IEMAllAImpl.asm@ 95578

Last change on this file since 95578 was 95578, checked in by vboxsync, 2 years ago
VMM/IEM: [v]ptest. bugref:9898
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 117.2 KB

Line
1	; $Id: IEMAllAImpl.asm 95578 2022-07-09 00:09:50Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;*********************************************************************************************************************************
20	;* Header Files *
21	;*********************************************************************************************************************************
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;*********************************************************************************************************************************
28	;* Defined Constants And Macros *
29	;*********************************************************************************************************************************
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%define T2 r10 ; only AMD64
176	%define T2_32 r10d
177	%define T2_16 r10w
178	%define T2_8 r10b
179
180	%else
181	; x86
182	%macro PROLOGUE_1_ARGS 0
183	push edi
184	%endmacro
185	%macro EPILOGUE_1_ARGS 0
186	pop edi
187	ret 0
188	%endmacro
189	%macro EPILOGUE_1_ARGS_EX 1
190	pop edi
191	ret %1
192	%endmacro
193
194	%macro PROLOGUE_2_ARGS 0
195	push edi
196	%endmacro
197	%macro EPILOGUE_2_ARGS 0
198	pop edi
199	ret 0
200	%endmacro
201	%macro EPILOGUE_2_ARGS_EX 1
202	pop edi
203	ret %1
204	%endmacro
205
206	%macro PROLOGUE_3_ARGS 0
207	push ebx
208	mov ebx, [esp + 4 + 4]
209	push edi
210	%endmacro
211	%macro EPILOGUE_3_ARGS_EX 1
212	%if (%1) < 4
213	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214	%endif
215	pop edi
216	pop ebx
217	ret %1
218	%endmacro
219	%macro EPILOGUE_3_ARGS 0
220	EPILOGUE_3_ARGS_EX 4
221	%endmacro
222
223	%macro PROLOGUE_4_ARGS 0
224	push ebx
225	push edi
226	push esi
227	mov ebx, [esp + 12 + 4 + 0]
228	mov esi, [esp + 12 + 4 + 4]
229	%endmacro
230	%macro EPILOGUE_4_ARGS_EX 1
231	%if (%1) < 8
232	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233	%endif
234	pop esi
235	pop edi
236	pop ebx
237	ret %1
238	%endmacro
239	%macro EPILOGUE_4_ARGS 0
240	EPILOGUE_4_ARGS_EX 8
241	%endmacro
242
243	%define A0 ecx
244	%define A0_32 ecx
245	%define A0_16 cx
246	%define A0_8 cl
247
248	%define A1 edx
249	%define A1_32 edx
250	%define A1_16 dx
251	%define A1_8 dl
252
253	%define A2 ebx
254	%define A2_32 ebx
255	%define A2_16 bx
256	%define A2_8 bl
257
258	%define A3 esi
259	%define A3_32 esi
260	%define A3_16 si
261
262	%define T0 eax
263	%define T0_32 eax
264	%define T0_16 ax
265	%define T0_8 al
266
267	%define T1 edi
268	%define T1_32 edi
269	%define T1_16 di
270	%endif
271
272
273	;;
274	; Load the relevant flags from [%1] if there are undefined flags (%3).
275	;
276	; @remarks Clobbers T0, stack. Changes EFLAGS.
277	; @param A2 The register pointing to the flags.
278	; @param 1 The parameter (A0..A3) pointing to the eflags.
279	; @param 2 The set of modified flags.
280	; @param 3 The set of undefined flags.
281	;
282	%macro IEM_MAYBE_LOAD_FLAGS 3
283	;%if (%3) != 0
284	pushf ; store current flags
285	mov T0_32, [%1] ; load the guest flags
286	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
287	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
288	or [xSP], T0 ; merge guest flags with host flags.
289	popf ; load the mixed flags.
290	;%endif
291	%endmacro
292
293	;;
294	; Update the flag.
295	;
296	; @remarks Clobbers T0, T1, stack.
297	; @param 1 The register pointing to the EFLAGS.
298	; @param 2 The mask of modified flags to save.
299	; @param 3 The mask of undefined flags to (maybe) save.
300	;
301	%macro IEM_SAVE_FLAGS 3
302	%if (%2 \| %3) != 0
303	pushf
304	pop T1
305	mov T0_32, [%1] ; flags
306	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
307	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
308	or T0_32, T1_32 ; combine the flags.
309	mov [%1], T0_32 ; save the flags.
310	%endif
311	%endmacro
312
313	;;
314	; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315	;
316	; @remarks Clobbers T0, T1, stack.
317	; @param 1 The register pointing to the EFLAGS.
318	; @param 2 The mask of modified flags to save.
319	; @param 3 Mask of additional flags to always clear
320	; @param 4 Mask of additional flags to always set.
321	;
322	%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323	%if (%2 \| %3 \| %4) != 0
324	pushf
325	pop T1
326	mov T0_32, [%1] ; load flags.
327	and T0_32, ~(%2 \| %3) ; clear the modified and always cleared flags.
328	and T1_32, (%2) ; select the modified flags.
329	or T0_32, T1_32 ; combine the flags.
330	%if (%4) != 0
331	or T0_32, %4 ; add the always set flags.
332	%endif
333	mov [%1], T0_32 ; save the result.
334	%endif
335	%endmacro
336
337	;;
338	; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339	; signed input (%4[%5]) and parity index (%6).
340	;
341	; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342	; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343	; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344	;
345	; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346	; @param 1 The register pointing to the EFLAGS.
347	; @param 2 The mask of modified flags to save.
348	; @param 3 Mask of additional flags to always clear
349	; @param 4 The result register to set SF by.
350	; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351	; @param 6 The (full) register containing the parity table index. Will be modified!
352
353	%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354	%ifdef RT_ARCH_AMD64
355	pushf
356	pop T2
357	%else
358	push T0
359	pushf
360	pop T0
361	%endif
362	mov T1_32, [%1] ; load flags.
363	and T1_32, ~(%2 \| %3 \| X86_EFL_PF \| X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364	%ifdef RT_ARCH_AMD64
365	and T2_32, (%2) ; select the modified flags.
366	or T1_32, T2_32 ; combine the flags.
367	%else
368	and T0_32, (%2) ; select the modified flags.
369	or T1_32, T0_32 ; combine the flags.
370	pop T0
371	%endif
372
373	; First calculate SF as it's likely to be refereing to the same register as %6 does.
374	bt %4, %5 - 1
375	jnc %%sf_clear
376	or T1_32, X86_EFL_SF
377	%%sf_clear:
378
379	; Parity last.
380	and %6, 0xff
381	%ifdef RT_ARCH_AMD64
382	lea T2, [NAME(g_afParity) xWrtRIP]
383	or T1_8, [T2 + %6]
384	%else
385	or T1_8, [NAME(g_afParity) + %6]
386	%endif
387
388	mov [%1], T1_32 ; save the result.
389	%endmacro
390
391	;;
392	; Calculates the new EFLAGS using fixed clear and set bit masks.
393	;
394	; @remarks Clobbers T0.
395	; @param 1 The register pointing to the EFLAGS.
396	; @param 2 Mask of additional flags to always clear
397	; @param 3 Mask of additional flags to always set.
398	;
399	%macro IEM_ADJUST_FLAGS 3
400	%if (%2 \| %3) != 0
401	mov T0_32, [%1] ; Load flags.
402	%if (%2) != 0
403	and T0_32, ~(%2) ; Remove the always cleared flags.
404	%endif
405	%if (%3) != 0
406	or T0_32, %3 ; Add the always set flags.
407	%endif
408	mov [%1], T0_32 ; Save the result.
409	%endif
410	%endmacro
411
412	;;
413	; Calculates the new EFLAGS using fixed clear and set bit masks.
414	;
415	; @remarks Clobbers T0, %4, EFLAGS.
416	; @param 1 The register pointing to the EFLAGS.
417	; @param 2 Mask of additional flags to always clear
418	; @param 3 Mask of additional flags to always set.
419	; @param 4 The (full) register containing the parity table index. Will be modified!
420	;
421	%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422	mov T0_32, [%1] ; Load flags.
423	and T0_32, ~(%2 \| X86_EFL_PF) ; Remove PF and the always cleared flags.
424	%if (%3) != 0
425	or T0_32, %3 ; Add the always set flags.
426	%endif
427	and %4, 0xff
428	%ifdef RT_ARCH_AMD64
429	lea T2, [NAME(g_afParity) xWrtRIP]
430	or T0_8, [T2 + %4]
431	%else
432	or T0_8, [NAME(g_afParity) + %4]
433	%endif
434	mov [%1], T0_32 ; Save the result.
435	%endmacro
436
437
438	;*********************************************************************************************************************************
439	;* External Symbols *
440	;*********************************************************************************************************************************
441	extern NAME(g_afParity)
442
443
444	;;
445	; Macro for implementing a binary operator.
446	;
447	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448	; variants, except on 32-bit system where the 64-bit accesses requires hand
449	; coding.
450	;
451	; All the functions takes a pointer to the destination memory operand in A0,
452	; the source register operand in A1 and a pointer to eflags in A2.
453	;
454	; @param 1 The instruction mnemonic.
455	; @param 2 Non-zero if there should be a locked version.
456	; @param 3 The modified flags.
457	; @param 4 The undefined flags.
458	;
459	%macro IEMIMPL_BIN_OP 4
460	BEGINCODE
461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462	PROLOGUE_3_ARGS
463	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464	%1 byte [A0], A1_8
465	IEM_SAVE_FLAGS A2, %3, %4
466	EPILOGUE_3_ARGS
467	ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470	PROLOGUE_3_ARGS
471	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472	%1 word [A0], A1_16
473	IEM_SAVE_FLAGS A2, %3, %4
474	EPILOGUE_3_ARGS
475	ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478	PROLOGUE_3_ARGS
479	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480	%1 dword [A0], A1_32
481	IEM_SAVE_FLAGS A2, %3, %4
482	EPILOGUE_3_ARGS
483	ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485	%ifdef RT_ARCH_AMD64
486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487	PROLOGUE_3_ARGS
488	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489	%1 qword [A0], A1
490	IEM_SAVE_FLAGS A2, %3, %4
491	EPILOGUE_3_ARGS_EX 8
492	ENDPROC iemAImpl_ %+ %1 %+ _u64
493	%endif ; RT_ARCH_AMD64
494
495	%if %2 != 0 ; locked versions requested?
496
497	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498	PROLOGUE_3_ARGS
499	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500	lock %1 byte [A0], A1_8
501	IEM_SAVE_FLAGS A2, %3, %4
502	EPILOGUE_3_ARGS
503	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506	PROLOGUE_3_ARGS
507	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508	lock %1 word [A0], A1_16
509	IEM_SAVE_FLAGS A2, %3, %4
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516	lock %1 dword [A0], A1_32
517	IEM_SAVE_FLAGS A2, %3, %4
518	EPILOGUE_3_ARGS
519	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521	%ifdef RT_ARCH_AMD64
522	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523	PROLOGUE_3_ARGS
524	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525	lock %1 qword [A0], A1
526	IEM_SAVE_FLAGS A2, %3, %4
527	EPILOGUE_3_ARGS_EX 8
528	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529	%endif ; RT_ARCH_AMD64
530	%endif ; locked
531	%endmacro
532
533	; instr,lock, modified-flags, undefined flags
534	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
535	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
536	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
537	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
538	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
539	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
540	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
541	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
542	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
543
544
545	;;
546	; Macro for implementing a binary operator, VEX variant with separate input/output.
547	;
548	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549	; where the 64-bit accesses requires hand coding.
550	;
551	; All the functions takes a pointer to the destination memory operand in A0,
552	; the first source register operand in A1, the second source register operand
553	; in A2 and a pointer to eflags in A3.
554	;
555	; @param 1 The instruction mnemonic.
556	; @param 2 The modified flags.
557	; @param 3 The undefined flags.
558	;
559	%macro IEMIMPL_VEX_BIN_OP 3
560	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561	PROLOGUE_4_ARGS
562	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563	%1 T0_32, A1_32, A2_32
564	mov [A0], T0_32
565	IEM_SAVE_FLAGS A3, %2, %3
566	EPILOGUE_4_ARGS
567	ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569	%ifdef RT_ARCH_AMD64
570	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571	PROLOGUE_4_ARGS
572	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573	%1 T0, A1, A2
574	mov [A0], T0
575	IEM_SAVE_FLAGS A3, %2, %3
576	EPILOGUE_4_ARGS
577	ENDPROC iemAImpl_ %+ %1 %+ _u64
578	%endif ; RT_ARCH_AMD64
579	%endmacro
580
581	; instr, modified-flags, undefined-flags
582	IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
583	IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF)
584	IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
585
586	;;
587	; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
588	;
589	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
590	; where the 64-bit accesses requires hand coding.
591	;
592	; All the functions takes a pointer to the destination memory operand in A0,
593	; the source register operand in A1 and a pointer to eflags in A2.
594	;
595	; @param 1 The instruction mnemonic.
596	; @param 2 The modified flags.
597	; @param 3 The undefined flags.
598	;
599	%macro IEMIMPL_VEX_BIN_OP_2 3
600	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
601	PROLOGUE_4_ARGS
602	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
603	mov T0_32, [A0]
604	%1 T0_32, A1_32
605	mov [A0], T0_32
606	IEM_SAVE_FLAGS A2, %2, %3
607	EPILOGUE_4_ARGS
608	ENDPROC iemAImpl_ %+ %1 %+ _u32
609
610	%ifdef RT_ARCH_AMD64
611	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
612	PROLOGUE_4_ARGS
613	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
614	mov T0, [A0]
615	%1 T0, A1
616	mov [A0], T0
617	IEM_SAVE_FLAGS A2, %2, %3
618	EPILOGUE_4_ARGS
619	ENDPROC iemAImpl_ %+ %1 %+ _u64
620	%endif ; RT_ARCH_AMD64
621	%endmacro
622
623	; instr, modified-flags, undefined-flags
624	IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
625	IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
626	IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
627
628
629	;;
630	; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
631	;
632	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
633	; where the 64-bit accesses requires hand coding.
634	;
635	; All the functions takes a pointer to the destination memory operand in A0,
636	; the first source register operand in A1, the second source register operand
637	; in A2 and a pointer to eflags in A3.
638	;
639	; @param 1 The instruction mnemonic.
640	; @param 2 Fallback instruction if applicable.
641	; @param 3 Whether to emit fallback or not.
642	;
643	%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
644	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
645	PROLOGUE_3_ARGS
646	%1 T0_32, A1_32, A2_32
647	mov [A0], T0_32
648	EPILOGUE_3_ARGS
649	ENDPROC iemAImpl_ %+ %1 %+ _u32
650
651	%if %3
652	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
653	PROLOGUE_3_ARGS
654	%ifdef ASM_CALL64_GCC
655	mov cl, A2_8
656	%2 A1_32, cl
657	mov [A0], A1_32
658	%else
659	xchg A2, A0
660	%2 A1_32, cl
661	mov [A2], A1_32
662	%endif
663	EPILOGUE_3_ARGS
664	ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
665	%endif
666
667	%ifdef RT_ARCH_AMD64
668	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
669	PROLOGUE_3_ARGS
670	%1 T0, A1, A2
671	mov [A0], T0
672	EPILOGUE_3_ARGS
673	ENDPROC iemAImpl_ %+ %1 %+ _u64
674
675	%if %3
676	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
677	PROLOGUE_3_ARGS
678	%ifdef ASM_CALL64_GCC
679	mov cl, A2_8
680	%2 A1, cl
681	mov [A0], A1_32
682	%else
683	xchg A2, A0
684	%2 A1, cl
685	mov [A2], A1_32
686	%endif
687	mov [A0], A1
688	EPILOGUE_3_ARGS
689	ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
690	%endif
691	%endif ; RT_ARCH_AMD64
692	%endmacro
693
694	; instr, fallback instr, emit fallback
695	IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
696	IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
697	IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
698	IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
699	IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
700
701
702	;
703	; RORX uses a immediate byte for the shift count, so we only do
704	; fallback implementation of that one.
705	;
706	BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
707	PROLOGUE_3_ARGS
708	%ifdef ASM_CALL64_GCC
709	mov cl, A2_8
710	ror A1_32, cl
711	mov [A0], A1_32
712	%else
713	xchg A2, A0
714	ror A1_32, cl
715	mov [A2], A1_32
716	%endif
717	EPILOGUE_3_ARGS
718	ENDPROC iemAImpl_rorx_u32
719
720	%ifdef RT_ARCH_AMD64
721	BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
722	PROLOGUE_3_ARGS
723	%ifdef ASM_CALL64_GCC
724	mov cl, A2_8
725	ror A1, cl
726	mov [A0], A1_32
727	%else
728	xchg A2, A0
729	ror A1, cl
730	mov [A2], A1_32
731	%endif
732	mov [A0], A1
733	EPILOGUE_3_ARGS
734	ENDPROC iemAImpl_rorx_u64
735	%endif ; RT_ARCH_AMD64
736
737
738	;
739	; MULX
740	;
741	BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
742	PROLOGUE_4_ARGS
743	%ifdef ASM_CALL64_GCC
744	; A2_32 is EDX - prefect
745	mulx T0_32, T1_32, A3_32
746	mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
747	mov [A0], T0_32
748	%else
749	; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
750	xchg A1, A2
751	mulx T0_32, T1_32, A3_32
752	mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
753	mov [A0], T0_32
754	%endif
755	EPILOGUE_4_ARGS
756	ENDPROC iemAImpl_mulx_u32
757
758
759	BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
760	PROLOGUE_4_ARGS
761	%ifdef ASM_CALL64_GCC
762	; A2_32 is EDX, T0_32 is EAX
763	mov eax, A3_32
764	mul A2_32
765	mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
766	mov [A0], edx
767	%else
768	; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
769	xchg A1, A2
770	mov eax, A3_32
771	mul A2_32
772	mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
773	mov [A0], edx
774	%endif
775	EPILOGUE_4_ARGS
776	ENDPROC iemAImpl_mulx_u32_fallback
777
778	%ifdef RT_ARCH_AMD64
779	BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
780	PROLOGUE_4_ARGS
781	%ifdef ASM_CALL64_GCC
782	; A2 is RDX - prefect
783	mulx T0, T1, A3
784	mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
785	mov [A0], T0
786	%else
787	; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
788	xchg A1, A2
789	mulx T0, T1, A3
790	mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
791	mov [A0], T0
792	%endif
793	EPILOGUE_4_ARGS
794	ENDPROC iemAImpl_mulx_u64
795
796
797	BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
798	PROLOGUE_4_ARGS
799	%ifdef ASM_CALL64_GCC
800	; A2 is RDX, T0 is RAX
801	mov rax, A3
802	mul A2
803	mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
804	mov [A0], rdx
805	%else
806	; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
807	xchg A1, A2
808	mov rax, A3
809	mul A2
810	mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
811	mov [A0], rdx
812	%endif
813	EPILOGUE_4_ARGS
814	ENDPROC iemAImpl_mulx_u64_fallback
815
816	%endif
817
818
819	;;
820	; Macro for implementing a bit operator.
821	;
822	; This will generate code for the 16, 32 and 64 bit accesses with locked
823	; variants, except on 32-bit system where the 64-bit accesses requires hand
824	; coding.
825	;
826	; All the functions takes a pointer to the destination memory operand in A0,
827	; the source register operand in A1 and a pointer to eflags in A2.
828	;
829	; @param 1 The instruction mnemonic.
830	; @param 2 Non-zero if there should be a locked version.
831	; @param 3 The modified flags.
832	; @param 4 The undefined flags.
833	;
834	%macro IEMIMPL_BIT_OP 4
835	BEGINCODE
836	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
837	PROLOGUE_3_ARGS
838	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
839	%1 word [A0], A1_16
840	IEM_SAVE_FLAGS A2, %3, %4
841	EPILOGUE_3_ARGS
842	ENDPROC iemAImpl_ %+ %1 %+ _u16
843
844	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
845	PROLOGUE_3_ARGS
846	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
847	%1 dword [A0], A1_32
848	IEM_SAVE_FLAGS A2, %3, %4
849	EPILOGUE_3_ARGS
850	ENDPROC iemAImpl_ %+ %1 %+ _u32
851
852	%ifdef RT_ARCH_AMD64
853	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
854	PROLOGUE_3_ARGS
855	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
856	%1 qword [A0], A1
857	IEM_SAVE_FLAGS A2, %3, %4
858	EPILOGUE_3_ARGS_EX 8
859	ENDPROC iemAImpl_ %+ %1 %+ _u64
860	%endif ; RT_ARCH_AMD64
861
862	%if %2 != 0 ; locked versions requested?
863
864	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
865	PROLOGUE_3_ARGS
866	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
867	lock %1 word [A0], A1_16
868	IEM_SAVE_FLAGS A2, %3, %4
869	EPILOGUE_3_ARGS
870	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
871
872	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
873	PROLOGUE_3_ARGS
874	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875	lock %1 dword [A0], A1_32
876	IEM_SAVE_FLAGS A2, %3, %4
877	EPILOGUE_3_ARGS
878	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
879
880	%ifdef RT_ARCH_AMD64
881	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
882	PROLOGUE_3_ARGS
883	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884	lock %1 qword [A0], A1
885	IEM_SAVE_FLAGS A2, %3, %4
886	EPILOGUE_3_ARGS_EX 8
887	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
888	%endif ; RT_ARCH_AMD64
889	%endif ; locked
890	%endmacro
891	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
892	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
893	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
894	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
895
896	;;
897	; Macro for implementing a bit search operator.
898	;
899	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
900	; system where the 64-bit accesses requires hand coding.
901	;
902	; All the functions takes a pointer to the destination memory operand in A0,
903	; the source register operand in A1 and a pointer to eflags in A2.
904	;
905	; In the ZF case the destination register is 'undefined', however it seems that
906	; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
907	; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
908	; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
909	; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
910	;
911	; @param 1 The instruction mnemonic.
912	; @param 2 The modified flags.
913	; @param 3 The undefined flags.
914	; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
915	;
916	%macro IEMIMPL_BIT_OP2 4
917	BEGINCODE
918	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
919	PROLOGUE_3_ARGS
920	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
921	%1 T0_16, A1_16
922	%if %4 != 0
923	jz .unchanged_dst
924	%endif
925	mov [A0], T0_16
926	.unchanged_dst:
927	IEM_SAVE_FLAGS A2, %2, %3
928	EPILOGUE_3_ARGS
929	ENDPROC iemAImpl_ %+ %1 %+ _u16
930
931	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
932	PROLOGUE_3_ARGS
933	%1 T1_16, A1_16
934	%if %4 != 0
935	jz .unchanged_dst
936	%endif
937	mov [A0], T1_16
938	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
939	EPILOGUE_3_ARGS
940	.unchanged_dst:
941	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
942	EPILOGUE_3_ARGS
943	ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
944
945	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
946	PROLOGUE_3_ARGS
947	%1 T0_16, A1_16
948	%if %4 != 0
949	jz .unchanged_dst
950	%endif
951	mov [A0], T0_16
952	.unchanged_dst:
953	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
954	EPILOGUE_3_ARGS
955	ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
956
957
958	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
959	PROLOGUE_3_ARGS
960	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
961	%1 T0_32, A1_32
962	%if %4 != 0
963	jz .unchanged_dst
964	%endif
965	mov [A0], T0_32
966	.unchanged_dst:
967	IEM_SAVE_FLAGS A2, %2, %3
968	EPILOGUE_3_ARGS
969	ENDPROC iemAImpl_ %+ %1 %+ _u32
970
971	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
972	PROLOGUE_3_ARGS
973	%1 T1_32, A1_32
974	%if %4 != 0
975	jz .unchanged_dst
976	%endif
977	mov [A0], T1_32
978	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
979	EPILOGUE_3_ARGS
980	.unchanged_dst:
981	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
982	EPILOGUE_3_ARGS
983	ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
984
985	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
986	PROLOGUE_3_ARGS
987	%1 T0_32, A1_32
988	%if %4 != 0
989	jz .unchanged_dst
990	%endif
991	mov [A0], T0_32
992	.unchanged_dst:
993	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
994	EPILOGUE_3_ARGS
995	ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
996
997
998	%ifdef RT_ARCH_AMD64
999
1000	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1001	PROLOGUE_3_ARGS
1002	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1003	%1 T0, A1
1004	%if %4 != 0
1005	jz .unchanged_dst
1006	%endif
1007	mov [A0], T0
1008	.unchanged_dst:
1009	IEM_SAVE_FLAGS A2, %2, %3
1010	EPILOGUE_3_ARGS_EX 8
1011	ENDPROC iemAImpl_ %+ %1 %+ _u64
1012
1013	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1014	PROLOGUE_3_ARGS
1015	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1016	%1 T1, A1
1017	%if %4 != 0
1018	jz .unchanged_dst
1019	%endif
1020	mov [A0], T1
1021	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
1022	EPILOGUE_3_ARGS
1023	.unchanged_dst:
1024	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
1025	EPILOGUE_3_ARGS
1026	ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1027
1028	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1029	PROLOGUE_3_ARGS
1030	%1 T0, A1
1031	%if %4 != 0
1032	jz .unchanged_dst
1033	%endif
1034	mov [A0], T0
1035	.unchanged_dst:
1036	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1037	EPILOGUE_3_ARGS_EX 8
1038	ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1039
1040	%endif ; RT_ARCH_AMD64
1041	%endmacro
1042
1043	IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1044	IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1045	IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
1046	IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
1047
1048
1049	;;
1050	; Macro for implementing POPCNT.
1051	;
1052	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1053	; system where the 64-bit accesses requires hand coding.
1054	;
1055	; All the functions takes a pointer to the destination memory operand in A0,
1056	; the source register operand in A1 and a pointer to eflags in A2.
1057	;
1058	; ASSUMES Intel and AMD set EFLAGS the same way.
1059	;
1060	; ASSUMES the instruction does not support memory destination.
1061	;
1062	; @param 1 The instruction mnemonic.
1063	; @param 2 The modified flags.
1064	; @param 3 The undefined flags.
1065	;
1066	%macro IEMIMPL_BIT_OP3 3
1067	BEGINCODE
1068	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1069	PROLOGUE_3_ARGS
1070	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1071	%1 T0_16, A1_16
1072	mov [A0], T0_16
1073	IEM_SAVE_FLAGS A2, %2, %3
1074	EPILOGUE_3_ARGS
1075	ENDPROC iemAImpl_ %+ %1 %+ _u16
1076
1077	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1078	PROLOGUE_3_ARGS
1079	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1080	%1 T0_32, A1_32
1081	mov [A0], T0_32
1082	IEM_SAVE_FLAGS A2, %2, %3
1083	EPILOGUE_3_ARGS
1084	ENDPROC iemAImpl_ %+ %1 %+ _u32
1085
1086	%ifdef RT_ARCH_AMD64
1087	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1088	PROLOGUE_3_ARGS
1089	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090	%1 T0, A1
1091	mov [A0], T0
1092	IEM_SAVE_FLAGS A2, %2, %3
1093	EPILOGUE_3_ARGS_EX 8
1094	ENDPROC iemAImpl_ %+ %1 %+ _u64
1095	%endif ; RT_ARCH_AMD64
1096	%endmacro
1097	IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
1098
1099
1100	;
1101	; IMUL is also a similar but yet different case (no lock, no mem dst).
1102	; The rDX:rAX variant of imul is handled together with mul further down.
1103	;
1104	BEGINCODE
1105	; @param 1 EFLAGS that are modified.
1106	; @param 2 Undefined EFLAGS.
1107	; @param 3 Function suffix.
1108	; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1109	; 2 for AMD (set AF, clear PF, ZF and SF).
1110	%macro IEMIMPL_IMUL_TWO 4
1111	BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1112	PROLOGUE_3_ARGS
1113	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1114	imul A1_16, word [A0]
1115	mov [A0], A1_16
1116	%if %4 != 1
1117	IEM_SAVE_FLAGS A2, %1, %2
1118	%else
1119	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_16, 16, A1
1120	%endif
1121	EPILOGUE_3_ARGS
1122	ENDPROC iemAImpl_imul_two_u16 %+ %3
1123
1124	BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1125	PROLOGUE_3_ARGS
1126	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1127	imul A1_32, dword [A0]
1128	mov [A0], A1_32
1129	%if %4 != 1
1130	IEM_SAVE_FLAGS A2, %1, %2
1131	%else
1132	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_32, 32, A1
1133	%endif
1134	EPILOGUE_3_ARGS
1135	ENDPROC iemAImpl_imul_two_u32 %+ %3
1136
1137	%ifdef RT_ARCH_AMD64
1138	BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1139	PROLOGUE_3_ARGS
1140	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1141	imul A1, qword [A0]
1142	mov [A0], A1
1143	%if %4 != 1
1144	IEM_SAVE_FLAGS A2, %1, %2
1145	%else
1146	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1, 64, A1
1147	%endif
1148	EPILOGUE_3_ARGS_EX 8
1149	ENDPROC iemAImpl_imul_two_u64 %+ %3
1150	%endif ; RT_ARCH_AMD64
1151	%endmacro
1152	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, , 0
1153	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _intel, 1
1154	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _amd, 2
1155
1156
1157	;
1158	; XCHG for memory operands. This implies locking. No flag changes.
1159	;
1160	; Each function takes two arguments, first the pointer to the memory,
1161	; then the pointer to the register. They all return void.
1162	;
1163	BEGINCODE
1164	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1165	PROLOGUE_2_ARGS
1166	mov T0_8, [A1]
1167	xchg [A0], T0_8
1168	mov [A1], T0_8
1169	EPILOGUE_2_ARGS
1170	ENDPROC iemAImpl_xchg_u8_locked
1171
1172	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1173	PROLOGUE_2_ARGS
1174	mov T0_16, [A1]
1175	xchg [A0], T0_16
1176	mov [A1], T0_16
1177	EPILOGUE_2_ARGS
1178	ENDPROC iemAImpl_xchg_u16_locked
1179
1180	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1181	PROLOGUE_2_ARGS
1182	mov T0_32, [A1]
1183	xchg [A0], T0_32
1184	mov [A1], T0_32
1185	EPILOGUE_2_ARGS
1186	ENDPROC iemAImpl_xchg_u32_locked
1187
1188	%ifdef RT_ARCH_AMD64
1189	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1190	PROLOGUE_2_ARGS
1191	mov T0, [A1]
1192	xchg [A0], T0
1193	mov [A1], T0
1194	EPILOGUE_2_ARGS
1195	ENDPROC iemAImpl_xchg_u64_locked
1196	%endif
1197
1198	; Unlocked variants for fDisregardLock mode.
1199
1200	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1201	PROLOGUE_2_ARGS
1202	mov T0_8, [A1]
1203	mov T1_8, [A0]
1204	mov [A0], T0_8
1205	mov [A1], T1_8
1206	EPILOGUE_2_ARGS
1207	ENDPROC iemAImpl_xchg_u8_unlocked
1208
1209	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1210	PROLOGUE_2_ARGS
1211	mov T0_16, [A1]
1212	mov T1_16, [A0]
1213	mov [A0], T0_16
1214	mov [A1], T1_16
1215	EPILOGUE_2_ARGS
1216	ENDPROC iemAImpl_xchg_u16_unlocked
1217
1218	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1219	PROLOGUE_2_ARGS
1220	mov T0_32, [A1]
1221	mov T1_32, [A0]
1222	mov [A0], T0_32
1223	mov [A1], T1_32
1224	EPILOGUE_2_ARGS
1225	ENDPROC iemAImpl_xchg_u32_unlocked
1226
1227	%ifdef RT_ARCH_AMD64
1228	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1229	PROLOGUE_2_ARGS
1230	mov T0, [A1]
1231	mov T1, [A0]
1232	mov [A0], T0
1233	mov [A1], T1
1234	EPILOGUE_2_ARGS
1235	ENDPROC iemAImpl_xchg_u64_unlocked
1236	%endif
1237
1238
1239	;
1240	; XADD for memory operands.
1241	;
1242	; Each function takes three arguments, first the pointer to the
1243	; memory/register, then the pointer to the register, and finally a pointer to
1244	; eflags. They all return void.
1245	;
1246	BEGINCODE
1247	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1248	PROLOGUE_3_ARGS
1249	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1250	mov T0_8, [A1]
1251	xadd [A0], T0_8
1252	mov [A1], T0_8
1253	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1254	EPILOGUE_3_ARGS
1255	ENDPROC iemAImpl_xadd_u8
1256
1257	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1258	PROLOGUE_3_ARGS
1259	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1260	mov T0_16, [A1]
1261	xadd [A0], T0_16
1262	mov [A1], T0_16
1263	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1264	EPILOGUE_3_ARGS
1265	ENDPROC iemAImpl_xadd_u16
1266
1267	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1268	PROLOGUE_3_ARGS
1269	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1270	mov T0_32, [A1]
1271	xadd [A0], T0_32
1272	mov [A1], T0_32
1273	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1274	EPILOGUE_3_ARGS
1275	ENDPROC iemAImpl_xadd_u32
1276
1277	%ifdef RT_ARCH_AMD64
1278	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1279	PROLOGUE_3_ARGS
1280	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1281	mov T0, [A1]
1282	xadd [A0], T0
1283	mov [A1], T0
1284	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1285	EPILOGUE_3_ARGS
1286	ENDPROC iemAImpl_xadd_u64
1287	%endif ; RT_ARCH_AMD64
1288
1289	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1290	PROLOGUE_3_ARGS
1291	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1292	mov T0_8, [A1]
1293	lock xadd [A0], T0_8
1294	mov [A1], T0_8
1295	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1296	EPILOGUE_3_ARGS
1297	ENDPROC iemAImpl_xadd_u8_locked
1298
1299	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1300	PROLOGUE_3_ARGS
1301	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1302	mov T0_16, [A1]
1303	lock xadd [A0], T0_16
1304	mov [A1], T0_16
1305	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1306	EPILOGUE_3_ARGS
1307	ENDPROC iemAImpl_xadd_u16_locked
1308
1309	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1310	PROLOGUE_3_ARGS
1311	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1312	mov T0_32, [A1]
1313	lock xadd [A0], T0_32
1314	mov [A1], T0_32
1315	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1316	EPILOGUE_3_ARGS
1317	ENDPROC iemAImpl_xadd_u32_locked
1318
1319	%ifdef RT_ARCH_AMD64
1320	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1321	PROLOGUE_3_ARGS
1322	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1323	mov T0, [A1]
1324	lock xadd [A0], T0
1325	mov [A1], T0
1326	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1327	EPILOGUE_3_ARGS
1328	ENDPROC iemAImpl_xadd_u64_locked
1329	%endif ; RT_ARCH_AMD64
1330
1331
1332	;
1333	; CMPXCHG8B.
1334	;
1335	; These are tricky register wise, so the code is duplicated for each calling
1336	; convention.
1337	;
1338	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1339	;
1340	; C-proto:
1341	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1342	; uint32_t *pEFlags));
1343	;
1344	; Note! Identical to iemAImpl_cmpxchg16b.
1345	;
1346	BEGINCODE
1347	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1348	%ifdef RT_ARCH_AMD64
1349	%ifdef ASM_CALL64_MSC
1350	push rbx
1351
1352	mov r11, rdx ; pu64EaxEdx (is also T1)
1353	mov r10, rcx ; pu64Dst
1354
1355	mov ebx, [r8]
1356	mov ecx, [r8 + 4]
1357	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1358	mov eax, [r11]
1359	mov edx, [r11 + 4]
1360
1361	lock cmpxchg8b [r10]
1362
1363	mov [r11], eax
1364	mov [r11 + 4], edx
1365	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1366
1367	pop rbx
1368	ret
1369	%else
1370	push rbx
1371
1372	mov r10, rcx ; pEFlags
1373	mov r11, rdx ; pu64EbxEcx (is also T1)
1374
1375	mov ebx, [r11]
1376	mov ecx, [r11 + 4]
1377	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1378	mov eax, [rsi]
1379	mov edx, [rsi + 4]
1380
1381	lock cmpxchg8b [rdi]
1382
1383	mov [rsi], eax
1384	mov [rsi + 4], edx
1385	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1386
1387	pop rbx
1388	ret
1389
1390	%endif
1391	%else
1392	push esi
1393	push edi
1394	push ebx
1395	push ebp
1396
1397	mov edi, ecx ; pu64Dst
1398	mov esi, edx ; pu64EaxEdx
1399	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1400	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1401
1402	mov ebx, [ecx]
1403	mov ecx, [ecx + 4]
1404	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1405	mov eax, [esi]
1406	mov edx, [esi + 4]
1407
1408	lock cmpxchg8b [edi]
1409
1410	mov [esi], eax
1411	mov [esi + 4], edx
1412	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1413
1414	pop ebp
1415	pop ebx
1416	pop edi
1417	pop esi
1418	ret 8
1419	%endif
1420	ENDPROC iemAImpl_cmpxchg8b
1421
1422	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1423	; Lazy bird always lock prefixes cmpxchg8b.
1424	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1425	ENDPROC iemAImpl_cmpxchg8b_locked
1426
1427	%ifdef RT_ARCH_AMD64
1428
1429	;
1430	; CMPXCHG16B.
1431	;
1432	; These are tricky register wise, so the code is duplicated for each calling
1433	; convention.
1434	;
1435	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1436	;
1437	; C-proto:
1438	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1439	; uint32_t *pEFlags));
1440	;
1441	; Note! Identical to iemAImpl_cmpxchg8b.
1442	;
1443	BEGINCODE
1444	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1445	%ifdef ASM_CALL64_MSC
1446	push rbx
1447
1448	mov r11, rdx ; pu64RaxRdx (is also T1)
1449	mov r10, rcx ; pu64Dst
1450
1451	mov rbx, [r8]
1452	mov rcx, [r8 + 8]
1453	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1454	mov rax, [r11]
1455	mov rdx, [r11 + 8]
1456
1457	lock cmpxchg16b [r10]
1458
1459	mov [r11], rax
1460	mov [r11 + 8], rdx
1461	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1462
1463	pop rbx
1464	ret
1465	%else
1466	push rbx
1467
1468	mov r10, rcx ; pEFlags
1469	mov r11, rdx ; pu64RbxRcx (is also T1)
1470
1471	mov rbx, [r11]
1472	mov rcx, [r11 + 8]
1473	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1474	mov rax, [rsi]
1475	mov rdx, [rsi + 8]
1476
1477	lock cmpxchg16b [rdi]
1478
1479	mov [rsi], rax
1480	mov [rsi + 8], rdx
1481	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1482
1483	pop rbx
1484	ret
1485
1486	%endif
1487	ENDPROC iemAImpl_cmpxchg16b
1488
1489	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1490	; Lazy bird always lock prefixes cmpxchg16b.
1491	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1492	ENDPROC iemAImpl_cmpxchg16b_locked
1493
1494	%endif ; RT_ARCH_AMD64
1495
1496
1497	;
1498	; CMPXCHG.
1499	;
1500	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1501	;
1502	; C-proto:
1503	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
1504	;
1505	BEGINCODE
1506	%macro IEMIMPL_CMPXCHG 2
1507	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1508	PROLOGUE_4_ARGS
1509	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1510	mov al, [A1]
1511	%1 cmpxchg [A0], A2_8
1512	mov [A1], al
1513	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1514	EPILOGUE_4_ARGS
1515	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1516
1517	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1518	PROLOGUE_4_ARGS
1519	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1520	mov ax, [A1]
1521	%1 cmpxchg [A0], A2_16
1522	mov [A1], ax
1523	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524	EPILOGUE_4_ARGS
1525	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1526
1527	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1528	PROLOGUE_4_ARGS
1529	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1530	mov eax, [A1]
1531	%1 cmpxchg [A0], A2_32
1532	mov [A1], eax
1533	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534	EPILOGUE_4_ARGS
1535	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1536
1537	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1538	%ifdef RT_ARCH_AMD64
1539	PROLOGUE_4_ARGS
1540	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1541	mov rax, [A1]
1542	%1 cmpxchg [A0], A2
1543	mov [A1], rax
1544	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1545	EPILOGUE_4_ARGS
1546	%else
1547	;
1548	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1549	;
1550	push esi
1551	push edi
1552	push ebx
1553	push ebp
1554
1555	mov edi, ecx ; pu64Dst
1556	mov esi, edx ; pu64Rax
1557	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1558	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1559
1560	mov ebx, [ecx]
1561	mov ecx, [ecx + 4]
1562	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1563	mov eax, [esi]
1564	mov edx, [esi + 4]
1565
1566	lock cmpxchg8b [edi]
1567
1568	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1569	jz .cmpxchg8b_not_equal
1570	cmp eax, eax ; just set the other flags.
1571	.store:
1572	mov [esi], eax
1573	mov [esi + 4], edx
1574	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1575
1576	pop ebp
1577	pop ebx
1578	pop edi
1579	pop esi
1580	ret 8
1581
1582	.cmpxchg8b_not_equal:
1583	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1584	jne .store
1585	cmp [esi], eax
1586	jmp .store
1587
1588	%endif
1589	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1590	%endmacro ; IEMIMPL_CMPXCHG
1591
1592	IEMIMPL_CMPXCHG , ,
1593	IEMIMPL_CMPXCHG lock, _locked
1594
1595	;;
1596	; Macro for implementing a unary operator.
1597	;
1598	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1599	; variants, except on 32-bit system where the 64-bit accesses requires hand
1600	; coding.
1601	;
1602	; All the functions takes a pointer to the destination memory operand in A0,
1603	; the source register operand in A1 and a pointer to eflags in A2.
1604	;
1605	; @param 1 The instruction mnemonic.
1606	; @param 2 The modified flags.
1607	; @param 3 The undefined flags.
1608	;
1609	%macro IEMIMPL_UNARY_OP 3
1610	BEGINCODE
1611	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1612	PROLOGUE_2_ARGS
1613	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1614	%1 byte [A0]
1615	IEM_SAVE_FLAGS A1, %2, %3
1616	EPILOGUE_2_ARGS
1617	ENDPROC iemAImpl_ %+ %1 %+ _u8
1618
1619	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1620	PROLOGUE_2_ARGS
1621	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1622	lock %1 byte [A0]
1623	IEM_SAVE_FLAGS A1, %2, %3
1624	EPILOGUE_2_ARGS
1625	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1626
1627	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1628	PROLOGUE_2_ARGS
1629	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1630	%1 word [A0]
1631	IEM_SAVE_FLAGS A1, %2, %3
1632	EPILOGUE_2_ARGS
1633	ENDPROC iemAImpl_ %+ %1 %+ _u16
1634
1635	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1636	PROLOGUE_2_ARGS
1637	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1638	lock %1 word [A0]
1639	IEM_SAVE_FLAGS A1, %2, %3
1640	EPILOGUE_2_ARGS
1641	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1642
1643	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1644	PROLOGUE_2_ARGS
1645	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1646	%1 dword [A0]
1647	IEM_SAVE_FLAGS A1, %2, %3
1648	EPILOGUE_2_ARGS
1649	ENDPROC iemAImpl_ %+ %1 %+ _u32
1650
1651	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1652	PROLOGUE_2_ARGS
1653	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1654	lock %1 dword [A0]
1655	IEM_SAVE_FLAGS A1, %2, %3
1656	EPILOGUE_2_ARGS
1657	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1658
1659	%ifdef RT_ARCH_AMD64
1660	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1661	PROLOGUE_2_ARGS
1662	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1663	%1 qword [A0]
1664	IEM_SAVE_FLAGS A1, %2, %3
1665	EPILOGUE_2_ARGS
1666	ENDPROC iemAImpl_ %+ %1 %+ _u64
1667
1668	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1669	PROLOGUE_2_ARGS
1670	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1671	lock %1 qword [A0]
1672	IEM_SAVE_FLAGS A1, %2, %3
1673	EPILOGUE_2_ARGS
1674	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1675	%endif ; RT_ARCH_AMD64
1676
1677	%endmacro
1678
1679	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1680	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1681	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1682	IEMIMPL_UNARY_OP not, 0, 0
1683
1684
1685	;
1686	; BSWAP. No flag changes.
1687	;
1688	; Each function takes one argument, pointer to the value to bswap
1689	; (input/output). They all return void.
1690	;
1691	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1692	PROLOGUE_1_ARGS
1693	mov T0_32, [A0] ; just in case any of the upper bits are used.
1694	db 66h
1695	bswap T0_32
1696	mov [A0], T0_32
1697	EPILOGUE_1_ARGS
1698	ENDPROC iemAImpl_bswap_u16
1699
1700	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1701	PROLOGUE_1_ARGS
1702	mov T0_32, [A0]
1703	bswap T0_32
1704	mov [A0], T0_32
1705	EPILOGUE_1_ARGS
1706	ENDPROC iemAImpl_bswap_u32
1707
1708	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1709	%ifdef RT_ARCH_AMD64
1710	PROLOGUE_1_ARGS
1711	mov T0, [A0]
1712	bswap T0
1713	mov [A0], T0
1714	EPILOGUE_1_ARGS
1715	%else
1716	PROLOGUE_1_ARGS
1717	mov T0, [A0]
1718	mov T1, [A0 + 4]
1719	bswap T0
1720	bswap T1
1721	mov [A0 + 4], T0
1722	mov [A0], T1
1723	EPILOGUE_1_ARGS
1724	%endif
1725	ENDPROC iemAImpl_bswap_u64
1726
1727
1728	;;
1729	; Macro for implementing a shift operation.
1730	;
1731	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1732	; 32-bit system where the 64-bit accesses requires hand coding.
1733	;
1734	; All the functions takes a pointer to the destination memory operand in A0,
1735	; the shift count in A1 and a pointer to eflags in A2.
1736	;
1737	; @param 1 The instruction mnemonic.
1738	; @param 2 The modified flags.
1739	; @param 3 The undefined flags.
1740	;
1741	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1742	;
1743	; @note the _intel and _amd variants are implemented in C.
1744	;
1745	%macro IEMIMPL_SHIFT_OP 3
1746	BEGINCODE
1747	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1748	PROLOGUE_3_ARGS
1749	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1750	%ifdef ASM_CALL64_GCC
1751	mov cl, A1_8
1752	%1 byte [A0], cl
1753	%else
1754	xchg A1, A0
1755	%1 byte [A1], cl
1756	%endif
1757	IEM_SAVE_FLAGS A2, %2, %3
1758	EPILOGUE_3_ARGS
1759	ENDPROC iemAImpl_ %+ %1 %+ _u8
1760
1761	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1762	PROLOGUE_3_ARGS
1763	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1764	%ifdef ASM_CALL64_GCC
1765	mov cl, A1_8
1766	%1 word [A0], cl
1767	%else
1768	xchg A1, A0
1769	%1 word [A1], cl
1770	%endif
1771	IEM_SAVE_FLAGS A2, %2, %3
1772	EPILOGUE_3_ARGS
1773	ENDPROC iemAImpl_ %+ %1 %+ _u16
1774
1775	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1776	PROLOGUE_3_ARGS
1777	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1778	%ifdef ASM_CALL64_GCC
1779	mov cl, A1_8
1780	%1 dword [A0], cl
1781	%else
1782	xchg A1, A0
1783	%1 dword [A1], cl
1784	%endif
1785	IEM_SAVE_FLAGS A2, %2, %3
1786	EPILOGUE_3_ARGS
1787	ENDPROC iemAImpl_ %+ %1 %+ _u32
1788
1789	%ifdef RT_ARCH_AMD64
1790	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1791	PROLOGUE_3_ARGS
1792	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1793	%ifdef ASM_CALL64_GCC
1794	mov cl, A1_8
1795	%1 qword [A0], cl
1796	%else
1797	xchg A1, A0
1798	%1 qword [A1], cl
1799	%endif
1800	IEM_SAVE_FLAGS A2, %2, %3
1801	EPILOGUE_3_ARGS
1802	ENDPROC iemAImpl_ %+ %1 %+ _u64
1803	%endif ; RT_ARCH_AMD64
1804
1805	%endmacro
1806
1807	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1808	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1809	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1810	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1811	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1812	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1813	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1814
1815
1816	;;
1817	; Macro for implementing a double precision shift operation.
1818	;
1819	; This will generate code for the 16, 32 and 64 bit accesses, except on
1820	; 32-bit system where the 64-bit accesses requires hand coding.
1821	;
1822	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1823	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1824	;
1825	; @param 1 The instruction mnemonic.
1826	; @param 2 The modified flags.
1827	; @param 3 The undefined flags.
1828	;
1829	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1830	;
1831	; @note the _intel and _amd variants are implemented in C.
1832	;
1833	%macro IEMIMPL_SHIFT_DBL_OP 3
1834	BEGINCODE
1835	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1836	PROLOGUE_4_ARGS
1837	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1838	%ifdef ASM_CALL64_GCC
1839	xchg A3, A2
1840	%1 [A0], A1_16, cl
1841	xchg A3, A2
1842	%else
1843	xchg A0, A2
1844	%1 [A2], A1_16, cl
1845	%endif
1846	IEM_SAVE_FLAGS A3, %2, %3
1847	EPILOGUE_4_ARGS
1848	ENDPROC iemAImpl_ %+ %1 %+ _u16
1849
1850	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1851	PROLOGUE_4_ARGS
1852	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1853	%ifdef ASM_CALL64_GCC
1854	xchg A3, A2
1855	%1 [A0], A1_32, cl
1856	xchg A3, A2
1857	%else
1858	xchg A0, A2
1859	%1 [A2], A1_32, cl
1860	%endif
1861	IEM_SAVE_FLAGS A3, %2, %3
1862	EPILOGUE_4_ARGS
1863	ENDPROC iemAImpl_ %+ %1 %+ _u32
1864
1865	%ifdef RT_ARCH_AMD64
1866	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1867	PROLOGUE_4_ARGS
1868	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1869	%ifdef ASM_CALL64_GCC
1870	xchg A3, A2
1871	%1 [A0], A1, cl
1872	xchg A3, A2
1873	%else
1874	xchg A0, A2
1875	%1 [A2], A1, cl
1876	%endif
1877	IEM_SAVE_FLAGS A3, %2, %3
1878	EPILOGUE_4_ARGS_EX 12
1879	ENDPROC iemAImpl_ %+ %1 %+ _u64
1880	%endif ; RT_ARCH_AMD64
1881
1882	%endmacro
1883
1884	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1885	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1886
1887
1888	;;
1889	; Macro for implementing a multiplication operations.
1890	;
1891	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1892	; 32-bit system where the 64-bit accesses requires hand coding.
1893	;
1894	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1895	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1896	; pointer to eflags in A3.
1897	;
1898	; The functions all return 0 so the caller can be used for div/idiv as well as
1899	; for the mul/imul implementation.
1900	;
1901	; @param 1 The instruction mnemonic.
1902	; @param 2 The modified flags.
1903	; @param 3 The undefined flags.
1904	; @param 4 Name suffix.
1905	; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1906	;
1907	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1908	;
1909	%macro IEMIMPL_MUL_OP 5
1910	BEGINCODE
1911	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1912	PROLOGUE_3_ARGS
1913	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1914	mov al, [A0]
1915	%1 A1_8
1916	mov [A0], ax
1917	%if %5 != 1
1918	IEM_SAVE_FLAGS A2, %2, %3
1919	%else
1920	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 8, xAX
1921	%endif
1922	xor eax, eax
1923	EPILOGUE_3_ARGS
1924	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1925
1926	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1927	PROLOGUE_4_ARGS
1928	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1929	mov ax, [A0]
1930	%ifdef ASM_CALL64_GCC
1931	%1 A2_16
1932	mov [A0], ax
1933	mov [A1], dx
1934	%else
1935	mov T1, A1
1936	%1 A2_16
1937	mov [A0], ax
1938	mov [T1], dx
1939	%endif
1940	%if %5 != 1
1941	IEM_SAVE_FLAGS A3, %2, %3
1942	%else
1943	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 16, xAX
1944	%endif
1945	xor eax, eax
1946	EPILOGUE_4_ARGS
1947	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1948
1949	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1950	PROLOGUE_4_ARGS
1951	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1952	mov eax, [A0]
1953	%ifdef ASM_CALL64_GCC
1954	%1 A2_32
1955	mov [A0], eax
1956	mov [A1], edx
1957	%else
1958	mov T1, A1
1959	%1 A2_32
1960	mov [A0], eax
1961	mov [T1], edx
1962	%endif
1963	%if %5 != 1
1964	IEM_SAVE_FLAGS A3, %2, %3
1965	%else
1966	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, eax, 32, xAX
1967	%endif
1968	xor eax, eax
1969	EPILOGUE_4_ARGS
1970	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1971
1972	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1973	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1974	PROLOGUE_4_ARGS
1975	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1976	mov rax, [A0]
1977	%ifdef ASM_CALL64_GCC
1978	%1 A2
1979	mov [A0], rax
1980	mov [A1], rdx
1981	%else
1982	mov T1, A1
1983	%1 A2
1984	mov [A0], rax
1985	mov [T1], rdx
1986	%endif
1987	%if %5 != 1
1988	IEM_SAVE_FLAGS A3, %2, %3
1989	%else
1990	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, rax, 64, xAX
1991	%endif
1992	xor eax, eax
1993	EPILOGUE_4_ARGS_EX 12
1994	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1995	%endif ; !RT_ARCH_AMD64
1996
1997	%endmacro
1998
1999	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
2000	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
2001	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
2002	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
2003	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
2004	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
2005
2006
2007	BEGINCODE
2008	;;
2009	; Worker function for negating a 32-bit number in T1:T0
2010	; @uses None (T0,T1)
2011	BEGINPROC iemAImpl_negate_T0_T1_u32
2012	push 0
2013	push 0
2014	xchg T0_32, [xSP]
2015	xchg T1_32, [xSP + xCB]
2016	sub T0_32, [xSP]
2017	sbb T1_32, [xSP + xCB]
2018	add xSP, xCB*2
2019	ret
2020	ENDPROC iemAImpl_negate_T0_T1_u32
2021
2022	%ifdef RT_ARCH_AMD64
2023	;;
2024	; Worker function for negating a 64-bit number in T1:T0
2025	; @uses None (T0,T1)
2026	BEGINPROC iemAImpl_negate_T0_T1_u64
2027	push 0
2028	push 0
2029	xchg T0, [xSP]
2030	xchg T1, [xSP + xCB]
2031	sub T0, [xSP]
2032	sbb T1, [xSP + xCB]
2033	add xSP, xCB*2
2034	ret
2035	ENDPROC iemAImpl_negate_T0_T1_u64
2036	%endif
2037
2038
2039	;;
2040	; Macro for implementing a division operations.
2041	;
2042	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2043	; 32-bit system where the 64-bit accesses requires hand coding.
2044	;
2045	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2046	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2047	; pointer to eflags in A3.
2048	;
2049	; The functions all return 0 on success and -1 if a divide error should be
2050	; raised by the caller.
2051	;
2052	; @param 1 The instruction mnemonic.
2053	; @param 2 The modified flags.
2054	; @param 3 The undefined flags.
2055	; @param 4 1 if signed, 0 if unsigned.
2056	; @param 5 Function suffix.
2057	; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2058	; 2 for AMD (set AF, clear PF, ZF and SF).
2059	;
2060	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2061	;
2062	%macro IEMIMPL_DIV_OP 6
2063	BEGINCODE
2064	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2065	PROLOGUE_3_ARGS
2066
2067	; div by chainsaw check.
2068	test A1_8, A1_8
2069	jz .div_zero
2070
2071	; Overflow check - unsigned division is simple to verify, haven't
2072	; found a simple way to check signed division yet unfortunately.
2073	%if %4 == 0
2074	cmp [A0 + 1], A1_8
2075	jae .div_overflow
2076	%else
2077	mov T0_16, [A0] ; T0 = dividend
2078	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2079	test A1_8, A1_8
2080	js .divisor_negative
2081	test T0_16, T0_16
2082	jns .both_positive
2083	neg T0_16
2084	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2085	push T0 ; Start off like unsigned below.
2086	shr T0_16, 7
2087	cmp T0_8, A1_8
2088	pop T0
2089	jb .div_no_overflow
2090	ja .div_overflow
2091	and T0_8, 0x7f ; Special case for covering (divisor - 1).
2092	cmp T0_8, A1_8
2093	jae .div_overflow
2094	jmp .div_no_overflow
2095
2096	.divisor_negative:
2097	neg A1_8
2098	test T0_16, T0_16
2099	jns .one_of_each
2100	neg T0_16
2101	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2102	shr T0_16, 7
2103	cmp T0_8, A1_8
2104	jae .div_overflow
2105	.div_no_overflow:
2106	mov A1, T1 ; restore divisor
2107	%endif
2108
2109	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2110	mov ax, [A0]
2111	%1 A1_8
2112	mov [A0], ax
2113	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2114	IEM_ADJUST_FLAGS A2, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2115	%else
2116	IEM_SAVE_FLAGS A2, %2, %3
2117	%endif
2118	xor eax, eax
2119
2120	.return:
2121	EPILOGUE_3_ARGS
2122
2123	.div_zero:
2124	.div_overflow:
2125	mov eax, -1
2126	jmp .return
2127	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2128
2129	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2130	PROLOGUE_4_ARGS
2131
2132	; div by chainsaw check.
2133	test A2_16, A2_16
2134	jz .div_zero
2135
2136	; Overflow check - unsigned division is simple to verify, haven't
2137	; found a simple way to check signed division yet unfortunately.
2138	%if %4 == 0
2139	cmp [A1], A2_16
2140	jae .div_overflow
2141	%else
2142	mov T0_16, [A1]
2143	shl T0_32, 16
2144	mov T0_16, [A0] ; T0 = dividend
2145	mov T1, A2 ; T1 = divisor
2146	test T1_16, T1_16
2147	js .divisor_negative
2148	test T0_32, T0_32
2149	jns .both_positive
2150	neg T0_32
2151	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2152	push T0 ; Start off like unsigned below.
2153	shr T0_32, 15
2154	cmp T0_16, T1_16
2155	pop T0
2156	jb .div_no_overflow
2157	ja .div_overflow
2158	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2159	cmp T0_16, T1_16
2160	jae .div_overflow
2161	jmp .div_no_overflow
2162
2163	.divisor_negative:
2164	neg T1_16
2165	test T0_32, T0_32
2166	jns .one_of_each
2167	neg T0_32
2168	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2169	shr T0_32, 15
2170	cmp T0_16, T1_16
2171	jae .div_overflow
2172	.div_no_overflow:
2173	%endif
2174
2175	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2176	%ifdef ASM_CALL64_GCC
2177	mov T1, A2
2178	mov ax, [A0]
2179	mov dx, [A1]
2180	%1 T1_16
2181	mov [A0], ax
2182	mov [A1], dx
2183	%else
2184	mov T1, A1
2185	mov ax, [A0]
2186	mov dx, [T1]
2187	%1 A2_16
2188	mov [A0], ax
2189	mov [T1], dx
2190	%endif
2191	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2192	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2193	%else
2194	IEM_SAVE_FLAGS A3, %2, %3
2195	%endif
2196	xor eax, eax
2197
2198	.return:
2199	EPILOGUE_4_ARGS
2200
2201	.div_zero:
2202	.div_overflow:
2203	mov eax, -1
2204	jmp .return
2205	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2206
2207	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2208	PROLOGUE_4_ARGS
2209
2210	; div by chainsaw check.
2211	test A2_32, A2_32
2212	jz .div_zero
2213
2214	; Overflow check - unsigned division is simple to verify, haven't
2215	; found a simple way to check signed division yet unfortunately.
2216	%if %4 == 0
2217	cmp [A1], A2_32
2218	jae .div_overflow
2219	%else
2220	push A2 ; save A2 so we modify it (we out of regs on x86).
2221	mov T0_32, [A0] ; T0 = dividend low
2222	mov T1_32, [A1] ; T1 = dividend high
2223	test A2_32, A2_32
2224	js .divisor_negative
2225	test T1_32, T1_32
2226	jns .both_positive
2227	call NAME(iemAImpl_negate_T0_T1_u32)
2228	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2229	push T0 ; Start off like unsigned below.
2230	shl T1_32, 1
2231	shr T0_32, 31
2232	or T1_32, T0_32
2233	cmp T1_32, A2_32
2234	pop T0
2235	jb .div_no_overflow
2236	ja .div_overflow
2237	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2238	cmp T0_32, A2_32
2239	jae .div_overflow
2240	jmp .div_no_overflow
2241
2242	.divisor_negative:
2243	neg A2_32
2244	test T1_32, T1_32
2245	jns .one_of_each
2246	call NAME(iemAImpl_negate_T0_T1_u32)
2247	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2248	shl T1_32, 1
2249	shr T0_32, 31
2250	or T1_32, T0_32
2251	cmp T1_32, A2_32
2252	jae .div_overflow
2253	.div_no_overflow:
2254	pop A2
2255	%endif
2256
2257	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2258	mov eax, [A0]
2259	%ifdef ASM_CALL64_GCC
2260	mov T1, A2
2261	mov eax, [A0]
2262	mov edx, [A1]
2263	%1 T1_32
2264	mov [A0], eax
2265	mov [A1], edx
2266	%else
2267	mov T1, A1
2268	mov eax, [A0]
2269	mov edx, [T1]
2270	%1 A2_32
2271	mov [A0], eax
2272	mov [T1], edx
2273	%endif
2274	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2275	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2276	%else
2277	IEM_SAVE_FLAGS A3, %2, %3
2278	%endif
2279	xor eax, eax
2280
2281	.return:
2282	EPILOGUE_4_ARGS
2283
2284	.div_overflow:
2285	%if %4 != 0
2286	pop A2
2287	%endif
2288	.div_zero:
2289	mov eax, -1
2290	jmp .return
2291	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2292
2293	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2294	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2295	PROLOGUE_4_ARGS
2296
2297	test A2, A2
2298	jz .div_zero
2299	%if %4 == 0
2300	cmp [A1], A2
2301	jae .div_overflow
2302	%else
2303	push A2 ; save A2 so we modify it (we out of regs on x86).
2304	mov T0, [A0] ; T0 = dividend low
2305	mov T1, [A1] ; T1 = dividend high
2306	test A2, A2
2307	js .divisor_negative
2308	test T1, T1
2309	jns .both_positive
2310	call NAME(iemAImpl_negate_T0_T1_u64)
2311	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2312	push T0 ; Start off like unsigned below.
2313	shl T1, 1
2314	shr T0, 63
2315	or T1, T0
2316	cmp T1, A2
2317	pop T0
2318	jb .div_no_overflow
2319	ja .div_overflow
2320	mov T1, 0x7fffffffffffffff
2321	and T0, T1 ; Special case for covering (divisor - 1).
2322	cmp T0, A2
2323	jae .div_overflow
2324	jmp .div_no_overflow
2325
2326	.divisor_negative:
2327	neg A2
2328	test T1, T1
2329	jns .one_of_each
2330	call NAME(iemAImpl_negate_T0_T1_u64)
2331	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2332	shl T1, 1
2333	shr T0, 63
2334	or T1, T0
2335	cmp T1, A2
2336	jae .div_overflow
2337	.div_no_overflow:
2338	pop A2
2339	%endif
2340
2341	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2342	mov rax, [A0]
2343	%ifdef ASM_CALL64_GCC
2344	mov T1, A2
2345	mov rax, [A0]
2346	mov rdx, [A1]
2347	%1 T1
2348	mov [A0], rax
2349	mov [A1], rdx
2350	%else
2351	mov T1, A1
2352	mov rax, [A0]
2353	mov rdx, [T1]
2354	%1 A2
2355	mov [A0], rax
2356	mov [T1], rdx
2357	%endif
2358	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2359	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2360	%else
2361	IEM_SAVE_FLAGS A3, %2, %3
2362	%endif
2363	xor eax, eax
2364
2365	.return:
2366	EPILOGUE_4_ARGS_EX 12
2367
2368	.div_overflow:
2369	%if %4 != 0
2370	pop A2
2371	%endif
2372	.div_zero:
2373	mov eax, -1
2374	jmp .return
2375	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2376	%endif ; !RT_ARCH_AMD64
2377
2378	%endmacro
2379
2380	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0, , 0
2381	IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2382	IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2383	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1, , 0
2384	IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2385	IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2386
2387
2388	;;
2389	; Macro for implementing memory fence operation.
2390	;
2391	; No return value, no operands or anything.
2392	;
2393	; @param 1 The instruction.
2394	;
2395	%macro IEMIMPL_MEM_FENCE 1
2396	BEGINCODE
2397	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2398	%1
2399	ret
2400	ENDPROC iemAImpl_ %+ %1
2401	%endmacro
2402
2403	IEMIMPL_MEM_FENCE lfence
2404	IEMIMPL_MEM_FENCE sfence
2405	IEMIMPL_MEM_FENCE mfence
2406
2407	;;
2408	; Alternative for non-SSE2 host.
2409	;
2410	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2411	push xAX
2412	xchg xAX, [xSP]
2413	add xSP, xCB
2414	ret
2415	ENDPROC iemAImpl_alt_mem_fence
2416
2417
2418	;;
2419	; Initialize the FPU for the actual instruction being emulated, this means
2420	; loading parts of the guest's control word and status word.
2421	;
2422	; @uses 24 bytes of stack. T0, T1
2423	; @param 1 Expression giving the address of the FXSTATE of the guest.
2424	;
2425	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2426	fnstenv [xSP]
2427
2428	; FCW - for exception, precision and rounding control.
2429	movzx T0, word [%1 + X86FXSTATE.FCW]
2430	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2431	mov [xSP + X86FSTENV32P.FCW], T0_16
2432
2433	; FSW - for undefined C0, C1, C2, and C3.
2434	movzx T1, word [%1 + X86FXSTATE.FSW]
2435	and T1, X86_FSW_C_MASK
2436	movzx T0, word [xSP + X86FSTENV32P.FSW]
2437	and T0, X86_FSW_TOP_MASK
2438	or T0, T1
2439	mov [xSP + X86FSTENV32P.FSW], T0_16
2440
2441	fldenv [xSP]
2442	%endmacro
2443
2444
2445	;;
2446	; Initialize the FPU for the actual instruction being emulated, this means
2447	; loading parts of the guest's control word, status word, and update the
2448	; tag word for the top register if it's empty.
2449	;
2450	; ASSUMES actual TOP=7
2451	;
2452	; @uses 24 bytes of stack. T0, T1
2453	; @param 1 Expression giving the address of the FXSTATE of the guest.
2454	;
2455	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2456	fnstenv [xSP]
2457
2458	; FCW - for exception, precision and rounding control.
2459	movzx T0_32, word [%1 + X86FXSTATE.FCW]
2460	and T0_32, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2461	mov [xSP + X86FSTENV32P.FCW], T0_16
2462
2463	; FSW - for undefined C0, C1, C2, and C3.
2464	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2465	and T1_32, X86_FSW_C_MASK
2466	movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2467	and T0_32, X86_FSW_TOP_MASK
2468	or T0_32, T1_32
2469	mov [xSP + X86FSTENV32P.FSW], T0_16
2470
2471	; FTW - Only for ST0 (in/out).
2472	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2473	shr T1_32, X86_FSW_TOP_SHIFT
2474	and T1_32, X86_FSW_TOP_SMASK
2475	bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2476	jc %%st0_not_empty
2477	or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2478	%%st0_not_empty:
2479
2480	fldenv [xSP]
2481	%endmacro
2482
2483
2484	;;
2485	; Need to move this as well somewhere better?
2486	;
2487	struc IEMFPURESULT
2488	.r80Result resw 5
2489	.FSW resw 1
2490	endstruc
2491
2492
2493	;;
2494	; Need to move this as well somewhere better?
2495	;
2496	struc IEMFPURESULTTWO
2497	.r80Result1 resw 5
2498	.FSW resw 1
2499	.r80Result2 resw 5
2500	endstruc
2501
2502
2503	;
2504	;---------------------- 16-bit signed integer operations ----------------------
2505	;
2506
2507
2508	;;
2509	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2510	;
2511	; @param A0 FPU context (fxsave).
2512	; @param A1 Pointer to a IEMFPURESULT for the output.
2513	; @param A2 Pointer to the 16-bit floating point value to convert.
2514	;
2515	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2516	PROLOGUE_3_ARGS
2517	sub xSP, 20h
2518
2519	fninit
2520	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2521	fild word [A2]
2522
2523	fnstsw word [A1 + IEMFPURESULT.FSW]
2524	fnclex
2525	fstp tword [A1 + IEMFPURESULT.r80Result]
2526
2527	fninit
2528	add xSP, 20h
2529	EPILOGUE_3_ARGS
2530	ENDPROC iemAImpl_fild_r80_from_i16
2531
2532
2533	;;
2534	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2535	;
2536	; @param A0 FPU context (fxsave).
2537	; @param A1 Where to return the output FSW.
2538	; @param A2 Where to store the 16-bit signed integer value.
2539	; @param A3 Pointer to the 80-bit value.
2540	;
2541	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2542	PROLOGUE_4_ARGS
2543	sub xSP, 20h
2544
2545	fninit
2546	fld tword [A3]
2547	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2548	fistp word [A2]
2549
2550	fnstsw word [A1]
2551
2552	fninit
2553	add xSP, 20h
2554	EPILOGUE_4_ARGS
2555	ENDPROC iemAImpl_fist_r80_to_i16
2556
2557
2558	;;
2559	; Store a 80-bit floating point value (register) as a 16-bit signed integer
2560	; (memory) with truncation.
2561	;
2562	; @param A0 FPU context (fxsave).
2563	; @param A1 Where to return the output FSW.
2564	; @param A2 Where to store the 16-bit signed integer value.
2565	; @param A3 Pointer to the 80-bit value.
2566	;
2567	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2568	PROLOGUE_4_ARGS
2569	sub xSP, 20h
2570
2571	fninit
2572	fld tword [A3]
2573	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2574	fisttp word [A2]
2575
2576	fnstsw word [A1]
2577
2578	fninit
2579	add xSP, 20h
2580	EPILOGUE_4_ARGS
2581	ENDPROC iemAImpl_fistt_r80_to_i16
2582
2583
2584	;;
2585	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2586	;
2587	; @param 1 The instruction
2588	;
2589	; @param A0 FPU context (fxsave).
2590	; @param A1 Pointer to a IEMFPURESULT for the output.
2591	; @param A2 Pointer to the 80-bit value.
2592	; @param A3 Pointer to the 16-bit value.
2593	;
2594	%macro IEMIMPL_FPU_R80_BY_I16 1
2595	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2596	PROLOGUE_4_ARGS
2597	sub xSP, 20h
2598
2599	fninit
2600	fld tword [A2]
2601	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2602	%1 word [A3]
2603
2604	fnstsw word [A1 + IEMFPURESULT.FSW]
2605	fnclex
2606	fstp tword [A1 + IEMFPURESULT.r80Result]
2607
2608	fninit
2609	add xSP, 20h
2610	EPILOGUE_4_ARGS
2611	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2612	%endmacro
2613
2614	IEMIMPL_FPU_R80_BY_I16 fiadd
2615	IEMIMPL_FPU_R80_BY_I16 fimul
2616	IEMIMPL_FPU_R80_BY_I16 fisub
2617	IEMIMPL_FPU_R80_BY_I16 fisubr
2618	IEMIMPL_FPU_R80_BY_I16 fidiv
2619	IEMIMPL_FPU_R80_BY_I16 fidivr
2620
2621
2622	;;
2623	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2624	; only returning FSW.
2625	;
2626	; @param 1 The instruction
2627	;
2628	; @param A0 FPU context (fxsave).
2629	; @param A1 Where to store the output FSW.
2630	; @param A2 Pointer to the 80-bit value.
2631	; @param A3 Pointer to the 64-bit value.
2632	;
2633	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2634	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2635	PROLOGUE_4_ARGS
2636	sub xSP, 20h
2637
2638	fninit
2639	fld tword [A2]
2640	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2641	%1 word [A3]
2642
2643	fnstsw word [A1]
2644
2645	fninit
2646	add xSP, 20h
2647	EPILOGUE_4_ARGS
2648	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2649	%endmacro
2650
2651	IEMIMPL_FPU_R80_BY_I16_FSW ficom
2652
2653
2654
2655	;
2656	;---------------------- 32-bit signed integer operations ----------------------
2657	;
2658
2659
2660	;;
2661	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2662	;
2663	; @param A0 FPU context (fxsave).
2664	; @param A1 Pointer to a IEMFPURESULT for the output.
2665	; @param A2 Pointer to the 32-bit floating point value to convert.
2666	;
2667	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2668	PROLOGUE_3_ARGS
2669	sub xSP, 20h
2670
2671	fninit
2672	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2673	fild dword [A2]
2674
2675	fnstsw word [A1 + IEMFPURESULT.FSW]
2676	fnclex
2677	fstp tword [A1 + IEMFPURESULT.r80Result]
2678
2679	fninit
2680	add xSP, 20h
2681	EPILOGUE_3_ARGS
2682	ENDPROC iemAImpl_fild_r80_from_i32
2683
2684
2685	;;
2686	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2687	;
2688	; @param A0 FPU context (fxsave).
2689	; @param A1 Where to return the output FSW.
2690	; @param A2 Where to store the 32-bit signed integer value.
2691	; @param A3 Pointer to the 80-bit value.
2692	;
2693	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2694	PROLOGUE_4_ARGS
2695	sub xSP, 20h
2696
2697	fninit
2698	fld tword [A3]
2699	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700	fistp dword [A2]
2701
2702	fnstsw word [A1]
2703
2704	fninit
2705	add xSP, 20h
2706	EPILOGUE_4_ARGS
2707	ENDPROC iemAImpl_fist_r80_to_i32
2708
2709
2710	;;
2711	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2712	; (memory) with truncation.
2713	;
2714	; @param A0 FPU context (fxsave).
2715	; @param A1 Where to return the output FSW.
2716	; @param A2 Where to store the 32-bit signed integer value.
2717	; @param A3 Pointer to the 80-bit value.
2718	;
2719	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2720	PROLOGUE_4_ARGS
2721	sub xSP, 20h
2722
2723	fninit
2724	fld tword [A3]
2725	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2726	fisttp dword [A2]
2727
2728	fnstsw word [A1]
2729
2730	fninit
2731	add xSP, 20h
2732	EPILOGUE_4_ARGS
2733	ENDPROC iemAImpl_fistt_r80_to_i32
2734
2735
2736	;;
2737	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2738	;
2739	; @param 1 The instruction
2740	;
2741	; @param A0 FPU context (fxsave).
2742	; @param A1 Pointer to a IEMFPURESULT for the output.
2743	; @param A2 Pointer to the 80-bit value.
2744	; @param A3 Pointer to the 32-bit value.
2745	;
2746	%macro IEMIMPL_FPU_R80_BY_I32 1
2747	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2748	PROLOGUE_4_ARGS
2749	sub xSP, 20h
2750
2751	fninit
2752	fld tword [A2]
2753	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754	%1 dword [A3]
2755
2756	fnstsw word [A1 + IEMFPURESULT.FSW]
2757	fnclex
2758	fstp tword [A1 + IEMFPURESULT.r80Result]
2759
2760	fninit
2761	add xSP, 20h
2762	EPILOGUE_4_ARGS
2763	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2764	%endmacro
2765
2766	IEMIMPL_FPU_R80_BY_I32 fiadd
2767	IEMIMPL_FPU_R80_BY_I32 fimul
2768	IEMIMPL_FPU_R80_BY_I32 fisub
2769	IEMIMPL_FPU_R80_BY_I32 fisubr
2770	IEMIMPL_FPU_R80_BY_I32 fidiv
2771	IEMIMPL_FPU_R80_BY_I32 fidivr
2772
2773
2774	;;
2775	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2776	; only returning FSW.
2777	;
2778	; @param 1 The instruction
2779	;
2780	; @param A0 FPU context (fxsave).
2781	; @param A1 Where to store the output FSW.
2782	; @param A2 Pointer to the 80-bit value.
2783	; @param A3 Pointer to the 64-bit value.
2784	;
2785	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2786	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2787	PROLOGUE_4_ARGS
2788	sub xSP, 20h
2789
2790	fninit
2791	fld tword [A2]
2792	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2793	%1 dword [A3]
2794
2795	fnstsw word [A1]
2796
2797	fninit
2798	add xSP, 20h
2799	EPILOGUE_4_ARGS
2800	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2801	%endmacro
2802
2803	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2804
2805
2806
2807	;
2808	;---------------------- 64-bit signed integer operations ----------------------
2809	;
2810
2811
2812	;;
2813	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2814	;
2815	; @param A0 FPU context (fxsave).
2816	; @param A1 Pointer to a IEMFPURESULT for the output.
2817	; @param A2 Pointer to the 64-bit floating point value to convert.
2818	;
2819	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2820	PROLOGUE_3_ARGS
2821	sub xSP, 20h
2822
2823	fninit
2824	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2825	fild qword [A2]
2826
2827	fnstsw word [A1 + IEMFPURESULT.FSW]
2828	fnclex
2829	fstp tword [A1 + IEMFPURESULT.r80Result]
2830
2831	fninit
2832	add xSP, 20h
2833	EPILOGUE_3_ARGS
2834	ENDPROC iemAImpl_fild_r80_from_i64
2835
2836
2837	;;
2838	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2839	;
2840	; @param A0 FPU context (fxsave).
2841	; @param A1 Where to return the output FSW.
2842	; @param A2 Where to store the 64-bit signed integer value.
2843	; @param A3 Pointer to the 80-bit value.
2844	;
2845	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2846	PROLOGUE_4_ARGS
2847	sub xSP, 20h
2848
2849	fninit
2850	fld tword [A3]
2851	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2852	fistp qword [A2]
2853
2854	fnstsw word [A1]
2855
2856	fninit
2857	add xSP, 20h
2858	EPILOGUE_4_ARGS
2859	ENDPROC iemAImpl_fist_r80_to_i64
2860
2861
2862	;;
2863	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2864	; (memory) with truncation.
2865	;
2866	; @param A0 FPU context (fxsave).
2867	; @param A1 Where to return the output FSW.
2868	; @param A2 Where to store the 64-bit signed integer value.
2869	; @param A3 Pointer to the 80-bit value.
2870	;
2871	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2872	PROLOGUE_4_ARGS
2873	sub xSP, 20h
2874
2875	fninit
2876	fld tword [A3]
2877	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2878	fisttp qword [A2]
2879
2880	fnstsw word [A1]
2881
2882	fninit
2883	add xSP, 20h
2884	EPILOGUE_4_ARGS
2885	ENDPROC iemAImpl_fistt_r80_to_i64
2886
2887
2888
2889	;
2890	;---------------------- 32-bit floating point operations ----------------------
2891	;
2892
2893	;;
2894	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2895	;
2896	; @param A0 FPU context (fxsave).
2897	; @param A1 Pointer to a IEMFPURESULT for the output.
2898	; @param A2 Pointer to the 32-bit floating point value to convert.
2899	;
2900	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2901	PROLOGUE_3_ARGS
2902	sub xSP, 20h
2903
2904	fninit
2905	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906	fld dword [A2]
2907
2908	fnstsw word [A1 + IEMFPURESULT.FSW]
2909	fnclex
2910	fstp tword [A1 + IEMFPURESULT.r80Result]
2911
2912	fninit
2913	add xSP, 20h
2914	EPILOGUE_3_ARGS
2915	ENDPROC iemAImpl_fld_r80_from_r32
2916
2917
2918	;;
2919	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2920	;
2921	; @param A0 FPU context (fxsave).
2922	; @param A1 Where to return the output FSW.
2923	; @param A2 Where to store the 32-bit value.
2924	; @param A3 Pointer to the 80-bit value.
2925	;
2926	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2927	PROLOGUE_4_ARGS
2928	sub xSP, 20h
2929
2930	fninit
2931	fld tword [A3]
2932	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2933	fst dword [A2]
2934
2935	fnstsw word [A1]
2936
2937	fninit
2938	add xSP, 20h
2939	EPILOGUE_4_ARGS
2940	ENDPROC iemAImpl_fst_r80_to_r32
2941
2942
2943	;;
2944	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2945	;
2946	; @param 1 The instruction
2947	;
2948	; @param A0 FPU context (fxsave).
2949	; @param A1 Pointer to a IEMFPURESULT for the output.
2950	; @param A2 Pointer to the 80-bit value.
2951	; @param A3 Pointer to the 32-bit value.
2952	;
2953	%macro IEMIMPL_FPU_R80_BY_R32 1
2954	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2955	PROLOGUE_4_ARGS
2956	sub xSP, 20h
2957
2958	fninit
2959	fld tword [A2]
2960	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2961	%1 dword [A3]
2962
2963	fnstsw word [A1 + IEMFPURESULT.FSW]
2964	fnclex
2965	fstp tword [A1 + IEMFPURESULT.r80Result]
2966
2967	fninit
2968	add xSP, 20h
2969	EPILOGUE_4_ARGS
2970	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2971	%endmacro
2972
2973	IEMIMPL_FPU_R80_BY_R32 fadd
2974	IEMIMPL_FPU_R80_BY_R32 fmul
2975	IEMIMPL_FPU_R80_BY_R32 fsub
2976	IEMIMPL_FPU_R80_BY_R32 fsubr
2977	IEMIMPL_FPU_R80_BY_R32 fdiv
2978	IEMIMPL_FPU_R80_BY_R32 fdivr
2979
2980
2981	;;
2982	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2983	; only returning FSW.
2984	;
2985	; @param 1 The instruction
2986	;
2987	; @param A0 FPU context (fxsave).
2988	; @param A1 Where to store the output FSW.
2989	; @param A2 Pointer to the 80-bit value.
2990	; @param A3 Pointer to the 64-bit value.
2991	;
2992	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2993	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2994	PROLOGUE_4_ARGS
2995	sub xSP, 20h
2996
2997	fninit
2998	fld tword [A2]
2999	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3000	%1 dword [A3]
3001
3002	fnstsw word [A1]
3003
3004	fninit
3005	add xSP, 20h
3006	EPILOGUE_4_ARGS
3007	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3008	%endmacro
3009
3010	IEMIMPL_FPU_R80_BY_R32_FSW fcom
3011
3012
3013
3014	;
3015	;---------------------- 64-bit floating point operations ----------------------
3016	;
3017
3018	;;
3019	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3020	;
3021	; @param A0 FPU context (fxsave).
3022	; @param A1 Pointer to a IEMFPURESULT for the output.
3023	; @param A2 Pointer to the 64-bit floating point value to convert.
3024	;
3025	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3026	PROLOGUE_3_ARGS
3027	sub xSP, 20h
3028
3029	fninit
3030	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3031	fld qword [A2]
3032
3033	fnstsw word [A1 + IEMFPURESULT.FSW]
3034	fnclex
3035	fstp tword [A1 + IEMFPURESULT.r80Result]
3036
3037	fninit
3038	add xSP, 20h
3039	EPILOGUE_3_ARGS
3040	ENDPROC iemAImpl_fld_r80_from_r64
3041
3042
3043	;;
3044	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3045	;
3046	; @param A0 FPU context (fxsave).
3047	; @param A1 Where to return the output FSW.
3048	; @param A2 Where to store the 64-bit value.
3049	; @param A3 Pointer to the 80-bit value.
3050	;
3051	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3052	PROLOGUE_4_ARGS
3053	sub xSP, 20h
3054
3055	fninit
3056	fld tword [A3]
3057	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3058	fst qword [A2]
3059
3060	fnstsw word [A1]
3061
3062	fninit
3063	add xSP, 20h
3064	EPILOGUE_4_ARGS
3065	ENDPROC iemAImpl_fst_r80_to_r64
3066
3067
3068	;;
3069	; FPU instruction working on one 80-bit and one 64-bit floating point value.
3070	;
3071	; @param 1 The instruction
3072	;
3073	; @param A0 FPU context (fxsave).
3074	; @param A1 Pointer to a IEMFPURESULT for the output.
3075	; @param A2 Pointer to the 80-bit value.
3076	; @param A3 Pointer to the 64-bit value.
3077	;
3078	%macro IEMIMPL_FPU_R80_BY_R64 1
3079	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3080	PROLOGUE_4_ARGS
3081	sub xSP, 20h
3082
3083	fninit
3084	fld tword [A2]
3085	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3086	%1 qword [A3]
3087
3088	fnstsw word [A1 + IEMFPURESULT.FSW]
3089	fnclex
3090	fstp tword [A1 + IEMFPURESULT.r80Result]
3091
3092	fninit
3093	add xSP, 20h
3094	EPILOGUE_4_ARGS
3095	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3096	%endmacro
3097
3098	IEMIMPL_FPU_R80_BY_R64 fadd
3099	IEMIMPL_FPU_R80_BY_R64 fmul
3100	IEMIMPL_FPU_R80_BY_R64 fsub
3101	IEMIMPL_FPU_R80_BY_R64 fsubr
3102	IEMIMPL_FPU_R80_BY_R64 fdiv
3103	IEMIMPL_FPU_R80_BY_R64 fdivr
3104
3105	;;
3106	; FPU instruction working on one 80-bit and one 64-bit floating point value,
3107	; only returning FSW.
3108	;
3109	; @param 1 The instruction
3110	;
3111	; @param A0 FPU context (fxsave).
3112	; @param A1 Where to store the output FSW.
3113	; @param A2 Pointer to the 80-bit value.
3114	; @param A3 Pointer to the 64-bit value.
3115	;
3116	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3117	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3118	PROLOGUE_4_ARGS
3119	sub xSP, 20h
3120
3121	fninit
3122	fld tword [A2]
3123	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3124	%1 qword [A3]
3125
3126	fnstsw word [A1]
3127
3128	fninit
3129	add xSP, 20h
3130	EPILOGUE_4_ARGS
3131	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3132	%endmacro
3133
3134	IEMIMPL_FPU_R80_BY_R64_FSW fcom
3135
3136
3137
3138	;
3139	;---------------------- 80-bit floating point operations ----------------------
3140	;
3141
3142	;;
3143	; Loads a 80-bit floating point register value from memory.
3144	;
3145	; @param A0 FPU context (fxsave).
3146	; @param A1 Pointer to a IEMFPURESULT for the output.
3147	; @param A2 Pointer to the 80-bit floating point value to load.
3148	;
3149	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3150	PROLOGUE_3_ARGS
3151	sub xSP, 20h
3152
3153	fninit
3154	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3155	fld tword [A2]
3156
3157	fnstsw word [A1 + IEMFPURESULT.FSW]
3158	fnclex
3159	fstp tword [A1 + IEMFPURESULT.r80Result]
3160
3161	fninit
3162	add xSP, 20h
3163	EPILOGUE_3_ARGS
3164	ENDPROC iemAImpl_fld_r80_from_r80
3165
3166
3167	;;
3168	; Store a 80-bit floating point register to memory
3169	;
3170	; @param A0 FPU context (fxsave).
3171	; @param A1 Where to return the output FSW.
3172	; @param A2 Where to store the 80-bit value.
3173	; @param A3 Pointer to the 80-bit register value.
3174	;
3175	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3176	PROLOGUE_4_ARGS
3177	sub xSP, 20h
3178
3179	fninit
3180	fld tword [A3]
3181	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3182	fstp tword [A2]
3183
3184	fnstsw word [A1]
3185
3186	fninit
3187	add xSP, 20h
3188	EPILOGUE_4_ARGS
3189	ENDPROC iemAImpl_fst_r80_to_r80
3190
3191
3192	;;
3193	; Loads an 80-bit floating point register value in BCD format from memory.
3194	;
3195	; @param A0 FPU context (fxsave).
3196	; @param A1 Pointer to a IEMFPURESULT for the output.
3197	; @param A2 Pointer to the 80-bit BCD value to load.
3198	;
3199	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3200	PROLOGUE_3_ARGS
3201	sub xSP, 20h
3202
3203	fninit
3204	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3205	fbld tword [A2]
3206
3207	fnstsw word [A1 + IEMFPURESULT.FSW]
3208	fnclex
3209	fstp tword [A1 + IEMFPURESULT.r80Result]
3210
3211	fninit
3212	add xSP, 20h
3213	EPILOGUE_3_ARGS
3214	ENDPROC iemAImpl_fld_r80_from_d80
3215
3216
3217	;;
3218	; Store a 80-bit floating point register to memory as BCD
3219	;
3220	; @param A0 FPU context (fxsave).
3221	; @param A1 Where to return the output FSW.
3222	; @param A2 Where to store the 80-bit BCD value.
3223	; @param A3 Pointer to the 80-bit register value.
3224	;
3225	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3226	PROLOGUE_4_ARGS
3227	sub xSP, 20h
3228
3229	fninit
3230	fld tword [A3]
3231	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3232	fbstp tword [A2]
3233
3234	fnstsw word [A1]
3235
3236	fninit
3237	add xSP, 20h
3238	EPILOGUE_4_ARGS
3239	ENDPROC iemAImpl_fst_r80_to_d80
3240
3241
3242	;;
3243	; FPU instruction working on two 80-bit floating point values.
3244	;
3245	; @param 1 The instruction
3246	;
3247	; @param A0 FPU context (fxsave).
3248	; @param A1 Pointer to a IEMFPURESULT for the output.
3249	; @param A2 Pointer to the first 80-bit value (ST0)
3250	; @param A3 Pointer to the second 80-bit value (STn).
3251	;
3252	%macro IEMIMPL_FPU_R80_BY_R80 2
3253	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3254	PROLOGUE_4_ARGS
3255	sub xSP, 20h
3256
3257	fninit
3258	fld tword [A3]
3259	fld tword [A2]
3260	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3261	%1 %2
3262
3263	fnstsw word [A1 + IEMFPURESULT.FSW]
3264	fnclex
3265	fstp tword [A1 + IEMFPURESULT.r80Result]
3266
3267	fninit
3268	add xSP, 20h
3269	EPILOGUE_4_ARGS
3270	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3271	%endmacro
3272
3273	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3274	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3275	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3276	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3277	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3278	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3279	IEMIMPL_FPU_R80_BY_R80 fprem, {}
3280	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3281	IEMIMPL_FPU_R80_BY_R80 fscale, {}
3282
3283
3284	;;
3285	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3286	; storing the result in ST1 and popping the stack.
3287	;
3288	; @param 1 The instruction
3289	;
3290	; @param A0 FPU context (fxsave).
3291	; @param A1 Pointer to a IEMFPURESULT for the output.
3292	; @param A2 Pointer to the first 80-bit value (ST1).
3293	; @param A3 Pointer to the second 80-bit value (ST0).
3294	;
3295	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3296	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3297	PROLOGUE_4_ARGS
3298	sub xSP, 20h
3299
3300	fninit
3301	fld tword [A2]
3302	fld tword [A3]
3303	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3304	%1
3305
3306	fnstsw word [A1 + IEMFPURESULT.FSW]
3307	fnclex
3308	fstp tword [A1 + IEMFPURESULT.r80Result]
3309
3310	fninit
3311	add xSP, 20h
3312	EPILOGUE_4_ARGS
3313	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3314	%endmacro
3315
3316	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3317	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3318	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3319
3320
3321	;;
3322	; FPU instruction working on two 80-bit floating point values, only
3323	; returning FSW.
3324	;
3325	; @param 1 The instruction
3326	;
3327	; @param A0 FPU context (fxsave).
3328	; @param A1 Pointer to a uint16_t for the resulting FSW.
3329	; @param A2 Pointer to the first 80-bit value.
3330	; @param A3 Pointer to the second 80-bit value.
3331	;
3332	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3333	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3334	PROLOGUE_4_ARGS
3335	sub xSP, 20h
3336
3337	fninit
3338	fld tword [A3]
3339	fld tword [A2]
3340	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3341	%1 st0, st1
3342
3343	fnstsw word [A1]
3344
3345	fninit
3346	add xSP, 20h
3347	EPILOGUE_4_ARGS
3348	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3349	%endmacro
3350
3351	IEMIMPL_FPU_R80_BY_R80_FSW fcom
3352	IEMIMPL_FPU_R80_BY_R80_FSW fucom
3353
3354
3355	;;
3356	; FPU instruction working on two 80-bit floating point values,
3357	; returning FSW and EFLAGS (eax).
3358	;
3359	; @param 1 The instruction
3360	;
3361	; @returns EFLAGS in EAX.
3362	; @param A0 FPU context (fxsave).
3363	; @param A1 Pointer to a uint16_t for the resulting FSW.
3364	; @param A2 Pointer to the first 80-bit value.
3365	; @param A3 Pointer to the second 80-bit value.
3366	;
3367	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3368	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3369	PROLOGUE_4_ARGS
3370	sub xSP, 20h
3371
3372	fninit
3373	fld tword [A3]
3374	fld tword [A2]
3375	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3376	%1 st1
3377
3378	fnstsw word [A1]
3379	pushf
3380	pop xAX
3381
3382	fninit
3383	add xSP, 20h
3384	EPILOGUE_4_ARGS
3385	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3386	%endmacro
3387
3388	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3389	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3390
3391
3392	;;
3393	; FPU instruction working on one 80-bit floating point value.
3394	;
3395	; @param 1 The instruction
3396	;
3397	; @param A0 FPU context (fxsave).
3398	; @param A1 Pointer to a IEMFPURESULT for the output.
3399	; @param A2 Pointer to the 80-bit value.
3400	;
3401	%macro IEMIMPL_FPU_R80 1
3402	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3403	PROLOGUE_3_ARGS
3404	sub xSP, 20h
3405
3406	fninit
3407	fld tword [A2]
3408	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3409	%1
3410
3411	fnstsw word [A1 + IEMFPURESULT.FSW]
3412	fnclex
3413	fstp tword [A1 + IEMFPURESULT.r80Result]
3414
3415	fninit
3416	add xSP, 20h
3417	EPILOGUE_3_ARGS
3418	ENDPROC iemAImpl_ %+ %1 %+ _r80
3419	%endmacro
3420
3421	IEMIMPL_FPU_R80 fchs
3422	IEMIMPL_FPU_R80 fabs
3423	IEMIMPL_FPU_R80 f2xm1
3424	IEMIMPL_FPU_R80 fsqrt
3425	IEMIMPL_FPU_R80 frndint
3426	IEMIMPL_FPU_R80 fsin
3427	IEMIMPL_FPU_R80 fcos
3428
3429
3430	;;
3431	; FPU instruction working on one 80-bit floating point value, only
3432	; returning FSW.
3433	;
3434	; @param 1 The instruction
3435	; @param 2 Non-zero to also restore FTW.
3436	;
3437	; @param A0 FPU context (fxsave).
3438	; @param A1 Pointer to a uint16_t for the resulting FSW.
3439	; @param A2 Pointer to the 80-bit value.
3440	;
3441	%macro IEMIMPL_FPU_R80_FSW 2
3442	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3443	PROLOGUE_3_ARGS
3444	sub xSP, 20h
3445
3446	fninit
3447	fld tword [A2]
3448	%if %2 != 0
3449	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3450	%else
3451	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3452	%endif
3453	%1
3454
3455	fnstsw word [A1]
3456
3457	fninit
3458	add xSP, 20h
3459	EPILOGUE_3_ARGS
3460	ENDPROC iemAImpl_ %+ %1 %+ _r80
3461	%endmacro
3462
3463	IEMIMPL_FPU_R80_FSW ftst, 0
3464	IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3465
3466
3467
3468	;;
3469	; FPU instruction loading a 80-bit floating point constant.
3470	;
3471	; @param 1 The instruction
3472	;
3473	; @param A0 FPU context (fxsave).
3474	; @param A1 Pointer to a IEMFPURESULT for the output.
3475	;
3476	%macro IEMIMPL_FPU_R80_CONST 1
3477	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3478	PROLOGUE_2_ARGS
3479	sub xSP, 20h
3480
3481	fninit
3482	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3483	%1
3484
3485	fnstsw word [A1 + IEMFPURESULT.FSW]
3486	fnclex
3487	fstp tword [A1 + IEMFPURESULT.r80Result]
3488
3489	fninit
3490	add xSP, 20h
3491	EPILOGUE_2_ARGS
3492	ENDPROC iemAImpl_ %+ %1 %+
3493	%endmacro
3494
3495	IEMIMPL_FPU_R80_CONST fld1
3496	IEMIMPL_FPU_R80_CONST fldl2t
3497	IEMIMPL_FPU_R80_CONST fldl2e
3498	IEMIMPL_FPU_R80_CONST fldpi
3499	IEMIMPL_FPU_R80_CONST fldlg2
3500	IEMIMPL_FPU_R80_CONST fldln2
3501	IEMIMPL_FPU_R80_CONST fldz
3502
3503
3504	;;
3505	; FPU instruction working on one 80-bit floating point value, outputing two.
3506	;
3507	; @param 1 The instruction
3508	;
3509	; @param A0 FPU context (fxsave).
3510	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3511	; @param A2 Pointer to the 80-bit value.
3512	;
3513	%macro IEMIMPL_FPU_R80_R80 1
3514	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3515	PROLOGUE_3_ARGS
3516	sub xSP, 20h
3517
3518	fninit
3519	fld tword [A2]
3520	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3521	%1
3522
3523	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3524	fnclex
3525	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3526	fnclex
3527	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3528
3529	fninit
3530	add xSP, 20h
3531	EPILOGUE_3_ARGS
3532	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3533	%endmacro
3534
3535	IEMIMPL_FPU_R80_R80 fptan
3536	IEMIMPL_FPU_R80_R80 fxtract
3537	IEMIMPL_FPU_R80_R80 fsincos
3538
3539
3540
3541
3542	;---------------------- SSE and MMX Operations ----------------------
3543
3544	;; @todo what do we need to do for MMX?
3545	%macro IEMIMPL_MMX_PROLOGUE 0
3546	%endmacro
3547	%macro IEMIMPL_MMX_EPILOGUE 0
3548	%endmacro
3549
3550	;; @todo what do we need to do for SSE?
3551	%macro IEMIMPL_SSE_PROLOGUE 0
3552	%endmacro
3553	%macro IEMIMPL_SSE_EPILOGUE 0
3554	%endmacro
3555
3556	;; @todo what do we need to do for AVX?
3557	%macro IEMIMPL_AVX_PROLOGUE 0
3558	%endmacro
3559	%macro IEMIMPL_AVX_EPILOGUE 0
3560	%endmacro
3561
3562
3563	;;
3564	; Media instruction working on two full sized registers.
3565	;
3566	; @param 1 The instruction
3567	; @param 2 Whether there is an MMX variant (1) or not (0).
3568	;
3569	; @param A0 FPU context (fxsave).
3570	; @param A1 Pointer to the first media register size operand (input/output).
3571	; @param A2 Pointer to the second media register size operand (input).
3572	;
3573	%macro IEMIMPL_MEDIA_F2 2
3574	%if %2 != 0
3575	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3576	PROLOGUE_3_ARGS
3577	IEMIMPL_MMX_PROLOGUE
3578
3579	movq mm0, [A1]
3580	movq mm1, [A2]
3581	%1 mm0, mm1
3582	movq [A1], mm0
3583
3584	IEMIMPL_MMX_EPILOGUE
3585	EPILOGUE_3_ARGS
3586	ENDPROC iemAImpl_ %+ %1 %+ _u64
3587	%endif
3588
3589	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3590	PROLOGUE_3_ARGS
3591	IEMIMPL_SSE_PROLOGUE
3592
3593	movdqu xmm0, [A1]
3594	movdqu xmm1, [A2]
3595	%1 xmm0, xmm1
3596	movdqu [A1], xmm0
3597
3598	IEMIMPL_SSE_EPILOGUE
3599	EPILOGUE_3_ARGS
3600	ENDPROC iemAImpl_ %+ %1 %+ _u128
3601	%endmacro
3602
3603	IEMIMPL_MEDIA_F2 pshufb, 1
3604	IEMIMPL_MEDIA_F2 pand, 1
3605	IEMIMPL_MEDIA_F2 pandn, 1
3606	IEMIMPL_MEDIA_F2 por, 1
3607	IEMIMPL_MEDIA_F2 pxor, 1
3608	IEMIMPL_MEDIA_F2 pcmpeqb, 1
3609	IEMIMPL_MEDIA_F2 pcmpeqw, 1
3610	IEMIMPL_MEDIA_F2 pcmpeqd, 1
3611	IEMIMPL_MEDIA_F2 pcmpeqq, 0
3612	IEMIMPL_MEDIA_F2 pcmpgtb, 1
3613	IEMIMPL_MEDIA_F2 pcmpgtw, 1
3614	IEMIMPL_MEDIA_F2 pcmpgtd, 1
3615	IEMIMPL_MEDIA_F2 pcmpgtq, 0
3616	IEMIMPL_MEDIA_F2 paddb, 1
3617	IEMIMPL_MEDIA_F2 paddw, 1
3618	IEMIMPL_MEDIA_F2 paddd, 1
3619	IEMIMPL_MEDIA_F2 paddq, 1
3620	IEMIMPL_MEDIA_F2 psubb, 1
3621	IEMIMPL_MEDIA_F2 psubw, 1
3622	IEMIMPL_MEDIA_F2 psubd, 1
3623	IEMIMPL_MEDIA_F2 psubq, 1
3624
3625
3626	;;
3627	; Media instruction working on two full sized registers, but no FXSAVE state argument.
3628	;
3629	; @param 1 The instruction
3630	; @param 2 Whether there is an MMX variant (1) or not (0).
3631	;
3632	; @param A0 Pointer to the first media register size operand (input/output).
3633	; @param A1 Pointer to the second media register size operand (input).
3634	;
3635	%macro IEMIMPL_MEDIA_OPT_F2 2
3636	%if %2 != 0
3637	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3638	PROLOGUE_2_ARGS
3639	IEMIMPL_MMX_PROLOGUE
3640
3641	movq mm0, [A0]
3642	movq mm1, [A1]
3643	%1 mm0, mm1
3644	movq [A0], mm0
3645
3646	IEMIMPL_MMX_EPILOGUE
3647	EPILOGUE_2_ARGS
3648	ENDPROC iemAImpl_ %+ %1 %+ _u64
3649	%endif
3650
3651	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3652	PROLOGUE_2_ARGS
3653	IEMIMPL_SSE_PROLOGUE
3654
3655	movdqu xmm0, [A0]
3656	movdqu xmm1, [A1]
3657	%1 xmm0, xmm1
3658	movdqu [A0], xmm0
3659
3660	IEMIMPL_SSE_EPILOGUE
3661	EPILOGUE_2_ARGS
3662	ENDPROC iemAImpl_ %+ %1 %+ _u128
3663	%endmacro
3664
3665	IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3666	IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3667	IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3668	IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3669
3670
3671	;;
3672	; Media instruction working on one full sized and one half sized register (lower half).
3673	;
3674	; @param 1 The instruction
3675	; @param 2 1 if MMX is included, 0 if not.
3676	;
3677	; @param A0 Pointer to the first full sized media register operand (input/output).
3678	; @param A1 Pointer to the second half sized media register operand (input).
3679	;
3680	%macro IEMIMPL_MEDIA_F1L1 2
3681	%if %2 != 0
3682	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3683	PROLOGUE_2_ARGS
3684	IEMIMPL_MMX_PROLOGUE
3685
3686	movq mm0, [A0]
3687	movq mm1, [A1]
3688	%1 mm0, mm1
3689	movq [A0], mm0
3690
3691	IEMIMPL_MMX_EPILOGUE
3692	EPILOGUE_2_ARGS
3693	ENDPROC iemAImpl_ %+ %1 %+ _u64
3694	%endif
3695
3696	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3697	PROLOGUE_2_ARGS
3698	IEMIMPL_SSE_PROLOGUE
3699
3700	movdqu xmm0, [A0]
3701	movdqu xmm1, [A1]
3702	%1 xmm0, xmm1
3703	movdqu [A0], xmm0
3704
3705	IEMIMPL_SSE_EPILOGUE
3706	EPILOGUE_2_ARGS
3707	ENDPROC iemAImpl_ %+ %1 %+ _u128
3708	%endmacro
3709
3710	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3711	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3712	IEMIMPL_MEDIA_F1L1 punpckldq, 1
3713	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3714
3715
3716	;;
3717	; Media instruction working two half sized input registers (lower half) and a full sized
3718	; destination register (vpunpckh*).
3719	;
3720	; @param 1 The instruction
3721	;
3722	; @param A0 Pointer to the destination register (full sized, output only).
3723	; @param A1 Pointer to the first full sized media source register operand, where we
3724	; will only use the lower half as input - but we'll be loading it in full.
3725	; @param A2 Pointer to the second full sized media source register operand, where we
3726	; will only use the lower half as input - but we'll be loading it in full.
3727	;
3728	%macro IEMIMPL_MEDIA_F1L1L1 1
3729	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3730	PROLOGUE_3_ARGS
3731	IEMIMPL_AVX_PROLOGUE
3732
3733	vmovdqu xmm0, [A1]
3734	vmovdqu xmm1, [A2]
3735	%1 xmm0, xmm0, xmm1
3736	vmovdqu [A0], xmm0
3737
3738	IEMIMPL_AVX_PROLOGUE
3739	EPILOGUE_3_ARGS
3740	ENDPROC iemAImpl_ %+ %1 %+ _u128
3741
3742	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3743	PROLOGUE_3_ARGS
3744	IEMIMPL_AVX_PROLOGUE
3745
3746	vmovdqu ymm0, [A1]
3747	vmovdqu ymm1, [A2]
3748	%1 ymm0, ymm0, ymm1
3749	vmovdqu [A0], ymm0
3750
3751	IEMIMPL_AVX_PROLOGUE
3752	EPILOGUE_3_ARGS
3753	ENDPROC iemAImpl_ %+ %1 %+ _u256
3754	%endmacro
3755
3756	IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3757	IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3758	IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3759	IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3760
3761
3762	;;
3763	; Media instruction working on one full sized and one half sized register (high half).
3764	;
3765	; @param 1 The instruction
3766	; @param 2 1 if MMX is included, 0 if not.
3767	;
3768	; @param A0 Pointer to the first full sized media register operand (input/output).
3769	; @param A1 Pointer to the second full sized media register operand, where we
3770	; will only use the upper half as input - but we'll load it in full.
3771	;
3772	%macro IEMIMPL_MEDIA_F1H1 2
3773	IEMIMPL_MEDIA_F1L1 %1, %2
3774	%endmacro
3775
3776	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3777	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3778	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3779	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3780
3781
3782	;;
3783	; Media instruction working two half sized input registers (high half) and a full sized
3784	; destination register (vpunpckh*).
3785	;
3786	; @param 1 The instruction
3787	;
3788	; @param A0 Pointer to the destination register (full sized, output only).
3789	; @param A1 Pointer to the first full sized media source register operand, where we
3790	; will only use the upper half as input - but we'll be loading it in full.
3791	; @param A2 Pointer to the second full sized media source register operand, where we
3792	; will only use the upper half as input - but we'll be loading it in full.
3793	;
3794	%macro IEMIMPL_MEDIA_F1H1H1 1
3795	IEMIMPL_MEDIA_F1L1L1 %1
3796	%endmacro
3797
3798	IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3799	IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3800	IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3801	IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3802
3803
3804	;
3805	; Shufflers with evil 8-bit immediates.
3806	;
3807
3808	BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3809	PROLOGUE_3_ARGS
3810	IEMIMPL_MMX_PROLOGUE
3811
3812	movq mm1, [A1]
3813	movq mm0, mm0 ; paranoia!
3814	lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3815	lea T1, [.imm0 xWrtRIP]
3816	lea T1, [T1 + T0]
3817	call T1
3818	movq [A0], mm0
3819
3820	IEMIMPL_MMX_EPILOGUE
3821	EPILOGUE_3_ARGS
3822	%assign bImm 0
3823	%rep 256
3824	.imm %+ bImm:
3825	pshufw mm0, mm1, bImm
3826	ret
3827	%assign bImm bImm + 1
3828	%endrep
3829	.immEnd: ; 256*5 == 0x500
3830	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3831	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3832	ENDPROC iemAImpl_pshufw_u64
3833
3834
3835	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3836	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3837	PROLOGUE_3_ARGS
3838	IEMIMPL_SSE_PROLOGUE
3839
3840	movdqu xmm1, [A1]
3841	movdqu xmm0, xmm1 ; paranoia!
3842	lea T1, [.imm0 xWrtRIP]
3843	lea T0, [A2 + A22] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3844	lea T1, [T1 + T0*2]
3845	call T1
3846	movdqu [A0], xmm0
3847
3848	IEMIMPL_SSE_EPILOGUE
3849	EPILOGUE_3_ARGS
3850	%assign bImm 0
3851	%rep 256
3852	.imm %+ bImm:
3853	%1 xmm0, xmm1, bImm
3854	ret
3855	%assign bImm bImm + 1
3856	%endrep
3857	.immEnd: ; 256*6 == 0x600
3858	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3859	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3860	ENDPROC iemAImpl_ %+ %1 %+ _u128
3861	%endmacro
3862
3863	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3864	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3865	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3866
3867
3868	%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3869	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3870	PROLOGUE_3_ARGS
3871	IEMIMPL_SSE_PROLOGUE
3872
3873	vmovdqu ymm1, [A1]
3874	vmovdqu ymm0, ymm1 ; paranoia!
3875	lea T1, [.imm0 xWrtRIP]
3876	lea T0, [A2 + A22] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3877	lea T1, [T1 + T0*2]
3878	call T1
3879	vmovdqu [A0], ymm0
3880
3881	IEMIMPL_SSE_EPILOGUE
3882	EPILOGUE_3_ARGS
3883	%assign bImm 0
3884	%rep 256
3885	.imm %+ bImm:
3886	%1 ymm0, ymm1, bImm
3887	ret
3888	%assign bImm bImm + 1
3889	%endrep
3890	.immEnd: ; 256*6 == 0x600
3891	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3892	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3893	ENDPROC iemAImpl_ %+ %1 %+ _u256
3894	%endmacro
3895
3896	IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3897	IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3898	IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3899
3900
3901	;
3902	; Move byte mask.
3903	;
3904
3905	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
3906	PROLOGUE_2_ARGS
3907	IEMIMPL_MMX_PROLOGUE
3908
3909	movq mm1, [A1]
3910	pmovmskb T0, mm1
3911	mov [A0], T0
3912	%ifdef RT_ARCH_X86
3913	mov dword [A0 + 4], 0
3914	%endif
3915	IEMIMPL_MMX_EPILOGUE
3916	EPILOGUE_2_ARGS
3917	ENDPROC iemAImpl_pmovmskb_u64
3918
3919	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
3920	PROLOGUE_2_ARGS
3921	IEMIMPL_SSE_PROLOGUE
3922
3923	movdqu xmm1, [A1]
3924	pmovmskb T0, xmm1
3925	mov [A0], T0
3926	%ifdef RT_ARCH_X86
3927	mov dword [A0 + 4], 0
3928	%endif
3929	IEMIMPL_SSE_EPILOGUE
3930	EPILOGUE_2_ARGS
3931	ENDPROC iemAImpl_pmovmskb_u128
3932
3933	BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
3934	PROLOGUE_2_ARGS
3935	IEMIMPL_AVX_PROLOGUE
3936
3937	vmovdqu ymm1, [A1]
3938	vpmovmskb T0, ymm1
3939	mov [A0], T0
3940	%ifdef RT_ARCH_X86
3941	mov dword [A0 + 4], 0
3942	%endif
3943	IEMIMPL_AVX_EPILOGUE
3944	EPILOGUE_2_ARGS
3945	ENDPROC iemAImpl_vpmovmskb_u256
3946
3947
3948	;;
3949	; Media instruction working on two full sized source registers and one destination (AVX).
3950	;
3951	; @param 1 The instruction
3952	;
3953	; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
3954	; @param A1 Pointer to the destination media register size operand (output).
3955	; @param A2 Pointer to the first source media register size operand (input).
3956	; @param A3 Pointer to the second source media register size operand (input).
3957	;
3958	%macro IEMIMPL_MEDIA_F3 1
3959	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3960	PROLOGUE_4_ARGS
3961	IEMIMPL_AVX_PROLOGUE
3962
3963	vmovdqu xmm0, [A2]
3964	vmovdqu xmm1, [A3]
3965	%1 xmm0, xmm0, xmm1
3966	vmovdqu [A1], xmm0
3967
3968	IEMIMPL_AVX_PROLOGUE
3969	EPILOGUE_4_ARGS
3970	ENDPROC iemAImpl_ %+ %1 %+ _u128
3971
3972	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3973	PROLOGUE_4_ARGS
3974	IEMIMPL_AVX_PROLOGUE
3975
3976	vmovdqu ymm0, [A2]
3977	vmovdqu ymm1, [A3]
3978	%1 ymm0, ymm0, ymm1
3979	vmovdqu [A1], ymm0
3980
3981	IEMIMPL_AVX_PROLOGUE
3982	EPILOGUE_4_ARGS
3983	ENDPROC iemAImpl_ %+ %1 %+ _u256
3984	%endmacro
3985
3986	IEMIMPL_MEDIA_F3 vpshufb
3987	IEMIMPL_MEDIA_F3 vpand
3988	IEMIMPL_MEDIA_F3 vpandn
3989	IEMIMPL_MEDIA_F3 vpor
3990	IEMIMPL_MEDIA_F3 vpxor
3991	IEMIMPL_MEDIA_F3 vpcmpeqb
3992	IEMIMPL_MEDIA_F3 vpcmpeqw
3993	IEMIMPL_MEDIA_F3 vpcmpeqd
3994	IEMIMPL_MEDIA_F3 vpcmpeqq
3995	IEMIMPL_MEDIA_F3 vpcmpgtb
3996	IEMIMPL_MEDIA_F3 vpcmpgtw
3997	IEMIMPL_MEDIA_F3 vpcmpgtd
3998	IEMIMPL_MEDIA_F3 vpcmpgtq
3999	IEMIMPL_MEDIA_F3 vpaddb
4000	IEMIMPL_MEDIA_F3 vpaddw
4001	IEMIMPL_MEDIA_F3 vpaddd
4002	IEMIMPL_MEDIA_F3 vpaddq
4003	IEMIMPL_MEDIA_F3 vpsubb
4004	IEMIMPL_MEDIA_F3 vpsubw
4005	IEMIMPL_MEDIA_F3 vpsubd
4006	IEMIMPL_MEDIA_F3 vpsubq
4007
4008
4009	;;
4010	; Media instruction working on two full sized source registers and one destination (AVX),
4011	; but no XSAVE state pointer argument.
4012	;
4013	; @param 1 The instruction
4014	;
4015	; @param A0 Pointer to the destination media register size operand (output).
4016	; @param A1 Pointer to the first source media register size operand (input).
4017	; @param A2 Pointer to the second source media register size operand (input).
4018	;
4019	%macro IEMIMPL_MEDIA_OPT_F3 1
4020	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4021	PROLOGUE_3_ARGS
4022	IEMIMPL_AVX_PROLOGUE
4023
4024	vmovdqu xmm0, [A1]
4025	vmovdqu xmm1, [A2]
4026	%1 xmm0, xmm0, xmm1
4027	vmovdqu [A0], xmm0
4028
4029	IEMIMPL_AVX_PROLOGUE
4030	EPILOGUE_3_ARGS
4031	ENDPROC iemAImpl_ %+ %1 %+ _u128
4032
4033	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4034	PROLOGUE_3_ARGS
4035	IEMIMPL_AVX_PROLOGUE
4036
4037	vmovdqu ymm0, [A1]
4038	vmovdqu ymm1, [A2]
4039	%1 ymm0, ymm0, ymm1
4040	vmovdqu [A0], ymm0
4041
4042	IEMIMPL_AVX_PROLOGUE
4043	EPILOGUE_3_ARGS
4044	ENDPROC iemAImpl_ %+ %1 %+ _u256
4045	%endmacro
4046
4047	IEMIMPL_MEDIA_OPT_F3 vpacksswb
4048	IEMIMPL_MEDIA_OPT_F3 vpackssdw
4049	IEMIMPL_MEDIA_OPT_F3 vpackuswb
4050	IEMIMPL_MEDIA_OPT_F3 vpackusdw
4051
4052
4053	;
4054	; The SSE 4.2 crc32
4055	;
4056	; @param A1 Pointer to the 32-bit destination.
4057	; @param A2 The source operand, sized according to the suffix.
4058	;
4059	BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4060	PROLOGUE_2_ARGS
4061
4062	mov T0_32, [A0]
4063	crc32 T0_32, A1_8
4064	mov [A0], T0_32
4065
4066	EPILOGUE_2_ARGS
4067	ENDPROC iemAImpl_crc32_u8
4068
4069	BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4070	PROLOGUE_2_ARGS
4071
4072	mov T0_32, [A0]
4073	crc32 T0_32, A1_16
4074	mov [A0], T0_32
4075
4076	EPILOGUE_2_ARGS
4077	ENDPROC iemAImpl_crc32_u16
4078
4079	BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4080	PROLOGUE_2_ARGS
4081
4082	mov T0_32, [A0]
4083	crc32 T0_32, A1_32
4084	mov [A0], T0_32
4085
4086	EPILOGUE_2_ARGS
4087	ENDPROC iemAImpl_crc32_u32
4088
4089	%ifdef RT_ARCH_AMD64
4090	BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4091	PROLOGUE_2_ARGS
4092
4093	mov T0_32, [A0]
4094	crc32 T0, A1
4095	mov [A0], T0_32
4096
4097	EPILOGUE_2_ARGS
4098	ENDPROC iemAImpl_crc32_u64
4099	%endif
4100
4101
4102	;
4103	; PTEST (SSE 4.1)
4104	;
4105	; @param A0 Pointer to the first source operand (aka readonly destination).
4106	; @param A1 Pointer to the second source operand.
4107	; @param A2 Pointer to the EFLAGS register.
4108	;
4109	BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4110	PROLOGUE_3_ARGS
4111	IEMIMPL_SSE_PROLOGUE
4112
4113	movdqu xmm0, [A0]
4114	movdqu xmm1, [A1]
4115	ptest xmm0, xmm1
4116	IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4117
4118	IEMIMPL_SSE_EPILOGUE
4119	EPILOGUE_3_ARGS
4120	ENDPROC iemAImpl_ptest_u128
4121
4122	BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4123	PROLOGUE_3_ARGS
4124	IEMIMPL_SSE_PROLOGUE
4125
4126	vmovdqu ymm0, [A0]
4127	vmovdqu ymm1, [A1]
4128	vptest ymm0, ymm1
4129	IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4130
4131	IEMIMPL_SSE_EPILOGUE
4132	EPILOGUE_3_ARGS
4133	ENDPROC iemAImpl_vptest_u256
4134

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95578

Download in other formats: