IEMAllAImpl.asm@ 95341

Last change on this file since 95341 was 95341, checked in by vboxsync, 2 years ago
VMM/IEM: Implemented the BLSR, BLSMSK and BLSI instructions. bugref:9898
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 103.4 KB

Line
1	; $Id: IEMAllAImpl.asm 95341 2022-06-22 10:37:37Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;*********************************************************************************************************************************
20	;* Header Files *
21	;*********************************************************************************************************************************
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;*********************************************************************************************************************************
28	;* Defined Constants And Macros *
29	;*********************************************************************************************************************************
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%define T2 r10 ; only AMD64
176	%define T2_32 r10d
177	%define T2_16 r10w
178	%define T2_8 r10b
179
180	%else
181	; x86
182	%macro PROLOGUE_1_ARGS 0
183	push edi
184	%endmacro
185	%macro EPILOGUE_1_ARGS 0
186	pop edi
187	ret 0
188	%endmacro
189	%macro EPILOGUE_1_ARGS_EX 1
190	pop edi
191	ret %1
192	%endmacro
193
194	%macro PROLOGUE_2_ARGS 0
195	push edi
196	%endmacro
197	%macro EPILOGUE_2_ARGS 0
198	pop edi
199	ret 0
200	%endmacro
201	%macro EPILOGUE_2_ARGS_EX 1
202	pop edi
203	ret %1
204	%endmacro
205
206	%macro PROLOGUE_3_ARGS 0
207	push ebx
208	mov ebx, [esp + 4 + 4]
209	push edi
210	%endmacro
211	%macro EPILOGUE_3_ARGS_EX 1
212	%if (%1) < 4
213	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214	%endif
215	pop edi
216	pop ebx
217	ret %1
218	%endmacro
219	%macro EPILOGUE_3_ARGS 0
220	EPILOGUE_3_ARGS_EX 4
221	%endmacro
222
223	%macro PROLOGUE_4_ARGS 0
224	push ebx
225	push edi
226	push esi
227	mov ebx, [esp + 12 + 4 + 0]
228	mov esi, [esp + 12 + 4 + 4]
229	%endmacro
230	%macro EPILOGUE_4_ARGS_EX 1
231	%if (%1) < 8
232	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233	%endif
234	pop esi
235	pop edi
236	pop ebx
237	ret %1
238	%endmacro
239	%macro EPILOGUE_4_ARGS 0
240	EPILOGUE_4_ARGS_EX 8
241	%endmacro
242
243	%define A0 ecx
244	%define A0_32 ecx
245	%define A0_16 cx
246	%define A0_8 cl
247
248	%define A1 edx
249	%define A1_32 edx
250	%define A1_16 dx
251	%define A1_8 dl
252
253	%define A2 ebx
254	%define A2_32 ebx
255	%define A2_16 bx
256	%define A2_8 bl
257
258	%define A3 esi
259	%define A3_32 esi
260	%define A3_16 si
261
262	%define T0 eax
263	%define T0_32 eax
264	%define T0_16 ax
265	%define T0_8 al
266
267	%define T1 edi
268	%define T1_32 edi
269	%define T1_16 di
270	%endif
271
272
273	;;
274	; Load the relevant flags from [%1] if there are undefined flags (%3).
275	;
276	; @remarks Clobbers T0, stack. Changes EFLAGS.
277	; @param A2 The register pointing to the flags.
278	; @param 1 The parameter (A0..A3) pointing to the eflags.
279	; @param 2 The set of modified flags.
280	; @param 3 The set of undefined flags.
281	;
282	%macro IEM_MAYBE_LOAD_FLAGS 3
283	;%if (%3) != 0
284	pushf ; store current flags
285	mov T0_32, [%1] ; load the guest flags
286	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
287	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
288	or [xSP], T0 ; merge guest flags with host flags.
289	popf ; load the mixed flags.
290	;%endif
291	%endmacro
292
293	;;
294	; Update the flag.
295	;
296	; @remarks Clobbers T0, T1, stack.
297	; @param 1 The register pointing to the EFLAGS.
298	; @param 2 The mask of modified flags to save.
299	; @param 3 The mask of undefined flags to (maybe) save.
300	;
301	%macro IEM_SAVE_FLAGS 3
302	%if (%2 \| %3) != 0
303	pushf
304	pop T1
305	mov T0_32, [%1] ; flags
306	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
307	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
308	or T0_32, T1_32 ; combine the flags.
309	mov [%1], T0_32 ; save the flags.
310	%endif
311	%endmacro
312
313	;;
314	; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315	;
316	; @remarks Clobbers T0, T1, stack.
317	; @param 1 The register pointing to the EFLAGS.
318	; @param 2 The mask of modified flags to save.
319	; @param 3 Mask of additional flags to always clear
320	; @param 4 Mask of additional flags to always set.
321	;
322	%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323	%if (%2 \| %3 \| %4) != 0
324	pushf
325	pop T1
326	mov T0_32, [%1] ; load flags.
327	and T0_32, ~(%2 \| %3) ; clear the modified and always cleared flags.
328	and T1_32, (%2) ; select the modified flags.
329	or T0_32, T1_32 ; combine the flags.
330	%if (%4) != 0
331	or T0_32, %4 ; add the always set flags.
332	%endif
333	mov [%1], T0_32 ; save the result.
334	%endif
335	%endmacro
336
337	;;
338	; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339	; signed input (%4[%5]) and parity index (%6).
340	;
341	; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342	; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343	; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344	;
345	; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346	; @param 1 The register pointing to the EFLAGS.
347	; @param 2 The mask of modified flags to save.
348	; @param 3 Mask of additional flags to always clear
349	; @param 4 The result register to set SF by.
350	; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351	; @param 6 The (full) register containing the parity table index. Will be modified!
352
353	%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354	%ifdef RT_ARCH_AMD64
355	pushf
356	pop T2
357	%else
358	push T0
359	pushf
360	pop T0
361	%endif
362	mov T1_32, [%1] ; load flags.
363	and T1_32, ~(%2 \| %3 \| X86_EFL_PF \| X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364	%ifdef RT_ARCH_AMD64
365	and T2_32, (%2) ; select the modified flags.
366	or T1_32, T2_32 ; combine the flags.
367	%else
368	and T0_32, (%2) ; select the modified flags.
369	or T1_32, T0_32 ; combine the flags.
370	pop T0
371	%endif
372
373	; First calculate SF as it's likely to be refereing to the same register as %6 does.
374	bt %4, %5 - 1
375	jnc %%sf_clear
376	or T1_32, X86_EFL_SF
377	%%sf_clear:
378
379	; Parity last.
380	and %6, 0xff
381	%ifdef RT_ARCH_AMD64
382	lea T2, [NAME(g_afParity) xWrtRIP]
383	or T1_8, [T2 + %6]
384	%else
385	or T1_8, [NAME(g_afParity) + %6]
386	%endif
387
388	mov [%1], T1_32 ; save the result.
389	%endmacro
390
391	;;
392	; Calculates the new EFLAGS using fixed clear and set bit masks.
393	;
394	; @remarks Clobbers T0.
395	; @param 1 The register pointing to the EFLAGS.
396	; @param 2 Mask of additional flags to always clear
397	; @param 3 Mask of additional flags to always set.
398	;
399	%macro IEM_ADJUST_FLAGS 3
400	%if (%2 \| %3) != 0
401	mov T0_32, [%1] ; Load flags.
402	%if (%2) != 0
403	and T0_32, ~(%2) ; Remove the always cleared flags.
404	%endif
405	%if (%3) != 0
406	or T0_32, %3 ; Add the always set flags.
407	%endif
408	mov [%1], T0_32 ; Save the result.
409	%endif
410	%endmacro
411
412	;;
413	; Calculates the new EFLAGS using fixed clear and set bit masks.
414	;
415	; @remarks Clobbers T0, %4, EFLAGS.
416	; @param 1 The register pointing to the EFLAGS.
417	; @param 2 Mask of additional flags to always clear
418	; @param 3 Mask of additional flags to always set.
419	; @param 4 The (full) register containing the parity table index. Will be modified!
420	;
421	%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422	mov T0_32, [%1] ; Load flags.
423	and T0_32, ~(%2 \| X86_EFL_PF) ; Remove PF and the always cleared flags.
424	%if (%3) != 0
425	or T0_32, %3 ; Add the always set flags.
426	%endif
427	and %4, 0xff
428	%ifdef RT_ARCH_AMD64
429	lea T2, [NAME(g_afParity) xWrtRIP]
430	or T0_8, [T2 + %4]
431	%else
432	or T0_8, [NAME(g_afParity) + %4]
433	%endif
434	mov [%1], T0_32 ; Save the result.
435	%endmacro
436
437
438	;*********************************************************************************************************************************
439	;* External Symbols *
440	;*********************************************************************************************************************************
441	extern NAME(g_afParity)
442
443
444	;;
445	; Macro for implementing a binary operator.
446	;
447	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448	; variants, except on 32-bit system where the 64-bit accesses requires hand
449	; coding.
450	;
451	; All the functions takes a pointer to the destination memory operand in A0,
452	; the source register operand in A1 and a pointer to eflags in A2.
453	;
454	; @param 1 The instruction mnemonic.
455	; @param 2 Non-zero if there should be a locked version.
456	; @param 3 The modified flags.
457	; @param 4 The undefined flags.
458	;
459	%macro IEMIMPL_BIN_OP 4
460	BEGINCODE
461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462	PROLOGUE_3_ARGS
463	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464	%1 byte [A0], A1_8
465	IEM_SAVE_FLAGS A2, %3, %4
466	EPILOGUE_3_ARGS
467	ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470	PROLOGUE_3_ARGS
471	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472	%1 word [A0], A1_16
473	IEM_SAVE_FLAGS A2, %3, %4
474	EPILOGUE_3_ARGS
475	ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478	PROLOGUE_3_ARGS
479	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480	%1 dword [A0], A1_32
481	IEM_SAVE_FLAGS A2, %3, %4
482	EPILOGUE_3_ARGS
483	ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485	%ifdef RT_ARCH_AMD64
486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487	PROLOGUE_3_ARGS
488	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489	%1 qword [A0], A1
490	IEM_SAVE_FLAGS A2, %3, %4
491	EPILOGUE_3_ARGS_EX 8
492	ENDPROC iemAImpl_ %+ %1 %+ _u64
493	%endif ; RT_ARCH_AMD64
494
495	%if %2 != 0 ; locked versions requested?
496
497	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498	PROLOGUE_3_ARGS
499	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500	lock %1 byte [A0], A1_8
501	IEM_SAVE_FLAGS A2, %3, %4
502	EPILOGUE_3_ARGS
503	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506	PROLOGUE_3_ARGS
507	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508	lock %1 word [A0], A1_16
509	IEM_SAVE_FLAGS A2, %3, %4
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516	lock %1 dword [A0], A1_32
517	IEM_SAVE_FLAGS A2, %3, %4
518	EPILOGUE_3_ARGS
519	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521	%ifdef RT_ARCH_AMD64
522	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523	PROLOGUE_3_ARGS
524	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525	lock %1 qword [A0], A1
526	IEM_SAVE_FLAGS A2, %3, %4
527	EPILOGUE_3_ARGS_EX 8
528	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529	%endif ; RT_ARCH_AMD64
530	%endif ; locked
531	%endmacro
532
533	; instr,lock, modified-flags, undefined flags
534	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
535	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
536	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
537	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
538	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
539	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
540	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
541	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
542	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
543
544
545	;;
546	; Macro for implementing a binary operator, VEX variant with separate input/output.
547	;
548	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
549	; where the 64-bit accesses requires hand coding.
550	;
551	; All the functions takes a pointer to the destination memory operand in A0,
552	; the first source register operand in A1, the second source register operand
553	; in A2 and a pointer to eflags in A3.
554	;
555	; @param 1 The instruction mnemonic.
556	; @param 2 The modified flags.
557	; @param 3 The undefined flags.
558	;
559	%macro IEMIMPL_VEX_BIN_OP 3
560	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
561	PROLOGUE_4_ARGS
562	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
563	%1 T0_32, A1_32, A2_32
564	mov [A0], T0_32
565	IEM_SAVE_FLAGS A3, %2, %3
566	EPILOGUE_4_ARGS
567	ENDPROC iemAImpl_ %+ %1 %+ _u32
568
569	%ifdef RT_ARCH_AMD64
570	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
571	PROLOGUE_4_ARGS
572	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573	%1 T0, A1, A2
574	mov [A0], T0
575	IEM_SAVE_FLAGS A3, %2, %3
576	EPILOGUE_4_ARGS
577	ENDPROC iemAImpl_ %+ %1 %+ _u64
578	%endif ; RT_ARCH_AMD64
579	%endmacro
580
581	; instr, modified-flags, undefined-flags
582	IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
583	IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF)
584
585	;;
586	; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
587	;
588	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
589	; where the 64-bit accesses requires hand coding.
590	;
591	; All the functions takes a pointer to the destination memory operand in A0,
592	; the source register operand in A1 and a pointer to eflags in A2.
593	;
594	; @param 1 The instruction mnemonic.
595	; @param 2 The modified flags.
596	; @param 3 The undefined flags.
597	;
598	%macro IEMIMPL_VEX_BIN_OP_2 3
599	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
600	PROLOGUE_4_ARGS
601	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
602	mov T0_32, [A0]
603	%1 T0_32, A1_32
604	mov [A0], T0_32
605	IEM_SAVE_FLAGS A2, %2, %3
606	EPILOGUE_4_ARGS
607	ENDPROC iemAImpl_ %+ %1 %+ _u32
608
609	%ifdef RT_ARCH_AMD64
610	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
611	PROLOGUE_4_ARGS
612	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613	mov T0, [A0]
614	%1 T0, A1
615	mov [A0], T0
616	IEM_SAVE_FLAGS A2, %2, %3
617	EPILOGUE_4_ARGS
618	ENDPROC iemAImpl_ %+ %1 %+ _u64
619	%endif ; RT_ARCH_AMD64
620	%endmacro
621
622	; instr, modified-flags, undefined-flags
623	IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
624	IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
625	IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_AF \| X86_EFL_PF)
626
627
628	;;
629	; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
630	;
631	; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
632	; where the 64-bit accesses requires hand coding.
633	;
634	; All the functions takes a pointer to the destination memory operand in A0,
635	; the first source register operand in A1, the second source register operand
636	; in A2 and a pointer to eflags in A3.
637	;
638	; @param 1 The instruction mnemonic.
639	;
640	%macro IEMIMPL_VEX_BIN_OP_NOEFL 2
641	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
642	PROLOGUE_3_ARGS
643	%1 T0_32, A1_32, A2_32
644	mov [A0], T0_32
645	EPILOGUE_3_ARGS
646	ENDPROC iemAImpl_ %+ %1 %+ _u32
647
648	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
649	PROLOGUE_3_ARGS
650	%ifdef ASM_CALL64_GCC
651	mov cl, A2_8
652	%2 A1_32, cl
653	mov [A0], A1_32
654	%else
655	xchg A2, A0
656	%2 A1_32, cl
657	mov [A2], A1_32
658	%endif
659	EPILOGUE_3_ARGS
660	ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
661
662	%ifdef RT_ARCH_AMD64
663	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
664	PROLOGUE_3_ARGS
665	%1 T0, A1, A2
666	mov [A0], T0
667	EPILOGUE_3_ARGS
668	ENDPROC iemAImpl_ %+ %1 %+ _u64
669
670	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
671	PROLOGUE_3_ARGS
672	%ifdef ASM_CALL64_GCC
673	mov cl, A2_8
674	%2 A1, cl
675	mov [A0], A1_32
676	%else
677	xchg A2, A0
678	%2 A1, cl
679	mov [A2], A1_32
680	%endif
681	mov [A0], A1
682	EPILOGUE_3_ARGS
683	ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
684	%endif ; RT_ARCH_AMD64
685	%endmacro
686
687	; instr, fallback instr
688	IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar
689	IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl
690	IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr
691
692
693	;
694	; RORX uses a immediate byte for the shift count, so we only do
695	; fallback implementation of that one.
696	;
697	BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
698	PROLOGUE_3_ARGS
699	%ifdef ASM_CALL64_GCC
700	mov cl, A2_8
701	ror A1_32, cl
702	mov [A0], A1_32
703	%else
704	xchg A2, A0
705	ror A1_32, cl
706	mov [A2], A1_32
707	%endif
708	EPILOGUE_3_ARGS
709	ENDPROC iemAImpl_rorx_u32
710
711	%ifdef RT_ARCH_AMD64
712	BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
713	PROLOGUE_3_ARGS
714	%ifdef ASM_CALL64_GCC
715	mov cl, A2_8
716	ror A1, cl
717	mov [A0], A1_32
718	%else
719	xchg A2, A0
720	ror A1, cl
721	mov [A2], A1_32
722	%endif
723	mov [A0], A1
724	EPILOGUE_3_ARGS
725	ENDPROC iemAImpl_rorx_u64
726	%endif ; RT_ARCH_AMD64
727
728
729	;;
730	; Macro for implementing a bit operator.
731	;
732	; This will generate code for the 16, 32 and 64 bit accesses with locked
733	; variants, except on 32-bit system where the 64-bit accesses requires hand
734	; coding.
735	;
736	; All the functions takes a pointer to the destination memory operand in A0,
737	; the source register operand in A1 and a pointer to eflags in A2.
738	;
739	; @param 1 The instruction mnemonic.
740	; @param 2 Non-zero if there should be a locked version.
741	; @param 3 The modified flags.
742	; @param 4 The undefined flags.
743	;
744	%macro IEMIMPL_BIT_OP 4
745	BEGINCODE
746	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
747	PROLOGUE_3_ARGS
748	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
749	%1 word [A0], A1_16
750	IEM_SAVE_FLAGS A2, %3, %4
751	EPILOGUE_3_ARGS
752	ENDPROC iemAImpl_ %+ %1 %+ _u16
753
754	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
755	PROLOGUE_3_ARGS
756	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
757	%1 dword [A0], A1_32
758	IEM_SAVE_FLAGS A2, %3, %4
759	EPILOGUE_3_ARGS
760	ENDPROC iemAImpl_ %+ %1 %+ _u32
761
762	%ifdef RT_ARCH_AMD64
763	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
764	PROLOGUE_3_ARGS
765	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
766	%1 qword [A0], A1
767	IEM_SAVE_FLAGS A2, %3, %4
768	EPILOGUE_3_ARGS_EX 8
769	ENDPROC iemAImpl_ %+ %1 %+ _u64
770	%endif ; RT_ARCH_AMD64
771
772	%if %2 != 0 ; locked versions requested?
773
774	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
775	PROLOGUE_3_ARGS
776	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
777	lock %1 word [A0], A1_16
778	IEM_SAVE_FLAGS A2, %3, %4
779	EPILOGUE_3_ARGS
780	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
781
782	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
783	PROLOGUE_3_ARGS
784	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
785	lock %1 dword [A0], A1_32
786	IEM_SAVE_FLAGS A2, %3, %4
787	EPILOGUE_3_ARGS
788	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
789
790	%ifdef RT_ARCH_AMD64
791	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
792	PROLOGUE_3_ARGS
793	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
794	lock %1 qword [A0], A1
795	IEM_SAVE_FLAGS A2, %3, %4
796	EPILOGUE_3_ARGS_EX 8
797	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
798	%endif ; RT_ARCH_AMD64
799	%endif ; locked
800	%endmacro
801	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
802	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
803	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
804	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
805
806	;;
807	; Macro for implementing a bit search operator.
808	;
809	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
810	; system where the 64-bit accesses requires hand coding.
811	;
812	; All the functions takes a pointer to the destination memory operand in A0,
813	; the source register operand in A1 and a pointer to eflags in A2.
814	;
815	; In the ZF case the destination register is 'undefined', however it seems that
816	; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
817	; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
818	; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
819	; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
820	;
821	; @param 1 The instruction mnemonic.
822	; @param 2 The modified flags.
823	; @param 3 The undefined flags.
824	; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
825	;
826	%macro IEMIMPL_BIT_OP2 4
827	BEGINCODE
828	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
829	PROLOGUE_3_ARGS
830	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
831	%1 T0_16, A1_16
832	%if %4 != 0
833	jz .unchanged_dst
834	%endif
835	mov [A0], T0_16
836	.unchanged_dst:
837	IEM_SAVE_FLAGS A2, %2, %3
838	EPILOGUE_3_ARGS
839	ENDPROC iemAImpl_ %+ %1 %+ _u16
840
841	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
842	PROLOGUE_3_ARGS
843	%1 T1_16, A1_16
844	%if %4 != 0
845	jz .unchanged_dst
846	%endif
847	mov [A0], T1_16
848	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
849	EPILOGUE_3_ARGS
850	.unchanged_dst:
851	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
852	EPILOGUE_3_ARGS
853	ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
854
855	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
856	PROLOGUE_3_ARGS
857	%1 T0_16, A1_16
858	%if %4 != 0
859	jz .unchanged_dst
860	%endif
861	mov [A0], T0_16
862	.unchanged_dst:
863	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
864	EPILOGUE_3_ARGS
865	ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
866
867
868	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
869	PROLOGUE_3_ARGS
870	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
871	%1 T0_32, A1_32
872	%if %4 != 0
873	jz .unchanged_dst
874	%endif
875	mov [A0], T0_32
876	.unchanged_dst:
877	IEM_SAVE_FLAGS A2, %2, %3
878	EPILOGUE_3_ARGS
879	ENDPROC iemAImpl_ %+ %1 %+ _u32
880
881	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
882	PROLOGUE_3_ARGS
883	%1 T1_32, A1_32
884	%if %4 != 0
885	jz .unchanged_dst
886	%endif
887	mov [A0], T1_32
888	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
889	EPILOGUE_3_ARGS
890	.unchanged_dst:
891	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
892	EPILOGUE_3_ARGS
893	ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
894
895	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
896	PROLOGUE_3_ARGS
897	%1 T0_32, A1_32
898	%if %4 != 0
899	jz .unchanged_dst
900	%endif
901	mov [A0], T0_32
902	.unchanged_dst:
903	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
904	EPILOGUE_3_ARGS
905	ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
906
907
908	%ifdef RT_ARCH_AMD64
909
910	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
911	PROLOGUE_3_ARGS
912	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
913	%1 T0, A1
914	%if %4 != 0
915	jz .unchanged_dst
916	%endif
917	mov [A0], T0
918	.unchanged_dst:
919	IEM_SAVE_FLAGS A2, %2, %3
920	EPILOGUE_3_ARGS_EX 8
921	ENDPROC iemAImpl_ %+ %1 %+ _u64
922
923	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
924	PROLOGUE_3_ARGS
925	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
926	%1 T1, A1
927	%if %4 != 0
928	jz .unchanged_dst
929	%endif
930	mov [A0], T1
931	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
932	EPILOGUE_3_ARGS
933	.unchanged_dst:
934	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
935	EPILOGUE_3_ARGS
936	ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
937
938	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
939	PROLOGUE_3_ARGS
940	%1 T0, A1
941	%if %4 != 0
942	jz .unchanged_dst
943	%endif
944	mov [A0], T0
945	.unchanged_dst:
946	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
947	EPILOGUE_3_ARGS_EX 8
948	ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
949
950	%endif ; RT_ARCH_AMD64
951	%endmacro
952
953	IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
954	IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
955	IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
956	IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF \| X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF), 0
957
958
959	;
960	; IMUL is also a similar but yet different case (no lock, no mem dst).
961	; The rDX:rAX variant of imul is handled together with mul further down.
962	;
963	BEGINCODE
964	; @param 1 EFLAGS that are modified.
965	; @param 2 Undefined EFLAGS.
966	; @param 3 Function suffix.
967	; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
968	; 2 for AMD (set AF, clear PF, ZF and SF).
969	%macro IEMIMPL_IMUL_TWO 4
970	BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
971	PROLOGUE_3_ARGS
972	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
973	imul A1_16, word [A0]
974	mov [A0], A1_16
975	%if %4 != 1
976	IEM_SAVE_FLAGS A2, %1, %2
977	%else
978	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_16, 16, A1
979	%endif
980	EPILOGUE_3_ARGS
981	ENDPROC iemAImpl_imul_two_u16 %+ %3
982
983	BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
984	PROLOGUE_3_ARGS
985	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
986	imul A1_32, dword [A0]
987	mov [A0], A1_32
988	%if %4 != 1
989	IEM_SAVE_FLAGS A2, %1, %2
990	%else
991	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_32, 32, A1
992	%endif
993	EPILOGUE_3_ARGS
994	ENDPROC iemAImpl_imul_two_u32 %+ %3
995
996	%ifdef RT_ARCH_AMD64
997	BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
998	PROLOGUE_3_ARGS
999	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1000	imul A1, qword [A0]
1001	mov [A0], A1
1002	%if %4 != 1
1003	IEM_SAVE_FLAGS A2, %1, %2
1004	%else
1005	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1, 64, A1
1006	%endif
1007	EPILOGUE_3_ARGS_EX 8
1008	ENDPROC iemAImpl_imul_two_u64 %+ %3
1009	%endif ; RT_ARCH_AMD64
1010	%endmacro
1011	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, , 0
1012	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _intel, 1
1013	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _amd, 2
1014
1015
1016	;
1017	; XCHG for memory operands. This implies locking. No flag changes.
1018	;
1019	; Each function takes two arguments, first the pointer to the memory,
1020	; then the pointer to the register. They all return void.
1021	;
1022	BEGINCODE
1023	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1024	PROLOGUE_2_ARGS
1025	mov T0_8, [A1]
1026	xchg [A0], T0_8
1027	mov [A1], T0_8
1028	EPILOGUE_2_ARGS
1029	ENDPROC iemAImpl_xchg_u8_locked
1030
1031	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1032	PROLOGUE_2_ARGS
1033	mov T0_16, [A1]
1034	xchg [A0], T0_16
1035	mov [A1], T0_16
1036	EPILOGUE_2_ARGS
1037	ENDPROC iemAImpl_xchg_u16_locked
1038
1039	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1040	PROLOGUE_2_ARGS
1041	mov T0_32, [A1]
1042	xchg [A0], T0_32
1043	mov [A1], T0_32
1044	EPILOGUE_2_ARGS
1045	ENDPROC iemAImpl_xchg_u32_locked
1046
1047	%ifdef RT_ARCH_AMD64
1048	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1049	PROLOGUE_2_ARGS
1050	mov T0, [A1]
1051	xchg [A0], T0
1052	mov [A1], T0
1053	EPILOGUE_2_ARGS
1054	ENDPROC iemAImpl_xchg_u64_locked
1055	%endif
1056
1057	; Unlocked variants for fDisregardLock mode.
1058
1059	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1060	PROLOGUE_2_ARGS
1061	mov T0_8, [A1]
1062	mov T1_8, [A0]
1063	mov [A0], T0_8
1064	mov [A1], T1_8
1065	EPILOGUE_2_ARGS
1066	ENDPROC iemAImpl_xchg_u8_unlocked
1067
1068	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1069	PROLOGUE_2_ARGS
1070	mov T0_16, [A1]
1071	mov T1_16, [A0]
1072	mov [A0], T0_16
1073	mov [A1], T1_16
1074	EPILOGUE_2_ARGS
1075	ENDPROC iemAImpl_xchg_u16_unlocked
1076
1077	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1078	PROLOGUE_2_ARGS
1079	mov T0_32, [A1]
1080	mov T1_32, [A0]
1081	mov [A0], T0_32
1082	mov [A1], T1_32
1083	EPILOGUE_2_ARGS
1084	ENDPROC iemAImpl_xchg_u32_unlocked
1085
1086	%ifdef RT_ARCH_AMD64
1087	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1088	PROLOGUE_2_ARGS
1089	mov T0, [A1]
1090	mov T1, [A0]
1091	mov [A0], T0
1092	mov [A1], T1
1093	EPILOGUE_2_ARGS
1094	ENDPROC iemAImpl_xchg_u64_unlocked
1095	%endif
1096
1097
1098	;
1099	; XADD for memory operands.
1100	;
1101	; Each function takes three arguments, first the pointer to the
1102	; memory/register, then the pointer to the register, and finally a pointer to
1103	; eflags. They all return void.
1104	;
1105	BEGINCODE
1106	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1107	PROLOGUE_3_ARGS
1108	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1109	mov T0_8, [A1]
1110	xadd [A0], T0_8
1111	mov [A1], T0_8
1112	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1113	EPILOGUE_3_ARGS
1114	ENDPROC iemAImpl_xadd_u8
1115
1116	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1117	PROLOGUE_3_ARGS
1118	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1119	mov T0_16, [A1]
1120	xadd [A0], T0_16
1121	mov [A1], T0_16
1122	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1123	EPILOGUE_3_ARGS
1124	ENDPROC iemAImpl_xadd_u16
1125
1126	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1127	PROLOGUE_3_ARGS
1128	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1129	mov T0_32, [A1]
1130	xadd [A0], T0_32
1131	mov [A1], T0_32
1132	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1133	EPILOGUE_3_ARGS
1134	ENDPROC iemAImpl_xadd_u32
1135
1136	%ifdef RT_ARCH_AMD64
1137	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1138	PROLOGUE_3_ARGS
1139	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1140	mov T0, [A1]
1141	xadd [A0], T0
1142	mov [A1], T0
1143	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1144	EPILOGUE_3_ARGS
1145	ENDPROC iemAImpl_xadd_u64
1146	%endif ; RT_ARCH_AMD64
1147
1148	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1149	PROLOGUE_3_ARGS
1150	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1151	mov T0_8, [A1]
1152	lock xadd [A0], T0_8
1153	mov [A1], T0_8
1154	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1155	EPILOGUE_3_ARGS
1156	ENDPROC iemAImpl_xadd_u8_locked
1157
1158	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1159	PROLOGUE_3_ARGS
1160	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1161	mov T0_16, [A1]
1162	lock xadd [A0], T0_16
1163	mov [A1], T0_16
1164	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1165	EPILOGUE_3_ARGS
1166	ENDPROC iemAImpl_xadd_u16_locked
1167
1168	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1169	PROLOGUE_3_ARGS
1170	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1171	mov T0_32, [A1]
1172	lock xadd [A0], T0_32
1173	mov [A1], T0_32
1174	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1175	EPILOGUE_3_ARGS
1176	ENDPROC iemAImpl_xadd_u32_locked
1177
1178	%ifdef RT_ARCH_AMD64
1179	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1180	PROLOGUE_3_ARGS
1181	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1182	mov T0, [A1]
1183	lock xadd [A0], T0
1184	mov [A1], T0
1185	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1186	EPILOGUE_3_ARGS
1187	ENDPROC iemAImpl_xadd_u64_locked
1188	%endif ; RT_ARCH_AMD64
1189
1190
1191	;
1192	; CMPXCHG8B.
1193	;
1194	; These are tricky register wise, so the code is duplicated for each calling
1195	; convention.
1196	;
1197	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1198	;
1199	; C-proto:
1200	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1201	; uint32_t *pEFlags));
1202	;
1203	; Note! Identical to iemAImpl_cmpxchg16b.
1204	;
1205	BEGINCODE
1206	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1207	%ifdef RT_ARCH_AMD64
1208	%ifdef ASM_CALL64_MSC
1209	push rbx
1210
1211	mov r11, rdx ; pu64EaxEdx (is also T1)
1212	mov r10, rcx ; pu64Dst
1213
1214	mov ebx, [r8]
1215	mov ecx, [r8 + 4]
1216	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1217	mov eax, [r11]
1218	mov edx, [r11 + 4]
1219
1220	lock cmpxchg8b [r10]
1221
1222	mov [r11], eax
1223	mov [r11 + 4], edx
1224	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1225
1226	pop rbx
1227	ret
1228	%else
1229	push rbx
1230
1231	mov r10, rcx ; pEFlags
1232	mov r11, rdx ; pu64EbxEcx (is also T1)
1233
1234	mov ebx, [r11]
1235	mov ecx, [r11 + 4]
1236	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1237	mov eax, [rsi]
1238	mov edx, [rsi + 4]
1239
1240	lock cmpxchg8b [rdi]
1241
1242	mov [rsi], eax
1243	mov [rsi + 4], edx
1244	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1245
1246	pop rbx
1247	ret
1248
1249	%endif
1250	%else
1251	push esi
1252	push edi
1253	push ebx
1254	push ebp
1255
1256	mov edi, ecx ; pu64Dst
1257	mov esi, edx ; pu64EaxEdx
1258	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1259	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1260
1261	mov ebx, [ecx]
1262	mov ecx, [ecx + 4]
1263	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1264	mov eax, [esi]
1265	mov edx, [esi + 4]
1266
1267	lock cmpxchg8b [edi]
1268
1269	mov [esi], eax
1270	mov [esi + 4], edx
1271	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1272
1273	pop ebp
1274	pop ebx
1275	pop edi
1276	pop esi
1277	ret 8
1278	%endif
1279	ENDPROC iemAImpl_cmpxchg8b
1280
1281	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1282	; Lazy bird always lock prefixes cmpxchg8b.
1283	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1284	ENDPROC iemAImpl_cmpxchg8b_locked
1285
1286	%ifdef RT_ARCH_AMD64
1287
1288	;
1289	; CMPXCHG16B.
1290	;
1291	; These are tricky register wise, so the code is duplicated for each calling
1292	; convention.
1293	;
1294	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1295	;
1296	; C-proto:
1297	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1298	; uint32_t *pEFlags));
1299	;
1300	; Note! Identical to iemAImpl_cmpxchg8b.
1301	;
1302	BEGINCODE
1303	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1304	%ifdef ASM_CALL64_MSC
1305	push rbx
1306
1307	mov r11, rdx ; pu64RaxRdx (is also T1)
1308	mov r10, rcx ; pu64Dst
1309
1310	mov rbx, [r8]
1311	mov rcx, [r8 + 8]
1312	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1313	mov rax, [r11]
1314	mov rdx, [r11 + 8]
1315
1316	lock cmpxchg16b [r10]
1317
1318	mov [r11], rax
1319	mov [r11 + 8], rdx
1320	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1321
1322	pop rbx
1323	ret
1324	%else
1325	push rbx
1326
1327	mov r10, rcx ; pEFlags
1328	mov r11, rdx ; pu64RbxRcx (is also T1)
1329
1330	mov rbx, [r11]
1331	mov rcx, [r11 + 8]
1332	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1333	mov rax, [rsi]
1334	mov rdx, [rsi + 8]
1335
1336	lock cmpxchg16b [rdi]
1337
1338	mov [rsi], rax
1339	mov [rsi + 8], rdx
1340	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1341
1342	pop rbx
1343	ret
1344
1345	%endif
1346	ENDPROC iemAImpl_cmpxchg16b
1347
1348	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1349	; Lazy bird always lock prefixes cmpxchg16b.
1350	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1351	ENDPROC iemAImpl_cmpxchg16b_locked
1352
1353	%endif ; RT_ARCH_AMD64
1354
1355
1356	;
1357	; CMPXCHG.
1358	;
1359	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1360	;
1361	; C-proto:
1362	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
1363	;
1364	BEGINCODE
1365	%macro IEMIMPL_CMPXCHG 2
1366	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1367	PROLOGUE_4_ARGS
1368	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1369	mov al, [A1]
1370	%1 cmpxchg [A0], A2_8
1371	mov [A1], al
1372	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1373	EPILOGUE_4_ARGS
1374	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1375
1376	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1377	PROLOGUE_4_ARGS
1378	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1379	mov ax, [A1]
1380	%1 cmpxchg [A0], A2_16
1381	mov [A1], ax
1382	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1383	EPILOGUE_4_ARGS
1384	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1385
1386	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1387	PROLOGUE_4_ARGS
1388	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1389	mov eax, [A1]
1390	%1 cmpxchg [A0], A2_32
1391	mov [A1], eax
1392	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1393	EPILOGUE_4_ARGS
1394	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1395
1396	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1397	%ifdef RT_ARCH_AMD64
1398	PROLOGUE_4_ARGS
1399	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1400	mov rax, [A1]
1401	%1 cmpxchg [A0], A2
1402	mov [A1], rax
1403	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1404	EPILOGUE_4_ARGS
1405	%else
1406	;
1407	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1408	;
1409	push esi
1410	push edi
1411	push ebx
1412	push ebp
1413
1414	mov edi, ecx ; pu64Dst
1415	mov esi, edx ; pu64Rax
1416	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1417	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1418
1419	mov ebx, [ecx]
1420	mov ecx, [ecx + 4]
1421	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1422	mov eax, [esi]
1423	mov edx, [esi + 4]
1424
1425	lock cmpxchg8b [edi]
1426
1427	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1428	jz .cmpxchg8b_not_equal
1429	cmp eax, eax ; just set the other flags.
1430	.store:
1431	mov [esi], eax
1432	mov [esi + 4], edx
1433	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1434
1435	pop ebp
1436	pop ebx
1437	pop edi
1438	pop esi
1439	ret 8
1440
1441	.cmpxchg8b_not_equal:
1442	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1443	jne .store
1444	cmp [esi], eax
1445	jmp .store
1446
1447	%endif
1448	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1449	%endmacro ; IEMIMPL_CMPXCHG
1450
1451	IEMIMPL_CMPXCHG , ,
1452	IEMIMPL_CMPXCHG lock, _locked
1453
1454	;;
1455	; Macro for implementing a unary operator.
1456	;
1457	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1458	; variants, except on 32-bit system where the 64-bit accesses requires hand
1459	; coding.
1460	;
1461	; All the functions takes a pointer to the destination memory operand in A0,
1462	; the source register operand in A1 and a pointer to eflags in A2.
1463	;
1464	; @param 1 The instruction mnemonic.
1465	; @param 2 The modified flags.
1466	; @param 3 The undefined flags.
1467	;
1468	%macro IEMIMPL_UNARY_OP 3
1469	BEGINCODE
1470	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1471	PROLOGUE_2_ARGS
1472	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1473	%1 byte [A0]
1474	IEM_SAVE_FLAGS A1, %2, %3
1475	EPILOGUE_2_ARGS
1476	ENDPROC iemAImpl_ %+ %1 %+ _u8
1477
1478	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1479	PROLOGUE_2_ARGS
1480	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1481	lock %1 byte [A0]
1482	IEM_SAVE_FLAGS A1, %2, %3
1483	EPILOGUE_2_ARGS
1484	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1485
1486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1487	PROLOGUE_2_ARGS
1488	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1489	%1 word [A0]
1490	IEM_SAVE_FLAGS A1, %2, %3
1491	EPILOGUE_2_ARGS
1492	ENDPROC iemAImpl_ %+ %1 %+ _u16
1493
1494	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1495	PROLOGUE_2_ARGS
1496	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1497	lock %1 word [A0]
1498	IEM_SAVE_FLAGS A1, %2, %3
1499	EPILOGUE_2_ARGS
1500	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1501
1502	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1503	PROLOGUE_2_ARGS
1504	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1505	%1 dword [A0]
1506	IEM_SAVE_FLAGS A1, %2, %3
1507	EPILOGUE_2_ARGS
1508	ENDPROC iemAImpl_ %+ %1 %+ _u32
1509
1510	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1511	PROLOGUE_2_ARGS
1512	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1513	lock %1 dword [A0]
1514	IEM_SAVE_FLAGS A1, %2, %3
1515	EPILOGUE_2_ARGS
1516	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1517
1518	%ifdef RT_ARCH_AMD64
1519	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1520	PROLOGUE_2_ARGS
1521	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1522	%1 qword [A0]
1523	IEM_SAVE_FLAGS A1, %2, %3
1524	EPILOGUE_2_ARGS
1525	ENDPROC iemAImpl_ %+ %1 %+ _u64
1526
1527	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1528	PROLOGUE_2_ARGS
1529	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1530	lock %1 qword [A0]
1531	IEM_SAVE_FLAGS A1, %2, %3
1532	EPILOGUE_2_ARGS
1533	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1534	%endif ; RT_ARCH_AMD64
1535
1536	%endmacro
1537
1538	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1539	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1540	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1541	IEMIMPL_UNARY_OP not, 0, 0
1542
1543
1544	;
1545	; BSWAP. No flag changes.
1546	;
1547	; Each function takes one argument, pointer to the value to bswap
1548	; (input/output). They all return void.
1549	;
1550	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1551	PROLOGUE_1_ARGS
1552	mov T0_32, [A0] ; just in case any of the upper bits are used.
1553	db 66h
1554	bswap T0_32
1555	mov [A0], T0_32
1556	EPILOGUE_1_ARGS
1557	ENDPROC iemAImpl_bswap_u16
1558
1559	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1560	PROLOGUE_1_ARGS
1561	mov T0_32, [A0]
1562	bswap T0_32
1563	mov [A0], T0_32
1564	EPILOGUE_1_ARGS
1565	ENDPROC iemAImpl_bswap_u32
1566
1567	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1568	%ifdef RT_ARCH_AMD64
1569	PROLOGUE_1_ARGS
1570	mov T0, [A0]
1571	bswap T0
1572	mov [A0], T0
1573	EPILOGUE_1_ARGS
1574	%else
1575	PROLOGUE_1_ARGS
1576	mov T0, [A0]
1577	mov T1, [A0 + 4]
1578	bswap T0
1579	bswap T1
1580	mov [A0 + 4], T0
1581	mov [A0], T1
1582	EPILOGUE_1_ARGS
1583	%endif
1584	ENDPROC iemAImpl_bswap_u64
1585
1586
1587	;;
1588	; Macro for implementing a shift operation.
1589	;
1590	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1591	; 32-bit system where the 64-bit accesses requires hand coding.
1592	;
1593	; All the functions takes a pointer to the destination memory operand in A0,
1594	; the shift count in A1 and a pointer to eflags in A2.
1595	;
1596	; @param 1 The instruction mnemonic.
1597	; @param 2 The modified flags.
1598	; @param 3 The undefined flags.
1599	;
1600	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1601	;
1602	; @note the _intel and _amd variants are implemented in C.
1603	;
1604	%macro IEMIMPL_SHIFT_OP 3
1605	BEGINCODE
1606	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1607	PROLOGUE_3_ARGS
1608	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1609	%ifdef ASM_CALL64_GCC
1610	mov cl, A1_8
1611	%1 byte [A0], cl
1612	%else
1613	xchg A1, A0
1614	%1 byte [A1], cl
1615	%endif
1616	IEM_SAVE_FLAGS A2, %2, %3
1617	EPILOGUE_3_ARGS
1618	ENDPROC iemAImpl_ %+ %1 %+ _u8
1619
1620	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1621	PROLOGUE_3_ARGS
1622	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1623	%ifdef ASM_CALL64_GCC
1624	mov cl, A1_8
1625	%1 word [A0], cl
1626	%else
1627	xchg A1, A0
1628	%1 word [A1], cl
1629	%endif
1630	IEM_SAVE_FLAGS A2, %2, %3
1631	EPILOGUE_3_ARGS
1632	ENDPROC iemAImpl_ %+ %1 %+ _u16
1633
1634	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1635	PROLOGUE_3_ARGS
1636	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1637	%ifdef ASM_CALL64_GCC
1638	mov cl, A1_8
1639	%1 dword [A0], cl
1640	%else
1641	xchg A1, A0
1642	%1 dword [A1], cl
1643	%endif
1644	IEM_SAVE_FLAGS A2, %2, %3
1645	EPILOGUE_3_ARGS
1646	ENDPROC iemAImpl_ %+ %1 %+ _u32
1647
1648	%ifdef RT_ARCH_AMD64
1649	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1650	PROLOGUE_3_ARGS
1651	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1652	%ifdef ASM_CALL64_GCC
1653	mov cl, A1_8
1654	%1 qword [A0], cl
1655	%else
1656	xchg A1, A0
1657	%1 qword [A1], cl
1658	%endif
1659	IEM_SAVE_FLAGS A2, %2, %3
1660	EPILOGUE_3_ARGS
1661	ENDPROC iemAImpl_ %+ %1 %+ _u64
1662	%endif ; RT_ARCH_AMD64
1663
1664	%endmacro
1665
1666	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1667	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1668	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1669	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1670	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1671	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1672	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1673
1674
1675	;;
1676	; Macro for implementing a double precision shift operation.
1677	;
1678	; This will generate code for the 16, 32 and 64 bit accesses, except on
1679	; 32-bit system where the 64-bit accesses requires hand coding.
1680	;
1681	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1682	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1683	;
1684	; @param 1 The instruction mnemonic.
1685	; @param 2 The modified flags.
1686	; @param 3 The undefined flags.
1687	;
1688	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1689	;
1690	; @note the _intel and _amd variants are implemented in C.
1691	;
1692	%macro IEMIMPL_SHIFT_DBL_OP 3
1693	BEGINCODE
1694	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1695	PROLOGUE_4_ARGS
1696	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1697	%ifdef ASM_CALL64_GCC
1698	xchg A3, A2
1699	%1 [A0], A1_16, cl
1700	xchg A3, A2
1701	%else
1702	xchg A0, A2
1703	%1 [A2], A1_16, cl
1704	%endif
1705	IEM_SAVE_FLAGS A3, %2, %3
1706	EPILOGUE_4_ARGS
1707	ENDPROC iemAImpl_ %+ %1 %+ _u16
1708
1709	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1710	PROLOGUE_4_ARGS
1711	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1712	%ifdef ASM_CALL64_GCC
1713	xchg A3, A2
1714	%1 [A0], A1_32, cl
1715	xchg A3, A2
1716	%else
1717	xchg A0, A2
1718	%1 [A2], A1_32, cl
1719	%endif
1720	IEM_SAVE_FLAGS A3, %2, %3
1721	EPILOGUE_4_ARGS
1722	ENDPROC iemAImpl_ %+ %1 %+ _u32
1723
1724	%ifdef RT_ARCH_AMD64
1725	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1726	PROLOGUE_4_ARGS
1727	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1728	%ifdef ASM_CALL64_GCC
1729	xchg A3, A2
1730	%1 [A0], A1, cl
1731	xchg A3, A2
1732	%else
1733	xchg A0, A2
1734	%1 [A2], A1, cl
1735	%endif
1736	IEM_SAVE_FLAGS A3, %2, %3
1737	EPILOGUE_4_ARGS_EX 12
1738	ENDPROC iemAImpl_ %+ %1 %+ _u64
1739	%endif ; RT_ARCH_AMD64
1740
1741	%endmacro
1742
1743	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1744	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1745
1746
1747	;;
1748	; Macro for implementing a multiplication operations.
1749	;
1750	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1751	; 32-bit system where the 64-bit accesses requires hand coding.
1752	;
1753	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1754	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1755	; pointer to eflags in A3.
1756	;
1757	; The functions all return 0 so the caller can be used for div/idiv as well as
1758	; for the mul/imul implementation.
1759	;
1760	; @param 1 The instruction mnemonic.
1761	; @param 2 The modified flags.
1762	; @param 3 The undefined flags.
1763	; @param 4 Name suffix.
1764	; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1765	;
1766	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1767	;
1768	%macro IEMIMPL_MUL_OP 5
1769	BEGINCODE
1770	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1771	PROLOGUE_3_ARGS
1772	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1773	mov al, [A0]
1774	%1 A1_8
1775	mov [A0], ax
1776	%if %5 != 1
1777	IEM_SAVE_FLAGS A2, %2, %3
1778	%else
1779	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 8, xAX
1780	%endif
1781	xor eax, eax
1782	EPILOGUE_3_ARGS
1783	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1784
1785	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1786	PROLOGUE_4_ARGS
1787	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1788	mov ax, [A0]
1789	%ifdef ASM_CALL64_GCC
1790	%1 A2_16
1791	mov [A0], ax
1792	mov [A1], dx
1793	%else
1794	mov T1, A1
1795	%1 A2_16
1796	mov [A0], ax
1797	mov [T1], dx
1798	%endif
1799	%if %5 != 1
1800	IEM_SAVE_FLAGS A3, %2, %3
1801	%else
1802	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 16, xAX
1803	%endif
1804	xor eax, eax
1805	EPILOGUE_4_ARGS
1806	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1807
1808	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1809	PROLOGUE_4_ARGS
1810	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1811	mov eax, [A0]
1812	%ifdef ASM_CALL64_GCC
1813	%1 A2_32
1814	mov [A0], eax
1815	mov [A1], edx
1816	%else
1817	mov T1, A1
1818	%1 A2_32
1819	mov [A0], eax
1820	mov [T1], edx
1821	%endif
1822	%if %5 != 1
1823	IEM_SAVE_FLAGS A3, %2, %3
1824	%else
1825	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, eax, 32, xAX
1826	%endif
1827	xor eax, eax
1828	EPILOGUE_4_ARGS
1829	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1830
1831	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1832	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1833	PROLOGUE_4_ARGS
1834	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1835	mov rax, [A0]
1836	%ifdef ASM_CALL64_GCC
1837	%1 A2
1838	mov [A0], rax
1839	mov [A1], rdx
1840	%else
1841	mov T1, A1
1842	%1 A2
1843	mov [A0], rax
1844	mov [T1], rdx
1845	%endif
1846	%if %5 != 1
1847	IEM_SAVE_FLAGS A3, %2, %3
1848	%else
1849	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, rax, 64, xAX
1850	%endif
1851	xor eax, eax
1852	EPILOGUE_4_ARGS_EX 12
1853	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1854	%endif ; !RT_ARCH_AMD64
1855
1856	%endmacro
1857
1858	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1859	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1860	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1861	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1862	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1863	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1864
1865
1866	BEGINCODE
1867	;;
1868	; Worker function for negating a 32-bit number in T1:T0
1869	; @uses None (T0,T1)
1870	BEGINPROC iemAImpl_negate_T0_T1_u32
1871	push 0
1872	push 0
1873	xchg T0_32, [xSP]
1874	xchg T1_32, [xSP + xCB]
1875	sub T0_32, [xSP]
1876	sbb T1_32, [xSP + xCB]
1877	add xSP, xCB*2
1878	ret
1879	ENDPROC iemAImpl_negate_T0_T1_u32
1880
1881	%ifdef RT_ARCH_AMD64
1882	;;
1883	; Worker function for negating a 64-bit number in T1:T0
1884	; @uses None (T0,T1)
1885	BEGINPROC iemAImpl_negate_T0_T1_u64
1886	push 0
1887	push 0
1888	xchg T0, [xSP]
1889	xchg T1, [xSP + xCB]
1890	sub T0, [xSP]
1891	sbb T1, [xSP + xCB]
1892	add xSP, xCB*2
1893	ret
1894	ENDPROC iemAImpl_negate_T0_T1_u64
1895	%endif
1896
1897
1898	;;
1899	; Macro for implementing a division operations.
1900	;
1901	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902	; 32-bit system where the 64-bit accesses requires hand coding.
1903	;
1904	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906	; pointer to eflags in A3.
1907	;
1908	; The functions all return 0 on success and -1 if a divide error should be
1909	; raised by the caller.
1910	;
1911	; @param 1 The instruction mnemonic.
1912	; @param 2 The modified flags.
1913	; @param 3 The undefined flags.
1914	; @param 4 1 if signed, 0 if unsigned.
1915	; @param 5 Function suffix.
1916	; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
1917	; 2 for AMD (set AF, clear PF, ZF and SF).
1918	;
1919	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1920	;
1921	%macro IEMIMPL_DIV_OP 6
1922	BEGINCODE
1923	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
1924	PROLOGUE_3_ARGS
1925
1926	; div by chainsaw check.
1927	test A1_8, A1_8
1928	jz .div_zero
1929
1930	; Overflow check - unsigned division is simple to verify, haven't
1931	; found a simple way to check signed division yet unfortunately.
1932	%if %4 == 0
1933	cmp [A0 + 1], A1_8
1934	jae .div_overflow
1935	%else
1936	mov T0_16, [A0] ; T0 = dividend
1937	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1938	test A1_8, A1_8
1939	js .divisor_negative
1940	test T0_16, T0_16
1941	jns .both_positive
1942	neg T0_16
1943	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1944	push T0 ; Start off like unsigned below.
1945	shr T0_16, 7
1946	cmp T0_8, A1_8
1947	pop T0
1948	jb .div_no_overflow
1949	ja .div_overflow
1950	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1951	cmp T0_8, A1_8
1952	jae .div_overflow
1953	jmp .div_no_overflow
1954
1955	.divisor_negative:
1956	neg A1_8
1957	test T0_16, T0_16
1958	jns .one_of_each
1959	neg T0_16
1960	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1961	shr T0_16, 7
1962	cmp T0_8, A1_8
1963	jae .div_overflow
1964	.div_no_overflow:
1965	mov A1, T1 ; restore divisor
1966	%endif
1967
1968	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1969	mov ax, [A0]
1970	%1 A1_8
1971	mov [A0], ax
1972	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1973	IEM_ADJUST_FLAGS A2, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
1974	%else
1975	IEM_SAVE_FLAGS A2, %2, %3
1976	%endif
1977	xor eax, eax
1978
1979	.return:
1980	EPILOGUE_3_ARGS
1981
1982	.div_zero:
1983	.div_overflow:
1984	mov eax, -1
1985	jmp .return
1986	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
1987
1988	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
1989	PROLOGUE_4_ARGS
1990
1991	; div by chainsaw check.
1992	test A2_16, A2_16
1993	jz .div_zero
1994
1995	; Overflow check - unsigned division is simple to verify, haven't
1996	; found a simple way to check signed division yet unfortunately.
1997	%if %4 == 0
1998	cmp [A1], A2_16
1999	jae .div_overflow
2000	%else
2001	mov T0_16, [A1]
2002	shl T0_32, 16
2003	mov T0_16, [A0] ; T0 = dividend
2004	mov T1, A2 ; T1 = divisor
2005	test T1_16, T1_16
2006	js .divisor_negative
2007	test T0_32, T0_32
2008	jns .both_positive
2009	neg T0_32
2010	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2011	push T0 ; Start off like unsigned below.
2012	shr T0_32, 15
2013	cmp T0_16, T1_16
2014	pop T0
2015	jb .div_no_overflow
2016	ja .div_overflow
2017	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2018	cmp T0_16, T1_16
2019	jae .div_overflow
2020	jmp .div_no_overflow
2021
2022	.divisor_negative:
2023	neg T1_16
2024	test T0_32, T0_32
2025	jns .one_of_each
2026	neg T0_32
2027	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2028	shr T0_32, 15
2029	cmp T0_16, T1_16
2030	jae .div_overflow
2031	.div_no_overflow:
2032	%endif
2033
2034	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2035	%ifdef ASM_CALL64_GCC
2036	mov T1, A2
2037	mov ax, [A0]
2038	mov dx, [A1]
2039	%1 T1_16
2040	mov [A0], ax
2041	mov [A1], dx
2042	%else
2043	mov T1, A1
2044	mov ax, [A0]
2045	mov dx, [T1]
2046	%1 A2_16
2047	mov [A0], ax
2048	mov [T1], dx
2049	%endif
2050	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2051	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2052	%else
2053	IEM_SAVE_FLAGS A3, %2, %3
2054	%endif
2055	xor eax, eax
2056
2057	.return:
2058	EPILOGUE_4_ARGS
2059
2060	.div_zero:
2061	.div_overflow:
2062	mov eax, -1
2063	jmp .return
2064	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2065
2066	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2067	PROLOGUE_4_ARGS
2068
2069	; div by chainsaw check.
2070	test A2_32, A2_32
2071	jz .div_zero
2072
2073	; Overflow check - unsigned division is simple to verify, haven't
2074	; found a simple way to check signed division yet unfortunately.
2075	%if %4 == 0
2076	cmp [A1], A2_32
2077	jae .div_overflow
2078	%else
2079	push A2 ; save A2 so we modify it (we out of regs on x86).
2080	mov T0_32, [A0] ; T0 = dividend low
2081	mov T1_32, [A1] ; T1 = dividend high
2082	test A2_32, A2_32
2083	js .divisor_negative
2084	test T1_32, T1_32
2085	jns .both_positive
2086	call NAME(iemAImpl_negate_T0_T1_u32)
2087	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2088	push T0 ; Start off like unsigned below.
2089	shl T1_32, 1
2090	shr T0_32, 31
2091	or T1_32, T0_32
2092	cmp T1_32, A2_32
2093	pop T0
2094	jb .div_no_overflow
2095	ja .div_overflow
2096	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2097	cmp T0_32, A2_32
2098	jae .div_overflow
2099	jmp .div_no_overflow
2100
2101	.divisor_negative:
2102	neg A2_32
2103	test T1_32, T1_32
2104	jns .one_of_each
2105	call NAME(iemAImpl_negate_T0_T1_u32)
2106	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2107	shl T1_32, 1
2108	shr T0_32, 31
2109	or T1_32, T0_32
2110	cmp T1_32, A2_32
2111	jae .div_overflow
2112	.div_no_overflow:
2113	pop A2
2114	%endif
2115
2116	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2117	mov eax, [A0]
2118	%ifdef ASM_CALL64_GCC
2119	mov T1, A2
2120	mov eax, [A0]
2121	mov edx, [A1]
2122	%1 T1_32
2123	mov [A0], eax
2124	mov [A1], edx
2125	%else
2126	mov T1, A1
2127	mov eax, [A0]
2128	mov edx, [T1]
2129	%1 A2_32
2130	mov [A0], eax
2131	mov [T1], edx
2132	%endif
2133	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2134	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2135	%else
2136	IEM_SAVE_FLAGS A3, %2, %3
2137	%endif
2138	xor eax, eax
2139
2140	.return:
2141	EPILOGUE_4_ARGS
2142
2143	.div_overflow:
2144	%if %4 != 0
2145	pop A2
2146	%endif
2147	.div_zero:
2148	mov eax, -1
2149	jmp .return
2150	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2151
2152	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2153	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2154	PROLOGUE_4_ARGS
2155
2156	test A2, A2
2157	jz .div_zero
2158	%if %4 == 0
2159	cmp [A1], A2
2160	jae .div_overflow
2161	%else
2162	push A2 ; save A2 so we modify it (we out of regs on x86).
2163	mov T0, [A0] ; T0 = dividend low
2164	mov T1, [A1] ; T1 = dividend high
2165	test A2, A2
2166	js .divisor_negative
2167	test T1, T1
2168	jns .both_positive
2169	call NAME(iemAImpl_negate_T0_T1_u64)
2170	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2171	push T0 ; Start off like unsigned below.
2172	shl T1, 1
2173	shr T0, 63
2174	or T1, T0
2175	cmp T1, A2
2176	pop T0
2177	jb .div_no_overflow
2178	ja .div_overflow
2179	mov T1, 0x7fffffffffffffff
2180	and T0, T1 ; Special case for covering (divisor - 1).
2181	cmp T0, A2
2182	jae .div_overflow
2183	jmp .div_no_overflow
2184
2185	.divisor_negative:
2186	neg A2
2187	test T1, T1
2188	jns .one_of_each
2189	call NAME(iemAImpl_negate_T0_T1_u64)
2190	.both_positive: ; Same as unsigned shifted by sign indicator bit.
2191	shl T1, 1
2192	shr T0, 63
2193	or T1, T0
2194	cmp T1, A2
2195	jae .div_overflow
2196	.div_no_overflow:
2197	pop A2
2198	%endif
2199
2200	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2201	mov rax, [A0]
2202	%ifdef ASM_CALL64_GCC
2203	mov T1, A2
2204	mov rax, [A0]
2205	mov rdx, [A1]
2206	%1 T1
2207	mov [A0], rax
2208	mov [A1], rdx
2209	%else
2210	mov T1, A1
2211	mov rax, [A0]
2212	mov rdx, [T1]
2213	%1 A2
2214	mov [A0], rax
2215	mov [T1], rdx
2216	%endif
2217	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2218	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2219	%else
2220	IEM_SAVE_FLAGS A3, %2, %3
2221	%endif
2222	xor eax, eax
2223
2224	.return:
2225	EPILOGUE_4_ARGS_EX 12
2226
2227	.div_overflow:
2228	%if %4 != 0
2229	pop A2
2230	%endif
2231	.div_zero:
2232	mov eax, -1
2233	jmp .return
2234	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2235	%endif ; !RT_ARCH_AMD64
2236
2237	%endmacro
2238
2239	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0, , 0
2240	IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2241	IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2242	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1, , 0
2243	IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2244	IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2245
2246
2247	;;
2248	; Macro for implementing memory fence operation.
2249	;
2250	; No return value, no operands or anything.
2251	;
2252	; @param 1 The instruction.
2253	;
2254	%macro IEMIMPL_MEM_FENCE 1
2255	BEGINCODE
2256	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2257	%1
2258	ret
2259	ENDPROC iemAImpl_ %+ %1
2260	%endmacro
2261
2262	IEMIMPL_MEM_FENCE lfence
2263	IEMIMPL_MEM_FENCE sfence
2264	IEMIMPL_MEM_FENCE mfence
2265
2266	;;
2267	; Alternative for non-SSE2 host.
2268	;
2269	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2270	push xAX
2271	xchg xAX, [xSP]
2272	add xSP, xCB
2273	ret
2274	ENDPROC iemAImpl_alt_mem_fence
2275
2276
2277	;;
2278	; Initialize the FPU for the actual instruction being emulated, this means
2279	; loading parts of the guest's control word and status word.
2280	;
2281	; @uses 24 bytes of stack. T0, T1
2282	; @param 1 Expression giving the address of the FXSTATE of the guest.
2283	;
2284	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2285	fnstenv [xSP]
2286
2287	; FCW - for exception, precision and rounding control.
2288	movzx T0, word [%1 + X86FXSTATE.FCW]
2289	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2290	mov [xSP + X86FSTENV32P.FCW], T0_16
2291
2292	; FSW - for undefined C0, C1, C2, and C3.
2293	movzx T1, word [%1 + X86FXSTATE.FSW]
2294	and T1, X86_FSW_C_MASK
2295	movzx T0, word [xSP + X86FSTENV32P.FSW]
2296	and T0, X86_FSW_TOP_MASK
2297	or T0, T1
2298	mov [xSP + X86FSTENV32P.FSW], T0_16
2299
2300	fldenv [xSP]
2301	%endmacro
2302
2303
2304	;;
2305	; Initialize the FPU for the actual instruction being emulated, this means
2306	; loading parts of the guest's control word, status word, and update the
2307	; tag word for the top register if it's empty.
2308	;
2309	; ASSUMES actual TOP=7
2310	;
2311	; @uses 24 bytes of stack. T0, T1
2312	; @param 1 Expression giving the address of the FXSTATE of the guest.
2313	;
2314	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2315	fnstenv [xSP]
2316
2317	; FCW - for exception, precision and rounding control.
2318	movzx T0_32, word [%1 + X86FXSTATE.FCW]
2319	and T0_32, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2320	mov [xSP + X86FSTENV32P.FCW], T0_16
2321
2322	; FSW - for undefined C0, C1, C2, and C3.
2323	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2324	and T1_32, X86_FSW_C_MASK
2325	movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2326	and T0_32, X86_FSW_TOP_MASK
2327	or T0_32, T1_32
2328	mov [xSP + X86FSTENV32P.FSW], T0_16
2329
2330	; FTW - Only for ST0 (in/out).
2331	movzx T1_32, word [%1 + X86FXSTATE.FSW]
2332	shr T1_32, X86_FSW_TOP_SHIFT
2333	and T1_32, X86_FSW_TOP_SMASK
2334	bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2335	jc %%st0_not_empty
2336	or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2337	%%st0_not_empty:
2338
2339	fldenv [xSP]
2340	%endmacro
2341
2342
2343	;;
2344	; Need to move this as well somewhere better?
2345	;
2346	struc IEMFPURESULT
2347	.r80Result resw 5
2348	.FSW resw 1
2349	endstruc
2350
2351
2352	;;
2353	; Need to move this as well somewhere better?
2354	;
2355	struc IEMFPURESULTTWO
2356	.r80Result1 resw 5
2357	.FSW resw 1
2358	.r80Result2 resw 5
2359	endstruc
2360
2361
2362	;
2363	;---------------------- 16-bit signed integer operations ----------------------
2364	;
2365
2366
2367	;;
2368	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2369	;
2370	; @param A0 FPU context (fxsave).
2371	; @param A1 Pointer to a IEMFPURESULT for the output.
2372	; @param A2 Pointer to the 16-bit floating point value to convert.
2373	;
2374	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2375	PROLOGUE_3_ARGS
2376	sub xSP, 20h
2377
2378	fninit
2379	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2380	fild word [A2]
2381
2382	fnstsw word [A1 + IEMFPURESULT.FSW]
2383	fnclex
2384	fstp tword [A1 + IEMFPURESULT.r80Result]
2385
2386	fninit
2387	add xSP, 20h
2388	EPILOGUE_3_ARGS
2389	ENDPROC iemAImpl_fild_r80_from_i16
2390
2391
2392	;;
2393	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2394	;
2395	; @param A0 FPU context (fxsave).
2396	; @param A1 Where to return the output FSW.
2397	; @param A2 Where to store the 16-bit signed integer value.
2398	; @param A3 Pointer to the 80-bit value.
2399	;
2400	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2401	PROLOGUE_4_ARGS
2402	sub xSP, 20h
2403
2404	fninit
2405	fld tword [A3]
2406	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2407	fistp word [A2]
2408
2409	fnstsw word [A1]
2410
2411	fninit
2412	add xSP, 20h
2413	EPILOGUE_4_ARGS
2414	ENDPROC iemAImpl_fist_r80_to_i16
2415
2416
2417	;;
2418	; Store a 80-bit floating point value (register) as a 16-bit signed integer
2419	; (memory) with truncation.
2420	;
2421	; @param A0 FPU context (fxsave).
2422	; @param A1 Where to return the output FSW.
2423	; @param A2 Where to store the 16-bit signed integer value.
2424	; @param A3 Pointer to the 80-bit value.
2425	;
2426	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2427	PROLOGUE_4_ARGS
2428	sub xSP, 20h
2429
2430	fninit
2431	fld tword [A3]
2432	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2433	fisttp word [A2]
2434
2435	fnstsw word [A1]
2436
2437	fninit
2438	add xSP, 20h
2439	EPILOGUE_4_ARGS
2440	ENDPROC iemAImpl_fistt_r80_to_i16
2441
2442
2443	;;
2444	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2445	;
2446	; @param 1 The instruction
2447	;
2448	; @param A0 FPU context (fxsave).
2449	; @param A1 Pointer to a IEMFPURESULT for the output.
2450	; @param A2 Pointer to the 80-bit value.
2451	; @param A3 Pointer to the 16-bit value.
2452	;
2453	%macro IEMIMPL_FPU_R80_BY_I16 1
2454	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2455	PROLOGUE_4_ARGS
2456	sub xSP, 20h
2457
2458	fninit
2459	fld tword [A2]
2460	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2461	%1 word [A3]
2462
2463	fnstsw word [A1 + IEMFPURESULT.FSW]
2464	fnclex
2465	fstp tword [A1 + IEMFPURESULT.r80Result]
2466
2467	fninit
2468	add xSP, 20h
2469	EPILOGUE_4_ARGS
2470	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2471	%endmacro
2472
2473	IEMIMPL_FPU_R80_BY_I16 fiadd
2474	IEMIMPL_FPU_R80_BY_I16 fimul
2475	IEMIMPL_FPU_R80_BY_I16 fisub
2476	IEMIMPL_FPU_R80_BY_I16 fisubr
2477	IEMIMPL_FPU_R80_BY_I16 fidiv
2478	IEMIMPL_FPU_R80_BY_I16 fidivr
2479
2480
2481	;;
2482	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2483	; only returning FSW.
2484	;
2485	; @param 1 The instruction
2486	;
2487	; @param A0 FPU context (fxsave).
2488	; @param A1 Where to store the output FSW.
2489	; @param A2 Pointer to the 80-bit value.
2490	; @param A3 Pointer to the 64-bit value.
2491	;
2492	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2493	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2494	PROLOGUE_4_ARGS
2495	sub xSP, 20h
2496
2497	fninit
2498	fld tword [A2]
2499	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2500	%1 word [A3]
2501
2502	fnstsw word [A1]
2503
2504	fninit
2505	add xSP, 20h
2506	EPILOGUE_4_ARGS
2507	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2508	%endmacro
2509
2510	IEMIMPL_FPU_R80_BY_I16_FSW ficom
2511
2512
2513
2514	;
2515	;---------------------- 32-bit signed integer operations ----------------------
2516	;
2517
2518
2519	;;
2520	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2521	;
2522	; @param A0 FPU context (fxsave).
2523	; @param A1 Pointer to a IEMFPURESULT for the output.
2524	; @param A2 Pointer to the 32-bit floating point value to convert.
2525	;
2526	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2527	PROLOGUE_3_ARGS
2528	sub xSP, 20h
2529
2530	fninit
2531	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2532	fild dword [A2]
2533
2534	fnstsw word [A1 + IEMFPURESULT.FSW]
2535	fnclex
2536	fstp tword [A1 + IEMFPURESULT.r80Result]
2537
2538	fninit
2539	add xSP, 20h
2540	EPILOGUE_3_ARGS
2541	ENDPROC iemAImpl_fild_r80_from_i32
2542
2543
2544	;;
2545	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2546	;
2547	; @param A0 FPU context (fxsave).
2548	; @param A1 Where to return the output FSW.
2549	; @param A2 Where to store the 32-bit signed integer value.
2550	; @param A3 Pointer to the 80-bit value.
2551	;
2552	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2553	PROLOGUE_4_ARGS
2554	sub xSP, 20h
2555
2556	fninit
2557	fld tword [A3]
2558	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2559	fistp dword [A2]
2560
2561	fnstsw word [A1]
2562
2563	fninit
2564	add xSP, 20h
2565	EPILOGUE_4_ARGS
2566	ENDPROC iemAImpl_fist_r80_to_i32
2567
2568
2569	;;
2570	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2571	; (memory) with truncation.
2572	;
2573	; @param A0 FPU context (fxsave).
2574	; @param A1 Where to return the output FSW.
2575	; @param A2 Where to store the 32-bit signed integer value.
2576	; @param A3 Pointer to the 80-bit value.
2577	;
2578	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2579	PROLOGUE_4_ARGS
2580	sub xSP, 20h
2581
2582	fninit
2583	fld tword [A3]
2584	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2585	fisttp dword [A2]
2586
2587	fnstsw word [A1]
2588
2589	fninit
2590	add xSP, 20h
2591	EPILOGUE_4_ARGS
2592	ENDPROC iemAImpl_fistt_r80_to_i32
2593
2594
2595	;;
2596	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2597	;
2598	; @param 1 The instruction
2599	;
2600	; @param A0 FPU context (fxsave).
2601	; @param A1 Pointer to a IEMFPURESULT for the output.
2602	; @param A2 Pointer to the 80-bit value.
2603	; @param A3 Pointer to the 32-bit value.
2604	;
2605	%macro IEMIMPL_FPU_R80_BY_I32 1
2606	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2607	PROLOGUE_4_ARGS
2608	sub xSP, 20h
2609
2610	fninit
2611	fld tword [A2]
2612	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2613	%1 dword [A3]
2614
2615	fnstsw word [A1 + IEMFPURESULT.FSW]
2616	fnclex
2617	fstp tword [A1 + IEMFPURESULT.r80Result]
2618
2619	fninit
2620	add xSP, 20h
2621	EPILOGUE_4_ARGS
2622	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2623	%endmacro
2624
2625	IEMIMPL_FPU_R80_BY_I32 fiadd
2626	IEMIMPL_FPU_R80_BY_I32 fimul
2627	IEMIMPL_FPU_R80_BY_I32 fisub
2628	IEMIMPL_FPU_R80_BY_I32 fisubr
2629	IEMIMPL_FPU_R80_BY_I32 fidiv
2630	IEMIMPL_FPU_R80_BY_I32 fidivr
2631
2632
2633	;;
2634	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2635	; only returning FSW.
2636	;
2637	; @param 1 The instruction
2638	;
2639	; @param A0 FPU context (fxsave).
2640	; @param A1 Where to store the output FSW.
2641	; @param A2 Pointer to the 80-bit value.
2642	; @param A3 Pointer to the 64-bit value.
2643	;
2644	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2645	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2646	PROLOGUE_4_ARGS
2647	sub xSP, 20h
2648
2649	fninit
2650	fld tword [A2]
2651	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2652	%1 dword [A3]
2653
2654	fnstsw word [A1]
2655
2656	fninit
2657	add xSP, 20h
2658	EPILOGUE_4_ARGS
2659	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2660	%endmacro
2661
2662	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2663
2664
2665
2666	;
2667	;---------------------- 64-bit signed integer operations ----------------------
2668	;
2669
2670
2671	;;
2672	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2673	;
2674	; @param A0 FPU context (fxsave).
2675	; @param A1 Pointer to a IEMFPURESULT for the output.
2676	; @param A2 Pointer to the 64-bit floating point value to convert.
2677	;
2678	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2679	PROLOGUE_3_ARGS
2680	sub xSP, 20h
2681
2682	fninit
2683	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2684	fild qword [A2]
2685
2686	fnstsw word [A1 + IEMFPURESULT.FSW]
2687	fnclex
2688	fstp tword [A1 + IEMFPURESULT.r80Result]
2689
2690	fninit
2691	add xSP, 20h
2692	EPILOGUE_3_ARGS
2693	ENDPROC iemAImpl_fild_r80_from_i64
2694
2695
2696	;;
2697	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2698	;
2699	; @param A0 FPU context (fxsave).
2700	; @param A1 Where to return the output FSW.
2701	; @param A2 Where to store the 64-bit signed integer value.
2702	; @param A3 Pointer to the 80-bit value.
2703	;
2704	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2705	PROLOGUE_4_ARGS
2706	sub xSP, 20h
2707
2708	fninit
2709	fld tword [A3]
2710	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2711	fistp qword [A2]
2712
2713	fnstsw word [A1]
2714
2715	fninit
2716	add xSP, 20h
2717	EPILOGUE_4_ARGS
2718	ENDPROC iemAImpl_fist_r80_to_i64
2719
2720
2721	;;
2722	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2723	; (memory) with truncation.
2724	;
2725	; @param A0 FPU context (fxsave).
2726	; @param A1 Where to return the output FSW.
2727	; @param A2 Where to store the 64-bit signed integer value.
2728	; @param A3 Pointer to the 80-bit value.
2729	;
2730	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2731	PROLOGUE_4_ARGS
2732	sub xSP, 20h
2733
2734	fninit
2735	fld tword [A3]
2736	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2737	fisttp qword [A2]
2738
2739	fnstsw word [A1]
2740
2741	fninit
2742	add xSP, 20h
2743	EPILOGUE_4_ARGS
2744	ENDPROC iemAImpl_fistt_r80_to_i64
2745
2746
2747
2748	;
2749	;---------------------- 32-bit floating point operations ----------------------
2750	;
2751
2752	;;
2753	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2754	;
2755	; @param A0 FPU context (fxsave).
2756	; @param A1 Pointer to a IEMFPURESULT for the output.
2757	; @param A2 Pointer to the 32-bit floating point value to convert.
2758	;
2759	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2760	PROLOGUE_3_ARGS
2761	sub xSP, 20h
2762
2763	fninit
2764	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2765	fld dword [A2]
2766
2767	fnstsw word [A1 + IEMFPURESULT.FSW]
2768	fnclex
2769	fstp tword [A1 + IEMFPURESULT.r80Result]
2770
2771	fninit
2772	add xSP, 20h
2773	EPILOGUE_3_ARGS
2774	ENDPROC iemAImpl_fld_r80_from_r32
2775
2776
2777	;;
2778	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2779	;
2780	; @param A0 FPU context (fxsave).
2781	; @param A1 Where to return the output FSW.
2782	; @param A2 Where to store the 32-bit value.
2783	; @param A3 Pointer to the 80-bit value.
2784	;
2785	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2786	PROLOGUE_4_ARGS
2787	sub xSP, 20h
2788
2789	fninit
2790	fld tword [A3]
2791	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2792	fst dword [A2]
2793
2794	fnstsw word [A1]
2795
2796	fninit
2797	add xSP, 20h
2798	EPILOGUE_4_ARGS
2799	ENDPROC iemAImpl_fst_r80_to_r32
2800
2801
2802	;;
2803	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2804	;
2805	; @param 1 The instruction
2806	;
2807	; @param A0 FPU context (fxsave).
2808	; @param A1 Pointer to a IEMFPURESULT for the output.
2809	; @param A2 Pointer to the 80-bit value.
2810	; @param A3 Pointer to the 32-bit value.
2811	;
2812	%macro IEMIMPL_FPU_R80_BY_R32 1
2813	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2814	PROLOGUE_4_ARGS
2815	sub xSP, 20h
2816
2817	fninit
2818	fld tword [A2]
2819	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2820	%1 dword [A3]
2821
2822	fnstsw word [A1 + IEMFPURESULT.FSW]
2823	fnclex
2824	fstp tword [A1 + IEMFPURESULT.r80Result]
2825
2826	fninit
2827	add xSP, 20h
2828	EPILOGUE_4_ARGS
2829	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2830	%endmacro
2831
2832	IEMIMPL_FPU_R80_BY_R32 fadd
2833	IEMIMPL_FPU_R80_BY_R32 fmul
2834	IEMIMPL_FPU_R80_BY_R32 fsub
2835	IEMIMPL_FPU_R80_BY_R32 fsubr
2836	IEMIMPL_FPU_R80_BY_R32 fdiv
2837	IEMIMPL_FPU_R80_BY_R32 fdivr
2838
2839
2840	;;
2841	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2842	; only returning FSW.
2843	;
2844	; @param 1 The instruction
2845	;
2846	; @param A0 FPU context (fxsave).
2847	; @param A1 Where to store the output FSW.
2848	; @param A2 Pointer to the 80-bit value.
2849	; @param A3 Pointer to the 64-bit value.
2850	;
2851	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2852	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2853	PROLOGUE_4_ARGS
2854	sub xSP, 20h
2855
2856	fninit
2857	fld tword [A2]
2858	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2859	%1 dword [A3]
2860
2861	fnstsw word [A1]
2862
2863	fninit
2864	add xSP, 20h
2865	EPILOGUE_4_ARGS
2866	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2867	%endmacro
2868
2869	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2870
2871
2872
2873	;
2874	;---------------------- 64-bit floating point operations ----------------------
2875	;
2876
2877	;;
2878	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2879	;
2880	; @param A0 FPU context (fxsave).
2881	; @param A1 Pointer to a IEMFPURESULT for the output.
2882	; @param A2 Pointer to the 64-bit floating point value to convert.
2883	;
2884	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
2885	PROLOGUE_3_ARGS
2886	sub xSP, 20h
2887
2888	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2889	fld qword [A2]
2890
2891	fnstsw word [A1 + IEMFPURESULT.FSW]
2892	fnclex
2893	fstp tword [A1 + IEMFPURESULT.r80Result]
2894
2895	fninit
2896	add xSP, 20h
2897	EPILOGUE_3_ARGS
2898	ENDPROC iemAImpl_fld_r80_from_r64
2899
2900
2901	;;
2902	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2903	;
2904	; @param A0 FPU context (fxsave).
2905	; @param A1 Where to return the output FSW.
2906	; @param A2 Where to store the 64-bit value.
2907	; @param A3 Pointer to the 80-bit value.
2908	;
2909	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2910	PROLOGUE_4_ARGS
2911	sub xSP, 20h
2912
2913	fninit
2914	fld tword [A3]
2915	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916	fst qword [A2]
2917
2918	fnstsw word [A1]
2919
2920	fninit
2921	add xSP, 20h
2922	EPILOGUE_4_ARGS
2923	ENDPROC iemAImpl_fst_r80_to_r64
2924
2925
2926	;;
2927	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2928	;
2929	; @param 1 The instruction
2930	;
2931	; @param A0 FPU context (fxsave).
2932	; @param A1 Pointer to a IEMFPURESULT for the output.
2933	; @param A2 Pointer to the 80-bit value.
2934	; @param A3 Pointer to the 64-bit value.
2935	;
2936	%macro IEMIMPL_FPU_R80_BY_R64 1
2937	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2938	PROLOGUE_4_ARGS
2939	sub xSP, 20h
2940
2941	fninit
2942	fld tword [A2]
2943	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2944	%1 qword [A3]
2945
2946	fnstsw word [A1 + IEMFPURESULT.FSW]
2947	fnclex
2948	fstp tword [A1 + IEMFPURESULT.r80Result]
2949
2950	fninit
2951	add xSP, 20h
2952	EPILOGUE_4_ARGS
2953	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2954	%endmacro
2955
2956	IEMIMPL_FPU_R80_BY_R64 fadd
2957	IEMIMPL_FPU_R80_BY_R64 fmul
2958	IEMIMPL_FPU_R80_BY_R64 fsub
2959	IEMIMPL_FPU_R80_BY_R64 fsubr
2960	IEMIMPL_FPU_R80_BY_R64 fdiv
2961	IEMIMPL_FPU_R80_BY_R64 fdivr
2962
2963	;;
2964	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2965	; only returning FSW.
2966	;
2967	; @param 1 The instruction
2968	;
2969	; @param A0 FPU context (fxsave).
2970	; @param A1 Where to store the output FSW.
2971	; @param A2 Pointer to the 80-bit value.
2972	; @param A3 Pointer to the 64-bit value.
2973	;
2974	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2975	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2976	PROLOGUE_4_ARGS
2977	sub xSP, 20h
2978
2979	fninit
2980	fld tword [A2]
2981	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2982	%1 qword [A3]
2983
2984	fnstsw word [A1]
2985
2986	fninit
2987	add xSP, 20h
2988	EPILOGUE_4_ARGS
2989	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2990	%endmacro
2991
2992	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2993
2994
2995
2996	;
2997	;---------------------- 80-bit floating point operations ----------------------
2998	;
2999
3000	;;
3001	; Loads a 80-bit floating point register value from memory.
3002	;
3003	; @param A0 FPU context (fxsave).
3004	; @param A1 Pointer to a IEMFPURESULT for the output.
3005	; @param A2 Pointer to the 80-bit floating point value to load.
3006	;
3007	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3008	PROLOGUE_3_ARGS
3009	sub xSP, 20h
3010
3011	fninit
3012	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013	fld tword [A2]
3014
3015	fnstsw word [A1 + IEMFPURESULT.FSW]
3016	fnclex
3017	fstp tword [A1 + IEMFPURESULT.r80Result]
3018
3019	fninit
3020	add xSP, 20h
3021	EPILOGUE_3_ARGS
3022	ENDPROC iemAImpl_fld_r80_from_r80
3023
3024
3025	;;
3026	; Store a 80-bit floating point register to memory
3027	;
3028	; @param A0 FPU context (fxsave).
3029	; @param A1 Where to return the output FSW.
3030	; @param A2 Where to store the 80-bit value.
3031	; @param A3 Pointer to the 80-bit register value.
3032	;
3033	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3034	PROLOGUE_4_ARGS
3035	sub xSP, 20h
3036
3037	fninit
3038	fld tword [A3]
3039	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3040	fstp tword [A2]
3041
3042	fnstsw word [A1]
3043
3044	fninit
3045	add xSP, 20h
3046	EPILOGUE_4_ARGS
3047	ENDPROC iemAImpl_fst_r80_to_r80
3048
3049
3050	;;
3051	; Loads an 80-bit floating point register value in BCD format from memory.
3052	;
3053	; @param A0 FPU context (fxsave).
3054	; @param A1 Pointer to a IEMFPURESULT for the output.
3055	; @param A2 Pointer to the 80-bit BCD value to load.
3056	;
3057	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3058	PROLOGUE_3_ARGS
3059	sub xSP, 20h
3060
3061	fninit
3062	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3063	fbld tword [A2]
3064
3065	fnstsw word [A1 + IEMFPURESULT.FSW]
3066	fnclex
3067	fstp tword [A1 + IEMFPURESULT.r80Result]
3068
3069	fninit
3070	add xSP, 20h
3071	EPILOGUE_3_ARGS
3072	ENDPROC iemAImpl_fld_r80_from_d80
3073
3074
3075	;;
3076	; Store a 80-bit floating point register to memory as BCD
3077	;
3078	; @param A0 FPU context (fxsave).
3079	; @param A1 Where to return the output FSW.
3080	; @param A2 Where to store the 80-bit BCD value.
3081	; @param A3 Pointer to the 80-bit register value.
3082	;
3083	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3084	PROLOGUE_4_ARGS
3085	sub xSP, 20h
3086
3087	fninit
3088	fld tword [A3]
3089	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3090	fbstp tword [A2]
3091
3092	fnstsw word [A1]
3093
3094	fninit
3095	add xSP, 20h
3096	EPILOGUE_4_ARGS
3097	ENDPROC iemAImpl_fst_r80_to_d80
3098
3099
3100	;;
3101	; FPU instruction working on two 80-bit floating point values.
3102	;
3103	; @param 1 The instruction
3104	;
3105	; @param A0 FPU context (fxsave).
3106	; @param A1 Pointer to a IEMFPURESULT for the output.
3107	; @param A2 Pointer to the first 80-bit value (ST0)
3108	; @param A3 Pointer to the second 80-bit value (STn).
3109	;
3110	%macro IEMIMPL_FPU_R80_BY_R80 2
3111	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3112	PROLOGUE_4_ARGS
3113	sub xSP, 20h
3114
3115	fninit
3116	fld tword [A3]
3117	fld tword [A2]
3118	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3119	%1 %2
3120
3121	fnstsw word [A1 + IEMFPURESULT.FSW]
3122	fnclex
3123	fstp tword [A1 + IEMFPURESULT.r80Result]
3124
3125	fninit
3126	add xSP, 20h
3127	EPILOGUE_4_ARGS
3128	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3129	%endmacro
3130
3131	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3132	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3133	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3134	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3135	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3136	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3137	IEMIMPL_FPU_R80_BY_R80 fprem, {}
3138	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3139	IEMIMPL_FPU_R80_BY_R80 fscale, {}
3140
3141
3142	;;
3143	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3144	; storing the result in ST1 and popping the stack.
3145	;
3146	; @param 1 The instruction
3147	;
3148	; @param A0 FPU context (fxsave).
3149	; @param A1 Pointer to a IEMFPURESULT for the output.
3150	; @param A2 Pointer to the first 80-bit value (ST1).
3151	; @param A3 Pointer to the second 80-bit value (ST0).
3152	;
3153	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3154	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3155	PROLOGUE_4_ARGS
3156	sub xSP, 20h
3157
3158	fninit
3159	fld tword [A2]
3160	fld tword [A3]
3161	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3162	%1
3163
3164	fnstsw word [A1 + IEMFPURESULT.FSW]
3165	fnclex
3166	fstp tword [A1 + IEMFPURESULT.r80Result]
3167
3168	fninit
3169	add xSP, 20h
3170	EPILOGUE_4_ARGS
3171	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3172	%endmacro
3173
3174	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3175	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3176	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3177
3178
3179	;;
3180	; FPU instruction working on two 80-bit floating point values, only
3181	; returning FSW.
3182	;
3183	; @param 1 The instruction
3184	;
3185	; @param A0 FPU context (fxsave).
3186	; @param A1 Pointer to a uint16_t for the resulting FSW.
3187	; @param A2 Pointer to the first 80-bit value.
3188	; @param A3 Pointer to the second 80-bit value.
3189	;
3190	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3191	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3192	PROLOGUE_4_ARGS
3193	sub xSP, 20h
3194
3195	fninit
3196	fld tword [A3]
3197	fld tword [A2]
3198	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3199	%1 st0, st1
3200
3201	fnstsw word [A1]
3202
3203	fninit
3204	add xSP, 20h
3205	EPILOGUE_4_ARGS
3206	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3207	%endmacro
3208
3209	IEMIMPL_FPU_R80_BY_R80_FSW fcom
3210	IEMIMPL_FPU_R80_BY_R80_FSW fucom
3211
3212
3213	;;
3214	; FPU instruction working on two 80-bit floating point values,
3215	; returning FSW and EFLAGS (eax).
3216	;
3217	; @param 1 The instruction
3218	;
3219	; @returns EFLAGS in EAX.
3220	; @param A0 FPU context (fxsave).
3221	; @param A1 Pointer to a uint16_t for the resulting FSW.
3222	; @param A2 Pointer to the first 80-bit value.
3223	; @param A3 Pointer to the second 80-bit value.
3224	;
3225	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3226	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3227	PROLOGUE_4_ARGS
3228	sub xSP, 20h
3229
3230	fninit
3231	fld tword [A3]
3232	fld tword [A2]
3233	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3234	%1 st1
3235
3236	fnstsw word [A1]
3237	pushf
3238	pop xAX
3239
3240	fninit
3241	add xSP, 20h
3242	EPILOGUE_4_ARGS
3243	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3244	%endmacro
3245
3246	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3247	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3248
3249
3250	;;
3251	; FPU instruction working on one 80-bit floating point value.
3252	;
3253	; @param 1 The instruction
3254	;
3255	; @param A0 FPU context (fxsave).
3256	; @param A1 Pointer to a IEMFPURESULT for the output.
3257	; @param A2 Pointer to the 80-bit value.
3258	;
3259	%macro IEMIMPL_FPU_R80 1
3260	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3261	PROLOGUE_3_ARGS
3262	sub xSP, 20h
3263
3264	fninit
3265	fld tword [A2]
3266	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3267	%1
3268
3269	fnstsw word [A1 + IEMFPURESULT.FSW]
3270	fnclex
3271	fstp tword [A1 + IEMFPURESULT.r80Result]
3272
3273	fninit
3274	add xSP, 20h
3275	EPILOGUE_3_ARGS
3276	ENDPROC iemAImpl_ %+ %1 %+ _r80
3277	%endmacro
3278
3279	IEMIMPL_FPU_R80 fchs
3280	IEMIMPL_FPU_R80 fabs
3281	IEMIMPL_FPU_R80 f2xm1
3282	IEMIMPL_FPU_R80 fsqrt
3283	IEMIMPL_FPU_R80 frndint
3284	IEMIMPL_FPU_R80 fsin
3285	IEMIMPL_FPU_R80 fcos
3286
3287
3288	;;
3289	; FPU instruction working on one 80-bit floating point value, only
3290	; returning FSW.
3291	;
3292	; @param 1 The instruction
3293	; @param 2 Non-zero to also restore FTW.
3294	;
3295	; @param A0 FPU context (fxsave).
3296	; @param A1 Pointer to a uint16_t for the resulting FSW.
3297	; @param A2 Pointer to the 80-bit value.
3298	;
3299	%macro IEMIMPL_FPU_R80_FSW 2
3300	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3301	PROLOGUE_3_ARGS
3302	sub xSP, 20h
3303
3304	fninit
3305	fld tword [A2]
3306	%if %2 != 0
3307	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3308	%else
3309	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3310	%endif
3311	%1
3312
3313	fnstsw word [A1]
3314
3315	fninit
3316	add xSP, 20h
3317	EPILOGUE_3_ARGS
3318	ENDPROC iemAImpl_ %+ %1 %+ _r80
3319	%endmacro
3320
3321	IEMIMPL_FPU_R80_FSW ftst, 0
3322	IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3323
3324
3325
3326	;;
3327	; FPU instruction loading a 80-bit floating point constant.
3328	;
3329	; @param 1 The instruction
3330	;
3331	; @param A0 FPU context (fxsave).
3332	; @param A1 Pointer to a IEMFPURESULT for the output.
3333	;
3334	%macro IEMIMPL_FPU_R80_CONST 1
3335	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3336	PROLOGUE_2_ARGS
3337	sub xSP, 20h
3338
3339	fninit
3340	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3341	%1
3342
3343	fnstsw word [A1 + IEMFPURESULT.FSW]
3344	fnclex
3345	fstp tword [A1 + IEMFPURESULT.r80Result]
3346
3347	fninit
3348	add xSP, 20h
3349	EPILOGUE_2_ARGS
3350	ENDPROC iemAImpl_ %+ %1 %+
3351	%endmacro
3352
3353	IEMIMPL_FPU_R80_CONST fld1
3354	IEMIMPL_FPU_R80_CONST fldl2t
3355	IEMIMPL_FPU_R80_CONST fldl2e
3356	IEMIMPL_FPU_R80_CONST fldpi
3357	IEMIMPL_FPU_R80_CONST fldlg2
3358	IEMIMPL_FPU_R80_CONST fldln2
3359	IEMIMPL_FPU_R80_CONST fldz
3360
3361
3362	;;
3363	; FPU instruction working on one 80-bit floating point value, outputing two.
3364	;
3365	; @param 1 The instruction
3366	;
3367	; @param A0 FPU context (fxsave).
3368	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3369	; @param A2 Pointer to the 80-bit value.
3370	;
3371	%macro IEMIMPL_FPU_R80_R80 1
3372	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3373	PROLOGUE_3_ARGS
3374	sub xSP, 20h
3375
3376	fninit
3377	fld tword [A2]
3378	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3379	%1
3380
3381	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3382	fnclex
3383	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3384	fnclex
3385	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3386
3387	fninit
3388	add xSP, 20h
3389	EPILOGUE_3_ARGS
3390	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3391	%endmacro
3392
3393	IEMIMPL_FPU_R80_R80 fptan
3394	IEMIMPL_FPU_R80_R80 fxtract
3395	IEMIMPL_FPU_R80_R80 fsincos
3396
3397
3398
3399
3400	;---------------------- SSE and MMX Operations ----------------------
3401
3402	;; @todo what do we need to do for MMX?
3403	%macro IEMIMPL_MMX_PROLOGUE 0
3404	%endmacro
3405	%macro IEMIMPL_MMX_EPILOGUE 0
3406	%endmacro
3407
3408	;; @todo what do we need to do for SSE?
3409	%macro IEMIMPL_SSE_PROLOGUE 0
3410	%endmacro
3411	%macro IEMIMPL_SSE_EPILOGUE 0
3412	%endmacro
3413
3414
3415	;;
3416	; Media instruction working on two full sized registers.
3417	;
3418	; @param 1 The instruction
3419	;
3420	; @param A0 FPU context (fxsave).
3421	; @param A1 Pointer to the first media register size operand (input/output).
3422	; @param A2 Pointer to the second media register size operand (input).
3423	;
3424	%macro IEMIMPL_MEDIA_F2 1
3425	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3426	PROLOGUE_3_ARGS
3427	IEMIMPL_MMX_PROLOGUE
3428
3429	movq mm0, [A1]
3430	movq mm1, [A2]
3431	%1 mm0, mm1
3432	movq [A1], mm0
3433
3434	IEMIMPL_MMX_EPILOGUE
3435	EPILOGUE_3_ARGS
3436	ENDPROC iemAImpl_ %+ %1 %+ _u64
3437
3438	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3439	PROLOGUE_3_ARGS
3440	IEMIMPL_SSE_PROLOGUE
3441
3442	movdqu xmm0, [A1]
3443	movdqu xmm1, [A2]
3444	%1 xmm0, xmm1
3445	movdqu [A1], xmm0
3446
3447	IEMIMPL_SSE_EPILOGUE
3448	EPILOGUE_3_ARGS
3449	ENDPROC iemAImpl_ %+ %1 %+ _u128
3450	%endmacro
3451
3452	IEMIMPL_MEDIA_F2 pxor
3453	IEMIMPL_MEDIA_F2 pcmpeqb
3454	IEMIMPL_MEDIA_F2 pcmpeqw
3455	IEMIMPL_MEDIA_F2 pcmpeqd
3456
3457
3458	;;
3459	; Media instruction working on one full sized and one half sized register (lower half).
3460	;
3461	; @param 1 The instruction
3462	; @param 2 1 if MMX is included, 0 if not.
3463	;
3464	; @param A0 FPU context (fxsave).
3465	; @param A1 Pointer to the first full sized media register operand (input/output).
3466	; @param A2 Pointer to the second half sized media register operand (input).
3467	;
3468	%macro IEMIMPL_MEDIA_F1L1 2
3469	%if %2 != 0
3470	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3471	PROLOGUE_3_ARGS
3472	IEMIMPL_MMX_PROLOGUE
3473
3474	movq mm0, [A1]
3475	movd mm1, [A2]
3476	%1 mm0, mm1
3477	movq [A1], mm0
3478
3479	IEMIMPL_MMX_EPILOGUE
3480	EPILOGUE_3_ARGS
3481	ENDPROC iemAImpl_ %+ %1 %+ _u64
3482	%endif
3483
3484	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3485	PROLOGUE_3_ARGS
3486	IEMIMPL_SSE_PROLOGUE
3487
3488	movdqu xmm0, [A1]
3489	movq xmm1, [A2]
3490	%1 xmm0, xmm1
3491	movdqu [A1], xmm0
3492
3493	IEMIMPL_SSE_EPILOGUE
3494	EPILOGUE_3_ARGS
3495	ENDPROC iemAImpl_ %+ %1 %+ _u128
3496	%endmacro
3497
3498	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3499	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3500	IEMIMPL_MEDIA_F1L1 punpckldq, 1
3501	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3502
3503
3504	;;
3505	; Media instruction working on one full sized and one half sized register (high half).
3506	;
3507	; @param 1 The instruction
3508	; @param 2 1 if MMX is included, 0 if not.
3509	;
3510	; @param A0 FPU context (fxsave).
3511	; @param A1 Pointer to the first full sized media register operand (input/output).
3512	; @param A2 Pointer to the second full sized media register operand, where we
3513	; will only use the upper half (input).
3514	;
3515	%macro IEMIMPL_MEDIA_F1H1 2
3516	%if %2 != 0
3517	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3518	PROLOGUE_3_ARGS
3519	IEMIMPL_MMX_PROLOGUE
3520
3521	movq mm0, [A1]
3522	movq mm1, [A2]
3523	%1 mm0, mm1
3524	movq [A1], mm0
3525
3526	IEMIMPL_MMX_EPILOGUE
3527	EPILOGUE_3_ARGS
3528	ENDPROC iemAImpl_ %+ %1 %+ _u64
3529	%endif
3530
3531	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3532	PROLOGUE_3_ARGS
3533	IEMIMPL_SSE_PROLOGUE
3534
3535	movdqu xmm0, [A1]
3536	movdqu xmm1, [A2]
3537	%1 xmm0, xmm1
3538	movdqu [A1], xmm0
3539
3540	IEMIMPL_SSE_EPILOGUE
3541	EPILOGUE_3_ARGS
3542	ENDPROC iemAImpl_ %+ %1 %+ _u128
3543	%endmacro
3544
3545	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3546	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3547	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3548	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3549
3550
3551	;
3552	; Shufflers with evil 8-bit immediates.
3553	;
3554
3555	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3556	PROLOGUE_4_ARGS
3557	IEMIMPL_MMX_PROLOGUE
3558
3559	movq mm0, [A1]
3560	movq mm1, [A2]
3561	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3562	lea T1, [.imm0 xWrtRIP]
3563	lea T1, [T1 + T0]
3564	call T1
3565	movq [A1], mm0
3566
3567	IEMIMPL_MMX_EPILOGUE
3568	EPILOGUE_4_ARGS
3569	%assign bImm 0
3570	%rep 256
3571	.imm %+ bImm:
3572	pshufw mm0, mm1, bImm
3573	ret
3574	%assign bImm bImm + 1
3575	%endrep
3576	.immEnd: ; 256*5 == 0x500
3577	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3578	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3579	ENDPROC iemAImpl_pshufw
3580
3581
3582	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3583	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3584	PROLOGUE_4_ARGS
3585	IEMIMPL_SSE_PROLOGUE
3586
3587	movdqu xmm0, [A1]
3588	movdqu xmm1, [A2]
3589	lea T1, [.imm0 xWrtRIP]
3590	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3591	lea T1, [T1 + T0*2]
3592	call T1
3593	movdqu [A1], xmm0
3594
3595	IEMIMPL_SSE_EPILOGUE
3596	EPILOGUE_4_ARGS
3597	%assign bImm 0
3598	%rep 256
3599	.imm %+ bImm:
3600	%1 xmm0, xmm1, bImm
3601	ret
3602	%assign bImm bImm + 1
3603	%endrep
3604	.immEnd: ; 256*6 == 0x600
3605	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3606	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3607	ENDPROC iemAImpl_ %+ %1
3608	%endmacro
3609
3610	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3611	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3612	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3613
3614
3615	;
3616	; Move byte mask.
3617	;
3618
3619	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3620	PROLOGUE_3_ARGS
3621	IEMIMPL_MMX_PROLOGUE
3622
3623	mov T0, [A1]
3624	movq mm1, [A2]
3625	pmovmskb T0, mm1
3626	mov [A1], T0
3627	%ifdef RT_ARCH_X86
3628	mov dword [A1 + 4], 0
3629	%endif
3630	IEMIMPL_MMX_EPILOGUE
3631	EPILOGUE_3_ARGS
3632	ENDPROC iemAImpl_pmovmskb_u64
3633
3634	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3635	PROLOGUE_3_ARGS
3636	IEMIMPL_SSE_PROLOGUE
3637
3638	mov T0, [A1]
3639	movdqu xmm1, [A2]
3640	pmovmskb T0, xmm1
3641	mov [A1], T0
3642	%ifdef RT_ARCH_X86
3643	mov dword [A1 + 4], 0
3644	%endif
3645	IEMIMPL_SSE_EPILOGUE
3646	EPILOGUE_3_ARGS
3647	ENDPROC iemAImpl_pmovmskb_u128
3648

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 95341

Download in other formats: