IEMAllAImpl.asm@ 94169

Last change on this file since 94169 was 94164, checked in by vboxsync, 3 years ago
VMM/IEM: fixed bug in cmpxchg16b worker for gcc targets.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 92.0 KB

Line
1	; $Id: IEMAllAImpl.asm 94164 2022-03-11 09:05:10Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;*********************************************************************************************************************************
20	;* Header Files *
21	;*********************************************************************************************************************************
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;*********************************************************************************************************************************
28	;* Defined Constants And Macros *
29	;*********************************************************************************************************************************
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%define T2 r10 ; only AMD64
176	%define T2_32 r10d
177	%define T2_16 r10w
178	%define T2_8 r10b
179
180	%else
181	; x86
182	%macro PROLOGUE_1_ARGS 0
183	push edi
184	%endmacro
185	%macro EPILOGUE_1_ARGS 0
186	pop edi
187	ret 0
188	%endmacro
189	%macro EPILOGUE_1_ARGS_EX 1
190	pop edi
191	ret %1
192	%endmacro
193
194	%macro PROLOGUE_2_ARGS 0
195	push edi
196	%endmacro
197	%macro EPILOGUE_2_ARGS 0
198	pop edi
199	ret 0
200	%endmacro
201	%macro EPILOGUE_2_ARGS_EX 1
202	pop edi
203	ret %1
204	%endmacro
205
206	%macro PROLOGUE_3_ARGS 0
207	push ebx
208	mov ebx, [esp + 4 + 4]
209	push edi
210	%endmacro
211	%macro EPILOGUE_3_ARGS_EX 1
212	%if (%1) < 4
213	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214	%endif
215	pop edi
216	pop ebx
217	ret %1
218	%endmacro
219	%macro EPILOGUE_3_ARGS 0
220	EPILOGUE_3_ARGS_EX 4
221	%endmacro
222
223	%macro PROLOGUE_4_ARGS 0
224	push ebx
225	push edi
226	push esi
227	mov ebx, [esp + 12 + 4 + 0]
228	mov esi, [esp + 12 + 4 + 4]
229	%endmacro
230	%macro EPILOGUE_4_ARGS_EX 1
231	%if (%1) < 8
232	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233	%endif
234	pop esi
235	pop edi
236	pop ebx
237	ret %1
238	%endmacro
239	%macro EPILOGUE_4_ARGS 0
240	EPILOGUE_4_ARGS_EX 8
241	%endmacro
242
243	%define A0 ecx
244	%define A0_32 ecx
245	%define A0_16 cx
246	%define A0_8 cl
247
248	%define A1 edx
249	%define A1_32 edx
250	%define A1_16 dx
251	%define A1_8 dl
252
253	%define A2 ebx
254	%define A2_32 ebx
255	%define A2_16 bx
256	%define A2_8 bl
257
258	%define A3 esi
259	%define A3_32 esi
260	%define A3_16 si
261
262	%define T0 eax
263	%define T0_32 eax
264	%define T0_16 ax
265	%define T0_8 al
266
267	%define T1 edi
268	%define T1_32 edi
269	%define T1_16 di
270	%endif
271
272
273	;;
274	; Load the relevant flags from [%1] if there are undefined flags (%3).
275	;
276	; @remarks Clobbers T0, stack. Changes EFLAGS.
277	; @param A2 The register pointing to the flags.
278	; @param 1 The parameter (A0..A3) pointing to the eflags.
279	; @param 2 The set of modified flags.
280	; @param 3 The set of undefined flags.
281	;
282	%macro IEM_MAYBE_LOAD_FLAGS 3
283	;%if (%3) != 0
284	pushf ; store current flags
285	mov T0_32, [%1] ; load the guest flags
286	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
287	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
288	or [xSP], T0 ; merge guest flags with host flags.
289	popf ; load the mixed flags.
290	;%endif
291	%endmacro
292
293	;;
294	; Update the flag.
295	;
296	; @remarks Clobbers T0, T1, stack.
297	; @param 1 The register pointing to the EFLAGS.
298	; @param 2 The mask of modified flags to save.
299	; @param 3 The mask of undefined flags to (maybe) save.
300	;
301	%macro IEM_SAVE_FLAGS 3
302	%if (%2 \| %3) != 0
303	pushf
304	pop T1
305	mov T0_32, [%1] ; flags
306	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
307	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
308	or T0_32, T1_32 ; combine the flags.
309	mov [%1], T0_32 ; save the flags.
310	%endif
311	%endmacro
312
313	;;
314	; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315	;
316	; @remarks Clobbers T0, T1, stack.
317	; @param 1 The register pointing to the EFLAGS.
318	; @param 2 The mask of modified flags to save.
319	; @param 3 Mask of additional flags to always clear
320	; @param 4 Mask of additional flags to always set.
321	;
322	%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323	%if (%2 \| %3 \| %4) != 0
324	pushf
325	pop T1
326	mov T0_32, [%1] ; load flags.
327	and T0_32, ~(%2 \| %3) ; clear the modified and always cleared flags.
328	and T1_32, (%2) ; select the modified flags.
329	or T0_32, T1_32 ; combine the flags.
330	%if (%4) != 0
331	or T0_32, %4 ; add the always set flags.
332	%endif
333	mov [%1], T0_32 ; save the result.
334	%endif
335	%endmacro
336
337	;;
338	; Calculates the new EFLAGS using fixed clear and set bit masks.
339	;
340	; @remarks Clobbers T0.
341	; @param 1 The register pointing to the EFLAGS.
342	; @param 2 Mask of additional flags to always clear
343	; @param 3 Mask of additional flags to always set.
344	;
345	%macro IEM_ADJUST_FLAGS 3
346	%if (%2 \| %3) != 0
347	mov T0_32, [%1] ; Load flags.
348	%if (%2) != 0
349	and T0_32, ~(%2) ; Remove the always cleared flags.
350	%endif
351	%if (%3) != 0
352	or T0_32, %3 ; Add the always set flags.
353	%endif
354	mov [%1], T0_32 ; Save the result.
355	%endif
356	%endmacro
357
358	;;
359	; Calculates the new EFLAGS using fixed clear and set bit masks.
360	;
361	; @remarks Clobbers T0, %4.
362	; @param 1 The register pointing to the EFLAGS.
363	; @param 2 Mask of additional flags to always clear
364	; @param 3 Mask of additional flags to always set.
365	; @param 4 The (full) register containing the parity table index. Will be modified!
366	;
367	%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
368	mov T0_32, [%1] ; Load flags.
369	and T0_32, ~(%2 \| X86_EFL_PF) ; Remove PF and the always cleared flags.
370	%if (%3) != 0
371	or T0_32, %3 ; Add the always set flags.
372	%endif
373	and %4, 0xff
374	%ifdef RT_ARCH_AMD64
375	lea T2, [NAME(g_afParity) xWrtRIP]
376	or T0_8, [T2 + %4]
377	%else
378	or T0_8, [NAME(g_afParity) + %4]
379	%endif
380	mov [%1], T0_32 ; Save the result.
381	%endmacro
382
383
384	;*********************************************************************************************************************************
385	;* External Symbols *
386	;*********************************************************************************************************************************
387	extern NAME(g_afParity)
388
389
390	;;
391	; Macro for implementing a binary operator.
392	;
393	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
394	; variants, except on 32-bit system where the 64-bit accesses requires hand
395	; coding.
396	;
397	; All the functions takes a pointer to the destination memory operand in A0,
398	; the source register operand in A1 and a pointer to eflags in A2.
399	;
400	; @param 1 The instruction mnemonic.
401	; @param 2 Non-zero if there should be a locked version.
402	; @param 3 The modified flags.
403	; @param 4 The undefined flags.
404	;
405	%macro IEMIMPL_BIN_OP 4
406	BEGINCODE
407	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
408	PROLOGUE_3_ARGS
409	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
410	%1 byte [A0], A1_8
411	IEM_SAVE_FLAGS A2, %3, %4
412	EPILOGUE_3_ARGS
413	ENDPROC iemAImpl_ %+ %1 %+ _u8
414
415	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
416	PROLOGUE_3_ARGS
417	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
418	%1 word [A0], A1_16
419	IEM_SAVE_FLAGS A2, %3, %4
420	EPILOGUE_3_ARGS
421	ENDPROC iemAImpl_ %+ %1 %+ _u16
422
423	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
424	PROLOGUE_3_ARGS
425	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
426	%1 dword [A0], A1_32
427	IEM_SAVE_FLAGS A2, %3, %4
428	EPILOGUE_3_ARGS
429	ENDPROC iemAImpl_ %+ %1 %+ _u32
430
431	%ifdef RT_ARCH_AMD64
432	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
433	PROLOGUE_3_ARGS
434	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
435	%1 qword [A0], A1
436	IEM_SAVE_FLAGS A2, %3, %4
437	EPILOGUE_3_ARGS_EX 8
438	ENDPROC iemAImpl_ %+ %1 %+ _u64
439	%endif ; RT_ARCH_AMD64
440
441	%if %2 != 0 ; locked versions requested?
442
443	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
444	PROLOGUE_3_ARGS
445	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
446	lock %1 byte [A0], A1_8
447	IEM_SAVE_FLAGS A2, %3, %4
448	EPILOGUE_3_ARGS
449	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
450
451	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
452	PROLOGUE_3_ARGS
453	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
454	lock %1 word [A0], A1_16
455	IEM_SAVE_FLAGS A2, %3, %4
456	EPILOGUE_3_ARGS
457	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
458
459	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
460	PROLOGUE_3_ARGS
461	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
462	lock %1 dword [A0], A1_32
463	IEM_SAVE_FLAGS A2, %3, %4
464	EPILOGUE_3_ARGS
465	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
466
467	%ifdef RT_ARCH_AMD64
468	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
469	PROLOGUE_3_ARGS
470	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
471	lock %1 qword [A0], A1
472	IEM_SAVE_FLAGS A2, %3, %4
473	EPILOGUE_3_ARGS_EX 8
474	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
475	%endif ; RT_ARCH_AMD64
476	%endif ; locked
477	%endmacro
478
479	; instr,lock,modified-flags.
480	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
481	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
482	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
483	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
484	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
485	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
486	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
487	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
488	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
489
490
491	;;
492	; Macro for implementing a bit operator.
493	;
494	; This will generate code for the 16, 32 and 64 bit accesses with locked
495	; variants, except on 32-bit system where the 64-bit accesses requires hand
496	; coding.
497	;
498	; All the functions takes a pointer to the destination memory operand in A0,
499	; the source register operand in A1 and a pointer to eflags in A2.
500	;
501	; @param 1 The instruction mnemonic.
502	; @param 2 Non-zero if there should be a locked version.
503	; @param 3 The modified flags.
504	; @param 4 The undefined flags.
505	;
506	%macro IEMIMPL_BIT_OP 4
507	BEGINCODE
508	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509	PROLOGUE_3_ARGS
510	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
511	%1 word [A0], A1_16
512	IEM_SAVE_FLAGS A2, %3, %4
513	EPILOGUE_3_ARGS
514	ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517	PROLOGUE_3_ARGS
518	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
519	%1 dword [A0], A1_32
520	IEM_SAVE_FLAGS A2, %3, %4
521	EPILOGUE_3_ARGS
522	ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524	%ifdef RT_ARCH_AMD64
525	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526	PROLOGUE_3_ARGS
527	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
528	%1 qword [A0], A1
529	IEM_SAVE_FLAGS A2, %3, %4
530	EPILOGUE_3_ARGS_EX 8
531	ENDPROC iemAImpl_ %+ %1 %+ _u64
532	%endif ; RT_ARCH_AMD64
533
534	%if %2 != 0 ; locked versions requested?
535
536	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
537	PROLOGUE_3_ARGS
538	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
539	lock %1 word [A0], A1_16
540	IEM_SAVE_FLAGS A2, %3, %4
541	EPILOGUE_3_ARGS
542	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
543
544	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
545	PROLOGUE_3_ARGS
546	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
547	lock %1 dword [A0], A1_32
548	IEM_SAVE_FLAGS A2, %3, %4
549	EPILOGUE_3_ARGS
550	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
551
552	%ifdef RT_ARCH_AMD64
553	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
554	PROLOGUE_3_ARGS
555	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
556	lock %1 qword [A0], A1
557	IEM_SAVE_FLAGS A2, %3, %4
558	EPILOGUE_3_ARGS_EX 8
559	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
560	%endif ; RT_ARCH_AMD64
561	%endif ; locked
562	%endmacro
563	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
564	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
565	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
566	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
567
568	;;
569	; Macro for implementing a bit search operator.
570	;
571	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
572	; system where the 64-bit accesses requires hand coding.
573	;
574	; All the functions takes a pointer to the destination memory operand in A0,
575	; the source register operand in A1 and a pointer to eflags in A2.
576	;
577	; In the ZF case the destination register is 'undefined', however it seems that
578	; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
579	; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
580	; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
581	; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
582	;
583	; @param 1 The instruction mnemonic.
584	; @param 2 The modified flags.
585	; @param 3 The undefined flags.
586	;
587	%macro IEMIMPL_BIT_OP 3
588	BEGINCODE
589	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
590	PROLOGUE_3_ARGS
591	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
592	%1 T0_16, A1_16
593	jz .unchanged_dst
594	mov [A0], T0_16
595	.unchanged_dst:
596	IEM_SAVE_FLAGS A2, %2, %3
597	EPILOGUE_3_ARGS
598	ENDPROC iemAImpl_ %+ %1 %+ _u16
599
600	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
601	PROLOGUE_3_ARGS
602	%1 T1_16, A1_16
603	jz .unchanged_dst
604	mov [A0], T1_16
605	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
606	EPILOGUE_3_ARGS
607	.unchanged_dst:
608	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
609	EPILOGUE_3_ARGS
610	ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
611
612	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
613	PROLOGUE_3_ARGS
614	%1 T0_16, A1_16
615	jz .unchanged_dst
616	mov [A0], T0_16
617	.unchanged_dst:
618	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
619	EPILOGUE_3_ARGS
620	ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
621
622
623	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
624	PROLOGUE_3_ARGS
625	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
626	%1 T0_32, A1_32
627	jz .unchanged_dst
628	mov [A0], T0_32
629	.unchanged_dst:
630	IEM_SAVE_FLAGS A2, %2, %3
631	EPILOGUE_3_ARGS
632	ENDPROC iemAImpl_ %+ %1 %+ _u32
633
634	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
635	PROLOGUE_3_ARGS
636	%1 T1_32, A1_32
637	jz .unchanged_dst
638	mov [A0], T1_32
639	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
640	EPILOGUE_3_ARGS
641	.unchanged_dst:
642	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
643	EPILOGUE_3_ARGS
644	ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
645
646	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
647	PROLOGUE_3_ARGS
648	%1 T0_32, A1_32
649	jz .unchanged_dst
650	mov [A0], T0_32
651	.unchanged_dst:
652	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
653	EPILOGUE_3_ARGS
654	ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
655
656
657	%ifdef RT_ARCH_AMD64
658
659	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
660	PROLOGUE_3_ARGS
661	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
662	%1 T0, A1
663	jz .unchanged_dst
664	mov [A0], T0
665	.unchanged_dst:
666	IEM_SAVE_FLAGS A2, %2, %3
667	EPILOGUE_3_ARGS_EX 8
668	ENDPROC iemAImpl_ %+ %1 %+ _u64
669
670	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
671	PROLOGUE_3_ARGS
672	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
673	%1 T1, A1
674	jz .unchanged_dst
675	mov [A0], T1
676	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
677	EPILOGUE_3_ARGS
678	.unchanged_dst:
679	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
680	EPILOGUE_3_ARGS
681	ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
682
683	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
684	PROLOGUE_3_ARGS
685	%1 T0, A1
686	jz .unchanged_dst
687	mov [A0], T0
688	.unchanged_dst:
689	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
690	EPILOGUE_3_ARGS_EX 8
691	ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
692
693	%endif ; RT_ARCH_AMD64
694	%endmacro
695
696	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
697	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
698
699
700	;
701	; IMUL is also a similar but yet different case (no lock, no mem dst).
702	; The rDX:rAX variant of imul is handled together with mul further down.
703	;
704	BEGINCODE
705	BEGINPROC_FASTCALL iemAImpl_imul_two_u16_intel, 12
706	BEGINPROC_FASTCALL iemAImpl_imul_two_u16_amd, 12
707	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
708	PROLOGUE_3_ARGS
709	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
710	imul A1_16, word [A0]
711	mov [A0], A1_16
712	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
713	EPILOGUE_3_ARGS
714	ENDPROC iemAImpl_imul_two_u16
715
716	BEGINPROC_FASTCALL iemAImpl_imul_two_u32_intel, 12
717	BEGINPROC_FASTCALL iemAImpl_imul_two_u32_amd, 12
718	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
719	PROLOGUE_3_ARGS
720	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
721	imul A1_32, dword [A0]
722	mov [A0], A1_32
723	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
724	EPILOGUE_3_ARGS
725	ENDPROC iemAImpl_imul_two_u32
726
727	%ifdef RT_ARCH_AMD64
728	BEGINPROC_FASTCALL iemAImpl_imul_two_u64_intel, 16
729	BEGINPROC_FASTCALL iemAImpl_imul_two_u64_amd, 16
730	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
731	PROLOGUE_3_ARGS
732	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
733	imul A1, qword [A0]
734	mov [A0], A1
735	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
736	EPILOGUE_3_ARGS_EX 8
737	ENDPROC iemAImpl_imul_two_u64
738	%endif ; RT_ARCH_AMD64
739
740
741	;
742	; XCHG for memory operands. This implies locking. No flag changes.
743	;
744	; Each function takes two arguments, first the pointer to the memory,
745	; then the pointer to the register. They all return void.
746	;
747	BEGINCODE
748	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
749	PROLOGUE_2_ARGS
750	mov T0_8, [A1]
751	xchg [A0], T0_8
752	mov [A1], T0_8
753	EPILOGUE_2_ARGS
754	ENDPROC iemAImpl_xchg_u8_locked
755
756	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
757	PROLOGUE_2_ARGS
758	mov T0_16, [A1]
759	xchg [A0], T0_16
760	mov [A1], T0_16
761	EPILOGUE_2_ARGS
762	ENDPROC iemAImpl_xchg_u16_locked
763
764	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
765	PROLOGUE_2_ARGS
766	mov T0_32, [A1]
767	xchg [A0], T0_32
768	mov [A1], T0_32
769	EPILOGUE_2_ARGS
770	ENDPROC iemAImpl_xchg_u32_locked
771
772	%ifdef RT_ARCH_AMD64
773	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
774	PROLOGUE_2_ARGS
775	mov T0, [A1]
776	xchg [A0], T0
777	mov [A1], T0
778	EPILOGUE_2_ARGS
779	ENDPROC iemAImpl_xchg_u64_locked
780	%endif
781
782	; Unlocked variants for fDisregardLock mode.
783
784	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
785	PROLOGUE_2_ARGS
786	mov T0_8, [A1]
787	mov T1_8, [A0]
788	mov [A0], T0_8
789	mov [A1], T1_8
790	EPILOGUE_2_ARGS
791	ENDPROC iemAImpl_xchg_u8_unlocked
792
793	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
794	PROLOGUE_2_ARGS
795	mov T0_16, [A1]
796	mov T1_16, [A0]
797	mov [A0], T0_16
798	mov [A1], T1_16
799	EPILOGUE_2_ARGS
800	ENDPROC iemAImpl_xchg_u16_unlocked
801
802	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
803	PROLOGUE_2_ARGS
804	mov T0_32, [A1]
805	mov T1_32, [A0]
806	mov [A0], T0_32
807	mov [A1], T1_32
808	EPILOGUE_2_ARGS
809	ENDPROC iemAImpl_xchg_u32_unlocked
810
811	%ifdef RT_ARCH_AMD64
812	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
813	PROLOGUE_2_ARGS
814	mov T0, [A1]
815	mov T1, [A0]
816	mov [A0], T0
817	mov [A1], T1
818	EPILOGUE_2_ARGS
819	ENDPROC iemAImpl_xchg_u64_unlocked
820	%endif
821
822
823	;
824	; XADD for memory operands.
825	;
826	; Each function takes three arguments, first the pointer to the
827	; memory/register, then the pointer to the register, and finally a pointer to
828	; eflags. They all return void.
829	;
830	BEGINCODE
831	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
832	PROLOGUE_3_ARGS
833	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
834	mov T0_8, [A1]
835	xadd [A0], T0_8
836	mov [A1], T0_8
837	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
838	EPILOGUE_3_ARGS
839	ENDPROC iemAImpl_xadd_u8
840
841	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
842	PROLOGUE_3_ARGS
843	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
844	mov T0_16, [A1]
845	xadd [A0], T0_16
846	mov [A1], T0_16
847	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
848	EPILOGUE_3_ARGS
849	ENDPROC iemAImpl_xadd_u16
850
851	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
852	PROLOGUE_3_ARGS
853	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
854	mov T0_32, [A1]
855	xadd [A0], T0_32
856	mov [A1], T0_32
857	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
858	EPILOGUE_3_ARGS
859	ENDPROC iemAImpl_xadd_u32
860
861	%ifdef RT_ARCH_AMD64
862	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
863	PROLOGUE_3_ARGS
864	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
865	mov T0, [A1]
866	xadd [A0], T0
867	mov [A1], T0
868	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
869	EPILOGUE_3_ARGS
870	ENDPROC iemAImpl_xadd_u64
871	%endif ; RT_ARCH_AMD64
872
873	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
874	PROLOGUE_3_ARGS
875	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
876	mov T0_8, [A1]
877	lock xadd [A0], T0_8
878	mov [A1], T0_8
879	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
880	EPILOGUE_3_ARGS
881	ENDPROC iemAImpl_xadd_u8_locked
882
883	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
884	PROLOGUE_3_ARGS
885	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
886	mov T0_16, [A1]
887	lock xadd [A0], T0_16
888	mov [A1], T0_16
889	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
890	EPILOGUE_3_ARGS
891	ENDPROC iemAImpl_xadd_u16_locked
892
893	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
894	PROLOGUE_3_ARGS
895	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
896	mov T0_32, [A1]
897	lock xadd [A0], T0_32
898	mov [A1], T0_32
899	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
900	EPILOGUE_3_ARGS
901	ENDPROC iemAImpl_xadd_u32_locked
902
903	%ifdef RT_ARCH_AMD64
904	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
905	PROLOGUE_3_ARGS
906	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
907	mov T0, [A1]
908	lock xadd [A0], T0
909	mov [A1], T0
910	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
911	EPILOGUE_3_ARGS
912	ENDPROC iemAImpl_xadd_u64_locked
913	%endif ; RT_ARCH_AMD64
914
915
916	;
917	; CMPXCHG8B.
918	;
919	; These are tricky register wise, so the code is duplicated for each calling
920	; convention.
921	;
922	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
923	;
924	; C-proto:
925	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
926	; uint32_t *pEFlags));
927	;
928	; Note! Identical to iemAImpl_cmpxchg16b.
929	;
930	BEGINCODE
931	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
932	%ifdef RT_ARCH_AMD64
933	%ifdef ASM_CALL64_MSC
934	push rbx
935
936	mov r11, rdx ; pu64EaxEdx (is also T1)
937	mov r10, rcx ; pu64Dst
938
939	mov ebx, [r8]
940	mov ecx, [r8 + 4]
941	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
942	mov eax, [r11]
943	mov edx, [r11 + 4]
944
945	lock cmpxchg8b [r10]
946
947	mov [r11], eax
948	mov [r11 + 4], edx
949	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
950
951	pop rbx
952	ret
953	%else
954	push rbx
955
956	mov r10, rcx ; pEFlags
957	mov r11, rdx ; pu64EbxEcx (is also T1)
958
959	mov ebx, [r11]
960	mov ecx, [r11 + 4]
961	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
962	mov eax, [rsi]
963	mov edx, [rsi + 4]
964
965	lock cmpxchg8b [rdi]
966
967	mov [rsi], eax
968	mov [rsi + 4], edx
969	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
970
971	pop rbx
972	ret
973
974	%endif
975	%else
976	push esi
977	push edi
978	push ebx
979	push ebp
980
981	mov edi, ecx ; pu64Dst
982	mov esi, edx ; pu64EaxEdx
983	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
984	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
985
986	mov ebx, [ecx]
987	mov ecx, [ecx + 4]
988	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
989	mov eax, [esi]
990	mov edx, [esi + 4]
991
992	lock cmpxchg8b [edi]
993
994	mov [esi], eax
995	mov [esi + 4], edx
996	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
997
998	pop ebp
999	pop ebx
1000	pop edi
1001	pop esi
1002	ret 8
1003	%endif
1004	ENDPROC iemAImpl_cmpxchg8b
1005
1006	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1007	; Lazy bird always lock prefixes cmpxchg8b.
1008	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1009	ENDPROC iemAImpl_cmpxchg8b_locked
1010
1011	%ifdef RT_ARCH_AMD64
1012
1013	;
1014	; CMPXCHG16B.
1015	;
1016	; These are tricky register wise, so the code is duplicated for each calling
1017	; convention.
1018	;
1019	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1020	;
1021	; C-proto:
1022	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1023	; uint32_t *pEFlags));
1024	;
1025	; Note! Identical to iemAImpl_cmpxchg8b.
1026	;
1027	BEGINCODE
1028	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1029	%ifdef ASM_CALL64_MSC
1030	push rbx
1031
1032	mov r11, rdx ; pu64RaxRdx (is also T1)
1033	mov r10, rcx ; pu64Dst
1034
1035	mov rbx, [r8]
1036	mov rcx, [r8 + 8]
1037	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1038	mov rax, [r11]
1039	mov rdx, [r11 + 8]
1040
1041	lock cmpxchg16b [r10]
1042
1043	mov [r11], rax
1044	mov [r11 + 8], rdx
1045	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1046
1047	pop rbx
1048	ret
1049	%else
1050	push rbx
1051
1052	mov r10, rcx ; pEFlags
1053	mov r11, rdx ; pu64RbxRcx (is also T1)
1054
1055	mov rbx, [r11]
1056	mov rcx, [r11 + 8]
1057	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1058	mov rax, [rsi]
1059	mov rdx, [rsi + 8]
1060
1061	lock cmpxchg16b [rdi]
1062
1063	mov [rsi], rax
1064	mov [rsi + 8], rdx
1065	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1066
1067	pop rbx
1068	ret
1069
1070	%endif
1071	ENDPROC iemAImpl_cmpxchg16b
1072
1073	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1074	; Lazy bird always lock prefixes cmpxchg16b.
1075	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1076	ENDPROC iemAImpl_cmpxchg16b_locked
1077
1078	%endif ; RT_ARCH_AMD64
1079
1080
1081	;
1082	; CMPXCHG.
1083	;
1084	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1085	;
1086	; C-proto:
1087	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
1088	;
1089	BEGINCODE
1090	%macro IEMIMPL_CMPXCHG 2
1091	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1092	PROLOGUE_4_ARGS
1093	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1094	mov al, [A1]
1095	%1 cmpxchg [A0], A2_8
1096	mov [A1], al
1097	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1098	EPILOGUE_4_ARGS
1099	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1100
1101	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1102	PROLOGUE_4_ARGS
1103	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1104	mov ax, [A1]
1105	%1 cmpxchg [A0], A2_16
1106	mov [A1], ax
1107	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1108	EPILOGUE_4_ARGS
1109	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1110
1111	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1112	PROLOGUE_4_ARGS
1113	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1114	mov eax, [A1]
1115	%1 cmpxchg [A0], A2_32
1116	mov [A1], eax
1117	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1118	EPILOGUE_4_ARGS
1119	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1120
1121	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1122	%ifdef RT_ARCH_AMD64
1123	PROLOGUE_4_ARGS
1124	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1125	mov rax, [A1]
1126	%1 cmpxchg [A0], A2
1127	mov [A1], rax
1128	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1129	EPILOGUE_4_ARGS
1130	%else
1131	;
1132	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1133	;
1134	push esi
1135	push edi
1136	push ebx
1137	push ebp
1138
1139	mov edi, ecx ; pu64Dst
1140	mov esi, edx ; pu64Rax
1141	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1142	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1143
1144	mov ebx, [ecx]
1145	mov ecx, [ecx + 4]
1146	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1147	mov eax, [esi]
1148	mov edx, [esi + 4]
1149
1150	lock cmpxchg8b [edi]
1151
1152	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1153	jz .cmpxchg8b_not_equal
1154	cmp eax, eax ; just set the other flags.
1155	.store:
1156	mov [esi], eax
1157	mov [esi + 4], edx
1158	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1159
1160	pop ebp
1161	pop ebx
1162	pop edi
1163	pop esi
1164	ret 8
1165
1166	.cmpxchg8b_not_equal:
1167	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1168	jne .store
1169	cmp [esi], eax
1170	jmp .store
1171
1172	%endif
1173	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1174	%endmacro ; IEMIMPL_CMPXCHG
1175
1176	IEMIMPL_CMPXCHG , ,
1177	IEMIMPL_CMPXCHG lock, _locked
1178
1179	;;
1180	; Macro for implementing a unary operator.
1181	;
1182	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1183	; variants, except on 32-bit system where the 64-bit accesses requires hand
1184	; coding.
1185	;
1186	; All the functions takes a pointer to the destination memory operand in A0,
1187	; the source register operand in A1 and a pointer to eflags in A2.
1188	;
1189	; @param 1 The instruction mnemonic.
1190	; @param 2 The modified flags.
1191	; @param 3 The undefined flags.
1192	;
1193	%macro IEMIMPL_UNARY_OP 3
1194	BEGINCODE
1195	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1196	PROLOGUE_2_ARGS
1197	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1198	%1 byte [A0]
1199	IEM_SAVE_FLAGS A1, %2, %3
1200	EPILOGUE_2_ARGS
1201	ENDPROC iemAImpl_ %+ %1 %+ _u8
1202
1203	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1204	PROLOGUE_2_ARGS
1205	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1206	lock %1 byte [A0]
1207	IEM_SAVE_FLAGS A1, %2, %3
1208	EPILOGUE_2_ARGS
1209	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1210
1211	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1212	PROLOGUE_2_ARGS
1213	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1214	%1 word [A0]
1215	IEM_SAVE_FLAGS A1, %2, %3
1216	EPILOGUE_2_ARGS
1217	ENDPROC iemAImpl_ %+ %1 %+ _u16
1218
1219	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1220	PROLOGUE_2_ARGS
1221	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1222	lock %1 word [A0]
1223	IEM_SAVE_FLAGS A1, %2, %3
1224	EPILOGUE_2_ARGS
1225	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1226
1227	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1228	PROLOGUE_2_ARGS
1229	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1230	%1 dword [A0]
1231	IEM_SAVE_FLAGS A1, %2, %3
1232	EPILOGUE_2_ARGS
1233	ENDPROC iemAImpl_ %+ %1 %+ _u32
1234
1235	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1236	PROLOGUE_2_ARGS
1237	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1238	lock %1 dword [A0]
1239	IEM_SAVE_FLAGS A1, %2, %3
1240	EPILOGUE_2_ARGS
1241	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1242
1243	%ifdef RT_ARCH_AMD64
1244	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1245	PROLOGUE_2_ARGS
1246	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1247	%1 qword [A0]
1248	IEM_SAVE_FLAGS A1, %2, %3
1249	EPILOGUE_2_ARGS
1250	ENDPROC iemAImpl_ %+ %1 %+ _u64
1251
1252	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1253	PROLOGUE_2_ARGS
1254	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1255	lock %1 qword [A0]
1256	IEM_SAVE_FLAGS A1, %2, %3
1257	EPILOGUE_2_ARGS
1258	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1259	%endif ; RT_ARCH_AMD64
1260
1261	%endmacro
1262
1263	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1264	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1265	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1266	IEMIMPL_UNARY_OP not, 0, 0
1267
1268
1269	;
1270	; BSWAP. No flag changes.
1271	;
1272	; Each function takes one argument, pointer to the value to bswap
1273	; (input/output). They all return void.
1274	;
1275	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1276	PROLOGUE_1_ARGS
1277	mov T0_32, [A0] ; just in case any of the upper bits are used.
1278	db 66h
1279	bswap T0_32
1280	mov [A0], T0_32
1281	EPILOGUE_1_ARGS
1282	ENDPROC iemAImpl_bswap_u16
1283
1284	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1285	PROLOGUE_1_ARGS
1286	mov T0_32, [A0]
1287	bswap T0_32
1288	mov [A0], T0_32
1289	EPILOGUE_1_ARGS
1290	ENDPROC iemAImpl_bswap_u32
1291
1292	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1293	%ifdef RT_ARCH_AMD64
1294	PROLOGUE_1_ARGS
1295	mov T0, [A0]
1296	bswap T0
1297	mov [A0], T0
1298	EPILOGUE_1_ARGS
1299	%else
1300	PROLOGUE_1_ARGS
1301	mov T0, [A0]
1302	mov T1, [A0 + 4]
1303	bswap T0
1304	bswap T1
1305	mov [A0 + 4], T0
1306	mov [A0], T1
1307	EPILOGUE_1_ARGS
1308	%endif
1309	ENDPROC iemAImpl_bswap_u64
1310
1311
1312	;;
1313	; Macro for implementing a shift operation.
1314	;
1315	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1316	; 32-bit system where the 64-bit accesses requires hand coding.
1317	;
1318	; All the functions takes a pointer to the destination memory operand in A0,
1319	; the shift count in A1 and a pointer to eflags in A2.
1320	;
1321	; @param 1 The instruction mnemonic.
1322	; @param 2 The modified flags.
1323	; @param 3 The undefined flags.
1324	;
1325	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1326	;
1327	%macro IEMIMPL_SHIFT_OP 3
1328	BEGINCODE
1329	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_intel, 12
1330	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_amd, 12
1331	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1332	PROLOGUE_3_ARGS
1333	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1334	%ifdef ASM_CALL64_GCC
1335	mov cl, A1_8
1336	%1 byte [A0], cl
1337	%else
1338	xchg A1, A0
1339	%1 byte [A1], cl
1340	%endif
1341	IEM_SAVE_FLAGS A2, %2, %3
1342	EPILOGUE_3_ARGS
1343	ENDPROC iemAImpl_ %+ %1 %+ _u8
1344
1345	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_intel, 12
1346	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_amd, 12
1347	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1348	PROLOGUE_3_ARGS
1349	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1350	%ifdef ASM_CALL64_GCC
1351	mov cl, A1_8
1352	%1 word [A0], cl
1353	%else
1354	xchg A1, A0
1355	%1 word [A1], cl
1356	%endif
1357	IEM_SAVE_FLAGS A2, %2, %3
1358	EPILOGUE_3_ARGS
1359	ENDPROC iemAImpl_ %+ %1 %+ _u16
1360
1361	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_intel, 12
1362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_amd, 12
1363	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1364	PROLOGUE_3_ARGS
1365	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1366	%ifdef ASM_CALL64_GCC
1367	mov cl, A1_8
1368	%1 dword [A0], cl
1369	%else
1370	xchg A1, A0
1371	%1 dword [A1], cl
1372	%endif
1373	IEM_SAVE_FLAGS A2, %2, %3
1374	EPILOGUE_3_ARGS
1375	ENDPROC iemAImpl_ %+ %1 %+ _u32
1376
1377	%ifdef RT_ARCH_AMD64
1378	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_intel, 12
1379	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_amd, 12
1380	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1381	PROLOGUE_3_ARGS
1382	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1383	%ifdef ASM_CALL64_GCC
1384	mov cl, A1_8
1385	%1 qword [A0], cl
1386	%else
1387	xchg A1, A0
1388	%1 qword [A1], cl
1389	%endif
1390	IEM_SAVE_FLAGS A2, %2, %3
1391	EPILOGUE_3_ARGS
1392	ENDPROC iemAImpl_ %+ %1 %+ _u64
1393	%endif ; RT_ARCH_AMD64
1394
1395	%endmacro
1396
1397	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1398	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1399	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1400	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1401	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1402	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1403	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1404
1405
1406	;;
1407	; Macro for implementing a double precision shift operation.
1408	;
1409	; This will generate code for the 16, 32 and 64 bit accesses, except on
1410	; 32-bit system where the 64-bit accesses requires hand coding.
1411	;
1412	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1413	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1414	;
1415	; @param 1 The instruction mnemonic.
1416	; @param 2 The modified flags.
1417	; @param 3 The undefined flags.
1418	;
1419	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1420	;
1421	%macro IEMIMPL_SHIFT_DBL_OP 3
1422	BEGINCODE
1423	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_intel, 16
1424	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_amd, 16
1425	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1426	PROLOGUE_4_ARGS
1427	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1428	%ifdef ASM_CALL64_GCC
1429	xchg A3, A2
1430	%1 [A0], A1_16, cl
1431	xchg A3, A2
1432	%else
1433	xchg A0, A2
1434	%1 [A2], A1_16, cl
1435	%endif
1436	IEM_SAVE_FLAGS A3, %2, %3
1437	EPILOGUE_4_ARGS
1438	ENDPROC iemAImpl_ %+ %1 %+ _u16
1439
1440	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_intel, 16
1441	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_amd, 16
1442	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1443	PROLOGUE_4_ARGS
1444	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1445	%ifdef ASM_CALL64_GCC
1446	xchg A3, A2
1447	%1 [A0], A1_32, cl
1448	xchg A3, A2
1449	%else
1450	xchg A0, A2
1451	%1 [A2], A1_32, cl
1452	%endif
1453	IEM_SAVE_FLAGS A3, %2, %3
1454	EPILOGUE_4_ARGS
1455	ENDPROC iemAImpl_ %+ %1 %+ _u32
1456
1457	%ifdef RT_ARCH_AMD64
1458	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_intel, 20
1459	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_amd, 20
1460	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1461	PROLOGUE_4_ARGS
1462	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1463	%ifdef ASM_CALL64_GCC
1464	xchg A3, A2
1465	%1 [A0], A1, cl
1466	xchg A3, A2
1467	%else
1468	xchg A0, A2
1469	%1 [A2], A1, cl
1470	%endif
1471	IEM_SAVE_FLAGS A3, %2, %3
1472	EPILOGUE_4_ARGS_EX 12
1473	ENDPROC iemAImpl_ %+ %1 %+ _u64
1474	%endif ; RT_ARCH_AMD64
1475
1476	%endmacro
1477
1478	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1479	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1480
1481
1482	;;
1483	; Macro for implementing a multiplication operations.
1484	;
1485	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1486	; 32-bit system where the 64-bit accesses requires hand coding.
1487	;
1488	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1489	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1490	; pointer to eflags in A3.
1491	;
1492	; The functions all return 0 so the caller can be used for div/idiv as well as
1493	; for the mul/imul implementation.
1494	;
1495	; @param 1 The instruction mnemonic.
1496	; @param 2 The modified flags.
1497	; @param 3 The undefined flags.
1498	;
1499	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1500	;
1501	%macro IEMIMPL_MUL_OP 3
1502	BEGINCODE
1503	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_intel, 12
1504	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_amd, 12
1505	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1506	PROLOGUE_3_ARGS
1507	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1508	mov al, [A0]
1509	%1 A1_8
1510	mov [A0], ax
1511	IEM_SAVE_FLAGS A2, %2, %3
1512	xor eax, eax
1513	EPILOGUE_3_ARGS
1514	ENDPROC iemAImpl_ %+ %1 %+ _u8
1515
1516	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_intel, 16
1517	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_amd, 16
1518	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1519	PROLOGUE_4_ARGS
1520	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1521	mov ax, [A0]
1522	%ifdef ASM_CALL64_GCC
1523	%1 A2_16
1524	mov [A0], ax
1525	mov [A1], dx
1526	%else
1527	mov T1, A1
1528	%1 A2_16
1529	mov [A0], ax
1530	mov [T1], dx
1531	%endif
1532	IEM_SAVE_FLAGS A3, %2, %3
1533	xor eax, eax
1534	EPILOGUE_4_ARGS
1535	ENDPROC iemAImpl_ %+ %1 %+ _u16
1536
1537	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_intel, 16
1538	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_amd, 16
1539	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1540	PROLOGUE_4_ARGS
1541	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1542	mov eax, [A0]
1543	%ifdef ASM_CALL64_GCC
1544	%1 A2_32
1545	mov [A0], eax
1546	mov [A1], edx
1547	%else
1548	mov T1, A1
1549	%1 A2_32
1550	mov [A0], eax
1551	mov [T1], edx
1552	%endif
1553	IEM_SAVE_FLAGS A3, %2, %3
1554	xor eax, eax
1555	EPILOGUE_4_ARGS
1556	ENDPROC iemAImpl_ %+ %1 %+ _u32
1557
1558	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1559	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_intel, 20
1560	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_amd, 20
1561	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1562	PROLOGUE_4_ARGS
1563	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1564	mov rax, [A0]
1565	%ifdef ASM_CALL64_GCC
1566	%1 A2
1567	mov [A0], rax
1568	mov [A1], rdx
1569	%else
1570	mov T1, A1
1571	%1 A2
1572	mov [A0], rax
1573	mov [T1], rdx
1574	%endif
1575	IEM_SAVE_FLAGS A3, %2, %3
1576	xor eax, eax
1577	EPILOGUE_4_ARGS_EX 12
1578	ENDPROC iemAImpl_ %+ %1 %+ _u64
1579	%endif ; !RT_ARCH_AMD64
1580
1581	%endmacro
1582
1583	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1584	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1585
1586
1587	BEGINCODE
1588	;;
1589	; Worker function for negating a 32-bit number in T1:T0
1590	; @uses None (T0,T1)
1591	BEGINPROC iemAImpl_negate_T0_T1_u32
1592	push 0
1593	push 0
1594	xchg T0_32, [xSP]
1595	xchg T1_32, [xSP + xCB]
1596	sub T0_32, [xSP]
1597	sbb T1_32, [xSP + xCB]
1598	add xSP, xCB*2
1599	ret
1600	ENDPROC iemAImpl_negate_T0_T1_u32
1601
1602	%ifdef RT_ARCH_AMD64
1603	;;
1604	; Worker function for negating a 64-bit number in T1:T0
1605	; @uses None (T0,T1)
1606	BEGINPROC iemAImpl_negate_T0_T1_u64
1607	push 0
1608	push 0
1609	xchg T0, [xSP]
1610	xchg T1, [xSP + xCB]
1611	sub T0, [xSP]
1612	sbb T1, [xSP + xCB]
1613	add xSP, xCB*2
1614	ret
1615	ENDPROC iemAImpl_negate_T0_T1_u64
1616	%endif
1617
1618
1619	;;
1620	; Macro for implementing a division operations.
1621	;
1622	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1623	; 32-bit system where the 64-bit accesses requires hand coding.
1624	;
1625	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1626	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1627	; pointer to eflags in A3.
1628	;
1629	; The functions all return 0 on success and -1 if a divide error should be
1630	; raised by the caller.
1631	;
1632	; @param 1 The instruction mnemonic.
1633	; @param 2 The modified flags.
1634	; @param 3 The undefined flags.
1635	; @param 4 1 if signed, 0 if unsigned.
1636	;
1637	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1638	;
1639	%macro IEMIMPL_DIV_OP 4
1640	BEGINCODE
1641	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_intel, 12
1642	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_amd, 12
1643	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1644	PROLOGUE_3_ARGS
1645
1646	; div by chainsaw check.
1647	test A1_8, A1_8
1648	jz .div_zero
1649
1650	; Overflow check - unsigned division is simple to verify, haven't
1651	; found a simple way to check signed division yet unfortunately.
1652	%if %4 == 0
1653	cmp [A0 + 1], A1_8
1654	jae .div_overflow
1655	%else
1656	mov T0_16, [A0] ; T0 = dividend
1657	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1658	test A1_8, A1_8
1659	js .divisor_negative
1660	test T0_16, T0_16
1661	jns .both_positive
1662	neg T0_16
1663	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1664	push T0 ; Start off like unsigned below.
1665	shr T0_16, 7
1666	cmp T0_8, A1_8
1667	pop T0
1668	jb .div_no_overflow
1669	ja .div_overflow
1670	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1671	cmp T0_8, A1_8
1672	jae .div_overflow
1673	jmp .div_no_overflow
1674
1675	.divisor_negative:
1676	neg A1_8
1677	test T0_16, T0_16
1678	jns .one_of_each
1679	neg T0_16
1680	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1681	shr T0_16, 7
1682	cmp T0_8, A1_8
1683	jae .div_overflow
1684	.div_no_overflow:
1685	mov A1, T1 ; restore divisor
1686	%endif
1687
1688	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1689	mov ax, [A0]
1690	%1 A1_8
1691	mov [A0], ax
1692	IEM_SAVE_FLAGS A2, %2, %3
1693	xor eax, eax
1694
1695	.return:
1696	EPILOGUE_3_ARGS
1697
1698	.div_zero:
1699	.div_overflow:
1700	mov eax, -1
1701	jmp .return
1702	ENDPROC iemAImpl_ %+ %1 %+ _u8
1703
1704	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_intel, 16
1705	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_amd, 16
1706	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1707	PROLOGUE_4_ARGS
1708
1709	; div by chainsaw check.
1710	test A2_16, A2_16
1711	jz .div_zero
1712
1713	; Overflow check - unsigned division is simple to verify, haven't
1714	; found a simple way to check signed division yet unfortunately.
1715	%if %4 == 0
1716	cmp [A1], A2_16
1717	jae .div_overflow
1718	%else
1719	mov T0_16, [A1]
1720	shl T0_32, 16
1721	mov T0_16, [A0] ; T0 = dividend
1722	mov T1, A2 ; T1 = divisor
1723	test T1_16, T1_16
1724	js .divisor_negative
1725	test T0_32, T0_32
1726	jns .both_positive
1727	neg T0_32
1728	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1729	push T0 ; Start off like unsigned below.
1730	shr T0_32, 15
1731	cmp T0_16, T1_16
1732	pop T0
1733	jb .div_no_overflow
1734	ja .div_overflow
1735	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1736	cmp T0_16, T1_16
1737	jae .div_overflow
1738	jmp .div_no_overflow
1739
1740	.divisor_negative:
1741	neg T1_16
1742	test T0_32, T0_32
1743	jns .one_of_each
1744	neg T0_32
1745	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1746	shr T0_32, 15
1747	cmp T0_16, T1_16
1748	jae .div_overflow
1749	.div_no_overflow:
1750	%endif
1751
1752	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1753	%ifdef ASM_CALL64_GCC
1754	mov T1, A2
1755	mov ax, [A0]
1756	mov dx, [A1]
1757	%1 T1_16
1758	mov [A0], ax
1759	mov [A1], dx
1760	%else
1761	mov T1, A1
1762	mov ax, [A0]
1763	mov dx, [T1]
1764	%1 A2_16
1765	mov [A0], ax
1766	mov [T1], dx
1767	%endif
1768	IEM_SAVE_FLAGS A3, %2, %3
1769	xor eax, eax
1770
1771	.return:
1772	EPILOGUE_4_ARGS
1773
1774	.div_zero:
1775	.div_overflow:
1776	mov eax, -1
1777	jmp .return
1778	ENDPROC iemAImpl_ %+ %1 %+ _u16
1779
1780	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_intel, 16
1781	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_amd, 16
1782	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1783	PROLOGUE_4_ARGS
1784
1785	; div by chainsaw check.
1786	test A2_32, A2_32
1787	jz .div_zero
1788
1789	; Overflow check - unsigned division is simple to verify, haven't
1790	; found a simple way to check signed division yet unfortunately.
1791	%if %4 == 0
1792	cmp [A1], A2_32
1793	jae .div_overflow
1794	%else
1795	push A2 ; save A2 so we modify it (we out of regs on x86).
1796	mov T0_32, [A0] ; T0 = dividend low
1797	mov T1_32, [A1] ; T1 = dividend high
1798	test A2_32, A2_32
1799	js .divisor_negative
1800	test T1_32, T1_32
1801	jns .both_positive
1802	call NAME(iemAImpl_negate_T0_T1_u32)
1803	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1804	push T0 ; Start off like unsigned below.
1805	shl T1_32, 1
1806	shr T0_32, 31
1807	or T1_32, T0_32
1808	cmp T1_32, A2_32
1809	pop T0
1810	jb .div_no_overflow
1811	ja .div_overflow
1812	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1813	cmp T0_32, A2_32
1814	jae .div_overflow
1815	jmp .div_no_overflow
1816
1817	.divisor_negative:
1818	neg A2_32
1819	test T1_32, T1_32
1820	jns .one_of_each
1821	call NAME(iemAImpl_negate_T0_T1_u32)
1822	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1823	shl T1_32, 1
1824	shr T0_32, 31
1825	or T1_32, T0_32
1826	cmp T1_32, A2_32
1827	jae .div_overflow
1828	.div_no_overflow:
1829	pop A2
1830	%endif
1831
1832	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1833	mov eax, [A0]
1834	%ifdef ASM_CALL64_GCC
1835	mov T1, A2
1836	mov eax, [A0]
1837	mov edx, [A1]
1838	%1 T1_32
1839	mov [A0], eax
1840	mov [A1], edx
1841	%else
1842	mov T1, A1
1843	mov eax, [A0]
1844	mov edx, [T1]
1845	%1 A2_32
1846	mov [A0], eax
1847	mov [T1], edx
1848	%endif
1849	IEM_SAVE_FLAGS A3, %2, %3
1850	xor eax, eax
1851
1852	.return:
1853	EPILOGUE_4_ARGS
1854
1855	.div_overflow:
1856	%if %4 != 0
1857	pop A2
1858	%endif
1859	.div_zero:
1860	mov eax, -1
1861	jmp .return
1862	ENDPROC iemAImpl_ %+ %1 %+ _u32
1863
1864	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1865	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_intel, 20
1866	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_amd, 20
1867	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1868	PROLOGUE_4_ARGS
1869
1870	test A2, A2
1871	jz .div_zero
1872	%if %4 == 0
1873	cmp [A1], A2
1874	jae .div_overflow
1875	%else
1876	push A2 ; save A2 so we modify it (we out of regs on x86).
1877	mov T0, [A0] ; T0 = dividend low
1878	mov T1, [A1] ; T1 = dividend high
1879	test A2, A2
1880	js .divisor_negative
1881	test T1, T1
1882	jns .both_positive
1883	call NAME(iemAImpl_negate_T0_T1_u64)
1884	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1885	push T0 ; Start off like unsigned below.
1886	shl T1, 1
1887	shr T0, 63
1888	or T1, T0
1889	cmp T1, A2
1890	pop T0
1891	jb .div_no_overflow
1892	ja .div_overflow
1893	mov T1, 0x7fffffffffffffff
1894	and T0, T1 ; Special case for covering (divisor - 1).
1895	cmp T0, A2
1896	jae .div_overflow
1897	jmp .div_no_overflow
1898
1899	.divisor_negative:
1900	neg A2
1901	test T1, T1
1902	jns .one_of_each
1903	call NAME(iemAImpl_negate_T0_T1_u64)
1904	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1905	shl T1, 1
1906	shr T0, 63
1907	or T1, T0
1908	cmp T1, A2
1909	jae .div_overflow
1910	.div_no_overflow:
1911	pop A2
1912	%endif
1913
1914	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1915	mov rax, [A0]
1916	%ifdef ASM_CALL64_GCC
1917	mov T1, A2
1918	mov rax, [A0]
1919	mov rdx, [A1]
1920	%1 T1
1921	mov [A0], rax
1922	mov [A1], rdx
1923	%else
1924	mov T1, A1
1925	mov rax, [A0]
1926	mov rdx, [T1]
1927	%1 A2
1928	mov [A0], rax
1929	mov [T1], rdx
1930	%endif
1931	IEM_SAVE_FLAGS A3, %2, %3
1932	xor eax, eax
1933
1934	.return:
1935	EPILOGUE_4_ARGS_EX 12
1936
1937	.div_overflow:
1938	%if %4 != 0
1939	pop A2
1940	%endif
1941	.div_zero:
1942	mov eax, -1
1943	jmp .return
1944	ENDPROC iemAImpl_ %+ %1 %+ _u64
1945	%endif ; !RT_ARCH_AMD64
1946
1947	%endmacro
1948
1949	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1950	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1951
1952
1953	;;
1954	; Macro for implementing memory fence operation.
1955	;
1956	; No return value, no operands or anything.
1957	;
1958	; @param 1 The instruction.
1959	;
1960	%macro IEMIMPL_MEM_FENCE 1
1961	BEGINCODE
1962	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1963	%1
1964	ret
1965	ENDPROC iemAImpl_ %+ %1
1966	%endmacro
1967
1968	IEMIMPL_MEM_FENCE lfence
1969	IEMIMPL_MEM_FENCE sfence
1970	IEMIMPL_MEM_FENCE mfence
1971
1972	;;
1973	; Alternative for non-SSE2 host.
1974	;
1975	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1976	push xAX
1977	xchg xAX, [xSP]
1978	add xSP, xCB
1979	ret
1980	ENDPROC iemAImpl_alt_mem_fence
1981
1982
1983	;;
1984	; Initialize the FPU for the actual instruction being emulated, this means
1985	; loading parts of the guest's control word and status word.
1986	;
1987	; @uses 24 bytes of stack.
1988	; @param 1 Expression giving the address of the FXSTATE of the guest.
1989	;
1990	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1991	fnstenv [xSP]
1992
1993	; FCW - for exception, precision and rounding control.
1994	movzx T0, word [%1 + X86FXSTATE.FCW]
1995	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1996	mov [xSP + X86FSTENV32P.FCW], T0_16
1997
1998	; FSW - for undefined C0, C1, C2, and C3.
1999	movzx T1, word [%1 + X86FXSTATE.FSW]
2000	and T1, X86_FSW_C_MASK
2001	movzx T0, word [xSP + X86FSTENV32P.FSW]
2002	and T0, X86_FSW_TOP_MASK
2003	or T0, T1
2004	mov [xSP + X86FSTENV32P.FSW], T0_16
2005
2006	fldenv [xSP]
2007	%endmacro
2008
2009
2010	;;
2011	; Need to move this as well somewhere better?
2012	;
2013	struc IEMFPURESULT
2014	.r80Result resw 5
2015	.FSW resw 1
2016	endstruc
2017
2018
2019	;;
2020	; Need to move this as well somewhere better?
2021	;
2022	struc IEMFPURESULTTWO
2023	.r80Result1 resw 5
2024	.FSW resw 1
2025	.r80Result2 resw 5
2026	endstruc
2027
2028
2029	;
2030	;---------------------- 16-bit signed integer operations ----------------------
2031	;
2032
2033
2034	;;
2035	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2036	;
2037	; @param A0 FPU context (fxsave).
2038	; @param A1 Pointer to a IEMFPURESULT for the output.
2039	; @param A2 Pointer to the 16-bit floating point value to convert.
2040	;
2041	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
2042	PROLOGUE_3_ARGS
2043	sub xSP, 20h
2044
2045	fninit
2046	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2047	fild word [A2]
2048
2049	fnstsw word [A1 + IEMFPURESULT.FSW]
2050	fnclex
2051	fstp tword [A1 + IEMFPURESULT.r80Result]
2052
2053	fninit
2054	add xSP, 20h
2055	EPILOGUE_3_ARGS
2056	ENDPROC iemAImpl_fild_i16_to_r80
2057
2058
2059	;;
2060	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2061	;
2062	; @param A0 FPU context (fxsave).
2063	; @param A1 Where to return the output FSW.
2064	; @param A2 Where to store the 16-bit signed integer value.
2065	; @param A3 Pointer to the 80-bit value.
2066	;
2067	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2068	PROLOGUE_4_ARGS
2069	sub xSP, 20h
2070
2071	fninit
2072	fld tword [A3]
2073	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2074	fistp word [A2]
2075
2076	fnstsw word [A1]
2077
2078	fninit
2079	add xSP, 20h
2080	EPILOGUE_4_ARGS
2081	ENDPROC iemAImpl_fist_r80_to_i16
2082
2083
2084	;;
2085	; Store a 80-bit floating point value (register) as a 16-bit signed integer
2086	; (memory) with truncation.
2087	;
2088	; @param A0 FPU context (fxsave).
2089	; @param A1 Where to return the output FSW.
2090	; @param A2 Where to store the 16-bit signed integer value.
2091	; @param A3 Pointer to the 80-bit value.
2092	;
2093	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2094	PROLOGUE_4_ARGS
2095	sub xSP, 20h
2096
2097	fninit
2098	fld tword [A3]
2099	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2100	fisttp dword [A2]
2101
2102	fnstsw word [A1]
2103
2104	fninit
2105	add xSP, 20h
2106	EPILOGUE_4_ARGS
2107	ENDPROC iemAImpl_fistt_r80_to_i16
2108
2109
2110	;;
2111	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2112	;
2113	; @param 1 The instruction
2114	;
2115	; @param A0 FPU context (fxsave).
2116	; @param A1 Pointer to a IEMFPURESULT for the output.
2117	; @param A2 Pointer to the 80-bit value.
2118	; @param A3 Pointer to the 16-bit value.
2119	;
2120	%macro IEMIMPL_FPU_R80_BY_I16 1
2121	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2122	PROLOGUE_4_ARGS
2123	sub xSP, 20h
2124
2125	fninit
2126	fld tword [A2]
2127	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2128	%1 word [A3]
2129
2130	fnstsw word [A1 + IEMFPURESULT.FSW]
2131	fnclex
2132	fstp tword [A1 + IEMFPURESULT.r80Result]
2133
2134	fninit
2135	add xSP, 20h
2136	EPILOGUE_4_ARGS
2137	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2138	%endmacro
2139
2140	IEMIMPL_FPU_R80_BY_I16 fiadd
2141	IEMIMPL_FPU_R80_BY_I16 fimul
2142	IEMIMPL_FPU_R80_BY_I16 fisub
2143	IEMIMPL_FPU_R80_BY_I16 fisubr
2144	IEMIMPL_FPU_R80_BY_I16 fidiv
2145	IEMIMPL_FPU_R80_BY_I16 fidivr
2146
2147
2148	;;
2149	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2150	; only returning FSW.
2151	;
2152	; @param 1 The instruction
2153	;
2154	; @param A0 FPU context (fxsave).
2155	; @param A1 Where to store the output FSW.
2156	; @param A2 Pointer to the 80-bit value.
2157	; @param A3 Pointer to the 64-bit value.
2158	;
2159	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2160	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2161	PROLOGUE_4_ARGS
2162	sub xSP, 20h
2163
2164	fninit
2165	fld tword [A2]
2166	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2167	%1 word [A3]
2168
2169	fnstsw word [A1]
2170
2171	fninit
2172	add xSP, 20h
2173	EPILOGUE_4_ARGS
2174	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2175	%endmacro
2176
2177	IEMIMPL_FPU_R80_BY_I16_FSW ficom
2178
2179
2180
2181	;
2182	;---------------------- 32-bit signed integer operations ----------------------
2183	;
2184
2185
2186	;;
2187	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2188	;
2189	; @param A0 FPU context (fxsave).
2190	; @param A1 Pointer to a IEMFPURESULT for the output.
2191	; @param A2 Pointer to the 32-bit floating point value to convert.
2192	;
2193	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
2194	PROLOGUE_3_ARGS
2195	sub xSP, 20h
2196
2197	fninit
2198	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2199	fild dword [A2]
2200
2201	fnstsw word [A1 + IEMFPURESULT.FSW]
2202	fnclex
2203	fstp tword [A1 + IEMFPURESULT.r80Result]
2204
2205	fninit
2206	add xSP, 20h
2207	EPILOGUE_3_ARGS
2208	ENDPROC iemAImpl_fild_i32_to_r80
2209
2210
2211	;;
2212	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2213	;
2214	; @param A0 FPU context (fxsave).
2215	; @param A1 Where to return the output FSW.
2216	; @param A2 Where to store the 32-bit signed integer value.
2217	; @param A3 Pointer to the 80-bit value.
2218	;
2219	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2220	PROLOGUE_4_ARGS
2221	sub xSP, 20h
2222
2223	fninit
2224	fld tword [A3]
2225	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2226	fistp dword [A2]
2227
2228	fnstsw word [A1]
2229
2230	fninit
2231	add xSP, 20h
2232	EPILOGUE_4_ARGS
2233	ENDPROC iemAImpl_fist_r80_to_i32
2234
2235
2236	;;
2237	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2238	; (memory) with truncation.
2239	;
2240	; @param A0 FPU context (fxsave).
2241	; @param A1 Where to return the output FSW.
2242	; @param A2 Where to store the 32-bit signed integer value.
2243	; @param A3 Pointer to the 80-bit value.
2244	;
2245	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2246	PROLOGUE_4_ARGS
2247	sub xSP, 20h
2248
2249	fninit
2250	fld tword [A3]
2251	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2252	fisttp dword [A2]
2253
2254	fnstsw word [A1]
2255
2256	fninit
2257	add xSP, 20h
2258	EPILOGUE_4_ARGS
2259	ENDPROC iemAImpl_fistt_r80_to_i32
2260
2261
2262	;;
2263	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2264	;
2265	; @param 1 The instruction
2266	;
2267	; @param A0 FPU context (fxsave).
2268	; @param A1 Pointer to a IEMFPURESULT for the output.
2269	; @param A2 Pointer to the 80-bit value.
2270	; @param A3 Pointer to the 32-bit value.
2271	;
2272	%macro IEMIMPL_FPU_R80_BY_I32 1
2273	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2274	PROLOGUE_4_ARGS
2275	sub xSP, 20h
2276
2277	fninit
2278	fld tword [A2]
2279	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2280	%1 dword [A3]
2281
2282	fnstsw word [A1 + IEMFPURESULT.FSW]
2283	fnclex
2284	fstp tword [A1 + IEMFPURESULT.r80Result]
2285
2286	fninit
2287	add xSP, 20h
2288	EPILOGUE_4_ARGS
2289	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2290	%endmacro
2291
2292	IEMIMPL_FPU_R80_BY_I32 fiadd
2293	IEMIMPL_FPU_R80_BY_I32 fimul
2294	IEMIMPL_FPU_R80_BY_I32 fisub
2295	IEMIMPL_FPU_R80_BY_I32 fisubr
2296	IEMIMPL_FPU_R80_BY_I32 fidiv
2297	IEMIMPL_FPU_R80_BY_I32 fidivr
2298
2299
2300	;;
2301	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2302	; only returning FSW.
2303	;
2304	; @param 1 The instruction
2305	;
2306	; @param A0 FPU context (fxsave).
2307	; @param A1 Where to store the output FSW.
2308	; @param A2 Pointer to the 80-bit value.
2309	; @param A3 Pointer to the 64-bit value.
2310	;
2311	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2312	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2313	PROLOGUE_4_ARGS
2314	sub xSP, 20h
2315
2316	fninit
2317	fld tword [A2]
2318	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2319	%1 dword [A3]
2320
2321	fnstsw word [A1]
2322
2323	fninit
2324	add xSP, 20h
2325	EPILOGUE_4_ARGS
2326	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2327	%endmacro
2328
2329	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2330
2331
2332
2333	;
2334	;---------------------- 64-bit signed integer operations ----------------------
2335	;
2336
2337
2338	;;
2339	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2340	;
2341	; @param A0 FPU context (fxsave).
2342	; @param A1 Pointer to a IEMFPURESULT for the output.
2343	; @param A2 Pointer to the 64-bit floating point value to convert.
2344	;
2345	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2346	PROLOGUE_3_ARGS
2347	sub xSP, 20h
2348
2349	fninit
2350	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2351	fild qword [A2]
2352
2353	fnstsw word [A1 + IEMFPURESULT.FSW]
2354	fnclex
2355	fstp tword [A1 + IEMFPURESULT.r80Result]
2356
2357	fninit
2358	add xSP, 20h
2359	EPILOGUE_3_ARGS
2360	ENDPROC iemAImpl_fild_i64_to_r80
2361
2362
2363	;;
2364	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2365	;
2366	; @param A0 FPU context (fxsave).
2367	; @param A1 Where to return the output FSW.
2368	; @param A2 Where to store the 64-bit signed integer value.
2369	; @param A3 Pointer to the 80-bit value.
2370	;
2371	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2372	PROLOGUE_4_ARGS
2373	sub xSP, 20h
2374
2375	fninit
2376	fld tword [A3]
2377	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2378	fistp qword [A2]
2379
2380	fnstsw word [A1]
2381
2382	fninit
2383	add xSP, 20h
2384	EPILOGUE_4_ARGS
2385	ENDPROC iemAImpl_fist_r80_to_i64
2386
2387
2388	;;
2389	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2390	; (memory) with truncation.
2391	;
2392	; @param A0 FPU context (fxsave).
2393	; @param A1 Where to return the output FSW.
2394	; @param A2 Where to store the 64-bit signed integer value.
2395	; @param A3 Pointer to the 80-bit value.
2396	;
2397	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2398	PROLOGUE_4_ARGS
2399	sub xSP, 20h
2400
2401	fninit
2402	fld tword [A3]
2403	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2404	fisttp qword [A2]
2405
2406	fnstsw word [A1]
2407
2408	fninit
2409	add xSP, 20h
2410	EPILOGUE_4_ARGS
2411	ENDPROC iemAImpl_fistt_r80_to_i64
2412
2413
2414
2415	;
2416	;---------------------- 32-bit floating point operations ----------------------
2417	;
2418
2419	;;
2420	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2421	;
2422	; @param A0 FPU context (fxsave).
2423	; @param A1 Pointer to a IEMFPURESULT for the output.
2424	; @param A2 Pointer to the 32-bit floating point value to convert.
2425	;
2426	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2427	PROLOGUE_3_ARGS
2428	sub xSP, 20h
2429
2430	fninit
2431	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2432	fld dword [A2]
2433
2434	fnstsw word [A1 + IEMFPURESULT.FSW]
2435	fnclex
2436	fstp tword [A1 + IEMFPURESULT.r80Result]
2437
2438	fninit
2439	add xSP, 20h
2440	EPILOGUE_3_ARGS
2441	ENDPROC iemAImpl_fld_r32_to_r80
2442
2443
2444	;;
2445	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2446	;
2447	; @param A0 FPU context (fxsave).
2448	; @param A1 Where to return the output FSW.
2449	; @param A2 Where to store the 32-bit value.
2450	; @param A3 Pointer to the 80-bit value.
2451	;
2452	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2453	PROLOGUE_4_ARGS
2454	sub xSP, 20h
2455
2456	fninit
2457	fld tword [A3]
2458	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2459	fst dword [A2]
2460
2461	fnstsw word [A1]
2462
2463	fninit
2464	add xSP, 20h
2465	EPILOGUE_4_ARGS
2466	ENDPROC iemAImpl_fst_r80_to_r32
2467
2468
2469	;;
2470	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2471	;
2472	; @param 1 The instruction
2473	;
2474	; @param A0 FPU context (fxsave).
2475	; @param A1 Pointer to a IEMFPURESULT for the output.
2476	; @param A2 Pointer to the 80-bit value.
2477	; @param A3 Pointer to the 32-bit value.
2478	;
2479	%macro IEMIMPL_FPU_R80_BY_R32 1
2480	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2481	PROLOGUE_4_ARGS
2482	sub xSP, 20h
2483
2484	fninit
2485	fld tword [A2]
2486	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2487	%1 dword [A3]
2488
2489	fnstsw word [A1 + IEMFPURESULT.FSW]
2490	fnclex
2491	fstp tword [A1 + IEMFPURESULT.r80Result]
2492
2493	fninit
2494	add xSP, 20h
2495	EPILOGUE_4_ARGS
2496	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2497	%endmacro
2498
2499	IEMIMPL_FPU_R80_BY_R32 fadd
2500	IEMIMPL_FPU_R80_BY_R32 fmul
2501	IEMIMPL_FPU_R80_BY_R32 fsub
2502	IEMIMPL_FPU_R80_BY_R32 fsubr
2503	IEMIMPL_FPU_R80_BY_R32 fdiv
2504	IEMIMPL_FPU_R80_BY_R32 fdivr
2505
2506
2507	;;
2508	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2509	; only returning FSW.
2510	;
2511	; @param 1 The instruction
2512	;
2513	; @param A0 FPU context (fxsave).
2514	; @param A1 Where to store the output FSW.
2515	; @param A2 Pointer to the 80-bit value.
2516	; @param A3 Pointer to the 64-bit value.
2517	;
2518	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2519	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2520	PROLOGUE_4_ARGS
2521	sub xSP, 20h
2522
2523	fninit
2524	fld tword [A2]
2525	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2526	%1 dword [A3]
2527
2528	fnstsw word [A1]
2529
2530	fninit
2531	add xSP, 20h
2532	EPILOGUE_4_ARGS
2533	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2534	%endmacro
2535
2536	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2537
2538
2539
2540	;
2541	;---------------------- 64-bit floating point operations ----------------------
2542	;
2543
2544	;;
2545	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2546	;
2547	; @param A0 FPU context (fxsave).
2548	; @param A1 Pointer to a IEMFPURESULT for the output.
2549	; @param A2 Pointer to the 64-bit floating point value to convert.
2550	;
2551	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2552	PROLOGUE_3_ARGS
2553	sub xSP, 20h
2554
2555	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2556	fld qword [A2]
2557
2558	fnstsw word [A1 + IEMFPURESULT.FSW]
2559	fnclex
2560	fstp tword [A1 + IEMFPURESULT.r80Result]
2561
2562	fninit
2563	add xSP, 20h
2564	EPILOGUE_3_ARGS
2565	ENDPROC iemAImpl_fld_r64_to_r80
2566
2567
2568	;;
2569	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2570	;
2571	; @param A0 FPU context (fxsave).
2572	; @param A1 Where to return the output FSW.
2573	; @param A2 Where to store the 64-bit value.
2574	; @param A3 Pointer to the 80-bit value.
2575	;
2576	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2577	PROLOGUE_4_ARGS
2578	sub xSP, 20h
2579
2580	fninit
2581	fld tword [A3]
2582	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2583	fst qword [A2]
2584
2585	fnstsw word [A1]
2586
2587	fninit
2588	add xSP, 20h
2589	EPILOGUE_4_ARGS
2590	ENDPROC iemAImpl_fst_r80_to_r64
2591
2592
2593	;;
2594	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2595	;
2596	; @param 1 The instruction
2597	;
2598	; @param A0 FPU context (fxsave).
2599	; @param A1 Pointer to a IEMFPURESULT for the output.
2600	; @param A2 Pointer to the 80-bit value.
2601	; @param A3 Pointer to the 64-bit value.
2602	;
2603	%macro IEMIMPL_FPU_R80_BY_R64 1
2604	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2605	PROLOGUE_4_ARGS
2606	sub xSP, 20h
2607
2608	fninit
2609	fld tword [A2]
2610	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2611	%1 qword [A3]
2612
2613	fnstsw word [A1 + IEMFPURESULT.FSW]
2614	fnclex
2615	fstp tword [A1 + IEMFPURESULT.r80Result]
2616
2617	fninit
2618	add xSP, 20h
2619	EPILOGUE_4_ARGS
2620	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2621	%endmacro
2622
2623	IEMIMPL_FPU_R80_BY_R64 fadd
2624	IEMIMPL_FPU_R80_BY_R64 fmul
2625	IEMIMPL_FPU_R80_BY_R64 fsub
2626	IEMIMPL_FPU_R80_BY_R64 fsubr
2627	IEMIMPL_FPU_R80_BY_R64 fdiv
2628	IEMIMPL_FPU_R80_BY_R64 fdivr
2629
2630	;;
2631	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2632	; only returning FSW.
2633	;
2634	; @param 1 The instruction
2635	;
2636	; @param A0 FPU context (fxsave).
2637	; @param A1 Where to store the output FSW.
2638	; @param A2 Pointer to the 80-bit value.
2639	; @param A3 Pointer to the 64-bit value.
2640	;
2641	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2642	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2643	PROLOGUE_4_ARGS
2644	sub xSP, 20h
2645
2646	fninit
2647	fld tword [A2]
2648	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2649	%1 qword [A3]
2650
2651	fnstsw word [A1]
2652
2653	fninit
2654	add xSP, 20h
2655	EPILOGUE_4_ARGS
2656	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2657	%endmacro
2658
2659	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2660
2661
2662
2663	;
2664	;---------------------- 80-bit floating point operations ----------------------
2665	;
2666
2667	;;
2668	; Loads a 80-bit floating point register value from memory.
2669	;
2670	; @param A0 FPU context (fxsave).
2671	; @param A1 Pointer to a IEMFPURESULT for the output.
2672	; @param A2 Pointer to the 80-bit floating point value to load.
2673	;
2674	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2675	PROLOGUE_3_ARGS
2676	sub xSP, 20h
2677
2678	fninit
2679	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2680	fld tword [A2]
2681
2682	fnstsw word [A1 + IEMFPURESULT.FSW]
2683	fnclex
2684	fstp tword [A1 + IEMFPURESULT.r80Result]
2685
2686	fninit
2687	add xSP, 20h
2688	EPILOGUE_3_ARGS
2689	ENDPROC iemAImpl_fld_r80_from_r80
2690
2691
2692	;;
2693	; Store a 80-bit floating point register to memory
2694	;
2695	; @param A0 FPU context (fxsave).
2696	; @param A1 Where to return the output FSW.
2697	; @param A2 Where to store the 80-bit value.
2698	; @param A3 Pointer to the 80-bit register value.
2699	;
2700	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2701	PROLOGUE_4_ARGS
2702	sub xSP, 20h
2703
2704	fninit
2705	fld tword [A3]
2706	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2707	fstp tword [A2]
2708
2709	fnstsw word [A1]
2710
2711	fninit
2712	add xSP, 20h
2713	EPILOGUE_4_ARGS
2714	ENDPROC iemAImpl_fst_r80_to_r80
2715
2716
2717	;;
2718	; Loads an 80-bit floating point register value in BCD format from memory.
2719	;
2720	; @param A0 FPU context (fxsave).
2721	; @param A1 Pointer to a IEMFPURESULT for the output.
2722	; @param A2 Pointer to the 80-bit BCD value to load.
2723	;
2724	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
2725	PROLOGUE_3_ARGS
2726	sub xSP, 20h
2727
2728	fninit
2729	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2730	fbld tword [A2]
2731
2732	fnstsw word [A1 + IEMFPURESULT.FSW]
2733	fnclex
2734	fstp tword [A1 + IEMFPURESULT.r80Result]
2735
2736	fninit
2737	add xSP, 20h
2738	EPILOGUE_3_ARGS
2739	ENDPROC iemAImpl_fld_r80_from_d80
2740
2741
2742	;;
2743	; Store a 80-bit floating point register to memory as BCD
2744	;
2745	; @param A0 FPU context (fxsave).
2746	; @param A1 Where to return the output FSW.
2747	; @param A2 Where to store the 80-bit BCD value.
2748	; @param A3 Pointer to the 80-bit register value.
2749	;
2750	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
2751	PROLOGUE_4_ARGS
2752	sub xSP, 20h
2753
2754	fninit
2755	fld tword [A3]
2756	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2757	fbstp tword [A2]
2758
2759	fnstsw word [A1]
2760
2761	fninit
2762	add xSP, 20h
2763	EPILOGUE_4_ARGS
2764	ENDPROC iemAImpl_fst_r80_to_d80
2765
2766
2767	;;
2768	; FPU instruction working on two 80-bit floating point values.
2769	;
2770	; @param 1 The instruction
2771	;
2772	; @param A0 FPU context (fxsave).
2773	; @param A1 Pointer to a IEMFPURESULT for the output.
2774	; @param A2 Pointer to the first 80-bit value (ST0)
2775	; @param A3 Pointer to the second 80-bit value (STn).
2776	;
2777	%macro IEMIMPL_FPU_R80_BY_R80 2
2778	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2779	PROLOGUE_4_ARGS
2780	sub xSP, 20h
2781
2782	fninit
2783	fld tword [A3]
2784	fld tword [A2]
2785	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2786	%1 %2
2787
2788	fnstsw word [A1 + IEMFPURESULT.FSW]
2789	fnclex
2790	fstp tword [A1 + IEMFPURESULT.r80Result]
2791
2792	fninit
2793	add xSP, 20h
2794	EPILOGUE_4_ARGS
2795	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2796	%endmacro
2797
2798	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2799	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2800	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2801	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2802	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2803	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2804	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2805	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2806	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2807
2808
2809	;;
2810	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2811	; storing the result in ST1 and popping the stack.
2812	;
2813	; @param 1 The instruction
2814	;
2815	; @param A0 FPU context (fxsave).
2816	; @param A1 Pointer to a IEMFPURESULT for the output.
2817	; @param A2 Pointer to the first 80-bit value (ST1).
2818	; @param A3 Pointer to the second 80-bit value (ST0).
2819	;
2820	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2821	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2822	PROLOGUE_4_ARGS
2823	sub xSP, 20h
2824
2825	fninit
2826	fld tword [A2]
2827	fld tword [A3]
2828	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2829	%1
2830
2831	fnstsw word [A1 + IEMFPURESULT.FSW]
2832	fnclex
2833	fstp tword [A1 + IEMFPURESULT.r80Result]
2834
2835	fninit
2836	add xSP, 20h
2837	EPILOGUE_4_ARGS
2838	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2839	%endmacro
2840
2841	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2842	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2843	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2844
2845
2846	;;
2847	; FPU instruction working on two 80-bit floating point values, only
2848	; returning FSW.
2849	;
2850	; @param 1 The instruction
2851	;
2852	; @param A0 FPU context (fxsave).
2853	; @param A1 Pointer to a uint16_t for the resulting FSW.
2854	; @param A2 Pointer to the first 80-bit value.
2855	; @param A3 Pointer to the second 80-bit value.
2856	;
2857	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2858	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2859	PROLOGUE_4_ARGS
2860	sub xSP, 20h
2861
2862	fninit
2863	fld tword [A3]
2864	fld tword [A2]
2865	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2866	%1 st0, st1
2867
2868	fnstsw word [A1]
2869
2870	fninit
2871	add xSP, 20h
2872	EPILOGUE_4_ARGS
2873	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2874	%endmacro
2875
2876	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2877	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2878
2879
2880	;;
2881	; FPU instruction working on two 80-bit floating point values,
2882	; returning FSW and EFLAGS (eax).
2883	;
2884	; @param 1 The instruction
2885	;
2886	; @returns EFLAGS in EAX.
2887	; @param A0 FPU context (fxsave).
2888	; @param A1 Pointer to a uint16_t for the resulting FSW.
2889	; @param A2 Pointer to the first 80-bit value.
2890	; @param A3 Pointer to the second 80-bit value.
2891	;
2892	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2893	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2894	PROLOGUE_4_ARGS
2895	sub xSP, 20h
2896
2897	fninit
2898	fld tword [A3]
2899	fld tword [A2]
2900	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2901	%1 st1
2902
2903	fnstsw word [A1]
2904	pushf
2905	pop xAX
2906
2907	fninit
2908	add xSP, 20h
2909	EPILOGUE_4_ARGS
2910	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2911	%endmacro
2912
2913	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2914	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2915
2916
2917	;;
2918	; FPU instruction working on one 80-bit floating point value.
2919	;
2920	; @param 1 The instruction
2921	;
2922	; @param A0 FPU context (fxsave).
2923	; @param A1 Pointer to a IEMFPURESULT for the output.
2924	; @param A2 Pointer to the 80-bit value.
2925	;
2926	%macro IEMIMPL_FPU_R80 1
2927	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2928	PROLOGUE_3_ARGS
2929	sub xSP, 20h
2930
2931	fninit
2932	fld tword [A2]
2933	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2934	%1
2935
2936	fnstsw word [A1 + IEMFPURESULT.FSW]
2937	fnclex
2938	fstp tword [A1 + IEMFPURESULT.r80Result]
2939
2940	fninit
2941	add xSP, 20h
2942	EPILOGUE_3_ARGS
2943	ENDPROC iemAImpl_ %+ %1 %+ _r80
2944	%endmacro
2945
2946	IEMIMPL_FPU_R80 fchs
2947	IEMIMPL_FPU_R80 fabs
2948	IEMIMPL_FPU_R80 f2xm1
2949	IEMIMPL_FPU_R80 fsqrt
2950	IEMIMPL_FPU_R80 frndint
2951	IEMIMPL_FPU_R80 fsin
2952	IEMIMPL_FPU_R80 fcos
2953
2954
2955	;;
2956	; FPU instruction working on one 80-bit floating point value, only
2957	; returning FSW.
2958	;
2959	; @param 1 The instruction
2960	;
2961	; @param A0 FPU context (fxsave).
2962	; @param A1 Pointer to a uint16_t for the resulting FSW.
2963	; @param A2 Pointer to the 80-bit value.
2964	;
2965	%macro IEMIMPL_FPU_R80_FSW 1
2966	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2967	PROLOGUE_3_ARGS
2968	sub xSP, 20h
2969
2970	fninit
2971	fld tword [A2]
2972	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2973	%1
2974
2975	fnstsw word [A1]
2976
2977	fninit
2978	add xSP, 20h
2979	EPILOGUE_3_ARGS
2980	ENDPROC iemAImpl_ %+ %1 %+ _r80
2981	%endmacro
2982
2983	IEMIMPL_FPU_R80_FSW ftst
2984	IEMIMPL_FPU_R80_FSW fxam
2985
2986
2987
2988	;;
2989	; FPU instruction loading a 80-bit floating point constant.
2990	;
2991	; @param 1 The instruction
2992	;
2993	; @param A0 FPU context (fxsave).
2994	; @param A1 Pointer to a IEMFPURESULT for the output.
2995	;
2996	%macro IEMIMPL_FPU_R80_CONST 1
2997	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2998	PROLOGUE_2_ARGS
2999	sub xSP, 20h
3000
3001	fninit
3002	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3003	%1
3004
3005	fnstsw word [A1 + IEMFPURESULT.FSW]
3006	fnclex
3007	fstp tword [A1 + IEMFPURESULT.r80Result]
3008
3009	fninit
3010	add xSP, 20h
3011	EPILOGUE_2_ARGS
3012	ENDPROC iemAImpl_ %+ %1 %+
3013	%endmacro
3014
3015	IEMIMPL_FPU_R80_CONST fld1
3016	IEMIMPL_FPU_R80_CONST fldl2t
3017	IEMIMPL_FPU_R80_CONST fldl2e
3018	IEMIMPL_FPU_R80_CONST fldpi
3019	IEMIMPL_FPU_R80_CONST fldlg2
3020	IEMIMPL_FPU_R80_CONST fldln2
3021	IEMIMPL_FPU_R80_CONST fldz
3022
3023
3024	;;
3025	; FPU instruction working on one 80-bit floating point value, outputing two.
3026	;
3027	; @param 1 The instruction
3028	;
3029	; @param A0 FPU context (fxsave).
3030	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3031	; @param A2 Pointer to the 80-bit value.
3032	;
3033	%macro IEMIMPL_FPU_R80_R80 1
3034	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3035	PROLOGUE_3_ARGS
3036	sub xSP, 20h
3037
3038	fninit
3039	fld tword [A2]
3040	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041	%1
3042
3043	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3044	fnclex
3045	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3046	fnclex
3047	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3048
3049	fninit
3050	add xSP, 20h
3051	EPILOGUE_3_ARGS
3052	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3053	%endmacro
3054
3055	IEMIMPL_FPU_R80_R80 fptan
3056	IEMIMPL_FPU_R80_R80 fxtract
3057	IEMIMPL_FPU_R80_R80 fsincos
3058
3059
3060
3061
3062	;---------------------- SSE and MMX Operations ----------------------
3063
3064	;; @todo what do we need to do for MMX?
3065	%macro IEMIMPL_MMX_PROLOGUE 0
3066	%endmacro
3067	%macro IEMIMPL_MMX_EPILOGUE 0
3068	%endmacro
3069
3070	;; @todo what do we need to do for SSE?
3071	%macro IEMIMPL_SSE_PROLOGUE 0
3072	%endmacro
3073	%macro IEMIMPL_SSE_EPILOGUE 0
3074	%endmacro
3075
3076
3077	;;
3078	; Media instruction working on two full sized registers.
3079	;
3080	; @param 1 The instruction
3081	;
3082	; @param A0 FPU context (fxsave).
3083	; @param A1 Pointer to the first media register size operand (input/output).
3084	; @param A2 Pointer to the second media register size operand (input).
3085	;
3086	%macro IEMIMPL_MEDIA_F2 1
3087	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3088	PROLOGUE_3_ARGS
3089	IEMIMPL_MMX_PROLOGUE
3090
3091	movq mm0, [A1]
3092	movq mm1, [A2]
3093	%1 mm0, mm1
3094	movq [A1], mm0
3095
3096	IEMIMPL_MMX_EPILOGUE
3097	EPILOGUE_3_ARGS
3098	ENDPROC iemAImpl_ %+ %1 %+ _u64
3099
3100	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3101	PROLOGUE_3_ARGS
3102	IEMIMPL_SSE_PROLOGUE
3103
3104	movdqu xmm0, [A1]
3105	movdqu xmm1, [A2]
3106	%1 xmm0, xmm1
3107	movdqu [A1], xmm0
3108
3109	IEMIMPL_SSE_EPILOGUE
3110	EPILOGUE_3_ARGS
3111	ENDPROC iemAImpl_ %+ %1 %+ _u128
3112	%endmacro
3113
3114	IEMIMPL_MEDIA_F2 pxor
3115	IEMIMPL_MEDIA_F2 pcmpeqb
3116	IEMIMPL_MEDIA_F2 pcmpeqw
3117	IEMIMPL_MEDIA_F2 pcmpeqd
3118
3119
3120	;;
3121	; Media instruction working on one full sized and one half sized register (lower half).
3122	;
3123	; @param 1 The instruction
3124	; @param 2 1 if MMX is included, 0 if not.
3125	;
3126	; @param A0 FPU context (fxsave).
3127	; @param A1 Pointer to the first full sized media register operand (input/output).
3128	; @param A2 Pointer to the second half sized media register operand (input).
3129	;
3130	%macro IEMIMPL_MEDIA_F1L1 2
3131	%if %2 != 0
3132	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3133	PROLOGUE_3_ARGS
3134	IEMIMPL_MMX_PROLOGUE
3135
3136	movq mm0, [A1]
3137	movd mm1, [A2]
3138	%1 mm0, mm1
3139	movq [A1], mm0
3140
3141	IEMIMPL_MMX_EPILOGUE
3142	EPILOGUE_3_ARGS
3143	ENDPROC iemAImpl_ %+ %1 %+ _u64
3144	%endif
3145
3146	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3147	PROLOGUE_3_ARGS
3148	IEMIMPL_SSE_PROLOGUE
3149
3150	movdqu xmm0, [A1]
3151	movq xmm1, [A2]
3152	%1 xmm0, xmm1
3153	movdqu [A1], xmm0
3154
3155	IEMIMPL_SSE_EPILOGUE
3156	EPILOGUE_3_ARGS
3157	ENDPROC iemAImpl_ %+ %1 %+ _u128
3158	%endmacro
3159
3160	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3161	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3162	IEMIMPL_MEDIA_F1L1 punpckldq, 1
3163	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3164
3165
3166	;;
3167	; Media instruction working on one full sized and one half sized register (high half).
3168	;
3169	; @param 1 The instruction
3170	; @param 2 1 if MMX is included, 0 if not.
3171	;
3172	; @param A0 FPU context (fxsave).
3173	; @param A1 Pointer to the first full sized media register operand (input/output).
3174	; @param A2 Pointer to the second full sized media register operand, where we
3175	; will only use the upper half (input).
3176	;
3177	%macro IEMIMPL_MEDIA_F1H1 2
3178	%if %2 != 0
3179	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3180	PROLOGUE_3_ARGS
3181	IEMIMPL_MMX_PROLOGUE
3182
3183	movq mm0, [A1]
3184	movq mm1, [A2]
3185	%1 mm0, mm1
3186	movq [A1], mm0
3187
3188	IEMIMPL_MMX_EPILOGUE
3189	EPILOGUE_3_ARGS
3190	ENDPROC iemAImpl_ %+ %1 %+ _u64
3191	%endif
3192
3193	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3194	PROLOGUE_3_ARGS
3195	IEMIMPL_SSE_PROLOGUE
3196
3197	movdqu xmm0, [A1]
3198	movdqu xmm1, [A2]
3199	%1 xmm0, xmm1
3200	movdqu [A1], xmm0
3201
3202	IEMIMPL_SSE_EPILOGUE
3203	EPILOGUE_3_ARGS
3204	ENDPROC iemAImpl_ %+ %1 %+ _u128
3205	%endmacro
3206
3207	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3208	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3209	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3210	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3211
3212
3213	;
3214	; Shufflers with evil 8-bit immediates.
3215	;
3216
3217	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3218	PROLOGUE_4_ARGS
3219	IEMIMPL_MMX_PROLOGUE
3220
3221	movq mm0, [A1]
3222	movq mm1, [A2]
3223	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3224	lea T1, [.imm0 xWrtRIP]
3225	lea T1, [T1 + T0]
3226	call T1
3227	movq [A1], mm0
3228
3229	IEMIMPL_MMX_EPILOGUE
3230	EPILOGUE_4_ARGS
3231	%assign bImm 0
3232	%rep 256
3233	.imm %+ bImm:
3234	pshufw mm0, mm1, bImm
3235	ret
3236	%assign bImm bImm + 1
3237	%endrep
3238	.immEnd: ; 256*5 == 0x500
3239	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3240	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3241	ENDPROC iemAImpl_pshufw
3242
3243
3244	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3245	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3246	PROLOGUE_4_ARGS
3247	IEMIMPL_SSE_PROLOGUE
3248
3249	movdqu xmm0, [A1]
3250	movdqu xmm1, [A2]
3251	lea T1, [.imm0 xWrtRIP]
3252	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3253	lea T1, [T1 + T0*2]
3254	call T1
3255	movdqu [A1], xmm0
3256
3257	IEMIMPL_SSE_EPILOGUE
3258	EPILOGUE_4_ARGS
3259	%assign bImm 0
3260	%rep 256
3261	.imm %+ bImm:
3262	%1 xmm0, xmm1, bImm
3263	ret
3264	%assign bImm bImm + 1
3265	%endrep
3266	.immEnd: ; 256*6 == 0x600
3267	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3268	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3269	ENDPROC iemAImpl_ %+ %1
3270	%endmacro
3271
3272	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3273	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3274	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3275
3276
3277	;
3278	; Move byte mask.
3279	;
3280
3281	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3282	PROLOGUE_3_ARGS
3283	IEMIMPL_MMX_PROLOGUE
3284
3285	mov T0, [A1]
3286	movq mm1, [A2]
3287	pmovmskb T0, mm1
3288	mov [A1], T0
3289	%ifdef RT_ARCH_X86
3290	mov dword [A1 + 4], 0
3291	%endif
3292	IEMIMPL_MMX_EPILOGUE
3293	EPILOGUE_3_ARGS
3294	ENDPROC iemAImpl_pmovmskb_u64
3295
3296	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3297	PROLOGUE_3_ARGS
3298	IEMIMPL_SSE_PROLOGUE
3299
3300	mov T0, [A1]
3301	movdqu xmm1, [A2]
3302	pmovmskb T0, xmm1
3303	mov [A1], T0
3304	%ifdef RT_ARCH_X86
3305	mov dword [A1 + 4], 0
3306	%endif
3307	IEMIMPL_SSE_EPILOGUE
3308	EPILOGUE_3_ARGS
3309	ENDPROC iemAImpl_pmovmskb_u128
3310

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 94169

Download in other formats: