IEMAllAImpl.asm@ 94410

Last change on this file since 94410 was 94410, checked in by vboxsync, 3 years ago
VMM/IEM: Correction to iemAImpl_fistt_r80_to_i16 return variable size. bugref:9898
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 95.6 KB

Line
1	; $Id: IEMAllAImpl.asm 94410 2022-03-31 10:59:45Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;*********************************************************************************************************************************
20	;* Header Files *
21	;*********************************************************************************************************************************
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;*********************************************************************************************************************************
28	;* Defined Constants And Macros *
29	;*********************************************************************************************************************************
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%define T2 r10 ; only AMD64
176	%define T2_32 r10d
177	%define T2_16 r10w
178	%define T2_8 r10b
179
180	%else
181	; x86
182	%macro PROLOGUE_1_ARGS 0
183	push edi
184	%endmacro
185	%macro EPILOGUE_1_ARGS 0
186	pop edi
187	ret 0
188	%endmacro
189	%macro EPILOGUE_1_ARGS_EX 1
190	pop edi
191	ret %1
192	%endmacro
193
194	%macro PROLOGUE_2_ARGS 0
195	push edi
196	%endmacro
197	%macro EPILOGUE_2_ARGS 0
198	pop edi
199	ret 0
200	%endmacro
201	%macro EPILOGUE_2_ARGS_EX 1
202	pop edi
203	ret %1
204	%endmacro
205
206	%macro PROLOGUE_3_ARGS 0
207	push ebx
208	mov ebx, [esp + 4 + 4]
209	push edi
210	%endmacro
211	%macro EPILOGUE_3_ARGS_EX 1
212	%if (%1) < 4
213	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
214	%endif
215	pop edi
216	pop ebx
217	ret %1
218	%endmacro
219	%macro EPILOGUE_3_ARGS 0
220	EPILOGUE_3_ARGS_EX 4
221	%endmacro
222
223	%macro PROLOGUE_4_ARGS 0
224	push ebx
225	push edi
226	push esi
227	mov ebx, [esp + 12 + 4 + 0]
228	mov esi, [esp + 12 + 4 + 4]
229	%endmacro
230	%macro EPILOGUE_4_ARGS_EX 1
231	%if (%1) < 8
232	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
233	%endif
234	pop esi
235	pop edi
236	pop ebx
237	ret %1
238	%endmacro
239	%macro EPILOGUE_4_ARGS 0
240	EPILOGUE_4_ARGS_EX 8
241	%endmacro
242
243	%define A0 ecx
244	%define A0_32 ecx
245	%define A0_16 cx
246	%define A0_8 cl
247
248	%define A1 edx
249	%define A1_32 edx
250	%define A1_16 dx
251	%define A1_8 dl
252
253	%define A2 ebx
254	%define A2_32 ebx
255	%define A2_16 bx
256	%define A2_8 bl
257
258	%define A3 esi
259	%define A3_32 esi
260	%define A3_16 si
261
262	%define T0 eax
263	%define T0_32 eax
264	%define T0_16 ax
265	%define T0_8 al
266
267	%define T1 edi
268	%define T1_32 edi
269	%define T1_16 di
270	%endif
271
272
273	;;
274	; Load the relevant flags from [%1] if there are undefined flags (%3).
275	;
276	; @remarks Clobbers T0, stack. Changes EFLAGS.
277	; @param A2 The register pointing to the flags.
278	; @param 1 The parameter (A0..A3) pointing to the eflags.
279	; @param 2 The set of modified flags.
280	; @param 3 The set of undefined flags.
281	;
282	%macro IEM_MAYBE_LOAD_FLAGS 3
283	;%if (%3) != 0
284	pushf ; store current flags
285	mov T0_32, [%1] ; load the guest flags
286	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
287	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
288	or [xSP], T0 ; merge guest flags with host flags.
289	popf ; load the mixed flags.
290	;%endif
291	%endmacro
292
293	;;
294	; Update the flag.
295	;
296	; @remarks Clobbers T0, T1, stack.
297	; @param 1 The register pointing to the EFLAGS.
298	; @param 2 The mask of modified flags to save.
299	; @param 3 The mask of undefined flags to (maybe) save.
300	;
301	%macro IEM_SAVE_FLAGS 3
302	%if (%2 \| %3) != 0
303	pushf
304	pop T1
305	mov T0_32, [%1] ; flags
306	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
307	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
308	or T0_32, T1_32 ; combine the flags.
309	mov [%1], T0_32 ; save the flags.
310	%endif
311	%endmacro
312
313	;;
314	; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
315	;
316	; @remarks Clobbers T0, T1, stack.
317	; @param 1 The register pointing to the EFLAGS.
318	; @param 2 The mask of modified flags to save.
319	; @param 3 Mask of additional flags to always clear
320	; @param 4 Mask of additional flags to always set.
321	;
322	%macro IEM_SAVE_AND_ADJUST_FLAGS 4
323	%if (%2 \| %3 \| %4) != 0
324	pushf
325	pop T1
326	mov T0_32, [%1] ; load flags.
327	and T0_32, ~(%2 \| %3) ; clear the modified and always cleared flags.
328	and T1_32, (%2) ; select the modified flags.
329	or T0_32, T1_32 ; combine the flags.
330	%if (%4) != 0
331	or T0_32, %4 ; add the always set flags.
332	%endif
333	mov [%1], T0_32 ; save the result.
334	%endif
335	%endmacro
336
337	;;
338	; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
339	; signed input (%4[%5]) and parity index (%6).
340	;
341	; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
342	; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
343	; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
344	;
345	; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
346	; @param 1 The register pointing to the EFLAGS.
347	; @param 2 The mask of modified flags to save.
348	; @param 3 Mask of additional flags to always clear
349	; @param 4 The result register to set SF by.
350	; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
351	; @param 6 The (full) register containing the parity table index. Will be modified!
352
353	%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
354	%ifdef RT_ARCH_AMD64
355	pushf
356	pop T2
357	%else
358	push T0
359	pushf
360	pop T0
361	%endif
362	mov T1_32, [%1] ; load flags.
363	and T1_32, ~(%2 \| %3 \| X86_EFL_PF \| X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
364	%ifdef RT_ARCH_AMD64
365	and T2_32, (%2) ; select the modified flags.
366	or T1_32, T2_32 ; combine the flags.
367	%else
368	and T0_32, (%2) ; select the modified flags.
369	or T1_32, T0_32 ; combine the flags.
370	pop T0
371	%endif
372
373	; First calculate SF as it's likely to be refereing to the same register as %6 does.
374	bt %4, %5 - 1
375	jnc %%sf_clear
376	or T1_32, X86_EFL_SF
377	%%sf_clear:
378
379	; Parity last.
380	and %6, 0xff
381	%ifdef RT_ARCH_AMD64
382	lea T2, [NAME(g_afParity) xWrtRIP]
383	or T1_8, [T2 + %6]
384	%else
385	or T1_8, [NAME(g_afParity) + %6]
386	%endif
387
388	mov [%1], T1_32 ; save the result.
389	%endmacro
390
391	;;
392	; Calculates the new EFLAGS using fixed clear and set bit masks.
393	;
394	; @remarks Clobbers T0.
395	; @param 1 The register pointing to the EFLAGS.
396	; @param 2 Mask of additional flags to always clear
397	; @param 3 Mask of additional flags to always set.
398	;
399	%macro IEM_ADJUST_FLAGS 3
400	%if (%2 \| %3) != 0
401	mov T0_32, [%1] ; Load flags.
402	%if (%2) != 0
403	and T0_32, ~(%2) ; Remove the always cleared flags.
404	%endif
405	%if (%3) != 0
406	or T0_32, %3 ; Add the always set flags.
407	%endif
408	mov [%1], T0_32 ; Save the result.
409	%endif
410	%endmacro
411
412	;;
413	; Calculates the new EFLAGS using fixed clear and set bit masks.
414	;
415	; @remarks Clobbers T0, %4, EFLAGS.
416	; @param 1 The register pointing to the EFLAGS.
417	; @param 2 Mask of additional flags to always clear
418	; @param 3 Mask of additional flags to always set.
419	; @param 4 The (full) register containing the parity table index. Will be modified!
420	;
421	%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
422	mov T0_32, [%1] ; Load flags.
423	and T0_32, ~(%2 \| X86_EFL_PF) ; Remove PF and the always cleared flags.
424	%if (%3) != 0
425	or T0_32, %3 ; Add the always set flags.
426	%endif
427	and %4, 0xff
428	%ifdef RT_ARCH_AMD64
429	lea T2, [NAME(g_afParity) xWrtRIP]
430	or T0_8, [T2 + %4]
431	%else
432	or T0_8, [NAME(g_afParity) + %4]
433	%endif
434	mov [%1], T0_32 ; Save the result.
435	%endmacro
436
437
438	;*********************************************************************************************************************************
439	;* External Symbols *
440	;*********************************************************************************************************************************
441	extern NAME(g_afParity)
442
443
444	;;
445	; Macro for implementing a binary operator.
446	;
447	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
448	; variants, except on 32-bit system where the 64-bit accesses requires hand
449	; coding.
450	;
451	; All the functions takes a pointer to the destination memory operand in A0,
452	; the source register operand in A1 and a pointer to eflags in A2.
453	;
454	; @param 1 The instruction mnemonic.
455	; @param 2 Non-zero if there should be a locked version.
456	; @param 3 The modified flags.
457	; @param 4 The undefined flags.
458	;
459	%macro IEMIMPL_BIN_OP 4
460	BEGINCODE
461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
462	PROLOGUE_3_ARGS
463	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
464	%1 byte [A0], A1_8
465	IEM_SAVE_FLAGS A2, %3, %4
466	EPILOGUE_3_ARGS
467	ENDPROC iemAImpl_ %+ %1 %+ _u8
468
469	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
470	PROLOGUE_3_ARGS
471	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472	%1 word [A0], A1_16
473	IEM_SAVE_FLAGS A2, %3, %4
474	EPILOGUE_3_ARGS
475	ENDPROC iemAImpl_ %+ %1 %+ _u16
476
477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
478	PROLOGUE_3_ARGS
479	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480	%1 dword [A0], A1_32
481	IEM_SAVE_FLAGS A2, %3, %4
482	EPILOGUE_3_ARGS
483	ENDPROC iemAImpl_ %+ %1 %+ _u32
484
485	%ifdef RT_ARCH_AMD64
486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
487	PROLOGUE_3_ARGS
488	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489	%1 qword [A0], A1
490	IEM_SAVE_FLAGS A2, %3, %4
491	EPILOGUE_3_ARGS_EX 8
492	ENDPROC iemAImpl_ %+ %1 %+ _u64
493	%endif ; RT_ARCH_AMD64
494
495	%if %2 != 0 ; locked versions requested?
496
497	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
498	PROLOGUE_3_ARGS
499	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500	lock %1 byte [A0], A1_8
501	IEM_SAVE_FLAGS A2, %3, %4
502	EPILOGUE_3_ARGS
503	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
504
505	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
506	PROLOGUE_3_ARGS
507	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508	lock %1 word [A0], A1_16
509	IEM_SAVE_FLAGS A2, %3, %4
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
516	lock %1 dword [A0], A1_32
517	IEM_SAVE_FLAGS A2, %3, %4
518	EPILOGUE_3_ARGS
519	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
520
521	%ifdef RT_ARCH_AMD64
522	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
523	PROLOGUE_3_ARGS
524	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
525	lock %1 qword [A0], A1
526	IEM_SAVE_FLAGS A2, %3, %4
527	EPILOGUE_3_ARGS_EX 8
528	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
529	%endif ; RT_ARCH_AMD64
530	%endif ; locked
531	%endmacro
532
533	; instr,lock,modified-flags.
534	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
535	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
536	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
537	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
538	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
539	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
540	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
541	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
542	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
543
544
545	;;
546	; Macro for implementing a bit operator.
547	;
548	; This will generate code for the 16, 32 and 64 bit accesses with locked
549	; variants, except on 32-bit system where the 64-bit accesses requires hand
550	; coding.
551	;
552	; All the functions takes a pointer to the destination memory operand in A0,
553	; the source register operand in A1 and a pointer to eflags in A2.
554	;
555	; @param 1 The instruction mnemonic.
556	; @param 2 Non-zero if there should be a locked version.
557	; @param 3 The modified flags.
558	; @param 4 The undefined flags.
559	;
560	%macro IEMIMPL_BIT_OP 4
561	BEGINCODE
562	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
563	PROLOGUE_3_ARGS
564	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
565	%1 word [A0], A1_16
566	IEM_SAVE_FLAGS A2, %3, %4
567	EPILOGUE_3_ARGS
568	ENDPROC iemAImpl_ %+ %1 %+ _u16
569
570	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
571	PROLOGUE_3_ARGS
572	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
573	%1 dword [A0], A1_32
574	IEM_SAVE_FLAGS A2, %3, %4
575	EPILOGUE_3_ARGS
576	ENDPROC iemAImpl_ %+ %1 %+ _u32
577
578	%ifdef RT_ARCH_AMD64
579	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
580	PROLOGUE_3_ARGS
581	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
582	%1 qword [A0], A1
583	IEM_SAVE_FLAGS A2, %3, %4
584	EPILOGUE_3_ARGS_EX 8
585	ENDPROC iemAImpl_ %+ %1 %+ _u64
586	%endif ; RT_ARCH_AMD64
587
588	%if %2 != 0 ; locked versions requested?
589
590	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
591	PROLOGUE_3_ARGS
592	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
593	lock %1 word [A0], A1_16
594	IEM_SAVE_FLAGS A2, %3, %4
595	EPILOGUE_3_ARGS
596	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
597
598	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
599	PROLOGUE_3_ARGS
600	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
601	lock %1 dword [A0], A1_32
602	IEM_SAVE_FLAGS A2, %3, %4
603	EPILOGUE_3_ARGS
604	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
605
606	%ifdef RT_ARCH_AMD64
607	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
608	PROLOGUE_3_ARGS
609	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
610	lock %1 qword [A0], A1
611	IEM_SAVE_FLAGS A2, %3, %4
612	EPILOGUE_3_ARGS_EX 8
613	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
614	%endif ; RT_ARCH_AMD64
615	%endif ; locked
616	%endmacro
617	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
618	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
619	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
620	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
621
622	;;
623	; Macro for implementing a bit search operator.
624	;
625	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
626	; system where the 64-bit accesses requires hand coding.
627	;
628	; All the functions takes a pointer to the destination memory operand in A0,
629	; the source register operand in A1 and a pointer to eflags in A2.
630	;
631	; In the ZF case the destination register is 'undefined', however it seems that
632	; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
633	; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
634	; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
635	; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
636	;
637	; @param 1 The instruction mnemonic.
638	; @param 2 The modified flags.
639	; @param 3 The undefined flags.
640	;
641	%macro IEMIMPL_BIT_OP 3
642	BEGINCODE
643	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
644	PROLOGUE_3_ARGS
645	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
646	%1 T0_16, A1_16
647	jz .unchanged_dst
648	mov [A0], T0_16
649	.unchanged_dst:
650	IEM_SAVE_FLAGS A2, %2, %3
651	EPILOGUE_3_ARGS
652	ENDPROC iemAImpl_ %+ %1 %+ _u16
653
654	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
655	PROLOGUE_3_ARGS
656	%1 T1_16, A1_16
657	jz .unchanged_dst
658	mov [A0], T1_16
659	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
660	EPILOGUE_3_ARGS
661	.unchanged_dst:
662	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
663	EPILOGUE_3_ARGS
664	ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
665
666	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
667	PROLOGUE_3_ARGS
668	%1 T0_16, A1_16
669	jz .unchanged_dst
670	mov [A0], T0_16
671	.unchanged_dst:
672	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
673	EPILOGUE_3_ARGS
674	ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
675
676
677	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
678	PROLOGUE_3_ARGS
679	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
680	%1 T0_32, A1_32
681	jz .unchanged_dst
682	mov [A0], T0_32
683	.unchanged_dst:
684	IEM_SAVE_FLAGS A2, %2, %3
685	EPILOGUE_3_ARGS
686	ENDPROC iemAImpl_ %+ %1 %+ _u32
687
688	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
689	PROLOGUE_3_ARGS
690	%1 T1_32, A1_32
691	jz .unchanged_dst
692	mov [A0], T1_32
693	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
694	EPILOGUE_3_ARGS
695	.unchanged_dst:
696	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
697	EPILOGUE_3_ARGS
698	ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
699
700	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
701	PROLOGUE_3_ARGS
702	%1 T0_32, A1_32
703	jz .unchanged_dst
704	mov [A0], T0_32
705	.unchanged_dst:
706	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
707	EPILOGUE_3_ARGS
708	ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
709
710
711	%ifdef RT_ARCH_AMD64
712
713	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
714	PROLOGUE_3_ARGS
715	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
716	%1 T0, A1
717	jz .unchanged_dst
718	mov [A0], T0
719	.unchanged_dst:
720	IEM_SAVE_FLAGS A2, %2, %3
721	EPILOGUE_3_ARGS_EX 8
722	ENDPROC iemAImpl_ %+ %1 %+ _u64
723
724	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
725	PROLOGUE_3_ARGS
726	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
727	%1 T1, A1
728	jz .unchanged_dst
729	mov [A0], T1
730	IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF \| X86_EFL_ZF, 0, T1
731	EPILOGUE_3_ARGS
732	.unchanged_dst:
733	IEM_ADJUST_FLAGS A2, X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_CF, X86_EFL_ZF \| X86_EFL_PF
734	EPILOGUE_3_ARGS
735	ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
736
737	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
738	PROLOGUE_3_ARGS
739	%1 T0, A1
740	jz .unchanged_dst
741	mov [A0], T0
742	.unchanged_dst:
743	IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
744	EPILOGUE_3_ARGS_EX 8
745	ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
746
747	%endif ; RT_ARCH_AMD64
748	%endmacro
749
750	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
751	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
752
753
754	;
755	; IMUL is also a similar but yet different case (no lock, no mem dst).
756	; The rDX:rAX variant of imul is handled together with mul further down.
757	;
758	BEGINCODE
759	; @param 1 EFLAGS that are modified.
760	; @param 2 Undefined EFLAGS.
761	; @param 3 Function suffix.
762	; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
763	; 2 for AMD (set AF, clear PF, ZF and SF).
764	%macro IEMIMPL_IMUL_TWO 4
765	BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
766	PROLOGUE_3_ARGS
767	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
768	imul A1_16, word [A0]
769	mov [A0], A1_16
770	%if %4 != 1
771	IEM_SAVE_FLAGS A2, %1, %2
772	%else
773	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_16, 16, A1
774	%endif
775	EPILOGUE_3_ARGS
776	ENDPROC iemAImpl_imul_two_u16 %+ %3
777
778	BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
779	PROLOGUE_3_ARGS
780	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
781	imul A1_32, dword [A0]
782	mov [A0], A1_32
783	%if %4 != 1
784	IEM_SAVE_FLAGS A2, %1, %2
785	%else
786	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1_32, 32, A1
787	%endif
788	EPILOGUE_3_ARGS
789	ENDPROC iemAImpl_imul_two_u32 %+ %3
790
791	%ifdef RT_ARCH_AMD64
792	BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
793	PROLOGUE_3_ARGS
794	IEM_MAYBE_LOAD_FLAGS A2, %1, %2
795	imul A1, qword [A0]
796	mov [A0], A1
797	%if %4 != 1
798	IEM_SAVE_FLAGS A2, %1, %2
799	%else
800	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF \| X86_EFL_ZF, A1, 64, A1
801	%endif
802	EPILOGUE_3_ARGS_EX 8
803	ENDPROC iemAImpl_imul_two_u64 %+ %3
804	%endif ; RT_ARCH_AMD64
805	%endmacro
806	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, , 0
807	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _intel, 1
808	IEMIMPL_IMUL_TWO X86_EFL_OF \| X86_EFL_CF, 0, _amd, 2
809
810
811	;
812	; XCHG for memory operands. This implies locking. No flag changes.
813	;
814	; Each function takes two arguments, first the pointer to the memory,
815	; then the pointer to the register. They all return void.
816	;
817	BEGINCODE
818	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
819	PROLOGUE_2_ARGS
820	mov T0_8, [A1]
821	xchg [A0], T0_8
822	mov [A1], T0_8
823	EPILOGUE_2_ARGS
824	ENDPROC iemAImpl_xchg_u8_locked
825
826	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
827	PROLOGUE_2_ARGS
828	mov T0_16, [A1]
829	xchg [A0], T0_16
830	mov [A1], T0_16
831	EPILOGUE_2_ARGS
832	ENDPROC iemAImpl_xchg_u16_locked
833
834	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
835	PROLOGUE_2_ARGS
836	mov T0_32, [A1]
837	xchg [A0], T0_32
838	mov [A1], T0_32
839	EPILOGUE_2_ARGS
840	ENDPROC iemAImpl_xchg_u32_locked
841
842	%ifdef RT_ARCH_AMD64
843	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
844	PROLOGUE_2_ARGS
845	mov T0, [A1]
846	xchg [A0], T0
847	mov [A1], T0
848	EPILOGUE_2_ARGS
849	ENDPROC iemAImpl_xchg_u64_locked
850	%endif
851
852	; Unlocked variants for fDisregardLock mode.
853
854	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
855	PROLOGUE_2_ARGS
856	mov T0_8, [A1]
857	mov T1_8, [A0]
858	mov [A0], T0_8
859	mov [A1], T1_8
860	EPILOGUE_2_ARGS
861	ENDPROC iemAImpl_xchg_u8_unlocked
862
863	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
864	PROLOGUE_2_ARGS
865	mov T0_16, [A1]
866	mov T1_16, [A0]
867	mov [A0], T0_16
868	mov [A1], T1_16
869	EPILOGUE_2_ARGS
870	ENDPROC iemAImpl_xchg_u16_unlocked
871
872	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
873	PROLOGUE_2_ARGS
874	mov T0_32, [A1]
875	mov T1_32, [A0]
876	mov [A0], T0_32
877	mov [A1], T1_32
878	EPILOGUE_2_ARGS
879	ENDPROC iemAImpl_xchg_u32_unlocked
880
881	%ifdef RT_ARCH_AMD64
882	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
883	PROLOGUE_2_ARGS
884	mov T0, [A1]
885	mov T1, [A0]
886	mov [A0], T0
887	mov [A1], T1
888	EPILOGUE_2_ARGS
889	ENDPROC iemAImpl_xchg_u64_unlocked
890	%endif
891
892
893	;
894	; XADD for memory operands.
895	;
896	; Each function takes three arguments, first the pointer to the
897	; memory/register, then the pointer to the register, and finally a pointer to
898	; eflags. They all return void.
899	;
900	BEGINCODE
901	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
902	PROLOGUE_3_ARGS
903	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
904	mov T0_8, [A1]
905	xadd [A0], T0_8
906	mov [A1], T0_8
907	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
908	EPILOGUE_3_ARGS
909	ENDPROC iemAImpl_xadd_u8
910
911	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
912	PROLOGUE_3_ARGS
913	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
914	mov T0_16, [A1]
915	xadd [A0], T0_16
916	mov [A1], T0_16
917	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
918	EPILOGUE_3_ARGS
919	ENDPROC iemAImpl_xadd_u16
920
921	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
922	PROLOGUE_3_ARGS
923	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
924	mov T0_32, [A1]
925	xadd [A0], T0_32
926	mov [A1], T0_32
927	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
928	EPILOGUE_3_ARGS
929	ENDPROC iemAImpl_xadd_u32
930
931	%ifdef RT_ARCH_AMD64
932	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
933	PROLOGUE_3_ARGS
934	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
935	mov T0, [A1]
936	xadd [A0], T0
937	mov [A1], T0
938	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
939	EPILOGUE_3_ARGS
940	ENDPROC iemAImpl_xadd_u64
941	%endif ; RT_ARCH_AMD64
942
943	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
944	PROLOGUE_3_ARGS
945	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
946	mov T0_8, [A1]
947	lock xadd [A0], T0_8
948	mov [A1], T0_8
949	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
950	EPILOGUE_3_ARGS
951	ENDPROC iemAImpl_xadd_u8_locked
952
953	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
954	PROLOGUE_3_ARGS
955	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
956	mov T0_16, [A1]
957	lock xadd [A0], T0_16
958	mov [A1], T0_16
959	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
960	EPILOGUE_3_ARGS
961	ENDPROC iemAImpl_xadd_u16_locked
962
963	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
964	PROLOGUE_3_ARGS
965	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
966	mov T0_32, [A1]
967	lock xadd [A0], T0_32
968	mov [A1], T0_32
969	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
970	EPILOGUE_3_ARGS
971	ENDPROC iemAImpl_xadd_u32_locked
972
973	%ifdef RT_ARCH_AMD64
974	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
975	PROLOGUE_3_ARGS
976	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
977	mov T0, [A1]
978	lock xadd [A0], T0
979	mov [A1], T0
980	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
981	EPILOGUE_3_ARGS
982	ENDPROC iemAImpl_xadd_u64_locked
983	%endif ; RT_ARCH_AMD64
984
985
986	;
987	; CMPXCHG8B.
988	;
989	; These are tricky register wise, so the code is duplicated for each calling
990	; convention.
991	;
992	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
993	;
994	; C-proto:
995	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
996	; uint32_t *pEFlags));
997	;
998	; Note! Identical to iemAImpl_cmpxchg16b.
999	;
1000	BEGINCODE
1001	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1002	%ifdef RT_ARCH_AMD64
1003	%ifdef ASM_CALL64_MSC
1004	push rbx
1005
1006	mov r11, rdx ; pu64EaxEdx (is also T1)
1007	mov r10, rcx ; pu64Dst
1008
1009	mov ebx, [r8]
1010	mov ecx, [r8 + 4]
1011	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1012	mov eax, [r11]
1013	mov edx, [r11 + 4]
1014
1015	lock cmpxchg8b [r10]
1016
1017	mov [r11], eax
1018	mov [r11 + 4], edx
1019	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1020
1021	pop rbx
1022	ret
1023	%else
1024	push rbx
1025
1026	mov r10, rcx ; pEFlags
1027	mov r11, rdx ; pu64EbxEcx (is also T1)
1028
1029	mov ebx, [r11]
1030	mov ecx, [r11 + 4]
1031	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1032	mov eax, [rsi]
1033	mov edx, [rsi + 4]
1034
1035	lock cmpxchg8b [rdi]
1036
1037	mov [rsi], eax
1038	mov [rsi + 4], edx
1039	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1040
1041	pop rbx
1042	ret
1043
1044	%endif
1045	%else
1046	push esi
1047	push edi
1048	push ebx
1049	push ebp
1050
1051	mov edi, ecx ; pu64Dst
1052	mov esi, edx ; pu64EaxEdx
1053	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1054	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1055
1056	mov ebx, [ecx]
1057	mov ecx, [ecx + 4]
1058	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1059	mov eax, [esi]
1060	mov edx, [esi + 4]
1061
1062	lock cmpxchg8b [edi]
1063
1064	mov [esi], eax
1065	mov [esi + 4], edx
1066	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1067
1068	pop ebp
1069	pop ebx
1070	pop edi
1071	pop esi
1072	ret 8
1073	%endif
1074	ENDPROC iemAImpl_cmpxchg8b
1075
1076	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1077	; Lazy bird always lock prefixes cmpxchg8b.
1078	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1079	ENDPROC iemAImpl_cmpxchg8b_locked
1080
1081	%ifdef RT_ARCH_AMD64
1082
1083	;
1084	; CMPXCHG16B.
1085	;
1086	; These are tricky register wise, so the code is duplicated for each calling
1087	; convention.
1088	;
1089	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1090	;
1091	; C-proto:
1092	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1093	; uint32_t *pEFlags));
1094	;
1095	; Note! Identical to iemAImpl_cmpxchg8b.
1096	;
1097	BEGINCODE
1098	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1099	%ifdef ASM_CALL64_MSC
1100	push rbx
1101
1102	mov r11, rdx ; pu64RaxRdx (is also T1)
1103	mov r10, rcx ; pu64Dst
1104
1105	mov rbx, [r8]
1106	mov rcx, [r8 + 8]
1107	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1108	mov rax, [r11]
1109	mov rdx, [r11 + 8]
1110
1111	lock cmpxchg16b [r10]
1112
1113	mov [r11], rax
1114	mov [r11 + 8], rdx
1115	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1116
1117	pop rbx
1118	ret
1119	%else
1120	push rbx
1121
1122	mov r10, rcx ; pEFlags
1123	mov r11, rdx ; pu64RbxRcx (is also T1)
1124
1125	mov rbx, [r11]
1126	mov rcx, [r11 + 8]
1127	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1128	mov rax, [rsi]
1129	mov rdx, [rsi + 8]
1130
1131	lock cmpxchg16b [rdi]
1132
1133	mov [rsi], rax
1134	mov [rsi + 8], rdx
1135	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1136
1137	pop rbx
1138	ret
1139
1140	%endif
1141	ENDPROC iemAImpl_cmpxchg16b
1142
1143	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1144	; Lazy bird always lock prefixes cmpxchg16b.
1145	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1146	ENDPROC iemAImpl_cmpxchg16b_locked
1147
1148	%endif ; RT_ARCH_AMD64
1149
1150
1151	;
1152	; CMPXCHG.
1153	;
1154	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1155	;
1156	; C-proto:
1157	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
1158	;
1159	BEGINCODE
1160	%macro IEMIMPL_CMPXCHG 2
1161	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1162	PROLOGUE_4_ARGS
1163	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1164	mov al, [A1]
1165	%1 cmpxchg [A0], A2_8
1166	mov [A1], al
1167	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1168	EPILOGUE_4_ARGS
1169	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1170
1171	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1172	PROLOGUE_4_ARGS
1173	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1174	mov ax, [A1]
1175	%1 cmpxchg [A0], A2_16
1176	mov [A1], ax
1177	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1178	EPILOGUE_4_ARGS
1179	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1180
1181	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1182	PROLOGUE_4_ARGS
1183	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1184	mov eax, [A1]
1185	%1 cmpxchg [A0], A2_32
1186	mov [A1], eax
1187	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1188	EPILOGUE_4_ARGS
1189	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1190
1191	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1192	%ifdef RT_ARCH_AMD64
1193	PROLOGUE_4_ARGS
1194	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1195	mov rax, [A1]
1196	%1 cmpxchg [A0], A2
1197	mov [A1], rax
1198	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1199	EPILOGUE_4_ARGS
1200	%else
1201	;
1202	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1203	;
1204	push esi
1205	push edi
1206	push ebx
1207	push ebp
1208
1209	mov edi, ecx ; pu64Dst
1210	mov esi, edx ; pu64Rax
1211	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1212	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1213
1214	mov ebx, [ecx]
1215	mov ecx, [ecx + 4]
1216	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
1217	mov eax, [esi]
1218	mov edx, [esi + 4]
1219
1220	lock cmpxchg8b [edi]
1221
1222	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1223	jz .cmpxchg8b_not_equal
1224	cmp eax, eax ; just set the other flags.
1225	.store:
1226	mov [esi], eax
1227	mov [esi + 4], edx
1228	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1229
1230	pop ebp
1231	pop ebx
1232	pop edi
1233	pop esi
1234	ret 8
1235
1236	.cmpxchg8b_not_equal:
1237	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1238	jne .store
1239	cmp [esi], eax
1240	jmp .store
1241
1242	%endif
1243	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1244	%endmacro ; IEMIMPL_CMPXCHG
1245
1246	IEMIMPL_CMPXCHG , ,
1247	IEMIMPL_CMPXCHG lock, _locked
1248
1249	;;
1250	; Macro for implementing a unary operator.
1251	;
1252	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1253	; variants, except on 32-bit system where the 64-bit accesses requires hand
1254	; coding.
1255	;
1256	; All the functions takes a pointer to the destination memory operand in A0,
1257	; the source register operand in A1 and a pointer to eflags in A2.
1258	;
1259	; @param 1 The instruction mnemonic.
1260	; @param 2 The modified flags.
1261	; @param 3 The undefined flags.
1262	;
1263	%macro IEMIMPL_UNARY_OP 3
1264	BEGINCODE
1265	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1266	PROLOGUE_2_ARGS
1267	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1268	%1 byte [A0]
1269	IEM_SAVE_FLAGS A1, %2, %3
1270	EPILOGUE_2_ARGS
1271	ENDPROC iemAImpl_ %+ %1 %+ _u8
1272
1273	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1274	PROLOGUE_2_ARGS
1275	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1276	lock %1 byte [A0]
1277	IEM_SAVE_FLAGS A1, %2, %3
1278	EPILOGUE_2_ARGS
1279	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1280
1281	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1282	PROLOGUE_2_ARGS
1283	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1284	%1 word [A0]
1285	IEM_SAVE_FLAGS A1, %2, %3
1286	EPILOGUE_2_ARGS
1287	ENDPROC iemAImpl_ %+ %1 %+ _u16
1288
1289	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1290	PROLOGUE_2_ARGS
1291	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1292	lock %1 word [A0]
1293	IEM_SAVE_FLAGS A1, %2, %3
1294	EPILOGUE_2_ARGS
1295	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1296
1297	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1298	PROLOGUE_2_ARGS
1299	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1300	%1 dword [A0]
1301	IEM_SAVE_FLAGS A1, %2, %3
1302	EPILOGUE_2_ARGS
1303	ENDPROC iemAImpl_ %+ %1 %+ _u32
1304
1305	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1306	PROLOGUE_2_ARGS
1307	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1308	lock %1 dword [A0]
1309	IEM_SAVE_FLAGS A1, %2, %3
1310	EPILOGUE_2_ARGS
1311	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1312
1313	%ifdef RT_ARCH_AMD64
1314	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1315	PROLOGUE_2_ARGS
1316	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1317	%1 qword [A0]
1318	IEM_SAVE_FLAGS A1, %2, %3
1319	EPILOGUE_2_ARGS
1320	ENDPROC iemAImpl_ %+ %1 %+ _u64
1321
1322	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1323	PROLOGUE_2_ARGS
1324	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1325	lock %1 qword [A0]
1326	IEM_SAVE_FLAGS A1, %2, %3
1327	EPILOGUE_2_ARGS
1328	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1329	%endif ; RT_ARCH_AMD64
1330
1331	%endmacro
1332
1333	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1334	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1335	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1336	IEMIMPL_UNARY_OP not, 0, 0
1337
1338
1339	;
1340	; BSWAP. No flag changes.
1341	;
1342	; Each function takes one argument, pointer to the value to bswap
1343	; (input/output). They all return void.
1344	;
1345	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1346	PROLOGUE_1_ARGS
1347	mov T0_32, [A0] ; just in case any of the upper bits are used.
1348	db 66h
1349	bswap T0_32
1350	mov [A0], T0_32
1351	EPILOGUE_1_ARGS
1352	ENDPROC iemAImpl_bswap_u16
1353
1354	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1355	PROLOGUE_1_ARGS
1356	mov T0_32, [A0]
1357	bswap T0_32
1358	mov [A0], T0_32
1359	EPILOGUE_1_ARGS
1360	ENDPROC iemAImpl_bswap_u32
1361
1362	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1363	%ifdef RT_ARCH_AMD64
1364	PROLOGUE_1_ARGS
1365	mov T0, [A0]
1366	bswap T0
1367	mov [A0], T0
1368	EPILOGUE_1_ARGS
1369	%else
1370	PROLOGUE_1_ARGS
1371	mov T0, [A0]
1372	mov T1, [A0 + 4]
1373	bswap T0
1374	bswap T1
1375	mov [A0 + 4], T0
1376	mov [A0], T1
1377	EPILOGUE_1_ARGS
1378	%endif
1379	ENDPROC iemAImpl_bswap_u64
1380
1381
1382	;;
1383	; Macro for implementing a shift operation.
1384	;
1385	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1386	; 32-bit system where the 64-bit accesses requires hand coding.
1387	;
1388	; All the functions takes a pointer to the destination memory operand in A0,
1389	; the shift count in A1 and a pointer to eflags in A2.
1390	;
1391	; @param 1 The instruction mnemonic.
1392	; @param 2 The modified flags.
1393	; @param 3 The undefined flags.
1394	;
1395	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1396	;
1397	; @note the _intel and _amd variants are implemented in C.
1398	;
1399	%macro IEMIMPL_SHIFT_OP 3
1400	BEGINCODE
1401	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1402	PROLOGUE_3_ARGS
1403	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1404	%ifdef ASM_CALL64_GCC
1405	mov cl, A1_8
1406	%1 byte [A0], cl
1407	%else
1408	xchg A1, A0
1409	%1 byte [A1], cl
1410	%endif
1411	IEM_SAVE_FLAGS A2, %2, %3
1412	EPILOGUE_3_ARGS
1413	ENDPROC iemAImpl_ %+ %1 %+ _u8
1414
1415	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1416	PROLOGUE_3_ARGS
1417	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1418	%ifdef ASM_CALL64_GCC
1419	mov cl, A1_8
1420	%1 word [A0], cl
1421	%else
1422	xchg A1, A0
1423	%1 word [A1], cl
1424	%endif
1425	IEM_SAVE_FLAGS A2, %2, %3
1426	EPILOGUE_3_ARGS
1427	ENDPROC iemAImpl_ %+ %1 %+ _u16
1428
1429	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1430	PROLOGUE_3_ARGS
1431	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1432	%ifdef ASM_CALL64_GCC
1433	mov cl, A1_8
1434	%1 dword [A0], cl
1435	%else
1436	xchg A1, A0
1437	%1 dword [A1], cl
1438	%endif
1439	IEM_SAVE_FLAGS A2, %2, %3
1440	EPILOGUE_3_ARGS
1441	ENDPROC iemAImpl_ %+ %1 %+ _u32
1442
1443	%ifdef RT_ARCH_AMD64
1444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1445	PROLOGUE_3_ARGS
1446	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1447	%ifdef ASM_CALL64_GCC
1448	mov cl, A1_8
1449	%1 qword [A0], cl
1450	%else
1451	xchg A1, A0
1452	%1 qword [A1], cl
1453	%endif
1454	IEM_SAVE_FLAGS A2, %2, %3
1455	EPILOGUE_3_ARGS
1456	ENDPROC iemAImpl_ %+ %1 %+ _u64
1457	%endif ; RT_ARCH_AMD64
1458
1459	%endmacro
1460
1461	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1462	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1463	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1464	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1465	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1466	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1467	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1468
1469
1470	;;
1471	; Macro for implementing a double precision shift operation.
1472	;
1473	; This will generate code for the 16, 32 and 64 bit accesses, except on
1474	; 32-bit system where the 64-bit accesses requires hand coding.
1475	;
1476	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1477	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1478	;
1479	; @param 1 The instruction mnemonic.
1480	; @param 2 The modified flags.
1481	; @param 3 The undefined flags.
1482	;
1483	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1484	;
1485	; @note the _intel and _amd variants are implemented in C.
1486	;
1487	%macro IEMIMPL_SHIFT_DBL_OP 3
1488	BEGINCODE
1489	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1490	PROLOGUE_4_ARGS
1491	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1492	%ifdef ASM_CALL64_GCC
1493	xchg A3, A2
1494	%1 [A0], A1_16, cl
1495	xchg A3, A2
1496	%else
1497	xchg A0, A2
1498	%1 [A2], A1_16, cl
1499	%endif
1500	IEM_SAVE_FLAGS A3, %2, %3
1501	EPILOGUE_4_ARGS
1502	ENDPROC iemAImpl_ %+ %1 %+ _u16
1503
1504	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1505	PROLOGUE_4_ARGS
1506	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1507	%ifdef ASM_CALL64_GCC
1508	xchg A3, A2
1509	%1 [A0], A1_32, cl
1510	xchg A3, A2
1511	%else
1512	xchg A0, A2
1513	%1 [A2], A1_32, cl
1514	%endif
1515	IEM_SAVE_FLAGS A3, %2, %3
1516	EPILOGUE_4_ARGS
1517	ENDPROC iemAImpl_ %+ %1 %+ _u32
1518
1519	%ifdef RT_ARCH_AMD64
1520	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1521	PROLOGUE_4_ARGS
1522	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1523	%ifdef ASM_CALL64_GCC
1524	xchg A3, A2
1525	%1 [A0], A1, cl
1526	xchg A3, A2
1527	%else
1528	xchg A0, A2
1529	%1 [A2], A1, cl
1530	%endif
1531	IEM_SAVE_FLAGS A3, %2, %3
1532	EPILOGUE_4_ARGS_EX 12
1533	ENDPROC iemAImpl_ %+ %1 %+ _u64
1534	%endif ; RT_ARCH_AMD64
1535
1536	%endmacro
1537
1538	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1539	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1540
1541
1542	;;
1543	; Macro for implementing a multiplication operations.
1544	;
1545	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1546	; 32-bit system where the 64-bit accesses requires hand coding.
1547	;
1548	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1549	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1550	; pointer to eflags in A3.
1551	;
1552	; The functions all return 0 so the caller can be used for div/idiv as well as
1553	; for the mul/imul implementation.
1554	;
1555	; @param 1 The instruction mnemonic.
1556	; @param 2 The modified flags.
1557	; @param 3 The undefined flags.
1558	; @param 4 Name suffix.
1559	; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1560	;
1561	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1562	;
1563	%macro IEMIMPL_MUL_OP 5
1564	BEGINCODE
1565	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1566	PROLOGUE_3_ARGS
1567	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1568	mov al, [A0]
1569	%1 A1_8
1570	mov [A0], ax
1571	%if %5 != 1
1572	IEM_SAVE_FLAGS A2, %2, %3
1573	%else
1574	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 8, xAX
1575	%endif
1576	xor eax, eax
1577	EPILOGUE_3_ARGS
1578	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1579
1580	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1581	PROLOGUE_4_ARGS
1582	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1583	mov ax, [A0]
1584	%ifdef ASM_CALL64_GCC
1585	%1 A2_16
1586	mov [A0], ax
1587	mov [A1], dx
1588	%else
1589	mov T1, A1
1590	%1 A2_16
1591	mov [A0], ax
1592	mov [T1], dx
1593	%endif
1594	%if %5 != 1
1595	IEM_SAVE_FLAGS A3, %2, %3
1596	%else
1597	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, ax, 16, xAX
1598	%endif
1599	xor eax, eax
1600	EPILOGUE_4_ARGS
1601	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1602
1603	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1604	PROLOGUE_4_ARGS
1605	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1606	mov eax, [A0]
1607	%ifdef ASM_CALL64_GCC
1608	%1 A2_32
1609	mov [A0], eax
1610	mov [A1], edx
1611	%else
1612	mov T1, A1
1613	%1 A2_32
1614	mov [A0], eax
1615	mov [T1], edx
1616	%endif
1617	%if %5 != 1
1618	IEM_SAVE_FLAGS A3, %2, %3
1619	%else
1620	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, eax, 32, xAX
1621	%endif
1622	xor eax, eax
1623	EPILOGUE_4_ARGS
1624	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1625
1626	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1627	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1628	PROLOGUE_4_ARGS
1629	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1630	mov rax, [A0]
1631	%ifdef ASM_CALL64_GCC
1632	%1 A2
1633	mov [A0], rax
1634	mov [A1], rdx
1635	%else
1636	mov T1, A1
1637	%1 A2
1638	mov [A0], rax
1639	mov [T1], rdx
1640	%endif
1641	%if %5 != 1
1642	IEM_SAVE_FLAGS A3, %2, %3
1643	%else
1644	IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF \| X86_EFL_ZF, rax, 64, xAX
1645	%endif
1646	xor eax, eax
1647	EPILOGUE_4_ARGS_EX 12
1648	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
1649	%endif ; !RT_ARCH_AMD64
1650
1651	%endmacro
1652
1653	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1654	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1655	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1656	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), , 0
1657	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _intel, 1
1658	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), 0, _amd, 2
1659
1660
1661	BEGINCODE
1662	;;
1663	; Worker function for negating a 32-bit number in T1:T0
1664	; @uses None (T0,T1)
1665	BEGINPROC iemAImpl_negate_T0_T1_u32
1666	push 0
1667	push 0
1668	xchg T0_32, [xSP]
1669	xchg T1_32, [xSP + xCB]
1670	sub T0_32, [xSP]
1671	sbb T1_32, [xSP + xCB]
1672	add xSP, xCB*2
1673	ret
1674	ENDPROC iemAImpl_negate_T0_T1_u32
1675
1676	%ifdef RT_ARCH_AMD64
1677	;;
1678	; Worker function for negating a 64-bit number in T1:T0
1679	; @uses None (T0,T1)
1680	BEGINPROC iemAImpl_negate_T0_T1_u64
1681	push 0
1682	push 0
1683	xchg T0, [xSP]
1684	xchg T1, [xSP + xCB]
1685	sub T0, [xSP]
1686	sbb T1, [xSP + xCB]
1687	add xSP, xCB*2
1688	ret
1689	ENDPROC iemAImpl_negate_T0_T1_u64
1690	%endif
1691
1692
1693	;;
1694	; Macro for implementing a division operations.
1695	;
1696	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1697	; 32-bit system where the 64-bit accesses requires hand coding.
1698	;
1699	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1700	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1701	; pointer to eflags in A3.
1702	;
1703	; The functions all return 0 on success and -1 if a divide error should be
1704	; raised by the caller.
1705	;
1706	; @param 1 The instruction mnemonic.
1707	; @param 2 The modified flags.
1708	; @param 3 The undefined flags.
1709	; @param 4 1 if signed, 0 if unsigned.
1710	; @param 5 Function suffix.
1711	; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
1712	; 2 for AMD (set AF, clear PF, ZF and SF).
1713	;
1714	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1715	;
1716	%macro IEMIMPL_DIV_OP 6
1717	BEGINCODE
1718	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
1719	PROLOGUE_3_ARGS
1720
1721	; div by chainsaw check.
1722	test A1_8, A1_8
1723	jz .div_zero
1724
1725	; Overflow check - unsigned division is simple to verify, haven't
1726	; found a simple way to check signed division yet unfortunately.
1727	%if %4 == 0
1728	cmp [A0 + 1], A1_8
1729	jae .div_overflow
1730	%else
1731	mov T0_16, [A0] ; T0 = dividend
1732	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1733	test A1_8, A1_8
1734	js .divisor_negative
1735	test T0_16, T0_16
1736	jns .both_positive
1737	neg T0_16
1738	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1739	push T0 ; Start off like unsigned below.
1740	shr T0_16, 7
1741	cmp T0_8, A1_8
1742	pop T0
1743	jb .div_no_overflow
1744	ja .div_overflow
1745	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1746	cmp T0_8, A1_8
1747	jae .div_overflow
1748	jmp .div_no_overflow
1749
1750	.divisor_negative:
1751	neg A1_8
1752	test T0_16, T0_16
1753	jns .one_of_each
1754	neg T0_16
1755	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1756	shr T0_16, 7
1757	cmp T0_8, A1_8
1758	jae .div_overflow
1759	.div_no_overflow:
1760	mov A1, T1 ; restore divisor
1761	%endif
1762
1763	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1764	mov ax, [A0]
1765	%1 A1_8
1766	mov [A0], ax
1767	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1768	IEM_ADJUST_FLAGS A2, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
1769	%else
1770	IEM_SAVE_FLAGS A2, %2, %3
1771	%endif
1772	xor eax, eax
1773
1774	.return:
1775	EPILOGUE_3_ARGS
1776
1777	.div_zero:
1778	.div_overflow:
1779	mov eax, -1
1780	jmp .return
1781	ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
1782
1783	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
1784	PROLOGUE_4_ARGS
1785
1786	; div by chainsaw check.
1787	test A2_16, A2_16
1788	jz .div_zero
1789
1790	; Overflow check - unsigned division is simple to verify, haven't
1791	; found a simple way to check signed division yet unfortunately.
1792	%if %4 == 0
1793	cmp [A1], A2_16
1794	jae .div_overflow
1795	%else
1796	mov T0_16, [A1]
1797	shl T0_32, 16
1798	mov T0_16, [A0] ; T0 = dividend
1799	mov T1, A2 ; T1 = divisor
1800	test T1_16, T1_16
1801	js .divisor_negative
1802	test T0_32, T0_32
1803	jns .both_positive
1804	neg T0_32
1805	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1806	push T0 ; Start off like unsigned below.
1807	shr T0_32, 15
1808	cmp T0_16, T1_16
1809	pop T0
1810	jb .div_no_overflow
1811	ja .div_overflow
1812	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1813	cmp T0_16, T1_16
1814	jae .div_overflow
1815	jmp .div_no_overflow
1816
1817	.divisor_negative:
1818	neg T1_16
1819	test T0_32, T0_32
1820	jns .one_of_each
1821	neg T0_32
1822	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1823	shr T0_32, 15
1824	cmp T0_16, T1_16
1825	jae .div_overflow
1826	.div_no_overflow:
1827	%endif
1828
1829	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1830	%ifdef ASM_CALL64_GCC
1831	mov T1, A2
1832	mov ax, [A0]
1833	mov dx, [A1]
1834	%1 T1_16
1835	mov [A0], ax
1836	mov [A1], dx
1837	%else
1838	mov T1, A1
1839	mov ax, [A0]
1840	mov dx, [T1]
1841	%1 A2_16
1842	mov [A0], ax
1843	mov [T1], dx
1844	%endif
1845	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1846	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
1847	%else
1848	IEM_SAVE_FLAGS A3, %2, %3
1849	%endif
1850	xor eax, eax
1851
1852	.return:
1853	EPILOGUE_4_ARGS
1854
1855	.div_zero:
1856	.div_overflow:
1857	mov eax, -1
1858	jmp .return
1859	ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
1860
1861	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
1862	PROLOGUE_4_ARGS
1863
1864	; div by chainsaw check.
1865	test A2_32, A2_32
1866	jz .div_zero
1867
1868	; Overflow check - unsigned division is simple to verify, haven't
1869	; found a simple way to check signed division yet unfortunately.
1870	%if %4 == 0
1871	cmp [A1], A2_32
1872	jae .div_overflow
1873	%else
1874	push A2 ; save A2 so we modify it (we out of regs on x86).
1875	mov T0_32, [A0] ; T0 = dividend low
1876	mov T1_32, [A1] ; T1 = dividend high
1877	test A2_32, A2_32
1878	js .divisor_negative
1879	test T1_32, T1_32
1880	jns .both_positive
1881	call NAME(iemAImpl_negate_T0_T1_u32)
1882	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1883	push T0 ; Start off like unsigned below.
1884	shl T1_32, 1
1885	shr T0_32, 31
1886	or T1_32, T0_32
1887	cmp T1_32, A2_32
1888	pop T0
1889	jb .div_no_overflow
1890	ja .div_overflow
1891	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1892	cmp T0_32, A2_32
1893	jae .div_overflow
1894	jmp .div_no_overflow
1895
1896	.divisor_negative:
1897	neg A2_32
1898	test T1_32, T1_32
1899	jns .one_of_each
1900	call NAME(iemAImpl_negate_T0_T1_u32)
1901	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1902	shl T1_32, 1
1903	shr T0_32, 31
1904	or T1_32, T0_32
1905	cmp T1_32, A2_32
1906	jae .div_overflow
1907	.div_no_overflow:
1908	pop A2
1909	%endif
1910
1911	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1912	mov eax, [A0]
1913	%ifdef ASM_CALL64_GCC
1914	mov T1, A2
1915	mov eax, [A0]
1916	mov edx, [A1]
1917	%1 T1_32
1918	mov [A0], eax
1919	mov [A1], edx
1920	%else
1921	mov T1, A1
1922	mov eax, [A0]
1923	mov edx, [T1]
1924	%1 A2_32
1925	mov [A0], eax
1926	mov [T1], edx
1927	%endif
1928	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
1929	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
1930	%else
1931	IEM_SAVE_FLAGS A3, %2, %3
1932	%endif
1933	xor eax, eax
1934
1935	.return:
1936	EPILOGUE_4_ARGS
1937
1938	.div_overflow:
1939	%if %4 != 0
1940	pop A2
1941	%endif
1942	.div_zero:
1943	mov eax, -1
1944	jmp .return
1945	ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
1946
1947	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1948	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
1949	PROLOGUE_4_ARGS
1950
1951	test A2, A2
1952	jz .div_zero
1953	%if %4 == 0
1954	cmp [A1], A2
1955	jae .div_overflow
1956	%else
1957	push A2 ; save A2 so we modify it (we out of regs on x86).
1958	mov T0, [A0] ; T0 = dividend low
1959	mov T1, [A1] ; T1 = dividend high
1960	test A2, A2
1961	js .divisor_negative
1962	test T1, T1
1963	jns .both_positive
1964	call NAME(iemAImpl_negate_T0_T1_u64)
1965	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1966	push T0 ; Start off like unsigned below.
1967	shl T1, 1
1968	shr T0, 63
1969	or T1, T0
1970	cmp T1, A2
1971	pop T0
1972	jb .div_no_overflow
1973	ja .div_overflow
1974	mov T1, 0x7fffffffffffffff
1975	and T0, T1 ; Special case for covering (divisor - 1).
1976	cmp T0, A2
1977	jae .div_overflow
1978	jmp .div_no_overflow
1979
1980	.divisor_negative:
1981	neg A2
1982	test T1, T1
1983	jns .one_of_each
1984	call NAME(iemAImpl_negate_T0_T1_u64)
1985	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1986	shl T1, 1
1987	shr T0, 63
1988	or T1, T0
1989	cmp T1, A2
1990	jae .div_overflow
1991	.div_no_overflow:
1992	pop A2
1993	%endif
1994
1995	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1996	mov rax, [A0]
1997	%ifdef ASM_CALL64_GCC
1998	mov T1, A2
1999	mov rax, [A0]
2000	mov rdx, [A1]
2001	%1 T1
2002	mov [A0], rax
2003	mov [A1], rdx
2004	%else
2005	mov T1, A1
2006	mov rax, [A0]
2007	mov rdx, [T1]
2008	%1 A2
2009	mov [A0], rax
2010	mov [T1], rdx
2011	%endif
2012	%if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2013	IEM_ADJUST_FLAGS A3, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF, X86_EFL_AF
2014	%else
2015	IEM_SAVE_FLAGS A3, %2, %3
2016	%endif
2017	xor eax, eax
2018
2019	.return:
2020	EPILOGUE_4_ARGS_EX 12
2021
2022	.div_overflow:
2023	%if %4 != 0
2024	pop A2
2025	%endif
2026	.div_zero:
2027	mov eax, -1
2028	jmp .return
2029	ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2030	%endif ; !RT_ARCH_AMD64
2031
2032	%endmacro
2033
2034	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0, , 0
2035	IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2036	IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2037	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1, , 0
2038	IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2039	IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2040
2041
2042	;;
2043	; Macro for implementing memory fence operation.
2044	;
2045	; No return value, no operands or anything.
2046	;
2047	; @param 1 The instruction.
2048	;
2049	%macro IEMIMPL_MEM_FENCE 1
2050	BEGINCODE
2051	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2052	%1
2053	ret
2054	ENDPROC iemAImpl_ %+ %1
2055	%endmacro
2056
2057	IEMIMPL_MEM_FENCE lfence
2058	IEMIMPL_MEM_FENCE sfence
2059	IEMIMPL_MEM_FENCE mfence
2060
2061	;;
2062	; Alternative for non-SSE2 host.
2063	;
2064	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2065	push xAX
2066	xchg xAX, [xSP]
2067	add xSP, xCB
2068	ret
2069	ENDPROC iemAImpl_alt_mem_fence
2070
2071
2072	;;
2073	; Initialize the FPU for the actual instruction being emulated, this means
2074	; loading parts of the guest's control word and status word.
2075	;
2076	; @uses 24 bytes of stack.
2077	; @param 1 Expression giving the address of the FXSTATE of the guest.
2078	;
2079	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2080	fnstenv [xSP]
2081
2082	; FCW - for exception, precision and rounding control.
2083	movzx T0, word [%1 + X86FXSTATE.FCW]
2084	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
2085	mov [xSP + X86FSTENV32P.FCW], T0_16
2086
2087	; FSW - for undefined C0, C1, C2, and C3.
2088	movzx T1, word [%1 + X86FXSTATE.FSW]
2089	and T1, X86_FSW_C_MASK
2090	movzx T0, word [xSP + X86FSTENV32P.FSW]
2091	and T0, X86_FSW_TOP_MASK
2092	or T0, T1
2093	mov [xSP + X86FSTENV32P.FSW], T0_16
2094
2095	fldenv [xSP]
2096	%endmacro
2097
2098
2099	;;
2100	; Need to move this as well somewhere better?
2101	;
2102	struc IEMFPURESULT
2103	.r80Result resw 5
2104	.FSW resw 1
2105	endstruc
2106
2107
2108	;;
2109	; Need to move this as well somewhere better?
2110	;
2111	struc IEMFPURESULTTWO
2112	.r80Result1 resw 5
2113	.FSW resw 1
2114	.r80Result2 resw 5
2115	endstruc
2116
2117
2118	;
2119	;---------------------- 16-bit signed integer operations ----------------------
2120	;
2121
2122
2123	;;
2124	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2125	;
2126	; @param A0 FPU context (fxsave).
2127	; @param A1 Pointer to a IEMFPURESULT for the output.
2128	; @param A2 Pointer to the 16-bit floating point value to convert.
2129	;
2130	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2131	PROLOGUE_3_ARGS
2132	sub xSP, 20h
2133
2134	fninit
2135	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2136	fild word [A2]
2137
2138	fnstsw word [A1 + IEMFPURESULT.FSW]
2139	fnclex
2140	fstp tword [A1 + IEMFPURESULT.r80Result]
2141
2142	fninit
2143	add xSP, 20h
2144	EPILOGUE_3_ARGS
2145	ENDPROC iemAImpl_fild_r80_from_i16
2146
2147
2148	;;
2149	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2150	;
2151	; @param A0 FPU context (fxsave).
2152	; @param A1 Where to return the output FSW.
2153	; @param A2 Where to store the 16-bit signed integer value.
2154	; @param A3 Pointer to the 80-bit value.
2155	;
2156	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2157	PROLOGUE_4_ARGS
2158	sub xSP, 20h
2159
2160	fninit
2161	fld tword [A3]
2162	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2163	fistp word [A2]
2164
2165	fnstsw word [A1]
2166
2167	fninit
2168	add xSP, 20h
2169	EPILOGUE_4_ARGS
2170	ENDPROC iemAImpl_fist_r80_to_i16
2171
2172
2173	;;
2174	; Store a 80-bit floating point value (register) as a 16-bit signed integer
2175	; (memory) with truncation.
2176	;
2177	; @param A0 FPU context (fxsave).
2178	; @param A1 Where to return the output FSW.
2179	; @param A2 Where to store the 16-bit signed integer value.
2180	; @param A3 Pointer to the 80-bit value.
2181	;
2182	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2183	PROLOGUE_4_ARGS
2184	sub xSP, 20h
2185
2186	fninit
2187	fld tword [A3]
2188	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2189	fisttp word [A2]
2190
2191	fnstsw word [A1]
2192
2193	fninit
2194	add xSP, 20h
2195	EPILOGUE_4_ARGS
2196	ENDPROC iemAImpl_fistt_r80_to_i16
2197
2198
2199	;;
2200	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2201	;
2202	; @param 1 The instruction
2203	;
2204	; @param A0 FPU context (fxsave).
2205	; @param A1 Pointer to a IEMFPURESULT for the output.
2206	; @param A2 Pointer to the 80-bit value.
2207	; @param A3 Pointer to the 16-bit value.
2208	;
2209	%macro IEMIMPL_FPU_R80_BY_I16 1
2210	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2211	PROLOGUE_4_ARGS
2212	sub xSP, 20h
2213
2214	fninit
2215	fld tword [A2]
2216	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2217	%1 word [A3]
2218
2219	fnstsw word [A1 + IEMFPURESULT.FSW]
2220	fnclex
2221	fstp tword [A1 + IEMFPURESULT.r80Result]
2222
2223	fninit
2224	add xSP, 20h
2225	EPILOGUE_4_ARGS
2226	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2227	%endmacro
2228
2229	IEMIMPL_FPU_R80_BY_I16 fiadd
2230	IEMIMPL_FPU_R80_BY_I16 fimul
2231	IEMIMPL_FPU_R80_BY_I16 fisub
2232	IEMIMPL_FPU_R80_BY_I16 fisubr
2233	IEMIMPL_FPU_R80_BY_I16 fidiv
2234	IEMIMPL_FPU_R80_BY_I16 fidivr
2235
2236
2237	;;
2238	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2239	; only returning FSW.
2240	;
2241	; @param 1 The instruction
2242	;
2243	; @param A0 FPU context (fxsave).
2244	; @param A1 Where to store the output FSW.
2245	; @param A2 Pointer to the 80-bit value.
2246	; @param A3 Pointer to the 64-bit value.
2247	;
2248	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2249	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2250	PROLOGUE_4_ARGS
2251	sub xSP, 20h
2252
2253	fninit
2254	fld tword [A2]
2255	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2256	%1 word [A3]
2257
2258	fnstsw word [A1]
2259
2260	fninit
2261	add xSP, 20h
2262	EPILOGUE_4_ARGS
2263	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2264	%endmacro
2265
2266	IEMIMPL_FPU_R80_BY_I16_FSW ficom
2267
2268
2269
2270	;
2271	;---------------------- 32-bit signed integer operations ----------------------
2272	;
2273
2274
2275	;;
2276	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2277	;
2278	; @param A0 FPU context (fxsave).
2279	; @param A1 Pointer to a IEMFPURESULT for the output.
2280	; @param A2 Pointer to the 32-bit floating point value to convert.
2281	;
2282	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2283	PROLOGUE_3_ARGS
2284	sub xSP, 20h
2285
2286	fninit
2287	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2288	fild dword [A2]
2289
2290	fnstsw word [A1 + IEMFPURESULT.FSW]
2291	fnclex
2292	fstp tword [A1 + IEMFPURESULT.r80Result]
2293
2294	fninit
2295	add xSP, 20h
2296	EPILOGUE_3_ARGS
2297	ENDPROC iemAImpl_fild_r80_from_i32
2298
2299
2300	;;
2301	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2302	;
2303	; @param A0 FPU context (fxsave).
2304	; @param A1 Where to return the output FSW.
2305	; @param A2 Where to store the 32-bit signed integer value.
2306	; @param A3 Pointer to the 80-bit value.
2307	;
2308	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2309	PROLOGUE_4_ARGS
2310	sub xSP, 20h
2311
2312	fninit
2313	fld tword [A3]
2314	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2315	fistp dword [A2]
2316
2317	fnstsw word [A1]
2318
2319	fninit
2320	add xSP, 20h
2321	EPILOGUE_4_ARGS
2322	ENDPROC iemAImpl_fist_r80_to_i32
2323
2324
2325	;;
2326	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2327	; (memory) with truncation.
2328	;
2329	; @param A0 FPU context (fxsave).
2330	; @param A1 Where to return the output FSW.
2331	; @param A2 Where to store the 32-bit signed integer value.
2332	; @param A3 Pointer to the 80-bit value.
2333	;
2334	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2335	PROLOGUE_4_ARGS
2336	sub xSP, 20h
2337
2338	fninit
2339	fld tword [A3]
2340	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2341	fisttp dword [A2]
2342
2343	fnstsw word [A1]
2344
2345	fninit
2346	add xSP, 20h
2347	EPILOGUE_4_ARGS
2348	ENDPROC iemAImpl_fistt_r80_to_i32
2349
2350
2351	;;
2352	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2353	;
2354	; @param 1 The instruction
2355	;
2356	; @param A0 FPU context (fxsave).
2357	; @param A1 Pointer to a IEMFPURESULT for the output.
2358	; @param A2 Pointer to the 80-bit value.
2359	; @param A3 Pointer to the 32-bit value.
2360	;
2361	%macro IEMIMPL_FPU_R80_BY_I32 1
2362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2363	PROLOGUE_4_ARGS
2364	sub xSP, 20h
2365
2366	fninit
2367	fld tword [A2]
2368	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2369	%1 dword [A3]
2370
2371	fnstsw word [A1 + IEMFPURESULT.FSW]
2372	fnclex
2373	fstp tword [A1 + IEMFPURESULT.r80Result]
2374
2375	fninit
2376	add xSP, 20h
2377	EPILOGUE_4_ARGS
2378	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2379	%endmacro
2380
2381	IEMIMPL_FPU_R80_BY_I32 fiadd
2382	IEMIMPL_FPU_R80_BY_I32 fimul
2383	IEMIMPL_FPU_R80_BY_I32 fisub
2384	IEMIMPL_FPU_R80_BY_I32 fisubr
2385	IEMIMPL_FPU_R80_BY_I32 fidiv
2386	IEMIMPL_FPU_R80_BY_I32 fidivr
2387
2388
2389	;;
2390	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2391	; only returning FSW.
2392	;
2393	; @param 1 The instruction
2394	;
2395	; @param A0 FPU context (fxsave).
2396	; @param A1 Where to store the output FSW.
2397	; @param A2 Pointer to the 80-bit value.
2398	; @param A3 Pointer to the 64-bit value.
2399	;
2400	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2401	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2402	PROLOGUE_4_ARGS
2403	sub xSP, 20h
2404
2405	fninit
2406	fld tword [A2]
2407	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2408	%1 dword [A3]
2409
2410	fnstsw word [A1]
2411
2412	fninit
2413	add xSP, 20h
2414	EPILOGUE_4_ARGS
2415	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2416	%endmacro
2417
2418	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2419
2420
2421
2422	;
2423	;---------------------- 64-bit signed integer operations ----------------------
2424	;
2425
2426
2427	;;
2428	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2429	;
2430	; @param A0 FPU context (fxsave).
2431	; @param A1 Pointer to a IEMFPURESULT for the output.
2432	; @param A2 Pointer to the 64-bit floating point value to convert.
2433	;
2434	BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2435	PROLOGUE_3_ARGS
2436	sub xSP, 20h
2437
2438	fninit
2439	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2440	fild qword [A2]
2441
2442	fnstsw word [A1 + IEMFPURESULT.FSW]
2443	fnclex
2444	fstp tword [A1 + IEMFPURESULT.r80Result]
2445
2446	fninit
2447	add xSP, 20h
2448	EPILOGUE_3_ARGS
2449	ENDPROC iemAImpl_fild_r80_from_i64
2450
2451
2452	;;
2453	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2454	;
2455	; @param A0 FPU context (fxsave).
2456	; @param A1 Where to return the output FSW.
2457	; @param A2 Where to store the 64-bit signed integer value.
2458	; @param A3 Pointer to the 80-bit value.
2459	;
2460	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2461	PROLOGUE_4_ARGS
2462	sub xSP, 20h
2463
2464	fninit
2465	fld tword [A3]
2466	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2467	fistp qword [A2]
2468
2469	fnstsw word [A1]
2470
2471	fninit
2472	add xSP, 20h
2473	EPILOGUE_4_ARGS
2474	ENDPROC iemAImpl_fist_r80_to_i64
2475
2476
2477	;;
2478	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2479	; (memory) with truncation.
2480	;
2481	; @param A0 FPU context (fxsave).
2482	; @param A1 Where to return the output FSW.
2483	; @param A2 Where to store the 64-bit signed integer value.
2484	; @param A3 Pointer to the 80-bit value.
2485	;
2486	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2487	PROLOGUE_4_ARGS
2488	sub xSP, 20h
2489
2490	fninit
2491	fld tword [A3]
2492	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2493	fisttp qword [A2]
2494
2495	fnstsw word [A1]
2496
2497	fninit
2498	add xSP, 20h
2499	EPILOGUE_4_ARGS
2500	ENDPROC iemAImpl_fistt_r80_to_i64
2501
2502
2503
2504	;
2505	;---------------------- 32-bit floating point operations ----------------------
2506	;
2507
2508	;;
2509	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2510	;
2511	; @param A0 FPU context (fxsave).
2512	; @param A1 Pointer to a IEMFPURESULT for the output.
2513	; @param A2 Pointer to the 32-bit floating point value to convert.
2514	;
2515	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2516	PROLOGUE_3_ARGS
2517	sub xSP, 20h
2518
2519	fninit
2520	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2521	fld dword [A2]
2522
2523	fnstsw word [A1 + IEMFPURESULT.FSW]
2524	fnclex
2525	fstp tword [A1 + IEMFPURESULT.r80Result]
2526
2527	fninit
2528	add xSP, 20h
2529	EPILOGUE_3_ARGS
2530	ENDPROC iemAImpl_fld_r80_from_r32
2531
2532
2533	;;
2534	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2535	;
2536	; @param A0 FPU context (fxsave).
2537	; @param A1 Where to return the output FSW.
2538	; @param A2 Where to store the 32-bit value.
2539	; @param A3 Pointer to the 80-bit value.
2540	;
2541	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2542	PROLOGUE_4_ARGS
2543	sub xSP, 20h
2544
2545	fninit
2546	fld tword [A3]
2547	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2548	fst dword [A2]
2549
2550	fnstsw word [A1]
2551
2552	fninit
2553	add xSP, 20h
2554	EPILOGUE_4_ARGS
2555	ENDPROC iemAImpl_fst_r80_to_r32
2556
2557
2558	;;
2559	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2560	;
2561	; @param 1 The instruction
2562	;
2563	; @param A0 FPU context (fxsave).
2564	; @param A1 Pointer to a IEMFPURESULT for the output.
2565	; @param A2 Pointer to the 80-bit value.
2566	; @param A3 Pointer to the 32-bit value.
2567	;
2568	%macro IEMIMPL_FPU_R80_BY_R32 1
2569	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2570	PROLOGUE_4_ARGS
2571	sub xSP, 20h
2572
2573	fninit
2574	fld tword [A2]
2575	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2576	%1 dword [A3]
2577
2578	fnstsw word [A1 + IEMFPURESULT.FSW]
2579	fnclex
2580	fstp tword [A1 + IEMFPURESULT.r80Result]
2581
2582	fninit
2583	add xSP, 20h
2584	EPILOGUE_4_ARGS
2585	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2586	%endmacro
2587
2588	IEMIMPL_FPU_R80_BY_R32 fadd
2589	IEMIMPL_FPU_R80_BY_R32 fmul
2590	IEMIMPL_FPU_R80_BY_R32 fsub
2591	IEMIMPL_FPU_R80_BY_R32 fsubr
2592	IEMIMPL_FPU_R80_BY_R32 fdiv
2593	IEMIMPL_FPU_R80_BY_R32 fdivr
2594
2595
2596	;;
2597	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2598	; only returning FSW.
2599	;
2600	; @param 1 The instruction
2601	;
2602	; @param A0 FPU context (fxsave).
2603	; @param A1 Where to store the output FSW.
2604	; @param A2 Pointer to the 80-bit value.
2605	; @param A3 Pointer to the 64-bit value.
2606	;
2607	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2608	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2609	PROLOGUE_4_ARGS
2610	sub xSP, 20h
2611
2612	fninit
2613	fld tword [A2]
2614	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2615	%1 dword [A3]
2616
2617	fnstsw word [A1]
2618
2619	fninit
2620	add xSP, 20h
2621	EPILOGUE_4_ARGS
2622	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2623	%endmacro
2624
2625	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2626
2627
2628
2629	;
2630	;---------------------- 64-bit floating point operations ----------------------
2631	;
2632
2633	;;
2634	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2635	;
2636	; @param A0 FPU context (fxsave).
2637	; @param A1 Pointer to a IEMFPURESULT for the output.
2638	; @param A2 Pointer to the 64-bit floating point value to convert.
2639	;
2640	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
2641	PROLOGUE_3_ARGS
2642	sub xSP, 20h
2643
2644	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2645	fld qword [A2]
2646
2647	fnstsw word [A1 + IEMFPURESULT.FSW]
2648	fnclex
2649	fstp tword [A1 + IEMFPURESULT.r80Result]
2650
2651	fninit
2652	add xSP, 20h
2653	EPILOGUE_3_ARGS
2654	ENDPROC iemAImpl_fld_r80_from_r64
2655
2656
2657	;;
2658	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2659	;
2660	; @param A0 FPU context (fxsave).
2661	; @param A1 Where to return the output FSW.
2662	; @param A2 Where to store the 64-bit value.
2663	; @param A3 Pointer to the 80-bit value.
2664	;
2665	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2666	PROLOGUE_4_ARGS
2667	sub xSP, 20h
2668
2669	fninit
2670	fld tword [A3]
2671	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2672	fst qword [A2]
2673
2674	fnstsw word [A1]
2675
2676	fninit
2677	add xSP, 20h
2678	EPILOGUE_4_ARGS
2679	ENDPROC iemAImpl_fst_r80_to_r64
2680
2681
2682	;;
2683	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2684	;
2685	; @param 1 The instruction
2686	;
2687	; @param A0 FPU context (fxsave).
2688	; @param A1 Pointer to a IEMFPURESULT for the output.
2689	; @param A2 Pointer to the 80-bit value.
2690	; @param A3 Pointer to the 64-bit value.
2691	;
2692	%macro IEMIMPL_FPU_R80_BY_R64 1
2693	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2694	PROLOGUE_4_ARGS
2695	sub xSP, 20h
2696
2697	fninit
2698	fld tword [A2]
2699	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2700	%1 qword [A3]
2701
2702	fnstsw word [A1 + IEMFPURESULT.FSW]
2703	fnclex
2704	fstp tword [A1 + IEMFPURESULT.r80Result]
2705
2706	fninit
2707	add xSP, 20h
2708	EPILOGUE_4_ARGS
2709	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2710	%endmacro
2711
2712	IEMIMPL_FPU_R80_BY_R64 fadd
2713	IEMIMPL_FPU_R80_BY_R64 fmul
2714	IEMIMPL_FPU_R80_BY_R64 fsub
2715	IEMIMPL_FPU_R80_BY_R64 fsubr
2716	IEMIMPL_FPU_R80_BY_R64 fdiv
2717	IEMIMPL_FPU_R80_BY_R64 fdivr
2718
2719	;;
2720	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2721	; only returning FSW.
2722	;
2723	; @param 1 The instruction
2724	;
2725	; @param A0 FPU context (fxsave).
2726	; @param A1 Where to store the output FSW.
2727	; @param A2 Pointer to the 80-bit value.
2728	; @param A3 Pointer to the 64-bit value.
2729	;
2730	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2731	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2732	PROLOGUE_4_ARGS
2733	sub xSP, 20h
2734
2735	fninit
2736	fld tword [A2]
2737	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2738	%1 qword [A3]
2739
2740	fnstsw word [A1]
2741
2742	fninit
2743	add xSP, 20h
2744	EPILOGUE_4_ARGS
2745	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2746	%endmacro
2747
2748	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2749
2750
2751
2752	;
2753	;---------------------- 80-bit floating point operations ----------------------
2754	;
2755
2756	;;
2757	; Loads a 80-bit floating point register value from memory.
2758	;
2759	; @param A0 FPU context (fxsave).
2760	; @param A1 Pointer to a IEMFPURESULT for the output.
2761	; @param A2 Pointer to the 80-bit floating point value to load.
2762	;
2763	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2764	PROLOGUE_3_ARGS
2765	sub xSP, 20h
2766
2767	fninit
2768	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2769	fld tword [A2]
2770
2771	fnstsw word [A1 + IEMFPURESULT.FSW]
2772	fnclex
2773	fstp tword [A1 + IEMFPURESULT.r80Result]
2774
2775	fninit
2776	add xSP, 20h
2777	EPILOGUE_3_ARGS
2778	ENDPROC iemAImpl_fld_r80_from_r80
2779
2780
2781	;;
2782	; Store a 80-bit floating point register to memory
2783	;
2784	; @param A0 FPU context (fxsave).
2785	; @param A1 Where to return the output FSW.
2786	; @param A2 Where to store the 80-bit value.
2787	; @param A3 Pointer to the 80-bit register value.
2788	;
2789	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2790	PROLOGUE_4_ARGS
2791	sub xSP, 20h
2792
2793	fninit
2794	fld tword [A3]
2795	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2796	fstp tword [A2]
2797
2798	fnstsw word [A1]
2799
2800	fninit
2801	add xSP, 20h
2802	EPILOGUE_4_ARGS
2803	ENDPROC iemAImpl_fst_r80_to_r80
2804
2805
2806	;;
2807	; Loads an 80-bit floating point register value in BCD format from memory.
2808	;
2809	; @param A0 FPU context (fxsave).
2810	; @param A1 Pointer to a IEMFPURESULT for the output.
2811	; @param A2 Pointer to the 80-bit BCD value to load.
2812	;
2813	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
2814	PROLOGUE_3_ARGS
2815	sub xSP, 20h
2816
2817	fninit
2818	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2819	fbld tword [A2]
2820
2821	fnstsw word [A1 + IEMFPURESULT.FSW]
2822	fnclex
2823	fstp tword [A1 + IEMFPURESULT.r80Result]
2824
2825	fninit
2826	add xSP, 20h
2827	EPILOGUE_3_ARGS
2828	ENDPROC iemAImpl_fld_r80_from_d80
2829
2830
2831	;;
2832	; Store a 80-bit floating point register to memory as BCD
2833	;
2834	; @param A0 FPU context (fxsave).
2835	; @param A1 Where to return the output FSW.
2836	; @param A2 Where to store the 80-bit BCD value.
2837	; @param A3 Pointer to the 80-bit register value.
2838	;
2839	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
2840	PROLOGUE_4_ARGS
2841	sub xSP, 20h
2842
2843	fninit
2844	fld tword [A3]
2845	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2846	fbstp tword [A2]
2847
2848	fnstsw word [A1]
2849
2850	fninit
2851	add xSP, 20h
2852	EPILOGUE_4_ARGS
2853	ENDPROC iemAImpl_fst_r80_to_d80
2854
2855
2856	;;
2857	; FPU instruction working on two 80-bit floating point values.
2858	;
2859	; @param 1 The instruction
2860	;
2861	; @param A0 FPU context (fxsave).
2862	; @param A1 Pointer to a IEMFPURESULT for the output.
2863	; @param A2 Pointer to the first 80-bit value (ST0)
2864	; @param A3 Pointer to the second 80-bit value (STn).
2865	;
2866	%macro IEMIMPL_FPU_R80_BY_R80 2
2867	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2868	PROLOGUE_4_ARGS
2869	sub xSP, 20h
2870
2871	fninit
2872	fld tword [A3]
2873	fld tword [A2]
2874	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2875	%1 %2
2876
2877	fnstsw word [A1 + IEMFPURESULT.FSW]
2878	fnclex
2879	fstp tword [A1 + IEMFPURESULT.r80Result]
2880
2881	fninit
2882	add xSP, 20h
2883	EPILOGUE_4_ARGS
2884	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2885	%endmacro
2886
2887	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2888	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2889	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2890	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2891	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2892	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2893	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2894	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2895	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2896
2897
2898	;;
2899	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2900	; storing the result in ST1 and popping the stack.
2901	;
2902	; @param 1 The instruction
2903	;
2904	; @param A0 FPU context (fxsave).
2905	; @param A1 Pointer to a IEMFPURESULT for the output.
2906	; @param A2 Pointer to the first 80-bit value (ST1).
2907	; @param A3 Pointer to the second 80-bit value (ST0).
2908	;
2909	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2910	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2911	PROLOGUE_4_ARGS
2912	sub xSP, 20h
2913
2914	fninit
2915	fld tword [A2]
2916	fld tword [A3]
2917	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2918	%1
2919
2920	fnstsw word [A1 + IEMFPURESULT.FSW]
2921	fnclex
2922	fstp tword [A1 + IEMFPURESULT.r80Result]
2923
2924	fninit
2925	add xSP, 20h
2926	EPILOGUE_4_ARGS
2927	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2928	%endmacro
2929
2930	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2931	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2932	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2933
2934
2935	;;
2936	; FPU instruction working on two 80-bit floating point values, only
2937	; returning FSW.
2938	;
2939	; @param 1 The instruction
2940	;
2941	; @param A0 FPU context (fxsave).
2942	; @param A1 Pointer to a uint16_t for the resulting FSW.
2943	; @param A2 Pointer to the first 80-bit value.
2944	; @param A3 Pointer to the second 80-bit value.
2945	;
2946	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2947	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2948	PROLOGUE_4_ARGS
2949	sub xSP, 20h
2950
2951	fninit
2952	fld tword [A3]
2953	fld tword [A2]
2954	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2955	%1 st0, st1
2956
2957	fnstsw word [A1]
2958
2959	fninit
2960	add xSP, 20h
2961	EPILOGUE_4_ARGS
2962	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2963	%endmacro
2964
2965	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2966	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2967
2968
2969	;;
2970	; FPU instruction working on two 80-bit floating point values,
2971	; returning FSW and EFLAGS (eax).
2972	;
2973	; @param 1 The instruction
2974	;
2975	; @returns EFLAGS in EAX.
2976	; @param A0 FPU context (fxsave).
2977	; @param A1 Pointer to a uint16_t for the resulting FSW.
2978	; @param A2 Pointer to the first 80-bit value.
2979	; @param A3 Pointer to the second 80-bit value.
2980	;
2981	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2982	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2983	PROLOGUE_4_ARGS
2984	sub xSP, 20h
2985
2986	fninit
2987	fld tword [A3]
2988	fld tword [A2]
2989	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2990	%1 st1
2991
2992	fnstsw word [A1]
2993	pushf
2994	pop xAX
2995
2996	fninit
2997	add xSP, 20h
2998	EPILOGUE_4_ARGS
2999	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3000	%endmacro
3001
3002	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3003	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3004
3005
3006	;;
3007	; FPU instruction working on one 80-bit floating point value.
3008	;
3009	; @param 1 The instruction
3010	;
3011	; @param A0 FPU context (fxsave).
3012	; @param A1 Pointer to a IEMFPURESULT for the output.
3013	; @param A2 Pointer to the 80-bit value.
3014	;
3015	%macro IEMIMPL_FPU_R80 1
3016	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3017	PROLOGUE_3_ARGS
3018	sub xSP, 20h
3019
3020	fninit
3021	fld tword [A2]
3022	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3023	%1
3024
3025	fnstsw word [A1 + IEMFPURESULT.FSW]
3026	fnclex
3027	fstp tword [A1 + IEMFPURESULT.r80Result]
3028
3029	fninit
3030	add xSP, 20h
3031	EPILOGUE_3_ARGS
3032	ENDPROC iemAImpl_ %+ %1 %+ _r80
3033	%endmacro
3034
3035	IEMIMPL_FPU_R80 fchs
3036	IEMIMPL_FPU_R80 fabs
3037	IEMIMPL_FPU_R80 f2xm1
3038	IEMIMPL_FPU_R80 fsqrt
3039	IEMIMPL_FPU_R80 frndint
3040	IEMIMPL_FPU_R80 fsin
3041	IEMIMPL_FPU_R80 fcos
3042
3043
3044	;;
3045	; FPU instruction working on one 80-bit floating point value, only
3046	; returning FSW.
3047	;
3048	; @param 1 The instruction
3049	;
3050	; @param A0 FPU context (fxsave).
3051	; @param A1 Pointer to a uint16_t for the resulting FSW.
3052	; @param A2 Pointer to the 80-bit value.
3053	;
3054	%macro IEMIMPL_FPU_R80_FSW 1
3055	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3056	PROLOGUE_3_ARGS
3057	sub xSP, 20h
3058
3059	fninit
3060	fld tword [A2]
3061	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3062	%1
3063
3064	fnstsw word [A1]
3065
3066	fninit
3067	add xSP, 20h
3068	EPILOGUE_3_ARGS
3069	ENDPROC iemAImpl_ %+ %1 %+ _r80
3070	%endmacro
3071
3072	IEMIMPL_FPU_R80_FSW ftst
3073	IEMIMPL_FPU_R80_FSW fxam
3074
3075
3076
3077	;;
3078	; FPU instruction loading a 80-bit floating point constant.
3079	;
3080	; @param 1 The instruction
3081	;
3082	; @param A0 FPU context (fxsave).
3083	; @param A1 Pointer to a IEMFPURESULT for the output.
3084	;
3085	%macro IEMIMPL_FPU_R80_CONST 1
3086	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3087	PROLOGUE_2_ARGS
3088	sub xSP, 20h
3089
3090	fninit
3091	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3092	%1
3093
3094	fnstsw word [A1 + IEMFPURESULT.FSW]
3095	fnclex
3096	fstp tword [A1 + IEMFPURESULT.r80Result]
3097
3098	fninit
3099	add xSP, 20h
3100	EPILOGUE_2_ARGS
3101	ENDPROC iemAImpl_ %+ %1 %+
3102	%endmacro
3103
3104	IEMIMPL_FPU_R80_CONST fld1
3105	IEMIMPL_FPU_R80_CONST fldl2t
3106	IEMIMPL_FPU_R80_CONST fldl2e
3107	IEMIMPL_FPU_R80_CONST fldpi
3108	IEMIMPL_FPU_R80_CONST fldlg2
3109	IEMIMPL_FPU_R80_CONST fldln2
3110	IEMIMPL_FPU_R80_CONST fldz
3111
3112
3113	;;
3114	; FPU instruction working on one 80-bit floating point value, outputing two.
3115	;
3116	; @param 1 The instruction
3117	;
3118	; @param A0 FPU context (fxsave).
3119	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3120	; @param A2 Pointer to the 80-bit value.
3121	;
3122	%macro IEMIMPL_FPU_R80_R80 1
3123	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3124	PROLOGUE_3_ARGS
3125	sub xSP, 20h
3126
3127	fninit
3128	fld tword [A2]
3129	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3130	%1
3131
3132	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3133	fnclex
3134	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3135	fnclex
3136	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3137
3138	fninit
3139	add xSP, 20h
3140	EPILOGUE_3_ARGS
3141	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3142	%endmacro
3143
3144	IEMIMPL_FPU_R80_R80 fptan
3145	IEMIMPL_FPU_R80_R80 fxtract
3146	IEMIMPL_FPU_R80_R80 fsincos
3147
3148
3149
3150
3151	;---------------------- SSE and MMX Operations ----------------------
3152
3153	;; @todo what do we need to do for MMX?
3154	%macro IEMIMPL_MMX_PROLOGUE 0
3155	%endmacro
3156	%macro IEMIMPL_MMX_EPILOGUE 0
3157	%endmacro
3158
3159	;; @todo what do we need to do for SSE?
3160	%macro IEMIMPL_SSE_PROLOGUE 0
3161	%endmacro
3162	%macro IEMIMPL_SSE_EPILOGUE 0
3163	%endmacro
3164
3165
3166	;;
3167	; Media instruction working on two full sized registers.
3168	;
3169	; @param 1 The instruction
3170	;
3171	; @param A0 FPU context (fxsave).
3172	; @param A1 Pointer to the first media register size operand (input/output).
3173	; @param A2 Pointer to the second media register size operand (input).
3174	;
3175	%macro IEMIMPL_MEDIA_F2 1
3176	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3177	PROLOGUE_3_ARGS
3178	IEMIMPL_MMX_PROLOGUE
3179
3180	movq mm0, [A1]
3181	movq mm1, [A2]
3182	%1 mm0, mm1
3183	movq [A1], mm0
3184
3185	IEMIMPL_MMX_EPILOGUE
3186	EPILOGUE_3_ARGS
3187	ENDPROC iemAImpl_ %+ %1 %+ _u64
3188
3189	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3190	PROLOGUE_3_ARGS
3191	IEMIMPL_SSE_PROLOGUE
3192
3193	movdqu xmm0, [A1]
3194	movdqu xmm1, [A2]
3195	%1 xmm0, xmm1
3196	movdqu [A1], xmm0
3197
3198	IEMIMPL_SSE_EPILOGUE
3199	EPILOGUE_3_ARGS
3200	ENDPROC iemAImpl_ %+ %1 %+ _u128
3201	%endmacro
3202
3203	IEMIMPL_MEDIA_F2 pxor
3204	IEMIMPL_MEDIA_F2 pcmpeqb
3205	IEMIMPL_MEDIA_F2 pcmpeqw
3206	IEMIMPL_MEDIA_F2 pcmpeqd
3207
3208
3209	;;
3210	; Media instruction working on one full sized and one half sized register (lower half).
3211	;
3212	; @param 1 The instruction
3213	; @param 2 1 if MMX is included, 0 if not.
3214	;
3215	; @param A0 FPU context (fxsave).
3216	; @param A1 Pointer to the first full sized media register operand (input/output).
3217	; @param A2 Pointer to the second half sized media register operand (input).
3218	;
3219	%macro IEMIMPL_MEDIA_F1L1 2
3220	%if %2 != 0
3221	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3222	PROLOGUE_3_ARGS
3223	IEMIMPL_MMX_PROLOGUE
3224
3225	movq mm0, [A1]
3226	movd mm1, [A2]
3227	%1 mm0, mm1
3228	movq [A1], mm0
3229
3230	IEMIMPL_MMX_EPILOGUE
3231	EPILOGUE_3_ARGS
3232	ENDPROC iemAImpl_ %+ %1 %+ _u64
3233	%endif
3234
3235	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3236	PROLOGUE_3_ARGS
3237	IEMIMPL_SSE_PROLOGUE
3238
3239	movdqu xmm0, [A1]
3240	movq xmm1, [A2]
3241	%1 xmm0, xmm1
3242	movdqu [A1], xmm0
3243
3244	IEMIMPL_SSE_EPILOGUE
3245	EPILOGUE_3_ARGS
3246	ENDPROC iemAImpl_ %+ %1 %+ _u128
3247	%endmacro
3248
3249	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3250	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3251	IEMIMPL_MEDIA_F1L1 punpckldq, 1
3252	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3253
3254
3255	;;
3256	; Media instruction working on one full sized and one half sized register (high half).
3257	;
3258	; @param 1 The instruction
3259	; @param 2 1 if MMX is included, 0 if not.
3260	;
3261	; @param A0 FPU context (fxsave).
3262	; @param A1 Pointer to the first full sized media register operand (input/output).
3263	; @param A2 Pointer to the second full sized media register operand, where we
3264	; will only use the upper half (input).
3265	;
3266	%macro IEMIMPL_MEDIA_F1H1 2
3267	%if %2 != 0
3268	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3269	PROLOGUE_3_ARGS
3270	IEMIMPL_MMX_PROLOGUE
3271
3272	movq mm0, [A1]
3273	movq mm1, [A2]
3274	%1 mm0, mm1
3275	movq [A1], mm0
3276
3277	IEMIMPL_MMX_EPILOGUE
3278	EPILOGUE_3_ARGS
3279	ENDPROC iemAImpl_ %+ %1 %+ _u64
3280	%endif
3281
3282	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3283	PROLOGUE_3_ARGS
3284	IEMIMPL_SSE_PROLOGUE
3285
3286	movdqu xmm0, [A1]
3287	movdqu xmm1, [A2]
3288	%1 xmm0, xmm1
3289	movdqu [A1], xmm0
3290
3291	IEMIMPL_SSE_EPILOGUE
3292	EPILOGUE_3_ARGS
3293	ENDPROC iemAImpl_ %+ %1 %+ _u128
3294	%endmacro
3295
3296	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3297	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3298	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3299	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3300
3301
3302	;
3303	; Shufflers with evil 8-bit immediates.
3304	;
3305
3306	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3307	PROLOGUE_4_ARGS
3308	IEMIMPL_MMX_PROLOGUE
3309
3310	movq mm0, [A1]
3311	movq mm1, [A2]
3312	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3313	lea T1, [.imm0 xWrtRIP]
3314	lea T1, [T1 + T0]
3315	call T1
3316	movq [A1], mm0
3317
3318	IEMIMPL_MMX_EPILOGUE
3319	EPILOGUE_4_ARGS
3320	%assign bImm 0
3321	%rep 256
3322	.imm %+ bImm:
3323	pshufw mm0, mm1, bImm
3324	ret
3325	%assign bImm bImm + 1
3326	%endrep
3327	.immEnd: ; 256*5 == 0x500
3328	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3329	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3330	ENDPROC iemAImpl_pshufw
3331
3332
3333	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3334	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3335	PROLOGUE_4_ARGS
3336	IEMIMPL_SSE_PROLOGUE
3337
3338	movdqu xmm0, [A1]
3339	movdqu xmm1, [A2]
3340	lea T1, [.imm0 xWrtRIP]
3341	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3342	lea T1, [T1 + T0*2]
3343	call T1
3344	movdqu [A1], xmm0
3345
3346	IEMIMPL_SSE_EPILOGUE
3347	EPILOGUE_4_ARGS
3348	%assign bImm 0
3349	%rep 256
3350	.imm %+ bImm:
3351	%1 xmm0, xmm1, bImm
3352	ret
3353	%assign bImm bImm + 1
3354	%endrep
3355	.immEnd: ; 256*6 == 0x600
3356	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3357	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3358	ENDPROC iemAImpl_ %+ %1
3359	%endmacro
3360
3361	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3362	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3363	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3364
3365
3366	;
3367	; Move byte mask.
3368	;
3369
3370	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3371	PROLOGUE_3_ARGS
3372	IEMIMPL_MMX_PROLOGUE
3373
3374	mov T0, [A1]
3375	movq mm1, [A2]
3376	pmovmskb T0, mm1
3377	mov [A1], T0
3378	%ifdef RT_ARCH_X86
3379	mov dword [A1 + 4], 0
3380	%endif
3381	IEMIMPL_MMX_EPILOGUE
3382	EPILOGUE_3_ARGS
3383	ENDPROC iemAImpl_pmovmskb_u64
3384
3385	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3386	PROLOGUE_3_ARGS
3387	IEMIMPL_SSE_PROLOGUE
3388
3389	mov T0, [A1]
3390	movdqu xmm1, [A2]
3391	pmovmskb T0, xmm1
3392	mov [A1], T0
3393	%ifdef RT_ARCH_X86
3394	mov dword [A1 + 4], 0
3395	%endif
3396	IEMIMPL_SSE_EPILOGUE
3397	EPILOGUE_3_ARGS
3398	ENDPROC iemAImpl_pmovmskb_u128
3399

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 94410

Download in other formats: