IEMAllAImpl.asm@ 88638

Last change on this file since 88638 was 87740, checked in by vboxsync, 4 years ago
SUP/Makefile-wrapper.gmk,iprt/asmdefs.mac: Call objtool on our .r0 object when ORCs are roaming freely accross our Linux kernel. Otherwise we won't get any callstacks. bugref:9937
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 81.8 KB

Line
1	; $Id: IEMAllAImpl.asm 87740 2021-02-12 16:36:34Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2020 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20	; Header Files ;
21	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28	; Defined Constants And Macros ;
29	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%else
176	; x86
177	%macro PROLOGUE_1_ARGS 0
178	push edi
179	%endmacro
180	%macro EPILOGUE_1_ARGS 0
181	pop edi
182	ret 0
183	%endmacro
184	%macro EPILOGUE_1_ARGS_EX 1
185	pop edi
186	ret %1
187	%endmacro
188
189	%macro PROLOGUE_2_ARGS 0
190	push edi
191	%endmacro
192	%macro EPILOGUE_2_ARGS 0
193	pop edi
194	ret 0
195	%endmacro
196	%macro EPILOGUE_2_ARGS_EX 1
197	pop edi
198	ret %1
199	%endmacro
200
201	%macro PROLOGUE_3_ARGS 0
202	push ebx
203	mov ebx, [esp + 4 + 4]
204	push edi
205	%endmacro
206	%macro EPILOGUE_3_ARGS_EX 1
207	%if (%1) < 4
208	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
209	%endif
210	pop edi
211	pop ebx
212	ret %1
213	%endmacro
214	%macro EPILOGUE_3_ARGS 0
215	EPILOGUE_3_ARGS_EX 4
216	%endmacro
217
218	%macro PROLOGUE_4_ARGS 0
219	push ebx
220	push edi
221	push esi
222	mov ebx, [esp + 12 + 4 + 0]
223	mov esi, [esp + 12 + 4 + 4]
224	%endmacro
225	%macro EPILOGUE_4_ARGS_EX 1
226	%if (%1) < 8
227	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
228	%endif
229	pop esi
230	pop edi
231	pop ebx
232	ret %1
233	%endmacro
234	%macro EPILOGUE_4_ARGS 0
235	EPILOGUE_4_ARGS_EX 8
236	%endmacro
237
238	%define A0 ecx
239	%define A0_32 ecx
240	%define A0_16 cx
241	%define A0_8 cl
242
243	%define A1 edx
244	%define A1_32 edx
245	%define A1_16 dx
246	%define A1_8 dl
247
248	%define A2 ebx
249	%define A2_32 ebx
250	%define A2_16 bx
251	%define A2_8 bl
252
253	%define A3 esi
254	%define A3_32 esi
255	%define A3_16 si
256
257	%define T0 eax
258	%define T0_32 eax
259	%define T0_16 ax
260	%define T0_8 al
261
262	%define T1 edi
263	%define T1_32 edi
264	%define T1_16 di
265	%endif
266
267
268	;;
269	; Load the relevant flags from [%1] if there are undefined flags (%3).
270	;
271	; @remarks Clobbers T0, stack. Changes EFLAGS.
272	; @param A2 The register pointing to the flags.
273	; @param 1 The parameter (A0..A3) pointing to the eflags.
274	; @param 2 The set of modified flags.
275	; @param 3 The set of undefined flags.
276	;
277	%macro IEM_MAYBE_LOAD_FLAGS 3
278	;%if (%3) != 0
279	pushf ; store current flags
280	mov T0_32, [%1] ; load the guest flags
281	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
282	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
283	or [xSP], T0 ; merge guest flags with host flags.
284	popf ; load the mixed flags.
285	;%endif
286	%endmacro
287
288	;;
289	; Update the flag.
290	;
291	; @remarks Clobbers T0, T1, stack.
292	; @param 1 The register pointing to the EFLAGS.
293	; @param 2 The mask of modified flags to save.
294	; @param 3 The mask of undefined flags to (maybe) save.
295	;
296	%macro IEM_SAVE_FLAGS 3
297	%if (%2 \| %3) != 0
298	pushf
299	pop T1
300	mov T0_32, [%1] ; flags
301	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
302	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
303	or T0_32, T1_32 ; combine the flags.
304	mov [%1], T0_32 ; save the flags.
305	%endif
306	%endmacro
307
308
309	;;
310	; Macro for implementing a binary operator.
311	;
312	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
313	; variants, except on 32-bit system where the 64-bit accesses requires hand
314	; coding.
315	;
316	; All the functions takes a pointer to the destination memory operand in A0,
317	; the source register operand in A1 and a pointer to eflags in A2.
318	;
319	; @param 1 The instruction mnemonic.
320	; @param 2 Non-zero if there should be a locked version.
321	; @param 3 The modified flags.
322	; @param 4 The undefined flags.
323	;
324	%macro IEMIMPL_BIN_OP 4
325	BEGINCODE
326	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
327	PROLOGUE_3_ARGS
328	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
329	%1 byte [A0], A1_8
330	IEM_SAVE_FLAGS A2, %3, %4
331	EPILOGUE_3_ARGS
332	ENDPROC iemAImpl_ %+ %1 %+ _u8
333
334	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
335	PROLOGUE_3_ARGS
336	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
337	%1 word [A0], A1_16
338	IEM_SAVE_FLAGS A2, %3, %4
339	EPILOGUE_3_ARGS
340	ENDPROC iemAImpl_ %+ %1 %+ _u16
341
342	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
343	PROLOGUE_3_ARGS
344	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
345	%1 dword [A0], A1_32
346	IEM_SAVE_FLAGS A2, %3, %4
347	EPILOGUE_3_ARGS
348	ENDPROC iemAImpl_ %+ %1 %+ _u32
349
350	%ifdef RT_ARCH_AMD64
351	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
352	PROLOGUE_3_ARGS
353	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
354	%1 qword [A0], A1
355	IEM_SAVE_FLAGS A2, %3, %4
356	EPILOGUE_3_ARGS_EX 8
357	ENDPROC iemAImpl_ %+ %1 %+ _u64
358	%endif ; RT_ARCH_AMD64
359
360	%if %2 != 0 ; locked versions requested?
361
362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
363	PROLOGUE_3_ARGS
364	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
365	lock %1 byte [A0], A1_8
366	IEM_SAVE_FLAGS A2, %3, %4
367	EPILOGUE_3_ARGS
368	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
369
370	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
371	PROLOGUE_3_ARGS
372	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
373	lock %1 word [A0], A1_16
374	IEM_SAVE_FLAGS A2, %3, %4
375	EPILOGUE_3_ARGS
376	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
377
378	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
379	PROLOGUE_3_ARGS
380	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
381	lock %1 dword [A0], A1_32
382	IEM_SAVE_FLAGS A2, %3, %4
383	EPILOGUE_3_ARGS
384	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
385
386	%ifdef RT_ARCH_AMD64
387	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
388	PROLOGUE_3_ARGS
389	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
390	lock %1 qword [A0], A1
391	IEM_SAVE_FLAGS A2, %3, %4
392	EPILOGUE_3_ARGS_EX 8
393	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
394	%endif ; RT_ARCH_AMD64
395	%endif ; locked
396	%endmacro
397
398	; instr,lock,modified-flags.
399	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
400	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
401	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
402	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
403	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
404	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
405	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
406	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
407	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
408
409
410	;;
411	; Macro for implementing a bit operator.
412	;
413	; This will generate code for the 16, 32 and 64 bit accesses with locked
414	; variants, except on 32-bit system where the 64-bit accesses requires hand
415	; coding.
416	;
417	; All the functions takes a pointer to the destination memory operand in A0,
418	; the source register operand in A1 and a pointer to eflags in A2.
419	;
420	; @param 1 The instruction mnemonic.
421	; @param 2 Non-zero if there should be a locked version.
422	; @param 3 The modified flags.
423	; @param 4 The undefined flags.
424	;
425	%macro IEMIMPL_BIT_OP 4
426	BEGINCODE
427	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
428	PROLOGUE_3_ARGS
429	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
430	%1 word [A0], A1_16
431	IEM_SAVE_FLAGS A2, %3, %4
432	EPILOGUE_3_ARGS
433	ENDPROC iemAImpl_ %+ %1 %+ _u16
434
435	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
436	PROLOGUE_3_ARGS
437	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
438	%1 dword [A0], A1_32
439	IEM_SAVE_FLAGS A2, %3, %4
440	EPILOGUE_3_ARGS
441	ENDPROC iemAImpl_ %+ %1 %+ _u32
442
443	%ifdef RT_ARCH_AMD64
444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
445	PROLOGUE_3_ARGS
446	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
447	%1 qword [A0], A1
448	IEM_SAVE_FLAGS A2, %3, %4
449	EPILOGUE_3_ARGS_EX 8
450	ENDPROC iemAImpl_ %+ %1 %+ _u64
451	%endif ; RT_ARCH_AMD64
452
453	%if %2 != 0 ; locked versions requested?
454
455	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
456	PROLOGUE_3_ARGS
457	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
458	lock %1 word [A0], A1_16
459	IEM_SAVE_FLAGS A2, %3, %4
460	EPILOGUE_3_ARGS
461	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
462
463	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
464	PROLOGUE_3_ARGS
465	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
466	lock %1 dword [A0], A1_32
467	IEM_SAVE_FLAGS A2, %3, %4
468	EPILOGUE_3_ARGS
469	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
470
471	%ifdef RT_ARCH_AMD64
472	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
473	PROLOGUE_3_ARGS
474	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
475	lock %1 qword [A0], A1
476	IEM_SAVE_FLAGS A2, %3, %4
477	EPILOGUE_3_ARGS_EX 8
478	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
479	%endif ; RT_ARCH_AMD64
480	%endif ; locked
481	%endmacro
482	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
483	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
484	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
485	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
486
487	;;
488	; Macro for implementing a bit search operator.
489	;
490	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
491	; system where the 64-bit accesses requires hand coding.
492	;
493	; All the functions takes a pointer to the destination memory operand in A0,
494	; the source register operand in A1 and a pointer to eflags in A2.
495	;
496	; @param 1 The instruction mnemonic.
497	; @param 2 The modified flags.
498	; @param 3 The undefined flags.
499	;
500	%macro IEMIMPL_BIT_OP 3
501	BEGINCODE
502	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
503	PROLOGUE_3_ARGS
504	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
505	%1 T0_16, A1_16
506	jz .unchanged_dst
507	mov [A0], T0_16
508	.unchanged_dst:
509	IEM_SAVE_FLAGS A2, %2, %3
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
516	%1 T0_32, A1_32
517	jz .unchanged_dst
518	mov [A0], T0_32
519	.unchanged_dst:
520	IEM_SAVE_FLAGS A2, %2, %3
521	EPILOGUE_3_ARGS
522	ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524	%ifdef RT_ARCH_AMD64
525	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526	PROLOGUE_3_ARGS
527	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
528	%1 T0, A1
529	jz .unchanged_dst
530	mov [A0], T0
531	.unchanged_dst:
532	IEM_SAVE_FLAGS A2, %2, %3
533	EPILOGUE_3_ARGS_EX 8
534	ENDPROC iemAImpl_ %+ %1 %+ _u64
535	%endif ; RT_ARCH_AMD64
536	%endmacro
537	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
538	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
539
540
541	;
542	; IMUL is also a similar but yet different case (no lock, no mem dst).
543	; The rDX:rAX variant of imul is handled together with mul further down.
544	;
545	BEGINCODE
546	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
547	PROLOGUE_3_ARGS
548	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
549	imul A1_16, word [A0]
550	mov [A0], A1_16
551	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
552	EPILOGUE_3_ARGS
553	ENDPROC iemAImpl_imul_two_u16
554
555	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
556	PROLOGUE_3_ARGS
557	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
558	imul A1_32, dword [A0]
559	mov [A0], A1_32
560	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
561	EPILOGUE_3_ARGS
562	ENDPROC iemAImpl_imul_two_u32
563
564	%ifdef RT_ARCH_AMD64
565	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
566	PROLOGUE_3_ARGS
567	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
568	imul A1, qword [A0]
569	mov [A0], A1
570	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
571	EPILOGUE_3_ARGS_EX 8
572	ENDPROC iemAImpl_imul_two_u64
573	%endif ; RT_ARCH_AMD64
574
575
576	;
577	; XCHG for memory operands. This implies locking. No flag changes.
578	;
579	; Each function takes two arguments, first the pointer to the memory,
580	; then the pointer to the register. They all return void.
581	;
582	BEGINCODE
583	BEGINPROC_FASTCALL iemAImpl_xchg_u8, 8
584	PROLOGUE_2_ARGS
585	mov T0_8, [A1]
586	xchg [A0], T0_8
587	mov [A1], T0_8
588	EPILOGUE_2_ARGS
589	ENDPROC iemAImpl_xchg_u8
590
591	BEGINPROC_FASTCALL iemAImpl_xchg_u16, 8
592	PROLOGUE_2_ARGS
593	mov T0_16, [A1]
594	xchg [A0], T0_16
595	mov [A1], T0_16
596	EPILOGUE_2_ARGS
597	ENDPROC iemAImpl_xchg_u16
598
599	BEGINPROC_FASTCALL iemAImpl_xchg_u32, 8
600	PROLOGUE_2_ARGS
601	mov T0_32, [A1]
602	xchg [A0], T0_32
603	mov [A1], T0_32
604	EPILOGUE_2_ARGS
605	ENDPROC iemAImpl_xchg_u32
606
607	%ifdef RT_ARCH_AMD64
608	BEGINPROC_FASTCALL iemAImpl_xchg_u64, 8
609	PROLOGUE_2_ARGS
610	mov T0, [A1]
611	xchg [A0], T0
612	mov [A1], T0
613	EPILOGUE_2_ARGS
614	ENDPROC iemAImpl_xchg_u64
615	%endif
616
617
618	;
619	; XADD for memory operands.
620	;
621	; Each function takes three arguments, first the pointer to the
622	; memory/register, then the pointer to the register, and finally a pointer to
623	; eflags. They all return void.
624	;
625	BEGINCODE
626	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
627	PROLOGUE_3_ARGS
628	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
629	mov T0_8, [A1]
630	xadd [A0], T0_8
631	mov [A1], T0_8
632	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
633	EPILOGUE_3_ARGS
634	ENDPROC iemAImpl_xadd_u8
635
636	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
637	PROLOGUE_3_ARGS
638	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
639	mov T0_16, [A1]
640	xadd [A0], T0_16
641	mov [A1], T0_16
642	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
643	EPILOGUE_3_ARGS
644	ENDPROC iemAImpl_xadd_u16
645
646	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
647	PROLOGUE_3_ARGS
648	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
649	mov T0_32, [A1]
650	xadd [A0], T0_32
651	mov [A1], T0_32
652	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
653	EPILOGUE_3_ARGS
654	ENDPROC iemAImpl_xadd_u32
655
656	%ifdef RT_ARCH_AMD64
657	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
658	PROLOGUE_3_ARGS
659	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
660	mov T0, [A1]
661	xadd [A0], T0
662	mov [A1], T0
663	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
664	EPILOGUE_3_ARGS
665	ENDPROC iemAImpl_xadd_u64
666	%endif ; RT_ARCH_AMD64
667
668	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
669	PROLOGUE_3_ARGS
670	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
671	mov T0_8, [A1]
672	lock xadd [A0], T0_8
673	mov [A1], T0_8
674	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
675	EPILOGUE_3_ARGS
676	ENDPROC iemAImpl_xadd_u8_locked
677
678	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
679	PROLOGUE_3_ARGS
680	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
681	mov T0_16, [A1]
682	lock xadd [A0], T0_16
683	mov [A1], T0_16
684	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
685	EPILOGUE_3_ARGS
686	ENDPROC iemAImpl_xadd_u16_locked
687
688	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
689	PROLOGUE_3_ARGS
690	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
691	mov T0_32, [A1]
692	lock xadd [A0], T0_32
693	mov [A1], T0_32
694	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
695	EPILOGUE_3_ARGS
696	ENDPROC iemAImpl_xadd_u32_locked
697
698	%ifdef RT_ARCH_AMD64
699	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
700	PROLOGUE_3_ARGS
701	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
702	mov T0, [A1]
703	lock xadd [A0], T0
704	mov [A1], T0
705	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
706	EPILOGUE_3_ARGS
707	ENDPROC iemAImpl_xadd_u64_locked
708	%endif ; RT_ARCH_AMD64
709
710
711	;
712	; CMPXCHG8B.
713	;
714	; These are tricky register wise, so the code is duplicated for each calling
715	; convention.
716	;
717	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
718	;
719	; C-proto:
720	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
721	; uint32_t *pEFlags));
722	;
723	; Note! Identical to iemAImpl_cmpxchg16b.
724	;
725	BEGINCODE
726	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
727	%ifdef RT_ARCH_AMD64
728	%ifdef ASM_CALL64_MSC
729	push rbx
730
731	mov r11, rdx ; pu64EaxEdx (is also T1)
732	mov r10, rcx ; pu64Dst
733
734	mov ebx, [r8]
735	mov ecx, [r8 + 4]
736	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
737	mov eax, [r11]
738	mov edx, [r11 + 4]
739
740	lock cmpxchg8b [r10]
741
742	mov [r11], eax
743	mov [r11 + 4], edx
744	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
745
746	pop rbx
747	ret
748	%else
749	push rbx
750
751	mov r10, rcx ; pEFlags
752	mov r11, rdx ; pu64EbxEcx (is also T1)
753
754	mov ebx, [r11]
755	mov ecx, [r11 + 4]
756	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
757	mov eax, [rsi]
758	mov edx, [rsi + 4]
759
760	lock cmpxchg8b [rdi]
761
762	mov [rsi], eax
763	mov [rsi + 4], edx
764	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
765
766	pop rbx
767	ret
768
769	%endif
770	%else
771	push esi
772	push edi
773	push ebx
774	push ebp
775
776	mov edi, ecx ; pu64Dst
777	mov esi, edx ; pu64EaxEdx
778	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
779	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
780
781	mov ebx, [ecx]
782	mov ecx, [ecx + 4]
783	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
784	mov eax, [esi]
785	mov edx, [esi + 4]
786
787	lock cmpxchg8b [edi]
788
789	mov [esi], eax
790	mov [esi + 4], edx
791	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
792
793	pop ebp
794	pop ebx
795	pop edi
796	pop esi
797	ret 8
798	%endif
799	ENDPROC iemAImpl_cmpxchg8b
800
801	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
802	; Lazy bird always lock prefixes cmpxchg8b.
803	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
804	ENDPROC iemAImpl_cmpxchg8b_locked
805
806	%ifdef RT_ARCH_AMD64
807
808	;
809	; CMPXCHG16B.
810	;
811	; These are tricky register wise, so the code is duplicated for each calling
812	; convention.
813	;
814	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
815	;
816	; C-proto:
817	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
818	; uint32_t *pEFlags));
819	;
820	; Note! Identical to iemAImpl_cmpxchg8b.
821	;
822	BEGINCODE
823	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
824	%ifdef ASM_CALL64_MSC
825	push rbx
826
827	mov r11, rdx ; pu64RaxRdx (is also T1)
828	mov r10, rcx ; pu64Dst
829
830	mov rbx, [r8]
831	mov rcx, [r8 + 8]
832	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
833	mov rax, [r11]
834	mov rdx, [r11 + 8]
835
836	lock cmpxchg16b [r10]
837
838	mov [r11], rax
839	mov [r11 + 8], rdx
840	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
841
842	pop rbx
843	ret
844	%else
845	push rbx
846
847	mov r10, rcx ; pEFlags
848	mov r11, rdx ; pu64RbxRcx (is also T1)
849
850	mov rbx, [r11]
851	mov rcx, [r11 + 8]
852	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
853	mov rax, [rsi]
854	mov rdx, [rsi + 8]
855
856	lock cmpxchg16b [rdi]
857
858	mov [rsi], eax
859	mov [rsi + 8], edx
860	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
861
862	pop rbx
863	ret
864
865	%endif
866	ENDPROC iemAImpl_cmpxchg16b
867
868	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
869	; Lazy bird always lock prefixes cmpxchg8b.
870	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
871	ENDPROC iemAImpl_cmpxchg16b_locked
872
873	%endif ; RT_ARCH_AMD64
874
875
876	;
877	; CMPXCHG.
878	;
879	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
880	;
881	; C-proto:
882	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
883	;
884	BEGINCODE
885	%macro IEMIMPL_CMPXCHG 2
886	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
887	PROLOGUE_4_ARGS
888	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
889	mov al, [A1]
890	%1 cmpxchg [A0], A2_8
891	mov [A1], al
892	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
893	EPILOGUE_4_ARGS
894	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
895
896	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
897	PROLOGUE_4_ARGS
898	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
899	mov ax, [A1]
900	%1 cmpxchg [A0], A2_16
901	mov [A1], ax
902	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
903	EPILOGUE_4_ARGS
904	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
905
906	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
907	PROLOGUE_4_ARGS
908	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
909	mov eax, [A1]
910	%1 cmpxchg [A0], A2_32
911	mov [A1], eax
912	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
913	EPILOGUE_4_ARGS
914	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
915
916	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
917	%ifdef RT_ARCH_AMD64
918	PROLOGUE_4_ARGS
919	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
920	mov rax, [A1]
921	%1 cmpxchg [A0], A2
922	mov [A1], rax
923	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
924	EPILOGUE_4_ARGS
925	%else
926	;
927	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
928	;
929	push esi
930	push edi
931	push ebx
932	push ebp
933
934	mov edi, ecx ; pu64Dst
935	mov esi, edx ; pu64Rax
936	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
937	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
938
939	mov ebx, [ecx]
940	mov ecx, [ecx + 4]
941	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
942	mov eax, [esi]
943	mov edx, [esi + 4]
944
945	lock cmpxchg8b [edi]
946
947	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
948	jz .cmpxchg8b_not_equal
949	cmp eax, eax ; just set the other flags.
950	.store:
951	mov [esi], eax
952	mov [esi + 4], edx
953	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
954
955	pop ebp
956	pop ebx
957	pop edi
958	pop esi
959	ret 8
960
961	.cmpxchg8b_not_equal:
962	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
963	jne .store
964	cmp [esi], eax
965	jmp .store
966
967	%endif
968	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
969	%endmacro ; IEMIMPL_CMPXCHG
970
971	IEMIMPL_CMPXCHG , ,
972	IEMIMPL_CMPXCHG lock, _locked
973
974	;;
975	; Macro for implementing a unary operator.
976	;
977	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
978	; variants, except on 32-bit system where the 64-bit accesses requires hand
979	; coding.
980	;
981	; All the functions takes a pointer to the destination memory operand in A0,
982	; the source register operand in A1 and a pointer to eflags in A2.
983	;
984	; @param 1 The instruction mnemonic.
985	; @param 2 The modified flags.
986	; @param 3 The undefined flags.
987	;
988	%macro IEMIMPL_UNARY_OP 3
989	BEGINCODE
990	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
991	PROLOGUE_2_ARGS
992	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
993	%1 byte [A0]
994	IEM_SAVE_FLAGS A1, %2, %3
995	EPILOGUE_2_ARGS
996	ENDPROC iemAImpl_ %+ %1 %+ _u8
997
998	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
999	PROLOGUE_2_ARGS
1000	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1001	lock %1 byte [A0]
1002	IEM_SAVE_FLAGS A1, %2, %3
1003	EPILOGUE_2_ARGS
1004	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1005
1006	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1007	PROLOGUE_2_ARGS
1008	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1009	%1 word [A0]
1010	IEM_SAVE_FLAGS A1, %2, %3
1011	EPILOGUE_2_ARGS
1012	ENDPROC iemAImpl_ %+ %1 %+ _u16
1013
1014	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1015	PROLOGUE_2_ARGS
1016	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1017	lock %1 word [A0]
1018	IEM_SAVE_FLAGS A1, %2, %3
1019	EPILOGUE_2_ARGS
1020	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1021
1022	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1023	PROLOGUE_2_ARGS
1024	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1025	%1 dword [A0]
1026	IEM_SAVE_FLAGS A1, %2, %3
1027	EPILOGUE_2_ARGS
1028	ENDPROC iemAImpl_ %+ %1 %+ _u32
1029
1030	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1031	PROLOGUE_2_ARGS
1032	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1033	lock %1 dword [A0]
1034	IEM_SAVE_FLAGS A1, %2, %3
1035	EPILOGUE_2_ARGS
1036	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1037
1038	%ifdef RT_ARCH_AMD64
1039	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1040	PROLOGUE_2_ARGS
1041	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1042	%1 qword [A0]
1043	IEM_SAVE_FLAGS A1, %2, %3
1044	EPILOGUE_2_ARGS
1045	ENDPROC iemAImpl_ %+ %1 %+ _u64
1046
1047	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1048	PROLOGUE_2_ARGS
1049	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1050	lock %1 qword [A0]
1051	IEM_SAVE_FLAGS A1, %2, %3
1052	EPILOGUE_2_ARGS
1053	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1054	%endif ; RT_ARCH_AMD64
1055
1056	%endmacro
1057
1058	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1059	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1060	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1061	IEMIMPL_UNARY_OP not, 0, 0
1062
1063
1064	;;
1065	; Macro for implementing memory fence operation.
1066	;
1067	; No return value, no operands or anything.
1068	;
1069	; @param 1 The instruction.
1070	;
1071	%macro IEMIMPL_MEM_FENCE 1
1072	BEGINCODE
1073	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1074	%1
1075	ret
1076	ENDPROC iemAImpl_ %+ %1
1077	%endmacro
1078
1079	IEMIMPL_MEM_FENCE lfence
1080	IEMIMPL_MEM_FENCE sfence
1081	IEMIMPL_MEM_FENCE mfence
1082
1083	;;
1084	; Alternative for non-SSE2 host.
1085	;
1086	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1087	push xAX
1088	xchg xAX, [xSP]
1089	add xSP, xCB
1090	ret
1091	ENDPROC iemAImpl_alt_mem_fence
1092
1093
1094
1095	;;
1096	; Macro for implementing a shift operation.
1097	;
1098	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1099	; 32-bit system where the 64-bit accesses requires hand coding.
1100	;
1101	; All the functions takes a pointer to the destination memory operand in A0,
1102	; the shift count in A1 and a pointer to eflags in A2.
1103	;
1104	; @param 1 The instruction mnemonic.
1105	; @param 2 The modified flags.
1106	; @param 3 The undefined flags.
1107	;
1108	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1109	;
1110	%macro IEMIMPL_SHIFT_OP 3
1111	BEGINCODE
1112	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1113	PROLOGUE_3_ARGS
1114	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1115	%ifdef ASM_CALL64_GCC
1116	mov cl, A1_8
1117	%1 byte [A0], cl
1118	%else
1119	xchg A1, A0
1120	%1 byte [A1], cl
1121	%endif
1122	IEM_SAVE_FLAGS A2, %2, %3
1123	EPILOGUE_3_ARGS
1124	ENDPROC iemAImpl_ %+ %1 %+ _u8
1125
1126	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1127	PROLOGUE_3_ARGS
1128	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1129	%ifdef ASM_CALL64_GCC
1130	mov cl, A1_8
1131	%1 word [A0], cl
1132	%else
1133	xchg A1, A0
1134	%1 word [A1], cl
1135	%endif
1136	IEM_SAVE_FLAGS A2, %2, %3
1137	EPILOGUE_3_ARGS
1138	ENDPROC iemAImpl_ %+ %1 %+ _u16
1139
1140	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1141	PROLOGUE_3_ARGS
1142	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1143	%ifdef ASM_CALL64_GCC
1144	mov cl, A1_8
1145	%1 dword [A0], cl
1146	%else
1147	xchg A1, A0
1148	%1 dword [A1], cl
1149	%endif
1150	IEM_SAVE_FLAGS A2, %2, %3
1151	EPILOGUE_3_ARGS
1152	ENDPROC iemAImpl_ %+ %1 %+ _u32
1153
1154	%ifdef RT_ARCH_AMD64
1155	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1156	PROLOGUE_3_ARGS
1157	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1158	%ifdef ASM_CALL64_GCC
1159	mov cl, A1_8
1160	%1 qword [A0], cl
1161	%else
1162	xchg A1, A0
1163	%1 qword [A1], cl
1164	%endif
1165	IEM_SAVE_FLAGS A2, %2, %3
1166	EPILOGUE_3_ARGS
1167	ENDPROC iemAImpl_ %+ %1 %+ _u64
1168	%endif ; RT_ARCH_AMD64
1169
1170	%endmacro
1171
1172	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1173	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1174	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1175	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1176	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1177	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1178	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1179
1180
1181	;;
1182	; Macro for implementing a double precision shift operation.
1183	;
1184	; This will generate code for the 16, 32 and 64 bit accesses, except on
1185	; 32-bit system where the 64-bit accesses requires hand coding.
1186	;
1187	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1188	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1189	;
1190	; @param 1 The instruction mnemonic.
1191	; @param 2 The modified flags.
1192	; @param 3 The undefined flags.
1193	;
1194	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1195	;
1196	%macro IEMIMPL_SHIFT_DBL_OP 3
1197	BEGINCODE
1198	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1199	PROLOGUE_4_ARGS
1200	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1201	%ifdef ASM_CALL64_GCC
1202	xchg A3, A2
1203	%1 [A0], A1_16, cl
1204	xchg A3, A2
1205	%else
1206	xchg A0, A2
1207	%1 [A2], A1_16, cl
1208	%endif
1209	IEM_SAVE_FLAGS A3, %2, %3
1210	EPILOGUE_4_ARGS
1211	ENDPROC iemAImpl_ %+ %1 %+ _u16
1212
1213	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1214	PROLOGUE_4_ARGS
1215	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1216	%ifdef ASM_CALL64_GCC
1217	xchg A3, A2
1218	%1 [A0], A1_32, cl
1219	xchg A3, A2
1220	%else
1221	xchg A0, A2
1222	%1 [A2], A1_32, cl
1223	%endif
1224	IEM_SAVE_FLAGS A3, %2, %3
1225	EPILOGUE_4_ARGS
1226	ENDPROC iemAImpl_ %+ %1 %+ _u32
1227
1228	%ifdef RT_ARCH_AMD64
1229	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1230	PROLOGUE_4_ARGS
1231	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1232	%ifdef ASM_CALL64_GCC
1233	xchg A3, A2
1234	%1 [A0], A1, cl
1235	xchg A3, A2
1236	%else
1237	xchg A0, A2
1238	%1 [A2], A1, cl
1239	%endif
1240	IEM_SAVE_FLAGS A3, %2, %3
1241	EPILOGUE_4_ARGS_EX 12
1242	ENDPROC iemAImpl_ %+ %1 %+ _u64
1243	%endif ; RT_ARCH_AMD64
1244
1245	%endmacro
1246
1247	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1248	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1249
1250
1251	;;
1252	; Macro for implementing a multiplication operations.
1253	;
1254	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1255	; 32-bit system where the 64-bit accesses requires hand coding.
1256	;
1257	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1258	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1259	; pointer to eflags in A3.
1260	;
1261	; The functions all return 0 so the caller can be used for div/idiv as well as
1262	; for the mul/imul implementation.
1263	;
1264	; @param 1 The instruction mnemonic.
1265	; @param 2 The modified flags.
1266	; @param 3 The undefined flags.
1267	;
1268	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1269	;
1270	%macro IEMIMPL_MUL_OP 3
1271	BEGINCODE
1272	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1273	PROLOGUE_3_ARGS
1274	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1275	mov al, [A0]
1276	%1 A1_8
1277	mov [A0], ax
1278	IEM_SAVE_FLAGS A2, %2, %3
1279	xor eax, eax
1280	EPILOGUE_3_ARGS
1281	ENDPROC iemAImpl_ %+ %1 %+ _u8
1282
1283	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1284	PROLOGUE_4_ARGS
1285	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1286	mov ax, [A0]
1287	%ifdef ASM_CALL64_GCC
1288	%1 A2_16
1289	mov [A0], ax
1290	mov [A1], dx
1291	%else
1292	mov T1, A1
1293	%1 A2_16
1294	mov [A0], ax
1295	mov [T1], dx
1296	%endif
1297	IEM_SAVE_FLAGS A3, %2, %3
1298	xor eax, eax
1299	EPILOGUE_4_ARGS
1300	ENDPROC iemAImpl_ %+ %1 %+ _u16
1301
1302	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1303	PROLOGUE_4_ARGS
1304	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1305	mov eax, [A0]
1306	%ifdef ASM_CALL64_GCC
1307	%1 A2_32
1308	mov [A0], eax
1309	mov [A1], edx
1310	%else
1311	mov T1, A1
1312	%1 A2_32
1313	mov [A0], eax
1314	mov [T1], edx
1315	%endif
1316	IEM_SAVE_FLAGS A3, %2, %3
1317	xor eax, eax
1318	EPILOGUE_4_ARGS
1319	ENDPROC iemAImpl_ %+ %1 %+ _u32
1320
1321	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1322	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1323	PROLOGUE_4_ARGS
1324	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1325	mov rax, [A0]
1326	%ifdef ASM_CALL64_GCC
1327	%1 A2
1328	mov [A0], rax
1329	mov [A1], rdx
1330	%else
1331	mov T1, A1
1332	%1 A2
1333	mov [A0], rax
1334	mov [T1], rdx
1335	%endif
1336	IEM_SAVE_FLAGS A3, %2, %3
1337	xor eax, eax
1338	EPILOGUE_4_ARGS_EX 12
1339	ENDPROC iemAImpl_ %+ %1 %+ _u64
1340	%endif ; !RT_ARCH_AMD64
1341
1342	%endmacro
1343
1344	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1345	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1346
1347
1348	BEGINCODE
1349	;;
1350	; Worker function for negating a 32-bit number in T1:T0
1351	; @uses None (T0,T1)
1352	BEGINPROC iemAImpl_negate_T0_T1_u32
1353	push 0
1354	push 0
1355	xchg T0_32, [xSP]
1356	xchg T1_32, [xSP + xCB]
1357	sub T0_32, [xSP]
1358	sbb T1_32, [xSP + xCB]
1359	add xSP, xCB*2
1360	ret
1361	ENDPROC iemAImpl_negate_T0_T1_u32
1362
1363	%ifdef RT_ARCH_AMD64
1364	;;
1365	; Worker function for negating a 64-bit number in T1:T0
1366	; @uses None (T0,T1)
1367	BEGINPROC iemAImpl_negate_T0_T1_u64
1368	push 0
1369	push 0
1370	xchg T0, [xSP]
1371	xchg T1, [xSP + xCB]
1372	sub T0, [xSP]
1373	sbb T1, [xSP + xCB]
1374	add xSP, xCB*2
1375	ret
1376	ENDPROC iemAImpl_negate_T0_T1_u64
1377	%endif
1378
1379
1380	;;
1381	; Macro for implementing a division operations.
1382	;
1383	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1384	; 32-bit system where the 64-bit accesses requires hand coding.
1385	;
1386	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1387	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1388	; pointer to eflags in A3.
1389	;
1390	; The functions all return 0 on success and -1 if a divide error should be
1391	; raised by the caller.
1392	;
1393	; @param 1 The instruction mnemonic.
1394	; @param 2 The modified flags.
1395	; @param 3 The undefined flags.
1396	; @param 4 1 if signed, 0 if unsigned.
1397	;
1398	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1399	;
1400	%macro IEMIMPL_DIV_OP 4
1401	BEGINCODE
1402	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1403	PROLOGUE_3_ARGS
1404
1405	; div by chainsaw check.
1406	test A1_8, A1_8
1407	jz .div_zero
1408
1409	; Overflow check - unsigned division is simple to verify, haven't
1410	; found a simple way to check signed division yet unfortunately.
1411	%if %4 == 0
1412	cmp [A0 + 1], A1_8
1413	jae .div_overflow
1414	%else
1415	mov T0_16, [A0] ; T0 = dividend
1416	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1417	test A1_8, A1_8
1418	js .divisor_negative
1419	test T0_16, T0_16
1420	jns .both_positive
1421	neg T0_16
1422	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1423	push T0 ; Start off like unsigned below.
1424	shr T0_16, 7
1425	cmp T0_8, A1_8
1426	pop T0
1427	jb .div_no_overflow
1428	ja .div_overflow
1429	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1430	cmp T0_8, A1_8
1431	jae .div_overflow
1432	jmp .div_no_overflow
1433
1434	.divisor_negative:
1435	neg A1_8
1436	test T0_16, T0_16
1437	jns .one_of_each
1438	neg T0_16
1439	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1440	shr T0_16, 7
1441	cmp T0_8, A1_8
1442	jae .div_overflow
1443	.div_no_overflow:
1444	mov A1, T1 ; restore divisor
1445	%endif
1446
1447	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1448	mov ax, [A0]
1449	%1 A1_8
1450	mov [A0], ax
1451	IEM_SAVE_FLAGS A2, %2, %3
1452	xor eax, eax
1453
1454	.return:
1455	EPILOGUE_3_ARGS
1456
1457	.div_zero:
1458	.div_overflow:
1459	mov eax, -1
1460	jmp .return
1461	ENDPROC iemAImpl_ %+ %1 %+ _u8
1462
1463	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1464	PROLOGUE_4_ARGS
1465
1466	; div by chainsaw check.
1467	test A2_16, A2_16
1468	jz .div_zero
1469
1470	; Overflow check - unsigned division is simple to verify, haven't
1471	; found a simple way to check signed division yet unfortunately.
1472	%if %4 == 0
1473	cmp [A1], A2_16
1474	jae .div_overflow
1475	%else
1476	mov T0_16, [A1]
1477	shl T0_32, 16
1478	mov T0_16, [A0] ; T0 = dividend
1479	mov T1, A2 ; T1 = divisor
1480	test T1_16, T1_16
1481	js .divisor_negative
1482	test T0_32, T0_32
1483	jns .both_positive
1484	neg T0_32
1485	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1486	push T0 ; Start off like unsigned below.
1487	shr T0_32, 15
1488	cmp T0_16, T1_16
1489	pop T0
1490	jb .div_no_overflow
1491	ja .div_overflow
1492	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1493	cmp T0_16, T1_16
1494	jae .div_overflow
1495	jmp .div_no_overflow
1496
1497	.divisor_negative:
1498	neg T1_16
1499	test T0_32, T0_32
1500	jns .one_of_each
1501	neg T0_32
1502	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1503	shr T0_32, 15
1504	cmp T0_16, T1_16
1505	jae .div_overflow
1506	.div_no_overflow:
1507	%endif
1508
1509	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1510	%ifdef ASM_CALL64_GCC
1511	mov T1, A2
1512	mov ax, [A0]
1513	mov dx, [A1]
1514	%1 T1_16
1515	mov [A0], ax
1516	mov [A1], dx
1517	%else
1518	mov T1, A1
1519	mov ax, [A0]
1520	mov dx, [T1]
1521	%1 A2_16
1522	mov [A0], ax
1523	mov [T1], dx
1524	%endif
1525	IEM_SAVE_FLAGS A3, %2, %3
1526	xor eax, eax
1527
1528	.return:
1529	EPILOGUE_4_ARGS
1530
1531	.div_zero:
1532	.div_overflow:
1533	mov eax, -1
1534	jmp .return
1535	ENDPROC iemAImpl_ %+ %1 %+ _u16
1536
1537	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1538	PROLOGUE_4_ARGS
1539
1540	; div by chainsaw check.
1541	test A2_32, A2_32
1542	jz .div_zero
1543
1544	; Overflow check - unsigned division is simple to verify, haven't
1545	; found a simple way to check signed division yet unfortunately.
1546	%if %4 == 0
1547	cmp [A1], A2_32
1548	jae .div_overflow
1549	%else
1550	push A2 ; save A2 so we modify it (we out of regs on x86).
1551	mov T0_32, [A0] ; T0 = dividend low
1552	mov T1_32, [A1] ; T1 = dividend high
1553	test A2_32, A2_32
1554	js .divisor_negative
1555	test T1_32, T1_32
1556	jns .both_positive
1557	call NAME(iemAImpl_negate_T0_T1_u32)
1558	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1559	push T0 ; Start off like unsigned below.
1560	shl T1_32, 1
1561	shr T0_32, 31
1562	or T1_32, T0_32
1563	cmp T1_32, A2_32
1564	pop T0
1565	jb .div_no_overflow
1566	ja .div_overflow
1567	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1568	cmp T0_32, A2_32
1569	jae .div_overflow
1570	jmp .div_no_overflow
1571
1572	.divisor_negative:
1573	neg A2_32
1574	test T1_32, T1_32
1575	jns .one_of_each
1576	call NAME(iemAImpl_negate_T0_T1_u32)
1577	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1578	shl T1_32, 1
1579	shr T0_32, 31
1580	or T1_32, T0_32
1581	cmp T1_32, A2_32
1582	jae .div_overflow
1583	.div_no_overflow:
1584	pop A2
1585	%endif
1586
1587	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1588	mov eax, [A0]
1589	%ifdef ASM_CALL64_GCC
1590	mov T1, A2
1591	mov eax, [A0]
1592	mov edx, [A1]
1593	%1 T1_32
1594	mov [A0], eax
1595	mov [A1], edx
1596	%else
1597	mov T1, A1
1598	mov eax, [A0]
1599	mov edx, [T1]
1600	%1 A2_32
1601	mov [A0], eax
1602	mov [T1], edx
1603	%endif
1604	IEM_SAVE_FLAGS A3, %2, %3
1605	xor eax, eax
1606
1607	.return:
1608	EPILOGUE_4_ARGS
1609
1610	.div_overflow:
1611	%if %4 != 0
1612	pop A2
1613	%endif
1614	.div_zero:
1615	mov eax, -1
1616	jmp .return
1617	ENDPROC iemAImpl_ %+ %1 %+ _u32
1618
1619	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1620	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1621	PROLOGUE_4_ARGS
1622
1623	test A2, A2
1624	jz .div_zero
1625	%if %4 == 0
1626	cmp [A1], A2
1627	jae .div_overflow
1628	%else
1629	push A2 ; save A2 so we modify it (we out of regs on x86).
1630	mov T0, [A0] ; T0 = dividend low
1631	mov T1, [A1] ; T1 = dividend high
1632	test A2, A2
1633	js .divisor_negative
1634	test T1, T1
1635	jns .both_positive
1636	call NAME(iemAImpl_negate_T0_T1_u64)
1637	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1638	push T0 ; Start off like unsigned below.
1639	shl T1, 1
1640	shr T0, 63
1641	or T1, T0
1642	cmp T1, A2
1643	pop T0
1644	jb .div_no_overflow
1645	ja .div_overflow
1646	mov T1, 0x7fffffffffffffff
1647	and T0, T1 ; Special case for covering (divisor - 1).
1648	cmp T0, A2
1649	jae .div_overflow
1650	jmp .div_no_overflow
1651
1652	.divisor_negative:
1653	neg A2
1654	test T1, T1
1655	jns .one_of_each
1656	call NAME(iemAImpl_negate_T0_T1_u64)
1657	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1658	shl T1, 1
1659	shr T0, 63
1660	or T1, T0
1661	cmp T1, A2
1662	jae .div_overflow
1663	.div_no_overflow:
1664	pop A2
1665	%endif
1666
1667	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1668	mov rax, [A0]
1669	%ifdef ASM_CALL64_GCC
1670	mov T1, A2
1671	mov rax, [A0]
1672	mov rdx, [A1]
1673	%1 T1
1674	mov [A0], rax
1675	mov [A1], rdx
1676	%else
1677	mov T1, A1
1678	mov rax, [A0]
1679	mov rdx, [T1]
1680	%1 A2
1681	mov [A0], rax
1682	mov [T1], rdx
1683	%endif
1684	IEM_SAVE_FLAGS A3, %2, %3
1685	xor eax, eax
1686
1687	.return:
1688	EPILOGUE_4_ARGS_EX 12
1689
1690	.div_overflow:
1691	%if %4 != 0
1692	pop A2
1693	%endif
1694	.div_zero:
1695	mov eax, -1
1696	jmp .return
1697	ENDPROC iemAImpl_ %+ %1 %+ _u64
1698	%endif ; !RT_ARCH_AMD64
1699
1700	%endmacro
1701
1702	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1703	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1704
1705
1706	;
1707	; BSWAP. No flag changes.
1708	;
1709	; Each function takes one argument, pointer to the value to bswap
1710	; (input/output). They all return void.
1711	;
1712	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1713	PROLOGUE_1_ARGS
1714	mov T0_32, [A0] ; just in case any of the upper bits are used.
1715	db 66h
1716	bswap T0_32
1717	mov [A0], T0_32
1718	EPILOGUE_1_ARGS
1719	ENDPROC iemAImpl_bswap_u16
1720
1721	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1722	PROLOGUE_1_ARGS
1723	mov T0_32, [A0]
1724	bswap T0_32
1725	mov [A0], T0_32
1726	EPILOGUE_1_ARGS
1727	ENDPROC iemAImpl_bswap_u32
1728
1729	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1730	%ifdef RT_ARCH_AMD64
1731	PROLOGUE_1_ARGS
1732	mov T0, [A0]
1733	bswap T0
1734	mov [A0], T0
1735	EPILOGUE_1_ARGS
1736	%else
1737	PROLOGUE_1_ARGS
1738	mov T0, [A0]
1739	mov T1, [A0 + 4]
1740	bswap T0
1741	bswap T1
1742	mov [A0 + 4], T0
1743	mov [A0], T1
1744	EPILOGUE_1_ARGS
1745	%endif
1746	ENDPROC iemAImpl_bswap_u64
1747
1748
1749	;;
1750	; Initialize the FPU for the actual instruction being emulated, this means
1751	; loading parts of the guest's control word and status word.
1752	;
1753	; @uses 24 bytes of stack.
1754	; @param 1 Expression giving the address of the FXSTATE of the guest.
1755	;
1756	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1757	fnstenv [xSP]
1758
1759	; FCW - for exception, precision and rounding control.
1760	movzx T0, word [%1 + X86FXSTATE.FCW]
1761	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1762	mov [xSP + X86FSTENV32P.FCW], T0_16
1763
1764	; FSW - for undefined C0, C1, C2, and C3.
1765	movzx T1, word [%1 + X86FXSTATE.FSW]
1766	and T1, X86_FSW_C_MASK
1767	movzx T0, word [xSP + X86FSTENV32P.FSW]
1768	and T0, X86_FSW_TOP_MASK
1769	or T0, T1
1770	mov [xSP + X86FSTENV32P.FSW], T0_16
1771
1772	fldenv [xSP]
1773	%endmacro
1774
1775
1776	;;
1777	; Need to move this as well somewhere better?
1778	;
1779	struc IEMFPURESULT
1780	.r80Result resw 5
1781	.FSW resw 1
1782	endstruc
1783
1784
1785	;;
1786	; Need to move this as well somewhere better?
1787	;
1788	struc IEMFPURESULTTWO
1789	.r80Result1 resw 5
1790	.FSW resw 1
1791	.r80Result2 resw 5
1792	endstruc
1793
1794
1795	;
1796	;---------------------- 16-bit signed integer operations ----------------------
1797	;
1798
1799
1800	;;
1801	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1802	;
1803	; @param A0 FPU context (fxsave).
1804	; @param A1 Pointer to a IEMFPURESULT for the output.
1805	; @param A2 Pointer to the 16-bit floating point value to convert.
1806	;
1807	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1808	PROLOGUE_3_ARGS
1809	sub xSP, 20h
1810
1811	fninit
1812	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1813	fild word [A2]
1814
1815	fnstsw word [A1 + IEMFPURESULT.FSW]
1816	fnclex
1817	fstp tword [A1 + IEMFPURESULT.r80Result]
1818
1819	fninit
1820	add xSP, 20h
1821	EPILOGUE_3_ARGS
1822	ENDPROC iemAImpl_fild_i16_to_r80
1823
1824
1825	;;
1826	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1827	;
1828	; @param A0 FPU context (fxsave).
1829	; @param A1 Where to return the output FSW.
1830	; @param A2 Where to store the 16-bit signed integer value.
1831	; @param A3 Pointer to the 80-bit value.
1832	;
1833	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1834	PROLOGUE_4_ARGS
1835	sub xSP, 20h
1836
1837	fninit
1838	fld tword [A3]
1839	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1840	fistp word [A2]
1841
1842	fnstsw word [A1]
1843
1844	fninit
1845	add xSP, 20h
1846	EPILOGUE_4_ARGS
1847	ENDPROC iemAImpl_fist_r80_to_i16
1848
1849
1850	;;
1851	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1852	; (memory) with truncation.
1853	;
1854	; @param A0 FPU context (fxsave).
1855	; @param A1 Where to return the output FSW.
1856	; @param A2 Where to store the 16-bit signed integer value.
1857	; @param A3 Pointer to the 80-bit value.
1858	;
1859	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1860	PROLOGUE_4_ARGS
1861	sub xSP, 20h
1862
1863	fninit
1864	fld tword [A3]
1865	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1866	fisttp dword [A2]
1867
1868	fnstsw word [A1]
1869
1870	fninit
1871	add xSP, 20h
1872	EPILOGUE_4_ARGS
1873	ENDPROC iemAImpl_fistt_r80_to_i16
1874
1875
1876	;;
1877	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1878	;
1879	; @param 1 The instruction
1880	;
1881	; @param A0 FPU context (fxsave).
1882	; @param A1 Pointer to a IEMFPURESULT for the output.
1883	; @param A2 Pointer to the 80-bit value.
1884	; @param A3 Pointer to the 16-bit value.
1885	;
1886	%macro IEMIMPL_FPU_R80_BY_I16 1
1887	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1888	PROLOGUE_4_ARGS
1889	sub xSP, 20h
1890
1891	fninit
1892	fld tword [A2]
1893	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1894	%1 word [A3]
1895
1896	fnstsw word [A1 + IEMFPURESULT.FSW]
1897	fnclex
1898	fstp tword [A1 + IEMFPURESULT.r80Result]
1899
1900	fninit
1901	add xSP, 20h
1902	EPILOGUE_4_ARGS
1903	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1904	%endmacro
1905
1906	IEMIMPL_FPU_R80_BY_I16 fiadd
1907	IEMIMPL_FPU_R80_BY_I16 fimul
1908	IEMIMPL_FPU_R80_BY_I16 fisub
1909	IEMIMPL_FPU_R80_BY_I16 fisubr
1910	IEMIMPL_FPU_R80_BY_I16 fidiv
1911	IEMIMPL_FPU_R80_BY_I16 fidivr
1912
1913
1914	;;
1915	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1916	; only returning FSW.
1917	;
1918	; @param 1 The instruction
1919	;
1920	; @param A0 FPU context (fxsave).
1921	; @param A1 Where to store the output FSW.
1922	; @param A2 Pointer to the 80-bit value.
1923	; @param A3 Pointer to the 64-bit value.
1924	;
1925	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1926	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1927	PROLOGUE_4_ARGS
1928	sub xSP, 20h
1929
1930	fninit
1931	fld tword [A2]
1932	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1933	%1 word [A3]
1934
1935	fnstsw word [A1]
1936
1937	fninit
1938	add xSP, 20h
1939	EPILOGUE_4_ARGS
1940	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1941	%endmacro
1942
1943	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1944
1945
1946
1947	;
1948	;---------------------- 32-bit signed integer operations ----------------------
1949	;
1950
1951
1952	;;
1953	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1954	;
1955	; @param A0 FPU context (fxsave).
1956	; @param A1 Pointer to a IEMFPURESULT for the output.
1957	; @param A2 Pointer to the 32-bit floating point value to convert.
1958	;
1959	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
1960	PROLOGUE_3_ARGS
1961	sub xSP, 20h
1962
1963	fninit
1964	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1965	fild dword [A2]
1966
1967	fnstsw word [A1 + IEMFPURESULT.FSW]
1968	fnclex
1969	fstp tword [A1 + IEMFPURESULT.r80Result]
1970
1971	fninit
1972	add xSP, 20h
1973	EPILOGUE_3_ARGS
1974	ENDPROC iemAImpl_fild_i32_to_r80
1975
1976
1977	;;
1978	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
1979	;
1980	; @param A0 FPU context (fxsave).
1981	; @param A1 Where to return the output FSW.
1982	; @param A2 Where to store the 32-bit signed integer value.
1983	; @param A3 Pointer to the 80-bit value.
1984	;
1985	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
1986	PROLOGUE_4_ARGS
1987	sub xSP, 20h
1988
1989	fninit
1990	fld tword [A3]
1991	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1992	fistp dword [A2]
1993
1994	fnstsw word [A1]
1995
1996	fninit
1997	add xSP, 20h
1998	EPILOGUE_4_ARGS
1999	ENDPROC iemAImpl_fist_r80_to_i32
2000
2001
2002	;;
2003	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2004	; (memory) with truncation.
2005	;
2006	; @param A0 FPU context (fxsave).
2007	; @param A1 Where to return the output FSW.
2008	; @param A2 Where to store the 32-bit signed integer value.
2009	; @param A3 Pointer to the 80-bit value.
2010	;
2011	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2012	PROLOGUE_4_ARGS
2013	sub xSP, 20h
2014
2015	fninit
2016	fld tword [A3]
2017	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2018	fisttp dword [A2]
2019
2020	fnstsw word [A1]
2021
2022	fninit
2023	add xSP, 20h
2024	EPILOGUE_4_ARGS
2025	ENDPROC iemAImpl_fistt_r80_to_i32
2026
2027
2028	;;
2029	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2030	;
2031	; @param 1 The instruction
2032	;
2033	; @param A0 FPU context (fxsave).
2034	; @param A1 Pointer to a IEMFPURESULT for the output.
2035	; @param A2 Pointer to the 80-bit value.
2036	; @param A3 Pointer to the 32-bit value.
2037	;
2038	%macro IEMIMPL_FPU_R80_BY_I32 1
2039	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2040	PROLOGUE_4_ARGS
2041	sub xSP, 20h
2042
2043	fninit
2044	fld tword [A2]
2045	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2046	%1 dword [A3]
2047
2048	fnstsw word [A1 + IEMFPURESULT.FSW]
2049	fnclex
2050	fstp tword [A1 + IEMFPURESULT.r80Result]
2051
2052	fninit
2053	add xSP, 20h
2054	EPILOGUE_4_ARGS
2055	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2056	%endmacro
2057
2058	IEMIMPL_FPU_R80_BY_I32 fiadd
2059	IEMIMPL_FPU_R80_BY_I32 fimul
2060	IEMIMPL_FPU_R80_BY_I32 fisub
2061	IEMIMPL_FPU_R80_BY_I32 fisubr
2062	IEMIMPL_FPU_R80_BY_I32 fidiv
2063	IEMIMPL_FPU_R80_BY_I32 fidivr
2064
2065
2066	;;
2067	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2068	; only returning FSW.
2069	;
2070	; @param 1 The instruction
2071	;
2072	; @param A0 FPU context (fxsave).
2073	; @param A1 Where to store the output FSW.
2074	; @param A2 Pointer to the 80-bit value.
2075	; @param A3 Pointer to the 64-bit value.
2076	;
2077	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2078	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2079	PROLOGUE_4_ARGS
2080	sub xSP, 20h
2081
2082	fninit
2083	fld tword [A2]
2084	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2085	%1 dword [A3]
2086
2087	fnstsw word [A1]
2088
2089	fninit
2090	add xSP, 20h
2091	EPILOGUE_4_ARGS
2092	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2093	%endmacro
2094
2095	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2096
2097
2098
2099	;
2100	;---------------------- 64-bit signed integer operations ----------------------
2101	;
2102
2103
2104	;;
2105	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2106	;
2107	; @param A0 FPU context (fxsave).
2108	; @param A1 Pointer to a IEMFPURESULT for the output.
2109	; @param A2 Pointer to the 64-bit floating point value to convert.
2110	;
2111	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2112	PROLOGUE_3_ARGS
2113	sub xSP, 20h
2114
2115	fninit
2116	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2117	fild qword [A2]
2118
2119	fnstsw word [A1 + IEMFPURESULT.FSW]
2120	fnclex
2121	fstp tword [A1 + IEMFPURESULT.r80Result]
2122
2123	fninit
2124	add xSP, 20h
2125	EPILOGUE_3_ARGS
2126	ENDPROC iemAImpl_fild_i64_to_r80
2127
2128
2129	;;
2130	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2131	;
2132	; @param A0 FPU context (fxsave).
2133	; @param A1 Where to return the output FSW.
2134	; @param A2 Where to store the 64-bit signed integer value.
2135	; @param A3 Pointer to the 80-bit value.
2136	;
2137	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2138	PROLOGUE_4_ARGS
2139	sub xSP, 20h
2140
2141	fninit
2142	fld tword [A3]
2143	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2144	fistp qword [A2]
2145
2146	fnstsw word [A1]
2147
2148	fninit
2149	add xSP, 20h
2150	EPILOGUE_4_ARGS
2151	ENDPROC iemAImpl_fist_r80_to_i64
2152
2153
2154	;;
2155	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2156	; (memory) with truncation.
2157	;
2158	; @param A0 FPU context (fxsave).
2159	; @param A1 Where to return the output FSW.
2160	; @param A2 Where to store the 64-bit signed integer value.
2161	; @param A3 Pointer to the 80-bit value.
2162	;
2163	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2164	PROLOGUE_4_ARGS
2165	sub xSP, 20h
2166
2167	fninit
2168	fld tword [A3]
2169	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2170	fisttp qword [A2]
2171
2172	fnstsw word [A1]
2173
2174	fninit
2175	add xSP, 20h
2176	EPILOGUE_4_ARGS
2177	ENDPROC iemAImpl_fistt_r80_to_i64
2178
2179
2180
2181	;
2182	;---------------------- 32-bit floating point operations ----------------------
2183	;
2184
2185	;;
2186	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2187	;
2188	; @param A0 FPU context (fxsave).
2189	; @param A1 Pointer to a IEMFPURESULT for the output.
2190	; @param A2 Pointer to the 32-bit floating point value to convert.
2191	;
2192	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2193	PROLOGUE_3_ARGS
2194	sub xSP, 20h
2195
2196	fninit
2197	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2198	fld dword [A2]
2199
2200	fnstsw word [A1 + IEMFPURESULT.FSW]
2201	fnclex
2202	fstp tword [A1 + IEMFPURESULT.r80Result]
2203
2204	fninit
2205	add xSP, 20h
2206	EPILOGUE_3_ARGS
2207	ENDPROC iemAImpl_fld_r32_to_r80
2208
2209
2210	;;
2211	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2212	;
2213	; @param A0 FPU context (fxsave).
2214	; @param A1 Where to return the output FSW.
2215	; @param A2 Where to store the 32-bit value.
2216	; @param A3 Pointer to the 80-bit value.
2217	;
2218	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2219	PROLOGUE_4_ARGS
2220	sub xSP, 20h
2221
2222	fninit
2223	fld tword [A3]
2224	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2225	fst dword [A2]
2226
2227	fnstsw word [A1]
2228
2229	fninit
2230	add xSP, 20h
2231	EPILOGUE_4_ARGS
2232	ENDPROC iemAImpl_fst_r80_to_r32
2233
2234
2235	;;
2236	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2237	;
2238	; @param 1 The instruction
2239	;
2240	; @param A0 FPU context (fxsave).
2241	; @param A1 Pointer to a IEMFPURESULT for the output.
2242	; @param A2 Pointer to the 80-bit value.
2243	; @param A3 Pointer to the 32-bit value.
2244	;
2245	%macro IEMIMPL_FPU_R80_BY_R32 1
2246	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2247	PROLOGUE_4_ARGS
2248	sub xSP, 20h
2249
2250	fninit
2251	fld tword [A2]
2252	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2253	%1 dword [A3]
2254
2255	fnstsw word [A1 + IEMFPURESULT.FSW]
2256	fnclex
2257	fstp tword [A1 + IEMFPURESULT.r80Result]
2258
2259	fninit
2260	add xSP, 20h
2261	EPILOGUE_4_ARGS
2262	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2263	%endmacro
2264
2265	IEMIMPL_FPU_R80_BY_R32 fadd
2266	IEMIMPL_FPU_R80_BY_R32 fmul
2267	IEMIMPL_FPU_R80_BY_R32 fsub
2268	IEMIMPL_FPU_R80_BY_R32 fsubr
2269	IEMIMPL_FPU_R80_BY_R32 fdiv
2270	IEMIMPL_FPU_R80_BY_R32 fdivr
2271
2272
2273	;;
2274	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2275	; only returning FSW.
2276	;
2277	; @param 1 The instruction
2278	;
2279	; @param A0 FPU context (fxsave).
2280	; @param A1 Where to store the output FSW.
2281	; @param A2 Pointer to the 80-bit value.
2282	; @param A3 Pointer to the 64-bit value.
2283	;
2284	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2285	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2286	PROLOGUE_4_ARGS
2287	sub xSP, 20h
2288
2289	fninit
2290	fld tword [A2]
2291	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2292	%1 dword [A3]
2293
2294	fnstsw word [A1]
2295
2296	fninit
2297	add xSP, 20h
2298	EPILOGUE_4_ARGS
2299	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2300	%endmacro
2301
2302	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2303
2304
2305
2306	;
2307	;---------------------- 64-bit floating point operations ----------------------
2308	;
2309
2310	;;
2311	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2312	;
2313	; @param A0 FPU context (fxsave).
2314	; @param A1 Pointer to a IEMFPURESULT for the output.
2315	; @param A2 Pointer to the 64-bit floating point value to convert.
2316	;
2317	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2318	PROLOGUE_3_ARGS
2319	sub xSP, 20h
2320
2321	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2322	fld qword [A2]
2323
2324	fnstsw word [A1 + IEMFPURESULT.FSW]
2325	fnclex
2326	fstp tword [A1 + IEMFPURESULT.r80Result]
2327
2328	fninit
2329	add xSP, 20h
2330	EPILOGUE_3_ARGS
2331	ENDPROC iemAImpl_fld_r64_to_r80
2332
2333
2334	;;
2335	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2336	;
2337	; @param A0 FPU context (fxsave).
2338	; @param A1 Where to return the output FSW.
2339	; @param A2 Where to store the 64-bit value.
2340	; @param A3 Pointer to the 80-bit value.
2341	;
2342	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2343	PROLOGUE_4_ARGS
2344	sub xSP, 20h
2345
2346	fninit
2347	fld tword [A3]
2348	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2349	fst qword [A2]
2350
2351	fnstsw word [A1]
2352
2353	fninit
2354	add xSP, 20h
2355	EPILOGUE_4_ARGS
2356	ENDPROC iemAImpl_fst_r80_to_r64
2357
2358
2359	;;
2360	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2361	;
2362	; @param 1 The instruction
2363	;
2364	; @param A0 FPU context (fxsave).
2365	; @param A1 Pointer to a IEMFPURESULT for the output.
2366	; @param A2 Pointer to the 80-bit value.
2367	; @param A3 Pointer to the 64-bit value.
2368	;
2369	%macro IEMIMPL_FPU_R80_BY_R64 1
2370	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2371	PROLOGUE_4_ARGS
2372	sub xSP, 20h
2373
2374	fninit
2375	fld tword [A2]
2376	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2377	%1 qword [A3]
2378
2379	fnstsw word [A1 + IEMFPURESULT.FSW]
2380	fnclex
2381	fstp tword [A1 + IEMFPURESULT.r80Result]
2382
2383	fninit
2384	add xSP, 20h
2385	EPILOGUE_4_ARGS
2386	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2387	%endmacro
2388
2389	IEMIMPL_FPU_R80_BY_R64 fadd
2390	IEMIMPL_FPU_R80_BY_R64 fmul
2391	IEMIMPL_FPU_R80_BY_R64 fsub
2392	IEMIMPL_FPU_R80_BY_R64 fsubr
2393	IEMIMPL_FPU_R80_BY_R64 fdiv
2394	IEMIMPL_FPU_R80_BY_R64 fdivr
2395
2396	;;
2397	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2398	; only returning FSW.
2399	;
2400	; @param 1 The instruction
2401	;
2402	; @param A0 FPU context (fxsave).
2403	; @param A1 Where to store the output FSW.
2404	; @param A2 Pointer to the 80-bit value.
2405	; @param A3 Pointer to the 64-bit value.
2406	;
2407	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2408	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2409	PROLOGUE_4_ARGS
2410	sub xSP, 20h
2411
2412	fninit
2413	fld tword [A2]
2414	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2415	%1 qword [A3]
2416
2417	fnstsw word [A1]
2418
2419	fninit
2420	add xSP, 20h
2421	EPILOGUE_4_ARGS
2422	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2423	%endmacro
2424
2425	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2426
2427
2428
2429	;
2430	;---------------------- 80-bit floating point operations ----------------------
2431	;
2432
2433	;;
2434	; Loads a 80-bit floating point register value from memory.
2435	;
2436	; @param A0 FPU context (fxsave).
2437	; @param A1 Pointer to a IEMFPURESULT for the output.
2438	; @param A2 Pointer to the 80-bit floating point value to load.
2439	;
2440	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2441	PROLOGUE_3_ARGS
2442	sub xSP, 20h
2443
2444	fninit
2445	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2446	fld tword [A2]
2447
2448	fnstsw word [A1 + IEMFPURESULT.FSW]
2449	fnclex
2450	fstp tword [A1 + IEMFPURESULT.r80Result]
2451
2452	fninit
2453	add xSP, 20h
2454	EPILOGUE_3_ARGS
2455	ENDPROC iemAImpl_fld_r80_from_r80
2456
2457
2458	;;
2459	; Store a 80-bit floating point register to memory
2460	;
2461	; @param A0 FPU context (fxsave).
2462	; @param A1 Where to return the output FSW.
2463	; @param A2 Where to store the 80-bit value.
2464	; @param A3 Pointer to the 80-bit register value.
2465	;
2466	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2467	PROLOGUE_4_ARGS
2468	sub xSP, 20h
2469
2470	fninit
2471	fld tword [A3]
2472	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2473	fstp tword [A2]
2474
2475	fnstsw word [A1]
2476
2477	fninit
2478	add xSP, 20h
2479	EPILOGUE_4_ARGS
2480	ENDPROC iemAImpl_fst_r80_to_r80
2481
2482
2483	;;
2484	; FPU instruction working on two 80-bit floating point values.
2485	;
2486	; @param 1 The instruction
2487	;
2488	; @param A0 FPU context (fxsave).
2489	; @param A1 Pointer to a IEMFPURESULT for the output.
2490	; @param A2 Pointer to the first 80-bit value (ST0)
2491	; @param A3 Pointer to the second 80-bit value (STn).
2492	;
2493	%macro IEMIMPL_FPU_R80_BY_R80 2
2494	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2495	PROLOGUE_4_ARGS
2496	sub xSP, 20h
2497
2498	fninit
2499	fld tword [A3]
2500	fld tword [A2]
2501	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2502	%1 %2
2503
2504	fnstsw word [A1 + IEMFPURESULT.FSW]
2505	fnclex
2506	fstp tword [A1 + IEMFPURESULT.r80Result]
2507
2508	fninit
2509	add xSP, 20h
2510	EPILOGUE_4_ARGS
2511	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2512	%endmacro
2513
2514	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2515	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2516	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2517	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2518	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2519	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2520	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2521	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2522	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2523
2524
2525	;;
2526	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2527	; storing the result in ST1 and popping the stack.
2528	;
2529	; @param 1 The instruction
2530	;
2531	; @param A0 FPU context (fxsave).
2532	; @param A1 Pointer to a IEMFPURESULT for the output.
2533	; @param A2 Pointer to the first 80-bit value (ST1).
2534	; @param A3 Pointer to the second 80-bit value (ST0).
2535	;
2536	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2537	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2538	PROLOGUE_4_ARGS
2539	sub xSP, 20h
2540
2541	fninit
2542	fld tword [A2]
2543	fld tword [A3]
2544	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2545	%1
2546
2547	fnstsw word [A1 + IEMFPURESULT.FSW]
2548	fnclex
2549	fstp tword [A1 + IEMFPURESULT.r80Result]
2550
2551	fninit
2552	add xSP, 20h
2553	EPILOGUE_4_ARGS
2554	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2555	%endmacro
2556
2557	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2558	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2559	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2560
2561
2562	;;
2563	; FPU instruction working on two 80-bit floating point values, only
2564	; returning FSW.
2565	;
2566	; @param 1 The instruction
2567	;
2568	; @param A0 FPU context (fxsave).
2569	; @param A1 Pointer to a uint16_t for the resulting FSW.
2570	; @param A2 Pointer to the first 80-bit value.
2571	; @param A3 Pointer to the second 80-bit value.
2572	;
2573	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2574	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2575	PROLOGUE_4_ARGS
2576	sub xSP, 20h
2577
2578	fninit
2579	fld tword [A3]
2580	fld tword [A2]
2581	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2582	%1 st0, st1
2583
2584	fnstsw word [A1]
2585
2586	fninit
2587	add xSP, 20h
2588	EPILOGUE_4_ARGS
2589	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2590	%endmacro
2591
2592	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2593	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2594
2595
2596	;;
2597	; FPU instruction working on two 80-bit floating point values,
2598	; returning FSW and EFLAGS (eax).
2599	;
2600	; @param 1 The instruction
2601	;
2602	; @returns EFLAGS in EAX.
2603	; @param A0 FPU context (fxsave).
2604	; @param A1 Pointer to a uint16_t for the resulting FSW.
2605	; @param A2 Pointer to the first 80-bit value.
2606	; @param A3 Pointer to the second 80-bit value.
2607	;
2608	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2609	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2610	PROLOGUE_4_ARGS
2611	sub xSP, 20h
2612
2613	fninit
2614	fld tword [A3]
2615	fld tword [A2]
2616	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2617	%1 st1
2618
2619	fnstsw word [A1]
2620	pushf
2621	pop xAX
2622
2623	fninit
2624	add xSP, 20h
2625	EPILOGUE_4_ARGS
2626	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2627	%endmacro
2628
2629	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2630	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2631
2632
2633	;;
2634	; FPU instruction working on one 80-bit floating point value.
2635	;
2636	; @param 1 The instruction
2637	;
2638	; @param A0 FPU context (fxsave).
2639	; @param A1 Pointer to a IEMFPURESULT for the output.
2640	; @param A2 Pointer to the 80-bit value.
2641	;
2642	%macro IEMIMPL_FPU_R80 1
2643	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2644	PROLOGUE_3_ARGS
2645	sub xSP, 20h
2646
2647	fninit
2648	fld tword [A2]
2649	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2650	%1
2651
2652	fnstsw word [A1 + IEMFPURESULT.FSW]
2653	fnclex
2654	fstp tword [A1 + IEMFPURESULT.r80Result]
2655
2656	fninit
2657	add xSP, 20h
2658	EPILOGUE_3_ARGS
2659	ENDPROC iemAImpl_ %+ %1 %+ _r80
2660	%endmacro
2661
2662	IEMIMPL_FPU_R80 fchs
2663	IEMIMPL_FPU_R80 fabs
2664	IEMIMPL_FPU_R80 f2xm1
2665	IEMIMPL_FPU_R80 fsqrt
2666	IEMIMPL_FPU_R80 frndint
2667	IEMIMPL_FPU_R80 fsin
2668	IEMIMPL_FPU_R80 fcos
2669
2670
2671	;;
2672	; FPU instruction working on one 80-bit floating point value, only
2673	; returning FSW.
2674	;
2675	; @param 1 The instruction
2676	;
2677	; @param A0 FPU context (fxsave).
2678	; @param A1 Pointer to a uint16_t for the resulting FSW.
2679	; @param A2 Pointer to the 80-bit value.
2680	;
2681	%macro IEMIMPL_FPU_R80_FSW 1
2682	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2683	PROLOGUE_3_ARGS
2684	sub xSP, 20h
2685
2686	fninit
2687	fld tword [A2]
2688	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2689	%1
2690
2691	fnstsw word [A1]
2692
2693	fninit
2694	add xSP, 20h
2695	EPILOGUE_3_ARGS
2696	ENDPROC iemAImpl_ %+ %1 %+ _r80
2697	%endmacro
2698
2699	IEMIMPL_FPU_R80_FSW ftst
2700	IEMIMPL_FPU_R80_FSW fxam
2701
2702
2703
2704	;;
2705	; FPU instruction loading a 80-bit floating point constant.
2706	;
2707	; @param 1 The instruction
2708	;
2709	; @param A0 FPU context (fxsave).
2710	; @param A1 Pointer to a IEMFPURESULT for the output.
2711	;
2712	%macro IEMIMPL_FPU_R80_CONST 1
2713	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2714	PROLOGUE_2_ARGS
2715	sub xSP, 20h
2716
2717	fninit
2718	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2719	%1
2720
2721	fnstsw word [A1 + IEMFPURESULT.FSW]
2722	fnclex
2723	fstp tword [A1 + IEMFPURESULT.r80Result]
2724
2725	fninit
2726	add xSP, 20h
2727	EPILOGUE_2_ARGS
2728	ENDPROC iemAImpl_ %+ %1 %+
2729	%endmacro
2730
2731	IEMIMPL_FPU_R80_CONST fld1
2732	IEMIMPL_FPU_R80_CONST fldl2t
2733	IEMIMPL_FPU_R80_CONST fldl2e
2734	IEMIMPL_FPU_R80_CONST fldpi
2735	IEMIMPL_FPU_R80_CONST fldlg2
2736	IEMIMPL_FPU_R80_CONST fldln2
2737	IEMIMPL_FPU_R80_CONST fldz
2738
2739
2740	;;
2741	; FPU instruction working on one 80-bit floating point value, outputing two.
2742	;
2743	; @param 1 The instruction
2744	;
2745	; @param A0 FPU context (fxsave).
2746	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2747	; @param A2 Pointer to the 80-bit value.
2748	;
2749	%macro IEMIMPL_FPU_R80_R80 1
2750	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2751	PROLOGUE_3_ARGS
2752	sub xSP, 20h
2753
2754	fninit
2755	fld tword [A2]
2756	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2757	%1
2758
2759	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2760	fnclex
2761	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2762	fnclex
2763	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2764
2765	fninit
2766	add xSP, 20h
2767	EPILOGUE_3_ARGS
2768	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2769	%endmacro
2770
2771	IEMIMPL_FPU_R80_R80 fptan
2772	IEMIMPL_FPU_R80_R80 fxtract
2773	IEMIMPL_FPU_R80_R80 fsincos
2774
2775
2776
2777
2778	;---------------------- SSE and MMX Operations ----------------------
2779
2780	;; @todo what do we need to do for MMX?
2781	%macro IEMIMPL_MMX_PROLOGUE 0
2782	%endmacro
2783	%macro IEMIMPL_MMX_EPILOGUE 0
2784	%endmacro
2785
2786	;; @todo what do we need to do for SSE?
2787	%macro IEMIMPL_SSE_PROLOGUE 0
2788	%endmacro
2789	%macro IEMIMPL_SSE_EPILOGUE 0
2790	%endmacro
2791
2792
2793	;;
2794	; Media instruction working on two full sized registers.
2795	;
2796	; @param 1 The instruction
2797	;
2798	; @param A0 FPU context (fxsave).
2799	; @param A1 Pointer to the first media register size operand (input/output).
2800	; @param A2 Pointer to the second media register size operand (input).
2801	;
2802	%macro IEMIMPL_MEDIA_F2 1
2803	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2804	PROLOGUE_3_ARGS
2805	IEMIMPL_MMX_PROLOGUE
2806
2807	movq mm0, [A1]
2808	movq mm1, [A2]
2809	%1 mm0, mm1
2810	movq [A1], mm0
2811
2812	IEMIMPL_MMX_EPILOGUE
2813	EPILOGUE_3_ARGS
2814	ENDPROC iemAImpl_ %+ %1 %+ _u64
2815
2816	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2817	PROLOGUE_3_ARGS
2818	IEMIMPL_SSE_PROLOGUE
2819
2820	movdqu xmm0, [A1]
2821	movdqu xmm1, [A2]
2822	%1 xmm0, xmm1
2823	movdqu [A1], xmm0
2824
2825	IEMIMPL_SSE_EPILOGUE
2826	EPILOGUE_3_ARGS
2827	ENDPROC iemAImpl_ %+ %1 %+ _u128
2828	%endmacro
2829
2830	IEMIMPL_MEDIA_F2 pxor
2831	IEMIMPL_MEDIA_F2 pcmpeqb
2832	IEMIMPL_MEDIA_F2 pcmpeqw
2833	IEMIMPL_MEDIA_F2 pcmpeqd
2834
2835
2836	;;
2837	; Media instruction working on one full sized and one half sized register (lower half).
2838	;
2839	; @param 1 The instruction
2840	; @param 2 1 if MMX is included, 0 if not.
2841	;
2842	; @param A0 FPU context (fxsave).
2843	; @param A1 Pointer to the first full sized media register operand (input/output).
2844	; @param A2 Pointer to the second half sized media register operand (input).
2845	;
2846	%macro IEMIMPL_MEDIA_F1L1 2
2847	%if %2 != 0
2848	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2849	PROLOGUE_3_ARGS
2850	IEMIMPL_MMX_PROLOGUE
2851
2852	movq mm0, [A1]
2853	movd mm1, [A2]
2854	%1 mm0, mm1
2855	movq [A1], mm0
2856
2857	IEMIMPL_MMX_EPILOGUE
2858	EPILOGUE_3_ARGS
2859	ENDPROC iemAImpl_ %+ %1 %+ _u64
2860	%endif
2861
2862	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2863	PROLOGUE_3_ARGS
2864	IEMIMPL_SSE_PROLOGUE
2865
2866	movdqu xmm0, [A1]
2867	movq xmm1, [A2]
2868	%1 xmm0, xmm1
2869	movdqu [A1], xmm0
2870
2871	IEMIMPL_SSE_EPILOGUE
2872	EPILOGUE_3_ARGS
2873	ENDPROC iemAImpl_ %+ %1 %+ _u128
2874	%endmacro
2875
2876	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2877	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2878	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2879	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2880
2881
2882	;;
2883	; Media instruction working on one full sized and one half sized register (high half).
2884	;
2885	; @param 1 The instruction
2886	; @param 2 1 if MMX is included, 0 if not.
2887	;
2888	; @param A0 FPU context (fxsave).
2889	; @param A1 Pointer to the first full sized media register operand (input/output).
2890	; @param A2 Pointer to the second full sized media register operand, where we
2891	; will only use the upper half (input).
2892	;
2893	%macro IEMIMPL_MEDIA_F1H1 2
2894	%if %2 != 0
2895	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2896	PROLOGUE_3_ARGS
2897	IEMIMPL_MMX_PROLOGUE
2898
2899	movq mm0, [A1]
2900	movq mm1, [A2]
2901	%1 mm0, mm1
2902	movq [A1], mm0
2903
2904	IEMIMPL_MMX_EPILOGUE
2905	EPILOGUE_3_ARGS
2906	ENDPROC iemAImpl_ %+ %1 %+ _u64
2907	%endif
2908
2909	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2910	PROLOGUE_3_ARGS
2911	IEMIMPL_SSE_PROLOGUE
2912
2913	movdqu xmm0, [A1]
2914	movdqu xmm1, [A2]
2915	%1 xmm0, xmm1
2916	movdqu [A1], xmm0
2917
2918	IEMIMPL_SSE_EPILOGUE
2919	EPILOGUE_3_ARGS
2920	ENDPROC iemAImpl_ %+ %1 %+ _u128
2921	%endmacro
2922
2923	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
2924	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
2925	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
2926	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
2927
2928
2929	;
2930	; Shufflers with evil 8-bit immediates.
2931	;
2932
2933	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
2934	PROLOGUE_4_ARGS
2935	IEMIMPL_MMX_PROLOGUE
2936
2937	movq mm0, [A1]
2938	movq mm1, [A2]
2939	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
2940	lea T1, [.imm0 xWrtRIP]
2941	lea T1, [T1 + T0]
2942	call T1
2943	movq [A1], mm0
2944
2945	IEMIMPL_MMX_EPILOGUE
2946	EPILOGUE_4_ARGS
2947	%assign bImm 0
2948	%rep 256
2949	.imm %+ bImm:
2950	pshufw mm0, mm1, bImm
2951	ret
2952	%assign bImm bImm + 1
2953	%endrep
2954	.immEnd: ; 256*5 == 0x500
2955	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2956	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2957	ENDPROC iemAImpl_pshufw
2958
2959
2960	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
2961	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
2962	PROLOGUE_4_ARGS
2963	IEMIMPL_SSE_PROLOGUE
2964
2965	movdqu xmm0, [A1]
2966	movdqu xmm1, [A2]
2967	lea T1, [.imm0 xWrtRIP]
2968	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
2969	lea T1, [T1 + T0*2]
2970	call T1
2971	movdqu [A1], xmm0
2972
2973	IEMIMPL_SSE_EPILOGUE
2974	EPILOGUE_4_ARGS
2975	%assign bImm 0
2976	%rep 256
2977	.imm %+ bImm:
2978	%1 xmm0, xmm1, bImm
2979	ret
2980	%assign bImm bImm + 1
2981	%endrep
2982	.immEnd: ; 256*6 == 0x600
2983	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2984	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2985	ENDPROC iemAImpl_ %+ %1
2986	%endmacro
2987
2988	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
2989	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
2990	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
2991
2992
2993	;
2994	; Move byte mask.
2995	;
2996
2997	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
2998	PROLOGUE_3_ARGS
2999	IEMIMPL_MMX_PROLOGUE
3000
3001	mov T0, [A1]
3002	movq mm1, [A2]
3003	pmovmskb T0, mm1
3004	mov [A1], T0
3005	%ifdef RT_ARCH_X86
3006	mov dword [A1 + 4], 0
3007	%endif
3008	IEMIMPL_MMX_EPILOGUE
3009	EPILOGUE_3_ARGS
3010	ENDPROC iemAImpl_pmovmskb_u64
3011
3012	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3013	PROLOGUE_3_ARGS
3014	IEMIMPL_SSE_PROLOGUE
3015
3016	mov T0, [A1]
3017	movdqu xmm1, [A2]
3018	pmovmskb T0, xmm1
3019	mov [A1], T0
3020	%ifdef RT_ARCH_X86
3021	mov dword [A1 + 4], 0
3022	%endif
3023	IEMIMPL_SSE_EPILOGUE
3024	EPILOGUE_3_ARGS
3025	ENDPROC iemAImpl_pmovmskb_u128
3026

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 88638

Download in other formats: