IEMAllAImpl.asm@ 74065

Last change on this file since 74065 was 69221, checked in by vboxsync, 7 years ago
VMM: scm cleanups
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 81.7 KB

Line
1	; $Id: IEMAllAImpl.asm 69221 2017-10-24 15:07:46Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2017 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20	; Header Files ;
21	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28	; Defined Constants And Macros ;
29	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%else
176	; x86
177	%macro PROLOGUE_1_ARGS 0
178	push edi
179	%endmacro
180	%macro EPILOGUE_1_ARGS 0
181	pop edi
182	ret 0
183	%endmacro
184	%macro EPILOGUE_1_ARGS_EX 1
185	pop edi
186	ret %1
187	%endmacro
188
189	%macro PROLOGUE_2_ARGS 0
190	push edi
191	%endmacro
192	%macro EPILOGUE_2_ARGS 0
193	pop edi
194	ret 0
195	%endmacro
196	%macro EPILOGUE_2_ARGS_EX 1
197	pop edi
198	ret %1
199	%endmacro
200
201	%macro PROLOGUE_3_ARGS 0
202	push ebx
203	mov ebx, [esp + 4 + 4]
204	push edi
205	%endmacro
206	%macro EPILOGUE_3_ARGS_EX 1
207	%if (%1) < 4
208	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
209	%endif
210	pop edi
211	pop ebx
212	ret %1
213	%endmacro
214	%macro EPILOGUE_3_ARGS 0
215	EPILOGUE_3_ARGS_EX 4
216	%endmacro
217
218	%macro PROLOGUE_4_ARGS 0
219	push ebx
220	push edi
221	push esi
222	mov ebx, [esp + 12 + 4 + 0]
223	mov esi, [esp + 12 + 4 + 4]
224	%endmacro
225	%macro EPILOGUE_4_ARGS_EX 1
226	%if (%1) < 8
227	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
228	%endif
229	pop esi
230	pop edi
231	pop ebx
232	ret %1
233	%endmacro
234	%macro EPILOGUE_4_ARGS 0
235	EPILOGUE_4_ARGS_EX 8
236	%endmacro
237
238	%define A0 ecx
239	%define A0_32 ecx
240	%define A0_16 cx
241	%define A0_8 cl
242
243	%define A1 edx
244	%define A1_32 edx
245	%define A1_16 dx
246	%define A1_8 dl
247
248	%define A2 ebx
249	%define A2_32 ebx
250	%define A2_16 bx
251	%define A2_8 bl
252
253	%define A3 esi
254	%define A3_32 esi
255	%define A3_16 si
256
257	%define T0 eax
258	%define T0_32 eax
259	%define T0_16 ax
260	%define T0_8 al
261
262	%define T1 edi
263	%define T1_32 edi
264	%define T1_16 di
265	%endif
266
267
268	;;
269	; Load the relevant flags from [%1] if there are undefined flags (%3).
270	;
271	; @remarks Clobbers T0, stack. Changes EFLAGS.
272	; @param A2 The register pointing to the flags.
273	; @param 1 The parameter (A0..A3) pointing to the eflags.
274	; @param 2 The set of modified flags.
275	; @param 3 The set of undefined flags.
276	;
277	%macro IEM_MAYBE_LOAD_FLAGS 3
278	;%if (%3) != 0
279	pushf ; store current flags
280	mov T0_32, [%1] ; load the guest flags
281	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
282	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
283	or [xSP], T0 ; merge guest flags with host flags.
284	popf ; load the mixed flags.
285	;%endif
286	%endmacro
287
288	;;
289	; Update the flag.
290	;
291	; @remarks Clobbers T0, T1, stack.
292	; @param 1 The register pointing to the EFLAGS.
293	; @param 2 The mask of modified flags to save.
294	; @param 3 The mask of undefined flags to (maybe) save.
295	;
296	%macro IEM_SAVE_FLAGS 3
297	%if (%2 \| %3) != 0
298	pushf
299	pop T1
300	mov T0_32, [%1] ; flags
301	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
302	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
303	or T0_32, T1_32 ; combine the flags.
304	mov [%1], T0_32 ; save the flags.
305	%endif
306	%endmacro
307
308
309	;;
310	; Macro for implementing a binary operator.
311	;
312	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
313	; variants, except on 32-bit system where the 64-bit accesses requires hand
314	; coding.
315	;
316	; All the functions takes a pointer to the destination memory operand in A0,
317	; the source register operand in A1 and a pointer to eflags in A2.
318	;
319	; @param 1 The instruction mnemonic.
320	; @param 2 Non-zero if there should be a locked version.
321	; @param 3 The modified flags.
322	; @param 4 The undefined flags.
323	;
324	%macro IEMIMPL_BIN_OP 4
325	BEGINCODE
326	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
327	PROLOGUE_3_ARGS
328	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
329	%1 byte [A0], A1_8
330	IEM_SAVE_FLAGS A2, %3, %4
331	EPILOGUE_3_ARGS
332	ENDPROC iemAImpl_ %+ %1 %+ _u8
333
334	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
335	PROLOGUE_3_ARGS
336	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
337	%1 word [A0], A1_16
338	IEM_SAVE_FLAGS A2, %3, %4
339	EPILOGUE_3_ARGS
340	ENDPROC iemAImpl_ %+ %1 %+ _u16
341
342	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
343	PROLOGUE_3_ARGS
344	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
345	%1 dword [A0], A1_32
346	IEM_SAVE_FLAGS A2, %3, %4
347	EPILOGUE_3_ARGS
348	ENDPROC iemAImpl_ %+ %1 %+ _u32
349
350	%ifdef RT_ARCH_AMD64
351	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
352	PROLOGUE_3_ARGS
353	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
354	%1 qword [A0], A1
355	IEM_SAVE_FLAGS A2, %3, %4
356	EPILOGUE_3_ARGS_EX 8
357	ENDPROC iemAImpl_ %+ %1 %+ _u64
358	%endif ; RT_ARCH_AMD64
359
360	%if %2 != 0 ; locked versions requested?
361
362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
363	PROLOGUE_3_ARGS
364	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
365	lock %1 byte [A0], A1_8
366	IEM_SAVE_FLAGS A2, %3, %4
367	EPILOGUE_3_ARGS
368	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
369
370	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
371	PROLOGUE_3_ARGS
372	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
373	lock %1 word [A0], A1_16
374	IEM_SAVE_FLAGS A2, %3, %4
375	EPILOGUE_3_ARGS
376	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
377
378	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
379	PROLOGUE_3_ARGS
380	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
381	lock %1 dword [A0], A1_32
382	IEM_SAVE_FLAGS A2, %3, %4
383	EPILOGUE_3_ARGS
384	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
385
386	%ifdef RT_ARCH_AMD64
387	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
388	PROLOGUE_3_ARGS
389	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
390	lock %1 qword [A0], A1
391	IEM_SAVE_FLAGS A2, %3, %4
392	EPILOGUE_3_ARGS_EX 8
393	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
394	%endif ; RT_ARCH_AMD64
395	%endif ; locked
396	%endmacro
397
398	; instr,lock,modified-flags.
399	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
400	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
401	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
402	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
403	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
404	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
405	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
406	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
407	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
408
409
410	;;
411	; Macro for implementing a bit operator.
412	;
413	; This will generate code for the 16, 32 and 64 bit accesses with locked
414	; variants, except on 32-bit system where the 64-bit accesses requires hand
415	; coding.
416	;
417	; All the functions takes a pointer to the destination memory operand in A0,
418	; the source register operand in A1 and a pointer to eflags in A2.
419	;
420	; @param 1 The instruction mnemonic.
421	; @param 2 Non-zero if there should be a locked version.
422	; @param 3 The modified flags.
423	; @param 4 The undefined flags.
424	;
425	%macro IEMIMPL_BIT_OP 4
426	BEGINCODE
427	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
428	PROLOGUE_3_ARGS
429	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
430	%1 word [A0], A1_16
431	IEM_SAVE_FLAGS A2, %3, %4
432	EPILOGUE_3_ARGS
433	ENDPROC iemAImpl_ %+ %1 %+ _u16
434
435	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
436	PROLOGUE_3_ARGS
437	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
438	%1 dword [A0], A1_32
439	IEM_SAVE_FLAGS A2, %3, %4
440	EPILOGUE_3_ARGS
441	ENDPROC iemAImpl_ %+ %1 %+ _u32
442
443	%ifdef RT_ARCH_AMD64
444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
445	PROLOGUE_3_ARGS
446	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
447	%1 qword [A0], A1
448	IEM_SAVE_FLAGS A2, %3, %4
449	EPILOGUE_3_ARGS_EX 8
450	ENDPROC iemAImpl_ %+ %1 %+ _u64
451	%endif ; RT_ARCH_AMD64
452
453	%if %2 != 0 ; locked versions requested?
454
455	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
456	PROLOGUE_3_ARGS
457	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
458	lock %1 word [A0], A1_16
459	IEM_SAVE_FLAGS A2, %3, %4
460	EPILOGUE_3_ARGS
461	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
462
463	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
464	PROLOGUE_3_ARGS
465	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
466	lock %1 dword [A0], A1_32
467	IEM_SAVE_FLAGS A2, %3, %4
468	EPILOGUE_3_ARGS
469	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
470
471	%ifdef RT_ARCH_AMD64
472	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
473	PROLOGUE_3_ARGS
474	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
475	lock %1 qword [A0], A1
476	IEM_SAVE_FLAGS A2, %3, %4
477	EPILOGUE_3_ARGS_EX 8
478	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
479	%endif ; RT_ARCH_AMD64
480	%endif ; locked
481	%endmacro
482	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
483	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
484	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
485	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
486
487	;;
488	; Macro for implementing a bit search operator.
489	;
490	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
491	; system where the 64-bit accesses requires hand coding.
492	;
493	; All the functions takes a pointer to the destination memory operand in A0,
494	; the source register operand in A1 and a pointer to eflags in A2.
495	;
496	; @param 1 The instruction mnemonic.
497	; @param 2 The modified flags.
498	; @param 3 The undefined flags.
499	;
500	%macro IEMIMPL_BIT_OP 3
501	BEGINCODE
502	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
503	PROLOGUE_3_ARGS
504	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
505	%1 T0_16, A1_16
506	jz .unchanged_dst
507	mov [A0], T0_16
508	.unchanged_dst:
509	IEM_SAVE_FLAGS A2, %2, %3
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
516	%1 T0_32, A1_32
517	jz .unchanged_dst
518	mov [A0], T0_32
519	.unchanged_dst:
520	IEM_SAVE_FLAGS A2, %2, %3
521	EPILOGUE_3_ARGS
522	ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524	%ifdef RT_ARCH_AMD64
525	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526	PROLOGUE_3_ARGS
527	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
528	%1 T0, A1
529	jz .unchanged_dst
530	mov [A0], T0
531	.unchanged_dst:
532	IEM_SAVE_FLAGS A2, %2, %3
533	EPILOGUE_3_ARGS_EX 8
534	ENDPROC iemAImpl_ %+ %1 %+ _u64
535	%endif ; RT_ARCH_AMD64
536	%endmacro
537	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
538	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
539
540
541	;
542	; IMUL is also a similar but yet different case (no lock, no mem dst).
543	; The rDX:rAX variant of imul is handled together with mul further down.
544	;
545	BEGINCODE
546	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
547	PROLOGUE_3_ARGS
548	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
549	imul A1_16, word [A0]
550	mov [A0], A1_16
551	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
552	EPILOGUE_3_ARGS
553	ENDPROC iemAImpl_imul_two_u16
554
555	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
556	PROLOGUE_3_ARGS
557	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
558	imul A1_32, dword [A0]
559	mov [A0], A1_32
560	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
561	EPILOGUE_3_ARGS
562	ENDPROC iemAImpl_imul_two_u32
563
564	%ifdef RT_ARCH_AMD64
565	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
566	PROLOGUE_3_ARGS
567	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
568	imul A1, qword [A0]
569	mov [A0], A1
570	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
571	EPILOGUE_3_ARGS_EX 8
572	ENDPROC iemAImpl_imul_two_u64
573	%endif ; RT_ARCH_AMD64
574
575
576	;
577	; XCHG for memory operands. This implies locking. No flag changes.
578	;
579	; Each function takes two arguments, first the pointer to the memory,
580	; then the pointer to the register. They all return void.
581	;
582	BEGINCODE
583	BEGINPROC_FASTCALL iemAImpl_xchg_u8, 8
584	PROLOGUE_2_ARGS
585	mov T0_8, [A1]
586	xchg [A0], T0_8
587	mov [A1], T0_8
588	EPILOGUE_2_ARGS
589	ENDPROC iemAImpl_xchg_u8
590
591	BEGINPROC_FASTCALL iemAImpl_xchg_u16, 8
592	PROLOGUE_2_ARGS
593	mov T0_16, [A1]
594	xchg [A0], T0_16
595	mov [A1], T0_16
596	EPILOGUE_2_ARGS
597	ENDPROC iemAImpl_xchg_u16
598
599	BEGINPROC_FASTCALL iemAImpl_xchg_u32, 8
600	PROLOGUE_2_ARGS
601	mov T0_32, [A1]
602	xchg [A0], T0_32
603	mov [A1], T0_32
604	EPILOGUE_2_ARGS
605	ENDPROC iemAImpl_xchg_u32
606
607	%ifdef RT_ARCH_AMD64
608	BEGINPROC_FASTCALL iemAImpl_xchg_u64, 8
609	PROLOGUE_2_ARGS
610	mov T0, [A1]
611	xchg [A0], T0
612	mov [A1], T0
613	EPILOGUE_2_ARGS
614	ENDPROC iemAImpl_xchg_u64
615	%endif
616
617
618	;
619	; XADD for memory operands.
620	;
621	; Each function takes three arguments, first the pointer to the
622	; memory/register, then the pointer to the register, and finally a pointer to
623	; eflags. They all return void.
624	;
625	BEGINCODE
626	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
627	PROLOGUE_3_ARGS
628	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
629	mov T0_8, [A1]
630	xadd [A0], T0_8
631	mov [A1], T0_8
632	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
633	EPILOGUE_3_ARGS
634	ENDPROC iemAImpl_xadd_u8
635
636	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
637	PROLOGUE_3_ARGS
638	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
639	mov T0_16, [A1]
640	xadd [A0], T0_16
641	mov [A1], T0_16
642	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
643	EPILOGUE_3_ARGS
644	ENDPROC iemAImpl_xadd_u16
645
646	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
647	PROLOGUE_3_ARGS
648	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
649	mov T0_32, [A1]
650	xadd [A0], T0_32
651	mov [A1], T0_32
652	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
653	EPILOGUE_3_ARGS
654	ENDPROC iemAImpl_xadd_u32
655
656	%ifdef RT_ARCH_AMD64
657	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
658	PROLOGUE_3_ARGS
659	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
660	mov T0, [A1]
661	xadd [A0], T0
662	mov [A1], T0
663	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
664	EPILOGUE_3_ARGS
665	ENDPROC iemAImpl_xadd_u64
666	%endif ; RT_ARCH_AMD64
667
668	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
669	PROLOGUE_3_ARGS
670	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
671	mov T0_8, [A1]
672	lock xadd [A0], T0_8
673	mov [A1], T0_8
674	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
675	EPILOGUE_3_ARGS
676	ENDPROC iemAImpl_xadd_u8_locked
677
678	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
679	PROLOGUE_3_ARGS
680	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
681	mov T0_16, [A1]
682	lock xadd [A0], T0_16
683	mov [A1], T0_16
684	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
685	EPILOGUE_3_ARGS
686	ENDPROC iemAImpl_xadd_u16_locked
687
688	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
689	PROLOGUE_3_ARGS
690	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
691	mov T0_32, [A1]
692	lock xadd [A0], T0_32
693	mov [A1], T0_32
694	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
695	EPILOGUE_3_ARGS
696	ENDPROC iemAImpl_xadd_u32_locked
697
698	%ifdef RT_ARCH_AMD64
699	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
700	PROLOGUE_3_ARGS
701	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
702	mov T0, [A1]
703	lock xadd [A0], T0
704	mov [A1], T0
705	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
706	EPILOGUE_3_ARGS
707	ENDPROC iemAImpl_xadd_u64_locked
708	%endif ; RT_ARCH_AMD64
709
710
711	;
712	; CMPXCHG8B.
713	;
714	; These are tricky register wise, so the code is duplicated for each calling
715	; convention.
716	;
717	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
718	;
719	; C-proto:
720	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
721	; uint32_t *pEFlags));
722	;
723	; Note! Identical to iemAImpl_cmpxchg16b.
724	;
725	BEGINCODE
726	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
727	%ifdef RT_ARCH_AMD64
728	%ifdef ASM_CALL64_MSC
729	push rbx
730
731	mov r11, rdx ; pu64EaxEdx (is also T1)
732	mov r10, rcx ; pu64Dst
733
734	mov ebx, [r8]
735	mov ecx, [r8 + 4]
736	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
737	mov eax, [r11]
738	mov edx, [r11 + 4]
739
740	lock cmpxchg8b [r10]
741
742	mov [r11], eax
743	mov [r11 + 4], edx
744	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
745
746	pop rbx
747	ret
748	%else
749	push rbx
750
751	mov r10, rcx ; pEFlags
752	mov r11, rdx ; pu64EbxEcx (is also T1)
753
754	mov ebx, [r11]
755	mov ecx, [r11 + 4]
756	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
757	mov eax, [rsi]
758	mov edx, [rsi + 4]
759
760	lock cmpxchg8b [rdi]
761
762	mov [rsi], eax
763	mov [rsi + 4], edx
764	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
765
766	pop rbx
767	ret
768
769	%endif
770	%else
771	push esi
772	push edi
773	push ebx
774	push ebp
775
776	mov edi, ecx ; pu64Dst
777	mov esi, edx ; pu64EaxEdx
778	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
779	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
780
781	mov ebx, [ecx]
782	mov ecx, [ecx + 4]
783	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
784	mov eax, [esi]
785	mov edx, [esi + 4]
786
787	lock cmpxchg8b [edi]
788
789	mov [esi], eax
790	mov [esi + 4], edx
791	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
792
793	pop ebp
794	pop ebx
795	pop edi
796	pop esi
797	ret 8
798	%endif
799	ENDPROC iemAImpl_cmpxchg8b
800
801	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
802	; Lazy bird always lock prefixes cmpxchg8b.
803	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
804	ENDPROC iemAImpl_cmpxchg8b_locked
805
806	%ifdef RT_ARCH_AMD64
807
808	;
809	; CMPXCHG16B.
810	;
811	; These are tricky register wise, so the code is duplicated for each calling
812	; convention.
813	;
814	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
815	;
816	; C-proto:
817	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
818	; uint32_t *pEFlags));
819	;
820	; Note! Identical to iemAImpl_cmpxchg8b.
821	;
822	BEGINCODE
823	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
824	%ifdef ASM_CALL64_MSC
825	push rbx
826
827	mov r11, rdx ; pu64RaxRdx (is also T1)
828	mov r10, rcx ; pu64Dst
829
830	mov rbx, [r8]
831	mov rcx, [r8 + 8]
832	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
833	mov rax, [r11]
834	mov rdx, [r11 + 8]
835
836	lock cmpxchg16b [r10]
837
838	mov [r11], rax
839	mov [r11 + 8], rdx
840	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
841
842	pop rbx
843	ret
844	%else
845	push rbx
846
847	mov r10, rcx ; pEFlags
848	mov r11, rdx ; pu64RbxRcx (is also T1)
849
850	mov rbx, [r11]
851	mov rcx, [r11 + 8]
852	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
853	mov rax, [rsi]
854	mov rdx, [rsi + 8]
855
856	lock cmpxchg16b [rdi]
857
858	mov [rsi], eax
859	mov [rsi + 8], edx
860	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
861
862	pop rbx
863	ret
864
865	%endif
866	ENDPROC iemAImpl_cmpxchg16b
867
868	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
869	; Lazy bird always lock prefixes cmpxchg8b.
870	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
871	ENDPROC iemAImpl_cmpxchg16b_locked
872
873	%endif ; RT_ARCH_AMD64
874
875
876	;
877	; CMPXCHG.
878	;
879	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
880	;
881	; C-proto:
882	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
883	;
884	BEGINCODE
885	%macro IEMIMPL_CMPXCHG 2
886	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
887	PROLOGUE_4_ARGS
888	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
889	mov al, [A1]
890	%1 cmpxchg [A0], A2_8
891	mov [A1], al
892	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
893	EPILOGUE_4_ARGS
894	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
895
896	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
897	PROLOGUE_4_ARGS
898	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
899	mov ax, [A1]
900	%1 cmpxchg [A0], A2_16
901	mov [A1], ax
902	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
903	EPILOGUE_4_ARGS
904	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
905
906	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
907	PROLOGUE_4_ARGS
908	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
909	mov eax, [A1]
910	%1 cmpxchg [A0], A2_32
911	mov [A1], eax
912	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
913	EPILOGUE_4_ARGS
914	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
915
916	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
917	%ifdef RT_ARCH_AMD64
918	PROLOGUE_4_ARGS
919	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
920	mov rax, [A1]
921	%1 cmpxchg [A0], A2
922	mov [A1], rax
923	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
924	EPILOGUE_4_ARGS
925	%else
926	;
927	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
928	;
929	push esi
930	push edi
931	push ebx
932	push ebp
933
934	mov edi, ecx ; pu64Dst
935	mov esi, edx ; pu64Rax
936	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
937	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
938
939	mov ebx, [ecx]
940	mov ecx, [ecx + 4]
941	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
942	mov eax, [esi]
943	mov edx, [esi + 4]
944
945	lock cmpxchg8b [edi]
946
947	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
948	jz .cmpxchg8b_not_equal
949	cmp eax, eax ; just set the other flags.
950	.store:
951	mov [esi], eax
952	mov [esi + 4], edx
953	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
954
955	pop ebp
956	pop ebx
957	pop edi
958	pop esi
959	ret 8
960
961	.cmpxchg8b_not_equal:
962	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
963	jne .store
964	cmp [esi], eax
965	jmp .store
966
967	%endif
968	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
969	%endmacro ; IEMIMPL_CMPXCHG
970
971	IEMIMPL_CMPXCHG , ,
972	IEMIMPL_CMPXCHG lock, _locked
973
974	;;
975	; Macro for implementing a unary operator.
976	;
977	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
978	; variants, except on 32-bit system where the 64-bit accesses requires hand
979	; coding.
980	;
981	; All the functions takes a pointer to the destination memory operand in A0,
982	; the source register operand in A1 and a pointer to eflags in A2.
983	;
984	; @param 1 The instruction mnemonic.
985	; @param 2 The modified flags.
986	; @param 3 The undefined flags.
987	;
988	%macro IEMIMPL_UNARY_OP 3
989	BEGINCODE
990	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
991	PROLOGUE_2_ARGS
992	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
993	%1 byte [A0]
994	IEM_SAVE_FLAGS A1, %2, %3
995	EPILOGUE_2_ARGS
996	ENDPROC iemAImpl_ %+ %1 %+ _u8
997
998	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
999	PROLOGUE_2_ARGS
1000	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1001	lock %1 byte [A0]
1002	IEM_SAVE_FLAGS A1, %2, %3
1003	EPILOGUE_2_ARGS
1004	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1005
1006	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1007	PROLOGUE_2_ARGS
1008	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1009	%1 word [A0]
1010	IEM_SAVE_FLAGS A1, %2, %3
1011	EPILOGUE_2_ARGS
1012	ENDPROC iemAImpl_ %+ %1 %+ _u16
1013
1014	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1015	PROLOGUE_2_ARGS
1016	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1017	lock %1 word [A0]
1018	IEM_SAVE_FLAGS A1, %2, %3
1019	EPILOGUE_2_ARGS
1020	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1021
1022	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1023	PROLOGUE_2_ARGS
1024	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1025	%1 dword [A0]
1026	IEM_SAVE_FLAGS A1, %2, %3
1027	EPILOGUE_2_ARGS
1028	ENDPROC iemAImpl_ %+ %1 %+ _u32
1029
1030	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1031	PROLOGUE_2_ARGS
1032	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1033	lock %1 dword [A0]
1034	IEM_SAVE_FLAGS A1, %2, %3
1035	EPILOGUE_2_ARGS
1036	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1037
1038	%ifdef RT_ARCH_AMD64
1039	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1040	PROLOGUE_2_ARGS
1041	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1042	%1 qword [A0]
1043	IEM_SAVE_FLAGS A1, %2, %3
1044	EPILOGUE_2_ARGS
1045	ENDPROC iemAImpl_ %+ %1 %+ _u64
1046
1047	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1048	PROLOGUE_2_ARGS
1049	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1050	lock %1 qword [A0]
1051	IEM_SAVE_FLAGS A1, %2, %3
1052	EPILOGUE_2_ARGS
1053	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1054	%endif ; RT_ARCH_AMD64
1055
1056	%endmacro
1057
1058	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1059	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1060	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1061	IEMIMPL_UNARY_OP not, 0, 0
1062
1063
1064	;;
1065	; Macro for implementing memory fence operation.
1066	;
1067	; No return value, no operands or anything.
1068	;
1069	; @param 1 The instruction.
1070	;
1071	%macro IEMIMPL_MEM_FENCE 1
1072	BEGINCODE
1073	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1074	%1
1075	ret
1076	ENDPROC iemAImpl_ %+ %1
1077	%endmacro
1078
1079	IEMIMPL_MEM_FENCE lfence
1080	IEMIMPL_MEM_FENCE sfence
1081	IEMIMPL_MEM_FENCE mfence
1082
1083	;;
1084	; Alternative for non-SSE2 host.
1085	;
1086	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1087	push xAX
1088	xchg xAX, [xSP]
1089	add xSP, xCB
1090	ret
1091	ENDPROC iemAImpl_alt_mem_fence
1092
1093
1094
1095	;;
1096	; Macro for implementing a shift operation.
1097	;
1098	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1099	; 32-bit system where the 64-bit accesses requires hand coding.
1100	;
1101	; All the functions takes a pointer to the destination memory operand in A0,
1102	; the shift count in A1 and a pointer to eflags in A2.
1103	;
1104	; @param 1 The instruction mnemonic.
1105	; @param 2 The modified flags.
1106	; @param 3 The undefined flags.
1107	;
1108	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1109	;
1110	%macro IEMIMPL_SHIFT_OP 3
1111	BEGINCODE
1112	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1113	PROLOGUE_3_ARGS
1114	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1115	%ifdef ASM_CALL64_GCC
1116	mov cl, A1_8
1117	%1 byte [A0], cl
1118	%else
1119	xchg A1, A0
1120	%1 byte [A1], cl
1121	%endif
1122	IEM_SAVE_FLAGS A2, %2, %3
1123	EPILOGUE_3_ARGS
1124	ENDPROC iemAImpl_ %+ %1 %+ _u8
1125
1126	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1127	PROLOGUE_3_ARGS
1128	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1129	%ifdef ASM_CALL64_GCC
1130	mov cl, A1_8
1131	%1 word [A0], cl
1132	%else
1133	xchg A1, A0
1134	%1 word [A1], cl
1135	%endif
1136	IEM_SAVE_FLAGS A2, %2, %3
1137	EPILOGUE_3_ARGS
1138	ENDPROC iemAImpl_ %+ %1 %+ _u16
1139
1140	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1141	PROLOGUE_3_ARGS
1142	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1143	%ifdef ASM_CALL64_GCC
1144	mov cl, A1_8
1145	%1 dword [A0], cl
1146	%else
1147	xchg A1, A0
1148	%1 dword [A1], cl
1149	%endif
1150	IEM_SAVE_FLAGS A2, %2, %3
1151	EPILOGUE_3_ARGS
1152	ENDPROC iemAImpl_ %+ %1 %+ _u32
1153
1154	%ifdef RT_ARCH_AMD64
1155	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1156	PROLOGUE_3_ARGS
1157	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1158	%ifdef ASM_CALL64_GCC
1159	mov cl, A1_8
1160	%1 qword [A0], cl
1161	%else
1162	xchg A1, A0
1163	%1 qword [A1], cl
1164	%endif
1165	IEM_SAVE_FLAGS A2, %2, %3
1166	EPILOGUE_3_ARGS
1167	ENDPROC iemAImpl_ %+ %1 %+ _u64
1168	%endif ; RT_ARCH_AMD64
1169
1170	%endmacro
1171
1172	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1173	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1174	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1175	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1176	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1177	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1178	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1179
1180
1181	;;
1182	; Macro for implementing a double precision shift operation.
1183	;
1184	; This will generate code for the 16, 32 and 64 bit accesses, except on
1185	; 32-bit system where the 64-bit accesses requires hand coding.
1186	;
1187	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1188	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1189	;
1190	; @param 1 The instruction mnemonic.
1191	; @param 2 The modified flags.
1192	; @param 3 The undefined flags.
1193	;
1194	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1195	;
1196	%macro IEMIMPL_SHIFT_DBL_OP 3
1197	BEGINCODE
1198	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1199	PROLOGUE_4_ARGS
1200	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1201	%ifdef ASM_CALL64_GCC
1202	xchg A3, A2
1203	%1 [A0], A1_16, cl
1204	xchg A3, A2
1205	%else
1206	xchg A0, A2
1207	%1 [A2], A1_16, cl
1208	%endif
1209	IEM_SAVE_FLAGS A3, %2, %3
1210	EPILOGUE_4_ARGS
1211	ENDPROC iemAImpl_ %+ %1 %+ _u16
1212
1213	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1214	PROLOGUE_4_ARGS
1215	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1216	%ifdef ASM_CALL64_GCC
1217	xchg A3, A2
1218	%1 [A0], A1_32, cl
1219	xchg A3, A2
1220	%else
1221	xchg A0, A2
1222	%1 [A2], A1_32, cl
1223	%endif
1224	IEM_SAVE_FLAGS A3, %2, %3
1225	EPILOGUE_4_ARGS
1226	ENDPROC iemAImpl_ %+ %1 %+ _u32
1227
1228	%ifdef RT_ARCH_AMD64
1229	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1230	PROLOGUE_4_ARGS
1231	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1232	%ifdef ASM_CALL64_GCC
1233	xchg A3, A2
1234	%1 [A0], A1, cl
1235	xchg A3, A2
1236	%else
1237	xchg A0, A2
1238	%1 [A2], A1, cl
1239	%endif
1240	IEM_SAVE_FLAGS A3, %2, %3
1241	EPILOGUE_4_ARGS_EX 12
1242	ENDPROC iemAImpl_ %+ %1 %+ _u64
1243	%endif ; RT_ARCH_AMD64
1244
1245	%endmacro
1246
1247	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1248	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1249
1250
1251	;;
1252	; Macro for implementing a multiplication operations.
1253	;
1254	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1255	; 32-bit system where the 64-bit accesses requires hand coding.
1256	;
1257	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1258	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1259	; pointer to eflags in A3.
1260	;
1261	; The functions all return 0 so the caller can be used for div/idiv as well as
1262	; for the mul/imul implementation.
1263	;
1264	; @param 1 The instruction mnemonic.
1265	; @param 2 The modified flags.
1266	; @param 3 The undefined flags.
1267	;
1268	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1269	;
1270	%macro IEMIMPL_MUL_OP 3
1271	BEGINCODE
1272	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1273	PROLOGUE_3_ARGS
1274	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1275	mov al, [A0]
1276	%1 A1_8
1277	mov [A0], ax
1278	IEM_SAVE_FLAGS A2, %2, %3
1279	xor eax, eax
1280	EPILOGUE_3_ARGS
1281	ENDPROC iemAImpl_ %+ %1 %+ _u8
1282
1283	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1284	PROLOGUE_4_ARGS
1285	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1286	mov ax, [A0]
1287	%ifdef ASM_CALL64_GCC
1288	%1 A2_16
1289	mov [A0], ax
1290	mov [A1], dx
1291	%else
1292	mov T1, A1
1293	%1 A2_16
1294	mov [A0], ax
1295	mov [T1], dx
1296	%endif
1297	IEM_SAVE_FLAGS A3, %2, %3
1298	xor eax, eax
1299	EPILOGUE_4_ARGS
1300	ENDPROC iemAImpl_ %+ %1 %+ _u16
1301
1302	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1303	PROLOGUE_4_ARGS
1304	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1305	mov eax, [A0]
1306	%ifdef ASM_CALL64_GCC
1307	%1 A2_32
1308	mov [A0], eax
1309	mov [A1], edx
1310	%else
1311	mov T1, A1
1312	%1 A2_32
1313	mov [A0], eax
1314	mov [T1], edx
1315	%endif
1316	IEM_SAVE_FLAGS A3, %2, %3
1317	xor eax, eax
1318	EPILOGUE_4_ARGS
1319	ENDPROC iemAImpl_ %+ %1 %+ _u32
1320
1321	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1322	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1323	PROLOGUE_4_ARGS
1324	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1325	mov rax, [A0]
1326	%ifdef ASM_CALL64_GCC
1327	%1 A2
1328	mov [A0], rax
1329	mov [A1], rdx
1330	%else
1331	mov T1, A1
1332	%1 A2
1333	mov [A0], rax
1334	mov [T1], rdx
1335	%endif
1336	IEM_SAVE_FLAGS A3, %2, %3
1337	xor eax, eax
1338	EPILOGUE_4_ARGS_EX 12
1339	ENDPROC iemAImpl_ %+ %1 %+ _u64
1340	%endif ; !RT_ARCH_AMD64
1341
1342	%endmacro
1343
1344	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1345	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1346
1347
1348	BEGINCODE
1349	;;
1350	; Worker function for negating a 32-bit number in T1:T0
1351	; @uses None (T0,T1)
1352	iemAImpl_negate_T0_T1_u32:
1353	push 0
1354	push 0
1355	xchg T0_32, [xSP]
1356	xchg T1_32, [xSP + xCB]
1357	sub T0_32, [xSP]
1358	sbb T1_32, [xSP + xCB]
1359	add xSP, xCB*2
1360	ret
1361
1362	%ifdef RT_ARCH_AMD64
1363	;;
1364	; Worker function for negating a 64-bit number in T1:T0
1365	; @uses None (T0,T1)
1366	iemAImpl_negate_T0_T1_u64:
1367	push 0
1368	push 0
1369	xchg T0, [xSP]
1370	xchg T1, [xSP + xCB]
1371	sub T0, [xSP]
1372	sbb T1, [xSP + xCB]
1373	add xSP, xCB*2
1374	ret
1375	%endif
1376
1377
1378	;;
1379	; Macro for implementing a division operations.
1380	;
1381	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1382	; 32-bit system where the 64-bit accesses requires hand coding.
1383	;
1384	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1385	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1386	; pointer to eflags in A3.
1387	;
1388	; The functions all return 0 on success and -1 if a divide error should be
1389	; raised by the caller.
1390	;
1391	; @param 1 The instruction mnemonic.
1392	; @param 2 The modified flags.
1393	; @param 3 The undefined flags.
1394	; @param 4 1 if signed, 0 if unsigned.
1395	;
1396	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1397	;
1398	%macro IEMIMPL_DIV_OP 4
1399	BEGINCODE
1400	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1401	PROLOGUE_3_ARGS
1402
1403	; div by chainsaw check.
1404	test A1_8, A1_8
1405	jz .div_zero
1406
1407	; Overflow check - unsigned division is simple to verify, haven't
1408	; found a simple way to check signed division yet unfortunately.
1409	%if %4 == 0
1410	cmp [A0 + 1], A1_8
1411	jae .div_overflow
1412	%else
1413	mov T0_16, [A0] ; T0 = dividend
1414	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1415	test A1_8, A1_8
1416	js .divisor_negative
1417	test T0_16, T0_16
1418	jns .both_positive
1419	neg T0_16
1420	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1421	push T0 ; Start off like unsigned below.
1422	shr T0_16, 7
1423	cmp T0_8, A1_8
1424	pop T0
1425	jb .div_no_overflow
1426	ja .div_overflow
1427	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1428	cmp T0_8, A1_8
1429	jae .div_overflow
1430	jmp .div_no_overflow
1431
1432	.divisor_negative:
1433	neg A1_8
1434	test T0_16, T0_16
1435	jns .one_of_each
1436	neg T0_16
1437	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1438	shr T0_16, 7
1439	cmp T0_8, A1_8
1440	jae .div_overflow
1441	.div_no_overflow:
1442	mov A1, T1 ; restore divisor
1443	%endif
1444
1445	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1446	mov ax, [A0]
1447	%1 A1_8
1448	mov [A0], ax
1449	IEM_SAVE_FLAGS A2, %2, %3
1450	xor eax, eax
1451
1452	.return:
1453	EPILOGUE_3_ARGS
1454
1455	.div_zero:
1456	.div_overflow:
1457	mov eax, -1
1458	jmp .return
1459	ENDPROC iemAImpl_ %+ %1 %+ _u8
1460
1461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1462	PROLOGUE_4_ARGS
1463
1464	; div by chainsaw check.
1465	test A2_16, A2_16
1466	jz .div_zero
1467
1468	; Overflow check - unsigned division is simple to verify, haven't
1469	; found a simple way to check signed division yet unfortunately.
1470	%if %4 == 0
1471	cmp [A1], A2_16
1472	jae .div_overflow
1473	%else
1474	mov T0_16, [A1]
1475	shl T0_32, 16
1476	mov T0_16, [A0] ; T0 = dividend
1477	mov T1, A2 ; T1 = divisor
1478	test T1_16, T1_16
1479	js .divisor_negative
1480	test T0_32, T0_32
1481	jns .both_positive
1482	neg T0_32
1483	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1484	push T0 ; Start off like unsigned below.
1485	shr T0_32, 15
1486	cmp T0_16, T1_16
1487	pop T0
1488	jb .div_no_overflow
1489	ja .div_overflow
1490	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1491	cmp T0_16, T1_16
1492	jae .div_overflow
1493	jmp .div_no_overflow
1494
1495	.divisor_negative:
1496	neg T1_16
1497	test T0_32, T0_32
1498	jns .one_of_each
1499	neg T0_32
1500	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1501	shr T0_32, 15
1502	cmp T0_16, T1_16
1503	jae .div_overflow
1504	.div_no_overflow:
1505	%endif
1506
1507	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1508	%ifdef ASM_CALL64_GCC
1509	mov T1, A2
1510	mov ax, [A0]
1511	mov dx, [A1]
1512	%1 T1_16
1513	mov [A0], ax
1514	mov [A1], dx
1515	%else
1516	mov T1, A1
1517	mov ax, [A0]
1518	mov dx, [T1]
1519	%1 A2_16
1520	mov [A0], ax
1521	mov [T1], dx
1522	%endif
1523	IEM_SAVE_FLAGS A3, %2, %3
1524	xor eax, eax
1525
1526	.return:
1527	EPILOGUE_4_ARGS
1528
1529	.div_zero:
1530	.div_overflow:
1531	mov eax, -1
1532	jmp .return
1533	ENDPROC iemAImpl_ %+ %1 %+ _u16
1534
1535	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1536	PROLOGUE_4_ARGS
1537
1538	; div by chainsaw check.
1539	test A2_32, A2_32
1540	jz .div_zero
1541
1542	; Overflow check - unsigned division is simple to verify, haven't
1543	; found a simple way to check signed division yet unfortunately.
1544	%if %4 == 0
1545	cmp [A1], A2_32
1546	jae .div_overflow
1547	%else
1548	push A2 ; save A2 so we modify it (we out of regs on x86).
1549	mov T0_32, [A0] ; T0 = dividend low
1550	mov T1_32, [A1] ; T1 = dividend high
1551	test A2_32, A2_32
1552	js .divisor_negative
1553	test T1_32, T1_32
1554	jns .both_positive
1555	call iemAImpl_negate_T0_T1_u32
1556	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1557	push T0 ; Start off like unsigned below.
1558	shl T1_32, 1
1559	shr T0_32, 31
1560	or T1_32, T0_32
1561	cmp T1_32, A2_32
1562	pop T0
1563	jb .div_no_overflow
1564	ja .div_overflow
1565	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1566	cmp T0_32, A2_32
1567	jae .div_overflow
1568	jmp .div_no_overflow
1569
1570	.divisor_negative:
1571	neg A2_32
1572	test T1_32, T1_32
1573	jns .one_of_each
1574	call iemAImpl_negate_T0_T1_u32
1575	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1576	shl T1_32, 1
1577	shr T0_32, 31
1578	or T1_32, T0_32
1579	cmp T1_32, A2_32
1580	jae .div_overflow
1581	.div_no_overflow:
1582	pop A2
1583	%endif
1584
1585	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1586	mov eax, [A0]
1587	%ifdef ASM_CALL64_GCC
1588	mov T1, A2
1589	mov eax, [A0]
1590	mov edx, [A1]
1591	%1 T1_32
1592	mov [A0], eax
1593	mov [A1], edx
1594	%else
1595	mov T1, A1
1596	mov eax, [A0]
1597	mov edx, [T1]
1598	%1 A2_32
1599	mov [A0], eax
1600	mov [T1], edx
1601	%endif
1602	IEM_SAVE_FLAGS A3, %2, %3
1603	xor eax, eax
1604
1605	.return:
1606	EPILOGUE_4_ARGS
1607
1608	.div_overflow:
1609	%if %4 != 0
1610	pop A2
1611	%endif
1612	.div_zero:
1613	mov eax, -1
1614	jmp .return
1615	ENDPROC iemAImpl_ %+ %1 %+ _u32
1616
1617	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1618	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1619	PROLOGUE_4_ARGS
1620
1621	test A2, A2
1622	jz .div_zero
1623	%if %4 == 0
1624	cmp [A1], A2
1625	jae .div_overflow
1626	%else
1627	push A2 ; save A2 so we modify it (we out of regs on x86).
1628	mov T0, [A0] ; T0 = dividend low
1629	mov T1, [A1] ; T1 = dividend high
1630	test A2, A2
1631	js .divisor_negative
1632	test T1, T1
1633	jns .both_positive
1634	call iemAImpl_negate_T0_T1_u64
1635	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1636	push T0 ; Start off like unsigned below.
1637	shl T1, 1
1638	shr T0, 63
1639	or T1, T0
1640	cmp T1, A2
1641	pop T0
1642	jb .div_no_overflow
1643	ja .div_overflow
1644	mov T1, 0x7fffffffffffffff
1645	and T0, T1 ; Special case for covering (divisor - 1).
1646	cmp T0, A2
1647	jae .div_overflow
1648	jmp .div_no_overflow
1649
1650	.divisor_negative:
1651	neg A2
1652	test T1, T1
1653	jns .one_of_each
1654	call iemAImpl_negate_T0_T1_u64
1655	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1656	shl T1, 1
1657	shr T0, 63
1658	or T1, T0
1659	cmp T1, A2
1660	jae .div_overflow
1661	.div_no_overflow:
1662	pop A2
1663	%endif
1664
1665	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1666	mov rax, [A0]
1667	%ifdef ASM_CALL64_GCC
1668	mov T1, A2
1669	mov rax, [A0]
1670	mov rdx, [A1]
1671	%1 T1
1672	mov [A0], rax
1673	mov [A1], rdx
1674	%else
1675	mov T1, A1
1676	mov rax, [A0]
1677	mov rdx, [T1]
1678	%1 A2
1679	mov [A0], rax
1680	mov [T1], rdx
1681	%endif
1682	IEM_SAVE_FLAGS A3, %2, %3
1683	xor eax, eax
1684
1685	.return:
1686	EPILOGUE_4_ARGS_EX 12
1687
1688	.div_overflow:
1689	%if %4 != 0
1690	pop A2
1691	%endif
1692	.div_zero:
1693	mov eax, -1
1694	jmp .return
1695	ENDPROC iemAImpl_ %+ %1 %+ _u64
1696	%endif ; !RT_ARCH_AMD64
1697
1698	%endmacro
1699
1700	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1701	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1702
1703
1704	;
1705	; BSWAP. No flag changes.
1706	;
1707	; Each function takes one argument, pointer to the value to bswap
1708	; (input/output). They all return void.
1709	;
1710	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1711	PROLOGUE_1_ARGS
1712	mov T0_32, [A0] ; just in case any of the upper bits are used.
1713	db 66h
1714	bswap T0_32
1715	mov [A0], T0_32
1716	EPILOGUE_1_ARGS
1717	ENDPROC iemAImpl_bswap_u16
1718
1719	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1720	PROLOGUE_1_ARGS
1721	mov T0_32, [A0]
1722	bswap T0_32
1723	mov [A0], T0_32
1724	EPILOGUE_1_ARGS
1725	ENDPROC iemAImpl_bswap_u32
1726
1727	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1728	%ifdef RT_ARCH_AMD64
1729	PROLOGUE_1_ARGS
1730	mov T0, [A0]
1731	bswap T0
1732	mov [A0], T0
1733	EPILOGUE_1_ARGS
1734	%else
1735	PROLOGUE_1_ARGS
1736	mov T0, [A0]
1737	mov T1, [A0 + 4]
1738	bswap T0
1739	bswap T1
1740	mov [A0 + 4], T0
1741	mov [A0], T1
1742	EPILOGUE_1_ARGS
1743	%endif
1744	ENDPROC iemAImpl_bswap_u64
1745
1746
1747	;;
1748	; Initialize the FPU for the actual instruction being emulated, this means
1749	; loading parts of the guest's control word and status word.
1750	;
1751	; @uses 24 bytes of stack.
1752	; @param 1 Expression giving the address of the FXSTATE of the guest.
1753	;
1754	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1755	fnstenv [xSP]
1756
1757	; FCW - for exception, precision and rounding control.
1758	movzx T0, word [%1 + X86FXSTATE.FCW]
1759	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1760	mov [xSP + X86FSTENV32P.FCW], T0_16
1761
1762	; FSW - for undefined C0, C1, C2, and C3.
1763	movzx T1, word [%1 + X86FXSTATE.FSW]
1764	and T1, X86_FSW_C_MASK
1765	movzx T0, word [xSP + X86FSTENV32P.FSW]
1766	and T0, X86_FSW_TOP_MASK
1767	or T0, T1
1768	mov [xSP + X86FSTENV32P.FSW], T0_16
1769
1770	fldenv [xSP]
1771	%endmacro
1772
1773
1774	;;
1775	; Need to move this as well somewhere better?
1776	;
1777	struc IEMFPURESULT
1778	.r80Result resw 5
1779	.FSW resw 1
1780	endstruc
1781
1782
1783	;;
1784	; Need to move this as well somewhere better?
1785	;
1786	struc IEMFPURESULTTWO
1787	.r80Result1 resw 5
1788	.FSW resw 1
1789	.r80Result2 resw 5
1790	endstruc
1791
1792
1793	;
1794	;---------------------- 16-bit signed integer operations ----------------------
1795	;
1796
1797
1798	;;
1799	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1800	;
1801	; @param A0 FPU context (fxsave).
1802	; @param A1 Pointer to a IEMFPURESULT for the output.
1803	; @param A2 Pointer to the 16-bit floating point value to convert.
1804	;
1805	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1806	PROLOGUE_3_ARGS
1807	sub xSP, 20h
1808
1809	fninit
1810	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1811	fild word [A2]
1812
1813	fnstsw word [A1 + IEMFPURESULT.FSW]
1814	fnclex
1815	fstp tword [A1 + IEMFPURESULT.r80Result]
1816
1817	fninit
1818	add xSP, 20h
1819	EPILOGUE_3_ARGS
1820	ENDPROC iemAImpl_fild_i16_to_r80
1821
1822
1823	;;
1824	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1825	;
1826	; @param A0 FPU context (fxsave).
1827	; @param A1 Where to return the output FSW.
1828	; @param A2 Where to store the 16-bit signed integer value.
1829	; @param A3 Pointer to the 80-bit value.
1830	;
1831	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1832	PROLOGUE_4_ARGS
1833	sub xSP, 20h
1834
1835	fninit
1836	fld tword [A3]
1837	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1838	fistp word [A2]
1839
1840	fnstsw word [A1]
1841
1842	fninit
1843	add xSP, 20h
1844	EPILOGUE_4_ARGS
1845	ENDPROC iemAImpl_fist_r80_to_i16
1846
1847
1848	;;
1849	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1850	; (memory) with truncation.
1851	;
1852	; @param A0 FPU context (fxsave).
1853	; @param A1 Where to return the output FSW.
1854	; @param A2 Where to store the 16-bit signed integer value.
1855	; @param A3 Pointer to the 80-bit value.
1856	;
1857	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1858	PROLOGUE_4_ARGS
1859	sub xSP, 20h
1860
1861	fninit
1862	fld tword [A3]
1863	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1864	fisttp dword [A2]
1865
1866	fnstsw word [A1]
1867
1868	fninit
1869	add xSP, 20h
1870	EPILOGUE_4_ARGS
1871	ENDPROC iemAImpl_fistt_r80_to_i16
1872
1873
1874	;;
1875	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1876	;
1877	; @param 1 The instruction
1878	;
1879	; @param A0 FPU context (fxsave).
1880	; @param A1 Pointer to a IEMFPURESULT for the output.
1881	; @param A2 Pointer to the 80-bit value.
1882	; @param A3 Pointer to the 16-bit value.
1883	;
1884	%macro IEMIMPL_FPU_R80_BY_I16 1
1885	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1886	PROLOGUE_4_ARGS
1887	sub xSP, 20h
1888
1889	fninit
1890	fld tword [A2]
1891	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1892	%1 word [A3]
1893
1894	fnstsw word [A1 + IEMFPURESULT.FSW]
1895	fnclex
1896	fstp tword [A1 + IEMFPURESULT.r80Result]
1897
1898	fninit
1899	add xSP, 20h
1900	EPILOGUE_4_ARGS
1901	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1902	%endmacro
1903
1904	IEMIMPL_FPU_R80_BY_I16 fiadd
1905	IEMIMPL_FPU_R80_BY_I16 fimul
1906	IEMIMPL_FPU_R80_BY_I16 fisub
1907	IEMIMPL_FPU_R80_BY_I16 fisubr
1908	IEMIMPL_FPU_R80_BY_I16 fidiv
1909	IEMIMPL_FPU_R80_BY_I16 fidivr
1910
1911
1912	;;
1913	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1914	; only returning FSW.
1915	;
1916	; @param 1 The instruction
1917	;
1918	; @param A0 FPU context (fxsave).
1919	; @param A1 Where to store the output FSW.
1920	; @param A2 Pointer to the 80-bit value.
1921	; @param A3 Pointer to the 64-bit value.
1922	;
1923	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1924	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1925	PROLOGUE_4_ARGS
1926	sub xSP, 20h
1927
1928	fninit
1929	fld tword [A2]
1930	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1931	%1 word [A3]
1932
1933	fnstsw word [A1]
1934
1935	fninit
1936	add xSP, 20h
1937	EPILOGUE_4_ARGS
1938	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1939	%endmacro
1940
1941	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1942
1943
1944
1945	;
1946	;---------------------- 32-bit signed integer operations ----------------------
1947	;
1948
1949
1950	;;
1951	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1952	;
1953	; @param A0 FPU context (fxsave).
1954	; @param A1 Pointer to a IEMFPURESULT for the output.
1955	; @param A2 Pointer to the 32-bit floating point value to convert.
1956	;
1957	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
1958	PROLOGUE_3_ARGS
1959	sub xSP, 20h
1960
1961	fninit
1962	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1963	fild dword [A2]
1964
1965	fnstsw word [A1 + IEMFPURESULT.FSW]
1966	fnclex
1967	fstp tword [A1 + IEMFPURESULT.r80Result]
1968
1969	fninit
1970	add xSP, 20h
1971	EPILOGUE_3_ARGS
1972	ENDPROC iemAImpl_fild_i32_to_r80
1973
1974
1975	;;
1976	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
1977	;
1978	; @param A0 FPU context (fxsave).
1979	; @param A1 Where to return the output FSW.
1980	; @param A2 Where to store the 32-bit signed integer value.
1981	; @param A3 Pointer to the 80-bit value.
1982	;
1983	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
1984	PROLOGUE_4_ARGS
1985	sub xSP, 20h
1986
1987	fninit
1988	fld tword [A3]
1989	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1990	fistp dword [A2]
1991
1992	fnstsw word [A1]
1993
1994	fninit
1995	add xSP, 20h
1996	EPILOGUE_4_ARGS
1997	ENDPROC iemAImpl_fist_r80_to_i32
1998
1999
2000	;;
2001	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2002	; (memory) with truncation.
2003	;
2004	; @param A0 FPU context (fxsave).
2005	; @param A1 Where to return the output FSW.
2006	; @param A2 Where to store the 32-bit signed integer value.
2007	; @param A3 Pointer to the 80-bit value.
2008	;
2009	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2010	PROLOGUE_4_ARGS
2011	sub xSP, 20h
2012
2013	fninit
2014	fld tword [A3]
2015	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2016	fisttp dword [A2]
2017
2018	fnstsw word [A1]
2019
2020	fninit
2021	add xSP, 20h
2022	EPILOGUE_4_ARGS
2023	ENDPROC iemAImpl_fistt_r80_to_i32
2024
2025
2026	;;
2027	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2028	;
2029	; @param 1 The instruction
2030	;
2031	; @param A0 FPU context (fxsave).
2032	; @param A1 Pointer to a IEMFPURESULT for the output.
2033	; @param A2 Pointer to the 80-bit value.
2034	; @param A3 Pointer to the 32-bit value.
2035	;
2036	%macro IEMIMPL_FPU_R80_BY_I32 1
2037	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2038	PROLOGUE_4_ARGS
2039	sub xSP, 20h
2040
2041	fninit
2042	fld tword [A2]
2043	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2044	%1 dword [A3]
2045
2046	fnstsw word [A1 + IEMFPURESULT.FSW]
2047	fnclex
2048	fstp tword [A1 + IEMFPURESULT.r80Result]
2049
2050	fninit
2051	add xSP, 20h
2052	EPILOGUE_4_ARGS
2053	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2054	%endmacro
2055
2056	IEMIMPL_FPU_R80_BY_I32 fiadd
2057	IEMIMPL_FPU_R80_BY_I32 fimul
2058	IEMIMPL_FPU_R80_BY_I32 fisub
2059	IEMIMPL_FPU_R80_BY_I32 fisubr
2060	IEMIMPL_FPU_R80_BY_I32 fidiv
2061	IEMIMPL_FPU_R80_BY_I32 fidivr
2062
2063
2064	;;
2065	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2066	; only returning FSW.
2067	;
2068	; @param 1 The instruction
2069	;
2070	; @param A0 FPU context (fxsave).
2071	; @param A1 Where to store the output FSW.
2072	; @param A2 Pointer to the 80-bit value.
2073	; @param A3 Pointer to the 64-bit value.
2074	;
2075	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2076	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2077	PROLOGUE_4_ARGS
2078	sub xSP, 20h
2079
2080	fninit
2081	fld tword [A2]
2082	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2083	%1 dword [A3]
2084
2085	fnstsw word [A1]
2086
2087	fninit
2088	add xSP, 20h
2089	EPILOGUE_4_ARGS
2090	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2091	%endmacro
2092
2093	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2094
2095
2096
2097	;
2098	;---------------------- 64-bit signed integer operations ----------------------
2099	;
2100
2101
2102	;;
2103	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2104	;
2105	; @param A0 FPU context (fxsave).
2106	; @param A1 Pointer to a IEMFPURESULT for the output.
2107	; @param A2 Pointer to the 64-bit floating point value to convert.
2108	;
2109	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2110	PROLOGUE_3_ARGS
2111	sub xSP, 20h
2112
2113	fninit
2114	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2115	fild qword [A2]
2116
2117	fnstsw word [A1 + IEMFPURESULT.FSW]
2118	fnclex
2119	fstp tword [A1 + IEMFPURESULT.r80Result]
2120
2121	fninit
2122	add xSP, 20h
2123	EPILOGUE_3_ARGS
2124	ENDPROC iemAImpl_fild_i64_to_r80
2125
2126
2127	;;
2128	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2129	;
2130	; @param A0 FPU context (fxsave).
2131	; @param A1 Where to return the output FSW.
2132	; @param A2 Where to store the 64-bit signed integer value.
2133	; @param A3 Pointer to the 80-bit value.
2134	;
2135	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2136	PROLOGUE_4_ARGS
2137	sub xSP, 20h
2138
2139	fninit
2140	fld tword [A3]
2141	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2142	fistp qword [A2]
2143
2144	fnstsw word [A1]
2145
2146	fninit
2147	add xSP, 20h
2148	EPILOGUE_4_ARGS
2149	ENDPROC iemAImpl_fist_r80_to_i64
2150
2151
2152	;;
2153	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2154	; (memory) with truncation.
2155	;
2156	; @param A0 FPU context (fxsave).
2157	; @param A1 Where to return the output FSW.
2158	; @param A2 Where to store the 64-bit signed integer value.
2159	; @param A3 Pointer to the 80-bit value.
2160	;
2161	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2162	PROLOGUE_4_ARGS
2163	sub xSP, 20h
2164
2165	fninit
2166	fld tword [A3]
2167	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2168	fisttp qword [A2]
2169
2170	fnstsw word [A1]
2171
2172	fninit
2173	add xSP, 20h
2174	EPILOGUE_4_ARGS
2175	ENDPROC iemAImpl_fistt_r80_to_i64
2176
2177
2178
2179	;
2180	;---------------------- 32-bit floating point operations ----------------------
2181	;
2182
2183	;;
2184	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2185	;
2186	; @param A0 FPU context (fxsave).
2187	; @param A1 Pointer to a IEMFPURESULT for the output.
2188	; @param A2 Pointer to the 32-bit floating point value to convert.
2189	;
2190	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2191	PROLOGUE_3_ARGS
2192	sub xSP, 20h
2193
2194	fninit
2195	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2196	fld dword [A2]
2197
2198	fnstsw word [A1 + IEMFPURESULT.FSW]
2199	fnclex
2200	fstp tword [A1 + IEMFPURESULT.r80Result]
2201
2202	fninit
2203	add xSP, 20h
2204	EPILOGUE_3_ARGS
2205	ENDPROC iemAImpl_fld_r32_to_r80
2206
2207
2208	;;
2209	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2210	;
2211	; @param A0 FPU context (fxsave).
2212	; @param A1 Where to return the output FSW.
2213	; @param A2 Where to store the 32-bit value.
2214	; @param A3 Pointer to the 80-bit value.
2215	;
2216	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2217	PROLOGUE_4_ARGS
2218	sub xSP, 20h
2219
2220	fninit
2221	fld tword [A3]
2222	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2223	fst dword [A2]
2224
2225	fnstsw word [A1]
2226
2227	fninit
2228	add xSP, 20h
2229	EPILOGUE_4_ARGS
2230	ENDPROC iemAImpl_fst_r80_to_r32
2231
2232
2233	;;
2234	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2235	;
2236	; @param 1 The instruction
2237	;
2238	; @param A0 FPU context (fxsave).
2239	; @param A1 Pointer to a IEMFPURESULT for the output.
2240	; @param A2 Pointer to the 80-bit value.
2241	; @param A3 Pointer to the 32-bit value.
2242	;
2243	%macro IEMIMPL_FPU_R80_BY_R32 1
2244	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2245	PROLOGUE_4_ARGS
2246	sub xSP, 20h
2247
2248	fninit
2249	fld tword [A2]
2250	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2251	%1 dword [A3]
2252
2253	fnstsw word [A1 + IEMFPURESULT.FSW]
2254	fnclex
2255	fstp tword [A1 + IEMFPURESULT.r80Result]
2256
2257	fninit
2258	add xSP, 20h
2259	EPILOGUE_4_ARGS
2260	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2261	%endmacro
2262
2263	IEMIMPL_FPU_R80_BY_R32 fadd
2264	IEMIMPL_FPU_R80_BY_R32 fmul
2265	IEMIMPL_FPU_R80_BY_R32 fsub
2266	IEMIMPL_FPU_R80_BY_R32 fsubr
2267	IEMIMPL_FPU_R80_BY_R32 fdiv
2268	IEMIMPL_FPU_R80_BY_R32 fdivr
2269
2270
2271	;;
2272	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2273	; only returning FSW.
2274	;
2275	; @param 1 The instruction
2276	;
2277	; @param A0 FPU context (fxsave).
2278	; @param A1 Where to store the output FSW.
2279	; @param A2 Pointer to the 80-bit value.
2280	; @param A3 Pointer to the 64-bit value.
2281	;
2282	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2283	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2284	PROLOGUE_4_ARGS
2285	sub xSP, 20h
2286
2287	fninit
2288	fld tword [A2]
2289	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2290	%1 dword [A3]
2291
2292	fnstsw word [A1]
2293
2294	fninit
2295	add xSP, 20h
2296	EPILOGUE_4_ARGS
2297	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2298	%endmacro
2299
2300	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2301
2302
2303
2304	;
2305	;---------------------- 64-bit floating point operations ----------------------
2306	;
2307
2308	;;
2309	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2310	;
2311	; @param A0 FPU context (fxsave).
2312	; @param A1 Pointer to a IEMFPURESULT for the output.
2313	; @param A2 Pointer to the 64-bit floating point value to convert.
2314	;
2315	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2316	PROLOGUE_3_ARGS
2317	sub xSP, 20h
2318
2319	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2320	fld qword [A2]
2321
2322	fnstsw word [A1 + IEMFPURESULT.FSW]
2323	fnclex
2324	fstp tword [A1 + IEMFPURESULT.r80Result]
2325
2326	fninit
2327	add xSP, 20h
2328	EPILOGUE_3_ARGS
2329	ENDPROC iemAImpl_fld_r64_to_r80
2330
2331
2332	;;
2333	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2334	;
2335	; @param A0 FPU context (fxsave).
2336	; @param A1 Where to return the output FSW.
2337	; @param A2 Where to store the 64-bit value.
2338	; @param A3 Pointer to the 80-bit value.
2339	;
2340	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2341	PROLOGUE_4_ARGS
2342	sub xSP, 20h
2343
2344	fninit
2345	fld tword [A3]
2346	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2347	fst qword [A2]
2348
2349	fnstsw word [A1]
2350
2351	fninit
2352	add xSP, 20h
2353	EPILOGUE_4_ARGS
2354	ENDPROC iemAImpl_fst_r80_to_r64
2355
2356
2357	;;
2358	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2359	;
2360	; @param 1 The instruction
2361	;
2362	; @param A0 FPU context (fxsave).
2363	; @param A1 Pointer to a IEMFPURESULT for the output.
2364	; @param A2 Pointer to the 80-bit value.
2365	; @param A3 Pointer to the 64-bit value.
2366	;
2367	%macro IEMIMPL_FPU_R80_BY_R64 1
2368	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2369	PROLOGUE_4_ARGS
2370	sub xSP, 20h
2371
2372	fninit
2373	fld tword [A2]
2374	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2375	%1 qword [A3]
2376
2377	fnstsw word [A1 + IEMFPURESULT.FSW]
2378	fnclex
2379	fstp tword [A1 + IEMFPURESULT.r80Result]
2380
2381	fninit
2382	add xSP, 20h
2383	EPILOGUE_4_ARGS
2384	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2385	%endmacro
2386
2387	IEMIMPL_FPU_R80_BY_R64 fadd
2388	IEMIMPL_FPU_R80_BY_R64 fmul
2389	IEMIMPL_FPU_R80_BY_R64 fsub
2390	IEMIMPL_FPU_R80_BY_R64 fsubr
2391	IEMIMPL_FPU_R80_BY_R64 fdiv
2392	IEMIMPL_FPU_R80_BY_R64 fdivr
2393
2394	;;
2395	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2396	; only returning FSW.
2397	;
2398	; @param 1 The instruction
2399	;
2400	; @param A0 FPU context (fxsave).
2401	; @param A1 Where to store the output FSW.
2402	; @param A2 Pointer to the 80-bit value.
2403	; @param A3 Pointer to the 64-bit value.
2404	;
2405	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2406	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2407	PROLOGUE_4_ARGS
2408	sub xSP, 20h
2409
2410	fninit
2411	fld tword [A2]
2412	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2413	%1 qword [A3]
2414
2415	fnstsw word [A1]
2416
2417	fninit
2418	add xSP, 20h
2419	EPILOGUE_4_ARGS
2420	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2421	%endmacro
2422
2423	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2424
2425
2426
2427	;
2428	;---------------------- 80-bit floating point operations ----------------------
2429	;
2430
2431	;;
2432	; Loads a 80-bit floating point register value from memory.
2433	;
2434	; @param A0 FPU context (fxsave).
2435	; @param A1 Pointer to a IEMFPURESULT for the output.
2436	; @param A2 Pointer to the 80-bit floating point value to load.
2437	;
2438	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2439	PROLOGUE_3_ARGS
2440	sub xSP, 20h
2441
2442	fninit
2443	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2444	fld tword [A2]
2445
2446	fnstsw word [A1 + IEMFPURESULT.FSW]
2447	fnclex
2448	fstp tword [A1 + IEMFPURESULT.r80Result]
2449
2450	fninit
2451	add xSP, 20h
2452	EPILOGUE_3_ARGS
2453	ENDPROC iemAImpl_fld_r80_from_r80
2454
2455
2456	;;
2457	; Store a 80-bit floating point register to memory
2458	;
2459	; @param A0 FPU context (fxsave).
2460	; @param A1 Where to return the output FSW.
2461	; @param A2 Where to store the 80-bit value.
2462	; @param A3 Pointer to the 80-bit register value.
2463	;
2464	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2465	PROLOGUE_4_ARGS
2466	sub xSP, 20h
2467
2468	fninit
2469	fld tword [A3]
2470	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2471	fstp tword [A2]
2472
2473	fnstsw word [A1]
2474
2475	fninit
2476	add xSP, 20h
2477	EPILOGUE_4_ARGS
2478	ENDPROC iemAImpl_fst_r80_to_r80
2479
2480
2481	;;
2482	; FPU instruction working on two 80-bit floating point values.
2483	;
2484	; @param 1 The instruction
2485	;
2486	; @param A0 FPU context (fxsave).
2487	; @param A1 Pointer to a IEMFPURESULT for the output.
2488	; @param A2 Pointer to the first 80-bit value (ST0)
2489	; @param A3 Pointer to the second 80-bit value (STn).
2490	;
2491	%macro IEMIMPL_FPU_R80_BY_R80 2
2492	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2493	PROLOGUE_4_ARGS
2494	sub xSP, 20h
2495
2496	fninit
2497	fld tword [A3]
2498	fld tword [A2]
2499	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2500	%1 %2
2501
2502	fnstsw word [A1 + IEMFPURESULT.FSW]
2503	fnclex
2504	fstp tword [A1 + IEMFPURESULT.r80Result]
2505
2506	fninit
2507	add xSP, 20h
2508	EPILOGUE_4_ARGS
2509	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2510	%endmacro
2511
2512	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2513	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2514	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2515	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2516	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2517	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2518	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2519	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2520	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2521
2522
2523	;;
2524	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2525	; storing the result in ST1 and popping the stack.
2526	;
2527	; @param 1 The instruction
2528	;
2529	; @param A0 FPU context (fxsave).
2530	; @param A1 Pointer to a IEMFPURESULT for the output.
2531	; @param A2 Pointer to the first 80-bit value (ST1).
2532	; @param A3 Pointer to the second 80-bit value (ST0).
2533	;
2534	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2535	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2536	PROLOGUE_4_ARGS
2537	sub xSP, 20h
2538
2539	fninit
2540	fld tword [A2]
2541	fld tword [A3]
2542	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2543	%1
2544
2545	fnstsw word [A1 + IEMFPURESULT.FSW]
2546	fnclex
2547	fstp tword [A1 + IEMFPURESULT.r80Result]
2548
2549	fninit
2550	add xSP, 20h
2551	EPILOGUE_4_ARGS
2552	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2553	%endmacro
2554
2555	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2556	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2557	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2558
2559
2560	;;
2561	; FPU instruction working on two 80-bit floating point values, only
2562	; returning FSW.
2563	;
2564	; @param 1 The instruction
2565	;
2566	; @param A0 FPU context (fxsave).
2567	; @param A1 Pointer to a uint16_t for the resulting FSW.
2568	; @param A2 Pointer to the first 80-bit value.
2569	; @param A3 Pointer to the second 80-bit value.
2570	;
2571	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2572	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2573	PROLOGUE_4_ARGS
2574	sub xSP, 20h
2575
2576	fninit
2577	fld tword [A3]
2578	fld tword [A2]
2579	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2580	%1 st0, st1
2581
2582	fnstsw word [A1]
2583
2584	fninit
2585	add xSP, 20h
2586	EPILOGUE_4_ARGS
2587	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2588	%endmacro
2589
2590	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2591	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2592
2593
2594	;;
2595	; FPU instruction working on two 80-bit floating point values,
2596	; returning FSW and EFLAGS (eax).
2597	;
2598	; @param 1 The instruction
2599	;
2600	; @returns EFLAGS in EAX.
2601	; @param A0 FPU context (fxsave).
2602	; @param A1 Pointer to a uint16_t for the resulting FSW.
2603	; @param A2 Pointer to the first 80-bit value.
2604	; @param A3 Pointer to the second 80-bit value.
2605	;
2606	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2607	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2608	PROLOGUE_4_ARGS
2609	sub xSP, 20h
2610
2611	fninit
2612	fld tword [A3]
2613	fld tword [A2]
2614	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2615	%1 st1
2616
2617	fnstsw word [A1]
2618	pushf
2619	pop xAX
2620
2621	fninit
2622	add xSP, 20h
2623	EPILOGUE_4_ARGS
2624	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2625	%endmacro
2626
2627	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2628	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2629
2630
2631	;;
2632	; FPU instruction working on one 80-bit floating point value.
2633	;
2634	; @param 1 The instruction
2635	;
2636	; @param A0 FPU context (fxsave).
2637	; @param A1 Pointer to a IEMFPURESULT for the output.
2638	; @param A2 Pointer to the 80-bit value.
2639	;
2640	%macro IEMIMPL_FPU_R80 1
2641	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2642	PROLOGUE_3_ARGS
2643	sub xSP, 20h
2644
2645	fninit
2646	fld tword [A2]
2647	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2648	%1
2649
2650	fnstsw word [A1 + IEMFPURESULT.FSW]
2651	fnclex
2652	fstp tword [A1 + IEMFPURESULT.r80Result]
2653
2654	fninit
2655	add xSP, 20h
2656	EPILOGUE_3_ARGS
2657	ENDPROC iemAImpl_ %+ %1 %+ _r80
2658	%endmacro
2659
2660	IEMIMPL_FPU_R80 fchs
2661	IEMIMPL_FPU_R80 fabs
2662	IEMIMPL_FPU_R80 f2xm1
2663	IEMIMPL_FPU_R80 fsqrt
2664	IEMIMPL_FPU_R80 frndint
2665	IEMIMPL_FPU_R80 fsin
2666	IEMIMPL_FPU_R80 fcos
2667
2668
2669	;;
2670	; FPU instruction working on one 80-bit floating point value, only
2671	; returning FSW.
2672	;
2673	; @param 1 The instruction
2674	;
2675	; @param A0 FPU context (fxsave).
2676	; @param A1 Pointer to a uint16_t for the resulting FSW.
2677	; @param A2 Pointer to the 80-bit value.
2678	;
2679	%macro IEMIMPL_FPU_R80_FSW 1
2680	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2681	PROLOGUE_3_ARGS
2682	sub xSP, 20h
2683
2684	fninit
2685	fld tword [A2]
2686	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2687	%1
2688
2689	fnstsw word [A1]
2690
2691	fninit
2692	add xSP, 20h
2693	EPILOGUE_3_ARGS
2694	ENDPROC iemAImpl_ %+ %1 %+ _r80
2695	%endmacro
2696
2697	IEMIMPL_FPU_R80_FSW ftst
2698	IEMIMPL_FPU_R80_FSW fxam
2699
2700
2701
2702	;;
2703	; FPU instruction loading a 80-bit floating point constant.
2704	;
2705	; @param 1 The instruction
2706	;
2707	; @param A0 FPU context (fxsave).
2708	; @param A1 Pointer to a IEMFPURESULT for the output.
2709	;
2710	%macro IEMIMPL_FPU_R80_CONST 1
2711	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2712	PROLOGUE_2_ARGS
2713	sub xSP, 20h
2714
2715	fninit
2716	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2717	%1
2718
2719	fnstsw word [A1 + IEMFPURESULT.FSW]
2720	fnclex
2721	fstp tword [A1 + IEMFPURESULT.r80Result]
2722
2723	fninit
2724	add xSP, 20h
2725	EPILOGUE_2_ARGS
2726	ENDPROC iemAImpl_ %+ %1 %+
2727	%endmacro
2728
2729	IEMIMPL_FPU_R80_CONST fld1
2730	IEMIMPL_FPU_R80_CONST fldl2t
2731	IEMIMPL_FPU_R80_CONST fldl2e
2732	IEMIMPL_FPU_R80_CONST fldpi
2733	IEMIMPL_FPU_R80_CONST fldlg2
2734	IEMIMPL_FPU_R80_CONST fldln2
2735	IEMIMPL_FPU_R80_CONST fldz
2736
2737
2738	;;
2739	; FPU instruction working on one 80-bit floating point value, outputing two.
2740	;
2741	; @param 1 The instruction
2742	;
2743	; @param A0 FPU context (fxsave).
2744	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2745	; @param A2 Pointer to the 80-bit value.
2746	;
2747	%macro IEMIMPL_FPU_R80_R80 1
2748	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2749	PROLOGUE_3_ARGS
2750	sub xSP, 20h
2751
2752	fninit
2753	fld tword [A2]
2754	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2755	%1
2756
2757	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2758	fnclex
2759	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2760	fnclex
2761	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2762
2763	fninit
2764	add xSP, 20h
2765	EPILOGUE_3_ARGS
2766	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2767	%endmacro
2768
2769	IEMIMPL_FPU_R80_R80 fptan
2770	IEMIMPL_FPU_R80_R80 fxtract
2771	IEMIMPL_FPU_R80_R80 fsincos
2772
2773
2774
2775
2776	;---------------------- SSE and MMX Operations ----------------------
2777
2778	;; @todo what do we need to do for MMX?
2779	%macro IEMIMPL_MMX_PROLOGUE 0
2780	%endmacro
2781	%macro IEMIMPL_MMX_EPILOGUE 0
2782	%endmacro
2783
2784	;; @todo what do we need to do for SSE?
2785	%macro IEMIMPL_SSE_PROLOGUE 0
2786	%endmacro
2787	%macro IEMIMPL_SSE_EPILOGUE 0
2788	%endmacro
2789
2790
2791	;;
2792	; Media instruction working on two full sized registers.
2793	;
2794	; @param 1 The instruction
2795	;
2796	; @param A0 FPU context (fxsave).
2797	; @param A1 Pointer to the first media register size operand (input/output).
2798	; @param A2 Pointer to the second media register size operand (input).
2799	;
2800	%macro IEMIMPL_MEDIA_F2 1
2801	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2802	PROLOGUE_3_ARGS
2803	IEMIMPL_MMX_PROLOGUE
2804
2805	movq mm0, [A1]
2806	movq mm1, [A2]
2807	%1 mm0, mm1
2808	movq [A1], mm0
2809
2810	IEMIMPL_MMX_EPILOGUE
2811	EPILOGUE_3_ARGS
2812	ENDPROC iemAImpl_ %+ %1 %+ _u64
2813
2814	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2815	PROLOGUE_3_ARGS
2816	IEMIMPL_SSE_PROLOGUE
2817
2818	movdqu xmm0, [A1]
2819	movdqu xmm1, [A2]
2820	%1 xmm0, xmm1
2821	movdqu [A1], xmm0
2822
2823	IEMIMPL_SSE_EPILOGUE
2824	EPILOGUE_3_ARGS
2825	ENDPROC iemAImpl_ %+ %1 %+ _u128
2826	%endmacro
2827
2828	IEMIMPL_MEDIA_F2 pxor
2829	IEMIMPL_MEDIA_F2 pcmpeqb
2830	IEMIMPL_MEDIA_F2 pcmpeqw
2831	IEMIMPL_MEDIA_F2 pcmpeqd
2832
2833
2834	;;
2835	; Media instruction working on one full sized and one half sized register (lower half).
2836	;
2837	; @param 1 The instruction
2838	; @param 2 1 if MMX is included, 0 if not.
2839	;
2840	; @param A0 FPU context (fxsave).
2841	; @param A1 Pointer to the first full sized media register operand (input/output).
2842	; @param A2 Pointer to the second half sized media register operand (input).
2843	;
2844	%macro IEMIMPL_MEDIA_F1L1 2
2845	%if %2 != 0
2846	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2847	PROLOGUE_3_ARGS
2848	IEMIMPL_MMX_PROLOGUE
2849
2850	movq mm0, [A1]
2851	movd mm1, [A2]
2852	%1 mm0, mm1
2853	movq [A1], mm0
2854
2855	IEMIMPL_MMX_EPILOGUE
2856	EPILOGUE_3_ARGS
2857	ENDPROC iemAImpl_ %+ %1 %+ _u64
2858	%endif
2859
2860	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2861	PROLOGUE_3_ARGS
2862	IEMIMPL_SSE_PROLOGUE
2863
2864	movdqu xmm0, [A1]
2865	movq xmm1, [A2]
2866	%1 xmm0, xmm1
2867	movdqu [A1], xmm0
2868
2869	IEMIMPL_SSE_EPILOGUE
2870	EPILOGUE_3_ARGS
2871	ENDPROC iemAImpl_ %+ %1 %+ _u128
2872	%endmacro
2873
2874	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2875	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2876	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2877	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2878
2879
2880	;;
2881	; Media instruction working on one full sized and one half sized register (high half).
2882	;
2883	; @param 1 The instruction
2884	; @param 2 1 if MMX is included, 0 if not.
2885	;
2886	; @param A0 FPU context (fxsave).
2887	; @param A1 Pointer to the first full sized media register operand (input/output).
2888	; @param A2 Pointer to the second full sized media register operand, where we
2889	; will only use the upper half (input).
2890	;
2891	%macro IEMIMPL_MEDIA_F1H1 2
2892	%if %2 != 0
2893	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2894	PROLOGUE_3_ARGS
2895	IEMIMPL_MMX_PROLOGUE
2896
2897	movq mm0, [A1]
2898	movq mm1, [A2]
2899	%1 mm0, mm1
2900	movq [A1], mm0
2901
2902	IEMIMPL_MMX_EPILOGUE
2903	EPILOGUE_3_ARGS
2904	ENDPROC iemAImpl_ %+ %1 %+ _u64
2905	%endif
2906
2907	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2908	PROLOGUE_3_ARGS
2909	IEMIMPL_SSE_PROLOGUE
2910
2911	movdqu xmm0, [A1]
2912	movdqu xmm1, [A2]
2913	%1 xmm0, xmm1
2914	movdqu [A1], xmm0
2915
2916	IEMIMPL_SSE_EPILOGUE
2917	EPILOGUE_3_ARGS
2918	ENDPROC iemAImpl_ %+ %1 %+ _u128
2919	%endmacro
2920
2921	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
2922	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
2923	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
2924	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
2925
2926
2927	;
2928	; Shufflers with evil 8-bit immediates.
2929	;
2930
2931	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
2932	PROLOGUE_4_ARGS
2933	IEMIMPL_MMX_PROLOGUE
2934
2935	movq mm0, [A1]
2936	movq mm1, [A2]
2937	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
2938	lea T1, [.imm0 xWrtRIP]
2939	lea T1, [T1 + T0]
2940	call T1
2941	movq [A1], mm0
2942
2943	IEMIMPL_MMX_EPILOGUE
2944	EPILOGUE_4_ARGS
2945	%assign bImm 0
2946	%rep 256
2947	.imm %+ bImm:
2948	pshufw mm0, mm1, bImm
2949	ret
2950	%assign bImm bImm + 1
2951	%endrep
2952	.immEnd: ; 256*5 == 0x500
2953	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2954	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2955	ENDPROC iemAImpl_pshufw
2956
2957
2958	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
2959	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
2960	PROLOGUE_4_ARGS
2961	IEMIMPL_SSE_PROLOGUE
2962
2963	movdqu xmm0, [A1]
2964	movdqu xmm1, [A2]
2965	lea T1, [.imm0 xWrtRIP]
2966	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
2967	lea T1, [T1 + T0*2]
2968	call T1
2969	movdqu [A1], xmm0
2970
2971	IEMIMPL_SSE_EPILOGUE
2972	EPILOGUE_4_ARGS
2973	%assign bImm 0
2974	%rep 256
2975	.imm %+ bImm:
2976	%1 xmm0, xmm1, bImm
2977	ret
2978	%assign bImm bImm + 1
2979	%endrep
2980	.immEnd: ; 256*6 == 0x600
2981	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2982	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2983	ENDPROC iemAImpl_ %+ %1
2984	%endmacro
2985
2986	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
2987	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
2988	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
2989
2990
2991	;
2992	; Move byte mask.
2993	;
2994
2995	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
2996	PROLOGUE_3_ARGS
2997	IEMIMPL_MMX_PROLOGUE
2998
2999	mov T0, [A1]
3000	movq mm1, [A2]
3001	pmovmskb T0, mm1
3002	mov [A1], T0
3003	%ifdef RT_ARCH_X86
3004	mov dword [A1 + 4], 0
3005	%endif
3006	IEMIMPL_MMX_EPILOGUE
3007	EPILOGUE_3_ARGS
3008	ENDPROC iemAImpl_pmovmskb_u64
3009
3010	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3011	PROLOGUE_3_ARGS
3012	IEMIMPL_SSE_PROLOGUE
3013
3014	mov T0, [A1]
3015	movdqu xmm1, [A2]
3016	pmovmskb T0, xmm1
3017	mov [A1], T0
3018	%ifdef RT_ARCH_X86
3019	mov dword [A1 + 4], 0
3020	%endif
3021	IEMIMPL_SSE_EPILOGUE
3022	EPILOGUE_3_ARGS
3023	ENDPROC iemAImpl_pmovmskb_u128
3024

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 74065

Download in other formats: