IEMAllAImpl.asm@ 93926

Last change on this file since 93926 was 93906, checked in by vboxsync, 3 years ago
IEM: Implemented fbstp instruction (used by OLE and indirectly MS Word 6.0 and similar).
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 84.1 KB

Line
1	; $Id: IEMAllAImpl.asm 93906 2022-02-24 10:28:32Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	;
7	; Copyright (C) 2011-2022 Oracle Corporation
8	;
9	; This file is part of VirtualBox Open Source Edition (OSE), as
10	; available from http://www.virtualbox.org. This file is free software;
11	; you can redistribute it and/or modify it under the terms of the GNU
12	; General Public License (GPL) as published by the Free Software
13	; Foundation, in version 2 as it comes in the "COPYING" file of the
14	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	;
17
18
19	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20	; Header Files ;
21	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22	%include "VBox/asmdefs.mac"
23	%include "VBox/err.mac"
24	%include "iprt/x86.mac"
25
26
27	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28	; Defined Constants And Macros ;
29	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30
31	;;
32	; RET XX / RET wrapper for fastcall.
33	;
34	%macro RET_FASTCALL 1
35	%ifdef RT_ARCH_X86
36	%ifdef RT_OS_WINDOWS
37	ret %1
38	%else
39	ret
40	%endif
41	%else
42	ret
43	%endif
44	%endmacro
45
46	;;
47	; NAME for fastcall functions.
48	;
49	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
50	; escaping (or whatever the dollar is good for here). Thus the ugly
51	; prefix argument.
52	;
53	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
54	%ifdef RT_ARCH_X86
55	%ifdef RT_OS_WINDOWS
56	%undef NAME_FASTCALL
57	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
58	%endif
59	%endif
60
61	;;
62	; BEGINPROC for fastcall functions.
63	;
64	; @param 1 The function name (C).
65	; @param 2 The argument size on x86.
66	;
67	%macro BEGINPROC_FASTCALL 2
68	%ifdef ASM_FORMAT_PE
69	export %1=NAME_FASTCALL(%1,%2,$@)
70	%endif
71	%ifdef __NASM__
72	%ifdef ASM_FORMAT_OMF
73	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
74	%endif
75	%endif
76	%ifndef ASM_FORMAT_BIN
77	global NAME_FASTCALL(%1,%2,$@)
78	%endif
79	NAME_FASTCALL(%1,%2,@):
80	%endmacro
81
82
83	;
84	; We employ some macro assembly here to hid the calling convention differences.
85	;
86	%ifdef RT_ARCH_AMD64
87	%macro PROLOGUE_1_ARGS 0
88	%endmacro
89	%macro EPILOGUE_1_ARGS 0
90	ret
91	%endmacro
92	%macro EPILOGUE_1_ARGS_EX 0
93	ret
94	%endmacro
95
96	%macro PROLOGUE_2_ARGS 0
97	%endmacro
98	%macro EPILOGUE_2_ARGS 0
99	ret
100	%endmacro
101	%macro EPILOGUE_2_ARGS_EX 1
102	ret
103	%endmacro
104
105	%macro PROLOGUE_3_ARGS 0
106	%endmacro
107	%macro EPILOGUE_3_ARGS 0
108	ret
109	%endmacro
110	%macro EPILOGUE_3_ARGS_EX 1
111	ret
112	%endmacro
113
114	%macro PROLOGUE_4_ARGS 0
115	%endmacro
116	%macro EPILOGUE_4_ARGS 0
117	ret
118	%endmacro
119	%macro EPILOGUE_4_ARGS_EX 1
120	ret
121	%endmacro
122
123	%ifdef ASM_CALL64_GCC
124	%define A0 rdi
125	%define A0_32 edi
126	%define A0_16 di
127	%define A0_8 dil
128
129	%define A1 rsi
130	%define A1_32 esi
131	%define A1_16 si
132	%define A1_8 sil
133
134	%define A2 rdx
135	%define A2_32 edx
136	%define A2_16 dx
137	%define A2_8 dl
138
139	%define A3 rcx
140	%define A3_32 ecx
141	%define A3_16 cx
142	%endif
143
144	%ifdef ASM_CALL64_MSC
145	%define A0 rcx
146	%define A0_32 ecx
147	%define A0_16 cx
148	%define A0_8 cl
149
150	%define A1 rdx
151	%define A1_32 edx
152	%define A1_16 dx
153	%define A1_8 dl
154
155	%define A2 r8
156	%define A2_32 r8d
157	%define A2_16 r8w
158	%define A2_8 r8b
159
160	%define A3 r9
161	%define A3_32 r9d
162	%define A3_16 r9w
163	%endif
164
165	%define T0 rax
166	%define T0_32 eax
167	%define T0_16 ax
168	%define T0_8 al
169
170	%define T1 r11
171	%define T1_32 r11d
172	%define T1_16 r11w
173	%define T1_8 r11b
174
175	%else
176	; x86
177	%macro PROLOGUE_1_ARGS 0
178	push edi
179	%endmacro
180	%macro EPILOGUE_1_ARGS 0
181	pop edi
182	ret 0
183	%endmacro
184	%macro EPILOGUE_1_ARGS_EX 1
185	pop edi
186	ret %1
187	%endmacro
188
189	%macro PROLOGUE_2_ARGS 0
190	push edi
191	%endmacro
192	%macro EPILOGUE_2_ARGS 0
193	pop edi
194	ret 0
195	%endmacro
196	%macro EPILOGUE_2_ARGS_EX 1
197	pop edi
198	ret %1
199	%endmacro
200
201	%macro PROLOGUE_3_ARGS 0
202	push ebx
203	mov ebx, [esp + 4 + 4]
204	push edi
205	%endmacro
206	%macro EPILOGUE_3_ARGS_EX 1
207	%if (%1) < 4
208	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
209	%endif
210	pop edi
211	pop ebx
212	ret %1
213	%endmacro
214	%macro EPILOGUE_3_ARGS 0
215	EPILOGUE_3_ARGS_EX 4
216	%endmacro
217
218	%macro PROLOGUE_4_ARGS 0
219	push ebx
220	push edi
221	push esi
222	mov ebx, [esp + 12 + 4 + 0]
223	mov esi, [esp + 12 + 4 + 4]
224	%endmacro
225	%macro EPILOGUE_4_ARGS_EX 1
226	%if (%1) < 8
227	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
228	%endif
229	pop esi
230	pop edi
231	pop ebx
232	ret %1
233	%endmacro
234	%macro EPILOGUE_4_ARGS 0
235	EPILOGUE_4_ARGS_EX 8
236	%endmacro
237
238	%define A0 ecx
239	%define A0_32 ecx
240	%define A0_16 cx
241	%define A0_8 cl
242
243	%define A1 edx
244	%define A1_32 edx
245	%define A1_16 dx
246	%define A1_8 dl
247
248	%define A2 ebx
249	%define A2_32 ebx
250	%define A2_16 bx
251	%define A2_8 bl
252
253	%define A3 esi
254	%define A3_32 esi
255	%define A3_16 si
256
257	%define T0 eax
258	%define T0_32 eax
259	%define T0_16 ax
260	%define T0_8 al
261
262	%define T1 edi
263	%define T1_32 edi
264	%define T1_16 di
265	%endif
266
267
268	;;
269	; Load the relevant flags from [%1] if there are undefined flags (%3).
270	;
271	; @remarks Clobbers T0, stack. Changes EFLAGS.
272	; @param A2 The register pointing to the flags.
273	; @param 1 The parameter (A0..A3) pointing to the eflags.
274	; @param 2 The set of modified flags.
275	; @param 3 The set of undefined flags.
276	;
277	%macro IEM_MAYBE_LOAD_FLAGS 3
278	;%if (%3) != 0
279	pushf ; store current flags
280	mov T0_32, [%1] ; load the guest flags
281	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
282	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
283	or [xSP], T0 ; merge guest flags with host flags.
284	popf ; load the mixed flags.
285	;%endif
286	%endmacro
287
288	;;
289	; Update the flag.
290	;
291	; @remarks Clobbers T0, T1, stack.
292	; @param 1 The register pointing to the EFLAGS.
293	; @param 2 The mask of modified flags to save.
294	; @param 3 The mask of undefined flags to (maybe) save.
295	;
296	%macro IEM_SAVE_FLAGS 3
297	%if (%2 \| %3) != 0
298	pushf
299	pop T1
300	mov T0_32, [%1] ; flags
301	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
302	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
303	or T0_32, T1_32 ; combine the flags.
304	mov [%1], T0_32 ; save the flags.
305	%endif
306	%endmacro
307
308
309	;;
310	; Macro for implementing a binary operator.
311	;
312	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
313	; variants, except on 32-bit system where the 64-bit accesses requires hand
314	; coding.
315	;
316	; All the functions takes a pointer to the destination memory operand in A0,
317	; the source register operand in A1 and a pointer to eflags in A2.
318	;
319	; @param 1 The instruction mnemonic.
320	; @param 2 Non-zero if there should be a locked version.
321	; @param 3 The modified flags.
322	; @param 4 The undefined flags.
323	;
324	%macro IEMIMPL_BIN_OP 4
325	BEGINCODE
326	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
327	PROLOGUE_3_ARGS
328	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
329	%1 byte [A0], A1_8
330	IEM_SAVE_FLAGS A2, %3, %4
331	EPILOGUE_3_ARGS
332	ENDPROC iemAImpl_ %+ %1 %+ _u8
333
334	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
335	PROLOGUE_3_ARGS
336	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
337	%1 word [A0], A1_16
338	IEM_SAVE_FLAGS A2, %3, %4
339	EPILOGUE_3_ARGS
340	ENDPROC iemAImpl_ %+ %1 %+ _u16
341
342	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
343	PROLOGUE_3_ARGS
344	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
345	%1 dword [A0], A1_32
346	IEM_SAVE_FLAGS A2, %3, %4
347	EPILOGUE_3_ARGS
348	ENDPROC iemAImpl_ %+ %1 %+ _u32
349
350	%ifdef RT_ARCH_AMD64
351	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
352	PROLOGUE_3_ARGS
353	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
354	%1 qword [A0], A1
355	IEM_SAVE_FLAGS A2, %3, %4
356	EPILOGUE_3_ARGS_EX 8
357	ENDPROC iemAImpl_ %+ %1 %+ _u64
358	%endif ; RT_ARCH_AMD64
359
360	%if %2 != 0 ; locked versions requested?
361
362	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
363	PROLOGUE_3_ARGS
364	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
365	lock %1 byte [A0], A1_8
366	IEM_SAVE_FLAGS A2, %3, %4
367	EPILOGUE_3_ARGS
368	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
369
370	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
371	PROLOGUE_3_ARGS
372	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
373	lock %1 word [A0], A1_16
374	IEM_SAVE_FLAGS A2, %3, %4
375	EPILOGUE_3_ARGS
376	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
377
378	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
379	PROLOGUE_3_ARGS
380	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
381	lock %1 dword [A0], A1_32
382	IEM_SAVE_FLAGS A2, %3, %4
383	EPILOGUE_3_ARGS
384	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
385
386	%ifdef RT_ARCH_AMD64
387	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
388	PROLOGUE_3_ARGS
389	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
390	lock %1 qword [A0], A1
391	IEM_SAVE_FLAGS A2, %3, %4
392	EPILOGUE_3_ARGS_EX 8
393	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
394	%endif ; RT_ARCH_AMD64
395	%endif ; locked
396	%endmacro
397
398	; instr,lock,modified-flags.
399	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
400	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
401	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
402	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
403	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
404	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
405	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
406	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
407	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
408
409
410	;;
411	; Macro for implementing a bit operator.
412	;
413	; This will generate code for the 16, 32 and 64 bit accesses with locked
414	; variants, except on 32-bit system where the 64-bit accesses requires hand
415	; coding.
416	;
417	; All the functions takes a pointer to the destination memory operand in A0,
418	; the source register operand in A1 and a pointer to eflags in A2.
419	;
420	; @param 1 The instruction mnemonic.
421	; @param 2 Non-zero if there should be a locked version.
422	; @param 3 The modified flags.
423	; @param 4 The undefined flags.
424	;
425	%macro IEMIMPL_BIT_OP 4
426	BEGINCODE
427	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
428	PROLOGUE_3_ARGS
429	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
430	%1 word [A0], A1_16
431	IEM_SAVE_FLAGS A2, %3, %4
432	EPILOGUE_3_ARGS
433	ENDPROC iemAImpl_ %+ %1 %+ _u16
434
435	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
436	PROLOGUE_3_ARGS
437	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
438	%1 dword [A0], A1_32
439	IEM_SAVE_FLAGS A2, %3, %4
440	EPILOGUE_3_ARGS
441	ENDPROC iemAImpl_ %+ %1 %+ _u32
442
443	%ifdef RT_ARCH_AMD64
444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
445	PROLOGUE_3_ARGS
446	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
447	%1 qword [A0], A1
448	IEM_SAVE_FLAGS A2, %3, %4
449	EPILOGUE_3_ARGS_EX 8
450	ENDPROC iemAImpl_ %+ %1 %+ _u64
451	%endif ; RT_ARCH_AMD64
452
453	%if %2 != 0 ; locked versions requested?
454
455	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
456	PROLOGUE_3_ARGS
457	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
458	lock %1 word [A0], A1_16
459	IEM_SAVE_FLAGS A2, %3, %4
460	EPILOGUE_3_ARGS
461	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
462
463	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
464	PROLOGUE_3_ARGS
465	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
466	lock %1 dword [A0], A1_32
467	IEM_SAVE_FLAGS A2, %3, %4
468	EPILOGUE_3_ARGS
469	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
470
471	%ifdef RT_ARCH_AMD64
472	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
473	PROLOGUE_3_ARGS
474	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
475	lock %1 qword [A0], A1
476	IEM_SAVE_FLAGS A2, %3, %4
477	EPILOGUE_3_ARGS_EX 8
478	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
479	%endif ; RT_ARCH_AMD64
480	%endif ; locked
481	%endmacro
482	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
483	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
484	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
485	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
486
487	;;
488	; Macro for implementing a bit search operator.
489	;
490	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
491	; system where the 64-bit accesses requires hand coding.
492	;
493	; All the functions takes a pointer to the destination memory operand in A0,
494	; the source register operand in A1 and a pointer to eflags in A2.
495	;
496	; @param 1 The instruction mnemonic.
497	; @param 2 The modified flags.
498	; @param 3 The undefined flags.
499	;
500	%macro IEMIMPL_BIT_OP 3
501	BEGINCODE
502	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
503	PROLOGUE_3_ARGS
504	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
505	%1 T0_16, A1_16
506	jz .unchanged_dst
507	mov [A0], T0_16
508	.unchanged_dst:
509	IEM_SAVE_FLAGS A2, %2, %3
510	EPILOGUE_3_ARGS
511	ENDPROC iemAImpl_ %+ %1 %+ _u16
512
513	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
514	PROLOGUE_3_ARGS
515	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
516	%1 T0_32, A1_32
517	jz .unchanged_dst
518	mov [A0], T0_32
519	.unchanged_dst:
520	IEM_SAVE_FLAGS A2, %2, %3
521	EPILOGUE_3_ARGS
522	ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524	%ifdef RT_ARCH_AMD64
525	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526	PROLOGUE_3_ARGS
527	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
528	%1 T0, A1
529	jz .unchanged_dst
530	mov [A0], T0
531	.unchanged_dst:
532	IEM_SAVE_FLAGS A2, %2, %3
533	EPILOGUE_3_ARGS_EX 8
534	ENDPROC iemAImpl_ %+ %1 %+ _u64
535	%endif ; RT_ARCH_AMD64
536	%endmacro
537	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
538	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
539
540
541	;
542	; IMUL is also a similar but yet different case (no lock, no mem dst).
543	; The rDX:rAX variant of imul is handled together with mul further down.
544	;
545	BEGINCODE
546	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
547	PROLOGUE_3_ARGS
548	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
549	imul A1_16, word [A0]
550	mov [A0], A1_16
551	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
552	EPILOGUE_3_ARGS
553	ENDPROC iemAImpl_imul_two_u16
554
555	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
556	PROLOGUE_3_ARGS
557	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
558	imul A1_32, dword [A0]
559	mov [A0], A1_32
560	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
561	EPILOGUE_3_ARGS
562	ENDPROC iemAImpl_imul_two_u32
563
564	%ifdef RT_ARCH_AMD64
565	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
566	PROLOGUE_3_ARGS
567	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
568	imul A1, qword [A0]
569	mov [A0], A1
570	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
571	EPILOGUE_3_ARGS_EX 8
572	ENDPROC iemAImpl_imul_two_u64
573	%endif ; RT_ARCH_AMD64
574
575
576	;
577	; XCHG for memory operands. This implies locking. No flag changes.
578	;
579	; Each function takes two arguments, first the pointer to the memory,
580	; then the pointer to the register. They all return void.
581	;
582	BEGINCODE
583	BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
584	PROLOGUE_2_ARGS
585	mov T0_8, [A1]
586	xchg [A0], T0_8
587	mov [A1], T0_8
588	EPILOGUE_2_ARGS
589	ENDPROC iemAImpl_xchg_u8_locked
590
591	BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
592	PROLOGUE_2_ARGS
593	mov T0_16, [A1]
594	xchg [A0], T0_16
595	mov [A1], T0_16
596	EPILOGUE_2_ARGS
597	ENDPROC iemAImpl_xchg_u16_locked
598
599	BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
600	PROLOGUE_2_ARGS
601	mov T0_32, [A1]
602	xchg [A0], T0_32
603	mov [A1], T0_32
604	EPILOGUE_2_ARGS
605	ENDPROC iemAImpl_xchg_u32_locked
606
607	%ifdef RT_ARCH_AMD64
608	BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
609	PROLOGUE_2_ARGS
610	mov T0, [A1]
611	xchg [A0], T0
612	mov [A1], T0
613	EPILOGUE_2_ARGS
614	ENDPROC iemAImpl_xchg_u64_locked
615	%endif
616
617	; Unlocked variants for fDisregardLock mode.
618
619	BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
620	PROLOGUE_2_ARGS
621	mov T0_8, [A1]
622	mov T1_8, [A0]
623	mov [A0], T0_8
624	mov [A1], T1_8
625	EPILOGUE_2_ARGS
626	ENDPROC iemAImpl_xchg_u8_unlocked
627
628	BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
629	PROLOGUE_2_ARGS
630	mov T0_16, [A1]
631	mov T1_16, [A0]
632	mov [A0], T0_16
633	mov [A1], T1_16
634	EPILOGUE_2_ARGS
635	ENDPROC iemAImpl_xchg_u16_unlocked
636
637	BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
638	PROLOGUE_2_ARGS
639	mov T0_32, [A1]
640	mov T1_32, [A0]
641	mov [A0], T0_32
642	mov [A1], T1_32
643	EPILOGUE_2_ARGS
644	ENDPROC iemAImpl_xchg_u32_unlocked
645
646	%ifdef RT_ARCH_AMD64
647	BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
648	PROLOGUE_2_ARGS
649	mov T0, [A1]
650	mov T1, [A0]
651	mov [A0], T0
652	mov [A1], T1
653	EPILOGUE_2_ARGS
654	ENDPROC iemAImpl_xchg_u64_unlocked
655	%endif
656
657
658	;
659	; XADD for memory operands.
660	;
661	; Each function takes three arguments, first the pointer to the
662	; memory/register, then the pointer to the register, and finally a pointer to
663	; eflags. They all return void.
664	;
665	BEGINCODE
666	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
667	PROLOGUE_3_ARGS
668	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
669	mov T0_8, [A1]
670	xadd [A0], T0_8
671	mov [A1], T0_8
672	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
673	EPILOGUE_3_ARGS
674	ENDPROC iemAImpl_xadd_u8
675
676	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
677	PROLOGUE_3_ARGS
678	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
679	mov T0_16, [A1]
680	xadd [A0], T0_16
681	mov [A1], T0_16
682	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
683	EPILOGUE_3_ARGS
684	ENDPROC iemAImpl_xadd_u16
685
686	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
687	PROLOGUE_3_ARGS
688	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
689	mov T0_32, [A1]
690	xadd [A0], T0_32
691	mov [A1], T0_32
692	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
693	EPILOGUE_3_ARGS
694	ENDPROC iemAImpl_xadd_u32
695
696	%ifdef RT_ARCH_AMD64
697	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
698	PROLOGUE_3_ARGS
699	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
700	mov T0, [A1]
701	xadd [A0], T0
702	mov [A1], T0
703	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
704	EPILOGUE_3_ARGS
705	ENDPROC iemAImpl_xadd_u64
706	%endif ; RT_ARCH_AMD64
707
708	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
709	PROLOGUE_3_ARGS
710	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
711	mov T0_8, [A1]
712	lock xadd [A0], T0_8
713	mov [A1], T0_8
714	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
715	EPILOGUE_3_ARGS
716	ENDPROC iemAImpl_xadd_u8_locked
717
718	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
719	PROLOGUE_3_ARGS
720	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
721	mov T0_16, [A1]
722	lock xadd [A0], T0_16
723	mov [A1], T0_16
724	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
725	EPILOGUE_3_ARGS
726	ENDPROC iemAImpl_xadd_u16_locked
727
728	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
729	PROLOGUE_3_ARGS
730	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
731	mov T0_32, [A1]
732	lock xadd [A0], T0_32
733	mov [A1], T0_32
734	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
735	EPILOGUE_3_ARGS
736	ENDPROC iemAImpl_xadd_u32_locked
737
738	%ifdef RT_ARCH_AMD64
739	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
740	PROLOGUE_3_ARGS
741	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
742	mov T0, [A1]
743	lock xadd [A0], T0
744	mov [A1], T0
745	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
746	EPILOGUE_3_ARGS
747	ENDPROC iemAImpl_xadd_u64_locked
748	%endif ; RT_ARCH_AMD64
749
750
751	;
752	; CMPXCHG8B.
753	;
754	; These are tricky register wise, so the code is duplicated for each calling
755	; convention.
756	;
757	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
758	;
759	; C-proto:
760	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
761	; uint32_t *pEFlags));
762	;
763	; Note! Identical to iemAImpl_cmpxchg16b.
764	;
765	BEGINCODE
766	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
767	%ifdef RT_ARCH_AMD64
768	%ifdef ASM_CALL64_MSC
769	push rbx
770
771	mov r11, rdx ; pu64EaxEdx (is also T1)
772	mov r10, rcx ; pu64Dst
773
774	mov ebx, [r8]
775	mov ecx, [r8 + 4]
776	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
777	mov eax, [r11]
778	mov edx, [r11 + 4]
779
780	lock cmpxchg8b [r10]
781
782	mov [r11], eax
783	mov [r11 + 4], edx
784	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
785
786	pop rbx
787	ret
788	%else
789	push rbx
790
791	mov r10, rcx ; pEFlags
792	mov r11, rdx ; pu64EbxEcx (is also T1)
793
794	mov ebx, [r11]
795	mov ecx, [r11 + 4]
796	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
797	mov eax, [rsi]
798	mov edx, [rsi + 4]
799
800	lock cmpxchg8b [rdi]
801
802	mov [rsi], eax
803	mov [rsi + 4], edx
804	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
805
806	pop rbx
807	ret
808
809	%endif
810	%else
811	push esi
812	push edi
813	push ebx
814	push ebp
815
816	mov edi, ecx ; pu64Dst
817	mov esi, edx ; pu64EaxEdx
818	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
819	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
820
821	mov ebx, [ecx]
822	mov ecx, [ecx + 4]
823	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
824	mov eax, [esi]
825	mov edx, [esi + 4]
826
827	lock cmpxchg8b [edi]
828
829	mov [esi], eax
830	mov [esi + 4], edx
831	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
832
833	pop ebp
834	pop ebx
835	pop edi
836	pop esi
837	ret 8
838	%endif
839	ENDPROC iemAImpl_cmpxchg8b
840
841	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
842	; Lazy bird always lock prefixes cmpxchg8b.
843	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
844	ENDPROC iemAImpl_cmpxchg8b_locked
845
846	%ifdef RT_ARCH_AMD64
847
848	;
849	; CMPXCHG16B.
850	;
851	; These are tricky register wise, so the code is duplicated for each calling
852	; convention.
853	;
854	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
855	;
856	; C-proto:
857	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
858	; uint32_t *pEFlags));
859	;
860	; Note! Identical to iemAImpl_cmpxchg8b.
861	;
862	BEGINCODE
863	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
864	%ifdef ASM_CALL64_MSC
865	push rbx
866
867	mov r11, rdx ; pu64RaxRdx (is also T1)
868	mov r10, rcx ; pu64Dst
869
870	mov rbx, [r8]
871	mov rcx, [r8 + 8]
872	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
873	mov rax, [r11]
874	mov rdx, [r11 + 8]
875
876	lock cmpxchg16b [r10]
877
878	mov [r11], rax
879	mov [r11 + 8], rdx
880	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
881
882	pop rbx
883	ret
884	%else
885	push rbx
886
887	mov r10, rcx ; pEFlags
888	mov r11, rdx ; pu64RbxRcx (is also T1)
889
890	mov rbx, [r11]
891	mov rcx, [r11 + 8]
892	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
893	mov rax, [rsi]
894	mov rdx, [rsi + 8]
895
896	lock cmpxchg16b [rdi]
897
898	mov [rsi], eax
899	mov [rsi + 8], edx
900	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
901
902	pop rbx
903	ret
904
905	%endif
906	ENDPROC iemAImpl_cmpxchg16b
907
908	BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
909	; Lazy bird always lock prefixes cmpxchg8b.
910	jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
911	ENDPROC iemAImpl_cmpxchg16b_locked
912
913	%endif ; RT_ARCH_AMD64
914
915
916	;
917	; CMPXCHG.
918	;
919	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
920	;
921	; C-proto:
922	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
923	;
924	BEGINCODE
925	%macro IEMIMPL_CMPXCHG 2
926	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
927	PROLOGUE_4_ARGS
928	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
929	mov al, [A1]
930	%1 cmpxchg [A0], A2_8
931	mov [A1], al
932	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
933	EPILOGUE_4_ARGS
934	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
935
936	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
937	PROLOGUE_4_ARGS
938	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
939	mov ax, [A1]
940	%1 cmpxchg [A0], A2_16
941	mov [A1], ax
942	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
943	EPILOGUE_4_ARGS
944	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
945
946	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
947	PROLOGUE_4_ARGS
948	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
949	mov eax, [A1]
950	%1 cmpxchg [A0], A2_32
951	mov [A1], eax
952	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
953	EPILOGUE_4_ARGS
954	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
955
956	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
957	%ifdef RT_ARCH_AMD64
958	PROLOGUE_4_ARGS
959	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
960	mov rax, [A1]
961	%1 cmpxchg [A0], A2
962	mov [A1], rax
963	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
964	EPILOGUE_4_ARGS
965	%else
966	;
967	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
968	;
969	push esi
970	push edi
971	push ebx
972	push ebp
973
974	mov edi, ecx ; pu64Dst
975	mov esi, edx ; pu64Rax
976	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
977	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
978
979	mov ebx, [ecx]
980	mov ecx, [ecx + 4]
981	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
982	mov eax, [esi]
983	mov edx, [esi + 4]
984
985	lock cmpxchg8b [edi]
986
987	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
988	jz .cmpxchg8b_not_equal
989	cmp eax, eax ; just set the other flags.
990	.store:
991	mov [esi], eax
992	mov [esi + 4], edx
993	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
994
995	pop ebp
996	pop ebx
997	pop edi
998	pop esi
999	ret 8
1000
1001	.cmpxchg8b_not_equal:
1002	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1003	jne .store
1004	cmp [esi], eax
1005	jmp .store
1006
1007	%endif
1008	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1009	%endmacro ; IEMIMPL_CMPXCHG
1010
1011	IEMIMPL_CMPXCHG , ,
1012	IEMIMPL_CMPXCHG lock, _locked
1013
1014	;;
1015	; Macro for implementing a unary operator.
1016	;
1017	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1018	; variants, except on 32-bit system where the 64-bit accesses requires hand
1019	; coding.
1020	;
1021	; All the functions takes a pointer to the destination memory operand in A0,
1022	; the source register operand in A1 and a pointer to eflags in A2.
1023	;
1024	; @param 1 The instruction mnemonic.
1025	; @param 2 The modified flags.
1026	; @param 3 The undefined flags.
1027	;
1028	%macro IEMIMPL_UNARY_OP 3
1029	BEGINCODE
1030	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1031	PROLOGUE_2_ARGS
1032	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1033	%1 byte [A0]
1034	IEM_SAVE_FLAGS A1, %2, %3
1035	EPILOGUE_2_ARGS
1036	ENDPROC iemAImpl_ %+ %1 %+ _u8
1037
1038	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1039	PROLOGUE_2_ARGS
1040	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1041	lock %1 byte [A0]
1042	IEM_SAVE_FLAGS A1, %2, %3
1043	EPILOGUE_2_ARGS
1044	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1045
1046	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1047	PROLOGUE_2_ARGS
1048	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1049	%1 word [A0]
1050	IEM_SAVE_FLAGS A1, %2, %3
1051	EPILOGUE_2_ARGS
1052	ENDPROC iemAImpl_ %+ %1 %+ _u16
1053
1054	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1055	PROLOGUE_2_ARGS
1056	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1057	lock %1 word [A0]
1058	IEM_SAVE_FLAGS A1, %2, %3
1059	EPILOGUE_2_ARGS
1060	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1061
1062	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1063	PROLOGUE_2_ARGS
1064	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1065	%1 dword [A0]
1066	IEM_SAVE_FLAGS A1, %2, %3
1067	EPILOGUE_2_ARGS
1068	ENDPROC iemAImpl_ %+ %1 %+ _u32
1069
1070	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1071	PROLOGUE_2_ARGS
1072	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1073	lock %1 dword [A0]
1074	IEM_SAVE_FLAGS A1, %2, %3
1075	EPILOGUE_2_ARGS
1076	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1077
1078	%ifdef RT_ARCH_AMD64
1079	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1080	PROLOGUE_2_ARGS
1081	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1082	%1 qword [A0]
1083	IEM_SAVE_FLAGS A1, %2, %3
1084	EPILOGUE_2_ARGS
1085	ENDPROC iemAImpl_ %+ %1 %+ _u64
1086
1087	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1088	PROLOGUE_2_ARGS
1089	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1090	lock %1 qword [A0]
1091	IEM_SAVE_FLAGS A1, %2, %3
1092	EPILOGUE_2_ARGS
1093	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1094	%endif ; RT_ARCH_AMD64
1095
1096	%endmacro
1097
1098	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1099	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1100	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1101	IEMIMPL_UNARY_OP not, 0, 0
1102
1103
1104	;
1105	; BSWAP. No flag changes.
1106	;
1107	; Each function takes one argument, pointer to the value to bswap
1108	; (input/output). They all return void.
1109	;
1110	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1111	PROLOGUE_1_ARGS
1112	mov T0_32, [A0] ; just in case any of the upper bits are used.
1113	db 66h
1114	bswap T0_32
1115	mov [A0], T0_32
1116	EPILOGUE_1_ARGS
1117	ENDPROC iemAImpl_bswap_u16
1118
1119	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1120	PROLOGUE_1_ARGS
1121	mov T0_32, [A0]
1122	bswap T0_32
1123	mov [A0], T0_32
1124	EPILOGUE_1_ARGS
1125	ENDPROC iemAImpl_bswap_u32
1126
1127	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1128	%ifdef RT_ARCH_AMD64
1129	PROLOGUE_1_ARGS
1130	mov T0, [A0]
1131	bswap T0
1132	mov [A0], T0
1133	EPILOGUE_1_ARGS
1134	%else
1135	PROLOGUE_1_ARGS
1136	mov T0, [A0]
1137	mov T1, [A0 + 4]
1138	bswap T0
1139	bswap T1
1140	mov [A0 + 4], T0
1141	mov [A0], T1
1142	EPILOGUE_1_ARGS
1143	%endif
1144	ENDPROC iemAImpl_bswap_u64
1145
1146
1147	;;
1148	; Macro for implementing a shift operation.
1149	;
1150	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1151	; 32-bit system where the 64-bit accesses requires hand coding.
1152	;
1153	; All the functions takes a pointer to the destination memory operand in A0,
1154	; the shift count in A1 and a pointer to eflags in A2.
1155	;
1156	; @param 1 The instruction mnemonic.
1157	; @param 2 The modified flags.
1158	; @param 3 The undefined flags.
1159	;
1160	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1161	;
1162	%macro IEMIMPL_SHIFT_OP 3
1163	BEGINCODE
1164	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1165	PROLOGUE_3_ARGS
1166	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1167	%ifdef ASM_CALL64_GCC
1168	mov cl, A1_8
1169	%1 byte [A0], cl
1170	%else
1171	xchg A1, A0
1172	%1 byte [A1], cl
1173	%endif
1174	IEM_SAVE_FLAGS A2, %2, %3
1175	EPILOGUE_3_ARGS
1176	ENDPROC iemAImpl_ %+ %1 %+ _u8
1177
1178	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1179	PROLOGUE_3_ARGS
1180	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1181	%ifdef ASM_CALL64_GCC
1182	mov cl, A1_8
1183	%1 word [A0], cl
1184	%else
1185	xchg A1, A0
1186	%1 word [A1], cl
1187	%endif
1188	IEM_SAVE_FLAGS A2, %2, %3
1189	EPILOGUE_3_ARGS
1190	ENDPROC iemAImpl_ %+ %1 %+ _u16
1191
1192	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1193	PROLOGUE_3_ARGS
1194	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1195	%ifdef ASM_CALL64_GCC
1196	mov cl, A1_8
1197	%1 dword [A0], cl
1198	%else
1199	xchg A1, A0
1200	%1 dword [A1], cl
1201	%endif
1202	IEM_SAVE_FLAGS A2, %2, %3
1203	EPILOGUE_3_ARGS
1204	ENDPROC iemAImpl_ %+ %1 %+ _u32
1205
1206	%ifdef RT_ARCH_AMD64
1207	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1208	PROLOGUE_3_ARGS
1209	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1210	%ifdef ASM_CALL64_GCC
1211	mov cl, A1_8
1212	%1 qword [A0], cl
1213	%else
1214	xchg A1, A0
1215	%1 qword [A1], cl
1216	%endif
1217	IEM_SAVE_FLAGS A2, %2, %3
1218	EPILOGUE_3_ARGS
1219	ENDPROC iemAImpl_ %+ %1 %+ _u64
1220	%endif ; RT_ARCH_AMD64
1221
1222	%endmacro
1223
1224	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1225	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1226	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1227	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1228	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1229	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1230	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1231
1232
1233	;;
1234	; Macro for implementing a double precision shift operation.
1235	;
1236	; This will generate code for the 16, 32 and 64 bit accesses, except on
1237	; 32-bit system where the 64-bit accesses requires hand coding.
1238	;
1239	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1240	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1241	;
1242	; @param 1 The instruction mnemonic.
1243	; @param 2 The modified flags.
1244	; @param 3 The undefined flags.
1245	;
1246	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1247	;
1248	%macro IEMIMPL_SHIFT_DBL_OP 3
1249	BEGINCODE
1250	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1251	PROLOGUE_4_ARGS
1252	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1253	%ifdef ASM_CALL64_GCC
1254	xchg A3, A2
1255	%1 [A0], A1_16, cl
1256	xchg A3, A2
1257	%else
1258	xchg A0, A2
1259	%1 [A2], A1_16, cl
1260	%endif
1261	IEM_SAVE_FLAGS A3, %2, %3
1262	EPILOGUE_4_ARGS
1263	ENDPROC iemAImpl_ %+ %1 %+ _u16
1264
1265	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1266	PROLOGUE_4_ARGS
1267	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1268	%ifdef ASM_CALL64_GCC
1269	xchg A3, A2
1270	%1 [A0], A1_32, cl
1271	xchg A3, A2
1272	%else
1273	xchg A0, A2
1274	%1 [A2], A1_32, cl
1275	%endif
1276	IEM_SAVE_FLAGS A3, %2, %3
1277	EPILOGUE_4_ARGS
1278	ENDPROC iemAImpl_ %+ %1 %+ _u32
1279
1280	%ifdef RT_ARCH_AMD64
1281	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1282	PROLOGUE_4_ARGS
1283	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1284	%ifdef ASM_CALL64_GCC
1285	xchg A3, A2
1286	%1 [A0], A1, cl
1287	xchg A3, A2
1288	%else
1289	xchg A0, A2
1290	%1 [A2], A1, cl
1291	%endif
1292	IEM_SAVE_FLAGS A3, %2, %3
1293	EPILOGUE_4_ARGS_EX 12
1294	ENDPROC iemAImpl_ %+ %1 %+ _u64
1295	%endif ; RT_ARCH_AMD64
1296
1297	%endmacro
1298
1299	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1300	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1301
1302
1303	;;
1304	; Macro for implementing a multiplication operations.
1305	;
1306	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1307	; 32-bit system where the 64-bit accesses requires hand coding.
1308	;
1309	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1310	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1311	; pointer to eflags in A3.
1312	;
1313	; The functions all return 0 so the caller can be used for div/idiv as well as
1314	; for the mul/imul implementation.
1315	;
1316	; @param 1 The instruction mnemonic.
1317	; @param 2 The modified flags.
1318	; @param 3 The undefined flags.
1319	;
1320	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1321	;
1322	%macro IEMIMPL_MUL_OP 3
1323	BEGINCODE
1324	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1325	PROLOGUE_3_ARGS
1326	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1327	mov al, [A0]
1328	%1 A1_8
1329	mov [A0], ax
1330	IEM_SAVE_FLAGS A2, %2, %3
1331	xor eax, eax
1332	EPILOGUE_3_ARGS
1333	ENDPROC iemAImpl_ %+ %1 %+ _u8
1334
1335	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1336	PROLOGUE_4_ARGS
1337	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1338	mov ax, [A0]
1339	%ifdef ASM_CALL64_GCC
1340	%1 A2_16
1341	mov [A0], ax
1342	mov [A1], dx
1343	%else
1344	mov T1, A1
1345	%1 A2_16
1346	mov [A0], ax
1347	mov [T1], dx
1348	%endif
1349	IEM_SAVE_FLAGS A3, %2, %3
1350	xor eax, eax
1351	EPILOGUE_4_ARGS
1352	ENDPROC iemAImpl_ %+ %1 %+ _u16
1353
1354	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1355	PROLOGUE_4_ARGS
1356	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1357	mov eax, [A0]
1358	%ifdef ASM_CALL64_GCC
1359	%1 A2_32
1360	mov [A0], eax
1361	mov [A1], edx
1362	%else
1363	mov T1, A1
1364	%1 A2_32
1365	mov [A0], eax
1366	mov [T1], edx
1367	%endif
1368	IEM_SAVE_FLAGS A3, %2, %3
1369	xor eax, eax
1370	EPILOGUE_4_ARGS
1371	ENDPROC iemAImpl_ %+ %1 %+ _u32
1372
1373	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1374	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1375	PROLOGUE_4_ARGS
1376	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1377	mov rax, [A0]
1378	%ifdef ASM_CALL64_GCC
1379	%1 A2
1380	mov [A0], rax
1381	mov [A1], rdx
1382	%else
1383	mov T1, A1
1384	%1 A2
1385	mov [A0], rax
1386	mov [T1], rdx
1387	%endif
1388	IEM_SAVE_FLAGS A3, %2, %3
1389	xor eax, eax
1390	EPILOGUE_4_ARGS_EX 12
1391	ENDPROC iemAImpl_ %+ %1 %+ _u64
1392	%endif ; !RT_ARCH_AMD64
1393
1394	%endmacro
1395
1396	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1397	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1398
1399
1400	BEGINCODE
1401	;;
1402	; Worker function for negating a 32-bit number in T1:T0
1403	; @uses None (T0,T1)
1404	BEGINPROC iemAImpl_negate_T0_T1_u32
1405	push 0
1406	push 0
1407	xchg T0_32, [xSP]
1408	xchg T1_32, [xSP + xCB]
1409	sub T0_32, [xSP]
1410	sbb T1_32, [xSP + xCB]
1411	add xSP, xCB*2
1412	ret
1413	ENDPROC iemAImpl_negate_T0_T1_u32
1414
1415	%ifdef RT_ARCH_AMD64
1416	;;
1417	; Worker function for negating a 64-bit number in T1:T0
1418	; @uses None (T0,T1)
1419	BEGINPROC iemAImpl_negate_T0_T1_u64
1420	push 0
1421	push 0
1422	xchg T0, [xSP]
1423	xchg T1, [xSP + xCB]
1424	sub T0, [xSP]
1425	sbb T1, [xSP + xCB]
1426	add xSP, xCB*2
1427	ret
1428	ENDPROC iemAImpl_negate_T0_T1_u64
1429	%endif
1430
1431
1432	;;
1433	; Macro for implementing a division operations.
1434	;
1435	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1436	; 32-bit system where the 64-bit accesses requires hand coding.
1437	;
1438	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1439	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1440	; pointer to eflags in A3.
1441	;
1442	; The functions all return 0 on success and -1 if a divide error should be
1443	; raised by the caller.
1444	;
1445	; @param 1 The instruction mnemonic.
1446	; @param 2 The modified flags.
1447	; @param 3 The undefined flags.
1448	; @param 4 1 if signed, 0 if unsigned.
1449	;
1450	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1451	;
1452	%macro IEMIMPL_DIV_OP 4
1453	BEGINCODE
1454	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1455	PROLOGUE_3_ARGS
1456
1457	; div by chainsaw check.
1458	test A1_8, A1_8
1459	jz .div_zero
1460
1461	; Overflow check - unsigned division is simple to verify, haven't
1462	; found a simple way to check signed division yet unfortunately.
1463	%if %4 == 0
1464	cmp [A0 + 1], A1_8
1465	jae .div_overflow
1466	%else
1467	mov T0_16, [A0] ; T0 = dividend
1468	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1469	test A1_8, A1_8
1470	js .divisor_negative
1471	test T0_16, T0_16
1472	jns .both_positive
1473	neg T0_16
1474	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1475	push T0 ; Start off like unsigned below.
1476	shr T0_16, 7
1477	cmp T0_8, A1_8
1478	pop T0
1479	jb .div_no_overflow
1480	ja .div_overflow
1481	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1482	cmp T0_8, A1_8
1483	jae .div_overflow
1484	jmp .div_no_overflow
1485
1486	.divisor_negative:
1487	neg A1_8
1488	test T0_16, T0_16
1489	jns .one_of_each
1490	neg T0_16
1491	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1492	shr T0_16, 7
1493	cmp T0_8, A1_8
1494	jae .div_overflow
1495	.div_no_overflow:
1496	mov A1, T1 ; restore divisor
1497	%endif
1498
1499	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1500	mov ax, [A0]
1501	%1 A1_8
1502	mov [A0], ax
1503	IEM_SAVE_FLAGS A2, %2, %3
1504	xor eax, eax
1505
1506	.return:
1507	EPILOGUE_3_ARGS
1508
1509	.div_zero:
1510	.div_overflow:
1511	mov eax, -1
1512	jmp .return
1513	ENDPROC iemAImpl_ %+ %1 %+ _u8
1514
1515	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1516	PROLOGUE_4_ARGS
1517
1518	; div by chainsaw check.
1519	test A2_16, A2_16
1520	jz .div_zero
1521
1522	; Overflow check - unsigned division is simple to verify, haven't
1523	; found a simple way to check signed division yet unfortunately.
1524	%if %4 == 0
1525	cmp [A1], A2_16
1526	jae .div_overflow
1527	%else
1528	mov T0_16, [A1]
1529	shl T0_32, 16
1530	mov T0_16, [A0] ; T0 = dividend
1531	mov T1, A2 ; T1 = divisor
1532	test T1_16, T1_16
1533	js .divisor_negative
1534	test T0_32, T0_32
1535	jns .both_positive
1536	neg T0_32
1537	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1538	push T0 ; Start off like unsigned below.
1539	shr T0_32, 15
1540	cmp T0_16, T1_16
1541	pop T0
1542	jb .div_no_overflow
1543	ja .div_overflow
1544	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1545	cmp T0_16, T1_16
1546	jae .div_overflow
1547	jmp .div_no_overflow
1548
1549	.divisor_negative:
1550	neg T1_16
1551	test T0_32, T0_32
1552	jns .one_of_each
1553	neg T0_32
1554	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1555	shr T0_32, 15
1556	cmp T0_16, T1_16
1557	jae .div_overflow
1558	.div_no_overflow:
1559	%endif
1560
1561	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1562	%ifdef ASM_CALL64_GCC
1563	mov T1, A2
1564	mov ax, [A0]
1565	mov dx, [A1]
1566	%1 T1_16
1567	mov [A0], ax
1568	mov [A1], dx
1569	%else
1570	mov T1, A1
1571	mov ax, [A0]
1572	mov dx, [T1]
1573	%1 A2_16
1574	mov [A0], ax
1575	mov [T1], dx
1576	%endif
1577	IEM_SAVE_FLAGS A3, %2, %3
1578	xor eax, eax
1579
1580	.return:
1581	EPILOGUE_4_ARGS
1582
1583	.div_zero:
1584	.div_overflow:
1585	mov eax, -1
1586	jmp .return
1587	ENDPROC iemAImpl_ %+ %1 %+ _u16
1588
1589	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1590	PROLOGUE_4_ARGS
1591
1592	; div by chainsaw check.
1593	test A2_32, A2_32
1594	jz .div_zero
1595
1596	; Overflow check - unsigned division is simple to verify, haven't
1597	; found a simple way to check signed division yet unfortunately.
1598	%if %4 == 0
1599	cmp [A1], A2_32
1600	jae .div_overflow
1601	%else
1602	push A2 ; save A2 so we modify it (we out of regs on x86).
1603	mov T0_32, [A0] ; T0 = dividend low
1604	mov T1_32, [A1] ; T1 = dividend high
1605	test A2_32, A2_32
1606	js .divisor_negative
1607	test T1_32, T1_32
1608	jns .both_positive
1609	call NAME(iemAImpl_negate_T0_T1_u32)
1610	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1611	push T0 ; Start off like unsigned below.
1612	shl T1_32, 1
1613	shr T0_32, 31
1614	or T1_32, T0_32
1615	cmp T1_32, A2_32
1616	pop T0
1617	jb .div_no_overflow
1618	ja .div_overflow
1619	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1620	cmp T0_32, A2_32
1621	jae .div_overflow
1622	jmp .div_no_overflow
1623
1624	.divisor_negative:
1625	neg A2_32
1626	test T1_32, T1_32
1627	jns .one_of_each
1628	call NAME(iemAImpl_negate_T0_T1_u32)
1629	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1630	shl T1_32, 1
1631	shr T0_32, 31
1632	or T1_32, T0_32
1633	cmp T1_32, A2_32
1634	jae .div_overflow
1635	.div_no_overflow:
1636	pop A2
1637	%endif
1638
1639	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1640	mov eax, [A0]
1641	%ifdef ASM_CALL64_GCC
1642	mov T1, A2
1643	mov eax, [A0]
1644	mov edx, [A1]
1645	%1 T1_32
1646	mov [A0], eax
1647	mov [A1], edx
1648	%else
1649	mov T1, A1
1650	mov eax, [A0]
1651	mov edx, [T1]
1652	%1 A2_32
1653	mov [A0], eax
1654	mov [T1], edx
1655	%endif
1656	IEM_SAVE_FLAGS A3, %2, %3
1657	xor eax, eax
1658
1659	.return:
1660	EPILOGUE_4_ARGS
1661
1662	.div_overflow:
1663	%if %4 != 0
1664	pop A2
1665	%endif
1666	.div_zero:
1667	mov eax, -1
1668	jmp .return
1669	ENDPROC iemAImpl_ %+ %1 %+ _u32
1670
1671	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1672	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1673	PROLOGUE_4_ARGS
1674
1675	test A2, A2
1676	jz .div_zero
1677	%if %4 == 0
1678	cmp [A1], A2
1679	jae .div_overflow
1680	%else
1681	push A2 ; save A2 so we modify it (we out of regs on x86).
1682	mov T0, [A0] ; T0 = dividend low
1683	mov T1, [A1] ; T1 = dividend high
1684	test A2, A2
1685	js .divisor_negative
1686	test T1, T1
1687	jns .both_positive
1688	call NAME(iemAImpl_negate_T0_T1_u64)
1689	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1690	push T0 ; Start off like unsigned below.
1691	shl T1, 1
1692	shr T0, 63
1693	or T1, T0
1694	cmp T1, A2
1695	pop T0
1696	jb .div_no_overflow
1697	ja .div_overflow
1698	mov T1, 0x7fffffffffffffff
1699	and T0, T1 ; Special case for covering (divisor - 1).
1700	cmp T0, A2
1701	jae .div_overflow
1702	jmp .div_no_overflow
1703
1704	.divisor_negative:
1705	neg A2
1706	test T1, T1
1707	jns .one_of_each
1708	call NAME(iemAImpl_negate_T0_T1_u64)
1709	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1710	shl T1, 1
1711	shr T0, 63
1712	or T1, T0
1713	cmp T1, A2
1714	jae .div_overflow
1715	.div_no_overflow:
1716	pop A2
1717	%endif
1718
1719	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1720	mov rax, [A0]
1721	%ifdef ASM_CALL64_GCC
1722	mov T1, A2
1723	mov rax, [A0]
1724	mov rdx, [A1]
1725	%1 T1
1726	mov [A0], rax
1727	mov [A1], rdx
1728	%else
1729	mov T1, A1
1730	mov rax, [A0]
1731	mov rdx, [T1]
1732	%1 A2
1733	mov [A0], rax
1734	mov [T1], rdx
1735	%endif
1736	IEM_SAVE_FLAGS A3, %2, %3
1737	xor eax, eax
1738
1739	.return:
1740	EPILOGUE_4_ARGS_EX 12
1741
1742	.div_overflow:
1743	%if %4 != 0
1744	pop A2
1745	%endif
1746	.div_zero:
1747	mov eax, -1
1748	jmp .return
1749	ENDPROC iemAImpl_ %+ %1 %+ _u64
1750	%endif ; !RT_ARCH_AMD64
1751
1752	%endmacro
1753
1754	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1755	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1756
1757
1758	;;
1759	; Macro for implementing memory fence operation.
1760	;
1761	; No return value, no operands or anything.
1762	;
1763	; @param 1 The instruction.
1764	;
1765	%macro IEMIMPL_MEM_FENCE 1
1766	BEGINCODE
1767	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1768	%1
1769	ret
1770	ENDPROC iemAImpl_ %+ %1
1771	%endmacro
1772
1773	IEMIMPL_MEM_FENCE lfence
1774	IEMIMPL_MEM_FENCE sfence
1775	IEMIMPL_MEM_FENCE mfence
1776
1777	;;
1778	; Alternative for non-SSE2 host.
1779	;
1780	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1781	push xAX
1782	xchg xAX, [xSP]
1783	add xSP, xCB
1784	ret
1785	ENDPROC iemAImpl_alt_mem_fence
1786
1787
1788	;;
1789	; Initialize the FPU for the actual instruction being emulated, this means
1790	; loading parts of the guest's control word and status word.
1791	;
1792	; @uses 24 bytes of stack.
1793	; @param 1 Expression giving the address of the FXSTATE of the guest.
1794	;
1795	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1796	fnstenv [xSP]
1797
1798	; FCW - for exception, precision and rounding control.
1799	movzx T0, word [%1 + X86FXSTATE.FCW]
1800	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1801	mov [xSP + X86FSTENV32P.FCW], T0_16
1802
1803	; FSW - for undefined C0, C1, C2, and C3.
1804	movzx T1, word [%1 + X86FXSTATE.FSW]
1805	and T1, X86_FSW_C_MASK
1806	movzx T0, word [xSP + X86FSTENV32P.FSW]
1807	and T0, X86_FSW_TOP_MASK
1808	or T0, T1
1809	mov [xSP + X86FSTENV32P.FSW], T0_16
1810
1811	fldenv [xSP]
1812	%endmacro
1813
1814
1815	;;
1816	; Need to move this as well somewhere better?
1817	;
1818	struc IEMFPURESULT
1819	.r80Result resw 5
1820	.FSW resw 1
1821	endstruc
1822
1823
1824	;;
1825	; Need to move this as well somewhere better?
1826	;
1827	struc IEMFPURESULTTWO
1828	.r80Result1 resw 5
1829	.FSW resw 1
1830	.r80Result2 resw 5
1831	endstruc
1832
1833
1834	;
1835	;---------------------- 16-bit signed integer operations ----------------------
1836	;
1837
1838
1839	;;
1840	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1841	;
1842	; @param A0 FPU context (fxsave).
1843	; @param A1 Pointer to a IEMFPURESULT for the output.
1844	; @param A2 Pointer to the 16-bit floating point value to convert.
1845	;
1846	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1847	PROLOGUE_3_ARGS
1848	sub xSP, 20h
1849
1850	fninit
1851	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1852	fild word [A2]
1853
1854	fnstsw word [A1 + IEMFPURESULT.FSW]
1855	fnclex
1856	fstp tword [A1 + IEMFPURESULT.r80Result]
1857
1858	fninit
1859	add xSP, 20h
1860	EPILOGUE_3_ARGS
1861	ENDPROC iemAImpl_fild_i16_to_r80
1862
1863
1864	;;
1865	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1866	;
1867	; @param A0 FPU context (fxsave).
1868	; @param A1 Where to return the output FSW.
1869	; @param A2 Where to store the 16-bit signed integer value.
1870	; @param A3 Pointer to the 80-bit value.
1871	;
1872	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1873	PROLOGUE_4_ARGS
1874	sub xSP, 20h
1875
1876	fninit
1877	fld tword [A3]
1878	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1879	fistp word [A2]
1880
1881	fnstsw word [A1]
1882
1883	fninit
1884	add xSP, 20h
1885	EPILOGUE_4_ARGS
1886	ENDPROC iemAImpl_fist_r80_to_i16
1887
1888
1889	;;
1890	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1891	; (memory) with truncation.
1892	;
1893	; @param A0 FPU context (fxsave).
1894	; @param A1 Where to return the output FSW.
1895	; @param A2 Where to store the 16-bit signed integer value.
1896	; @param A3 Pointer to the 80-bit value.
1897	;
1898	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1899	PROLOGUE_4_ARGS
1900	sub xSP, 20h
1901
1902	fninit
1903	fld tword [A3]
1904	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1905	fisttp dword [A2]
1906
1907	fnstsw word [A1]
1908
1909	fninit
1910	add xSP, 20h
1911	EPILOGUE_4_ARGS
1912	ENDPROC iemAImpl_fistt_r80_to_i16
1913
1914
1915	;;
1916	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1917	;
1918	; @param 1 The instruction
1919	;
1920	; @param A0 FPU context (fxsave).
1921	; @param A1 Pointer to a IEMFPURESULT for the output.
1922	; @param A2 Pointer to the 80-bit value.
1923	; @param A3 Pointer to the 16-bit value.
1924	;
1925	%macro IEMIMPL_FPU_R80_BY_I16 1
1926	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1927	PROLOGUE_4_ARGS
1928	sub xSP, 20h
1929
1930	fninit
1931	fld tword [A2]
1932	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1933	%1 word [A3]
1934
1935	fnstsw word [A1 + IEMFPURESULT.FSW]
1936	fnclex
1937	fstp tword [A1 + IEMFPURESULT.r80Result]
1938
1939	fninit
1940	add xSP, 20h
1941	EPILOGUE_4_ARGS
1942	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1943	%endmacro
1944
1945	IEMIMPL_FPU_R80_BY_I16 fiadd
1946	IEMIMPL_FPU_R80_BY_I16 fimul
1947	IEMIMPL_FPU_R80_BY_I16 fisub
1948	IEMIMPL_FPU_R80_BY_I16 fisubr
1949	IEMIMPL_FPU_R80_BY_I16 fidiv
1950	IEMIMPL_FPU_R80_BY_I16 fidivr
1951
1952
1953	;;
1954	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1955	; only returning FSW.
1956	;
1957	; @param 1 The instruction
1958	;
1959	; @param A0 FPU context (fxsave).
1960	; @param A1 Where to store the output FSW.
1961	; @param A2 Pointer to the 80-bit value.
1962	; @param A3 Pointer to the 64-bit value.
1963	;
1964	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1965	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1966	PROLOGUE_4_ARGS
1967	sub xSP, 20h
1968
1969	fninit
1970	fld tword [A2]
1971	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1972	%1 word [A3]
1973
1974	fnstsw word [A1]
1975
1976	fninit
1977	add xSP, 20h
1978	EPILOGUE_4_ARGS
1979	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1980	%endmacro
1981
1982	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1983
1984
1985
1986	;
1987	;---------------------- 32-bit signed integer operations ----------------------
1988	;
1989
1990
1991	;;
1992	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1993	;
1994	; @param A0 FPU context (fxsave).
1995	; @param A1 Pointer to a IEMFPURESULT for the output.
1996	; @param A2 Pointer to the 32-bit floating point value to convert.
1997	;
1998	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
1999	PROLOGUE_3_ARGS
2000	sub xSP, 20h
2001
2002	fninit
2003	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2004	fild dword [A2]
2005
2006	fnstsw word [A1 + IEMFPURESULT.FSW]
2007	fnclex
2008	fstp tword [A1 + IEMFPURESULT.r80Result]
2009
2010	fninit
2011	add xSP, 20h
2012	EPILOGUE_3_ARGS
2013	ENDPROC iemAImpl_fild_i32_to_r80
2014
2015
2016	;;
2017	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2018	;
2019	; @param A0 FPU context (fxsave).
2020	; @param A1 Where to return the output FSW.
2021	; @param A2 Where to store the 32-bit signed integer value.
2022	; @param A3 Pointer to the 80-bit value.
2023	;
2024	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2025	PROLOGUE_4_ARGS
2026	sub xSP, 20h
2027
2028	fninit
2029	fld tword [A3]
2030	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2031	fistp dword [A2]
2032
2033	fnstsw word [A1]
2034
2035	fninit
2036	add xSP, 20h
2037	EPILOGUE_4_ARGS
2038	ENDPROC iemAImpl_fist_r80_to_i32
2039
2040
2041	;;
2042	; Store a 80-bit floating point value (register) as a 32-bit signed integer
2043	; (memory) with truncation.
2044	;
2045	; @param A0 FPU context (fxsave).
2046	; @param A1 Where to return the output FSW.
2047	; @param A2 Where to store the 32-bit signed integer value.
2048	; @param A3 Pointer to the 80-bit value.
2049	;
2050	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2051	PROLOGUE_4_ARGS
2052	sub xSP, 20h
2053
2054	fninit
2055	fld tword [A3]
2056	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2057	fisttp dword [A2]
2058
2059	fnstsw word [A1]
2060
2061	fninit
2062	add xSP, 20h
2063	EPILOGUE_4_ARGS
2064	ENDPROC iemAImpl_fistt_r80_to_i32
2065
2066
2067	;;
2068	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2069	;
2070	; @param 1 The instruction
2071	;
2072	; @param A0 FPU context (fxsave).
2073	; @param A1 Pointer to a IEMFPURESULT for the output.
2074	; @param A2 Pointer to the 80-bit value.
2075	; @param A3 Pointer to the 32-bit value.
2076	;
2077	%macro IEMIMPL_FPU_R80_BY_I32 1
2078	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2079	PROLOGUE_4_ARGS
2080	sub xSP, 20h
2081
2082	fninit
2083	fld tword [A2]
2084	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2085	%1 dword [A3]
2086
2087	fnstsw word [A1 + IEMFPURESULT.FSW]
2088	fnclex
2089	fstp tword [A1 + IEMFPURESULT.r80Result]
2090
2091	fninit
2092	add xSP, 20h
2093	EPILOGUE_4_ARGS
2094	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2095	%endmacro
2096
2097	IEMIMPL_FPU_R80_BY_I32 fiadd
2098	IEMIMPL_FPU_R80_BY_I32 fimul
2099	IEMIMPL_FPU_R80_BY_I32 fisub
2100	IEMIMPL_FPU_R80_BY_I32 fisubr
2101	IEMIMPL_FPU_R80_BY_I32 fidiv
2102	IEMIMPL_FPU_R80_BY_I32 fidivr
2103
2104
2105	;;
2106	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2107	; only returning FSW.
2108	;
2109	; @param 1 The instruction
2110	;
2111	; @param A0 FPU context (fxsave).
2112	; @param A1 Where to store the output FSW.
2113	; @param A2 Pointer to the 80-bit value.
2114	; @param A3 Pointer to the 64-bit value.
2115	;
2116	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2117	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2118	PROLOGUE_4_ARGS
2119	sub xSP, 20h
2120
2121	fninit
2122	fld tword [A2]
2123	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2124	%1 dword [A3]
2125
2126	fnstsw word [A1]
2127
2128	fninit
2129	add xSP, 20h
2130	EPILOGUE_4_ARGS
2131	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2132	%endmacro
2133
2134	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2135
2136
2137
2138	;
2139	;---------------------- 64-bit signed integer operations ----------------------
2140	;
2141
2142
2143	;;
2144	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2145	;
2146	; @param A0 FPU context (fxsave).
2147	; @param A1 Pointer to a IEMFPURESULT for the output.
2148	; @param A2 Pointer to the 64-bit floating point value to convert.
2149	;
2150	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2151	PROLOGUE_3_ARGS
2152	sub xSP, 20h
2153
2154	fninit
2155	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2156	fild qword [A2]
2157
2158	fnstsw word [A1 + IEMFPURESULT.FSW]
2159	fnclex
2160	fstp tword [A1 + IEMFPURESULT.r80Result]
2161
2162	fninit
2163	add xSP, 20h
2164	EPILOGUE_3_ARGS
2165	ENDPROC iemAImpl_fild_i64_to_r80
2166
2167
2168	;;
2169	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2170	;
2171	; @param A0 FPU context (fxsave).
2172	; @param A1 Where to return the output FSW.
2173	; @param A2 Where to store the 64-bit signed integer value.
2174	; @param A3 Pointer to the 80-bit value.
2175	;
2176	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2177	PROLOGUE_4_ARGS
2178	sub xSP, 20h
2179
2180	fninit
2181	fld tword [A3]
2182	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2183	fistp qword [A2]
2184
2185	fnstsw word [A1]
2186
2187	fninit
2188	add xSP, 20h
2189	EPILOGUE_4_ARGS
2190	ENDPROC iemAImpl_fist_r80_to_i64
2191
2192
2193	;;
2194	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2195	; (memory) with truncation.
2196	;
2197	; @param A0 FPU context (fxsave).
2198	; @param A1 Where to return the output FSW.
2199	; @param A2 Where to store the 64-bit signed integer value.
2200	; @param A3 Pointer to the 80-bit value.
2201	;
2202	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2203	PROLOGUE_4_ARGS
2204	sub xSP, 20h
2205
2206	fninit
2207	fld tword [A3]
2208	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2209	fisttp qword [A2]
2210
2211	fnstsw word [A1]
2212
2213	fninit
2214	add xSP, 20h
2215	EPILOGUE_4_ARGS
2216	ENDPROC iemAImpl_fistt_r80_to_i64
2217
2218
2219
2220	;
2221	;---------------------- 32-bit floating point operations ----------------------
2222	;
2223
2224	;;
2225	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2226	;
2227	; @param A0 FPU context (fxsave).
2228	; @param A1 Pointer to a IEMFPURESULT for the output.
2229	; @param A2 Pointer to the 32-bit floating point value to convert.
2230	;
2231	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2232	PROLOGUE_3_ARGS
2233	sub xSP, 20h
2234
2235	fninit
2236	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2237	fld dword [A2]
2238
2239	fnstsw word [A1 + IEMFPURESULT.FSW]
2240	fnclex
2241	fstp tword [A1 + IEMFPURESULT.r80Result]
2242
2243	fninit
2244	add xSP, 20h
2245	EPILOGUE_3_ARGS
2246	ENDPROC iemAImpl_fld_r32_to_r80
2247
2248
2249	;;
2250	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2251	;
2252	; @param A0 FPU context (fxsave).
2253	; @param A1 Where to return the output FSW.
2254	; @param A2 Where to store the 32-bit value.
2255	; @param A3 Pointer to the 80-bit value.
2256	;
2257	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2258	PROLOGUE_4_ARGS
2259	sub xSP, 20h
2260
2261	fninit
2262	fld tword [A3]
2263	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2264	fst dword [A2]
2265
2266	fnstsw word [A1]
2267
2268	fninit
2269	add xSP, 20h
2270	EPILOGUE_4_ARGS
2271	ENDPROC iemAImpl_fst_r80_to_r32
2272
2273
2274	;;
2275	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2276	;
2277	; @param 1 The instruction
2278	;
2279	; @param A0 FPU context (fxsave).
2280	; @param A1 Pointer to a IEMFPURESULT for the output.
2281	; @param A2 Pointer to the 80-bit value.
2282	; @param A3 Pointer to the 32-bit value.
2283	;
2284	%macro IEMIMPL_FPU_R80_BY_R32 1
2285	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2286	PROLOGUE_4_ARGS
2287	sub xSP, 20h
2288
2289	fninit
2290	fld tword [A2]
2291	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2292	%1 dword [A3]
2293
2294	fnstsw word [A1 + IEMFPURESULT.FSW]
2295	fnclex
2296	fstp tword [A1 + IEMFPURESULT.r80Result]
2297
2298	fninit
2299	add xSP, 20h
2300	EPILOGUE_4_ARGS
2301	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2302	%endmacro
2303
2304	IEMIMPL_FPU_R80_BY_R32 fadd
2305	IEMIMPL_FPU_R80_BY_R32 fmul
2306	IEMIMPL_FPU_R80_BY_R32 fsub
2307	IEMIMPL_FPU_R80_BY_R32 fsubr
2308	IEMIMPL_FPU_R80_BY_R32 fdiv
2309	IEMIMPL_FPU_R80_BY_R32 fdivr
2310
2311
2312	;;
2313	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2314	; only returning FSW.
2315	;
2316	; @param 1 The instruction
2317	;
2318	; @param A0 FPU context (fxsave).
2319	; @param A1 Where to store the output FSW.
2320	; @param A2 Pointer to the 80-bit value.
2321	; @param A3 Pointer to the 64-bit value.
2322	;
2323	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2324	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2325	PROLOGUE_4_ARGS
2326	sub xSP, 20h
2327
2328	fninit
2329	fld tword [A2]
2330	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2331	%1 dword [A3]
2332
2333	fnstsw word [A1]
2334
2335	fninit
2336	add xSP, 20h
2337	EPILOGUE_4_ARGS
2338	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2339	%endmacro
2340
2341	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2342
2343
2344
2345	;
2346	;---------------------- 64-bit floating point operations ----------------------
2347	;
2348
2349	;;
2350	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2351	;
2352	; @param A0 FPU context (fxsave).
2353	; @param A1 Pointer to a IEMFPURESULT for the output.
2354	; @param A2 Pointer to the 64-bit floating point value to convert.
2355	;
2356	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2357	PROLOGUE_3_ARGS
2358	sub xSP, 20h
2359
2360	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2361	fld qword [A2]
2362
2363	fnstsw word [A1 + IEMFPURESULT.FSW]
2364	fnclex
2365	fstp tword [A1 + IEMFPURESULT.r80Result]
2366
2367	fninit
2368	add xSP, 20h
2369	EPILOGUE_3_ARGS
2370	ENDPROC iemAImpl_fld_r64_to_r80
2371
2372
2373	;;
2374	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2375	;
2376	; @param A0 FPU context (fxsave).
2377	; @param A1 Where to return the output FSW.
2378	; @param A2 Where to store the 64-bit value.
2379	; @param A3 Pointer to the 80-bit value.
2380	;
2381	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2382	PROLOGUE_4_ARGS
2383	sub xSP, 20h
2384
2385	fninit
2386	fld tword [A3]
2387	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2388	fst qword [A2]
2389
2390	fnstsw word [A1]
2391
2392	fninit
2393	add xSP, 20h
2394	EPILOGUE_4_ARGS
2395	ENDPROC iemAImpl_fst_r80_to_r64
2396
2397
2398	;;
2399	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2400	;
2401	; @param 1 The instruction
2402	;
2403	; @param A0 FPU context (fxsave).
2404	; @param A1 Pointer to a IEMFPURESULT for the output.
2405	; @param A2 Pointer to the 80-bit value.
2406	; @param A3 Pointer to the 64-bit value.
2407	;
2408	%macro IEMIMPL_FPU_R80_BY_R64 1
2409	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2410	PROLOGUE_4_ARGS
2411	sub xSP, 20h
2412
2413	fninit
2414	fld tword [A2]
2415	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2416	%1 qword [A3]
2417
2418	fnstsw word [A1 + IEMFPURESULT.FSW]
2419	fnclex
2420	fstp tword [A1 + IEMFPURESULT.r80Result]
2421
2422	fninit
2423	add xSP, 20h
2424	EPILOGUE_4_ARGS
2425	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2426	%endmacro
2427
2428	IEMIMPL_FPU_R80_BY_R64 fadd
2429	IEMIMPL_FPU_R80_BY_R64 fmul
2430	IEMIMPL_FPU_R80_BY_R64 fsub
2431	IEMIMPL_FPU_R80_BY_R64 fsubr
2432	IEMIMPL_FPU_R80_BY_R64 fdiv
2433	IEMIMPL_FPU_R80_BY_R64 fdivr
2434
2435	;;
2436	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2437	; only returning FSW.
2438	;
2439	; @param 1 The instruction
2440	;
2441	; @param A0 FPU context (fxsave).
2442	; @param A1 Where to store the output FSW.
2443	; @param A2 Pointer to the 80-bit value.
2444	; @param A3 Pointer to the 64-bit value.
2445	;
2446	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2447	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2448	PROLOGUE_4_ARGS
2449	sub xSP, 20h
2450
2451	fninit
2452	fld tword [A2]
2453	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2454	%1 qword [A3]
2455
2456	fnstsw word [A1]
2457
2458	fninit
2459	add xSP, 20h
2460	EPILOGUE_4_ARGS
2461	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2462	%endmacro
2463
2464	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2465
2466
2467
2468	;
2469	;---------------------- 80-bit floating point operations ----------------------
2470	;
2471
2472	;;
2473	; Loads a 80-bit floating point register value from memory.
2474	;
2475	; @param A0 FPU context (fxsave).
2476	; @param A1 Pointer to a IEMFPURESULT for the output.
2477	; @param A2 Pointer to the 80-bit floating point value to load.
2478	;
2479	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2480	PROLOGUE_3_ARGS
2481	sub xSP, 20h
2482
2483	fninit
2484	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2485	fld tword [A2]
2486
2487	fnstsw word [A1 + IEMFPURESULT.FSW]
2488	fnclex
2489	fstp tword [A1 + IEMFPURESULT.r80Result]
2490
2491	fninit
2492	add xSP, 20h
2493	EPILOGUE_3_ARGS
2494	ENDPROC iemAImpl_fld_r80_from_r80
2495
2496
2497	;;
2498	; Store a 80-bit floating point register to memory
2499	;
2500	; @param A0 FPU context (fxsave).
2501	; @param A1 Where to return the output FSW.
2502	; @param A2 Where to store the 80-bit value.
2503	; @param A3 Pointer to the 80-bit register value.
2504	;
2505	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2506	PROLOGUE_4_ARGS
2507	sub xSP, 20h
2508
2509	fninit
2510	fld tword [A3]
2511	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2512	fstp tword [A2]
2513
2514	fnstsw word [A1]
2515
2516	fninit
2517	add xSP, 20h
2518	EPILOGUE_4_ARGS
2519	ENDPROC iemAImpl_fst_r80_to_r80
2520
2521
2522	;;
2523	; Loads an 80-bit floating point register value in BCD format from memory.
2524	;
2525	; @param A0 FPU context (fxsave).
2526	; @param A1 Pointer to a IEMFPURESULT for the output.
2527	; @param A2 Pointer to the 80-bit BCD value to load.
2528	;
2529	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
2530	PROLOGUE_3_ARGS
2531	sub xSP, 20h
2532
2533	fninit
2534	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2535	fbld tword [A2]
2536
2537	fnstsw word [A1 + IEMFPURESULT.FSW]
2538	fnclex
2539	fstp tword [A1 + IEMFPURESULT.r80Result]
2540
2541	fninit
2542	add xSP, 20h
2543	EPILOGUE_3_ARGS
2544	ENDPROC iemAImpl_fld_r80_from_d80
2545
2546
2547	;;
2548	; Store a 80-bit floating point register to memory as BCD
2549	;
2550	; @param A0 FPU context (fxsave).
2551	; @param A1 Where to return the output FSW.
2552	; @param A2 Where to store the 80-bit BCD value.
2553	; @param A3 Pointer to the 80-bit register value.
2554	;
2555	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
2556	PROLOGUE_4_ARGS
2557	sub xSP, 20h
2558
2559	fninit
2560	fld tword [A3]
2561	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2562	fbstp tword [A2]
2563
2564	fnstsw word [A1]
2565
2566	fninit
2567	add xSP, 20h
2568	EPILOGUE_4_ARGS
2569	ENDPROC iemAImpl_fst_r80_to_d80
2570
2571
2572	;;
2573	; FPU instruction working on two 80-bit floating point values.
2574	;
2575	; @param 1 The instruction
2576	;
2577	; @param A0 FPU context (fxsave).
2578	; @param A1 Pointer to a IEMFPURESULT for the output.
2579	; @param A2 Pointer to the first 80-bit value (ST0)
2580	; @param A3 Pointer to the second 80-bit value (STn).
2581	;
2582	%macro IEMIMPL_FPU_R80_BY_R80 2
2583	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2584	PROLOGUE_4_ARGS
2585	sub xSP, 20h
2586
2587	fninit
2588	fld tword [A3]
2589	fld tword [A2]
2590	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2591	%1 %2
2592
2593	fnstsw word [A1 + IEMFPURESULT.FSW]
2594	fnclex
2595	fstp tword [A1 + IEMFPURESULT.r80Result]
2596
2597	fninit
2598	add xSP, 20h
2599	EPILOGUE_4_ARGS
2600	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2601	%endmacro
2602
2603	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2604	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2605	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2606	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2607	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2608	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2609	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2610	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2611	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2612
2613
2614	;;
2615	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2616	; storing the result in ST1 and popping the stack.
2617	;
2618	; @param 1 The instruction
2619	;
2620	; @param A0 FPU context (fxsave).
2621	; @param A1 Pointer to a IEMFPURESULT for the output.
2622	; @param A2 Pointer to the first 80-bit value (ST1).
2623	; @param A3 Pointer to the second 80-bit value (ST0).
2624	;
2625	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2626	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2627	PROLOGUE_4_ARGS
2628	sub xSP, 20h
2629
2630	fninit
2631	fld tword [A2]
2632	fld tword [A3]
2633	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2634	%1
2635
2636	fnstsw word [A1 + IEMFPURESULT.FSW]
2637	fnclex
2638	fstp tword [A1 + IEMFPURESULT.r80Result]
2639
2640	fninit
2641	add xSP, 20h
2642	EPILOGUE_4_ARGS
2643	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2644	%endmacro
2645
2646	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2647	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
2648	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2649
2650
2651	;;
2652	; FPU instruction working on two 80-bit floating point values, only
2653	; returning FSW.
2654	;
2655	; @param 1 The instruction
2656	;
2657	; @param A0 FPU context (fxsave).
2658	; @param A1 Pointer to a uint16_t for the resulting FSW.
2659	; @param A2 Pointer to the first 80-bit value.
2660	; @param A3 Pointer to the second 80-bit value.
2661	;
2662	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2663	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2664	PROLOGUE_4_ARGS
2665	sub xSP, 20h
2666
2667	fninit
2668	fld tword [A3]
2669	fld tword [A2]
2670	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2671	%1 st0, st1
2672
2673	fnstsw word [A1]
2674
2675	fninit
2676	add xSP, 20h
2677	EPILOGUE_4_ARGS
2678	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2679	%endmacro
2680
2681	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2682	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2683
2684
2685	;;
2686	; FPU instruction working on two 80-bit floating point values,
2687	; returning FSW and EFLAGS (eax).
2688	;
2689	; @param 1 The instruction
2690	;
2691	; @returns EFLAGS in EAX.
2692	; @param A0 FPU context (fxsave).
2693	; @param A1 Pointer to a uint16_t for the resulting FSW.
2694	; @param A2 Pointer to the first 80-bit value.
2695	; @param A3 Pointer to the second 80-bit value.
2696	;
2697	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2698	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2699	PROLOGUE_4_ARGS
2700	sub xSP, 20h
2701
2702	fninit
2703	fld tword [A3]
2704	fld tword [A2]
2705	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2706	%1 st1
2707
2708	fnstsw word [A1]
2709	pushf
2710	pop xAX
2711
2712	fninit
2713	add xSP, 20h
2714	EPILOGUE_4_ARGS
2715	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2716	%endmacro
2717
2718	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2719	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2720
2721
2722	;;
2723	; FPU instruction working on one 80-bit floating point value.
2724	;
2725	; @param 1 The instruction
2726	;
2727	; @param A0 FPU context (fxsave).
2728	; @param A1 Pointer to a IEMFPURESULT for the output.
2729	; @param A2 Pointer to the 80-bit value.
2730	;
2731	%macro IEMIMPL_FPU_R80 1
2732	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2733	PROLOGUE_3_ARGS
2734	sub xSP, 20h
2735
2736	fninit
2737	fld tword [A2]
2738	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2739	%1
2740
2741	fnstsw word [A1 + IEMFPURESULT.FSW]
2742	fnclex
2743	fstp tword [A1 + IEMFPURESULT.r80Result]
2744
2745	fninit
2746	add xSP, 20h
2747	EPILOGUE_3_ARGS
2748	ENDPROC iemAImpl_ %+ %1 %+ _r80
2749	%endmacro
2750
2751	IEMIMPL_FPU_R80 fchs
2752	IEMIMPL_FPU_R80 fabs
2753	IEMIMPL_FPU_R80 f2xm1
2754	IEMIMPL_FPU_R80 fsqrt
2755	IEMIMPL_FPU_R80 frndint
2756	IEMIMPL_FPU_R80 fsin
2757	IEMIMPL_FPU_R80 fcos
2758
2759
2760	;;
2761	; FPU instruction working on one 80-bit floating point value, only
2762	; returning FSW.
2763	;
2764	; @param 1 The instruction
2765	;
2766	; @param A0 FPU context (fxsave).
2767	; @param A1 Pointer to a uint16_t for the resulting FSW.
2768	; @param A2 Pointer to the 80-bit value.
2769	;
2770	%macro IEMIMPL_FPU_R80_FSW 1
2771	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2772	PROLOGUE_3_ARGS
2773	sub xSP, 20h
2774
2775	fninit
2776	fld tword [A2]
2777	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2778	%1
2779
2780	fnstsw word [A1]
2781
2782	fninit
2783	add xSP, 20h
2784	EPILOGUE_3_ARGS
2785	ENDPROC iemAImpl_ %+ %1 %+ _r80
2786	%endmacro
2787
2788	IEMIMPL_FPU_R80_FSW ftst
2789	IEMIMPL_FPU_R80_FSW fxam
2790
2791
2792
2793	;;
2794	; FPU instruction loading a 80-bit floating point constant.
2795	;
2796	; @param 1 The instruction
2797	;
2798	; @param A0 FPU context (fxsave).
2799	; @param A1 Pointer to a IEMFPURESULT for the output.
2800	;
2801	%macro IEMIMPL_FPU_R80_CONST 1
2802	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2803	PROLOGUE_2_ARGS
2804	sub xSP, 20h
2805
2806	fninit
2807	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2808	%1
2809
2810	fnstsw word [A1 + IEMFPURESULT.FSW]
2811	fnclex
2812	fstp tword [A1 + IEMFPURESULT.r80Result]
2813
2814	fninit
2815	add xSP, 20h
2816	EPILOGUE_2_ARGS
2817	ENDPROC iemAImpl_ %+ %1 %+
2818	%endmacro
2819
2820	IEMIMPL_FPU_R80_CONST fld1
2821	IEMIMPL_FPU_R80_CONST fldl2t
2822	IEMIMPL_FPU_R80_CONST fldl2e
2823	IEMIMPL_FPU_R80_CONST fldpi
2824	IEMIMPL_FPU_R80_CONST fldlg2
2825	IEMIMPL_FPU_R80_CONST fldln2
2826	IEMIMPL_FPU_R80_CONST fldz
2827
2828
2829	;;
2830	; FPU instruction working on one 80-bit floating point value, outputing two.
2831	;
2832	; @param 1 The instruction
2833	;
2834	; @param A0 FPU context (fxsave).
2835	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2836	; @param A2 Pointer to the 80-bit value.
2837	;
2838	%macro IEMIMPL_FPU_R80_R80 1
2839	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2840	PROLOGUE_3_ARGS
2841	sub xSP, 20h
2842
2843	fninit
2844	fld tword [A2]
2845	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2846	%1
2847
2848	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2849	fnclex
2850	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2851	fnclex
2852	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2853
2854	fninit
2855	add xSP, 20h
2856	EPILOGUE_3_ARGS
2857	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2858	%endmacro
2859
2860	IEMIMPL_FPU_R80_R80 fptan
2861	IEMIMPL_FPU_R80_R80 fxtract
2862	IEMIMPL_FPU_R80_R80 fsincos
2863
2864
2865
2866
2867	;---------------------- SSE and MMX Operations ----------------------
2868
2869	;; @todo what do we need to do for MMX?
2870	%macro IEMIMPL_MMX_PROLOGUE 0
2871	%endmacro
2872	%macro IEMIMPL_MMX_EPILOGUE 0
2873	%endmacro
2874
2875	;; @todo what do we need to do for SSE?
2876	%macro IEMIMPL_SSE_PROLOGUE 0
2877	%endmacro
2878	%macro IEMIMPL_SSE_EPILOGUE 0
2879	%endmacro
2880
2881
2882	;;
2883	; Media instruction working on two full sized registers.
2884	;
2885	; @param 1 The instruction
2886	;
2887	; @param A0 FPU context (fxsave).
2888	; @param A1 Pointer to the first media register size operand (input/output).
2889	; @param A2 Pointer to the second media register size operand (input).
2890	;
2891	%macro IEMIMPL_MEDIA_F2 1
2892	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2893	PROLOGUE_3_ARGS
2894	IEMIMPL_MMX_PROLOGUE
2895
2896	movq mm0, [A1]
2897	movq mm1, [A2]
2898	%1 mm0, mm1
2899	movq [A1], mm0
2900
2901	IEMIMPL_MMX_EPILOGUE
2902	EPILOGUE_3_ARGS
2903	ENDPROC iemAImpl_ %+ %1 %+ _u64
2904
2905	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2906	PROLOGUE_3_ARGS
2907	IEMIMPL_SSE_PROLOGUE
2908
2909	movdqu xmm0, [A1]
2910	movdqu xmm1, [A2]
2911	%1 xmm0, xmm1
2912	movdqu [A1], xmm0
2913
2914	IEMIMPL_SSE_EPILOGUE
2915	EPILOGUE_3_ARGS
2916	ENDPROC iemAImpl_ %+ %1 %+ _u128
2917	%endmacro
2918
2919	IEMIMPL_MEDIA_F2 pxor
2920	IEMIMPL_MEDIA_F2 pcmpeqb
2921	IEMIMPL_MEDIA_F2 pcmpeqw
2922	IEMIMPL_MEDIA_F2 pcmpeqd
2923
2924
2925	;;
2926	; Media instruction working on one full sized and one half sized register (lower half).
2927	;
2928	; @param 1 The instruction
2929	; @param 2 1 if MMX is included, 0 if not.
2930	;
2931	; @param A0 FPU context (fxsave).
2932	; @param A1 Pointer to the first full sized media register operand (input/output).
2933	; @param A2 Pointer to the second half sized media register operand (input).
2934	;
2935	%macro IEMIMPL_MEDIA_F1L1 2
2936	%if %2 != 0
2937	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2938	PROLOGUE_3_ARGS
2939	IEMIMPL_MMX_PROLOGUE
2940
2941	movq mm0, [A1]
2942	movd mm1, [A2]
2943	%1 mm0, mm1
2944	movq [A1], mm0
2945
2946	IEMIMPL_MMX_EPILOGUE
2947	EPILOGUE_3_ARGS
2948	ENDPROC iemAImpl_ %+ %1 %+ _u64
2949	%endif
2950
2951	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2952	PROLOGUE_3_ARGS
2953	IEMIMPL_SSE_PROLOGUE
2954
2955	movdqu xmm0, [A1]
2956	movq xmm1, [A2]
2957	%1 xmm0, xmm1
2958	movdqu [A1], xmm0
2959
2960	IEMIMPL_SSE_EPILOGUE
2961	EPILOGUE_3_ARGS
2962	ENDPROC iemAImpl_ %+ %1 %+ _u128
2963	%endmacro
2964
2965	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2966	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2967	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2968	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2969
2970
2971	;;
2972	; Media instruction working on one full sized and one half sized register (high half).
2973	;
2974	; @param 1 The instruction
2975	; @param 2 1 if MMX is included, 0 if not.
2976	;
2977	; @param A0 FPU context (fxsave).
2978	; @param A1 Pointer to the first full sized media register operand (input/output).
2979	; @param A2 Pointer to the second full sized media register operand, where we
2980	; will only use the upper half (input).
2981	;
2982	%macro IEMIMPL_MEDIA_F1H1 2
2983	%if %2 != 0
2984	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2985	PROLOGUE_3_ARGS
2986	IEMIMPL_MMX_PROLOGUE
2987
2988	movq mm0, [A1]
2989	movq mm1, [A2]
2990	%1 mm0, mm1
2991	movq [A1], mm0
2992
2993	IEMIMPL_MMX_EPILOGUE
2994	EPILOGUE_3_ARGS
2995	ENDPROC iemAImpl_ %+ %1 %+ _u64
2996	%endif
2997
2998	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2999	PROLOGUE_3_ARGS
3000	IEMIMPL_SSE_PROLOGUE
3001
3002	movdqu xmm0, [A1]
3003	movdqu xmm1, [A2]
3004	%1 xmm0, xmm1
3005	movdqu [A1], xmm0
3006
3007	IEMIMPL_SSE_EPILOGUE
3008	EPILOGUE_3_ARGS
3009	ENDPROC iemAImpl_ %+ %1 %+ _u128
3010	%endmacro
3011
3012	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3013	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3014	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3015	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3016
3017
3018	;
3019	; Shufflers with evil 8-bit immediates.
3020	;
3021
3022	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
3023	PROLOGUE_4_ARGS
3024	IEMIMPL_MMX_PROLOGUE
3025
3026	movq mm0, [A1]
3027	movq mm1, [A2]
3028	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
3029	lea T1, [.imm0 xWrtRIP]
3030	lea T1, [T1 + T0]
3031	call T1
3032	movq [A1], mm0
3033
3034	IEMIMPL_MMX_EPILOGUE
3035	EPILOGUE_4_ARGS
3036	%assign bImm 0
3037	%rep 256
3038	.imm %+ bImm:
3039	pshufw mm0, mm1, bImm
3040	ret
3041	%assign bImm bImm + 1
3042	%endrep
3043	.immEnd: ; 256*5 == 0x500
3044	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3045	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3046	ENDPROC iemAImpl_pshufw
3047
3048
3049	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3050	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
3051	PROLOGUE_4_ARGS
3052	IEMIMPL_SSE_PROLOGUE
3053
3054	movdqu xmm0, [A1]
3055	movdqu xmm1, [A2]
3056	lea T1, [.imm0 xWrtRIP]
3057	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
3058	lea T1, [T1 + T0*2]
3059	call T1
3060	movdqu [A1], xmm0
3061
3062	IEMIMPL_SSE_EPILOGUE
3063	EPILOGUE_4_ARGS
3064	%assign bImm 0
3065	%rep 256
3066	.imm %+ bImm:
3067	%1 xmm0, xmm1, bImm
3068	ret
3069	%assign bImm bImm + 1
3070	%endrep
3071	.immEnd: ; 256*6 == 0x600
3072	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3073	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
3074	ENDPROC iemAImpl_ %+ %1
3075	%endmacro
3076
3077	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3078	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3079	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3080
3081
3082	;
3083	; Move byte mask.
3084	;
3085
3086	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
3087	PROLOGUE_3_ARGS
3088	IEMIMPL_MMX_PROLOGUE
3089
3090	mov T0, [A1]
3091	movq mm1, [A2]
3092	pmovmskb T0, mm1
3093	mov [A1], T0
3094	%ifdef RT_ARCH_X86
3095	mov dword [A1 + 4], 0
3096	%endif
3097	IEMIMPL_MMX_EPILOGUE
3098	EPILOGUE_3_ARGS
3099	ENDPROC iemAImpl_pmovmskb_u64
3100
3101	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
3102	PROLOGUE_3_ARGS
3103	IEMIMPL_SSE_PROLOGUE
3104
3105	mov T0, [A1]
3106	movdqu xmm1, [A2]
3107	pmovmskb T0, xmm1
3108	mov [A1], T0
3109	%ifdef RT_ARCH_X86
3110	mov dword [A1 + 4], 0
3111	%endif
3112	IEMIMPL_SSE_EPILOGUE
3113	EPILOGUE_3_ARGS
3114	ENDPROC iemAImpl_pmovmskb_u128
3115

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 93926

Download in other formats: