IEMAllAImpl.asm@ 47681

Last change on this file since 47681 was 47548, checked in by vboxsync, 11 years ago
IEM: Bunch of fixes, mostly DOS related.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 81.3 KB

Line
1	; $Id: IEMAllAImpl.asm 47548 2013-08-06 03:58:21Z vboxsync $
2	;; @file
3	; IEM - Instruction Implementation in Assembly.
4	;
5
6	; Copyright (C) 2011-2012 Oracle Corporation
7	;
8	; This file is part of VirtualBox Open Source Edition (OSE), as
9	; available from http://www.virtualbox.org. This file is free software;
10	; you can redistribute it and/or modify it under the terms of the GNU
11	; General Public License (GPL) as published by the Free Software
12	; Foundation, in version 2 as it comes in the "COPYING" file of the
13	; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	;
16
17
18	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
19	; Header Files ;
20	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21	%include "VBox/asmdefs.mac"
22	%include "VBox/err.mac"
23	%include "iprt/x86.mac"
24
25
26	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
27	; Defined Constants And Macros ;
28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30	;;
31	; RET XX / RET wrapper for fastcall.
32	;
33	%macro RET_FASTCALL 1
34	%ifdef RT_ARCH_X86
35	%ifdef RT_OS_WINDOWS
36	ret %1
37	%else
38	ret
39	%endif
40	%else
41	ret
42	%endif
43	%endmacro
44
45	;;
46	; NAME for fastcall functions.
47	;
48	;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
49	; escaping (or whatever the dollar is good for here). Thus the ugly
50	; prefix argument.
51	;
52	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
53	%ifdef RT_ARCH_X86
54	%ifdef RT_OS_WINDOWS
55	%undef NAME_FASTCALL
56	%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
57	%endif
58	%endif
59
60	;;
61	; BEGINPROC for fastcall functions.
62	;
63	; @param 1 The function name (C).
64	; @param 2 The argument size on x86.
65	;
66	%macro BEGINPROC_FASTCALL 2
67	%ifdef ASM_FORMAT_PE
68	export %1=NAME_FASTCALL(%1,%2,$@)
69	%endif
70	%ifdef __NASM__
71	%ifdef ASM_FORMAT_OMF
72	export NAME(%1) NAME_FASTCALL(%1,%2,$@)
73	%endif
74	%endif
75	%ifndef ASM_FORMAT_BIN
76	global NAME_FASTCALL(%1,%2,$@)
77	%endif
78	NAME_FASTCALL(%1,%2,@):
79	%endmacro
80
81
82	;
83	; We employ some macro assembly here to hid the calling convention differences.
84	;
85	%ifdef RT_ARCH_AMD64
86	%macro PROLOGUE_1_ARGS 0
87	%endmacro
88	%macro EPILOGUE_1_ARGS 0
89	ret
90	%endmacro
91	%macro EPILOGUE_1_ARGS_EX 0
92	ret
93	%endmacro
94
95	%macro PROLOGUE_2_ARGS 0
96	%endmacro
97	%macro EPILOGUE_2_ARGS 0
98	ret
99	%endmacro
100	%macro EPILOGUE_2_ARGS_EX 1
101	ret
102	%endmacro
103
104	%macro PROLOGUE_3_ARGS 0
105	%endmacro
106	%macro EPILOGUE_3_ARGS 0
107	ret
108	%endmacro
109	%macro EPILOGUE_3_ARGS_EX 1
110	ret
111	%endmacro
112
113	%macro PROLOGUE_4_ARGS 0
114	%endmacro
115	%macro EPILOGUE_4_ARGS 0
116	ret
117	%endmacro
118	%macro EPILOGUE_4_ARGS_EX 1
119	ret
120	%endmacro
121
122	%ifdef ASM_CALL64_GCC
123	%define A0 rdi
124	%define A0_32 edi
125	%define A0_16 di
126	%define A0_8 dil
127
128	%define A1 rsi
129	%define A1_32 esi
130	%define A1_16 si
131	%define A1_8 sil
132
133	%define A2 rdx
134	%define A2_32 edx
135	%define A2_16 dx
136	%define A2_8 dl
137
138	%define A3 rcx
139	%define A3_32 ecx
140	%define A3_16 cx
141	%endif
142
143	%ifdef ASM_CALL64_MSC
144	%define A0 rcx
145	%define A0_32 ecx
146	%define A0_16 cx
147	%define A0_8 cl
148
149	%define A1 rdx
150	%define A1_32 edx
151	%define A1_16 dx
152	%define A1_8 dl
153
154	%define A2 r8
155	%define A2_32 r8d
156	%define A2_16 r8w
157	%define A2_8 r8b
158
159	%define A3 r9
160	%define A3_32 r9d
161	%define A3_16 r9w
162	%endif
163
164	%define T0 rax
165	%define T0_32 eax
166	%define T0_16 ax
167	%define T0_8 al
168
169	%define T1 r11
170	%define T1_32 r11d
171	%define T1_16 r11w
172	%define T1_8 r11b
173
174	%else
175	; x86
176	%macro PROLOGUE_1_ARGS 0
177	push edi
178	%endmacro
179	%macro EPILOGUE_1_ARGS 0
180	pop edi
181	ret 0
182	%endmacro
183	%macro EPILOGUE_1_ARGS_EX 1
184	pop edi
185	ret %1
186	%endmacro
187
188	%macro PROLOGUE_2_ARGS 0
189	push edi
190	%endmacro
191	%macro EPILOGUE_2_ARGS 0
192	pop edi
193	ret 0
194	%endmacro
195	%macro EPILOGUE_2_ARGS_EX 1
196	pop edi
197	ret %1
198	%endmacro
199
200	%macro PROLOGUE_3_ARGS 0
201	push ebx
202	mov ebx, [esp + 4 + 4]
203	push edi
204	%endmacro
205	%macro EPILOGUE_3_ARGS_EX 1
206	%if (%1) < 4
207	%error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
208	%endif
209	pop edi
210	pop ebx
211	ret %1
212	%endmacro
213	%macro EPILOGUE_3_ARGS 0
214	EPILOGUE_3_ARGS_EX 4
215	%endmacro
216
217	%macro PROLOGUE_4_ARGS 0
218	push ebx
219	push edi
220	push esi
221	mov ebx, [esp + 12 + 4 + 0]
222	mov esi, [esp + 12 + 4 + 4]
223	%endmacro
224	%macro EPILOGUE_4_ARGS_EX 1
225	%if (%1) < 8
226	%error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
227	%endif
228	pop esi
229	pop edi
230	pop ebx
231	ret %1
232	%endmacro
233	%macro EPILOGUE_4_ARGS 0
234	EPILOGUE_4_ARGS_EX 8
235	%endmacro
236
237	%define A0 ecx
238	%define A0_32 ecx
239	%define A0_16 cx
240	%define A0_8 cl
241
242	%define A1 edx
243	%define A1_32 edx
244	%define A1_16 dx
245	%define A1_8 dl
246
247	%define A2 ebx
248	%define A2_32 ebx
249	%define A2_16 bx
250	%define A2_8 bl
251
252	%define A3 esi
253	%define A3_32 esi
254	%define A3_16 si
255
256	%define T0 eax
257	%define T0_32 eax
258	%define T0_16 ax
259	%define T0_8 al
260
261	%define T1 edi
262	%define T1_32 edi
263	%define T1_16 di
264	%endif
265
266
267	;;
268	; Load the relevant flags from [%1] if there are undefined flags (%3).
269	;
270	; @remarks Clobbers T0, stack. Changes EFLAGS.
271	; @param A2 The register pointing to the flags.
272	; @param 1 The parameter (A0..A3) pointing to the eflags.
273	; @param 2 The set of modified flags.
274	; @param 3 The set of undefined flags.
275	;
276	%macro IEM_MAYBE_LOAD_FLAGS 3
277	;%if (%3) != 0
278	pushf ; store current flags
279	mov T0_32, [%1] ; load the guest flags
280	and dword [xSP], ~(%2 \| %3) ; mask out the modified and undefined flags
281	and T0_32, (%2 \| %3) ; select the modified and undefined flags.
282	or [xSP], T0 ; merge guest flags with host flags.
283	popf ; load the mixed flags.
284	;%endif
285	%endmacro
286
287	;;
288	; Update the flag.
289	;
290	; @remarks Clobbers T0, T1, stack.
291	; @param 1 The register pointing to the EFLAGS.
292	; @param 2 The mask of modified flags to save.
293	; @param 3 The mask of undefined flags to (maybe) save.
294	;
295	%macro IEM_SAVE_FLAGS 3
296	%if (%2 \| %3) != 0
297	pushf
298	pop T1
299	mov T0_32, [%1] ; flags
300	and T0_32, ~(%2 \| %3) ; clear the modified & undefined flags.
301	and T1_32, (%2 \| %3) ; select the modified and undefined flags.
302	or T0_32, T1_32 ; combine the flags.
303	mov [%1], T0_32 ; save the flags.
304	%endif
305	%endmacro
306
307
308	;;
309	; Macro for implementing a binary operator.
310	;
311	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
312	; variants, except on 32-bit system where the 64-bit accesses requires hand
313	; coding.
314	;
315	; All the functions takes a pointer to the destination memory operand in A0,
316	; the source register operand in A1 and a pointer to eflags in A2.
317	;
318	; @param 1 The instruction mnemonic.
319	; @param 2 Non-zero if there should be a locked version.
320	; @param 3 The modified flags.
321	; @param 4 The undefined flags.
322	;
323	%macro IEMIMPL_BIN_OP 4
324	BEGINCODE
325	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
326	PROLOGUE_3_ARGS
327	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
328	%1 byte [A0], A1_8
329	IEM_SAVE_FLAGS A2, %3, %4
330	EPILOGUE_3_ARGS
331	ENDPROC iemAImpl_ %+ %1 %+ _u8
332
333	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
334	PROLOGUE_3_ARGS
335	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
336	%1 word [A0], A1_16
337	IEM_SAVE_FLAGS A2, %3, %4
338	EPILOGUE_3_ARGS
339	ENDPROC iemAImpl_ %+ %1 %+ _u16
340
341	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
342	PROLOGUE_3_ARGS
343	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
344	%1 dword [A0], A1_32
345	IEM_SAVE_FLAGS A2, %3, %4
346	EPILOGUE_3_ARGS
347	ENDPROC iemAImpl_ %+ %1 %+ _u32
348
349	%ifdef RT_ARCH_AMD64
350	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
351	PROLOGUE_3_ARGS
352	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
353	%1 qword [A0], A1
354	IEM_SAVE_FLAGS A2, %3, %4
355	EPILOGUE_3_ARGS_EX 8
356	ENDPROC iemAImpl_ %+ %1 %+ _u64
357	%else ; stub it for now - later, replace with hand coded stuff.
358	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
359	int3
360	ret
361	ENDPROC iemAImpl_ %+ %1 %+ _u64
362	%endif ; !RT_ARCH_AMD64
363
364	%if %2 != 0 ; locked versions requested?
365
366	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
367	PROLOGUE_3_ARGS
368	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
369	lock %1 byte [A0], A1_8
370	IEM_SAVE_FLAGS A2, %3, %4
371	EPILOGUE_3_ARGS
372	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
373
374	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
375	PROLOGUE_3_ARGS
376	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
377	lock %1 word [A0], A1_16
378	IEM_SAVE_FLAGS A2, %3, %4
379	EPILOGUE_3_ARGS
380	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
381
382	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
383	PROLOGUE_3_ARGS
384	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
385	lock %1 dword [A0], A1_32
386	IEM_SAVE_FLAGS A2, %3, %4
387	EPILOGUE_3_ARGS
388	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
389
390	%ifdef RT_ARCH_AMD64
391	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
392	PROLOGUE_3_ARGS
393	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
394	lock %1 qword [A0], A1
395	IEM_SAVE_FLAGS A2, %3, %4
396	EPILOGUE_3_ARGS_EX 8
397	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
398	%else ; stub it for now - later, replace with hand coded stuff.
399	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
400	int3
401	ret 8
402	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
403	%endif ; !RT_ARCH_AMD64
404	%endif ; locked
405	%endmacro
406
407	; instr,lock,modified-flags.
408	IEMIMPL_BIN_OP add, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
409	IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
410	IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
411	IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
412	IEMIMPL_BIN_OP or, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
413	IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
414	IEMIMPL_BIN_OP and, 1, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
415	IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
416	IEMIMPL_BIN_OP test, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), X86_EFL_AF
417
418
419	;;
420	; Macro for implementing a bit operator.
421	;
422	; This will generate code for the 16, 32 and 64 bit accesses with locked
423	; variants, except on 32-bit system where the 64-bit accesses requires hand
424	; coding.
425	;
426	; All the functions takes a pointer to the destination memory operand in A0,
427	; the source register operand in A1 and a pointer to eflags in A2.
428	;
429	; @param 1 The instruction mnemonic.
430	; @param 2 Non-zero if there should be a locked version.
431	; @param 3 The modified flags.
432	; @param 4 The undefined flags.
433	;
434	%macro IEMIMPL_BIT_OP 4
435	BEGINCODE
436	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
437	PROLOGUE_3_ARGS
438	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
439	%1 word [A0], A1_16
440	IEM_SAVE_FLAGS A2, %3, %4
441	EPILOGUE_3_ARGS
442	ENDPROC iemAImpl_ %+ %1 %+ _u16
443
444	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
445	PROLOGUE_3_ARGS
446	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
447	%1 dword [A0], A1_32
448	IEM_SAVE_FLAGS A2, %3, %4
449	EPILOGUE_3_ARGS
450	ENDPROC iemAImpl_ %+ %1 %+ _u32
451
452	%ifdef RT_ARCH_AMD64
453	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
454	PROLOGUE_3_ARGS
455	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
456	%1 qword [A0], A1
457	IEM_SAVE_FLAGS A2, %3, %4
458	EPILOGUE_3_ARGS_EX 8
459	ENDPROC iemAImpl_ %+ %1 %+ _u64
460	%else ; stub it for now - later, replace with hand coded stuff.
461	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
462	int3
463	ret 8
464	ENDPROC iemAImpl_ %+ %1 %+ _u64
465	%endif ; !RT_ARCH_AMD64
466
467	%if %2 != 0 ; locked versions requested?
468
469	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
470	PROLOGUE_3_ARGS
471	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
472	lock %1 word [A0], A1_16
473	IEM_SAVE_FLAGS A2, %3, %4
474	EPILOGUE_3_ARGS
475	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
476
477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
478	PROLOGUE_3_ARGS
479	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
480	lock %1 dword [A0], A1_32
481	IEM_SAVE_FLAGS A2, %3, %4
482	EPILOGUE_3_ARGS
483	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
484
485	%ifdef RT_ARCH_AMD64
486	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
487	PROLOGUE_3_ARGS
488	IEM_MAYBE_LOAD_FLAGS A2, %3, %4
489	lock %1 qword [A0], A1
490	IEM_SAVE_FLAGS A2, %3, %4
491	EPILOGUE_3_ARGS_EX 8
492	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
493	%else ; stub it for now - later, replace with hand coded stuff.
494	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
495	int3
496	ret 8
497	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
498	%endif ; !RT_ARCH_AMD64
499	%endif ; locked
500	%endmacro
501	IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
502	IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
503	IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
504	IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
505
506	;;
507	; Macro for implementing a bit search operator.
508	;
509	; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
510	; system where the 64-bit accesses requires hand coding.
511	;
512	; All the functions takes a pointer to the destination memory operand in A0,
513	; the source register operand in A1 and a pointer to eflags in A2.
514	;
515	; @param 1 The instruction mnemonic.
516	; @param 2 The modified flags.
517	; @param 3 The undefined flags.
518	;
519	%macro IEMIMPL_BIT_OP 3
520	BEGINCODE
521	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
522	PROLOGUE_3_ARGS
523	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
524	%1 T0_16, A1_16
525	jz .unchanged_dst
526	mov [A0], T0_16
527	.unchanged_dst:
528	IEM_SAVE_FLAGS A2, %2, %3
529	EPILOGUE_3_ARGS
530	ENDPROC iemAImpl_ %+ %1 %+ _u16
531
532	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
533	PROLOGUE_3_ARGS
534	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
535	%1 T0_32, A1_32
536	jz .unchanged_dst
537	mov [A0], T0_32
538	.unchanged_dst:
539	IEM_SAVE_FLAGS A2, %2, %3
540	EPILOGUE_3_ARGS
541	ENDPROC iemAImpl_ %+ %1 %+ _u32
542
543	%ifdef RT_ARCH_AMD64
544	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
545	PROLOGUE_3_ARGS
546	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
547	%1 T0, A1
548	jz .unchanged_dst
549	mov [A0], T0
550	.unchanged_dst:
551	IEM_SAVE_FLAGS A2, %2, %3
552	EPILOGUE_3_ARGS_EX 8
553	ENDPROC iemAImpl_ %+ %1 %+ _u64
554	%else ; stub it for now - later, replace with hand coded stuff.
555	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
556	int3
557	ret 8
558	ENDPROC iemAImpl_ %+ %1 %+ _u64
559	%endif ; !RT_ARCH_AMD64
560	%endmacro
561	IEMIMPL_BIT_OP bsf, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
562	IEMIMPL_BIT_OP bsr, (X86_EFL_ZF), (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF)
563
564
565	;
566	; IMUL is also a similar but yet different case (no lock, no mem dst).
567	; The rDX:rAX variant of imul is handled together with mul further down.
568	;
569	BEGINCODE
570	BEGINPROC_FASTCALL iemAImpl_imul_two_u16, 12
571	PROLOGUE_3_ARGS
572	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
573	imul A1_16, word [A0]
574	mov [A0], A1_16
575	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
576	EPILOGUE_3_ARGS
577	ENDPROC iemAImpl_imul_two_u16
578
579	BEGINPROC_FASTCALL iemAImpl_imul_two_u32, 12
580	PROLOGUE_3_ARGS
581	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
582	imul A1_32, dword [A0]
583	mov [A0], A1_32
584	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
585	EPILOGUE_3_ARGS
586	ENDPROC iemAImpl_imul_two_u32
587
588	BEGINPROC_FASTCALL iemAImpl_imul_two_u64, 16
589	PROLOGUE_3_ARGS
590	%ifdef RT_ARCH_AMD64
591	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
592	imul A1, qword [A0]
593	mov [A0], A1
594	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
595	%else
596	int3 ;; @todo implement me
597	%endif
598	EPILOGUE_3_ARGS_EX 8
599	ENDPROC iemAImpl_imul_two_u64
600
601
602	;
603	; XCHG for memory operands. This implies locking. No flag changes.
604	;
605	; Each function takes two arguments, first the pointer to the memory,
606	; then the pointer to the register. They all return void.
607	;
608	BEGINCODE
609	BEGINPROC_FASTCALL iemAImpl_xchg_u8, 8
610	PROLOGUE_2_ARGS
611	mov T0_8, [A1]
612	xchg [A0], T0_8
613	mov [A1], T0_8
614	EPILOGUE_2_ARGS
615	ENDPROC iemAImpl_xchg_u8
616
617	BEGINPROC_FASTCALL iemAImpl_xchg_u16, 8
618	PROLOGUE_2_ARGS
619	mov T0_16, [A1]
620	xchg [A0], T0_16
621	mov [A1], T0_16
622	EPILOGUE_2_ARGS
623	ENDPROC iemAImpl_xchg_u16
624
625	BEGINPROC_FASTCALL iemAImpl_xchg_u32, 8
626	PROLOGUE_2_ARGS
627	mov T0_32, [A1]
628	xchg [A0], T0_32
629	mov [A1], T0_32
630	EPILOGUE_2_ARGS
631	ENDPROC iemAImpl_xchg_u32
632
633	BEGINPROC_FASTCALL iemAImpl_xchg_u64, 8
634	%ifdef RT_ARCH_AMD64
635	PROLOGUE_2_ARGS
636	mov T0, [A1]
637	xchg [A0], T0
638	mov [A1], T0
639	EPILOGUE_2_ARGS
640	%else
641	int3
642	ret 0
643	%endif
644	ENDPROC iemAImpl_xchg_u64
645
646
647	;
648	; XADD for memory operands.
649	;
650	; Each function takes three arguments, first the pointer to the
651	; memory/register, then the pointer to the register, and finally a pointer to
652	; eflags. They all return void.
653	;
654	BEGINCODE
655	BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
656	PROLOGUE_3_ARGS
657	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
658	mov T0_8, [A1]
659	xadd [A0], T0_8
660	mov [A1], T0_8
661	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
662	EPILOGUE_3_ARGS
663	ENDPROC iemAImpl_xadd_u8
664
665	BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
666	PROLOGUE_3_ARGS
667	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
668	mov T0_16, [A1]
669	xadd [A0], T0_16
670	mov [A1], T0_16
671	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
672	EPILOGUE_3_ARGS
673	ENDPROC iemAImpl_xadd_u16
674
675	BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
676	PROLOGUE_3_ARGS
677	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
678	mov T0_32, [A1]
679	xadd [A0], T0_32
680	mov [A1], T0_32
681	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
682	EPILOGUE_3_ARGS
683	ENDPROC iemAImpl_xadd_u32
684
685	BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
686	%ifdef RT_ARCH_AMD64
687	PROLOGUE_3_ARGS
688	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
689	mov T0, [A1]
690	xadd [A0], T0
691	mov [A1], T0
692	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
693	EPILOGUE_3_ARGS
694	%else
695	int3
696	ret 4
697	%endif
698	ENDPROC iemAImpl_xadd_u64
699
700	BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
701	PROLOGUE_3_ARGS
702	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
703	mov T0_8, [A1]
704	lock xadd [A0], T0_8
705	mov [A1], T0_8
706	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
707	EPILOGUE_3_ARGS
708	ENDPROC iemAImpl_xadd_u8_locked
709
710	BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
711	PROLOGUE_3_ARGS
712	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
713	mov T0_16, [A1]
714	lock xadd [A0], T0_16
715	mov [A1], T0_16
716	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
717	EPILOGUE_3_ARGS
718	ENDPROC iemAImpl_xadd_u16_locked
719
720	BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
721	PROLOGUE_3_ARGS
722	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
723	mov T0_32, [A1]
724	lock xadd [A0], T0_32
725	mov [A1], T0_32
726	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
727	EPILOGUE_3_ARGS
728	ENDPROC iemAImpl_xadd_u32_locked
729
730	BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
731	%ifdef RT_ARCH_AMD64
732	PROLOGUE_3_ARGS
733	IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
734	mov T0, [A1]
735	lock xadd [A0], T0
736	mov [A1], T0
737	IEM_SAVE_FLAGS A2, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
738	EPILOGUE_3_ARGS
739	%else
740	int3
741	ret 4
742	%endif
743	ENDPROC iemAImpl_xadd_u64_locked
744
745
746	;
747	; CMPXCHG8B.
748	;
749	; These are tricky register wise, so the code is duplicated for each calling
750	; convention.
751	;
752	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
753	;
754	; C-proto:
755	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
756	; uint32_t *pEFlags));
757	;
758	BEGINCODE
759	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
760	%ifdef RT_ARCH_AMD64
761	%ifdef ASM_CALL64_MSC
762	push rbx
763
764	mov r11, rdx ; pu64EaxEdx (is also T1)
765	mov r10, rcx ; pu64Dst
766
767	mov ebx, [r8]
768	mov ecx, [r8 + 4]
769	IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
770	mov eax, [r11]
771	mov edx, [r11 + 4]
772
773	lock cmpxchg8b [r10]
774
775	mov [r11], eax
776	mov [r11 + 4], edx
777	IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
778
779	pop rbx
780	ret
781	%else
782	push rbx
783
784	mov r10, rcx ; pEFlags
785	mov r11, rdx ; pu64EbxEcx (is also T1)
786
787	mov ebx, [r11]
788	mov ecx, [r11 + 4]
789	IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
790	mov eax, [rsi]
791	mov edx, [rsi + 4]
792
793	lock cmpxchg8b [rdi]
794
795	mov [rsi], eax
796	mov [rsi + 4], edx
797	IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
798
799	pop rbx
800	ret
801
802	%endif
803	%else
804	push esi
805	push edi
806	push ebx
807	push ebp
808
809	mov edi, ecx ; pu64Dst
810	mov esi, edx ; pu64EaxEdx
811	mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
812	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
813
814	mov ebx, [ecx]
815	mov ecx, [ecx + 4]
816	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
817	mov eax, [esi]
818	mov edx, [esi + 4]
819
820	lock cmpxchg8b [edi]
821
822	mov [esi], eax
823	mov [esi + 4], edx
824	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
825
826	pop ebp
827	pop ebx
828	pop edi
829	pop esi
830	ret 8
831	%endif
832	ENDPROC iemAImpl_cmpxchg8b
833
834	BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
835	; Lazy bird always lock prefixes cmpxchg8b.
836	jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
837	ENDPROC iemAImpl_cmpxchg8b_locked
838
839
840
841	;
842	; CMPXCHG.
843	;
844	; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
845	;
846	; C-proto:
847	; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t puXDst, uintX_t puEax, uintX_t uReg, uint32_t pEFlags));
848	;
849	BEGINCODE
850	%macro IEMIMPL_CMPXCHG 2
851	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
852	PROLOGUE_4_ARGS
853	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
854	mov al, [A1]
855	%1 cmpxchg [A0], A2_8
856	mov [A1], al
857	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
858	EPILOGUE_4_ARGS
859	ENDPROC iemAImpl_cmpxchg_u8 %+ %2
860
861	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
862	PROLOGUE_4_ARGS
863	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
864	mov ax, [A1]
865	%1 cmpxchg [A0], A2_16
866	mov [A1], ax
867	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
868	EPILOGUE_4_ARGS
869	ENDPROC iemAImpl_cmpxchg_u16 %+ %2
870
871	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
872	PROLOGUE_4_ARGS
873	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
874	mov eax, [A1]
875	%1 cmpxchg [A0], A2_32
876	mov [A1], eax
877	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
878	EPILOGUE_4_ARGS
879	ENDPROC iemAImpl_cmpxchg_u32 %+ %2
880
881	BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
882	%ifdef RT_ARCH_AMD64
883	PROLOGUE_4_ARGS
884	IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
885	mov rax, [A1]
886	%1 cmpxchg [A0], A2
887	mov [A1], rax
888	IEM_SAVE_FLAGS A3, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
889	EPILOGUE_4_ARGS
890	%else
891	;
892	; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
893	;
894	push esi
895	push edi
896	push ebx
897	push ebp
898
899	mov edi, ecx ; pu64Dst
900	mov esi, edx ; pu64Rax
901	mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
902	mov ebp, [esp + 16 + 4 + 4] ; pEFlags
903
904	mov ebx, [ecx]
905	mov ecx, [ecx + 4]
906	IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0 (eax)
907	mov eax, [esi]
908	mov edx, [esi + 4]
909
910	lock cmpxchg8b [edi]
911
912	; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
913	jz .cmpxchg8b_not_equal
914	cmp eax, eax ; just set the other flags.
915	.store:
916	mov [esi], eax
917	mov [esi + 4], edx
918	IEM_SAVE_FLAGS ebp, (X86_EFL_ZF \| X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
919
920	pop ebp
921	pop ebx
922	pop edi
923	pop esi
924	ret 8
925
926	.cmpxchg8b_not_equal:
927	cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
928	jne .store
929	cmp [esi], eax
930	jmp .store
931
932	%endif
933	ENDPROC iemAImpl_cmpxchg_u64 %+ %2
934	%endmacro ; IEMIMPL_CMPXCHG
935
936	IEMIMPL_CMPXCHG , ,
937	IEMIMPL_CMPXCHG lock, _locked
938
939	;;
940	; Macro for implementing a unary operator.
941	;
942	; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
943	; variants, except on 32-bit system where the 64-bit accesses requires hand
944	; coding.
945	;
946	; All the functions takes a pointer to the destination memory operand in A0,
947	; the source register operand in A1 and a pointer to eflags in A2.
948	;
949	; @param 1 The instruction mnemonic.
950	; @param 2 The modified flags.
951	; @param 3 The undefined flags.
952	;
953	%macro IEMIMPL_UNARY_OP 3
954	BEGINCODE
955	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
956	PROLOGUE_2_ARGS
957	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
958	%1 byte [A0]
959	IEM_SAVE_FLAGS A1, %2, %3
960	EPILOGUE_2_ARGS
961	ENDPROC iemAImpl_ %+ %1 %+ _u8
962
963	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
964	PROLOGUE_2_ARGS
965	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
966	lock %1 byte [A0]
967	IEM_SAVE_FLAGS A1, %2, %3
968	EPILOGUE_2_ARGS
969	ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
970
971	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
972	PROLOGUE_2_ARGS
973	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
974	%1 word [A0]
975	IEM_SAVE_FLAGS A1, %2, %3
976	EPILOGUE_2_ARGS
977	ENDPROC iemAImpl_ %+ %1 %+ _u16
978
979	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
980	PROLOGUE_2_ARGS
981	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
982	lock %1 word [A0]
983	IEM_SAVE_FLAGS A1, %2, %3
984	EPILOGUE_2_ARGS
985	ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
986
987	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
988	PROLOGUE_2_ARGS
989	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
990	%1 dword [A0]
991	IEM_SAVE_FLAGS A1, %2, %3
992	EPILOGUE_2_ARGS
993	ENDPROC iemAImpl_ %+ %1 %+ _u32
994
995	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
996	PROLOGUE_2_ARGS
997	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
998	lock %1 dword [A0]
999	IEM_SAVE_FLAGS A1, %2, %3
1000	EPILOGUE_2_ARGS
1001	ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1002
1003	%ifdef RT_ARCH_AMD64
1004	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1005	PROLOGUE_2_ARGS
1006	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1007	%1 qword [A0]
1008	IEM_SAVE_FLAGS A1, %2, %3
1009	EPILOGUE_2_ARGS
1010	ENDPROC iemAImpl_ %+ %1 %+ _u64
1011
1012	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1013	PROLOGUE_2_ARGS
1014	IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1015	lock %1 qword [A0]
1016	IEM_SAVE_FLAGS A1, %2, %3
1017	EPILOGUE_2_ARGS
1018	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1019	%else
1020	; stub them for now.
1021	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1022	int3
1023	ret 0
1024	ENDPROC iemAImpl_ %+ %1 %+ _u64
1025	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1026	int3
1027	ret 0
1028	ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1029	%endif
1030
1031	%endmacro
1032
1033	IEMIMPL_UNARY_OP inc, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1034	IEMIMPL_UNARY_OP dec, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF), 0
1035	IEMIMPL_UNARY_OP neg, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1036	IEMIMPL_UNARY_OP not, 0, 0
1037
1038
1039	;;
1040	; Macro for implementing memory fence operation.
1041	;
1042	; No return value, no operands or anything.
1043	;
1044	; @param 1 The instruction.
1045	;
1046	%macro IEMIMPL_MEM_FENCE 1
1047	BEGINCODE
1048	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
1049	%1
1050	ret
1051	ENDPROC iemAImpl_ %+ %1
1052	%endmacro
1053
1054	IEMIMPL_MEM_FENCE lfence
1055	IEMIMPL_MEM_FENCE sfence
1056	IEMIMPL_MEM_FENCE mfence
1057
1058	;;
1059	; Alternative for non-SSE2 host.
1060	;
1061	BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
1062	push xAX
1063	xchg xAX, [xSP]
1064	add xSP, xCB
1065	ret
1066	ENDPROC iemAImpl_alt_mem_fence
1067
1068
1069
1070	;;
1071	; Macro for implementing a shift operation.
1072	;
1073	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1074	; 32-bit system where the 64-bit accesses requires hand coding.
1075	;
1076	; All the functions takes a pointer to the destination memory operand in A0,
1077	; the shift count in A1 and a pointer to eflags in A2.
1078	;
1079	; @param 1 The instruction mnemonic.
1080	; @param 2 The modified flags.
1081	; @param 3 The undefined flags.
1082	;
1083	; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1084	;
1085	%macro IEMIMPL_SHIFT_OP 3
1086	BEGINCODE
1087	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1088	PROLOGUE_3_ARGS
1089	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090	%ifdef ASM_CALL64_GCC
1091	mov cl, A1_8
1092	%1 byte [A0], cl
1093	%else
1094	xchg A1, A0
1095	%1 byte [A1], cl
1096	%endif
1097	IEM_SAVE_FLAGS A2, %2, %3
1098	EPILOGUE_3_ARGS
1099	ENDPROC iemAImpl_ %+ %1 %+ _u8
1100
1101	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1102	PROLOGUE_3_ARGS
1103	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1104	%ifdef ASM_CALL64_GCC
1105	mov cl, A1_8
1106	%1 word [A0], cl
1107	%else
1108	xchg A1, A0
1109	%1 word [A1], cl
1110	%endif
1111	IEM_SAVE_FLAGS A2, %2, %3
1112	EPILOGUE_3_ARGS
1113	ENDPROC iemAImpl_ %+ %1 %+ _u16
1114
1115	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1116	PROLOGUE_3_ARGS
1117	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1118	%ifdef ASM_CALL64_GCC
1119	mov cl, A1_8
1120	%1 dword [A0], cl
1121	%else
1122	xchg A1, A0
1123	%1 dword [A1], cl
1124	%endif
1125	IEM_SAVE_FLAGS A2, %2, %3
1126	EPILOGUE_3_ARGS
1127	ENDPROC iemAImpl_ %+ %1 %+ _u32
1128
1129	%ifdef RT_ARCH_AMD64
1130	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1131	PROLOGUE_3_ARGS
1132	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1133	%ifdef ASM_CALL64_GCC
1134	mov cl, A1_8
1135	%1 qword [A0], cl
1136	%else
1137	xchg A1, A0
1138	%1 qword [A1], cl
1139	%endif
1140	IEM_SAVE_FLAGS A2, %2, %3
1141	EPILOGUE_3_ARGS
1142	ENDPROC iemAImpl_ %+ %1 %+ _u64
1143	%else ; stub it for now - later, replace with hand coded stuff.
1144	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1145	int3
1146	ret 4
1147	ENDPROC iemAImpl_ %+ %1 %+ _u64
1148	%endif ; !RT_ARCH_AMD64
1149
1150	%endmacro
1151
1152	IEMIMPL_SHIFT_OP rol, (X86_EFL_OF \| X86_EFL_CF), 0
1153	IEMIMPL_SHIFT_OP ror, (X86_EFL_OF \| X86_EFL_CF), 0
1154	IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF \| X86_EFL_CF), 0
1155	IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF \| X86_EFL_CF), 0
1156	IEMIMPL_SHIFT_OP shl, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1157	IEMIMPL_SHIFT_OP shr, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1158	IEMIMPL_SHIFT_OP sar, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1159
1160
1161	;;
1162	; Macro for implementing a double precision shift operation.
1163	;
1164	; This will generate code for the 16, 32 and 64 bit accesses, except on
1165	; 32-bit system where the 64-bit accesses requires hand coding.
1166	;
1167	; The functions takes the destination operand (r/m) in A0, the source (reg) in
1168	; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1169	;
1170	; @param 1 The instruction mnemonic.
1171	; @param 2 The modified flags.
1172	; @param 3 The undefined flags.
1173	;
1174	; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1175	;
1176	%macro IEMIMPL_SHIFT_DBL_OP 3
1177	BEGINCODE
1178	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1179	PROLOGUE_4_ARGS
1180	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1181	%ifdef ASM_CALL64_GCC
1182	xchg A3, A2
1183	%1 [A0], A1_16, cl
1184	xchg A3, A2
1185	%else
1186	xchg A0, A2
1187	%1 [A2], A1_16, cl
1188	%endif
1189	IEM_SAVE_FLAGS A3, %2, %3
1190	EPILOGUE_4_ARGS
1191	ENDPROC iemAImpl_ %+ %1 %+ _u16
1192
1193	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1194	PROLOGUE_4_ARGS
1195	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1196	%ifdef ASM_CALL64_GCC
1197	xchg A3, A2
1198	%1 [A0], A1_32, cl
1199	xchg A3, A2
1200	%else
1201	xchg A0, A2
1202	%1 [A2], A1_32, cl
1203	%endif
1204	IEM_SAVE_FLAGS A3, %2, %3
1205	EPILOGUE_4_ARGS
1206	ENDPROC iemAImpl_ %+ %1 %+ _u32
1207
1208	%ifdef RT_ARCH_AMD64
1209	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1210	PROLOGUE_4_ARGS
1211	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1212	%ifdef ASM_CALL64_GCC
1213	xchg A3, A2
1214	%1 [A0], A1, cl
1215	xchg A3, A2
1216	%else
1217	xchg A0, A2
1218	%1 [A2], A1, cl
1219	%endif
1220	IEM_SAVE_FLAGS A3, %2, %3
1221	EPILOGUE_4_ARGS_EX 12
1222	ENDPROC iemAImpl_ %+ %1 %+ _u64
1223	%else ; stub it for now - later, replace with hand coded stuff.
1224	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1225	int3
1226	ret 12
1227	ENDPROC iemAImpl_ %+ %1 %+ _u64
1228	%endif ; !RT_ARCH_AMD64
1229
1230	%endmacro
1231
1232	IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1233	IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_PF \| X86_EFL_CF), (X86_EFL_AF)
1234
1235
1236	;;
1237	; Macro for implementing a multiplication operations.
1238	;
1239	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1240	; 32-bit system where the 64-bit accesses requires hand coding.
1241	;
1242	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1243	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1244	; pointer to eflags in A3.
1245	;
1246	; The functions all return 0 so the caller can be used for div/idiv as well as
1247	; for the mul/imul implementation.
1248	;
1249	; @param 1 The instruction mnemonic.
1250	; @param 2 The modified flags.
1251	; @param 3 The undefined flags.
1252	;
1253	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1254	;
1255	%macro IEMIMPL_MUL_OP 3
1256	BEGINCODE
1257	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1258	PROLOGUE_3_ARGS
1259	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1260	mov al, [A0]
1261	%1 A1_8
1262	mov [A0], ax
1263	IEM_SAVE_FLAGS A2, %2, %3
1264	xor eax, eax
1265	EPILOGUE_3_ARGS
1266	ENDPROC iemAImpl_ %+ %1 %+ _u8
1267
1268	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1269	PROLOGUE_4_ARGS
1270	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1271	mov ax, [A0]
1272	%ifdef ASM_CALL64_GCC
1273	%1 A2_16
1274	mov [A0], ax
1275	mov [A1], dx
1276	%else
1277	mov T1, A1
1278	%1 A2_16
1279	mov [A0], ax
1280	mov [T1], dx
1281	%endif
1282	IEM_SAVE_FLAGS A3, %2, %3
1283	xor eax, eax
1284	EPILOGUE_4_ARGS
1285	ENDPROC iemAImpl_ %+ %1 %+ _u16
1286
1287	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1288	PROLOGUE_4_ARGS
1289	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1290	mov eax, [A0]
1291	%ifdef ASM_CALL64_GCC
1292	%1 A2_32
1293	mov [A0], eax
1294	mov [A1], edx
1295	%else
1296	mov T1, A1
1297	%1 A2_32
1298	mov [A0], eax
1299	mov [T1], edx
1300	%endif
1301	IEM_SAVE_FLAGS A3, %2, %3
1302	xor eax, eax
1303	EPILOGUE_4_ARGS
1304	ENDPROC iemAImpl_ %+ %1 %+ _u32
1305
1306	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1307	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1308	PROLOGUE_4_ARGS
1309	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1310	mov rax, [A0]
1311	%ifdef ASM_CALL64_GCC
1312	%1 A2
1313	mov [A0], rax
1314	mov [A1], rdx
1315	%else
1316	mov T1, A1
1317	%1 A2
1318	mov [A0], rax
1319	mov [T1], rdx
1320	%endif
1321	IEM_SAVE_FLAGS A3, %2, %3
1322	xor eax, eax
1323	EPILOGUE_4_ARGS_EX 12
1324	ENDPROC iemAImpl_ %+ %1 %+ _u64
1325	%endif ; !RT_ARCH_AMD64
1326
1327	%endmacro
1328
1329	IEMIMPL_MUL_OP mul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1330	IEMIMPL_MUL_OP imul, (X86_EFL_OF \| X86_EFL_CF), (X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF)
1331
1332
1333	BEGINCODE
1334	;;
1335	; Worker function for negating a 32-bit number in T1:T0
1336	; @uses None (T0,T1)
1337	iemAImpl_negate_T0_T1_u32:
1338	push 0
1339	push 0
1340	xchg T0_32, [xSP]
1341	xchg T1_32, [xSP + xCB]
1342	sub T0_32, [xSP]
1343	sbb T1_32, [xSP + xCB]
1344	add xSP, xCB*2
1345	ret
1346
1347	%ifdef RT_ARCH_AMD64
1348	;;
1349	; Worker function for negating a 64-bit number in T1:T0
1350	; @uses None (T0,T1)
1351	iemAImpl_negate_T0_T1_u64:
1352	push 0
1353	push 0
1354	xchg T0, [xSP]
1355	xchg T1, [xSP + xCB]
1356	sub T0, [xSP]
1357	sbb T1, [xSP + xCB]
1358	add xSP, xCB*2
1359	ret
1360	%endif
1361
1362
1363	;;
1364	; Macro for implementing a division operations.
1365	;
1366	; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1367	; 32-bit system where the 64-bit accesses requires hand coding.
1368	;
1369	; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1370	; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1371	; pointer to eflags in A3.
1372	;
1373	; The functions all return 0 on success and -1 if a divide error should be
1374	; raised by the caller.
1375	;
1376	; @param 1 The instruction mnemonic.
1377	; @param 2 The modified flags.
1378	; @param 3 The undefined flags.
1379	; @param 4 1 if signed, 0 if unsigned.
1380	;
1381	; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1382	;
1383	%macro IEMIMPL_DIV_OP 4
1384	BEGINCODE
1385	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1386	PROLOGUE_3_ARGS
1387
1388	; div by chainsaw check.
1389	test A1_8, A1_8
1390	jz .div_zero
1391
1392	; Overflow check - unsigned division is simple to verify, haven't
1393	; found a simple way to check signed division yet unfortunately.
1394	%if %4 == 0
1395	cmp [A0 + 1], A1_8
1396	jae .div_overflow
1397	%else
1398	mov T0_16, [A0] ; T0 = dividend
1399	mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
1400	test A1_8, A1_8
1401	js .divisor_negative
1402	test T0_16, T0_16
1403	jns .both_positive
1404	neg T0_16
1405	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1406	push T0 ; Start off like unsigned below.
1407	shr T0_16, 7
1408	cmp T0_8, A1_8
1409	pop T0
1410	jb .div_no_overflow
1411	ja .div_overflow
1412	and T0_8, 0x7f ; Special case for covering (divisor - 1).
1413	cmp T0_8, A1_8
1414	jae .div_overflow
1415	jmp .div_no_overflow
1416
1417	.divisor_negative:
1418	neg A1_8
1419	test T0_16, T0_16
1420	jns .one_of_each
1421	neg T0_16
1422	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1423	shr T0_16, 7
1424	cmp T0_8, A1_8
1425	jae .div_overflow
1426	.div_no_overflow:
1427	mov A1, T1 ; restore divisor
1428	%endif
1429
1430	IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1431	mov ax, [A0]
1432	%1 A1_8
1433	mov [A0], ax
1434	IEM_SAVE_FLAGS A2, %2, %3
1435	xor eax, eax
1436
1437	.return:
1438	EPILOGUE_3_ARGS
1439
1440	.div_zero:
1441	.div_overflow:
1442	mov eax, -1
1443	jmp .return
1444	ENDPROC iemAImpl_ %+ %1 %+ _u8
1445
1446	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1447	PROLOGUE_4_ARGS
1448
1449	; div by chainsaw check.
1450	test A2_16, A2_16
1451	jz .div_zero
1452
1453	; Overflow check - unsigned division is simple to verify, haven't
1454	; found a simple way to check signed division yet unfortunately.
1455	%if %4 == 0
1456	cmp [A1], A2_16
1457	jae .div_overflow
1458	%else
1459	mov T0_16, [A1]
1460	shl T0_32, 16
1461	mov T0_16, [A0] ; T0 = dividend
1462	mov T1, A2 ; T1 = divisor
1463	test T1_16, T1_16
1464	js .divisor_negative
1465	test T0_32, T0_32
1466	jns .both_positive
1467	neg T0_32
1468	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1469	push T0 ; Start off like unsigned below.
1470	shr T0_32, 15
1471	cmp T0_16, T1_16
1472	pop T0
1473	jb .div_no_overflow
1474	ja .div_overflow
1475	and T0_16, 0x7fff ; Special case for covering (divisor - 1).
1476	cmp T0_16, T1_16
1477	jae .div_overflow
1478	jmp .div_no_overflow
1479
1480	.divisor_negative:
1481	neg T1_16
1482	test T0_32, T0_32
1483	jns .one_of_each
1484	neg T0_32
1485	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1486	shr T0_32, 15
1487	cmp T0_16, T1_16
1488	jae .div_overflow
1489	.div_no_overflow:
1490	%endif
1491
1492	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1493	%ifdef ASM_CALL64_GCC
1494	mov T1, A2
1495	mov ax, [A0]
1496	mov dx, [A1]
1497	%1 T1_16
1498	mov [A0], ax
1499	mov [A1], dx
1500	%else
1501	mov T1, A1
1502	mov ax, [A0]
1503	mov dx, [T1]
1504	%1 A2_16
1505	mov [A0], ax
1506	mov [T1], dx
1507	%endif
1508	IEM_SAVE_FLAGS A3, %2, %3
1509	xor eax, eax
1510
1511	.return:
1512	EPILOGUE_4_ARGS
1513
1514	.div_zero:
1515	.div_overflow:
1516	mov eax, -1
1517	jmp .return
1518	ENDPROC iemAImpl_ %+ %1 %+ _u16
1519
1520	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1521	PROLOGUE_4_ARGS
1522
1523	; div by chainsaw check.
1524	test A2_32, A2_32
1525	jz .div_zero
1526
1527	; Overflow check - unsigned division is simple to verify, haven't
1528	; found a simple way to check signed division yet unfortunately.
1529	%if %4 == 0
1530	cmp [A1], A2_32
1531	jae .div_overflow
1532	%else
1533	push A2 ; save A2 so we modify it (we out of regs on x86).
1534	mov T0_32, [A0] ; T0 = dividend low
1535	mov T1_32, [A1] ; T1 = dividend high
1536	test A2_32, A2_32
1537	js .divisor_negative
1538	test T1_32, T1_32
1539	jns .both_positive
1540	call iemAImpl_negate_T0_T1_u32
1541	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1542	push T0 ; Start off like unsigned below.
1543	shl T1_32, 1
1544	shr T0_32, 31
1545	or T1_32, T0_32
1546	cmp T1_32, A2_32
1547	pop T0
1548	jb .div_no_overflow
1549	ja .div_overflow
1550	and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
1551	cmp T0_32, A2_32
1552	jae .div_overflow
1553	jmp .div_no_overflow
1554
1555	.divisor_negative:
1556	neg A2_32
1557	test T1_32, T1_32
1558	jns .one_of_each
1559	call iemAImpl_negate_T0_T1_u32
1560	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1561	shl T1_32, 1
1562	shr T0_32, 31
1563	or T1_32, T0_32
1564	cmp T1_32, A2_32
1565	jae .div_overflow
1566	.div_no_overflow:
1567	pop A2
1568	%endif
1569
1570	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1571	mov eax, [A0]
1572	%ifdef ASM_CALL64_GCC
1573	mov T1, A2
1574	mov eax, [A0]
1575	mov edx, [A1]
1576	%1 T1_32
1577	mov [A0], eax
1578	mov [A1], edx
1579	%else
1580	mov T1, A1
1581	mov eax, [A0]
1582	mov edx, [T1]
1583	%1 A2_32
1584	mov [A0], eax
1585	mov [T1], edx
1586	%endif
1587	IEM_SAVE_FLAGS A3, %2, %3
1588	xor eax, eax
1589
1590	.return:
1591	EPILOGUE_4_ARGS
1592
1593	.div_overflow:
1594	%if %4 != 0
1595	pop A2
1596	%endif
1597	.div_zero:
1598	mov eax, -1
1599	jmp .return
1600	ENDPROC iemAImpl_ %+ %1 %+ _u32
1601
1602	%ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1603	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1604	PROLOGUE_4_ARGS
1605
1606	test A2, A2
1607	jz .div_zero
1608	%if %4 == 0
1609	cmp [A1], A2
1610	jae .div_overflow
1611	%else
1612	push A2 ; save A2 so we modify it (we out of regs on x86).
1613	mov T0, [A0] ; T0 = dividend low
1614	mov T1, [A1] ; T1 = dividend high
1615	test A2, A2
1616	js .divisor_negative
1617	test T1, T1
1618	jns .both_positive
1619	call iemAImpl_negate_T0_T1_u64
1620	.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
1621	push T0 ; Start off like unsigned below.
1622	shl T1, 1
1623	shr T0, 63
1624	or T1, T0
1625	cmp T1, A2
1626	pop T0
1627	jb .div_no_overflow
1628	ja .div_overflow
1629	mov T1, 0x7fffffffffffffff
1630	and T0, T1 ; Special case for covering (divisor - 1).
1631	cmp T0, A2
1632	jae .div_overflow
1633	jmp .div_no_overflow
1634
1635	.divisor_negative:
1636	neg A2
1637	test T1, T1
1638	jns .one_of_each
1639	call iemAImpl_negate_T0_T1_u64
1640	.both_positive: ; Same as unsigned shifted by sign indicator bit.
1641	shl T1, 1
1642	shr T0, 63
1643	or T1, T0
1644	cmp T1, A2
1645	jae .div_overflow
1646	.div_no_overflow:
1647	pop A2
1648	%endif
1649
1650	IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1651	mov rax, [A0]
1652	%ifdef ASM_CALL64_GCC
1653	mov T1, A2
1654	mov rax, [A0]
1655	mov rdx, [A1]
1656	%1 T1
1657	mov [A0], rax
1658	mov [A1], rdx
1659	%else
1660	mov T1, A1
1661	mov rax, [A0]
1662	mov rdx, [T1]
1663	%1 A2
1664	mov [A0], rax
1665	mov [T1], rdx
1666	%endif
1667	IEM_SAVE_FLAGS A3, %2, %3
1668	xor eax, eax
1669
1670	.return:
1671	EPILOGUE_4_ARGS_EX 12
1672
1673	.div_overflow:
1674	%if %4 != 0
1675	pop A2
1676	%endif
1677	.div_zero:
1678	mov eax, -1
1679	jmp .return
1680	ENDPROC iemAImpl_ %+ %1 %+ _u64
1681	%endif ; !RT_ARCH_AMD64
1682
1683	%endmacro
1684
1685	IEMIMPL_DIV_OP div, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 0
1686	IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF \| X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF), 1
1687
1688
1689	;
1690	; BSWAP. No flag changes.
1691	;
1692	; Each function takes one argument, pointer to the value to bswap
1693	; (input/output). They all return void.
1694	;
1695	BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1696	PROLOGUE_1_ARGS
1697	mov T0_32, [A0] ; just in case any of the upper bits are used.
1698	db 66h
1699	bswap T0_32
1700	mov [A0], T0_32
1701	EPILOGUE_1_ARGS
1702	ENDPROC iemAImpl_bswap_u16
1703
1704	BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1705	PROLOGUE_1_ARGS
1706	mov T0_32, [A0]
1707	bswap T0_32
1708	mov [A0], T0_32
1709	EPILOGUE_1_ARGS
1710	ENDPROC iemAImpl_bswap_u32
1711
1712	BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1713	%ifdef RT_ARCH_AMD64
1714	PROLOGUE_1_ARGS
1715	mov T0, [A0]
1716	bswap T0
1717	mov [A0], T0
1718	EPILOGUE_1_ARGS
1719	%else
1720	PROLOGUE_1_ARGS
1721	mov T0, [A0]
1722	mov T1, [A0 + 4]
1723	bswap T0
1724	bswap T1
1725	mov [A0 + 4], T0
1726	mov [A0], T1
1727	EPILOGUE_1_ARGS
1728	%endif
1729	ENDPROC iemAImpl_bswap_u64
1730
1731
1732	;;
1733	; Initialize the FPU for the actual instruction being emulated, this means
1734	; loading parts of the guest's control word and status word.
1735	;
1736	; @uses 24 bytes of stack.
1737	; @param 1 Expression giving the address of the FXSTATE of the guest.
1738	;
1739	%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
1740	fnstenv [xSP]
1741
1742	; FCW - for exception, precision and rounding control.
1743	movzx T0, word [%1 + X86FXSTATE.FCW]
1744	and T0, X86_FCW_MASK_ALL \| X86_FCW_PC_MASK \| X86_FCW_RC_MASK
1745	mov [xSP + X86FSTENV32P.FCW], T0_16
1746
1747	; FSW - for undefined C0, C1, C2, and C3.
1748	movzx T1, word [%1 + X86FXSTATE.FSW]
1749	and T1, X86_FSW_C_MASK
1750	movzx T0, word [xSP + X86FSTENV32P.FSW]
1751	and T0, X86_FSW_TOP_MASK
1752	or T0, T1
1753	mov [xSP + X86FSTENV32P.FSW], T0_16
1754
1755	fldenv [xSP]
1756	%endmacro
1757
1758
1759	;;
1760	; Need to move this as well somewhere better?
1761	;
1762	struc IEMFPURESULT
1763	.r80Result resw 5
1764	.FSW resw 1
1765	endstruc
1766
1767
1768	;;
1769	; Need to move this as well somewhere better?
1770	;
1771	struc IEMFPURESULTTWO
1772	.r80Result1 resw 5
1773	.FSW resw 1
1774	.r80Result2 resw 5
1775	endstruc
1776
1777
1778	;
1779	;---------------------- 16-bit signed integer operations ----------------------
1780	;
1781
1782
1783	;;
1784	; Converts a 16-bit floating point value to a 80-bit one (fpu register).
1785	;
1786	; @param A0 FPU context (fxsave).
1787	; @param A1 Pointer to a IEMFPURESULT for the output.
1788	; @param A2 Pointer to the 16-bit floating point value to convert.
1789	;
1790	BEGINPROC_FASTCALL iemAImpl_fild_i16_to_r80, 12
1791	PROLOGUE_3_ARGS
1792	sub xSP, 20h
1793
1794	fninit
1795	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1796	fild word [A2]
1797
1798	fnstsw word [A1 + IEMFPURESULT.FSW]
1799	fnclex
1800	fstp tword [A1 + IEMFPURESULT.r80Result]
1801
1802	fninit
1803	add xSP, 20h
1804	EPILOGUE_3_ARGS
1805	ENDPROC iemAImpl_fild_i16_to_r80
1806
1807
1808	;;
1809	; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
1810	;
1811	; @param A0 FPU context (fxsave).
1812	; @param A1 Where to return the output FSW.
1813	; @param A2 Where to store the 16-bit signed integer value.
1814	; @param A3 Pointer to the 80-bit value.
1815	;
1816	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
1817	PROLOGUE_4_ARGS
1818	sub xSP, 20h
1819
1820	fninit
1821	fld tword [A3]
1822	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1823	fistp word [A2]
1824
1825	fnstsw word [A1]
1826
1827	fninit
1828	add xSP, 20h
1829	EPILOGUE_4_ARGS
1830	ENDPROC iemAImpl_fist_r80_to_i16
1831
1832
1833	;;
1834	; Store a 80-bit floating point value (register) as a 16-bit signed integer
1835	; (memory) with truncation.
1836	;
1837	; @param A0 FPU context (fxsave).
1838	; @param A1 Where to return the output FSW.
1839	; @param A2 Where to store the 16-bit signed integer value.
1840	; @param A3 Pointer to the 80-bit value.
1841	;
1842	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
1843	PROLOGUE_4_ARGS
1844	sub xSP, 20h
1845
1846	fninit
1847	fld tword [A3]
1848	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1849	fisttp dword [A2]
1850
1851	fnstsw word [A1]
1852
1853	fninit
1854	add xSP, 20h
1855	EPILOGUE_4_ARGS
1856	ENDPROC iemAImpl_fistt_r80_to_i16
1857
1858
1859	;;
1860	; FPU instruction working on one 80-bit and one 16-bit signed integer value.
1861	;
1862	; @param 1 The instruction
1863	;
1864	; @param A0 FPU context (fxsave).
1865	; @param A1 Pointer to a IEMFPURESULT for the output.
1866	; @param A2 Pointer to the 80-bit value.
1867	; @param A3 Pointer to the 16-bit value.
1868	;
1869	%macro IEMIMPL_FPU_R80_BY_I16 1
1870	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1871	PROLOGUE_4_ARGS
1872	sub xSP, 20h
1873
1874	fninit
1875	fld tword [A2]
1876	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1877	%1 word [A3]
1878
1879	fnstsw word [A1 + IEMFPURESULT.FSW]
1880	fnclex
1881	fstp tword [A1 + IEMFPURESULT.r80Result]
1882
1883	fninit
1884	add xSP, 20h
1885	EPILOGUE_4_ARGS
1886	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1887	%endmacro
1888
1889	IEMIMPL_FPU_R80_BY_I16 fiadd
1890	IEMIMPL_FPU_R80_BY_I16 fimul
1891	IEMIMPL_FPU_R80_BY_I16 fisub
1892	IEMIMPL_FPU_R80_BY_I16 fisubr
1893	IEMIMPL_FPU_R80_BY_I16 fidiv
1894	IEMIMPL_FPU_R80_BY_I16 fidivr
1895
1896
1897	;;
1898	; FPU instruction working on one 80-bit and one 16-bit signed integer value,
1899	; only returning FSW.
1900	;
1901	; @param 1 The instruction
1902	;
1903	; @param A0 FPU context (fxsave).
1904	; @param A1 Where to store the output FSW.
1905	; @param A2 Pointer to the 80-bit value.
1906	; @param A3 Pointer to the 64-bit value.
1907	;
1908	%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
1909	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
1910	PROLOGUE_4_ARGS
1911	sub xSP, 20h
1912
1913	fninit
1914	fld tword [A2]
1915	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1916	%1 word [A3]
1917
1918	fnstsw word [A1]
1919
1920	fninit
1921	add xSP, 20h
1922	EPILOGUE_4_ARGS
1923	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
1924	%endmacro
1925
1926	IEMIMPL_FPU_R80_BY_I16_FSW ficom
1927
1928
1929
1930	;
1931	;---------------------- 32-bit signed integer operations ----------------------
1932	;
1933
1934
1935	;;
1936	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
1937	;
1938	; @param A0 FPU context (fxsave).
1939	; @param A1 Pointer to a IEMFPURESULT for the output.
1940	; @param A2 Pointer to the 32-bit floating point value to convert.
1941	;
1942	BEGINPROC_FASTCALL iemAImpl_fild_i32_to_r80, 12
1943	PROLOGUE_3_ARGS
1944	sub xSP, 20h
1945
1946	fninit
1947	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1948	fild dword [A2]
1949
1950	fnstsw word [A1 + IEMFPURESULT.FSW]
1951	fnclex
1952	fstp tword [A1 + IEMFPURESULT.r80Result]
1953
1954	fninit
1955	add xSP, 20h
1956	EPILOGUE_3_ARGS
1957	ENDPROC iemAImpl_fild_i32_to_r80
1958
1959
1960	;;
1961	; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
1962	;
1963	; @param A0 FPU context (fxsave).
1964	; @param A1 Where to return the output FSW.
1965	; @param A2 Where to store the 32-bit signed integer value.
1966	; @param A3 Pointer to the 80-bit value.
1967	;
1968	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
1969	PROLOGUE_4_ARGS
1970	sub xSP, 20h
1971
1972	fninit
1973	fld tword [A3]
1974	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
1975	fistp dword [A2]
1976
1977	fnstsw word [A1]
1978
1979	fninit
1980	add xSP, 20h
1981	EPILOGUE_4_ARGS
1982	ENDPROC iemAImpl_fist_r80_to_i32
1983
1984
1985	;;
1986	; Store a 80-bit floating point value (register) as a 32-bit signed integer
1987	; (memory) with truncation.
1988	;
1989	; @param A0 FPU context (fxsave).
1990	; @param A1 Where to return the output FSW.
1991	; @param A2 Where to store the 32-bit signed integer value.
1992	; @param A3 Pointer to the 80-bit value.
1993	;
1994	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
1995	PROLOGUE_4_ARGS
1996	sub xSP, 20h
1997
1998	fninit
1999	fld tword [A3]
2000	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2001	fisttp dword [A2]
2002
2003	fnstsw word [A1]
2004
2005	fninit
2006	add xSP, 20h
2007	EPILOGUE_4_ARGS
2008	ENDPROC iemAImpl_fistt_r80_to_i32
2009
2010
2011	;;
2012	; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2013	;
2014	; @param 1 The instruction
2015	;
2016	; @param A0 FPU context (fxsave).
2017	; @param A1 Pointer to a IEMFPURESULT for the output.
2018	; @param A2 Pointer to the 80-bit value.
2019	; @param A3 Pointer to the 32-bit value.
2020	;
2021	%macro IEMIMPL_FPU_R80_BY_I32 1
2022	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2023	PROLOGUE_4_ARGS
2024	sub xSP, 20h
2025
2026	fninit
2027	fld tword [A2]
2028	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2029	%1 dword [A3]
2030
2031	fnstsw word [A1 + IEMFPURESULT.FSW]
2032	fnclex
2033	fstp tword [A1 + IEMFPURESULT.r80Result]
2034
2035	fninit
2036	add xSP, 20h
2037	EPILOGUE_4_ARGS
2038	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2039	%endmacro
2040
2041	IEMIMPL_FPU_R80_BY_I32 fiadd
2042	IEMIMPL_FPU_R80_BY_I32 fimul
2043	IEMIMPL_FPU_R80_BY_I32 fisub
2044	IEMIMPL_FPU_R80_BY_I32 fisubr
2045	IEMIMPL_FPU_R80_BY_I32 fidiv
2046	IEMIMPL_FPU_R80_BY_I32 fidivr
2047
2048
2049	;;
2050	; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2051	; only returning FSW.
2052	;
2053	; @param 1 The instruction
2054	;
2055	; @param A0 FPU context (fxsave).
2056	; @param A1 Where to store the output FSW.
2057	; @param A2 Pointer to the 80-bit value.
2058	; @param A3 Pointer to the 64-bit value.
2059	;
2060	%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2061	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2062	PROLOGUE_4_ARGS
2063	sub xSP, 20h
2064
2065	fninit
2066	fld tword [A2]
2067	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2068	%1 dword [A3]
2069
2070	fnstsw word [A1]
2071
2072	fninit
2073	add xSP, 20h
2074	EPILOGUE_4_ARGS
2075	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2076	%endmacro
2077
2078	IEMIMPL_FPU_R80_BY_I32_FSW ficom
2079
2080
2081
2082	;
2083	;---------------------- 64-bit signed integer operations ----------------------
2084	;
2085
2086
2087	;;
2088	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2089	;
2090	; @param A0 FPU context (fxsave).
2091	; @param A1 Pointer to a IEMFPURESULT for the output.
2092	; @param A2 Pointer to the 64-bit floating point value to convert.
2093	;
2094	BEGINPROC_FASTCALL iemAImpl_fild_i64_to_r80, 12
2095	PROLOGUE_3_ARGS
2096	sub xSP, 20h
2097
2098	fninit
2099	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2100	fild qword [A2]
2101
2102	fnstsw word [A1 + IEMFPURESULT.FSW]
2103	fnclex
2104	fstp tword [A1 + IEMFPURESULT.r80Result]
2105
2106	fninit
2107	add xSP, 20h
2108	EPILOGUE_3_ARGS
2109	ENDPROC iemAImpl_fild_i64_to_r80
2110
2111
2112	;;
2113	; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2114	;
2115	; @param A0 FPU context (fxsave).
2116	; @param A1 Where to return the output FSW.
2117	; @param A2 Where to store the 64-bit signed integer value.
2118	; @param A3 Pointer to the 80-bit value.
2119	;
2120	BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2121	PROLOGUE_4_ARGS
2122	sub xSP, 20h
2123
2124	fninit
2125	fld tword [A3]
2126	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2127	fistp qword [A2]
2128
2129	fnstsw word [A1]
2130
2131	fninit
2132	add xSP, 20h
2133	EPILOGUE_4_ARGS
2134	ENDPROC iemAImpl_fist_r80_to_i64
2135
2136
2137	;;
2138	; Store a 80-bit floating point value (register) as a 64-bit signed integer
2139	; (memory) with truncation.
2140	;
2141	; @param A0 FPU context (fxsave).
2142	; @param A1 Where to return the output FSW.
2143	; @param A2 Where to store the 64-bit signed integer value.
2144	; @param A3 Pointer to the 80-bit value.
2145	;
2146	BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2147	PROLOGUE_4_ARGS
2148	sub xSP, 20h
2149
2150	fninit
2151	fld tword [A3]
2152	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2153	fisttp qword [A2]
2154
2155	fnstsw word [A1]
2156
2157	fninit
2158	add xSP, 20h
2159	EPILOGUE_4_ARGS
2160	ENDPROC iemAImpl_fistt_r80_to_i64
2161
2162
2163
2164	;
2165	;---------------------- 32-bit floating point operations ----------------------
2166	;
2167
2168	;;
2169	; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2170	;
2171	; @param A0 FPU context (fxsave).
2172	; @param A1 Pointer to a IEMFPURESULT for the output.
2173	; @param A2 Pointer to the 32-bit floating point value to convert.
2174	;
2175	BEGINPROC_FASTCALL iemAImpl_fld_r32_to_r80, 12
2176	PROLOGUE_3_ARGS
2177	sub xSP, 20h
2178
2179	fninit
2180	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2181	fld dword [A2]
2182
2183	fnstsw word [A1 + IEMFPURESULT.FSW]
2184	fnclex
2185	fstp tword [A1 + IEMFPURESULT.r80Result]
2186
2187	fninit
2188	add xSP, 20h
2189	EPILOGUE_3_ARGS
2190	ENDPROC iemAImpl_fld_r32_to_r80
2191
2192
2193	;;
2194	; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2195	;
2196	; @param A0 FPU context (fxsave).
2197	; @param A1 Where to return the output FSW.
2198	; @param A2 Where to store the 32-bit value.
2199	; @param A3 Pointer to the 80-bit value.
2200	;
2201	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2202	PROLOGUE_4_ARGS
2203	sub xSP, 20h
2204
2205	fninit
2206	fld tword [A3]
2207	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2208	fst dword [A2]
2209
2210	fnstsw word [A1]
2211
2212	fninit
2213	add xSP, 20h
2214	EPILOGUE_4_ARGS
2215	ENDPROC iemAImpl_fst_r80_to_r32
2216
2217
2218	;;
2219	; FPU instruction working on one 80-bit and one 32-bit floating point value.
2220	;
2221	; @param 1 The instruction
2222	;
2223	; @param A0 FPU context (fxsave).
2224	; @param A1 Pointer to a IEMFPURESULT for the output.
2225	; @param A2 Pointer to the 80-bit value.
2226	; @param A3 Pointer to the 32-bit value.
2227	;
2228	%macro IEMIMPL_FPU_R80_BY_R32 1
2229	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2230	PROLOGUE_4_ARGS
2231	sub xSP, 20h
2232
2233	fninit
2234	fld tword [A2]
2235	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2236	%1 dword [A3]
2237
2238	fnstsw word [A1 + IEMFPURESULT.FSW]
2239	fnclex
2240	fstp tword [A1 + IEMFPURESULT.r80Result]
2241
2242	fninit
2243	add xSP, 20h
2244	EPILOGUE_4_ARGS
2245	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2246	%endmacro
2247
2248	IEMIMPL_FPU_R80_BY_R32 fadd
2249	IEMIMPL_FPU_R80_BY_R32 fmul
2250	IEMIMPL_FPU_R80_BY_R32 fsub
2251	IEMIMPL_FPU_R80_BY_R32 fsubr
2252	IEMIMPL_FPU_R80_BY_R32 fdiv
2253	IEMIMPL_FPU_R80_BY_R32 fdivr
2254
2255
2256	;;
2257	; FPU instruction working on one 80-bit and one 32-bit floating point value,
2258	; only returning FSW.
2259	;
2260	; @param 1 The instruction
2261	;
2262	; @param A0 FPU context (fxsave).
2263	; @param A1 Where to store the output FSW.
2264	; @param A2 Pointer to the 80-bit value.
2265	; @param A3 Pointer to the 64-bit value.
2266	;
2267	%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
2268	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2269	PROLOGUE_4_ARGS
2270	sub xSP, 20h
2271
2272	fninit
2273	fld tword [A2]
2274	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2275	%1 dword [A3]
2276
2277	fnstsw word [A1]
2278
2279	fninit
2280	add xSP, 20h
2281	EPILOGUE_4_ARGS
2282	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2283	%endmacro
2284
2285	IEMIMPL_FPU_R80_BY_R32_FSW fcom
2286
2287
2288
2289	;
2290	;---------------------- 64-bit floating point operations ----------------------
2291	;
2292
2293	;;
2294	; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2295	;
2296	; @param A0 FPU context (fxsave).
2297	; @param A1 Pointer to a IEMFPURESULT for the output.
2298	; @param A2 Pointer to the 64-bit floating point value to convert.
2299	;
2300	BEGINPROC_FASTCALL iemAImpl_fld_r64_to_r80, 12
2301	PROLOGUE_3_ARGS
2302	sub xSP, 20h
2303
2304	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2305	fld qword [A2]
2306
2307	fnstsw word [A1 + IEMFPURESULT.FSW]
2308	fnclex
2309	fstp tword [A1 + IEMFPURESULT.r80Result]
2310
2311	fninit
2312	add xSP, 20h
2313	EPILOGUE_3_ARGS
2314	ENDPROC iemAImpl_fld_r64_to_r80
2315
2316
2317	;;
2318	; Store a 80-bit floating point value (register) as a 64-bit one (memory).
2319	;
2320	; @param A0 FPU context (fxsave).
2321	; @param A1 Where to return the output FSW.
2322	; @param A2 Where to store the 64-bit value.
2323	; @param A3 Pointer to the 80-bit value.
2324	;
2325	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
2326	PROLOGUE_4_ARGS
2327	sub xSP, 20h
2328
2329	fninit
2330	fld tword [A3]
2331	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2332	fst qword [A2]
2333
2334	fnstsw word [A1]
2335
2336	fninit
2337	add xSP, 20h
2338	EPILOGUE_4_ARGS
2339	ENDPROC iemAImpl_fst_r80_to_r64
2340
2341
2342	;;
2343	; FPU instruction working on one 80-bit and one 64-bit floating point value.
2344	;
2345	; @param 1 The instruction
2346	;
2347	; @param A0 FPU context (fxsave).
2348	; @param A1 Pointer to a IEMFPURESULT for the output.
2349	; @param A2 Pointer to the 80-bit value.
2350	; @param A3 Pointer to the 64-bit value.
2351	;
2352	%macro IEMIMPL_FPU_R80_BY_R64 1
2353	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2354	PROLOGUE_4_ARGS
2355	sub xSP, 20h
2356
2357	fninit
2358	fld tword [A2]
2359	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2360	%1 qword [A3]
2361
2362	fnstsw word [A1 + IEMFPURESULT.FSW]
2363	fnclex
2364	fstp tword [A1 + IEMFPURESULT.r80Result]
2365
2366	fninit
2367	add xSP, 20h
2368	EPILOGUE_4_ARGS
2369	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2370	%endmacro
2371
2372	IEMIMPL_FPU_R80_BY_R64 fadd
2373	IEMIMPL_FPU_R80_BY_R64 fmul
2374	IEMIMPL_FPU_R80_BY_R64 fsub
2375	IEMIMPL_FPU_R80_BY_R64 fsubr
2376	IEMIMPL_FPU_R80_BY_R64 fdiv
2377	IEMIMPL_FPU_R80_BY_R64 fdivr
2378
2379	;;
2380	; FPU instruction working on one 80-bit and one 64-bit floating point value,
2381	; only returning FSW.
2382	;
2383	; @param 1 The instruction
2384	;
2385	; @param A0 FPU context (fxsave).
2386	; @param A1 Where to store the output FSW.
2387	; @param A2 Pointer to the 80-bit value.
2388	; @param A3 Pointer to the 64-bit value.
2389	;
2390	%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
2391	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
2392	PROLOGUE_4_ARGS
2393	sub xSP, 20h
2394
2395	fninit
2396	fld tword [A2]
2397	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2398	%1 qword [A3]
2399
2400	fnstsw word [A1]
2401
2402	fninit
2403	add xSP, 20h
2404	EPILOGUE_4_ARGS
2405	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
2406	%endmacro
2407
2408	IEMIMPL_FPU_R80_BY_R64_FSW fcom
2409
2410
2411
2412	;
2413	;---------------------- 80-bit floating point operations ----------------------
2414	;
2415
2416	;;
2417	; Loads a 80-bit floating point register value from memory.
2418	;
2419	; @param A0 FPU context (fxsave).
2420	; @param A1 Pointer to a IEMFPURESULT for the output.
2421	; @param A2 Pointer to the 80-bit floating point value to load.
2422	;
2423	BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
2424	PROLOGUE_3_ARGS
2425	sub xSP, 20h
2426
2427	fninit
2428	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2429	fld tword [A2]
2430
2431	fnstsw word [A1 + IEMFPURESULT.FSW]
2432	fnclex
2433	fstp tword [A1 + IEMFPURESULT.r80Result]
2434
2435	fninit
2436	add xSP, 20h
2437	EPILOGUE_3_ARGS
2438	ENDPROC iemAImpl_fld_r80_from_r80
2439
2440
2441	;;
2442	; Store a 80-bit floating point register to memory
2443	;
2444	; @param A0 FPU context (fxsave).
2445	; @param A1 Where to return the output FSW.
2446	; @param A2 Where to store the 80-bit value.
2447	; @param A3 Pointer to the 80-bit register value.
2448	;
2449	BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
2450	PROLOGUE_4_ARGS
2451	sub xSP, 20h
2452
2453	fninit
2454	fld tword [A3]
2455	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2456	fstp tword [A2]
2457
2458	fnstsw word [A1]
2459
2460	fninit
2461	add xSP, 20h
2462	EPILOGUE_4_ARGS
2463	ENDPROC iemAImpl_fst_r80_to_r80
2464
2465
2466	;;
2467	; FPU instruction working on two 80-bit floating point values.
2468	;
2469	; @param 1 The instruction
2470	;
2471	; @param A0 FPU context (fxsave).
2472	; @param A1 Pointer to a IEMFPURESULT for the output.
2473	; @param A2 Pointer to the first 80-bit value (ST0)
2474	; @param A3 Pointer to the second 80-bit value (STn).
2475	;
2476	%macro IEMIMPL_FPU_R80_BY_R80 2
2477	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2478	PROLOGUE_4_ARGS
2479	sub xSP, 20h
2480
2481	fninit
2482	fld tword [A3]
2483	fld tword [A2]
2484	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2485	%1 %2
2486
2487	fnstsw word [A1 + IEMFPURESULT.FSW]
2488	fnclex
2489	fstp tword [A1 + IEMFPURESULT.r80Result]
2490
2491	fninit
2492	add xSP, 20h
2493	EPILOGUE_4_ARGS
2494	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2495	%endmacro
2496
2497	IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
2498	IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
2499	IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
2500	IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
2501	IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
2502	IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
2503	IEMIMPL_FPU_R80_BY_R80 fprem, {}
2504	IEMIMPL_FPU_R80_BY_R80 fprem1, {}
2505	IEMIMPL_FPU_R80_BY_R80 fscale, {}
2506
2507
2508	;;
2509	; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
2510	; storing the result in ST1 and popping the stack.
2511	;
2512	; @param 1 The instruction
2513	;
2514	; @param A0 FPU context (fxsave).
2515	; @param A1 Pointer to a IEMFPURESULT for the output.
2516	; @param A2 Pointer to the first 80-bit value (ST1).
2517	; @param A3 Pointer to the second 80-bit value (ST0).
2518	;
2519	%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
2520	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2521	PROLOGUE_4_ARGS
2522	sub xSP, 20h
2523
2524	fninit
2525	fld tword [A2]
2526	fld tword [A3]
2527	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2528	%1
2529
2530	fnstsw word [A1 + IEMFPURESULT.FSW]
2531	fnclex
2532	fstp tword [A1 + IEMFPURESULT.r80Result]
2533
2534	fninit
2535	add xSP, 20h
2536	EPILOGUE_4_ARGS
2537	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2538	%endmacro
2539
2540	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
2541	IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
2542
2543
2544	;;
2545	; FPU instruction working on two 80-bit floating point values, only
2546	; returning FSW.
2547	;
2548	; @param 1 The instruction
2549	;
2550	; @param A0 FPU context (fxsave).
2551	; @param A1 Pointer to a uint16_t for the resulting FSW.
2552	; @param A2 Pointer to the first 80-bit value.
2553	; @param A3 Pointer to the second 80-bit value.
2554	;
2555	%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
2556	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2557	PROLOGUE_4_ARGS
2558	sub xSP, 20h
2559
2560	fninit
2561	fld tword [A3]
2562	fld tword [A2]
2563	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2564	%1 st0, st1
2565
2566	fnstsw word [A1]
2567
2568	fninit
2569	add xSP, 20h
2570	EPILOGUE_4_ARGS
2571	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2572	%endmacro
2573
2574	IEMIMPL_FPU_R80_BY_R80_FSW fcom
2575	IEMIMPL_FPU_R80_BY_R80_FSW fucom
2576
2577
2578	;;
2579	; FPU instruction working on two 80-bit floating point values,
2580	; returning FSW and EFLAGS (eax).
2581	;
2582	; @param 1 The instruction
2583	;
2584	; @returns EFLAGS in EAX.
2585	; @param A0 FPU context (fxsave).
2586	; @param A1 Pointer to a uint16_t for the resulting FSW.
2587	; @param A2 Pointer to the first 80-bit value.
2588	; @param A3 Pointer to the second 80-bit value.
2589	;
2590	%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
2591	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
2592	PROLOGUE_4_ARGS
2593	sub xSP, 20h
2594
2595	fninit
2596	fld tword [A3]
2597	fld tword [A2]
2598	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2599	%1 st1
2600
2601	fnstsw word [A1]
2602	pushf
2603	pop xAX
2604
2605	fninit
2606	add xSP, 20h
2607	EPILOGUE_4_ARGS
2608	ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
2609	%endmacro
2610
2611	IEMIMPL_FPU_R80_BY_R80_EFL fcomi
2612	IEMIMPL_FPU_R80_BY_R80_EFL fucomi
2613
2614
2615	;;
2616	; FPU instruction working on one 80-bit floating point value.
2617	;
2618	; @param 1 The instruction
2619	;
2620	; @param A0 FPU context (fxsave).
2621	; @param A1 Pointer to a IEMFPURESULT for the output.
2622	; @param A2 Pointer to the 80-bit value.
2623	;
2624	%macro IEMIMPL_FPU_R80 1
2625	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2626	PROLOGUE_3_ARGS
2627	sub xSP, 20h
2628
2629	fninit
2630	fld tword [A2]
2631	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2632	%1
2633
2634	fnstsw word [A1 + IEMFPURESULT.FSW]
2635	fnclex
2636	fstp tword [A1 + IEMFPURESULT.r80Result]
2637
2638	fninit
2639	add xSP, 20h
2640	EPILOGUE_3_ARGS
2641	ENDPROC iemAImpl_ %+ %1 %+ _r80
2642	%endmacro
2643
2644	IEMIMPL_FPU_R80 fchs
2645	IEMIMPL_FPU_R80 fabs
2646	IEMIMPL_FPU_R80 f2xm1
2647	IEMIMPL_FPU_R80 fyl2x
2648	IEMIMPL_FPU_R80 fsqrt
2649	IEMIMPL_FPU_R80 frndint
2650	IEMIMPL_FPU_R80 fsin
2651	IEMIMPL_FPU_R80 fcos
2652
2653
2654	;;
2655	; FPU instruction working on one 80-bit floating point value, only
2656	; returning FSW.
2657	;
2658	; @param 1 The instruction
2659	;
2660	; @param A0 FPU context (fxsave).
2661	; @param A1 Pointer to a uint16_t for the resulting FSW.
2662	; @param A2 Pointer to the 80-bit value.
2663	;
2664	%macro IEMIMPL_FPU_R80_FSW 1
2665	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
2666	PROLOGUE_3_ARGS
2667	sub xSP, 20h
2668
2669	fninit
2670	fld tword [A2]
2671	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2672	%1
2673
2674	fnstsw word [A1]
2675
2676	fninit
2677	add xSP, 20h
2678	EPILOGUE_3_ARGS
2679	ENDPROC iemAImpl_ %+ %1 %+ _r80
2680	%endmacro
2681
2682	IEMIMPL_FPU_R80_FSW ftst
2683	IEMIMPL_FPU_R80_FSW fxam
2684
2685
2686
2687	;;
2688	; FPU instruction loading a 80-bit floating point constant.
2689	;
2690	; @param 1 The instruction
2691	;
2692	; @param A0 FPU context (fxsave).
2693	; @param A1 Pointer to a IEMFPURESULT for the output.
2694	;
2695	%macro IEMIMPL_FPU_R80_CONST 1
2696	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
2697	PROLOGUE_2_ARGS
2698	sub xSP, 20h
2699
2700	fninit
2701	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2702	%1
2703
2704	fnstsw word [A1 + IEMFPURESULT.FSW]
2705	fnclex
2706	fstp tword [A1 + IEMFPURESULT.r80Result]
2707
2708	fninit
2709	add xSP, 20h
2710	EPILOGUE_2_ARGS
2711	ENDPROC iemAImpl_ %+ %1 %+
2712	%endmacro
2713
2714	IEMIMPL_FPU_R80_CONST fld1
2715	IEMIMPL_FPU_R80_CONST fldl2t
2716	IEMIMPL_FPU_R80_CONST fldl2e
2717	IEMIMPL_FPU_R80_CONST fldpi
2718	IEMIMPL_FPU_R80_CONST fldlg2
2719	IEMIMPL_FPU_R80_CONST fldln2
2720	IEMIMPL_FPU_R80_CONST fldz
2721
2722
2723	;;
2724	; FPU instruction working on one 80-bit floating point value, outputing two.
2725	;
2726	; @param 1 The instruction
2727	;
2728	; @param A0 FPU context (fxsave).
2729	; @param A1 Pointer to a IEMFPURESULTTWO for the output.
2730	; @param A2 Pointer to the 80-bit value.
2731	;
2732	%macro IEMIMPL_FPU_R80_R80 1
2733	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
2734	PROLOGUE_3_ARGS
2735	sub xSP, 20h
2736
2737	fninit
2738	fld tword [A2]
2739	FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2740	%1
2741
2742	fnstsw word [A1 + IEMFPURESULTTWO.FSW]
2743	fnclex
2744	fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
2745	fnclex
2746	fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
2747
2748	fninit
2749	add xSP, 20h
2750	EPILOGUE_3_ARGS
2751	ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
2752	%endmacro
2753
2754	IEMIMPL_FPU_R80_R80 fptan
2755	IEMIMPL_FPU_R80_R80 fxtract
2756	IEMIMPL_FPU_R80_R80 fsincos
2757
2758
2759
2760
2761	;---------------------- SSE and MMX Operations ----------------------
2762
2763	;; @todo what do we need to do for MMX?
2764	%macro IEMIMPL_MMX_PROLOGUE 0
2765	%endmacro
2766	%macro IEMIMPL_MMX_EPILOGUE 0
2767	%endmacro
2768
2769	;; @todo what do we need to do for SSE?
2770	%macro IEMIMPL_SSE_PROLOGUE 0
2771	%endmacro
2772	%macro IEMIMPL_SSE_EPILOGUE 0
2773	%endmacro
2774
2775
2776	;;
2777	; Media instruction working on two full sized registers.
2778	;
2779	; @param 1 The instruction
2780	;
2781	; @param A0 FPU context (fxsave).
2782	; @param A1 Pointer to the first media register size operand (input/output).
2783	; @param A2 Pointer to the second media register size operand (input).
2784	;
2785	%macro IEMIMPL_MEDIA_F2 1
2786	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2787	PROLOGUE_3_ARGS
2788	IEMIMPL_MMX_PROLOGUE
2789
2790	movq mm0, [A1]
2791	movq mm1, [A2]
2792	%1 mm0, mm1
2793	movq [A1], mm0
2794
2795	IEMIMPL_MMX_EPILOGUE
2796	EPILOGUE_3_ARGS
2797	ENDPROC iemAImpl_ %+ %1 %+ _u64
2798
2799	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2800	PROLOGUE_3_ARGS
2801	IEMIMPL_SSE_PROLOGUE
2802
2803	movdqu xmm0, [A1]
2804	movdqu xmm1, [A2]
2805	%1 xmm0, xmm1
2806	movdqu [A1], xmm0
2807
2808	IEMIMPL_SSE_EPILOGUE
2809	EPILOGUE_3_ARGS
2810	ENDPROC iemAImpl_ %+ %1 %+ _u128
2811	%endmacro
2812
2813	IEMIMPL_MEDIA_F2 pxor
2814	IEMIMPL_MEDIA_F2 pcmpeqb
2815	IEMIMPL_MEDIA_F2 pcmpeqw
2816	IEMIMPL_MEDIA_F2 pcmpeqd
2817
2818
2819	;;
2820	; Media instruction working on one full sized and one half sized register (lower half).
2821	;
2822	; @param 1 The instruction
2823	; @param 2 1 if MMX is included, 0 if not.
2824	;
2825	; @param A0 FPU context (fxsave).
2826	; @param A1 Pointer to the first full sized media register operand (input/output).
2827	; @param A2 Pointer to the second half sized media register operand (input).
2828	;
2829	%macro IEMIMPL_MEDIA_F1L1 2
2830	%if %2 != 0
2831	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2832	PROLOGUE_3_ARGS
2833	IEMIMPL_MMX_PROLOGUE
2834
2835	movq mm0, [A1]
2836	movd mm1, [A2]
2837	%1 mm0, mm1
2838	movq [A1], mm0
2839
2840	IEMIMPL_MMX_EPILOGUE
2841	EPILOGUE_3_ARGS
2842	ENDPROC iemAImpl_ %+ %1 %+ _u64
2843	%endif
2844
2845	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2846	PROLOGUE_3_ARGS
2847	IEMIMPL_SSE_PROLOGUE
2848
2849	movdqu xmm0, [A1]
2850	movq xmm1, [A2]
2851	%1 xmm0, xmm1
2852	movdqu [A1], xmm0
2853
2854	IEMIMPL_SSE_EPILOGUE
2855	EPILOGUE_3_ARGS
2856	ENDPROC iemAImpl_ %+ %1 %+ _u128
2857	%endmacro
2858
2859	IEMIMPL_MEDIA_F1L1 punpcklbw, 1
2860	IEMIMPL_MEDIA_F1L1 punpcklwd, 1
2861	IEMIMPL_MEDIA_F1L1 punpckldq, 1
2862	IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
2863
2864
2865	;;
2866	; Media instruction working on one full sized and one half sized register (high half).
2867	;
2868	; @param 1 The instruction
2869	; @param 2 1 if MMX is included, 0 if not.
2870	;
2871	; @param A0 FPU context (fxsave).
2872	; @param A1 Pointer to the first full sized media register operand (input/output).
2873	; @param A2 Pointer to the second full sized media register operand, where we
2874	; will only use the upper half (input).
2875	;
2876	%macro IEMIMPL_MEDIA_F1H1 2
2877	%if %2 != 0
2878	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
2879	PROLOGUE_3_ARGS
2880	IEMIMPL_MMX_PROLOGUE
2881
2882	movq mm0, [A1]
2883	movq mm1, [A2]
2884	%1 mm0, mm1
2885	movq [A1], mm0
2886
2887	IEMIMPL_MMX_EPILOGUE
2888	EPILOGUE_3_ARGS
2889	ENDPROC iemAImpl_ %+ %1 %+ _u64
2890	%endif
2891
2892	BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
2893	PROLOGUE_3_ARGS
2894	IEMIMPL_SSE_PROLOGUE
2895
2896	movdqu xmm0, [A1]
2897	movdqu xmm1, [A2]
2898	%1 xmm0, xmm1
2899	movdqu [A1], xmm0
2900
2901	IEMIMPL_SSE_EPILOGUE
2902	EPILOGUE_3_ARGS
2903	ENDPROC iemAImpl_ %+ %1 %+ _u128
2904	%endmacro
2905
2906	IEMIMPL_MEDIA_F1L1 punpckhbw, 1
2907	IEMIMPL_MEDIA_F1L1 punpckhwd, 1
2908	IEMIMPL_MEDIA_F1L1 punpckhdq, 1
2909	IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
2910
2911
2912	;
2913	; Shufflers with evil 8-bit immediates.
2914	;
2915
2916	BEGINPROC_FASTCALL iemAImpl_pshufw, 16
2917	PROLOGUE_4_ARGS
2918	IEMIMPL_MMX_PROLOGUE
2919
2920	movq mm0, [A1]
2921	movq mm1, [A2]
2922	lea T0, [A3 + A3*4] ; sizeof(pshufw+ret) == 5
2923	lea T1, [.imm0 xWrtRIP]
2924	lea T1, [T1 + T0]
2925	call T1
2926	movq [A1], mm0
2927
2928	IEMIMPL_MMX_EPILOGUE
2929	EPILOGUE_4_ARGS
2930	%assign bImm 0
2931	%rep 256
2932	.imm %+ bImm:
2933	pshufw mm0, mm1, bImm
2934	ret
2935	%assign bImm bImm + 1
2936	%endrep
2937	.immEnd: ; 256*5 == 0x500
2938	dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2939	dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2940	ENDPROC iemAImpl_pshufw
2941
2942
2943	%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
2944	BEGINPROC_FASTCALL iemAImpl_ %+ %1, 16
2945	PROLOGUE_4_ARGS
2946	IEMIMPL_SSE_PROLOGUE
2947
2948	movdqu xmm0, [A1]
2949	movdqu xmm1, [A2]
2950	lea T1, [.imm0 xWrtRIP]
2951	lea T0, [A3 + A32] ; sizeof(pshufXX+ret) == 6: (A3 3) *2
2952	lea T1, [T1 + T0*2]
2953	call T1
2954	movdqu [A1], xmm0
2955
2956	IEMIMPL_SSE_EPILOGUE
2957	EPILOGUE_4_ARGS
2958	%assign bImm 0
2959	%rep 256
2960	.imm %+ bImm:
2961	%1 xmm0, xmm1, bImm
2962	ret
2963	%assign bImm bImm + 1
2964	%endrep
2965	.immEnd: ; 256*6 == 0x600
2966	dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
2967	dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are small big.
2968	ENDPROC iemAImpl_ %+ %1
2969	%endmacro
2970
2971	IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
2972	IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
2973	IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
2974
2975
2976	;
2977	; Move byte mask.
2978	;
2979
2980	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 12
2981	PROLOGUE_3_ARGS
2982	IEMIMPL_MMX_PROLOGUE
2983
2984	mov T0, [A1]
2985	movq mm1, [A2]
2986	pmovmskb T0, mm1
2987	mov [A1], T0
2988	%ifdef RT_ARCH_X86
2989	mov dword [A1 + 4], 0
2990	%endif
2991	IEMIMPL_MMX_EPILOGUE
2992	EPILOGUE_3_ARGS
2993	ENDPROC iemAImpl_pmovmskb_u64
2994
2995	BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 12
2996	PROLOGUE_3_ARGS
2997	IEMIMPL_SSE_PROLOGUE
2998
2999	mov T0, [A1]
3000	movdqu xmm1, [A2]
3001	pmovmskb T0, xmm1
3002	mov [A1], T0
3003	%ifdef RT_ARCH_X86
3004	mov dword [A1 + 4], 0
3005	%endif
3006	IEMIMPL_SSE_EPILOGUE
3007	EPILOGUE_3_ARGS
3008	ENDPROC iemAImpl_pmovmskb_u128
3009

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 47681

Download in other formats: