IEMAllN8veEmit-x86.h@ 106192

Last change on this file since 106192 was 106192, checked in by vboxsync, 4 months ago
VMM/IEM: Added some basic stats & debug info for postponed EFLAGS calcs. Moved debug info structures from IEMInternal.h and into IEMN8veRecompiler.h. bugref:10720
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 160.5 KB

Line
1	/* $Id: IEMAllN8veEmit-x86.h 106192 2024-10-01 12:57:32Z vboxsync $ */
2	/** @file
3	* IEM - Native Recompiler, x86 Target - Code Emitters.
4	*/
5
6	/*
7	* Copyright (C) 2023-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28	#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29	#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30	#ifndef RT_WITHOUT_PRAGMA_ONCE
31	# pragma once
32	#endif
33
34
35	#ifdef RT_ARCH_AMD64
36
37	/**
38	* Emits an ModR/M instruction with one opcode byte and only register operands.
39	*/
40	DECL_FORCE_INLINE(uint32_t)
41	iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42	uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43	{
44	Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45	switch (cOpBits)
46	{
47	case 16:
48	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49	RT_FALL_THRU();
50	case 32:
51	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
52	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53	pCodeBuf[off++] = bOpcodeOther;
54	break;
55
56	default: AssertFailed(); RT_FALL_THRU();
57	case 64:
58	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59	pCodeBuf[off++] = bOpcodeOther;
60	break;
61
62	case 8:
63	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
64	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65	else if (idxRegReg >= 4 \|\| idxRegRm >= 4)
66	pCodeBuf[off++] = X86_OP_REX;
67	pCodeBuf[off++] = bOpcode8;
68	break;
69	}
70	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71	return off;
72	}
73
74
75	/**
76	* Emits an ModR/M instruction with two opcode bytes and only register operands.
77	*/
78	DECL_FORCE_INLINE(uint32_t)
79	iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80	uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81	uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82	{
83	Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84	switch (cOpBits)
85	{
86	case 16:
87	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88	RT_FALL_THRU();
89	case 32:
90	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
91	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92	pCodeBuf[off++] = bOpcode0;
93	pCodeBuf[off++] = bOpcodeOther;
94	break;
95
96	default: AssertFailed(); RT_FALL_THRU();
97	case 64:
98	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99	pCodeBuf[off++] = bOpcode0;
100	pCodeBuf[off++] = bOpcodeOther;
101	break;
102
103	case 8:
104	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
105	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106	else if (idxRegReg >= 4 \|\| idxRegRm >= 4)
107	pCodeBuf[off++] = X86_OP_REX;
108	pCodeBuf[off++] = bOpcode0;
109	pCodeBuf[off++] = bOpcode8;
110	break;
111	}
112	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113	return off;
114	}
115
116
117	/**
118	* Emits one of three opcodes with an immediate.
119	*
120	* These are expected to be a /idxRegReg form.
121	*/
122	DECL_FORCE_INLINE(uint32_t)
123	iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124	uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125	uint8_t idxRegRm, uint64_t uImmOp)
126	{
127	Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128	if ( cImmBits == 8
129	\|\| (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130	{
131	switch (cOpBits)
132	{
133	case 16:
134	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135	RT_FALL_THRU();
136	case 32:
137	if (idxRegRm >= 8)
138	pCodeBuf[off++] = X86_OP_REX_B;
139	pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140	break;
141
142	default: AssertFailed(); RT_FALL_THRU();
143	case 64:
144	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145	pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146	break;
147
148	case 8:
149	if (idxRegRm >= 8)
150	pCodeBuf[off++] = X86_OP_REX_B;
151	else if (idxRegRm >= 4)
152	pCodeBuf[off++] = X86_OP_REX;
153	pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154	break;
155	}
156	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157	pCodeBuf[off++] = (uint8_t)uImmOp;
158	}
159	else
160	{
161	switch (cOpBits)
162	{
163	case 32:
164	if (idxRegRm >= 8)
165	pCodeBuf[off++] = X86_OP_REX_B;
166	break;
167
168	default: AssertFailed(); RT_FALL_THRU();
169	case 64:
170	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171	break;
172
173	case 16:
174	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175	if (idxRegRm >= 8)
176	pCodeBuf[off++] = X86_OP_REX_B;
177	pCodeBuf[off++] = bOpcodeOther;
178	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179	pCodeBuf[off++] = RT_BYTE1(uImmOp);
180	pCodeBuf[off++] = RT_BYTE2(uImmOp);
181	Assert(cImmBits == 16);
182	return off;
183	}
184	pCodeBuf[off++] = bOpcodeOther;
185	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186	pCodeBuf[off++] = RT_BYTE1(uImmOp);
187	pCodeBuf[off++] = RT_BYTE2(uImmOp);
188	pCodeBuf[off++] = RT_BYTE3(uImmOp);
189	pCodeBuf[off++] = RT_BYTE4(uImmOp);
190	Assert(cImmBits == 32);
191	}
192	return off;
193	}
194
195	#endif /* RT_ARCH_AMD64 */
196
197
198
199	/*********************************************************************************************************************************
200	* EFLAGS *
201	*********************************************************************************************************************************/
202
203	#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205	/** @def IEMNATIVE_POSTPONING_REG_MASK
206	* Register suitable for keeping the inputs or result for a postponed EFLAGS
207	* calculation.
208	*
209	* We use non-volatile register here so we don't have to save & restore them
210	* accross callouts (i.e. TLB loads).
211	*
212	* @note On x86 we cannot use RDI and RSI because these are used by the
213	* opcode checking code. The usual joy of the x86 instruction set.
214	*/
215	# ifdef RT_ARCH_AMD64
216	# define IEMNATIVE_POSTPONING_REG_MASK \
217	(IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) \| RT_BIT_32(X86_GREG_xSI)))
218	# else
219	# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220	# endif
221
222	/**
223	* This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224	*/
225	template<uint32_t const a_fEflClobbered>
226	DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227	{
228	AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229	uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230	if (fEFlags)
231	{
232	if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233	{
234	fEFlags &= ~a_fEflClobbered;
235	if (!fEFlags)
236	{ /* likely */ }
237	else
238	{
239	Log5(("iemNativeClearPostponedEFlags: Clobbering %#x: %#x -> %#x (op=%d bits=%u)\n", a_fEflClobbered,
240	pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241	pReNative->PostponedEfl.fEFlags = fEFlags;
242	return;
243	}
244	}
245
246	/* Do cleanup. */
247	Log5(("iemNativeClearPostponedEFlags: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x\n",
248	pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249	pReNative->PostponedEfl.fEFlags = 0;
250	pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251	pReNative->PostponedEfl.cOpBits = 0;
252	iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253	if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254	iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255	pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256	pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257	#if defined(VBOX_WITH_STATISTICS) \|\| defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258	STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259	pReNative->PostponedEfl.cEmits = 0;
260	#endif
261	}
262	}
263
264	DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
265	uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
266	{
267	#ifdef RT_ARCH_AMD64
268	/*
269	* Do an AND and collect flags and merge them with eflags.
270	*/
271	/* Do TEST idxRegResult, idxRegResult to set flags. */
272	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
273
274	if (idxRegTmp == X86_GREG_xAX)
275	{
276	/* lahf ; AH = EFLAGS */
277	pCodeBuf[off++] = 0x9f;
278	if (idxRegEfl <= X86_GREG_xBX)
279	{
280	/* mov [CDB]L, AH */
281	pCodeBuf[off++] = 0x88;
282	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, idxRegEfl);
283	}
284	else
285	{
286	/* mov AL, AH */
287	pCodeBuf[off++] = 0x88;
288	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
289	/* mov xxL, AL */
290	pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
291	pCodeBuf[off++] = 0x88;
292	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /AL/, idxRegEfl & 7);
293	}
294	}
295	else if (idxRegEfl != X86_GREG_xAX)
296	{
297	/* pushf */
298	pCodeBuf[off++] = 0x9c;
299	/* pop tmp */
300	if (idxRegTmp >= 8)
301	pCodeBuf[off++] = X86_OP_REX_B;
302	pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
303	/* mov byte(efl), byte(tmp) */
304	pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
305	\| (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
306	pCodeBuf[off++] = 0x88;
307	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
308	}
309	else
310	{
311	/* xchg al, ah */
312	pCodeBuf[off++] = 0x86;
313	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
314	/* lahf ; AH = EFLAGS */
315	pCodeBuf[off++] = 0x9f;
316	/* xchg al, ah */
317	pCodeBuf[off++] = 0x86;
318	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
319	}
320	/* BTC idxEfl, 11; Clear OF */
321	if (idxRegEfl >= 8)
322	pCodeBuf[off++] = X86_OP_REX_B;
323	pCodeBuf[off++] = 0xf;
324	pCodeBuf[off++] = 0xba;
325	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 7, idxRegEfl & 7);
326	pCodeBuf[off++] = X86_EFL_OF_BIT;
327
328	#elif defined(RT_ARCH_ARM64)
329	/*
330	* Calculate flags.
331	*/
332	/* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
333	off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
334	off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
335
336	/* N,Z -> SF,ZF */
337	if (cOpBits < 32)
338	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
339	else
340	pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /f64Bit/);
341	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
342	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
343	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /f64Bit/);
344	AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
345
346	/* Calculate 8-bit parity of the result. */
347	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /f64Bit/,
348	4 /offShift6/, kArmv8A64InstrShift_Lsr);
349	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
350	2 /offShift6/, kArmv8A64InstrShift_Lsr);
351	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
352	1 /offShift6/, kArmv8A64InstrShift_Lsr);
353	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
354	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /f64Bit/);
355	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /f64Bit/);
356
357	#else
358	# error "port me"
359	#endif
360	return off;
361	}
362
363
364	template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
365	static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
366	uint32_t bmExtraTlbMissRegs = 0)
367	{
368	#ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
369	iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
370	pReNative->PostponedEfl.cEmits);
371	#endif
372
373	/*
374	* In the TB exit code path we cannot do regular register allocation. Nor
375	* can we when we're in the TLB miss code, unless we're skipping the TLB
376	* lookup. Since the latter isn't an important usecase and should get along
377	* fine on just volatile registers, we do not need to do anything special
378	* for it.
379	*
380	* So, we do our own register allocating here. Any register goes in the TB
381	* exit path, excluding a_bmInputRegs, fixed and postponed related registers.
382	* In the TLB miss we can use any volatile register and temporary registers
383	* allocated in the TLB state.
384	*
385	* Note! On x86 we prefer using RAX as the first TMP register, so we can
386	* make use of LAHF which is typically faster than PUSHF/POP. This
387	* is why the idxRegTmp allocation is first when there is no EFLAG
388	* shadow, since RAX is represented by bit 0 in the mask.
389	*/
390	uint32_t bmAvailableRegs;
391	if RT_CONSTEXPR_IF(!a_fTlbMiss)
392	{
393	bmAvailableRegs = ~(a_bmInputRegs \| IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
394	if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
395	bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) \| RT_BIT_32(pReNative->PostponedEfl.idxReg2));
396	else
397	bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
398	}
399	else
400	{
401	/* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
402	bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK \| bmExtraTlbMissRegs)
403	& ~(a_bmInputRegs \| IEMNATIVE_REG_FIXED_MASK)
404	& IEMNATIVE_HST_GREG_MASK;
405	}
406
407	/* Use existing EFLAGS shadow if available. For the TLB-miss code path we
408	need to weed out volatile registers here, as they will no longer be valid. */
409	uint8_t idxRegTmp;
410	uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
411	if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
412	&& (!a_fTlbMiss \|\| !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
413	{
414	Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
415	Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
416	if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
417	bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
418	#ifdef VBOX_STRICT
419	off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
420	#endif
421
422	idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
423	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
424	}
425	else
426	{
427	idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
428	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
429
430	idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
431	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
432	off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
433	}
434	Assert(bmAvailableRegs != 0);
435
436	/*
437	* Do the actual EFLAGS calculation.
438	*/
439	switch (pReNative->PostponedEfl.enmOp)
440	{
441	case kIemNativePostponedEflOp_Logical:
442	Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
443	off = iemNativeEmitPostponedEFlagsCalcLogical(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
444	pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
445	break;
446
447	default:
448	AssertFailedBreak();
449	}
450
451	/*
452	* Store EFLAGS.
453	*/
454	#ifdef VBOX_STRICT
455	/* check that X86_EFL_1 is set. */
456	uint32_t offFixup1;
457	off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
458	off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
459	iemNativeFixupFixedJump(pReNative, offFixup1, off);
460	/* Check that X86_EFL_RAZ_LO_MASK is zero. */
461	off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK);
462	uint32_t const offFixup2 = off;
463	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
464	off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
465	iemNativeFixupFixedJump(pReNative, offFixup2, off);
466	#endif
467	off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
468	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
469
470	#if defined(VBOX_WITH_STATISTICS) \|\| defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
471	pReNative->PostponedEfl.cEmits++;
472	#endif
473	return off;
474	}
475
476
477
478	template<uint32_t const a_bmInputRegs>
479	DECL_FORCE_INLINE_THROW(uint32_t)
480	iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
481	{
482	if (pReNative->PostponedEfl.fEFlags)
483	{
484	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
485	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
486	}
487	return off;
488	}
489
490
491	template<uint32_t const a_bmInputRegs>
492	DECL_FORCE_INLINE_THROW(uint32_t)
493	iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
494	{
495	if (pReNative->PostponedEfl.fEFlags)
496	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
497	return off;
498	}
499
500
501	template<uint32_t const a_bmInputRegs>
502	DECL_FORCE_INLINE_THROW(uint32_t)
503	iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
504	uint32_t bmTmpRegs)
505	{
506	if (pReNative->PostponedEfl.fEFlags)
507	{
508	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
509	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
510	pTlbState->getRegsNotToSave() \| bmTmpRegs);
511	}
512	return off;
513	}
514
515
516	#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
517
518
519	/**
520	* This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
521	*
522	* It takes liveness stuff into account.
523	*/
524	DECL_INLINE_THROW(uint32_t)
525	iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
526	uint8_t cOpBits, uint8_t idxRegResult
527	#ifndef RT_ARCH_AMD64
528	, bool fNativeFlags = false
529	#endif
530	)
531	{
532	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
533	IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
534	RT_NOREF(cOpBits, idxRegResult);
535
536	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
537	/*
538	* See if we can skip this wholesale.
539	*/
540	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
541	uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
542	& IEMLIVENESSBIT_STATUS_EFL_MASK;
543	# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
544	uint64_t fEflPostponing;
545	# endif
546	if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
547	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
548	{
549	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
550	pReNative->fSkippingEFlags \|= X86_EFL_STATUS_BITS;
551	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
552	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
553	# endif
554	Log5(("iemNativeEmitEFlagsForLogical: Skipping %#x\n", X86_EFL_STATUS_BITS));
555	return off;
556	}
557	# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
558	if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
559	\| fEflClobbered)
560	== IEMLIVENESSBIT_STATUS_EFL_MASK
561	&& idxRegResult != UINT8_MAX)
562	{
563	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
564	pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
565	pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
566	pReNative->PostponedEfl.cOpBits = cOpBits;
567	pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
568	/** @todo it would normally be possible to use idxRegResult, iff it is
569	* already a non-volatile register and we can be user the caller
570	* doesn't modify it. That'll save a register move and allocation. */
571	off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
572	Log5(("iemNativeEmitEFlagsForLogical: Postponing %#x op=%u bits=%u reg1=%u\n", X86_EFL_STATUS_BITS,
573	kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
574	}
575	# endif
576	else
577	#endif
578	{
579	#ifdef RT_ARCH_AMD64
580	/*
581	* Collect flags and merge them with eflags.
582	*/
583	/** @todo we could alternatively use LAHF here when host rax is free since,
584	* OF is cleared. */
585	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
586	/* pushf - do this before any reg allocations as they may emit instructions too. */
587	pCodeBuf[off++] = 0x9c;
588
589	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
590	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
591	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
592	/* pop tmp */
593	if (idxTmpReg >= 8)
594	pCodeBuf[off++] = X86_OP_REX_B;
595	pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
596	/* and tmp, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF */
597	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF);
598	/* Clear the status bits in EFLs. */
599	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
600	/* OR in the flags we collected. */
601	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
602	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
603	iemNativeRegFreeTmp(pReNative, idxTmpReg);
604
605	#elif defined(RT_ARCH_ARM64)
606	/*
607	* Calculate flags.
608	*/
609	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
610	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
611	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 15);
612
613	/* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxTmpReg for constant. */
614	off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxTmpReg, ~X86_EFL_STATUS_BITS);
615	off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxTmpReg);
616
617	/* N,Z -> SF,ZF */
618	if (cOpBits < 32)
619	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
620	else if (!fNativeFlags)
621	pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /f64Bit/);
622	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
623	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 30);
624	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /f64Bit/);
625	AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
626
627	/* Calculate 8-bit parity of the result. */
628	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /f64Bit/,
629	4 /offShift6/, kArmv8A64InstrShift_Lsr);
630	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
631	2 /offShift6/, kArmv8A64InstrShift_Lsr);
632	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
633	1 /offShift6/, kArmv8A64InstrShift_Lsr);
634	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
635	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /f64Bit/);
636	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /f64Bit/);
637
638	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
639	iemNativeRegFreeTmp(pReNative, idxTmpReg);
640	#else
641	# error "port me"
642	#endif
643	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
644	}
645
646	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
647	pReNative->fSkippingEFlags = 0;
648	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
649	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
650	# endif
651	#endif
652	return off;
653	}
654
655
656	/**
657	* This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
658	*
659	* It takes liveness stuff into account.
660	*/
661	DECL_FORCE_INLINE_THROW(uint32_t)
662	iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
663	#ifndef RT_ARCH_AMD64
664	, uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
665	, bool fInvertCarry, uint64_t uImmSrc
666	#endif
667	)
668	{
669	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
670	IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
671
672	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
673	/*
674	* See if we can skip this wholesale.
675	*/
676	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
677	if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
678	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
679	{
680	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
681	pReNative->fSkippingEFlags \|= X86_EFL_STATUS_BITS;
682	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
683	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
684	# endif
685	}
686	else
687	#endif
688	{
689	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
690	uint32_t fSkipped = 0;
691	#endif
692	#ifdef RT_ARCH_AMD64
693	/*
694	* Collect flags and merge them with eflags.
695	*/
696	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
697	/* pushf - do this before any reg allocations as they may emit instructions too. */
698	pCodeBuf[off++] = 0x9c;
699
700	uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
701	: iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
702	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
703	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
704	/* pop tmp */
705	if (idxTmpReg >= 8)
706	pCodeBuf[off++] = X86_OP_REX_B;
707	pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
708	/* Isolate the flags we want. */
709	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
710	/* Clear the status bits in EFLs. */
711	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
712	/* OR in the flags we collected. */
713	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
714	if (idxRegEflIn != idxRegEfl)
715	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
716	iemNativeRegFreeTmp(pReNative, idxTmpReg);
717
718	#elif defined(RT_ARCH_ARM64)
719	/*
720	* Calculate flags.
721	*/
722	uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
723	: iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
724	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
725	uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
726	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
727
728	/* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
729	if (fInvertCarry)
730	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
731	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
732
733	if (cOpBits >= 32)
734	{
735	/* V -> OF */
736	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
737	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /f64Bit/);
738
739	/* C -> CF */
740	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
741	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /f64Bit/);
742	}
743
744	/* N,Z -> SF,ZF */
745	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
746	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /f64Bit/);
747
748	/* For ADC and SBB we have to calculate overflow and carry our selves. */
749	if (cOpBits < 32)
750	{
751	/* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
752	AssertCompile(X86_EFL_CF_BIT == 0);
753	pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /f64Bit/);
754
755	/* The overflow flag is more work as we have to compare the signed bits for
756	both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
757
758	Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
759	With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
760
761	It is a bit simpler when the right (source) side is constant:
762	adc: S D R -> OF sbb: S D R -> OF
763	0 0 0 -> 0 \ 0 0 0 -> 0 \
764	0 0 1 -> 1 \ 0 0 1 -> 0 \
765	0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
766	0 1 1 -> 0 / 0 1 1 -> 0 /
767	1 0 0 -> 0 \ 1 0 0 -> 0 \
768	1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
769	1 1 0 -> 1 / 1 1 0 -> 0 /
770	1 1 1 -> 0 / 1 1 1 -> 0 / */
771	if (idxRegSrc != UINT8_MAX)
772	{
773	if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
774	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
775	else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
776	pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
777	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
778	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /f64Bit/);
779	}
780	else if (uImmSrc & RT_BIT_32(cOpBits - 1))
781	{
782	if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
783	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
784	else
785	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
786	}
787	else
788	{
789	if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
790	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
791	else
792	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
793	}
794	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /f64Bit/);
795	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
796	iemNativeRegFreeTmp(pReNative, idxTmpReg2);
797	}
798
799	/* Calculate 8-bit parity of the result. */
800	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /f64Bit/,
801	4 /offShift6/, kArmv8A64InstrShift_Lsr);
802	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
803	2 /offShift6/, kArmv8A64InstrShift_Lsr);
804	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
805	1 /offShift6/, kArmv8A64InstrShift_Lsr);
806	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
807	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /f64Bit/);
808	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /f64Bit/);
809
810	/* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
811	General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
812	S D R
813	0 0 0 -> 0; \
814	0 0 1 -> 1; \ regular
815	0 1 0 -> 1; / xor R, D
816	0 1 1 -> 0; /
817	1 0 0 -> 1; \
818	1 0 1 -> 0; \ invert one of the two
819	1 1 0 -> 0; / xor not(R), D
820	1 1 1 -> 1; /
821	a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
822	a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
823	*/
824
825	if (idxRegSrc != UINT8_MAX)
826	{
827	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /f64Bit/);
828	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /f64Bit/);
829	}
830	else if (uImmSrc & X86_EFL_AF)
831	pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /f64Bit/);
832	else
833	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /f64Bit/);
834	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /f64Bit/);
835	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /f64Bit/);
836
837	if (idxRegEflIn != idxRegEfl)
838	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
839	iemNativeRegFreeTmp(pReNative, idxTmpReg);
840
841	#else
842	# error "port me"
843	#endif
844	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
845
846	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
847	pReNative->fSkippingEFlags = fSkipped;
848	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
849	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, fSkipped, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
850	# endif
851	#endif
852	}
853	return off;
854
855	}
856
857
858
859	/*********************************************************************************************************************************
860	* Bitwise Logical Operations *
861	*********************************************************************************************************************************/
862
863	/**
864	* The AND instruction will clear OF, CF and AF (latter is undefined) and
865	* set the other flags according to the result.
866	*/
867	DECL_INLINE_THROW(uint32_t)
868	iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
869	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
870	{
871	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
872	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
873	#ifdef RT_ARCH_AMD64
874	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
875	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
876	0x22, 0x23, cOpBits, idxRegDst, idxRegSrc);
877	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
878	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
879
880	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
881
882	#elif defined(RT_ARCH_ARM64)
883	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
884	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
885	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
886	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
887	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
888
889	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, true /fNativeFlags/);
890	#else
891	# error "Port me"
892	#endif
893	iemNativeVarRegisterRelease(pReNative, idxVarDst);
894	return off;
895	}
896
897
898	/**
899	* The AND instruction with immediate value as right operand.
900	*/
901	DECL_INLINE_THROW(uint32_t)
902	iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
903	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
904	{
905	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
906	#ifdef RT_ARCH_AMD64
907	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
908	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
909	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 4, idxRegDst, uImmOp);
910	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
911
912	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
913
914	#elif defined(RT_ARCH_ARM64)
915	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
916	course the immediate variant when possible to save a register load. */
917	uint32_t uImmSizeLen, uImmRotations;
918	if ( cOpBits > 32
919	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
920	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
921	{
922	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
923	if (cOpBits >= 32)
924	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
925	else
926	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
927	}
928	else
929	{
930	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
931	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
932	if (cOpBits >= 32)
933	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
934	else
935	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
936	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
937	}
938	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
939
940	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, cOpBits >= 32 /fNativeFlags/);
941	RT_NOREF_PV(cImmBits);
942
943	#else
944	# error "Port me"
945	#endif
946	iemNativeVarRegisterRelease(pReNative, idxVarDst);
947	return off;
948	}
949
950
951	/**
952	* The TEST instruction will clear OF, CF and AF (latter is undefined) and
953	* set the other flags according to the result.
954	*/
955	DECL_INLINE_THROW(uint32_t)
956	iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
957	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
958	{
959	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
960	uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
961	: iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
962	#ifdef RT_ARCH_AMD64
963	/* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
964	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
965	0x84, 0x85, cOpBits, idxRegSrc, idxRegDst);
966	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
967
968	#elif defined(RT_ARCH_ARM64)
969	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
970	need to keep the result in order to calculate the flags. */
971	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
972	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
973	if (cOpBits >= 32)
974	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
975	else
976	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
977	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
978
979	#else
980	# error "Port me"
981	#endif
982	if (idxVarSrc != idxVarDst)
983	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
984	iemNativeVarRegisterRelease(pReNative, idxVarDst);
985
986	#ifdef RT_ARCH_AMD64
987	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
988	#else
989	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /fNativeFlags/);
990	iemNativeRegFreeTmp(pReNative, idxRegResult);
991	#endif
992	return off;
993	}
994
995
996	/**
997	* The TEST instruction with immediate value as right operand.
998	*/
999	DECL_INLINE_THROW(uint32_t)
1000	iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1001	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1002	{
1003	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1004	#ifdef RT_ARCH_AMD64
1005	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
1006	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1007	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1008	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1009	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1010
1011	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
1012
1013	#elif defined(RT_ARCH_ARM64)
1014	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1015	course the immediate variant when possible to save a register load.
1016	We also need to keep the result in order to calculate the flags. */
1017	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1018	uint32_t uImmSizeLen, uImmRotations;
1019	if ( cOpBits > 32
1020	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1021	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1022	{
1023	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1024	if (cOpBits >= 32)
1025	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
1026	else
1027	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
1028	}
1029	else
1030	{
1031	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1032	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1033	if (cOpBits >= 32)
1034	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
1035	else
1036	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
1037	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1038	}
1039	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1040	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1041
1042	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /fNativeFlags/);
1043
1044	iemNativeRegFreeTmp(pReNative, idxRegResult);
1045	RT_NOREF_PV(cImmBits);
1046
1047	#else
1048	# error "Port me"
1049	#endif
1050	return off;
1051	}
1052
1053
1054	/**
1055	* The OR instruction will clear OF, CF and AF (latter is undefined) and
1056	* set the other flags according to the result.
1057	*/
1058	DECL_INLINE_THROW(uint32_t)
1059	iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1060	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1061	{
1062	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1063	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1064	#ifdef RT_ARCH_AMD64
1065	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1066	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1067	0x0a, 0x0b, cOpBits, idxRegDst, idxRegSrc);
1068	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1069	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1070
1071	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1072
1073	#elif defined(RT_ARCH_ARM64)
1074	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1075	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1076	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
1077	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1078	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1079
1080	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1081
1082	#else
1083	# error "Port me"
1084	#endif
1085	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1086	return off;
1087	}
1088
1089
1090	/**
1091	* The OR instruction with immediate value as right operand.
1092	*/
1093	DECL_INLINE_THROW(uint32_t)
1094	iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1095	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1096	{
1097	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1098	#ifdef RT_ARCH_AMD64
1099	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1100	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1101	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 1, idxRegDst, uImmOp);
1102	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1103
1104	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1105
1106	#elif defined(RT_ARCH_ARM64)
1107	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1108	course the immediate variant when possible to save a register load. */
1109	uint32_t uImmSizeLen, uImmRotations;
1110	if ( cOpBits > 32
1111	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1112	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1113	{
1114	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1115	pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
1116	}
1117	else
1118	{
1119	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1120	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1121	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
1122	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1123	}
1124	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1125
1126	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1127	RT_NOREF_PV(cImmBits);
1128
1129	#else
1130	# error "Port me"
1131	#endif
1132	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1133	return off;
1134	}
1135
1136
1137	/**
1138	* The XOR instruction will clear OF, CF and AF (latter is undefined) and
1139	* set the other flags according to the result.
1140	*/
1141	DECL_INLINE_THROW(uint32_t)
1142	iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1143	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1144	{
1145	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1146	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1147	#ifdef RT_ARCH_AMD64
1148	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1149	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1150	0x32, 0x33, cOpBits, idxRegDst, idxRegSrc);
1151	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1152	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1153
1154	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1155
1156	#elif defined(RT_ARCH_ARM64)
1157	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1158	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1159	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
1160	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1161	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1162
1163	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1164
1165	#else
1166	# error "Port me"
1167	#endif
1168	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1169	return off;
1170	}
1171
1172
1173	/**
1174	* The XOR instruction with immediate value as right operand.
1175	*/
1176	DECL_INLINE_THROW(uint32_t)
1177	iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1178	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1179	{
1180	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1181	#ifdef RT_ARCH_AMD64
1182	/* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1183	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1184	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 6, idxRegDst, uImmOp);
1185	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1186
1187	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1188
1189	#elif defined(RT_ARCH_ARM64)
1190	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1191	course the immediate variant when possible to save a register load. */
1192	uint32_t uImmSizeLen, uImmRotations;
1193	if ( cOpBits > 32
1194	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1195	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1196	{
1197	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1198	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /f64Bit/);
1199	}
1200	else
1201	{
1202	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1203	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1204	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/);
1205	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1206	}
1207	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1208
1209	off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1210	RT_NOREF_PV(cImmBits);
1211
1212	#else
1213	# error "Port me"
1214	#endif
1215	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1216	return off;
1217	}
1218
1219
1220
1221	/*********************************************************************************************************************************
1222	* ADD, ADC, SUB, SBB, CMP *
1223	*********************************************************************************************************************************/
1224
1225	/**
1226	* The ADD instruction will set all status flags.
1227	*/
1228	DECL_INLINE_THROW(uint32_t)
1229	iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1230	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1231	{
1232	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1233	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1234
1235	#ifdef RT_ARCH_AMD64
1236	/* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1237	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1238	0x02, 0x03, cOpBits, idxRegDst, idxRegSrc);
1239	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1240
1241	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1242	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1243
1244	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1245
1246	#elif defined(RT_ARCH_ARM64)
1247	/* On ARM64 we'll need the two input operands as well as the result in order
1248	to calculate the right flags, even if we use ADDS and translates NZCV into
1249	OF, CF, ZF and SF. */
1250	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1251	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1252	if (cOpBits >= 32)
1253	{
1254	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1255	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1256	}
1257	else
1258	{
1259	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1260	uint32_t const cShift = 32 - cOpBits;
1261	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1262	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /f64Bit/,
1263	true /fSetFlags/, cShift);
1264	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1265	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1266	cOpBits = 32;
1267	}
1268	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1269
1270	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1271	idxRegDstIn, idxRegSrc, false /fInvertCarry/, 0);
1272
1273	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1274	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1275	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1276
1277	#else
1278	# error "port me"
1279	#endif
1280	return off;
1281	}
1282
1283
1284	/**
1285	* The ADD instruction with immediate value as right operand.
1286	*/
1287	DECL_INLINE_THROW(uint32_t)
1288	iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1289	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1290	{
1291	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1292
1293	#ifdef RT_ARCH_AMD64
1294	/* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1295	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1296	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1297	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1298
1299	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1300
1301	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1302
1303	#elif defined(RT_ARCH_ARM64)
1304	/* On ARM64 we'll need the two input operands as well as the result in order
1305	to calculate the right flags, even if we use ADDS and translates NZCV into
1306	OF, CF, ZF and SF. */
1307	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1308	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1309	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1310	if (cOpBits >= 32)
1311	{
1312	if (uImmOp <= 0xfffU)
1313	pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1314	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1315	pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /f64Bit/,
1316	true /fSetFlags/, true /fShift12/);
1317	else
1318	{
1319	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1320	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1321	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1322	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1323	}
1324	}
1325	else
1326	{
1327	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1328	uint32_t const cShift = 32 - cOpBits;
1329	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1330	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1331	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /f64Bit/, true /fSetFlags/, cShift);
1332	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1333	cOpBits = 32;
1334	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1335	}
1336	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1337
1338	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1339	idxRegDstIn, UINT8_MAX, false /fInvertCarry/, uImmOp);
1340
1341	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1342	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1343	RT_NOREF(cImmBits);
1344
1345	#else
1346	# error "port me"
1347	#endif
1348	return off;
1349	}
1350
1351
1352	/**
1353	* The ADC instruction takes CF as input and will set all status flags.
1354	*/
1355	DECL_INLINE_THROW(uint32_t)
1356	iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1357	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1358	{
1359	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1360	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1361	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1362
1363	#ifdef RT_ARCH_AMD64
1364	/* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1365	with matching size to get the correct flags. */
1366	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1367
1368	/* Use the BT instruction to set CF according to idxRegEfl. */
1369	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1370	pCodeBuf[off++] = X86_EFL_CF_BIT;
1371
1372	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, cOpBits, idxRegDst, idxRegSrc);
1373	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1374
1375	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1376	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1377
1378	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1379
1380	#elif defined(RT_ARCH_ARM64)
1381	/* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1382	then ADCS for the calculation. We need all inputs and result for the two
1383	flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1384	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1385	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1386
1387	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1388	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1389	if (cOpBits >= 32)
1390	pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
1391	else
1392	{
1393	/* Since we're also adding in the carry flag here, shifting operands up
1394	doesn't work. So, we have to calculate carry & overflow manually. */
1395	pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /f64Bit/);
1396	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1397	}
1398	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1399
1400	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1401	idxRegDstIn, idxRegSrc, false /fInvertCarry/, 0);
1402
1403	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1404	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1405	if (cOpBits < 32)
1406	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1407	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1408
1409	#else
1410	# error "port me"
1411	#endif
1412	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1413	return off;
1414	}
1415
1416
1417	/**
1418	* The ADC instruction with immediate value as right operand.
1419	*/
1420	DECL_INLINE_THROW(uint32_t)
1421	iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1422	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1423	{
1424	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1425	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1426
1427	#ifdef RT_ARCH_AMD64
1428	/* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1429	with matching size to get the correct flags. */
1430	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1431
1432	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1433	pCodeBuf[off++] = X86_EFL_CF_BIT;
1434
1435	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 2, idxRegDst, uImmOp);
1436	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1437
1438	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1439
1440	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1441
1442	#elif defined(RT_ARCH_ARM64)
1443	/* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1444	and then ADCS for the calculation. We need all inputs and result for
1445	the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1446	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1447	uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1448	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1449
1450	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1451	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1452	if (cOpBits >= 32)
1453	pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /f64Bit/);
1454	else
1455	{
1456	/* Since we're also adding in the carry flag here, shifting operands up
1457	doesn't work. So, we have to calculate carry & overflow manually. */
1458	pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /f64Bit/);
1459	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1460	}
1461	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1462
1463	iemNativeRegFreeTmp(pReNative, idxRegImm);
1464
1465	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1466	idxRegDstIn, UINT8_MAX, false /fInvertCarry/, uImmOp);
1467
1468	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1469	if (cOpBits < 32)
1470	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1471	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1472	RT_NOREF(cImmBits);
1473
1474	#else
1475	# error "port me"
1476	#endif
1477	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1478	return off;
1479	}
1480
1481
1482	/**
1483	* The SUB instruction will set all status flags.
1484	*/
1485	DECL_INLINE_THROW(uint32_t)
1486	iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1487	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1488	{
1489	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1490	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1491
1492	#ifdef RT_ARCH_AMD64
1493	/* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1494	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1495	0x2a, 0x2b, cOpBits, idxRegDst, idxRegSrc);
1496	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1497
1498	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1499	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1500
1501	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1502
1503	#elif defined(RT_ARCH_ARM64)
1504	/* On ARM64 we'll need the two input operands as well as the result in order
1505	to calculate the right flags, even if we use SUBS and translates NZCV into
1506	OF, CF, ZF and SF. */
1507	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1508	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1509	if (cOpBits >= 32)
1510	{
1511	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1512	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1513	}
1514	else
1515	{
1516	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1517	uint32_t const cShift = 32 - cOpBits;
1518	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1519	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /f64Bit/,
1520	true /fSetFlags/, cShift);
1521	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1522	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1523	cOpBits = 32;
1524	}
1525	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1526
1527	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1528	idxRegDstIn, idxRegSrc, true /fInvertCarry/, 0);
1529
1530	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1531	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1532	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1533
1534	#else
1535	# error "port me"
1536	#endif
1537	return off;
1538	}
1539
1540
1541	/**
1542	* The SUB instruction with immediate value as right operand.
1543	*/
1544	DECL_INLINE_THROW(uint32_t)
1545	iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1546	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1547	{
1548	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1549
1550	#ifdef RT_ARCH_AMD64
1551	/* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1552	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1553	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 5, idxRegDst, uImmOp);
1554	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1555
1556	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1557
1558	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1559
1560	#elif defined(RT_ARCH_ARM64)
1561	/* On ARM64 we'll need the two input operands as well as the result in order
1562	to calculate the right flags, even if we use SUBS and translates NZCV into
1563	OF, CF, ZF and SF. */
1564	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1565	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1566	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1567	if (cOpBits >= 32)
1568	{
1569	if (uImmOp <= 0xfffU)
1570	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1571	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1572	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /f64Bit/,
1573	true /fSetFlags/, true /fShift12/);
1574	else
1575	{
1576	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1577	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1578	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1579	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1580	}
1581	}
1582	else
1583	{
1584	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1585	uint32_t const cShift = 32 - cOpBits;
1586	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1587	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1588	pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1589	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /f64Bit/, true /fSetFlags/, cShift);
1590	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1591	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1592	cOpBits = 32;
1593	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1594	}
1595	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1596
1597	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1598	idxRegDstIn, UINT8_MAX, true /fInvertCarry/, uImmOp);
1599
1600	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1601	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1602	RT_NOREF(cImmBits);
1603
1604	#else
1605	# error "port me"
1606	#endif
1607	return off;
1608	}
1609
1610
1611	/**
1612	* The CMP instruction will set all status flags, but modifies no registers.
1613	*/
1614	DECL_INLINE_THROW(uint32_t)
1615	iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1616	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1617	{
1618	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1619	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1620
1621	#ifdef RT_ARCH_AMD64
1622	/* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1623	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1624	0x3a, 0x3b, cOpBits, idxRegDst, idxRegSrc);
1625	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1626
1627	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1628	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1629
1630	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1631
1632	#elif defined(RT_ARCH_ARM64)
1633	/* On ARM64 we'll need the actual result as well as both input operands in order
1634	to calculate the right flags, even if we use SUBS and translates NZCV into
1635	OF, CF, ZF and SF. */
1636	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1637	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1638	if (cOpBits >= 32)
1639	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1640	else
1641	{
1642	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1643	uint32_t const cShift = 32 - cOpBits;
1644	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1645	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /f64Bit/,
1646	true /fSetFlags/, cShift);
1647	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /f64Bit/);
1648	cOpBits = 32;
1649	}
1650	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1651
1652	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1653	idxRegDst, idxRegSrc, true /fInvertCarry/, 0);
1654
1655	iemNativeRegFreeTmp(pReNative, idxRegResult);
1656	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1657	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1658
1659	#else
1660	# error "port me"
1661	#endif
1662	return off;
1663	}
1664
1665
1666	/**
1667	* The CMP instruction with immediate value as right operand.
1668	*/
1669	DECL_INLINE_THROW(uint32_t)
1670	iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1671	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1672	{
1673	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1674
1675	#ifdef RT_ARCH_AMD64
1676	/* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1677	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1678	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 7, idxRegDst, uImmOp);
1679	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1680
1681	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1682
1683	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1684
1685	#elif defined(RT_ARCH_ARM64)
1686	/* On ARM64 we'll need the actual result as well as both input operands in order
1687	to calculate the right flags, even if we use SUBS and translates NZCV into
1688	OF, CF, ZF and SF. */
1689	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1690	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1691	if (cOpBits >= 32)
1692	{
1693	if (uImmOp <= 0xfffU)
1694	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1695	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1696	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, cOpBits > 32 /f64Bit/,
1697	true /fSetFlags/, true /fShift12/);
1698	else
1699	{
1700	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1701	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1702	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /f64Bit/, true /fSetFlags/);
1703	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1704	}
1705	}
1706	else
1707	{
1708	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1709	uint32_t const cShift = 32 - cOpBits;
1710	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1711	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1712	pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /f64Bit/);
1713	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /f64Bit/, true /fSetFlags/, cShift);
1714	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /f64Bit/);
1715	cOpBits = 32;
1716	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1717	}
1718	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1719
1720	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1721	idxRegDst, UINT8_MAX, true /fInvertCarry/, uImmOp);
1722
1723	iemNativeRegFreeTmp(pReNative, idxRegResult);
1724	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1725	RT_NOREF(cImmBits);
1726
1727	#else
1728	# error "port me"
1729	#endif
1730	return off;
1731	}
1732
1733
1734	/**
1735	* The SBB instruction takes CF as input and will set all status flags.
1736	*/
1737	DECL_INLINE_THROW(uint32_t)
1738	iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1739	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1740	{
1741	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1742	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1743	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1744
1745	#ifdef RT_ARCH_AMD64
1746	/* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1747	with matching size to get the correct flags. */
1748	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1749
1750	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1751	pCodeBuf[off++] = X86_EFL_CF_BIT;
1752
1753	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, cOpBits, idxRegDst, idxRegSrc);
1754	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1755
1756	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1757	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1758
1759	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1760
1761	#elif defined(RT_ARCH_ARM64)
1762	/* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1763	idxRegEfl and then SBCS for the calculation. We need all inputs and
1764	result for the two flags (AF,PF) that can't be directly derived from
1765	PSTATE.NZCV. */
1766	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1767	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1768
1769	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1770	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1771	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1772	if (cOpBits >= 32)
1773	pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /f64Bit/);
1774	else
1775	{
1776	/* Since we're also adding in the carry flag here, shifting operands up
1777	doesn't work. So, we have to calculate carry & overflow manually. */
1778	pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /f64Bit/);
1779	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1780	}
1781	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1782
1783	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1784	idxRegDstIn, idxRegSrc, true /fInvertCarry/, 0);
1785
1786	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1787	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1788	if (cOpBits < 32)
1789	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1790	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1791
1792	#else
1793	# error "port me"
1794	#endif
1795	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1796	return off;
1797	}
1798
1799
1800	/**
1801	* The SBB instruction with immediate value as right operand.
1802	*/
1803	DECL_INLINE_THROW(uint32_t)
1804	iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1805	uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1806	{
1807	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1808	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1809
1810	#ifdef RT_ARCH_AMD64
1811	/* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1812	with matching size to get the correct flags. */
1813	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1814
1815	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1816	pCodeBuf[off++] = X86_EFL_CF_BIT;
1817
1818	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 3, idxRegDst, uImmOp);
1819	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1820
1821	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1822
1823	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1824
1825	#elif defined(RT_ARCH_ARM64)
1826	/* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1827	idxRegEfl and then SBCS for the calculation. We need all inputs and
1828	result for the two flags (AF,PF) that can't be directly derived from
1829	PSTATE.NZCV. */
1830	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1831	uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1832	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1833
1834	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1835	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1836	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1837	if (cOpBits >= 32)
1838	pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /f64Bit/);
1839	else
1840	{
1841	/* Since we're also adding in the carry flag here, shifting operands up
1842	doesn't work. So, we have to calculate carry & overflow manually. */
1843	pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /f64Bit/);
1844	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1845	}
1846	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1847
1848	iemNativeRegFreeTmp(pReNative, idxRegImm);
1849
1850	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1851	idxRegDstIn, UINT8_MAX, true /fInvertCarry/, uImmOp);
1852
1853	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1854	if (cOpBits < 32)
1855	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1856	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1857	RT_NOREF(cImmBits);
1858
1859	#else
1860	# error "port me"
1861	#endif
1862	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1863	return off;
1864	}
1865
1866
1867	DECL_INLINE_THROW(uint32_t)
1868	iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1869	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1870	{
1871	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1872	AssertFailed();
1873	return iemNativeEmitBrk(pReNative, off, 0x666);
1874	}
1875
1876
1877	DECL_INLINE_THROW(uint32_t)
1878	iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1879	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1880	{
1881	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1882	AssertFailed();
1883	return iemNativeEmitBrk(pReNative, off, 0x666);
1884	}
1885
1886
1887	DECL_INLINE_THROW(uint32_t)
1888	iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1889	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1890	{
1891	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1892	AssertFailed();
1893	return iemNativeEmitBrk(pReNative, off, 0x666);
1894	}
1895
1896
1897	DECL_INLINE_THROW(uint32_t)
1898	iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1899	uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1900	{
1901	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1902	AssertFailed();
1903	return iemNativeEmitBrk(pReNative, off, 0x666);
1904	}
1905
1906
1907
1908	/*********************************************************************************************************************************
1909	* Shifting and Rotating. *
1910	*********************************************************************************************************************************/
1911
1912
1913	typedef enum
1914	{
1915	kIemNativeEmitEFlagsForShiftType_Left,
1916	kIemNativeEmitEFlagsForShiftType_Right,
1917	kIemNativeEmitEFlagsForShiftType_SignedRight
1918	} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1919
1920	/**
1921	* This is used by SHL, SHR and SAR emulation.
1922	*
1923	* It takes liveness stuff into account.
1924	*/
1925	DECL_INLINE_THROW(uint32_t)
1926	iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1927	uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1928	uint8_t idxRegTmp)
1929	{
1930	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1931
1932	RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1933	#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1934	/*
1935	* See if we can skip this wholesale.
1936	*/
1937	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1938	if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1939	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1940	{
1941	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1942	pReNative->fSkippingEFlags \|= X86_EFL_STATUS_BITS;
1943	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1944	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1945	# endif
1946	}
1947	else
1948	#endif
1949	{
1950	/*
1951	* The difference between Intel and AMD flags for SHL are:
1952	* - Intel always clears AF while AMD always sets it.
1953	* - Intel sets OF for the first shift, while AMD for the last shift.
1954	*
1955	*/
1956
1957	#ifdef RT_ARCH_AMD64
1958	/*
1959	* We capture flags and does the additional OF and AF calculations as needed.
1960	*/
1961	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1962	/** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1963	* use LAHF here when host rax is free since, OF is cleared. */
1964	/* pushf */
1965	pCodeBuf[off++] = 0x9c;
1966	/* pop tmp */
1967	if (idxRegTmp >= 8)
1968	pCodeBuf[off++] = X86_OP_REX_B;
1969	pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1970	/* Clear the status bits in EFLs. */
1971	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1972	uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1973	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1974	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1975	else
1976	{
1977	/* and tmp, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF \| X86_EFL_CF */
1978	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF \| X86_EFL_CF);
1979	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1980	off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1981	/* OR in the flags we collected. */
1982	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1983
1984	/* Calculate OF */
1985	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1986	{
1987	/* AMD last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1988	/* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1989	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /ud2/, 0xba,
1990	RT_MAX(cOpBits, 16), 4, idxRegResult);
1991	pCodeBuf[off++] = cOpBits - 1;
1992	/* setc idxRegTmp */
1993	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /ud2/, 8, 0, idxRegTmp);
1994	/* xor idxRegTmp, idxRegEfl */
1995	off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1996	/* and idxRegTmp, 1 */
1997	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1998	/* shl idxRegTmp, X86_EFL_OF_BIT */
1999	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
2000	}
2001	else
2002	{
2003	/* Intel first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2004	if (cOpBits <= 32)
2005	{
2006	/* mov idxRegTmp, idxRegSrc */
2007	off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
2008	/* shl idxRegTmp, 1 */
2009	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
2010	/* xor idxRegTmp, idxRegSrc */
2011	off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2012	/* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
2013	if (cOpBits >= X86_EFL_OF_BIT)
2014	off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2015	else
2016	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
2017	}
2018	else
2019	{
2020	/* same as above but with 64-bit grps*/
2021	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2022	off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2023	off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2024	off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2025	}
2026	/* and idxRegTmp, X86_EFL_OF */
2027	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2028	}
2029	}
2030	/* Or in the collected flag(s) */
2031	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2032
2033	#elif defined(RT_ARCH_ARM64)
2034	/*
2035	* Calculate flags.
2036	*/
2037	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2038
2039	/* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2040	off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2041	off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2042
2043	/* N,Z -> SF,ZF */
2044	if (cOpBits < 32)
2045	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2046	else
2047	pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /f64Bit/);
2048	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2049	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2050	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /f64Bit/);
2051	AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2052
2053	/* Calculate 8-bit parity of the result. */
2054	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /f64Bit/,
2055	4 /offShift6/, kArmv8A64InstrShift_Lsr);
2056	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
2057	2 /offShift6/, kArmv8A64InstrShift_Lsr);
2058	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
2059	1 /offShift6/, kArmv8A64InstrShift_Lsr);
2060	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2061	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /f64Bit/);
2062	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /f64Bit/);
2063
2064	/* Calculate carry - the last bit shifted out of the input value. */
2065	if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2066	{
2067	/* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2068	pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2069	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /f64Bit/, cOpBits < 32 /fSetFlags/);
2070	if (cOpBits < 32)
2071	pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2072	pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2073	}
2074	else
2075	{
2076	/* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2077	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /f64Bit/);
2078	pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2079	}
2080	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /f64Bit/);
2081
2082	uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2083	if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2084	{
2085	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2086	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /left shift count/);
2087	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2088	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /f64Bit/);
2089	}
2090	else
2091	{
2092	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2093	AssertCompile(X86_EFL_CF_BIT == 0);
2094	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2095	cOpBits - 1, kArmv8A64InstrShift_Lsr);
2096	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /f64Bit/);
2097
2098	/* AMD unconditionally clears AF. */
2099	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2100	pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /f64Bit/);
2101	}
2102	#else
2103	# error "port me"
2104	#endif
2105	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2106
2107	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2108	pReNative->fSkippingEFlags = 0;
2109	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2110	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2111	# endif
2112	#endif
2113	}
2114	return off;
2115	}
2116
2117
2118	DECL_INLINE_THROW(uint32_t)
2119	iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2120	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2121	{
2122	/* Note! Since we're doing some branching here, we need to allocate all
2123	registers we need before the jump or we may end up with invalid
2124	register state if the branch is taken. */
2125	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2126	uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /fInitialized/); /* modified on arm64 */
2127	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
2128	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
2129
2130	#ifdef RT_ARCH_AMD64
2131	/* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2132	AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2133
2134	/* We only need a copy of the input value if the target CPU differs from the host CPU. */
2135	uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2136	? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2137	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2138
2139	/* Check if it's NOP before we do anything. */
2140	off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2141	uint32_t const offFixup = off;
2142	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /8-bit should be enough /, kIemNativeInstrCond_z);
2143
2144	if (idxRegDstIn != UINT8_MAX)
2145	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2146	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2147
2148	#elif defined(RT_ARCH_ARM64)
2149	/* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2150	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2151	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2152
2153	/* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2154	Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2155	Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2156	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /f64Bit/);
2157	uint32_t const offFixup = off;
2158	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2159
2160	pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2161	pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /f64Bit/);
2162	if (cOpBits < 32)
2163	{
2164	Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2165	Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2166	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /f64Bit/);
2167	}
2168
2169	#else
2170	# error "port me"
2171	#endif
2172
2173	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2174	off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2175	cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2176
2177	/* fixup the jump */
2178	iemNativeFixupFixedJump(pReNative, offFixup, off);
2179
2180	#ifdef RT_ARCH_AMD64
2181	if (idxRegDstIn != UINT8_MAX)
2182	#endif
2183	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2184	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2185	iemNativeVarRegisterRelease(pReNative, idxVarDst);
2186	iemNativeVarRegisterRelease(pReNative, idxVarCount);
2187	iemNativeRegFreeTmp(pReNative, idxRegTmp);
2188	return off;
2189	}
2190
2191
2192	DECL_INLINE_THROW(uint32_t)
2193	iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2194	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2195	{
2196	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2197	AssertFailed();
2198	return iemNativeEmitBrk(pReNative, off, 0x666);
2199	}
2200
2201
2202	DECL_INLINE_THROW(uint32_t)
2203	iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2204	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2205	{
2206	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2207	AssertFailed();
2208	return iemNativeEmitBrk(pReNative, off, 0x666);
2209	}
2210
2211
2212	DECL_INLINE_THROW(uint32_t)
2213	iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2214	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2215	{
2216	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2217	AssertFailed();
2218	return iemNativeEmitBrk(pReNative, off, 0x666);
2219	}
2220
2221
2222	DECL_INLINE_THROW(uint32_t)
2223	iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2224	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2225	{
2226	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2227	AssertFailed();
2228	return iemNativeEmitBrk(pReNative, off, 0x666);
2229	}
2230
2231
2232	DECL_INLINE_THROW(uint32_t)
2233	iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2234	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2235	{
2236	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2237	AssertFailed();
2238	return iemNativeEmitBrk(pReNative, off, 0x666);
2239	}
2240
2241
2242	DECL_INLINE_THROW(uint32_t)
2243	iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2244	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2245	{
2246	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2247	AssertFailed();
2248	return iemNativeEmitBrk(pReNative, off, 0x666);
2249	}
2250
2251
2252
2253	#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2254	/*********************************************************************************************************************************
2255	* SIMD emitters. *
2256	*********************************************************************************************************************************/
2257
2258	/**
2259	* Common emitter for packed arithmetic instructions.
2260	*/
2261	#ifdef RT_ARCH_AMD64
2262	# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2263	DECL_INLINE_THROW(uint32_t) \
2264	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2265	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2266	{ \
2267	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2268	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2269	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2270	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2271	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2272	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2273	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2274	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2275	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2276	pCodeBuf[off++] = 0x0f; \
2277	pCodeBuf[off++] = (a_bOpcX86); \
2278	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2279	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2280	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2281	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2282	return off; \
2283	} \
2284	DECL_INLINE_THROW(uint32_t) \
2285	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2286	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2287	{ \
2288	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2289	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2290	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2291	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2292	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2293	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2294	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2295	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2296	pCodeBuf[off++] = 0x0f; \
2297	pCodeBuf[off++] = (a_bOpcX86); \
2298	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2299	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2300	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2301	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2302	return off; \
2303	} \
2304	typedef int ignore_semicolon
2305	#elif defined(RT_ARCH_ARM64)
2306	# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2307	DECL_INLINE_THROW(uint32_t) \
2308	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2309	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2310	{ \
2311	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2312	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2313	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2314	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2315	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2316	pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2317	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2318	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2319	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2320	return off; \
2321	} \
2322	DECL_INLINE_THROW(uint32_t) \
2323	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2324	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2325	{ \
2326	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2327	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2328	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2329	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2330	pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2331	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2332	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2333	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2334	return off; \
2335	} \
2336	typedef int ignore_semicolon
2337	#else
2338	# error "Port me"
2339	#endif
2340
2341	/* POR, ORPS, ORPD. */
2342	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2343	/* PXOR, XORPS, XORPD. */
2344	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2345	/* PAND, ANDPS, ANDPD. */
2346	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2347
2348
2349	/**
2350	* Common emitter for the shift right with immediate instructions.
2351	*/
2352	#ifdef RT_ARCH_AMD64
2353	# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2354	DECL_INLINE_THROW(uint32_t) \
2355	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2356	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2357	{ \
2358	if (bImm) \
2359	{ \
2360	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2361	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2362	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2363	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2364	if (idxSimdRegDst >= 8) \
2365	pCodeBuf[off++] = X86_OP_REX_B; \
2366	pCodeBuf[off++] = 0x0f; \
2367	pCodeBuf[off++] = (a_bOpcX86); \
2368	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2369	pCodeBuf[off++] = bImm; \
2370	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2371	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2372	} \
2373	/* Immediate 0 is a nop. */ \
2374	return off; \
2375	} \
2376	typedef int ignore_semicolon
2377	#elif defined(RT_ARCH_ARM64)
2378	# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2379	DECL_INLINE_THROW(uint32_t) \
2380	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2381	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2382	{ \
2383	if (bImm) \
2384	{ \
2385	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2386	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2387	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2388	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2389	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2390	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2391	} \
2392	/* Immediate 0 is a nop. */ \
2393	return off; \
2394	} \
2395	typedef int ignore_semicolon
2396	#else
2397	# error "Port me"
2398	#endif
2399
2400	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2401	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2402	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2403
2404
2405	/**
2406	* Common emitter for the shift left with immediate instructions.
2407	*/
2408	#ifdef RT_ARCH_AMD64
2409	# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2410	DECL_INLINE_THROW(uint32_t) \
2411	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2412	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2413	{ \
2414	if (bImm) \
2415	{ \
2416	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2417	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2418	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2419	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2420	if (idxSimdRegDst >= 8) \
2421	pCodeBuf[off++] = X86_OP_REX_B; \
2422	pCodeBuf[off++] = 0x0f; \
2423	pCodeBuf[off++] = (a_bOpcX86); \
2424	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2425	pCodeBuf[off++] = bImm; \
2426	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2427	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2428	} \
2429	/* Immediate 0 is a nop. */ \
2430	return off; \
2431	} \
2432	typedef int ignore_semicolon
2433	#elif defined(RT_ARCH_ARM64)
2434	# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2435	DECL_INLINE_THROW(uint32_t) \
2436	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2437	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2438	{ \
2439	if (bImm) /* bImm == 0 is a nop */ \
2440	{ \
2441	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2442	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2443	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2444	if (bImm < (a_cShiftMax)) \
2445	pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2446	else /* Everything >= a_cShiftMax sets the register to zero. */ \
2447	pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2448	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2449	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2450	} \
2451	return off; \
2452	} \
2453	typedef int ignore_semicolon
2454	#else
2455	# error "Port me"
2456	#endif
2457
2458	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2459	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2460	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2461
2462
2463	/**
2464	* Common emitter for packed arithmetic instructions.
2465	*/
2466	#ifdef RT_ARCH_AMD64
2467	# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2468	DECL_INLINE_THROW(uint32_t) \
2469	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2470	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2471	{ \
2472	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2473	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2474	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2475	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2476	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2477	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2478	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2479	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2480	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2481	pCodeBuf[off++] = 0x0f; \
2482	pCodeBuf[off++] = (a_bOpcX86); \
2483	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2484	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2485	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2486	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2487	return off; \
2488	} \
2489	DECL_INLINE_THROW(uint32_t) \
2490	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2491	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2492	{ \
2493	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2494	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2495	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2496	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2497	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2498	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2499	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2500	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2501	pCodeBuf[off++] = 0x0f; \
2502	pCodeBuf[off++] = (a_bOpcX86); \
2503	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2504	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2505	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2506	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2507	return off; \
2508	} \
2509	typedef int ignore_semicolon
2510	#elif defined(RT_ARCH_ARM64)
2511	# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2512	DECL_INLINE_THROW(uint32_t) \
2513	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2514	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2515	{ \
2516	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2517	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2518	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2519	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2520	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2521	pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2522	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2523	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2524	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2525	return off; \
2526	} \
2527	DECL_INLINE_THROW(uint32_t) \
2528	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2529	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2530	{ \
2531	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2532	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2533	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2534	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2535	pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2536	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2537	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2538	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2539	return off; \
2540	} \
2541	typedef int ignore_semicolon
2542	#else
2543	# error "Port me"
2544	#endif
2545
2546	/*
2547	* PADDx.
2548	*/
2549	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2550	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2551	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2552	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2553
2554	/*
2555	* PSUBx.
2556	*/
2557	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2558	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2559	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2560	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2561
2562	/*
2563	* PADDUSx.
2564	*/
2565	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2566	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2567
2568	/*
2569	* PMULLx.
2570	*/
2571	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2572
2573
2574	/**
2575	* Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2576	*/
2577	#ifdef RT_ARCH_AMD64
2578	# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2579	DECL_INLINE_THROW(uint32_t) \
2580	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2581	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2582	{ \
2583	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2584	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2585	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2586	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2587	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2588	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2589	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2590	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2591	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2592	pCodeBuf[off++] = 0x0f; \
2593	pCodeBuf[off++] = (a_bOpcX86); \
2594	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2595	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2596	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2597	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2598	return off; \
2599	} \
2600	DECL_INLINE_THROW(uint32_t) \
2601	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2602	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2603	{ \
2604	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2605	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2606	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2607	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2608	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2609	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2610	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2611	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2612	pCodeBuf[off++] = 0x0f; \
2613	pCodeBuf[off++] = (a_bOpcX86); \
2614	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2615	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2616	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2617	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2618	return off; \
2619	} \
2620	typedef int ignore_semicolon
2621	#elif defined(RT_ARCH_ARM64)
2622	# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2623	DECL_INLINE_THROW(uint32_t) \
2624	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2625	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2626	{ \
2627	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2628	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2629	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2630	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2631	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2632	pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2633	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2634	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2635	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2636	return off; \
2637	} \
2638	DECL_INLINE_THROW(uint32_t) \
2639	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2640	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2641	{ \
2642	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2643	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2644	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2645	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2646	pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2647	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2648	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2649	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2650	return off; \
2651	} \
2652	typedef int ignore_semicolon
2653	#else
2654	# error "Port me"
2655	#endif
2656
2657	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2658	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2659	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2660
2661
2662	/**
2663	* Emitter for pmovmskb
2664	*/
2665	DECL_INLINE_THROW(uint32_t)
2666	iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2667	uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2668	{
2669	#ifdef RT_ARCH_AMD64
2670	uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2671	kIemNativeGstRegUse_ForFullWrite);
2672	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2673	IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2674	kIemNativeGstSimdRegLdStSz_Low128,
2675	kIemNativeGstRegUse_ReadOnly);
2676	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2677
2678	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2679	if (idxRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2680	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2681	\| (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2682	pCodeBuf[off++] = 0x0f;
2683	pCodeBuf[off++] = 0xd7;
2684	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2685
2686	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2687	iemNativeRegFreeTmp(pReNative, idxRegDst);
2688
2689	#elif defined(RT_ARCH_ARM64)
2690	uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2691	kIemNativeGstRegUse_ForFullWrite);
2692	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2693	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2694	IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2695	kIemNativeGstSimdRegLdStSz_Low128,
2696	kIemNativeGstRegUse_Calculation);
2697	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2698
2699	/*
2700	* See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2701	* for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2702	*
2703	* As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2704	* as the sse2neon implementation because there we can get away with loading any constants and the
2705	* base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2706	*
2707	* The following illustrates the algorithm:
2708	*
2709	* Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2710	* Instruction
2711	* \|
2712	* V
2713	* Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2714	* USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2715	* USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2716	* USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2717	* USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2718	*
2719	* The extraction process
2720	* UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2721	* UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2722	* ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2723	*/
2724	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2725	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /fUnsigned/, false /fRound/, true /fAccum/);
2726	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /fUnsigned/, false /fRound/, true /fAccum/);
2727	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /fUnsigned/, false /fRound/, true /fAccum/);
2728	pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /fDst64Bit/);
2729	pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /fDst64Bit/);
2730	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /f64Bit/, 8 /offShift6/);
2731
2732	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2733	iemNativeRegFreeTmp(pReNative, idxRegTmp);
2734	iemNativeRegFreeTmp(pReNative, idxRegDst);
2735
2736	#else
2737	# error "Port me"
2738	#endif
2739	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2740	return off;
2741	}
2742
2743
2744	/**
2745	* Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2746	*/
2747	DECL_INLINE_THROW(uint32_t)
2748	iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2749	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2750	{
2751	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2752	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2753	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2754	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2755
2756	#ifdef RT_ARCH_AMD64
2757	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2758
2759	/* packuswb xmm, xmm */
2760	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2761	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2762	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2763	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2764	pCodeBuf[off++] = 0x0f;
2765	pCodeBuf[off++] = 0x67;
2766	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2767
2768	#elif defined(RT_ARCH_ARM64)
2769	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2770
2771	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /fUpper/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2772	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /fUpper/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2773
2774	#else
2775	# error "port me"
2776	#endif
2777
2778	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2779	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2780
2781	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2782	return off;
2783	}
2784
2785
2786	/**
2787	* Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2788	*/
2789	DECL_INLINE_THROW(uint32_t)
2790	iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2791	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2792	{
2793	IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2794	IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2795
2796	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2797	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2798	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
2799
2800
2801	#ifdef RT_ARCH_AMD64
2802	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2803
2804	/* packuswb xmm, xmm */
2805	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2806	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2807	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2808	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2809	pCodeBuf[off++] = 0x0f;
2810	pCodeBuf[off++] = 0x67;
2811	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2812
2813	#elif defined(RT_ARCH_ARM64)
2814	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2815
2816	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /fUpper/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2817	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /fUpper/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2818
2819	#else
2820	# error "port me"
2821	#endif
2822
2823	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2824	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2825
2826	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2827	return off;
2828	}
2829
2830
2831	/**
2832	* Common emitter for the pmov{s,z}x* instructions.
2833	*/
2834	#ifdef RT_ARCH_AMD64
2835	# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2836	DECL_INLINE_THROW(uint32_t) \
2837	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2838	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2839	{ \
2840	if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2841	{ \
2842	uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2843	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2844	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2845	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2846	if (idxSimdReg >= 8) \
2847	pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B \| X86_OP_REX_R : 0); \
2848	pCodeBuf[off++] = 0x0f; \
2849	pCodeBuf[off++] = 0x38; \
2850	pCodeBuf[off++] = (a_bOpcX86); \
2851	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2852	iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2853	} \
2854	else \
2855	{ \
2856	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2857	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2858	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2859	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2860	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2861	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2862	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2863	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2864	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2865	pCodeBuf[off++] = 0x0f; \
2866	pCodeBuf[off++] = 0x38; \
2867	pCodeBuf[off++] = (a_bOpcX86); \
2868	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2869	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2870	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2871	} \
2872	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2873	return off; \
2874	} \
2875	DECL_INLINE_THROW(uint32_t) \
2876	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2877	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2878	{ \
2879	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2880	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2881	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2882	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2883	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2884	pCodeBuf[off++] = X86_OP_REX_W \
2885	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2886	\| (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2887	pCodeBuf[off++] = 0x0f; \
2888	pCodeBuf[off++] = 0x3a; \
2889	pCodeBuf[off++] = 0x22; \
2890	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2891	pCodeBuf[off++] = 0; /* QWord */\
2892	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2893	if (idxSimdRegDst >= 8 \|\| IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2894	pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2895	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2896	pCodeBuf[off++] = 0x0f; \
2897	pCodeBuf[off++] = 0x38; \
2898	pCodeBuf[off++] = (a_bOpcX86); \
2899	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2900	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2901	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2902	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2903	return off; \
2904	} \
2905	typedef int ignore_semicolon
2906	#elif defined(RT_ARCH_ARM64)
2907	# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2908	DECL_INLINE_THROW(uint32_t) \
2909	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2910	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2911	{ \
2912	if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2913	{ \
2914	uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2915	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2916	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2917	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2918	iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2919	} \
2920	else \
2921	{ \
2922	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2923	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2924	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2925	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2926	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2927	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2928	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2929	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2930	} \
2931	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2932	return off; \
2933	} \
2934	DECL_INLINE_THROW(uint32_t) \
2935	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2936	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2937	{ \
2938	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2939	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2940	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2941	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2942	pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /idxElem/); /* Transfer value from GPR to temporary vector register. */ \
2943	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2944	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2945	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2946	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2947	return off; \
2948	} \
2949	typedef int ignore_semicolon
2950	#else
2951	# error "Port me"
2952	#endif
2953
2954	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2955	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2956	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2957
2958	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2959	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2960	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2961
2962
2963	/**
2964	* Updates the MXCSR exception flags, raising any unmasked exceptions.
2965	*/
2966	DECL_INLINE_THROW(uint32_t)
2967	iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2968	{
2969	uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2970	uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2971	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2972
2973	#ifdef RT_ARCH_AMD64
2974	PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2975
2976	/* stmxcsr */
2977	if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2978	pbCodeBuf[off++] = X86_OP_REX_B;
2979	pbCodeBuf[off++] = 0x0f;
2980	pbCodeBuf[off++] = 0xae;
2981	pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2982	pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2983	pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2984	pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2985	pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2986
2987	/* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2988	off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2989
2990	/* Store the flags in the MXCSR xcpt flags register. */
2991	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2992	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2993
2994	/* Clear the status flags in the temporary copy and write it back to MXCSR. */
2995	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2996	off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2997
2998	pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2999
3000	/* ldmxcsr */
3001	if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
3002	pbCodeBuf[off++] = X86_OP_REX_B;
3003	pbCodeBuf[off++] = 0x0f;
3004	pbCodeBuf[off++] = 0xae;
3005	pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
3006	pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3007	pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3008	pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3009	pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3010
3011	#elif defined(RT_ARCH_ARM64)
3012	PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
3013	pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
3014	pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
3015	pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
3016
3017	/*
3018	* The exception flags layout differs between MXCSR and FPSR of course:
3019	*
3020	* Bit FPSR MXCSR
3021	* 0 IOC ------> IE
3022	*
3023	* 1 DZC ---- DE <-+
3024	* \ \|
3025	* 2 OFC --- -> ZE \|
3026	* \ \|
3027	* 3 UFC -- --> OE \|
3028	* \ \|
3029	* 4 IXC - ---> UE \|
3030	* \ \|
3031	* 5 ----> PE \|
3032	* 6 \|
3033	* 7 IDC --------------+
3034	*/
3035	pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3036	pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3037	pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3038	pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3039	#else
3040	# error "Port me"
3041	#endif
3042
3043	/*
3044	* If PE is set together with OE/UE and neither are masked
3045	* PE needs to be cleared, because on real hardware
3046	* an exception is generated with only OE/UE being set,
3047	* but because we mask all exceptions PE will get set as well.
3048	*/
3049	/** @todo On ARM we can combine the load+and into one and instruction. */
3050	/** @todo r=aeichner Can this be done more optimal? */
3051	uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3052	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3053	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE \| X86_MXCSR_UE);
3054	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3055	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM \| X86_MXCSR_UM);
3056	off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3057	off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /f64Bit/);
3058	off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3059	off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE \| X86_MXCSR_UE);
3060
3061	uint32_t offFixup = off;
3062	off = iemNativeEmitJzToFixed(pReNative, off, off);
3063	off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3064	iemNativeFixupFixedJump(pReNative, offFixup, off);
3065	iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3066
3067
3068	/* Set the MXCSR flags now. */
3069	off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3070
3071	/*
3072	* Make sure we don't have any outstanding guest register writes as we may
3073	* raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3074	*/
3075	off = iemNativeRegFlushPendingWrites(pReNative, off);
3076
3077	#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3078	off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3079	#else
3080	RT_NOREF(idxInstr);
3081	#endif
3082
3083	/* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3084	/* mov tmp, varmxcsr */
3085	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3086	/* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3087	off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3088	/* tmp = ~tmp */
3089	off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /f64Bit/);
3090	/* tmp &= mxcsr */
3091	off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3092	off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3093	X86_MXCSR_XCPT_FLAGS);
3094
3095	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3096	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3097
3098	/* Move result to guest SIMD register (at this point there is no exception being raised). */
3099	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3100
3101	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3102	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3103	iemNativeRegFreeTmp(pReNative, idxRegTmp);
3104	iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3105	iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3106	return off;
3107	}
3108
3109
3110	/**
3111	* Common emitter for packed floating point instructions with 3 operands - register, register variant.
3112	*/
3113	DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3114	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3115	#ifdef RT_ARCH_AMD64
3116	uint8_t const bPrefixX86, uint8_t const bOpcX86
3117	#elif defined(RT_ARCH_ARM64)
3118	ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3119	#endif
3120	)
3121	{
3122	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3123	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3124	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3125	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3126
3127	#ifdef RT_ARCH_AMD64
3128	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3129	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3130	if (bPrefixX86 != 0)
3131	pCodeBuf[off++] = bPrefixX86;
3132	if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 \|\| idxSimdRegSrc >= 8)
3133	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3134	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3135	pCodeBuf[off++] = 0x0f;
3136	pCodeBuf[off++] = bOpcX86;
3137	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3138	#elif defined(RT_ARCH_ARM64)
3139	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3140	pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3141	#else
3142	# error "Port me"
3143	#endif
3144	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3145	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3146	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3147	return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3148	}
3149
3150
3151	/**
3152	* Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3153	*/
3154	DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3155	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3156	#ifdef RT_ARCH_AMD64
3157	uint8_t const bPrefixX86, uint8_t const bOpcX86
3158	#elif defined(RT_ARCH_ARM64)
3159	ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3160	#endif
3161	)
3162	{
3163	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3164	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3165	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
3166
3167	#ifdef RT_ARCH_AMD64
3168	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3169	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3170	if (bPrefixX86 != 0)
3171	pCodeBuf[off++] = bPrefixX86;
3172	if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 \|\| idxSimdRegSrc >= 8)
3173	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3174	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3175	pCodeBuf[off++] = 0x0f;
3176	pCodeBuf[off++] = bOpcX86;
3177	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3178	#elif defined(RT_ARCH_ARM64)
3179	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3180	pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3181	#else
3182	# error "Port me"
3183	#endif
3184	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3185	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3186	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3187	return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3188	}
3189
3190
3191	/**
3192	* Common emitter for packed floating point instructions with 3 operands.
3193	*/
3194	#ifdef RT_ARCH_AMD64
3195	# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3196	DECL_FORCE_INLINE_THROW(uint32_t) \
3197	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3198	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3199	{ \
3200	return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3201	a_bPrefixX86, a_bOpcX86); \
3202	} \
3203	DECL_FORCE_INLINE_THROW(uint32_t) \
3204	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3205	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3206	{ \
3207	return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3208	a_bPrefixX86, a_bOpcX86); \
3209	} \
3210	typedef int ignore_semicolon
3211	#elif defined(RT_ARCH_ARM64)
3212	# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3213	DECL_FORCE_INLINE_THROW(uint32_t) \
3214	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3215	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3216	{ \
3217	return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3218	a_enmArmOp, a_ArmElemSz); \
3219	} \
3220	DECL_FORCE_INLINE_THROW(uint32_t) \
3221	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3222	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3223	{ \
3224	return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3225	a_enmArmOp, a_ArmElemSz); \
3226	} \
3227	typedef int ignore_semicolon
3228	#else
3229	# error "Port me"
3230	#endif
3231
3232
3233	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3234	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3235	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3236	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3237
3238	#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3239
3240	#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106192

Download in other formats: