IEMAllN8veEmit-x86.h@ 106201

Last change on this file since 106201 was 106201, checked in by vboxsync, 2 months ago
VMM/IEM: A couple of debug build fixes for arm. bugref:10720
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 160.0 KB

Line
1	/* $Id: IEMAllN8veEmit-x86.h 106201 2024-10-01 23:48:36Z vboxsync $ */
2	/** @file
3	* IEM - Native Recompiler, x86 Target - Code Emitters.
4	*/
5
6	/*
7	* Copyright (C) 2023-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28	#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29	#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30	#ifndef RT_WITHOUT_PRAGMA_ONCE
31	# pragma once
32	#endif
33
34
35	#ifdef RT_ARCH_AMD64
36
37	/**
38	* Emits an ModR/M instruction with one opcode byte and only register operands.
39	*/
40	DECL_FORCE_INLINE(uint32_t)
41	iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42	uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43	{
44	Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45	switch (cOpBits)
46	{
47	case 16:
48	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49	RT_FALL_THRU();
50	case 32:
51	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
52	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53	pCodeBuf[off++] = bOpcodeOther;
54	break;
55
56	default: AssertFailed(); RT_FALL_THRU();
57	case 64:
58	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59	pCodeBuf[off++] = bOpcodeOther;
60	break;
61
62	case 8:
63	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
64	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65	else if (idxRegReg >= 4 \|\| idxRegRm >= 4)
66	pCodeBuf[off++] = X86_OP_REX;
67	pCodeBuf[off++] = bOpcode8;
68	break;
69	}
70	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71	return off;
72	}
73
74
75	/**
76	* Emits an ModR/M instruction with two opcode bytes and only register operands.
77	*/
78	DECL_FORCE_INLINE(uint32_t)
79	iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80	uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81	uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82	{
83	Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84	switch (cOpBits)
85	{
86	case 16:
87	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88	RT_FALL_THRU();
89	case 32:
90	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
91	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92	pCodeBuf[off++] = bOpcode0;
93	pCodeBuf[off++] = bOpcodeOther;
94	break;
95
96	default: AssertFailed(); RT_FALL_THRU();
97	case 64:
98	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99	pCodeBuf[off++] = bOpcode0;
100	pCodeBuf[off++] = bOpcodeOther;
101	break;
102
103	case 8:
104	if (idxRegReg >= 8 \|\| idxRegRm >= 8)
105	pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106	else if (idxRegReg >= 4 \|\| idxRegRm >= 4)
107	pCodeBuf[off++] = X86_OP_REX;
108	pCodeBuf[off++] = bOpcode0;
109	pCodeBuf[off++] = bOpcode8;
110	break;
111	}
112	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113	return off;
114	}
115
116
117	/**
118	* Emits one of three opcodes with an immediate.
119	*
120	* These are expected to be a /idxRegReg form.
121	*/
122	DECL_FORCE_INLINE(uint32_t)
123	iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124	uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125	uint8_t idxRegRm, uint64_t uImmOp)
126	{
127	Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128	if ( cImmBits == 8
129	\|\| (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130	{
131	switch (cOpBits)
132	{
133	case 16:
134	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135	RT_FALL_THRU();
136	case 32:
137	if (idxRegRm >= 8)
138	pCodeBuf[off++] = X86_OP_REX_B;
139	pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140	break;
141
142	default: AssertFailed(); RT_FALL_THRU();
143	case 64:
144	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145	pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146	break;
147
148	case 8:
149	if (idxRegRm >= 8)
150	pCodeBuf[off++] = X86_OP_REX_B;
151	else if (idxRegRm >= 4)
152	pCodeBuf[off++] = X86_OP_REX;
153	pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154	break;
155	}
156	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157	pCodeBuf[off++] = (uint8_t)uImmOp;
158	}
159	else
160	{
161	switch (cOpBits)
162	{
163	case 32:
164	if (idxRegRm >= 8)
165	pCodeBuf[off++] = X86_OP_REX_B;
166	break;
167
168	default: AssertFailed(); RT_FALL_THRU();
169	case 64:
170	pCodeBuf[off++] = X86_OP_REX_W \| (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171	break;
172
173	case 16:
174	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175	if (idxRegRm >= 8)
176	pCodeBuf[off++] = X86_OP_REX_B;
177	pCodeBuf[off++] = bOpcodeOther;
178	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179	pCodeBuf[off++] = RT_BYTE1(uImmOp);
180	pCodeBuf[off++] = RT_BYTE2(uImmOp);
181	Assert(cImmBits == 16);
182	return off;
183	}
184	pCodeBuf[off++] = bOpcodeOther;
185	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186	pCodeBuf[off++] = RT_BYTE1(uImmOp);
187	pCodeBuf[off++] = RT_BYTE2(uImmOp);
188	pCodeBuf[off++] = RT_BYTE3(uImmOp);
189	pCodeBuf[off++] = RT_BYTE4(uImmOp);
190	Assert(cImmBits == 32);
191	}
192	return off;
193	}
194
195	#endif /* RT_ARCH_AMD64 */
196
197
198
199	/*********************************************************************************************************************************
200	* EFLAGS *
201	*********************************************************************************************************************************/
202
203	#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205	/** @def IEMNATIVE_POSTPONING_REG_MASK
206	* Register suitable for keeping the inputs or result for a postponed EFLAGS
207	* calculation.
208	*
209	* We use non-volatile register here so we don't have to save & restore them
210	* accross callouts (i.e. TLB loads).
211	*
212	* @note On x86 we cannot use RDI and RSI because these are used by the
213	* opcode checking code. The usual joy of the x86 instruction set.
214	*/
215	# ifdef RT_ARCH_AMD64
216	# define IEMNATIVE_POSTPONING_REG_MASK \
217	(IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) \| RT_BIT_32(X86_GREG_xSI)))
218	# else
219	# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220	# endif
221
222	/**
223	* This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224	*/
225	template<uint32_t const a_fEflClobbered>
226	DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227	{
228	AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229	uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230	if (fEFlags)
231	{
232	if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233	{
234	fEFlags &= ~a_fEflClobbered;
235	if (!fEFlags)
236	{ /* likely */ }
237	else
238	{
239	Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
240	pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241	pReNative->PostponedEfl.fEFlags = fEFlags;
242	return;
243	}
244	}
245
246	/* Do cleanup. */
247	Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
248	pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249	pReNative->PostponedEfl.fEFlags = 0;
250	pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251	pReNative->PostponedEfl.cOpBits = 0;
252	iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253	if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254	iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255	pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256	pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257	# if defined(VBOX_WITH_STATISTICS) \|\| defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258	STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259	pReNative->PostponedEfl.cEmits = 0;
260	# endif
261	}
262	}
263
264	#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
265
266
267	template<bool const a_fDoOp>
268	DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
269	uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
270	{
271	#ifdef RT_ARCH_AMD64
272	/* Do TEST idxRegResult, idxRegResult to set flags. */
273	if RT_CONSTEXPR_IF(a_fDoOp)
274	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
275
276	/*
277	* Collect the EFLAGS status bits.
278	* We know that the overflow bit will always be cleared, so LAHF can be used.
279	*/
280	if (idxRegTmp == X86_GREG_xAX)
281	{
282	/* lahf ; AH = EFLAGS */
283	pCodeBuf[off++] = 0x9f;
284	if (idxRegEfl <= X86_GREG_xBX)
285	{
286	/* mov [CDB]L, AH */
287	pCodeBuf[off++] = 0x88;
288	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, idxRegEfl);
289	}
290	else
291	{
292	/* mov AL, AH */
293	pCodeBuf[off++] = 0x88;
294	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
295	/* mov xxL, AL */
296	pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
297	pCodeBuf[off++] = 0x88;
298	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /AL/, idxRegEfl & 7);
299	}
300	}
301	else if (idxRegEfl != X86_GREG_xAX)
302	{
303	# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
304	/* xchg rax, tmp */
305	pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B \| X86_OP_REX_W;
306	pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
307
308	/* lahf ; AH = EFLAGS */
309	pCodeBuf[off++] = 0x9f;
310	if (idxRegEfl <= X86_GREG_xBX)
311	{
312	/* mov [CDB]L, AH */
313	pCodeBuf[off++] = 0x88;
314	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, idxRegEfl);
315	}
316	else
317	{
318	/* mov AL, AH */
319	pCodeBuf[off++] = 0x88;
320	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
321	/* mov xxL, AL */
322	pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
323	pCodeBuf[off++] = 0x88;
324	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /AL/, idxRegEfl & 7);
325	}
326
327	/* xchg rax, tmp */
328	pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B \| X86_OP_REX_W;
329	pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
330
331	# else
332	/* pushf */
333	pCodeBuf[off++] = 0x9c;
334	/* pop tmp */
335	if (idxRegTmp >= 8)
336	pCodeBuf[off++] = X86_OP_REX_B;
337	pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
338	/* mov byte(efl), byte(tmp) */
339	if (idxRegEfl >= 4 \|\| idxRegTmp >= 4)
340	pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
341	\| (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
342	pCodeBuf[off++] = 0x88;
343	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
344	# endif
345	}
346	else
347	{
348	/* xchg al, ah */
349	pCodeBuf[off++] = 0x86;
350	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
351	/* lahf ; AH = EFLAGS */
352	pCodeBuf[off++] = 0x9f;
353	/* xchg al, ah */
354	pCodeBuf[off++] = 0x86;
355	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /AH/, 0 /AL/);
356	}
357	/* BTR idxEfl, 11; Clear OF */
358	if (idxRegEfl >= 8)
359	pCodeBuf[off++] = X86_OP_REX_B;
360	pCodeBuf[off++] = 0xf;
361	pCodeBuf[off++] = 0xba;
362	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
363	pCodeBuf[off++] = X86_EFL_OF_BIT;
364
365	#elif defined(RT_ARCH_ARM64)
366	/*
367	* Calculate flags.
368	*/
369	/* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
370	off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
371	off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
372
373	/* N,Z -> SF,ZF */
374	if (cOpBits < 32)
375	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
376	else if RT_CONSTEXPR_IF(a_fDoOp)
377	pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /f64Bit/);
378	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
379	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
380	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /f64Bit/);
381	AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
382
383	/* Calculate 8-bit parity of the result. */
384	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /f64Bit/,
385	4 /offShift6/, kArmv8A64InstrShift_Lsr);
386	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
387	2 /offShift6/, kArmv8A64InstrShift_Lsr);
388	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
389	1 /offShift6/, kArmv8A64InstrShift_Lsr);
390	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
391	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /f64Bit/);
392	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /f64Bit/);
393
394	#else
395	# error "port me"
396	#endif
397	return off;
398	}
399
400	#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
401
402	template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
403	static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
404	uint32_t bmExtraTlbMissRegs = 0)
405	{
406	# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
407	iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
408	pReNative->PostponedEfl.cEmits);
409	# endif
410
411	/*
412	* In the TB exit code path we cannot do regular register allocation. Nor
413	* can we when we're in the TLB miss code, unless we're skipping the TLB
414	* lookup. Since the latter isn't an important usecase and should get along
415	* fine on just volatile registers, we do not need to do anything special
416	* for it.
417	*
418	* So, we do our own register allocating here. Any register goes in the TB
419	* exit path, excluding a_bmInputRegs, fixed and postponed related registers.
420	* In the TLB miss we can use any volatile register and temporary registers
421	* allocated in the TLB state.
422	*
423	* Note! On x86 we prefer using RAX as the first TMP register, so we can
424	* make use of LAHF which is typically faster than PUSHF/POP. This
425	* is why the idxRegTmp allocation is first when there is no EFLAG
426	* shadow, since RAX is represented by bit 0 in the mask.
427	*/
428	uint32_t bmAvailableRegs;
429	if RT_CONSTEXPR_IF(!a_fTlbMiss)
430	{
431	bmAvailableRegs = ~(a_bmInputRegs \| IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
432	if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
433	bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) \| RT_BIT_32(pReNative->PostponedEfl.idxReg2));
434	else
435	bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
436	}
437	else
438	{
439	/* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
440	bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK \| bmExtraTlbMissRegs)
441	& ~(a_bmInputRegs \| IEMNATIVE_REG_FIXED_MASK)
442	& IEMNATIVE_HST_GREG_MASK;
443	}
444
445	/* Use existing EFLAGS shadow if available. For the TLB-miss code path we
446	need to weed out volatile registers here, as they will no longer be valid. */
447	uint8_t idxRegTmp;
448	uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
449	if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
450	&& (!a_fTlbMiss \|\| !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
451	{
452	Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
453	Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
454	if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
455	bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
456	# ifdef VBOX_STRICT
457	off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
458	# endif
459
460	idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
461	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
462	}
463	else
464	{
465	idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
466	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
467
468	idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
469	bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
470	off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
471	}
472	Assert(bmAvailableRegs != 0);
473
474	/*
475	* Do the actual EFLAGS calculation.
476	*/
477	switch (pReNative->PostponedEfl.enmOp)
478	{
479	case kIemNativePostponedEflOp_Logical:
480	Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
481	off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
482	pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
483	break;
484
485	default:
486	AssertFailedBreak();
487	}
488
489	/*
490	* Store EFLAGS.
491	*/
492	# ifdef VBOX_STRICT
493	/* check that X86_EFL_1 is set. */
494	uint32_t offFixup1;
495	off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
496	off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
497	iemNativeFixupFixedJump(pReNative, offFixup1, off);
498	/* Check that X86_EFL_RAZ_LO_MASK is zero. */
499	off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK, idxRegTmp);
500	uint32_t const offFixup2 = off;
501	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
502	off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
503	iemNativeFixupFixedJump(pReNative, offFixup2, off);
504	# endif
505	off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
506	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
507
508	# if defined(VBOX_WITH_STATISTICS) \|\| defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
509	pReNative->PostponedEfl.cEmits++;
510	# endif
511	return off;
512	}
513
514
515
516	template<uint32_t const a_bmInputRegs>
517	DECL_FORCE_INLINE_THROW(uint32_t)
518	iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
519	{
520	if (pReNative->PostponedEfl.fEFlags)
521	{
522	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
523	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
524	}
525	return off;
526	}
527
528
529	template<uint32_t const a_bmInputRegs>
530	DECL_FORCE_INLINE_THROW(uint32_t)
531	iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
532	{
533	if (pReNative->PostponedEfl.fEFlags)
534	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
535	return off;
536	}
537
538
539	template<uint32_t const a_bmInputRegs>
540	DECL_FORCE_INLINE_THROW(uint32_t)
541	iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
542	uint32_t bmTmpRegs)
543	{
544	if (pReNative->PostponedEfl.fEFlags)
545	{
546	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
547	return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
548	pTlbState->getRegsNotToSave() \| bmTmpRegs);
549	}
550	return off;
551	}
552
553	#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
554
555
556	/**
557	* This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
558	*
559	* It takes liveness stuff into account.
560	*/
561	template<bool a_fNeedToSetFlags>
562	DECL_INLINE_THROW(uint32_t)
563	iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
564	uint8_t cOpBits, uint8_t idxRegResult)
565	{
566	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
567	IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
568	RT_NOREF(cOpBits, idxRegResult);
569
570	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
571	/*
572	* See if we can skip this wholesale.
573	*/
574	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
575	uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
576	& IEMLIVENESSBIT_STATUS_EFL_MASK;
577	# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
578	uint64_t fEflPostponing;
579	# endif
580	if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
581	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
582	{
583	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
584	pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
585	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
586	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
587	# endif
588	Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
589	return off;
590	}
591	# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
592	if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
593	\| fEflClobbered)
594	== IEMLIVENESSBIT_STATUS_EFL_MASK
595	&& idxRegResult != UINT8_MAX)
596	{
597	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
598	pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
599	pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
600	pReNative->PostponedEfl.cOpBits = cOpBits;
601	pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
602	/** @todo it would normally be possible to use idxRegResult, iff it is
603	* already a non-volatile register and we can be user the caller
604	* doesn't modify it. That'll save a register move and allocation. */
605	off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
606	Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
607	kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
608	}
609	# endif
610	else
611	#endif
612	{
613	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
614	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
615	#ifdef RT_ARCH_AMD64
616	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
617	#elif defined(RT_ARCH_ARM64)
618	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
619	#else
620	# error "port me"
621	#endif
622	off = iemNativeEmitPostponedEFlagsCalcLogical<a_fNeedToSetFlags>(pCodeBuf, off, cOpBits, idxRegResult,
623	idxRegEfl, idxRegTmp);
624	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
625
626	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
627	iemNativeRegFreeTmp(pReNative, idxRegTmp);
628	}
629
630	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
631	if (pReNative->fSkippingEFlags)
632	Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
633	pReNative->fSkippingEFlags = 0;
634	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
635	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
636	# endif
637	#endif
638	return off;
639	}
640
641
642	/**
643	* This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
644	*
645	* It takes liveness stuff into account.
646	*/
647	DECL_FORCE_INLINE_THROW(uint32_t)
648	iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
649	#ifndef RT_ARCH_AMD64
650	, uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
651	, bool fInvertCarry, uint64_t uImmSrc
652	#endif
653	)
654	{
655	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
656	IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
657
658	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
659	/*
660	* See if we can skip this wholesale.
661	*/
662	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
663	if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
664	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
665	{
666	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
667	pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
668	Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
669	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
670	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
671	# endif
672	}
673	else
674	#endif
675	{
676	#ifdef RT_ARCH_AMD64
677	/*
678	* Collect flags and merge them with eflags.
679	*/
680	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
681	/* pushf - do this before any reg allocations as they may emit instructions too. */
682	pCodeBuf[off++] = 0x9c;
683
684	uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
685	: iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
686	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
687	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
688	/* pop tmp */
689	if (idxTmpReg >= 8)
690	pCodeBuf[off++] = X86_OP_REX_B;
691	pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
692	/* Isolate the flags we want. */
693	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
694	/* Clear the status bits in EFLs. */
695	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
696	/* OR in the flags we collected. */
697	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
698	if (idxRegEflIn != idxRegEfl)
699	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
700	iemNativeRegFreeTmp(pReNative, idxTmpReg);
701
702	#elif defined(RT_ARCH_ARM64)
703	/*
704	* Calculate flags.
705	*/
706	uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
707	: iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
708	uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
709	uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
710	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
711
712	/* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
713	if (fInvertCarry)
714	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
715	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
716
717	if (cOpBits >= 32)
718	{
719	/* V -> OF */
720	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
721	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /f64Bit/);
722
723	/* C -> CF */
724	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
725	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /f64Bit/);
726	}
727
728	/* N,Z -> SF,ZF */
729	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
730	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /f64Bit/);
731
732	/* For ADC and SBB we have to calculate overflow and carry our selves. */
733	if (cOpBits < 32)
734	{
735	/* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
736	AssertCompile(X86_EFL_CF_BIT == 0);
737	pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /f64Bit/);
738
739	/* The overflow flag is more work as we have to compare the signed bits for
740	both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
741
742	Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
743	With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
744
745	It is a bit simpler when the right (source) side is constant:
746	adc: S D R -> OF sbb: S D R -> OF
747	0 0 0 -> 0 \ 0 0 0 -> 0 \
748	0 0 1 -> 1 \ 0 0 1 -> 0 \
749	0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
750	0 1 1 -> 0 / 0 1 1 -> 0 /
751	1 0 0 -> 0 \ 1 0 0 -> 0 \
752	1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
753	1 1 0 -> 1 / 1 1 0 -> 0 /
754	1 1 1 -> 0 / 1 1 1 -> 0 / */
755	if (idxRegSrc != UINT8_MAX)
756	{
757	if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
758	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
759	else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
760	pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
761	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
762	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /f64Bit/);
763	}
764	else if (uImmSrc & RT_BIT_32(cOpBits - 1))
765	{
766	if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
767	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
768	else
769	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
770	}
771	else
772	{
773	if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
774	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
775	else
776	pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
777	}
778	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /f64Bit/);
779	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
780	iemNativeRegFreeTmp(pReNative, idxTmpReg2);
781	}
782
783	/* Calculate 8-bit parity of the result. */
784	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /f64Bit/,
785	4 /offShift6/, kArmv8A64InstrShift_Lsr);
786	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
787	2 /offShift6/, kArmv8A64InstrShift_Lsr);
788	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /f64Bit/,
789	1 /offShift6/, kArmv8A64InstrShift_Lsr);
790	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
791	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /f64Bit/);
792	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /f64Bit/);
793
794	/* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
795	General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
796	S D R
797	0 0 0 -> 0; \
798	0 0 1 -> 1; \ regular
799	0 1 0 -> 1; / xor R, D
800	0 1 1 -> 0; /
801	1 0 0 -> 1; \
802	1 0 1 -> 0; \ invert one of the two
803	1 1 0 -> 0; / xor not(R), D
804	1 1 1 -> 1; /
805	a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
806	a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
807	*/
808
809	if (idxRegSrc != UINT8_MAX)
810	{
811	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /f64Bit/);
812	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /f64Bit/);
813	}
814	else if (uImmSrc & X86_EFL_AF)
815	pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /f64Bit/);
816	else
817	pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /f64Bit/);
818	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /f64Bit/);
819	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /f64Bit/);
820
821	if (idxRegEflIn != idxRegEfl)
822	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
823	iemNativeRegFreeTmp(pReNative, idxTmpReg);
824
825	#else
826	# error "port me"
827	#endif
828	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
829
830	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
831	if (pReNative->fSkippingEFlags)
832	Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
833	pReNative->fSkippingEFlags = 0;
834	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
835	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
836	# endif
837	#endif
838	}
839	return off;
840
841	}
842
843
844
845	/*********************************************************************************************************************************
846	* Bitwise Logical Operations *
847	*********************************************************************************************************************************/
848
849	/**
850	* The AND instruction will clear OF, CF and AF (latter is undefined) and
851	* set the other flags according to the result.
852	*/
853	template<uint8_t const a_cOpBits>
854	DECL_INLINE_THROW(uint32_t)
855	iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
856	{
857	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
858	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
859	#ifdef RT_ARCH_AMD64
860	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
861	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
862	0x22, 0x23, a_cOpBits, idxRegDst, idxRegSrc);
863	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
864	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
865
866	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
867
868	#elif defined(RT_ARCH_ARM64)
869	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
870	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
871	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
872	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
873	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
874
875	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
876	#else
877	# error "Port me"
878	#endif
879	iemNativeVarRegisterRelease(pReNative, idxVarDst);
880	return off;
881	}
882
883
884	/**
885	* The AND instruction with immediate value as right operand.
886	*/
887	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
888	DECL_INLINE_THROW(uint32_t)
889	iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
890	{
891	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
892	#ifdef RT_ARCH_AMD64
893	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
894	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
895	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 4, idxRegDst, uImmOp);
896	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
897
898	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
899
900	#elif defined(RT_ARCH_ARM64)
901	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
902	course the immediate variant when possible to save a register load. */
903	uint32_t uImmSizeLen, uImmRotations;
904	if ( a_cOpBits > 32
905	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
906	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
907	{
908	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
909	if (a_cOpBits >= 32)
910	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
911	else
912	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
913	}
914	else
915	{
916	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
917	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
918	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
919	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
920	else
921	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
922	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
923	}
924	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
925
926	off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
927
928	#else
929	# error "Port me"
930	#endif
931	iemNativeVarRegisterRelease(pReNative, idxVarDst);
932	return off;
933	}
934
935
936	/**
937	* The TEST instruction will clear OF, CF and AF (latter is undefined) and
938	* set the other flags according to the result.
939	*/
940	template<uint8_t const a_cOpBits>
941	DECL_INLINE_THROW(uint32_t)
942	iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
943	{
944	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
945	uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
946	: iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
947	#ifdef RT_ARCH_AMD64
948	/* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
949	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
950	0x84, 0x85, a_cOpBits, idxRegSrc, idxRegDst);
951	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
952
953	#elif defined(RT_ARCH_ARM64)
954	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
955	need to keep the result in order to calculate the flags. */
956	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
957	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
958	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
959	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
960	else
961	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
962	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
963
964	#else
965	# error "Port me"
966	#endif
967	if (idxVarSrc != idxVarDst)
968	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
969	iemNativeVarRegisterRelease(pReNative, idxVarDst);
970
971	#ifdef RT_ARCH_AMD64
972	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
973	#else
974	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
975	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
976	else
977	off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
978	iemNativeRegFreeTmp(pReNative, idxRegResult);
979	#endif
980	return off;
981	}
982
983
984	/**
985	* The TEST instruction with immediate value as right operand.
986	*/
987	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
988	DECL_INLINE_THROW(uint32_t)
989	iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
990	{
991	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
992	#ifdef RT_ARCH_AMD64
993	/* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
994	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
995	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
996	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
997	iemNativeVarRegisterRelease(pReNative, idxVarDst);
998
999	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1000
1001	#elif defined(RT_ARCH_ARM64)
1002	/* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1003	course the immediate variant when possible to save a register load.
1004	We also need to keep the result in order to calculate the flags. */
1005	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1006	uint32_t uImmSizeLen, uImmRotations;
1007	if ( a_cOpBits > 32
1008	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1009	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1010	{
1011	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1012	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1013	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
1014	else
1015	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
1016	}
1017	else
1018	{
1019	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1020	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1021	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1022	pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
1023	else
1024	pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
1025	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1026	}
1027	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1028	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1029
1030	off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1031
1032	iemNativeRegFreeTmp(pReNative, idxRegResult);
1033
1034	#else
1035	# error "Port me"
1036	#endif
1037	return off;
1038	}
1039
1040
1041	/**
1042	* The OR instruction will clear OF, CF and AF (latter is undefined) and
1043	* set the other flags according to the result.
1044	*/
1045	template<uint8_t const a_cOpBits>
1046	DECL_INLINE_THROW(uint32_t)
1047	iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1048	{
1049	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1050	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1051	#ifdef RT_ARCH_AMD64
1052	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1053	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1054	0x0a, 0x0b, a_cOpBits, idxRegDst, idxRegSrc);
1055	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1056	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1057
1058	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1059
1060	#elif defined(RT_ARCH_ARM64)
1061	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1062	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1063	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
1064	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1065	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1066
1067	off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1068
1069	#else
1070	# error "Port me"
1071	#endif
1072	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1073	return off;
1074	}
1075
1076
1077	/**
1078	* The OR instruction with immediate value as right operand.
1079	*/
1080	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1081	DECL_INLINE_THROW(uint32_t)
1082	iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1083	{
1084	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1085	#ifdef RT_ARCH_AMD64
1086	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1087	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1088	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 1, idxRegDst, uImmOp);
1089	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1090
1091	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1092
1093	#elif defined(RT_ARCH_ARM64)
1094	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1095	course the immediate variant when possible to save a register load. */
1096	uint32_t uImmSizeLen, uImmRotations;
1097	if ( a_cOpBits > 32
1098	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1099	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1100	{
1101	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1102	pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
1103	}
1104	else
1105	{
1106	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1107	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1108	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
1109	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1110	}
1111	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1112
1113	off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1114
1115	#else
1116	# error "Port me"
1117	#endif
1118	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1119	return off;
1120	}
1121
1122
1123	/**
1124	* The XOR instruction will clear OF, CF and AF (latter is undefined) and
1125	* set the other flags according to the result.
1126	*/
1127	template<uint8_t const a_cOpBits>
1128	DECL_INLINE_THROW(uint32_t)
1129	iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1130	{
1131	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1132	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1133	#ifdef RT_ARCH_AMD64
1134	/* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1135	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1136	0x32, 0x33, a_cOpBits, idxRegDst, idxRegSrc);
1137	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1138	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1139
1140	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1141
1142	#elif defined(RT_ARCH_ARM64)
1143	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1144	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1145	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
1146	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1147	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1148
1149	off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1150
1151	#else
1152	# error "Port me"
1153	#endif
1154	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1155	return off;
1156	}
1157
1158
1159	/**
1160	* The XOR instruction with immediate value as right operand.
1161	*/
1162	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1163	DECL_INLINE_THROW(uint32_t)
1164	iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1165	{
1166	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1167	#ifdef RT_ARCH_AMD64
1168	/* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1169	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1170	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 6, idxRegDst, uImmOp);
1171	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1172
1173	off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1174
1175	#elif defined(RT_ARCH_ARM64)
1176	/* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1177	course the immediate variant when possible to save a register load. */
1178	uint32_t uImmSizeLen, uImmRotations;
1179	if ( a_cOpBits > 32
1180	? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1181	: Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1182	{
1183	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1184	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /f64Bit/);
1185	}
1186	else
1187	{
1188	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1189	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1190	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/);
1191	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1192	}
1193	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1194
1195	off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1196
1197	#else
1198	# error "Port me"
1199	#endif
1200	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1201	return off;
1202	}
1203
1204
1205
1206	/*********************************************************************************************************************************
1207	* ADD, ADC, SUB, SBB, CMP *
1208	*********************************************************************************************************************************/
1209
1210	/**
1211	* The ADD instruction will set all status flags.
1212	*/
1213	template<uint8_t const a_cOpBits>
1214	DECL_INLINE_THROW(uint32_t)
1215	iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1216	{
1217	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1218	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1219
1220	#ifdef RT_ARCH_AMD64
1221	/* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1222	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1223	0x02, 0x03, a_cOpBits, idxRegDst, idxRegSrc);
1224	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1225
1226	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1227	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1228
1229	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1230
1231	#elif defined(RT_ARCH_ARM64)
1232	/* On ARM64 we'll need the two input operands as well as the result in order
1233	to calculate the right flags, even if we use ADDS and translates NZCV into
1234	OF, CF, ZF and SF. */
1235	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1236	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1237	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1238	{
1239	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1240	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/, true /fSetFlags/);
1241	}
1242	else
1243	{
1244	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1245	uint32_t const cShift = 32 - a_cOpBits;
1246	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1247	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /f64Bit/,
1248	true /fSetFlags/, cShift);
1249	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1250	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1251	}
1252	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1253
1254	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1255	idxRegDstIn, idxRegSrc, false /fInvertCarry/, 0);
1256
1257	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1258	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1259	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1260
1261	#else
1262	# error "port me"
1263	#endif
1264	return off;
1265	}
1266
1267
1268	/**
1269	* The ADD instruction with immediate value as right operand.
1270	*/
1271	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1272	DECL_INLINE_THROW(uint32_t)
1273	iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1274	{
1275	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1276
1277	#ifdef RT_ARCH_AMD64
1278	/* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1279	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1280	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1281	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1282
1283	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1284
1285	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1286
1287	#elif defined(RT_ARCH_ARM64)
1288	/* On ARM64 we'll need the two input operands as well as the result in order
1289	to calculate the right flags, even if we use ADDS and translates NZCV into
1290	OF, CF, ZF and SF. */
1291	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1292	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1293	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1294	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1295	{
1296	if (uImmOp <= 0xfffU)
1297	pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /f64Bit/,
1298	true /fSetFlags/);
1299	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1300	pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /f64Bit/,
1301	true /fSetFlags/, true /fShift12/);
1302	else
1303	{
1304	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1305	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1306	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/,
1307	true /fSetFlags/);
1308	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1309	}
1310	}
1311	else
1312	{
1313	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1314	uint32_t const cShift = 32 - a_cOpBits;
1315	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1316	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1317	pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /f64Bit/, true /fSetFlags/, cShift);
1318	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1319	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1320	}
1321	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1322
1323	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1324	idxRegDstIn, UINT8_MAX, false /fInvertCarry/, uImmOp);
1325
1326	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1327	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1328
1329	#else
1330	# error "port me"
1331	#endif
1332	return off;
1333	}
1334
1335
1336	/**
1337	* The ADC instruction takes CF as input and will set all status flags.
1338	*/
1339	template<uint8_t const a_cOpBits>
1340	DECL_INLINE_THROW(uint32_t)
1341	iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1342	{
1343	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1344	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1345	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1346
1347	#ifdef RT_ARCH_AMD64
1348	/* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1349	with matching size to get the correct flags. */
1350	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1351
1352	/* Use the BT instruction to set CF according to idxRegEfl. */
1353	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1354	pCodeBuf[off++] = X86_EFL_CF_BIT;
1355
1356	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, a_cOpBits, idxRegDst, idxRegSrc);
1357	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1358
1359	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1360	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1361
1362	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1363
1364	#elif defined(RT_ARCH_ARM64)
1365	/* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1366	then ADCS for the calculation. We need all inputs and result for the two
1367	flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1368	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1369	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1370
1371	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1372	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1373	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1374	pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
1375	else
1376	{
1377	/* Since we're also adding in the carry flag here, shifting operands up
1378	doesn't work. So, we have to calculate carry & overflow manually. */
1379	pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /f64Bit/);
1380	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1381	}
1382	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1383
1384	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1385	idxRegDstIn, idxRegSrc, false /fInvertCarry/, 0);
1386
1387	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1388	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1389	if RT_CONSTEXPR_IF(a_cOpBits < 32)
1390	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1391	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1392
1393	#else
1394	# error "port me"
1395	#endif
1396	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1397	return off;
1398	}
1399
1400
1401	/**
1402	* The ADC instruction with immediate value as right operand.
1403	*/
1404	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1405	DECL_INLINE_THROW(uint32_t)
1406	iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1407	{
1408	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1409	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1410
1411	#ifdef RT_ARCH_AMD64
1412	/* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1413	with matching size to get the correct flags. */
1414	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1415
1416	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1417	pCodeBuf[off++] = X86_EFL_CF_BIT;
1418
1419	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 2, idxRegDst, uImmOp);
1420	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1421
1422	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1423
1424	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1425
1426	#elif defined(RT_ARCH_ARM64)
1427	/* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1428	and then ADCS for the calculation. We need all inputs and result for
1429	the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1430	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1431	uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1432	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1433
1434	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1435	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1436	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1437	pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /f64Bit/);
1438	else
1439	{
1440	/* Since we're also adding in the carry flag here, shifting operands up
1441	doesn't work. So, we have to calculate carry & overflow manually. */
1442	pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /f64Bit/);
1443	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1444	}
1445	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1446
1447	iemNativeRegFreeTmp(pReNative, idxRegImm);
1448
1449	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1450	idxRegDstIn, UINT8_MAX, false /fInvertCarry/, uImmOp);
1451
1452	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1453	if RT_CONSTEXPR_IF(a_cOpBits < 32)
1454	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1455	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1456
1457	#else
1458	# error "port me"
1459	#endif
1460	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1461	return off;
1462	}
1463
1464
1465	/**
1466	* The SUB instruction will set all status flags.
1467	*/
1468	template<uint8_t const a_cOpBits>
1469	DECL_INLINE_THROW(uint32_t)
1470	iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1471	{
1472	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1473	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1474
1475	#ifdef RT_ARCH_AMD64
1476	/* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1477	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1478	0x2a, 0x2b, a_cOpBits, idxRegDst, idxRegSrc);
1479	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1480
1481	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1482	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1483
1484	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1485
1486	#elif defined(RT_ARCH_ARM64)
1487	/* On ARM64 we'll need the two input operands as well as the result in order
1488	to calculate the right flags, even if we use SUBS and translates NZCV into
1489	OF, CF, ZF and SF. */
1490	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1491	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1492	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1493	{
1494	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1495	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/, true /fSetFlags/);
1496	}
1497	else
1498	{
1499	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1500	uint32_t const cShift = 32 - a_cOpBits;
1501	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1502	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /f64Bit/,
1503	true /fSetFlags/, cShift);
1504	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1505	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1506	}
1507	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1508
1509	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1510	idxRegDstIn, idxRegSrc, true /fInvertCarry/, 0);
1511
1512	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1513	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1514	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1515
1516	#else
1517	# error "port me"
1518	#endif
1519	return off;
1520	}
1521
1522
1523	/**
1524	* The SUB instruction with immediate value as right operand.
1525	*/
1526	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1527	DECL_INLINE_THROW(uint32_t)
1528	iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1529	{
1530	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1531
1532	#ifdef RT_ARCH_AMD64
1533	/* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1534	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1535	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 5, idxRegDst, uImmOp);
1536	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1537
1538	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1539
1540	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1541
1542	#elif defined(RT_ARCH_ARM64)
1543	/* On ARM64 we'll need the two input operands as well as the result in order
1544	to calculate the right flags, even if we use SUBS and translates NZCV into
1545	OF, CF, ZF and SF. */
1546	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1547	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1548	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1549	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1550	{
1551	if (uImmOp <= 0xfffU)
1552	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /f64Bit/,
1553	true /fSetFlags/);
1554	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1555	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /f64Bit/,
1556	true /fSetFlags/, true /fShift12/);
1557	else
1558	{
1559	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1560	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1561	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/,
1562	true /fSetFlags/);
1563	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1564	}
1565	}
1566	else
1567	{
1568	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1569	uint32_t const cShift = 32 - a_cOpBits;
1570	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1571	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1572	pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1573	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /f64Bit/, true /fSetFlags/, cShift);
1574	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /f64Bit/);
1575	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /f64Bit/);
1576	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1577	}
1578	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1579
1580	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1581	idxRegDstIn, UINT8_MAX, true /fInvertCarry/, uImmOp);
1582
1583	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1584	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1585
1586	#else
1587	# error "port me"
1588	#endif
1589	return off;
1590	}
1591
1592
1593	/**
1594	* The CMP instruction will set all status flags, but modifies no registers.
1595	*/
1596	template<uint8_t const a_cOpBits>
1597	DECL_INLINE_THROW(uint32_t)
1598	iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1599	{
1600	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1601	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1602
1603	#ifdef RT_ARCH_AMD64
1604	/* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1605	off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1606	0x3a, 0x3b, a_cOpBits, idxRegDst, idxRegSrc);
1607	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1608
1609	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1610	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1611
1612	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1613
1614	#elif defined(RT_ARCH_ARM64)
1615	/* On ARM64 we'll need the actual result as well as both input operands in order
1616	to calculate the right flags, even if we use SUBS and translates NZCV into
1617	OF, CF, ZF and SF. */
1618	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1619	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1620	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1621	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/, true /fSetFlags/);
1622	else
1623	{
1624	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1625	uint32_t const cShift = 32 - a_cOpBits;
1626	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /f64Bit/, cShift);
1627	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /f64Bit/,
1628	true /fSetFlags/, cShift);
1629	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /f64Bit/);
1630	}
1631	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1632
1633	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1634	idxRegDst, idxRegSrc, true /fInvertCarry/, 0);
1635
1636	iemNativeRegFreeTmp(pReNative, idxRegResult);
1637	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1638	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1639
1640	#else
1641	# error "port me"
1642	#endif
1643	return off;
1644	}
1645
1646
1647	/**
1648	* The CMP instruction with immediate value as right operand.
1649	*/
1650	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1651	DECL_INLINE_THROW(uint32_t)
1652	iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1653	{
1654	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1655
1656	#ifdef RT_ARCH_AMD64
1657	/* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1658	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1659	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 7, idxRegDst, uImmOp);
1660	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1661
1662	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1663
1664	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1665
1666	#elif defined(RT_ARCH_ARM64)
1667	/* On ARM64 we'll need the actual result as well as both input operands in order
1668	to calculate the right flags, even if we use SUBS and translates NZCV into
1669	OF, CF, ZF and SF. */
1670	uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1671	PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1672	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1673	{
1674	if (uImmOp <= 0xfffU)
1675	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, a_cOpBits > 32 /f64Bit/,
1676	true /fSetFlags/);
1677	else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1678	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /f64Bit/,
1679	true /fSetFlags/, true /fShift12/);
1680	else
1681	{
1682	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1683	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1684	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /f64Bit/,
1685	true /fSetFlags/);
1686	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1687	}
1688	}
1689	else
1690	{
1691	/* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1692	uint32_t const cShift = 32 - a_cOpBits;
1693	uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1694	pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1695	pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /f64Bit/);
1696	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /f64Bit/, true /fSetFlags/, cShift);
1697	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /f64Bit/);
1698	iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1699	}
1700	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1701
1702	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1703	idxRegDst, UINT8_MAX, true /fInvertCarry/, uImmOp);
1704
1705	iemNativeRegFreeTmp(pReNative, idxRegResult);
1706	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1707
1708	#else
1709	# error "port me"
1710	#endif
1711	return off;
1712	}
1713
1714
1715	/**
1716	* The SBB instruction takes CF as input and will set all status flags.
1717	*/
1718	template<uint8_t const a_cOpBits>
1719	DECL_INLINE_THROW(uint32_t)
1720	iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1721	{
1722	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1723	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
1724	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1725
1726	#ifdef RT_ARCH_AMD64
1727	/* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1728	with matching size to get the correct flags. */
1729	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1730
1731	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1732	pCodeBuf[off++] = X86_EFL_CF_BIT;
1733
1734	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, a_cOpBits, idxRegDst, idxRegSrc);
1735	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1736
1737	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1738	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1739
1740	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1741
1742	#elif defined(RT_ARCH_ARM64)
1743	/* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1744	idxRegEfl and then SBCS for the calculation. We need all inputs and
1745	result for the two flags (AF,PF) that can't be directly derived from
1746	PSTATE.NZCV. */
1747	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1748	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1749
1750	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1751	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1752	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1753	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1754	pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /f64Bit/);
1755	else
1756	{
1757	/* Since we're also adding in the carry flag here, shifting operands up
1758	doesn't work. So, we have to calculate carry & overflow manually. */
1759	pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /f64Bit/);
1760	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1761	}
1762	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1763
1764	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1765	idxRegDstIn, idxRegSrc, true /fInvertCarry/, 0);
1766
1767	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1768	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1769	if RT_CONSTEXPR_IF(a_cOpBits < 32)
1770	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1771	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1772
1773	#else
1774	# error "port me"
1775	#endif
1776	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1777	return off;
1778	}
1779
1780
1781	/**
1782	* The SBB instruction with immediate value as right operand.
1783	*/
1784	template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1785	DECL_INLINE_THROW(uint32_t)
1786	iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1787	{
1788	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
1789	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
1790
1791	#ifdef RT_ARCH_AMD64
1792	/* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1793	with matching size to get the correct flags. */
1794	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1795
1796	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /cOpBits/, 4, idxRegEfl);
1797	pCodeBuf[off++] = X86_EFL_CF_BIT;
1798
1799	off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 3, idxRegDst, uImmOp);
1800	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1801
1802	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1803
1804	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1805
1806	#elif defined(RT_ARCH_ARM64)
1807	/* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1808	idxRegEfl and then SBCS for the calculation. We need all inputs and
1809	result for the two flags (AF,PF) that can't be directly derived from
1810	PSTATE.NZCV. */
1811	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1812	uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1813	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1814
1815	pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /fMask=C/);
1816	pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1817	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1818	if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1819	pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /f64Bit/);
1820	else
1821	{
1822	/* Since we're also adding in the carry flag here, shifting operands up
1823	doesn't work. So, we have to calculate carry & overflow manually. */
1824	pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /f64Bit/);
1825	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1826	}
1827	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1828
1829	iemNativeRegFreeTmp(pReNative, idxRegImm);
1830
1831	off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1832	idxRegDstIn, UINT8_MAX, true /fInvertCarry/, uImmOp);
1833
1834	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1835	if RT_CONSTEXPR_IF(a_cOpBits < 32)
1836	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1837	iemNativeVarRegisterRelease(pReNative, idxVarDst);
1838
1839	#else
1840	# error "port me"
1841	#endif
1842	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1843	return off;
1844	}
1845
1846
1847	template<uint8_t const a_cOpBits>
1848	DECL_INLINE_THROW(uint32_t)
1849	iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1850	{
1851	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1852	AssertFailed();
1853	return iemNativeEmitBrk(pReNative, off, 0x666);
1854	}
1855
1856
1857	template<uint8_t const a_cOpBits>
1858	DECL_INLINE_THROW(uint32_t)
1859	iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1860	{
1861	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1862	AssertFailed();
1863	return iemNativeEmitBrk(pReNative, off, 0x666);
1864	}
1865
1866
1867	template<uint8_t const a_cOpBits>
1868	DECL_INLINE_THROW(uint32_t)
1869	iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1870	{
1871	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1872	AssertFailed();
1873	return iemNativeEmitBrk(pReNative, off, 0x666);
1874	}
1875
1876
1877	template<uint8_t const a_cOpBits>
1878	DECL_INLINE_THROW(uint32_t)
1879	iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1880	{
1881	RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1882	AssertFailed();
1883	return iemNativeEmitBrk(pReNative, off, 0x666);
1884	}
1885
1886
1887
1888	/*********************************************************************************************************************************
1889	* Shifting and Rotating. *
1890	*********************************************************************************************************************************/
1891
1892
1893	typedef enum
1894	{
1895	kIemNativeEmitEFlagsForShiftType_Left,
1896	kIemNativeEmitEFlagsForShiftType_Right,
1897	kIemNativeEmitEFlagsForShiftType_SignedRight
1898	} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1899
1900	/**
1901	* This is used by SHL, SHR and SAR emulation.
1902	*
1903	* It takes liveness stuff into account.
1904	*/
1905	DECL_INLINE_THROW(uint32_t)
1906	iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1907	uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1908	uint8_t idxRegTmp)
1909	{
1910	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1911
1912	RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1913	#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1914	/*
1915	* See if we can skip this wholesale.
1916	*/
1917	PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1918	if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1919	&& !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1920	{
1921	STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1922	pReNative->fSkippingEFlags \|= X86_EFL_STATUS_BITS;
1923	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1924	off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1925	# endif
1926	}
1927	else
1928	#endif
1929	{
1930	/*
1931	* The difference between Intel and AMD flags for SHL are:
1932	* - Intel always clears AF while AMD always sets it.
1933	* - Intel sets OF for the first shift, while AMD for the last shift.
1934	*
1935	*/
1936
1937	#ifdef RT_ARCH_AMD64
1938	/*
1939	* We capture flags and does the additional OF and AF calculations as needed.
1940	*/
1941	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1942	/** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1943	* use LAHF here when host rax is free since, OF is cleared. */
1944	/* pushf */
1945	pCodeBuf[off++] = 0x9c;
1946	/* pop tmp */
1947	if (idxRegTmp >= 8)
1948	pCodeBuf[off++] = X86_OP_REX_B;
1949	pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1950	/* Clear the status bits in EFLs. */
1951	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1952	uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1953	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1954	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1955	else
1956	{
1957	/* and tmp, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF \| X86_EFL_CF */
1958	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF \| X86_EFL_ZF \| X86_EFL_SF \| X86_EFL_CF);
1959	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1960	off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1961	/* OR in the flags we collected. */
1962	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1963
1964	/* Calculate OF */
1965	if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1966	{
1967	/* AMD last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1968	/* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1969	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /ud2/, 0xba,
1970	RT_MAX(cOpBits, 16), 4, idxRegResult);
1971	pCodeBuf[off++] = cOpBits - 1;
1972	/* setc idxRegTmp */
1973	off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /ud2/, 8, 0, idxRegTmp);
1974	/* xor idxRegTmp, idxRegEfl */
1975	off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1976	/* and idxRegTmp, 1 */
1977	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1978	/* shl idxRegTmp, X86_EFL_OF_BIT */
1979	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
1980	}
1981	else
1982	{
1983	/* Intel first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1984	if (cOpBits <= 32)
1985	{
1986	/* mov idxRegTmp, idxRegSrc */
1987	off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
1988	/* shl idxRegTmp, 1 */
1989	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
1990	/* xor idxRegTmp, idxRegSrc */
1991	off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
1992	/* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
1993	if (cOpBits >= X86_EFL_OF_BIT)
1994	off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
1995	else
1996	off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
1997	}
1998	else
1999	{
2000	/* same as above but with 64-bit grps*/
2001	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2002	off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2003	off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2004	off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2005	}
2006	/* and idxRegTmp, X86_EFL_OF */
2007	off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2008	}
2009	}
2010	/* Or in the collected flag(s) */
2011	off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2012
2013	#elif defined(RT_ARCH_ARM64)
2014	/*
2015	* Calculate flags.
2016	*/
2017	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2018
2019	/* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2020	off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2021	off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2022
2023	/* N,Z -> SF,ZF */
2024	if (cOpBits < 32)
2025	pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2026	else
2027	pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /f64Bit/);
2028	pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2029	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2030	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /f64Bit/);
2031	AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2032
2033	/* Calculate 8-bit parity of the result. */
2034	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /f64Bit/,
2035	4 /offShift6/, kArmv8A64InstrShift_Lsr);
2036	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
2037	2 /offShift6/, kArmv8A64InstrShift_Lsr);
2038	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /f64Bit/,
2039	1 /offShift6/, kArmv8A64InstrShift_Lsr);
2040	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2041	pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /f64Bit/);
2042	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /f64Bit/);
2043
2044	/* Calculate carry - the last bit shifted out of the input value. */
2045	if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2046	{
2047	/* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2048	pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2049	pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /f64Bit/, cOpBits < 32 /fSetFlags/);
2050	if (cOpBits < 32)
2051	pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2052	pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2053	}
2054	else
2055	{
2056	/* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2057	pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /f64Bit/);
2058	pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2059	}
2060	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /f64Bit/);
2061
2062	uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2063	if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2064	{
2065	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2066	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /left shift count/);
2067	pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2068	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /f64Bit/);
2069	}
2070	else
2071	{
2072	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2073	AssertCompile(X86_EFL_CF_BIT == 0);
2074	pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2075	cOpBits - 1, kArmv8A64InstrShift_Lsr);
2076	pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /f64Bit/);
2077
2078	/* AMD unconditionally clears AF. */
2079	Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2080	pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /f64Bit/);
2081	}
2082	#else
2083	# error "port me"
2084	#endif
2085	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2086
2087	#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2088	if (pReNative->fSkippingEFlags)
2089	Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2090	pReNative->fSkippingEFlags = 0;
2091	# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2092	off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2093	# endif
2094	#endif
2095	}
2096	return off;
2097	}
2098
2099
2100	DECL_INLINE_THROW(uint32_t)
2101	iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2102	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2103	{
2104	/* Note! Since we're doing some branching here, we need to allocate all
2105	registers we need before the jump or we may end up with invalid
2106	register state if the branch is taken. */
2107	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2108	uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /fInitialized/); /* modified on arm64 */
2109	uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /fInitialized/);
2110	uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /fInitialized/);
2111
2112	#ifdef RT_ARCH_AMD64
2113	/* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2114	AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2115
2116	/* We only need a copy of the input value if the target CPU differs from the host CPU. */
2117	uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2118	? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2119	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2120
2121	/* Check if it's NOP before we do anything. */
2122	off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2123	uint32_t const offFixup = off;
2124	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /8-bit should be enough /, kIemNativeInstrCond_z);
2125
2126	if (idxRegDstIn != UINT8_MAX)
2127	off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2128	off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2129
2130	#elif defined(RT_ARCH_ARM64)
2131	/* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2132	uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2133	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2134
2135	/* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2136	Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2137	Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2138	pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /f64Bit/);
2139	uint32_t const offFixup = off;
2140	off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2141
2142	pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2143	pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /f64Bit/);
2144	if (cOpBits < 32)
2145	{
2146	Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2147	Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2148	pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /f64Bit/);
2149	}
2150
2151	#else
2152	# error "port me"
2153	#endif
2154
2155	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2156	off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2157	cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2158
2159	/* fixup the jump */
2160	iemNativeFixupFixedJump(pReNative, offFixup, off);
2161
2162	#ifdef RT_ARCH_AMD64
2163	if (idxRegDstIn != UINT8_MAX)
2164	#endif
2165	iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2166	iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2167	iemNativeVarRegisterRelease(pReNative, idxVarDst);
2168	iemNativeVarRegisterRelease(pReNative, idxVarCount);
2169	iemNativeRegFreeTmp(pReNative, idxRegTmp);
2170	return off;
2171	}
2172
2173
2174	DECL_INLINE_THROW(uint32_t)
2175	iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2176	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2177	{
2178	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2179	AssertFailed();
2180	return iemNativeEmitBrk(pReNative, off, 0x666);
2181	}
2182
2183
2184	DECL_INLINE_THROW(uint32_t)
2185	iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2186	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2187	{
2188	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2189	AssertFailed();
2190	return iemNativeEmitBrk(pReNative, off, 0x666);
2191	}
2192
2193
2194	DECL_INLINE_THROW(uint32_t)
2195	iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2196	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2197	{
2198	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2199	AssertFailed();
2200	return iemNativeEmitBrk(pReNative, off, 0x666);
2201	}
2202
2203
2204	DECL_INLINE_THROW(uint32_t)
2205	iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2206	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2207	{
2208	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2209	AssertFailed();
2210	return iemNativeEmitBrk(pReNative, off, 0x666);
2211	}
2212
2213
2214	DECL_INLINE_THROW(uint32_t)
2215	iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2216	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2217	{
2218	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2219	AssertFailed();
2220	return iemNativeEmitBrk(pReNative, off, 0x666);
2221	}
2222
2223
2224	DECL_INLINE_THROW(uint32_t)
2225	iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2226	uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2227	{
2228	RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2229	AssertFailed();
2230	return iemNativeEmitBrk(pReNative, off, 0x666);
2231	}
2232
2233
2234
2235	#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2236	/*********************************************************************************************************************************
2237	* SIMD emitters. *
2238	*********************************************************************************************************************************/
2239
2240	/**
2241	* Common emitter for packed arithmetic instructions.
2242	*/
2243	#ifdef RT_ARCH_AMD64
2244	# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2245	DECL_INLINE_THROW(uint32_t) \
2246	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2247	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2248	{ \
2249	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2250	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2251	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2252	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2253	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2254	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2255	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2256	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2257	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2258	pCodeBuf[off++] = 0x0f; \
2259	pCodeBuf[off++] = (a_bOpcX86); \
2260	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2261	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2262	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2263	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2264	return off; \
2265	} \
2266	DECL_INLINE_THROW(uint32_t) \
2267	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2268	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2269	{ \
2270	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2271	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2272	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2273	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2274	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2275	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2276	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2277	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2278	pCodeBuf[off++] = 0x0f; \
2279	pCodeBuf[off++] = (a_bOpcX86); \
2280	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2281	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2282	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2283	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2284	return off; \
2285	} \
2286	typedef int ignore_semicolon
2287	#elif defined(RT_ARCH_ARM64)
2288	# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2289	DECL_INLINE_THROW(uint32_t) \
2290	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2291	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2292	{ \
2293	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2294	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2295	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2296	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2297	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2298	pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2299	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2300	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2301	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2302	return off; \
2303	} \
2304	DECL_INLINE_THROW(uint32_t) \
2305	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2306	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2307	{ \
2308	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2309	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2310	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2311	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2312	pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2313	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2314	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2315	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2316	return off; \
2317	} \
2318	typedef int ignore_semicolon
2319	#else
2320	# error "Port me"
2321	#endif
2322
2323	/* POR, ORPS, ORPD. */
2324	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2325	/* PXOR, XORPS, XORPD. */
2326	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2327	/* PAND, ANDPS, ANDPD. */
2328	IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2329
2330
2331	/**
2332	* Common emitter for the shift right with immediate instructions.
2333	*/
2334	#ifdef RT_ARCH_AMD64
2335	# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2336	DECL_INLINE_THROW(uint32_t) \
2337	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2338	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2339	{ \
2340	if (bImm) \
2341	{ \
2342	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2343	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2344	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2345	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2346	if (idxSimdRegDst >= 8) \
2347	pCodeBuf[off++] = X86_OP_REX_B; \
2348	pCodeBuf[off++] = 0x0f; \
2349	pCodeBuf[off++] = (a_bOpcX86); \
2350	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2351	pCodeBuf[off++] = bImm; \
2352	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2353	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2354	} \
2355	/* Immediate 0 is a nop. */ \
2356	return off; \
2357	} \
2358	typedef int ignore_semicolon
2359	#elif defined(RT_ARCH_ARM64)
2360	# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2361	DECL_INLINE_THROW(uint32_t) \
2362	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2363	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2364	{ \
2365	if (bImm) \
2366	{ \
2367	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2368	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2369	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2370	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2371	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2372	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2373	} \
2374	/* Immediate 0 is a nop. */ \
2375	return off; \
2376	} \
2377	typedef int ignore_semicolon
2378	#else
2379	# error "Port me"
2380	#endif
2381
2382	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2383	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2384	IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2385
2386
2387	/**
2388	* Common emitter for the shift left with immediate instructions.
2389	*/
2390	#ifdef RT_ARCH_AMD64
2391	# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2392	DECL_INLINE_THROW(uint32_t) \
2393	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2394	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2395	{ \
2396	if (bImm) \
2397	{ \
2398	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2399	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2400	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2401	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2402	if (idxSimdRegDst >= 8) \
2403	pCodeBuf[off++] = X86_OP_REX_B; \
2404	pCodeBuf[off++] = 0x0f; \
2405	pCodeBuf[off++] = (a_bOpcX86); \
2406	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2407	pCodeBuf[off++] = bImm; \
2408	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2409	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2410	} \
2411	/* Immediate 0 is a nop. */ \
2412	return off; \
2413	} \
2414	typedef int ignore_semicolon
2415	#elif defined(RT_ARCH_ARM64)
2416	# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2417	DECL_INLINE_THROW(uint32_t) \
2418	RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2419	uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2420	{ \
2421	if (bImm) /* bImm == 0 is a nop */ \
2422	{ \
2423	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2424	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2425	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2426	if (bImm < (a_cShiftMax)) \
2427	pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2428	else /* Everything >= a_cShiftMax sets the register to zero. */ \
2429	pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2430	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2431	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2432	} \
2433	return off; \
2434	} \
2435	typedef int ignore_semicolon
2436	#else
2437	# error "Port me"
2438	#endif
2439
2440	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2441	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2442	IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2443
2444
2445	/**
2446	* Common emitter for packed arithmetic instructions.
2447	*/
2448	#ifdef RT_ARCH_AMD64
2449	# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2450	DECL_INLINE_THROW(uint32_t) \
2451	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2452	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2453	{ \
2454	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2455	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2456	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2457	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2458	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2459	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2460	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2461	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2462	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2463	pCodeBuf[off++] = 0x0f; \
2464	pCodeBuf[off++] = (a_bOpcX86); \
2465	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2466	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2467	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2468	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2469	return off; \
2470	} \
2471	DECL_INLINE_THROW(uint32_t) \
2472	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2473	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2474	{ \
2475	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2476	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2477	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2478	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2479	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2480	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2481	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2482	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2483	pCodeBuf[off++] = 0x0f; \
2484	pCodeBuf[off++] = (a_bOpcX86); \
2485	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2486	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2487	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2488	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2489	return off; \
2490	} \
2491	typedef int ignore_semicolon
2492	#elif defined(RT_ARCH_ARM64)
2493	# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2494	DECL_INLINE_THROW(uint32_t) \
2495	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2496	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2497	{ \
2498	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2499	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2500	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2501	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2502	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2503	pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2504	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2505	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2506	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2507	return off; \
2508	} \
2509	DECL_INLINE_THROW(uint32_t) \
2510	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2511	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2512	{ \
2513	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2514	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2515	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2516	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2517	pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2518	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2519	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2520	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2521	return off; \
2522	} \
2523	typedef int ignore_semicolon
2524	#else
2525	# error "Port me"
2526	#endif
2527
2528	/*
2529	* PADDx.
2530	*/
2531	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2532	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2533	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2534	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2535
2536	/*
2537	* PSUBx.
2538	*/
2539	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2540	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2541	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2542	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2543
2544	/*
2545	* PADDUSx.
2546	*/
2547	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2548	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2549
2550	/*
2551	* PMULLx.
2552	*/
2553	IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2554
2555
2556	/**
2557	* Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2558	*/
2559	#ifdef RT_ARCH_AMD64
2560	# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2561	DECL_INLINE_THROW(uint32_t) \
2562	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2563	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2564	{ \
2565	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2566	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2567	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2568	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2569	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2570	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2571	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2572	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2573	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2574	pCodeBuf[off++] = 0x0f; \
2575	pCodeBuf[off++] = (a_bOpcX86); \
2576	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2577	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2578	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2579	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2580	return off; \
2581	} \
2582	DECL_INLINE_THROW(uint32_t) \
2583	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2584	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2585	{ \
2586	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2587	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2588	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2589	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2590	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2591	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2592	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2593	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2594	pCodeBuf[off++] = 0x0f; \
2595	pCodeBuf[off++] = (a_bOpcX86); \
2596	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2597	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2598	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2599	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2600	return off; \
2601	} \
2602	typedef int ignore_semicolon
2603	#elif defined(RT_ARCH_ARM64)
2604	# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2605	DECL_INLINE_THROW(uint32_t) \
2606	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2607	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2608	{ \
2609	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2610	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2611	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2612	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2613	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2614	pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2615	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2616	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2617	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2618	return off; \
2619	} \
2620	DECL_INLINE_THROW(uint32_t) \
2621	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2622	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2623	{ \
2624	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2625	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2626	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2627	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2628	pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2629	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2630	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2631	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2632	return off; \
2633	} \
2634	typedef int ignore_semicolon
2635	#else
2636	# error "Port me"
2637	#endif
2638
2639	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2640	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2641	IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2642
2643
2644	/**
2645	* Emitter for pmovmskb
2646	*/
2647	DECL_INLINE_THROW(uint32_t)
2648	iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2649	uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2650	{
2651	#ifdef RT_ARCH_AMD64
2652	uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2653	kIemNativeGstRegUse_ForFullWrite);
2654	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2655	IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2656	kIemNativeGstSimdRegLdStSz_Low128,
2657	kIemNativeGstRegUse_ReadOnly);
2658	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2659
2660	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2661	if (idxRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2662	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2663	\| (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2664	pCodeBuf[off++] = 0x0f;
2665	pCodeBuf[off++] = 0xd7;
2666	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2667
2668	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2669	iemNativeRegFreeTmp(pReNative, idxRegDst);
2670
2671	#elif defined(RT_ARCH_ARM64)
2672	uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2673	kIemNativeGstRegUse_ForFullWrite);
2674	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2675	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2676	IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2677	kIemNativeGstSimdRegLdStSz_Low128,
2678	kIemNativeGstRegUse_Calculation);
2679	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2680
2681	/*
2682	* See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2683	* for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2684	*
2685	* As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2686	* as the sse2neon implementation because there we can get away with loading any constants and the
2687	* base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2688	*
2689	* The following illustrates the algorithm:
2690	*
2691	* Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2692	* Instruction
2693	* \|
2694	* V
2695	* Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2696	* USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2697	* USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2698	* USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2699	* USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2700	*
2701	* The extraction process
2702	* UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2703	* UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2704	* ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2705	*/
2706	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2707	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /fUnsigned/, false /fRound/, true /fAccum/);
2708	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /fUnsigned/, false /fRound/, true /fAccum/);
2709	pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /fUnsigned/, false /fRound/, true /fAccum/);
2710	pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /fDst64Bit/);
2711	pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /fDst64Bit/);
2712	pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /f64Bit/, 8 /offShift6/);
2713
2714	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2715	iemNativeRegFreeTmp(pReNative, idxRegTmp);
2716	iemNativeRegFreeTmp(pReNative, idxRegDst);
2717
2718	#else
2719	# error "Port me"
2720	#endif
2721	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2722	return off;
2723	}
2724
2725
2726	/**
2727	* Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2728	*/
2729	DECL_INLINE_THROW(uint32_t)
2730	iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2731	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2732	{
2733	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2734	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2735	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2736	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2737
2738	#ifdef RT_ARCH_AMD64
2739	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2740
2741	/* packuswb xmm, xmm */
2742	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2743	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2744	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2745	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2746	pCodeBuf[off++] = 0x0f;
2747	pCodeBuf[off++] = 0x67;
2748	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2749
2750	#elif defined(RT_ARCH_ARM64)
2751	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2752
2753	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /fUpper/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2754	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /fUpper/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2755
2756	#else
2757	# error "port me"
2758	#endif
2759
2760	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2761	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2762
2763	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2764	return off;
2765	}
2766
2767
2768	/**
2769	* Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2770	*/
2771	DECL_INLINE_THROW(uint32_t)
2772	iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2773	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2774	{
2775	IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2776	IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2777
2778	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2779	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2780	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
2781
2782
2783	#ifdef RT_ARCH_AMD64
2784	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2785
2786	/* packuswb xmm, xmm */
2787	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2788	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8)
2789	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2790	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2791	pCodeBuf[off++] = 0x0f;
2792	pCodeBuf[off++] = 0x67;
2793	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2794
2795	#elif defined(RT_ARCH_ARM64)
2796	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2797
2798	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /fUpper/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2799	pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /fUpper/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2800
2801	#else
2802	# error "port me"
2803	#endif
2804
2805	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2806	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2807
2808	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2809	return off;
2810	}
2811
2812
2813	/**
2814	* Common emitter for the pmov{s,z}x* instructions.
2815	*/
2816	#ifdef RT_ARCH_AMD64
2817	# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2818	DECL_INLINE_THROW(uint32_t) \
2819	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2820	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2821	{ \
2822	if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2823	{ \
2824	uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2825	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2826	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2827	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2828	if (idxSimdReg >= 8) \
2829	pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B \| X86_OP_REX_R : 0); \
2830	pCodeBuf[off++] = 0x0f; \
2831	pCodeBuf[off++] = 0x38; \
2832	pCodeBuf[off++] = (a_bOpcX86); \
2833	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2834	iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2835	} \
2836	else \
2837	{ \
2838	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2839	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2840	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2841	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2842	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2843	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2844	if (idxSimdRegDst >= 8 \|\| idxSimdRegSrc >= 8) \
2845	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2846	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2847	pCodeBuf[off++] = 0x0f; \
2848	pCodeBuf[off++] = 0x38; \
2849	pCodeBuf[off++] = (a_bOpcX86); \
2850	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2851	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2852	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2853	} \
2854	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2855	return off; \
2856	} \
2857	DECL_INLINE_THROW(uint32_t) \
2858	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2859	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2860	{ \
2861	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2862	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2863	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2864	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2865	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2866	pCodeBuf[off++] = X86_OP_REX_W \
2867	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2868	\| (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2869	pCodeBuf[off++] = 0x0f; \
2870	pCodeBuf[off++] = 0x3a; \
2871	pCodeBuf[off++] = 0x22; \
2872	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2873	pCodeBuf[off++] = 0; /* QWord */\
2874	pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2875	if (idxSimdRegDst >= 8 \|\| IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2876	pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2877	\| (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2878	pCodeBuf[off++] = 0x0f; \
2879	pCodeBuf[off++] = 0x38; \
2880	pCodeBuf[off++] = (a_bOpcX86); \
2881	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2882	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2883	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2884	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2885	return off; \
2886	} \
2887	typedef int ignore_semicolon
2888	#elif defined(RT_ARCH_ARM64)
2889	# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2890	DECL_INLINE_THROW(uint32_t) \
2891	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2892	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2893	{ \
2894	if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2895	{ \
2896	uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2897	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2898	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2899	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2900	iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2901	} \
2902	else \
2903	{ \
2904	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2905	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2906	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2907	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2908	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2909	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2910	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2911	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2912	} \
2913	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2914	return off; \
2915	} \
2916	DECL_INLINE_THROW(uint32_t) \
2917	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2918	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2919	{ \
2920	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2921	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2922	uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/); \
2923	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2924	pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /idxElem/); /* Transfer value from GPR to temporary vector register. */ \
2925	pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2926	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2927	iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2928	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2929	return off; \
2930	} \
2931	typedef int ignore_semicolon
2932	#else
2933	# error "Port me"
2934	#endif
2935
2936	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2937	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2938	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2939
2940	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2941	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2942	IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2943
2944
2945	/**
2946	* Updates the MXCSR exception flags, raising any unmasked exceptions.
2947	*/
2948	DECL_INLINE_THROW(uint32_t)
2949	iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2950	{
2951	uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2952	uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2953	uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2954
2955	#ifdef RT_ARCH_AMD64
2956	PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2957
2958	/* stmxcsr */
2959	if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2960	pbCodeBuf[off++] = X86_OP_REX_B;
2961	pbCodeBuf[off++] = 0x0f;
2962	pbCodeBuf[off++] = 0xae;
2963	pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2964	pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2965	pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2966	pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2967	pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2968
2969	/* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2970	off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2971
2972	/* Store the flags in the MXCSR xcpt flags register. */
2973	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2974	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2975
2976	/* Clear the status flags in the temporary copy and write it back to MXCSR. */
2977	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2978	off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2979
2980	pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2981
2982	/* ldmxcsr */
2983	if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2984	pbCodeBuf[off++] = X86_OP_REX_B;
2985	pbCodeBuf[off++] = 0x0f;
2986	pbCodeBuf[off++] = 0xae;
2987	pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2988	pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2989	pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2990	pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2991	pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2992
2993	#elif defined(RT_ARCH_ARM64)
2994	PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2995	pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
2996	pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
2997	pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
2998
2999	/*
3000	* The exception flags layout differs between MXCSR and FPSR of course:
3001	*
3002	* Bit FPSR MXCSR
3003	* 0 IOC ------> IE
3004	*
3005	* 1 DZC ---- DE <-+
3006	* \ \|
3007	* 2 OFC --- -> ZE \|
3008	* \ \|
3009	* 3 UFC -- --> OE \|
3010	* \ \|
3011	* 4 IXC - ---> UE \|
3012	* \ \|
3013	* 5 ----> PE \|
3014	* 6 \|
3015	* 7 IDC --------------+
3016	*/
3017	pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3018	pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3019	pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3020	pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3021	#else
3022	# error "Port me"
3023	#endif
3024
3025	/*
3026	* If PE is set together with OE/UE and neither are masked
3027	* PE needs to be cleared, because on real hardware
3028	* an exception is generated with only OE/UE being set,
3029	* but because we mask all exceptions PE will get set as well.
3030	*/
3031	/** @todo On ARM we can combine the load+and into one and instruction. */
3032	/** @todo r=aeichner Can this be done more optimal? */
3033	uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3034	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3035	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE \| X86_MXCSR_UE);
3036	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3037	off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM \| X86_MXCSR_UM);
3038	off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3039	off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /f64Bit/);
3040	off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3041	off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE \| X86_MXCSR_UE);
3042
3043	uint32_t offFixup = off;
3044	off = iemNativeEmitJzToFixed(pReNative, off, off);
3045	off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3046	iemNativeFixupFixedJump(pReNative, offFixup, off);
3047	iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3048
3049
3050	/* Set the MXCSR flags now. */
3051	off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3052
3053	/*
3054	* Make sure we don't have any outstanding guest register writes as we may
3055	* raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3056	*/
3057	off = iemNativeRegFlushPendingWrites(pReNative, off);
3058
3059	#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3060	off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3061	#else
3062	RT_NOREF(idxInstr);
3063	#endif
3064
3065	/* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3066	/* mov tmp, varmxcsr */
3067	off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3068	/* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3069	off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3070	/* tmp = ~tmp */
3071	off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /f64Bit/);
3072	/* tmp &= mxcsr */
3073	off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3074	off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3075	X86_MXCSR_XCPT_FLAGS);
3076
3077	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3078	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3079
3080	/* Move result to guest SIMD register (at this point there is no exception being raised). */
3081	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3082
3083	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3084	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3085	iemNativeRegFreeTmp(pReNative, idxRegTmp);
3086	iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3087	iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3088	return off;
3089	}
3090
3091
3092	/**
3093	* Common emitter for packed floating point instructions with 3 operands - register, register variant.
3094	*/
3095	DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3096	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3097	#ifdef RT_ARCH_AMD64
3098	uint8_t const bPrefixX86, uint8_t const bOpcX86
3099	#elif defined(RT_ARCH_ARM64)
3100	ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3101	#endif
3102	)
3103	{
3104	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3105	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3106	uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3107	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3108
3109	#ifdef RT_ARCH_AMD64
3110	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3111	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3112	if (bPrefixX86 != 0)
3113	pCodeBuf[off++] = bPrefixX86;
3114	if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 \|\| idxSimdRegSrc >= 8)
3115	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3116	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3117	pCodeBuf[off++] = 0x0f;
3118	pCodeBuf[off++] = bOpcX86;
3119	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3120	#elif defined(RT_ARCH_ARM64)
3121	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3122	pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3123	#else
3124	# error "Port me"
3125	#endif
3126	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3127	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3128	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3129	return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3130	}
3131
3132
3133	/**
3134	* Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3135	*/
3136	DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3137	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3138	#ifdef RT_ARCH_AMD64
3139	uint8_t const bPrefixX86, uint8_t const bOpcX86
3140	#elif defined(RT_ARCH_ARM64)
3141	ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3142	#endif
3143	)
3144	{
3145	uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3146	kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3147	uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /fInitialized/);
3148
3149	#ifdef RT_ARCH_AMD64
3150	off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3151	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3152	if (bPrefixX86 != 0)
3153	pCodeBuf[off++] = bPrefixX86;
3154	if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 \|\| idxSimdRegSrc >= 8)
3155	pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3156	\| (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3157	pCodeBuf[off++] = 0x0f;
3158	pCodeBuf[off++] = bOpcX86;
3159	pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3160	#elif defined(RT_ARCH_ARM64)
3161	PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3162	pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3163	#else
3164	# error "Port me"
3165	#endif
3166	iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3167	iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3168	IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3169	return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3170	}
3171
3172
3173	/**
3174	* Common emitter for packed floating point instructions with 3 operands.
3175	*/
3176	#ifdef RT_ARCH_AMD64
3177	# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3178	DECL_FORCE_INLINE_THROW(uint32_t) \
3179	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3180	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3181	{ \
3182	return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3183	a_bPrefixX86, a_bOpcX86); \
3184	} \
3185	DECL_FORCE_INLINE_THROW(uint32_t) \
3186	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3187	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3188	{ \
3189	return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3190	a_bPrefixX86, a_bOpcX86); \
3191	} \
3192	typedef int ignore_semicolon
3193	#elif defined(RT_ARCH_ARM64)
3194	# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3195	DECL_FORCE_INLINE_THROW(uint32_t) \
3196	RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3197	uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3198	{ \
3199	return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3200	a_enmArmOp, a_ArmElemSz); \
3201	} \
3202	DECL_FORCE_INLINE_THROW(uint32_t) \
3203	RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3204	uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3205	{ \
3206	return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3207	a_enmArmOp, a_ArmElemSz); \
3208	} \
3209	typedef int ignore_semicolon
3210	#else
3211	# error "Port me"
3212	#endif
3213
3214
3215	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3216	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3217	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3218	IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3219
3220	#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3221
3222	#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106201

Download in other formats: