VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106192

Last change on this file since 106192 was 106192, checked in by vboxsync, 4 months ago

VMM/IEM: Added some basic stats & debug info for postponed EFLAGS calcs. Moved debug info structures from IEMInternal.h and into IEMN8veRecompiler.h. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 160.5 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106192 2024-10-01 12:57:32Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* EFLAGS *
201*********************************************************************************************************************************/
202
203#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205/** @def IEMNATIVE_POSTPONING_REG_MASK
206 * Register suitable for keeping the inputs or result for a postponed EFLAGS
207 * calculation.
208 *
209 * We use non-volatile register here so we don't have to save & restore them
210 * accross callouts (i.e. TLB loads).
211 *
212 * @note On x86 we cannot use RDI and RSI because these are used by the
213 * opcode checking code. The usual joy of the x86 instruction set.
214 */
215# ifdef RT_ARCH_AMD64
216# define IEMNATIVE_POSTPONING_REG_MASK \
217 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
218# else
219# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220# endif
221
222/**
223 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224 */
225template<uint32_t const a_fEflClobbered>
226DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227{
228 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230 if (fEFlags)
231 {
232 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233 {
234 fEFlags &= ~a_fEflClobbered;
235 if (!fEFlags)
236 { /* likely */ }
237 else
238 {
239 Log5(("iemNativeClearPostponedEFlags: Clobbering %#x: %#x -> %#x (op=%d bits=%u)\n", a_fEflClobbered,
240 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241 pReNative->PostponedEfl.fEFlags = fEFlags;
242 return;
243 }
244 }
245
246 /* Do cleanup. */
247 Log5(("iemNativeClearPostponedEFlags: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x\n",
248 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249 pReNative->PostponedEfl.fEFlags = 0;
250 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251 pReNative->PostponedEfl.cOpBits = 0;
252 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257#if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259 pReNative->PostponedEfl.cEmits = 0;
260#endif
261 }
262}
263
264DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
265 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
266{
267#ifdef RT_ARCH_AMD64
268 /*
269 * Do an AND and collect flags and merge them with eflags.
270 */
271 /* Do TEST idxRegResult, idxRegResult to set flags. */
272 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
273
274 if (idxRegTmp == X86_GREG_xAX)
275 {
276 /* lahf ; AH = EFLAGS */
277 pCodeBuf[off++] = 0x9f;
278 if (idxRegEfl <= X86_GREG_xBX)
279 {
280 /* mov [CDB]L, AH */
281 pCodeBuf[off++] = 0x88;
282 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
283 }
284 else
285 {
286 /* mov AL, AH */
287 pCodeBuf[off++] = 0x88;
288 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
289 /* mov xxL, AL */
290 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
291 pCodeBuf[off++] = 0x88;
292 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
293 }
294 }
295 else if (idxRegEfl != X86_GREG_xAX)
296 {
297 /* pushf */
298 pCodeBuf[off++] = 0x9c;
299 /* pop tmp */
300 if (idxRegTmp >= 8)
301 pCodeBuf[off++] = X86_OP_REX_B;
302 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
303 /* mov byte(efl), byte(tmp) */
304 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
305 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
306 pCodeBuf[off++] = 0x88;
307 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
308 }
309 else
310 {
311 /* xchg al, ah */
312 pCodeBuf[off++] = 0x86;
313 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
314 /* lahf ; AH = EFLAGS */
315 pCodeBuf[off++] = 0x9f;
316 /* xchg al, ah */
317 pCodeBuf[off++] = 0x86;
318 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
319 }
320 /* BTC idxEfl, 11; Clear OF */
321 if (idxRegEfl >= 8)
322 pCodeBuf[off++] = X86_OP_REX_B;
323 pCodeBuf[off++] = 0xf;
324 pCodeBuf[off++] = 0xba;
325 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 7, idxRegEfl & 7);
326 pCodeBuf[off++] = X86_EFL_OF_BIT;
327
328#elif defined(RT_ARCH_ARM64)
329 /*
330 * Calculate flags.
331 */
332 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
333 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
334 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
335
336 /* N,Z -> SF,ZF */
337 if (cOpBits < 32)
338 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
339 else
340 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
341 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
342 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
343 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
344 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
345
346 /* Calculate 8-bit parity of the result. */
347 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
348 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
349 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
350 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
351 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
352 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
353 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
354 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
355 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
356
357#else
358# error "port me"
359#endif
360 return off;
361}
362
363
364template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
365static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
366 uint32_t bmExtraTlbMissRegs = 0)
367{
368#ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
369 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
370 pReNative->PostponedEfl.cEmits);
371#endif
372
373 /*
374 * In the TB exit code path we cannot do regular register allocation. Nor
375 * can we when we're in the TLB miss code, unless we're skipping the TLB
376 * lookup. Since the latter isn't an important usecase and should get along
377 * fine on just volatile registers, we do not need to do anything special
378 * for it.
379 *
380 * So, we do our own register allocating here. Any register goes in the TB
381 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
382 * In the TLB miss we can use any volatile register and temporary registers
383 * allocated in the TLB state.
384 *
385 * Note! On x86 we prefer using RAX as the first TMP register, so we can
386 * make use of LAHF which is typically faster than PUSHF/POP. This
387 * is why the idxRegTmp allocation is first when there is no EFLAG
388 * shadow, since RAX is represented by bit 0 in the mask.
389 */
390 uint32_t bmAvailableRegs;
391 if RT_CONSTEXPR_IF(!a_fTlbMiss)
392 {
393 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
394 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
395 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
396 else
397 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
398 }
399 else
400 {
401 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
402 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
403 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
404 & IEMNATIVE_HST_GREG_MASK;
405 }
406
407 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
408 need to weed out volatile registers here, as they will no longer be valid. */
409 uint8_t idxRegTmp;
410 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
411 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
412 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
413 {
414 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
415 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
416 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
417 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
418#ifdef VBOX_STRICT
419 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
420#endif
421
422 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
423 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
424 }
425 else
426 {
427 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
428 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
429
430 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
431 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
432 off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
433 }
434 Assert(bmAvailableRegs != 0);
435
436 /*
437 * Do the actual EFLAGS calculation.
438 */
439 switch (pReNative->PostponedEfl.enmOp)
440 {
441 case kIemNativePostponedEflOp_Logical:
442 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
443 off = iemNativeEmitPostponedEFlagsCalcLogical(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
444 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
445 break;
446
447 default:
448 AssertFailedBreak();
449 }
450
451 /*
452 * Store EFLAGS.
453 */
454#ifdef VBOX_STRICT
455 /* check that X86_EFL_1 is set. */
456 uint32_t offFixup1;
457 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
458 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
459 iemNativeFixupFixedJump(pReNative, offFixup1, off);
460 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
461 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK);
462 uint32_t const offFixup2 = off;
463 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
464 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
465 iemNativeFixupFixedJump(pReNative, offFixup2, off);
466#endif
467 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
468 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
469
470#if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
471 pReNative->PostponedEfl.cEmits++;
472#endif
473 return off;
474}
475
476
477
478template<uint32_t const a_bmInputRegs>
479DECL_FORCE_INLINE_THROW(uint32_t)
480iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
481{
482 if (pReNative->PostponedEfl.fEFlags)
483 {
484 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
485 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
486 }
487 return off;
488}
489
490
491template<uint32_t const a_bmInputRegs>
492DECL_FORCE_INLINE_THROW(uint32_t)
493iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
494{
495 if (pReNative->PostponedEfl.fEFlags)
496 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
497 return off;
498}
499
500
501template<uint32_t const a_bmInputRegs>
502DECL_FORCE_INLINE_THROW(uint32_t)
503iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
504 uint32_t bmTmpRegs)
505{
506 if (pReNative->PostponedEfl.fEFlags)
507 {
508 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
509 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
510 pTlbState->getRegsNotToSave() | bmTmpRegs);
511 }
512 return off;
513}
514
515
516#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
517
518
519/**
520 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
521 *
522 * It takes liveness stuff into account.
523 */
524DECL_INLINE_THROW(uint32_t)
525iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
526 uint8_t cOpBits, uint8_t idxRegResult
527#ifndef RT_ARCH_AMD64
528 , bool fNativeFlags = false
529#endif
530 )
531{
532 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
533 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
534 RT_NOREF(cOpBits, idxRegResult);
535
536#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
537 /*
538 * See if we can skip this wholesale.
539 */
540 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
541 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
542 & IEMLIVENESSBIT_STATUS_EFL_MASK;
543# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
544 uint64_t fEflPostponing;
545# endif
546 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
547 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
548 {
549 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
550 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
551# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
552 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
553# endif
554 Log5(("iemNativeEmitEFlagsForLogical: Skipping %#x\n", X86_EFL_STATUS_BITS));
555 return off;
556 }
557# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
558 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
559 | fEflClobbered)
560 == IEMLIVENESSBIT_STATUS_EFL_MASK
561 && idxRegResult != UINT8_MAX)
562 {
563 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
564 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
565 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
566 pReNative->PostponedEfl.cOpBits = cOpBits;
567 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
568 /** @todo it would normally be possible to use idxRegResult, iff it is
569 * already a non-volatile register and we can be user the caller
570 * doesn't modify it. That'll save a register move and allocation. */
571 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
572 Log5(("iemNativeEmitEFlagsForLogical: Postponing %#x op=%u bits=%u reg1=%u\n", X86_EFL_STATUS_BITS,
573 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
574 }
575# endif
576 else
577#endif
578 {
579#ifdef RT_ARCH_AMD64
580 /*
581 * Collect flags and merge them with eflags.
582 */
583 /** @todo we could alternatively use LAHF here when host rax is free since,
584 * OF is cleared. */
585 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
586 /* pushf - do this before any reg allocations as they may emit instructions too. */
587 pCodeBuf[off++] = 0x9c;
588
589 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
590 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
591 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
592 /* pop tmp */
593 if (idxTmpReg >= 8)
594 pCodeBuf[off++] = X86_OP_REX_B;
595 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
596 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF */
597 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF);
598 /* Clear the status bits in EFLs. */
599 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
600 /* OR in the flags we collected. */
601 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
602 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
603 iemNativeRegFreeTmp(pReNative, idxTmpReg);
604
605#elif defined(RT_ARCH_ARM64)
606 /*
607 * Calculate flags.
608 */
609 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
610 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
611 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 15);
612
613 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxTmpReg for constant. */
614 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxTmpReg, ~X86_EFL_STATUS_BITS);
615 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxTmpReg);
616
617 /* N,Z -> SF,ZF */
618 if (cOpBits < 32)
619 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
620 else if (!fNativeFlags)
621 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
622 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
623 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 30);
624 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
625 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
626
627 /* Calculate 8-bit parity of the result. */
628 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
629 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
630 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
631 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
632 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
633 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
634 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
635 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
636 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
637
638 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
639 iemNativeRegFreeTmp(pReNative, idxTmpReg);
640#else
641# error "port me"
642#endif
643 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
644 }
645
646#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
647 pReNative->fSkippingEFlags = 0;
648# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
649 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
650# endif
651#endif
652 return off;
653}
654
655
656/**
657 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
658 *
659 * It takes liveness stuff into account.
660 */
661DECL_FORCE_INLINE_THROW(uint32_t)
662iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
663#ifndef RT_ARCH_AMD64
664 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
665 , bool fInvertCarry, uint64_t uImmSrc
666#endif
667 )
668{
669 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
670 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
671
672#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
673 /*
674 * See if we can skip this wholesale.
675 */
676 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
677 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
678 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
679 {
680 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
681 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
682# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
683 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
684# endif
685 }
686 else
687#endif
688 {
689#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
690 uint32_t fSkipped = 0;
691#endif
692#ifdef RT_ARCH_AMD64
693 /*
694 * Collect flags and merge them with eflags.
695 */
696 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
697 /* pushf - do this before any reg allocations as they may emit instructions too. */
698 pCodeBuf[off++] = 0x9c;
699
700 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
701 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
702 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
703 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
704 /* pop tmp */
705 if (idxTmpReg >= 8)
706 pCodeBuf[off++] = X86_OP_REX_B;
707 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
708 /* Isolate the flags we want. */
709 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
710 /* Clear the status bits in EFLs. */
711 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
712 /* OR in the flags we collected. */
713 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
714 if (idxRegEflIn != idxRegEfl)
715 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
716 iemNativeRegFreeTmp(pReNative, idxTmpReg);
717
718#elif defined(RT_ARCH_ARM64)
719 /*
720 * Calculate flags.
721 */
722 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
723 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
724 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
725 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
726 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
727
728 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
729 if (fInvertCarry)
730 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
731 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
732
733 if (cOpBits >= 32)
734 {
735 /* V -> OF */
736 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
737 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
738
739 /* C -> CF */
740 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
741 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
742 }
743
744 /* N,Z -> SF,ZF */
745 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
746 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
747
748 /* For ADC and SBB we have to calculate overflow and carry our selves. */
749 if (cOpBits < 32)
750 {
751 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
752 AssertCompile(X86_EFL_CF_BIT == 0);
753 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
754
755 /* The overflow flag is more work as we have to compare the signed bits for
756 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
757
758 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
759 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
760
761 It is a bit simpler when the right (source) side is constant:
762 adc: S D R -> OF sbb: S D R -> OF
763 0 0 0 -> 0 \ 0 0 0 -> 0 \
764 0 0 1 -> 1 \ 0 0 1 -> 0 \
765 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
766 0 1 1 -> 0 / 0 1 1 -> 0 /
767 1 0 0 -> 0 \ 1 0 0 -> 0 \
768 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
769 1 1 0 -> 1 / 1 1 0 -> 0 /
770 1 1 1 -> 0 / 1 1 1 -> 0 / */
771 if (idxRegSrc != UINT8_MAX)
772 {
773 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
774 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
775 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
776 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
777 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
778 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
779 }
780 else if (uImmSrc & RT_BIT_32(cOpBits - 1))
781 {
782 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
783 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
784 else
785 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
786 }
787 else
788 {
789 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
790 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
791 else
792 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
793 }
794 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
795 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
796 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
797 }
798
799 /* Calculate 8-bit parity of the result. */
800 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
801 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
802 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
803 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
804 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
805 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
806 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
807 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
808 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
809
810 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
811 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
812 S D R
813 0 0 0 -> 0; \
814 0 0 1 -> 1; \ regular
815 0 1 0 -> 1; / xor R, D
816 0 1 1 -> 0; /
817 1 0 0 -> 1; \
818 1 0 1 -> 0; \ invert one of the two
819 1 1 0 -> 0; / xor not(R), D
820 1 1 1 -> 1; /
821 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
822 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
823 */
824
825 if (idxRegSrc != UINT8_MAX)
826 {
827 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
828 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
829 }
830 else if (uImmSrc & X86_EFL_AF)
831 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
832 else
833 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
834 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
835 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
836
837 if (idxRegEflIn != idxRegEfl)
838 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
839 iemNativeRegFreeTmp(pReNative, idxTmpReg);
840
841#else
842# error "port me"
843#endif
844 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
845
846#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
847 pReNative->fSkippingEFlags = fSkipped;
848# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
849 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, fSkipped, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
850# endif
851#endif
852 }
853 return off;
854
855}
856
857
858
859/*********************************************************************************************************************************
860* Bitwise Logical Operations *
861*********************************************************************************************************************************/
862
863/**
864 * The AND instruction will clear OF, CF and AF (latter is undefined) and
865 * set the other flags according to the result.
866 */
867DECL_INLINE_THROW(uint32_t)
868iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
869 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
870{
871 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
872 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
873#ifdef RT_ARCH_AMD64
874 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
875 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
876 0x22, 0x23, cOpBits, idxRegDst, idxRegSrc);
877 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
878 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
879
880 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
881
882#elif defined(RT_ARCH_ARM64)
883 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
884 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
885 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
886 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
887 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
888
889 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, true /*fNativeFlags*/);
890#else
891# error "Port me"
892#endif
893 iemNativeVarRegisterRelease(pReNative, idxVarDst);
894 return off;
895}
896
897
898/**
899 * The AND instruction with immediate value as right operand.
900 */
901DECL_INLINE_THROW(uint32_t)
902iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
903 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
904{
905 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
906#ifdef RT_ARCH_AMD64
907 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
908 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
909 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 4, idxRegDst, uImmOp);
910 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
911
912 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
913
914#elif defined(RT_ARCH_ARM64)
915 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
916 course the immediate variant when possible to save a register load. */
917 uint32_t uImmSizeLen, uImmRotations;
918 if ( cOpBits > 32
919 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
920 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
921 {
922 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
923 if (cOpBits >= 32)
924 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
925 else
926 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
927 }
928 else
929 {
930 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
931 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
932 if (cOpBits >= 32)
933 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
934 else
935 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
936 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
937 }
938 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
939
940 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, cOpBits >= 32 /*fNativeFlags*/);
941 RT_NOREF_PV(cImmBits);
942
943#else
944# error "Port me"
945#endif
946 iemNativeVarRegisterRelease(pReNative, idxVarDst);
947 return off;
948}
949
950
951/**
952 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
953 * set the other flags according to the result.
954 */
955DECL_INLINE_THROW(uint32_t)
956iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
957 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
958{
959 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
960 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
961 : iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
962#ifdef RT_ARCH_AMD64
963 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
964 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
965 0x84, 0x85, cOpBits, idxRegSrc, idxRegDst);
966 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
967
968#elif defined(RT_ARCH_ARM64)
969 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
970 need to keep the result in order to calculate the flags. */
971 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
972 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
973 if (cOpBits >= 32)
974 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
975 else
976 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
977 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
978
979#else
980# error "Port me"
981#endif
982 if (idxVarSrc != idxVarDst)
983 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
984 iemNativeVarRegisterRelease(pReNative, idxVarDst);
985
986#ifdef RT_ARCH_AMD64
987 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
988#else
989 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /*fNativeFlags*/);
990 iemNativeRegFreeTmp(pReNative, idxRegResult);
991#endif
992 return off;
993}
994
995
996/**
997 * The TEST instruction with immediate value as right operand.
998 */
999DECL_INLINE_THROW(uint32_t)
1000iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1001 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1002{
1003 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1004#ifdef RT_ARCH_AMD64
1005 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
1006 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1007 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1008 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1009 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1010
1011 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
1012
1013#elif defined(RT_ARCH_ARM64)
1014 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1015 course the immediate variant when possible to save a register load.
1016 We also need to keep the result in order to calculate the flags. */
1017 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1018 uint32_t uImmSizeLen, uImmRotations;
1019 if ( cOpBits > 32
1020 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1021 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1022 {
1023 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1024 if (cOpBits >= 32)
1025 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1026 else
1027 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1028 }
1029 else
1030 {
1031 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1032 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1033 if (cOpBits >= 32)
1034 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1035 else
1036 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1037 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1038 }
1039 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1040 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1041
1042 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /*fNativeFlags*/);
1043
1044 iemNativeRegFreeTmp(pReNative, idxRegResult);
1045 RT_NOREF_PV(cImmBits);
1046
1047#else
1048# error "Port me"
1049#endif
1050 return off;
1051}
1052
1053
1054/**
1055 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1056 * set the other flags according to the result.
1057 */
1058DECL_INLINE_THROW(uint32_t)
1059iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1060 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1061{
1062 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1063 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1064#ifdef RT_ARCH_AMD64
1065 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1066 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1067 0x0a, 0x0b, cOpBits, idxRegDst, idxRegSrc);
1068 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1069 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1070
1071 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1072
1073#elif defined(RT_ARCH_ARM64)
1074 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1075 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1076 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1077 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1078 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1079
1080 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1081
1082#else
1083# error "Port me"
1084#endif
1085 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1086 return off;
1087}
1088
1089
1090/**
1091 * The OR instruction with immediate value as right operand.
1092 */
1093DECL_INLINE_THROW(uint32_t)
1094iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1095 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1096{
1097 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1098#ifdef RT_ARCH_AMD64
1099 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1100 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1101 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 1, idxRegDst, uImmOp);
1102 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1103
1104 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1105
1106#elif defined(RT_ARCH_ARM64)
1107 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1108 course the immediate variant when possible to save a register load. */
1109 uint32_t uImmSizeLen, uImmRotations;
1110 if ( cOpBits > 32
1111 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1112 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1113 {
1114 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1115 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1116 }
1117 else
1118 {
1119 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1120 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1121 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1122 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1123 }
1124 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1125
1126 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1127 RT_NOREF_PV(cImmBits);
1128
1129#else
1130# error "Port me"
1131#endif
1132 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1133 return off;
1134}
1135
1136
1137/**
1138 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1139 * set the other flags according to the result.
1140 */
1141DECL_INLINE_THROW(uint32_t)
1142iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1143 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1144{
1145 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1146 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1147#ifdef RT_ARCH_AMD64
1148 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1149 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1150 0x32, 0x33, cOpBits, idxRegDst, idxRegSrc);
1151 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1152 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1153
1154 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1155
1156#elif defined(RT_ARCH_ARM64)
1157 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1158 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1159 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1160 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1161 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1162
1163 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1164
1165#else
1166# error "Port me"
1167#endif
1168 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1169 return off;
1170}
1171
1172
1173/**
1174 * The XOR instruction with immediate value as right operand.
1175 */
1176DECL_INLINE_THROW(uint32_t)
1177iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1178 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1179{
1180 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1181#ifdef RT_ARCH_AMD64
1182 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1183 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1184 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 6, idxRegDst, uImmOp);
1185 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1186
1187 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1188
1189#elif defined(RT_ARCH_ARM64)
1190 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1191 course the immediate variant when possible to save a register load. */
1192 uint32_t uImmSizeLen, uImmRotations;
1193 if ( cOpBits > 32
1194 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1195 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1196 {
1197 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1198 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1199 }
1200 else
1201 {
1202 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1203 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1204 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1205 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1206 }
1207 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1208
1209 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1210 RT_NOREF_PV(cImmBits);
1211
1212#else
1213# error "Port me"
1214#endif
1215 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1216 return off;
1217}
1218
1219
1220
1221/*********************************************************************************************************************************
1222* ADD, ADC, SUB, SBB, CMP *
1223*********************************************************************************************************************************/
1224
1225/**
1226 * The ADD instruction will set all status flags.
1227 */
1228DECL_INLINE_THROW(uint32_t)
1229iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1230 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1231{
1232 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1233 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1234
1235#ifdef RT_ARCH_AMD64
1236 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1237 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1238 0x02, 0x03, cOpBits, idxRegDst, idxRegSrc);
1239 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1240
1241 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1242 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1243
1244 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1245
1246#elif defined(RT_ARCH_ARM64)
1247 /* On ARM64 we'll need the two input operands as well as the result in order
1248 to calculate the right flags, even if we use ADDS and translates NZCV into
1249 OF, CF, ZF and SF. */
1250 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1251 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1252 if (cOpBits >= 32)
1253 {
1254 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1255 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1256 }
1257 else
1258 {
1259 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1260 uint32_t const cShift = 32 - cOpBits;
1261 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1262 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1263 true /*fSetFlags*/, cShift);
1264 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1265 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1266 cOpBits = 32;
1267 }
1268 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1269
1270 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1271 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1272
1273 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1274 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1275 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1276
1277#else
1278# error "port me"
1279#endif
1280 return off;
1281}
1282
1283
1284/**
1285 * The ADD instruction with immediate value as right operand.
1286 */
1287DECL_INLINE_THROW(uint32_t)
1288iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1289 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1290{
1291 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1292
1293#ifdef RT_ARCH_AMD64
1294 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1295 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1296 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1297 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1298
1299 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1300
1301 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1302
1303#elif defined(RT_ARCH_ARM64)
1304 /* On ARM64 we'll need the two input operands as well as the result in order
1305 to calculate the right flags, even if we use ADDS and translates NZCV into
1306 OF, CF, ZF and SF. */
1307 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1308 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1309 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1310 if (cOpBits >= 32)
1311 {
1312 if (uImmOp <= 0xfffU)
1313 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1314 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1315 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1316 true /*fSetFlags*/, true /*fShift12*/);
1317 else
1318 {
1319 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1320 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1321 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1322 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1323 }
1324 }
1325 else
1326 {
1327 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1328 uint32_t const cShift = 32 - cOpBits;
1329 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1330 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1331 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1332 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1333 cOpBits = 32;
1334 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1335 }
1336 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1337
1338 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1339 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1340
1341 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1342 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1343 RT_NOREF(cImmBits);
1344
1345#else
1346# error "port me"
1347#endif
1348 return off;
1349}
1350
1351
1352/**
1353 * The ADC instruction takes CF as input and will set all status flags.
1354 */
1355DECL_INLINE_THROW(uint32_t)
1356iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1357 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1358{
1359 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1360 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1361 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1362
1363#ifdef RT_ARCH_AMD64
1364 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1365 with matching size to get the correct flags. */
1366 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1367
1368 /* Use the BT instruction to set CF according to idxRegEfl. */
1369 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1370 pCodeBuf[off++] = X86_EFL_CF_BIT;
1371
1372 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, cOpBits, idxRegDst, idxRegSrc);
1373 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1374
1375 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1376 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1377
1378 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1379
1380#elif defined(RT_ARCH_ARM64)
1381 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1382 then ADCS for the calculation. We need all inputs and result for the two
1383 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1384 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1385 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1386
1387 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1388 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1389 if (cOpBits >= 32)
1390 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1391 else
1392 {
1393 /* Since we're also adding in the carry flag here, shifting operands up
1394 doesn't work. So, we have to calculate carry & overflow manually. */
1395 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1396 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1397 }
1398 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1399
1400 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1401 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1402
1403 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1404 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1405 if (cOpBits < 32)
1406 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1407 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1408
1409#else
1410# error "port me"
1411#endif
1412 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1413 return off;
1414}
1415
1416
1417/**
1418 * The ADC instruction with immediate value as right operand.
1419 */
1420DECL_INLINE_THROW(uint32_t)
1421iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1422 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1423{
1424 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1425 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1426
1427#ifdef RT_ARCH_AMD64
1428 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1429 with matching size to get the correct flags. */
1430 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1431
1432 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1433 pCodeBuf[off++] = X86_EFL_CF_BIT;
1434
1435 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 2, idxRegDst, uImmOp);
1436 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1437
1438 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1439
1440 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1441
1442#elif defined(RT_ARCH_ARM64)
1443 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1444 and then ADCS for the calculation. We need all inputs and result for
1445 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1446 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1447 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1448 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1449
1450 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1451 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1452 if (cOpBits >= 32)
1453 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /*f64Bit*/);
1454 else
1455 {
1456 /* Since we're also adding in the carry flag here, shifting operands up
1457 doesn't work. So, we have to calculate carry & overflow manually. */
1458 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1459 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1460 }
1461 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1462
1463 iemNativeRegFreeTmp(pReNative, idxRegImm);
1464
1465 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1466 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1467
1468 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1469 if (cOpBits < 32)
1470 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1471 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1472 RT_NOREF(cImmBits);
1473
1474#else
1475# error "port me"
1476#endif
1477 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1478 return off;
1479}
1480
1481
1482/**
1483 * The SUB instruction will set all status flags.
1484 */
1485DECL_INLINE_THROW(uint32_t)
1486iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1487 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1488{
1489 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1490 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1491
1492#ifdef RT_ARCH_AMD64
1493 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1494 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1495 0x2a, 0x2b, cOpBits, idxRegDst, idxRegSrc);
1496 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1497
1498 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1499 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1500
1501 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1502
1503#elif defined(RT_ARCH_ARM64)
1504 /* On ARM64 we'll need the two input operands as well as the result in order
1505 to calculate the right flags, even if we use SUBS and translates NZCV into
1506 OF, CF, ZF and SF. */
1507 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1508 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1509 if (cOpBits >= 32)
1510 {
1511 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1512 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1513 }
1514 else
1515 {
1516 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1517 uint32_t const cShift = 32 - cOpBits;
1518 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1519 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1520 true /*fSetFlags*/, cShift);
1521 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1522 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1523 cOpBits = 32;
1524 }
1525 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1526
1527 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1528 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1529
1530 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1531 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1532 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1533
1534#else
1535# error "port me"
1536#endif
1537 return off;
1538}
1539
1540
1541/**
1542 * The SUB instruction with immediate value as right operand.
1543 */
1544DECL_INLINE_THROW(uint32_t)
1545iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1546 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1547{
1548 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1549
1550#ifdef RT_ARCH_AMD64
1551 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1552 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1553 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 5, idxRegDst, uImmOp);
1554 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1555
1556 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1557
1558 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1559
1560#elif defined(RT_ARCH_ARM64)
1561 /* On ARM64 we'll need the two input operands as well as the result in order
1562 to calculate the right flags, even if we use SUBS and translates NZCV into
1563 OF, CF, ZF and SF. */
1564 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1565 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1566 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1567 if (cOpBits >= 32)
1568 {
1569 if (uImmOp <= 0xfffU)
1570 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1571 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1572 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1573 true /*fSetFlags*/, true /*fShift12*/);
1574 else
1575 {
1576 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1577 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1578 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1579 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1580 }
1581 }
1582 else
1583 {
1584 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1585 uint32_t const cShift = 32 - cOpBits;
1586 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1587 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1588 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1589 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1590 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1591 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1592 cOpBits = 32;
1593 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1594 }
1595 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1596
1597 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1598 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1599
1600 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1601 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1602 RT_NOREF(cImmBits);
1603
1604#else
1605# error "port me"
1606#endif
1607 return off;
1608}
1609
1610
1611/**
1612 * The CMP instruction will set all status flags, but modifies no registers.
1613 */
1614DECL_INLINE_THROW(uint32_t)
1615iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1616 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1617{
1618 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1619 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1620
1621#ifdef RT_ARCH_AMD64
1622 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1623 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1624 0x3a, 0x3b, cOpBits, idxRegDst, idxRegSrc);
1625 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1626
1627 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1628 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1629
1630 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1631
1632#elif defined(RT_ARCH_ARM64)
1633 /* On ARM64 we'll need the actual result as well as both input operands in order
1634 to calculate the right flags, even if we use SUBS and translates NZCV into
1635 OF, CF, ZF and SF. */
1636 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1637 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1638 if (cOpBits >= 32)
1639 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1640 else
1641 {
1642 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1643 uint32_t const cShift = 32 - cOpBits;
1644 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1645 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1646 true /*fSetFlags*/, cShift);
1647 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1648 cOpBits = 32;
1649 }
1650 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1651
1652 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1653 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1654
1655 iemNativeRegFreeTmp(pReNative, idxRegResult);
1656 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1657 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1658
1659#else
1660# error "port me"
1661#endif
1662 return off;
1663}
1664
1665
1666/**
1667 * The CMP instruction with immediate value as right operand.
1668 */
1669DECL_INLINE_THROW(uint32_t)
1670iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1671 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1672{
1673 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1674
1675#ifdef RT_ARCH_AMD64
1676 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1677 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1678 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 7, idxRegDst, uImmOp);
1679 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1680
1681 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1682
1683 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1684
1685#elif defined(RT_ARCH_ARM64)
1686 /* On ARM64 we'll need the actual result as well as both input operands in order
1687 to calculate the right flags, even if we use SUBS and translates NZCV into
1688 OF, CF, ZF and SF. */
1689 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1690 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1691 if (cOpBits >= 32)
1692 {
1693 if (uImmOp <= 0xfffU)
1694 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1695 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1696 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1697 true /*fSetFlags*/, true /*fShift12*/);
1698 else
1699 {
1700 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1701 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1702 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1703 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1704 }
1705 }
1706 else
1707 {
1708 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1709 uint32_t const cShift = 32 - cOpBits;
1710 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1711 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1712 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1713 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1714 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1715 cOpBits = 32;
1716 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1717 }
1718 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1719
1720 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1721 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1722
1723 iemNativeRegFreeTmp(pReNative, idxRegResult);
1724 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1725 RT_NOREF(cImmBits);
1726
1727#else
1728# error "port me"
1729#endif
1730 return off;
1731}
1732
1733
1734/**
1735 * The SBB instruction takes CF as input and will set all status flags.
1736 */
1737DECL_INLINE_THROW(uint32_t)
1738iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1739 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1740{
1741 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1742 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1743 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1744
1745#ifdef RT_ARCH_AMD64
1746 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1747 with matching size to get the correct flags. */
1748 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1749
1750 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1751 pCodeBuf[off++] = X86_EFL_CF_BIT;
1752
1753 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, cOpBits, idxRegDst, idxRegSrc);
1754 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1755
1756 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1757 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1758
1759 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1760
1761#elif defined(RT_ARCH_ARM64)
1762 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1763 idxRegEfl and then SBCS for the calculation. We need all inputs and
1764 result for the two flags (AF,PF) that can't be directly derived from
1765 PSTATE.NZCV. */
1766 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1767 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1768
1769 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1770 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1771 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1772 if (cOpBits >= 32)
1773 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1774 else
1775 {
1776 /* Since we're also adding in the carry flag here, shifting operands up
1777 doesn't work. So, we have to calculate carry & overflow manually. */
1778 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1779 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1780 }
1781 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1782
1783 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1784 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1785
1786 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1787 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1788 if (cOpBits < 32)
1789 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1790 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1791
1792#else
1793# error "port me"
1794#endif
1795 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1796 return off;
1797}
1798
1799
1800/**
1801 * The SBB instruction with immediate value as right operand.
1802 */
1803DECL_INLINE_THROW(uint32_t)
1804iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1805 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1806{
1807 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1808 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1809
1810#ifdef RT_ARCH_AMD64
1811 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1812 with matching size to get the correct flags. */
1813 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1814
1815 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1816 pCodeBuf[off++] = X86_EFL_CF_BIT;
1817
1818 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 3, idxRegDst, uImmOp);
1819 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1820
1821 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1822
1823 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1824
1825#elif defined(RT_ARCH_ARM64)
1826 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1827 idxRegEfl and then SBCS for the calculation. We need all inputs and
1828 result for the two flags (AF,PF) that can't be directly derived from
1829 PSTATE.NZCV. */
1830 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1831 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1832 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1833
1834 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1835 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1836 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1837 if (cOpBits >= 32)
1838 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /*f64Bit*/);
1839 else
1840 {
1841 /* Since we're also adding in the carry flag here, shifting operands up
1842 doesn't work. So, we have to calculate carry & overflow manually. */
1843 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1844 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1845 }
1846 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1847
1848 iemNativeRegFreeTmp(pReNative, idxRegImm);
1849
1850 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1851 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1852
1853 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1854 if (cOpBits < 32)
1855 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1856 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1857 RT_NOREF(cImmBits);
1858
1859#else
1860# error "port me"
1861#endif
1862 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1863 return off;
1864}
1865
1866
1867DECL_INLINE_THROW(uint32_t)
1868iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1869 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1870{
1871 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1872 AssertFailed();
1873 return iemNativeEmitBrk(pReNative, off, 0x666);
1874}
1875
1876
1877DECL_INLINE_THROW(uint32_t)
1878iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1879 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1880{
1881 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1882 AssertFailed();
1883 return iemNativeEmitBrk(pReNative, off, 0x666);
1884}
1885
1886
1887DECL_INLINE_THROW(uint32_t)
1888iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1889 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1890{
1891 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1892 AssertFailed();
1893 return iemNativeEmitBrk(pReNative, off, 0x666);
1894}
1895
1896
1897DECL_INLINE_THROW(uint32_t)
1898iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1899 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1900{
1901 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1902 AssertFailed();
1903 return iemNativeEmitBrk(pReNative, off, 0x666);
1904}
1905
1906
1907
1908/*********************************************************************************************************************************
1909* Shifting and Rotating. *
1910*********************************************************************************************************************************/
1911
1912
1913typedef enum
1914{
1915 kIemNativeEmitEFlagsForShiftType_Left,
1916 kIemNativeEmitEFlagsForShiftType_Right,
1917 kIemNativeEmitEFlagsForShiftType_SignedRight
1918} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1919
1920/**
1921 * This is used by SHL, SHR and SAR emulation.
1922 *
1923 * It takes liveness stuff into account.
1924 */
1925DECL_INLINE_THROW(uint32_t)
1926iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1927 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1928 uint8_t idxRegTmp)
1929{
1930 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1931
1932RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1933#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1934 /*
1935 * See if we can skip this wholesale.
1936 */
1937 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1938 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1939 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1940 {
1941 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1942 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
1943# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1944 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1945# endif
1946 }
1947 else
1948#endif
1949 {
1950 /*
1951 * The difference between Intel and AMD flags for SHL are:
1952 * - Intel always clears AF while AMD always sets it.
1953 * - Intel sets OF for the first shift, while AMD for the last shift.
1954 *
1955 */
1956
1957#ifdef RT_ARCH_AMD64
1958 /*
1959 * We capture flags and does the additional OF and AF calculations as needed.
1960 */
1961 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1962 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1963 * use LAHF here when host rax is free since, OF is cleared. */
1964 /* pushf */
1965 pCodeBuf[off++] = 0x9c;
1966 /* pop tmp */
1967 if (idxRegTmp >= 8)
1968 pCodeBuf[off++] = X86_OP_REX_B;
1969 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1970 /* Clear the status bits in EFLs. */
1971 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1972 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1973 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1974 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1975 else
1976 {
1977 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
1978 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
1979 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1980 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1981 /* OR in the flags we collected. */
1982 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1983
1984 /* Calculate OF */
1985 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1986 {
1987 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1988 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1989 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
1990 RT_MAX(cOpBits, 16), 4, idxRegResult);
1991 pCodeBuf[off++] = cOpBits - 1;
1992 /* setc idxRegTmp */
1993 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
1994 /* xor idxRegTmp, idxRegEfl */
1995 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1996 /* and idxRegTmp, 1 */
1997 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1998 /* shl idxRegTmp, X86_EFL_OF_BIT */
1999 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
2000 }
2001 else
2002 {
2003 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2004 if (cOpBits <= 32)
2005 {
2006 /* mov idxRegTmp, idxRegSrc */
2007 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
2008 /* shl idxRegTmp, 1 */
2009 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
2010 /* xor idxRegTmp, idxRegSrc */
2011 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2012 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
2013 if (cOpBits >= X86_EFL_OF_BIT)
2014 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2015 else
2016 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
2017 }
2018 else
2019 {
2020 /* same as above but with 64-bit grps*/
2021 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2022 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2023 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2024 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2025 }
2026 /* and idxRegTmp, X86_EFL_OF */
2027 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2028 }
2029 }
2030 /* Or in the collected flag(s) */
2031 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2032
2033#elif defined(RT_ARCH_ARM64)
2034 /*
2035 * Calculate flags.
2036 */
2037 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2038
2039 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2040 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2041 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2042
2043 /* N,Z -> SF,ZF */
2044 if (cOpBits < 32)
2045 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2046 else
2047 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2048 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2049 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2050 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2051 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2052
2053 /* Calculate 8-bit parity of the result. */
2054 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2055 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2056 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2057 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2058 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2059 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2060 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2061 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2062 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2063
2064 /* Calculate carry - the last bit shifted out of the input value. */
2065 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2066 {
2067 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2068 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2069 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2070 if (cOpBits < 32)
2071 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2072 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2073 }
2074 else
2075 {
2076 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2077 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2078 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2079 }
2080 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2081
2082 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2083 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2084 {
2085 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2086 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2087 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2088 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2089 }
2090 else
2091 {
2092 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2093 AssertCompile(X86_EFL_CF_BIT == 0);
2094 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2095 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2096 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2097
2098 /* AMD unconditionally clears AF. */
2099 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2100 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2101 }
2102#else
2103# error "port me"
2104#endif
2105 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2106
2107#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2108 pReNative->fSkippingEFlags = 0;
2109# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2110 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2111# endif
2112#endif
2113 }
2114 return off;
2115}
2116
2117
2118DECL_INLINE_THROW(uint32_t)
2119iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2120 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2121{
2122 /* Note! Since we're doing some branching here, we need to allocate all
2123 registers we need before the jump or we may end up with invalid
2124 register state if the branch is taken. */
2125 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2126 uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /*fInitialized*/); /* modified on arm64 */
2127 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
2128 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
2129
2130#ifdef RT_ARCH_AMD64
2131 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2132 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2133
2134 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2135 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2136 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2137 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2138
2139 /* Check if it's NOP before we do anything. */
2140 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2141 uint32_t const offFixup = off;
2142 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2143
2144 if (idxRegDstIn != UINT8_MAX)
2145 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2146 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2147
2148#elif defined(RT_ARCH_ARM64)
2149 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2150 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2151 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2152
2153 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2154 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2155 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2156 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2157 uint32_t const offFixup = off;
2158 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2159
2160 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2161 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2162 if (cOpBits < 32)
2163 {
2164 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2165 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2166 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2167 }
2168
2169#else
2170# error "port me"
2171#endif
2172
2173 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2174 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2175 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2176
2177 /* fixup the jump */
2178 iemNativeFixupFixedJump(pReNative, offFixup, off);
2179
2180#ifdef RT_ARCH_AMD64
2181 if (idxRegDstIn != UINT8_MAX)
2182#endif
2183 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2184 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2185 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2186 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2187 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2188 return off;
2189}
2190
2191
2192DECL_INLINE_THROW(uint32_t)
2193iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2194 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2195{
2196 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2197 AssertFailed();
2198 return iemNativeEmitBrk(pReNative, off, 0x666);
2199}
2200
2201
2202DECL_INLINE_THROW(uint32_t)
2203iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2204 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2205{
2206 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2207 AssertFailed();
2208 return iemNativeEmitBrk(pReNative, off, 0x666);
2209}
2210
2211
2212DECL_INLINE_THROW(uint32_t)
2213iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2214 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2215{
2216 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2217 AssertFailed();
2218 return iemNativeEmitBrk(pReNative, off, 0x666);
2219}
2220
2221
2222DECL_INLINE_THROW(uint32_t)
2223iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2224 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2225{
2226 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2227 AssertFailed();
2228 return iemNativeEmitBrk(pReNative, off, 0x666);
2229}
2230
2231
2232DECL_INLINE_THROW(uint32_t)
2233iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2234 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2235{
2236 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2237 AssertFailed();
2238 return iemNativeEmitBrk(pReNative, off, 0x666);
2239}
2240
2241
2242DECL_INLINE_THROW(uint32_t)
2243iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2244 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2245{
2246 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2247 AssertFailed();
2248 return iemNativeEmitBrk(pReNative, off, 0x666);
2249}
2250
2251
2252
2253#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2254/*********************************************************************************************************************************
2255* SIMD emitters. *
2256*********************************************************************************************************************************/
2257
2258/**
2259 * Common emitter for packed arithmetic instructions.
2260 */
2261#ifdef RT_ARCH_AMD64
2262# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2263 DECL_INLINE_THROW(uint32_t) \
2264 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2265 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2266 { \
2267 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2268 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2269 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2270 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2271 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2272 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2273 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2274 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2275 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2276 pCodeBuf[off++] = 0x0f; \
2277 pCodeBuf[off++] = (a_bOpcX86); \
2278 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2279 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2280 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2281 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2282 return off; \
2283 } \
2284 DECL_INLINE_THROW(uint32_t) \
2285 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2286 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2287 { \
2288 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2289 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2290 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2291 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2292 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2293 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2294 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2295 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2296 pCodeBuf[off++] = 0x0f; \
2297 pCodeBuf[off++] = (a_bOpcX86); \
2298 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2299 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2300 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2301 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2302 return off; \
2303 } \
2304 typedef int ignore_semicolon
2305#elif defined(RT_ARCH_ARM64)
2306# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2307 DECL_INLINE_THROW(uint32_t) \
2308 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2309 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2310 { \
2311 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2312 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2313 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2314 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2315 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2316 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2317 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2318 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2319 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2320 return off; \
2321 } \
2322 DECL_INLINE_THROW(uint32_t) \
2323 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2324 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2325 { \
2326 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2327 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2328 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2329 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2330 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2331 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2332 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2333 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2334 return off; \
2335 } \
2336 typedef int ignore_semicolon
2337#else
2338# error "Port me"
2339#endif
2340
2341/* POR, ORPS, ORPD. */
2342IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2343/* PXOR, XORPS, XORPD. */
2344IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2345/* PAND, ANDPS, ANDPD. */
2346IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2347
2348
2349/**
2350 * Common emitter for the shift right with immediate instructions.
2351 */
2352#ifdef RT_ARCH_AMD64
2353# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2354 DECL_INLINE_THROW(uint32_t) \
2355 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2356 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2357 { \
2358 if (bImm) \
2359 { \
2360 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2361 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2362 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2363 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2364 if (idxSimdRegDst >= 8) \
2365 pCodeBuf[off++] = X86_OP_REX_B; \
2366 pCodeBuf[off++] = 0x0f; \
2367 pCodeBuf[off++] = (a_bOpcX86); \
2368 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2369 pCodeBuf[off++] = bImm; \
2370 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2371 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2372 } \
2373 /* Immediate 0 is a nop. */ \
2374 return off; \
2375 } \
2376 typedef int ignore_semicolon
2377#elif defined(RT_ARCH_ARM64)
2378# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2379 DECL_INLINE_THROW(uint32_t) \
2380 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2381 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2382 { \
2383 if (bImm) \
2384 { \
2385 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2386 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2387 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2388 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2389 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2390 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2391 } \
2392 /* Immediate 0 is a nop. */ \
2393 return off; \
2394 } \
2395 typedef int ignore_semicolon
2396#else
2397# error "Port me"
2398#endif
2399
2400IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2401IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2402IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2403
2404
2405/**
2406 * Common emitter for the shift left with immediate instructions.
2407 */
2408#ifdef RT_ARCH_AMD64
2409# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2410 DECL_INLINE_THROW(uint32_t) \
2411 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2412 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2413 { \
2414 if (bImm) \
2415 { \
2416 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2417 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2418 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2419 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2420 if (idxSimdRegDst >= 8) \
2421 pCodeBuf[off++] = X86_OP_REX_B; \
2422 pCodeBuf[off++] = 0x0f; \
2423 pCodeBuf[off++] = (a_bOpcX86); \
2424 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2425 pCodeBuf[off++] = bImm; \
2426 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2427 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2428 } \
2429 /* Immediate 0 is a nop. */ \
2430 return off; \
2431 } \
2432 typedef int ignore_semicolon
2433#elif defined(RT_ARCH_ARM64)
2434# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2435 DECL_INLINE_THROW(uint32_t) \
2436 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2437 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2438 { \
2439 if (bImm) /* bImm == 0 is a nop */ \
2440 { \
2441 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2442 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2443 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2444 if (bImm < (a_cShiftMax)) \
2445 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2446 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2447 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2448 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2449 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2450 } \
2451 return off; \
2452 } \
2453 typedef int ignore_semicolon
2454#else
2455# error "Port me"
2456#endif
2457
2458IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2459IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2460IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2461
2462
2463/**
2464 * Common emitter for packed arithmetic instructions.
2465 */
2466#ifdef RT_ARCH_AMD64
2467# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2468 DECL_INLINE_THROW(uint32_t) \
2469 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2470 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2471 { \
2472 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2473 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2474 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2475 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2476 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2477 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2478 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2479 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2480 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2481 pCodeBuf[off++] = 0x0f; \
2482 pCodeBuf[off++] = (a_bOpcX86); \
2483 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2484 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2485 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2486 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2487 return off; \
2488 } \
2489 DECL_INLINE_THROW(uint32_t) \
2490 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2491 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2492 { \
2493 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2494 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2495 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2496 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2497 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2498 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2499 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2500 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2501 pCodeBuf[off++] = 0x0f; \
2502 pCodeBuf[off++] = (a_bOpcX86); \
2503 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2504 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2505 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2507 return off; \
2508 } \
2509 typedef int ignore_semicolon
2510#elif defined(RT_ARCH_ARM64)
2511# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2512 DECL_INLINE_THROW(uint32_t) \
2513 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2514 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2515 { \
2516 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2517 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2518 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2519 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2520 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2521 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2522 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2523 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2524 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2525 return off; \
2526 } \
2527 DECL_INLINE_THROW(uint32_t) \
2528 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2529 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2530 { \
2531 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2532 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2533 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2534 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2535 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2536 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2537 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2538 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2539 return off; \
2540 } \
2541 typedef int ignore_semicolon
2542#else
2543# error "Port me"
2544#endif
2545
2546/*
2547 * PADDx.
2548 */
2549IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2550IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2551IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2552IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2553
2554/*
2555 * PSUBx.
2556 */
2557IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2558IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2559IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2560IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2561
2562/*
2563 * PADDUSx.
2564 */
2565IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2566IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2567
2568/*
2569 * PMULLx.
2570 */
2571IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2572
2573
2574/**
2575 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2576 */
2577#ifdef RT_ARCH_AMD64
2578# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2579 DECL_INLINE_THROW(uint32_t) \
2580 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2581 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2582 { \
2583 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2584 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2585 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2586 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2587 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2588 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2589 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2590 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2591 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2592 pCodeBuf[off++] = 0x0f; \
2593 pCodeBuf[off++] = (a_bOpcX86); \
2594 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2595 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2596 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2597 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2598 return off; \
2599 } \
2600 DECL_INLINE_THROW(uint32_t) \
2601 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2602 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2603 { \
2604 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2605 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2606 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2607 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2608 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2609 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2610 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2611 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2612 pCodeBuf[off++] = 0x0f; \
2613 pCodeBuf[off++] = (a_bOpcX86); \
2614 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2615 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2616 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2617 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2618 return off; \
2619 } \
2620 typedef int ignore_semicolon
2621#elif defined(RT_ARCH_ARM64)
2622# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2623 DECL_INLINE_THROW(uint32_t) \
2624 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2625 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2626 { \
2627 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2628 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2629 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2630 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2631 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2632 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2633 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2634 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2635 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2636 return off; \
2637 } \
2638 DECL_INLINE_THROW(uint32_t) \
2639 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2640 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2641 { \
2642 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2643 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2644 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2645 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2646 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2647 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2648 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2649 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2650 return off; \
2651 } \
2652 typedef int ignore_semicolon
2653#else
2654# error "Port me"
2655#endif
2656
2657IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2658IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2659IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2660
2661
2662/**
2663 * Emitter for pmovmskb
2664 */
2665DECL_INLINE_THROW(uint32_t)
2666iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2667 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2668{
2669#ifdef RT_ARCH_AMD64
2670 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2671 kIemNativeGstRegUse_ForFullWrite);
2672 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2673 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2674 kIemNativeGstSimdRegLdStSz_Low128,
2675 kIemNativeGstRegUse_ReadOnly);
2676 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2677
2678 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2679 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2680 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2681 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2682 pCodeBuf[off++] = 0x0f;
2683 pCodeBuf[off++] = 0xd7;
2684 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2685
2686 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2687 iemNativeRegFreeTmp(pReNative, idxRegDst);
2688
2689#elif defined(RT_ARCH_ARM64)
2690 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2691 kIemNativeGstRegUse_ForFullWrite);
2692 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2693 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2694 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2695 kIemNativeGstSimdRegLdStSz_Low128,
2696 kIemNativeGstRegUse_Calculation);
2697 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2698
2699 /*
2700 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2701 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2702 *
2703 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2704 * as the sse2neon implementation because there we can get away with loading any constants and the
2705 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2706 *
2707 * The following illustrates the algorithm:
2708 *
2709 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2710 * Instruction
2711 * |
2712 * V
2713 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2714 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2715 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2716 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2717 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2718 *
2719 * The extraction process
2720 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2721 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2722 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2723 */
2724 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2725 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2726 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2727 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2728 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2729 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2730 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2731
2732 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2733 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2734 iemNativeRegFreeTmp(pReNative, idxRegDst);
2735
2736#else
2737# error "Port me"
2738#endif
2739 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2740 return off;
2741}
2742
2743
2744/**
2745 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2746 */
2747DECL_INLINE_THROW(uint32_t)
2748iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2749 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2750{
2751 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2752 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2753 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2754 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2755
2756#ifdef RT_ARCH_AMD64
2757 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2758
2759 /* packuswb xmm, xmm */
2760 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2761 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2762 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2763 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2764 pCodeBuf[off++] = 0x0f;
2765 pCodeBuf[off++] = 0x67;
2766 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2767
2768#elif defined(RT_ARCH_ARM64)
2769 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2770
2771 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2772 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2773
2774#else
2775# error "port me"
2776#endif
2777
2778 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2779 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2780
2781 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2782 return off;
2783}
2784
2785
2786/**
2787 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2788 */
2789DECL_INLINE_THROW(uint32_t)
2790iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2791 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2792{
2793 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2794 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2795
2796 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2797 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2798 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2799
2800
2801#ifdef RT_ARCH_AMD64
2802 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2803
2804 /* packuswb xmm, xmm */
2805 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2806 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2807 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2808 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2809 pCodeBuf[off++] = 0x0f;
2810 pCodeBuf[off++] = 0x67;
2811 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2812
2813#elif defined(RT_ARCH_ARM64)
2814 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2815
2816 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2817 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2818
2819#else
2820# error "port me"
2821#endif
2822
2823 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2824 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2825
2826 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2827 return off;
2828}
2829
2830
2831/**
2832 * Common emitter for the pmov{s,z}x* instructions.
2833 */
2834#ifdef RT_ARCH_AMD64
2835# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2836 DECL_INLINE_THROW(uint32_t) \
2837 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2838 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2839 { \
2840 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2841 { \
2842 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2843 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2844 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2845 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2846 if (idxSimdReg >= 8) \
2847 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2848 pCodeBuf[off++] = 0x0f; \
2849 pCodeBuf[off++] = 0x38; \
2850 pCodeBuf[off++] = (a_bOpcX86); \
2851 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2852 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2853 } \
2854 else \
2855 { \
2856 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2857 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2858 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2859 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2860 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2861 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2862 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2863 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2864 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2865 pCodeBuf[off++] = 0x0f; \
2866 pCodeBuf[off++] = 0x38; \
2867 pCodeBuf[off++] = (a_bOpcX86); \
2868 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2869 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2870 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2871 } \
2872 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2873 return off; \
2874 } \
2875 DECL_INLINE_THROW(uint32_t) \
2876 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2877 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2878 { \
2879 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2880 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2881 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2882 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2883 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2884 pCodeBuf[off++] = X86_OP_REX_W \
2885 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2886 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2887 pCodeBuf[off++] = 0x0f; \
2888 pCodeBuf[off++] = 0x3a; \
2889 pCodeBuf[off++] = 0x22; \
2890 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2891 pCodeBuf[off++] = 0; /* QWord */\
2892 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2893 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2894 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2895 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2896 pCodeBuf[off++] = 0x0f; \
2897 pCodeBuf[off++] = 0x38; \
2898 pCodeBuf[off++] = (a_bOpcX86); \
2899 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2900 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2901 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2902 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2903 return off; \
2904 } \
2905 typedef int ignore_semicolon
2906#elif defined(RT_ARCH_ARM64)
2907# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2908 DECL_INLINE_THROW(uint32_t) \
2909 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2910 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2911 { \
2912 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2913 { \
2914 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2915 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2916 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2917 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2918 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2919 } \
2920 else \
2921 { \
2922 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2923 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2924 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2925 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2926 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2927 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2928 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2929 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2930 } \
2931 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2932 return off; \
2933 } \
2934 DECL_INLINE_THROW(uint32_t) \
2935 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2936 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2937 { \
2938 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2939 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2940 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2941 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2942 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
2943 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2944 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2945 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2946 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2947 return off; \
2948 } \
2949 typedef int ignore_semicolon
2950#else
2951# error "Port me"
2952#endif
2953
2954IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2955IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2956IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2957
2958IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2959IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2960IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2961
2962
2963/**
2964 * Updates the MXCSR exception flags, raising any unmasked exceptions.
2965 */
2966DECL_INLINE_THROW(uint32_t)
2967iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2968{
2969 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2970 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2971 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2972
2973#ifdef RT_ARCH_AMD64
2974 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2975
2976 /* stmxcsr */
2977 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2978 pbCodeBuf[off++] = X86_OP_REX_B;
2979 pbCodeBuf[off++] = 0x0f;
2980 pbCodeBuf[off++] = 0xae;
2981 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2982 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2983 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2984 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2985 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2986
2987 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2988 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2989
2990 /* Store the flags in the MXCSR xcpt flags register. */
2991 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2992 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2993
2994 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
2995 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2996 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2997
2998 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2999
3000 /* ldmxcsr */
3001 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
3002 pbCodeBuf[off++] = X86_OP_REX_B;
3003 pbCodeBuf[off++] = 0x0f;
3004 pbCodeBuf[off++] = 0xae;
3005 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
3006 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3007 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3008 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3009 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3010
3011#elif defined(RT_ARCH_ARM64)
3012 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
3013 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
3014 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
3015 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
3016
3017 /*
3018 * The exception flags layout differs between MXCSR and FPSR of course:
3019 *
3020 * Bit FPSR MXCSR
3021 * 0 IOC ------> IE
3022 *
3023 * 1 DZC ---- DE <-+
3024 * \ |
3025 * 2 OFC --- -> ZE |
3026 * \ |
3027 * 3 UFC -- --> OE |
3028 * \ |
3029 * 4 IXC - ---> UE |
3030 * \ |
3031 * 5 ----> PE |
3032 * 6 |
3033 * 7 IDC --------------+
3034 */
3035 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3036 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3037 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3038 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3039#else
3040# error "Port me"
3041#endif
3042
3043 /*
3044 * If PE is set together with OE/UE and neither are masked
3045 * PE needs to be cleared, because on real hardware
3046 * an exception is generated with only OE/UE being set,
3047 * but because we mask all exceptions PE will get set as well.
3048 */
3049 /** @todo On ARM we can combine the load+and into one and instruction. */
3050 /** @todo r=aeichner Can this be done more optimal? */
3051 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3052 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3053 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3054 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3055 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3056 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3057 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3058 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3059 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3060
3061 uint32_t offFixup = off;
3062 off = iemNativeEmitJzToFixed(pReNative, off, off);
3063 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3064 iemNativeFixupFixedJump(pReNative, offFixup, off);
3065 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3066
3067
3068 /* Set the MXCSR flags now. */
3069 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3070
3071 /*
3072 * Make sure we don't have any outstanding guest register writes as we may
3073 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3074 */
3075 off = iemNativeRegFlushPendingWrites(pReNative, off);
3076
3077#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3078 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3079#else
3080 RT_NOREF(idxInstr);
3081#endif
3082
3083 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3084 /* mov tmp, varmxcsr */
3085 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3086 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3087 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3088 /* tmp = ~tmp */
3089 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3090 /* tmp &= mxcsr */
3091 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3092 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3093 X86_MXCSR_XCPT_FLAGS);
3094
3095 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3096 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3097
3098 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3099 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3100
3101 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3102 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3103 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3104 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3105 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3106 return off;
3107}
3108
3109
3110/**
3111 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3112 */
3113DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3114 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3115#ifdef RT_ARCH_AMD64
3116 uint8_t const bPrefixX86, uint8_t const bOpcX86
3117#elif defined(RT_ARCH_ARM64)
3118 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3119#endif
3120 )
3121{
3122 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3123 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3124 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3125 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3126
3127#ifdef RT_ARCH_AMD64
3128 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3129 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3130 if (bPrefixX86 != 0)
3131 pCodeBuf[off++] = bPrefixX86;
3132 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3133 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3134 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3135 pCodeBuf[off++] = 0x0f;
3136 pCodeBuf[off++] = bOpcX86;
3137 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3138#elif defined(RT_ARCH_ARM64)
3139 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3140 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3141#else
3142# error "Port me"
3143#endif
3144 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3145 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3146 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3147 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3148}
3149
3150
3151/**
3152 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3153 */
3154DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3155 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3156#ifdef RT_ARCH_AMD64
3157 uint8_t const bPrefixX86, uint8_t const bOpcX86
3158#elif defined(RT_ARCH_ARM64)
3159 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3160#endif
3161 )
3162{
3163 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3164 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3165 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3166
3167#ifdef RT_ARCH_AMD64
3168 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3169 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3170 if (bPrefixX86 != 0)
3171 pCodeBuf[off++] = bPrefixX86;
3172 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3173 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3174 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3175 pCodeBuf[off++] = 0x0f;
3176 pCodeBuf[off++] = bOpcX86;
3177 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3178#elif defined(RT_ARCH_ARM64)
3179 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3180 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3181#else
3182# error "Port me"
3183#endif
3184 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3185 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3186 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3187 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3188}
3189
3190
3191/**
3192 * Common emitter for packed floating point instructions with 3 operands.
3193 */
3194#ifdef RT_ARCH_AMD64
3195# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3196 DECL_FORCE_INLINE_THROW(uint32_t) \
3197 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3198 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3199 { \
3200 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3201 a_bPrefixX86, a_bOpcX86); \
3202 } \
3203 DECL_FORCE_INLINE_THROW(uint32_t) \
3204 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3205 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3206 { \
3207 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3208 a_bPrefixX86, a_bOpcX86); \
3209 } \
3210 typedef int ignore_semicolon
3211#elif defined(RT_ARCH_ARM64)
3212# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3213 DECL_FORCE_INLINE_THROW(uint32_t) \
3214 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3215 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3216 { \
3217 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3218 a_enmArmOp, a_ArmElemSz); \
3219 } \
3220 DECL_FORCE_INLINE_THROW(uint32_t) \
3221 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3222 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3223 { \
3224 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3225 a_enmArmOp, a_ArmElemSz); \
3226 } \
3227 typedef int ignore_semicolon
3228#else
3229# error "Port me"
3230#endif
3231
3232
3233IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3234IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3235IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3236IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3237
3238#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3239
3240#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette