VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106201

Last change on this file since 106201 was 106201, checked in by vboxsync, 2 months ago

VMM/IEM: A couple of debug build fixes for arm. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 160.0 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106201 2024-10-01 23:48:36Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* EFLAGS *
201*********************************************************************************************************************************/
202
203#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205/** @def IEMNATIVE_POSTPONING_REG_MASK
206 * Register suitable for keeping the inputs or result for a postponed EFLAGS
207 * calculation.
208 *
209 * We use non-volatile register here so we don't have to save & restore them
210 * accross callouts (i.e. TLB loads).
211 *
212 * @note On x86 we cannot use RDI and RSI because these are used by the
213 * opcode checking code. The usual joy of the x86 instruction set.
214 */
215# ifdef RT_ARCH_AMD64
216# define IEMNATIVE_POSTPONING_REG_MASK \
217 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
218# else
219# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220# endif
221
222/**
223 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224 */
225template<uint32_t const a_fEflClobbered>
226DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227{
228 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230 if (fEFlags)
231 {
232 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233 {
234 fEFlags &= ~a_fEflClobbered;
235 if (!fEFlags)
236 { /* likely */ }
237 else
238 {
239 Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
240 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241 pReNative->PostponedEfl.fEFlags = fEFlags;
242 return;
243 }
244 }
245
246 /* Do cleanup. */
247 Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
248 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249 pReNative->PostponedEfl.fEFlags = 0;
250 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251 pReNative->PostponedEfl.cOpBits = 0;
252 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259 pReNative->PostponedEfl.cEmits = 0;
260# endif
261 }
262}
263
264#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
265
266
267template<bool const a_fDoOp>
268DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
269 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
270{
271#ifdef RT_ARCH_AMD64
272 /* Do TEST idxRegResult, idxRegResult to set flags. */
273 if RT_CONSTEXPR_IF(a_fDoOp)
274 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
275
276 /*
277 * Collect the EFLAGS status bits.
278 * We know that the overflow bit will always be cleared, so LAHF can be used.
279 */
280 if (idxRegTmp == X86_GREG_xAX)
281 {
282 /* lahf ; AH = EFLAGS */
283 pCodeBuf[off++] = 0x9f;
284 if (idxRegEfl <= X86_GREG_xBX)
285 {
286 /* mov [CDB]L, AH */
287 pCodeBuf[off++] = 0x88;
288 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
289 }
290 else
291 {
292 /* mov AL, AH */
293 pCodeBuf[off++] = 0x88;
294 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
295 /* mov xxL, AL */
296 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
297 pCodeBuf[off++] = 0x88;
298 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
299 }
300 }
301 else if (idxRegEfl != X86_GREG_xAX)
302 {
303# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
304 /* xchg rax, tmp */
305 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
306 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
307
308 /* lahf ; AH = EFLAGS */
309 pCodeBuf[off++] = 0x9f;
310 if (idxRegEfl <= X86_GREG_xBX)
311 {
312 /* mov [CDB]L, AH */
313 pCodeBuf[off++] = 0x88;
314 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
315 }
316 else
317 {
318 /* mov AL, AH */
319 pCodeBuf[off++] = 0x88;
320 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
321 /* mov xxL, AL */
322 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
323 pCodeBuf[off++] = 0x88;
324 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
325 }
326
327 /* xchg rax, tmp */
328 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
329 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
330
331# else
332 /* pushf */
333 pCodeBuf[off++] = 0x9c;
334 /* pop tmp */
335 if (idxRegTmp >= 8)
336 pCodeBuf[off++] = X86_OP_REX_B;
337 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
338 /* mov byte(efl), byte(tmp) */
339 if (idxRegEfl >= 4 || idxRegTmp >= 4)
340 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
341 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
342 pCodeBuf[off++] = 0x88;
343 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
344# endif
345 }
346 else
347 {
348 /* xchg al, ah */
349 pCodeBuf[off++] = 0x86;
350 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
351 /* lahf ; AH = EFLAGS */
352 pCodeBuf[off++] = 0x9f;
353 /* xchg al, ah */
354 pCodeBuf[off++] = 0x86;
355 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
356 }
357 /* BTR idxEfl, 11; Clear OF */
358 if (idxRegEfl >= 8)
359 pCodeBuf[off++] = X86_OP_REX_B;
360 pCodeBuf[off++] = 0xf;
361 pCodeBuf[off++] = 0xba;
362 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
363 pCodeBuf[off++] = X86_EFL_OF_BIT;
364
365#elif defined(RT_ARCH_ARM64)
366 /*
367 * Calculate flags.
368 */
369 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
370 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
371 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
372
373 /* N,Z -> SF,ZF */
374 if (cOpBits < 32)
375 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
376 else if RT_CONSTEXPR_IF(a_fDoOp)
377 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
378 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
379 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
380 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
381 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
382
383 /* Calculate 8-bit parity of the result. */
384 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
385 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
386 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
387 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
388 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
389 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
390 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
391 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
392 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
393
394#else
395# error "port me"
396#endif
397 return off;
398}
399
400#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
401
402template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
403static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
404 uint32_t bmExtraTlbMissRegs = 0)
405{
406# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
407 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
408 pReNative->PostponedEfl.cEmits);
409# endif
410
411 /*
412 * In the TB exit code path we cannot do regular register allocation. Nor
413 * can we when we're in the TLB miss code, unless we're skipping the TLB
414 * lookup. Since the latter isn't an important usecase and should get along
415 * fine on just volatile registers, we do not need to do anything special
416 * for it.
417 *
418 * So, we do our own register allocating here. Any register goes in the TB
419 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
420 * In the TLB miss we can use any volatile register and temporary registers
421 * allocated in the TLB state.
422 *
423 * Note! On x86 we prefer using RAX as the first TMP register, so we can
424 * make use of LAHF which is typically faster than PUSHF/POP. This
425 * is why the idxRegTmp allocation is first when there is no EFLAG
426 * shadow, since RAX is represented by bit 0 in the mask.
427 */
428 uint32_t bmAvailableRegs;
429 if RT_CONSTEXPR_IF(!a_fTlbMiss)
430 {
431 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
432 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
433 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
434 else
435 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
436 }
437 else
438 {
439 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
440 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
441 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
442 & IEMNATIVE_HST_GREG_MASK;
443 }
444
445 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
446 need to weed out volatile registers here, as they will no longer be valid. */
447 uint8_t idxRegTmp;
448 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
449 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
450 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
451 {
452 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
453 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
454 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
455 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
456# ifdef VBOX_STRICT
457 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
458# endif
459
460 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
461 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
462 }
463 else
464 {
465 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
466 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
467
468 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
469 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
470 off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
471 }
472 Assert(bmAvailableRegs != 0);
473
474 /*
475 * Do the actual EFLAGS calculation.
476 */
477 switch (pReNative->PostponedEfl.enmOp)
478 {
479 case kIemNativePostponedEflOp_Logical:
480 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
481 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
482 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
483 break;
484
485 default:
486 AssertFailedBreak();
487 }
488
489 /*
490 * Store EFLAGS.
491 */
492# ifdef VBOX_STRICT
493 /* check that X86_EFL_1 is set. */
494 uint32_t offFixup1;
495 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
496 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
497 iemNativeFixupFixedJump(pReNative, offFixup1, off);
498 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
499 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK, idxRegTmp);
500 uint32_t const offFixup2 = off;
501 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
502 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
503 iemNativeFixupFixedJump(pReNative, offFixup2, off);
504# endif
505 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
507
508# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
509 pReNative->PostponedEfl.cEmits++;
510# endif
511 return off;
512}
513
514
515
516template<uint32_t const a_bmInputRegs>
517DECL_FORCE_INLINE_THROW(uint32_t)
518iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
519{
520 if (pReNative->PostponedEfl.fEFlags)
521 {
522 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
523 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
524 }
525 return off;
526}
527
528
529template<uint32_t const a_bmInputRegs>
530DECL_FORCE_INLINE_THROW(uint32_t)
531iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
532{
533 if (pReNative->PostponedEfl.fEFlags)
534 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
535 return off;
536}
537
538
539template<uint32_t const a_bmInputRegs>
540DECL_FORCE_INLINE_THROW(uint32_t)
541iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
542 uint32_t bmTmpRegs)
543{
544 if (pReNative->PostponedEfl.fEFlags)
545 {
546 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
547 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
548 pTlbState->getRegsNotToSave() | bmTmpRegs);
549 }
550 return off;
551}
552
553#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
554
555
556/**
557 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
558 *
559 * It takes liveness stuff into account.
560 */
561template<bool a_fNeedToSetFlags>
562DECL_INLINE_THROW(uint32_t)
563iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
564 uint8_t cOpBits, uint8_t idxRegResult)
565{
566 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
567 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
568 RT_NOREF(cOpBits, idxRegResult);
569
570#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
571 /*
572 * See if we can skip this wholesale.
573 */
574 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
575 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
576 & IEMLIVENESSBIT_STATUS_EFL_MASK;
577# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
578 uint64_t fEflPostponing;
579# endif
580 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
581 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
582 {
583 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
584 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
585# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
586 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
587# endif
588 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
589 return off;
590 }
591# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
592 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
593 | fEflClobbered)
594 == IEMLIVENESSBIT_STATUS_EFL_MASK
595 && idxRegResult != UINT8_MAX)
596 {
597 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
598 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
599 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
600 pReNative->PostponedEfl.cOpBits = cOpBits;
601 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
602 /** @todo it would normally be possible to use idxRegResult, iff it is
603 * already a non-volatile register and we can be user the caller
604 * doesn't modify it. That'll save a register move and allocation. */
605 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
606 Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
607 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
608 }
609# endif
610 else
611#endif
612 {
613 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
614 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
615#ifdef RT_ARCH_AMD64
616 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
617#elif defined(RT_ARCH_ARM64)
618 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
619#else
620# error "port me"
621#endif
622 off = iemNativeEmitPostponedEFlagsCalcLogical<a_fNeedToSetFlags>(pCodeBuf, off, cOpBits, idxRegResult,
623 idxRegEfl, idxRegTmp);
624 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
625
626 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
627 iemNativeRegFreeTmp(pReNative, idxRegTmp);
628 }
629
630#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
631 if (pReNative->fSkippingEFlags)
632 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
633 pReNative->fSkippingEFlags = 0;
634# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
635 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
636# endif
637#endif
638 return off;
639}
640
641
642/**
643 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
644 *
645 * It takes liveness stuff into account.
646 */
647DECL_FORCE_INLINE_THROW(uint32_t)
648iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
649#ifndef RT_ARCH_AMD64
650 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
651 , bool fInvertCarry, uint64_t uImmSrc
652#endif
653 )
654{
655 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
656 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
657
658#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
659 /*
660 * See if we can skip this wholesale.
661 */
662 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
663 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
664 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
665 {
666 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
667 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
668 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
669# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
670 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
671# endif
672 }
673 else
674#endif
675 {
676#ifdef RT_ARCH_AMD64
677 /*
678 * Collect flags and merge them with eflags.
679 */
680 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
681 /* pushf - do this before any reg allocations as they may emit instructions too. */
682 pCodeBuf[off++] = 0x9c;
683
684 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
685 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
686 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
687 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
688 /* pop tmp */
689 if (idxTmpReg >= 8)
690 pCodeBuf[off++] = X86_OP_REX_B;
691 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
692 /* Isolate the flags we want. */
693 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
694 /* Clear the status bits in EFLs. */
695 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
696 /* OR in the flags we collected. */
697 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
698 if (idxRegEflIn != idxRegEfl)
699 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
700 iemNativeRegFreeTmp(pReNative, idxTmpReg);
701
702#elif defined(RT_ARCH_ARM64)
703 /*
704 * Calculate flags.
705 */
706 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
707 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
708 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
709 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
710 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
711
712 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
713 if (fInvertCarry)
714 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
715 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
716
717 if (cOpBits >= 32)
718 {
719 /* V -> OF */
720 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
721 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
722
723 /* C -> CF */
724 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
725 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
726 }
727
728 /* N,Z -> SF,ZF */
729 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
730 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
731
732 /* For ADC and SBB we have to calculate overflow and carry our selves. */
733 if (cOpBits < 32)
734 {
735 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
736 AssertCompile(X86_EFL_CF_BIT == 0);
737 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
738
739 /* The overflow flag is more work as we have to compare the signed bits for
740 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
741
742 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
743 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
744
745 It is a bit simpler when the right (source) side is constant:
746 adc: S D R -> OF sbb: S D R -> OF
747 0 0 0 -> 0 \ 0 0 0 -> 0 \
748 0 0 1 -> 1 \ 0 0 1 -> 0 \
749 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
750 0 1 1 -> 0 / 0 1 1 -> 0 /
751 1 0 0 -> 0 \ 1 0 0 -> 0 \
752 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
753 1 1 0 -> 1 / 1 1 0 -> 0 /
754 1 1 1 -> 0 / 1 1 1 -> 0 / */
755 if (idxRegSrc != UINT8_MAX)
756 {
757 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
758 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
759 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
760 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
761 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
762 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
763 }
764 else if (uImmSrc & RT_BIT_32(cOpBits - 1))
765 {
766 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
767 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
768 else
769 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
770 }
771 else
772 {
773 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
774 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
775 else
776 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
777 }
778 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
779 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
780 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
781 }
782
783 /* Calculate 8-bit parity of the result. */
784 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
785 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
786 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
787 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
788 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
789 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
790 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
791 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
792 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
793
794 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
795 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
796 S D R
797 0 0 0 -> 0; \
798 0 0 1 -> 1; \ regular
799 0 1 0 -> 1; / xor R, D
800 0 1 1 -> 0; /
801 1 0 0 -> 1; \
802 1 0 1 -> 0; \ invert one of the two
803 1 1 0 -> 0; / xor not(R), D
804 1 1 1 -> 1; /
805 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
806 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
807 */
808
809 if (idxRegSrc != UINT8_MAX)
810 {
811 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
812 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
813 }
814 else if (uImmSrc & X86_EFL_AF)
815 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
816 else
817 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
818 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
819 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
820
821 if (idxRegEflIn != idxRegEfl)
822 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
823 iemNativeRegFreeTmp(pReNative, idxTmpReg);
824
825#else
826# error "port me"
827#endif
828 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
829
830#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
831 if (pReNative->fSkippingEFlags)
832 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
833 pReNative->fSkippingEFlags = 0;
834# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
835 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
836# endif
837#endif
838 }
839 return off;
840
841}
842
843
844
845/*********************************************************************************************************************************
846* Bitwise Logical Operations *
847*********************************************************************************************************************************/
848
849/**
850 * The AND instruction will clear OF, CF and AF (latter is undefined) and
851 * set the other flags according to the result.
852 */
853template<uint8_t const a_cOpBits>
854DECL_INLINE_THROW(uint32_t)
855iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
856{
857 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
858 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
859#ifdef RT_ARCH_AMD64
860 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
861 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
862 0x22, 0x23, a_cOpBits, idxRegDst, idxRegSrc);
863 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
864 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
865
866 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
867
868#elif defined(RT_ARCH_ARM64)
869 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
870 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
871 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
872 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
873 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
874
875 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
876#else
877# error "Port me"
878#endif
879 iemNativeVarRegisterRelease(pReNative, idxVarDst);
880 return off;
881}
882
883
884/**
885 * The AND instruction with immediate value as right operand.
886 */
887template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
888DECL_INLINE_THROW(uint32_t)
889iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
890{
891 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
892#ifdef RT_ARCH_AMD64
893 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
894 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
895 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 4, idxRegDst, uImmOp);
896 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
897
898 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
899
900#elif defined(RT_ARCH_ARM64)
901 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
902 course the immediate variant when possible to save a register load. */
903 uint32_t uImmSizeLen, uImmRotations;
904 if ( a_cOpBits > 32
905 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
906 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
907 {
908 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
909 if (a_cOpBits >= 32)
910 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
911 else
912 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
913 }
914 else
915 {
916 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
917 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
918 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
919 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
920 else
921 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
922 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
923 }
924 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
925
926 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
927
928#else
929# error "Port me"
930#endif
931 iemNativeVarRegisterRelease(pReNative, idxVarDst);
932 return off;
933}
934
935
936/**
937 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
938 * set the other flags according to the result.
939 */
940template<uint8_t const a_cOpBits>
941DECL_INLINE_THROW(uint32_t)
942iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
943{
944 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
945 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
946 : iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
947#ifdef RT_ARCH_AMD64
948 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
949 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
950 0x84, 0x85, a_cOpBits, idxRegSrc, idxRegDst);
951 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
952
953#elif defined(RT_ARCH_ARM64)
954 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
955 need to keep the result in order to calculate the flags. */
956 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
957 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
958 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
959 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
960 else
961 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
962 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
963
964#else
965# error "Port me"
966#endif
967 if (idxVarSrc != idxVarDst)
968 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
969 iemNativeVarRegisterRelease(pReNative, idxVarDst);
970
971#ifdef RT_ARCH_AMD64
972 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
973#else
974 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
975 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
976 else
977 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
978 iemNativeRegFreeTmp(pReNative, idxRegResult);
979#endif
980 return off;
981}
982
983
984/**
985 * The TEST instruction with immediate value as right operand.
986 */
987template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
988DECL_INLINE_THROW(uint32_t)
989iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
990{
991 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
992#ifdef RT_ARCH_AMD64
993 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
994 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
995 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
996 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
997 iemNativeVarRegisterRelease(pReNative, idxVarDst);
998
999 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1000
1001#elif defined(RT_ARCH_ARM64)
1002 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1003 course the immediate variant when possible to save a register load.
1004 We also need to keep the result in order to calculate the flags. */
1005 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1006 uint32_t uImmSizeLen, uImmRotations;
1007 if ( a_cOpBits > 32
1008 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1009 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1010 {
1011 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1012 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1013 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1014 else
1015 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1016 }
1017 else
1018 {
1019 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1020 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1021 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1022 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1023 else
1024 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1025 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1026 }
1027 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1028 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1029
1030 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1031
1032 iemNativeRegFreeTmp(pReNative, idxRegResult);
1033
1034#else
1035# error "Port me"
1036#endif
1037 return off;
1038}
1039
1040
1041/**
1042 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1043 * set the other flags according to the result.
1044 */
1045template<uint8_t const a_cOpBits>
1046DECL_INLINE_THROW(uint32_t)
1047iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1048{
1049 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1050 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1051#ifdef RT_ARCH_AMD64
1052 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1053 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1054 0x0a, 0x0b, a_cOpBits, idxRegDst, idxRegSrc);
1055 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1056 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1057
1058 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1059
1060#elif defined(RT_ARCH_ARM64)
1061 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1062 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1063 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1064 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1065 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1066
1067 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1068
1069#else
1070# error "Port me"
1071#endif
1072 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1073 return off;
1074}
1075
1076
1077/**
1078 * The OR instruction with immediate value as right operand.
1079 */
1080template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1081DECL_INLINE_THROW(uint32_t)
1082iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1083{
1084 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1085#ifdef RT_ARCH_AMD64
1086 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1087 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1088 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 1, idxRegDst, uImmOp);
1089 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1090
1091 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1092
1093#elif defined(RT_ARCH_ARM64)
1094 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1095 course the immediate variant when possible to save a register load. */
1096 uint32_t uImmSizeLen, uImmRotations;
1097 if ( a_cOpBits > 32
1098 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1099 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1100 {
1101 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1102 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1103 }
1104 else
1105 {
1106 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1107 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1108 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1109 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1110 }
1111 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1112
1113 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1114
1115#else
1116# error "Port me"
1117#endif
1118 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1119 return off;
1120}
1121
1122
1123/**
1124 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1125 * set the other flags according to the result.
1126 */
1127template<uint8_t const a_cOpBits>
1128DECL_INLINE_THROW(uint32_t)
1129iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1130{
1131 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1132 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1133#ifdef RT_ARCH_AMD64
1134 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1135 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1136 0x32, 0x33, a_cOpBits, idxRegDst, idxRegSrc);
1137 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1138 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1139
1140 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1141
1142#elif defined(RT_ARCH_ARM64)
1143 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1144 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1145 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1146 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1147 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1148
1149 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1150
1151#else
1152# error "Port me"
1153#endif
1154 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1155 return off;
1156}
1157
1158
1159/**
1160 * The XOR instruction with immediate value as right operand.
1161 */
1162template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1163DECL_INLINE_THROW(uint32_t)
1164iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1165{
1166 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1167#ifdef RT_ARCH_AMD64
1168 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1169 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1170 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 6, idxRegDst, uImmOp);
1171 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1172
1173 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1174
1175#elif defined(RT_ARCH_ARM64)
1176 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1177 course the immediate variant when possible to save a register load. */
1178 uint32_t uImmSizeLen, uImmRotations;
1179 if ( a_cOpBits > 32
1180 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1181 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1182 {
1183 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1184 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1185 }
1186 else
1187 {
1188 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1189 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1190 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1191 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1192 }
1193 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1194
1195 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1196
1197#else
1198# error "Port me"
1199#endif
1200 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1201 return off;
1202}
1203
1204
1205
1206/*********************************************************************************************************************************
1207* ADD, ADC, SUB, SBB, CMP *
1208*********************************************************************************************************************************/
1209
1210/**
1211 * The ADD instruction will set all status flags.
1212 */
1213template<uint8_t const a_cOpBits>
1214DECL_INLINE_THROW(uint32_t)
1215iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1216{
1217 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1218 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1219
1220#ifdef RT_ARCH_AMD64
1221 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1222 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1223 0x02, 0x03, a_cOpBits, idxRegDst, idxRegSrc);
1224 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1225
1226 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1227 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1228
1229 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1230
1231#elif defined(RT_ARCH_ARM64)
1232 /* On ARM64 we'll need the two input operands as well as the result in order
1233 to calculate the right flags, even if we use ADDS and translates NZCV into
1234 OF, CF, ZF and SF. */
1235 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1236 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1237 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1238 {
1239 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1240 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1241 }
1242 else
1243 {
1244 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1245 uint32_t const cShift = 32 - a_cOpBits;
1246 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1247 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1248 true /*fSetFlags*/, cShift);
1249 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1250 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1251 }
1252 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1253
1254 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1255 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1256
1257 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1258 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1259 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1260
1261#else
1262# error "port me"
1263#endif
1264 return off;
1265}
1266
1267
1268/**
1269 * The ADD instruction with immediate value as right operand.
1270 */
1271template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1272DECL_INLINE_THROW(uint32_t)
1273iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1274{
1275 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1276
1277#ifdef RT_ARCH_AMD64
1278 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1279 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1280 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1281 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1282
1283 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1284
1285 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1286
1287#elif defined(RT_ARCH_ARM64)
1288 /* On ARM64 we'll need the two input operands as well as the result in order
1289 to calculate the right flags, even if we use ADDS and translates NZCV into
1290 OF, CF, ZF and SF. */
1291 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1292 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1293 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1294 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1295 {
1296 if (uImmOp <= 0xfffU)
1297 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1298 true /*fSetFlags*/);
1299 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1300 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1301 true /*fSetFlags*/, true /*fShift12*/);
1302 else
1303 {
1304 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1305 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1306 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1307 true /*fSetFlags*/);
1308 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1309 }
1310 }
1311 else
1312 {
1313 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1314 uint32_t const cShift = 32 - a_cOpBits;
1315 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1316 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1317 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1318 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1319 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1320 }
1321 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1322
1323 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1324 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1325
1326 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1327 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1328
1329#else
1330# error "port me"
1331#endif
1332 return off;
1333}
1334
1335
1336/**
1337 * The ADC instruction takes CF as input and will set all status flags.
1338 */
1339template<uint8_t const a_cOpBits>
1340DECL_INLINE_THROW(uint32_t)
1341iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1342{
1343 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1344 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1345 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1346
1347#ifdef RT_ARCH_AMD64
1348 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1349 with matching size to get the correct flags. */
1350 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1351
1352 /* Use the BT instruction to set CF according to idxRegEfl. */
1353 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1354 pCodeBuf[off++] = X86_EFL_CF_BIT;
1355
1356 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, a_cOpBits, idxRegDst, idxRegSrc);
1357 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1358
1359 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1360 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1361
1362 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1363
1364#elif defined(RT_ARCH_ARM64)
1365 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1366 then ADCS for the calculation. We need all inputs and result for the two
1367 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1368 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1369 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1370
1371 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1372 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1373 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1374 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1375 else
1376 {
1377 /* Since we're also adding in the carry flag here, shifting operands up
1378 doesn't work. So, we have to calculate carry & overflow manually. */
1379 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1380 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1381 }
1382 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1383
1384 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1385 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1386
1387 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1388 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1389 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1390 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1391 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1392
1393#else
1394# error "port me"
1395#endif
1396 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1397 return off;
1398}
1399
1400
1401/**
1402 * The ADC instruction with immediate value as right operand.
1403 */
1404template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1405DECL_INLINE_THROW(uint32_t)
1406iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1407{
1408 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1409 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1410
1411#ifdef RT_ARCH_AMD64
1412 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1413 with matching size to get the correct flags. */
1414 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1415
1416 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1417 pCodeBuf[off++] = X86_EFL_CF_BIT;
1418
1419 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 2, idxRegDst, uImmOp);
1420 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1421
1422 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1423
1424 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1425
1426#elif defined(RT_ARCH_ARM64)
1427 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1428 and then ADCS for the calculation. We need all inputs and result for
1429 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1430 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1431 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1432 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1433
1434 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1435 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1436 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1437 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1438 else
1439 {
1440 /* Since we're also adding in the carry flag here, shifting operands up
1441 doesn't work. So, we have to calculate carry & overflow manually. */
1442 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1443 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1444 }
1445 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1446
1447 iemNativeRegFreeTmp(pReNative, idxRegImm);
1448
1449 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1450 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1451
1452 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1453 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1454 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1455 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1456
1457#else
1458# error "port me"
1459#endif
1460 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1461 return off;
1462}
1463
1464
1465/**
1466 * The SUB instruction will set all status flags.
1467 */
1468template<uint8_t const a_cOpBits>
1469DECL_INLINE_THROW(uint32_t)
1470iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1471{
1472 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1473 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1474
1475#ifdef RT_ARCH_AMD64
1476 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1477 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1478 0x2a, 0x2b, a_cOpBits, idxRegDst, idxRegSrc);
1479 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1480
1481 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1482 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1483
1484 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1485
1486#elif defined(RT_ARCH_ARM64)
1487 /* On ARM64 we'll need the two input operands as well as the result in order
1488 to calculate the right flags, even if we use SUBS and translates NZCV into
1489 OF, CF, ZF and SF. */
1490 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1491 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1492 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1493 {
1494 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1495 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1496 }
1497 else
1498 {
1499 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1500 uint32_t const cShift = 32 - a_cOpBits;
1501 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1502 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1503 true /*fSetFlags*/, cShift);
1504 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1505 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1506 }
1507 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1508
1509 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1510 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1511
1512 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1513 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1514 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1515
1516#else
1517# error "port me"
1518#endif
1519 return off;
1520}
1521
1522
1523/**
1524 * The SUB instruction with immediate value as right operand.
1525 */
1526template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1527DECL_INLINE_THROW(uint32_t)
1528iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1529{
1530 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1531
1532#ifdef RT_ARCH_AMD64
1533 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1534 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1535 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 5, idxRegDst, uImmOp);
1536 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1537
1538 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1539
1540 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1541
1542#elif defined(RT_ARCH_ARM64)
1543 /* On ARM64 we'll need the two input operands as well as the result in order
1544 to calculate the right flags, even if we use SUBS and translates NZCV into
1545 OF, CF, ZF and SF. */
1546 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1547 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1548 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1549 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1550 {
1551 if (uImmOp <= 0xfffU)
1552 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1553 true /*fSetFlags*/);
1554 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1555 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1556 true /*fSetFlags*/, true /*fShift12*/);
1557 else
1558 {
1559 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1560 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1561 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1562 true /*fSetFlags*/);
1563 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1564 }
1565 }
1566 else
1567 {
1568 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1569 uint32_t const cShift = 32 - a_cOpBits;
1570 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1571 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1572 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1573 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1574 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1575 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1576 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1577 }
1578 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1579
1580 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1581 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1582
1583 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1584 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1585
1586#else
1587# error "port me"
1588#endif
1589 return off;
1590}
1591
1592
1593/**
1594 * The CMP instruction will set all status flags, but modifies no registers.
1595 */
1596template<uint8_t const a_cOpBits>
1597DECL_INLINE_THROW(uint32_t)
1598iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1599{
1600 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1601 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1602
1603#ifdef RT_ARCH_AMD64
1604 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1605 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1606 0x3a, 0x3b, a_cOpBits, idxRegDst, idxRegSrc);
1607 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1608
1609 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1610 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1611
1612 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1613
1614#elif defined(RT_ARCH_ARM64)
1615 /* On ARM64 we'll need the actual result as well as both input operands in order
1616 to calculate the right flags, even if we use SUBS and translates NZCV into
1617 OF, CF, ZF and SF. */
1618 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1619 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1620 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1621 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1622 else
1623 {
1624 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1625 uint32_t const cShift = 32 - a_cOpBits;
1626 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1627 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1628 true /*fSetFlags*/, cShift);
1629 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1630 }
1631 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1632
1633 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1634 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1635
1636 iemNativeRegFreeTmp(pReNative, idxRegResult);
1637 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1638 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1639
1640#else
1641# error "port me"
1642#endif
1643 return off;
1644}
1645
1646
1647/**
1648 * The CMP instruction with immediate value as right operand.
1649 */
1650template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1651DECL_INLINE_THROW(uint32_t)
1652iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1653{
1654 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1655
1656#ifdef RT_ARCH_AMD64
1657 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1658 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1659 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 7, idxRegDst, uImmOp);
1660 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1661
1662 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1663
1664 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1665
1666#elif defined(RT_ARCH_ARM64)
1667 /* On ARM64 we'll need the actual result as well as both input operands in order
1668 to calculate the right flags, even if we use SUBS and translates NZCV into
1669 OF, CF, ZF and SF. */
1670 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1671 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1672 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1673 {
1674 if (uImmOp <= 0xfffU)
1675 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1676 true /*fSetFlags*/);
1677 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1678 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1679 true /*fSetFlags*/, true /*fShift12*/);
1680 else
1681 {
1682 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1683 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1684 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1685 true /*fSetFlags*/);
1686 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1687 }
1688 }
1689 else
1690 {
1691 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1692 uint32_t const cShift = 32 - a_cOpBits;
1693 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1694 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1695 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1696 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1697 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1698 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1699 }
1700 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1701
1702 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1703 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1704
1705 iemNativeRegFreeTmp(pReNative, idxRegResult);
1706 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1707
1708#else
1709# error "port me"
1710#endif
1711 return off;
1712}
1713
1714
1715/**
1716 * The SBB instruction takes CF as input and will set all status flags.
1717 */
1718template<uint8_t const a_cOpBits>
1719DECL_INLINE_THROW(uint32_t)
1720iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1721{
1722 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1723 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1724 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1725
1726#ifdef RT_ARCH_AMD64
1727 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1728 with matching size to get the correct flags. */
1729 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1730
1731 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1732 pCodeBuf[off++] = X86_EFL_CF_BIT;
1733
1734 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, a_cOpBits, idxRegDst, idxRegSrc);
1735 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1736
1737 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1738 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1739
1740 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1741
1742#elif defined(RT_ARCH_ARM64)
1743 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1744 idxRegEfl and then SBCS for the calculation. We need all inputs and
1745 result for the two flags (AF,PF) that can't be directly derived from
1746 PSTATE.NZCV. */
1747 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1748 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1749
1750 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1751 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1752 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1753 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1754 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, a_cOpBits > 32 /*f64Bit*/);
1755 else
1756 {
1757 /* Since we're also adding in the carry flag here, shifting operands up
1758 doesn't work. So, we have to calculate carry & overflow manually. */
1759 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1760 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1761 }
1762 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1763
1764 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1765 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1766
1767 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1768 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1769 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1770 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1771 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1772
1773#else
1774# error "port me"
1775#endif
1776 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1777 return off;
1778}
1779
1780
1781/**
1782 * The SBB instruction with immediate value as right operand.
1783 */
1784template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1785DECL_INLINE_THROW(uint32_t)
1786iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1787{
1788 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1789 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1790
1791#ifdef RT_ARCH_AMD64
1792 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1793 with matching size to get the correct flags. */
1794 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1795
1796 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1797 pCodeBuf[off++] = X86_EFL_CF_BIT;
1798
1799 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 3, idxRegDst, uImmOp);
1800 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1801
1802 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1803
1804 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1805
1806#elif defined(RT_ARCH_ARM64)
1807 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1808 idxRegEfl and then SBCS for the calculation. We need all inputs and
1809 result for the two flags (AF,PF) that can't be directly derived from
1810 PSTATE.NZCV. */
1811 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1812 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1813 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1814
1815 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1816 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1817 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1818 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1819 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1820 else
1821 {
1822 /* Since we're also adding in the carry flag here, shifting operands up
1823 doesn't work. So, we have to calculate carry & overflow manually. */
1824 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1825 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1826 }
1827 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1828
1829 iemNativeRegFreeTmp(pReNative, idxRegImm);
1830
1831 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1832 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1833
1834 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1835 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1836 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1837 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1838
1839#else
1840# error "port me"
1841#endif
1842 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1843 return off;
1844}
1845
1846
1847template<uint8_t const a_cOpBits>
1848DECL_INLINE_THROW(uint32_t)
1849iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1850{
1851 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1852 AssertFailed();
1853 return iemNativeEmitBrk(pReNative, off, 0x666);
1854}
1855
1856
1857template<uint8_t const a_cOpBits>
1858DECL_INLINE_THROW(uint32_t)
1859iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1860{
1861 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1862 AssertFailed();
1863 return iemNativeEmitBrk(pReNative, off, 0x666);
1864}
1865
1866
1867template<uint8_t const a_cOpBits>
1868DECL_INLINE_THROW(uint32_t)
1869iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1870{
1871 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1872 AssertFailed();
1873 return iemNativeEmitBrk(pReNative, off, 0x666);
1874}
1875
1876
1877template<uint8_t const a_cOpBits>
1878DECL_INLINE_THROW(uint32_t)
1879iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl)
1880{
1881 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl);
1882 AssertFailed();
1883 return iemNativeEmitBrk(pReNative, off, 0x666);
1884}
1885
1886
1887
1888/*********************************************************************************************************************************
1889* Shifting and Rotating. *
1890*********************************************************************************************************************************/
1891
1892
1893typedef enum
1894{
1895 kIemNativeEmitEFlagsForShiftType_Left,
1896 kIemNativeEmitEFlagsForShiftType_Right,
1897 kIemNativeEmitEFlagsForShiftType_SignedRight
1898} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1899
1900/**
1901 * This is used by SHL, SHR and SAR emulation.
1902 *
1903 * It takes liveness stuff into account.
1904 */
1905DECL_INLINE_THROW(uint32_t)
1906iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1907 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1908 uint8_t idxRegTmp)
1909{
1910 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1911
1912RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1913#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1914 /*
1915 * See if we can skip this wholesale.
1916 */
1917 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1918 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1919 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1920 {
1921 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1922 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
1923# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1924 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1925# endif
1926 }
1927 else
1928#endif
1929 {
1930 /*
1931 * The difference between Intel and AMD flags for SHL are:
1932 * - Intel always clears AF while AMD always sets it.
1933 * - Intel sets OF for the first shift, while AMD for the last shift.
1934 *
1935 */
1936
1937#ifdef RT_ARCH_AMD64
1938 /*
1939 * We capture flags and does the additional OF and AF calculations as needed.
1940 */
1941 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1942 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1943 * use LAHF here when host rax is free since, OF is cleared. */
1944 /* pushf */
1945 pCodeBuf[off++] = 0x9c;
1946 /* pop tmp */
1947 if (idxRegTmp >= 8)
1948 pCodeBuf[off++] = X86_OP_REX_B;
1949 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1950 /* Clear the status bits in EFLs. */
1951 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1952 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1953 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1954 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1955 else
1956 {
1957 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
1958 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
1959 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1960 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1961 /* OR in the flags we collected. */
1962 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1963
1964 /* Calculate OF */
1965 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1966 {
1967 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1968 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1969 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
1970 RT_MAX(cOpBits, 16), 4, idxRegResult);
1971 pCodeBuf[off++] = cOpBits - 1;
1972 /* setc idxRegTmp */
1973 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
1974 /* xor idxRegTmp, idxRegEfl */
1975 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1976 /* and idxRegTmp, 1 */
1977 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1978 /* shl idxRegTmp, X86_EFL_OF_BIT */
1979 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
1980 }
1981 else
1982 {
1983 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1984 if (cOpBits <= 32)
1985 {
1986 /* mov idxRegTmp, idxRegSrc */
1987 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
1988 /* shl idxRegTmp, 1 */
1989 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
1990 /* xor idxRegTmp, idxRegSrc */
1991 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
1992 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
1993 if (cOpBits >= X86_EFL_OF_BIT)
1994 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
1995 else
1996 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
1997 }
1998 else
1999 {
2000 /* same as above but with 64-bit grps*/
2001 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2002 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2003 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2004 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2005 }
2006 /* and idxRegTmp, X86_EFL_OF */
2007 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2008 }
2009 }
2010 /* Or in the collected flag(s) */
2011 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2012
2013#elif defined(RT_ARCH_ARM64)
2014 /*
2015 * Calculate flags.
2016 */
2017 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2018
2019 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2020 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2021 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2022
2023 /* N,Z -> SF,ZF */
2024 if (cOpBits < 32)
2025 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2026 else
2027 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2028 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2029 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2030 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2031 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2032
2033 /* Calculate 8-bit parity of the result. */
2034 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2035 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2036 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2037 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2038 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2039 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2040 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2041 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2042 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2043
2044 /* Calculate carry - the last bit shifted out of the input value. */
2045 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2046 {
2047 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2048 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2049 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2050 if (cOpBits < 32)
2051 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2052 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2053 }
2054 else
2055 {
2056 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2057 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2058 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2059 }
2060 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2061
2062 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2063 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2064 {
2065 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2066 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2067 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2068 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2069 }
2070 else
2071 {
2072 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2073 AssertCompile(X86_EFL_CF_BIT == 0);
2074 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2075 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2076 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2077
2078 /* AMD unconditionally clears AF. */
2079 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2080 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2081 }
2082#else
2083# error "port me"
2084#endif
2085 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2086
2087#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2088 if (pReNative->fSkippingEFlags)
2089 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2090 pReNative->fSkippingEFlags = 0;
2091# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2092 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2093# endif
2094#endif
2095 }
2096 return off;
2097}
2098
2099
2100DECL_INLINE_THROW(uint32_t)
2101iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2102 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2103{
2104 /* Note! Since we're doing some branching here, we need to allocate all
2105 registers we need before the jump or we may end up with invalid
2106 register state if the branch is taken. */
2107 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2108 uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /*fInitialized*/); /* modified on arm64 */
2109 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
2110 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
2111
2112#ifdef RT_ARCH_AMD64
2113 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2114 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2115
2116 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2117 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2118 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2119 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2120
2121 /* Check if it's NOP before we do anything. */
2122 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2123 uint32_t const offFixup = off;
2124 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2125
2126 if (idxRegDstIn != UINT8_MAX)
2127 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2128 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2129
2130#elif defined(RT_ARCH_ARM64)
2131 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2132 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2133 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2134
2135 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2136 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2137 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2138 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2139 uint32_t const offFixup = off;
2140 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2141
2142 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2143 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2144 if (cOpBits < 32)
2145 {
2146 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2147 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2148 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2149 }
2150
2151#else
2152# error "port me"
2153#endif
2154
2155 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2156 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2157 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2158
2159 /* fixup the jump */
2160 iemNativeFixupFixedJump(pReNative, offFixup, off);
2161
2162#ifdef RT_ARCH_AMD64
2163 if (idxRegDstIn != UINT8_MAX)
2164#endif
2165 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2166 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2167 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2168 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2169 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2170 return off;
2171}
2172
2173
2174DECL_INLINE_THROW(uint32_t)
2175iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2176 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2177{
2178 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2179 AssertFailed();
2180 return iemNativeEmitBrk(pReNative, off, 0x666);
2181}
2182
2183
2184DECL_INLINE_THROW(uint32_t)
2185iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2186 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2187{
2188 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2189 AssertFailed();
2190 return iemNativeEmitBrk(pReNative, off, 0x666);
2191}
2192
2193
2194DECL_INLINE_THROW(uint32_t)
2195iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2196 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2197{
2198 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2199 AssertFailed();
2200 return iemNativeEmitBrk(pReNative, off, 0x666);
2201}
2202
2203
2204DECL_INLINE_THROW(uint32_t)
2205iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2206 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2207{
2208 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2209 AssertFailed();
2210 return iemNativeEmitBrk(pReNative, off, 0x666);
2211}
2212
2213
2214DECL_INLINE_THROW(uint32_t)
2215iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2216 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2217{
2218 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2219 AssertFailed();
2220 return iemNativeEmitBrk(pReNative, off, 0x666);
2221}
2222
2223
2224DECL_INLINE_THROW(uint32_t)
2225iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2226 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2227{
2228 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2229 AssertFailed();
2230 return iemNativeEmitBrk(pReNative, off, 0x666);
2231}
2232
2233
2234
2235#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2236/*********************************************************************************************************************************
2237* SIMD emitters. *
2238*********************************************************************************************************************************/
2239
2240/**
2241 * Common emitter for packed arithmetic instructions.
2242 */
2243#ifdef RT_ARCH_AMD64
2244# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2245 DECL_INLINE_THROW(uint32_t) \
2246 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2247 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2248 { \
2249 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2250 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2251 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2252 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2253 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2254 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2255 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2256 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2257 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2258 pCodeBuf[off++] = 0x0f; \
2259 pCodeBuf[off++] = (a_bOpcX86); \
2260 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2261 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2262 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2263 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2264 return off; \
2265 } \
2266 DECL_INLINE_THROW(uint32_t) \
2267 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2268 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2269 { \
2270 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2271 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2272 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2273 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2274 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2275 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2276 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2277 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2278 pCodeBuf[off++] = 0x0f; \
2279 pCodeBuf[off++] = (a_bOpcX86); \
2280 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2281 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2282 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2283 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2284 return off; \
2285 } \
2286 typedef int ignore_semicolon
2287#elif defined(RT_ARCH_ARM64)
2288# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2289 DECL_INLINE_THROW(uint32_t) \
2290 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2291 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2292 { \
2293 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2294 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2295 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2296 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2297 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2298 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2299 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2300 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2301 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2302 return off; \
2303 } \
2304 DECL_INLINE_THROW(uint32_t) \
2305 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2306 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2307 { \
2308 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2309 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2310 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2311 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2312 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2313 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2314 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2315 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2316 return off; \
2317 } \
2318 typedef int ignore_semicolon
2319#else
2320# error "Port me"
2321#endif
2322
2323/* POR, ORPS, ORPD. */
2324IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2325/* PXOR, XORPS, XORPD. */
2326IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2327/* PAND, ANDPS, ANDPD. */
2328IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2329
2330
2331/**
2332 * Common emitter for the shift right with immediate instructions.
2333 */
2334#ifdef RT_ARCH_AMD64
2335# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2336 DECL_INLINE_THROW(uint32_t) \
2337 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2338 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2339 { \
2340 if (bImm) \
2341 { \
2342 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2343 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2344 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2345 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2346 if (idxSimdRegDst >= 8) \
2347 pCodeBuf[off++] = X86_OP_REX_B; \
2348 pCodeBuf[off++] = 0x0f; \
2349 pCodeBuf[off++] = (a_bOpcX86); \
2350 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2351 pCodeBuf[off++] = bImm; \
2352 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2353 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2354 } \
2355 /* Immediate 0 is a nop. */ \
2356 return off; \
2357 } \
2358 typedef int ignore_semicolon
2359#elif defined(RT_ARCH_ARM64)
2360# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2361 DECL_INLINE_THROW(uint32_t) \
2362 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2363 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2364 { \
2365 if (bImm) \
2366 { \
2367 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2368 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2369 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2370 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2371 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2372 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2373 } \
2374 /* Immediate 0 is a nop. */ \
2375 return off; \
2376 } \
2377 typedef int ignore_semicolon
2378#else
2379# error "Port me"
2380#endif
2381
2382IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2383IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2384IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2385
2386
2387/**
2388 * Common emitter for the shift left with immediate instructions.
2389 */
2390#ifdef RT_ARCH_AMD64
2391# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2392 DECL_INLINE_THROW(uint32_t) \
2393 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2394 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2395 { \
2396 if (bImm) \
2397 { \
2398 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2399 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2400 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2401 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2402 if (idxSimdRegDst >= 8) \
2403 pCodeBuf[off++] = X86_OP_REX_B; \
2404 pCodeBuf[off++] = 0x0f; \
2405 pCodeBuf[off++] = (a_bOpcX86); \
2406 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2407 pCodeBuf[off++] = bImm; \
2408 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2409 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2410 } \
2411 /* Immediate 0 is a nop. */ \
2412 return off; \
2413 } \
2414 typedef int ignore_semicolon
2415#elif defined(RT_ARCH_ARM64)
2416# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2417 DECL_INLINE_THROW(uint32_t) \
2418 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2419 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2420 { \
2421 if (bImm) /* bImm == 0 is a nop */ \
2422 { \
2423 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2424 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2425 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2426 if (bImm < (a_cShiftMax)) \
2427 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2428 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2429 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2430 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2431 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2432 } \
2433 return off; \
2434 } \
2435 typedef int ignore_semicolon
2436#else
2437# error "Port me"
2438#endif
2439
2440IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2441IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2442IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2443
2444
2445/**
2446 * Common emitter for packed arithmetic instructions.
2447 */
2448#ifdef RT_ARCH_AMD64
2449# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2450 DECL_INLINE_THROW(uint32_t) \
2451 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2452 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2453 { \
2454 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2455 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2456 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2457 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2458 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2459 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2460 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2461 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2462 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2463 pCodeBuf[off++] = 0x0f; \
2464 pCodeBuf[off++] = (a_bOpcX86); \
2465 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2466 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2467 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2468 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2469 return off; \
2470 } \
2471 DECL_INLINE_THROW(uint32_t) \
2472 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2473 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2474 { \
2475 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2476 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2477 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2478 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2479 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2480 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2481 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2482 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2483 pCodeBuf[off++] = 0x0f; \
2484 pCodeBuf[off++] = (a_bOpcX86); \
2485 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2486 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2487 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2488 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2489 return off; \
2490 } \
2491 typedef int ignore_semicolon
2492#elif defined(RT_ARCH_ARM64)
2493# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2494 DECL_INLINE_THROW(uint32_t) \
2495 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2496 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2497 { \
2498 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2499 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2500 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2501 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2502 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2503 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2504 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2505 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2507 return off; \
2508 } \
2509 DECL_INLINE_THROW(uint32_t) \
2510 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2511 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2512 { \
2513 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2514 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2515 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2516 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2517 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2518 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2519 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2520 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2521 return off; \
2522 } \
2523 typedef int ignore_semicolon
2524#else
2525# error "Port me"
2526#endif
2527
2528/*
2529 * PADDx.
2530 */
2531IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2532IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2533IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2534IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2535
2536/*
2537 * PSUBx.
2538 */
2539IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2540IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2541IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2542IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2543
2544/*
2545 * PADDUSx.
2546 */
2547IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2548IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2549
2550/*
2551 * PMULLx.
2552 */
2553IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2554
2555
2556/**
2557 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2558 */
2559#ifdef RT_ARCH_AMD64
2560# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2561 DECL_INLINE_THROW(uint32_t) \
2562 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2563 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2564 { \
2565 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2566 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2567 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2568 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2569 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2570 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2571 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2572 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2573 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2574 pCodeBuf[off++] = 0x0f; \
2575 pCodeBuf[off++] = (a_bOpcX86); \
2576 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2577 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2578 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2579 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2580 return off; \
2581 } \
2582 DECL_INLINE_THROW(uint32_t) \
2583 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2584 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2585 { \
2586 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2587 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2588 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2589 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2590 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2591 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2592 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2593 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2594 pCodeBuf[off++] = 0x0f; \
2595 pCodeBuf[off++] = (a_bOpcX86); \
2596 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2597 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2598 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2599 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2600 return off; \
2601 } \
2602 typedef int ignore_semicolon
2603#elif defined(RT_ARCH_ARM64)
2604# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2605 DECL_INLINE_THROW(uint32_t) \
2606 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2607 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2608 { \
2609 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2610 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2611 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2612 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2613 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2614 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2615 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2616 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2617 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2618 return off; \
2619 } \
2620 DECL_INLINE_THROW(uint32_t) \
2621 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2622 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2623 { \
2624 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2625 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2626 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2627 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2628 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2629 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2630 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2631 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2632 return off; \
2633 } \
2634 typedef int ignore_semicolon
2635#else
2636# error "Port me"
2637#endif
2638
2639IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2640IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2641IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2642
2643
2644/**
2645 * Emitter for pmovmskb
2646 */
2647DECL_INLINE_THROW(uint32_t)
2648iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2649 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2650{
2651#ifdef RT_ARCH_AMD64
2652 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2653 kIemNativeGstRegUse_ForFullWrite);
2654 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2655 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2656 kIemNativeGstSimdRegLdStSz_Low128,
2657 kIemNativeGstRegUse_ReadOnly);
2658 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2659
2660 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2661 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2662 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2663 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2664 pCodeBuf[off++] = 0x0f;
2665 pCodeBuf[off++] = 0xd7;
2666 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2667
2668 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2669 iemNativeRegFreeTmp(pReNative, idxRegDst);
2670
2671#elif defined(RT_ARCH_ARM64)
2672 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2673 kIemNativeGstRegUse_ForFullWrite);
2674 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2675 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2676 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2677 kIemNativeGstSimdRegLdStSz_Low128,
2678 kIemNativeGstRegUse_Calculation);
2679 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2680
2681 /*
2682 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2683 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2684 *
2685 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2686 * as the sse2neon implementation because there we can get away with loading any constants and the
2687 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2688 *
2689 * The following illustrates the algorithm:
2690 *
2691 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2692 * Instruction
2693 * |
2694 * V
2695 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2696 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2697 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2698 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2699 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2700 *
2701 * The extraction process
2702 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2703 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2704 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2705 */
2706 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2707 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2708 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2709 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2710 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2711 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2712 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2713
2714 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2715 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2716 iemNativeRegFreeTmp(pReNative, idxRegDst);
2717
2718#else
2719# error "Port me"
2720#endif
2721 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2722 return off;
2723}
2724
2725
2726/**
2727 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2728 */
2729DECL_INLINE_THROW(uint32_t)
2730iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2731 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2732{
2733 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2734 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2735 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2736 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2737
2738#ifdef RT_ARCH_AMD64
2739 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2740
2741 /* packuswb xmm, xmm */
2742 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2743 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2744 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2745 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2746 pCodeBuf[off++] = 0x0f;
2747 pCodeBuf[off++] = 0x67;
2748 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2749
2750#elif defined(RT_ARCH_ARM64)
2751 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2752
2753 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2754 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2755
2756#else
2757# error "port me"
2758#endif
2759
2760 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2761 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2762
2763 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2764 return off;
2765}
2766
2767
2768/**
2769 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2770 */
2771DECL_INLINE_THROW(uint32_t)
2772iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2773 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2774{
2775 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2776 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2777
2778 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2779 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2780 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2781
2782
2783#ifdef RT_ARCH_AMD64
2784 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2785
2786 /* packuswb xmm, xmm */
2787 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2788 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2789 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2790 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2791 pCodeBuf[off++] = 0x0f;
2792 pCodeBuf[off++] = 0x67;
2793 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2794
2795#elif defined(RT_ARCH_ARM64)
2796 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2797
2798 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2799 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2800
2801#else
2802# error "port me"
2803#endif
2804
2805 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2806 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2807
2808 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2809 return off;
2810}
2811
2812
2813/**
2814 * Common emitter for the pmov{s,z}x* instructions.
2815 */
2816#ifdef RT_ARCH_AMD64
2817# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2818 DECL_INLINE_THROW(uint32_t) \
2819 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2820 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2821 { \
2822 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2823 { \
2824 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2825 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2826 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2827 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2828 if (idxSimdReg >= 8) \
2829 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2830 pCodeBuf[off++] = 0x0f; \
2831 pCodeBuf[off++] = 0x38; \
2832 pCodeBuf[off++] = (a_bOpcX86); \
2833 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2834 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2835 } \
2836 else \
2837 { \
2838 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2839 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2840 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2841 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2842 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2843 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2844 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2845 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2846 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2847 pCodeBuf[off++] = 0x0f; \
2848 pCodeBuf[off++] = 0x38; \
2849 pCodeBuf[off++] = (a_bOpcX86); \
2850 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2851 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2852 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2853 } \
2854 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2855 return off; \
2856 } \
2857 DECL_INLINE_THROW(uint32_t) \
2858 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2859 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2860 { \
2861 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2862 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2863 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2864 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2865 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2866 pCodeBuf[off++] = X86_OP_REX_W \
2867 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2868 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2869 pCodeBuf[off++] = 0x0f; \
2870 pCodeBuf[off++] = 0x3a; \
2871 pCodeBuf[off++] = 0x22; \
2872 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2873 pCodeBuf[off++] = 0; /* QWord */\
2874 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2875 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2876 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2877 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2878 pCodeBuf[off++] = 0x0f; \
2879 pCodeBuf[off++] = 0x38; \
2880 pCodeBuf[off++] = (a_bOpcX86); \
2881 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2882 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2883 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2884 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2885 return off; \
2886 } \
2887 typedef int ignore_semicolon
2888#elif defined(RT_ARCH_ARM64)
2889# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2890 DECL_INLINE_THROW(uint32_t) \
2891 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2892 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2893 { \
2894 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2895 { \
2896 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2897 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2898 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2899 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2900 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2901 } \
2902 else \
2903 { \
2904 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2905 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2906 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2907 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2908 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2909 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2910 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2911 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2912 } \
2913 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2914 return off; \
2915 } \
2916 DECL_INLINE_THROW(uint32_t) \
2917 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2918 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2919 { \
2920 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2921 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2922 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2923 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2924 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
2925 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2926 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2927 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2928 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2929 return off; \
2930 } \
2931 typedef int ignore_semicolon
2932#else
2933# error "Port me"
2934#endif
2935
2936IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2937IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2938IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2939
2940IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2941IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2942IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2943
2944
2945/**
2946 * Updates the MXCSR exception flags, raising any unmasked exceptions.
2947 */
2948DECL_INLINE_THROW(uint32_t)
2949iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2950{
2951 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2952 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2953 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2954
2955#ifdef RT_ARCH_AMD64
2956 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2957
2958 /* stmxcsr */
2959 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2960 pbCodeBuf[off++] = X86_OP_REX_B;
2961 pbCodeBuf[off++] = 0x0f;
2962 pbCodeBuf[off++] = 0xae;
2963 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2964 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2965 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2966 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2967 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2968
2969 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2970 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2971
2972 /* Store the flags in the MXCSR xcpt flags register. */
2973 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2974 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2975
2976 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
2977 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2978 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2979
2980 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2981
2982 /* ldmxcsr */
2983 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2984 pbCodeBuf[off++] = X86_OP_REX_B;
2985 pbCodeBuf[off++] = 0x0f;
2986 pbCodeBuf[off++] = 0xae;
2987 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2988 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2989 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2990 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2991 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2992
2993#elif defined(RT_ARCH_ARM64)
2994 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2995 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
2996 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
2997 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
2998
2999 /*
3000 * The exception flags layout differs between MXCSR and FPSR of course:
3001 *
3002 * Bit FPSR MXCSR
3003 * 0 IOC ------> IE
3004 *
3005 * 1 DZC ---- DE <-+
3006 * \ |
3007 * 2 OFC --- -> ZE |
3008 * \ |
3009 * 3 UFC -- --> OE |
3010 * \ |
3011 * 4 IXC - ---> UE |
3012 * \ |
3013 * 5 ----> PE |
3014 * 6 |
3015 * 7 IDC --------------+
3016 */
3017 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3018 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3019 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3020 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3021#else
3022# error "Port me"
3023#endif
3024
3025 /*
3026 * If PE is set together with OE/UE and neither are masked
3027 * PE needs to be cleared, because on real hardware
3028 * an exception is generated with only OE/UE being set,
3029 * but because we mask all exceptions PE will get set as well.
3030 */
3031 /** @todo On ARM we can combine the load+and into one and instruction. */
3032 /** @todo r=aeichner Can this be done more optimal? */
3033 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3034 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3035 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3036 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3037 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3038 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3039 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3040 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3041 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3042
3043 uint32_t offFixup = off;
3044 off = iemNativeEmitJzToFixed(pReNative, off, off);
3045 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3046 iemNativeFixupFixedJump(pReNative, offFixup, off);
3047 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3048
3049
3050 /* Set the MXCSR flags now. */
3051 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3052
3053 /*
3054 * Make sure we don't have any outstanding guest register writes as we may
3055 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3056 */
3057 off = iemNativeRegFlushPendingWrites(pReNative, off);
3058
3059#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3060 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3061#else
3062 RT_NOREF(idxInstr);
3063#endif
3064
3065 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3066 /* mov tmp, varmxcsr */
3067 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3068 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3069 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3070 /* tmp = ~tmp */
3071 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3072 /* tmp &= mxcsr */
3073 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3074 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3075 X86_MXCSR_XCPT_FLAGS);
3076
3077 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3078 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3079
3080 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3081 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3082
3083 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3084 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3085 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3086 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3087 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3088 return off;
3089}
3090
3091
3092/**
3093 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3094 */
3095DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3096 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3097#ifdef RT_ARCH_AMD64
3098 uint8_t const bPrefixX86, uint8_t const bOpcX86
3099#elif defined(RT_ARCH_ARM64)
3100 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3101#endif
3102 )
3103{
3104 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3105 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3106 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3107 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3108
3109#ifdef RT_ARCH_AMD64
3110 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3111 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3112 if (bPrefixX86 != 0)
3113 pCodeBuf[off++] = bPrefixX86;
3114 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3115 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3116 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3117 pCodeBuf[off++] = 0x0f;
3118 pCodeBuf[off++] = bOpcX86;
3119 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3120#elif defined(RT_ARCH_ARM64)
3121 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3122 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3123#else
3124# error "Port me"
3125#endif
3126 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3127 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3128 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3129 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3130}
3131
3132
3133/**
3134 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3135 */
3136DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3137 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3138#ifdef RT_ARCH_AMD64
3139 uint8_t const bPrefixX86, uint8_t const bOpcX86
3140#elif defined(RT_ARCH_ARM64)
3141 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3142#endif
3143 )
3144{
3145 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3146 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3147 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3148
3149#ifdef RT_ARCH_AMD64
3150 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3151 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3152 if (bPrefixX86 != 0)
3153 pCodeBuf[off++] = bPrefixX86;
3154 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3155 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3156 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3157 pCodeBuf[off++] = 0x0f;
3158 pCodeBuf[off++] = bOpcX86;
3159 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3160#elif defined(RT_ARCH_ARM64)
3161 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3162 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3163#else
3164# error "Port me"
3165#endif
3166 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3167 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3168 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3169 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3170}
3171
3172
3173/**
3174 * Common emitter for packed floating point instructions with 3 operands.
3175 */
3176#ifdef RT_ARCH_AMD64
3177# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3178 DECL_FORCE_INLINE_THROW(uint32_t) \
3179 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3180 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3181 { \
3182 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3183 a_bPrefixX86, a_bOpcX86); \
3184 } \
3185 DECL_FORCE_INLINE_THROW(uint32_t) \
3186 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3187 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3188 { \
3189 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3190 a_bPrefixX86, a_bOpcX86); \
3191 } \
3192 typedef int ignore_semicolon
3193#elif defined(RT_ARCH_ARM64)
3194# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3195 DECL_FORCE_INLINE_THROW(uint32_t) \
3196 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3197 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3198 { \
3199 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3200 a_enmArmOp, a_ArmElemSz); \
3201 } \
3202 DECL_FORCE_INLINE_THROW(uint32_t) \
3203 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3204 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3205 { \
3206 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3207 a_enmArmOp, a_ArmElemSz); \
3208 } \
3209 typedef int ignore_semicolon
3210#else
3211# error "Port me"
3212#endif
3213
3214
3215IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3216IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3217IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3218IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3219
3220#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3221
3222#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette