VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106199

Last change on this file since 106199 was 106199, checked in by vboxsync, 6 months ago

VMM/IEM: Refactored the xxxxx_r_i_efl functions to take the constant arguments (cOpBits & cImmBits) as template arguments. Fixed some arm build issues from pervious commit. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 159.9 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106199 2024-10-01 23:08:47Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* EFLAGS *
201*********************************************************************************************************************************/
202
203#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205/** @def IEMNATIVE_POSTPONING_REG_MASK
206 * Register suitable for keeping the inputs or result for a postponed EFLAGS
207 * calculation.
208 *
209 * We use non-volatile register here so we don't have to save & restore them
210 * accross callouts (i.e. TLB loads).
211 *
212 * @note On x86 we cannot use RDI and RSI because these are used by the
213 * opcode checking code. The usual joy of the x86 instruction set.
214 */
215# ifdef RT_ARCH_AMD64
216# define IEMNATIVE_POSTPONING_REG_MASK \
217 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
218# else
219# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220# endif
221
222/**
223 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224 */
225template<uint32_t const a_fEflClobbered>
226DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227{
228 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230 if (fEFlags)
231 {
232 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233 {
234 fEFlags &= ~a_fEflClobbered;
235 if (!fEFlags)
236 { /* likely */ }
237 else
238 {
239 Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
240 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241 pReNative->PostponedEfl.fEFlags = fEFlags;
242 return;
243 }
244 }
245
246 /* Do cleanup. */
247 Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
248 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249 pReNative->PostponedEfl.fEFlags = 0;
250 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251 pReNative->PostponedEfl.cOpBits = 0;
252 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259 pReNative->PostponedEfl.cEmits = 0;
260# endif
261 }
262}
263
264#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
265
266
267template<bool const a_fDoOp>
268DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
269 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
270{
271#ifdef RT_ARCH_AMD64
272 /* Do TEST idxRegResult, idxRegResult to set flags. */
273 if RT_CONSTEXPR_IF(a_fDoOp)
274 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
275
276 /*
277 * Collect the EFLAGS status bits.
278 * We know that the overflow bit will always be cleared, so LAHF can be used.
279 */
280 if (idxRegTmp == X86_GREG_xAX)
281 {
282 /* lahf ; AH = EFLAGS */
283 pCodeBuf[off++] = 0x9f;
284 if (idxRegEfl <= X86_GREG_xBX)
285 {
286 /* mov [CDB]L, AH */
287 pCodeBuf[off++] = 0x88;
288 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
289 }
290 else
291 {
292 /* mov AL, AH */
293 pCodeBuf[off++] = 0x88;
294 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
295 /* mov xxL, AL */
296 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
297 pCodeBuf[off++] = 0x88;
298 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
299 }
300 }
301 else if (idxRegEfl != X86_GREG_xAX)
302 {
303# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
304 /* xchg rax, tmp */
305 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
306 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
307
308 /* lahf ; AH = EFLAGS */
309 pCodeBuf[off++] = 0x9f;
310 if (idxRegEfl <= X86_GREG_xBX)
311 {
312 /* mov [CDB]L, AH */
313 pCodeBuf[off++] = 0x88;
314 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
315 }
316 else
317 {
318 /* mov AL, AH */
319 pCodeBuf[off++] = 0x88;
320 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
321 /* mov xxL, AL */
322 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
323 pCodeBuf[off++] = 0x88;
324 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
325 }
326
327 /* xchg rax, tmp */
328 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
329 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
330
331# else
332 /* pushf */
333 pCodeBuf[off++] = 0x9c;
334 /* pop tmp */
335 if (idxRegTmp >= 8)
336 pCodeBuf[off++] = X86_OP_REX_B;
337 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
338 /* mov byte(efl), byte(tmp) */
339 if (idxRegEfl >= 4 || idxRegTmp >= 4)
340 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
341 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
342 pCodeBuf[off++] = 0x88;
343 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
344# endif
345 }
346 else
347 {
348 /* xchg al, ah */
349 pCodeBuf[off++] = 0x86;
350 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
351 /* lahf ; AH = EFLAGS */
352 pCodeBuf[off++] = 0x9f;
353 /* xchg al, ah */
354 pCodeBuf[off++] = 0x86;
355 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
356 }
357 /* BTR idxEfl, 11; Clear OF */
358 if (idxRegEfl >= 8)
359 pCodeBuf[off++] = X86_OP_REX_B;
360 pCodeBuf[off++] = 0xf;
361 pCodeBuf[off++] = 0xba;
362 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
363 pCodeBuf[off++] = X86_EFL_OF_BIT;
364
365#elif defined(RT_ARCH_ARM64)
366 /*
367 * Calculate flags.
368 */
369 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
370 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
371 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
372
373 /* N,Z -> SF,ZF */
374 if (cOpBits < 32)
375 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
376 else if RT_CONSTEXPR_IF(a_fDoOp)
377 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
378 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
379 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
380 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
381 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
382
383 /* Calculate 8-bit parity of the result. */
384 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
385 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
386 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
387 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
388 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
389 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
390 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
391 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
392 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
393
394#else
395# error "port me"
396#endif
397 return off;
398}
399
400#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
401
402template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
403static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
404 uint32_t bmExtraTlbMissRegs = 0)
405{
406# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
407 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
408 pReNative->PostponedEfl.cEmits);
409# endif
410
411 /*
412 * In the TB exit code path we cannot do regular register allocation. Nor
413 * can we when we're in the TLB miss code, unless we're skipping the TLB
414 * lookup. Since the latter isn't an important usecase and should get along
415 * fine on just volatile registers, we do not need to do anything special
416 * for it.
417 *
418 * So, we do our own register allocating here. Any register goes in the TB
419 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
420 * In the TLB miss we can use any volatile register and temporary registers
421 * allocated in the TLB state.
422 *
423 * Note! On x86 we prefer using RAX as the first TMP register, so we can
424 * make use of LAHF which is typically faster than PUSHF/POP. This
425 * is why the idxRegTmp allocation is first when there is no EFLAG
426 * shadow, since RAX is represented by bit 0 in the mask.
427 */
428 uint32_t bmAvailableRegs;
429 if RT_CONSTEXPR_IF(!a_fTlbMiss)
430 {
431 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
432 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
433 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
434 else
435 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
436 }
437 else
438 {
439 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
440 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
441 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
442 & IEMNATIVE_HST_GREG_MASK;
443 }
444
445 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
446 need to weed out volatile registers here, as they will no longer be valid. */
447 uint8_t idxRegTmp;
448 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
449 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
450 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
451 {
452 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
453 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
454 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
455 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
456# ifdef VBOX_STRICT
457 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
458# endif
459
460 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
461 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
462 }
463 else
464 {
465 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
466 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
467
468 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
469 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
470 off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
471 }
472 Assert(bmAvailableRegs != 0);
473
474 /*
475 * Do the actual EFLAGS calculation.
476 */
477 switch (pReNative->PostponedEfl.enmOp)
478 {
479 case kIemNativePostponedEflOp_Logical:
480 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
481 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
482 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
483 break;
484
485 default:
486 AssertFailedBreak();
487 }
488
489 /*
490 * Store EFLAGS.
491 */
492# ifdef VBOX_STRICT
493 /* check that X86_EFL_1 is set. */
494 uint32_t offFixup1;
495 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
496 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
497 iemNativeFixupFixedJump(pReNative, offFixup1, off);
498 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
499 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK);
500 uint32_t const offFixup2 = off;
501 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
502 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
503 iemNativeFixupFixedJump(pReNative, offFixup2, off);
504# endif
505 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
507
508# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
509 pReNative->PostponedEfl.cEmits++;
510# endif
511 return off;
512}
513
514
515
516template<uint32_t const a_bmInputRegs>
517DECL_FORCE_INLINE_THROW(uint32_t)
518iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
519{
520 if (pReNative->PostponedEfl.fEFlags)
521 {
522 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
523 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
524 }
525 return off;
526}
527
528
529template<uint32_t const a_bmInputRegs>
530DECL_FORCE_INLINE_THROW(uint32_t)
531iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
532{
533 if (pReNative->PostponedEfl.fEFlags)
534 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
535 return off;
536}
537
538
539template<uint32_t const a_bmInputRegs>
540DECL_FORCE_INLINE_THROW(uint32_t)
541iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
542 uint32_t bmTmpRegs)
543{
544 if (pReNative->PostponedEfl.fEFlags)
545 {
546 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
547 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
548 pTlbState->getRegsNotToSave() | bmTmpRegs);
549 }
550 return off;
551}
552
553#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
554
555
556/**
557 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
558 *
559 * It takes liveness stuff into account.
560 */
561template<bool a_fNeedToSetFlags>
562DECL_INLINE_THROW(uint32_t)
563iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
564 uint8_t cOpBits, uint8_t idxRegResult)
565{
566 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
567 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
568 RT_NOREF(cOpBits, idxRegResult);
569
570#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
571 /*
572 * See if we can skip this wholesale.
573 */
574 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
575 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
576 & IEMLIVENESSBIT_STATUS_EFL_MASK;
577# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
578 uint64_t fEflPostponing;
579# endif
580 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
581 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
582 {
583 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
584 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
585# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
586 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
587# endif
588 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
589 return off;
590 }
591# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
592 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
593 | fEflClobbered)
594 == IEMLIVENESSBIT_STATUS_EFL_MASK
595 && idxRegResult != UINT8_MAX)
596 {
597 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
598 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
599 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
600 pReNative->PostponedEfl.cOpBits = cOpBits;
601 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
602 /** @todo it would normally be possible to use idxRegResult, iff it is
603 * already a non-volatile register and we can be user the caller
604 * doesn't modify it. That'll save a register move and allocation. */
605 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
606 Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
607 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
608 }
609# endif
610 else
611#endif
612 {
613 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
614 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
615#ifdef RT_ARCH_AMD64
616 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
617#elif defined(RT_ARCH_ARM64)
618 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
619#else
620# error "port me"
621#endif
622 off = iemNativeEmitPostponedEFlagsCalcLogical<a_fNeedToSetFlags>(pCodeBuf, off, cOpBits, idxRegResult,
623 idxRegEfl, idxRegTmp);
624 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
625
626 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
627 iemNativeRegFreeTmp(pReNative, idxRegTmp);
628 }
629
630#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
631 if (pReNative->fSkippingEFlags)
632 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
633 pReNative->fSkippingEFlags = 0;
634# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
635 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
636# endif
637#endif
638 return off;
639}
640
641
642/**
643 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
644 *
645 * It takes liveness stuff into account.
646 */
647DECL_FORCE_INLINE_THROW(uint32_t)
648iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
649#ifndef RT_ARCH_AMD64
650 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
651 , bool fInvertCarry, uint64_t uImmSrc
652#endif
653 )
654{
655 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
656 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
657
658#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
659 /*
660 * See if we can skip this wholesale.
661 */
662 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
663 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
664 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
665 {
666 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
667 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
668 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
669# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
670 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
671# endif
672 }
673 else
674#endif
675 {
676#ifdef RT_ARCH_AMD64
677 /*
678 * Collect flags and merge them with eflags.
679 */
680 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
681 /* pushf - do this before any reg allocations as they may emit instructions too. */
682 pCodeBuf[off++] = 0x9c;
683
684 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
685 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
686 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
687 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
688 /* pop tmp */
689 if (idxTmpReg >= 8)
690 pCodeBuf[off++] = X86_OP_REX_B;
691 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
692 /* Isolate the flags we want. */
693 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
694 /* Clear the status bits in EFLs. */
695 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
696 /* OR in the flags we collected. */
697 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
698 if (idxRegEflIn != idxRegEfl)
699 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
700 iemNativeRegFreeTmp(pReNative, idxTmpReg);
701
702#elif defined(RT_ARCH_ARM64)
703 /*
704 * Calculate flags.
705 */
706 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
707 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
708 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
709 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
710 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
711
712 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
713 if (fInvertCarry)
714 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
715 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
716
717 if (cOpBits >= 32)
718 {
719 /* V -> OF */
720 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
721 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
722
723 /* C -> CF */
724 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
725 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
726 }
727
728 /* N,Z -> SF,ZF */
729 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
730 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
731
732 /* For ADC and SBB we have to calculate overflow and carry our selves. */
733 if (cOpBits < 32)
734 {
735 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
736 AssertCompile(X86_EFL_CF_BIT == 0);
737 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
738
739 /* The overflow flag is more work as we have to compare the signed bits for
740 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
741
742 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
743 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
744
745 It is a bit simpler when the right (source) side is constant:
746 adc: S D R -> OF sbb: S D R -> OF
747 0 0 0 -> 0 \ 0 0 0 -> 0 \
748 0 0 1 -> 1 \ 0 0 1 -> 0 \
749 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
750 0 1 1 -> 0 / 0 1 1 -> 0 /
751 1 0 0 -> 0 \ 1 0 0 -> 0 \
752 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
753 1 1 0 -> 1 / 1 1 0 -> 0 /
754 1 1 1 -> 0 / 1 1 1 -> 0 / */
755 if (idxRegSrc != UINT8_MAX)
756 {
757 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
758 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
759 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
760 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
761 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
762 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
763 }
764 else if (uImmSrc & RT_BIT_32(cOpBits - 1))
765 {
766 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
767 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
768 else
769 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
770 }
771 else
772 {
773 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
774 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
775 else
776 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
777 }
778 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
779 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
780 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
781 }
782
783 /* Calculate 8-bit parity of the result. */
784 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
785 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
786 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
787 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
788 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
789 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
790 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
791 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
792 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
793
794 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
795 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
796 S D R
797 0 0 0 -> 0; \
798 0 0 1 -> 1; \ regular
799 0 1 0 -> 1; / xor R, D
800 0 1 1 -> 0; /
801 1 0 0 -> 1; \
802 1 0 1 -> 0; \ invert one of the two
803 1 1 0 -> 0; / xor not(R), D
804 1 1 1 -> 1; /
805 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
806 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
807 */
808
809 if (idxRegSrc != UINT8_MAX)
810 {
811 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
812 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
813 }
814 else if (uImmSrc & X86_EFL_AF)
815 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
816 else
817 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
818 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
819 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
820
821 if (idxRegEflIn != idxRegEfl)
822 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
823 iemNativeRegFreeTmp(pReNative, idxTmpReg);
824
825#else
826# error "port me"
827#endif
828 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
829
830#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
831 if (pReNative->fSkippingEFlags)
832 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
833 pReNative->fSkippingEFlags = 0;
834# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
835 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
836# endif
837#endif
838 }
839 return off;
840
841}
842
843
844
845/*********************************************************************************************************************************
846* Bitwise Logical Operations *
847*********************************************************************************************************************************/
848
849/**
850 * The AND instruction will clear OF, CF and AF (latter is undefined) and
851 * set the other flags according to the result.
852 */
853DECL_INLINE_THROW(uint32_t)
854iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
855 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
856{
857 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
858 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
859#ifdef RT_ARCH_AMD64
860 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
861 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
862 0x22, 0x23, cOpBits, idxRegDst, idxRegSrc);
863 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
864 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
865
866 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
867
868#elif defined(RT_ARCH_ARM64)
869 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
870 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
871 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
872 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
873 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
874
875 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
876#else
877# error "Port me"
878#endif
879 iemNativeVarRegisterRelease(pReNative, idxVarDst);
880 return off;
881}
882
883
884/**
885 * The AND instruction with immediate value as right operand.
886 */
887template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
888DECL_INLINE_THROW(uint32_t)
889iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
890{
891 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
892#ifdef RT_ARCH_AMD64
893 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
894 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
895 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 4, idxRegDst, uImmOp);
896 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
897
898 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
899
900#elif defined(RT_ARCH_ARM64)
901 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
902 course the immediate variant when possible to save a register load. */
903 uint32_t uImmSizeLen, uImmRotations;
904 if ( a_cOpBits > 32
905 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
906 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
907 {
908 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
909 if (a_cOpBits >= 32)
910 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
911 else
912 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
913 }
914 else
915 {
916 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
917 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
918 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
919 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
920 else
921 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
922 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
923 }
924 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
925
926 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
927
928#else
929# error "Port me"
930#endif
931 iemNativeVarRegisterRelease(pReNative, idxVarDst);
932 return off;
933}
934
935
936/**
937 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
938 * set the other flags according to the result.
939 */
940DECL_INLINE_THROW(uint32_t)
941iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
942 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
943{
944 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
945 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
946 : iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
947#ifdef RT_ARCH_AMD64
948 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
949 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
950 0x84, 0x85, cOpBits, idxRegSrc, idxRegDst);
951 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
952
953#elif defined(RT_ARCH_ARM64)
954 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
955 need to keep the result in order to calculate the flags. */
956 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
957 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
958 if (cOpBits >= 32)
959 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
960 else
961 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
962 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
963
964#else
965# error "Port me"
966#endif
967 if (idxVarSrc != idxVarDst)
968 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
969 iemNativeVarRegisterRelease(pReNative, idxVarDst);
970
971#ifdef RT_ARCH_AMD64
972 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
973#else
974 if (cOpBits >= 32)
975 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, idxRegResult);
976 else
977 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, cOpBits, idxRegResult);
978 iemNativeRegFreeTmp(pReNative, idxRegResult);
979#endif
980 return off;
981}
982
983
984/**
985 * The TEST instruction with immediate value as right operand.
986 */
987template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
988DECL_INLINE_THROW(uint32_t)
989iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
990{
991 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
992#ifdef RT_ARCH_AMD64
993 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
994 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
995 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
996 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
997 iemNativeVarRegisterRelease(pReNative, idxVarDst);
998
999 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, UINT8_MAX);
1000
1001#elif defined(RT_ARCH_ARM64)
1002 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1003 course the immediate variant when possible to save a register load.
1004 We also need to keep the result in order to calculate the flags. */
1005 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1006 uint32_t uImmSizeLen, uImmRotations;
1007 if ( a_cOpBits > 32
1008 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1009 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1010 {
1011 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1012 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1013 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1014 else
1015 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1016 }
1017 else
1018 {
1019 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1020 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1021 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1022 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1023 else
1024 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1025 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1026 }
1027 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1028 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1029
1030 off = iemNativeEmitEFlagsForLogical<a_cOpBits < 32>(pReNative, off, idxVarEfl, a_cOpBits, idxRegResult);
1031
1032 iemNativeRegFreeTmp(pReNative, idxRegResult);
1033
1034#else
1035# error "Port me"
1036#endif
1037 return off;
1038}
1039
1040
1041/**
1042 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1043 * set the other flags according to the result.
1044 */
1045DECL_INLINE_THROW(uint32_t)
1046iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1047 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1048{
1049 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1050 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1051#ifdef RT_ARCH_AMD64
1052 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1053 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1054 0x0a, 0x0b, cOpBits, idxRegDst, idxRegSrc);
1055 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1056 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1057
1058 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1059
1060#elif defined(RT_ARCH_ARM64)
1061 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1062 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1063 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1064 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1065 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1066
1067 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1068
1069#else
1070# error "Port me"
1071#endif
1072 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1073 return off;
1074}
1075
1076
1077/**
1078 * The OR instruction with immediate value as right operand.
1079 */
1080template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1081DECL_INLINE_THROW(uint32_t)
1082iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1083{
1084 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1085#ifdef RT_ARCH_AMD64
1086 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1087 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1088 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 1, idxRegDst, uImmOp);
1089 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1090
1091 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1092
1093#elif defined(RT_ARCH_ARM64)
1094 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1095 course the immediate variant when possible to save a register load. */
1096 uint32_t uImmSizeLen, uImmRotations;
1097 if ( a_cOpBits > 32
1098 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1099 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1100 {
1101 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1102 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1103 }
1104 else
1105 {
1106 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1107 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1108 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1109 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1110 }
1111 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1112
1113 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1114
1115#else
1116# error "Port me"
1117#endif
1118 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1119 return off;
1120}
1121
1122
1123/**
1124 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1125 * set the other flags according to the result.
1126 */
1127DECL_INLINE_THROW(uint32_t)
1128iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1129 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1130{
1131 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1132 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1133#ifdef RT_ARCH_AMD64
1134 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1135 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1136 0x32, 0x33, cOpBits, idxRegDst, idxRegSrc);
1137 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1138 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1139
1140 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1141
1142#elif defined(RT_ARCH_ARM64)
1143 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1144 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1145 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1146 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1147 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1148
1149 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1150
1151#else
1152# error "Port me"
1153#endif
1154 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1155 return off;
1156}
1157
1158
1159/**
1160 * The XOR instruction with immediate value as right operand.
1161 */
1162template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1163DECL_INLINE_THROW(uint32_t)
1164iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1165{
1166 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1167#ifdef RT_ARCH_AMD64
1168 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1169 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1170 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 6, idxRegDst, uImmOp);
1171 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1172
1173 off = iemNativeEmitEFlagsForLogical<false>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1174
1175#elif defined(RT_ARCH_ARM64)
1176 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1177 course the immediate variant when possible to save a register load. */
1178 uint32_t uImmSizeLen, uImmRotations;
1179 if ( a_cOpBits > 32
1180 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1181 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1182 {
1183 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1184 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, a_cOpBits > 32 /*f64Bit*/);
1185 }
1186 else
1187 {
1188 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1189 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1190 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/);
1191 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1192 }
1193 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1194
1195 off = iemNativeEmitEFlagsForLogical<true>(pReNative, off, idxVarEfl, a_cOpBits, idxRegDst);
1196
1197#else
1198# error "Port me"
1199#endif
1200 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1201 return off;
1202}
1203
1204
1205
1206/*********************************************************************************************************************************
1207* ADD, ADC, SUB, SBB, CMP *
1208*********************************************************************************************************************************/
1209
1210/**
1211 * The ADD instruction will set all status flags.
1212 */
1213DECL_INLINE_THROW(uint32_t)
1214iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1215 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1216{
1217 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1218 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1219
1220#ifdef RT_ARCH_AMD64
1221 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1222 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1223 0x02, 0x03, cOpBits, idxRegDst, idxRegSrc);
1224 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1225
1226 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1227 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1228
1229 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1230
1231#elif defined(RT_ARCH_ARM64)
1232 /* On ARM64 we'll need the two input operands as well as the result in order
1233 to calculate the right flags, even if we use ADDS and translates NZCV into
1234 OF, CF, ZF and SF. */
1235 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1236 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1237 if (cOpBits >= 32)
1238 {
1239 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1240 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1241 }
1242 else
1243 {
1244 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1245 uint32_t const cShift = 32 - cOpBits;
1246 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1247 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1248 true /*fSetFlags*/, cShift);
1249 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1250 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1251 cOpBits = 32;
1252 }
1253 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1254
1255 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1256 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1257
1258 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1259 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1260 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1261
1262#else
1263# error "port me"
1264#endif
1265 return off;
1266}
1267
1268
1269/**
1270 * The ADD instruction with immediate value as right operand.
1271 */
1272template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1273DECL_INLINE_THROW(uint32_t)
1274iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1275{
1276 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1277
1278#ifdef RT_ARCH_AMD64
1279 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1280 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1281 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 0, idxRegDst, uImmOp);
1282 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1283
1284 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1285
1286 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1287
1288#elif defined(RT_ARCH_ARM64)
1289 /* On ARM64 we'll need the two input operands as well as the result in order
1290 to calculate the right flags, even if we use ADDS and translates NZCV into
1291 OF, CF, ZF and SF. */
1292 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1293 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1294 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1295 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1296 {
1297 if (uImmOp <= 0xfffU)
1298 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1299 true /*fSetFlags*/);
1300 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1301 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1302 true /*fSetFlags*/, true /*fShift12*/);
1303 else
1304 {
1305 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1306 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1307 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1308 true /*fSetFlags*/);
1309 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1310 }
1311 }
1312 else
1313 {
1314 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1315 uint32_t const cShift = 32 - a_cOpBits;
1316 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1317 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1318 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1319 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1320 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1321 }
1322 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1323
1324 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1325 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1326
1327 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1328 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1329
1330#else
1331# error "port me"
1332#endif
1333 return off;
1334}
1335
1336
1337/**
1338 * The ADC instruction takes CF as input and will set all status flags.
1339 */
1340DECL_INLINE_THROW(uint32_t)
1341iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1342 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1343{
1344 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1345 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1346 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1347
1348#ifdef RT_ARCH_AMD64
1349 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1350 with matching size to get the correct flags. */
1351 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1352
1353 /* Use the BT instruction to set CF according to idxRegEfl. */
1354 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1355 pCodeBuf[off++] = X86_EFL_CF_BIT;
1356
1357 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, cOpBits, idxRegDst, idxRegSrc);
1358 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1359
1360 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1361 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1362
1363 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1364
1365#elif defined(RT_ARCH_ARM64)
1366 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1367 then ADCS for the calculation. We need all inputs and result for the two
1368 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1369 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1370 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1371
1372 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1373 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1374 if (cOpBits >= 32)
1375 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1376 else
1377 {
1378 /* Since we're also adding in the carry flag here, shifting operands up
1379 doesn't work. So, we have to calculate carry & overflow manually. */
1380 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1381 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1382 }
1383 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1384
1385 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1386 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1387
1388 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1389 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1390 if (cOpBits < 32)
1391 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1392 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1393
1394#else
1395# error "port me"
1396#endif
1397 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1398 return off;
1399}
1400
1401
1402/**
1403 * The ADC instruction with immediate value as right operand.
1404 */
1405template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1406DECL_INLINE_THROW(uint32_t)
1407iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1408{
1409 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1410 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1411
1412#ifdef RT_ARCH_AMD64
1413 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1414 with matching size to get the correct flags. */
1415 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1416
1417 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1418 pCodeBuf[off++] = X86_EFL_CF_BIT;
1419
1420 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 2, idxRegDst, uImmOp);
1421 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1422
1423 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1424
1425 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1426
1427#elif defined(RT_ARCH_ARM64)
1428 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1429 and then ADCS for the calculation. We need all inputs and result for
1430 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1431 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1432 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1433 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1434
1435 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1436 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1437 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1438 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1439 else
1440 {
1441 /* Since we're also adding in the carry flag here, shifting operands up
1442 doesn't work. So, we have to calculate carry & overflow manually. */
1443 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1444 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1445 }
1446 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1447
1448 iemNativeRegFreeTmp(pReNative, idxRegImm);
1449
1450 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1451 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1452
1453 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1454 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1455 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1456 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1457
1458#else
1459# error "port me"
1460#endif
1461 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1462 return off;
1463}
1464
1465
1466/**
1467 * The SUB instruction will set all status flags.
1468 */
1469DECL_INLINE_THROW(uint32_t)
1470iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1471 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1472{
1473 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1474 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1475
1476#ifdef RT_ARCH_AMD64
1477 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1478 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1479 0x2a, 0x2b, cOpBits, idxRegDst, idxRegSrc);
1480 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1481
1482 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1483 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1484
1485 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1486
1487#elif defined(RT_ARCH_ARM64)
1488 /* On ARM64 we'll need the two input operands as well as the result in order
1489 to calculate the right flags, even if we use SUBS and translates NZCV into
1490 OF, CF, ZF and SF. */
1491 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1492 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1493 if (cOpBits >= 32)
1494 {
1495 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1496 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1497 }
1498 else
1499 {
1500 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1501 uint32_t const cShift = 32 - cOpBits;
1502 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1503 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1504 true /*fSetFlags*/, cShift);
1505 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1506 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1507 cOpBits = 32;
1508 }
1509 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1510
1511 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1512 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1513
1514 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1515 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1516 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1517
1518#else
1519# error "port me"
1520#endif
1521 return off;
1522}
1523
1524
1525/**
1526 * The SUB instruction with immediate value as right operand.
1527 */
1528template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1529DECL_INLINE_THROW(uint32_t)
1530iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1531{
1532 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1533
1534#ifdef RT_ARCH_AMD64
1535 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1536 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1537 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 5, idxRegDst, uImmOp);
1538 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1539
1540 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1541
1542 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1543
1544#elif defined(RT_ARCH_ARM64)
1545 /* On ARM64 we'll need the two input operands as well as the result in order
1546 to calculate the right flags, even if we use SUBS and translates NZCV into
1547 OF, CF, ZF and SF. */
1548 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1549 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1550 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1551 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1552 {
1553 if (uImmOp <= 0xfffU)
1554 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1555 true /*fSetFlags*/);
1556 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1557 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1558 true /*fSetFlags*/, true /*fShift12*/);
1559 else
1560 {
1561 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1562 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1563 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1564 true /*fSetFlags*/);
1565 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1566 }
1567 }
1568 else
1569 {
1570 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1571 uint32_t const cShift = 32 - a_cOpBits;
1572 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1573 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1574 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1575 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1576 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1577 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1578 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1579 }
1580 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1581
1582 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegDst,
1583 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1584
1585 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1586 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1587
1588#else
1589# error "port me"
1590#endif
1591 return off;
1592}
1593
1594
1595/**
1596 * The CMP instruction will set all status flags, but modifies no registers.
1597 */
1598DECL_INLINE_THROW(uint32_t)
1599iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1600 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1601{
1602 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1603 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1604
1605#ifdef RT_ARCH_AMD64
1606 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1607 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1608 0x3a, 0x3b, cOpBits, idxRegDst, idxRegSrc);
1609 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1610
1611 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1612 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1613
1614 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1615
1616#elif defined(RT_ARCH_ARM64)
1617 /* On ARM64 we'll need the actual result as well as both input operands in order
1618 to calculate the right flags, even if we use SUBS and translates NZCV into
1619 OF, CF, ZF and SF. */
1620 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1621 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1622 if (cOpBits >= 32)
1623 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1624 else
1625 {
1626 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1627 uint32_t const cShift = 32 - cOpBits;
1628 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1629 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1630 true /*fSetFlags*/, cShift);
1631 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1632 cOpBits = 32;
1633 }
1634 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1635
1636 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1637 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1638
1639 iemNativeRegFreeTmp(pReNative, idxRegResult);
1640 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1641 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1642
1643#else
1644# error "port me"
1645#endif
1646 return off;
1647}
1648
1649
1650/**
1651 * The CMP instruction with immediate value as right operand.
1652 */
1653template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1654DECL_INLINE_THROW(uint32_t)
1655iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1656{
1657 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1658
1659#ifdef RT_ARCH_AMD64
1660 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1661 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1662 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 7, idxRegDst, uImmOp);
1663 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1664
1665 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1666
1667 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1668
1669#elif defined(RT_ARCH_ARM64)
1670 /* On ARM64 we'll need the actual result as well as both input operands in order
1671 to calculate the right flags, even if we use SUBS and translates NZCV into
1672 OF, CF, ZF and SF. */
1673 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1674 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1675 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1676 {
1677 if (uImmOp <= 0xfffU)
1678 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, a_cOpBits > 32 /*f64Bit*/,
1679 true /*fSetFlags*/);
1680 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1681 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, a_cOpBits > 32 /*f64Bit*/,
1682 true /*fSetFlags*/, true /*fShift12*/);
1683 else
1684 {
1685 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1686 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1687 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, a_cOpBits > 32 /*f64Bit*/,
1688 true /*fSetFlags*/);
1689 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1690 }
1691 }
1692 else
1693 {
1694 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1695 uint32_t const cShift = 32 - a_cOpBits;
1696 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1697 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1698 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1699 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1700 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1701 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1702 }
1703 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1704
1705 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, a_cOpBits > 32 ? a_cOpBits : 32, idxRegResult,
1706 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1707
1708 iemNativeRegFreeTmp(pReNative, idxRegResult);
1709 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1710
1711#else
1712# error "port me"
1713#endif
1714 return off;
1715}
1716
1717
1718/**
1719 * The SBB instruction takes CF as input and will set all status flags.
1720 */
1721DECL_INLINE_THROW(uint32_t)
1722iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1723 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1724{
1725 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1726 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1727 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1728
1729#ifdef RT_ARCH_AMD64
1730 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1731 with matching size to get the correct flags. */
1732 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1733
1734 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1735 pCodeBuf[off++] = X86_EFL_CF_BIT;
1736
1737 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, cOpBits, idxRegDst, idxRegSrc);
1738 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1739
1740 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1741 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1742
1743 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1744
1745#elif defined(RT_ARCH_ARM64)
1746 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1747 idxRegEfl and then SBCS for the calculation. We need all inputs and
1748 result for the two flags (AF,PF) that can't be directly derived from
1749 PSTATE.NZCV. */
1750 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1751 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1752
1753 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1754 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1755 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1756 if (cOpBits >= 32)
1757 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1758 else
1759 {
1760 /* Since we're also adding in the carry flag here, shifting operands up
1761 doesn't work. So, we have to calculate carry & overflow manually. */
1762 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1763 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1764 }
1765 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1766
1767 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1768 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1769
1770 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1771 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1772 if (cOpBits < 32)
1773 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1774 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1775
1776#else
1777# error "port me"
1778#endif
1779 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1780 return off;
1781}
1782
1783
1784/**
1785 * The SBB instruction with immediate value as right operand.
1786 */
1787template<uint8_t const a_cOpBits, uint8_t const a_cImmBits>
1788DECL_INLINE_THROW(uint32_t)
1789iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl)
1790{
1791 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1792 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1793
1794#ifdef RT_ARCH_AMD64
1795 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1796 with matching size to get the correct flags. */
1797 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1798
1799 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1800 pCodeBuf[off++] = X86_EFL_CF_BIT;
1801
1802 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, a_cOpBits, a_cImmBits, 3, idxRegDst, uImmOp);
1803 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1804
1805 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1806
1807 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1808
1809#elif defined(RT_ARCH_ARM64)
1810 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1811 idxRegEfl and then SBCS for the calculation. We need all inputs and
1812 result for the two flags (AF,PF) that can't be directly derived from
1813 PSTATE.NZCV. */
1814 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1815 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1816 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1817
1818 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1819 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1820 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1821 if RT_CONSTEXPR_IF(a_cOpBits >= 32)
1822 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, a_cOpBits > 32 /*f64Bit*/);
1823 else
1824 {
1825 /* Since we're also adding in the carry flag here, shifting operands up
1826 doesn't work. So, we have to calculate carry & overflow manually. */
1827 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1828 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, a_cOpBits > 8); /* NZ are okay, CV aren't.*/
1829 }
1830 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1831
1832 iemNativeRegFreeTmp(pReNative, idxRegImm);
1833
1834 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, a_cOpBits, idxRegDst,
1835 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1836
1837 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1838 if RT_CONSTEXPR_IF(a_cOpBits < 32)
1839 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(a_cOpBits) - 1U);
1840 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1841
1842#else
1843# error "port me"
1844#endif
1845 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1846 return off;
1847}
1848
1849
1850DECL_INLINE_THROW(uint32_t)
1851iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1852 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1853{
1854 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1855 AssertFailed();
1856 return iemNativeEmitBrk(pReNative, off, 0x666);
1857}
1858
1859
1860DECL_INLINE_THROW(uint32_t)
1861iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1862 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1863{
1864 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1865 AssertFailed();
1866 return iemNativeEmitBrk(pReNative, off, 0x666);
1867}
1868
1869
1870DECL_INLINE_THROW(uint32_t)
1871iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1872 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1873{
1874 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1875 AssertFailed();
1876 return iemNativeEmitBrk(pReNative, off, 0x666);
1877}
1878
1879
1880DECL_INLINE_THROW(uint32_t)
1881iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1882 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1883{
1884 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1885 AssertFailed();
1886 return iemNativeEmitBrk(pReNative, off, 0x666);
1887}
1888
1889
1890
1891/*********************************************************************************************************************************
1892* Shifting and Rotating. *
1893*********************************************************************************************************************************/
1894
1895
1896typedef enum
1897{
1898 kIemNativeEmitEFlagsForShiftType_Left,
1899 kIemNativeEmitEFlagsForShiftType_Right,
1900 kIemNativeEmitEFlagsForShiftType_SignedRight
1901} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1902
1903/**
1904 * This is used by SHL, SHR and SAR emulation.
1905 *
1906 * It takes liveness stuff into account.
1907 */
1908DECL_INLINE_THROW(uint32_t)
1909iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1910 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1911 uint8_t idxRegTmp)
1912{
1913 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1914
1915RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1916#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1917 /*
1918 * See if we can skip this wholesale.
1919 */
1920 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1921 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1922 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1923 {
1924 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1925 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
1926# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1927 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1928# endif
1929 }
1930 else
1931#endif
1932 {
1933 /*
1934 * The difference between Intel and AMD flags for SHL are:
1935 * - Intel always clears AF while AMD always sets it.
1936 * - Intel sets OF for the first shift, while AMD for the last shift.
1937 *
1938 */
1939
1940#ifdef RT_ARCH_AMD64
1941 /*
1942 * We capture flags and does the additional OF and AF calculations as needed.
1943 */
1944 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1945 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1946 * use LAHF here when host rax is free since, OF is cleared. */
1947 /* pushf */
1948 pCodeBuf[off++] = 0x9c;
1949 /* pop tmp */
1950 if (idxRegTmp >= 8)
1951 pCodeBuf[off++] = X86_OP_REX_B;
1952 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1953 /* Clear the status bits in EFLs. */
1954 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1955 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1956 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1957 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1958 else
1959 {
1960 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
1961 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
1962 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1963 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1964 /* OR in the flags we collected. */
1965 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1966
1967 /* Calculate OF */
1968 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1969 {
1970 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1971 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1972 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
1973 RT_MAX(cOpBits, 16), 4, idxRegResult);
1974 pCodeBuf[off++] = cOpBits - 1;
1975 /* setc idxRegTmp */
1976 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
1977 /* xor idxRegTmp, idxRegEfl */
1978 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1979 /* and idxRegTmp, 1 */
1980 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1981 /* shl idxRegTmp, X86_EFL_OF_BIT */
1982 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
1983 }
1984 else
1985 {
1986 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1987 if (cOpBits <= 32)
1988 {
1989 /* mov idxRegTmp, idxRegSrc */
1990 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
1991 /* shl idxRegTmp, 1 */
1992 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
1993 /* xor idxRegTmp, idxRegSrc */
1994 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
1995 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
1996 if (cOpBits >= X86_EFL_OF_BIT)
1997 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
1998 else
1999 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
2000 }
2001 else
2002 {
2003 /* same as above but with 64-bit grps*/
2004 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2005 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2006 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2007 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2008 }
2009 /* and idxRegTmp, X86_EFL_OF */
2010 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2011 }
2012 }
2013 /* Or in the collected flag(s) */
2014 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2015
2016#elif defined(RT_ARCH_ARM64)
2017 /*
2018 * Calculate flags.
2019 */
2020 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2021
2022 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2023 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2024 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2025
2026 /* N,Z -> SF,ZF */
2027 if (cOpBits < 32)
2028 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2029 else
2030 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2031 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2032 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2033 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2034 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2035
2036 /* Calculate 8-bit parity of the result. */
2037 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2038 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2039 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2040 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2041 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2042 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2043 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2044 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2045 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2046
2047 /* Calculate carry - the last bit shifted out of the input value. */
2048 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2049 {
2050 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2051 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2052 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2053 if (cOpBits < 32)
2054 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2055 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2056 }
2057 else
2058 {
2059 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2060 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2061 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2062 }
2063 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2064
2065 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2066 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2067 {
2068 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2069 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2070 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2071 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2072 }
2073 else
2074 {
2075 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2076 AssertCompile(X86_EFL_CF_BIT == 0);
2077 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2078 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2079 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2080
2081 /* AMD unconditionally clears AF. */
2082 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2083 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2084 }
2085#else
2086# error "port me"
2087#endif
2088 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2089
2090#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2091 if (pReNative->fSkippingEFlags)
2092 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2093 pReNative->fSkippingEFlags = 0;
2094# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2095 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2096# endif
2097#endif
2098 }
2099 return off;
2100}
2101
2102
2103DECL_INLINE_THROW(uint32_t)
2104iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2105 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2106{
2107 /* Note! Since we're doing some branching here, we need to allocate all
2108 registers we need before the jump or we may end up with invalid
2109 register state if the branch is taken. */
2110 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2111 uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /*fInitialized*/); /* modified on arm64 */
2112 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
2113 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
2114
2115#ifdef RT_ARCH_AMD64
2116 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2117 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2118
2119 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2120 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2121 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2122 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2123
2124 /* Check if it's NOP before we do anything. */
2125 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2126 uint32_t const offFixup = off;
2127 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2128
2129 if (idxRegDstIn != UINT8_MAX)
2130 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2131 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2132
2133#elif defined(RT_ARCH_ARM64)
2134 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2135 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2136 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2137
2138 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2139 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2140 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2141 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2142 uint32_t const offFixup = off;
2143 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2144
2145 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2146 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2147 if (cOpBits < 32)
2148 {
2149 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2150 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2151 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2152 }
2153
2154#else
2155# error "port me"
2156#endif
2157
2158 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2159 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2160 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2161
2162 /* fixup the jump */
2163 iemNativeFixupFixedJump(pReNative, offFixup, off);
2164
2165#ifdef RT_ARCH_AMD64
2166 if (idxRegDstIn != UINT8_MAX)
2167#endif
2168 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2169 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2170 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2171 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2172 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2173 return off;
2174}
2175
2176
2177DECL_INLINE_THROW(uint32_t)
2178iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2179 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2180{
2181 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2182 AssertFailed();
2183 return iemNativeEmitBrk(pReNative, off, 0x666);
2184}
2185
2186
2187DECL_INLINE_THROW(uint32_t)
2188iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2189 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2190{
2191 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2192 AssertFailed();
2193 return iemNativeEmitBrk(pReNative, off, 0x666);
2194}
2195
2196
2197DECL_INLINE_THROW(uint32_t)
2198iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2199 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2200{
2201 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2202 AssertFailed();
2203 return iemNativeEmitBrk(pReNative, off, 0x666);
2204}
2205
2206
2207DECL_INLINE_THROW(uint32_t)
2208iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2209 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2210{
2211 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2212 AssertFailed();
2213 return iemNativeEmitBrk(pReNative, off, 0x666);
2214}
2215
2216
2217DECL_INLINE_THROW(uint32_t)
2218iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2219 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2220{
2221 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2222 AssertFailed();
2223 return iemNativeEmitBrk(pReNative, off, 0x666);
2224}
2225
2226
2227DECL_INLINE_THROW(uint32_t)
2228iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2229 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2230{
2231 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2232 AssertFailed();
2233 return iemNativeEmitBrk(pReNative, off, 0x666);
2234}
2235
2236
2237
2238#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2239/*********************************************************************************************************************************
2240* SIMD emitters. *
2241*********************************************************************************************************************************/
2242
2243/**
2244 * Common emitter for packed arithmetic instructions.
2245 */
2246#ifdef RT_ARCH_AMD64
2247# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2248 DECL_INLINE_THROW(uint32_t) \
2249 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2250 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2251 { \
2252 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2253 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2254 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2255 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2256 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2257 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2258 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2259 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2260 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2261 pCodeBuf[off++] = 0x0f; \
2262 pCodeBuf[off++] = (a_bOpcX86); \
2263 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2264 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2265 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2266 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2267 return off; \
2268 } \
2269 DECL_INLINE_THROW(uint32_t) \
2270 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2271 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2272 { \
2273 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2274 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2275 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2276 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2277 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2278 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2279 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2280 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2281 pCodeBuf[off++] = 0x0f; \
2282 pCodeBuf[off++] = (a_bOpcX86); \
2283 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2284 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2285 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2286 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2287 return off; \
2288 } \
2289 typedef int ignore_semicolon
2290#elif defined(RT_ARCH_ARM64)
2291# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2292 DECL_INLINE_THROW(uint32_t) \
2293 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2294 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2295 { \
2296 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2297 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2298 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2299 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2300 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2301 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2302 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2303 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2304 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2305 return off; \
2306 } \
2307 DECL_INLINE_THROW(uint32_t) \
2308 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2309 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2310 { \
2311 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2312 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2313 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2314 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2315 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2316 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2317 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2318 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2319 return off; \
2320 } \
2321 typedef int ignore_semicolon
2322#else
2323# error "Port me"
2324#endif
2325
2326/* POR, ORPS, ORPD. */
2327IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2328/* PXOR, XORPS, XORPD. */
2329IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2330/* PAND, ANDPS, ANDPD. */
2331IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2332
2333
2334/**
2335 * Common emitter for the shift right with immediate instructions.
2336 */
2337#ifdef RT_ARCH_AMD64
2338# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2339 DECL_INLINE_THROW(uint32_t) \
2340 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2341 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2342 { \
2343 if (bImm) \
2344 { \
2345 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2346 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2347 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2348 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2349 if (idxSimdRegDst >= 8) \
2350 pCodeBuf[off++] = X86_OP_REX_B; \
2351 pCodeBuf[off++] = 0x0f; \
2352 pCodeBuf[off++] = (a_bOpcX86); \
2353 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2354 pCodeBuf[off++] = bImm; \
2355 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2356 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2357 } \
2358 /* Immediate 0 is a nop. */ \
2359 return off; \
2360 } \
2361 typedef int ignore_semicolon
2362#elif defined(RT_ARCH_ARM64)
2363# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2364 DECL_INLINE_THROW(uint32_t) \
2365 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2366 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2367 { \
2368 if (bImm) \
2369 { \
2370 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2371 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2372 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2373 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2374 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2375 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2376 } \
2377 /* Immediate 0 is a nop. */ \
2378 return off; \
2379 } \
2380 typedef int ignore_semicolon
2381#else
2382# error "Port me"
2383#endif
2384
2385IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2386IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2387IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2388
2389
2390/**
2391 * Common emitter for the shift left with immediate instructions.
2392 */
2393#ifdef RT_ARCH_AMD64
2394# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2395 DECL_INLINE_THROW(uint32_t) \
2396 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2397 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2398 { \
2399 if (bImm) \
2400 { \
2401 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2402 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2403 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2404 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2405 if (idxSimdRegDst >= 8) \
2406 pCodeBuf[off++] = X86_OP_REX_B; \
2407 pCodeBuf[off++] = 0x0f; \
2408 pCodeBuf[off++] = (a_bOpcX86); \
2409 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2410 pCodeBuf[off++] = bImm; \
2411 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2412 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2413 } \
2414 /* Immediate 0 is a nop. */ \
2415 return off; \
2416 } \
2417 typedef int ignore_semicolon
2418#elif defined(RT_ARCH_ARM64)
2419# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2420 DECL_INLINE_THROW(uint32_t) \
2421 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2422 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2423 { \
2424 if (bImm) /* bImm == 0 is a nop */ \
2425 { \
2426 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2427 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2428 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2429 if (bImm < (a_cShiftMax)) \
2430 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2431 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2432 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2433 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2434 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2435 } \
2436 return off; \
2437 } \
2438 typedef int ignore_semicolon
2439#else
2440# error "Port me"
2441#endif
2442
2443IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2444IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2445IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2446
2447
2448/**
2449 * Common emitter for packed arithmetic instructions.
2450 */
2451#ifdef RT_ARCH_AMD64
2452# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2453 DECL_INLINE_THROW(uint32_t) \
2454 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2455 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2456 { \
2457 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2458 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2459 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2460 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2461 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2462 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2463 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2464 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2465 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2466 pCodeBuf[off++] = 0x0f; \
2467 pCodeBuf[off++] = (a_bOpcX86); \
2468 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2469 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2470 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2471 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2472 return off; \
2473 } \
2474 DECL_INLINE_THROW(uint32_t) \
2475 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2476 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2477 { \
2478 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2479 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2480 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2481 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2482 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2483 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2484 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2485 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2486 pCodeBuf[off++] = 0x0f; \
2487 pCodeBuf[off++] = (a_bOpcX86); \
2488 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2489 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2490 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2491 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2492 return off; \
2493 } \
2494 typedef int ignore_semicolon
2495#elif defined(RT_ARCH_ARM64)
2496# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2497 DECL_INLINE_THROW(uint32_t) \
2498 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2499 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2500 { \
2501 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2502 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2503 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2504 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2505 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2506 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2507 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2508 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2509 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2510 return off; \
2511 } \
2512 DECL_INLINE_THROW(uint32_t) \
2513 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2514 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2515 { \
2516 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2517 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2518 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2519 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2520 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2521 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2522 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2523 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2524 return off; \
2525 } \
2526 typedef int ignore_semicolon
2527#else
2528# error "Port me"
2529#endif
2530
2531/*
2532 * PADDx.
2533 */
2534IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2535IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2536IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2537IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2538
2539/*
2540 * PSUBx.
2541 */
2542IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2543IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2544IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2545IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2546
2547/*
2548 * PADDUSx.
2549 */
2550IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2551IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2552
2553/*
2554 * PMULLx.
2555 */
2556IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2557
2558
2559/**
2560 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2561 */
2562#ifdef RT_ARCH_AMD64
2563# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2564 DECL_INLINE_THROW(uint32_t) \
2565 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2566 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2567 { \
2568 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2569 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2570 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2571 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2572 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2573 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2574 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2575 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2576 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2577 pCodeBuf[off++] = 0x0f; \
2578 pCodeBuf[off++] = (a_bOpcX86); \
2579 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2580 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2581 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2582 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2583 return off; \
2584 } \
2585 DECL_INLINE_THROW(uint32_t) \
2586 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2587 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2588 { \
2589 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2590 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2591 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2592 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2593 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2594 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2595 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2596 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2597 pCodeBuf[off++] = 0x0f; \
2598 pCodeBuf[off++] = (a_bOpcX86); \
2599 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2600 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2601 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2602 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2603 return off; \
2604 } \
2605 typedef int ignore_semicolon
2606#elif defined(RT_ARCH_ARM64)
2607# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2608 DECL_INLINE_THROW(uint32_t) \
2609 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2610 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2611 { \
2612 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2613 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2614 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2615 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2616 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2617 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2618 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2619 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2620 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2621 return off; \
2622 } \
2623 DECL_INLINE_THROW(uint32_t) \
2624 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2625 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2626 { \
2627 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2628 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2629 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2630 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2631 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2632 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2633 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2634 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2635 return off; \
2636 } \
2637 typedef int ignore_semicolon
2638#else
2639# error "Port me"
2640#endif
2641
2642IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2643IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2644IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2645
2646
2647/**
2648 * Emitter for pmovmskb
2649 */
2650DECL_INLINE_THROW(uint32_t)
2651iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2652 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2653{
2654#ifdef RT_ARCH_AMD64
2655 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2656 kIemNativeGstRegUse_ForFullWrite);
2657 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2658 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2659 kIemNativeGstSimdRegLdStSz_Low128,
2660 kIemNativeGstRegUse_ReadOnly);
2661 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2662
2663 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2664 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2665 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2666 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2667 pCodeBuf[off++] = 0x0f;
2668 pCodeBuf[off++] = 0xd7;
2669 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2670
2671 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2672 iemNativeRegFreeTmp(pReNative, idxRegDst);
2673
2674#elif defined(RT_ARCH_ARM64)
2675 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2676 kIemNativeGstRegUse_ForFullWrite);
2677 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2678 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2679 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2680 kIemNativeGstSimdRegLdStSz_Low128,
2681 kIemNativeGstRegUse_Calculation);
2682 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2683
2684 /*
2685 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2686 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2687 *
2688 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2689 * as the sse2neon implementation because there we can get away with loading any constants and the
2690 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2691 *
2692 * The following illustrates the algorithm:
2693 *
2694 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2695 * Instruction
2696 * |
2697 * V
2698 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2699 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2700 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2701 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2702 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2703 *
2704 * The extraction process
2705 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2706 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2707 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2708 */
2709 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2710 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2711 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2712 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2713 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2714 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2715 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2716
2717 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2718 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2719 iemNativeRegFreeTmp(pReNative, idxRegDst);
2720
2721#else
2722# error "Port me"
2723#endif
2724 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2725 return off;
2726}
2727
2728
2729/**
2730 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2731 */
2732DECL_INLINE_THROW(uint32_t)
2733iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2734 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2735{
2736 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2737 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2738 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2739 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2740
2741#ifdef RT_ARCH_AMD64
2742 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2743
2744 /* packuswb xmm, xmm */
2745 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2746 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2747 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2748 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2749 pCodeBuf[off++] = 0x0f;
2750 pCodeBuf[off++] = 0x67;
2751 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2752
2753#elif defined(RT_ARCH_ARM64)
2754 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2755
2756 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2757 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2758
2759#else
2760# error "port me"
2761#endif
2762
2763 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2764 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2765
2766 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2767 return off;
2768}
2769
2770
2771/**
2772 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2773 */
2774DECL_INLINE_THROW(uint32_t)
2775iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2776 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2777{
2778 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2779 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2780
2781 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2782 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2783 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2784
2785
2786#ifdef RT_ARCH_AMD64
2787 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2788
2789 /* packuswb xmm, xmm */
2790 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2791 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2792 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2793 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2794 pCodeBuf[off++] = 0x0f;
2795 pCodeBuf[off++] = 0x67;
2796 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2797
2798#elif defined(RT_ARCH_ARM64)
2799 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2800
2801 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2802 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2803
2804#else
2805# error "port me"
2806#endif
2807
2808 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2809 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2810
2811 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2812 return off;
2813}
2814
2815
2816/**
2817 * Common emitter for the pmov{s,z}x* instructions.
2818 */
2819#ifdef RT_ARCH_AMD64
2820# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2821 DECL_INLINE_THROW(uint32_t) \
2822 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2823 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2824 { \
2825 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2826 { \
2827 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2828 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2829 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2830 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2831 if (idxSimdReg >= 8) \
2832 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2833 pCodeBuf[off++] = 0x0f; \
2834 pCodeBuf[off++] = 0x38; \
2835 pCodeBuf[off++] = (a_bOpcX86); \
2836 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2837 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2838 } \
2839 else \
2840 { \
2841 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2842 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2843 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2844 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2845 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2846 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2847 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2848 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2849 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2850 pCodeBuf[off++] = 0x0f; \
2851 pCodeBuf[off++] = 0x38; \
2852 pCodeBuf[off++] = (a_bOpcX86); \
2853 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2854 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2855 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2856 } \
2857 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2858 return off; \
2859 } \
2860 DECL_INLINE_THROW(uint32_t) \
2861 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2862 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2863 { \
2864 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2865 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2866 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2867 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2868 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2869 pCodeBuf[off++] = X86_OP_REX_W \
2870 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2871 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2872 pCodeBuf[off++] = 0x0f; \
2873 pCodeBuf[off++] = 0x3a; \
2874 pCodeBuf[off++] = 0x22; \
2875 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2876 pCodeBuf[off++] = 0; /* QWord */\
2877 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2878 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2879 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2880 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2881 pCodeBuf[off++] = 0x0f; \
2882 pCodeBuf[off++] = 0x38; \
2883 pCodeBuf[off++] = (a_bOpcX86); \
2884 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2885 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2886 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2887 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2888 return off; \
2889 } \
2890 typedef int ignore_semicolon
2891#elif defined(RT_ARCH_ARM64)
2892# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2893 DECL_INLINE_THROW(uint32_t) \
2894 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2895 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2896 { \
2897 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2898 { \
2899 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2900 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2901 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2902 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2903 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2904 } \
2905 else \
2906 { \
2907 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2908 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2909 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2910 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2911 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2912 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2913 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2914 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2915 } \
2916 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2917 return off; \
2918 } \
2919 DECL_INLINE_THROW(uint32_t) \
2920 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2921 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2922 { \
2923 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2924 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2925 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2926 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2927 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
2928 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2929 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2930 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2931 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2932 return off; \
2933 } \
2934 typedef int ignore_semicolon
2935#else
2936# error "Port me"
2937#endif
2938
2939IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2940IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2941IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2942
2943IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2944IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2945IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2946
2947
2948/**
2949 * Updates the MXCSR exception flags, raising any unmasked exceptions.
2950 */
2951DECL_INLINE_THROW(uint32_t)
2952iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2953{
2954 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2955 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2956 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2957
2958#ifdef RT_ARCH_AMD64
2959 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2960
2961 /* stmxcsr */
2962 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2963 pbCodeBuf[off++] = X86_OP_REX_B;
2964 pbCodeBuf[off++] = 0x0f;
2965 pbCodeBuf[off++] = 0xae;
2966 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2967 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2968 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2969 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2970 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2971
2972 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2973 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2974
2975 /* Store the flags in the MXCSR xcpt flags register. */
2976 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2977 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2978
2979 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
2980 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2981 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2982
2983 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2984
2985 /* ldmxcsr */
2986 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2987 pbCodeBuf[off++] = X86_OP_REX_B;
2988 pbCodeBuf[off++] = 0x0f;
2989 pbCodeBuf[off++] = 0xae;
2990 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2991 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2992 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2993 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2994 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2995
2996#elif defined(RT_ARCH_ARM64)
2997 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2998 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
2999 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
3000 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
3001
3002 /*
3003 * The exception flags layout differs between MXCSR and FPSR of course:
3004 *
3005 * Bit FPSR MXCSR
3006 * 0 IOC ------> IE
3007 *
3008 * 1 DZC ---- DE <-+
3009 * \ |
3010 * 2 OFC --- -> ZE |
3011 * \ |
3012 * 3 UFC -- --> OE |
3013 * \ |
3014 * 4 IXC - ---> UE |
3015 * \ |
3016 * 5 ----> PE |
3017 * 6 |
3018 * 7 IDC --------------+
3019 */
3020 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3021 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3022 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3023 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3024#else
3025# error "Port me"
3026#endif
3027
3028 /*
3029 * If PE is set together with OE/UE and neither are masked
3030 * PE needs to be cleared, because on real hardware
3031 * an exception is generated with only OE/UE being set,
3032 * but because we mask all exceptions PE will get set as well.
3033 */
3034 /** @todo On ARM we can combine the load+and into one and instruction. */
3035 /** @todo r=aeichner Can this be done more optimal? */
3036 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3037 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3038 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3039 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3040 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3041 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3042 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3043 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3044 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3045
3046 uint32_t offFixup = off;
3047 off = iemNativeEmitJzToFixed(pReNative, off, off);
3048 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3049 iemNativeFixupFixedJump(pReNative, offFixup, off);
3050 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3051
3052
3053 /* Set the MXCSR flags now. */
3054 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3055
3056 /*
3057 * Make sure we don't have any outstanding guest register writes as we may
3058 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3059 */
3060 off = iemNativeRegFlushPendingWrites(pReNative, off);
3061
3062#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3063 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3064#else
3065 RT_NOREF(idxInstr);
3066#endif
3067
3068 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3069 /* mov tmp, varmxcsr */
3070 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3071 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3072 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3073 /* tmp = ~tmp */
3074 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3075 /* tmp &= mxcsr */
3076 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3077 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3078 X86_MXCSR_XCPT_FLAGS);
3079
3080 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3081 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3082
3083 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3084 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3085
3086 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3087 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3088 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3089 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3090 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3091 return off;
3092}
3093
3094
3095/**
3096 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3097 */
3098DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3099 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3100#ifdef RT_ARCH_AMD64
3101 uint8_t const bPrefixX86, uint8_t const bOpcX86
3102#elif defined(RT_ARCH_ARM64)
3103 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3104#endif
3105 )
3106{
3107 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3108 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3109 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3110 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3111
3112#ifdef RT_ARCH_AMD64
3113 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3114 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3115 if (bPrefixX86 != 0)
3116 pCodeBuf[off++] = bPrefixX86;
3117 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3118 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3119 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3120 pCodeBuf[off++] = 0x0f;
3121 pCodeBuf[off++] = bOpcX86;
3122 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3123#elif defined(RT_ARCH_ARM64)
3124 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3125 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3126#else
3127# error "Port me"
3128#endif
3129 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3130 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3131 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3132 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3133}
3134
3135
3136/**
3137 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3138 */
3139DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3140 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3141#ifdef RT_ARCH_AMD64
3142 uint8_t const bPrefixX86, uint8_t const bOpcX86
3143#elif defined(RT_ARCH_ARM64)
3144 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3145#endif
3146 )
3147{
3148 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3149 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3150 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3151
3152#ifdef RT_ARCH_AMD64
3153 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3154 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3155 if (bPrefixX86 != 0)
3156 pCodeBuf[off++] = bPrefixX86;
3157 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3158 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3159 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3160 pCodeBuf[off++] = 0x0f;
3161 pCodeBuf[off++] = bOpcX86;
3162 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3163#elif defined(RT_ARCH_ARM64)
3164 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3165 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3166#else
3167# error "Port me"
3168#endif
3169 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3170 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3171 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3172 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3173}
3174
3175
3176/**
3177 * Common emitter for packed floating point instructions with 3 operands.
3178 */
3179#ifdef RT_ARCH_AMD64
3180# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3181 DECL_FORCE_INLINE_THROW(uint32_t) \
3182 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3183 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3184 { \
3185 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3186 a_bPrefixX86, a_bOpcX86); \
3187 } \
3188 DECL_FORCE_INLINE_THROW(uint32_t) \
3189 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3190 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3191 { \
3192 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3193 a_bPrefixX86, a_bOpcX86); \
3194 } \
3195 typedef int ignore_semicolon
3196#elif defined(RT_ARCH_ARM64)
3197# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3198 DECL_FORCE_INLINE_THROW(uint32_t) \
3199 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3200 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3201 { \
3202 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3203 a_enmArmOp, a_ArmElemSz); \
3204 } \
3205 DECL_FORCE_INLINE_THROW(uint32_t) \
3206 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3207 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3208 { \
3209 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3210 a_enmArmOp, a_ArmElemSz); \
3211 } \
3212 typedef int ignore_semicolon
3213#else
3214# error "Port me"
3215#endif
3216
3217
3218IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3219IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3220IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3221IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3222
3223#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3224
3225#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette