VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/target-x86/IEMAllN8veEmit-x86.h@ 106197

Last change on this file since 106197 was 106197, checked in by vboxsync, 6 months ago

VMM/IEM: Use iemNativeEmitEFlagsForLogical as emitter for all cases. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 159.6 KB
Line 
1/* $Id: IEMAllN8veEmit-x86.h 106197 2024-10-01 15:35:13Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, x86 Target - Code Emitters.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28#ifndef VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
29#define VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h
30#ifndef RT_WITHOUT_PRAGMA_ONCE
31# pragma once
32#endif
33
34
35#ifdef RT_ARCH_AMD64
36
37/**
38 * Emits an ModR/M instruction with one opcode byte and only register operands.
39 */
40DECL_FORCE_INLINE(uint32_t)
41iemNativeEmitAmd64OneByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOther,
42 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
43{
44 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
45 switch (cOpBits)
46 {
47 case 16:
48 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
49 RT_FALL_THRU();
50 case 32:
51 if (idxRegReg >= 8 || idxRegRm >= 8)
52 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
53 pCodeBuf[off++] = bOpcodeOther;
54 break;
55
56 default: AssertFailed(); RT_FALL_THRU();
57 case 64:
58 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
59 pCodeBuf[off++] = bOpcodeOther;
60 break;
61
62 case 8:
63 if (idxRegReg >= 8 || idxRegRm >= 8)
64 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
65 else if (idxRegReg >= 4 || idxRegRm >= 4)
66 pCodeBuf[off++] = X86_OP_REX;
67 pCodeBuf[off++] = bOpcode8;
68 break;
69 }
70 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
71 return off;
72}
73
74
75/**
76 * Emits an ModR/M instruction with two opcode bytes and only register operands.
77 */
78DECL_FORCE_INLINE(uint32_t)
79iemNativeEmitAmd64TwoByteModRmInstrRREx(PIEMNATIVEINSTR pCodeBuf, uint32_t off,
80 uint8_t bOpcode0, uint8_t bOpcode8, uint8_t bOpcodeOther,
81 uint8_t cOpBits, uint8_t idxRegReg, uint8_t idxRegRm)
82{
83 Assert(idxRegReg < 16); Assert(idxRegRm < 16);
84 switch (cOpBits)
85 {
86 case 16:
87 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
88 RT_FALL_THRU();
89 case 32:
90 if (idxRegReg >= 8 || idxRegRm >= 8)
91 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
92 pCodeBuf[off++] = bOpcode0;
93 pCodeBuf[off++] = bOpcodeOther;
94 break;
95
96 default: AssertFailed(); RT_FALL_THRU();
97 case 64:
98 pCodeBuf[off++] = X86_OP_REX_W | (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
99 pCodeBuf[off++] = bOpcode0;
100 pCodeBuf[off++] = bOpcodeOther;
101 break;
102
103 case 8:
104 if (idxRegReg >= 8 || idxRegRm >= 8)
105 pCodeBuf[off++] = (idxRegReg >= 8 ? X86_OP_REX_R : 0) | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
106 else if (idxRegReg >= 4 || idxRegRm >= 4)
107 pCodeBuf[off++] = X86_OP_REX;
108 pCodeBuf[off++] = bOpcode0;
109 pCodeBuf[off++] = bOpcode8;
110 break;
111 }
112 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg & 7, idxRegRm & 7);
113 return off;
114}
115
116
117/**
118 * Emits one of three opcodes with an immediate.
119 *
120 * These are expected to be a /idxRegReg form.
121 */
122DECL_FORCE_INLINE(uint32_t)
123iemNativeEmitAmd64OneByteModRmInstrRIEx(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t bOpcode8, uint8_t bOpcodeOtherImm8,
124 uint8_t bOpcodeOther, uint8_t cOpBits, uint8_t cImmBits, uint8_t idxRegReg,
125 uint8_t idxRegRm, uint64_t uImmOp)
126{
127 Assert(idxRegReg < 8); Assert(idxRegRm < 16);
128 if ( cImmBits == 8
129 || (uImmOp <= (uint64_t)0x7f && bOpcodeOtherImm8 != 0xcc))
130 {
131 switch (cOpBits)
132 {
133 case 16:
134 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
135 RT_FALL_THRU();
136 case 32:
137 if (idxRegRm >= 8)
138 pCodeBuf[off++] = X86_OP_REX_B;
139 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
140 break;
141
142 default: AssertFailed(); RT_FALL_THRU();
143 case 64:
144 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
145 pCodeBuf[off++] = bOpcodeOtherImm8; Assert(bOpcodeOtherImm8 != 0xcc);
146 break;
147
148 case 8:
149 if (idxRegRm >= 8)
150 pCodeBuf[off++] = X86_OP_REX_B;
151 else if (idxRegRm >= 4)
152 pCodeBuf[off++] = X86_OP_REX;
153 pCodeBuf[off++] = bOpcode8; Assert(bOpcode8 != 0xcc);
154 break;
155 }
156 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
157 pCodeBuf[off++] = (uint8_t)uImmOp;
158 }
159 else
160 {
161 switch (cOpBits)
162 {
163 case 32:
164 if (idxRegRm >= 8)
165 pCodeBuf[off++] = X86_OP_REX_B;
166 break;
167
168 default: AssertFailed(); RT_FALL_THRU();
169 case 64:
170 pCodeBuf[off++] = X86_OP_REX_W | (idxRegRm >= 8 ? X86_OP_REX_B : 0);
171 break;
172
173 case 16:
174 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
175 if (idxRegRm >= 8)
176 pCodeBuf[off++] = X86_OP_REX_B;
177 pCodeBuf[off++] = bOpcodeOther;
178 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
179 pCodeBuf[off++] = RT_BYTE1(uImmOp);
180 pCodeBuf[off++] = RT_BYTE2(uImmOp);
181 Assert(cImmBits == 16);
182 return off;
183 }
184 pCodeBuf[off++] = bOpcodeOther;
185 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegReg, idxRegRm & 7);
186 pCodeBuf[off++] = RT_BYTE1(uImmOp);
187 pCodeBuf[off++] = RT_BYTE2(uImmOp);
188 pCodeBuf[off++] = RT_BYTE3(uImmOp);
189 pCodeBuf[off++] = RT_BYTE4(uImmOp);
190 Assert(cImmBits == 32);
191 }
192 return off;
193}
194
195#endif /* RT_ARCH_AMD64 */
196
197
198
199/*********************************************************************************************************************************
200* EFLAGS *
201*********************************************************************************************************************************/
202
203#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
204
205/** @def IEMNATIVE_POSTPONING_REG_MASK
206 * Register suitable for keeping the inputs or result for a postponed EFLAGS
207 * calculation.
208 *
209 * We use non-volatile register here so we don't have to save & restore them
210 * accross callouts (i.e. TLB loads).
211 *
212 * @note On x86 we cannot use RDI and RSI because these are used by the
213 * opcode checking code. The usual joy of the x86 instruction set.
214 */
215# ifdef RT_ARCH_AMD64
216# define IEMNATIVE_POSTPONING_REG_MASK \
217 (IEMNATIVE_CALL_NONVOLATILE_GREG_MASK & ~(RT_BIT_32(X86_GREG_xDI) | RT_BIT_32(X86_GREG_xSI)))
218# else
219# define IEMNATIVE_POSTPONING_REG_MASK IEMNATIVE_CALL_NONVOLATILE_GREG_MASK
220# endif
221
222/**
223 * This is normally invoked via IEMNATIVE_CLEAR_POSTPONED_EFLAGS().
224 */
225template<uint32_t const a_fEflClobbered>
226DECL_FORCE_INLINE(void) iemNativeClearPostponedEFlags(PIEMRECOMPILERSTATE pReNative)
227{
228 AssertCompile(!(a_fEflClobbered & ~X86_EFL_STATUS_BITS));
229 uint32_t fEFlags = pReNative->PostponedEfl.fEFlags;
230 if (fEFlags)
231 {
232 if RT_CONSTEXPR_IF(a_fEflClobbered != X86_EFL_STATUS_BITS)
233 {
234 fEFlags &= ~a_fEflClobbered;
235 if (!fEFlags)
236 { /* likely */ }
237 else
238 {
239 Log5(("EFLAGS: Clobbering %#x: %#x -> %#x (op=%d bits=%u) - iemNativeClearPostponedEFlags\n", a_fEflClobbered,
240 pReNative->PostponedEfl.fEFlags, fEFlags, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits));
241 pReNative->PostponedEfl.fEFlags = fEFlags;
242 return;
243 }
244 }
245
246 /* Do cleanup. */
247 Log5(("EFLAGS: Cleanup of op=%u bits=%u efl=%#x upon clobbering %#x - iemNativeClearPostponedEFlags\n",
248 pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits, pReNative->PostponedEfl.fEFlags, a_fEflClobbered));
249 pReNative->PostponedEfl.fEFlags = 0;
250 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Invalid;
251 pReNative->PostponedEfl.cOpBits = 0;
252 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg1);
253 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
254 iemNativeRegFreeTmp(pReNative, pReNative->PostponedEfl.idxReg2);
255 pReNative->PostponedEfl.idxReg1 = UINT8_MAX;
256 pReNative->PostponedEfl.idxReg2 = UINT8_MAX;
257# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
258 STAM_PROFILE_ADD_PERIOD(&pReNative->pVCpu->iem.s.StatNativeEflPostponedEmits, pReNative->PostponedEfl.cEmits);
259 pReNative->PostponedEfl.cEmits = 0;
260# endif
261 }
262}
263
264#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
265
266
267template<bool const a_fDoOp>
268DECL_INLINE_THROW(uint32_t) iemNativeEmitPostponedEFlagsCalcLogical(PIEMNATIVEINSTR pCodeBuf, uint32_t off, uint8_t cOpBits,
269 uint8_t idxRegResult, uint8_t idxRegEfl, uint8_t idxRegTmp)
270{
271#ifdef RT_ARCH_AMD64
272 /* Do TEST idxRegResult, idxRegResult to set flags. */
273 if RT_CONSTEXPR_IF(a_fDoOp)
274 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x84, 0x85, cOpBits, idxRegResult, idxRegResult);
275
276 /*
277 * Collect the EFLAGS status bits.
278 * We know that the overflow bit will always be cleared, so LAHF can be used.
279 */
280 if (idxRegTmp == X86_GREG_xAX)
281 {
282 /* lahf ; AH = EFLAGS */
283 pCodeBuf[off++] = 0x9f;
284 if (idxRegEfl <= X86_GREG_xBX)
285 {
286 /* mov [CDB]L, AH */
287 pCodeBuf[off++] = 0x88;
288 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
289 }
290 else
291 {
292 /* mov AL, AH */
293 pCodeBuf[off++] = 0x88;
294 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
295 /* mov xxL, AL */
296 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
297 pCodeBuf[off++] = 0x88;
298 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
299 }
300 }
301 else if (idxRegEfl != X86_GREG_xAX)
302 {
303# if 1 /* This is 1 or 4 bytes larger, but avoids the stack. */
304 /* xchg rax, tmp */
305 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
306 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
307
308 /* lahf ; AH = EFLAGS */
309 pCodeBuf[off++] = 0x9f;
310 if (idxRegEfl <= X86_GREG_xBX)
311 {
312 /* mov [CDB]L, AH */
313 pCodeBuf[off++] = 0x88;
314 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, idxRegEfl);
315 }
316 else
317 {
318 /* mov AL, AH */
319 pCodeBuf[off++] = 0x88;
320 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
321 /* mov xxL, AL */
322 pCodeBuf[off++] = idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX;
323 pCodeBuf[off++] = 0x88;
324 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 0 /*AL*/, idxRegEfl & 7);
325 }
326
327 /* xchg rax, tmp */
328 pCodeBuf[off++] = idxRegTmp < 8 ? X86_OP_REX_W : X86_OP_REX_B | X86_OP_REX_W;
329 pCodeBuf[off++] = 0x90 + (idxRegTmp & 7);
330
331# else
332 /* pushf */
333 pCodeBuf[off++] = 0x9c;
334 /* pop tmp */
335 if (idxRegTmp >= 8)
336 pCodeBuf[off++] = X86_OP_REX_B;
337 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
338 /* mov byte(efl), byte(tmp) */
339 if (idxRegEfl >= 4 || idxRegTmp >= 4)
340 pCodeBuf[off++] = (idxRegEfl >= 8 ? X86_OP_REX_B : X86_OP_REX)
341 | (idxRegTmp >= 8 ? X86_OP_REX_R : 0);
342 pCodeBuf[off++] = 0x88;
343 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegTmp & 7, idxRegEfl & 7);
344# endif
345 }
346 else
347 {
348 /* xchg al, ah */
349 pCodeBuf[off++] = 0x86;
350 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
351 /* lahf ; AH = EFLAGS */
352 pCodeBuf[off++] = 0x9f;
353 /* xchg al, ah */
354 pCodeBuf[off++] = 0x86;
355 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 4 /*AH*/, 0 /*AL*/);
356 }
357 /* BTR idxEfl, 11; Clear OF */
358 if (idxRegEfl >= 8)
359 pCodeBuf[off++] = X86_OP_REX_B;
360 pCodeBuf[off++] = 0xf;
361 pCodeBuf[off++] = 0xba;
362 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxRegEfl & 7);
363 pCodeBuf[off++] = X86_EFL_OF_BIT;
364
365#elif defined(RT_ARCH_ARM64)
366 /*
367 * Calculate flags.
368 */
369 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
370 off = iemNativeEmitLoadGpr32ImmExT<~X86_EFL_STATUS_BITS>(pCodeBuf, off, idxRegTmp);
371 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
372
373 /* N,Z -> SF,ZF */
374 if (cOpBits < 32)
375 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
376 else if RT_CONSTEXPR_IF(a_fDoOp)
377 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
378 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
379 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
380 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
381 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
382
383 /* Calculate 8-bit parity of the result. */
384 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
385 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
386 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
387 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
388 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
389 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
390 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
391 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
392 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
393
394#else
395# error "port me"
396#endif
397 return off;
398}
399
400#ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
401
402template<uint32_t const a_bmInputRegs, bool const a_fTlbMiss = false>
403static uint32_t iemNativeDoPostponedEFlagsInternal(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf,
404 uint32_t bmExtraTlbMissRegs = 0)
405{
406# ifdef IEMNATIVE_WITH_TB_DEBUG_INFO
407 iemNativeDbgInfoAddPostponedEFlagsCalc(pReNative, off, pReNative->PostponedEfl.enmOp, pReNative->PostponedEfl.cOpBits,
408 pReNative->PostponedEfl.cEmits);
409# endif
410
411 /*
412 * In the TB exit code path we cannot do regular register allocation. Nor
413 * can we when we're in the TLB miss code, unless we're skipping the TLB
414 * lookup. Since the latter isn't an important usecase and should get along
415 * fine on just volatile registers, we do not need to do anything special
416 * for it.
417 *
418 * So, we do our own register allocating here. Any register goes in the TB
419 * exit path, excluding a_bmInputRegs, fixed and postponed related registers.
420 * In the TLB miss we can use any volatile register and temporary registers
421 * allocated in the TLB state.
422 *
423 * Note! On x86 we prefer using RAX as the first TMP register, so we can
424 * make use of LAHF which is typically faster than PUSHF/POP. This
425 * is why the idxRegTmp allocation is first when there is no EFLAG
426 * shadow, since RAX is represented by bit 0 in the mask.
427 */
428 uint32_t bmAvailableRegs;
429 if RT_CONSTEXPR_IF(!a_fTlbMiss)
430 {
431 bmAvailableRegs = ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK) & IEMNATIVE_HST_GREG_MASK;
432 if (pReNative->PostponedEfl.idxReg2 != UINT8_MAX)
433 bmAvailableRegs &= ~(RT_BIT_32(pReNative->PostponedEfl.idxReg1) | RT_BIT_32(pReNative->PostponedEfl.idxReg2));
434 else
435 bmAvailableRegs &= ~RT_BIT_32(pReNative->PostponedEfl.idxReg1);
436 }
437 else
438 {
439 /* Note! a_bmInputRegs takes precedence over bmExtraTlbMissRegs. */
440 bmAvailableRegs = (IEMNATIVE_CALL_VOLATILE_GREG_MASK | bmExtraTlbMissRegs)
441 & ~(a_bmInputRegs | IEMNATIVE_REG_FIXED_MASK)
442 & IEMNATIVE_HST_GREG_MASK;
443 }
444
445 /* Use existing EFLAGS shadow if available. For the TLB-miss code path we
446 need to weed out volatile registers here, as they will no longer be valid. */
447 uint8_t idxRegTmp;
448 uint8_t idxRegEfl = pReNative->Core.aidxGstRegShadows[kIemNativeGstReg_EFlags];
449 if ( (pReNative->Core.bmGstRegShadows & RT_BIT_64(kIemNativeGstReg_EFlags))
450 && (!a_fTlbMiss || !(RT_BIT_32(idxRegEfl) & IEMNATIVE_CALL_VOLATILE_GREG_MASK)))
451 {
452 Assert(idxRegEfl < IEMNATIVE_HST_GREG_COUNT);
453 Assert(!(a_bmInputRegs & RT_BIT_32(idxRegEfl)));
454 if RT_CONSTEXPR_IF(!a_fTlbMiss) Assert(bmAvailableRegs & RT_BIT_32(idxRegEfl));
455 bmAvailableRegs &= ~RT_BIT_32(idxRegEfl);
456# ifdef VBOX_STRICT
457 off = iemNativeEmitGuestRegValueCheckEx(pReNative, pCodeBuf, off, idxRegEfl, kIemNativeGstReg_EFlags);
458# endif
459
460 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1;
461 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
462 }
463 else
464 {
465 idxRegTmp = ASMBitFirstSetU32(bmAvailableRegs) - 1; /* allocate the temp register first to prioritize EAX on x86. */
466 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
467
468 idxRegEfl = ASMBitFirstSetU32(bmAvailableRegs) - 1;
469 bmAvailableRegs &= ~RT_BIT_32(idxRegTmp);
470 off = iemNativeEmitLoadGprFromVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
471 }
472 Assert(bmAvailableRegs != 0);
473
474 /*
475 * Do the actual EFLAGS calculation.
476 */
477 switch (pReNative->PostponedEfl.enmOp)
478 {
479 case kIemNativePostponedEflOp_Logical:
480 Assert(pReNative->PostponedEfl.idxReg2 == UINT8_MAX);
481 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, pReNative->PostponedEfl.cOpBits,
482 pReNative->PostponedEfl.idxReg1, idxRegEfl, idxRegTmp);
483 break;
484
485 default:
486 AssertFailedBreak();
487 }
488
489 /*
490 * Store EFLAGS.
491 */
492# ifdef VBOX_STRICT
493 /* check that X86_EFL_1 is set. */
494 uint32_t offFixup1;
495 off = iemNativeEmitTestBitInGprAndJmpToFixedIfSetEx(pCodeBuf, off, idxRegEfl, X86_EFL_1_BIT, off, &offFixup1);
496 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3330);
497 iemNativeFixupFixedJump(pReNative, offFixup1, off);
498 /* Check that X86_EFL_RAZ_LO_MASK is zero. */
499 off = iemNativeEmitTestAnyBitsInGpr32Ex(pCodeBuf, off, idxRegEfl, X86_EFL_RAZ_LO_MASK);
500 uint32_t const offFixup2 = off;
501 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kIemNativeInstrCond_e);
502 off = iemNativeEmitBrkEx(pCodeBuf, off, 0x3331);
503 iemNativeFixupFixedJump(pReNative, offFixup2, off);
504# endif
505 off = iemNativeEmitStoreGprToVCpuU32Ex(pCodeBuf, off, idxRegEfl, RT_UOFFSETOF(VMCPU, cpum.GstCtx.eflags));
506 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
507
508# if defined(VBOX_WITH_STATISTICS) || defined(IEMNATIVE_WITH_TB_DEBUG_INFO)
509 pReNative->PostponedEfl.cEmits++;
510# endif
511 return off;
512}
513
514
515
516template<uint32_t const a_bmInputRegs>
517DECL_FORCE_INLINE_THROW(uint32_t)
518iemNativeDoPostponedEFlagsAtTbExit(PIEMRECOMPILERSTATE pReNative, uint32_t off)
519{
520 if (pReNative->PostponedEfl.fEFlags)
521 {
522 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
523 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
524 }
525 return off;
526}
527
528
529template<uint32_t const a_bmInputRegs>
530DECL_FORCE_INLINE_THROW(uint32_t)
531iemNativeDoPostponedEFlagsAtTbExitEx(PIEMRECOMPILERSTATE pReNative, uint32_t off, PIEMNATIVEINSTR pCodeBuf)
532{
533 if (pReNative->PostponedEfl.fEFlags)
534 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs>(pReNative, off, pCodeBuf);
535 return off;
536}
537
538
539template<uint32_t const a_bmInputRegs>
540DECL_FORCE_INLINE_THROW(uint32_t)
541iemNativeDoPostponedEFlagsAtTlbMiss(PIEMRECOMPILERSTATE pReNative, uint32_t off, const IEMNATIVEEMITTLBSTATE *pTlbState,
542 uint32_t bmTmpRegs)
543{
544 if (pReNative->PostponedEfl.fEFlags)
545 {
546 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, IEMNATIVE_MAX_POSTPONED_EFLAGS_INSTRUCTIONS);
547 return iemNativeDoPostponedEFlagsInternal<a_bmInputRegs, true>(pReNative, off, pCodeBuf,
548 pTlbState->getRegsNotToSave() | bmTmpRegs);
549 }
550 return off;
551}
552
553#endif /* IEMNATIVE_WITH_EFLAGS_POSTPONING */
554
555
556/**
557 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL.
558 *
559 * It takes liveness stuff into account.
560 */
561/** @todo make fNativeFlags a template argument. */
562DECL_INLINE_THROW(uint32_t)
563iemNativeEmitEFlagsForLogical(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl,
564 uint8_t cOpBits, uint8_t idxRegResult
565#ifndef RT_ARCH_AMD64
566 , bool fNativeFlags = false
567#endif
568 )
569{
570 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalLogical);
571 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
572 RT_NOREF(cOpBits, idxRegResult);
573
574#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
575 /*
576 * See if we can skip this wholesale.
577 */
578 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
579 uint64_t const fEflClobbered = IEMLIVENESS_STATE_GET_WILL_BE_CLOBBERED_SET(pLivenessEntry)
580 & IEMLIVENESSBIT_STATUS_EFL_MASK;
581# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
582 uint64_t fEflPostponing;
583# endif
584 if ( fEflClobbered == IEMLIVENESSBIT_STATUS_EFL_MASK
585 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
586 {
587 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedLogical);
588 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
589# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
590 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
591# endif
592 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS));
593 return off;
594 }
595# ifdef IEMNATIVE_WITH_EFLAGS_POSTPONING
596 if ( ( (fEflPostponing = IEMLIVENESS_STATE_GET_CAN_BE_POSTPONED_SET(pLivenessEntry) & IEMLIVENESSBIT_STATUS_EFL_MASK)
597 | fEflClobbered)
598 == IEMLIVENESSBIT_STATUS_EFL_MASK
599 && idxRegResult != UINT8_MAX)
600 {
601 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflPostponedLogical);
602 pReNative->PostponedEfl.fEFlags = X86_EFL_STATUS_BITS;
603 pReNative->PostponedEfl.enmOp = kIemNativePostponedEflOp_Logical;
604 pReNative->PostponedEfl.cOpBits = cOpBits;
605 pReNative->PostponedEfl.idxReg1 = iemNativeRegAllocTmpEx(pReNative, &off, IEMNATIVE_POSTPONING_REG_MASK, false);
606 /** @todo it would normally be possible to use idxRegResult, iff it is
607 * already a non-volatile register and we can be user the caller
608 * doesn't modify it. That'll save a register move and allocation. */
609 off = iemNativeEmitLoadGprFromGpr(pReNative, off, pReNative->PostponedEfl.idxReg1, idxRegResult);
610 Log5(("EFLAGS: Postponing %#x op=%u bits=%u reg1=%u - iemNativeEmitEFlagsForLogical\n", X86_EFL_STATUS_BITS,
611 kIemNativePostponedEflOp_Logical, cOpBits, pReNative->PostponedEfl.idxReg1));
612 }
613# endif
614 else
615#endif
616 {
617 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
618 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
619#ifdef RT_ARCH_AMD64
620 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
621#elif defined(RT_ARCH_ARM64)
622 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 16);
623#else
624# error "port me"
625#endif
626#ifndef RT_ARCH_AMD64
627 if (!fNativeFlags)
628 off = iemNativeEmitPostponedEFlagsCalcLogical<true>(pCodeBuf, off, cOpBits, idxRegResult, idxRegEfl, idxRegTmp);
629 else
630#endif
631 off = iemNativeEmitPostponedEFlagsCalcLogical<false>(pCodeBuf, off, cOpBits, idxRegResult, idxRegEfl, idxRegTmp);
632 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
633
634 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
635 iemNativeRegFreeTmp(pReNative, idxRegTmp);
636 }
637
638#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
639 if (pReNative->fSkippingEFlags)
640 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForLogical)\n", pReNative->fSkippingEFlags));
641 pReNative->fSkippingEFlags = 0;
642# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
643 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
644# endif
645#endif
646 return off;
647}
648
649
650/**
651 * This is an implementation of IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
652 *
653 * It takes liveness stuff into account.
654 */
655DECL_FORCE_INLINE_THROW(uint32_t)
656iemNativeEmitEFlagsForArithmetic(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxVarEfl, uint8_t idxRegEflIn
657#ifndef RT_ARCH_AMD64
658 , uint8_t cOpBits, uint8_t idxRegResult, uint8_t idxRegDstIn, uint8_t idxRegSrc
659 , bool fInvertCarry, uint64_t uImmSrc
660#endif
661 )
662{
663 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalArithmetic);
664 IEMNATIVE_CLEAR_POSTPONED_EFLAGS(pReNative, X86_EFL_STATUS_BITS);
665
666#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
667 /*
668 * See if we can skip this wholesale.
669 */
670 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
671 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
672 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
673 {
674 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedArithmetic);
675 pReNative->fSkippingEFlags = X86_EFL_STATUS_BITS;
676 Log5(("EFLAGS: Skipping %#x - iemNativeEmitEFlagsForArithmetic\n", X86_EFL_STATUS_BITS));
677# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
678 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
679# endif
680 }
681 else
682#endif
683 {
684#ifdef RT_ARCH_AMD64
685 /*
686 * Collect flags and merge them with eflags.
687 */
688 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
689 /* pushf - do this before any reg allocations as they may emit instructions too. */
690 pCodeBuf[off++] = 0x9c;
691
692 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
693 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
694 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
695 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2 + 7 + 7 + 3);
696 /* pop tmp */
697 if (idxTmpReg >= 8)
698 pCodeBuf[off++] = X86_OP_REX_B;
699 pCodeBuf[off++] = 0x58 + (idxTmpReg & 7);
700 /* Isolate the flags we want. */
701 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxTmpReg, X86_EFL_STATUS_BITS);
702 /* Clear the status bits in EFLs. */
703 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
704 /* OR in the flags we collected. */
705 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxTmpReg);
706 if (idxRegEflIn != idxRegEfl)
707 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
708 iemNativeRegFreeTmp(pReNative, idxTmpReg);
709
710#elif defined(RT_ARCH_ARM64)
711 /*
712 * Calculate flags.
713 */
714 uint8_t const idxRegEfl = idxRegEflIn != UINT8_MAX ? idxRegEflIn
715 : iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
716 uint8_t const idxTmpReg = iemNativeRegAllocTmp(pReNative, &off);
717 uint8_t const idxTmpReg2 = cOpBits >= 32 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
718 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
719
720 /* Invert CF (stored inved on ARM) and load the flags into the temporary register. */
721 if (fInvertCarry)
722 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
723 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxTmpReg, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
724
725 if (cOpBits >= 32)
726 {
727 /* V -> OF */
728 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 28);
729 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
730
731 /* C -> CF */
732 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, 1);
733 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
734 }
735
736 /* N,Z -> SF,ZF */
737 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits >= 32 ? 1 : 30);
738 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
739
740 /* For ADC and SBB we have to calculate overflow and carry our selves. */
741 if (cOpBits < 32)
742 {
743 /* Since the carry flag is the zero'th flag, we just use BFXIL got copy it over. */
744 AssertCompile(X86_EFL_CF_BIT == 0);
745 pCodeBuf[off++] = Armv8A64MkInstrBfxil(idxRegEfl, idxRegResult, cOpBits, 1, false /*f64Bit*/);
746
747 /* The overflow flag is more work as we have to compare the signed bits for
748 both inputs and the result. See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC.
749
750 Formula: ~(a_uDst ^ a_uSrcOf) & (a_uResult ^ a_uDst)
751 With a_uSrcOf as a_uSrc for additions and ~a_uSrc for subtractions.
752
753 It is a bit simpler when the right (source) side is constant:
754 adc: S D R -> OF sbb: S D R -> OF
755 0 0 0 -> 0 \ 0 0 0 -> 0 \
756 0 0 1 -> 1 \ 0 0 1 -> 0 \
757 0 1 0 -> 0 / and not(D), R 0 1 0 -> 1 / and D, not(R)
758 0 1 1 -> 0 / 0 1 1 -> 0 /
759 1 0 0 -> 0 \ 1 0 0 -> 0 \
760 1 0 1 -> 0 \ and D, not(R) 1 0 1 -> 1 \ and not(D), R
761 1 1 0 -> 1 / 1 1 0 -> 0 /
762 1 1 1 -> 0 / 1 1 1 -> 0 / */
763 if (idxRegSrc != UINT8_MAX)
764 {
765 if (fInvertCarry) /* sbb: ~((a_uDst) ^ ~(a_uSrcOf)) -> (a_uDst) ^ (a_uSrcOf); HACK ALERT: fInvertCarry == sbb */
766 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false);
767 else /* adc: ~((a_uDst) ^ (a_uSrcOf)) -> (a_uDst) ^ ~(a_uSrcOf) */
768 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegSrc, false);
769 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg2, idxRegDstIn, idxRegResult, false); /* (a_uDst) ^ (a_uResult) */
770 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxTmpReg, idxTmpReg, idxTmpReg2, false /*f64Bit*/);
771 }
772 else if (uImmSrc & RT_BIT_32(cOpBits - 1))
773 {
774 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
775 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
776 else
777 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
778 }
779 else
780 {
781 if (fInvertCarry) /* HACK ALERT: fInvertCarry == sbb */
782 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegDstIn, idxRegResult, false);
783 else
784 pCodeBuf[off++] = Armv8A64MkInstrBic(idxTmpReg, idxRegResult, idxRegDstIn, false);
785 }
786 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, cOpBits - 1, false /*f64Bit*/);
787 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_OF_BIT, 1);
788 iemNativeRegFreeTmp(pReNative, idxTmpReg2);
789 }
790
791 /* Calculate 8-bit parity of the result. */
792 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegResult, idxRegResult, false /*f64Bit*/,
793 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
794 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
795 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
796 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxTmpReg, false /*f64Bit*/,
797 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
798 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
799 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxTmpReg, idxTmpReg, 0, 0, false /*f64Bit*/);
800 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
801
802 /* Calculate auxilary carry/borrow. This is related to 8-bit BCD.
803 General formula: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
804 S D R
805 0 0 0 -> 0; \
806 0 0 1 -> 1; \ regular
807 0 1 0 -> 1; / xor R, D
808 0 1 1 -> 0; /
809 1 0 0 -> 1; \
810 1 0 1 -> 0; \ invert one of the two
811 1 1 0 -> 0; / xor not(R), D
812 1 1 1 -> 1; /
813 a_uSrc[bit 4]=0: ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
814 a_uSrc[bit 4]=1: ((uint32_t)~(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF;
815 */
816
817 if (idxRegSrc != UINT8_MAX)
818 {
819 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegSrc, false /*f64Bit*/);
820 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxTmpReg, idxRegResult, false /*f64Bit*/);
821 }
822 else if (uImmSrc & X86_EFL_AF)
823 pCodeBuf[off++] = Armv8A64MkInstrEon(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
824 else
825 pCodeBuf[off++] = Armv8A64MkInstrEor(idxTmpReg, idxRegDstIn, idxRegResult, false /*f64Bit*/);
826 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxTmpReg, idxTmpReg, X86_EFL_AF_BIT, false /*f64Bit*/);
827 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxTmpReg, X86_EFL_AF_BIT, 1, false /*f64Bit*/);
828
829 if (idxRegEflIn != idxRegEfl)
830 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
831 iemNativeRegFreeTmp(pReNative, idxTmpReg);
832
833#else
834# error "port me"
835#endif
836 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
837
838#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
839 if (pReNative->fSkippingEFlags)
840 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForArithmetic)\n", pReNative->fSkippingEFlags));
841 pReNative->fSkippingEFlags = 0;
842# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
843 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
844# endif
845#endif
846 }
847 return off;
848
849}
850
851
852
853/*********************************************************************************************************************************
854* Bitwise Logical Operations *
855*********************************************************************************************************************************/
856
857/**
858 * The AND instruction will clear OF, CF and AF (latter is undefined) and
859 * set the other flags according to the result.
860 */
861DECL_INLINE_THROW(uint32_t)
862iemNativeEmit_and_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
863 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
864{
865 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
866 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
867#ifdef RT_ARCH_AMD64
868 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
869 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
870 0x22, 0x23, cOpBits, idxRegDst, idxRegSrc);
871 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
872 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
873
874 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
875
876#elif defined(RT_ARCH_ARM64)
877 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. */
878 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
879 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
880 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
881 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
882
883 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, true /*fNativeFlags*/);
884#else
885# error "Port me"
886#endif
887 iemNativeVarRegisterRelease(pReNative, idxVarDst);
888 return off;
889}
890
891
892/**
893 * The AND instruction with immediate value as right operand.
894 */
895DECL_INLINE_THROW(uint32_t)
896iemNativeEmit_and_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
897 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
898{
899 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
900#ifdef RT_ARCH_AMD64
901 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
902 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
903 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 4, idxRegDst, uImmOp);
904 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
905
906 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
907
908#elif defined(RT_ARCH_ARM64)
909 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
910 course the immediate variant when possible to save a register load. */
911 uint32_t uImmSizeLen, uImmRotations;
912 if ( cOpBits > 32
913 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
914 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
915 {
916 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
917 if (cOpBits >= 32)
918 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
919 else
920 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
921 }
922 else
923 {
924 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
925 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
926 if (cOpBits >= 32)
927 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
928 else
929 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
930 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
931 }
932 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
933
934 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst, cOpBits >= 32 /*fNativeFlags*/);
935 RT_NOREF_PV(cImmBits);
936
937#else
938# error "Port me"
939#endif
940 iemNativeVarRegisterRelease(pReNative, idxVarDst);
941 return off;
942}
943
944
945/**
946 * The TEST instruction will clear OF, CF and AF (latter is undefined) and
947 * set the other flags according to the result.
948 */
949DECL_INLINE_THROW(uint32_t)
950iemNativeEmit_test_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
951 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
952{
953 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
954 uint8_t const idxRegSrc = idxVarSrc == idxVarDst ? idxRegDst /* special case of 'test samereg,samereg' */
955 : iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
956#ifdef RT_ARCH_AMD64
957 /* On AMD64 we just use the correctly sized TEST instruction harvest the EFLAGS. */
958 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
959 0x84, 0x85, cOpBits, idxRegSrc, idxRegDst);
960 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
961
962#elif defined(RT_ARCH_ARM64)
963 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones. We also
964 need to keep the result in order to calculate the flags. */
965 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
966 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
967 if (cOpBits >= 32)
968 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
969 else
970 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
971 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
972
973#else
974# error "Port me"
975#endif
976 if (idxVarSrc != idxVarDst)
977 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
978 iemNativeVarRegisterRelease(pReNative, idxVarDst);
979
980#ifdef RT_ARCH_AMD64
981 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
982#else
983 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /*fNativeFlags*/);
984 iemNativeRegFreeTmp(pReNative, idxRegResult);
985#endif
986 return off;
987}
988
989
990/**
991 * The TEST instruction with immediate value as right operand.
992 */
993DECL_INLINE_THROW(uint32_t)
994iemNativeEmit_test_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
995 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
996{
997 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
998#ifdef RT_ARCH_AMD64
999 /* On AMD64 we just use the correctly sized AND instruction harvest the EFLAGS. */
1000 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1001 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0xf6, 0xcc, 0xf7, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1002 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1003 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1004
1005 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, UINT8_MAX);
1006
1007#elif defined(RT_ARCH_ARM64)
1008 /* On ARM64 we use 32-bit AND for the 8-bit and 16-bit bit ones, and of
1009 course the immediate variant when possible to save a register load.
1010 We also need to keep the result in order to calculate the flags. */
1011 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1012 uint32_t uImmSizeLen, uImmRotations;
1013 if ( cOpBits > 32
1014 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1015 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1016 {
1017 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1018 if (cOpBits >= 32)
1019 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1020 else
1021 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegResult, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1022 }
1023 else
1024 {
1025 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1026 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1027 if (cOpBits >= 32)
1028 pCodeBuf[off++] = Armv8A64MkInstrAnds(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1029 else
1030 pCodeBuf[off++] = Armv8A64MkInstrAnd(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1031 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1032 }
1033 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1034 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1035
1036 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegResult, cOpBits >= 32 /*fNativeFlags*/);
1037
1038 iemNativeRegFreeTmp(pReNative, idxRegResult);
1039 RT_NOREF_PV(cImmBits);
1040
1041#else
1042# error "Port me"
1043#endif
1044 return off;
1045}
1046
1047
1048/**
1049 * The OR instruction will clear OF, CF and AF (latter is undefined) and
1050 * set the other flags according to the result.
1051 */
1052DECL_INLINE_THROW(uint32_t)
1053iemNativeEmit_or_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1054 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1055{
1056 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1057 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1058#ifdef RT_ARCH_AMD64
1059 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1060 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1061 0x0a, 0x0b, cOpBits, idxRegDst, idxRegSrc);
1062 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1063 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1064
1065 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1066
1067#elif defined(RT_ARCH_ARM64)
1068 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1069 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1070 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1071 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1072 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1073
1074 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1075
1076#else
1077# error "Port me"
1078#endif
1079 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1080 return off;
1081}
1082
1083
1084/**
1085 * The OR instruction with immediate value as right operand.
1086 */
1087DECL_INLINE_THROW(uint32_t)
1088iemNativeEmit_or_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1089 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1090{
1091 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1092#ifdef RT_ARCH_AMD64
1093 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1094 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1095 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 1, idxRegDst, uImmOp);
1096 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1097
1098 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1099
1100#elif defined(RT_ARCH_ARM64)
1101 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1102 course the immediate variant when possible to save a register load. */
1103 uint32_t uImmSizeLen, uImmRotations;
1104 if ( cOpBits > 32
1105 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1106 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1107 {
1108 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1109 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1110 }
1111 else
1112 {
1113 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1114 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1115 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1116 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1117 }
1118 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1119
1120 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1121 RT_NOREF_PV(cImmBits);
1122
1123#else
1124# error "Port me"
1125#endif
1126 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1127 return off;
1128}
1129
1130
1131/**
1132 * The XOR instruction will clear OF, CF and AF (latter is undefined) and
1133 * set the other flags according to the result.
1134 */
1135DECL_INLINE_THROW(uint32_t)
1136iemNativeEmit_xor_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1137 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1138{
1139 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1140 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1141#ifdef RT_ARCH_AMD64
1142 /* On AMD64 we just use the correctly sized OR instruction harvest the EFLAGS. */
1143 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1144 0x32, 0x33, cOpBits, idxRegDst, idxRegSrc);
1145 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1146 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1147
1148 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1149
1150#elif defined(RT_ARCH_ARM64)
1151 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones. */
1152 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1153 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1154 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1155 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1156
1157 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1158
1159#else
1160# error "Port me"
1161#endif
1162 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1163 return off;
1164}
1165
1166
1167/**
1168 * The XOR instruction with immediate value as right operand.
1169 */
1170DECL_INLINE_THROW(uint32_t)
1171iemNativeEmit_xor_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1172 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1173{
1174 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1175#ifdef RT_ARCH_AMD64
1176 /* On AMD64 we just use the correctly sized XOR instruction harvest the EFLAGS. */
1177 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1178 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 6, idxRegDst, uImmOp);
1179 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1180
1181 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1182
1183#elif defined(RT_ARCH_ARM64)
1184 /* On ARM64 we use 32-bit OR for the 8-bit and 16-bit bit ones, and of
1185 course the immediate variant when possible to save a register load. */
1186 uint32_t uImmSizeLen, uImmRotations;
1187 if ( cOpBits > 32
1188 ? Armv8A64ConvertMask64ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations)
1189 : Armv8A64ConvertMask32ToImmRImmS(uImmOp, &uImmSizeLen, &uImmRotations))
1190 {
1191 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1192 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegDst, idxRegDst, uImmSizeLen, uImmRotations, cOpBits > 32 /*f64Bit*/);
1193 }
1194 else
1195 {
1196 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1197 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1198 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/);
1199 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1200 }
1201 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1202
1203 off = iemNativeEmitEFlagsForLogical(pReNative, off, idxVarEfl, cOpBits, idxRegDst);
1204 RT_NOREF_PV(cImmBits);
1205
1206#else
1207# error "Port me"
1208#endif
1209 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1210 return off;
1211}
1212
1213
1214
1215/*********************************************************************************************************************************
1216* ADD, ADC, SUB, SBB, CMP *
1217*********************************************************************************************************************************/
1218
1219/**
1220 * The ADD instruction will set all status flags.
1221 */
1222DECL_INLINE_THROW(uint32_t)
1223iemNativeEmit_add_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1224 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1225{
1226 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1227 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1228
1229#ifdef RT_ARCH_AMD64
1230 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1231 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1232 0x02, 0x03, cOpBits, idxRegDst, idxRegSrc);
1233 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1234
1235 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1236 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1237
1238 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1239
1240#elif defined(RT_ARCH_ARM64)
1241 /* On ARM64 we'll need the two input operands as well as the result in order
1242 to calculate the right flags, even if we use ADDS and translates NZCV into
1243 OF, CF, ZF and SF. */
1244 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1245 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1246 if (cOpBits >= 32)
1247 {
1248 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1249 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1250 }
1251 else
1252 {
1253 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1254 uint32_t const cShift = 32 - cOpBits;
1255 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1256 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1257 true /*fSetFlags*/, cShift);
1258 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1259 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1260 cOpBits = 32;
1261 }
1262 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1263
1264 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1265 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1266
1267 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1268 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1269 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1270
1271#else
1272# error "port me"
1273#endif
1274 return off;
1275}
1276
1277
1278/**
1279 * The ADD instruction with immediate value as right operand.
1280 */
1281DECL_INLINE_THROW(uint32_t)
1282iemNativeEmit_add_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1283 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1284{
1285 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1286
1287#ifdef RT_ARCH_AMD64
1288 /* On AMD64 we just use the correctly sized ADD instruction to get the right EFLAGS.SF value. */
1289 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1290 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 0, idxRegDst, uImmOp);
1291 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1292
1293 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1294
1295 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1296
1297#elif defined(RT_ARCH_ARM64)
1298 /* On ARM64 we'll need the two input operands as well as the result in order
1299 to calculate the right flags, even if we use ADDS and translates NZCV into
1300 OF, CF, ZF and SF. */
1301 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1302 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1303 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1304 if (cOpBits >= 32)
1305 {
1306 if (uImmOp <= 0xfffU)
1307 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1308 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1309 pCodeBuf[off++] = Armv8A64MkInstrAddUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1310 true /*fSetFlags*/, true /*fShift12*/);
1311 else
1312 {
1313 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1314 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1315 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1316 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1317 }
1318 }
1319 else
1320 {
1321 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1322 uint32_t const cShift = 32 - cOpBits;
1323 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp << cShift);
1324 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1325 pCodeBuf[off++] = Armv8A64MkInstrAddReg(idxRegDst, idxRegTmpImm, idxRegDstIn, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1326 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1327 cOpBits = 32;
1328 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1329 }
1330 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1331
1332 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1333 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1334
1335 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1336 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1337 RT_NOREF(cImmBits);
1338
1339#else
1340# error "port me"
1341#endif
1342 return off;
1343}
1344
1345
1346/**
1347 * The ADC instruction takes CF as input and will set all status flags.
1348 */
1349DECL_INLINE_THROW(uint32_t)
1350iemNativeEmit_adc_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1351 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1352{
1353 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1354 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1355 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1356
1357#ifdef RT_ARCH_AMD64
1358 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1359 with matching size to get the correct flags. */
1360 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1361
1362 /* Use the BT instruction to set CF according to idxRegEfl. */
1363 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1364 pCodeBuf[off++] = X86_EFL_CF_BIT;
1365
1366 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x12, 0x13, cOpBits, idxRegDst, idxRegSrc);
1367 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1368
1369 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1370 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1371
1372 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1373
1374#elif defined(RT_ARCH_ARM64)
1375 /* On ARM64 we use the RMIF instruction to load PSTATE.CF from idxRegEfl and
1376 then ADCS for the calculation. We need all inputs and result for the two
1377 flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1378 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1379 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
1380
1381 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1382 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1383 if (cOpBits >= 32)
1384 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1385 else
1386 {
1387 /* Since we're also adding in the carry flag here, shifting operands up
1388 doesn't work. So, we have to calculate carry & overflow manually. */
1389 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1390 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1391 }
1392 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1393
1394 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1395 idxRegDstIn, idxRegSrc, false /*fInvertCarry*/, 0);
1396
1397 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1398 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1399 if (cOpBits < 32)
1400 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1401 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1402
1403#else
1404# error "port me"
1405#endif
1406 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1407 return off;
1408}
1409
1410
1411/**
1412 * The ADC instruction with immediate value as right operand.
1413 */
1414DECL_INLINE_THROW(uint32_t)
1415iemNativeEmit_adc_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1416 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1417{
1418 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1419 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1420
1421#ifdef RT_ARCH_AMD64
1422 /* On AMD64 we use BT to set EFLAGS.CF and then issue an ADC instruction
1423 with matching size to get the correct flags. */
1424 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1425
1426 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1427 pCodeBuf[off++] = X86_EFL_CF_BIT;
1428
1429 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 2, idxRegDst, uImmOp);
1430 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1431
1432 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1433
1434 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1435
1436#elif defined(RT_ARCH_ARM64)
1437 /* On ARM64 we use the RMIF instructions to load PSTATE.CF from idxRegEfl
1438 and then ADCS for the calculation. We need all inputs and result for
1439 the two flags (AF,PF) that can't be directly derived from PSTATE.NZCV. */
1440 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1441 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1442 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1443
1444 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1445 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1446 if (cOpBits >= 32)
1447 pCodeBuf[off++] = Armv8A64MkInstrAdcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /*f64Bit*/);
1448 else
1449 {
1450 /* Since we're also adding in the carry flag here, shifting operands up
1451 doesn't work. So, we have to calculate carry & overflow manually. */
1452 pCodeBuf[off++] = Armv8A64MkInstrAdc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1453 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1454 }
1455 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1456
1457 iemNativeRegFreeTmp(pReNative, idxRegImm);
1458
1459 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1460 idxRegDstIn, UINT8_MAX, false /*fInvertCarry*/, uImmOp);
1461
1462 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1463 if (cOpBits < 32)
1464 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1465 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1466 RT_NOREF(cImmBits);
1467
1468#else
1469# error "port me"
1470#endif
1471 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1472 return off;
1473}
1474
1475
1476/**
1477 * The SUB instruction will set all status flags.
1478 */
1479DECL_INLINE_THROW(uint32_t)
1480iemNativeEmit_sub_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1481 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1482{
1483 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1484 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1485
1486#ifdef RT_ARCH_AMD64
1487 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1488 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1489 0x2a, 0x2b, cOpBits, idxRegDst, idxRegSrc);
1490 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1491
1492 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1493 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1494
1495 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1496
1497#elif defined(RT_ARCH_ARM64)
1498 /* On ARM64 we'll need the two input operands as well as the result in order
1499 to calculate the right flags, even if we use SUBS and translates NZCV into
1500 OF, CF, ZF and SF. */
1501 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1502 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1503 if (cOpBits >= 32)
1504 {
1505 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1506 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1507 }
1508 else
1509 {
1510 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1511 uint32_t const cShift = 32 - cOpBits;
1512 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDstIn, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1513 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegSrc, false /*f64Bit*/,
1514 true /*fSetFlags*/, cShift);
1515 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1516 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1517 cOpBits = 32;
1518 }
1519 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1520
1521 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1522 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1523
1524 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1525 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1526 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1527
1528#else
1529# error "port me"
1530#endif
1531 return off;
1532}
1533
1534
1535/**
1536 * The SUB instruction with immediate value as right operand.
1537 */
1538DECL_INLINE_THROW(uint32_t)
1539iemNativeEmit_sub_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1540 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1541{
1542 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1543
1544#ifdef RT_ARCH_AMD64
1545 /* On AMD64 we just use the correctly sized SUB instruction to get the right EFLAGS.SF value. */
1546 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1547 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 5, idxRegDst, uImmOp);
1548 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1549
1550 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1551
1552 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1553
1554#elif defined(RT_ARCH_ARM64)
1555 /* On ARM64 we'll need the two input operands as well as the result in order
1556 to calculate the right flags, even if we use SUBS and translates NZCV into
1557 OF, CF, ZF and SF. */
1558 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1559 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1560 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1561 if (cOpBits >= 32)
1562 {
1563 if (uImmOp <= 0xfffU)
1564 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1565 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1566 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegDst, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1567 true /*fSetFlags*/, true /*fShift12*/);
1568 else
1569 {
1570 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1571 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1572 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1573 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1574 }
1575 }
1576 else
1577 {
1578 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1579 uint32_t const cShift = 32 - cOpBits;
1580 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1581 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4);
1582 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1583 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegDst, idxRegDstIn, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1584 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDstIn, idxRegDstIn, cShift, false /*f64Bit*/);
1585 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegDst, idxRegDst, cShift, false /*f64Bit*/);
1586 cOpBits = 32;
1587 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1588 }
1589 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1590
1591 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegDst,
1592 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1593
1594 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1595 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1596 RT_NOREF(cImmBits);
1597
1598#else
1599# error "port me"
1600#endif
1601 return off;
1602}
1603
1604
1605/**
1606 * The CMP instruction will set all status flags, but modifies no registers.
1607 */
1608DECL_INLINE_THROW(uint32_t)
1609iemNativeEmit_cmp_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1610 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1611{
1612 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1613 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1614
1615#ifdef RT_ARCH_AMD64
1616 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1617 off = iemNativeEmitAmd64OneByteModRmInstrRREx(iemNativeInstrBufEnsure(pReNative, off, 4), off,
1618 0x3a, 0x3b, cOpBits, idxRegDst, idxRegSrc);
1619 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1620
1621 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1622 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1623
1624 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1625
1626#elif defined(RT_ARCH_ARM64)
1627 /* On ARM64 we'll need the actual result as well as both input operands in order
1628 to calculate the right flags, even if we use SUBS and translates NZCV into
1629 OF, CF, ZF and SF. */
1630 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1631 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1632 if (cOpBits >= 32)
1633 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1634 else
1635 {
1636 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1637 uint32_t const cShift = 32 - cOpBits;
1638 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegResult, ARMV8_A64_REG_XZR, idxRegDst, false /*f64Bit*/, cShift);
1639 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegSrc, false /*f64Bit*/,
1640 true /*fSetFlags*/, cShift);
1641 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1642 cOpBits = 32;
1643 }
1644 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1645
1646 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1647 idxRegDst, idxRegSrc, true /*fInvertCarry*/, 0);
1648
1649 iemNativeRegFreeTmp(pReNative, idxRegResult);
1650 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1651 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1652
1653#else
1654# error "port me"
1655#endif
1656 return off;
1657}
1658
1659
1660/**
1661 * The CMP instruction with immediate value as right operand.
1662 */
1663DECL_INLINE_THROW(uint32_t)
1664iemNativeEmit_cmp_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1665 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1666{
1667 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1668
1669#ifdef RT_ARCH_AMD64
1670 /* On AMD64 we just use the correctly sized CMP instruction to get the right EFLAGS.SF value. */
1671 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1672 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 7, idxRegDst, uImmOp);
1673 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1674
1675 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1676
1677 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX);
1678
1679#elif defined(RT_ARCH_ARM64)
1680 /* On ARM64 we'll need the actual result as well as both input operands in order
1681 to calculate the right flags, even if we use SUBS and translates NZCV into
1682 OF, CF, ZF and SF. */
1683 uint8_t const idxRegResult = iemNativeRegAllocTmp(pReNative, &off);
1684 PIEMNATIVEINSTR pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
1685 if (cOpBits >= 32)
1686 {
1687 if (uImmOp <= 0xfffU)
1688 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1689 else if (uImmOp <= 0xfff000U && !(uImmOp & 0xfff))
1690 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegResult, idxRegDst, uImmOp >> 12, cOpBits > 32 /*f64Bit*/,
1691 true /*fSetFlags*/, true /*fShift12*/);
1692 else
1693 {
1694 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1695 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1696 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegDst, idxRegTmpImm, cOpBits > 32 /*f64Bit*/, true /*fSetFlags*/);
1697 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1698 }
1699 }
1700 else
1701 {
1702 /* Shift the operands up so we can perform a 32-bit operation and get all four flags. */
1703 uint32_t const cShift = 32 - cOpBits;
1704 uint8_t const idxRegTmpImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1705 pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1706 pCodeBuf[off++] = Armv8A64MkInstrLslImm(idxRegResult, idxRegDst, cShift, false /*f64Bit*/);
1707 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegResult, idxRegResult, idxRegTmpImm, false /*f64Bit*/, true /*fSetFlags*/, cShift);
1708 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegResult, idxRegResult, cShift, false /*f64Bit*/);
1709 cOpBits = 32;
1710 iemNativeRegFreeTmpImm(pReNative, idxRegTmpImm);
1711 }
1712 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1713
1714 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, idxVarEfl, UINT8_MAX, cOpBits, idxRegResult,
1715 idxRegDst, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1716
1717 iemNativeRegFreeTmp(pReNative, idxRegResult);
1718 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1719 RT_NOREF(cImmBits);
1720
1721#else
1722# error "port me"
1723#endif
1724 return off;
1725}
1726
1727
1728/**
1729 * The SBB instruction takes CF as input and will set all status flags.
1730 */
1731DECL_INLINE_THROW(uint32_t)
1732iemNativeEmit_sbb_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1733 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1734{
1735 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1736 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
1737 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1738
1739#ifdef RT_ARCH_AMD64
1740 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1741 with matching size to get the correct flags. */
1742 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 9);
1743
1744 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1745 pCodeBuf[off++] = X86_EFL_CF_BIT;
1746
1747 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0x1a, 0x1b, cOpBits, idxRegDst, idxRegSrc);
1748 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1749
1750 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1751 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1752
1753 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1754
1755#elif defined(RT_ARCH_ARM64)
1756 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1757 idxRegEfl and then SBCS for the calculation. We need all inputs and
1758 result for the two flags (AF,PF) that can't be directly derived from
1759 PSTATE.NZCV. */
1760 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1761 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1762
1763 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1764 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1765 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1766 if (cOpBits >= 32)
1767 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegSrc, cOpBits > 32 /*f64Bit*/);
1768 else
1769 {
1770 /* Since we're also adding in the carry flag here, shifting operands up
1771 doesn't work. So, we have to calculate carry & overflow manually. */
1772 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegSrc, false /*f64Bit*/);
1773 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1774 }
1775 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1776
1777 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1778 idxRegDstIn, idxRegSrc, true /*fInvertCarry*/, 0);
1779
1780 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1781 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
1782 if (cOpBits < 32)
1783 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1784 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1785
1786#else
1787# error "port me"
1788#endif
1789 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1790 return off;
1791}
1792
1793
1794/**
1795 * The SBB instruction with immediate value as right operand.
1796 */
1797DECL_INLINE_THROW(uint32_t)
1798iemNativeEmit_sbb_r_i_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1799 uint8_t idxVarDst, uint64_t uImmOp, uint8_t idxVarEfl, uint8_t cOpBits, uint8_t cImmBits)
1800{
1801 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
1802 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
1803
1804#ifdef RT_ARCH_AMD64
1805 /* On AMD64 we use BT to set EFLAGS.CF and then issue an SBB instruction
1806 with matching size to get the correct flags. */
1807 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 12);
1808
1809 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b, 0xba, 32 /*cOpBits*/, 4, idxRegEfl);
1810 pCodeBuf[off++] = X86_EFL_CF_BIT;
1811
1812 off = iemNativeEmitAmd64OneByteModRmInstrRIEx(pCodeBuf, off, 0x80, 0x83, 0x81, cOpBits, cImmBits, 3, idxRegDst, uImmOp);
1813 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1814
1815 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1816
1817 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl);
1818
1819#elif defined(RT_ARCH_ARM64)
1820 /* On ARM64 we use the RMIF+CFINV instructions to load PSTATE.CF from
1821 idxRegEfl and then SBCS for the calculation. We need all inputs and
1822 result for the two flags (AF,PF) that can't be directly derived from
1823 PSTATE.NZCV. */
1824 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
1825 uint8_t const idxRegImm = iemNativeRegAllocTmpImm(pReNative, &off, uImmOp);
1826 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
1827
1828 pCodeBuf[off++] = Armv8A64MkInstrRmif(idxRegEfl, (X86_EFL_CF_BIT - 1) & 63, RT_BIT_32(1) /*fMask=C*/);
1829 pCodeBuf[off++] = ARMV8_A64_INSTR_CFINV;
1830 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
1831 if (cOpBits >= 32)
1832 pCodeBuf[off++] = Armv8A64MkInstrSbcs(idxRegDst, idxRegDst, idxRegImm, cOpBits > 32 /*f64Bit*/);
1833 else
1834 {
1835 /* Since we're also adding in the carry flag here, shifting operands up
1836 doesn't work. So, we have to calculate carry & overflow manually. */
1837 pCodeBuf[off++] = Armv8A64MkInstrSbc(idxRegDst, idxRegDst, idxRegImm, false /*f64Bit*/);
1838 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegDst, cOpBits > 8); /* NZ are okay, CV aren't.*/
1839 }
1840 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
1841
1842 iemNativeRegFreeTmp(pReNative, idxRegImm);
1843
1844 off = iemNativeEmitEFlagsForArithmetic(pReNative, off, UINT8_MAX, idxRegEfl, cOpBits, idxRegDst,
1845 idxRegDstIn, UINT8_MAX, true /*fInvertCarry*/, uImmOp);
1846
1847 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
1848 if (cOpBits < 32)
1849 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegDst, RT_BIT_32(cOpBits) - 1U);
1850 iemNativeVarRegisterRelease(pReNative, idxVarDst);
1851 RT_NOREF(cImmBits);
1852
1853#else
1854# error "port me"
1855#endif
1856 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
1857 return off;
1858}
1859
1860
1861DECL_INLINE_THROW(uint32_t)
1862iemNativeEmit_imul_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1863 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1864{
1865 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1866 AssertFailed();
1867 return iemNativeEmitBrk(pReNative, off, 0x666);
1868}
1869
1870
1871DECL_INLINE_THROW(uint32_t)
1872iemNativeEmit_popcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1873 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1874{
1875 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1876 AssertFailed();
1877 return iemNativeEmitBrk(pReNative, off, 0x666);
1878}
1879
1880
1881DECL_INLINE_THROW(uint32_t)
1882iemNativeEmit_tzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1883 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1884{
1885 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1886 AssertFailed();
1887 return iemNativeEmitBrk(pReNative, off, 0x666);
1888}
1889
1890
1891DECL_INLINE_THROW(uint32_t)
1892iemNativeEmit_lzcnt_r_r_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1893 uint8_t idxVarDst, uint8_t idxVarSrc, uint8_t idxVarEfl, uint8_t cOpBits)
1894{
1895 RT_NOREF(idxVarDst, idxVarSrc, idxVarEfl, cOpBits);
1896 AssertFailed();
1897 return iemNativeEmitBrk(pReNative, off, 0x666);
1898}
1899
1900
1901
1902/*********************************************************************************************************************************
1903* Shifting and Rotating. *
1904*********************************************************************************************************************************/
1905
1906
1907typedef enum
1908{
1909 kIemNativeEmitEFlagsForShiftType_Left,
1910 kIemNativeEmitEFlagsForShiftType_Right,
1911 kIemNativeEmitEFlagsForShiftType_SignedRight
1912} IEMNATIVEEMITEFLAGSFORSHIFTTYPE;
1913
1914/**
1915 * This is used by SHL, SHR and SAR emulation.
1916 *
1917 * It takes liveness stuff into account.
1918 */
1919DECL_INLINE_THROW(uint32_t)
1920iemNativeEmitEFlagsForShift(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxRegEfl, uint8_t idxRegResult,
1921 uint8_t idxRegSrc, uint8_t idxRegCount, uint8_t cOpBits, IEMNATIVEEMITEFLAGSFORSHIFTTYPE enmType,
1922 uint8_t idxRegTmp)
1923{
1924 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflTotalShift);
1925
1926RT_NOREF(pReNative, off, idxRegEfl, idxRegResult, idxRegSrc, idxRegCount, cOpBits, enmType);
1927#if 0 //def IEMNATIVE_WITH_EFLAGS_SKIPPING
1928 /*
1929 * See if we can skip this wholesale.
1930 */
1931 PCIEMLIVENESSENTRY const pLivenessEntry = &pReNative->paLivenessEntries[pReNative->idxCurCall];
1932 if ( IEMLIVENESS_STATE_ARE_STATUS_EFL_TO_BE_CLOBBERED(pLivenessEntry)
1933 && !(pReNative->fMc & IEM_MC_F_WITH_FLAGS))
1934 {
1935 STAM_COUNTER_INC(&pReNative->pVCpu->iem.s.StatNativeEflSkippedShift);
1936 pReNative->fSkippingEFlags |= X86_EFL_STATUS_BITS;
1937# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
1938 off = iemNativeEmitOrImmIntoVCpuU32(pReNative, off, X86_EFL_STATUS_BITS, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
1939# endif
1940 }
1941 else
1942#endif
1943 {
1944 /*
1945 * The difference between Intel and AMD flags for SHL are:
1946 * - Intel always clears AF while AMD always sets it.
1947 * - Intel sets OF for the first shift, while AMD for the last shift.
1948 *
1949 */
1950
1951#ifdef RT_ARCH_AMD64
1952 /*
1953 * We capture flags and does the additional OF and AF calculations as needed.
1954 */
1955 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 64);
1956 /** @todo kIemNativeEmitEFlagsForShiftType_SignedRight: we could alternatively
1957 * use LAHF here when host rax is free since, OF is cleared. */
1958 /* pushf */
1959 pCodeBuf[off++] = 0x9c;
1960 /* pop tmp */
1961 if (idxRegTmp >= 8)
1962 pCodeBuf[off++] = X86_OP_REX_B;
1963 pCodeBuf[off++] = 0x58 + (idxRegTmp & 7);
1964 /* Clear the status bits in EFLs. */
1965 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegEfl, ~X86_EFL_STATUS_BITS);
1966 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1];
1967 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
1968 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_STATUS_BITS);
1969 else
1970 {
1971 /* and tmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF */
1972 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_CF);
1973 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1974 off = iemNativeEmitOrGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_AF);
1975 /* OR in the flags we collected. */
1976 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
1977
1978 /* Calculate OF */
1979 if (idxTargetCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD)
1980 {
1981 /* AMD last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1982 /* bt idxRegResult, (cOpBits - 1) => CF=result-sign-bit */
1983 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x0b /*ud2*/, 0xba,
1984 RT_MAX(cOpBits, 16), 4, idxRegResult);
1985 pCodeBuf[off++] = cOpBits - 1;
1986 /* setc idxRegTmp */
1987 off = iemNativeEmitAmd64TwoByteModRmInstrRREx(pCodeBuf, off, 0x0f, 0x92, 0x0b /*ud2*/, 8, 0, idxRegTmp);
1988 /* xor idxRegTmp, idxRegEfl */
1989 off = iemNativeEmitXorGpr32ByGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegEfl);
1990 /* and idxRegTmp, 1 */
1991 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, 1);
1992 /* shl idxRegTmp, X86_EFL_OF_BIT */
1993 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT);
1994 }
1995 else
1996 {
1997 /* Intel first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1998 if (cOpBits <= 32)
1999 {
2000 /* mov idxRegTmp, idxRegSrc */
2001 off = iemNativeEmitLoadGprFromGpr32Ex(pCodeBuf, off, idxRegTmp, idxRegSrc);
2002 /* shl idxRegTmp, 1 */
2003 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, 1);
2004 /* xor idxRegTmp, idxRegSrc */
2005 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2006 /* shr idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1 or shl idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1 */
2007 if (cOpBits >= X86_EFL_OF_BIT)
2008 off = iemNativeEmitShiftGpr32RightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2009 else
2010 off = iemNativeEmitShiftGpr32LeftEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF_BIT - cOpBits + 1);
2011 }
2012 else
2013 {
2014 /* same as above but with 64-bit grps*/
2015 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2016 off = iemNativeEmitShiftGprLeftEx(pCodeBuf, off, idxRegTmp, 1);
2017 off = iemNativeEmitXorGprByGprEx(pCodeBuf, off, idxRegTmp, idxRegSrc);
2018 off = iemNativeEmitShiftGprRightEx(pCodeBuf, off, idxRegTmp, cOpBits - X86_EFL_OF_BIT - 1);
2019 }
2020 /* and idxRegTmp, X86_EFL_OF */
2021 off = iemNativeEmitAndGpr32ByImmEx(pCodeBuf, off, idxRegTmp, X86_EFL_OF);
2022 }
2023 }
2024 /* Or in the collected flag(s) */
2025 off = iemNativeEmitOrGpr32ByGprEx(pCodeBuf, off, idxRegEfl, idxRegTmp);
2026
2027#elif defined(RT_ARCH_ARM64)
2028 /*
2029 * Calculate flags.
2030 */
2031 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
2032
2033 /* Clear the status bits. ~0x8D5 (or ~0x8FD) can't be AND immediate, so use idxRegTmp for constant. */
2034 off = iemNativeEmitLoadGpr32ImmEx(pCodeBuf, off, idxRegTmp, ~X86_EFL_STATUS_BITS);
2035 off = iemNativeEmitAndGpr32ByGpr32Ex(pCodeBuf, off, idxRegEfl, idxRegTmp);
2036
2037 /* N,Z -> SF,ZF */
2038 if (cOpBits < 32)
2039 pCodeBuf[off++] = Armv8A64MkInstrSetF8SetF16(idxRegResult, cOpBits > 8); /* sets NZ */
2040 else
2041 pCodeBuf[off++] = Armv8A64MkInstrAnds(ARMV8_A64_REG_XZR, idxRegResult, idxRegResult, cOpBits > 32 /*f64Bit*/);
2042 pCodeBuf[off++] = Armv8A64MkInstrMrs(idxRegTmp, ARMV8_AARCH64_SYSREG_NZCV); /* Bits: 31=N; 30=Z; 29=C; 28=V; */
2043 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, 30);
2044 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_ZF_BIT, 2, false /*f64Bit*/);
2045 AssertCompile(X86_EFL_ZF_BIT + 1 == X86_EFL_SF_BIT);
2046
2047 /* Calculate 8-bit parity of the result. */
2048 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegResult, idxRegResult, false /*f64Bit*/,
2049 4 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2050 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2051 2 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2052 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegTmp, idxRegTmp, false /*f64Bit*/,
2053 1 /*offShift6*/, kArmv8A64InstrShift_Lsr);
2054 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 0) == 1);
2055 pCodeBuf[off++] = Armv8A64MkInstrEorImm(idxRegTmp, idxRegTmp, 0, 0, false /*f64Bit*/);
2056 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_PF_BIT, 1, false /*f64Bit*/);
2057
2058 /* Calculate carry - the last bit shifted out of the input value. */
2059 if (enmType == kIemNativeEmitEFlagsForShiftType_Left)
2060 {
2061 /* CF = (idxRegSrc >> (cOpBits - idxRegCount))) & 1 */
2062 pCodeBuf[off++] = Armv8A64MkInstrMovZ(idxRegTmp, cOpBits);
2063 pCodeBuf[off++] = Armv8A64MkInstrSubReg(idxRegTmp, idxRegTmp, idxRegCount, false /*f64Bit*/, cOpBits < 32 /*fSetFlags*/);
2064 if (cOpBits < 32)
2065 pCodeBuf[off++] = Armv8A64MkInstrBCond(kArmv8InstrCond_Cc, 3); /* 16 or 8 bit: CF is clear if all shifted out */
2066 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2067 }
2068 else
2069 {
2070 /* CF = (idxRegSrc >> (idxRegCount - 1)) & 1 */
2071 pCodeBuf[off++] = Armv8A64MkInstrSubUImm12(idxRegTmp, idxRegCount, 1, false /*f64Bit*/);
2072 pCodeBuf[off++] = Armv8A64MkInstrLsrv(idxRegTmp, idxRegSrc, idxRegTmp, cOpBits > 32);
2073 }
2074 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_CF_BIT, 1, false /*f64Bit*/);
2075
2076 uint8_t const idxTargetCpuEflFlavour = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[0];
2077 if (idxTargetCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_AMD)
2078 {
2079 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
2080 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegSrc, idxRegSrc, cOpBits > 32, 1 /*left shift count*/);
2081 pCodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegTmp, cOpBits - 1, cOpBits > 32);
2082 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2083 }
2084 else
2085 {
2086 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
2087 AssertCompile(X86_EFL_CF_BIT == 0);
2088 pCodeBuf[off++] = Armv8A64MkInstrEor(idxRegTmp, idxRegEfl, idxRegResult, cOpBits > 32, /* ASSUMES CF calculated! */
2089 cOpBits - 1, kArmv8A64InstrShift_Lsr);
2090 pCodeBuf[off++] = Armv8A64MkInstrBfi(idxRegEfl, idxRegTmp, X86_EFL_OF_BIT, 1, false /*f64Bit*/);
2091
2092 /* AMD unconditionally clears AF. */
2093 Assert(Armv8A64ConvertImmRImmS2Mask32(0, 32 - X86_EFL_AF_BIT) == X86_EFL_AF);
2094 pCodeBuf[off++] = Armv8A64MkInstrOrrImm(idxRegEfl, idxRegEfl, 0, 32 - X86_EFL_AF_BIT, false /*f64Bit*/);
2095 }
2096#else
2097# error "port me"
2098#endif
2099 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2100
2101#ifdef IEMNATIVE_WITH_EFLAGS_SKIPPING
2102 if (pReNative->fSkippingEFlags)
2103 Log5(("EFLAGS: fSkippingEFlags %#x -> 0 (iemNativeEmitEFlagsForShift)\n", pReNative->fSkippingEFlags));
2104 pReNative->fSkippingEFlags = 0;
2105# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
2106 off = iemNativeEmitStoreImmToVCpuU32(pReNative, off, 0, RT_UOFFSETOF(VMCPU, iem.s.fSkippingEFlags));
2107# endif
2108#endif
2109 }
2110 return off;
2111}
2112
2113
2114DECL_INLINE_THROW(uint32_t)
2115iemNativeEmit_shl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2116 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2117{
2118 /* Note! Since we're doing some branching here, we need to allocate all
2119 registers we need before the jump or we may end up with invalid
2120 register state if the branch is taken. */
2121 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off); /* Do this first in hope we'll get EAX. */
2122 uint8_t const idxRegCount = iemNativeVarRegisterAcquire(pReNative, idxVarCount, &off, true /*fInitialized*/); /* modified on arm64 */
2123 uint8_t const idxRegDst = iemNativeVarRegisterAcquire(pReNative, idxVarDst, &off, true /*fInitialized*/);
2124 uint8_t const idxRegEfl = iemNativeVarRegisterAcquire(pReNative, idxVarEfl, &off, true /*fInitialized*/);
2125
2126#ifdef RT_ARCH_AMD64
2127 /* Make sure IEM_MC_NATIVE_AMD64_HOST_REG_FOR_LOCAL was used. */
2128 AssertStmt(idxRegCount == X86_GREG_xCX, IEMNATIVE_DO_LONGJMP(pReNative, VERR_IEM_EMIT_UNEXPECTED_VAR_REGISTER));
2129
2130 /* We only need a copy of the input value if the target CPU differs from the host CPU. */
2131 uint8_t const idxRegDstIn = pReNative->pVCpu->iem.s.aidxTargetCpuEflFlavour[1] == IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2132 ? UINT8_MAX : iemNativeRegAllocTmp(pReNative, &off);
2133 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 4+2+3+4);
2134
2135 /* Check if it's NOP before we do anything. */
2136 off = iemNativeEmitTestAnyBitsInGpr8Ex(pCodeBuf, off, idxRegCount, cOpBits <= 32 ? 0x1f : 0x3f);
2137 uint32_t const offFixup = off;
2138 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off /*8-bit should be enough */, kIemNativeInstrCond_z);
2139
2140 if (idxRegDstIn != UINT8_MAX)
2141 off = iemNativeEmitLoadGprFromGprEx(pCodeBuf, off, idxRegDstIn, idxRegDst);
2142 off = iemNativeEmitAmd64OneByteModRmInstrRREx(pCodeBuf, off, 0xd2, 0xd3, cOpBits, 4, idxRegDst);
2143
2144#elif defined(RT_ARCH_ARM64)
2145 /* We always (except we can skip EFLAGS calcs) a copy of the input value. */
2146 uint8_t const idxRegDstIn = iemNativeRegAllocTmp(pReNative, &off);
2147 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6);
2148
2149 /* Check if it's NOP before we do anything. We MODIFY idxRegCount here! */
2150 Assert(Armv8A64ConvertImmRImmS2Mask32(4, 0) == 0x1f);
2151 Assert(Armv8A64ConvertImmRImmS2Mask32(5, 0) == 0x3f);
2152 pCodeBuf[off++] = Armv8A64MkInstrAndsImm(idxRegCount, idxRegCount, cOpBits > 32 ? 5 : 4, 0, false /*f64Bit*/);
2153 uint32_t const offFixup = off;
2154 off = iemNativeEmitJccToFixedEx(pCodeBuf, off, off, kArmv8InstrCond_Eq);
2155
2156 pCodeBuf[off++] = Armv8A64MkInstrMov(idxRegDstIn, idxRegDst);
2157 pCodeBuf[off++] = Armv8A64MkInstrLslv(idxRegDst, idxRegDst, idxRegCount, cOpBits > 32 /*f64Bit*/);
2158 if (cOpBits < 32)
2159 {
2160 Assert(Armv8A64ConvertImmRImmS2Mask32(7, 0) == 0xff);
2161 Assert(Armv8A64ConvertImmRImmS2Mask32(15, 0) == 0xffff);
2162 pCodeBuf[off++] = Armv8A64MkInstrAndImm(idxRegDst, idxRegDst, cOpBits - 1, 0, false /*f64Bit*/);
2163 }
2164
2165#else
2166# error "port me"
2167#endif
2168
2169 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2170 off = iemNativeEmitEFlagsForShift(pReNative, off, idxRegEfl, idxRegDst, idxRegDstIn, idxRegCount,
2171 cOpBits, kIemNativeEmitEFlagsForShiftType_Left, idxRegTmp);
2172
2173 /* fixup the jump */
2174 iemNativeFixupFixedJump(pReNative, offFixup, off);
2175
2176#ifdef RT_ARCH_AMD64
2177 if (idxRegDstIn != UINT8_MAX)
2178#endif
2179 iemNativeRegFreeTmp(pReNative, idxRegDstIn);
2180 iemNativeVarRegisterRelease(pReNative, idxVarEfl);
2181 iemNativeVarRegisterRelease(pReNative, idxVarDst);
2182 iemNativeVarRegisterRelease(pReNative, idxVarCount);
2183 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2184 return off;
2185}
2186
2187
2188DECL_INLINE_THROW(uint32_t)
2189iemNativeEmit_shr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2190 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2191{
2192 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2193 AssertFailed();
2194 return iemNativeEmitBrk(pReNative, off, 0x666);
2195}
2196
2197
2198DECL_INLINE_THROW(uint32_t)
2199iemNativeEmit_sar_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2200 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2201{
2202 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2203 AssertFailed();
2204 return iemNativeEmitBrk(pReNative, off, 0x666);
2205}
2206
2207
2208DECL_INLINE_THROW(uint32_t)
2209iemNativeEmit_rol_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2210 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2211{
2212 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2213 AssertFailed();
2214 return iemNativeEmitBrk(pReNative, off, 0x666);
2215}
2216
2217
2218DECL_INLINE_THROW(uint32_t)
2219iemNativeEmit_ror_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2220 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2221{
2222 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2223 AssertFailed();
2224 return iemNativeEmitBrk(pReNative, off, 0x666);
2225}
2226
2227
2228DECL_INLINE_THROW(uint32_t)
2229iemNativeEmit_rcl_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2230 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2231{
2232 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2233 AssertFailed();
2234 return iemNativeEmitBrk(pReNative, off, 0x666);
2235}
2236
2237
2238DECL_INLINE_THROW(uint32_t)
2239iemNativeEmit_rcr_r_CL_efl(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2240 uint8_t idxVarDst, uint8_t idxVarCount, uint8_t idxVarEfl, uint8_t cOpBits)
2241{
2242 RT_NOREF(idxVarDst, idxVarCount, idxVarEfl, cOpBits);
2243 AssertFailed();
2244 return iemNativeEmitBrk(pReNative, off, 0x666);
2245}
2246
2247
2248
2249#ifdef IEMNATIVE_WITH_SIMD_REG_ALLOCATOR
2250/*********************************************************************************************************************************
2251* SIMD emitters. *
2252*********************************************************************************************************************************/
2253
2254/**
2255 * Common emitter for packed arithmetic instructions.
2256 */
2257#ifdef RT_ARCH_AMD64
2258# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2259 DECL_INLINE_THROW(uint32_t) \
2260 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2261 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2262 { \
2263 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2264 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2265 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2266 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2267 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2268 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2269 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2270 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2271 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2272 pCodeBuf[off++] = 0x0f; \
2273 pCodeBuf[off++] = (a_bOpcX86); \
2274 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2275 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2276 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2277 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2278 return off; \
2279 } \
2280 DECL_INLINE_THROW(uint32_t) \
2281 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2282 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2283 { \
2284 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2285 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2286 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2287 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2288 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2289 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2290 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2291 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2292 pCodeBuf[off++] = 0x0f; \
2293 pCodeBuf[off++] = (a_bOpcX86); \
2294 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2295 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2296 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2297 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2298 return off; \
2299 } \
2300 typedef int ignore_semicolon
2301#elif defined(RT_ARCH_ARM64)
2302# define IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(a_Instr, a_enmArmOp, a_bOpcX86) \
2303 DECL_INLINE_THROW(uint32_t) \
2304 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2305 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2306 { \
2307 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2308 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2309 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2310 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2311 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2312 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2313 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2314 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2315 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2316 return off; \
2317 } \
2318 DECL_INLINE_THROW(uint32_t) \
2319 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2320 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2321 { \
2322 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2323 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2324 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2325 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2326 pCodeBuf[off++] = Armv8A64MkVecInstrLogical((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc); \
2327 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2328 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2329 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2330 return off; \
2331 } \
2332 typedef int ignore_semicolon
2333#else
2334# error "Port me"
2335#endif
2336
2337/* POR, ORPS, ORPD. */
2338IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(por, kArmv8VecInstrLogicOp_Orr, 0xeb);
2339/* PXOR, XORPS, XORPD. */
2340IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pxor, kArmv8VecInstrLogicOp_Eor, 0xef);
2341/* PAND, ANDPS, ANDPD. */
2342IEMNATIVE_NATIVE_EMIT_LOGICAL_OP_U128(pand, kArmv8VecInstrLogicOp_And, 0xdb);
2343
2344
2345/**
2346 * Common emitter for the shift right with immediate instructions.
2347 */
2348#ifdef RT_ARCH_AMD64
2349# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2350 DECL_INLINE_THROW(uint32_t) \
2351 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2352 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2353 { \
2354 if (bImm) \
2355 { \
2356 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2357 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2358 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2359 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2360 if (idxSimdRegDst >= 8) \
2361 pCodeBuf[off++] = X86_OP_REX_B; \
2362 pCodeBuf[off++] = 0x0f; \
2363 pCodeBuf[off++] = (a_bOpcX86); \
2364 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, idxSimdRegDst & 7); \
2365 pCodeBuf[off++] = bImm; \
2366 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2367 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2368 } \
2369 /* Immediate 0 is a nop. */ \
2370 return off; \
2371 } \
2372 typedef int ignore_semicolon
2373#elif defined(RT_ARCH_ARM64)
2374# define IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2375 DECL_INLINE_THROW(uint32_t) \
2376 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2377 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2378 { \
2379 if (bImm) \
2380 { \
2381 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2382 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2383 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2384 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegDst, idxSimdRegDst, RT_MIN(bImm, (a_cShiftMax)), (a_ArmElemSz)); \
2385 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2386 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2387 } \
2388 /* Immediate 0 is a nop. */ \
2389 return off; \
2390 } \
2391 typedef int ignore_semicolon
2392#else
2393# error "Port me"
2394#endif
2395
2396IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlw, 16, kArmv8InstrShiftSz_U16, 0x71);
2397IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrld, 32, kArmv8InstrShiftSz_U32, 0x72);
2398IEMNATIVE_NATIVE_EMIT_SHIFT_RIGHT_IMM_U128(psrlq, 64, kArmv8InstrShiftSz_U64, 0x73);
2399
2400
2401/**
2402 * Common emitter for the shift left with immediate instructions.
2403 */
2404#ifdef RT_ARCH_AMD64
2405# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2406 DECL_INLINE_THROW(uint32_t) \
2407 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2408 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2409 { \
2410 if (bImm) \
2411 { \
2412 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2413 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2414 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2415 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2416 if (idxSimdRegDst >= 8) \
2417 pCodeBuf[off++] = X86_OP_REX_B; \
2418 pCodeBuf[off++] = 0x0f; \
2419 pCodeBuf[off++] = (a_bOpcX86); \
2420 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 6, idxSimdRegDst & 7); \
2421 pCodeBuf[off++] = bImm; \
2422 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2423 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2424 } \
2425 /* Immediate 0 is a nop. */ \
2426 return off; \
2427 } \
2428 typedef int ignore_semicolon
2429#elif defined(RT_ARCH_ARM64)
2430# define IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(a_Instr, a_cShiftMax, a_ArmElemSz, a_bOpcX86) \
2431 DECL_INLINE_THROW(uint32_t) \
2432 RT_CONCAT3(iemNativeEmit_,a_Instr,_ri_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2433 uint8_t const idxSimdGstRegDst, uint8_t const bImm) \
2434 { \
2435 if (bImm) /* bImm == 0 is a nop */ \
2436 { \
2437 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2438 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2439 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2440 if (bImm < (a_cShiftMax)) \
2441 pCodeBuf[off++] = Armv8A64MkVecInstrShlImm(idxSimdRegDst, idxSimdRegDst, bImm, (a_ArmElemSz)); \
2442 else /* Everything >= a_cShiftMax sets the register to zero. */ \
2443 pCodeBuf[off++] = Armv8A64MkVecInstrEor(idxSimdRegDst, idxSimdRegDst, idxSimdRegDst); \
2444 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2445 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2446 } \
2447 return off; \
2448 } \
2449 typedef int ignore_semicolon
2450#else
2451# error "Port me"
2452#endif
2453
2454IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllw, 16, kArmv8InstrShiftSz_U16, 0x71);
2455IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(pslld, 32, kArmv8InstrShiftSz_U32, 0x72);
2456IEMNATIVE_NATIVE_EMIT_SHIFT_LEFT_IMM_U128(psllq, 64, kArmv8InstrShiftSz_U64, 0x73);
2457
2458
2459/**
2460 * Common emitter for packed arithmetic instructions.
2461 */
2462#ifdef RT_ARCH_AMD64
2463# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2464 DECL_INLINE_THROW(uint32_t) \
2465 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2466 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2467 { \
2468 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2469 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2470 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2471 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2472 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2473 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2474 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2475 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2476 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2477 pCodeBuf[off++] = 0x0f; \
2478 pCodeBuf[off++] = (a_bOpcX86); \
2479 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2480 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2481 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2482 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2483 return off; \
2484 } \
2485 DECL_INLINE_THROW(uint32_t) \
2486 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2487 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2488 { \
2489 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2490 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2491 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2492 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2493 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2494 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2495 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2496 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2497 pCodeBuf[off++] = 0x0f; \
2498 pCodeBuf[off++] = (a_bOpcX86); \
2499 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2500 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2501 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2502 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2503 return off; \
2504 } \
2505 typedef int ignore_semicolon
2506#elif defined(RT_ARCH_ARM64)
2507# define IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bOpcX86) \
2508 DECL_INLINE_THROW(uint32_t) \
2509 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2510 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2511 { \
2512 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2513 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2514 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2515 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2516 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2517 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2518 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2519 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2520 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2521 return off; \
2522 } \
2523 DECL_INLINE_THROW(uint32_t) \
2524 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2525 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2526 { \
2527 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2528 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2529 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2530 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2531 pCodeBuf[off++] = Armv8A64MkVecInstrArithOp((a_enmArmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2532 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2533 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2534 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2535 return off; \
2536 } \
2537 typedef int ignore_semicolon
2538#else
2539# error "Port me"
2540#endif
2541
2542/*
2543 * PADDx.
2544 */
2545IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddb, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_8, 0xfc);
2546IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddw, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_16, 0xfd);
2547IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddd, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_32, 0xfe);
2548IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddq, kArmv8VecInstrArithOp_Add, kArmv8VecInstrArithSz_64, 0xd4);
2549
2550/*
2551 * PSUBx.
2552 */
2553IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubb, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_8, 0xf8);
2554IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubw, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_16, 0xf9);
2555IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubd, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_32, 0xfa);
2556IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(psubq, kArmv8VecInstrArithOp_Sub, kArmv8VecInstrArithSz_64, 0xfb);
2557
2558/*
2559 * PADDUSx.
2560 */
2561IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusb, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_8, 0xdc);
2562IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(paddusw, kArmv8VecInstrArithOp_UnsignSat_Add, kArmv8VecInstrArithSz_16, 0xdd);
2563
2564/*
2565 * PMULLx.
2566 */
2567IEMNATIVE_NATIVE_EMIT_ARITH_OP_U128(pmullw, kArmv8VecInstrArithOp_Mul, kArmv8VecInstrArithSz_16, 0xd5);
2568
2569
2570/**
2571 * Common emitter for the pcmpeqb/pcmpeqw/pcmpeqd instructions.
2572 */
2573#ifdef RT_ARCH_AMD64
2574# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2575 DECL_INLINE_THROW(uint32_t) \
2576 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2577 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2578 { \
2579 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2580 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2581 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2582 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2583 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2584 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2585 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2586 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2587 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2588 pCodeBuf[off++] = 0x0f; \
2589 pCodeBuf[off++] = (a_bOpcX86); \
2590 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2591 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2592 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2593 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2594 return off; \
2595 } \
2596 DECL_INLINE_THROW(uint32_t) \
2597 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2598 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2599 { \
2600 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2601 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2602 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2603 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5); \
2604 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2605 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2606 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2607 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2608 pCodeBuf[off++] = 0x0f; \
2609 pCodeBuf[off++] = (a_bOpcX86); \
2610 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2611 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2612 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2613 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2614 return off; \
2615 } \
2616 typedef int ignore_semicolon
2617#elif defined(RT_ARCH_ARM64)
2618# define IEMNATIVE_NATIVE_EMIT_PCMP_U128(a_Instr, a_enmOp, a_ArmElemSz, a_bOpcX86) \
2619 DECL_INLINE_THROW(uint32_t) \
2620 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2621 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2622 { \
2623 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2624 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2625 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2626 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2627 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2628 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2629 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2630 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2631 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2632 return off; \
2633 } \
2634 DECL_INLINE_THROW(uint32_t) \
2635 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2636 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2637 { \
2638 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2639 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2640 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2641 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2642 pCodeBuf[off++] = Armv8A64MkVecInstrCmp((a_enmOp), idxSimdRegDst, idxSimdRegDst, idxSimdRegSrc, (a_ArmElemSz)); \
2643 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2644 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2645 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2646 return off; \
2647 } \
2648 typedef int ignore_semicolon
2649#else
2650# error "Port me"
2651#endif
2652
2653IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqb, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_8, 0x74);
2654IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqw, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_16, 0x75);
2655IEMNATIVE_NATIVE_EMIT_PCMP_U128(pcmpeqd, kArmv8VecInstrCmpOp_Eq, kArmv8VecInstrArithSz_32, 0x76);
2656
2657
2658/**
2659 * Emitter for pmovmskb
2660 */
2661DECL_INLINE_THROW(uint32_t)
2662iemNativeEmit_pmovmskb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2663 uint8_t const idxGstRegDst, uint8_t const idxSimdGstRegSrc)
2664{
2665#ifdef RT_ARCH_AMD64
2666 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2667 kIemNativeGstRegUse_ForFullWrite);
2668 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2669 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2670 kIemNativeGstSimdRegLdStSz_Low128,
2671 kIemNativeGstRegUse_ReadOnly);
2672 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2673
2674 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2675 if (idxRegDst >= 8 || idxSimdRegSrc >= 8)
2676 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2677 | (idxRegDst >= 8 ? X86_OP_REX_R : 0);
2678 pCodeBuf[off++] = 0x0f;
2679 pCodeBuf[off++] = 0xd7;
2680 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxRegDst & 7, idxSimdRegSrc & 7);
2681
2682 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2683 iemNativeRegFreeTmp(pReNative, idxRegDst);
2684
2685#elif defined(RT_ARCH_ARM64)
2686 uint8_t const idxRegDst = iemNativeRegAllocTmpForGuestReg(pReNative, &off, IEMNATIVEGSTREG_GPR(idxGstRegDst),
2687 kIemNativeGstRegUse_ForFullWrite);
2688 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2689 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off,
2690 IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2691 kIemNativeGstSimdRegLdStSz_Low128,
2692 kIemNativeGstRegUse_Calculation);
2693 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
2694
2695 /*
2696 * See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
2697 * for different approaches as NEON doesn't has an instruction equivalent for pmovmskb, so we have to emulate that.
2698 *
2699 * As there is no way around emulating the exact semantics of pmovmskb we will use the same algorithm
2700 * as the sse2neon implementation because there we can get away with loading any constants and the
2701 * base algorithm is only 4 NEON instructions (+ 3 for extracting the result to a general register).
2702 *
2703 * The following illustrates the algorithm:
2704 *
2705 * Byte vector Element -> 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
2706 * Instruction
2707 * |
2708 * V
2709 * Axxxxxxx Bxxxxxxx Cxxxxxxx Dxxxxxxx Exxxxxxx Fxxxxxxx Gxxxxxxx Hxxxxxxx Ixxxxxxx Jxxxxxxx Kxxxxxxx Lxxxxxxx Mxxxxxxx Nxxxxxxx Oxxxxxxx Pxxxxxxx
2710 * USHR v.16B, v.16B, #7 0000000A 0000000B 0000000C 0000000D 0000000E 0000000F 0000000G 0000000H 0000000I 0000000J 0000000K 0000000L 0000000M 0000000N 0000000O 0000000P
2711 * USRA v.8H, v.8H, #7 00000000 000000AB 00000000 000000CD 00000000 000000EF 00000000 000000GH 00000000 000000IJ 00000000 000000KL 00000000 000000MN 00000000 000000OP
2712 * USRA v.4S, v.4S, #14 00000000 00000000 00000000 0000ABCD 00000000 00000000 00000000 0000EFGH 00000000 00000000 00000000 0000IJKL 00000000 00000000 00000000 0000MNOP
2713 * USRA v.2D, v.2D, #28 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2714 *
2715 * The extraction process
2716 * UMOV wTMP, v.16B[8] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH
2717 * UMOV wRES, v.16B[0] 00000000 00000000 00000000 00000000 00000000 00000000 00000000 IJKLMNOP
2718 * ORR xRES, xRES, xTMP, LSL #8 00000000 00000000 00000000 00000000 00000000 00000000 ABCDEFGH IJKLMNOP
2719 */
2720 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U8);
2721 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 7, kArmv8InstrShiftSz_U16, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2722 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 14, kArmv8InstrShiftSz_U32, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2723 pCodeBuf[off++] = Armv8A64MkVecInstrShrImm(idxSimdRegSrc, idxSimdRegSrc, 28, kArmv8InstrShiftSz_U64, true /*fUnsigned*/, false /*fRound*/, true /*fAccum*/);
2724 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegTmp, idxSimdRegSrc, 8, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2725 pCodeBuf[off++] = Armv8A64MkVecInstrUmov(idxRegDst, idxSimdRegSrc, 0, kArmv8InstrUmovInsSz_U8, false /*fDst64Bit*/);
2726 pCodeBuf[off++] = Armv8A64MkInstrOrr(idxRegDst, idxRegDst, idxRegTmp, true /*f64Bit*/, 8 /*offShift6*/);
2727
2728 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2729 iemNativeRegFreeTmp(pReNative, idxRegTmp);
2730 iemNativeRegFreeTmp(pReNative, idxRegDst);
2731
2732#else
2733# error "Port me"
2734#endif
2735 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2736 return off;
2737}
2738
2739
2740/**
2741 * Common emitter for the PACKUSWB instructions - guest register / guest register variant.
2742 */
2743DECL_INLINE_THROW(uint32_t)
2744iemNativeEmit_packuswb_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2745 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc)
2746{
2747 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2748 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2749 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
2750 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
2751
2752#ifdef RT_ARCH_AMD64
2753 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2754
2755 /* packuswb xmm, xmm */
2756 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2757 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2758 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2759 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2760 pCodeBuf[off++] = 0x0f;
2761 pCodeBuf[off++] = 0x67;
2762 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2763
2764#elif defined(RT_ARCH_ARM64)
2765 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2766
2767 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2768 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2769
2770#else
2771# error "port me"
2772#endif
2773
2774 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2775 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
2776
2777 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2778 return off;
2779}
2780
2781
2782/**
2783 * Common emitter for the PACKUSWB instructions - guest register / recompiler variable variant.
2784 */
2785DECL_INLINE_THROW(uint32_t)
2786iemNativeEmit_packuswb_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off,
2787 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc)
2788{
2789 IEMNATIVE_ASSERT_VAR_IDX(pReNative, idxVarSrc);
2790 IEMNATIVE_ASSERT_VAR_SIZE(pReNative, idxVarSrc, sizeof(RTUINT128U));
2791
2792 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
2793 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate);
2794 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
2795
2796
2797#ifdef RT_ARCH_AMD64
2798 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
2799
2800 /* packuswb xmm, xmm */
2801 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP;
2802 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8)
2803 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
2804 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0);
2805 pCodeBuf[off++] = 0x0f;
2806 pCodeBuf[off++] = 0x67;
2807 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7);
2808
2809#elif defined(RT_ARCH_ARM64)
2810 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
2811
2812 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, false /*fUpper*/, idxSimdRegDst, idxSimdRegDst, kArmv8VecInstrArithSz_8);
2813 pCodeBuf[off++] = Armv8A64MkVecInstrQxtn(kArmv8VecInstrQxtnOp_Sqxtun, true /*fUpper*/, idxSimdRegDst, idxSimdRegSrc, kArmv8VecInstrArithSz_8);
2814
2815#else
2816# error "port me"
2817#endif
2818
2819 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
2820 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
2821
2822 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
2823 return off;
2824}
2825
2826
2827/**
2828 * Common emitter for the pmov{s,z}x* instructions.
2829 */
2830#ifdef RT_ARCH_AMD64
2831# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2832 DECL_INLINE_THROW(uint32_t) \
2833 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2834 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2835 { \
2836 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2837 { \
2838 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2839 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2840 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2841 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2842 if (idxSimdReg >= 8) \
2843 pCodeBuf[off++] = (idxSimdReg >= 8 ? X86_OP_REX_B | X86_OP_REX_R : 0); \
2844 pCodeBuf[off++] = 0x0f; \
2845 pCodeBuf[off++] = 0x38; \
2846 pCodeBuf[off++] = (a_bOpcX86); \
2847 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdReg & 7, idxSimdReg & 7); \
2848 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2849 } \
2850 else \
2851 { \
2852 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2853 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2854 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2855 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2856 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 6); \
2857 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2858 if (idxSimdRegDst >= 8 || idxSimdRegSrc >= 8) \
2859 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0) \
2860 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2861 pCodeBuf[off++] = 0x0f; \
2862 pCodeBuf[off++] = 0x38; \
2863 pCodeBuf[off++] = (a_bOpcX86); \
2864 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, idxSimdRegSrc & 7); \
2865 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2866 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2867 } \
2868 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2869 return off; \
2870 } \
2871 DECL_INLINE_THROW(uint32_t) \
2872 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2873 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2874 { \
2875 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2876 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2877 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2878 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7 + 6); \
2879 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; /* Transfer value from GPR to temporary vector register using pinsrq. */ \
2880 pCodeBuf[off++] = X86_OP_REX_W \
2881 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 < 8 ? 0 : X86_OP_REX_R) \
2882 | (idxRegSrc < 8 ? 0 : X86_OP_REX_B); \
2883 pCodeBuf[off++] = 0x0f; \
2884 pCodeBuf[off++] = 0x3a; \
2885 pCodeBuf[off++] = 0x22; \
2886 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxRegSrc & 7); \
2887 pCodeBuf[off++] = 0; /* QWord */\
2888 pCodeBuf[off++] = X86_OP_PRF_SIZE_OP; \
2889 if (idxSimdRegDst >= 8 || IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8) \
2890 pCodeBuf[off++] = (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_B : 0) \
2891 | (idxSimdRegDst >= 8 ? X86_OP_REX_R : 0); \
2892 pCodeBuf[off++] = 0x0f; \
2893 pCodeBuf[off++] = 0x38; \
2894 pCodeBuf[off++] = (a_bOpcX86); \
2895 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, idxSimdRegDst & 7, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7); \
2896 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2897 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2898 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2899 return off; \
2900 } \
2901 typedef int ignore_semicolon
2902#elif defined(RT_ARCH_ARM64)
2903# define IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(a_Instr, a_fArmUnsigned, a_ArmElemSz, a_bOpcX86) \
2904 DECL_INLINE_THROW(uint32_t) \
2905 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2906 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
2907 { \
2908 if (idxSimdGstRegDst == idxSimdGstRegSrc) \
2909 { \
2910 uint8_t const idxSimdReg = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2911 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForUpdate); \
2912 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2913 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdReg, idxSimdReg, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2914 iemNativeSimdRegFreeTmp(pReNative, idxSimdReg); \
2915 } \
2916 else \
2917 { \
2918 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc), \
2919 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly); \
2920 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2921 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2922 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1); \
2923 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, idxSimdRegSrc, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2924 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2925 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc); \
2926 } \
2927 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2928 return off; \
2929 } \
2930 DECL_INLINE_THROW(uint32_t) \
2931 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, \
2932 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
2933 { \
2934 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst), \
2935 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite); \
2936 uint8_t const idxRegSrc = iemNativeVarRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/); \
2937 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2); \
2938 pCodeBuf[off++] = Armv8A64MkVecInstrIns(IEMNATIVE_SIMD_REG_FIXED_TMP0, idxRegSrc, 0 /*idxElem*/); /* Transfer value from GPR to temporary vector register. */ \
2939 pCodeBuf[off++] = Armv8A64MkVecInstrUShll(idxSimdRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0, 0, (a_ArmElemSz), (a_fArmUnsigned)); \
2940 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst); \
2941 iemNativeVarRegisterRelease(pReNative, idxVarSrc); \
2942 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off); \
2943 return off; \
2944 } \
2945 typedef int ignore_semicolon
2946#else
2947# error "Port me"
2948#endif
2949
2950IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxbw, true, kArmv8InstrShiftSz_U8, 0x30);
2951IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxwd, true, kArmv8InstrShiftSz_U16, 0x33);
2952IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovzxdq, true, kArmv8InstrShiftSz_U32, 0x35);
2953
2954IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxbw, false, kArmv8InstrShiftSz_U8, 0x20);
2955IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxwd, false, kArmv8InstrShiftSz_U16, 0x23);
2956IEMNATIVE_NATIVE_EMIT_PMOV_S_Z_U128(pmovsxdq, false, kArmv8InstrShiftSz_U32, 0x25);
2957
2958
2959/**
2960 * Updates the MXCSR exception flags, raising any unmasked exceptions.
2961 */
2962DECL_INLINE_THROW(uint32_t)
2963iemNativeEmitMxcsrUpdate(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, uint8_t const idxSimdGstRegDst, uint8_t const idxSimdRegRes)
2964{
2965 uint8_t const idxRegMxCsr = iemNativeRegAllocTmpForGuestReg(pReNative, &off, kIemNativeGstReg_MxCsr, kIemNativeGstRegUse_ForUpdate);
2966 uint8_t const idxRegMxCsrXcptFlags = iemNativeRegAllocTmp(pReNative, &off);
2967 uint8_t const idxRegTmp = iemNativeRegAllocTmp(pReNative, &off);
2968
2969#ifdef RT_ARCH_AMD64
2970 PIEMNATIVEINSTR pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2971
2972 /* stmxcsr */
2973 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2974 pbCodeBuf[off++] = X86_OP_REX_B;
2975 pbCodeBuf[off++] = 0x0f;
2976 pbCodeBuf[off++] = 0xae;
2977 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 3, IEMNATIVE_REG_FIXED_PVMCPU & 7);
2978 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2979 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2980 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2981 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2982
2983 /* Load MXCSR, mask everything except status flags and or into guest MXCSR. */
2984 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2985
2986 /* Store the flags in the MXCSR xcpt flags register. */
2987 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
2988 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_XCPT_FLAGS);
2989
2990 /* Clear the status flags in the temporary copy and write it back to MXCSR. */
2991 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, ~X86_MXCSR_XCPT_FLAGS);
2992 off = iemNativeEmitStoreGprToVCpuU32(pReNative, off, idxRegTmp, RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
2993
2994 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 8);
2995
2996 /* ldmxcsr */
2997 if (IEMNATIVE_REG_FIXED_PVMCPU >= 8)
2998 pbCodeBuf[off++] = X86_OP_REX_B;
2999 pbCodeBuf[off++] = 0x0f;
3000 pbCodeBuf[off++] = 0xae;
3001 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, 2, IEMNATIVE_REG_FIXED_PVMCPU & 7);
3002 pbCodeBuf[off++] = RT_BYTE1(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3003 pbCodeBuf[off++] = RT_BYTE2(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3004 pbCodeBuf[off++] = RT_BYTE3(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3005 pbCodeBuf[off++] = RT_BYTE4(RT_UOFFSETOF(VMCPU, iem.s.uRegMxcsrTmp));
3006
3007#elif defined(RT_ARCH_ARM64)
3008 PIEMNATIVEINSTR pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 7);
3009 pu32CodeBuf[off++] = Armv8A64MkInstrMrs(idxRegMxCsrXcptFlags, ARMV8_AARCH64_SYSREG_FPSR);
3010 pu32CodeBuf[off++] = Armv8A64MkInstrMsr(ARMV8_A64_REG_XZR, ARMV8_AARCH64_SYSREG_FPSR); /* Clear FPSR for next instruction. */
3011 pu32CodeBuf[off++] = Armv8A64MkInstrUxtb(idxRegMxCsrXcptFlags, idxRegMxCsrXcptFlags); /* Ensure there are only the exception flags set (clears QC, and any possible NZCV flags). */
3012
3013 /*
3014 * The exception flags layout differs between MXCSR and FPSR of course:
3015 *
3016 * Bit FPSR MXCSR
3017 * 0 IOC ------> IE
3018 *
3019 * 1 DZC ---- DE <-+
3020 * \ |
3021 * 2 OFC --- -> ZE |
3022 * \ |
3023 * 3 UFC -- --> OE |
3024 * \ |
3025 * 4 IXC - ---> UE |
3026 * \ |
3027 * 5 ----> PE |
3028 * 6 |
3029 * 7 IDC --------------+
3030 */
3031 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 1); /* Shift the block of flags starting at DZC to the least significant bits. */
3032 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 2, 4); /* Insert DZC, OFC, UFC and IXC into the MXCSR positions. */
3033 pu32CodeBuf[off++] = Armv8A64MkInstrLsrImm(idxRegTmp, idxRegMxCsrXcptFlags, 6); /* Shift IDC (now at 6) into the LSB. */
3034 pu32CodeBuf[off++] = Armv8A64MkInstrBfi(idxRegMxCsrXcptFlags, idxRegTmp, 1, 1); /* Insert IDC into the MXCSR positions. */
3035#else
3036# error "Port me"
3037#endif
3038
3039 /*
3040 * If PE is set together with OE/UE and neither are masked
3041 * PE needs to be cleared, because on real hardware
3042 * an exception is generated with only OE/UE being set,
3043 * but because we mask all exceptions PE will get set as well.
3044 */
3045 /** @todo On ARM we can combine the load+and into one and instruction. */
3046 /** @todo r=aeichner Can this be done more optimal? */
3047 uint8_t const idxRegTmp2 = iemNativeRegAllocTmp(pReNative, &off);
3048 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsrXcptFlags);
3049 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp, X86_MXCSR_OE | X86_MXCSR_UE);
3050 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp2, idxRegMxCsr);
3051 off = iemNativeEmitAndGpr32ByImm(pReNative, off, idxRegTmp2, X86_MXCSR_OM | X86_MXCSR_UM);
3052 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp2, X86_MXCSR_XCPT_MASK_SHIFT);
3053 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp2, idxRegTmp2, false /*f64Bit*/);
3054 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegTmp2, idxRegTmp);
3055 off = iemNativeEmitTestAnyBitsInGpr(pReNative, off, idxRegTmp2, X86_MXCSR_OE | X86_MXCSR_UE);
3056
3057 uint32_t offFixup = off;
3058 off = iemNativeEmitJzToFixed(pReNative, off, off);
3059 off = iemNativeEmitBitClearInGpr32(pReNative, off, idxRegMxCsrXcptFlags, X86_MXCSR_PE_BIT);
3060 iemNativeFixupFixedJump(pReNative, offFixup, off);
3061 iemNativeRegFreeTmp(pReNative, idxRegTmp2);
3062
3063
3064 /* Set the MXCSR flags now. */
3065 off = iemNativeEmitOrGpr32ByGpr(pReNative, off, idxRegMxCsr, idxRegMxCsrXcptFlags);
3066
3067 /*
3068 * Make sure we don't have any outstanding guest register writes as we may
3069 * raise an \#UD or \#XF and all guest register must be up to date in CPUMCTX.
3070 */
3071 off = iemNativeRegFlushPendingWrites(pReNative, off);
3072
3073#ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3074 off = iemNativeEmitStoreImmToVCpuU8(pReNative, off, idxInstr, RT_UOFFSETOF(VMCPUCC, iem.s.idxTbCurInstr));
3075#else
3076 RT_NOREF(idxInstr);
3077#endif
3078
3079 /* Check whether an exception is pending and only update the guest SIMD register if it isn't. */
3080 /* mov tmp, varmxcsr */
3081 off = iemNativeEmitLoadGprFromGpr32(pReNative, off, idxRegTmp, idxRegMxCsr);
3082 /* tmp >>= X86_MXCSR_XCPT_MASK_SHIFT */
3083 off = iemNativeEmitShiftGprRight(pReNative, off, idxRegTmp, X86_MXCSR_XCPT_MASK_SHIFT);
3084 /* tmp = ~tmp */
3085 off = iemNativeEmitInvBitsGpr(pReNative, off, idxRegTmp, idxRegTmp, false /*f64Bit*/);
3086 /* tmp &= mxcsr */
3087 off = iemNativeEmitAndGpr32ByGpr32(pReNative, off, idxRegMxCsrXcptFlags, idxRegTmp);
3088 off = iemNativeEmitTbExitIfAnyBitsSetInGpr<kIemNativeLabelType_RaiseSseAvxFpRelated>(pReNative, off, idxRegMxCsrXcptFlags,
3089 X86_MXCSR_XCPT_FLAGS);
3090
3091 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3092 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ForFullWrite);
3093
3094 /* Move result to guest SIMD register (at this point there is no exception being raised). */
3095 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, idxSimdRegDst, idxSimdRegRes);
3096
3097 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3098 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3099 iemNativeRegFreeTmp(pReNative, idxRegTmp);
3100 iemNativeRegFreeTmp(pReNative, idxRegMxCsrXcptFlags);
3101 iemNativeRegFreeTmp(pReNative, idxRegMxCsr);
3102 return off;
3103}
3104
3105
3106/**
3107 * Common emitter for packed floating point instructions with 3 operands - register, register variant.
3108 */
3109DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rr_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3110 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc,
3111#ifdef RT_ARCH_AMD64
3112 uint8_t const bPrefixX86, uint8_t const bOpcX86
3113#elif defined(RT_ARCH_ARM64)
3114 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3115#endif
3116 )
3117{
3118 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3119 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3120 uint8_t const idxSimdRegSrc = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegSrc),
3121 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3122
3123#ifdef RT_ARCH_AMD64
3124 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3125 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3126 if (bPrefixX86 != 0)
3127 pCodeBuf[off++] = bPrefixX86;
3128 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3129 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3130 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3131 pCodeBuf[off++] = 0x0f;
3132 pCodeBuf[off++] = bOpcX86;
3133 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3134#elif defined(RT_ARCH_ARM64)
3135 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3136 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3137#else
3138# error "Port me"
3139#endif
3140 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3141 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegSrc);
3142 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3143 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3144}
3145
3146
3147/**
3148 * Common emitter for packed floating point instructions with 3 operands - register, local variable variant.
3149 */
3150DECL_INLINE_THROW(uint32_t) iemNativeEmitSimdFp3OpCommon_rv_u128(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr,
3151 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc,
3152#ifdef RT_ARCH_AMD64
3153 uint8_t const bPrefixX86, uint8_t const bOpcX86
3154#elif defined(RT_ARCH_ARM64)
3155 ARMV8INSTRVECFPOP const enmFpOp, ARMV8INSTRVECFPSZ const enmFpSz
3156#endif
3157 )
3158{
3159 uint8_t const idxSimdRegDst = iemNativeSimdRegAllocTmpForGuestSimdReg(pReNative, &off, IEMNATIVEGSTSIMDREG_SIMD(idxSimdGstRegDst),
3160 kIemNativeGstSimdRegLdStSz_Low128, kIemNativeGstRegUse_ReadOnly);
3161 uint8_t const idxSimdRegSrc = iemNativeVarSimdRegisterAcquire(pReNative, idxVarSrc, &off, true /*fInitialized*/);
3162
3163#ifdef RT_ARCH_AMD64
3164 off = iemNativeEmitSimdLoadVecRegFromVecRegU128(pReNative, off, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst);
3165 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 5);
3166 if (bPrefixX86 != 0)
3167 pCodeBuf[off++] = bPrefixX86;
3168 if (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 || idxSimdRegSrc >= 8)
3169 pCodeBuf[off++] = (idxSimdRegSrc >= 8 ? X86_OP_REX_B : 0)
3170 | (IEMNATIVE_SIMD_REG_FIXED_TMP0 >= 8 ? X86_OP_REX_R : 0);
3171 pCodeBuf[off++] = 0x0f;
3172 pCodeBuf[off++] = bOpcX86;
3173 pCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, IEMNATIVE_SIMD_REG_FIXED_TMP0 & 7, idxSimdRegSrc & 7);
3174#elif defined(RT_ARCH_ARM64)
3175 PIEMNATIVEINSTR const pCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
3176 pCodeBuf[off++] = Armv8A64MkVecInstrFp3Op(enmFpOp, enmFpSz, IEMNATIVE_SIMD_REG_FIXED_TMP0, idxSimdRegDst, idxSimdRegSrc);
3177#else
3178# error "Port me"
3179#endif
3180 iemNativeVarRegisterRelease(pReNative, idxVarSrc);
3181 iemNativeSimdRegFreeTmp(pReNative, idxSimdRegDst);
3182 IEMNATIVE_ASSERT_INSTR_BUF_ENSURE(pReNative, off);
3183 return iemNativeEmitMxcsrUpdate(pReNative, off, idxInstr, idxSimdGstRegDst, IEMNATIVE_SIMD_REG_FIXED_TMP0);
3184}
3185
3186
3187/**
3188 * Common emitter for packed floating point instructions with 3 operands.
3189 */
3190#ifdef RT_ARCH_AMD64
3191# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3192 DECL_FORCE_INLINE_THROW(uint32_t) \
3193 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3194 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3195 { \
3196 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3197 a_bPrefixX86, a_bOpcX86); \
3198 } \
3199 DECL_FORCE_INLINE_THROW(uint32_t) \
3200 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3201 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3202 { \
3203 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3204 a_bPrefixX86, a_bOpcX86); \
3205 } \
3206 typedef int ignore_semicolon
3207#elif defined(RT_ARCH_ARM64)
3208# define IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(a_Instr, a_enmArmOp, a_ArmElemSz, a_bPrefixX86, a_bOpcX86) \
3209 DECL_FORCE_INLINE_THROW(uint32_t) \
3210 RT_CONCAT3(iemNativeEmit_,a_Instr,_rr_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3211 uint8_t const idxSimdGstRegDst, uint8_t const idxSimdGstRegSrc) \
3212 { \
3213 return iemNativeEmitSimdFp3OpCommon_rr_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxSimdGstRegSrc, \
3214 a_enmArmOp, a_ArmElemSz); \
3215 } \
3216 DECL_FORCE_INLINE_THROW(uint32_t) \
3217 RT_CONCAT3(iemNativeEmit_,a_Instr,_rv_u128)(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t const idxInstr, \
3218 uint8_t const idxSimdGstRegDst, uint8_t const idxVarSrc) \
3219 { \
3220 return iemNativeEmitSimdFp3OpCommon_rv_u128(pReNative, off, idxInstr, idxSimdGstRegDst, idxVarSrc, \
3221 a_enmArmOp, a_ArmElemSz); \
3222 } \
3223 typedef int ignore_semicolon
3224#else
3225# error "Port me"
3226#endif
3227
3228
3229IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(mulps, kArmv8VecInstrFpOp_Mul, kArmv8VecInstrFpSz_4x_Single, 0, 0x59);
3230IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addps, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_4x_Single, 0, 0x58);
3231IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(addpd, kArmv8VecInstrFpOp_Add, kArmv8VecInstrFpSz_2x_Double, X86_OP_PRF_SIZE_OP, 0x58);
3232IEMNATIVE_NATIVE_EMIT_FP_3OP_U128(subps, kArmv8VecInstrFpOp_Sub, kArmv8VecInstrFpSz_4x_Single, 0, 0x5c);
3233
3234#endif /* IEMNATIVE_WITH_SIMD_REG_ALLOCATOR */
3235
3236#endif /* !VMM_INCLUDED_SRC_VMMAll_target_x86_IEMAllN8veEmit_x86_h */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette