VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S@ 104195

Last change on this file since 104195 was 104195, checked in by vboxsync, 8 months ago

VMM/IEM: Refactoring assembly helpers to not pass eflags by reference but instead by value and return the updated value (via eax/w0) - first chunk: ADD,ADC,SUB,SBB,CMP,TEST,AND,OR,XOR. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 24.4 KB
Line 
1/* $Id: IEMAllAImpl-arm64.S 104195 2024-04-05 14:45:23Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, ARM64 variant.
4 */
5
6/*
7 * Copyright (C) 2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include <iprt/asmdefs-arm.h>
33#include <iprt/x86.h>
34
35
36#if RT_CLANG_PREREQ(15, 0)
37 .arch_extension flagm /* not necessary */
38#else
39 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42 work for v15 and is enabled by default it seems. */
43 .cpu apple-a14+crc
44#endif
45
46.macro BEGINPROC, a_Name
47 .private_extern NAME(\a_Name)
48 .globl NAME(\a_Name)
49NAME(\a_Name):
50.endm
51
52
53.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54 /*
55 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56 */
57 eor \regTmp, \regResult, \regResult, LSR #4
58 eor \regTmp, \regTmp, \regTmp, LSR #2
59 eor \regTmp, \regTmp, \regTmp, LSR #1
60 eor \regTmp, \regTmp, #1
61 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62.endm
63
64
65.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66 /*
67 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
68 */
69 eor \regTmp, \regLeft, \regRight
70 eor \regTmp, \regTmp, \regResult
71 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73.endm
74
75.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76 /*
77 * Translate the arm NZCV bits into corresponding EFLAGS bits.
78 */
79 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
80#if 0
81 /* Maybe just a tiny bit slow than the next one. */
82 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83 .ifeq \fSkipFlags & X86_EFL_OF
84 lsr \regTmp, \regTmp, #28
85 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86 lsr \regTmp, \regTmp, #1
87 .else
88 lsr \regTmp, \regTmp, #29
89 .endif
90 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92 lsr \regTmp, \regTmp, #1
93 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94#else
95 /* This seems to be the faster one... */
96 cfinv
97 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98 .ifeq (\fSkipFlags & X86_EFL_OF)
99 lsr \regTmp, \regTmp, #28
100 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101 lsr \regTmp, \regTmp, #1
102 .else
103 lsr \regTmp, \regTmp, #29
104 .endif
105 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106 lsr \regTmp, \regTmp, #1
107 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108#endif
109 .else
110 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111 .ifeq \fSkipFlags & X86_EFL_ZF
112 cset \regTmp, eq
113 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114 .endif
115 .ifeq \fSkipFlags & X86_EFL_CF
116 cset \regTmp, cc
117 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118 .endif
119 .ifeq \fSkipFlags & X86_EFL_OF
120 cset \regTmp, vs
121 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122 .endif
123 .ifeq \fSkipFlags & X86_EFL_SF
124 cset \regTmp, mi
125 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126 .endif
127 .endif
128
129
130 /*
131 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132 */
133 eor \regTmp, \regResult, \regResult, LSR #4
134 eor \regTmp, \regTmp, \regTmp, LSR #2
135 eor \regTmp, \regTmp, \regTmp, LSR #1
136 eor \regTmp, \regTmp, #1
137 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139 /*
140 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
141 */
142 eor \regTmp, \regLeft, \regRight
143 eor \regTmp, \regTmp, \regResult
144 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147 /* done */
148.endm
149
150
151BEGINCODE
152 .p2align 2
153 .private_extern NAME(iemAImpl_placeholder)
154 .globl NAME(iemAImpl_placeholder)
155NAME(iemAImpl_placeholder):
156 brk #1
157 ret
158
159/* Some sketches.
160
161// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg));
162 .p2align 2
163 .private_extern NAME(iemAImpl_xchg_u8_locked)
164 .globl NAME(iemAImpl_xchg_u8_locked)
165NAME(iemAImpl_xchg_u8_locked):
166 ldrb w2, [x1]
167 swpalb w2, w2, [x0]
168 strb w2, [x1]
169 ret
170
171// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *pu16Mem, uint16_t *pu16Reg));
172 .p2align 2
173 .private_extern NAME(iemAImpl_xchg_u16_locked)
174 .globl NAME(iemAImpl_xchg_u16_locked)
175NAME(iemAImpl_xchg_u16_locked):
176 ldrh w2, [x1]
177 swpalh w2, w2, [x0]
178 strh w2, [x1]
179 ret
180
181// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *pu32Mem, uint32_t *pu32Reg));
182// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *pu64Mem, uint64_t *pu64Reg));
183
184*/
185
186
187/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */
188
189/*
190 * The CMP instruction.
191 */
192
193/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc); */
194 .p2align 2
195 .private_extern NAME(iemAImpl_sub_u8)
196 .globl NAME(iemAImpl_sub_u8)
197NAME(iemAImpl_sub_u8):
198 .cfi_startproc
199 /* Do the subtraction. */
200 ldrb w8, [x1]
201 /*and w2, w2, #0xff - should not be necessary. */
202 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
203 strb w9, [x1]
204 setf8 w9
205
206 /* Calculate EFLAGS (passed in and returned via x0). */
207 and w9, w9, #0xffff
208 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
209
210 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
211 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
212 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
213 eor w12, w8, w9
214 and w11, w12, w11
215 lsr w11, w11, #7
216 bfi w0, w11, #X86_EFL_OF_BIT, #1
217
218 ret
219 .cfi_endproc
220
221
222/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc); */
223 .p2align 2
224 .private_extern NAME(iemAImpl_sub_u16)
225 .globl NAME(iemAImpl_sub_u16)
226NAME(iemAImpl_sub_u16):
227 .cfi_startproc
228 /* Do the subtraction. */
229 ldrh w8, [x1]
230 /*and w2, w2, #0xffff - should not be necessary. */
231 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
232 setf16 w9
233 strh w9, [x1]
234
235 /* Calculate EFLAGS (passed in and returned via x0). */
236 and w9, w9, #0xffff
237 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
238
239 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
240 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
241 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
242 eor w12, w8, w9
243 and w11, w12, w11
244 lsr w11, w11, #15
245 bfi w0, w11, #X86_EFL_OF_BIT, #1
246
247 ret
248 .cfi_endproc
249
250
251/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc); */
252 .p2align 2
253 .private_extern NAME(iemAImpl_sub_u32)
254 .globl NAME(iemAImpl_sub_u32)
255NAME(iemAImpl_sub_u32):
256 .cfi_startproc
257 /* Do the subtraction. */
258 ldr w8, [x1]
259 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
260 str w9, [x1]
261
262 /* Calculate EFLAGS (passed in and returned via x0). */
263
264#if 0
265 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
266#if 0 /* maybe just a tiny bit slow than the next one. */
267 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
268 lsr w11, w11, #28
269 bfi w0, w11, #X86_EFL_OF_BIT, #1
270 lsr w11, w11, #1
271 eor w11, w11, #1 /* inverts the carry flag to x86 style. */
272 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
273 lsr w11, w11, #1
274 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
275#elif 1 /* seems the faster one... */
276 cfinv
277 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
278 lsr w11, w11, #28
279 bfi w0, w11, #X86_EFL_OF_BIT, #1
280 lsr w11, w11, #1
281 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
282 lsr w11, w11, #1
283 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
284#else
285 cset w11, eq
286 bfi w0, w11, #X86_EFL_ZF_BIT, #1
287 cset w11, cc
288 bfi w0, w11, #X86_EFL_CF_BIT, #1
289 cset w11, vs
290 bfi w0, w11, #X86_EFL_OF_BIT, #1
291 cset w11, mi
292 bfi w0, w11, #X86_EFL_SF_BIT, #1
293#endif
294
295 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
296 eor w11, w9, w9, LSR #4
297 eor w11, w11, w11, LSR #2
298 eor w11, w11, w11, LSR #1
299 eor w11, w11, #1
300 bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
301
302 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
303 eor w11, w8, w2
304 eor w11, w11, w9
305 lsr w11, w11, #X86_EFL_AF_BIT
306 bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
307#else
308 CALC_EFLAGS x0, x9, x8, x2, x11
309#endif
310
311 ret
312 .cfi_endproc
313
314
315/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc); */
316 .p2align 2
317 .private_extern NAME(iemAImpl_sub_u64)
318 .globl NAME(iemAImpl_sub_u64)
319NAME(iemAImpl_sub_u64):
320 .cfi_startproc
321 /* Do the subtraction. */
322 ldr x8, [x1]
323 subs x9, x8, x2 /* x9 = x8 (*puDst) - x2 (uSrc) */
324 str x9, [x1]
325
326 /* Calculate EFLAGS (passed in and returned via x0). */
327 CALC_EFLAGS x0, x9, x8, x2, x11
328
329 ret
330 .cfi_endproc
331
332
333
334/*
335 * Shift Left.
336 */
337
338/* void iemAImpl_shl_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
339/* void iemAImpl_shl_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
340/* void iemAImpl_shl_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
341.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
342 .p2align 2
343BEGINPROC \a_Name
344 .cfi_startproc
345
346 /* Do we need to shift anything at all? */
347 and w1, w1, #0x1f
348 cbz w1, 99f
349
350 /*
351 * Do the shifting
352 */
353 ldr\a_LdStSuff w8, [x0]
354.ifne \a_cBits < 32
355 lslv w9, w8, w1
356.else
357 lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
358.endif
359 str\a_LdStSuff w9, [x0]
360
361 /*
362 * Calculate EFLAGS.
363 */
364 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
365
366 CALC_EFLAGS_PARITY w10, w9, w12
367
368.ifne \a_cBits < 32
369 setf\a_cBits w9 /* Sets NZ */
370.else
371 ands wzr, w9, w9 /* Sets NZ */
372.endif
373#if 1
374 mrs x11, NZCV
375 lsr w11, w11, #30 /* N=1; Z=0 */
376 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
377#else
378 cset x11, eq
379 bfi w10, w11, X86_EFL_ZF_BIT, 1
380 cset x12, pl
381 bfi w10, w12, X86_EFL_SF_BIT, 1
382#endif
383
384.ifne \a_cBits < 32
385 bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
386.else
387 bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
388.endif
389
390.ifne \a_fIntelFlags
391 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
392 eor w11, w8, w8, LSL #1
393 lsr w11, w11, #(\a_cBits - 1)
394 bfi w10, w11, #X86_EFL_OF_BIT, #1
395
396 and w10, w10, ~X86_EFL_AF /* AF is cleared */
397.else
398 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
399 .ifne \a_cBits < 32
400 eor w11, w9, w9, LSR #1
401 lsr w11, w11, #(\a_cBits - 1)
402 .else
403 eor x11, x9, x9, LSR #1
404 lsr x11, x11, #(\a_cBits - 1)
405 .endif
406 bfi w10, w11, #X86_EFL_OF_BIT, #1
407
408 orr w10, w10, X86_EFL_AF /* AF is set */
409.endif
410
411 str w10, [x2]
41299:
413 ret
414 .cfi_endproc
415.endm
416
417SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
418SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
419SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
420
421SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
422SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
423SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
424
425SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
426SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
427SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
428
429;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
430/* void iemAImpl_shl_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
431.macro SHL_64, a_Name, a_fIntelFlags
432 .p2align 2
433BEGINPROC \a_Name
434 .cfi_startproc
435
436 /* Do we need to shift anything at all? */
437 and w1, w1, #0x3f
438 cbz w1, 99f
439
440 /*
441 * Do the shifting
442 */
443 ldr x8, [x0]
444 lslv x9, x8, x1
445 str x9, [x0]
446
447 /*
448 * Calculate EFLAGS.
449 */
450 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
451
452 CALC_EFLAGS_PARITY w10, w9, w11
453
454 ands xzr, x9, x9 /* Sets NZ */
455 mrs x11, NZCV
456 lsr w11, w11, #30 /* N=1; Z=0 */
457 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
458
459 neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */
460 lsrv x11, x8, x11
461 bfi w10, w11, X86_EFL_CF_BIT, 1
462
463.ifne \a_fIntelFlags
464 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
465 eor x11, x8, x8, LSL #1
466 lsr x11, x11, #63
467 bfi w10, w11, #X86_EFL_OF_BIT, #1
468
469 and w10, w10, ~X86_EFL_AF /* AF is cleared */
470.else
471 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
472 eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
473 bfi w10, w11, #X86_EFL_OF_BIT, #1
474
475 orr w10, w10, X86_EFL_AF /* AF is set */
476.endif
477 str w10, [x2]
47899:
479 ret
480 .cfi_endproc
481.endm
482
483SHL_64 iemAImpl_shl_u64, 1
484SHL_64 iemAImpl_shl_u64_intel, 1
485SHL_64 iemAImpl_shl_u64_amd, 0
486
487
488/*
489 * Shift Right, Unsigned.
490 */
491
492/* void iemAImpl_shr_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
493/* void iemAImpl_shr_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
494/* void iemAImpl_shr_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
495.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
496 .p2align 2
497BEGINPROC \a_Name
498 .cfi_startproc
499
500 /* Do we need to shift anything at all? */
501 and w1, w1, #0x1f
502 cbz w1, 99f
503
504 /* Load EFLAGS before we start the calculation. */
505 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
506
507 /*
508 * Do the shifting.
509 */
510 ldr\a_LdStSuff w8, [x0]
511 lsrv w9, w8, w1
512 str\a_LdStSuff w9, [x0]
513
514 /*
515 * Calculate EFLAGS.
516 */
517 sub w11, w1, #1
518 lsrv w11, w8, w11
519 bfxil w10, w11, #X86_EFL_CF_BIT, #1
520
521.ifne \a_fIntelFlags
522 and w10, w10, ~X86_EFL_AF /* AF is cleared */
523 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
524 lsr w11, w8, #(\a_cBits - 1)
525 bfi w10, w11, #X86_EFL_OF_BIT, #1
526.else
527 orr w10, w10, X86_EFL_AF /* AF is set */
528 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
529 lsr w11, w9, #(\a_cBits - 2)
530 bfi w10, w11, #X86_EFL_OF_BIT, #1
531.endif
532
533 CALC_EFLAGS_PARITY w10, w9, w11
534
535.ifne \a_cBits < 32
536 setf\a_cBits w9 /* Sets NZ */
537.else
538 ands wzr, w9, w9 /* Sets NZ */
539.endif
540 mrs x11, NZCV
541 lsr w11, w11, #30 /* N=1; Z=0 */
542 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
543
544 str w10, [x2]
54599:
546 ret
547 .cfi_endproc
548.endm
549
550shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
551shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
552shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
553
554shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
555shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
556shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
557
558shr_8_16_32 iemAImpl_shr_u32, 32, 1,
559shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
560shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
561
562;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
563/* void iemAImpl_shr_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
564.macro shr_64, a_Name, a_fIntelFlags
565 .p2align 2
566BEGINPROC \a_Name
567 .cfi_startproc
568
569 /* Do we need to shift anything at all? */
570 ands w1, w1, #0x3f
571 b.eq 99f
572
573 /* Load EFLAGS before we start the calculation. */
574 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
575
576 /*
577 * Do the shifting
578 */
579 ldr x8, [x0]
580 lsrv x9, x8, x1
581 str x9, [x0]
582
583 /*
584 * Calculate EFLAGS.
585 */
586 sub w11, w1, #1
587 lsrv x11, x8, x11
588 bfxil w10, w11, #X86_EFL_CF_BIT, #1
589
590.ifne \a_fIntelFlags
591 and w10, w10, ~X86_EFL_AF /* AF is cleared */
592 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
593 lsr x11, x8, #63
594 bfi w10, w11, #X86_EFL_OF_BIT, #1
595.else
596 orr w10, w10, X86_EFL_AF /* AF is set */
597 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
598 lsr x11, x9, #62
599 bfi w10, w11, #X86_EFL_OF_BIT, #1
600.endif
601
602 CALC_EFLAGS_PARITY w10, w9, w11
603
604 ands xzr, x9, x9 /* Sets NZ */
605 mrs x11, NZCV
606 lsr w11, w11, #30 /* N=1; Z=0 */
607 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
608
609 str w10, [x2]
61099:
611 ret
612 .cfi_endproc
613.endm
614
615shr_64 iemAImpl_shr_u64, 1
616shr_64 iemAImpl_shr_u64_intel, 1
617shr_64 iemAImpl_shr_u64_amd, 0
618
619
620/*
621 * Shift Right, Signed
622 */
623
624/* void iemAImpl_sar_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
625/* void iemAImpl_sar_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
626/* void iemAImpl_sar_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
627.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
628 .p2align 2
629BEGINPROC \a_Name
630 .cfi_startproc
631
632 /* Do we need to shift anything at all? */
633 and w1, w1, #0x1f
634 cbz w1, 99f
635
636 /* Load EFLAGS before we start the calculation. */
637 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
638
639 /*
640 * Do the shifting.
641 */
642 ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */
643 asrv w9, w8, w1
644 str\a_StSuff w9, [x0]
645
646 /*
647 * Calculate EFLAGS.
648 */
649 sub w11, w1, #1
650 lsrv w11, w8, w11
651 bfxil w10, w11, #X86_EFL_CF_BIT, #1
652
653.ifne \a_fIntelFlags
654 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
655 and w10, w10, w11 /* AF and OF are cleared */
656.else
657 orr w10, w10, X86_EFL_AF /* AF is set */
658 and w10, w10, ~X86_EFL_OF /* OF is cleared */
659.endif
660
661 CALC_EFLAGS_PARITY w10, w9, w11
662
663.ifne \a_cBits < 32
664 setf\a_cBits w9 /* Sets NZ */
665.else
666 ands wzr, w9, w9 /* Sets NZ */
667.endif
668 mrs x11, NZCV
669 lsr w11, w11, #30 /* N=1; Z=0 */
670 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
671
672 str w10, [x2]
67399:
674 ret
675 .cfi_endproc
676.endm
677
678sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
679sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
680sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
681
682sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
683sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
684sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
685
686sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
687sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
688sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
689
690;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
691/* void iemAImpl_sar_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
692.macro sar_64, a_Name, a_fIntelFlags
693 .p2align 2
694BEGINPROC \a_Name
695 .cfi_startproc
696
697 /* Do we need to shift anything at all? */
698 ands w1, w1, #0x3f
699 b.eq 99f
700
701 /* Load EFLAGS before we start the calculation. */
702 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
703
704 /*
705 * Do the shifting
706 */
707 ldr x8, [x0]
708 asrv x9, x8, x1
709 str x9, [x0]
710
711 /*
712 * Calculate EFLAGS.
713 */
714 sub w11, w1, #1
715 lsrv x11, x8, x11
716 bfxil w10, w11, #X86_EFL_CF_BIT, #1
717
718.ifne \a_fIntelFlags
719 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
720 and w10, w10, w11 /* AF and OF are cleared */
721.else
722 orr w10, w10, X86_EFL_AF /* AF is set */
723 and w10, w10, ~X86_EFL_OF /* OF is cleared */
724.endif
725
726 CALC_EFLAGS_PARITY w10, w9, w11
727
728 ands xzr, x9, x9 /* Sets NZ */
729 mrs x11, NZCV
730 lsr w11, w11, #30 /* N=1; Z=0 */
731 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
732
733 str w10, [x2]
73499:
735 ret
736 .cfi_endproc
737.endm
738
739sar_64 iemAImpl_sar_u64, 1
740sar_64 iemAImpl_sar_u64_intel, 1
741sar_64 iemAImpl_sar_u64_amd, 0
742
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette