VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S@ 104231

Last change on this file since 104231 was 104231, checked in by vboxsync, 12 months ago

VMM/IEMAllAImpl-arm64.S: Make it build with gcc, bugref:10391

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
Line 
1/* $Id: IEMAllAImpl-arm64.S 104231 2024-04-08 13:46:29Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, ARM64 variant.
4 */
5
6/*
7 * Copyright (C) 2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include <iprt/asmdefs-arm.h>
33#include <iprt/x86.h>
34
35
36#if RT_CLANG_PREREQ(15, 0)
37 .arch_extension flagm /* not necessary */
38#else
39 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42 work for v15 and is enabled by default it seems. */
43# ifdef RT_OS_DARWIN
44 .cpu apple-a14+crc
45# else
46 .cpu cortex-a53+flagm
47# endif
48#endif
49
50
51.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
52 /*
53 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
54 */
55 eor \regTmp, \regResult, \regResult, LSR #4
56 eor \regTmp, \regTmp, \regTmp, LSR #2
57 eor \regTmp, \regTmp, \regTmp, LSR #1
58 eor \regTmp, \regTmp, #1
59 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
60.endm
61
62
63.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
64 /*
65 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
66 */
67 eor \regTmp, \regLeft, \regRight
68 eor \regTmp, \regTmp, \regResult
69 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
70 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
71.endm
72
73.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
74 /*
75 * Translate the arm NZCV bits into corresponding EFLAGS bits.
76 */
77 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
78#if 0
79 /* Maybe just a tiny bit slow than the next one. */
80 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
81 .ifeq \fSkipFlags & X86_EFL_OF
82 lsr \regTmp, \regTmp, #28
83 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
84 lsr \regTmp, \regTmp, #1
85 .else
86 lsr \regTmp, \regTmp, #29
87 .endif
88 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
89 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
90 lsr \regTmp, \regTmp, #1
91 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
92#else
93 /* This seems to be the faster one... */
94 cfinv
95 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
96 .ifeq (\fSkipFlags & X86_EFL_OF)
97 lsr \regTmp, \regTmp, #28
98 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
99 lsr \regTmp, \regTmp, #1
100 .else
101 lsr \regTmp, \regTmp, #29
102 .endif
103 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
104 lsr \regTmp, \regTmp, #1
105 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
106#endif
107 .else
108 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
109 .ifeq \fSkipFlags & X86_EFL_ZF
110 cset \regTmp, eq
111 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
112 .endif
113 .ifeq \fSkipFlags & X86_EFL_CF
114 cset \regTmp, cc
115 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
116 .endif
117 .ifeq \fSkipFlags & X86_EFL_OF
118 cset \regTmp, vs
119 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
120 .endif
121 .ifeq \fSkipFlags & X86_EFL_SF
122 cset \regTmp, mi
123 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
124 .endif
125 .endif
126
127
128 /*
129 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
130 */
131 eor \regTmp, \regResult, \regResult, LSR #4
132 eor \regTmp, \regTmp, \regTmp, LSR #2
133 eor \regTmp, \regTmp, \regTmp, LSR #1
134 eor \regTmp, \regTmp, #1
135 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
136
137 /*
138 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
139 */
140 eor \regTmp, \regLeft, \regRight
141 eor \regTmp, \regTmp, \regResult
142 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
143 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
144
145 /* done */
146.endm
147
148
149BEGINCODE
150
151BEGINPROC_HIDDEN iemAImpl_placeholder
152 brk #1
153 ret
154
155/* Some sketches.
156
157// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg));
158BEGINPROC_HIDDEN iemAImpl_xchg_u8_locked
159 ldrb w2, [x1]
160 swpalb w2, w2, [x0]
161 strb w2, [x1]
162 ret
163
164// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *pu16Mem, uint16_t *pu16Reg));
165BEGINPROC_HIDDEN iemAImpl_xchg_u16_locked
166 ldrh w2, [x1]
167 swpalh w2, w2, [x0]
168 strh w2, [x1]
169 ret
170
171// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *pu32Mem, uint32_t *pu32Reg));
172// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *pu64Mem, uint64_t *pu64Reg));
173
174*/
175
176
177/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */
178
179/*
180 * The CMP instruction.
181 */
182
183/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc); */
184BEGINPROC_HIDDEN iemAImpl_sub_u8
185 .cfi_startproc
186 /* Do the subtraction. */
187 ldrb w8, [x1]
188 /*and w2, w2, #0xff - should not be necessary. */
189 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
190 strb w9, [x1]
191 setf8 w9
192
193 /* Calculate EFLAGS (passed in and returned via x0). */
194 and w9, w9, #0xffff
195 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
196
197 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
198 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
199 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
200 eor w12, w8, w9
201 and w11, w12, w11
202 lsr w11, w11, #7
203 bfi w0, w11, #X86_EFL_OF_BIT, #1
204
205 ret
206 .cfi_endproc
207
208
209/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc); */
210BEGINPROC_HIDDEN iemAImpl_sub_u16
211 .cfi_startproc
212 /* Do the subtraction. */
213 ldrh w8, [x1]
214 /*and w2, w2, #0xffff - should not be necessary. */
215 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
216 setf16 w9
217 strh w9, [x1]
218
219 /* Calculate EFLAGS (passed in and returned via x0). */
220 and w9, w9, #0xffff
221 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
222
223 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
224 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
225 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
226 eor w12, w8, w9
227 and w11, w12, w11
228 lsr w11, w11, #15
229 bfi w0, w11, #X86_EFL_OF_BIT, #1
230
231 ret
232 .cfi_endproc
233
234
235/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc); */
236BEGINPROC_HIDDEN iemAImpl_sub_u32
237 .cfi_startproc
238 /* Do the subtraction. */
239 ldr w8, [x1]
240 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
241 str w9, [x1]
242
243 /* Calculate EFLAGS (passed in and returned via x0). */
244
245#if 0
246 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
247#if 0 /* maybe just a tiny bit slow than the next one. */
248 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
249 lsr w11, w11, #28
250 bfi w0, w11, #X86_EFL_OF_BIT, #1
251 lsr w11, w11, #1
252 eor w11, w11, #1 /* inverts the carry flag to x86 style. */
253 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
254 lsr w11, w11, #1
255 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
256#elif 1 /* seems the faster one... */
257 cfinv
258 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
259 lsr w11, w11, #28
260 bfi w0, w11, #X86_EFL_OF_BIT, #1
261 lsr w11, w11, #1
262 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
263 lsr w11, w11, #1
264 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
265#else
266 cset w11, eq
267 bfi w0, w11, #X86_EFL_ZF_BIT, #1
268 cset w11, cc
269 bfi w0, w11, #X86_EFL_CF_BIT, #1
270 cset w11, vs
271 bfi w0, w11, #X86_EFL_OF_BIT, #1
272 cset w11, mi
273 bfi w0, w11, #X86_EFL_SF_BIT, #1
274#endif
275
276 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
277 eor w11, w9, w9, LSR #4
278 eor w11, w11, w11, LSR #2
279 eor w11, w11, w11, LSR #1
280 eor w11, w11, #1
281 bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
282
283 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
284 eor w11, w8, w2
285 eor w11, w11, w9
286 lsr w11, w11, #X86_EFL_AF_BIT
287 bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
288#else
289 CALC_EFLAGS x0, x9, x8, x2, x11
290#endif
291
292 ret
293 .cfi_endproc
294
295
296/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc); */
297BEGINPROC_HIDDEN iemAImpl_sub_u64
298 .cfi_startproc
299 /* Do the subtraction. */
300 ldr x8, [x1]
301 subs x9, x8, x2 /* x9 = x8 (*puDst) - x2 (uSrc) */
302 str x9, [x1]
303
304 /* Calculate EFLAGS (passed in and returned via x0). */
305 CALC_EFLAGS x0, x9, x8, x2, x11
306
307 ret
308 .cfi_endproc
309
310
311
312/*
313 * Shift Left.
314 */
315
316/* void iemAImpl_shl_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
317/* void iemAImpl_shl_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
318/* void iemAImpl_shl_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
319.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
320BEGINPROC_HIDDEN \a_Name
321 .cfi_startproc
322
323 /* Do we need to shift anything at all? */
324 and w1, w1, #0x1f
325 cbz w1, 99f
326
327 /*
328 * Do the shifting
329 */
330 ldr\a_LdStSuff w8, [x0]
331.ifne \a_cBits < 32
332 lslv w9, w8, w1
333.else
334 lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
335.endif
336 str\a_LdStSuff w9, [x0]
337
338 /*
339 * Calculate EFLAGS.
340 */
341 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
342
343 CALC_EFLAGS_PARITY w10, w9, w12
344
345.ifne \a_cBits < 32
346 setf\a_cBits w9 /* Sets NZ */
347.else
348 ands wzr, w9, w9 /* Sets NZ */
349.endif
350#if 1
351 mrs x11, NZCV
352 lsr w11, w11, #30 /* N=1; Z=0 */
353 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
354#else
355 cset x11, eq
356 bfi w10, w11, X86_EFL_ZF_BIT, 1
357 cset x12, pl
358 bfi w10, w12, X86_EFL_SF_BIT, 1
359#endif
360
361.ifne \a_cBits < 32
362 bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
363.else
364 bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
365.endif
366
367.ifne \a_fIntelFlags
368 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
369 eor w11, w8, w8, LSL #1
370 lsr w11, w11, #(\a_cBits - 1)
371 bfi w10, w11, #X86_EFL_OF_BIT, #1
372
373 and w10, w10, ~X86_EFL_AF /* AF is cleared */
374.else
375 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
376 .ifne \a_cBits < 32
377 eor w11, w9, w9, LSR #1
378 lsr w11, w11, #(\a_cBits - 1)
379 .else
380 eor x11, x9, x9, LSR #1
381 lsr x11, x11, #(\a_cBits - 1)
382 .endif
383 bfi w10, w11, #X86_EFL_OF_BIT, #1
384
385 orr w10, w10, X86_EFL_AF /* AF is set */
386.endif
387
388 str w10, [x2]
38999:
390 ret
391 .cfi_endproc
392.endm
393
394SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
395SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
396SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
397
398SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
399SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
400SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
401
402SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
403SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
404SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
405
406/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
407/* void iemAImpl_shl_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
408.macro SHL_64, a_Name, a_fIntelFlags
409BEGINPROC_HIDDEN \a_Name
410 .cfi_startproc
411
412 /* Do we need to shift anything at all? */
413 and w1, w1, #0x3f
414 cbz w1, 99f
415
416 /*
417 * Do the shifting
418 */
419 ldr x8, [x0]
420 lslv x9, x8, x1
421 str x9, [x0]
422
423 /*
424 * Calculate EFLAGS.
425 */
426 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
427
428 CALC_EFLAGS_PARITY w10, w9, w11
429
430 ands xzr, x9, x9 /* Sets NZ */
431 mrs x11, NZCV
432 lsr w11, w11, #30 /* N=1; Z=0 */
433 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
434
435 neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */
436 lsrv x11, x8, x11
437 bfi w10, w11, X86_EFL_CF_BIT, 1
438
439.ifne \a_fIntelFlags
440 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
441 eor x11, x8, x8, LSL #1
442 lsr x11, x11, #63
443 bfi w10, w11, #X86_EFL_OF_BIT, #1
444
445 and w10, w10, ~X86_EFL_AF /* AF is cleared */
446.else
447 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
448 eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
449 bfi w10, w11, #X86_EFL_OF_BIT, #1
450
451 orr w10, w10, X86_EFL_AF /* AF is set */
452.endif
453 str w10, [x2]
45499:
455 ret
456 .cfi_endproc
457.endm
458
459SHL_64 iemAImpl_shl_u64, 1
460SHL_64 iemAImpl_shl_u64_intel, 1
461SHL_64 iemAImpl_shl_u64_amd, 0
462
463
464/*
465 * Shift Right, Unsigned.
466 */
467
468/* void iemAImpl_shr_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
469/* void iemAImpl_shr_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
470/* void iemAImpl_shr_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
471.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
472BEGINPROC_HIDDEN \a_Name
473 .cfi_startproc
474
475 /* Do we need to shift anything at all? */
476 and w1, w1, #0x1f
477 cbz w1, 99f
478
479 /* Load EFLAGS before we start the calculation. */
480 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
481
482 /*
483 * Do the shifting.
484 */
485 ldr\a_LdStSuff w8, [x0]
486 lsrv w9, w8, w1
487 str\a_LdStSuff w9, [x0]
488
489 /*
490 * Calculate EFLAGS.
491 */
492 sub w11, w1, #1
493 lsrv w11, w8, w11
494 bfxil w10, w11, #X86_EFL_CF_BIT, #1
495
496.ifne \a_fIntelFlags
497 and w10, w10, ~X86_EFL_AF /* AF is cleared */
498 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
499 lsr w11, w8, #(\a_cBits - 1)
500 bfi w10, w11, #X86_EFL_OF_BIT, #1
501.else
502 orr w10, w10, X86_EFL_AF /* AF is set */
503 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
504 lsr w11, w9, #(\a_cBits - 2)
505 bfi w10, w11, #X86_EFL_OF_BIT, #1
506.endif
507
508 CALC_EFLAGS_PARITY w10, w9, w11
509
510.ifne \a_cBits < 32
511 setf\a_cBits w9 /* Sets NZ */
512.else
513 ands wzr, w9, w9 /* Sets NZ */
514.endif
515 mrs x11, NZCV
516 lsr w11, w11, #30 /* N=1; Z=0 */
517 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
518
519 str w10, [x2]
52099:
521 ret
522 .cfi_endproc
523.endm
524
525shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
526shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
527shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
528
529shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
530shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
531shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
532
533shr_8_16_32 iemAImpl_shr_u32, 32, 1,
534shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
535shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
536
537/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
538/* void iemAImpl_shr_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
539.macro shr_64, a_Name, a_fIntelFlags
540BEGINPROC_HIDDEN \a_Name
541 .cfi_startproc
542
543 /* Do we need to shift anything at all? */
544 ands w1, w1, #0x3f
545 b.eq 99f
546
547 /* Load EFLAGS before we start the calculation. */
548 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
549
550 /*
551 * Do the shifting
552 */
553 ldr x8, [x0]
554 lsrv x9, x8, x1
555 str x9, [x0]
556
557 /*
558 * Calculate EFLAGS.
559 */
560 sub w11, w1, #1
561 lsrv x11, x8, x11
562 bfxil w10, w11, #X86_EFL_CF_BIT, #1
563
564.ifne \a_fIntelFlags
565 and w10, w10, ~X86_EFL_AF /* AF is cleared */
566 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
567 lsr x11, x8, #63
568 bfi w10, w11, #X86_EFL_OF_BIT, #1
569.else
570 orr w10, w10, X86_EFL_AF /* AF is set */
571 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
572 lsr x11, x9, #62
573 bfi w10, w11, #X86_EFL_OF_BIT, #1
574.endif
575
576 CALC_EFLAGS_PARITY w10, w9, w11
577
578 ands xzr, x9, x9 /* Sets NZ */
579 mrs x11, NZCV
580 lsr w11, w11, #30 /* N=1; Z=0 */
581 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
582
583 str w10, [x2]
58499:
585 ret
586 .cfi_endproc
587.endm
588
589shr_64 iemAImpl_shr_u64, 1
590shr_64 iemAImpl_shr_u64_intel, 1
591shr_64 iemAImpl_shr_u64_amd, 0
592
593
594/*
595 * Shift Right, Signed
596 */
597
598/* void iemAImpl_sar_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
599/* void iemAImpl_sar_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
600/* void iemAImpl_sar_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
601.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
602BEGINPROC_HIDDEN \a_Name
603 .cfi_startproc
604
605 /* Do we need to shift anything at all? */
606 and w1, w1, #0x1f
607 cbz w1, 99f
608
609 /* Load EFLAGS before we start the calculation. */
610 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
611
612 /*
613 * Do the shifting.
614 */
615 ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */
616 asrv w9, w8, w1
617 str\a_StSuff w9, [x0]
618
619 /*
620 * Calculate EFLAGS.
621 */
622 sub w11, w1, #1
623 lsrv w11, w8, w11
624 bfxil w10, w11, #X86_EFL_CF_BIT, #1
625
626.ifne \a_fIntelFlags
627 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
628 and w10, w10, w11 /* AF and OF are cleared */
629.else
630 orr w10, w10, X86_EFL_AF /* AF is set */
631 and w10, w10, ~X86_EFL_OF /* OF is cleared */
632.endif
633
634 CALC_EFLAGS_PARITY w10, w9, w11
635
636.ifne \a_cBits < 32
637 setf\a_cBits w9 /* Sets NZ */
638.else
639 ands wzr, w9, w9 /* Sets NZ */
640.endif
641 mrs x11, NZCV
642 lsr w11, w11, #30 /* N=1; Z=0 */
643 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
644
645 str w10, [x2]
64699:
647 ret
648 .cfi_endproc
649.endm
650
651sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
652sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
653sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
654
655sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
656sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
657sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
658
659sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
660sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
661sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
662
663/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
664/* void iemAImpl_sar_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
665.macro sar_64, a_Name, a_fIntelFlags
666BEGINPROC_HIDDEN \a_Name
667 .cfi_startproc
668
669 /* Do we need to shift anything at all? */
670 ands w1, w1, #0x3f
671 b.eq 99f
672
673 /* Load EFLAGS before we start the calculation. */
674 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
675
676 /*
677 * Do the shifting
678 */
679 ldr x8, [x0]
680 asrv x9, x8, x1
681 str x9, [x0]
682
683 /*
684 * Calculate EFLAGS.
685 */
686 sub w11, w1, #1
687 lsrv x11, x8, x11
688 bfxil w10, w11, #X86_EFL_CF_BIT, #1
689
690.ifne \a_fIntelFlags
691 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
692 and w10, w10, w11 /* AF and OF are cleared */
693.else
694 orr w10, w10, X86_EFL_AF /* AF is set */
695 and w10, w10, ~X86_EFL_OF /* OF is cleared */
696.endif
697
698 CALC_EFLAGS_PARITY w10, w9, w11
699
700 ands xzr, x9, x9 /* Sets NZ */
701 mrs x11, NZCV
702 lsr w11, w11, #30 /* N=1; Z=0 */
703 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
704
705 str w10, [x2]
70699:
707 ret
708 .cfi_endproc
709.endm
710
711sar_64 iemAImpl_sar_u64, 1
712sar_64 iemAImpl_sar_u64_intel, 1
713sar_64 iemAImpl_sar_u64_amd, 0
714
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette