VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S@ 104173

Last change on this file since 104173 was 104173, checked in by vboxsync, 8 months ago

VMM/IEM: ARM64 assembly renditions of shl, shr and sar assembly helpers. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 24.8 KB
Line 
1/* $Id: IEMAllAImpl-arm64.S 104173 2024-04-05 09:38:49Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, ARM64 variant.
4 */
5
6/*
7 * Copyright (C) 2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include <iprt/asmdefs-arm.h>
33#include <iprt/x86.h>
34
35
36#if RT_CLANG_PREREQ(15, 0)
37 .arch_extension flagm /* not necessary */
38#else
39 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42 work for v15 and is enabled by default it seems. */
43 .cpu apple-a14+crc
44#endif
45
46.macro BEGINPROC, a_Name
47 .private_extern NAME(\a_Name)
48 .globl NAME(\a_Name)
49NAME(\a_Name):
50.endm
51
52
53.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54 /*
55 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56 */
57 eor \regTmp, \regResult, \regResult, LSR #4
58 eor \regTmp, \regTmp, \regTmp, LSR #2
59 eor \regTmp, \regTmp, \regTmp, LSR #1
60 eor \regTmp, \regTmp, #1
61 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62.endm
63
64
65.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66 /*
67 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
68 */
69 eor \regTmp, \regLeft, \regRight
70 eor \regTmp, \regTmp, \regResult
71 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73.endm
74
75.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76 /*
77 * Translate the arm NZCV bits into corresponding EFLAGS bits.
78 */
79 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
80#if 0
81 /* Maybe just a tiny bit slow than the next one. */
82 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83 .ifeq \fSkipFlags & X86_EFL_OF
84 lsr \regTmp, \regTmp, #28
85 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86 lsr \regTmp, \regTmp, #1
87 .else
88 lsr \regTmp, \regTmp, #29
89 .endif
90 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92 lsr \regTmp, \regTmp, #1
93 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94#else
95 /* This seems to be the faster one... */
96 cfinv
97 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98 .ifeq (\fSkipFlags & X86_EFL_OF)
99 lsr \regTmp, \regTmp, #28
100 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101 lsr \regTmp, \regTmp, #1
102 .else
103 lsr \regTmp, \regTmp, #29
104 .endif
105 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106 lsr \regTmp, \regTmp, #1
107 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108#endif
109 .else
110 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111 .ifeq \fSkipFlags & X86_EFL_ZF
112 cset \regTmp, eq
113 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114 .endif
115 .ifeq \fSkipFlags & X86_EFL_CF
116 cset \regTmp, cc
117 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118 .endif
119 .ifeq \fSkipFlags & X86_EFL_OF
120 cset \regTmp, vs
121 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122 .endif
123 .ifeq \fSkipFlags & X86_EFL_SF
124 cset \regTmp, mi
125 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126 .endif
127 .endif
128
129
130 /*
131 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132 */
133 eor \regTmp, \regResult, \regResult, LSR #4
134 eor \regTmp, \regTmp, \regTmp, LSR #2
135 eor \regTmp, \regTmp, \regTmp, LSR #1
136 eor \regTmp, \regTmp, #1
137 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139 /*
140 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
141 */
142 eor \regTmp, \regLeft, \regRight
143 eor \regTmp, \regTmp, \regResult
144 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147 /* done */
148.endm
149
150
151BEGINCODE
152 .p2align 2
153 .private_extern NAME(iemAImpl_placeholder)
154 .globl NAME(iemAImpl_placeholder)
155NAME(iemAImpl_placeholder):
156 brk #1
157 ret
158
159/* Some sketches.
160
161// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg));
162 .p2align 2
163 .private_extern NAME(iemAImpl_xchg_u8_locked)
164 .globl NAME(iemAImpl_xchg_u8_locked)
165NAME(iemAImpl_xchg_u8_locked):
166 ldrb w2, [x1]
167 swpalb w2, w2, [x0]
168 strb w2, [x1]
169 ret
170
171// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *pu16Mem, uint16_t *pu16Reg));
172 .p2align 2
173 .private_extern NAME(iemAImpl_xchg_u16_locked)
174 .globl NAME(iemAImpl_xchg_u16_locked)
175NAME(iemAImpl_xchg_u16_locked):
176 ldrh w2, [x1]
177 swpalh w2, w2, [x0]
178 strh w2, [x1]
179 ret
180
181// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *pu32Mem, uint32_t *pu32Reg));
182// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *pu64Mem, uint64_t *pu64Reg));
183
184*/
185
186
187/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */
188
189/*
190 * The CMP instruction.
191 */
192
193/* void iemAImpl_cmp_u8(uint8_t const *puDst, uint8_t uSrc, uint32_t *pEFlags); */
194 .p2align 2
195 .private_extern NAME(iemAImpl_sub_u8)
196 .globl NAME(iemAImpl_sub_u8)
197NAME(iemAImpl_sub_u8):
198 .cfi_startproc
199 /* Do the subtraction. */
200 ldrb w8, [x0]
201 /*and w1, w1, #0xff - should not be necessary. */
202 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
203 setf8 w9
204 strb w9, [x0]
205
206 /* Load EFLAGS. */
207 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
208 and w9, w9, #0xffff
209 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
210
211 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
212 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
213 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
214 eor w12, w8, w9
215 and w11, w12, w11
216 lsr w11, w11, #7
217 bfi w10, w11, #X86_EFL_OF_BIT, #1
218
219 /* Done with EFLAGS. */
220 str w10, [x2]
221 ret
222 .cfi_endproc
223
224
225/* void iemAImpl_cmp_u16(uint16_t const *puDst, uint16_t uSrc, uint32_t *pEFlags); */
226 .p2align 2
227 .private_extern NAME(iemAImpl_sub_u16)
228 .globl NAME(iemAImpl_sub_u16)
229NAME(iemAImpl_sub_u16):
230 .cfi_startproc
231 /* Do the subtraction. */
232 ldrh w8, [x0]
233 /*and w1, w1, #0xffff - should not be necessary. */
234 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
235 setf16 w9
236 strh w9, [x0]
237
238 /* Load EFLAGS. */
239 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
240 and w9, w9, #0xffff
241 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
242
243 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
244 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
245 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
246 eor w12, w8, w9
247 and w11, w12, w11
248 lsr w11, w11, #15
249 bfi w10, w11, #X86_EFL_OF_BIT, #1
250
251 /* Done with EFLAGS. */
252 str w10, [x2]
253 ret
254 .cfi_endproc
255
256
257/* void iemAImpl_cmp_u32(uint32_t const *puDst, uint32_t uSrc, uint32_t *pEFlags); */
258 .p2align 2
259 .private_extern NAME(iemAImpl_sub_u32)
260 .globl NAME(iemAImpl_sub_u32)
261NAME(iemAImpl_sub_u32):
262 .cfi_startproc
263 /* Do the subtraction. */
264 ldr w8, [x0]
265 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
266 str w9, [x0]
267
268 /* Load EFLAGS. */
269 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
270
271#if 0
272 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
273#if 0 /* maybe just a tiny bit slow than the next one. */
274 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
275 lsr w11, w11, #28
276 bfi w10, w11, #X86_EFL_OF_BIT, #1
277 lsr w11, w11, #1
278 eor w11, w11, #1 /* inverts the carry flag to x86 style. */
279 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
280 lsr w11, w11, #1
281 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
282#elif 1 /* seems the faster one... */
283 cfinv
284 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
285 lsr w11, w11, #28
286 bfi w10, w11, #X86_EFL_OF_BIT, #1
287 lsr w11, w11, #1
288 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
289 lsr w11, w11, #1
290 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
291#else
292 cset w11, eq
293 bfi w10, w11, #X86_EFL_ZF_BIT, #1
294 cset w11, cc
295 bfi w10, w11, #X86_EFL_CF_BIT, #1
296 cset w11, vs
297 bfi w10, w11, #X86_EFL_OF_BIT, #1
298 cset w11, mi
299 bfi w10, w11, #X86_EFL_SF_BIT, #1
300#endif
301
302 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
303 eor w11, w9, w9, LSR #4
304 eor w11, w11, w11, LSR #2
305 eor w11, w11, w11, LSR #1
306 eor w11, w11, #1
307 bfi w10, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
308
309 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
310 eor w11, w8, w1
311 eor w11, w11, w9
312 lsr w11, w11, #X86_EFL_AF_BIT
313 bfi w10, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
314#else
315 CALC_EFLAGS x10, x9, x8, x1, x11
316#endif
317
318 str w10, [x2]
319 ret
320 .cfi_endproc
321
322
323/* void iemAImpl_cmp_u64(uint64_t const *puDst, uint64_t uSrc, uint32_t *pEFlags); */
324 .p2align 2
325 .private_extern NAME(iemAImpl_sub_u64)
326 .globl NAME(iemAImpl_sub_u64)
327NAME(iemAImpl_sub_u64):
328 .cfi_startproc
329 /* Do the subtraction. */
330 ldr x8, [x0]
331 subs x9, x8, x1 /* x9 = x8 (*puDst) - x1 (uSrc) */
332 str x9, [x0]
333
334 /* Load EFLAGS. */
335 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
336 CALC_EFLAGS x10, x9, x8, x1, x11
337
338 str w10, [x2]
339 ret
340 .cfi_endproc
341
342
343
344/*
345 * Shift Left.
346 */
347
348/* void iemAImpl_shl_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
349/* void iemAImpl_shl_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
350/* void iemAImpl_shl_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
351.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
352 .p2align 2
353BEGINPROC \a_Name
354 .cfi_startproc
355
356 /* Do we need to shift anything at all? */
357 and w1, w1, #0x1f
358 cbz w1, 99f
359
360 /*
361 * Do the shifting
362 */
363 ldr\a_LdStSuff w8, [x0]
364.ifne \a_cBits < 32
365 lslv w9, w8, w1
366.else
367 lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
368.endif
369 str\a_LdStSuff w9, [x0]
370
371 /*
372 * Calculate EFLAGS.
373 */
374 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
375
376 CALC_EFLAGS_PARITY w10, w9, w12
377
378.ifne \a_cBits < 32
379 setf\a_cBits w9 /* Sets NZ */
380.else
381 ands wzr, w9, w9 /* Sets NZ */
382.endif
383#if 1
384 mrs x11, NZCV
385 lsr w11, w11, #30 /* N=1; Z=0 */
386 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
387#else
388 cset x11, eq
389 bfi w10, w11, X86_EFL_ZF_BIT, 1
390 cset x12, pl
391 bfi w10, w12, X86_EFL_SF_BIT, 1
392#endif
393
394.ifne \a_cBits < 32
395 bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
396.else
397 bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
398.endif
399
400.ifne \a_fIntelFlags
401 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
402 eor w11, w8, w8, LSL #1
403 lsr w11, w11, #(\a_cBits - 1)
404 bfi w10, w11, #X86_EFL_OF_BIT, #1
405
406 and w10, w10, ~X86_EFL_AF /* AF is cleared */
407.else
408 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
409 .ifne \a_cBits < 32
410 eor w11, w9, w9, LSR #1
411 lsr w11, w11, #(\a_cBits - 1)
412 .else
413 eor x11, x9, x9, LSR #1
414 lsr x11, x11, #(\a_cBits - 1)
415 .endif
416 bfi w10, w11, #X86_EFL_OF_BIT, #1
417
418 orr w10, w10, X86_EFL_AF /* AF is set */
419.endif
420
421 str w10, [x2]
42299:
423 ret
424 .cfi_endproc
425.endm
426
427SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
428SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
429SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
430
431SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
432SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
433SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
434
435SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
436SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
437SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
438
439;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
440/* void iemAImpl_shl_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
441.macro SHL_64, a_Name, a_fIntelFlags
442 .p2align 2
443BEGINPROC \a_Name
444 .cfi_startproc
445
446 /* Do we need to shift anything at all? */
447 and w1, w1, #0x3f
448 cbz w1, 99f
449
450 /*
451 * Do the shifting
452 */
453 ldr x8, [x0]
454 lslv x9, x8, x1
455 str x9, [x0]
456
457 /*
458 * Calculate EFLAGS.
459 */
460 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
461
462 CALC_EFLAGS_PARITY w10, w9, w11
463
464 ands xzr, x9, x9 /* Sets NZ */
465 mrs x11, NZCV
466 lsr w11, w11, #30 /* N=1; Z=0 */
467 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
468
469 neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */
470 lsrv x11, x8, x11
471 bfi w10, w11, X86_EFL_CF_BIT, 1
472
473.ifne \a_fIntelFlags
474 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
475 eor x11, x8, x8, LSL #1
476 lsr x11, x11, #63
477 bfi w10, w11, #X86_EFL_OF_BIT, #1
478
479 and w10, w10, ~X86_EFL_AF /* AF is cleared */
480.else
481 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
482 eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
483 bfi w10, w11, #X86_EFL_OF_BIT, #1
484
485 orr w10, w10, X86_EFL_AF /* AF is set */
486.endif
487 str w10, [x2]
48899:
489 ret
490 .cfi_endproc
491.endm
492
493SHL_64 iemAImpl_shl_u64, 1
494SHL_64 iemAImpl_shl_u64_intel, 1
495SHL_64 iemAImpl_shl_u64_amd, 0
496
497
498/*
499 * Shift Right, Unsigned.
500 */
501
502/* void iemAImpl_shr_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
503/* void iemAImpl_shr_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
504/* void iemAImpl_shr_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
505.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
506 .p2align 2
507BEGINPROC \a_Name
508 .cfi_startproc
509
510 /* Do we need to shift anything at all? */
511 and w1, w1, #0x1f
512 cbz w1, 99f
513
514 /* Load EFLAGS before we start the calculation. */
515 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
516
517 /*
518 * Do the shifting.
519 */
520 ldr\a_LdStSuff w8, [x0]
521 lsrv w9, w8, w1
522 str\a_LdStSuff w9, [x0]
523
524 /*
525 * Calculate EFLAGS.
526 */
527 sub w11, w1, #1
528 lsrv w11, w8, w11
529 bfxil w10, w11, #X86_EFL_CF_BIT, #1
530
531.ifne \a_fIntelFlags
532 and w10, w10, ~X86_EFL_AF /* AF is cleared */
533 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
534 lsr w11, w8, #(\a_cBits - 1)
535 bfi w10, w11, #X86_EFL_OF_BIT, #1
536.else
537 orr w10, w10, X86_EFL_AF /* AF is set */
538 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
539 lsr w11, w9, #(\a_cBits - 2)
540 bfi w10, w11, #X86_EFL_OF_BIT, #1
541.endif
542
543 CALC_EFLAGS_PARITY w10, w9, w11
544
545.ifne \a_cBits < 32
546 setf\a_cBits w9 /* Sets NZ */
547.else
548 ands wzr, w9, w9 /* Sets NZ */
549.endif
550 mrs x11, NZCV
551 lsr w11, w11, #30 /* N=1; Z=0 */
552 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
553
554 str w10, [x2]
55599:
556 ret
557 .cfi_endproc
558.endm
559
560shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
561shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
562shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
563
564shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
565shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
566shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
567
568shr_8_16_32 iemAImpl_shr_u32, 32, 1,
569shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
570shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
571
572;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
573/* void iemAImpl_shr_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
574.macro shr_64, a_Name, a_fIntelFlags
575 .p2align 2
576BEGINPROC \a_Name
577 .cfi_startproc
578
579 /* Do we need to shift anything at all? */
580 ands w1, w1, #0x3f
581 b.eq 99f
582
583 /* Load EFLAGS before we start the calculation. */
584 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
585
586 /*
587 * Do the shifting
588 */
589 ldr x8, [x0]
590 lsrv x9, x8, x1
591 str x9, [x0]
592
593 /*
594 * Calculate EFLAGS.
595 */
596 sub w11, w1, #1
597 lsrv x11, x8, x11
598 bfxil w10, w11, #X86_EFL_CF_BIT, #1
599
600.ifne \a_fIntelFlags
601 and w10, w10, ~X86_EFL_AF /* AF is cleared */
602 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
603 lsr x11, x8, #63
604 bfi w10, w11, #X86_EFL_OF_BIT, #1
605.else
606 orr w10, w10, X86_EFL_AF /* AF is set */
607 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
608 lsr x11, x9, #62
609 bfi w10, w11, #X86_EFL_OF_BIT, #1
610.endif
611
612 CALC_EFLAGS_PARITY w10, w9, w11
613
614 ands xzr, x9, x9 /* Sets NZ */
615 mrs x11, NZCV
616 lsr w11, w11, #30 /* N=1; Z=0 */
617 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
618
619 str w10, [x2]
62099:
621 ret
622 .cfi_endproc
623.endm
624
625shr_64 iemAImpl_shr_u64, 1
626shr_64 iemAImpl_shr_u64_intel, 1
627shr_64 iemAImpl_shr_u64_amd, 0
628
629
630/*
631 * Shift Right, Signed
632 */
633
634/* void iemAImpl_sar_u8(uint8_t *pu8Dst, uint8_t cShift, uint32_t *pEFlags); */
635/* void iemAImpl_sar_u16(uint16_t *pu16Dst, uint8_t cShift, uint32_t *pEFlags); */
636/* void iemAImpl_sar_u32(uint16_t *pu32Dst, uint8_t cShift, uint32_t *pEFlags); */
637.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
638 .p2align 2
639BEGINPROC \a_Name
640 .cfi_startproc
641
642 /* Do we need to shift anything at all? */
643 and w1, w1, #0x1f
644 cbz w1, 99f
645
646 /* Load EFLAGS before we start the calculation. */
647 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
648
649 /*
650 * Do the shifting.
651 */
652 ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */
653 asrv w9, w8, w1
654 str\a_StSuff w9, [x0]
655
656 /*
657 * Calculate EFLAGS.
658 */
659 sub w11, w1, #1
660 lsrv w11, w8, w11
661 bfxil w10, w11, #X86_EFL_CF_BIT, #1
662
663.ifne \a_fIntelFlags
664 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
665 and w10, w10, w11 /* AF and OF are cleared */
666.else
667 orr w10, w10, X86_EFL_AF /* AF is set */
668 and w10, w10, ~X86_EFL_OF /* OF is cleared */
669.endif
670
671 CALC_EFLAGS_PARITY w10, w9, w11
672
673.ifne \a_cBits < 32
674 setf\a_cBits w9 /* Sets NZ */
675.else
676 ands wzr, w9, w9 /* Sets NZ */
677.endif
678 mrs x11, NZCV
679 lsr w11, w11, #30 /* N=1; Z=0 */
680 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
681
682 str w10, [x2]
68399:
684 ret
685 .cfi_endproc
686.endm
687
688sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
689sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
690sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
691
692sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
693sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
694sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
695
696sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
697sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
698sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
699
700;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
701/* void iemAImpl_sar_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
702.macro sar_64, a_Name, a_fIntelFlags
703 .p2align 2
704BEGINPROC \a_Name
705 .cfi_startproc
706
707 /* Do we need to shift anything at all? */
708 ands w1, w1, #0x3f
709 b.eq 99f
710
711 /* Load EFLAGS before we start the calculation. */
712 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
713
714 /*
715 * Do the shifting
716 */
717 ldr x8, [x0]
718 asrv x9, x8, x1
719 str x9, [x0]
720
721 /*
722 * Calculate EFLAGS.
723 */
724 sub w11, w1, #1
725 lsrv x11, x8, x11
726 bfxil w10, w11, #X86_EFL_CF_BIT, #1
727
728.ifne \a_fIntelFlags
729 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
730 and w10, w10, w11 /* AF and OF are cleared */
731.else
732 orr w10, w10, X86_EFL_AF /* AF is set */
733 and w10, w10, ~X86_EFL_OF /* OF is cleared */
734.endif
735
736 CALC_EFLAGS_PARITY w10, w9, w11
737
738 ands xzr, x9, x9 /* Sets NZ */
739 mrs x11, NZCV
740 lsr w11, w11, #30 /* N=1; Z=0 */
741 bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
742
743 str w10, [x2]
74499:
745 ret
746 .cfi_endproc
747.endm
748
749sar_64 iemAImpl_sar_u64, 1
750sar_64 iemAImpl_sar_u64_intel, 1
751sar_64 iemAImpl_sar_u64_amd, 0
752
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette