VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S@ 103853

Last change on this file since 103853 was 103003, checked in by vboxsync, 11 months ago

VMM/IEM: Assembly version of iemAImpl_sub_*. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.4 KB
Line 
1/* $Id: IEMAllAImpl-arm64.S 103003 2024-01-23 16:19:17Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, ARM64 variant.
4 */
5
6/*
7 * Copyright (C) 2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include <iprt/asmdefs-arm.h>
33#include <iprt/x86.h>
34
35
36#if RT_CLANG_PREREQ(15, 0)
37 .arch_extension flagm /* not necessary */
38#else
39 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42 work for v15 and is enabled by default it seems. */
43 .cpu apple-a14+crc
44#endif
45
46
47.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
48 /*
49 * Translate the arm NZCV bits into corresponding EFLAGS bits.
50 */
51 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
52#if 0
53 /* Maybe just a tiny bit slow than the next one. */
54 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
55 .ifeq \fSkipFlags & X86_EFL_OF
56 lsr \regTmp, \regTmp, #28
57 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
58 lsr \regTmp, \regTmp, #1
59 .else
60 lsr \regTmp, \regTmp, #29
61 .endif
62 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
63 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
64 lsr \regTmp, \regTmp, #1
65 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
66#else
67 /* This seems to be the faster one... */
68 cfinv
69 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
70 .ifeq (\fSkipFlags & X86_EFL_OF)
71 lsr \regTmp, \regTmp, #28
72 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
73 lsr \regTmp, \regTmp, #1
74 .else
75 lsr \regTmp, \regTmp, #29
76 .endif
77 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
78 lsr \regTmp, \regTmp, #1
79 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
80#endif
81 .else
82 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
83 .ifeq \fSkipFlags & X86_EFL_ZF
84 cset \regTmp, eq
85 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
86 .endif
87 .ifeq \fSkipFlags & X86_EFL_CF
88 cset \regTmp, cc
89 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
90 .endif
91 .ifeq \fSkipFlags & X86_EFL_OF
92 cset \regTmp, vs
93 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
94 .endif
95 .ifeq \fSkipFlags & X86_EFL_SF
96 cset \regTmp, mi
97 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
98 .endif
99 .endif
100
101
102 /*
103 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
104 */
105 eor \regTmp, \regResult, \regResult, LSR #4
106 eor \regTmp, \regTmp, \regTmp, LSR #2
107 eor \regTmp, \regTmp, \regTmp, LSR #1
108 eor \regTmp, \regTmp, #1
109 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
110
111 /*
112 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
113 */
114 eor \regTmp, \regLeft, \regRight
115 eor \regTmp, \regTmp, \regResult
116 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
117 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
118
119 /* done */
120.endm
121
122
123BEGINCODE
124 .p2align 2
125 .private_extern NAME(iemAImpl_placeholder)
126 .globl NAME(iemAImpl_placeholder)
127NAME(iemAImpl_placeholder):
128 brk #1
129 ret
130
131/* Some sketches.
132
133// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg));
134 .p2align 2
135 .private_extern NAME(iemAImpl_xchg_u8_locked)
136 .globl NAME(iemAImpl_xchg_u8_locked)
137NAME(iemAImpl_xchg_u8_locked):
138 ldrb w2, [x1]
139 swpalb w2, w2, [x0]
140 strb w2, [x1]
141 ret
142
143// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *pu16Mem, uint16_t *pu16Reg));
144 .p2align 2
145 .private_extern NAME(iemAImpl_xchg_u16_locked)
146 .globl NAME(iemAImpl_xchg_u16_locked)
147NAME(iemAImpl_xchg_u16_locked):
148 ldrh w2, [x1]
149 swpalh w2, w2, [x0]
150 strh w2, [x1]
151 ret
152
153// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *pu32Mem, uint32_t *pu32Reg));
154// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *pu64Mem, uint64_t *pu64Reg));
155
156*/
157
158
159/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */
160
161/*
162 * The CMP instruction.
163 */
164
165/* void iemAImpl_cmp_u8(uint8_t const *puDst, uint8_t uSrc, uint32_t *pEFlags); */
166 .p2align 2
167 .private_extern NAME(iemAImpl_sub_u8)
168 .globl NAME(iemAImpl_sub_u8)
169NAME(iemAImpl_sub_u8):
170 .cfi_startproc
171 /* Do the subtraction. */
172 ldrb w8, [x0]
173 /*and w1, w1, #0xff - should not be necessary. */
174 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
175 setf8 w9
176 strb w9, [x0]
177
178 /* Load EFLAGS. */
179 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
180 and w9, w9, #0xffff
181 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
182
183 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
184 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
185 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
186 eor w12, w8, w9
187 and w11, w12, w11
188 lsr w11, w11, #7
189 bfi w10, w11, #X86_EFL_OF_BIT, #1
190
191 /* Done with EFLAGS. */
192 str w10, [x2]
193 ret
194 .cfi_endproc
195
196
197/* void iemAImpl_cmp_u16(uint16_t const *puDst, uint16_t uSrc, uint32_t *pEFlags); */
198 .p2align 2
199 .private_extern NAME(iemAImpl_sub_u16)
200 .globl NAME(iemAImpl_sub_u16)
201NAME(iemAImpl_sub_u16):
202 .cfi_startproc
203 /* Do the subtraction. */
204 ldrh w8, [x0]
205 /*and w1, w1, #0xffff - should not be necessary. */
206 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
207 setf16 w9
208 strh w9, [x0]
209
210 /* Load EFLAGS. */
211 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
212 and w9, w9, #0xffff
213 CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
214
215 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
216 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
217 eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
218 eor w12, w8, w9
219 and w11, w12, w11
220 lsr w11, w11, #15
221 bfi w10, w11, #X86_EFL_OF_BIT, #1
222
223 /* Done with EFLAGS. */
224 str w10, [x2]
225 ret
226 .cfi_endproc
227
228
229/* void iemAImpl_cmp_u32(uint32_t const *puDst, uint32_t uSrc, uint32_t *pEFlags); */
230 .p2align 2
231 .private_extern NAME(iemAImpl_sub_u32)
232 .globl NAME(iemAImpl_sub_u32)
233NAME(iemAImpl_sub_u32):
234 .cfi_startproc
235 /* Do the subtraction. */
236 ldr w8, [x0]
237 subs w9, w8, w1 /* w9 = w8 (*puDst) - w1 (uSrc) */
238 str w9, [x0]
239
240 /* Load EFLAGS. */
241 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
242
243#if 0
244 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
245#if 0 /* maybe just a tiny bit slow than the next one. */
246 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
247 lsr w11, w11, #28
248 bfi w10, w11, #X86_EFL_OF_BIT, #1
249 lsr w11, w11, #1
250 eor w11, w11, #1 /* inverts the carry flag to x86 style. */
251 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
252 lsr w11, w11, #1
253 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
254#elif 1 /* seems the faster one... */
255 cfinv
256 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
257 lsr w11, w11, #28
258 bfi w10, w11, #X86_EFL_OF_BIT, #1
259 lsr w11, w11, #1
260 bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
261 lsr w11, w11, #1
262 bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
263#else
264 cset w11, eq
265 bfi w10, w11, #X86_EFL_ZF_BIT, #1
266 cset w11, cc
267 bfi w10, w11, #X86_EFL_CF_BIT, #1
268 cset w11, vs
269 bfi w10, w11, #X86_EFL_OF_BIT, #1
270 cset w11, mi
271 bfi w10, w11, #X86_EFL_SF_BIT, #1
272#endif
273
274 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
275 eor w11, w9, w9, LSR #4
276 eor w11, w11, w11, LSR #2
277 eor w11, w11, w11, LSR #1
278 eor w11, w11, #1
279 bfi w10, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
280
281 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
282 eor w11, w8, w1
283 eor w11, w11, w9
284 lsr w11, w11, #X86_EFL_AF_BIT
285 bfi w10, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
286#else
287 CALC_EFLAGS x10, x9, x8, x1, x11
288#endif
289
290 str w10, [x2]
291 ret
292 .cfi_endproc
293
294
295/* void iemAImpl_cmp_u64(uint64_t const *puDst, uint64_t uSrc, uint32_t *pEFlags); */
296 .p2align 2
297 .private_extern NAME(iemAImpl_sub_u64)
298 .globl NAME(iemAImpl_sub_u64)
299NAME(iemAImpl_sub_u64):
300 .cfi_startproc
301 /* Do the subtraction. */
302 ldr x8, [x0]
303 subs x9, x8, x1 /* x9 = x8 (*puDst) - x1 (uSrc) */
304 str x9, [x0]
305
306 /* Load EFLAGS. */
307 ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
308 CALC_EFLAGS x10, x9, x8, x1, x11
309
310 str w10, [x2]
311 ret
312 .cfi_endproc
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette