IEMAllAImpl-arm64.S@ 103862

Last change on this file since 103862 was 103003, checked in by vboxsync, 11 months ago
VMM/IEM: Assembly version of iemAImpl_sub_*. bugref:10376
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 11.4 KB

Line
1	/* $Id: IEMAllAImpl-arm64.S 103003 2024-01-23 16:19:17Z vboxsync $ */
2	/** @file
3	* IEM - Instruction Implementation in Assembly, ARM64 variant.
4	*/
5
6	/*
7	* Copyright (C) 2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include <iprt/asmdefs-arm.h>
33	#include <iprt/x86.h>
34
35
36	#if RT_CLANG_PREREQ(15, 0)
37	.arch_extension flagm /* not necessary */
38	#else
39	/* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40	For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41	recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42	work for v15 and is enabled by default it seems. */
43	.cpu apple-a14+crc
44	#endif
45
46
47	.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
48	/*
49	* Translate the arm NZCV bits into corresponding EFLAGS bits.
50	*/
51	.if \fSkipFlags == 0 \|\| \fSkipFlags == X86_EFL_OF
52	#if 0
53	/* Maybe just a tiny bit slow than the next one. */
54	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
55	.ifeq \fSkipFlags & X86_EFL_OF
56	lsr \regTmp, \regTmp, #28
57	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
58	lsr \regTmp, \regTmp, #1
59	.else
60	lsr \regTmp, \regTmp, #29
61	.endif
62	eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
63	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
64	lsr \regTmp, \regTmp, #1
65	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
66	#else
67	/* This seems to be the faster one... */
68	cfinv
69	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
70	.ifeq (\fSkipFlags & X86_EFL_OF)
71	lsr \regTmp, \regTmp, #28
72	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
73	lsr \regTmp, \regTmp, #1
74	.else
75	lsr \regTmp, \regTmp, #29
76	.endif
77	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
78	lsr \regTmp, \regTmp, #1
79	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
80	#endif
81	.else
82	/* Definitely slower than the above two, but easier to handle wrt skipping parts. */
83	.ifeq \fSkipFlags & X86_EFL_ZF
84	cset \regTmp, eq
85	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
86	.endif
87	.ifeq \fSkipFlags & X86_EFL_CF
88	cset \regTmp, cc
89	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
90	.endif
91	.ifeq \fSkipFlags & X86_EFL_OF
92	cset \regTmp, vs
93	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
94	.endif
95	.ifeq \fSkipFlags & X86_EFL_SF
96	cset \regTmp, mi
97	bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
98	.endif
99	.endif
100
101
102	/*
103	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
104	*/
105	eor \regTmp, \regResult, \regResult, LSR #4
106	eor \regTmp, \regTmp, \regTmp, LSR #2
107	eor \regTmp, \regTmp, \regTmp, LSR #1
108	eor \regTmp, \regTmp, #1
109	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
110
111	/*
112	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
113	*/
114	eor \regTmp, \regLeft, \regRight
115	eor \regTmp, \regTmp, \regResult
116	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
117	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
118
119	/* done */
120	.endm
121
122
123	BEGINCODE
124	.p2align 2
125	.private_extern NAME(iemAImpl_placeholder)
126	.globl NAME(iemAImpl_placeholder)
127	NAME(iemAImpl_placeholder):
128	brk #1
129	ret
130
131	/* Some sketches.
132
133	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg));
134	.p2align 2
135	.private_extern NAME(iemAImpl_xchg_u8_locked)
136	.globl NAME(iemAImpl_xchg_u8_locked)
137	NAME(iemAImpl_xchg_u8_locked):
138	ldrb w2, [x1]
139	swpalb w2, w2, [x0]
140	strb w2, [x1]
141	ret
142
143	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t pu16Mem, uint16_t pu16Reg));
144	.p2align 2
145	.private_extern NAME(iemAImpl_xchg_u16_locked)
146	.globl NAME(iemAImpl_xchg_u16_locked)
147	NAME(iemAImpl_xchg_u16_locked):
148	ldrh w2, [x1]
149	swpalh w2, w2, [x0]
150	strh w2, [x1]
151	ret
152
153	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t pu32Mem, uint32_t pu32Reg));
154	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t pu64Mem, uint64_t pu64Reg));
155
156	*/
157
158
159	/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg)); */
160
161	/*
162	* The CMP instruction.
163	*/
164
165	/* void iemAImpl_cmp_u8(uint8_t const puDst, uint8_t uSrc, uint32_t pEFlags); */
166	.p2align 2
167	.private_extern NAME(iemAImpl_sub_u8)
168	.globl NAME(iemAImpl_sub_u8)
169	NAME(iemAImpl_sub_u8):
170	.cfi_startproc
171	/* Do the subtraction. */
172	ldrb w8, [x0]
173	/and w1, w1, #0xff - should not be necessary. /
174	subs w9, w8, w1 /* w9 = w8 (puDst) - w1 (uSrc) /
175	setf8 w9
176	strb w9, [x0]
177
178	/* Load EFLAGS. */
179	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
180	and w9, w9, #0xffff
181	CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
182
183	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
184	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
185	eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
186	eor w12, w8, w9
187	and w11, w12, w11
188	lsr w11, w11, #7
189	bfi w10, w11, #X86_EFL_OF_BIT, #1
190
191	/* Done with EFLAGS. */
192	str w10, [x2]
193	ret
194	.cfi_endproc
195
196
197	/* void iemAImpl_cmp_u16(uint16_t const puDst, uint16_t uSrc, uint32_t pEFlags); */
198	.p2align 2
199	.private_extern NAME(iemAImpl_sub_u16)
200	.globl NAME(iemAImpl_sub_u16)
201	NAME(iemAImpl_sub_u16):
202	.cfi_startproc
203	/* Do the subtraction. */
204	ldrh w8, [x0]
205	/and w1, w1, #0xffff - should not be necessary. /
206	subs w9, w8, w1 /* w9 = w8 (puDst) - w1 (uSrc) /
207	setf16 w9
208	strh w9, [x0]
209
210	/* Load EFLAGS. */
211	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
212	and w9, w9, #0xffff
213	CALC_EFLAGS x10, x9, x8, x1, x11, X86_EFL_OF
214
215	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
216	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
217	eor w11, w8, w1 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
218	eor w12, w8, w9
219	and w11, w12, w11
220	lsr w11, w11, #15
221	bfi w10, w11, #X86_EFL_OF_BIT, #1
222
223	/* Done with EFLAGS. */
224	str w10, [x2]
225	ret
226	.cfi_endproc
227
228
229	/* void iemAImpl_cmp_u32(uint32_t const puDst, uint32_t uSrc, uint32_t pEFlags); */
230	.p2align 2
231	.private_extern NAME(iemAImpl_sub_u32)
232	.globl NAME(iemAImpl_sub_u32)
233	NAME(iemAImpl_sub_u32):
234	.cfi_startproc
235	/* Do the subtraction. */
236	ldr w8, [x0]
237	subs w9, w8, w1 /* w9 = w8 (puDst) - w1 (uSrc) /
238	str w9, [x0]
239
240	/* Load EFLAGS. */
241	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
242
243	#if 0
244	/* Translate the arm NZCV bits into corresponding EFLAGS bits. */
245	#if 0 /* maybe just a tiny bit slow than the next one. */
246	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
247	lsr w11, w11, #28
248	bfi w10, w11, #X86_EFL_OF_BIT, #1
249	lsr w11, w11, #1
250	eor w11, w11, #1 /* inverts the carry flag to x86 style. */
251	bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
252	lsr w11, w11, #1
253	bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
254	#elif 1 /* seems the faster one... */
255	cfinv
256	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
257	lsr w11, w11, #28
258	bfi w10, w11, #X86_EFL_OF_BIT, #1
259	lsr w11, w11, #1
260	bfi w10, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
261	lsr w11, w11, #1
262	bfi w10, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
263	#else
264	cset w11, eq
265	bfi w10, w11, #X86_EFL_ZF_BIT, #1
266	cset w11, cc
267	bfi w10, w11, #X86_EFL_CF_BIT, #1
268	cset w11, vs
269	bfi w10, w11, #X86_EFL_OF_BIT, #1
270	cset w11, mi
271	bfi w10, w11, #X86_EFL_SF_BIT, #1
272	#endif
273
274	/* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
275	eor w11, w9, w9, LSR #4
276	eor w11, w11, w11, LSR #2
277	eor w11, w11, w11, LSR #1
278	eor w11, w11, #1
279	bfi w10, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
280
281	/* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
282	eor w11, w8, w1
283	eor w11, w11, w9
284	lsr w11, w11, #X86_EFL_AF_BIT
285	bfi w10, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
286	#else
287	CALC_EFLAGS x10, x9, x8, x1, x11
288	#endif
289
290	str w10, [x2]
291	ret
292	.cfi_endproc
293
294
295	/* void iemAImpl_cmp_u64(uint64_t const puDst, uint64_t uSrc, uint32_t pEFlags); */
296	.p2align 2
297	.private_extern NAME(iemAImpl_sub_u64)
298	.globl NAME(iemAImpl_sub_u64)
299	NAME(iemAImpl_sub_u64):
300	.cfi_startproc
301	/* Do the subtraction. */
302	ldr x8, [x0]
303	subs x9, x8, x1 /* x9 = x8 (puDst) - x1 (uSrc) /
304	str x9, [x0]
305
306	/* Load EFLAGS. */
307	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
308	CALC_EFLAGS x10, x9, x8, x1, x11
309
310	str w10, [x2]
311	ret
312	.cfi_endproc

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format