IEMAllAImpl-arm64.S@ 104195

Last change on this file since 104195 was 104195, checked in by vboxsync, 8 months ago
VMM/IEM: Refactoring assembly helpers to not pass eflags by reference but instead by value and return the updated value (via eax/w0) - first chunk: ADD,ADC,SUB,SBB,CMP,TEST,AND,OR,XOR. bugref:10376
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 24.4 KB

Line
1	/* $Id: IEMAllAImpl-arm64.S 104195 2024-04-05 14:45:23Z vboxsync $ */
2	/** @file
3	* IEM - Instruction Implementation in Assembly, ARM64 variant.
4	*/
5
6	/*
7	* Copyright (C) 2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include <iprt/asmdefs-arm.h>
33	#include <iprt/x86.h>
34
35
36	#if RT_CLANG_PREREQ(15, 0)
37	.arch_extension flagm /* not necessary */
38	#else
39	/* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40	For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41	recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42	work for v15 and is enabled by default it seems. */
43	.cpu apple-a14+crc
44	#endif
45
46	.macro BEGINPROC, a_Name
47	.private_extern NAME(\a_Name)
48	.globl NAME(\a_Name)
49	NAME(\a_Name):
50	.endm
51
52
53	.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54	/*
55	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56	*/
57	eor \regTmp, \regResult, \regResult, LSR #4
58	eor \regTmp, \regTmp, \regTmp, LSR #2
59	eor \regTmp, \regTmp, \regTmp, LSR #1
60	eor \regTmp, \regTmp, #1
61	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62	.endm
63
64
65	.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66	/*
67	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
68	*/
69	eor \regTmp, \regLeft, \regRight
70	eor \regTmp, \regTmp, \regResult
71	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73	.endm
74
75	.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76	/*
77	* Translate the arm NZCV bits into corresponding EFLAGS bits.
78	*/
79	.if \fSkipFlags == 0 \|\| \fSkipFlags == X86_EFL_OF
80	#if 0
81	/* Maybe just a tiny bit slow than the next one. */
82	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83	.ifeq \fSkipFlags & X86_EFL_OF
84	lsr \regTmp, \regTmp, #28
85	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86	lsr \regTmp, \regTmp, #1
87	.else
88	lsr \regTmp, \regTmp, #29
89	.endif
90	eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92	lsr \regTmp, \regTmp, #1
93	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94	#else
95	/* This seems to be the faster one... */
96	cfinv
97	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98	.ifeq (\fSkipFlags & X86_EFL_OF)
99	lsr \regTmp, \regTmp, #28
100	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101	lsr \regTmp, \regTmp, #1
102	.else
103	lsr \regTmp, \regTmp, #29
104	.endif
105	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106	lsr \regTmp, \regTmp, #1
107	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108	#endif
109	.else
110	/* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111	.ifeq \fSkipFlags & X86_EFL_ZF
112	cset \regTmp, eq
113	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114	.endif
115	.ifeq \fSkipFlags & X86_EFL_CF
116	cset \regTmp, cc
117	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118	.endif
119	.ifeq \fSkipFlags & X86_EFL_OF
120	cset \regTmp, vs
121	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122	.endif
123	.ifeq \fSkipFlags & X86_EFL_SF
124	cset \regTmp, mi
125	bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126	.endif
127	.endif
128
129
130	/*
131	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132	*/
133	eor \regTmp, \regResult, \regResult, LSR #4
134	eor \regTmp, \regTmp, \regTmp, LSR #2
135	eor \regTmp, \regTmp, \regTmp, LSR #1
136	eor \regTmp, \regTmp, #1
137	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139	/*
140	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
141	*/
142	eor \regTmp, \regLeft, \regRight
143	eor \regTmp, \regTmp, \regResult
144	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147	/* done */
148	.endm
149
150
151	BEGINCODE
152	.p2align 2
153	.private_extern NAME(iemAImpl_placeholder)
154	.globl NAME(iemAImpl_placeholder)
155	NAME(iemAImpl_placeholder):
156	brk #1
157	ret
158
159	/* Some sketches.
160
161	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg));
162	.p2align 2
163	.private_extern NAME(iemAImpl_xchg_u8_locked)
164	.globl NAME(iemAImpl_xchg_u8_locked)
165	NAME(iemAImpl_xchg_u8_locked):
166	ldrb w2, [x1]
167	swpalb w2, w2, [x0]
168	strb w2, [x1]
169	ret
170
171	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t pu16Mem, uint16_t pu16Reg));
172	.p2align 2
173	.private_extern NAME(iemAImpl_xchg_u16_locked)
174	.globl NAME(iemAImpl_xchg_u16_locked)
175	NAME(iemAImpl_xchg_u16_locked):
176	ldrh w2, [x1]
177	swpalh w2, w2, [x0]
178	strh w2, [x1]
179	ret
180
181	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t pu32Mem, uint32_t pu32Reg));
182	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t pu64Mem, uint64_t pu64Reg));
183
184	*/
185
186
187	/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg)); */
188
189	/*
190	* The CMP instruction.
191	*/
192
193	/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const puDst, uint8_t uSrc); /
194	.p2align 2
195	.private_extern NAME(iemAImpl_sub_u8)
196	.globl NAME(iemAImpl_sub_u8)
197	NAME(iemAImpl_sub_u8):
198	.cfi_startproc
199	/* Do the subtraction. */
200	ldrb w8, [x1]
201	/and w2, w2, #0xff - should not be necessary. /
202	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
203	strb w9, [x1]
204	setf8 w9
205
206	/* Calculate EFLAGS (passed in and returned via x0). */
207	and w9, w9, #0xffff
208	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
209
210	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
211	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
212	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
213	eor w12, w8, w9
214	and w11, w12, w11
215	lsr w11, w11, #7
216	bfi w0, w11, #X86_EFL_OF_BIT, #1
217
218	ret
219	.cfi_endproc
220
221
222	/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const puDst, uint16_t uSrc); /
223	.p2align 2
224	.private_extern NAME(iemAImpl_sub_u16)
225	.globl NAME(iemAImpl_sub_u16)
226	NAME(iemAImpl_sub_u16):
227	.cfi_startproc
228	/* Do the subtraction. */
229	ldrh w8, [x1]
230	/and w2, w2, #0xffff - should not be necessary. /
231	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
232	setf16 w9
233	strh w9, [x1]
234
235	/* Calculate EFLAGS (passed in and returned via x0). */
236	and w9, w9, #0xffff
237	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
238
239	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
240	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
241	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
242	eor w12, w8, w9
243	and w11, w12, w11
244	lsr w11, w11, #15
245	bfi w0, w11, #X86_EFL_OF_BIT, #1
246
247	ret
248	.cfi_endproc
249
250
251	/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const puDst, uint32_t uSrc); /
252	.p2align 2
253	.private_extern NAME(iemAImpl_sub_u32)
254	.globl NAME(iemAImpl_sub_u32)
255	NAME(iemAImpl_sub_u32):
256	.cfi_startproc
257	/* Do the subtraction. */
258	ldr w8, [x1]
259	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
260	str w9, [x1]
261
262	/* Calculate EFLAGS (passed in and returned via x0). */
263
264	#if 0
265	/* Translate the arm NZCV bits into corresponding EFLAGS bits. */
266	#if 0 /* maybe just a tiny bit slow than the next one. */
267	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
268	lsr w11, w11, #28
269	bfi w0, w11, #X86_EFL_OF_BIT, #1
270	lsr w11, w11, #1
271	eor w11, w11, #1 /* inverts the carry flag to x86 style. */
272	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
273	lsr w11, w11, #1
274	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
275	#elif 1 /* seems the faster one... */
276	cfinv
277	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
278	lsr w11, w11, #28
279	bfi w0, w11, #X86_EFL_OF_BIT, #1
280	lsr w11, w11, #1
281	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
282	lsr w11, w11, #1
283	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
284	#else
285	cset w11, eq
286	bfi w0, w11, #X86_EFL_ZF_BIT, #1
287	cset w11, cc
288	bfi w0, w11, #X86_EFL_CF_BIT, #1
289	cset w11, vs
290	bfi w0, w11, #X86_EFL_OF_BIT, #1
291	cset w11, mi
292	bfi w0, w11, #X86_EFL_SF_BIT, #1
293	#endif
294
295	/* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
296	eor w11, w9, w9, LSR #4
297	eor w11, w11, w11, LSR #2
298	eor w11, w11, w11, LSR #1
299	eor w11, w11, #1
300	bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
301
302	/* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
303	eor w11, w8, w2
304	eor w11, w11, w9
305	lsr w11, w11, #X86_EFL_AF_BIT
306	bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
307	#else
308	CALC_EFLAGS x0, x9, x8, x2, x11
309	#endif
310
311	ret
312	.cfi_endproc
313
314
315	/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const puDst, uint64_t uSrc); /
316	.p2align 2
317	.private_extern NAME(iemAImpl_sub_u64)
318	.globl NAME(iemAImpl_sub_u64)
319	NAME(iemAImpl_sub_u64):
320	.cfi_startproc
321	/* Do the subtraction. */
322	ldr x8, [x1]
323	subs x9, x8, x2 /* x9 = x8 (puDst) - x2 (uSrc) /
324	str x9, [x1]
325
326	/* Calculate EFLAGS (passed in and returned via x0). */
327	CALC_EFLAGS x0, x9, x8, x2, x11
328
329	ret
330	.cfi_endproc
331
332
333
334	/*
335	* Shift Left.
336	*/
337
338	/* void iemAImpl_shl_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
339	/* void iemAImpl_shl_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
340	/* void iemAImpl_shl_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
341	.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
342	.p2align 2
343	BEGINPROC \a_Name
344	.cfi_startproc
345
346	/* Do we need to shift anything at all? */
347	and w1, w1, #0x1f
348	cbz w1, 99f
349
350	/*
351	* Do the shifting
352	*/
353	ldr\a_LdStSuff w8, [x0]
354	.ifne \a_cBits < 32
355	lslv w9, w8, w1
356	.else
357	lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
358	.endif
359	str\a_LdStSuff w9, [x0]
360
361	/*
362	* Calculate EFLAGS.
363	*/
364	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
365
366	CALC_EFLAGS_PARITY w10, w9, w12
367
368	.ifne \a_cBits < 32
369	setf\a_cBits w9 /* Sets NZ */
370	.else
371	ands wzr, w9, w9 /* Sets NZ */
372	.endif
373	#if 1
374	mrs x11, NZCV
375	lsr w11, w11, #30 /* N=1; Z=0 */
376	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
377	#else
378	cset x11, eq
379	bfi w10, w11, X86_EFL_ZF_BIT, 1
380	cset x12, pl
381	bfi w10, w12, X86_EFL_SF_BIT, 1
382	#endif
383
384	.ifne \a_cBits < 32
385	bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
386	.else
387	bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
388	.endif
389
390	.ifne \a_fIntelFlags
391	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
392	eor w11, w8, w8, LSL #1
393	lsr w11, w11, #(\a_cBits - 1)
394	bfi w10, w11, #X86_EFL_OF_BIT, #1
395
396	and w10, w10, ~X86_EFL_AF /* AF is cleared */
397	.else
398	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
399	.ifne \a_cBits < 32
400	eor w11, w9, w9, LSR #1
401	lsr w11, w11, #(\a_cBits - 1)
402	.else
403	eor x11, x9, x9, LSR #1
404	lsr x11, x11, #(\a_cBits - 1)
405	.endif
406	bfi w10, w11, #X86_EFL_OF_BIT, #1
407
408	orr w10, w10, X86_EFL_AF /* AF is set */
409	.endif
410
411	str w10, [x2]
412	99:
413	ret
414	.cfi_endproc
415	.endm
416
417	SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
418	SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
419	SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
420
421	SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
422	SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
423	SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
424
425	SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
426	SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
427	SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
428
429	;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
430	/* void iemAImpl_shl_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
431	.macro SHL_64, a_Name, a_fIntelFlags
432	.p2align 2
433	BEGINPROC \a_Name
434	.cfi_startproc
435
436	/* Do we need to shift anything at all? */
437	and w1, w1, #0x3f
438	cbz w1, 99f
439
440	/*
441	* Do the shifting
442	*/
443	ldr x8, [x0]
444	lslv x9, x8, x1
445	str x9, [x0]
446
447	/*
448	* Calculate EFLAGS.
449	*/
450	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
451
452	CALC_EFLAGS_PARITY w10, w9, w11
453
454	ands xzr, x9, x9 /* Sets NZ */
455	mrs x11, NZCV
456	lsr w11, w11, #30 /* N=1; Z=0 */
457	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
458
459	neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */
460	lsrv x11, x8, x11
461	bfi w10, w11, X86_EFL_CF_BIT, 1
462
463	.ifne \a_fIntelFlags
464	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
465	eor x11, x8, x8, LSL #1
466	lsr x11, x11, #63
467	bfi w10, w11, #X86_EFL_OF_BIT, #1
468
469	and w10, w10, ~X86_EFL_AF /* AF is cleared */
470	.else
471	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
472	eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
473	bfi w10, w11, #X86_EFL_OF_BIT, #1
474
475	orr w10, w10, X86_EFL_AF /* AF is set */
476	.endif
477	str w10, [x2]
478	99:
479	ret
480	.cfi_endproc
481	.endm
482
483	SHL_64 iemAImpl_shl_u64, 1
484	SHL_64 iemAImpl_shl_u64_intel, 1
485	SHL_64 iemAImpl_shl_u64_amd, 0
486
487
488	/*
489	* Shift Right, Unsigned.
490	*/
491
492	/* void iemAImpl_shr_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
493	/* void iemAImpl_shr_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
494	/* void iemAImpl_shr_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
495	.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
496	.p2align 2
497	BEGINPROC \a_Name
498	.cfi_startproc
499
500	/* Do we need to shift anything at all? */
501	and w1, w1, #0x1f
502	cbz w1, 99f
503
504	/* Load EFLAGS before we start the calculation. */
505	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
506
507	/*
508	* Do the shifting.
509	*/
510	ldr\a_LdStSuff w8, [x0]
511	lsrv w9, w8, w1
512	str\a_LdStSuff w9, [x0]
513
514	/*
515	* Calculate EFLAGS.
516	*/
517	sub w11, w1, #1
518	lsrv w11, w8, w11
519	bfxil w10, w11, #X86_EFL_CF_BIT, #1
520
521	.ifne \a_fIntelFlags
522	and w10, w10, ~X86_EFL_AF /* AF is cleared */
523	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
524	lsr w11, w8, #(\a_cBits - 1)
525	bfi w10, w11, #X86_EFL_OF_BIT, #1
526	.else
527	orr w10, w10, X86_EFL_AF /* AF is set */
528	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
529	lsr w11, w9, #(\a_cBits - 2)
530	bfi w10, w11, #X86_EFL_OF_BIT, #1
531	.endif
532
533	CALC_EFLAGS_PARITY w10, w9, w11
534
535	.ifne \a_cBits < 32
536	setf\a_cBits w9 /* Sets NZ */
537	.else
538	ands wzr, w9, w9 /* Sets NZ */
539	.endif
540	mrs x11, NZCV
541	lsr w11, w11, #30 /* N=1; Z=0 */
542	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
543
544	str w10, [x2]
545	99:
546	ret
547	.cfi_endproc
548	.endm
549
550	shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
551	shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
552	shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
553
554	shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
555	shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
556	shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
557
558	shr_8_16_32 iemAImpl_shr_u32, 32, 1,
559	shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
560	shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
561
562	;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
563	/* void iemAImpl_shr_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
564	.macro shr_64, a_Name, a_fIntelFlags
565	.p2align 2
566	BEGINPROC \a_Name
567	.cfi_startproc
568
569	/* Do we need to shift anything at all? */
570	ands w1, w1, #0x3f
571	b.eq 99f
572
573	/* Load EFLAGS before we start the calculation. */
574	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
575
576	/*
577	* Do the shifting
578	*/
579	ldr x8, [x0]
580	lsrv x9, x8, x1
581	str x9, [x0]
582
583	/*
584	* Calculate EFLAGS.
585	*/
586	sub w11, w1, #1
587	lsrv x11, x8, x11
588	bfxil w10, w11, #X86_EFL_CF_BIT, #1
589
590	.ifne \a_fIntelFlags
591	and w10, w10, ~X86_EFL_AF /* AF is cleared */
592	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
593	lsr x11, x8, #63
594	bfi w10, w11, #X86_EFL_OF_BIT, #1
595	.else
596	orr w10, w10, X86_EFL_AF /* AF is set */
597	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
598	lsr x11, x9, #62
599	bfi w10, w11, #X86_EFL_OF_BIT, #1
600	.endif
601
602	CALC_EFLAGS_PARITY w10, w9, w11
603
604	ands xzr, x9, x9 /* Sets NZ */
605	mrs x11, NZCV
606	lsr w11, w11, #30 /* N=1; Z=0 */
607	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
608
609	str w10, [x2]
610	99:
611	ret
612	.cfi_endproc
613	.endm
614
615	shr_64 iemAImpl_shr_u64, 1
616	shr_64 iemAImpl_shr_u64_intel, 1
617	shr_64 iemAImpl_shr_u64_amd, 0
618
619
620	/*
621	* Shift Right, Signed
622	*/
623
624	/* void iemAImpl_sar_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
625	/* void iemAImpl_sar_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
626	/* void iemAImpl_sar_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
627	.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
628	.p2align 2
629	BEGINPROC \a_Name
630	.cfi_startproc
631
632	/* Do we need to shift anything at all? */
633	and w1, w1, #0x1f
634	cbz w1, 99f
635
636	/* Load EFLAGS before we start the calculation. */
637	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
638
639	/*
640	* Do the shifting.
641	*/
642	ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */
643	asrv w9, w8, w1
644	str\a_StSuff w9, [x0]
645
646	/*
647	* Calculate EFLAGS.
648	*/
649	sub w11, w1, #1
650	lsrv w11, w8, w11
651	bfxil w10, w11, #X86_EFL_CF_BIT, #1
652
653	.ifne \a_fIntelFlags
654	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
655	and w10, w10, w11 /* AF and OF are cleared */
656	.else
657	orr w10, w10, X86_EFL_AF /* AF is set */
658	and w10, w10, ~X86_EFL_OF /* OF is cleared */
659	.endif
660
661	CALC_EFLAGS_PARITY w10, w9, w11
662
663	.ifne \a_cBits < 32
664	setf\a_cBits w9 /* Sets NZ */
665	.else
666	ands wzr, w9, w9 /* Sets NZ */
667	.endif
668	mrs x11, NZCV
669	lsr w11, w11, #30 /* N=1; Z=0 */
670	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
671
672	str w10, [x2]
673	99:
674	ret
675	.cfi_endproc
676	.endm
677
678	sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
679	sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
680	sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
681
682	sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
683	sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
684	sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
685
686	sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
687	sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
688	sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
689
690	;; @todo this is slightly slower than the C version (release) on an M2. Investigate why.
691	/* void iemAImpl_sar_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
692	.macro sar_64, a_Name, a_fIntelFlags
693	.p2align 2
694	BEGINPROC \a_Name
695	.cfi_startproc
696
697	/* Do we need to shift anything at all? */
698	ands w1, w1, #0x3f
699	b.eq 99f
700
701	/* Load EFLAGS before we start the calculation. */
702	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
703
704	/*
705	* Do the shifting
706	*/
707	ldr x8, [x0]
708	asrv x9, x8, x1
709	str x9, [x0]
710
711	/*
712	* Calculate EFLAGS.
713	*/
714	sub w11, w1, #1
715	lsrv x11, x8, x11
716	bfxil w10, w11, #X86_EFL_CF_BIT, #1
717
718	.ifne \a_fIntelFlags
719	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
720	and w10, w10, w11 /* AF and OF are cleared */
721	.else
722	orr w10, w10, X86_EFL_AF /* AF is set */
723	and w10, w10, ~X86_EFL_OF /* OF is cleared */
724	.endif
725
726	CALC_EFLAGS_PARITY w10, w9, w11
727
728	ands xzr, x9, x9 /* Sets NZ */
729	mrs x11, NZCV
730	lsr w11, w11, #30 /* N=1; Z=0 */
731	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
732
733	str w10, [x2]
734	99:
735	ret
736	.cfi_endproc
737	.endm
738
739	sar_64 iemAImpl_sar_u64, 1
740	sar_64 iemAImpl_sar_u64_intel, 1
741	sar_64 iemAImpl_sar_u64_amd, 0
742

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format