IEMAllAImpl-arm64.S@ 104231

Last change on this file since 104231 was 104231, checked in by vboxsync, 12 months ago
VMM/IEMAllAImpl-arm64.S: Make it build with gcc, bugref:10391
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Line
1	/* $Id: IEMAllAImpl-arm64.S 104231 2024-04-08 13:46:29Z vboxsync $ */
2	/** @file
3	* IEM - Instruction Implementation in Assembly, ARM64 variant.
4	*/
5
6	/*
7	* Copyright (C) 2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include <iprt/asmdefs-arm.h>
33	#include <iprt/x86.h>
34
35
36	#if RT_CLANG_PREREQ(15, 0)
37	.arch_extension flagm /* not necessary */
38	#else
39	/* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
40	For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
41	recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
42	work for v15 and is enabled by default it seems. */
43	# ifdef RT_OS_DARWIN
44	.cpu apple-a14+crc
45	# else
46	.cpu cortex-a53+flagm
47	# endif
48	#endif
49
50
51	.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
52	/*
53	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
54	*/
55	eor \regTmp, \regResult, \regResult, LSR #4
56	eor \regTmp, \regTmp, \regTmp, LSR #2
57	eor \regTmp, \regTmp, \regTmp, LSR #1
58	eor \regTmp, \regTmp, #1
59	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
60	.endm
61
62
63	.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
64	/*
65	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
66	*/
67	eor \regTmp, \regLeft, \regRight
68	eor \regTmp, \regTmp, \regResult
69	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
70	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
71	.endm
72
73	.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
74	/*
75	* Translate the arm NZCV bits into corresponding EFLAGS bits.
76	*/
77	.if \fSkipFlags == 0 \|\| \fSkipFlags == X86_EFL_OF
78	#if 0
79	/* Maybe just a tiny bit slow than the next one. */
80	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
81	.ifeq \fSkipFlags & X86_EFL_OF
82	lsr \regTmp, \regTmp, #28
83	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
84	lsr \regTmp, \regTmp, #1
85	.else
86	lsr \regTmp, \regTmp, #29
87	.endif
88	eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
89	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
90	lsr \regTmp, \regTmp, #1
91	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
92	#else
93	/* This seems to be the faster one... */
94	cfinv
95	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
96	.ifeq (\fSkipFlags & X86_EFL_OF)
97	lsr \regTmp, \regTmp, #28
98	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
99	lsr \regTmp, \regTmp, #1
100	.else
101	lsr \regTmp, \regTmp, #29
102	.endif
103	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
104	lsr \regTmp, \regTmp, #1
105	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
106	#endif
107	.else
108	/* Definitely slower than the above two, but easier to handle wrt skipping parts. */
109	.ifeq \fSkipFlags & X86_EFL_ZF
110	cset \regTmp, eq
111	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
112	.endif
113	.ifeq \fSkipFlags & X86_EFL_CF
114	cset \regTmp, cc
115	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
116	.endif
117	.ifeq \fSkipFlags & X86_EFL_OF
118	cset \regTmp, vs
119	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
120	.endif
121	.ifeq \fSkipFlags & X86_EFL_SF
122	cset \regTmp, mi
123	bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
124	.endif
125	.endif
126
127
128	/*
129	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
130	*/
131	eor \regTmp, \regResult, \regResult, LSR #4
132	eor \regTmp, \regTmp, \regTmp, LSR #2
133	eor \regTmp, \regTmp, \regTmp, LSR #1
134	eor \regTmp, \regTmp, #1
135	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
136
137	/*
138	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
139	*/
140	eor \regTmp, \regLeft, \regRight
141	eor \regTmp, \regTmp, \regResult
142	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
143	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
144
145	/* done */
146	.endm
147
148
149	BEGINCODE
150
151	BEGINPROC_HIDDEN iemAImpl_placeholder
152	brk #1
153	ret
154
155	/* Some sketches.
156
157	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg));
158	BEGINPROC_HIDDEN iemAImpl_xchg_u8_locked
159	ldrb w2, [x1]
160	swpalb w2, w2, [x0]
161	strb w2, [x1]
162	ret
163
164	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t pu16Mem, uint16_t pu16Reg));
165	BEGINPROC_HIDDEN iemAImpl_xchg_u16_locked
166	ldrh w2, [x1]
167	swpalh w2, w2, [x0]
168	strh w2, [x1]
169	ret
170
171	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t pu32Mem, uint32_t pu32Reg));
172	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t pu64Mem, uint64_t pu64Reg));
173
174	*/
175
176
177	/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg)); */
178
179	/*
180	* The CMP instruction.
181	*/
182
183	/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const puDst, uint8_t uSrc); /
184	BEGINPROC_HIDDEN iemAImpl_sub_u8
185	.cfi_startproc
186	/* Do the subtraction. */
187	ldrb w8, [x1]
188	/and w2, w2, #0xff - should not be necessary. /
189	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
190	strb w9, [x1]
191	setf8 w9
192
193	/* Calculate EFLAGS (passed in and returned via x0). */
194	and w9, w9, #0xffff
195	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
196
197	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
198	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
199	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
200	eor w12, w8, w9
201	and w11, w12, w11
202	lsr w11, w11, #7
203	bfi w0, w11, #X86_EFL_OF_BIT, #1
204
205	ret
206	.cfi_endproc
207
208
209	/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const puDst, uint16_t uSrc); /
210	BEGINPROC_HIDDEN iemAImpl_sub_u16
211	.cfi_startproc
212	/* Do the subtraction. */
213	ldrh w8, [x1]
214	/and w2, w2, #0xffff - should not be necessary. /
215	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
216	setf16 w9
217	strh w9, [x1]
218
219	/* Calculate EFLAGS (passed in and returned via x0). */
220	and w9, w9, #0xffff
221	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
222
223	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
224	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
225	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
226	eor w12, w8, w9
227	and w11, w12, w11
228	lsr w11, w11, #15
229	bfi w0, w11, #X86_EFL_OF_BIT, #1
230
231	ret
232	.cfi_endproc
233
234
235	/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const puDst, uint32_t uSrc); /
236	BEGINPROC_HIDDEN iemAImpl_sub_u32
237	.cfi_startproc
238	/* Do the subtraction. */
239	ldr w8, [x1]
240	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
241	str w9, [x1]
242
243	/* Calculate EFLAGS (passed in and returned via x0). */
244
245	#if 0
246	/* Translate the arm NZCV bits into corresponding EFLAGS bits. */
247	#if 0 /* maybe just a tiny bit slow than the next one. */
248	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
249	lsr w11, w11, #28
250	bfi w0, w11, #X86_EFL_OF_BIT, #1
251	lsr w11, w11, #1
252	eor w11, w11, #1 /* inverts the carry flag to x86 style. */
253	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
254	lsr w11, w11, #1
255	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
256	#elif 1 /* seems the faster one... */
257	cfinv
258	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
259	lsr w11, w11, #28
260	bfi w0, w11, #X86_EFL_OF_BIT, #1
261	lsr w11, w11, #1
262	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
263	lsr w11, w11, #1
264	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
265	#else
266	cset w11, eq
267	bfi w0, w11, #X86_EFL_ZF_BIT, #1
268	cset w11, cc
269	bfi w0, w11, #X86_EFL_CF_BIT, #1
270	cset w11, vs
271	bfi w0, w11, #X86_EFL_OF_BIT, #1
272	cset w11, mi
273	bfi w0, w11, #X86_EFL_SF_BIT, #1
274	#endif
275
276	/* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
277	eor w11, w9, w9, LSR #4
278	eor w11, w11, w11, LSR #2
279	eor w11, w11, w11, LSR #1
280	eor w11, w11, #1
281	bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
282
283	/* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
284	eor w11, w8, w2
285	eor w11, w11, w9
286	lsr w11, w11, #X86_EFL_AF_BIT
287	bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
288	#else
289	CALC_EFLAGS x0, x9, x8, x2, x11
290	#endif
291
292	ret
293	.cfi_endproc
294
295
296	/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const puDst, uint64_t uSrc); /
297	BEGINPROC_HIDDEN iemAImpl_sub_u64
298	.cfi_startproc
299	/* Do the subtraction. */
300	ldr x8, [x1]
301	subs x9, x8, x2 /* x9 = x8 (puDst) - x2 (uSrc) /
302	str x9, [x1]
303
304	/* Calculate EFLAGS (passed in and returned via x0). */
305	CALC_EFLAGS x0, x9, x8, x2, x11
306
307	ret
308	.cfi_endproc
309
310
311
312	/*
313	* Shift Left.
314	*/
315
316	/* void iemAImpl_shl_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
317	/* void iemAImpl_shl_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
318	/* void iemAImpl_shl_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
319	.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
320	BEGINPROC_HIDDEN \a_Name
321	.cfi_startproc
322
323	/* Do we need to shift anything at all? */
324	and w1, w1, #0x1f
325	cbz w1, 99f
326
327	/*
328	* Do the shifting
329	*/
330	ldr\a_LdStSuff w8, [x0]
331	.ifne \a_cBits < 32
332	lslv w9, w8, w1
333	.else
334	lslv x9, x8, x1 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
335	.endif
336	str\a_LdStSuff w9, [x0]
337
338	/*
339	* Calculate EFLAGS.
340	*/
341	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
342
343	CALC_EFLAGS_PARITY w10, w9, w12
344
345	.ifne \a_cBits < 32
346	setf\a_cBits w9 /* Sets NZ */
347	.else
348	ands wzr, w9, w9 /* Sets NZ */
349	.endif
350	#if 1
351	mrs x11, NZCV
352	lsr w11, w11, #30 /* N=1; Z=0 */
353	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
354	#else
355	cset x11, eq
356	bfi w10, w11, X86_EFL_ZF_BIT, 1
357	cset x12, pl
358	bfi w10, w12, X86_EFL_SF_BIT, 1
359	#endif
360
361	.ifne \a_cBits < 32
362	bfxil w10, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
363	.else
364	bfxil x10, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
365	.endif
366
367	.ifne \a_fIntelFlags
368	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
369	eor w11, w8, w8, LSL #1
370	lsr w11, w11, #(\a_cBits - 1)
371	bfi w10, w11, #X86_EFL_OF_BIT, #1
372
373	and w10, w10, ~X86_EFL_AF /* AF is cleared */
374	.else
375	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
376	.ifne \a_cBits < 32
377	eor w11, w9, w9, LSR #1
378	lsr w11, w11, #(\a_cBits - 1)
379	.else
380	eor x11, x9, x9, LSR #1
381	lsr x11, x11, #(\a_cBits - 1)
382	.endif
383	bfi w10, w11, #X86_EFL_OF_BIT, #1
384
385	orr w10, w10, X86_EFL_AF /* AF is set */
386	.endif
387
388	str w10, [x2]
389	99:
390	ret
391	.cfi_endproc
392	.endm
393
394	SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
395	SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
396	SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
397
398	SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
399	SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
400	SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
401
402	SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
403	SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
404	SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
405
406	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
407	/* void iemAImpl_shl_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
408	.macro SHL_64, a_Name, a_fIntelFlags
409	BEGINPROC_HIDDEN \a_Name
410	.cfi_startproc
411
412	/* Do we need to shift anything at all? */
413	and w1, w1, #0x3f
414	cbz w1, 99f
415
416	/*
417	* Do the shifting
418	*/
419	ldr x8, [x0]
420	lslv x9, x8, x1
421	str x9, [x0]
422
423	/*
424	* Calculate EFLAGS.
425	*/
426	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
427
428	CALC_EFLAGS_PARITY w10, w9, w11
429
430	ands xzr, x9, x9 /* Sets NZ */
431	mrs x11, NZCV
432	lsr w11, w11, #30 /* N=1; Z=0 */
433	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
434
435	neg w11, w1 /* the shift count is MODed by the data size, so this is safe. */
436	lsrv x11, x8, x11
437	bfi w10, w11, X86_EFL_CF_BIT, 1
438
439	.ifne \a_fIntelFlags
440	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
441	eor x11, x8, x8, LSL #1
442	lsr x11, x11, #63
443	bfi w10, w11, #X86_EFL_OF_BIT, #1
444
445	and w10, w10, ~X86_EFL_AF /* AF is cleared */
446	.else
447	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
448	eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
449	bfi w10, w11, #X86_EFL_OF_BIT, #1
450
451	orr w10, w10, X86_EFL_AF /* AF is set */
452	.endif
453	str w10, [x2]
454	99:
455	ret
456	.cfi_endproc
457	.endm
458
459	SHL_64 iemAImpl_shl_u64, 1
460	SHL_64 iemAImpl_shl_u64_intel, 1
461	SHL_64 iemAImpl_shl_u64_amd, 0
462
463
464	/*
465	* Shift Right, Unsigned.
466	*/
467
468	/* void iemAImpl_shr_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
469	/* void iemAImpl_shr_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
470	/* void iemAImpl_shr_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
471	.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
472	BEGINPROC_HIDDEN \a_Name
473	.cfi_startproc
474
475	/* Do we need to shift anything at all? */
476	and w1, w1, #0x1f
477	cbz w1, 99f
478
479	/* Load EFLAGS before we start the calculation. */
480	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
481
482	/*
483	* Do the shifting.
484	*/
485	ldr\a_LdStSuff w8, [x0]
486	lsrv w9, w8, w1
487	str\a_LdStSuff w9, [x0]
488
489	/*
490	* Calculate EFLAGS.
491	*/
492	sub w11, w1, #1
493	lsrv w11, w8, w11
494	bfxil w10, w11, #X86_EFL_CF_BIT, #1
495
496	.ifne \a_fIntelFlags
497	and w10, w10, ~X86_EFL_AF /* AF is cleared */
498	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
499	lsr w11, w8, #(\a_cBits - 1)
500	bfi w10, w11, #X86_EFL_OF_BIT, #1
501	.else
502	orr w10, w10, X86_EFL_AF /* AF is set */
503	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
504	lsr w11, w9, #(\a_cBits - 2)
505	bfi w10, w11, #X86_EFL_OF_BIT, #1
506	.endif
507
508	CALC_EFLAGS_PARITY w10, w9, w11
509
510	.ifne \a_cBits < 32
511	setf\a_cBits w9 /* Sets NZ */
512	.else
513	ands wzr, w9, w9 /* Sets NZ */
514	.endif
515	mrs x11, NZCV
516	lsr w11, w11, #30 /* N=1; Z=0 */
517	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
518
519	str w10, [x2]
520	99:
521	ret
522	.cfi_endproc
523	.endm
524
525	shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
526	shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
527	shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
528
529	shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
530	shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
531	shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
532
533	shr_8_16_32 iemAImpl_shr_u32, 32, 1,
534	shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
535	shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
536
537	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
538	/* void iemAImpl_shr_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
539	.macro shr_64, a_Name, a_fIntelFlags
540	BEGINPROC_HIDDEN \a_Name
541	.cfi_startproc
542
543	/* Do we need to shift anything at all? */
544	ands w1, w1, #0x3f
545	b.eq 99f
546
547	/* Load EFLAGS before we start the calculation. */
548	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
549
550	/*
551	* Do the shifting
552	*/
553	ldr x8, [x0]
554	lsrv x9, x8, x1
555	str x9, [x0]
556
557	/*
558	* Calculate EFLAGS.
559	*/
560	sub w11, w1, #1
561	lsrv x11, x8, x11
562	bfxil w10, w11, #X86_EFL_CF_BIT, #1
563
564	.ifne \a_fIntelFlags
565	and w10, w10, ~X86_EFL_AF /* AF is cleared */
566	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
567	lsr x11, x8, #63
568	bfi w10, w11, #X86_EFL_OF_BIT, #1
569	.else
570	orr w10, w10, X86_EFL_AF /* AF is set */
571	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
572	lsr x11, x9, #62
573	bfi w10, w11, #X86_EFL_OF_BIT, #1
574	.endif
575
576	CALC_EFLAGS_PARITY w10, w9, w11
577
578	ands xzr, x9, x9 /* Sets NZ */
579	mrs x11, NZCV
580	lsr w11, w11, #30 /* N=1; Z=0 */
581	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
582
583	str w10, [x2]
584	99:
585	ret
586	.cfi_endproc
587	.endm
588
589	shr_64 iemAImpl_shr_u64, 1
590	shr_64 iemAImpl_shr_u64_intel, 1
591	shr_64 iemAImpl_shr_u64_amd, 0
592
593
594	/*
595	* Shift Right, Signed
596	*/
597
598	/* void iemAImpl_sar_u8(uint8_t pu8Dst, uint8_t cShift, uint32_t pEFlags); */
599	/* void iemAImpl_sar_u16(uint16_t pu16Dst, uint8_t cShift, uint32_t pEFlags); */
600	/* void iemAImpl_sar_u32(uint16_t pu32Dst, uint8_t cShift, uint32_t pEFlags); */
601	.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
602	BEGINPROC_HIDDEN \a_Name
603	.cfi_startproc
604
605	/* Do we need to shift anything at all? */
606	and w1, w1, #0x1f
607	cbz w1, 99f
608
609	/* Load EFLAGS before we start the calculation. */
610	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
611
612	/*
613	* Do the shifting.
614	*/
615	ldr\a_LdSuff w8, [x0] /* Sign-extending for 8 and 16 bits! */
616	asrv w9, w8, w1
617	str\a_StSuff w9, [x0]
618
619	/*
620	* Calculate EFLAGS.
621	*/
622	sub w11, w1, #1
623	lsrv w11, w8, w11
624	bfxil w10, w11, #X86_EFL_CF_BIT, #1
625
626	.ifne \a_fIntelFlags
627	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
628	and w10, w10, w11 /* AF and OF are cleared */
629	.else
630	orr w10, w10, X86_EFL_AF /* AF is set */
631	and w10, w10, ~X86_EFL_OF /* OF is cleared */
632	.endif
633
634	CALC_EFLAGS_PARITY w10, w9, w11
635
636	.ifne \a_cBits < 32
637	setf\a_cBits w9 /* Sets NZ */
638	.else
639	ands wzr, w9, w9 /* Sets NZ */
640	.endif
641	mrs x11, NZCV
642	lsr w11, w11, #30 /* N=1; Z=0 */
643	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
644
645	str w10, [x2]
646	99:
647	ret
648	.cfi_endproc
649	.endm
650
651	sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
652	sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
653	sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
654
655	sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
656	sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
657	sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
658
659	sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
660	sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
661	sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
662
663	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
664	/* void iemAImpl_sar_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
665	.macro sar_64, a_Name, a_fIntelFlags
666	BEGINPROC_HIDDEN \a_Name
667	.cfi_startproc
668
669	/* Do we need to shift anything at all? */
670	ands w1, w1, #0x3f
671	b.eq 99f
672
673	/* Load EFLAGS before we start the calculation. */
674	ldr w10, [x2] /* w10 = eflags; CF=0 PF=2 AF=4 ZF=6 SF=7 OF=11 */
675
676	/*
677	* Do the shifting
678	*/
679	ldr x8, [x0]
680	asrv x9, x8, x1
681	str x9, [x0]
682
683	/*
684	* Calculate EFLAGS.
685	*/
686	sub w11, w1, #1
687	lsrv x11, x8, x11
688	bfxil w10, w11, #X86_EFL_CF_BIT, #1
689
690	.ifne \a_fIntelFlags
691	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
692	and w10, w10, w11 /* AF and OF are cleared */
693	.else
694	orr w10, w10, X86_EFL_AF /* AF is set */
695	and w10, w10, ~X86_EFL_OF /* OF is cleared */
696	.endif
697
698	CALC_EFLAGS_PARITY w10, w9, w11
699
700	ands xzr, x9, x9 /* Sets NZ */
701	mrs x11, NZCV
702	lsr w11, w11, #30 /* N=1; Z=0 */
703	bfi w10, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
704
705	str w10, [x2]
706	99:
707	ret
708	.cfi_endproc
709	.endm
710
711	sar_64 iemAImpl_sar_u64, 1
712	sar_64 iemAImpl_sar_u64_intel, 1
713	sar_64 iemAImpl_sar_u64_amd, 0
714

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format