IEMAllAImpl-arm64.S@ 104239

Last change on this file since 104239 was 104239, checked in by vboxsync, 8 months ago
VMM/IEM: ARM assembly rendition of ROR. bugref:10376
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 26.3 KB

Line
1	/* $Id: IEMAllAImpl-arm64.S 104239 2024-04-08 21:33:56Z vboxsync $ */
2	/** @file
3	* IEM - Instruction Implementation in Assembly, ARM64 variant.
4	*/
5
6	/*
7	* Copyright (C) 2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include <iprt/asmdefs-arm.h>
33	#include <iprt/x86.h>
34
35	#define IEM_AIMPL_FUNCTION_ALIGNMENT 0x20
36
37
38	#if RT_CLANG_PREREQ(15, 0)
39	.arch_extension flagm /* not necessary */
40	#else
41	/* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
42	For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
43	recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
44	work for v15 and is enabled by default it seems. */
45	# ifdef RT_OS_DARWIN
46	.cpu apple-a14+crc
47	# else
48	.cpu cortex-a53+flagm
49	# endif
50	#endif
51
52
53	.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54	/*
55	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56	*/
57	eor \regTmp, \regResult, \regResult, LSR #4
58	eor \regTmp, \regTmp, \regTmp, LSR #2
59	eor \regTmp, \regTmp, \regTmp, LSR #1
60	eor \regTmp, \regTmp, #1
61	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62	.endm
63
64
65	.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66	/*
67	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
68	*/
69	eor \regTmp, \regLeft, \regRight
70	eor \regTmp, \regTmp, \regResult
71	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73	.endm
74
75	.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76	/*
77	* Translate the arm NZCV bits into corresponding EFLAGS bits.
78	*/
79	.if \fSkipFlags == 0 \|\| \fSkipFlags == X86_EFL_OF
80	#if 0
81	/* Maybe just a tiny bit slow than the next one. */
82	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83	.ifeq \fSkipFlags & X86_EFL_OF
84	lsr \regTmp, \regTmp, #28
85	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86	lsr \regTmp, \regTmp, #1
87	.else
88	lsr \regTmp, \regTmp, #29
89	.endif
90	eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92	lsr \regTmp, \regTmp, #1
93	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94	#else
95	/* This seems to be the faster one... */
96	cfinv
97	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98	.ifeq (\fSkipFlags & X86_EFL_OF)
99	lsr \regTmp, \regTmp, #28
100	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101	lsr \regTmp, \regTmp, #1
102	.else
103	lsr \regTmp, \regTmp, #29
104	.endif
105	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106	lsr \regTmp, \regTmp, #1
107	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108	#endif
109	.else
110	/* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111	.ifeq \fSkipFlags & X86_EFL_ZF
112	cset \regTmp, eq
113	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114	.endif
115	.ifeq \fSkipFlags & X86_EFL_CF
116	cset \regTmp, cc
117	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118	.endif
119	.ifeq \fSkipFlags & X86_EFL_OF
120	cset \regTmp, vs
121	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122	.endif
123	.ifeq \fSkipFlags & X86_EFL_SF
124	cset \regTmp, mi
125	bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126	.endif
127	.endif
128
129
130	/*
131	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132	*/
133	eor \regTmp, \regResult, \regResult, LSR #4
134	eor \regTmp, \regTmp, \regTmp, LSR #2
135	eor \regTmp, \regTmp, \regTmp, LSR #1
136	eor \regTmp, \regTmp, #1
137	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139	/*
140	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
141	*/
142	eor \regTmp, \regLeft, \regRight
143	eor \regTmp, \regTmp, \regResult
144	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147	/* done */
148	.endm
149
150
151	BEGINCODE
152
153
154
155	/* Some sketches.
156
157	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg));
158	BEGINPROC_HIDDEN iemAImpl_xchg_u8_locked
159	ldrb w2, [x1]
160	swpalb w2, w2, [x0]
161	strb w2, [x1]
162	ret
163
164	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t pu16Mem, uint16_t pu16Reg));
165	BEGINPROC_HIDDEN iemAImpl_xchg_u16_locked
166	ldrh w2, [x1]
167	swpalh w2, w2, [x0]
168	strh w2, [x1]
169	ret
170
171	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t pu32Mem, uint32_t pu32Reg));
172	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t pu64Mem, uint64_t pu64Reg));
173
174	*/
175
176
177	/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg)); */
178
179	/*
180	* The CMP instruction.
181	*/
182
183	/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const puDst, uint8_t uSrc); /
184	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
185	BEGINPROC_HIDDEN iemAImpl_sub_u8
186	.cfi_startproc
187	/* Do the subtraction. */
188	ldrb w8, [x1]
189	/and w2, w2, #0xff - should not be necessary. /
190	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
191	strb w9, [x1]
192	setf8 w9
193
194	/* Calculate EFLAGS (passed in and returned via x0). */
195	and w9, w9, #0xffff
196	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
197
198	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
199	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
200	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
201	eor w12, w8, w9
202	and w11, w12, w11
203	lsr w11, w11, #7
204	bfi w0, w11, #X86_EFL_OF_BIT, #1
205
206	ret
207	.cfi_endproc
208
209
210	/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const puDst, uint16_t uSrc); /
211	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
212	BEGINPROC_HIDDEN iemAImpl_sub_u16
213	.cfi_startproc
214	/* Do the subtraction. */
215	ldrh w8, [x1]
216	/and w2, w2, #0xffff - should not be necessary. /
217	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
218	setf16 w9
219	strh w9, [x1]
220
221	/* Calculate EFLAGS (passed in and returned via x0). */
222	and w9, w9, #0xffff
223	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
224
225	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
226	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
227	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
228	eor w12, w8, w9
229	and w11, w12, w11
230	lsr w11, w11, #15
231	bfi w0, w11, #X86_EFL_OF_BIT, #1
232
233	ret
234	.cfi_endproc
235
236
237	/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const puDst, uint32_t uSrc); /
238	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
239	BEGINPROC_HIDDEN iemAImpl_sub_u32
240	.cfi_startproc
241	/* Do the subtraction. */
242	ldr w8, [x1]
243	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
244	str w9, [x1]
245
246	/* Calculate EFLAGS (passed in and returned via x0). */
247
248	#if 0
249	/* Translate the arm NZCV bits into corresponding EFLAGS bits. */
250	#if 0 /* maybe just a tiny bit slow than the next one. */
251	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
252	lsr w11, w11, #28
253	bfi w0, w11, #X86_EFL_OF_BIT, #1
254	lsr w11, w11, #1
255	eor w11, w11, #1 /* inverts the carry flag to x86 style. */
256	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
257	lsr w11, w11, #1
258	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
259	#elif 1 /* seems the faster one... */
260	cfinv
261	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
262	lsr w11, w11, #28
263	bfi w0, w11, #X86_EFL_OF_BIT, #1
264	lsr w11, w11, #1
265	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
266	lsr w11, w11, #1
267	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
268	#else
269	cset w11, eq
270	bfi w0, w11, #X86_EFL_ZF_BIT, #1
271	cset w11, cc
272	bfi w0, w11, #X86_EFL_CF_BIT, #1
273	cset w11, vs
274	bfi w0, w11, #X86_EFL_OF_BIT, #1
275	cset w11, mi
276	bfi w0, w11, #X86_EFL_SF_BIT, #1
277	#endif
278
279	/* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
280	eor w11, w9, w9, LSR #4
281	eor w11, w11, w11, LSR #2
282	eor w11, w11, w11, LSR #1
283	eor w11, w11, #1
284	bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
285
286	/* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
287	eor w11, w8, w2
288	eor w11, w11, w9
289	lsr w11, w11, #X86_EFL_AF_BIT
290	bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
291	#else
292	CALC_EFLAGS x0, x9, x8, x2, x11
293	#endif
294
295	ret
296	.cfi_endproc
297
298
299	/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const puDst, uint64_t uSrc); /
300	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
301	BEGINPROC_HIDDEN iemAImpl_sub_u64
302	.cfi_startproc
303	/* Do the subtraction. */
304	ldr x8, [x1]
305	subs x9, x8, x2 /* x9 = x8 (puDst) - x2 (uSrc) /
306	str x9, [x1]
307
308	/* Calculate EFLAGS (passed in and returned via x0). */
309	CALC_EFLAGS x0, x9, x8, x2, x11
310
311	ret
312	.cfi_endproc
313
314
315
316	/*
317	* Shift Left.
318	*/
319
320	/* uint32_t iemAImpl_shl_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
321	/* uint32_t iemAImpl_shl_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
322	/* uint32_t iemAImpl_shl_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
323	.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
324	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
325	BEGINPROC_HIDDEN \a_Name
326	.cfi_startproc
327
328	/* Do we need to shift anything at all? */
329	and w2, w2, #0x1f
330	cbz w2, 99f
331
332	/*
333	* Do the shifting
334	*/
335	ldr\a_LdStSuff w8, [x1]
336	.ifne \a_cBits < 32
337	lslv w9, w8, w2
338	.else
339	lslv x9, x8, x2 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
340	.endif
341	str\a_LdStSuff w9, [x1]
342
343	/*
344	* Calculate EFLAGS.
345	*/
346	CALC_EFLAGS_PARITY w0, w9, w12
347
348	.ifne \a_cBits < 32
349	setf\a_cBits w9 /* Sets NZ */
350	.else
351	ands wzr, w9, w9 /* Sets NZ */
352	.endif
353	#if 1
354	mrs x11, NZCV
355	lsr w11, w11, #30 /* N=1; Z=0 */
356	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
357	#else
358	cset x11, eq
359	bfi w0, w11, X86_EFL_ZF_BIT, 1
360	cset x12, pl
361	bfi w0, w12, X86_EFL_SF_BIT, 1
362	#endif
363
364	.ifne \a_cBits < 32
365	bfxil w0, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
366	.else
367	bfxil x0, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
368	.endif
369
370	.ifne \a_fIntelFlags
371	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
372	eor w11, w8, w8, LSL #1
373	lsr w11, w11, #(\a_cBits - 1)
374	bfi w0, w11, #X86_EFL_OF_BIT, #1
375
376	and w0, w0, ~X86_EFL_AF /* AF is cleared */
377	.else
378	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
379	.ifne \a_cBits < 32
380	eor w11, w9, w9, LSR #1
381	lsr w11, w11, #(\a_cBits - 1)
382	.else
383	eor x11, x9, x9, LSR #1
384	lsr x11, x11, #(\a_cBits - 1)
385	.endif
386	bfi w0, w11, #X86_EFL_OF_BIT, #1
387
388	orr w0, w0, X86_EFL_AF /* AF is set */
389	.endif
390
391	99:
392	ret
393	.cfi_endproc
394	.endm
395
396	SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
397	SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
398	SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
399
400	SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
401	SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
402	SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
403
404	SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
405	SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
406	SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
407
408	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
409	/* uint32_t iemAImpl_shl_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
410	.macro SHL_64, a_Name, a_fIntelFlags
411	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
412	BEGINPROC_HIDDEN \a_Name
413	.cfi_startproc
414
415	/* Do we need to shift anything at all? */
416	and w2, w2, #0x3f
417	cbz w2, 99f
418
419	/*
420	* Do the shifting
421	*/
422	ldr x8, [x1]
423	lslv x9, x8, x2
424	str x9, [x1]
425
426	/*
427	* Calculate EFLAGS.
428	*/
429	CALC_EFLAGS_PARITY w0, w9, w11
430
431	ands xzr, x9, x9 /* Sets NZ */
432	mrs x11, NZCV
433	lsr w11, w11, #30 /* N=1; Z=0 */
434	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
435
436	neg w11, w2 /* the shift count is MODed by the data size, so this is safe. */
437	lsrv x11, x8, x11
438	bfi w0, w11, X86_EFL_CF_BIT, 1
439
440	.ifne \a_fIntelFlags
441	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
442	eor x11, x8, x8, LSL #1
443	lsr x11, x11, #63
444	bfi w0, w11, #X86_EFL_OF_BIT, #1
445
446	and w0, w0, ~X86_EFL_AF /* AF is cleared */
447	.else
448	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
449	eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
450	bfi w0, w11, #X86_EFL_OF_BIT, #1
451
452	orr w0, w0, X86_EFL_AF /* AF is set */
453	.endif
454	99:
455	ret
456	.cfi_endproc
457	.endm
458
459	SHL_64 iemAImpl_shl_u64, 1
460	SHL_64 iemAImpl_shl_u64_intel, 1
461	SHL_64 iemAImpl_shl_u64_amd, 0
462
463
464	/*
465	* Shift Right, Unsigned.
466	*/
467
468	/* uint32_t iemAImpl_shr_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
469	/* uint32_t iemAImpl_shr_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
470	/* uint32_t iemAImpl_shr_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
471	.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
472	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
473	BEGINPROC_HIDDEN \a_Name
474	.cfi_startproc
475
476	/* Do we need to shift anything at all? */
477	and w2, w2, #0x1f
478	cbz w2, 99f
479
480	/*
481	* Do the shifting.
482	*/
483	ldr\a_LdStSuff w8, [x1]
484	lsrv w9, w8, w2
485	str\a_LdStSuff w9, [x1]
486
487	/*
488	* Calculate EFLAGS.
489	*/
490	sub w11, w2, #1
491	lsrv w11, w8, w11
492	bfxil w0, w11, #X86_EFL_CF_BIT, #1
493
494	.ifne \a_fIntelFlags
495	and w0, w0, ~X86_EFL_AF /* AF is cleared */
496	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
497	lsr w11, w8, #(\a_cBits - 1)
498	bfi w0, w11, #X86_EFL_OF_BIT, #1
499	.else
500	orr w0, w0, X86_EFL_AF /* AF is set */
501	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
502	lsr w11, w9, #(\a_cBits - 2)
503	bfi w0, w11, #X86_EFL_OF_BIT, #1
504	.endif
505
506	CALC_EFLAGS_PARITY w0, w9, w11
507
508	.ifne \a_cBits < 32
509	setf\a_cBits w9 /* Sets NZ */
510	.else
511	ands wzr, w9, w9 /* Sets NZ */
512	.endif
513	mrs x11, NZCV
514	lsr w11, w11, #30 /* N=1; Z=0 */
515	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
516
517	99:
518	ret
519	.cfi_endproc
520	.endm
521
522	shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
523	shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
524	shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
525
526	shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
527	shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
528	shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
529
530	shr_8_16_32 iemAImpl_shr_u32, 32, 1,
531	shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
532	shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
533
534	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
535	/* void iemAImpl_shr_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
536	.macro shr_64, a_Name, a_fIntelFlags
537	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
538	BEGINPROC_HIDDEN \a_Name
539	.cfi_startproc
540
541	/* Do we need to shift anything at all? */
542	ands w2, w2, #0x3f
543	b.eq 99f
544
545	/*
546	* Do the shifting
547	*/
548	ldr x8, [x1]
549	lsrv x9, x8, x2
550	str x9, [x1]
551
552	/*
553	* Calculate EFLAGS.
554	*/
555	sub w11, w2, #1
556	lsrv x11, x8, x11
557	bfxil w0, w11, #X86_EFL_CF_BIT, #1
558
559	.ifne \a_fIntelFlags
560	and w0, w0, ~X86_EFL_AF /* AF is cleared */
561	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
562	lsr x11, x8, #63
563	bfi w0, w11, #X86_EFL_OF_BIT, #1
564	.else
565	orr w0, w0, X86_EFL_AF /* AF is set */
566	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
567	lsr x11, x9, #62
568	bfi w0, w11, #X86_EFL_OF_BIT, #1
569	.endif
570
571	CALC_EFLAGS_PARITY w0, w9, w11
572
573	ands xzr, x9, x9 /* Sets NZ */
574	mrs x11, NZCV
575	lsr w11, w11, #30 /* N=1; Z=0 */
576	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
577
578	99:
579	ret
580	.cfi_endproc
581	.endm
582
583	shr_64 iemAImpl_shr_u64, 1
584	shr_64 iemAImpl_shr_u64_intel, 1
585	shr_64 iemAImpl_shr_u64_amd, 0
586
587
588	/*
589	* Shift Right, Signed
590	*/
591
592	/* uint32_t iemAImpl_sar_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
593	/* uint32_t iemAImpl_sar_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
594	/* uint32_t iemAImpl_sar_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
595	.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
596	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
597	BEGINPROC_HIDDEN \a_Name
598	.cfi_startproc
599
600	/* Do we need to shift anything at all? */
601	and w2, w2, #0x1f
602	cbz w2, 99f
603
604	/*
605	* Do the shifting.
606	*/
607	ldr\a_LdSuff w8, [x1] /* Sign-extending for 8 and 16 bits! */
608	asrv w9, w8, w2
609	str\a_StSuff w9, [x1]
610
611	/*
612	* Calculate EFLAGS.
613	*/
614	sub w11, w2, #1
615	lsrv w11, w8, w11
616	bfxil w0, w11, #X86_EFL_CF_BIT, #1
617
618	.ifne \a_fIntelFlags
619	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
620	and w0, w0, w11 /* AF and OF are cleared */
621	.else
622	orr w0, w0, X86_EFL_AF /* AF is set */
623	and w0, w0, ~X86_EFL_OF /* OF is cleared */
624	.endif
625
626	CALC_EFLAGS_PARITY w0, w9, w11
627
628	.ifne \a_cBits < 32
629	setf\a_cBits w9 /* Sets NZ */
630	.else
631	ands wzr, w9, w9 /* Sets NZ */
632	.endif
633	mrs x11, NZCV
634	lsr w11, w11, #30 /* N=1; Z=0 */
635	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
636
637	99:
638	ret
639	.cfi_endproc
640	.endm
641
642	sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
643	sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
644	sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
645
646	sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
647	sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
648	sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
649
650	sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
651	sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
652	sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
653
654	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
655	/* uint32_t iemAImpl_sar_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
656	.macro sar_64, a_Name, a_fIntelFlags
657	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
658	BEGINPROC_HIDDEN \a_Name
659	.cfi_startproc
660
661	/* Do we need to shift anything at all? */
662	ands w2, w2, #0x3f
663	b.eq 99f
664
665	/*
666	* Do the shifting
667	*/
668	ldr x8, [x1]
669	asrv x9, x8, x2
670	str x9, [x1]
671
672	/*
673	* Calculate EFLAGS.
674	*/
675	sub w11, w2, #1
676	lsrv x11, x8, x11
677	bfxil w0, w11, #X86_EFL_CF_BIT, #1
678
679	.ifne \a_fIntelFlags
680	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
681	and w0, w0, w11 /* AF and OF are cleared */
682	.else
683	orr w0, w0, X86_EFL_AF /* AF is set */
684	and w0, w0, ~X86_EFL_OF /* OF is cleared */
685	.endif
686
687	CALC_EFLAGS_PARITY w0, w9, w11
688
689	ands xzr, x9, x9 /* Sets NZ */
690	mrs x11, NZCV
691	lsr w11, w11, #30 /* N=1; Z=0 */
692	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
693
694	99:
695	ret
696	.cfi_endproc
697	.endm
698
699	sar_64 iemAImpl_sar_u64, 1
700	sar_64 iemAImpl_sar_u64_intel, 1
701	sar_64 iemAImpl_sar_u64_amd, 0
702
703
704	/*
705	* Rotate Left.
706	*/
707
708	/* uint32_t iemAImpl_rol_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
709	/* uint32_t iemAImpl_rol_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
710	/* uint32_t iemAImpl_rol_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
711	.macro ROL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
712	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
713	BEGINPROC_HIDDEN \a_Name
714	.cfi_startproc
715
716	/* Do we need to rotate anything at all? */
717	and w2, w2, #0x1f
718	cbz w2, 99f
719
720	/*
721	* Do the shifting
722	*/
723	.ifne \a_cBits < 32
724	and w2, w2, #(\a_cBits - 1)
725	neg w3, w2
726	and w3, w3, #(\a_cBits - 1)
727	ldr\a_LdStSuff w8, [x1]
728	lslv w9, w8, w2
729	lsrv w10, w8, w3
730	orr w9, w9, w10
731	str\a_LdStSuff w9, [x1]
732	and w9, w9, #(RT_BIT_32(\a_cBits) - 1)
733	.else
734	neg w3, w2 /* the count is MODed by the data size, so this is safe. */
735	ldr\a_LdStSuff w8, [x1]
736	rorv w9, w8, w3
737	str\a_LdStSuff w9, [x1]
738	.endif
739
740	/*
741	* Calculate EFLAGS - only CF and OF.
742	*/
743	bfi w0, w9, #0, #1 /* CF = last bit rotated around */
744
745	.ifne \a_fIntelFlags
746	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
747	eor w11, w8, w8, LSL #1
748	lsr w11, w11, #(\a_cBits - 1)
749	bfi w0, w11, #X86_EFL_OF_BIT, #1
750	.else
751	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
752	eor w11, w0, w9, LSR #(\a_cBits - 1)
753	bfi w0, w11, #X86_EFL_OF_BIT, #1
754	.endif
755
756	99:
757	ret
758	.cfi_endproc
759	.endm
760
761	ROL_8_16_32 iemAImpl_rol_u8, 8, 1, b
762	ROL_8_16_32 iemAImpl_rol_u8_intel, 8, 1, b
763	ROL_8_16_32 iemAImpl_rol_u8_amd, 8, 0, b
764
765	ROL_8_16_32 iemAImpl_rol_u16, 16, 1, h
766	ROL_8_16_32 iemAImpl_rol_u16_intel, 16, 1, h
767	ROL_8_16_32 iemAImpl_rol_u16_amd, 16, 0, h
768
769	ROL_8_16_32 iemAImpl_rol_u32, 32, 1,
770	ROL_8_16_32 iemAImpl_rol_u32_intel, 32, 1,
771	ROL_8_16_32 iemAImpl_rol_u32_amd, 32, 0,
772
773	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
774	/* uint32_t iemAImpl_rol_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
775	.macro ROL_64, a_Name, a_fIntelFlags
776	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
777	BEGINPROC_HIDDEN \a_Name
778	.cfi_startproc
779
780	/* Do we need to shift anything at all? */
781	and w2, w2, #0x3f
782	cbz w2, 99f
783
784	/*
785	* Do the shifting
786	*/
787	neg w3, w2
788	ldr x8, [x1]
789	rorv x9, x8, x3
790	str x9, [x1]
791
792	/*
793	* Calculate EFLAGS - only CF and OF.
794	*/
795	bfi w0, w9, #0, #1 /* CF = last bit rotated around */
796
797	.ifne \a_fIntelFlags
798	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
799	eor x11, x8, x8, LSL #1
800	lsr x11, x11, #(64 - 1)
801	bfi w0, w11, #X86_EFL_OF_BIT, #1
802	.else
803	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
804	eor x11, x0, x9, LSR #(64 - 1)
805	bfi w0, w11, #X86_EFL_OF_BIT, #1
806	.endif
807
808	99:
809	ret
810	.cfi_endproc
811	.endm
812
813	ROL_64 iemAImpl_rol_u64, 1
814	ROL_64 iemAImpl_rol_u64_intel, 1
815	ROL_64 iemAImpl_rol_u64_amd, 0
816

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format