tstIEMAImpl.cpp@ 103050

Last change on this file since 103050 was 103050, checked in by vboxsync, 15 months ago
tstIEMAImpl: Working on converting the C++ data to compressed binary. Done Int. bugref:9898
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 475.7 KB

Line
1	/* $Id: tstIEMAImpl.cpp 103050 2024-01-25 00:42:30Z vboxsync $ */
2	/** @file
3	* IEM Assembly Instruction Helper Testcase.
4	*/
5
6	/*
7	* Copyright (C) 2022-2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include "../include/IEMInternal.h"
33
34	#include <iprt/errcore.h>
35	#include <VBox/log.h>
36	#include <iprt/assert.h>
37	#include <iprt/ctype.h>
38	#include <iprt/err.h>
39	#include <iprt/getopt.h>
40	#include <iprt/initterm.h>
41	#include <iprt/file.h>
42	#include <iprt/mem.h>
43	#include <iprt/message.h>
44	#include <iprt/mp.h>
45	#include <iprt/rand.h>
46	#include <iprt/stream.h>
47	#include <iprt/string.h>
48	#include <iprt/test.h>
49	#include <iprt/time.h>
50	#include <iprt/thread.h>
51	#include <iprt/vfs.h>
52	#include <iprt/zip.h>
53	#include <VBox/version.h>
54
55	#include "tstIEMAImpl.h"
56
57
58	/*********************************************************************************************************************************
59	* Defined Constants And Macros *
60	*********************************************************************************************************************************/
61	#define ENTRY(a_Name) ENTRY_EX(a_Name, 0)
62	#define ENTRY_EX(a_Name, a_uExtra) \
63	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name, NULL, \
64	g_aTests_ ## a_Name, &g_cTests_ ## a_Name, \
65	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here */ }
66
67	#define ENTRY_FIX(a_Name) ENTRY_FIX_EX(a_Name, 0)
68	#ifdef TSTIEMAIMPL_WITH_GENERATOR
69	# define ENTRY_FIX_EX(a_Name, a_uExtra) \
70	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name, NULL, \
71	g_aTests_ ## a_Name, &g_cTests_ ## a_Name, \
72	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here */, \
73	false, false, RT_ELEMENTS(g_aFixedTests_ ## a_Name), g_aFixedTests_ ## a_Name }
74	#else
75	# define ENTRY_FIX_EX(a_Name, a_uExtra) ENTRY_EX(a_Name, a_uExtra)
76	#endif
77
78	#define ENTRY_PFN_CAST(a_Name, a_pfnType) ENTRY_PFN_CAST_EX(a_Name, a_pfnType, 0)
79	#define ENTRY_PFN_CAST_EX(a_Name, a_pfnType, a_uExtra) \
80	{ RT_XSTR(a_Name), (a_pfnType)iemAImpl_ ## a_Name, NULL, \
81	g_aTests_ ## a_Name, &g_cTests_ ## a_Name, \
82	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here */ }
83
84	#define ENTRY_BIN(a_Name) ENTRY_EX_BIN(a_Name, 0)
85	#define ENTRY_EX_BIN(a_Name, a_uExtra) \
86	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name, NULL, \
87	g_aTests_ ## a_Name, &g_cbTests_ ## a_Name, \
88	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here /, true /fBinary/, true /fCompressed*/ }
89
90	#define ENTRY_BIN_AVX(a_Name) ENTRY_BIN_AVX_EX(a_Name, 0)
91	#ifndef IEM_WITHOUT_ASSEMBLY
92	# define ENTRY_BIN_AVX_EX(a_Name, a_uExtra) \
93	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name, NULL, \
94	g_aTests_ ## a_Name, &g_cbTests_ ## a_Name, \
95	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here /, true /fBinary/, true /fCompressed*/ }
96	#else
97	# define ENTRY_BIN_AVX_EX(a_Name, a_uExtra) \
98	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name ## _fallback, NULL, \
99	g_aTests_ ## a_Name, &g_cbTests_ ## a_Name, \
100	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here /, true /fBinary/, true /fCompressed*/ }
101	#endif
102
103	#define ENTRY_BIN_SSE_OPT(a_Name) ENTRY_BIN_SSE_OPT_EX(a_Name, 0)
104	#ifndef IEM_WITHOUT_ASSEMBLY
105	# define ENTRY_BIN_SSE_OPT_EX(a_Name, a_uExtra) \
106	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name, NULL, \
107	g_aTests_ ## a_Name, &g_cbTests_ ## a_Name, \
108	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here /, true /fBinary/, true /fCompressed*/ }
109	#else
110	# define ENTRY_BIN_SSE_OPT_EX(a_Name, a_uExtra) \
111	{ RT_XSTR(a_Name), iemAImpl_ ## a_Name ## _fallback, NULL, \
112	g_aTests_ ## a_Name, &g_cbTests_ ## a_Name, \
113	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_NATIVE /* means same for all here /, true /fBinary/, true /fCompressed*/ }
114	#endif
115
116
117	#define ENTRY_INTEL(a_Name, a_fEflUndef) ENTRY_INTEL_EX(a_Name, a_fEflUndef, 0)
118	#define ENTRY_INTEL_EX(a_Name, a_fEflUndef, a_uExtra) \
119	{ RT_XSTR(a_Name) "_intel", iemAImpl_ ## a_Name ## _intel, iemAImpl_ ## a_Name, \
120	g_aTests_ ## a_Name ## _intel, &g_cTests_ ## a_Name ## _intel, \
121	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_INTEL }
122
123	#define ENTRY_AMD(a_Name, a_fEflUndef) ENTRY_AMD_EX(a_Name, a_fEflUndef, 0)
124	#define ENTRY_AMD_EX(a_Name, a_fEflUndef, a_uExtra) \
125	{ RT_XSTR(a_Name) "_amd", iemAImpl_ ## a_Name ## _amd, iemAImpl_ ## a_Name, \
126	g_aTests_ ## a_Name ## _amd, &g_cTests_ ## a_Name ## _amd, \
127	a_uExtra, IEMTARGETCPU_EFL_BEHAVIOR_AMD }
128
129	#define TYPEDEF_SUBTEST_TYPE(a_TypeName, a_TestType, a_FunctionPtrType) \
130	typedef struct a_TypeName \
131	{ \
132	const char *pszName; \
133	const a_FunctionPtrType pfn; \
134	const a_FunctionPtrType pfnNative; \
135	a_TestType const paTests; /< These are update for compressed tests. / \
136	uint32_t const pcTests; /< These are update for compressed tests. / \
137	uint32_t const uExtra; \
138	uint8_t const idxCpuEflFlavour; \
139	bool const fBinary; \
140	bool fCompressed; /*< This is cleared after decompressing the tests. / \
141	uint16_t const cFixedTests; \
142	a_TestType const * const paFixedTests; \
143	} a_TypeName
144
145	#define COUNT_VARIATIONS(a_SubTest) \
146	(1 + ((a_SubTest).idxCpuEflFlavour == g_idxCpuEflFlavour && (a_SubTest).pfnNative) )
147
148
149	/*********************************************************************************************************************************
150	* Structures and Typedefs *
151	*********************************************************************************************************************************/
152	typedef struct IEMBINARYOUTPUT
153	{
154	/** The output file. */
155	RTVFSFILE hVfsFile;
156	/** The stream we write uncompressed binary test data to. */
157	RTVFSIOSTREAM hVfsUncompressed;
158	/** Write status. */
159	int rcWrite;
160	/** Set if NULL. */
161	bool fNull;
162	/** Filename. */
163	char szFilename[79];
164	} IEMBINARYOUTPUT;
165	typedef IEMBINARYOUTPUT *PIEMBINARYOUTPUT;
166
167
168	/*********************************************************************************************************************************
169	* Global Variables *
170	*********************************************************************************************************************************/
171	static RTTEST g_hTest;
172	static uint8_t g_idxCpuEflFlavour = IEMTARGETCPU_EFL_BEHAVIOR_INTEL;
173	#ifdef TSTIEMAIMPL_WITH_GENERATOR
174	static uint32_t g_cZeroDstTests = 2;
175	static uint32_t g_cZeroSrcTests = 4;
176	#endif
177	static uint8_t g_pu8, g_pu8Two;
178	static uint16_t g_pu16, g_pu16Two;
179	static uint32_t g_pu32, g_pu32Two, *g_pfEfl;
180	static uint64_t g_pu64, g_pu64Two;
181	static RTUINT128U g_pu128, g_pu128Two;
182
183	static char g_aszBuf[32][256];
184	static unsigned g_idxBuf = 0;
185
186	static uint32_t g_cIncludeTestPatterns;
187	static uint32_t g_cExcludeTestPatterns;
188	static const char *g_apszIncludeTestPatterns[64];
189	static const char *g_apszExcludeTestPatterns[64];
190
191	/** Higher value, means longer benchmarking. */
192	static uint64_t g_cPicoSecBenchmark = 0;
193
194	static unsigned g_cVerbosity = 0;
195
196
197	/*********************************************************************************************************************************
198	* Internal Functions *
199	*********************************************************************************************************************************/
200	static const char *FormatR80(PCRTFLOAT80U pr80);
201	static const char *FormatR64(PCRTFLOAT64U pr64);
202	static const char *FormatR32(PCRTFLOAT32U pr32);
203
204
205	/*
206	* Random helpers.
207	*/
208
209	static uint32_t RandEFlags(void)
210	{
211	uint32_t fEfl = RTRandU32();
212	return (fEfl & X86_EFL_LIVE_MASK) \| X86_EFL_RA1_MASK;
213	}
214
215	#ifdef TSTIEMAIMPL_WITH_GENERATOR
216
217	static uint8_t RandU8(void)
218	{
219	return RTRandU32Ex(0, 0xff);
220	}
221
222
223	static uint16_t RandU16(void)
224	{
225	return RTRandU32Ex(0, 0xffff);
226	}
227
228
229	static uint32_t RandU32(void)
230	{
231	return RTRandU32();
232	}
233
234	#endif
235
236	static uint64_t RandU64(void)
237	{
238	return RTRandU64();
239	}
240
241
242	static RTUINT128U RandU128(void)
243	{
244	RTUINT128U Ret;
245	Ret.s.Hi = RTRandU64();
246	Ret.s.Lo = RTRandU64();
247	return Ret;
248	}
249
250	#ifdef TSTIEMAIMPL_WITH_GENERATOR
251
252	static uint8_t RandU8Dst(uint32_t iTest)
253	{
254	if (iTest < g_cZeroDstTests)
255	return 0;
256	return RandU8();
257	}
258
259
260	static uint8_t RandU8Src(uint32_t iTest)
261	{
262	if (iTest < g_cZeroSrcTests)
263	return 0;
264	return RandU8();
265	}
266
267
268	static uint16_t RandU16Dst(uint32_t iTest)
269	{
270	if (iTest < g_cZeroDstTests)
271	return 0;
272	return RandU16();
273	}
274
275
276	static uint16_t RandU16Src(uint32_t iTest)
277	{
278	if (iTest < g_cZeroSrcTests)
279	return 0;
280	return RandU16();
281	}
282
283
284	static uint32_t RandU32Dst(uint32_t iTest)
285	{
286	if (iTest < g_cZeroDstTests)
287	return 0;
288	return RandU32();
289	}
290
291
292	static uint32_t RandU32Src(uint32_t iTest)
293	{
294	if (iTest < g_cZeroSrcTests)
295	return 0;
296	return RandU32();
297	}
298
299
300	static uint64_t RandU64Dst(uint32_t iTest)
301	{
302	if (iTest < g_cZeroDstTests)
303	return 0;
304	return RandU64();
305	}
306
307
308	static uint64_t RandU64Src(uint32_t iTest)
309	{
310	if (iTest < g_cZeroSrcTests)
311	return 0;
312	return RandU64();
313	}
314
315
316	/** 2nd operand for and FPU instruction, pairing with RandR80Src1. */
317	static int16_t RandI16Src2(uint32_t iTest)
318	{
319	if (iTest < 18 * 4)
320	switch (iTest % 4)
321	{
322	case 0: return 0;
323	case 1: return INT16_MAX;
324	case 2: return INT16_MIN;
325	case 3: break;
326	}
327	return (int16_t)RandU16();
328	}
329
330
331	/** 2nd operand for and FPU instruction, pairing with RandR80Src1. */
332	static int32_t RandI32Src2(uint32_t iTest)
333	{
334	if (iTest < 18 * 4)
335	switch (iTest % 4)
336	{
337	case 0: return 0;
338	case 1: return INT32_MAX;
339	case 2: return INT32_MIN;
340	case 3: break;
341	}
342	return (int32_t)RandU32();
343	}
344
345
346	static int64_t RandI64Src(uint32_t iTest)
347	{
348	RT_NOREF(iTest);
349	return (int64_t)RandU64();
350	}
351
352
353	static uint16_t RandFcw(void)
354	{
355	return RandU16() & ~X86_FCW_ZERO_MASK;
356	}
357
358
359	static uint16_t RandFsw(void)
360	{
361	AssertCompile((X86_FSW_C_MASK \| X86_FSW_XCPT_ES_MASK \| X86_FSW_TOP_MASK \| X86_FSW_B) == 0xffff);
362	return RandU16();
363	}
364
365
366	static uint32_t RandMxcsr(void)
367	{
368	return RandU32() & ~X86_MXCSR_ZERO_MASK;
369	}
370
371
372	static void SafeR80FractionShift(PRTFLOAT80U pr80, uint8_t cShift)
373	{
374	if (pr80->sj64.uFraction >= RT_BIT_64(cShift))
375	pr80->sj64.uFraction >>= cShift;
376	else
377	pr80->sj64.uFraction = (cShift % 19) + 1;
378	}
379
380
381
382	static RTFLOAT80U RandR80Ex(uint8_t bType, unsigned cTarget = 80, bool fIntTarget = false)
383	{
384	Assert(cTarget == (!fIntTarget ? 80U : 16U) \|\| cTarget == 64U \|\| cTarget == 32U \|\| (cTarget == 59U && fIntTarget));
385
386	RTFLOAT80U r80;
387	r80.au64[0] = RandU64();
388	r80.au16[4] = RandU16();
389
390	/*
391	* Adjust the random stuff according to bType.
392	*/
393	bType &= 0x1f;
394	if (bType == 0 \|\| bType == 1 \|\| bType == 2 \|\| bType == 3)
395	{
396	/* Zero (0), Pseudo-Infinity (1), Infinity (2), Indefinite (3). We only keep fSign here. */
397	r80.sj64.uExponent = bType == 0 ? 0 : 0x7fff;
398	r80.sj64.uFraction = bType <= 2 ? 0 : RT_BIT_64(62);
399	r80.sj64.fInteger = bType >= 2 ? 1 : 0;
400	AssertMsg(bType != 0 \|\| RTFLOAT80U_IS_ZERO(&r80), ("%s\n", FormatR80(&r80)));
401	AssertMsg(bType != 1 \|\| RTFLOAT80U_IS_PSEUDO_INF(&r80), ("%s\n", FormatR80(&r80)));
402	Assert( bType != 1 \|\| RTFLOAT80U_IS_387_INVALID(&r80));
403	AssertMsg(bType != 2 \|\| RTFLOAT80U_IS_INF(&r80), ("%s\n", FormatR80(&r80)));
404	AssertMsg(bType != 3 \|\| RTFLOAT80U_IS_INDEFINITE(&r80), ("%s\n", FormatR80(&r80)));
405	}
406	else if (bType == 4 \|\| bType == 5 \|\| bType == 6 \|\| bType == 7)
407	{
408	/* Denormals (4,5) and Pseudo denormals (6,7) */
409	if (bType & 1)
410	SafeR80FractionShift(&r80, r80.sj64.uExponent % 62);
411	else if (r80.sj64.uFraction == 0 && bType < 6)
412	r80.sj64.uFraction = RTRandU64Ex(1, RT_BIT_64(RTFLOAT80U_FRACTION_BITS) - 1);
413	r80.sj64.uExponent = 0;
414	r80.sj64.fInteger = bType >= 6;
415	AssertMsg(bType >= 6 \|\| RTFLOAT80U_IS_DENORMAL(&r80), ("%s bType=%#x\n", FormatR80(&r80), bType));
416	AssertMsg(bType < 6 \|\| RTFLOAT80U_IS_PSEUDO_DENORMAL(&r80), ("%s bType=%#x\n", FormatR80(&r80), bType));
417	}
418	else if (bType == 8 \|\| bType == 9)
419	{
420	/* Pseudo NaN. */
421	if (bType & 1)
422	SafeR80FractionShift(&r80, r80.sj64.uExponent % 62);
423	else if (r80.sj64.uFraction == 0 && !r80.sj64.fInteger)
424	r80.sj64.uFraction = RTRandU64Ex(1, RT_BIT_64(RTFLOAT80U_FRACTION_BITS) - 1);
425	r80.sj64.uExponent = 0x7fff;
426	if (r80.sj64.fInteger)
427	r80.sj64.uFraction \|= RT_BIT_64(62);
428	else
429	r80.sj64.uFraction &= ~RT_BIT_64(62);
430	r80.sj64.fInteger = 0;
431	AssertMsg(RTFLOAT80U_IS_PSEUDO_NAN(&r80), ("%s bType=%#x\n", FormatR80(&r80), bType));
432	AssertMsg(RTFLOAT80U_IS_NAN(&r80), ("%s bType=%#x\n", FormatR80(&r80), bType));
433	Assert(RTFLOAT80U_IS_387_INVALID(&r80));
434	}
435	else if (bType == 10 \|\| bType == 11 \|\| bType == 12 \|\| bType == 13)
436	{
437	/* Quiet and signalling NaNs. */
438	if (bType & 1)
439	SafeR80FractionShift(&r80, r80.sj64.uExponent % 62);
440	else if (r80.sj64.uFraction == 0)
441	r80.sj64.uFraction = RTRandU64Ex(1, RT_BIT_64(RTFLOAT80U_FRACTION_BITS) - 1);
442	r80.sj64.uExponent = 0x7fff;
443	if (bType < 12)
444	r80.sj64.uFraction \|= RT_BIT_64(62); /* quiet */
445	else
446	r80.sj64.uFraction &= ~RT_BIT_64(62); /* signaling */
447	r80.sj64.fInteger = 1;
448	AssertMsg(bType >= 12 \|\| RTFLOAT80U_IS_QUIET_NAN(&r80), ("%s\n", FormatR80(&r80)));
449	AssertMsg(bType < 12 \|\| RTFLOAT80U_IS_SIGNALLING_NAN(&r80), ("%s\n", FormatR80(&r80)));
450	AssertMsg(RTFLOAT80U_IS_SIGNALLING_NAN(&r80) \|\| RTFLOAT80U_IS_QUIET_NAN(&r80), ("%s\n", FormatR80(&r80)));
451	AssertMsg(RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(&r80), ("%s\n", FormatR80(&r80)));
452	AssertMsg(RTFLOAT80U_IS_NAN(&r80), ("%s\n", FormatR80(&r80)));
453	}
454	else if (bType == 14 \|\| bType == 15)
455	{
456	/* Unnormals */
457	if (bType & 1)
458	SafeR80FractionShift(&r80, RandU8() % 62);
459	r80.sj64.fInteger = 0;
460	if (r80.sj64.uExponent == RTFLOAT80U_EXP_MAX \|\| r80.sj64.uExponent == 0)
461	r80.sj64.uExponent = (uint16_t)RTRandU32Ex(1, RTFLOAT80U_EXP_MAX - 1);
462	AssertMsg(RTFLOAT80U_IS_UNNORMAL(&r80), ("%s\n", FormatR80(&r80)));
463	Assert(RTFLOAT80U_IS_387_INVALID(&r80));
464	}
465	else if (bType < 26)
466	{
467	/* Make sure we have lots of normalized values. */
468	if (!fIntTarget)
469	{
470	const unsigned uMinExp = cTarget == 64 ? RTFLOAT80U_EXP_BIAS - RTFLOAT64U_EXP_BIAS
471	: cTarget == 32 ? RTFLOAT80U_EXP_BIAS - RTFLOAT32U_EXP_BIAS : 0;
472	const unsigned uMaxExp = cTarget == 64 ? uMinExp + RTFLOAT64U_EXP_MAX
473	: cTarget == 32 ? uMinExp + RTFLOAT32U_EXP_MAX : RTFLOAT80U_EXP_MAX;
474	r80.sj64.fInteger = 1;
475	if (r80.sj64.uExponent <= uMinExp)
476	r80.sj64.uExponent = uMinExp + 1;
477	else if (r80.sj64.uExponent >= uMaxExp)
478	r80.sj64.uExponent = uMaxExp - 1;
479
480	if (bType == 16)
481	{ /* All 1s is useful to testing rounding. Also try trigger special
482	behaviour by sometimes rounding out of range, while we're at it. */
483	r80.sj64.uFraction = RT_BIT_64(63) - 1;
484	uint8_t bExp = RandU8();
485	if ((bExp & 3) == 0)
486	r80.sj64.uExponent = uMaxExp - 1;
487	else if ((bExp & 3) == 1)
488	r80.sj64.uExponent = uMinExp + 1;
489	else if ((bExp & 3) == 2)
490	r80.sj64.uExponent = uMinExp - (bExp & 15); /* (small numbers are mapped to subnormal values) */
491	}
492	}
493	else
494	{
495	/* integer target: */
496	const unsigned uMinExp = RTFLOAT80U_EXP_BIAS;
497	const unsigned uMaxExp = RTFLOAT80U_EXP_BIAS + cTarget - 2;
498	r80.sj64.fInteger = 1;
499	if (r80.sj64.uExponent < uMinExp)
500	r80.sj64.uExponent = uMinExp;
501	else if (r80.sj64.uExponent > uMaxExp)
502	r80.sj64.uExponent = uMaxExp;
503
504	if (bType == 16)
505	{ /* All 1s is useful to testing rounding. Also try trigger special
506	behaviour by sometimes rounding out of range, while we're at it. */
507	r80.sj64.uFraction = RT_BIT_64(63) - 1;
508	uint8_t bExp = RandU8();
509	if ((bExp & 3) == 0)
510	r80.sj64.uExponent = uMaxExp;
511	else if ((bExp & 3) == 1)
512	r80.sj64.uFraction &= ~(RT_BIT_64(cTarget - 1 - r80.sj64.uExponent) - 1); /* no rounding */
513	}
514	}
515
516	AssertMsg(RTFLOAT80U_IS_NORMAL(&r80), ("%s\n", FormatR80(&r80)));
517	}
518	return r80;
519	}
520
521
522	static RTFLOAT80U RandR80(unsigned cTarget = 80, bool fIntTarget = false)
523	{
524	/*
525	* Make it more likely that we get a good selection of special values.
526	*/
527	return RandR80Ex(RandU8(), cTarget, fIntTarget);
528
529	}
530
531
532	static RTFLOAT80U RandR80Src(uint32_t iTest, unsigned cTarget = 80, bool fIntTarget = false)
533	{
534	/* Make sure we cover all the basic types first before going for random selection: */
535	if (iTest <= 18)
536	return RandR80Ex(18 - iTest, cTarget, fIntTarget); /* Starting with 3 normals. */
537	return RandR80(cTarget, fIntTarget);
538	}
539
540
541	/**
542	* Helper for RandR80Src1 and RandR80Src2 that converts bType from a 0..11 range
543	* to a 0..17, covering all basic value types.
544	*/
545	static uint8_t RandR80Src12RemapType(uint8_t bType)
546	{
547	switch (bType)
548	{
549	case 0: return 18; /* normal */
550	case 1: return 16; /* normal extreme rounding */
551	case 2: return 14; /* unnormal */
552	case 3: return 12; /* Signalling NaN */
553	case 4: return 10; /* Quiet NaN */
554	case 5: return 8; /* PseudoNaN */
555	case 6: return 6; /* Pseudo Denormal */
556	case 7: return 4; /* Denormal */
557	case 8: return 3; /* Indefinite */
558	case 9: return 2; /* Infinity */
559	case 10: return 1; /* Pseudo-Infinity */
560	case 11: return 0; /* Zero */
561	default: AssertFailedReturn(18);
562	}
563	}
564
565
566	/**
567	* This works in tandem with RandR80Src2 to make sure we cover all operand
568	* type mixes first before we venture into regular random testing.
569	*
570	* There are 11 basic variations, when we leave out the five odd ones using
571	* SafeR80FractionShift. Because of the special normalized value targetting at
572	* rounding, we make it an even 12. So 144 combinations for two operands.
573	*/
574	static RTFLOAT80U RandR80Src1(uint32_t iTest, unsigned cPartnerBits = 80, bool fPartnerInt = false)
575	{
576	if (cPartnerBits == 80)
577	{
578	Assert(!fPartnerInt);
579	if (iTest < 12 * 12)
580	return RandR80Ex(RandR80Src12RemapType(iTest / 12));
581	}
582	else if ((cPartnerBits == 64 \|\| cPartnerBits == 32) && !fPartnerInt)
583	{
584	if (iTest < 12 * 10)
585	return RandR80Ex(RandR80Src12RemapType(iTest / 10));
586	}
587	else if (iTest < 18 * 4 && fPartnerInt)
588	return RandR80Ex(iTest / 4);
589	return RandR80();
590	}
591
592
593	/** Partner to RandR80Src1. */
594	static RTFLOAT80U RandR80Src2(uint32_t iTest)
595	{
596	if (iTest < 12 * 12)
597	return RandR80Ex(RandR80Src12RemapType(iTest % 12));
598	return RandR80();
599	}
600
601
602	static void SafeR64FractionShift(PRTFLOAT64U pr64, uint8_t cShift)
603	{
604	if (pr64->s64.uFraction >= RT_BIT_64(cShift))
605	pr64->s64.uFraction >>= cShift;
606	else
607	pr64->s64.uFraction = (cShift % 19) + 1;
608	}
609
610
611	static RTFLOAT64U RandR64Ex(uint8_t bType)
612	{
613	RTFLOAT64U r64;
614	r64.u = RandU64();
615
616	/*
617	* Make it more likely that we get a good selection of special values.
618	* On average 6 out of 16 calls should return a special value.
619	*/
620	bType &= 0xf;
621	if (bType == 0 \|\| bType == 1)
622	{
623	/* 0 or Infinity. We only keep fSign here. */
624	r64.s.uExponent = bType == 0 ? 0 : 0x7ff;
625	r64.s.uFractionHigh = 0;
626	r64.s.uFractionLow = 0;
627	AssertMsg(bType != 0 \|\| RTFLOAT64U_IS_ZERO(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
628	AssertMsg(bType != 1 \|\| RTFLOAT64U_IS_INF(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
629	}
630	else if (bType == 2 \|\| bType == 3)
631	{
632	/* Subnormals */
633	if (bType == 3)
634	SafeR64FractionShift(&r64, r64.s64.uExponent % 51);
635	else if (r64.s64.uFraction == 0)
636	r64.s64.uFraction = RTRandU64Ex(1, RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1);
637	r64.s64.uExponent = 0;
638	AssertMsg(RTFLOAT64U_IS_SUBNORMAL(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
639	}
640	else if (bType == 4 \|\| bType == 5 \|\| bType == 6 \|\| bType == 7)
641	{
642	/* NaNs */
643	if (bType & 1)
644	SafeR64FractionShift(&r64, r64.s64.uExponent % 51);
645	else if (r64.s64.uFraction == 0)
646	r64.s64.uFraction = RTRandU64Ex(1, RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1);
647	r64.s64.uExponent = 0x7ff;
648	if (bType < 6)
649	r64.s64.uFraction \|= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1); /* quiet */
650	else
651	r64.s64.uFraction &= ~RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1); /* signalling */
652	AssertMsg(bType >= 6 \|\| RTFLOAT64U_IS_QUIET_NAN(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
653	AssertMsg(bType < 6 \|\| RTFLOAT64U_IS_SIGNALLING_NAN(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
654	AssertMsg(RTFLOAT64U_IS_NAN(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
655	}
656	else if (bType < 12)
657	{
658	/* Make sure we have lots of normalized values. */
659	if (r64.s.uExponent == 0)
660	r64.s.uExponent = 1;
661	else if (r64.s.uExponent == 0x7ff)
662	r64.s.uExponent = 0x7fe;
663	AssertMsg(RTFLOAT64U_IS_NORMAL(&r64), ("%s bType=%#x\n", FormatR64(&r64), bType));
664	}
665	return r64;
666	}
667
668
669	static RTFLOAT64U RandR64Src(uint32_t iTest)
670	{
671	if (iTest < 16)
672	return RandR64Ex(iTest);
673	return RandR64Ex(RandU8());
674	}
675
676
677	/** Pairing with a 80-bit floating point arg. */
678	static RTFLOAT64U RandR64Src2(uint32_t iTest)
679	{
680	if (iTest < 12 * 10)
681	return RandR64Ex(9 - iTest % 10); /* start with normal values */
682	return RandR64Ex(RandU8());
683	}
684
685
686	static void SafeR32FractionShift(PRTFLOAT32U pr32, uint8_t cShift)
687	{
688	if (pr32->s.uFraction >= RT_BIT_32(cShift))
689	pr32->s.uFraction >>= cShift;
690	else
691	pr32->s.uFraction = (cShift % 19) + 1;
692	}
693
694
695	static RTFLOAT32U RandR32Ex(uint8_t bType)
696	{
697	RTFLOAT32U r32;
698	r32.u = RandU32();
699
700	/*
701	* Make it more likely that we get a good selection of special values.
702	* On average 6 out of 16 calls should return a special value.
703	*/
704	bType &= 0xf;
705	if (bType == 0 \|\| bType == 1)
706	{
707	/* 0 or Infinity. We only keep fSign here. */
708	r32.s.uExponent = bType == 0 ? 0 : 0xff;
709	r32.s.uFraction = 0;
710	AssertMsg(bType != 0 \|\| RTFLOAT32U_IS_ZERO(&r32), ("%s\n", FormatR32(&r32)));
711	AssertMsg(bType != 1 \|\| RTFLOAT32U_IS_INF(&r32), ("%s\n", FormatR32(&r32)));
712	}
713	else if (bType == 2 \|\| bType == 3)
714	{
715	/* Subnormals */
716	if (bType == 3)
717	SafeR32FractionShift(&r32, r32.s.uExponent % 22);
718	else if (r32.s.uFraction == 0)
719	r32.s.uFraction = RTRandU32Ex(1, RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1);
720	r32.s.uExponent = 0;
721	AssertMsg(RTFLOAT32U_IS_SUBNORMAL(&r32), ("%s bType=%#x\n", FormatR32(&r32), bType));
722	}
723	else if (bType == 4 \|\| bType == 5 \|\| bType == 6 \|\| bType == 7)
724	{
725	/* NaNs */
726	if (bType & 1)
727	SafeR32FractionShift(&r32, r32.s.uExponent % 22);
728	else if (r32.s.uFraction == 0)
729	r32.s.uFraction = RTRandU32Ex(1, RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1);
730	r32.s.uExponent = 0xff;
731	if (bType < 6)
732	r32.s.uFraction \|= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1); /* quiet */
733	else
734	r32.s.uFraction &= ~RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1); /* signalling */
735	AssertMsg(bType >= 6 \|\| RTFLOAT32U_IS_QUIET_NAN(&r32), ("%s bType=%#x\n", FormatR32(&r32), bType));
736	AssertMsg(bType < 6 \|\| RTFLOAT32U_IS_SIGNALLING_NAN(&r32), ("%s bType=%#x\n", FormatR32(&r32), bType));
737	AssertMsg(RTFLOAT32U_IS_NAN(&r32), ("%s bType=%#x\n", FormatR32(&r32), bType));
738	}
739	else if (bType < 12)
740	{
741	/* Make sure we have lots of normalized values. */
742	if (r32.s.uExponent == 0)
743	r32.s.uExponent = 1;
744	else if (r32.s.uExponent == 0xff)
745	r32.s.uExponent = 0xfe;
746	AssertMsg(RTFLOAT32U_IS_NORMAL(&r32), ("%s bType=%#x\n", FormatR32(&r32), bType));
747	}
748	return r32;
749	}
750
751
752	static RTFLOAT32U RandR32Src(uint32_t iTest)
753	{
754	if (iTest < 16)
755	return RandR32Ex(iTest);
756	return RandR32Ex(RandU8());
757	}
758
759
760	/** Pairing with a 80-bit floating point arg. */
761	static RTFLOAT32U RandR32Src2(uint32_t iTest)
762	{
763	if (iTest < 12 * 10)
764	return RandR32Ex(9 - iTest % 10); /* start with normal values */
765	return RandR32Ex(RandU8());
766	}
767
768
769	static RTPBCD80U RandD80Src(uint32_t iTest)
770	{
771	if (iTest < 3)
772	{
773	RTPBCD80U d80Zero = RTPBCD80U_INIT_ZERO(!(iTest & 1));
774	return d80Zero;
775	}
776	if (iTest < 5)
777	{
778	RTPBCD80U d80Ind = RTPBCD80U_INIT_INDEFINITE();
779	return d80Ind;
780	}
781
782	RTPBCD80U d80;
783	uint8_t b = RandU8();
784	d80.s.fSign = b & 1;
785
786	if ((iTest & 7) >= 6)
787	{
788	/* Illegal */
789	d80.s.uPad = (iTest & 7) == 7 ? b >> 1 : 0;
790	for (size_t iPair = 0; iPair < RT_ELEMENTS(d80.s.abPairs); iPair++)
791	d80.s.abPairs[iPair] = RandU8();
792	}
793	else
794	{
795	/* Normal */
796	d80.s.uPad = 0;
797	for (size_t iPair = 0; iPair < RT_ELEMENTS(d80.s.abPairs); iPair++)
798	{
799	uint8_t const uLo = (uint8_t)RTRandU32Ex(0, 9);
800	uint8_t const uHi = (uint8_t)RTRandU32Ex(0, 9);
801	d80.s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(uHi, uLo);
802	}
803	}
804	return d80;
805	}
806
807
808	static const char *GenFormatR80(PCRTFLOAT80U plrd)
809	{
810	if (RTFLOAT80U_IS_ZERO(plrd))
811	return plrd->s.fSign ? "RTFLOAT80U_INIT_ZERO(1)" : "RTFLOAT80U_INIT_ZERO(0)";
812	if (RTFLOAT80U_IS_INF(plrd))
813	return plrd->s.fSign ? "RTFLOAT80U_INIT_INF(1)" : "RTFLOAT80U_INIT_INF(0)";
814	if (RTFLOAT80U_IS_INDEFINITE(plrd))
815	return plrd->s.fSign ? "RTFLOAT80U_INIT_IND(1)" : "RTFLOAT80U_INIT_IND(0)";
816	if (RTFLOAT80U_IS_QUIET_NAN(plrd) && (plrd->s.uMantissa & (RT_BIT_64(62) - 1)) == 1)
817	return plrd->s.fSign ? "RTFLOAT80U_INIT_QNAN(1)" : "RTFLOAT80U_INIT_QNAN(0)";
818	if (RTFLOAT80U_IS_SIGNALLING_NAN(plrd) && (plrd->s.uMantissa & (RT_BIT_64(62) - 1)) == 1)
819	return plrd->s.fSign ? "RTFLOAT80U_INIT_SNAN(1)" : "RTFLOAT80U_INIT_SNAN(0)";
820
821	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
822	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "RTFLOAT80U_INIT_C(%d,%#RX64,%u)",
823	plrd->s.fSign, plrd->s.uMantissa, plrd->s.uExponent);
824	return pszBuf;
825	}
826
827	static const char *GenFormatR64(PCRTFLOAT64U prd)
828	{
829	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
830	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "RTFLOAT64U_INIT_C(%d,%#RX64,%u)",
831	prd->s.fSign, RT_MAKE_U64(prd->s.uFractionLow, prd->s.uFractionHigh), prd->s.uExponent);
832	return pszBuf;
833	}
834
835
836	static const char *GenFormatR32(PCRTFLOAT32U pr)
837	{
838	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
839	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "RTFLOAT32U_INIT_C(%d,%#RX32,%u)", pr->s.fSign, pr->s.uFraction, pr->s.uExponent);
840	return pszBuf;
841	}
842
843
844	static const char *GenFormatD80(PCRTPBCD80U pd80)
845	{
846	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
847	size_t off;
848	if (pd80->s.uPad == 0)
849	off = RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "RTPBCD80U_INIT_C(%d", pd80->s.fSign);
850	else
851	off = RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "RTPBCD80U_INIT_EX_C(%#x,%d", pd80->s.uPad, pd80->s.fSign);
852	size_t iPair = RT_ELEMENTS(pd80->s.abPairs);
853	while (iPair-- > 0)
854	off += RTStrPrintf(&pszBuf[off], sizeof(g_aszBuf[0]) - off, ",%d,%d",
855	RTPBCD80U_HI_DIGIT(pd80->s.abPairs[iPair]),
856	RTPBCD80U_LO_DIGIT(pd80->s.abPairs[iPair]));
857	pszBuf[off++] = ')';
858	pszBuf[off++] = '\0';
859	return pszBuf;
860	}
861
862
863	static const char *GenFormatI64(int64_t i64)
864	{
865	if (i64 == INT64_MIN) /* This one is problematic */
866	return "INT64_MIN";
867	if (i64 == INT64_MAX)
868	return "INT64_MAX";
869	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
870	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "INT64_C(%RI64)", i64);
871	return pszBuf;
872	}
873
874	#if 0 /* unused */
875	static const char GenFormatI64(int64_t const pi64)
876	{
877	return GenFormatI64(*pi64);
878	}
879	#endif
880
881	static const char *GenFormatI32(int32_t i32)
882	{
883	if (i32 == INT32_MIN) /* This one is problematic */
884	return "INT32_MIN";
885	if (i32 == INT32_MAX)
886	return "INT32_MAX";
887	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
888	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "INT32_C(%RI32)", i32);
889	return pszBuf;
890	}
891
892
893	const char GenFormatI32(int32_t const pi32)
894	{
895	return GenFormatI32(*pi32);
896	}
897
898
899	const char *GenFormatI16(int16_t i16)
900	{
901	if (i16 == INT16_MIN) /* This one is problematic */
902	return "INT16_MIN";
903	if (i16 == INT16_MAX)
904	return "INT16_MAX";
905	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
906	RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), "INT16_C(%RI16)", i16);
907	return pszBuf;
908	}
909
910
911	const char GenFormatI16(int16_t const pi16)
912	{
913	return GenFormatI16(*pi16);
914	}
915
916
917	static void GenerateHeader(PRTSTREAM pOut, const char pszCpuDesc, const char pszCpuType)
918	{
919	/* We want to tag the generated source code with the revision that produced it. */
920	static char s_szRev[] = "$Revision: 103050 $";
921	const char *pszRev = RTStrStripL(strchr(s_szRev, ':') + 1);
922	size_t cchRev = 0;
923	while (RT_C_IS_DIGIT(pszRev[cchRev]))
924	cchRev++;
925
926	RTStrmPrintf(pOut,
927	"/* $Id: tstIEMAImpl.cpp 103050 2024-01-25 00:42:30Z vboxsync $ */\n"
928	"/** @file\n"
929	" * IEM Assembly Instruction Helper Testcase Data%s%s - r%.*s on %s.\n"
930	" */\n"
931	"\n"
932	"/*\n"
933	" * Copyright (C) 2022-" VBOX_C_YEAR " Oracle and/or its affiliates.\n"
934	" *\n"
935	" * This file is part of VirtualBox base platform packages, as\n"
936	" * available from https://www.virtualbox.org.\n"
937	" *\n"
938	" * This program is free software; you can redistribute it and/or\n"
939	" * modify it under the terms of the GNU General Public License\n"
940	" * as published by the Free Software Foundation, in version 3 of the\n"
941	" * License.\n"
942	" *\n"
943	" * This program is distributed in the hope that it will be useful, but\n"
944	" * WITHOUT ANY WARRANTY; without even the implied warranty of\n"
945	" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n"
946	" * General Public License for more details.\n"
947	" *\n"
948	" * You should have received a copy of the GNU General Public License\n"
949	" * along with this program; if not, see <https://www.gnu.org/licenses>.\n"
950	" *\n"
951	" * SPDX-License-Identifier: GPL-3.0-only\n"
952	" */\n"
953	"\n"
954	"#include \"tstIEMAImpl.h\"\n"
955	"\n"
956	,
957	pszCpuType ? " " : "", pszCpuType ? pszCpuType : "", cchRev, pszRev, pszCpuDesc);
958	}
959
960
961	static PRTSTREAM GenerateOpenWithHdr(const char pszFilename, const char pszCpuDesc, const char *pszCpuType)
962	{
963	PRTSTREAM pOut = NULL;
964	int rc = RTStrmOpen(pszFilename, "w", &pOut);
965	if (RT_SUCCESS(rc))
966	{
967	GenerateHeader(pOut, pszCpuDesc, pszCpuType);
968	return pOut;
969	}
970	RTMsgError("Failed to open %s for writing: %Rrc", pszFilename, rc);
971	return NULL;
972	}
973
974
975	static RTEXITCODE GenerateFooterAndClose(PRTSTREAM pOut, const char *pszFilename, RTEXITCODE rcExit)
976	{
977	RTStrmPrintf(pOut,
978	"\n"
979	"/* end of file */\n");
980	int rc = RTStrmClose(pOut);
981	if (RT_SUCCESS(rc))
982	return rcExit;
983	return RTMsgErrorExitFailure("RTStrmClose failed on %s: %Rrc", pszFilename, rc);
984	}
985
986
987	static void GenerateArrayStart(PRTSTREAM pOut, const char pszName, const char pszType)
988	{
989	RTStrmPrintf(pOut, "%s const g_aTests_%s[] =\n{\n", pszType, pszName);
990	}
991
992
993	static void GenerateArrayEnd(PRTSTREAM pOut, const char *pszName)
994	{
995	RTStrmPrintf(pOut,
996	"};\n"
997	"uint32_t const g_cTests_%s = RT_ELEMENTS(g_aTests_%s);\n"
998	"\n",
999	pszName, pszName);
1000	}
1001
1002
1003	static bool GenerateBinaryOpen(PIEMBINARYOUTPUT pBinOut, const char pszFilenameFmt, const char pszName)
1004	{
1005	pBinOut->hVfsFile = NIL_RTVFSFILE;
1006	pBinOut->hVfsUncompressed = NIL_RTVFSIOSTREAM;
1007	if (pszFilenameFmt)
1008	{
1009	pBinOut->fNull = false;
1010	if (RTStrPrintf2(pBinOut->szFilename, sizeof(pBinOut->szFilename), pszFilenameFmt, pszName) > 0)
1011	{
1012	RTMsgInfo("GenerateBinaryOpen: %s...\n", pBinOut->szFilename);
1013	pBinOut->rcWrite = RTVfsFileOpenNormal(pBinOut->szFilename,
1014	RTFILE_O_CREATE_REPLACE \| RTFILE_O_WRITE \| RTFILE_O_DENY_READWRITE,
1015	&pBinOut->hVfsFile);
1016	if (RT_SUCCESS(pBinOut->rcWrite))
1017	{
1018	RTVFSIOSTREAM hVfsIoFile = RTVfsFileToIoStream(pBinOut->hVfsFile);
1019	if (hVfsIoFile != NIL_RTVFSIOSTREAM)
1020	{
1021	pBinOut->rcWrite = RTZipGzipCompressIoStream(hVfsIoFile, 0 /fFlags/, 9, &pBinOut->hVfsUncompressed);
1022	RTVfsIoStrmRelease(hVfsIoFile);
1023	if (RT_SUCCESS(pBinOut->rcWrite))
1024	{
1025	pBinOut->rcWrite = VINF_SUCCESS;
1026	return true;
1027	}
1028
1029	RTMsgError("RTZipGzipCompressIoStream: %Rrc", pBinOut->rcWrite);
1030	}
1031	else
1032	{
1033	RTMsgError("RTVfsFileToIoStream failed!");
1034	pBinOut->rcWrite = VERR_VFS_CHAIN_CAST_FAILED;
1035	}
1036	RTVfsFileRelease(pBinOut->hVfsFile);
1037	RTFileDelete(pBinOut->szFilename);
1038	}
1039	else
1040	RTMsgError("Failed to open '%s' for writing: %Rrc", pBinOut->szFilename, pBinOut->rcWrite);
1041	}
1042	else
1043	{
1044	RTMsgError("filename too long: %s + %s", pszFilenameFmt, pszName);
1045	pBinOut->rcWrite = VERR_BUFFER_OVERFLOW;
1046	}
1047	return false;
1048	}
1049	RTMsgInfo("GenerateBinaryOpen: %s -> /dev/null\n", pszName);
1050	pBinOut->rcWrite = VERR_IGNORED;
1051	pBinOut->fNull = true;
1052	pBinOut->szFilename[0] = '\0';
1053	return true;
1054	}
1055
1056
1057	static void GenerateBinaryWrite(PIEMBINARYOUTPUT pBinOut, const void *pvData, size_t cbData)
1058	{
1059	if (RT_SUCCESS_NP(pBinOut->rcWrite))
1060	{
1061	pBinOut->rcWrite = RTVfsIoStrmWrite(pBinOut->hVfsUncompressed, pvData, cbData, true /fBlocking/, NULL);
1062	if (RT_SUCCESS(pBinOut->rcWrite))
1063	return;
1064	RTMsgError("Error writing '%s': %Rrc", pBinOut->szFilename, pBinOut->rcWrite);
1065	}
1066	}
1067
1068
1069	static bool GenerateBinaryClose(PIEMBINARYOUTPUT pBinOut)
1070	{
1071	if (!pBinOut->fNull)
1072	{
1073	/* This is rather jovial about rcWrite. */
1074	int const rc1 = RTVfsIoStrmFlush(pBinOut->hVfsUncompressed);
1075	RTVfsIoStrmRelease(pBinOut->hVfsUncompressed);
1076	pBinOut->hVfsUncompressed = NIL_RTVFSIOSTREAM;
1077	if (RT_FAILURE(rc1))
1078	RTMsgError("Error flushing '%s' (uncompressed stream): %Rrc", pBinOut->szFilename, rc1);
1079
1080	int const rc2 = RTVfsFileFlush(pBinOut->hVfsFile);
1081	RTVfsFileRelease(pBinOut->hVfsFile);
1082	pBinOut->hVfsFile = NIL_RTVFSFILE;
1083	if (RT_FAILURE(rc2))
1084	RTMsgError("Error flushing '%s' (compressed file): %Rrc", pBinOut->szFilename, rc2);
1085
1086	return RT_SUCCESS(rc2) && RT_SUCCESS(rc1) && RT_SUCCESS(pBinOut->rcWrite);
1087	}
1088	return true;
1089	}
1090
1091
1092	#endif /* TSTIEMAIMPL_WITH_GENERATOR */
1093
1094
1095	/*
1096	* Test helpers.
1097	*/
1098	static bool IsTestEnabled(const char *pszName)
1099	{
1100	/* Process excludes first: */
1101	uint32_t i = g_cExcludeTestPatterns;
1102	while (i-- > 0)
1103	if (RTStrSimplePatternMultiMatch(g_apszExcludeTestPatterns[i], RTSTR_MAX, pszName, RTSTR_MAX, NULL))
1104	return false;
1105
1106	/* If no include patterns, everything is included: */
1107	i = g_cIncludeTestPatterns;
1108	if (!i)
1109	return true;
1110
1111	/* Otherwise only tests in the include patters gets tested: */
1112	while (i-- > 0)
1113	if (RTStrSimplePatternMultiMatch(g_apszIncludeTestPatterns[i], RTSTR_MAX, pszName, RTSTR_MAX, NULL))
1114	return true;
1115
1116	return false;
1117	}
1118
1119
1120	static bool SubTestAndCheckIfEnabled(const char *pszName)
1121	{
1122	RTTestSub(g_hTest, pszName);
1123	if (IsTestEnabled(pszName))
1124	return true;
1125	RTTestSkipped(g_hTest, g_cVerbosity > 0 ? "excluded" : NULL);
1126	return false;
1127	}
1128
1129
1130	/** Decompresses test data before use as required. */
1131	static int DecompressBinaryTest(bool pfCompressed, void ppvTests, uint32_t const *ppcTests, size_t cbEntry, bool fBinary)
1132	{
1133	if (!*pfCompressed)
1134	return VINF_SUCCESS;
1135
1136	/* Open a memory stream for the compressed binary data. */
1137	uint32_t const cbCompressed = **ppcTests;
1138	RTVFSIOSTREAM hVfsIos = NIL_RTVFSIOSTREAM;
1139	int rc = RTVfsIoStrmFromBuffer(RTFILE_O_READ, *ppvTests, cbCompressed, &hVfsIos);
1140	RTTESTI_CHECK_RC_OK_RET(rc, rc);
1141
1142	/* Open a decompressed stream for it. */
1143	RTVFSIOSTREAM hVfsIosDecomp = NIL_RTVFSIOSTREAM;
1144	rc = RTZipGzipDecompressIoStream(hVfsIos, RTZIPGZIPDECOMP_F_ALLOW_ZLIB_HDR, &hVfsIosDecomp);
1145	RTTESTI_CHECK_RC_OK(rc);
1146	if (RT_SUCCESS(rc))
1147	{
1148	/* Initial output buffer allocation. */
1149	size_t cbDecompressedAlloc = cbCompressed <= _16M ? (size_t)cbCompressed * 16 : (size_t)cbCompressed * 4;
1150	uint8_t pbDecompressed = (uint8_t )RTMemAllocZ(cbDecompressedAlloc);
1151	if (pbDecompressed)
1152	{
1153	size_t off = 0;
1154	for (;;)
1155	{
1156	size_t cbRead = 0;
1157	rc = RTVfsIoStrmRead(hVfsIosDecomp, &pbDecompressed[off], cbDecompressedAlloc - off, true /fBlocking/, &cbRead);
1158	if (RT_FAILURE(rc))
1159	break;
1160	if (rc == VINF_EOF && cbRead == 0)
1161	break;
1162	off += cbRead;
1163
1164	if (cbDecompressedAlloc < off + 256)
1165	{
1166	size_t const cbNew = cbDecompressedAlloc < _128M ? cbDecompressedAlloc * 2 : cbDecompressedAlloc + _32M;
1167	void * const pvNew = RTMemRealloc(pbDecompressed, cbNew);
1168	AssertBreakStmt(pvNew, rc = VERR_NO_MEMORY);
1169	cbDecompressedAlloc = cbNew;
1170	pbDecompressed = (uint8_t *)pvNew;
1171	}
1172	}
1173	if (RT_SUCCESS(rc))
1174	{
1175	if ((off % cbEntry) == 0)
1176	{
1177	if (cbDecompressedAlloc - off > _512K)
1178	{
1179	void * const pvNew = RTMemRealloc(pbDecompressed, off);
1180	if (pvNew)
1181	pbDecompressed = (uint8_t *)pvNew;
1182	}
1183	uint32_t pcTests = (uint32_t )RTMemAlloc(sizeof(uint32_t));
1184	if (pcTests)
1185	{
1186	/* Done! */
1187	*pcTests = (uint32_t)(fBinary ? off : off / cbEntry);
1188	*ppvTests = pbDecompressed;
1189	*ppcTests = pcTests;
1190	*pfCompressed = false;
1191
1192	pbDecompressed = NULL;
1193	rc = VINF_SUCCESS;
1194	}
1195	else
1196	{
1197	RTTestIFailed("Out of memory decompressing test data (uint32_t)");
1198	rc = VERR_NO_MEMORY;
1199	}
1200	}
1201	else
1202	{
1203	RTTestIFailed("Uneven decompressed data size: %#zx vs entry size %#zx -> %#zx", off, cbEntry, off % cbEntry);
1204	rc = VERR_IO_BAD_LENGTH;
1205	}
1206	}
1207	else
1208	RTTestIFailed("Failed to decompress binary stream: %Rrc (off=%#zx, cbCompressed=%#x)", rc, off, cbCompressed);
1209	RTMemFree(pbDecompressed);
1210	}
1211	else
1212	{
1213	RTTestIFailed("Out of memory decompressing test data");
1214	rc = VERR_NO_MEMORY;
1215	}
1216	RTVfsIoStrmRelease(hVfsIosDecomp);
1217	}
1218	RTVfsIoStrmRelease(hVfsIos);
1219	return rc;
1220	}
1221
1222	#define DECOMPRESS_TESTS(a_Entry) \
1223	RT_SUCCESS(DecompressBinaryTest(&(a_Entry).fCompressed, (void **)&(a_Entry).paTests, &(a_Entry).pcTests, \
1224	sizeof((a_Entry).paTests[0]), (a_Entry).fBinary))
1225
1226
1227	/** Decompresses test data before use as required. */
1228	static int SubTestAndCheckIfEnabledAndDecompress(const char *pszName, size_t cbEntry, bool fBinary,
1229	bool pfCompressed, void ppvTests, uint32_t const *ppcTests)
1230	{
1231	if (SubTestAndCheckIfEnabled(pszName))
1232	{
1233	int const rc = DecompressBinaryTest(pfCompressed, ppvTests, ppcTests, cbEntry, fBinary);
1234	if (RT_SUCCESS(rc))
1235	return true;
1236	}
1237	return false;
1238	}
1239
1240	#define SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_Entry) \
1241	SubTestAndCheckIfEnabledAndDecompress((a_Entry).pszName, sizeof((a_Entry).paTests[0]), (a_Entry).fBinary, \
1242	&(a_Entry).fCompressed, (void **)&(a_Entry).paTests, &(a_Entry).pcTests)
1243
1244
1245	static const char *EFlagsDiff(uint32_t fActual, uint32_t fExpected)
1246	{
1247	if (fActual == fExpected)
1248	return "";
1249
1250	uint32_t const fXor = fActual ^ fExpected;
1251	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1252	size_t cch = RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), " - %#x", fXor);
1253
1254	static struct
1255	{
1256	const char *pszName;
1257	uint32_t fFlag;
1258	} const s_aFlags[] =
1259	{
1260	#define EFL_ENTRY(a_Flags) { #a_Flags, X86_EFL_ ## a_Flags }
1261	EFL_ENTRY(CF),
1262	EFL_ENTRY(PF),
1263	EFL_ENTRY(AF),
1264	EFL_ENTRY(ZF),
1265	EFL_ENTRY(SF),
1266	EFL_ENTRY(TF),
1267	EFL_ENTRY(IF),
1268	EFL_ENTRY(DF),
1269	EFL_ENTRY(OF),
1270	EFL_ENTRY(IOPL),
1271	EFL_ENTRY(NT),
1272	EFL_ENTRY(RF),
1273	EFL_ENTRY(VM),
1274	EFL_ENTRY(AC),
1275	EFL_ENTRY(VIF),
1276	EFL_ENTRY(VIP),
1277	EFL_ENTRY(ID),
1278	};
1279	for (size_t i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1280	if (s_aFlags[i].fFlag & fXor)
1281	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch,
1282	s_aFlags[i].fFlag & fActual ? "/%s" : "/!%s", s_aFlags[i].pszName);
1283	RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "");
1284	return pszBuf;
1285	}
1286
1287
1288	static const char *FswDiff(uint16_t fActual, uint16_t fExpected)
1289	{
1290	if (fActual == fExpected)
1291	return "";
1292
1293	uint16_t const fXor = fActual ^ fExpected;
1294	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1295	size_t cch = RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), " - %#x", fXor);
1296
1297	static struct
1298	{
1299	const char *pszName;
1300	uint32_t fFlag;
1301	} const s_aFlags[] =
1302	{
1303	#define FSW_ENTRY(a_Flags) { #a_Flags, X86_FSW_ ## a_Flags }
1304	FSW_ENTRY(IE),
1305	FSW_ENTRY(DE),
1306	FSW_ENTRY(ZE),
1307	FSW_ENTRY(OE),
1308	FSW_ENTRY(UE),
1309	FSW_ENTRY(PE),
1310	FSW_ENTRY(SF),
1311	FSW_ENTRY(ES),
1312	FSW_ENTRY(C0),
1313	FSW_ENTRY(C1),
1314	FSW_ENTRY(C2),
1315	FSW_ENTRY(C3),
1316	FSW_ENTRY(B),
1317	};
1318	for (size_t i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1319	if (s_aFlags[i].fFlag & fXor)
1320	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch,
1321	s_aFlags[i].fFlag & fActual ? "/%s" : "/!%s", s_aFlags[i].pszName);
1322	if (fXor & X86_FSW_TOP_MASK)
1323	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "/TOP%u!%u",
1324	X86_FSW_TOP_GET(fActual), X86_FSW_TOP_GET(fExpected));
1325	#if 0 /* For debugging fprem & fprem1 */
1326	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, " - Q=%d (vs %d)",
1327	X86_FSW_CX_TO_QUOTIENT(fActual), X86_FSW_CX_TO_QUOTIENT(fExpected));
1328	#endif
1329	RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "");
1330	return pszBuf;
1331	}
1332
1333
1334	static const char *MxcsrDiff(uint32_t fActual, uint32_t fExpected)
1335	{
1336	if (fActual == fExpected)
1337	return "";
1338
1339	uint16_t const fXor = fActual ^ fExpected;
1340	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1341	size_t cch = RTStrPrintf(pszBuf, sizeof(g_aszBuf[0]), " - %#x", fXor);
1342
1343	static struct
1344	{
1345	const char *pszName;
1346	uint32_t fFlag;
1347	} const s_aFlags[] =
1348	{
1349	#define MXCSR_ENTRY(a_Flags) { #a_Flags, X86_MXCSR_ ## a_Flags }
1350	MXCSR_ENTRY(IE),
1351	MXCSR_ENTRY(DE),
1352	MXCSR_ENTRY(ZE),
1353	MXCSR_ENTRY(OE),
1354	MXCSR_ENTRY(UE),
1355	MXCSR_ENTRY(PE),
1356
1357	MXCSR_ENTRY(IM),
1358	MXCSR_ENTRY(DM),
1359	MXCSR_ENTRY(ZM),
1360	MXCSR_ENTRY(OM),
1361	MXCSR_ENTRY(UM),
1362	MXCSR_ENTRY(PM),
1363
1364	MXCSR_ENTRY(DAZ),
1365	MXCSR_ENTRY(FZ),
1366	#undef MXCSR_ENTRY
1367	};
1368	for (size_t i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1369	if (s_aFlags[i].fFlag & fXor)
1370	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch,
1371	s_aFlags[i].fFlag & fActual ? "/%s" : "/!%s", s_aFlags[i].pszName);
1372	RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "");
1373	return pszBuf;
1374	}
1375
1376
1377	static const char *FormatFcw(uint16_t fFcw)
1378	{
1379	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1380
1381	const char pszPC = NULL; / (msc+gcc are too stupid) */
1382	switch (fFcw & X86_FCW_PC_MASK)
1383	{
1384	case X86_FCW_PC_24: pszPC = "PC24"; break;
1385	case X86_FCW_PC_RSVD: pszPC = "PCRSVD!"; break;
1386	case X86_FCW_PC_53: pszPC = "PC53"; break;
1387	case X86_FCW_PC_64: pszPC = "PC64"; break;
1388	}
1389
1390	const char pszRC = NULL; / (msc+gcc are too stupid) */
1391	switch (fFcw & X86_FCW_RC_MASK)
1392	{
1393	case X86_FCW_RC_NEAREST: pszRC = "NEAR"; break;
1394	case X86_FCW_RC_DOWN: pszRC = "DOWN"; break;
1395	case X86_FCW_RC_UP: pszRC = "UP"; break;
1396	case X86_FCW_RC_ZERO: pszRC = "ZERO"; break;
1397	}
1398	size_t cch = RTStrPrintf(&pszBuf[0], sizeof(g_aszBuf[0]), "%s %s", pszPC, pszRC);
1399
1400	static struct
1401	{
1402	const char *pszName;
1403	uint32_t fFlag;
1404	} const s_aFlags[] =
1405	{
1406	#define FCW_ENTRY(a_Flags) { #a_Flags, X86_FCW_ ## a_Flags }
1407	FCW_ENTRY(IM),
1408	FCW_ENTRY(DM),
1409	FCW_ENTRY(ZM),
1410	FCW_ENTRY(OM),
1411	FCW_ENTRY(UM),
1412	FCW_ENTRY(PM),
1413	{ "6M", 64 },
1414	};
1415	for (size_t i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1416	if (fFcw & s_aFlags[i].fFlag)
1417	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, " %s", s_aFlags[i].pszName);
1418
1419	RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "");
1420	return pszBuf;
1421	}
1422
1423
1424	static const char *FormatMxcsr(uint32_t fMxcsr)
1425	{
1426	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1427
1428	const char pszRC = NULL; / (msc+gcc are too stupid) */
1429	switch (fMxcsr & X86_MXCSR_RC_MASK)
1430	{
1431	case X86_MXCSR_RC_NEAREST: pszRC = "NEAR"; break;
1432	case X86_MXCSR_RC_DOWN: pszRC = "DOWN"; break;
1433	case X86_MXCSR_RC_UP: pszRC = "UP"; break;
1434	case X86_MXCSR_RC_ZERO: pszRC = "ZERO"; break;
1435	}
1436
1437	const char *pszDAZ = fMxcsr & X86_MXCSR_DAZ ? " DAZ" : "";
1438	const char *pszFZ = fMxcsr & X86_MXCSR_FZ ? " FZ" : "";
1439	size_t cch = RTStrPrintf(&pszBuf[0], sizeof(g_aszBuf[0]), "%s%s%s", pszRC, pszDAZ, pszFZ);
1440
1441	static struct
1442	{
1443	const char *pszName;
1444	uint32_t fFlag;
1445	} const s_aFlags[] =
1446	{
1447	#define MXCSR_ENTRY(a_Flags) { #a_Flags, X86_MXCSR_ ## a_Flags }
1448	MXCSR_ENTRY(IE),
1449	MXCSR_ENTRY(DE),
1450	MXCSR_ENTRY(ZE),
1451	MXCSR_ENTRY(OE),
1452	MXCSR_ENTRY(UE),
1453	MXCSR_ENTRY(PE),
1454
1455	MXCSR_ENTRY(IM),
1456	MXCSR_ENTRY(DM),
1457	MXCSR_ENTRY(ZM),
1458	MXCSR_ENTRY(OM),
1459	MXCSR_ENTRY(UM),
1460	MXCSR_ENTRY(PM),
1461	{ "6M", 64 },
1462	};
1463	for (size_t i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1464	if (fMxcsr & s_aFlags[i].fFlag)
1465	cch += RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, " %s", s_aFlags[i].pszName);
1466
1467	RTStrPrintf(&pszBuf[cch], sizeof(g_aszBuf[0]) - cch, "");
1468	return pszBuf;
1469	}
1470
1471
1472	static const char *FormatR80(PCRTFLOAT80U pr80)
1473	{
1474	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1475	RTStrFormatR80(pszBuf, sizeof(g_aszBuf[0]), pr80, 0, 0, RTSTR_F_SPECIAL);
1476	return pszBuf;
1477	}
1478
1479
1480	static const char *FormatR64(PCRTFLOAT64U pr64)
1481	{
1482	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1483	RTStrFormatR64(pszBuf, sizeof(g_aszBuf[0]), pr64, 0, 0, RTSTR_F_SPECIAL);
1484	return pszBuf;
1485	}
1486
1487
1488	static const char *FormatR32(PCRTFLOAT32U pr32)
1489	{
1490	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1491	RTStrFormatR32(pszBuf, sizeof(g_aszBuf[0]), pr32, 0, 0, RTSTR_F_SPECIAL);
1492	return pszBuf;
1493	}
1494
1495
1496	static const char *FormatD80(PCRTPBCD80U pd80)
1497	{
1498	/* There is only one indefinite endcoding (same as for 80-bit
1499	floating point), so get it out of the way first: */
1500	if (RTPBCD80U_IS_INDEFINITE(pd80))
1501	return "Ind";
1502
1503	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1504	size_t off = 0;
1505	pszBuf[off++] = pd80->s.fSign ? '-' : '+';
1506	unsigned cBadDigits = 0;
1507	size_t iPair = RT_ELEMENTS(pd80->s.abPairs);
1508	while (iPair-- > 0)
1509	{
1510	static const char s_szDigits[] = "0123456789abcdef";
1511	static const uint8_t s_bBadDigits[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1 };
1512	pszBuf[off++] = s_szDigits[RTPBCD80U_HI_DIGIT(pd80->s.abPairs[iPair])];
1513	pszBuf[off++] = s_szDigits[RTPBCD80U_LO_DIGIT(pd80->s.abPairs[iPair])];
1514	cBadDigits += s_bBadDigits[RTPBCD80U_HI_DIGIT(pd80->s.abPairs[iPair])]
1515	+ s_bBadDigits[RTPBCD80U_LO_DIGIT(pd80->s.abPairs[iPair])];
1516	}
1517	if (cBadDigits \|\| pd80->s.uPad != 0)
1518	off += RTStrPrintf(&pszBuf[off], sizeof(g_aszBuf[0]) - off, "[%u,%#x]", cBadDigits, pd80->s.uPad);
1519	pszBuf[off] = '\0';
1520	return pszBuf;
1521	}
1522
1523
1524	#if 0
1525	static const char FormatI64(int64_t const piVal)
1526	{
1527	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1528	RTStrFormatU64(pszBuf, sizeof(g_aszBuf[0]), *piVal, 16, 0, 0, RTSTR_F_SPECIAL \| RTSTR_F_VALSIGNED);
1529	return pszBuf;
1530	}
1531	#endif
1532
1533
1534	static const char FormatI32(int32_t const piVal)
1535	{
1536	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1537	RTStrFormatU32(pszBuf, sizeof(g_aszBuf[0]), *piVal, 16, 0, 0, RTSTR_F_SPECIAL \| RTSTR_F_VALSIGNED);
1538	return pszBuf;
1539	}
1540
1541
1542	static const char FormatI16(int16_t const piVal)
1543	{
1544	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1545	RTStrFormatU16(pszBuf, sizeof(g_aszBuf[0]), *piVal, 16, 0, 0, RTSTR_F_SPECIAL \| RTSTR_F_VALSIGNED);
1546	return pszBuf;
1547	}
1548
1549
1550	static const char *FormatU128(PCRTUINT128U puVal)
1551	{
1552	char *pszBuf = g_aszBuf[g_idxBuf++ % RT_ELEMENTS(g_aszBuf)];
1553	RTStrFormatU128(pszBuf, sizeof(g_aszBuf[0]), puVal, 16, 0, 0, RTSTR_F_SPECIAL);
1554	return pszBuf;
1555	}
1556
1557
1558	/*
1559	* Binary operations.
1560	*/
1561	TYPEDEF_SUBTEST_TYPE(BINU8_T, BINU8_TEST_T, PFNIEMAIMPLBINU8);
1562	TYPEDEF_SUBTEST_TYPE(BINU16_T, BINU16_TEST_T, PFNIEMAIMPLBINU16);
1563	TYPEDEF_SUBTEST_TYPE(BINU32_T, BINU32_TEST_T, PFNIEMAIMPLBINU32);
1564	TYPEDEF_SUBTEST_TYPE(BINU64_T, BINU64_TEST_T, PFNIEMAIMPLBINU64);
1565
1566	#ifdef TSTIEMAIMPL_WITH_GENERATOR
1567	# define GEN_BINARY_TESTS(a_cBits, a_Fmt, a_TestType) \
1568	static RTEXITCODE BinU ## a_cBits ## Generate(uint32_t cTests, const char * const * papszNameFmts) \
1569	{ \
1570	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aBinU ## a_cBits); iFn++) \
1571	{ \
1572	PFNIEMAIMPLBINU ## a_cBits const pfn = g_aBinU ## a_cBits[iFn].pfnNative \
1573	? g_aBinU ## a_cBits[iFn].pfnNative : g_aBinU ## a_cBits[iFn].pfn; \
1574	IEMBINARYOUTPUT BinOut; \
1575	if ( g_aBinU ## a_cBits[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE \
1576	&& g_aBinU ## a_cBits[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour) \
1577	continue; \
1578	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[g_aBinU ## a_cBits[iFn].idxCpuEflFlavour], \
1579	g_aBinU ## a_cBits[iFn].pszName), RTEXITCODE_FAILURE); \
1580	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
1581	{ \
1582	a_TestType Test; \
1583	Test.fEflIn = RandEFlags(); \
1584	Test.fEflOut = Test.fEflIn; \
1585	Test.uDstIn = RandU ## a_cBits ## Dst(iTest); \
1586	Test.uDstOut = Test.uDstIn; \
1587	Test.uSrcIn = RandU ## a_cBits ## Src(iTest); \
1588	if (g_aBinU ## a_cBits[iFn].uExtra) \
1589	Test.uSrcIn &= a_cBits - 1; /* Restrict bit index according to operand width */ \
1590	Test.uMisc = 0; \
1591	pfn(&Test.uDstOut, Test.uSrcIn, &Test.fEflOut); \
1592	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
1593	} \
1594	for (uint32_t iTest = 0; iTest < g_aBinU ## a_cBits[iFn].cFixedTests; iTest++ ) \
1595	{ \
1596	a_TestType Test; \
1597	Test.fEflIn = g_aBinU ## a_cBits[iFn].paFixedTests[iTest].fEflIn == UINT32_MAX ? RandEFlags() \
1598	: g_aBinU ## a_cBits[iFn].paFixedTests[iTest].fEflIn; \
1599	Test.fEflOut = Test.fEflIn; \
1600	Test.uDstIn = g_aBinU ## a_cBits[iFn].paFixedTests[iTest].uDstIn; \
1601	Test.uDstOut = Test.uDstIn; \
1602	Test.uSrcIn = g_aBinU ## a_cBits[iFn].paFixedTests[iTest].uSrcIn; \
1603	Test.uMisc = g_aBinU ## a_cBits[iFn].paFixedTests[iTest].uMisc; \
1604	pfn(&Test.uDstOut, Test.uSrcIn, &Test.fEflOut); \
1605	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
1606	} \
1607	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
1608	} \
1609	return RTEXITCODE_SUCCESS; \
1610	} \
1611	/* Temp for conversion. */ \
1612	static RTEXITCODE BinU ## a_cBits ## DumpAll(const char * const * papszNameFmts) \
1613	{ \
1614	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aBinU ## a_cBits); iFn++) \
1615	{ \
1616	AssertReturn(DECOMPRESS_TESTS(g_aBinU ## a_cBits[iFn]), RTEXITCODE_FAILURE); \
1617	IEMBINARYOUTPUT BinOut; \
1618	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[g_aBinU ## a_cBits[iFn].idxCpuEflFlavour], \
1619	g_aBinU ## a_cBits[iFn].pszName), RTEXITCODE_FAILURE); \
1620	size_t cbTests = g_aBinU ## a_cBits[iFn].pcTests[0]; \
1621	if (!g_aBinU ## a_cBits[iFn].fBinary) \
1622	cbTests *= sizeof(g_aBinU ## a_cBits[iFn].paTests[0]); \
1623	GenerateBinaryWrite(&BinOut, g_aBinU ## a_cBits[iFn].paTests, cbTests); \
1624	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
1625	} \
1626	return RTEXITCODE_SUCCESS; \
1627	}
1628
1629	#else
1630	# define GEN_BINARY_TESTS(a_cBits, a_Fmt, a_TestType)
1631	#endif
1632
1633
1634	/** Based on a quick probe run, guess how long to run the benchmark. */
1635	static uint32_t EstimateIterations(uint32_t cProbeIterations, uint64_t cNsProbe)
1636	{
1637	uint64_t cPicoSecPerIteration = cNsProbe * 1000 / cProbeIterations;
1638	uint64_t cIterations = g_cPicoSecBenchmark / cPicoSecPerIteration;
1639	if (cIterations > _2G)
1640	return _2G;
1641	if (cIterations < _4K)
1642	return _4K;
1643	return RT_ALIGN_32((uint32_t)cIterations, _4K);
1644	}
1645
1646
1647	#define TEST_BINARY_OPS(a_cBits, a_uType, a_Fmt, a_TestType, a_aSubTests) \
1648	GEN_BINARY_TESTS(a_cBits, a_Fmt, a_TestType) \
1649	\
1650	static uint64_t BinU ## a_cBits ## Bench(uint32_t cIterations, PFNIEMAIMPLBINU ## a_cBits pfn, a_TestType const *pEntry) \
1651	{ \
1652	uint32_t const fEflIn = pEntry->fEflIn; \
1653	a_uType const uDstIn = pEntry->uDstIn; \
1654	a_uType const uSrcIn = pEntry->uSrcIn; \
1655	cIterations /= 4; \
1656	RTThreadYield(); \
1657	uint64_t const nsStart = RTTimeNanoTS(); \
1658	for (uint32_t i = 0; i < cIterations; i++) \
1659	{ \
1660	uint32_t fBenchEfl = fEflIn; \
1661	a_uType uBenchDst = uDstIn; \
1662	pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
1663	\
1664	fBenchEfl = fEflIn; \
1665	uBenchDst = uDstIn; \
1666	pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
1667	\
1668	fBenchEfl = fEflIn; \
1669	uBenchDst = uDstIn; \
1670	pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
1671	\
1672	fBenchEfl = fEflIn; \
1673	uBenchDst = uDstIn; \
1674	pfn(&uBenchDst, uSrcIn, &fBenchEfl); \
1675	} \
1676	return RTTimeNanoTS() - nsStart; \
1677	} \
1678	\
1679	static void BinU ## a_cBits ## Test(void) \
1680	{ \
1681	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
1682	{ \
1683	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
1684	continue; \
1685	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
1686	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
1687	PFNIEMAIMPLBINU ## a_cBits pfn = a_aSubTests[iFn].pfn; \
1688	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
1689	if (!cTests) { RTTestSkipped(g_hTest, "no tests"); continue; } \
1690	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
1691	{ \
1692	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
1693	{ \
1694	uint32_t fEfl = paTests[iTest].fEflIn; \
1695	a_uType uDst = paTests[iTest].uDstIn; \
1696	pfn(&uDst, paTests[iTest].uSrcIn, &fEfl); \
1697	if ( uDst != paTests[iTest].uDstOut \
1698	\|\| fEfl != paTests[iTest].fEflOut ) \
1699	RTTestFailed(g_hTest, "#%u%s: efl=%#08x dst=" a_Fmt " src=" a_Fmt " -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s - %s\n", \
1700	iTest, !iVar ? "" : "/n", paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uSrcIn, \
1701	fEfl, uDst, paTests[iTest].fEflOut, paTests[iTest].uDstOut, \
1702	EFlagsDiff(fEfl, paTests[iTest].fEflOut), \
1703	uDst == paTests[iTest].uDstOut ? "eflags" : fEfl == paTests[iTest].fEflOut ? "dst" : "both"); \
1704	else \
1705	{ \
1706	*g_pu ## a_cBits = paTests[iTest].uDstIn; \
1707	*g_pfEfl = paTests[iTest].fEflIn; \
1708	pfn(g_pu ## a_cBits, paTests[iTest].uSrcIn, g_pfEfl); \
1709	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \
1710	RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \
1711	} \
1712	} \
1713	\
1714	/* Benchmark if all succeeded. */ \
1715	if (g_cPicoSecBenchmark && RTTestSubErrorCount(g_hTest) == 0) \
1716	{ \
1717	uint32_t const iTest = cTests / 2; \
1718	uint32_t const cIterations = EstimateIterations(_64K, BinU ## a_cBits ## Bench(_64K, pfn, &paTests[iTest])); \
1719	uint64_t const cNsRealRun = BinU ## a_cBits ## Bench(cIterations, pfn, &paTests[iTest]); \
1720	RTTestValueF(g_hTest, cNsRealRun * 1000 / cIterations, RTTESTUNIT_PS_PER_CALL, \
1721	"%s%s", a_aSubTests[iFn].pszName, iVar ? "-native" : ""); \
1722	} \
1723	\
1724	/* Next variation is native. */ \
1725	pfn = a_aSubTests[iFn].pfnNative; \
1726	} \
1727	} \
1728	}
1729
1730
1731	/*
1732	* 8-bit binary operations.
1733	*/
1734	static BINU8_T g_aBinU8[] =
1735	{
1736	ENTRY(add_u8),
1737	ENTRY(add_u8_locked),
1738	ENTRY(adc_u8),
1739	ENTRY(adc_u8_locked),
1740	ENTRY(sub_u8),
1741	ENTRY(sub_u8_locked),
1742	ENTRY(sbb_u8),
1743	ENTRY(sbb_u8_locked),
1744	ENTRY(or_u8),
1745	ENTRY(or_u8_locked),
1746	ENTRY(xor_u8),
1747	ENTRY(xor_u8_locked),
1748	ENTRY(and_u8),
1749	ENTRY(and_u8_locked),
1750	ENTRY_PFN_CAST(cmp_u8, PFNIEMAIMPLBINU8),
1751	ENTRY_PFN_CAST(test_u8, PFNIEMAIMPLBINU8),
1752	};
1753	TEST_BINARY_OPS(8, uint8_t, "%#04x", BINU8_TEST_T, g_aBinU8)
1754
1755
1756	/*
1757	* 16-bit binary operations.
1758	*/
1759	#ifdef TSTIEMAIMPL_WITH_GENERATOR
1760	static const BINU16_TEST_T g_aFixedTests_add_u16[] =
1761	{
1762	/* efl in, efl out, uDstIn, uDstOut, uSrc, uExtra */
1763	{ UINT32_MAX, 0, 1, 0, UINT16_MAX, 0 },
1764	};
1765	#endif
1766	static BINU16_T g_aBinU16[] =
1767	{
1768	ENTRY_FIX(add_u16),
1769	ENTRY(add_u16_locked),
1770	ENTRY(adc_u16),
1771	ENTRY(adc_u16_locked),
1772	ENTRY(sub_u16),
1773	ENTRY(sub_u16_locked),
1774	ENTRY(sbb_u16),
1775	ENTRY(sbb_u16_locked),
1776	ENTRY(or_u16),
1777	ENTRY(or_u16_locked),
1778	ENTRY(xor_u16),
1779	ENTRY(xor_u16_locked),
1780	ENTRY(and_u16),
1781	ENTRY(and_u16_locked),
1782	ENTRY_PFN_CAST(cmp_u16, PFNIEMAIMPLBINU16),
1783	ENTRY_PFN_CAST(test_u16, PFNIEMAIMPLBINU16),
1784	ENTRY_PFN_CAST_EX(bt_u16, PFNIEMAIMPLBINU16, 1),
1785	ENTRY_EX(btc_u16, 1),
1786	ENTRY_EX(btc_u16_locked, 1),
1787	ENTRY_EX(btr_u16, 1),
1788	ENTRY_EX(btr_u16_locked, 1),
1789	ENTRY_EX(bts_u16, 1),
1790	ENTRY_EX(bts_u16_locked, 1),
1791	ENTRY_AMD( bsf_u16, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1792	ENTRY_INTEL(bsf_u16, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1793	ENTRY_AMD( bsr_u16, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1794	ENTRY_INTEL(bsr_u16, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1795	ENTRY_AMD( imul_two_u16, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1796	ENTRY_INTEL(imul_two_u16, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1797	ENTRY(arpl),
1798	};
1799	TEST_BINARY_OPS(16, uint16_t, "%#06x", BINU16_TEST_T, g_aBinU16)
1800
1801
1802	/*
1803	* 32-bit binary operations.
1804	*/
1805	#ifdef TSTIEMAIMPL_WITH_GENERATOR
1806	static const BINU32_TEST_T g_aFixedTests_add_u32[] =
1807	{
1808	/* efl in, efl out, uDstIn, uDstOut, uSrc, uExtra */
1809	{ UINT32_MAX, 0, 1, 0, UINT32_MAX, 0 },
1810	};
1811	#endif
1812	static BINU32_T g_aBinU32[] =
1813	{
1814	ENTRY_FIX(add_u32),
1815	ENTRY(add_u32_locked),
1816	ENTRY(adc_u32),
1817	ENTRY(adc_u32_locked),
1818	ENTRY(sub_u32),
1819	ENTRY(sub_u32_locked),
1820	ENTRY(sbb_u32),
1821	ENTRY(sbb_u32_locked),
1822	ENTRY(or_u32),
1823	ENTRY(or_u32_locked),
1824	ENTRY(xor_u32),
1825	ENTRY(xor_u32_locked),
1826	ENTRY(and_u32),
1827	ENTRY(and_u32_locked),
1828	ENTRY_PFN_CAST(cmp_u32, PFNIEMAIMPLBINU32),
1829	ENTRY_PFN_CAST(test_u32, PFNIEMAIMPLBINU32),
1830	ENTRY_PFN_CAST_EX(bt_u32, PFNIEMAIMPLBINU32, 1),
1831	ENTRY_EX(btc_u32, 1),
1832	ENTRY_EX(btc_u32_locked, 1),
1833	ENTRY_EX(btr_u32, 1),
1834	ENTRY_EX(btr_u32_locked, 1),
1835	ENTRY_EX(bts_u32, 1),
1836	ENTRY_EX(bts_u32_locked, 1),
1837	ENTRY_AMD( bsf_u32, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1838	ENTRY_INTEL(bsf_u32, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1839	ENTRY_AMD( bsr_u32, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1840	ENTRY_INTEL(bsr_u32, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1841	ENTRY_AMD( imul_two_u32, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1842	ENTRY_INTEL(imul_two_u32, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1843	ENTRY(adcx_u32),
1844	ENTRY(adox_u32),
1845	};
1846	TEST_BINARY_OPS(32, uint32_t, "%#010RX32", BINU32_TEST_T, g_aBinU32)
1847
1848
1849	/*
1850	* 64-bit binary operations.
1851	*/
1852	#ifdef TSTIEMAIMPL_WITH_GENERATOR
1853	static const BINU64_TEST_T g_aFixedTests_add_u64[] =
1854	{
1855	/* efl in, efl out, uDstIn, uDstOut, uSrc, uExtra */
1856	{ UINT32_MAX, 0, 1, 0, UINT64_MAX, 0 },
1857	};
1858	#endif
1859	static BINU64_T g_aBinU64[] =
1860	{
1861	ENTRY_FIX(add_u64),
1862	ENTRY(add_u64_locked),
1863	ENTRY(adc_u64),
1864	ENTRY(adc_u64_locked),
1865	ENTRY(sub_u64),
1866	ENTRY(sub_u64_locked),
1867	ENTRY(sbb_u64),
1868	ENTRY(sbb_u64_locked),
1869	ENTRY(or_u64),
1870	ENTRY(or_u64_locked),
1871	ENTRY(xor_u64),
1872	ENTRY(xor_u64_locked),
1873	ENTRY(and_u64),
1874	ENTRY(and_u64_locked),
1875	ENTRY_PFN_CAST(cmp_u64, PFNIEMAIMPLBINU64),
1876	ENTRY_PFN_CAST(test_u64, PFNIEMAIMPLBINU64),
1877	ENTRY_PFN_CAST_EX(bt_u64, PFNIEMAIMPLBINU64, 1),
1878	ENTRY_EX(btc_u64, 1),
1879	ENTRY_EX(btc_u64_locked, 1),
1880	ENTRY_EX(btr_u64, 1),
1881	ENTRY_EX(btr_u64_locked, 1),
1882	ENTRY_EX(bts_u64, 1),
1883	ENTRY_EX(bts_u64_locked, 1),
1884	ENTRY_AMD( bsf_u64, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1885	ENTRY_INTEL(bsf_u64, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1886	ENTRY_AMD( bsr_u64, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1887	ENTRY_INTEL(bsr_u64, X86_EFL_CF \| X86_EFL_PF \| X86_EFL_AF \| X86_EFL_SF \| X86_EFL_OF),
1888	ENTRY_AMD( imul_two_u64, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1889	ENTRY_INTEL(imul_two_u64, X86_EFL_PF \| X86_EFL_AF \| X86_EFL_ZF \| X86_EFL_SF),
1890	ENTRY(adcx_u64),
1891	ENTRY(adox_u64),
1892	};
1893	TEST_BINARY_OPS(64, uint64_t, "%#018RX64", BINU64_TEST_T, g_aBinU64)
1894
1895
1896	/*
1897	* XCHG
1898	*/
1899	static void XchgTest(void)
1900	{
1901	if (!SubTestAndCheckIfEnabled("xchg"))
1902	return;
1903	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLXCHGU8, (uint8_t pu8Mem, uint8_t pu8Reg));
1904	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLXCHGU16,(uint16_t pu16Mem, uint16_t pu16Reg));
1905	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLXCHGU32,(uint32_t pu32Mem, uint32_t pu32Reg));
1906	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLXCHGU64,(uint64_t pu64Mem, uint64_t pu64Reg));
1907
1908	static struct
1909	{
1910	uint8_t cb; uint64_t fMask;
1911	union
1912	{
1913	uintptr_t pfn;
1914	FNIEMAIMPLXCHGU8 *pfnU8;
1915	FNIEMAIMPLXCHGU16 *pfnU16;
1916	FNIEMAIMPLXCHGU32 *pfnU32;
1917	FNIEMAIMPLXCHGU64 *pfnU64;
1918	} u;
1919	}
1920	s_aXchgWorkers[] =
1921	{
1922	{ 1, UINT8_MAX, { (uintptr_t)iemAImpl_xchg_u8_locked } },
1923	{ 2, UINT16_MAX, { (uintptr_t)iemAImpl_xchg_u16_locked } },
1924	{ 4, UINT32_MAX, { (uintptr_t)iemAImpl_xchg_u32_locked } },
1925	{ 8, UINT64_MAX, { (uintptr_t)iemAImpl_xchg_u64_locked } },
1926	{ 1, UINT8_MAX, { (uintptr_t)iemAImpl_xchg_u8_unlocked } },
1927	{ 2, UINT16_MAX, { (uintptr_t)iemAImpl_xchg_u16_unlocked } },
1928	{ 4, UINT32_MAX, { (uintptr_t)iemAImpl_xchg_u32_unlocked } },
1929	{ 8, UINT64_MAX, { (uintptr_t)iemAImpl_xchg_u64_unlocked } },
1930	};
1931	for (size_t i = 0; i < RT_ELEMENTS(s_aXchgWorkers); i++)
1932	{
1933	RTUINT64U uIn1, uIn2, uMem, uDst;
1934	uMem.u = uIn1.u = RTRandU64Ex(0, s_aXchgWorkers[i].fMask);
1935	uDst.u = uIn2.u = RTRandU64Ex(0, s_aXchgWorkers[i].fMask);
1936	if (uIn1.u == uIn2.u)
1937	uDst.u = uIn2.u = ~uIn2.u;
1938
1939	switch (s_aXchgWorkers[i].cb)
1940	{
1941	case 1:
1942	s_aXchgWorkers[i].u.pfnU8(g_pu8, g_pu8Two);
1943	s_aXchgWorkers[i].u.pfnU8(&uMem.au8[0], &uDst.au8[0]);
1944	break;
1945	case 2:
1946	s_aXchgWorkers[i].u.pfnU16(g_pu16, g_pu16Two);
1947	s_aXchgWorkers[i].u.pfnU16(&uMem.Words.w0, &uDst.Words.w0);
1948	break;
1949	case 4:
1950	s_aXchgWorkers[i].u.pfnU32(g_pu32, g_pu32Two);
1951	s_aXchgWorkers[i].u.pfnU32(&uMem.DWords.dw0, &uDst.DWords.dw0);
1952	break;
1953	case 8:
1954	s_aXchgWorkers[i].u.pfnU64(g_pu64, g_pu64Two);
1955	s_aXchgWorkers[i].u.pfnU64(&uMem.u, &uDst.u);
1956	break;
1957	default: RTTestFailed(g_hTest, "%d\n", s_aXchgWorkers[i].cb); break;
1958	}
1959
1960	if (uMem.u != uIn2.u \|\| uDst.u != uIn1.u)
1961	RTTestFailed(g_hTest, "i=%u: %#RX64, %#RX64 -> %#RX64, %#RX64\n", i, uIn1.u, uIn2.u, uMem.u, uDst.u);
1962	}
1963	}
1964
1965
1966	/*
1967	* XADD
1968	*/
1969	static void XaddTest(void)
1970	{
1971	#define TEST_XADD(a_cBits, a_Type, a_Fmt) do { \
1972	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLXADDU ## a_cBits, (a_Type , a_Type , uint32_t *)); \
1973	static struct \
1974	{ \
1975	const char *pszName; \
1976	FNIEMAIMPLXADDU ## a_cBits *pfn; \
1977	BINU ## a_cBits ## _TEST_T const *paTests; \
1978	uint32_t const *pcTests; \
1979	} const s_aFuncs[] = \
1980	{ \
1981	{ "xadd_u" # a_cBits, iemAImpl_xadd_u ## a_cBits, \
1982	g_aTests_add_u ## a_cBits, &g_cTests_add_u ## a_cBits }, \
1983	{ "xadd_u" # a_cBits "8_locked", iemAImpl_xadd_u ## a_cBits ## _locked, \
1984	g_aTests_add_u ## a_cBits, &g_cTests_add_u ## a_cBits }, \
1985	}; \
1986	for (size_t iFn = 0; iFn < RT_ELEMENTS(s_aFuncs); iFn++) \
1987	{ \
1988	if (!SubTestAndCheckIfEnabled(s_aFuncs[iFn].pszName)) continue; \
1989	uint32_t const cTests = *s_aFuncs[iFn].pcTests; \
1990	BINU ## a_cBits ## _TEST_T const * const paTests = s_aFuncs[iFn].paTests; \
1991	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
1992	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
1993	{ \
1994	uint32_t fEfl = paTests[iTest].fEflIn; \
1995	a_Type uSrc = paTests[iTest].uSrcIn; \
1996	*g_pu ## a_cBits = paTests[iTest].uDstIn; \
1997	s_aFuncs[iFn].pfn(g_pu ## a_cBits, &uSrc, &fEfl); \
1998	if ( fEfl != paTests[iTest].fEflOut \
1999	\|\| *g_pu ## a_cBits != paTests[iTest].uDstOut \
2000	\|\| uSrc != paTests[iTest].uDstIn) \
2001	RTTestFailed(g_hTest, "%s/#%u: efl=%#08x dst=" a_Fmt " src=" a_Fmt " -> efl=%#08x dst=" a_Fmt " src=" a_Fmt ", expected %#08x, " a_Fmt ", " a_Fmt "%s\n", \
2002	s_aFuncs[iFn].pszName, iTest, paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uSrcIn, \
2003	fEfl, *g_pu ## a_cBits, uSrc, paTests[iTest].fEflOut, paTests[iTest].uDstOut, paTests[iTest].uDstIn, \
2004	EFlagsDiff(fEfl, paTests[iTest].fEflOut)); \
2005	} \
2006	} \
2007	} while(0)
2008	TEST_XADD(8, uint8_t, "%#04x");
2009	TEST_XADD(16, uint16_t, "%#06x");
2010	TEST_XADD(32, uint32_t, "%#010RX32");
2011	TEST_XADD(64, uint64_t, "%#010RX64");
2012	}
2013
2014
2015	/*
2016	* CMPXCHG
2017	*/
2018
2019	static void CmpXchgTest(void)
2020	{
2021	#define TEST_CMPXCHG(a_cBits, a_Type, a_Fmt) do {\
2022	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLCMPXCHGU ## a_cBits, (a_Type , a_Type , a_Type, uint32_t *)); \
2023	static struct \
2024	{ \
2025	const char *pszName; \
2026	FNIEMAIMPLCMPXCHGU ## a_cBits *pfn; \
2027	PFNIEMAIMPLBINU ## a_cBits pfnSub; \
2028	BINU ## a_cBits ## _TEST_T const *paTests; \
2029	uint32_t const *pcTests; \
2030	} const s_aFuncs[] = \
2031	{ \
2032	{ "cmpxchg_u" # a_cBits, iemAImpl_cmpxchg_u ## a_cBits, iemAImpl_sub_u ## a_cBits, \
2033	g_aTests_cmp_u ## a_cBits, &g_cTests_cmp_u ## a_cBits }, \
2034	{ "cmpxchg_u" # a_cBits "_locked", iemAImpl_cmpxchg_u ## a_cBits ## _locked, iemAImpl_sub_u ## a_cBits, \
2035	g_aTests_cmp_u ## a_cBits, &g_cTests_cmp_u ## a_cBits }, \
2036	}; \
2037	for (size_t iFn = 0; iFn < RT_ELEMENTS(s_aFuncs); iFn++) \
2038	{ \
2039	if (!SubTestAndCheckIfEnabled(s_aFuncs[iFn].pszName)) continue; \
2040	BINU ## a_cBits ## _TEST_T const * const paTests = s_aFuncs[iFn].paTests; \
2041	uint32_t const cTests = *s_aFuncs[iFn].pcTests; \
2042	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
2043	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
2044	{ \
2045	/* as is (99% likely to be negative). */ \
2046	uint32_t fEfl = paTests[iTest].fEflIn; \
2047	a_Type const uNew = paTests[iTest].uSrcIn + 0x42; \
2048	a_Type uA = paTests[iTest].uDstIn; \
2049	*g_pu ## a_cBits = paTests[iTest].uSrcIn; \
2050	a_Type const uExpect = uA != paTests[iTest].uSrcIn ? paTests[iTest].uSrcIn : uNew; \
2051	s_aFuncs[iFn].pfn(g_pu ## a_cBits, &uA, uNew, &fEfl); \
2052	if ( fEfl != paTests[iTest].fEflOut \
2053	\|\| *g_pu ## a_cBits != uExpect \
2054	\|\| uA != paTests[iTest].uSrcIn) \
2055	RTTestFailed(g_hTest, "%s/#%ua: efl=%#08x dst=" a_Fmt " cmp=" a_Fmt " new=" a_Fmt " -> efl=%#08x dst=" a_Fmt " old=" a_Fmt ", expected %#08x, " a_Fmt ", " a_Fmt "%s\n", \
2056	s_aFuncs[iFn].pszName, iTest, paTests[iTest].fEflIn, paTests[iTest].uSrcIn, paTests[iTest].uDstIn, \
2057	uNew, fEfl, *g_pu ## a_cBits, uA, paTests[iTest].fEflOut, uExpect, paTests[iTest].uSrcIn, \
2058	EFlagsDiff(fEfl, paTests[iTest].fEflOut)); \
2059	/* positive */ \
2060	uint32_t fEflExpect = paTests[iTest].fEflIn; \
2061	uA = paTests[iTest].uDstIn; \
2062	s_aFuncs[iFn].pfnSub(&uA, uA, &fEflExpect); \
2063	fEfl = paTests[iTest].fEflIn; \
2064	uA = paTests[iTest].uDstIn; \
2065	*g_pu ## a_cBits = uA; \
2066	s_aFuncs[iFn].pfn(g_pu ## a_cBits, &uA, uNew, &fEfl); \
2067	if ( fEfl != fEflExpect \
2068	\|\| *g_pu ## a_cBits != uNew \
2069	\|\| uA != paTests[iTest].uDstIn) \
2070	RTTestFailed(g_hTest, "%s/#%ua: efl=%#08x dst=" a_Fmt " cmp=" a_Fmt " new=" a_Fmt " -> efl=%#08x dst=" a_Fmt " old=" a_Fmt ", expected %#08x, " a_Fmt ", " a_Fmt "%s\n", \
2071	s_aFuncs[iFn].pszName, iTest, paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uDstIn, \
2072	uNew, fEfl, *g_pu ## a_cBits, uA, fEflExpect, uNew, paTests[iTest].uDstIn, \
2073	EFlagsDiff(fEfl, fEflExpect)); \
2074	} \
2075	} \
2076	} while(0)
2077	TEST_CMPXCHG(8, uint8_t, "%#04RX8");
2078	TEST_CMPXCHG(16, uint16_t, "%#06x");
2079	TEST_CMPXCHG(32, uint32_t, "%#010RX32");
2080	#if ARCH_BITS != 32 /* calling convension issue, skipping as it's an unsupported host */
2081	TEST_CMPXCHG(64, uint64_t, "%#010RX64");
2082	#endif
2083	}
2084
2085	static void CmpXchg8bTest(void)
2086	{
2087	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLCMPXCHG8B,(uint64_t , PRTUINT64U, PRTUINT64U, uint32_t ));
2088	static struct
2089	{
2090	const char *pszName;
2091	FNIEMAIMPLCMPXCHG8B *pfn;
2092	} const s_aFuncs[] =
2093	{
2094	{ "cmpxchg8b", iemAImpl_cmpxchg8b },
2095	{ "cmpxchg8b_locked", iemAImpl_cmpxchg8b_locked },
2096	};
2097	for (size_t iFn = 0; iFn < RT_ELEMENTS(s_aFuncs); iFn++)
2098	{
2099	if (!SubTestAndCheckIfEnabled(s_aFuncs[iFn].pszName))
2100	continue;
2101	for (uint32_t iTest = 0; iTest < 4; iTest += 2)
2102	{
2103	uint64_t const uOldValue = RandU64();
2104	uint64_t const uNewValue = RandU64();
2105
2106	/* positive test. */
2107	RTUINT64U uA, uB;
2108	uB.u = uNewValue;
2109	uA.u = uOldValue;
2110	*g_pu64 = uOldValue;
2111	uint32_t fEflIn = RandEFlags();
2112	uint32_t fEfl = fEflIn;
2113	s_aFuncs[iFn].pfn(g_pu64, &uA, &uB, &fEfl);
2114	if ( fEfl != (fEflIn \| X86_EFL_ZF)
2115	\|\| *g_pu64 != uNewValue
2116	\|\| uA.u != uOldValue)
2117	RTTestFailed(g_hTest, "#%u: efl=%#08x dst=%#018RX64 cmp=%#018RX64 new=%#018RX64\n -> efl=%#08x dst=%#018RX64 old=%#018RX64,\n wanted %#08x, %#018RX64, %#018RX64%s\n",
2118	iTest, fEflIn, uOldValue, uOldValue, uNewValue,
2119	fEfl, *g_pu64, uA.u,
2120	(fEflIn \| X86_EFL_ZF), uNewValue, uOldValue, EFlagsDiff(fEfl, fEflIn \| X86_EFL_ZF));
2121	RTTEST_CHECK(g_hTest, uB.u == uNewValue);
2122
2123	/* negative */
2124	uint64_t const uExpect = ~uOldValue;
2125	*g_pu64 = uExpect;
2126	uA.u = uOldValue;
2127	uB.u = uNewValue;
2128	fEfl = fEflIn = RandEFlags();
2129	s_aFuncs[iFn].pfn(g_pu64, &uA, &uB, &fEfl);
2130	if ( fEfl != (fEflIn & ~X86_EFL_ZF)
2131	\|\| *g_pu64 != uExpect
2132	\|\| uA.u != uExpect)
2133	RTTestFailed(g_hTest, "#%u: efl=%#08x dst=%#018RX64 cmp=%#018RX64 new=%#018RX64\n -> efl=%#08x dst=%#018RX64 old=%#018RX64,\n wanted %#08x, %#018RX64, %#018RX64%s\n",
2134	iTest + 1, fEflIn, uExpect, uOldValue, uNewValue,
2135	fEfl, *g_pu64, uA.u,
2136	(fEflIn & ~X86_EFL_ZF), uExpect, uExpect, EFlagsDiff(fEfl, fEflIn & ~X86_EFL_ZF));
2137	RTTEST_CHECK(g_hTest, uB.u == uNewValue);
2138	}
2139	}
2140	}
2141
2142	static void CmpXchg16bTest(void)
2143	{
2144	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLCMPXCHG16B,(PRTUINT128U, PRTUINT128U, PRTUINT128U, uint32_t *));
2145	static struct
2146	{
2147	const char *pszName;
2148	FNIEMAIMPLCMPXCHG16B *pfn;
2149	} const s_aFuncs[] =
2150	{
2151	{ "cmpxchg16b", iemAImpl_cmpxchg16b },
2152	{ "cmpxchg16b_locked", iemAImpl_cmpxchg16b_locked },
2153	#if !defined(RT_ARCH_ARM64)
2154	{ "cmpxchg16b_fallback", iemAImpl_cmpxchg16b_fallback },
2155	#endif
2156	};
2157	for (size_t iFn = 0; iFn < RT_ELEMENTS(s_aFuncs); iFn++)
2158	{
2159	if (!SubTestAndCheckIfEnabled(s_aFuncs[iFn].pszName))
2160	continue;
2161	#if !defined(IEM_WITHOUT_ASSEMBLY) && defined(RT_ARCH_AMD64)
2162	if (!(ASMCpuId_ECX(1) & X86_CPUID_FEATURE_ECX_CX16))
2163	{
2164	RTTestSkipped(g_hTest, "no hardware cmpxchg16b");
2165	continue;
2166	}
2167	#endif
2168	for (uint32_t iTest = 0; iTest < 4; iTest += 2)
2169	{
2170	RTUINT128U const uOldValue = RandU128();
2171	RTUINT128U const uNewValue = RandU128();
2172
2173	/* positive test. */
2174	RTUINT128U uA, uB;
2175	uB = uNewValue;
2176	uA = uOldValue;
2177	*g_pu128 = uOldValue;
2178	uint32_t fEflIn = RandEFlags();
2179	uint32_t fEfl = fEflIn;
2180	s_aFuncs[iFn].pfn(g_pu128, &uA, &uB, &fEfl);
2181	if ( fEfl != (fEflIn \| X86_EFL_ZF)
2182	\|\| g_pu128->s.Lo != uNewValue.s.Lo
2183	\|\| g_pu128->s.Hi != uNewValue.s.Hi
2184	\|\| uA.s.Lo != uOldValue.s.Lo
2185	\|\| uA.s.Hi != uOldValue.s.Hi)
2186	RTTestFailed(g_hTest, "#%u: efl=%#08x dst=%#018RX64'%016RX64 cmp=%#018RX64'%016RX64 new=%#018RX64'%016RX64\n"
2187	" -> efl=%#08x dst=%#018RX64'%016RX64 old=%#018RX64'%016RX64,\n"
2188	" wanted %#08x, %#018RX64'%016RX64, %#018RX64'%016RX64%s\n",
2189	iTest, fEflIn, uOldValue.s.Hi, uOldValue.s.Lo, uOldValue.s.Hi, uOldValue.s.Lo, uNewValue.s.Hi, uNewValue.s.Lo,
2190	fEfl, g_pu128->s.Hi, g_pu128->s.Lo, uA.s.Hi, uA.s.Lo,
2191	(fEflIn \| X86_EFL_ZF), uNewValue.s.Hi, uNewValue.s.Lo, uOldValue.s.Hi, uOldValue.s.Lo,
2192	EFlagsDiff(fEfl, fEflIn \| X86_EFL_ZF));
2193	RTTEST_CHECK(g_hTest, uB.s.Lo == uNewValue.s.Lo && uB.s.Hi == uNewValue.s.Hi);
2194
2195	/* negative */
2196	RTUINT128U const uExpect = RTUINT128_INIT(~uOldValue.s.Hi, ~uOldValue.s.Lo);
2197	*g_pu128 = uExpect;
2198	uA = uOldValue;
2199	uB = uNewValue;
2200	fEfl = fEflIn = RandEFlags();
2201	s_aFuncs[iFn].pfn(g_pu128, &uA, &uB, &fEfl);
2202	if ( fEfl != (fEflIn & ~X86_EFL_ZF)
2203	\|\| g_pu128->s.Lo != uExpect.s.Lo
2204	\|\| g_pu128->s.Hi != uExpect.s.Hi
2205	\|\| uA.s.Lo != uExpect.s.Lo
2206	\|\| uA.s.Hi != uExpect.s.Hi)
2207	RTTestFailed(g_hTest, "#%u: efl=%#08x dst=%#018RX64'%016RX64 cmp=%#018RX64'%016RX64 new=%#018RX64'%016RX64\n"
2208	" -> efl=%#08x dst=%#018RX64'%016RX64 old=%#018RX64'%016RX64,\n"
2209	" wanted %#08x, %#018RX64'%016RX64, %#018RX64'%016RX64%s\n",
2210	iTest + 1, fEflIn, uExpect.s.Hi, uExpect.s.Lo, uOldValue.s.Hi, uOldValue.s.Lo, uNewValue.s.Hi, uNewValue.s.Lo,
2211	fEfl, g_pu128->s.Hi, g_pu128->s.Lo, uA.s.Hi, uA.s.Lo,
2212	(fEflIn & ~X86_EFL_ZF), uExpect.s.Hi, uExpect.s.Lo, uExpect.s.Hi, uExpect.s.Lo,
2213	EFlagsDiff(fEfl, fEflIn & ~X86_EFL_ZF));
2214	RTTEST_CHECK(g_hTest, uB.s.Lo == uNewValue.s.Lo && uB.s.Hi == uNewValue.s.Hi);
2215	}
2216	}
2217	}
2218
2219
2220	/*
2221	* Double shifts.
2222	*
2223	* Note! We use BINUxx_TEST_T with the shift value in the uMisc field.
2224	*/
2225	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2226	# define GEN_SHIFT_DBL(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2227	static RTEXITCODE ShiftDblU ## a_cBits ## Generate(uint32_t cTests, const char * const * papszNameFmts) \
2228	{ \
2229	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2230	{ \
2231	if ( a_aSubTests[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE \
2232	&& a_aSubTests[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour) \
2233	continue; \
2234	IEMBINARYOUTPUT BinOut; \
2235	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2236	RTEXITCODE_FAILURE); \
2237	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2238	{ \
2239	a_TestType Test; \
2240	Test.fEflIn = RandEFlags(); \
2241	Test.fEflOut = Test.fEflIn; \
2242	Test.uDstIn = RandU ## a_cBits ## Dst(iTest); \
2243	Test.uDstOut = Test.uDstIn; \
2244	Test.uSrcIn = RandU ## a_cBits ## Src(iTest); \
2245	Test.uMisc = RandU8() & (a_cBits * 4 - 1); /* need to go way beyond the a_cBits limit */ \
2246	a_aSubTests[iFn].pfnNative(&Test.uDstOut, Test.uSrcIn, Test.uMisc, &Test.fEflOut); \
2247	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
2248	} \
2249	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2250	} \
2251	return RTEXITCODE_SUCCESS; \
2252	} \
2253	static RTEXITCODE ShiftDblU ## a_cBits ## DumpAll(const char * const * papszNameFmts) \
2254	{ \
2255	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2256	{ \
2257	AssertReturn(DECOMPRESS_TESTS(a_aSubTests[iFn]), RTEXITCODE_FAILURE); \
2258	IEMBINARYOUTPUT BinOut; \
2259	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2260	RTEXITCODE_FAILURE); \
2261	size_t cbTests = a_aSubTests[iFn].pcTests[0]; \
2262	if (!a_aSubTests[iFn].fBinary) \
2263	cbTests *= sizeof(a_aSubTests[iFn].paTests[0]); \
2264	GenerateBinaryWrite(&BinOut, a_aSubTests[iFn].paTests, cbTests); \
2265	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2266	} \
2267	return RTEXITCODE_SUCCESS; \
2268	}
2269
2270	#else
2271	# define GEN_SHIFT_DBL(a_cBits, a_Fmt, a_TestType, a_aSubTests)
2272	#endif
2273
2274	#define TEST_SHIFT_DBL(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType, a_aSubTests) \
2275	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLSHIFTDBLU ## a_cBits); \
2276	\
2277	static a_SubTestType a_aSubTests[] = \
2278	{ \
2279	ENTRY_AMD(shld_u ## a_cBits, X86_EFL_OF \| X86_EFL_CF), \
2280	ENTRY_INTEL(shld_u ## a_cBits, X86_EFL_OF \| X86_EFL_CF), \
2281	ENTRY_AMD(shrd_u ## a_cBits, X86_EFL_OF \| X86_EFL_CF), \
2282	ENTRY_INTEL(shrd_u ## a_cBits, X86_EFL_OF \| X86_EFL_CF), \
2283	}; \
2284	\
2285	GEN_SHIFT_DBL(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2286	\
2287	static void ShiftDblU ## a_cBits ## Test(void) \
2288	{ \
2289	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2290	{ \
2291	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
2292	continue; \
2293	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
2294	PFNIEMAIMPLSHIFTDBLU ## a_cBits pfn = a_aSubTests[iFn].pfn; \
2295	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
2296	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
2297	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
2298	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
2299	{ \
2300	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2301	{ \
2302	uint32_t fEfl = paTests[iTest].fEflIn; \
2303	a_Type uDst = paTests[iTest].uDstIn; \
2304	pfn(&uDst, paTests[iTest].uSrcIn, paTests[iTest].uMisc, &fEfl); \
2305	if ( uDst != paTests[iTest].uDstOut \
2306	\|\| fEfl != paTests[iTest].fEflOut) \
2307	RTTestFailed(g_hTest, "#%03u%s: efl=%#08x dst=" a_Fmt " src=" a_Fmt " shift=%-2u -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s%s\n", \
2308	iTest, iVar == 0 ? "" : "/n", paTests[iTest].fEflIn, \
2309	paTests[iTest].uDstIn, paTests[iTest].uSrcIn, (unsigned)paTests[iTest].uMisc, \
2310	fEfl, uDst, paTests[iTest].fEflOut, paTests[iTest].uDstOut, \
2311	EFlagsDiff(fEfl, paTests[iTest].fEflOut), uDst == paTests[iTest].uDstOut ? "" : " dst!"); \
2312	else \
2313	{ \
2314	*g_pu ## a_cBits = paTests[iTest].uDstIn; \
2315	*g_pfEfl = paTests[iTest].fEflIn; \
2316	pfn(g_pu ## a_cBits, paTests[iTest].uSrcIn, paTests[iTest].uMisc, g_pfEfl); \
2317	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \
2318	RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \
2319	} \
2320	} \
2321	pfn = a_aSubTests[iFn].pfnNative; \
2322	} \
2323	} \
2324	}
2325	TEST_SHIFT_DBL(16, uint16_t, "%#06RX16", BINU16_TEST_T, SHIFT_DBL_U16_T, g_aShiftDblU16)
2326	TEST_SHIFT_DBL(32, uint32_t, "%#010RX32", BINU32_TEST_T, SHIFT_DBL_U32_T, g_aShiftDblU32)
2327	TEST_SHIFT_DBL(64, uint64_t, "%#018RX64", BINU64_TEST_T, SHIFT_DBL_U64_T, g_aShiftDblU64)
2328
2329	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2330	static RTEXITCODE ShiftDblGenerate(uint32_t cTests, const char * const * papszNameFmts)
2331	{
2332	RTEXITCODE rcExit = ShiftDblU16Generate(cTests, papszNameFmts);
2333	if (rcExit == RTEXITCODE_SUCCESS)
2334	rcExit = ShiftDblU32Generate(cTests, papszNameFmts);
2335	if (rcExit == RTEXITCODE_SUCCESS)
2336	rcExit = ShiftDblU64Generate(cTests, papszNameFmts);
2337	return rcExit;
2338	}
2339
2340	static RTEXITCODE ShiftDblDumpAll(const char * const * papszNameFmts)
2341	{
2342	RTEXITCODE rcExit = ShiftDblU16DumpAll(papszNameFmts);
2343	if (rcExit == RTEXITCODE_SUCCESS)
2344	rcExit = ShiftDblU32DumpAll(papszNameFmts);
2345	if (rcExit == RTEXITCODE_SUCCESS)
2346	rcExit = ShiftDblU64DumpAll(papszNameFmts);
2347	return rcExit;
2348	}
2349	#endif
2350
2351	static void ShiftDblTest(void)
2352	{
2353	ShiftDblU16Test();
2354	ShiftDblU32Test();
2355	ShiftDblU64Test();
2356	}
2357
2358
2359	/*
2360	* Unary operators.
2361	*
2362	* Note! We use BINUxx_TEST_T ignoreing uSrcIn and uMisc.
2363	*/
2364	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2365	# define GEN_UNARY(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType) \
2366	static RTEXITCODE UnaryU ## a_cBits ## Generate(uint32_t cTests, const char * const * papszNameFmts) \
2367	{ \
2368	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aUnaryU ## a_cBits); iFn++) \
2369	{ \
2370	IEMBINARYOUTPUT BinOut; \
2371	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[0], g_aUnaryU ## a_cBits[iFn].pszName), RTEXITCODE_FAILURE); \
2372	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2373	{ \
2374	a_TestType Test; \
2375	Test.fEflIn = RandEFlags(); \
2376	Test.fEflOut = Test.fEflIn; \
2377	Test.uDstIn = RandU ## a_cBits(); \
2378	Test.uDstOut = Test.uDstIn; \
2379	Test.uSrcIn = 0; \
2380	Test.uMisc = 0; \
2381	g_aUnaryU ## a_cBits[iFn].pfn(&Test.uDstOut, &Test.fEflOut); \
2382	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
2383	} \
2384	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2385	} \
2386	return RTEXITCODE_SUCCESS; \
2387	} \
2388	static RTEXITCODE UnaryU ## a_cBits ## DumpAll(const char * const * papszNameFmts) \
2389	{ \
2390	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aUnaryU ## a_cBits); iFn++) \
2391	{ \
2392	AssertReturn(DECOMPRESS_TESTS(g_aUnaryU ## a_cBits[iFn]), RTEXITCODE_FAILURE); \
2393	IEMBINARYOUTPUT BinOut; \
2394	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[0], g_aUnaryU ## a_cBits[iFn].pszName), RTEXITCODE_FAILURE); \
2395	uint32_t cbTests = g_aUnaryU ## a_cBits[iFn].pcTests[0]; \
2396	if (!g_aUnaryU ## a_cBits[iFn].fBinary) \
2397	cbTests *= sizeof(g_aUnaryU ## a_cBits[iFn].paTests[0]); \
2398	GenerateBinaryWrite(&BinOut, g_aUnaryU ## a_cBits[iFn].paTests, cbTests); \
2399	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2400	} \
2401	return RTEXITCODE_SUCCESS; \
2402	}
2403	#else
2404	# define GEN_UNARY(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType)
2405	#endif
2406
2407	#define TEST_UNARY(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType) \
2408	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLUNARYU ## a_cBits); \
2409	static a_SubTestType g_aUnaryU ## a_cBits [] = \
2410	{ \
2411	ENTRY(inc_u ## a_cBits), \
2412	ENTRY(inc_u ## a_cBits ## _locked), \
2413	ENTRY(dec_u ## a_cBits), \
2414	ENTRY(dec_u ## a_cBits ## _locked), \
2415	ENTRY(not_u ## a_cBits), \
2416	ENTRY(not_u ## a_cBits ## _locked), \
2417	ENTRY(neg_u ## a_cBits), \
2418	ENTRY(neg_u ## a_cBits ## _locked), \
2419	}; \
2420	\
2421	GEN_UNARY(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType) \
2422	\
2423	static void UnaryU ## a_cBits ## Test(void) \
2424	{ \
2425	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aUnaryU ## a_cBits); iFn++) \
2426	{ \
2427	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aUnaryU ## a_cBits[iFn])) \
2428	continue; \
2429	a_TestType const * const paTests = g_aUnaryU ## a_cBits[iFn].paTests; \
2430	uint32_t const cTests = *g_aUnaryU ## a_cBits[iFn].pcTests; \
2431	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
2432	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2433	{ \
2434	uint32_t fEfl = paTests[iTest].fEflIn; \
2435	a_Type uDst = paTests[iTest].uDstIn; \
2436	g_aUnaryU ## a_cBits[iFn].pfn(&uDst, &fEfl); \
2437	if ( uDst != paTests[iTest].uDstOut \
2438	\|\| fEfl != paTests[iTest].fEflOut) \
2439	RTTestFailed(g_hTest, "#%u: efl=%#08x dst=" a_Fmt " -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s\n", \
2440	iTest, paTests[iTest].fEflIn, paTests[iTest].uDstIn, \
2441	fEfl, uDst, paTests[iTest].fEflOut, paTests[iTest].uDstOut, \
2442	EFlagsDiff(fEfl, paTests[iTest].fEflOut)); \
2443	else \
2444	{ \
2445	*g_pu ## a_cBits = paTests[iTest].uDstIn; \
2446	*g_pfEfl = paTests[iTest].fEflIn; \
2447	g_aUnaryU ## a_cBits[iFn].pfn(g_pu ## a_cBits, g_pfEfl); \
2448	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \
2449	RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \
2450	} \
2451	} \
2452	} \
2453	}
2454	TEST_UNARY(8, uint8_t, "%#04RX8", BINU8_TEST_T, INT_UNARY_U8_T)
2455	TEST_UNARY(16, uint16_t, "%#06RX16", BINU16_TEST_T, INT_UNARY_U16_T)
2456	TEST_UNARY(32, uint32_t, "%#010RX32", BINU32_TEST_T, INT_UNARY_U32_T)
2457	TEST_UNARY(64, uint64_t, "%#018RX64", BINU64_TEST_T, INT_UNARY_U64_T)
2458
2459	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2460	static RTEXITCODE UnaryGenerate(uint32_t cTests, const char * const * papszNameFmts)
2461	{
2462	RTEXITCODE rcExit = UnaryU8Generate(cTests, papszNameFmts);
2463	if (rcExit == RTEXITCODE_SUCCESS)
2464	rcExit = UnaryU16Generate(cTests, papszNameFmts);
2465	if (rcExit == RTEXITCODE_SUCCESS)
2466	rcExit = UnaryU32Generate(cTests, papszNameFmts);
2467	if (rcExit == RTEXITCODE_SUCCESS)
2468	rcExit = UnaryU64Generate(cTests, papszNameFmts);
2469	return rcExit;
2470	}
2471
2472	static RTEXITCODE UnaryDumpAll(const char * const * papszNameFmts)
2473	{
2474	RTEXITCODE rcExit = UnaryU8DumpAll(papszNameFmts);
2475	if (rcExit == RTEXITCODE_SUCCESS)
2476	rcExit = UnaryU16DumpAll(papszNameFmts);
2477	if (rcExit == RTEXITCODE_SUCCESS)
2478	rcExit = UnaryU32DumpAll(papszNameFmts);
2479	if (rcExit == RTEXITCODE_SUCCESS)
2480	rcExit = UnaryU64DumpAll(papszNameFmts);
2481	return rcExit;
2482	}
2483	#endif
2484
2485	static void UnaryTest(void)
2486	{
2487	UnaryU8Test();
2488	UnaryU16Test();
2489	UnaryU32Test();
2490	UnaryU64Test();
2491	}
2492
2493
2494	/*
2495	* Shifts.
2496	*
2497	* Note! We use BINUxx_TEST_T with the shift count in uMisc and uSrcIn unused.
2498	*/
2499	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2500	# define GEN_SHIFT(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2501	static RTEXITCODE ShiftU ## a_cBits ## Generate(uint32_t cTests, const char * const * papszNameFmts) \
2502	{ \
2503	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2504	{ \
2505	if ( a_aSubTests[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE \
2506	&& a_aSubTests[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour) \
2507	continue; \
2508	IEMBINARYOUTPUT BinOut; \
2509	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2510	RTEXITCODE_FAILURE); \
2511	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2512	{ \
2513	a_TestType Test; \
2514	Test.fEflIn = RandEFlags(); \
2515	Test.fEflOut = Test.fEflIn; \
2516	Test.uDstIn = RandU ## a_cBits ## Dst(iTest); \
2517	Test.uDstOut = Test.uDstIn; \
2518	Test.uSrcIn = 0; \
2519	Test.uMisc = RandU8() & (a_cBits * 4 - 1); /* need to go way beyond the a_cBits limit */ \
2520	a_aSubTests[iFn].pfnNative(&Test.uDstOut, Test.uMisc, &Test.fEflOut); \
2521	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
2522	\
2523	Test.fEflIn = (~Test.fEflIn & X86_EFL_LIVE_MASK) \| X86_EFL_RA1_MASK; \
2524	Test.fEflOut = Test.fEflIn; \
2525	Test.uDstOut = Test.uDstIn; \
2526	a_aSubTests[iFn].pfnNative(&Test.uDstOut, Test.uMisc, &Test.fEflOut); \
2527	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
2528	} \
2529	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2530	} \
2531	return RTEXITCODE_SUCCESS; \
2532	} \
2533	static RTEXITCODE ShiftU ## a_cBits ## DumpAll(const char * const * papszNameFmts) \
2534	{ \
2535	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2536	{ \
2537	AssertReturn(DECOMPRESS_TESTS(a_aSubTests[iFn]), RTEXITCODE_FAILURE); \
2538	IEMBINARYOUTPUT BinOut; \
2539	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2540	RTEXITCODE_FAILURE); \
2541	uint32_t cbTests = a_aSubTests[iFn].pcTests[0]; \
2542	if (!a_aSubTests[iFn].fBinary) \
2543	cbTests *= sizeof(a_aSubTests[iFn].paTests[0]); \
2544	GenerateBinaryWrite(&BinOut, a_aSubTests[iFn].paTests, cbTests); \
2545	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2546	} \
2547	return RTEXITCODE_SUCCESS; \
2548	}
2549	#else
2550	# define GEN_SHIFT(a_cBits, a_Fmt, a_TestType, a_aSubTests)
2551	#endif
2552
2553	#define TEST_SHIFT(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType, a_aSubTests) \
2554	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLSHIFTU ## a_cBits); \
2555	static a_SubTestType a_aSubTests[] = \
2556	{ \
2557	ENTRY_AMD( rol_u ## a_cBits, X86_EFL_OF), \
2558	ENTRY_INTEL(rol_u ## a_cBits, X86_EFL_OF), \
2559	ENTRY_AMD( ror_u ## a_cBits, X86_EFL_OF), \
2560	ENTRY_INTEL(ror_u ## a_cBits, X86_EFL_OF), \
2561	ENTRY_AMD( rcl_u ## a_cBits, X86_EFL_OF), \
2562	ENTRY_INTEL(rcl_u ## a_cBits, X86_EFL_OF), \
2563	ENTRY_AMD( rcr_u ## a_cBits, X86_EFL_OF), \
2564	ENTRY_INTEL(rcr_u ## a_cBits, X86_EFL_OF), \
2565	ENTRY_AMD( shl_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2566	ENTRY_INTEL(shl_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2567	ENTRY_AMD( shr_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2568	ENTRY_INTEL(shr_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2569	ENTRY_AMD( sar_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2570	ENTRY_INTEL(sar_u ## a_cBits, X86_EFL_OF \| X86_EFL_AF), \
2571	}; \
2572	\
2573	GEN_SHIFT(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2574	\
2575	static void ShiftU ## a_cBits ## Test(void) \
2576	{ \
2577	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2578	{ \
2579	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
2580	continue; \
2581	PFNIEMAIMPLSHIFTU ## a_cBits pfn = a_aSubTests[iFn].pfn; \
2582	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
2583	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
2584	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
2585	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
2586	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
2587	{ \
2588	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2589	{ \
2590	uint32_t fEfl = paTests[iTest].fEflIn; \
2591	a_Type uDst = paTests[iTest].uDstIn; \
2592	pfn(&uDst, paTests[iTest].uMisc, &fEfl); \
2593	if ( uDst != paTests[iTest].uDstOut \
2594	\|\| fEfl != paTests[iTest].fEflOut ) \
2595	RTTestFailed(g_hTest, "#%u%s: efl=%#08x dst=" a_Fmt " shift=%2u -> efl=%#08x dst=" a_Fmt ", expected %#08x & " a_Fmt "%s\n", \
2596	iTest, iVar == 0 ? "" : "/n", \
2597	paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uMisc, \
2598	fEfl, uDst, paTests[iTest].fEflOut, paTests[iTest].uDstOut, \
2599	EFlagsDiff(fEfl, paTests[iTest].fEflOut)); \
2600	else \
2601	{ \
2602	*g_pu ## a_cBits = paTests[iTest].uDstIn; \
2603	*g_pfEfl = paTests[iTest].fEflIn; \
2604	pfn(g_pu ## a_cBits, paTests[iTest].uMisc, g_pfEfl); \
2605	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDstOut); \
2606	RTTEST_CHECK(g_hTest, *g_pfEfl == paTests[iTest].fEflOut); \
2607	} \
2608	} \
2609	pfn = a_aSubTests[iFn].pfnNative; \
2610	} \
2611	} \
2612	}
2613	TEST_SHIFT(8, uint8_t, "%#04RX8", BINU8_TEST_T, INT_BINARY_U8_T, g_aShiftU8)
2614	TEST_SHIFT(16, uint16_t, "%#06RX16", BINU16_TEST_T, INT_BINARY_U16_T, g_aShiftU16)
2615	TEST_SHIFT(32, uint32_t, "%#010RX32", BINU32_TEST_T, INT_BINARY_U32_T, g_aShiftU32)
2616	TEST_SHIFT(64, uint64_t, "%#018RX64", BINU64_TEST_T, INT_BINARY_U64_T, g_aShiftU64)
2617
2618	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2619	static RTEXITCODE ShiftGenerate(uint32_t cTests, const char * const * papszNameFmts)
2620	{
2621	RTEXITCODE rcExit = ShiftU8Generate(cTests, papszNameFmts);
2622	if (rcExit == RTEXITCODE_SUCCESS)
2623	rcExit = ShiftU16Generate(cTests, papszNameFmts);
2624	if (rcExit == RTEXITCODE_SUCCESS)
2625	rcExit = ShiftU32Generate(cTests, papszNameFmts);
2626	if (rcExit == RTEXITCODE_SUCCESS)
2627	rcExit = ShiftU64Generate(cTests, papszNameFmts);
2628	return rcExit;
2629	}
2630
2631	static RTEXITCODE ShiftDumpAll(const char * const * papszNameFmts)
2632	{
2633	RTEXITCODE rcExit = ShiftU8DumpAll(papszNameFmts);
2634	if (rcExit == RTEXITCODE_SUCCESS)
2635	rcExit = ShiftU16DumpAll(papszNameFmts);
2636	if (rcExit == RTEXITCODE_SUCCESS)
2637	rcExit = ShiftU32DumpAll(papszNameFmts);
2638	if (rcExit == RTEXITCODE_SUCCESS)
2639	rcExit = ShiftU64DumpAll(papszNameFmts);
2640	return rcExit;
2641	}
2642	#endif
2643
2644	static void ShiftTest(void)
2645	{
2646	ShiftU8Test();
2647	ShiftU16Test();
2648	ShiftU32Test();
2649	ShiftU64Test();
2650	}
2651
2652
2653	/*
2654	* Multiplication and division.
2655	*
2656	* Note! The 8-bit functions has a different format, so we need to duplicate things.
2657	* Note! Currently ignoring undefined bits.
2658	*/
2659
2660	/* U8 */
2661	TYPEDEF_SUBTEST_TYPE(INT_MULDIV_U8_T, MULDIVU8_TEST_T, PFNIEMAIMPLMULDIVU8);
2662	static INT_MULDIV_U8_T g_aMulDivU8[] =
2663	{
2664	ENTRY_AMD_EX(mul_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF,
2665	X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF),
2666	ENTRY_INTEL_EX(mul_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0),
2667	ENTRY_AMD_EX(imul_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF,
2668	X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF),
2669	ENTRY_INTEL_EX(imul_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0),
2670	ENTRY_AMD_EX(div_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0),
2671	ENTRY_INTEL_EX(div_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0),
2672	ENTRY_AMD_EX(idiv_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0),
2673	ENTRY_INTEL_EX(idiv_u8, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0),
2674	};
2675
2676	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2677	static RTEXITCODE MulDivU8Generate(uint32_t cTests, const char * const * papszNameFmts)
2678	{
2679	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aMulDivU8); iFn++)
2680	{
2681	if ( g_aMulDivU8[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE
2682	&& g_aMulDivU8[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour)
2683	continue;
2684	IEMBINARYOUTPUT BinOut; \
2685	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[g_aMulDivU8[iFn].idxCpuEflFlavour], g_aMulDivU8[iFn].pszName),
2686	RTEXITCODE_FAILURE);
2687	for (uint32_t iTest = 0; iTest < cTests; iTest++ )
2688	{
2689	MULDIVU8_TEST_T Test;
2690	Test.fEflIn = RandEFlags();
2691	Test.fEflOut = Test.fEflIn;
2692	Test.uDstIn = RandU16Dst(iTest);
2693	Test.uDstOut = Test.uDstIn;
2694	Test.uSrcIn = RandU8Src(iTest);
2695	Test.rc = g_aMulDivU8[iFn].pfnNative(&Test.uDstOut, Test.uSrcIn, &Test.fEflOut);
2696	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test));
2697	}
2698	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
2699	}
2700	return RTEXITCODE_SUCCESS;
2701	}
2702	static RTEXITCODE MulDivU8DumpAll(const char * const * papszNameFmts)
2703	{
2704	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aMulDivU8); iFn++)
2705	{
2706	AssertReturn(DECOMPRESS_TESTS(g_aMulDivU8[iFn]), RTEXITCODE_FAILURE);
2707	IEMBINARYOUTPUT BinOut;
2708	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[g_aMulDivU8[iFn].idxCpuEflFlavour], g_aMulDivU8[iFn].pszName),
2709	RTEXITCODE_FAILURE);
2710	uint32_t cbTests = g_aMulDivU8[iFn].pcTests[0];
2711	if (!g_aMulDivU8[iFn].fBinary)
2712	cbTests *= sizeof(g_aMulDivU8[iFn].paTests[0]);
2713	GenerateBinaryWrite(&BinOut, g_aMulDivU8[iFn].paTests, cbTests);
2714	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
2715	}
2716	return RTEXITCODE_SUCCESS;
2717	}
2718	#endif
2719
2720	static void MulDivU8Test(void)
2721	{
2722	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aMulDivU8); iFn++)
2723	{
2724	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aMulDivU8[iFn])) \
2725	continue; \
2726	MULDIVU8_TEST_T const * const paTests = g_aMulDivU8[iFn].paTests;
2727	uint32_t const cTests = *g_aMulDivU8[iFn].pcTests;
2728	uint32_t const fEflIgn = g_aMulDivU8[iFn].uExtra;
2729	PFNIEMAIMPLMULDIVU8 pfn = g_aMulDivU8[iFn].pfn;
2730	uint32_t const cVars = COUNT_VARIATIONS(g_aMulDivU8[iFn]); \
2731	if (!cTests) RTTestSkipped(g_hTest, "no tests");
2732	for (uint32_t iVar = 0; iVar < cVars; iVar++)
2733	{
2734	for (uint32_t iTest = 0; iTest < cTests; iTest++ )
2735	{
2736	uint32_t fEfl = paTests[iTest].fEflIn;
2737	uint16_t uDst = paTests[iTest].uDstIn;
2738	int rc = g_aMulDivU8[iFn].pfn(&uDst, paTests[iTest].uSrcIn, &fEfl);
2739	if ( uDst != paTests[iTest].uDstOut
2740	\|\| (fEfl \| fEflIgn) != (paTests[iTest].fEflOut \| fEflIgn)
2741	\|\| rc != paTests[iTest].rc)
2742	RTTestFailed(g_hTest, "#%02u%s: efl=%#08x dst=%#06RX16 src=%#04RX8\n"
2743	" %s-> efl=%#08x dst=%#06RX16 rc=%d\n"
2744	"%sexpected %#08x %#06RX16 %d%s\n",
2745	iTest, iVar ? "/n" : "", paTests[iTest].fEflIn, paTests[iTest].uDstIn, paTests[iTest].uSrcIn,
2746	iVar ? " " : "", fEfl, uDst, rc,
2747	iVar ? " " : "", paTests[iTest].fEflOut, paTests[iTest].uDstOut, paTests[iTest].rc,
2748	EFlagsDiff(fEfl \| fEflIgn, paTests[iTest].fEflOut \| fEflIgn));
2749	else
2750	{
2751	*g_pu16 = paTests[iTest].uDstIn;
2752	*g_pfEfl = paTests[iTest].fEflIn;
2753	rc = g_aMulDivU8[iFn].pfn(g_pu16, paTests[iTest].uSrcIn, g_pfEfl);
2754	RTTEST_CHECK(g_hTest, *g_pu16 == paTests[iTest].uDstOut);
2755	RTTEST_CHECK(g_hTest, (*g_pfEfl \| fEflIgn) == (paTests[iTest].fEflOut \| fEflIgn));
2756	RTTEST_CHECK(g_hTest, rc == paTests[iTest].rc);
2757	}
2758	}
2759	pfn = g_aMulDivU8[iFn].pfnNative;
2760	}
2761	}
2762	}
2763
2764	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2765	# define GEN_MULDIV(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2766	static RTEXITCODE MulDivU ## a_cBits ## Generate(uint32_t cTests, const char * const * papszNameFmts) \
2767	{ \
2768	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2769	{ \
2770	if ( a_aSubTests[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE \
2771	&& a_aSubTests[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour) \
2772	continue; \
2773	IEMBINARYOUTPUT BinOut; \
2774	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2775	RTEXITCODE_FAILURE); \
2776	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2777	{ \
2778	a_TestType Test; \
2779	Test.fEflIn = RandEFlags(); \
2780	Test.fEflOut = Test.fEflIn; \
2781	Test.uDst1In = RandU ## a_cBits ## Dst(iTest); \
2782	Test.uDst1Out = Test.uDst1In; \
2783	Test.uDst2In = RandU ## a_cBits ## Dst(iTest); \
2784	Test.uDst2Out = Test.uDst2In; \
2785	Test.uSrcIn = RandU ## a_cBits ## Src(iTest); \
2786	Test.rc = a_aSubTests[iFn].pfnNative(&Test.uDst1Out, &Test.uDst2Out, Test.uSrcIn, &Test.fEflOut); \
2787	GenerateBinaryWrite(&BinOut, &Test, sizeof(Test)); \
2788	} \
2789	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2790	} \
2791	return RTEXITCODE_SUCCESS; \
2792	} \
2793	static RTEXITCODE MulDivU ## a_cBits ## DumpAll(const char * const * papszNameFmts) \
2794	{ \
2795	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2796	{ \
2797	AssertReturn(DECOMPRESS_TESTS(a_aSubTests[iFn]), RTEXITCODE_FAILURE); \
2798	IEMBINARYOUTPUT BinOut; \
2799	AssertReturn(GenerateBinaryOpen(&BinOut, papszNameFmts[a_aSubTests[iFn].idxCpuEflFlavour], a_aSubTests[iFn].pszName), \
2800	RTEXITCODE_FAILURE); \
2801	uint32_t cbTests = a_aSubTests[iFn].pcTests[0]; \
2802	if (!a_aSubTests[iFn].fBinary) \
2803	cbTests *= sizeof(a_aSubTests[iFn].paTests[0]); \
2804	GenerateBinaryWrite(&BinOut, a_aSubTests[iFn].paTests, cbTests); \
2805	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE); \
2806	} \
2807	return RTEXITCODE_SUCCESS; \
2808	}
2809	#else
2810	# define GEN_MULDIV(a_cBits, a_Fmt, a_TestType, a_aSubTests)
2811	#endif
2812
2813	#define TEST_MULDIV(a_cBits, a_Type, a_Fmt, a_TestType, a_SubTestType, a_aSubTests) \
2814	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLMULDIVU ## a_cBits); \
2815	static a_SubTestType a_aSubTests [] = \
2816	{ \
2817	ENTRY_AMD_EX(mul_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0), \
2818	ENTRY_INTEL_EX(mul_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0), \
2819	ENTRY_AMD_EX(imul_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0), \
2820	ENTRY_INTEL_EX(imul_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF, 0), \
2821	ENTRY_AMD_EX(div_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0), \
2822	ENTRY_INTEL_EX(div_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0), \
2823	ENTRY_AMD_EX(idiv_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0), \
2824	ENTRY_INTEL_EX(idiv_u ## a_cBits, X86_EFL_SF \| X86_EFL_ZF \| X86_EFL_AF \| X86_EFL_PF \| X86_EFL_CF \| X86_EFL_OF, 0), \
2825	}; \
2826	\
2827	GEN_MULDIV(a_cBits, a_Fmt, a_TestType, a_aSubTests) \
2828	\
2829	static void MulDivU ## a_cBits ## Test(void) \
2830	{ \
2831	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
2832	{ \
2833	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
2834	continue; \
2835	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
2836	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
2837	uint32_t const fEflIgn = a_aSubTests[iFn].uExtra; \
2838	PFNIEMAIMPLMULDIVU ## a_cBits pfn = a_aSubTests[iFn].pfn; \
2839	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
2840	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
2841	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
2842	{ \
2843	for (uint32_t iTest = 0; iTest < cTests; iTest++ ) \
2844	{ \
2845	uint32_t fEfl = paTests[iTest].fEflIn; \
2846	a_Type uDst1 = paTests[iTest].uDst1In; \
2847	a_Type uDst2 = paTests[iTest].uDst2In; \
2848	int rc = pfn(&uDst1, &uDst2, paTests[iTest].uSrcIn, &fEfl); \
2849	if ( uDst1 != paTests[iTest].uDst1Out \
2850	\|\| uDst2 != paTests[iTest].uDst2Out \
2851	\|\| (fEfl \| fEflIgn) != (paTests[iTest].fEflOut \| fEflIgn)\
2852	\|\| rc != paTests[iTest].rc) \
2853	RTTestFailed(g_hTest, "#%02u%s: efl=%#08x dst1=" a_Fmt " dst2=" a_Fmt " src=" a_Fmt "\n" \
2854	" -> efl=%#08x dst1=" a_Fmt " dst2=" a_Fmt " rc=%d\n" \
2855	"expected %#08x " a_Fmt " " a_Fmt " %d%s -%s%s%s\n", \
2856	iTest, iVar == 0 ? "" : "/n", \
2857	paTests[iTest].fEflIn, paTests[iTest].uDst1In, paTests[iTest].uDst2In, paTests[iTest].uSrcIn, \
2858	fEfl, uDst1, uDst2, rc, \
2859	paTests[iTest].fEflOut, paTests[iTest].uDst1Out, paTests[iTest].uDst2Out, paTests[iTest].rc, \
2860	EFlagsDiff(fEfl \| fEflIgn, paTests[iTest].fEflOut \| fEflIgn), \
2861	uDst1 != paTests[iTest].uDst1Out ? " dst1" : "", uDst2 != paTests[iTest].uDst2Out ? " dst2" : "", \
2862	(fEfl \| fEflIgn) != (paTests[iTest].fEflOut \| fEflIgn) ? " eflags" : ""); \
2863	else \
2864	{ \
2865	*g_pu ## a_cBits = paTests[iTest].uDst1In; \
2866	*g_pu ## a_cBits ## Two = paTests[iTest].uDst2In; \
2867	*g_pfEfl = paTests[iTest].fEflIn; \
2868	rc = pfn(g_pu ## a_cBits, g_pu ## a_cBits ## Two, paTests[iTest].uSrcIn, g_pfEfl); \
2869	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits == paTests[iTest].uDst1Out); \
2870	RTTEST_CHECK(g_hTest, *g_pu ## a_cBits ## Two == paTests[iTest].uDst2Out); \
2871	RTTEST_CHECK(g_hTest, (*g_pfEfl \| fEflIgn) == (paTests[iTest].fEflOut \| fEflIgn)); \
2872	RTTEST_CHECK(g_hTest, rc == paTests[iTest].rc); \
2873	} \
2874	} \
2875	pfn = a_aSubTests[iFn].pfnNative; \
2876	} \
2877	} \
2878	}
2879	TEST_MULDIV(16, uint16_t, "%#06RX16", MULDIVU16_TEST_T, INT_MULDIV_U16_T, g_aMulDivU16)
2880	TEST_MULDIV(32, uint32_t, "%#010RX32", MULDIVU32_TEST_T, INT_MULDIV_U32_T, g_aMulDivU32)
2881	TEST_MULDIV(64, uint64_t, "%#018RX64", MULDIVU64_TEST_T, INT_MULDIV_U64_T, g_aMulDivU64)
2882
2883	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2884	static RTEXITCODE MulDivGenerate(uint32_t cTests, const char * const * papszNameFmts)
2885	{
2886	RTEXITCODE rcExit = MulDivU8Generate(cTests, papszNameFmts);
2887	if (rcExit == RTEXITCODE_SUCCESS)
2888	rcExit = MulDivU16Generate(cTests, papszNameFmts);
2889	if (rcExit == RTEXITCODE_SUCCESS)
2890	rcExit = MulDivU32Generate(cTests, papszNameFmts);
2891	if (rcExit == RTEXITCODE_SUCCESS)
2892	rcExit = MulDivU64Generate(cTests, papszNameFmts);
2893	return rcExit;
2894	}
2895
2896	static RTEXITCODE MulDivDumpAll(const char * const * papszNameFmts)
2897	{
2898	RTEXITCODE rcExit = MulDivU8DumpAll(papszNameFmts);
2899	if (rcExit == RTEXITCODE_SUCCESS)
2900	rcExit = MulDivU16DumpAll(papszNameFmts);
2901	if (rcExit == RTEXITCODE_SUCCESS)
2902	rcExit = MulDivU32DumpAll(papszNameFmts);
2903	if (rcExit == RTEXITCODE_SUCCESS)
2904	rcExit = MulDivU64DumpAll(papszNameFmts);
2905	return rcExit;
2906	}
2907	#endif
2908
2909	static void MulDivTest(void)
2910	{
2911	MulDivU8Test();
2912	MulDivU16Test();
2913	MulDivU32Test();
2914	MulDivU64Test();
2915	}
2916
2917
2918	/*
2919	* BSWAP
2920	*/
2921	static void BswapTest(void)
2922	{
2923	if (SubTestAndCheckIfEnabled("bswap_u16"))
2924	{
2925	*g_pu32 = UINT32_C(0x12345678);
2926	iemAImpl_bswap_u16(g_pu32);
2927	#if 0
2928	RTTEST_CHECK_MSG(g_hTest, g_pu32 == UINT32_C(0x12347856), (g_hTest, "g_pu32=%#RX32\n", *g_pu32));
2929	#else
2930	RTTEST_CHECK_MSG(g_hTest, g_pu32 == UINT32_C(0x12340000), (g_hTest, "g_pu32=%#RX32\n", *g_pu32));
2931	#endif
2932	*g_pu32 = UINT32_C(0xffff1122);
2933	iemAImpl_bswap_u16(g_pu32);
2934	#if 0
2935	RTTEST_CHECK_MSG(g_hTest, g_pu32 == UINT32_C(0xffff2211), (g_hTest, "g_pu32=%#RX32\n", *g_pu32));
2936	#else
2937	RTTEST_CHECK_MSG(g_hTest, g_pu32 == UINT32_C(0xffff0000), (g_hTest, "g_pu32=%#RX32\n", *g_pu32));
2938	#endif
2939	}
2940
2941	if (SubTestAndCheckIfEnabled("bswap_u32"))
2942	{
2943	*g_pu32 = UINT32_C(0x12345678);
2944	iemAImpl_bswap_u32(g_pu32);
2945	RTTEST_CHECK(g_hTest, *g_pu32 == UINT32_C(0x78563412));
2946	}
2947
2948	if (SubTestAndCheckIfEnabled("bswap_u64"))
2949	{
2950	*g_pu64 = UINT64_C(0x0123456789abcdef);
2951	iemAImpl_bswap_u64(g_pu64);
2952	RTTEST_CHECK(g_hTest, *g_pu64 == UINT64_C(0xefcdab8967452301));
2953	}
2954	}
2955
2956
2957
2958	/*********************************************************************************************************************************
2959	* Floating point (x87 style) *
2960	*********************************************************************************************************************************/
2961
2962	/*
2963	* FPU constant loading.
2964	*/
2965	TYPEDEF_SUBTEST_TYPE(FPU_LD_CONST_T, FPU_LD_CONST_TEST_T, PFNIEMAIMPLFPUR80LDCONST);
2966
2967	static FPU_LD_CONST_T g_aFpuLdConst[] =
2968	{
2969	ENTRY(fld1),
2970	ENTRY(fldl2t),
2971	ENTRY(fldl2e),
2972	ENTRY(fldpi),
2973	ENTRY(fldlg2),
2974	ENTRY(fldln2),
2975	ENTRY(fldz),
2976	};
2977
2978	#ifdef TSTIEMAIMPL_WITH_GENERATOR
2979	static void FpuLdConstGenerate(PRTSTREAM pOut, uint32_t cTests)
2980	{
2981	X86FXSTATE State;
2982	RT_ZERO(State);
2983	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuLdConst); iFn++)
2984	{
2985	GenerateArrayStart(pOut, g_aFpuLdConst[iFn].pszName, "FPU_LD_CONST_TEST_T");
2986	for (uint32_t iTest = 0; iTest < cTests; iTest += 4)
2987	{
2988	State.FCW = RandFcw();
2989	State.FSW = RandFsw();
2990
2991	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
2992	{
2993	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
2994	State.FCW = (State.FCW & ~X86_FCW_RC_MASK) \| (iRounding << X86_FCW_RC_SHIFT);
2995	g_aFpuLdConst[iFn].pfn(&State, &Res);
2996	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s }, /* #%u */\n",
2997	State.FCW, State.FSW, Res.FSW, GenFormatR80(&Res.r80Result), iTest + iRounding);
2998	}
2999	}
3000	GenerateArrayEnd(pOut, g_aFpuLdConst[iFn].pszName);
3001	}
3002	}
3003	#endif
3004
3005	static void FpuLoadConstTest(void)
3006	{
3007	/*
3008	* Inputs:
3009	* - FSW: C0, C1, C2, C3
3010	* - FCW: Exception masks, Precision control, Rounding control.
3011	*
3012	* C1 set to 1 on stack overflow, zero otherwise. C0, C2, and C3 are "undefined".
3013	*/
3014	X86FXSTATE State;
3015	RT_ZERO(State);
3016	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuLdConst); iFn++)
3017	{
3018	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuLdConst[iFn]))
3019	continue;
3020
3021	uint32_t const cTests = *g_aFpuLdConst[iFn].pcTests;
3022	FPU_LD_CONST_TEST_T const *paTests = g_aFpuLdConst[iFn].paTests;
3023	PFNIEMAIMPLFPUR80LDCONST pfn = g_aFpuLdConst[iFn].pfn;
3024	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuLdConst[iFn]); \
3025	if (!cTests) RTTestSkipped(g_hTest, "no tests");
3026	for (uint32_t iVar = 0; iVar < cVars; iVar++)
3027	{
3028	for (uint32_t iTest = 0; iTest < cTests; iTest++)
3029	{
3030	State.FCW = paTests[iTest].fFcw;
3031	State.FSW = paTests[iTest].fFswIn;
3032	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
3033	pfn(&State, &Res);
3034	if ( Res.FSW != paTests[iTest].fFswOut
3035	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult))
3036	RTTestFailed(g_hTest, "#%u%s: fcw=%#06x fsw=%#06x -> fsw=%#06x %s, expected %#06x %s%s%s (%s)\n",
3037	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
3038	Res.FSW, FormatR80(&Res.r80Result),
3039	paTests[iTest].fFswOut, FormatR80(&paTests[iTest].rdResult),
3040	FswDiff(Res.FSW, paTests[iTest].fFswOut),
3041	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult) ? " - val" : "",
3042	FormatFcw(paTests[iTest].fFcw) );
3043	}
3044	pfn = g_aFpuLdConst[iFn].pfnNative;
3045	}
3046	}
3047	}
3048
3049
3050	/*
3051	* Load floating point values from memory.
3052	*/
3053	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3054	# define GEN_FPU_LOAD(a_cBits, a_rdTypeIn, a_aSubTests, a_TestType) \
3055	static void FpuLdR ## a_cBits ## Generate(PRTSTREAM pOut, uint32_t cTests) \
3056	{ \
3057	X86FXSTATE State; \
3058	RT_ZERO(State); \
3059	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3060	{ \
3061	GenerateArrayStart(pOut, a_aSubTests[iFn].pszName, #a_TestType); \
3062	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3063	{ \
3064	State.FCW = RandFcw(); \
3065	State.FSW = RandFsw(); \
3066	a_rdTypeIn InVal = RandR ## a_cBits ## Src(iTest); \
3067	\
3068	for (uint16_t iRounding = 0; iRounding < 4; iRounding++) \
3069	{ \
3070	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
3071	State.FCW = (State.FCW & ~X86_FCW_RC_MASK) \| (iRounding << X86_FCW_RC_SHIFT); \
3072	a_aSubTests[iFn].pfn(&State, &Res, &InVal); \
3073	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u */\n", \
3074	State.FCW, State.FSW, Res.FSW, GenFormatR80(&Res.r80Result), \
3075	GenFormatR ## a_cBits(&InVal), iTest, iRounding); \
3076	} \
3077	} \
3078	GenerateArrayEnd(pOut, a_aSubTests[iFn].pszName); \
3079	} \
3080	}
3081	#else
3082	# define GEN_FPU_LOAD(a_cBits, a_rdTypeIn, a_aSubTests, a_TestType)
3083	#endif
3084
3085	#define TEST_FPU_LOAD(a_cBits, a_rdTypeIn, a_SubTestType, a_aSubTests, a_TestType) \
3086	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLFPULDR80FROM ## a_cBits,(PCX86FXSTATE, PIEMFPURESULT, PC ## a_rdTypeIn)); \
3087	typedef FNIEMAIMPLFPULDR80FROM ## a_cBits *PFNIEMAIMPLFPULDR80FROM ## a_cBits; \
3088	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLFPULDR80FROM ## a_cBits); \
3089	\
3090	static a_SubTestType a_aSubTests[] = \
3091	{ \
3092	ENTRY(RT_CONCAT(fld_r80_from_r,a_cBits)) \
3093	}; \
3094	GEN_FPU_LOAD(a_cBits, a_rdTypeIn, a_aSubTests, a_TestType) \
3095	\
3096	static void FpuLdR ## a_cBits ## Test(void) \
3097	{ \
3098	X86FXSTATE State; \
3099	RT_ZERO(State); \
3100	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3101	{ \
3102	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
3103	continue; \
3104	\
3105	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
3106	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
3107	PFNIEMAIMPLFPULDR80FROM ## a_cBits pfn = a_aSubTests[iFn].pfn; \
3108	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
3109	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
3110	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
3111	{ \
3112	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3113	{ \
3114	a_rdTypeIn const InVal = paTests[iTest].InVal; \
3115	State.FCW = paTests[iTest].fFcw; \
3116	State.FSW = paTests[iTest].fFswIn; \
3117	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
3118	pfn(&State, &Res, &InVal); \
3119	if ( Res.FSW != paTests[iTest].fFswOut \
3120	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult)) \
3121	RTTestFailed(g_hTest, "#%03u%s: fcw=%#06x fsw=%#06x in=%s\n" \
3122	"%s -> fsw=%#06x %s\n" \
3123	"%s expected %#06x %s%s%s (%s)\n", \
3124	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, \
3125	FormatR ## a_cBits(&paTests[iTest].InVal), \
3126	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result), \
3127	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].rdResult), \
3128	FswDiff(Res.FSW, paTests[iTest].fFswOut), \
3129	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult) ? " - val" : "", \
3130	FormatFcw(paTests[iTest].fFcw) ); \
3131	} \
3132	pfn = a_aSubTests[iFn].pfnNative; \
3133	} \
3134	} \
3135	}
3136
3137	TEST_FPU_LOAD(80, RTFLOAT80U, FPU_LD_R80_T, g_aFpuLdR80, FPU_R80_IN_TEST_T)
3138	TEST_FPU_LOAD(64, RTFLOAT64U, FPU_LD_R64_T, g_aFpuLdR64, FPU_R64_IN_TEST_T)
3139	TEST_FPU_LOAD(32, RTFLOAT32U, FPU_LD_R32_T, g_aFpuLdR32, FPU_R32_IN_TEST_T)
3140
3141	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3142	static void FpuLdMemGenerate(PRTSTREAM pOut, uint32_t cTests)
3143	{
3144	FpuLdR80Generate(pOut, cTests);
3145	FpuLdR64Generate(pOut, cTests);
3146	FpuLdR32Generate(pOut, cTests);
3147	}
3148	#endif
3149
3150	static void FpuLdMemTest(void)
3151	{
3152	FpuLdR80Test();
3153	FpuLdR64Test();
3154	FpuLdR32Test();
3155	}
3156
3157
3158	/*
3159	* Load integer values from memory.
3160	*/
3161	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3162	# define GEN_FPU_LOAD_INT(a_cBits, a_iTypeIn, a_szFmtIn, a_aSubTests, a_TestType) \
3163	static void FpuLdI ## a_cBits ## Generate(PRTSTREAM pOut, uint32_t cTests) \
3164	{ \
3165	X86FXSTATE State; \
3166	RT_ZERO(State); \
3167	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3168	{ \
3169	GenerateArrayStart(pOut, a_aSubTests[iFn].pszName, #a_TestType); \
3170	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3171	{ \
3172	State.FCW = RandFcw(); \
3173	State.FSW = RandFsw(); \
3174	a_iTypeIn InVal = (a_iTypeIn)RandU ## a_cBits ## Src(iTest); \
3175	\
3176	for (uint16_t iRounding = 0; iRounding < 4; iRounding++) \
3177	{ \
3178	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
3179	State.FCW = (State.FCW & ~X86_FCW_RC_MASK) \| (iRounding << X86_FCW_RC_SHIFT); \
3180	a_aSubTests[iFn].pfn(&State, &Res, &InVal); \
3181	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, " a_szFmtIn " }, /* #%u/%u */\n", \
3182	State.FCW, State.FSW, Res.FSW, GenFormatR80(&Res.r80Result), InVal, iTest, iRounding); \
3183	} \
3184	} \
3185	GenerateArrayEnd(pOut, a_aSubTests[iFn].pszName); \
3186	} \
3187	}
3188	#else
3189	# define GEN_FPU_LOAD_INT(a_cBits, a_iTypeIn, a_szFmtIn, a_aSubTests, a_TestType)
3190	#endif
3191
3192	#define TEST_FPU_LOAD_INT(a_cBits, a_iTypeIn, a_szFmtIn, a_SubTestType, a_aSubTests, a_TestType) \
3193	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLFPULDR80FROMI ## a_cBits,(PCX86FXSTATE, PIEMFPURESULT, a_iTypeIn const *)); \
3194	typedef FNIEMAIMPLFPULDR80FROMI ## a_cBits *PFNIEMAIMPLFPULDR80FROMI ## a_cBits; \
3195	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLFPULDR80FROMI ## a_cBits); \
3196	\
3197	static a_SubTestType a_aSubTests[] = \
3198	{ \
3199	ENTRY(RT_CONCAT(fild_r80_from_i,a_cBits)) \
3200	}; \
3201	GEN_FPU_LOAD_INT(a_cBits, a_iTypeIn, a_szFmtIn, a_aSubTests, a_TestType) \
3202	\
3203	static void FpuLdI ## a_cBits ## Test(void) \
3204	{ \
3205	X86FXSTATE State; \
3206	RT_ZERO(State); \
3207	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3208	{ \
3209	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
3210	continue; \
3211	\
3212	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
3213	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
3214	PFNIEMAIMPLFPULDR80FROMI ## a_cBits pfn = a_aSubTests[iFn].pfn; \
3215	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
3216	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
3217	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
3218	{ \
3219	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3220	{ \
3221	a_iTypeIn const iInVal = paTests[iTest].iInVal; \
3222	State.FCW = paTests[iTest].fFcw; \
3223	State.FSW = paTests[iTest].fFswIn; \
3224	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
3225	pfn(&State, &Res, &iInVal); \
3226	if ( Res.FSW != paTests[iTest].fFswOut \
3227	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult)) \
3228	RTTestFailed(g_hTest, "#%03u%s: fcw=%#06x fsw=%#06x in=" a_szFmtIn "\n" \
3229	"%s -> fsw=%#06x %s\n" \
3230	"%s expected %#06x %s%s%s (%s)\n", \
3231	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, paTests[iTest].iInVal, \
3232	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result), \
3233	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].rdResult), \
3234	FswDiff(Res.FSW, paTests[iTest].fFswOut), \
3235	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult) ? " - val" : "", \
3236	FormatFcw(paTests[iTest].fFcw) ); \
3237	} \
3238	pfn = a_aSubTests[iFn].pfnNative; \
3239	} \
3240	} \
3241	}
3242
3243	TEST_FPU_LOAD_INT(64, int64_t, "%RI64", FPU_LD_I64_T, g_aFpuLdU64, FPU_I64_IN_TEST_T)
3244	TEST_FPU_LOAD_INT(32, int32_t, "%RI32", FPU_LD_I32_T, g_aFpuLdU32, FPU_I32_IN_TEST_T)
3245	TEST_FPU_LOAD_INT(16, int16_t, "%RI16", FPU_LD_I16_T, g_aFpuLdU16, FPU_I16_IN_TEST_T)
3246
3247	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3248	static void FpuLdIntGenerate(PRTSTREAM pOut, uint32_t cTests)
3249	{
3250	FpuLdI64Generate(pOut, cTests);
3251	FpuLdI32Generate(pOut, cTests);
3252	FpuLdI16Generate(pOut, cTests);
3253	}
3254	#endif
3255
3256	static void FpuLdIntTest(void)
3257	{
3258	FpuLdI64Test();
3259	FpuLdI32Test();
3260	FpuLdI16Test();
3261	}
3262
3263
3264	/*
3265	* Load binary coded decimal values from memory.
3266	*/
3267	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLFPULDR80FROMD80,(PCX86FXSTATE, PIEMFPURESULT, PCRTPBCD80U));
3268	typedef FNIEMAIMPLFPULDR80FROMD80 *PFNIEMAIMPLFPULDR80FROMD80;
3269	TYPEDEF_SUBTEST_TYPE(FPU_LD_D80_T, FPU_D80_IN_TEST_T, PFNIEMAIMPLFPULDR80FROMD80);
3270
3271	static FPU_LD_D80_T g_aFpuLdD80[] =
3272	{
3273	ENTRY(fld_r80_from_d80)
3274	};
3275
3276	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3277	static void FpuLdD80Generate(PRTSTREAM pOut, uint32_t cTests)
3278	{
3279	X86FXSTATE State;
3280	RT_ZERO(State);
3281	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuLdD80); iFn++)
3282	{
3283	GenerateArrayStart(pOut, g_aFpuLdD80[iFn].pszName, "FPU_D80_IN_TEST_T");
3284	for (uint32_t iTest = 0; iTest < cTests; iTest++)
3285	{
3286	State.FCW = RandFcw();
3287	State.FSW = RandFsw();
3288	RTPBCD80U InVal = RandD80Src(iTest);
3289
3290	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
3291	{
3292	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
3293	State.FCW = (State.FCW & ~X86_FCW_RC_MASK) \| (iRounding << X86_FCW_RC_SHIFT);
3294	g_aFpuLdD80[iFn].pfn(&State, &Res, &InVal);
3295	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u */\n",
3296	State.FCW, State.FSW, Res.FSW, GenFormatR80(&Res.r80Result), GenFormatD80(&InVal),
3297	iTest, iRounding);
3298	}
3299	}
3300	GenerateArrayEnd(pOut, g_aFpuLdD80[iFn].pszName);
3301	}
3302	}
3303	#endif
3304
3305	static void FpuLdD80Test(void)
3306	{
3307	X86FXSTATE State;
3308	RT_ZERO(State);
3309	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuLdD80); iFn++)
3310	{
3311	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuLdD80[iFn]))
3312	continue;
3313
3314	uint32_t const cTests = *g_aFpuLdD80[iFn].pcTests;
3315	FPU_D80_IN_TEST_T const * const paTests = g_aFpuLdD80[iFn].paTests;
3316	PFNIEMAIMPLFPULDR80FROMD80 pfn = g_aFpuLdD80[iFn].pfn;
3317	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuLdD80[iFn]);
3318	if (!cTests) RTTestSkipped(g_hTest, "no tests");
3319	for (uint32_t iVar = 0; iVar < cVars; iVar++)
3320	{
3321	for (uint32_t iTest = 0; iTest < cTests; iTest++)
3322	{
3323	RTPBCD80U const InVal = paTests[iTest].InVal;
3324	State.FCW = paTests[iTest].fFcw;
3325	State.FSW = paTests[iTest].fFswIn;
3326	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
3327	pfn(&State, &Res, &InVal);
3328	if ( Res.FSW != paTests[iTest].fFswOut
3329	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult))
3330	RTTestFailed(g_hTest, "#%03u%s: fcw=%#06x fsw=%#06x in=%s\n"
3331	"%s -> fsw=%#06x %s\n"
3332	"%s expected %#06x %s%s%s (%s)\n",
3333	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
3334	FormatD80(&paTests[iTest].InVal),
3335	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result),
3336	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].rdResult),
3337	FswDiff(Res.FSW, paTests[iTest].fFswOut),
3338	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].rdResult) ? " - val" : "",
3339	FormatFcw(paTests[iTest].fFcw) );
3340	}
3341	pfn = g_aFpuLdD80[iFn].pfnNative;
3342	}
3343	}
3344	}
3345
3346
3347	/*
3348	* Store values floating point values to memory.
3349	*/
3350	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3351	static const RTFLOAT80U g_aFpuStR32Specials[] =
3352	{
3353	RTFLOAT80U_INIT_C(0, 0xffffff8000000000, RTFLOAT80U_EXP_BIAS), /* near rounding with carry */
3354	RTFLOAT80U_INIT_C(1, 0xffffff8000000000, RTFLOAT80U_EXP_BIAS), /* near rounding with carry */
3355	RTFLOAT80U_INIT_C(0, 0xfffffe8000000000, RTFLOAT80U_EXP_BIAS), /* near rounding */
3356	RTFLOAT80U_INIT_C(1, 0xfffffe8000000000, RTFLOAT80U_EXP_BIAS), /* near rounding */
3357	};
3358	static const RTFLOAT80U g_aFpuStR64Specials[] =
3359	{
3360	RTFLOAT80U_INIT_C(0, 0xfffffffffffffc00, RTFLOAT80U_EXP_BIAS), /* near rounding with carry */
3361	RTFLOAT80U_INIT_C(1, 0xfffffffffffffc00, RTFLOAT80U_EXP_BIAS), /* near rounding with carry */
3362	RTFLOAT80U_INIT_C(0, 0xfffffffffffff400, RTFLOAT80U_EXP_BIAS), /* near rounding */
3363	RTFLOAT80U_INIT_C(1, 0xfffffffffffff400, RTFLOAT80U_EXP_BIAS), /* near rounding */
3364	RTFLOAT80U_INIT_C(0, 0xd0b9e6fdda887400, 687 + RTFLOAT80U_EXP_BIAS), /* random example for this */
3365	};
3366	static const RTFLOAT80U g_aFpuStR80Specials[] =
3367	{
3368	RTFLOAT80U_INIT_C(0, 0x8000000000000000, RTFLOAT80U_EXP_BIAS), /* placeholder */
3369	};
3370	# define GEN_FPU_STORE(a_cBits, a_rdType, a_aSubTests, a_TestType) \
3371	static void FpuStR ## a_cBits ## Generate(PRTSTREAM pOut, uint32_t cTests) \
3372	{ \
3373	uint32_t const cTotalTests = cTests + RT_ELEMENTS(g_aFpuStR ## a_cBits ## Specials); \
3374	X86FXSTATE State; \
3375	RT_ZERO(State); \
3376	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3377	{ \
3378	GenerateArrayStart(pOut, a_aSubTests[iFn].pszName, #a_TestType); \
3379	for (uint32_t iTest = 0; iTest < cTotalTests; iTest++) \
3380	{ \
3381	uint16_t const fFcw = RandFcw(); \
3382	State.FSW = RandFsw(); \
3383	RTFLOAT80U const InVal = iTest < cTests ? RandR80Src(iTest, a_cBits) \
3384	: g_aFpuStR ## a_cBits ## Specials[iTest - cTests]; \
3385	\
3386	for (uint16_t iRounding = 0; iRounding < 4; iRounding++) \
3387	{ \
3388	/* PC doesn't influence these, so leave as is. */ \
3389	AssertCompile(X86_FCW_OM_BIT + 1 == X86_FCW_UM_BIT && X86_FCW_UM_BIT + 1 == X86_FCW_PM_BIT); \
3390	for (uint16_t iMask = 0; iMask < 16; iMask += 2 /1/) \
3391	{ \
3392	uint16_t uFswOut = 0; \
3393	a_rdType OutVal; \
3394	RT_ZERO(OutVal); \
3395	memset(&OutVal, 0xfe, sizeof(OutVal)); \
3396	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_OM \| X86_FCW_UM \| X86_FCW_PM)) \
3397	\| (iRounding << X86_FCW_RC_SHIFT); \
3398	/if (iMask & 1) State.FCW ^= X86_FCW_MASK_ALL;/ \
3399	State.FCW \|= (iMask >> 1) << X86_FCW_OM_BIT; \
3400	a_aSubTests[iFn].pfn(&State, &uFswOut, &OutVal, &InVal); \
3401	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u */\n", \
3402	State.FCW, State.FSW, uFswOut, GenFormatR80(&InVal), \
3403	GenFormatR ## a_cBits(&OutVal), iTest, iRounding, iMask); \
3404	} \
3405	} \
3406	} \
3407	GenerateArrayEnd(pOut, a_aSubTests[iFn].pszName); \
3408	} \
3409	}
3410	#else
3411	# define GEN_FPU_STORE(a_cBits, a_rdType, a_aSubTests, a_TestType)
3412	#endif
3413
3414	#define TEST_FPU_STORE(a_cBits, a_rdType, a_SubTestType, a_aSubTests, a_TestType) \
3415	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLFPUSTR80TOR ## a_cBits,(PCX86FXSTATE, uint16_t *, \
3416	PRTFLOAT ## a_cBits ## U, PCRTFLOAT80U)); \
3417	typedef FNIEMAIMPLFPUSTR80TOR ## a_cBits *PFNIEMAIMPLFPUSTR80TOR ## a_cBits; \
3418	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLFPUSTR80TOR ## a_cBits); \
3419	\
3420	static a_SubTestType a_aSubTests[] = \
3421	{ \
3422	ENTRY(RT_CONCAT(fst_r80_to_r,a_cBits)) \
3423	}; \
3424	GEN_FPU_STORE(a_cBits, a_rdType, a_aSubTests, a_TestType) \
3425	\
3426	static void FpuStR ## a_cBits ## Test(void) \
3427	{ \
3428	X86FXSTATE State; \
3429	RT_ZERO(State); \
3430	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3431	{ \
3432	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
3433	continue; \
3434	\
3435	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
3436	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
3437	PFNIEMAIMPLFPUSTR80TOR ## a_cBits pfn = a_aSubTests[iFn].pfn; \
3438	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
3439	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
3440	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
3441	{ \
3442	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3443	{ \
3444	RTFLOAT80U const InVal = paTests[iTest].InVal; \
3445	uint16_t uFswOut = 0; \
3446	a_rdType OutVal; \
3447	RT_ZERO(OutVal); \
3448	memset(&OutVal, 0xfe, sizeof(OutVal)); \
3449	State.FCW = paTests[iTest].fFcw; \
3450	State.FSW = paTests[iTest].fFswIn; \
3451	pfn(&State, &uFswOut, &OutVal, &InVal); \
3452	if ( uFswOut != paTests[iTest].fFswOut \
3453	\|\| !RTFLOAT ## a_cBits ## U_ARE_IDENTICAL(&OutVal, &paTests[iTest].OutVal)) \
3454	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n" \
3455	"%s -> fsw=%#06x %s\n" \
3456	"%s expected %#06x %s%s%s (%s)\n", \
3457	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, \
3458	FormatR80(&paTests[iTest].InVal), \
3459	iVar ? " " : "", uFswOut, FormatR ## a_cBits(&OutVal), \
3460	iVar ? " " : "", paTests[iTest].fFswOut, FormatR ## a_cBits(&paTests[iTest].OutVal), \
3461	FswDiff(uFswOut, paTests[iTest].fFswOut), \
3462	!RTFLOAT ## a_cBits ## U_ARE_IDENTICAL(&OutVal, &paTests[iTest].OutVal) ? " - val" : "", \
3463	FormatFcw(paTests[iTest].fFcw) ); \
3464	} \
3465	pfn = a_aSubTests[iFn].pfnNative; \
3466	} \
3467	} \
3468	}
3469
3470	TEST_FPU_STORE(80, RTFLOAT80U, FPU_ST_R80_T, g_aFpuStR80, FPU_ST_R80_TEST_T)
3471	TEST_FPU_STORE(64, RTFLOAT64U, FPU_ST_R64_T, g_aFpuStR64, FPU_ST_R64_TEST_T)
3472	TEST_FPU_STORE(32, RTFLOAT32U, FPU_ST_R32_T, g_aFpuStR32, FPU_ST_R32_TEST_T)
3473
3474	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3475	static void FpuStMemGenerate(PRTSTREAM pOut, uint32_t cTests)
3476	{
3477	FpuStR80Generate(pOut, cTests);
3478	FpuStR64Generate(pOut, cTests);
3479	FpuStR32Generate(pOut, cTests);
3480	}
3481	#endif
3482
3483	static void FpuStMemTest(void)
3484	{
3485	FpuStR80Test();
3486	FpuStR64Test();
3487	FpuStR32Test();
3488	}
3489
3490
3491	/*
3492	* Store integer values to memory or register.
3493	*/
3494	TYPEDEF_SUBTEST_TYPE(FPU_ST_I16_T, FPU_ST_I16_TEST_T, PFNIEMAIMPLFPUSTR80TOI16);
3495	TYPEDEF_SUBTEST_TYPE(FPU_ST_I32_T, FPU_ST_I32_TEST_T, PFNIEMAIMPLFPUSTR80TOI32);
3496	TYPEDEF_SUBTEST_TYPE(FPU_ST_I64_T, FPU_ST_I64_TEST_T, PFNIEMAIMPLFPUSTR80TOI64);
3497
3498	static FPU_ST_I16_T g_aFpuStI16[] =
3499	{
3500	ENTRY(fist_r80_to_i16),
3501	ENTRY_AMD( fistt_r80_to_i16, 0),
3502	ENTRY_INTEL(fistt_r80_to_i16, 0),
3503	};
3504	static FPU_ST_I32_T g_aFpuStI32[] =
3505	{
3506	ENTRY(fist_r80_to_i32),
3507	ENTRY(fistt_r80_to_i32),
3508	};
3509	static FPU_ST_I64_T g_aFpuStI64[] =
3510	{
3511	ENTRY(fist_r80_to_i64),
3512	ENTRY(fistt_r80_to_i64),
3513	};
3514
3515	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3516	static const RTFLOAT80U g_aFpuStI16Specials[] = /* 16-bit variant borrows properties from the 32-bit one, thus all this stuff. */
3517	{
3518	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 13 + RTFLOAT80U_EXP_BIAS),
3519	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 13 + RTFLOAT80U_EXP_BIAS),
3520	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3521	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3522	RTFLOAT80U_INIT_C(0, 0x8000080000000000, 14 + RTFLOAT80U_EXP_BIAS),
3523	RTFLOAT80U_INIT_C(1, 0x8000080000000000, 14 + RTFLOAT80U_EXP_BIAS),
3524	RTFLOAT80U_INIT_C(0, 0x8000100000000000, 14 + RTFLOAT80U_EXP_BIAS),
3525	RTFLOAT80U_INIT_C(1, 0x8000100000000000, 14 + RTFLOAT80U_EXP_BIAS),
3526	RTFLOAT80U_INIT_C(0, 0x8000200000000000, 14 + RTFLOAT80U_EXP_BIAS),
3527	RTFLOAT80U_INIT_C(1, 0x8000200000000000, 14 + RTFLOAT80U_EXP_BIAS),
3528	RTFLOAT80U_INIT_C(0, 0x8000400000000000, 14 + RTFLOAT80U_EXP_BIAS),
3529	RTFLOAT80U_INIT_C(1, 0x8000400000000000, 14 + RTFLOAT80U_EXP_BIAS),
3530	RTFLOAT80U_INIT_C(0, 0x8000800000000000, 14 + RTFLOAT80U_EXP_BIAS),
3531	RTFLOAT80U_INIT_C(1, 0x8000800000000000, 14 + RTFLOAT80U_EXP_BIAS),
3532	RTFLOAT80U_INIT_C(1, 0x8000ffffffffffff, 14 + RTFLOAT80U_EXP_BIAS),
3533	RTFLOAT80U_INIT_C(0, 0x8001000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3534	RTFLOAT80U_INIT_C(1, 0x8001000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3535	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 14 + RTFLOAT80U_EXP_BIAS),
3536	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 14 + RTFLOAT80U_EXP_BIAS),
3537	RTFLOAT80U_INIT_C(0, 0xffff800000000000, 14 + RTFLOAT80U_EXP_BIAS),
3538	RTFLOAT80U_INIT_C(0, 0xffff000000000000, 14 + RTFLOAT80U_EXP_BIAS), /* overflow to min/nan */
3539	RTFLOAT80U_INIT_C(0, 0xfffe000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3540	RTFLOAT80U_INIT_C(1, 0xffff800000000000, 14 + RTFLOAT80U_EXP_BIAS),
3541	RTFLOAT80U_INIT_C(1, 0xffff000000000000, 14 + RTFLOAT80U_EXP_BIAS), /* min */
3542	RTFLOAT80U_INIT_C(1, 0xfffe000000000000, 14 + RTFLOAT80U_EXP_BIAS),
3543	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 15 + RTFLOAT80U_EXP_BIAS),
3544	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 15 + RTFLOAT80U_EXP_BIAS),
3545	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 16 + RTFLOAT80U_EXP_BIAS),
3546	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 17 + RTFLOAT80U_EXP_BIAS),
3547	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 20 + RTFLOAT80U_EXP_BIAS),
3548	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 24 + RTFLOAT80U_EXP_BIAS),
3549	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 28 + RTFLOAT80U_EXP_BIAS),
3550	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 30 + RTFLOAT80U_EXP_BIAS),
3551	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 30 + RTFLOAT80U_EXP_BIAS),
3552	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 30 + RTFLOAT80U_EXP_BIAS),
3553	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 30 + RTFLOAT80U_EXP_BIAS),
3554	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3555	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3556	RTFLOAT80U_INIT_C(0, 0x8000000000000001, 31 + RTFLOAT80U_EXP_BIAS),
3557	RTFLOAT80U_INIT_C(1, 0x8000000000000001, 31 + RTFLOAT80U_EXP_BIAS),
3558	RTFLOAT80U_INIT_C(0, 0x8000ffffffffffff, 31 + RTFLOAT80U_EXP_BIAS),
3559	RTFLOAT80U_INIT_C(1, 0x8000ffffffffffff, 31 + RTFLOAT80U_EXP_BIAS),
3560	RTFLOAT80U_INIT_C(0, 0x8001000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3561	RTFLOAT80U_INIT_C(1, 0x8001000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3562	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 31 + RTFLOAT80U_EXP_BIAS),
3563	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 31 + RTFLOAT80U_EXP_BIAS),
3564	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 32 + RTFLOAT80U_EXP_BIAS),
3565	};
3566	static const RTFLOAT80U g_aFpuStI32Specials[] =
3567	{
3568	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 30 + RTFLOAT80U_EXP_BIAS),
3569	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 30 + RTFLOAT80U_EXP_BIAS),
3570	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 30 + RTFLOAT80U_EXP_BIAS), /* overflow to min/nan */
3571	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 30 + RTFLOAT80U_EXP_BIAS), /* min */
3572	RTFLOAT80U_INIT_C(0, 0xffffffff80000000, 30 + RTFLOAT80U_EXP_BIAS), /* overflow to min/nan */
3573	RTFLOAT80U_INIT_C(1, 0xffffffff80000000, 30 + RTFLOAT80U_EXP_BIAS), /* min */
3574	RTFLOAT80U_INIT_C(0, 0xffffffff00000000, 30 + RTFLOAT80U_EXP_BIAS), /* overflow to min/nan */
3575	RTFLOAT80U_INIT_C(1, 0xffffffff00000000, 30 + RTFLOAT80U_EXP_BIAS), /* min */
3576	RTFLOAT80U_INIT_C(0, 0xfffffffe00000000, 30 + RTFLOAT80U_EXP_BIAS),
3577	RTFLOAT80U_INIT_C(1, 0xfffffffe00000000, 30 + RTFLOAT80U_EXP_BIAS),
3578	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3579	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 31 + RTFLOAT80U_EXP_BIAS),
3580	RTFLOAT80U_INIT_C(0, 0x8000000000000001, 31 + RTFLOAT80U_EXP_BIAS),
3581	RTFLOAT80U_INIT_C(1, 0x8000000000000001, 31 + RTFLOAT80U_EXP_BIAS),
3582	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 31 + RTFLOAT80U_EXP_BIAS),
3583	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 31 + RTFLOAT80U_EXP_BIAS),
3584	};
3585	static const RTFLOAT80U g_aFpuStI64Specials[] =
3586	{
3587	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 61 + RTFLOAT80U_EXP_BIAS),
3588	RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, 61 + RTFLOAT80U_EXP_BIAS),
3589	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 62 + RTFLOAT80U_EXP_BIAS),
3590	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 62 + RTFLOAT80U_EXP_BIAS),
3591	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 62 + RTFLOAT80U_EXP_BIAS),
3592	RTFLOAT80U_INIT_C(1, 0xfffffffffffffff0, 62 + RTFLOAT80U_EXP_BIAS),
3593	RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, 62 + RTFLOAT80U_EXP_BIAS), /* overflow to min/nan */
3594	RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, 62 + RTFLOAT80U_EXP_BIAS), /* min */
3595	RTFLOAT80U_INIT_C(0, 0xfffffffffffffffe, 62 + RTFLOAT80U_EXP_BIAS),
3596	RTFLOAT80U_INIT_C(1, 0xfffffffffffffffe, 62 + RTFLOAT80U_EXP_BIAS),
3597	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 63 + RTFLOAT80U_EXP_BIAS),
3598	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 63 + RTFLOAT80U_EXP_BIAS),
3599	RTFLOAT80U_INIT_C(0, 0x8000000000000001, 63 + RTFLOAT80U_EXP_BIAS),
3600	RTFLOAT80U_INIT_C(1, 0x8000000000000001, 63 + RTFLOAT80U_EXP_BIAS),
3601	RTFLOAT80U_INIT_C(0, 0x8000000000000002, 63 + RTFLOAT80U_EXP_BIAS),
3602	RTFLOAT80U_INIT_C(1, 0x8000000000000002, 63 + RTFLOAT80U_EXP_BIAS),
3603	RTFLOAT80U_INIT_C(0, 0xfffffffffffffff0, 63 + RTFLOAT80U_EXP_BIAS),
3604	};
3605
3606	# define GEN_FPU_STORE_INT(a_cBits, a_iType, a_szFmt, a_aSubTests, a_TestType) \
3607	static void FpuStI ## a_cBits ## Generate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests) \
3608	{ \
3609	X86FXSTATE State; \
3610	RT_ZERO(State); \
3611	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3612	{ \
3613	PFNIEMAIMPLFPUSTR80TOI ## a_cBits const pfn = a_aSubTests[iFn].pfnNative \
3614	? a_aSubTests[iFn].pfnNative : a_aSubTests[iFn].pfn; \
3615	PRTSTREAM pOutFn = pOut; \
3616	if (a_aSubTests[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE) \
3617	{ \
3618	if (a_aSubTests[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour) \
3619	continue; \
3620	pOutFn = pOutCpu; \
3621	} \
3622	\
3623	GenerateArrayStart(pOutFn, a_aSubTests[iFn].pszName, #a_TestType); \
3624	uint32_t const cTotalTests = cTests + RT_ELEMENTS(g_aFpuStI ## a_cBits ## Specials); \
3625	for (uint32_t iTest = 0; iTest < cTotalTests; iTest++) \
3626	{ \
3627	uint16_t const fFcw = RandFcw(); \
3628	State.FSW = RandFsw(); \
3629	RTFLOAT80U const InVal = iTest < cTests ? RandR80Src(iTest, a_cBits, true) \
3630	: g_aFpuStI ## a_cBits ## Specials[iTest - cTests]; \
3631	\
3632	for (uint16_t iRounding = 0; iRounding < 4; iRounding++) \
3633	{ \
3634	/* PC doesn't influence these, so leave as is. */ \
3635	AssertCompile(X86_FCW_OM_BIT + 1 == X86_FCW_UM_BIT && X86_FCW_UM_BIT + 1 == X86_FCW_PM_BIT); \
3636	for (uint16_t iMask = 0; iMask < 16; iMask += 2 /1/) \
3637	{ \
3638	uint16_t uFswOut = 0; \
3639	a_iType iOutVal = ~(a_iType)2; \
3640	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_OM \| X86_FCW_UM \| X86_FCW_PM)) \
3641	\| (iRounding << X86_FCW_RC_SHIFT); \
3642	/if (iMask & 1) State.FCW ^= X86_FCW_MASK_ALL;/ \
3643	State.FCW \|= (iMask >> 1) << X86_FCW_OM_BIT; \
3644	pfn(&State, &uFswOut, &iOutVal, &InVal); \
3645	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u */\n", \
3646	State.FCW, State.FSW, uFswOut, GenFormatR80(&InVal), \
3647	GenFormatI ## a_cBits(iOutVal), iTest, iRounding, iMask); \
3648	} \
3649	} \
3650	} \
3651	GenerateArrayEnd(pOutFn, a_aSubTests[iFn].pszName); \
3652	} \
3653	}
3654	#else
3655	# define GEN_FPU_STORE_INT(a_cBits, a_iType, a_szFmt, a_aSubTests, a_TestType)
3656	#endif
3657
3658	#define TEST_FPU_STORE_INT(a_cBits, a_iType, a_szFmt, a_SubTestType, a_aSubTests, a_TestType) \
3659	GEN_FPU_STORE_INT(a_cBits, a_iType, a_szFmt, a_aSubTests, a_TestType) \
3660	\
3661	static void FpuStI ## a_cBits ## Test(void) \
3662	{ \
3663	X86FXSTATE State; \
3664	RT_ZERO(State); \
3665	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
3666	{ \
3667	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
3668	continue; \
3669	\
3670	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
3671	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
3672	PFNIEMAIMPLFPUSTR80TOI ## a_cBits pfn = a_aSubTests[iFn].pfn; \
3673	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
3674	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
3675	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
3676	{ \
3677	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
3678	{ \
3679	RTFLOAT80U const InVal = paTests[iTest].InVal; \
3680	uint16_t uFswOut = 0; \
3681	a_iType iOutVal = ~(a_iType)2; \
3682	State.FCW = paTests[iTest].fFcw; \
3683	State.FSW = paTests[iTest].fFswIn; \
3684	pfn(&State, &uFswOut, &iOutVal, &InVal); \
3685	if ( uFswOut != paTests[iTest].fFswOut \
3686	\|\| iOutVal != paTests[iTest].iOutVal) \
3687	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n" \
3688	"%s -> fsw=%#06x " a_szFmt "\n" \
3689	"%s expected %#06x " a_szFmt "%s%s (%s)\n", \
3690	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, \
3691	FormatR80(&paTests[iTest].InVal), \
3692	iVar ? " " : "", uFswOut, iOutVal, \
3693	iVar ? " " : "", paTests[iTest].fFswOut, paTests[iTest].iOutVal, \
3694	FswDiff(uFswOut, paTests[iTest].fFswOut), \
3695	iOutVal != paTests[iTest].iOutVal ? " - val" : "", FormatFcw(paTests[iTest].fFcw) ); \
3696	} \
3697	pfn = a_aSubTests[iFn].pfnNative; \
3698	} \
3699	} \
3700	}
3701
3702	//fistt_r80_to_i16 diffs for AMD, of course :-)
3703
3704	TEST_FPU_STORE_INT(64, int64_t, "%RI64", FPU_ST_I64_T, g_aFpuStI64, FPU_ST_I64_TEST_T)
3705	TEST_FPU_STORE_INT(32, int32_t, "%RI32", FPU_ST_I32_T, g_aFpuStI32, FPU_ST_I32_TEST_T)
3706	TEST_FPU_STORE_INT(16, int16_t, "%RI16", FPU_ST_I16_T, g_aFpuStI16, FPU_ST_I16_TEST_T)
3707
3708	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3709	static void FpuStIntGenerate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests)
3710	{
3711	FpuStI64Generate(pOut, pOutCpu, cTests);
3712	FpuStI32Generate(pOut, pOutCpu, cTests);
3713	FpuStI16Generate(pOut, pOutCpu, cTests);
3714	}
3715	#endif
3716
3717	static void FpuStIntTest(void)
3718	{
3719	FpuStI64Test();
3720	FpuStI32Test();
3721	FpuStI16Test();
3722	}
3723
3724
3725	/*
3726	* Store as packed BCD value (memory).
3727	*/
3728	typedef IEM_DECL_IMPL_TYPE(void, FNIEMAIMPLFPUSTR80TOD80,(PCX86FXSTATE, uint16_t *, PRTPBCD80U, PCRTFLOAT80U));
3729	typedef FNIEMAIMPLFPUSTR80TOD80 *PFNIEMAIMPLFPUSTR80TOD80;
3730	TYPEDEF_SUBTEST_TYPE(FPU_ST_D80_T, FPU_ST_D80_TEST_T, PFNIEMAIMPLFPUSTR80TOD80);
3731
3732	static FPU_ST_D80_T g_aFpuStD80[] =
3733	{
3734	ENTRY(fst_r80_to_d80),
3735	};
3736
3737	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3738	static void FpuStD80Generate(PRTSTREAM pOut, uint32_t cTests)
3739	{
3740	static RTFLOAT80U const s_aSpecials[] =
3741	{
3742	RTFLOAT80U_INIT_C(0, 0xde0b6b3a763fffe0, RTFLOAT80U_EXP_BIAS + 59), /* 1 below max */
3743	RTFLOAT80U_INIT_C(1, 0xde0b6b3a763fffe0, RTFLOAT80U_EXP_BIAS + 59), /* 1 above min */
3744	RTFLOAT80U_INIT_C(0, 0xde0b6b3a763ffff0, RTFLOAT80U_EXP_BIAS + 59), /* exact max */
3745	RTFLOAT80U_INIT_C(1, 0xde0b6b3a763ffff0, RTFLOAT80U_EXP_BIAS + 59), /* exact min */
3746	RTFLOAT80U_INIT_C(0, 0xde0b6b3a763fffff, RTFLOAT80U_EXP_BIAS + 59), /* max & all rounded off bits set */
3747	RTFLOAT80U_INIT_C(1, 0xde0b6b3a763fffff, RTFLOAT80U_EXP_BIAS + 59), /* min & all rounded off bits set */
3748	RTFLOAT80U_INIT_C(0, 0xde0b6b3a763ffff8, RTFLOAT80U_EXP_BIAS + 59), /* max & some rounded off bits set */
3749	RTFLOAT80U_INIT_C(1, 0xde0b6b3a763ffff8, RTFLOAT80U_EXP_BIAS + 59), /* min & some rounded off bits set */
3750	RTFLOAT80U_INIT_C(0, 0xde0b6b3a763ffff1, RTFLOAT80U_EXP_BIAS + 59), /* max & some other rounded off bits set */
3751	RTFLOAT80U_INIT_C(1, 0xde0b6b3a763ffff1, RTFLOAT80U_EXP_BIAS + 59), /* min & some other rounded off bits set */
3752	RTFLOAT80U_INIT_C(0, 0xde0b6b3a76400000, RTFLOAT80U_EXP_BIAS + 59), /* 1 above max */
3753	RTFLOAT80U_INIT_C(1, 0xde0b6b3a76400000, RTFLOAT80U_EXP_BIAS + 59), /* 1 below min */
3754	};
3755
3756	X86FXSTATE State;
3757	RT_ZERO(State);
3758	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuStD80); iFn++)
3759	{
3760	GenerateArrayStart(pOut, g_aFpuStD80[iFn].pszName, "FPU_ST_D80_TEST_T");
3761	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
3762	{
3763	uint16_t const fFcw = RandFcw();
3764	State.FSW = RandFsw();
3765	RTFLOAT80U const InVal = iTest < cTests ? RandR80Src(iTest, 59, true) : s_aSpecials[iTest - cTests];
3766
3767	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
3768	{
3769	/* PC doesn't influence these, so leave as is. */
3770	AssertCompile(X86_FCW_OM_BIT + 1 == X86_FCW_UM_BIT && X86_FCW_UM_BIT + 1 == X86_FCW_PM_BIT);
3771	for (uint16_t iMask = 0; iMask < 16; iMask += 2 /1/)
3772	{
3773	uint16_t uFswOut = 0;
3774	RTPBCD80U OutVal = RTPBCD80U_INIT_ZERO(0);
3775	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_OM \| X86_FCW_UM \| X86_FCW_PM))
3776	\| (iRounding << X86_FCW_RC_SHIFT);
3777	/if (iMask & 1) State.FCW ^= X86_FCW_MASK_ALL;/
3778	State.FCW \|= (iMask >> 1) << X86_FCW_OM_BIT;
3779	g_aFpuStD80[iFn].pfn(&State, &uFswOut, &OutVal, &InVal);
3780	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u */\n",
3781	State.FCW, State.FSW, uFswOut, GenFormatR80(&InVal),
3782	GenFormatD80(&OutVal), iTest, iRounding, iMask);
3783	}
3784	}
3785	}
3786	GenerateArrayEnd(pOut, g_aFpuStD80[iFn].pszName);
3787	}
3788	}
3789	#endif
3790
3791
3792	static void FpuStD80Test(void)
3793	{
3794	X86FXSTATE State;
3795	RT_ZERO(State);
3796	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuStD80); iFn++)
3797	{
3798	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuStD80[iFn]))
3799	continue;
3800
3801	uint32_t const cTests = *g_aFpuStD80[iFn].pcTests;
3802	FPU_ST_D80_TEST_T const * const paTests = g_aFpuStD80[iFn].paTests;
3803	PFNIEMAIMPLFPUSTR80TOD80 pfn = g_aFpuStD80[iFn].pfn;
3804	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuStD80[iFn]);
3805	if (!cTests) RTTestSkipped(g_hTest, "no tests");
3806	for (uint32_t iVar = 0; iVar < cVars; iVar++)
3807	{
3808	for (uint32_t iTest = 0; iTest < cTests; iTest++)
3809	{
3810	RTFLOAT80U const InVal = paTests[iTest].InVal;
3811	uint16_t uFswOut = 0;
3812	RTPBCD80U OutVal = RTPBCD80U_INIT_ZERO(0);
3813	State.FCW = paTests[iTest].fFcw;
3814	State.FSW = paTests[iTest].fFswIn;
3815	pfn(&State, &uFswOut, &OutVal, &InVal);
3816	if ( uFswOut != paTests[iTest].fFswOut
3817	\|\| !RTPBCD80U_ARE_IDENTICAL(&OutVal, &paTests[iTest].OutVal))
3818	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n"
3819	"%s -> fsw=%#06x %s\n"
3820	"%s expected %#06x %s%s%s (%s)\n",
3821	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
3822	FormatR80(&paTests[iTest].InVal),
3823	iVar ? " " : "", uFswOut, FormatD80(&OutVal),
3824	iVar ? " " : "", paTests[iTest].fFswOut, FormatD80(&paTests[iTest].OutVal),
3825	FswDiff(uFswOut, paTests[iTest].fFswOut),
3826	RTPBCD80U_ARE_IDENTICAL(&OutVal, &paTests[iTest].OutVal) ? " - val" : "",
3827	FormatFcw(paTests[iTest].fFcw) );
3828	}
3829	pfn = g_aFpuStD80[iFn].pfnNative;
3830	}
3831	}
3832	}
3833
3834
3835
3836	/*********************************************************************************************************************************
3837	* x87 FPU Binary Operations *
3838	*********************************************************************************************************************************/
3839
3840	/*
3841	* Binary FPU operations on two 80-bit floating point values.
3842	*/
3843	TYPEDEF_SUBTEST_TYPE(FPU_BINARY_R80_T, FPU_BINARY_R80_TEST_T, PFNIEMAIMPLFPUR80);
3844	enum { kFpuBinaryHint_fprem = 1, };
3845
3846	static FPU_BINARY_R80_T g_aFpuBinaryR80[] =
3847	{
3848	ENTRY(fadd_r80_by_r80),
3849	ENTRY(fsub_r80_by_r80),
3850	ENTRY(fsubr_r80_by_r80),
3851	ENTRY(fmul_r80_by_r80),
3852	ENTRY(fdiv_r80_by_r80),
3853	ENTRY(fdivr_r80_by_r80),
3854	ENTRY_EX(fprem_r80_by_r80, kFpuBinaryHint_fprem),
3855	ENTRY_EX(fprem1_r80_by_r80, kFpuBinaryHint_fprem),
3856	ENTRY(fscale_r80_by_r80),
3857	ENTRY_AMD( fpatan_r80_by_r80, 0), // C1 and rounding differs on AMD
3858	ENTRY_INTEL(fpatan_r80_by_r80, 0), // C1 and rounding differs on AMD
3859	ENTRY_AMD( fyl2x_r80_by_r80, 0), // C1 and rounding differs on AMD
3860	ENTRY_INTEL(fyl2x_r80_by_r80, 0), // C1 and rounding differs on AMD
3861	ENTRY_AMD( fyl2xp1_r80_by_r80, 0), // C1 and rounding differs on AMD
3862	ENTRY_INTEL(fyl2xp1_r80_by_r80, 0), // C1 and rounding differs on AMD
3863	};
3864
3865	#ifdef TSTIEMAIMPL_WITH_GENERATOR
3866	static void FpuBinaryR80Generate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests)
3867	{
3868	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
3869
3870	static struct { RTFLOAT80U Val1, Val2; } const s_aSpecials[] =
3871	{
3872	{ RTFLOAT80U_INIT_C(1, 0xdd762f07f2e80eef, 30142), /* causes weird overflows with DOWN and NEAR rounding. */
3873	RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1) },
3874	{ RTFLOAT80U_INIT_ZERO(0), /* causes weird overflows with UP and NEAR rounding when precision is lower than 64. */
3875	RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1) },
3876	{ RTFLOAT80U_INIT_ZERO(0), /* minus variant */
3877	RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1) },
3878	{ RTFLOAT80U_INIT_C(0, 0xcef238bb9a0afd86, 577 + RTFLOAT80U_EXP_BIAS), /* for fprem and fprem1, max sequence length */
3879	RTFLOAT80U_INIT_C(0, 0xf11684ec0beaad94, 1 + RTFLOAT80U_EXP_BIAS) },
3880	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, -13396 + RTFLOAT80U_EXP_BIAS), /* for fdiv. We missed PE. */
3881	RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, 16383 + RTFLOAT80U_EXP_BIAS) },
3882	{ RTFLOAT80U_INIT_C(0, 0x8000000000000000, 1 + RTFLOAT80U_EXP_BIAS), /* for fprem/fprem1 */
3883	RTFLOAT80U_INIT_C(0, 0xe000000000000000, 0 + RTFLOAT80U_EXP_BIAS) },
3884	{ RTFLOAT80U_INIT_C(0, 0x8000000000000000, 1 + RTFLOAT80U_EXP_BIAS), /* for fprem/fprem1 */
3885	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 0 + RTFLOAT80U_EXP_BIAS) },
3886	/* fscale: This may seriously increase the exponent, and it turns out overflow and underflow behaviour changes
3887	once RTFLOAT80U_EXP_BIAS_ADJUST is exceeded. */
3888	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^1 */
3889	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 0 + RTFLOAT80U_EXP_BIAS) },
3890	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^64 */
3891	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 6 + RTFLOAT80U_EXP_BIAS) },
3892	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^1024 */
3893	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 10 + RTFLOAT80U_EXP_BIAS) },
3894	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^4096 */
3895	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 12 + RTFLOAT80U_EXP_BIAS) },
3896	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^16384 */
3897	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 49150 */
3898	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^24576 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3899	RTFLOAT80U_INIT_C(0, 0xc000000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 57342 - within 10980XE range */
3900	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^24577 */
3901	RTFLOAT80U_INIT_C(0, 0xc002000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 57343 - outside 10980XE range, behaviour changes! */
3902	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^32768 - result is within range on 10980XE */
3903	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 15 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 65534 */
3904	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^65536 */
3905	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 16 + RTFLOAT80U_EXP_BIAS) },
3906	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^1048576 */
3907	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 20 + RTFLOAT80U_EXP_BIAS) },
3908	{ RTFLOAT80U_INIT_C(0, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^16777216 */
3909	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 24 + RTFLOAT80U_EXP_BIAS) },
3910	{ RTFLOAT80U_INIT_C(0, 0x8000000000000000, 1), /* for fscale: min * 2^-24576 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3911	RTFLOAT80U_INIT_C(1, 0xc000000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: -24575 - within 10980XE range */
3912	{ RTFLOAT80U_INIT_C(0, 0x8000000000000000, 1), /* for fscale: max * 2^-24577 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3913	RTFLOAT80U_INIT_C(1, 0xc002000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: -24576 - outside 10980XE range, behaviour changes! */
3914	/* fscale: Negative variants for the essentials of the above. */
3915	{ RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^24576 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3916	RTFLOAT80U_INIT_C(0, 0xc000000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 57342 - within 10980XE range */
3917	{ RTFLOAT80U_INIT_C(1, 0xffffffffffffffff, RTFLOAT80U_EXP_MAX - 1), /* for fscale: max * 2^24577 */
3918	RTFLOAT80U_INIT_C(0, 0xc002000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: 57343 - outside 10980XE range, behaviour changes! */
3919	{ RTFLOAT80U_INIT_C(1, 0x8000000000000000, 1), /* for fscale: min * 2^-24576 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3920	RTFLOAT80U_INIT_C(1, 0xc000000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: -57342 - within 10980XE range */
3921	{ RTFLOAT80U_INIT_C(1, 0x8000000000000000, 1), /* for fscale: max * 2^-24576 (RTFLOAT80U_EXP_BIAS_ADJUST) */
3922	RTFLOAT80U_INIT_C(1, 0xc002000000000000, 14 + RTFLOAT80U_EXP_BIAS) }, /* resulting exponent: -57343 - outside 10980XE range, behaviour changes! */
3923	/* fscale: Some fun with denormals and pseudo-denormals. */
3924	{ RTFLOAT80U_INIT_C(0, 0x0800000000000000, 0), /* for fscale: max * 2^-4 */
3925	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 2 + RTFLOAT80U_EXP_BIAS) },
3926	{ RTFLOAT80U_INIT_C(0, 0x0800000000000000, 0), /* for fscale: max * 2^+1 */
3927	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 0 + RTFLOAT80U_EXP_BIAS) },
3928	{ RTFLOAT80U_INIT_C(0, 0x0800000000000000, 0), RTFLOAT80U_INIT_ZERO(0) }, /* for fscale: max * 2^+0 */
3929	{ RTFLOAT80U_INIT_C(0, 0x0000000000000008, 0), /* for fscale: max * 2^-4 => underflow */
3930	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 2 + RTFLOAT80U_EXP_BIAS) },
3931	{ RTFLOAT80U_INIT_C(0, 0x8005000300020001, 0), RTFLOAT80U_INIT_ZERO(0) }, /* pseudo-normal number * 2^+0. */
3932	{ RTFLOAT80U_INIT_C(1, 0x8005000300020001, 0), RTFLOAT80U_INIT_ZERO(0) }, /* pseudo-normal number * 2^+0. */
3933	{ RTFLOAT80U_INIT_C(0, 0x8005000300020001, 0), /* pseudo-normal number * 2^-4 */
3934	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 2 + RTFLOAT80U_EXP_BIAS) },
3935	{ RTFLOAT80U_INIT_C(0, 0x8005000300020001, 0), /* pseudo-normal number * 2^+0 */
3936	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 0 + RTFLOAT80U_EXP_BIAS) },
3937	{ RTFLOAT80U_INIT_C(0, 0x8005000300020001, 0), /* pseudo-normal number * 2^+1 */
3938	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 1 + RTFLOAT80U_EXP_BIAS) },
3939	};
3940
3941	X86FXSTATE State;
3942	RT_ZERO(State);
3943	uint32_t cMinNormalPairs = (cTests - 144) / 4;
3944	uint32_t cMinTargetRangeInputs = cMinNormalPairs / 2;
3945	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuBinaryR80); iFn++)
3946	{
3947	PFNIEMAIMPLFPUR80 const pfn = g_aFpuBinaryR80[iFn].pfnNative ? g_aFpuBinaryR80[iFn].pfnNative : g_aFpuBinaryR80[iFn].pfn;
3948	PRTSTREAM pOutFn = pOut;
3949	if (g_aFpuBinaryR80[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
3950	{
3951	if (g_aFpuBinaryR80[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour)
3952	continue;
3953	pOutFn = pOutCpu;
3954	}
3955
3956	GenerateArrayStart(pOutFn, g_aFpuBinaryR80[iFn].pszName, "FPU_BINARY_R80_TEST_T");
3957	uint32_t iTestOutput = 0;
3958	uint32_t cNormalInputPairs = 0;
3959	uint32_t cTargetRangeInputs = 0;
3960	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
3961	{
3962	RTFLOAT80U InVal1 = iTest < cTests ? RandR80Src1(iTest) : s_aSpecials[iTest - cTests].Val1;
3963	RTFLOAT80U InVal2 = iTest < cTests ? RandR80Src2(iTest) : s_aSpecials[iTest - cTests].Val2;
3964	bool fTargetRange = false;
3965	if (RTFLOAT80U_IS_NORMAL(&InVal1) && RTFLOAT80U_IS_NORMAL(&InVal2))
3966	{
3967	cNormalInputPairs++;
3968	if ( g_aFpuBinaryR80[iFn].uExtra == kFpuBinaryHint_fprem
3969	&& (uint32_t)InVal1.s.uExponent - (uint32_t)InVal2.s.uExponent - (uint32_t)64 <= (uint32_t)512)
3970	cTargetRangeInputs += fTargetRange = true;
3971	else if (cTargetRangeInputs < cMinTargetRangeInputs && iTest < cTests)
3972	if (g_aFpuBinaryR80[iFn].uExtra == kFpuBinaryHint_fprem)
3973	{ /* The aim is two values with an exponent difference between 64 and 640 so we can do the whole sequence. */
3974	InVal2.s.uExponent = RTRandU32Ex(1, RTFLOAT80U_EXP_MAX - 66);
3975	InVal1.s.uExponent = RTRandU32Ex(InVal2.s.uExponent + 64, RT_MIN(InVal2.s.uExponent + 512, RTFLOAT80U_EXP_MAX - 1));
3976	cTargetRangeInputs += fTargetRange = true;
3977	}
3978	}
3979	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
3980	{
3981	iTest -= 1;
3982	continue;
3983	}
3984
3985	uint16_t const fFcwExtra = 0;
3986	uint16_t const fFcw = RandFcw();
3987	State.FSW = RandFsw();
3988
3989	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
3990	for (uint16_t iPrecision = 0; iPrecision < 4; iPrecision++)
3991	{
3992	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_PC_MASK \| X86_FCW_MASK_ALL))
3993	\| (iRounding << X86_FCW_RC_SHIFT)
3994	\| (iPrecision << X86_FCW_PC_SHIFT)
3995	\| X86_FCW_MASK_ALL;
3996	IEMFPURESULT ResM = { RTFLOAT80U_INIT(0, 0, 0), 0 };
3997	pfn(&State, &ResM, &InVal1, &InVal2);
3998	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/m = #%u */\n",
3999	State.FCW \| fFcwExtra, State.FSW, ResM.FSW, GenFormatR80(&InVal1), GenFormatR80(&InVal2),
4000	GenFormatR80(&ResM.r80Result), iTest, iRounding, iPrecision, iTestOutput++);
4001
4002	State.FCW = State.FCW & ~X86_FCW_MASK_ALL;
4003	IEMFPURESULT ResU = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4004	pfn(&State, &ResU, &InVal1, &InVal2);
4005	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/u = #%u */\n",
4006	State.FCW \| fFcwExtra, State.FSW, ResU.FSW, GenFormatR80(&InVal1), GenFormatR80(&InVal2),
4007	GenFormatR80(&ResU.r80Result), iTest, iRounding, iPrecision, iTestOutput++);
4008
4009	uint16_t fXcpt = (ResM.FSW \| ResU.FSW) & X86_FSW_XCPT_MASK & ~X86_FSW_SF;
4010	if (fXcpt)
4011	{
4012	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4013	IEMFPURESULT Res1 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4014	pfn(&State, &Res1, &InVal1, &InVal2);
4015	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/%#x = #%u */\n",
4016	State.FCW \| fFcwExtra, State.FSW, Res1.FSW, GenFormatR80(&InVal1), GenFormatR80(&InVal2),
4017	GenFormatR80(&Res1.r80Result), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4018	if (((Res1.FSW & X86_FSW_XCPT_MASK) & fXcpt) != (Res1.FSW & X86_FSW_XCPT_MASK))
4019	{
4020	fXcpt \|= Res1.FSW & X86_FSW_XCPT_MASK;
4021	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4022	IEMFPURESULT Res2 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4023	pfn(&State, &Res2, &InVal1, &InVal2);
4024	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/%#x[!] = #%u */\n",
4025	State.FCW \| fFcwExtra, State.FSW, Res2.FSW, GenFormatR80(&InVal1), GenFormatR80(&InVal2),
4026	GenFormatR80(&Res2.r80Result), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4027	}
4028	if (!RT_IS_POWER_OF_TWO(fXcpt))
4029	for (uint16_t fUnmasked = 1; fUnmasked <= X86_FCW_PM; fUnmasked <<= 1)
4030	if (fUnmasked & fXcpt)
4031	{
4032	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| (fXcpt & ~fUnmasked);
4033	IEMFPURESULT Res3 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4034	pfn(&State, &Res3, &InVal1, &InVal2);
4035	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/u%#x = #%u */\n",
4036	State.FCW \| fFcwExtra, State.FSW, Res3.FSW, GenFormatR80(&InVal1), GenFormatR80(&InVal2),
4037	GenFormatR80(&Res3.r80Result), iTest, iRounding, iPrecision, fUnmasked, iTestOutput++);
4038	}
4039	}
4040
4041	/* If the values are in range and caused no exceptions, do the whole series of
4042	partial reminders till we get the non-partial one or run into an exception. */
4043	if (fTargetRange && fXcpt == 0 && g_aFpuBinaryR80[iFn].uExtra == kFpuBinaryHint_fprem)
4044	{
4045	IEMFPURESULT ResPrev = ResM;
4046	for (unsigned i = 0; i < 32 && (ResPrev.FSW & (X86_FSW_C2 \| X86_FSW_XCPT_MASK)) == X86_FSW_C2; i++)
4047	{
4048	State.FCW = State.FCW \| X86_FCW_MASK_ALL;
4049	State.FSW = ResPrev.FSW;
4050	IEMFPURESULT ResSeq = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4051	pfn(&State, &ResSeq, &ResPrev.r80Result, &InVal2);
4052	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/seq%u = #%u */\n",
4053	State.FCW \| fFcwExtra, State.FSW, ResSeq.FSW, GenFormatR80(&ResPrev.r80Result),
4054	GenFormatR80(&InVal2), GenFormatR80(&ResSeq.r80Result),
4055	iTest, iRounding, iPrecision, i + 1, iTestOutput++);
4056	ResPrev = ResSeq;
4057	}
4058	}
4059	}
4060	}
4061	GenerateArrayEnd(pOutFn, g_aFpuBinaryR80[iFn].pszName);
4062	}
4063	}
4064	#endif
4065
4066
4067	static void FpuBinaryR80Test(void)
4068	{
4069	X86FXSTATE State;
4070	RT_ZERO(State);
4071	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuBinaryR80); iFn++)
4072	{
4073	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuBinaryR80[iFn]))
4074	continue;
4075
4076	uint32_t const cTests = *g_aFpuBinaryR80[iFn].pcTests;
4077	FPU_BINARY_R80_TEST_T const * const paTests = g_aFpuBinaryR80[iFn].paTests;
4078	PFNIEMAIMPLFPUR80 pfn = g_aFpuBinaryR80[iFn].pfn;
4079	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuBinaryR80[iFn]);
4080	if (!cTests) RTTestSkipped(g_hTest, "no tests");
4081	for (uint32_t iVar = 0; iVar < cVars; iVar++)
4082	{
4083	for (uint32_t iTest = 0; iTest < cTests; iTest++)
4084	{
4085	RTFLOAT80U const InVal1 = paTests[iTest].InVal1;
4086	RTFLOAT80U const InVal2 = paTests[iTest].InVal2;
4087	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4088	State.FCW = paTests[iTest].fFcw;
4089	State.FSW = paTests[iTest].fFswIn;
4090	pfn(&State, &Res, &InVal1, &InVal2);
4091	if ( Res.FSW != paTests[iTest].fFswOut
4092	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].OutVal))
4093	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in1=%s in2=%s\n"
4094	"%s -> fsw=%#06x %s\n"
4095	"%s expected %#06x %s%s%s (%s)\n",
4096	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
4097	FormatR80(&paTests[iTest].InVal1), FormatR80(&paTests[iTest].InVal2),
4098	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result),
4099	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].OutVal),
4100	FswDiff(Res.FSW, paTests[iTest].fFswOut),
4101	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].OutVal) ? " - val" : "",
4102	FormatFcw(paTests[iTest].fFcw) );
4103	}
4104	pfn = g_aFpuBinaryR80[iFn].pfnNative;
4105	}
4106	}
4107	}
4108
4109
4110	/*
4111	* Binary FPU operations on one 80-bit floating point value and one 64-bit or 32-bit one.
4112	*/
4113	#define int64_t_IS_NORMAL(a) 1
4114	#define int32_t_IS_NORMAL(a) 1
4115	#define int16_t_IS_NORMAL(a) 1
4116
4117	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4118	static struct { RTFLOAT80U Val1; RTFLOAT64U Val2; } const s_aFpuBinaryR64Specials[] =
4119	{
4120	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4121	RTFLOAT64U_INIT_C(0, 0xfeeeeddddcccc, RTFLOAT64U_EXP_BIAS) }, /* whatever */
4122	};
4123	static struct { RTFLOAT80U Val1; RTFLOAT32U Val2; } const s_aFpuBinaryR32Specials[] =
4124	{
4125	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4126	RTFLOAT32U_INIT_C(0, 0x7fffee, RTFLOAT32U_EXP_BIAS) }, /* whatever */
4127	};
4128	static struct { RTFLOAT80U Val1; int32_t Val2; } const s_aFpuBinaryI32Specials[] =
4129	{
4130	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), INT32_MAX }, /* whatever */
4131	};
4132	static struct { RTFLOAT80U Val1; int16_t Val2; } const s_aFpuBinaryI16Specials[] =
4133	{
4134	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), INT16_MAX }, /* whatever */
4135	};
4136
4137	# define GEN_FPU_BINARY_SMALL(a_fIntType, a_cBits, a_LoBits, a_UpBits, a_Type2, a_aSubTests, a_TestType) \
4138	static void FpuBinary ## a_UpBits ## Generate(PRTSTREAM pOut, uint32_t cTests) \
4139	{ \
4140	cTests = RT_MAX(160, cTests); /* there are 144 standard input variations for r80 by r80 */ \
4141	\
4142	X86FXSTATE State; \
4143	RT_ZERO(State); \
4144	uint32_t cMinNormalPairs = (cTests - 144) / 4; \
4145	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
4146	{ \
4147	GenerateArrayStart(pOut, a_aSubTests[iFn].pszName, #a_TestType); \
4148	uint32_t cNormalInputPairs = 0; \
4149	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aFpuBinary ## a_UpBits ## Specials); iTest += 1) \
4150	{ \
4151	RTFLOAT80U const InVal1 = iTest < cTests ? RandR80Src1(iTest, a_cBits, a_fIntType) \
4152	: s_aFpuBinary ## a_UpBits ## Specials[iTest - cTests].Val1; \
4153	a_Type2 const InVal2 = iTest < cTests ? Rand ## a_UpBits ## Src2(iTest) \
4154	: s_aFpuBinary ## a_UpBits ## Specials[iTest - cTests].Val2; \
4155	if (RTFLOAT80U_IS_NORMAL(&InVal1) && a_Type2 ## _IS_NORMAL(&InVal2)) \
4156	cNormalInputPairs++; \
4157	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests) \
4158	{ \
4159	iTest -= 1; \
4160	continue; \
4161	} \
4162	\
4163	uint16_t const fFcw = RandFcw(); \
4164	State.FSW = RandFsw(); \
4165	\
4166	for (uint16_t iRounding = 0; iRounding < 4; iRounding++) \
4167	{ \
4168	for (uint16_t iPrecision = 0; iPrecision < 4; iPrecision++) \
4169	{ \
4170	for (uint16_t iMask = 0; iMask <= X86_FCW_MASK_ALL; iMask += X86_FCW_MASK_ALL) \
4171	{ \
4172	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_PC_MASK \| X86_FCW_MASK_ALL)) \
4173	\| (iRounding << X86_FCW_RC_SHIFT) \
4174	\| (iPrecision << X86_FCW_PC_SHIFT) \
4175	\| iMask; \
4176	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
4177	a_aSubTests[iFn].pfn(&State, &Res, &InVal1, &InVal2); \
4178	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/%c */\n", \
4179	State.FCW, State.FSW, Res.FSW, GenFormatR80(&InVal1), GenFormat ## a_UpBits(&InVal2), \
4180	GenFormatR80(&Res.r80Result), iTest, iRounding, iPrecision, iMask ? 'c' : 'u'); \
4181	} \
4182	} \
4183	} \
4184	} \
4185	GenerateArrayEnd(pOut, a_aSubTests[iFn].pszName); \
4186	} \
4187	}
4188	#else
4189	# define GEN_FPU_BINARY_SMALL(a_fIntType, a_cBits, a_LoBits, a_UpBits, a_Type2, a_aSubTests, a_TestType)
4190	#endif
4191
4192	#define TEST_FPU_BINARY_SMALL(a_fIntType, a_cBits, a_LoBits, a_UpBits, a_I, a_Type2, a_SubTestType, a_aSubTests, a_TestType) \
4193	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLFPU ## a_UpBits); \
4194	\
4195	static a_SubTestType a_aSubTests[] = \
4196	{ \
4197	ENTRY(RT_CONCAT4(f, a_I, add_r80_by_, a_LoBits)), \
4198	ENTRY(RT_CONCAT4(f, a_I, mul_r80_by_, a_LoBits)), \
4199	ENTRY(RT_CONCAT4(f, a_I, sub_r80_by_, a_LoBits)), \
4200	ENTRY(RT_CONCAT4(f, a_I, subr_r80_by_, a_LoBits)), \
4201	ENTRY(RT_CONCAT4(f, a_I, div_r80_by_, a_LoBits)), \
4202	ENTRY(RT_CONCAT4(f, a_I, divr_r80_by_, a_LoBits)), \
4203	}; \
4204	\
4205	GEN_FPU_BINARY_SMALL(a_fIntType, a_cBits, a_LoBits, a_UpBits, a_Type2, a_aSubTests, a_TestType) \
4206	\
4207	static void FpuBinary ## a_UpBits ## Test(void) \
4208	{ \
4209	X86FXSTATE State; \
4210	RT_ZERO(State); \
4211	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
4212	{ \
4213	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
4214	continue; \
4215	\
4216	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
4217	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
4218	PFNIEMAIMPLFPU ## a_UpBits pfn = a_aSubTests[iFn].pfn; \
4219	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
4220	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
4221	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
4222	{ \
4223	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
4224	{ \
4225	RTFLOAT80U const InVal1 = paTests[iTest].InVal1; \
4226	a_Type2 const InVal2 = paTests[iTest].InVal2; \
4227	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 }; \
4228	State.FCW = paTests[iTest].fFcw; \
4229	State.FSW = paTests[iTest].fFswIn; \
4230	pfn(&State, &Res, &InVal1, &InVal2); \
4231	if ( Res.FSW != paTests[iTest].fFswOut \
4232	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].OutVal)) \
4233	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in1=%s in2=%s\n" \
4234	"%s -> fsw=%#06x %s\n" \
4235	"%s expected %#06x %s%s%s (%s)\n", \
4236	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, \
4237	FormatR80(&paTests[iTest].InVal1), Format ## a_UpBits(&paTests[iTest].InVal2), \
4238	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result), \
4239	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].OutVal), \
4240	FswDiff(Res.FSW, paTests[iTest].fFswOut), \
4241	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].OutVal) ? " - val" : "", \
4242	FormatFcw(paTests[iTest].fFcw) ); \
4243	} \
4244	pfn = a_aSubTests[iFn].pfnNative; \
4245	} \
4246	} \
4247	}
4248
4249	TEST_FPU_BINARY_SMALL(0, 64, r64, R64, RT_NOTHING, RTFLOAT64U, FPU_BINARY_R64_T, g_aFpuBinaryR64, FPU_BINARY_R64_TEST_T)
4250	TEST_FPU_BINARY_SMALL(0, 32, r32, R32, RT_NOTHING, RTFLOAT32U, FPU_BINARY_R32_T, g_aFpuBinaryR32, FPU_BINARY_R32_TEST_T)
4251	TEST_FPU_BINARY_SMALL(1, 32, i32, I32, i, int32_t, FPU_BINARY_I32_T, g_aFpuBinaryI32, FPU_BINARY_I32_TEST_T)
4252	TEST_FPU_BINARY_SMALL(1, 16, i16, I16, i, int16_t, FPU_BINARY_I16_T, g_aFpuBinaryI16, FPU_BINARY_I16_TEST_T)
4253
4254
4255	/*
4256	* Binary operations on 80-, 64- and 32-bit floating point only affecting FSW.
4257	*/
4258	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4259	static struct { RTFLOAT80U Val1, Val2; } const s_aFpuBinaryFswR80Specials[] =
4260	{
4261	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4262	RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS) }, /* whatever */
4263	};
4264	static struct { RTFLOAT80U Val1; RTFLOAT64U Val2; } const s_aFpuBinaryFswR64Specials[] =
4265	{
4266	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4267	RTFLOAT64U_INIT_C(0, 0xfeeeeddddcccc, RTFLOAT64U_EXP_BIAS) }, /* whatever */
4268	};
4269	static struct { RTFLOAT80U Val1; RTFLOAT32U Val2; } const s_aFpuBinaryFswR32Specials[] =
4270	{
4271	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4272	RTFLOAT32U_INIT_C(0, 0x7fffee, RTFLOAT32U_EXP_BIAS) }, /* whatever */
4273	};
4274	static struct { RTFLOAT80U Val1; int32_t Val2; } const s_aFpuBinaryFswI32Specials[] =
4275	{
4276	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), INT32_MAX }, /* whatever */
4277	};
4278	static struct { RTFLOAT80U Val1; int16_t Val2; } const s_aFpuBinaryFswI16Specials[] =
4279	{
4280	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), INT16_MAX }, /* whatever */
4281	};
4282
4283	# define GEN_FPU_BINARY_FSW(a_fIntType, a_cBits, a_UpBits, a_Type2, a_aSubTests, a_TestType) \
4284	static void FpuBinaryFsw ## a_UpBits ## Generate(PRTSTREAM pOut, uint32_t cTests) \
4285	{ \
4286	cTests = RT_MAX(160, cTests); /* there are 144 standard input variations for r80 by r80 */ \
4287	\
4288	X86FXSTATE State; \
4289	RT_ZERO(State); \
4290	uint32_t cMinNormalPairs = (cTests - 144) / 4; \
4291	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
4292	{ \
4293	GenerateArrayStart(pOut, a_aSubTests[iFn].pszName, #a_TestType); \
4294	uint32_t cNormalInputPairs = 0; \
4295	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aFpuBinaryFsw ## a_UpBits ## Specials); iTest += 1) \
4296	{ \
4297	RTFLOAT80U const InVal1 = iTest < cTests ? RandR80Src1(iTest, a_cBits, a_fIntType) \
4298	: s_aFpuBinaryFsw ## a_UpBits ## Specials[iTest - cTests].Val1; \
4299	a_Type2 const InVal2 = iTest < cTests ? Rand ## a_UpBits ## Src2(iTest) \
4300	: s_aFpuBinaryFsw ## a_UpBits ## Specials[iTest - cTests].Val2; \
4301	if (RTFLOAT80U_IS_NORMAL(&InVal1) && a_Type2 ## _IS_NORMAL(&InVal2)) \
4302	cNormalInputPairs++; \
4303	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests) \
4304	{ \
4305	iTest -= 1; \
4306	continue; \
4307	} \
4308	\
4309	uint16_t const fFcw = RandFcw(); \
4310	State.FSW = RandFsw(); \
4311	\
4312	/* Guess these aren't affected by precision or rounding, so just flip the exception mask. */ \
4313	for (uint16_t iMask = 0; iMask <= X86_FCW_MASK_ALL; iMask += X86_FCW_MASK_ALL) \
4314	{ \
4315	State.FCW = (fFcw & ~(X86_FCW_MASK_ALL)) \| iMask; \
4316	uint16_t fFswOut = 0; \
4317	a_aSubTests[iFn].pfn(&State, &fFswOut, &InVal1, &InVal2); \
4318	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%c */\n", \
4319	State.FCW, State.FSW, fFswOut, GenFormatR80(&InVal1), GenFormat ## a_UpBits(&InVal2), \
4320	iTest, iMask ? 'c' : 'u'); \
4321	} \
4322	} \
4323	GenerateArrayEnd(pOut, a_aSubTests[iFn].pszName); \
4324	} \
4325	}
4326	#else
4327	# define GEN_FPU_BINARY_FSW(a_fIntType, a_cBits, a_UpBits, a_Type2, a_aSubTests, a_TestType)
4328	#endif
4329
4330	#define TEST_FPU_BINARY_FSW(a_fIntType, a_cBits, a_UpBits, a_Type2, a_SubTestType, a_aSubTests, a_TestType, ...) \
4331	TYPEDEF_SUBTEST_TYPE(a_SubTestType, a_TestType, PFNIEMAIMPLFPU ## a_UpBits ## FSW); \
4332	\
4333	static a_SubTestType a_aSubTests[] = \
4334	{ \
4335	__VA_ARGS__ \
4336	}; \
4337	\
4338	GEN_FPU_BINARY_FSW(a_fIntType, a_cBits, a_UpBits, a_Type2, a_aSubTests, a_TestType) \
4339	\
4340	static void FpuBinaryFsw ## a_UpBits ## Test(void) \
4341	{ \
4342	X86FXSTATE State; \
4343	RT_ZERO(State); \
4344	for (size_t iFn = 0; iFn < RT_ELEMENTS(a_aSubTests); iFn++) \
4345	{ \
4346	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(a_aSubTests[iFn])) \
4347	continue; \
4348	\
4349	uint32_t const cTests = *a_aSubTests[iFn].pcTests; \
4350	a_TestType const * const paTests = a_aSubTests[iFn].paTests; \
4351	PFNIEMAIMPLFPU ## a_UpBits ## FSW pfn = a_aSubTests[iFn].pfn; \
4352	uint32_t const cVars = COUNT_VARIATIONS(a_aSubTests[iFn]); \
4353	if (!cTests) RTTestSkipped(g_hTest, "no tests"); \
4354	for (uint32_t iVar = 0; iVar < cVars; iVar++) \
4355	{ \
4356	for (uint32_t iTest = 0; iTest < cTests; iTest++) \
4357	{ \
4358	uint16_t fFswOut = 0; \
4359	RTFLOAT80U const InVal1 = paTests[iTest].InVal1; \
4360	a_Type2 const InVal2 = paTests[iTest].InVal2; \
4361	State.FCW = paTests[iTest].fFcw; \
4362	State.FSW = paTests[iTest].fFswIn; \
4363	pfn(&State, &fFswOut, &InVal1, &InVal2); \
4364	if (fFswOut != paTests[iTest].fFswOut) \
4365	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in1=%s in2=%s\n" \
4366	"%s -> fsw=%#06x\n" \
4367	"%s expected %#06x %s (%s)\n", \
4368	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn, \
4369	FormatR80(&paTests[iTest].InVal1), Format ## a_UpBits(&paTests[iTest].InVal2), \
4370	iVar ? " " : "", fFswOut, \
4371	iVar ? " " : "", paTests[iTest].fFswOut, \
4372	FswDiff(fFswOut, paTests[iTest].fFswOut), FormatFcw(paTests[iTest].fFcw) ); \
4373	} \
4374	pfn = a_aSubTests[iFn].pfnNative; \
4375	} \
4376	} \
4377	}
4378
4379	TEST_FPU_BINARY_FSW(0, 80, R80, RTFLOAT80U, FPU_BINARY_FSW_R80_T, g_aFpuBinaryFswR80, FPU_BINARY_R80_TEST_T, ENTRY(fcom_r80_by_r80), ENTRY(fucom_r80_by_r80))
4380	TEST_FPU_BINARY_FSW(0, 64, R64, RTFLOAT64U, FPU_BINARY_FSW_R64_T, g_aFpuBinaryFswR64, FPU_BINARY_R64_TEST_T, ENTRY(fcom_r80_by_r64))
4381	TEST_FPU_BINARY_FSW(0, 32, R32, RTFLOAT32U, FPU_BINARY_FSW_R32_T, g_aFpuBinaryFswR32, FPU_BINARY_R32_TEST_T, ENTRY(fcom_r80_by_r32))
4382	TEST_FPU_BINARY_FSW(1, 32, I32, int32_t, FPU_BINARY_FSW_I32_T, g_aFpuBinaryFswI32, FPU_BINARY_I32_TEST_T, ENTRY(ficom_r80_by_i32))
4383	TEST_FPU_BINARY_FSW(1, 16, I16, int16_t, FPU_BINARY_FSW_I16_T, g_aFpuBinaryFswI16, FPU_BINARY_I16_TEST_T, ENTRY(ficom_r80_by_i16))
4384
4385
4386	/*
4387	* Binary operations on 80-bit floating point that effects only EFLAGS and possibly FSW.
4388	*/
4389	TYPEDEF_SUBTEST_TYPE(FPU_BINARY_EFL_R80_T, FPU_BINARY_EFL_R80_TEST_T, PFNIEMAIMPLFPUR80EFL);
4390
4391	static FPU_BINARY_EFL_R80_T g_aFpuBinaryEflR80[] =
4392	{
4393	ENTRY(fcomi_r80_by_r80),
4394	ENTRY(fucomi_r80_by_r80),
4395	};
4396
4397	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4398	static struct { RTFLOAT80U Val1, Val2; } const s_aFpuBinaryEflR80Specials[] =
4399	{
4400	{ RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS),
4401	RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS) }, /* whatever */
4402	};
4403
4404	static void FpuBinaryEflR80Generate(PRTSTREAM pOut, uint32_t cTests)
4405	{
4406	cTests = RT_MAX(160, cTests); /* there are 144 standard input variations */
4407
4408	X86FXSTATE State;
4409	RT_ZERO(State);
4410	uint32_t cMinNormalPairs = (cTests - 144) / 4;
4411	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuBinaryEflR80); iFn++)
4412	{
4413	GenerateArrayStart(pOut, g_aFpuBinaryEflR80[iFn].pszName, "FPU_BINARY_EFL_R80_TEST_T");
4414	uint32_t cNormalInputPairs = 0;
4415	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aFpuBinaryEflR80Specials); iTest += 1)
4416	{
4417	RTFLOAT80U const InVal1 = iTest < cTests ? RandR80Src1(iTest) : s_aFpuBinaryEflR80Specials[iTest - cTests].Val1;
4418	RTFLOAT80U const InVal2 = iTest < cTests ? RandR80Src2(iTest) : s_aFpuBinaryEflR80Specials[iTest - cTests].Val2;
4419	if (RTFLOAT80U_IS_NORMAL(&InVal1) && RTFLOAT80U_IS_NORMAL(&InVal2))
4420	cNormalInputPairs++;
4421	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
4422	{
4423	iTest -= 1;
4424	continue;
4425	}
4426
4427	uint16_t const fFcw = RandFcw();
4428	State.FSW = RandFsw();
4429
4430	/* Guess these aren't affected by precision or rounding, so just flip the exception mask. */
4431	for (uint16_t iMask = 0; iMask <= X86_FCW_MASK_ALL; iMask += X86_FCW_MASK_ALL)
4432	{
4433	State.FCW = (fFcw & ~(X86_FCW_MASK_ALL)) \| iMask;
4434	uint16_t uFswOut = 0;
4435	uint32_t fEflOut = g_aFpuBinaryEflR80[iFn].pfn(&State, &uFswOut, &InVal1, &InVal2);
4436	RTStrmPrintf(pOut, " { %#06x, %#06x, %#06x, %s, %s, %#08x }, /* #%u/%c */\n",
4437	State.FCW, State.FSW, uFswOut, GenFormatR80(&InVal1), GenFormatR80(&InVal2), fEflOut,
4438	iTest, iMask ? 'c' : 'u');
4439	}
4440	}
4441	GenerateArrayEnd(pOut, g_aFpuBinaryEflR80[iFn].pszName);
4442	}
4443	}
4444	#endif /TSTIEMAIMPL_WITH_GENERATOR/
4445
4446	static void FpuBinaryEflR80Test(void)
4447	{
4448	X86FXSTATE State;
4449	RT_ZERO(State);
4450	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuBinaryEflR80); iFn++)
4451	{
4452	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuBinaryEflR80[iFn]))
4453	continue;
4454
4455	uint32_t const cTests = *g_aFpuBinaryEflR80[iFn].pcTests;
4456	FPU_BINARY_EFL_R80_TEST_T const * const paTests = g_aFpuBinaryEflR80[iFn].paTests;
4457	PFNIEMAIMPLFPUR80EFL pfn = g_aFpuBinaryEflR80[iFn].pfn;
4458	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuBinaryEflR80[iFn]);
4459	if (!cTests) RTTestSkipped(g_hTest, "no tests");
4460	for (uint32_t iVar = 0; iVar < cVars; iVar++)
4461	{
4462	for (uint32_t iTest = 0; iTest < cTests; iTest++)
4463	{
4464	RTFLOAT80U const InVal1 = paTests[iTest].InVal1;
4465	RTFLOAT80U const InVal2 = paTests[iTest].InVal2;
4466	State.FCW = paTests[iTest].fFcw;
4467	State.FSW = paTests[iTest].fFswIn;
4468	uint16_t uFswOut = 0;
4469	uint32_t fEflOut = pfn(&State, &uFswOut, &InVal1, &InVal2);
4470	if ( uFswOut != paTests[iTest].fFswOut
4471	\|\| fEflOut != paTests[iTest].fEflOut)
4472	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in1=%s in2=%s\n"
4473	"%s -> fsw=%#06x efl=%#08x\n"
4474	"%s expected %#06x %#08x %s%s (%s)\n",
4475	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
4476	FormatR80(&paTests[iTest].InVal1), FormatR80(&paTests[iTest].InVal2),
4477	iVar ? " " : "", uFswOut, fEflOut,
4478	iVar ? " " : "", paTests[iTest].fFswOut, paTests[iTest].fEflOut,
4479	FswDiff(uFswOut, paTests[iTest].fFswOut), EFlagsDiff(fEflOut, paTests[iTest].fEflOut),
4480	FormatFcw(paTests[iTest].fFcw));
4481	}
4482	pfn = g_aFpuBinaryEflR80[iFn].pfnNative;
4483	}
4484	}
4485	}
4486
4487
4488	/*********************************************************************************************************************************
4489	* x87 FPU Unary Operations *
4490	*********************************************************************************************************************************/
4491
4492	/*
4493	* Unary FPU operations on one 80-bit floating point value.
4494	*
4495	* Note! The FCW reserved bit 7 is used to indicate whether a test may produce
4496	* a rounding error or not.
4497	*/
4498	TYPEDEF_SUBTEST_TYPE(FPU_UNARY_R80_T, FPU_UNARY_R80_TEST_T, PFNIEMAIMPLFPUR80UNARY);
4499
4500	enum { kUnary_Accurate = 0, kUnary_Accurate_Trigonometry /probably not accurate, but need impl to know/, kUnary_Rounding_F2xm1 };
4501	static FPU_UNARY_R80_T g_aFpuUnaryR80[] =
4502	{
4503	ENTRY_EX( fabs_r80, kUnary_Accurate),
4504	ENTRY_EX( fchs_r80, kUnary_Accurate),
4505	ENTRY_AMD_EX( f2xm1_r80, 0, kUnary_Accurate), // C1 differs for -1m0x3fb263cc2c331e15^-2654 (different ln2 constant?)
4506	ENTRY_INTEL_EX(f2xm1_r80, 0, kUnary_Rounding_F2xm1),
4507	ENTRY_EX( fsqrt_r80, kUnary_Accurate),
4508	ENTRY_EX( frndint_r80, kUnary_Accurate),
4509	ENTRY_AMD_EX( fsin_r80, 0, kUnary_Accurate_Trigonometry), // value & C1 differences for pseudo denormals and others (e.g. -1m0x2b1e5683cbca5725^-3485)
4510	ENTRY_INTEL_EX(fsin_r80, 0, kUnary_Accurate_Trigonometry),
4511	ENTRY_AMD_EX( fcos_r80, 0, kUnary_Accurate_Trigonometry), // value & C1 differences
4512	ENTRY_INTEL_EX(fcos_r80, 0, kUnary_Accurate_Trigonometry),
4513	};
4514
4515	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4516
4517	static bool FpuUnaryR80MayHaveRoundingError(PCRTFLOAT80U pr80Val, int enmKind)
4518	{
4519	if ( enmKind == kUnary_Rounding_F2xm1
4520	&& RTFLOAT80U_IS_NORMAL(pr80Val)
4521	&& pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS
4522	&& pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS - 69)
4523	return true;
4524	return false;
4525	}
4526
4527	static void FpuUnaryR80Generate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests)
4528	{
4529	static RTFLOAT80U const s_aSpecials[] =
4530	{
4531	RTFLOAT80U_INIT_C(0, 0x8000000000000000, RTFLOAT80U_EXP_BIAS - 1), /* 0.5 (for f2xm1) */
4532	RTFLOAT80U_INIT_C(1, 0x8000000000000000, RTFLOAT80U_EXP_BIAS - 1), /* -0.5 (for f2xm1) */
4533	RTFLOAT80U_INIT_C(0, 0x8000000000000000, RTFLOAT80U_EXP_BIAS), /* 1.0 (for f2xm1) */
4534	RTFLOAT80U_INIT_C(1, 0x8000000000000000, RTFLOAT80U_EXP_BIAS), /* -1.0 (for f2xm1) */
4535	RTFLOAT80U_INIT_C(0, 0x8000000000000000, 0), /* +1.0^-16382 */
4536	RTFLOAT80U_INIT_C(1, 0x8000000000000000, 0), /* -1.0^-16382 */
4537	RTFLOAT80U_INIT_C(0, 0xc000000000000000, 0), /* +1.1^-16382 */
4538	RTFLOAT80U_INIT_C(1, 0xc000000000000000, 0), /* -1.1^-16382 */
4539	RTFLOAT80U_INIT_C(0, 0xc000100000000000, 0), /* +1.1xxx1^-16382 */
4540	RTFLOAT80U_INIT_C(1, 0xc000100000000000, 0), /* -1.1xxx1^-16382 */
4541	};
4542	X86FXSTATE State;
4543	RT_ZERO(State);
4544	uint32_t cMinNormals = cTests / 4;
4545	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryR80); iFn++)
4546	{
4547	PFNIEMAIMPLFPUR80UNARY const pfn = g_aFpuUnaryR80[iFn].pfnNative ? g_aFpuUnaryR80[iFn].pfnNative : g_aFpuUnaryR80[iFn].pfn;
4548	PRTSTREAM pOutFn = pOut;
4549	if (g_aFpuUnaryR80[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
4550	{
4551	if (g_aFpuUnaryR80[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour)
4552	continue;
4553	pOutFn = pOutCpu;
4554	}
4555
4556	GenerateArrayStart(pOutFn, g_aFpuUnaryR80[iFn].pszName, "FPU_UNARY_R80_TEST_T");
4557	uint32_t iTestOutput = 0;
4558	uint32_t cNormalInputs = 0;
4559	uint32_t cTargetRangeInputs = 0;
4560	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
4561	{
4562	RTFLOAT80U InVal = iTest < cTests ? RandR80Src(iTest) : s_aSpecials[iTest - cTests];
4563	if (RTFLOAT80U_IS_NORMAL(&InVal))
4564	{
4565	if (g_aFpuUnaryR80[iFn].uExtra == kUnary_Rounding_F2xm1)
4566	{
4567	unsigned uTargetExp = g_aFpuUnaryR80[iFn].uExtra == kUnary_Rounding_F2xm1
4568	? RTFLOAT80U_EXP_BIAS /* 2^0..2^-69 / : RTFLOAT80U_EXP_BIAS + 63 + 1 / 2^64..2^-64 */;
4569	unsigned cTargetExp = g_aFpuUnaryR80[iFn].uExtra == kUnary_Rounding_F2xm1 ? 69 : 63*2 + 2;
4570	if (InVal.s.uExponent <= uTargetExp && InVal.s.uExponent >= uTargetExp - cTargetExp)
4571	cTargetRangeInputs++;
4572	else if (cTargetRangeInputs < cMinNormals / 2 && iTest + cMinNormals / 2 >= cTests && iTest < cTests)
4573	{
4574	InVal.s.uExponent = RTRandU32Ex(uTargetExp - cTargetExp, uTargetExp);
4575	cTargetRangeInputs++;
4576	}
4577	}
4578	cNormalInputs++;
4579	}
4580	else if (cNormalInputs < cMinNormals && iTest + cMinNormals >= cTests && iTest < cTests)
4581	{
4582	iTest -= 1;
4583	continue;
4584	}
4585
4586	uint16_t const fFcwExtra = FpuUnaryR80MayHaveRoundingError(&InVal, g_aFpuUnaryR80[iFn].uExtra) ? 0x80 : 0;
4587	uint16_t const fFcw = RandFcw();
4588	State.FSW = RandFsw();
4589
4590	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
4591	for (uint16_t iPrecision = 0; iPrecision < 4; iPrecision++)
4592	{
4593	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_PC_MASK \| X86_FCW_MASK_ALL))
4594	\| (iRounding << X86_FCW_RC_SHIFT)
4595	\| (iPrecision << X86_FCW_PC_SHIFT)
4596	\| X86_FCW_MASK_ALL;
4597	IEMFPURESULT ResM = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4598	pfn(&State, &ResM, &InVal);
4599	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u/m = #%u */\n",
4600	State.FCW \| fFcwExtra, State.FSW, ResM.FSW, GenFormatR80(&InVal),
4601	GenFormatR80(&ResM.r80Result), iTest, iRounding, iPrecision, iTestOutput++);
4602
4603	State.FCW = State.FCW & ~X86_FCW_MASK_ALL;
4604	IEMFPURESULT ResU = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4605	pfn(&State, &ResU, &InVal);
4606	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u/u = #%u */\n",
4607	State.FCW \| fFcwExtra, State.FSW, ResU.FSW, GenFormatR80(&InVal),
4608	GenFormatR80(&ResU.r80Result), iTest, iRounding, iPrecision, iTestOutput++);
4609
4610	uint16_t fXcpt = (ResM.FSW \| ResU.FSW) & X86_FSW_XCPT_MASK & ~X86_FSW_SF;
4611	if (fXcpt)
4612	{
4613	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4614	IEMFPURESULT Res1 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4615	pfn(&State, &Res1, &InVal);
4616	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u/%#x = #%u */\n",
4617	State.FCW \| fFcwExtra, State.FSW, Res1.FSW, GenFormatR80(&InVal),
4618	GenFormatR80(&Res1.r80Result), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4619	if (((Res1.FSW & X86_FSW_XCPT_MASK) & fXcpt) != (Res1.FSW & X86_FSW_XCPT_MASK))
4620	{
4621	fXcpt \|= Res1.FSW & X86_FSW_XCPT_MASK;
4622	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4623	IEMFPURESULT Res2 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4624	pfn(&State, &Res2, &InVal);
4625	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u/%#x[!] = #%u */\n",
4626	State.FCW \| fFcwExtra, State.FSW, Res2.FSW, GenFormatR80(&InVal),
4627	GenFormatR80(&Res2.r80Result), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4628	}
4629	if (!RT_IS_POWER_OF_TWO(fXcpt))
4630	for (uint16_t fUnmasked = 1; fUnmasked <= X86_FCW_PM; fUnmasked <<= 1)
4631	if (fUnmasked & fXcpt)
4632	{
4633	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| (fXcpt & ~fUnmasked);
4634	IEMFPURESULT Res3 = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4635	pfn(&State, &Res3, &InVal);
4636	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s }, /* #%u/%u/%u/u%#x = #%u */\n",
4637	State.FCW \| fFcwExtra, State.FSW, Res3.FSW, GenFormatR80(&InVal),
4638	GenFormatR80(&Res3.r80Result), iTest, iRounding, iPrecision, fUnmasked, iTestOutput++);
4639	}
4640	}
4641	}
4642	}
4643	GenerateArrayEnd(pOutFn, g_aFpuUnaryR80[iFn].pszName);
4644	}
4645	}
4646	#endif
4647
4648	static bool FpuIsEqualFcwMaybeIgnoreRoundErr(uint16_t fFcw1, uint16_t fFcw2, bool fRndErrOk, bool *pfRndErr)
4649	{
4650	if (fFcw1 == fFcw2)
4651	return true;
4652	if (fRndErrOk && (fFcw1 & ~X86_FSW_C1) == (fFcw2 & ~X86_FSW_C1))
4653	{
4654	*pfRndErr = true;
4655	return true;
4656	}
4657	return false;
4658	}
4659
4660	static bool FpuIsEqualR80MaybeIgnoreRoundErr(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fRndErrOk, bool *pfRndErr)
4661	{
4662	if (RTFLOAT80U_ARE_IDENTICAL(pr80Val1, pr80Val2))
4663	return true;
4664	if ( fRndErrOk
4665	&& pr80Val1->s.fSign == pr80Val2->s.fSign)
4666	{
4667	if ( ( pr80Val1->s.uExponent == pr80Val2->s.uExponent
4668	&& ( pr80Val1->s.uMantissa > pr80Val2->s.uMantissa
4669	? pr80Val1->s.uMantissa - pr80Val2->s.uMantissa == 1
4670	: pr80Val2->s.uMantissa - pr80Val1->s.uMantissa == 1))
4671	\|\|
4672	( pr80Val1->s.uExponent + 1 == pr80Val2->s.uExponent
4673	&& pr80Val1->s.uMantissa == UINT64_MAX
4674	&& pr80Val2->s.uMantissa == RT_BIT_64(63))
4675	\|\|
4676	( pr80Val1->s.uExponent == pr80Val2->s.uExponent + 1
4677	&& pr80Val2->s.uMantissa == UINT64_MAX
4678	&& pr80Val1->s.uMantissa == RT_BIT_64(63)) )
4679	{
4680	*pfRndErr = true;
4681	return true;
4682	}
4683	}
4684	return false;
4685	}
4686
4687
4688	static void FpuUnaryR80Test(void)
4689	{
4690	X86FXSTATE State;
4691	RT_ZERO(State);
4692	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryR80); iFn++)
4693	{
4694	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuUnaryR80[iFn]))
4695	continue;
4696
4697	uint32_t const cTests = *g_aFpuUnaryR80[iFn].pcTests;
4698	FPU_UNARY_R80_TEST_T const * const paTests = g_aFpuUnaryR80[iFn].paTests;
4699	PFNIEMAIMPLFPUR80UNARY pfn = g_aFpuUnaryR80[iFn].pfn;
4700	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuUnaryR80[iFn]);
4701	uint32_t cRndErrs = 0;
4702	uint32_t cPossibleRndErrs = 0;
4703	if (!cTests) RTTestSkipped(g_hTest, "no tests");
4704	for (uint32_t iVar = 0; iVar < cVars; iVar++)
4705	{
4706	for (uint32_t iTest = 0; iTest < cTests; iTest++)
4707	{
4708	RTFLOAT80U const InVal = paTests[iTest].InVal;
4709	IEMFPURESULT Res = { RTFLOAT80U_INIT(0, 0, 0), 0 };
4710	bool const fRndErrOk = RT_BOOL(paTests[iTest].fFcw & 0x80);
4711	State.FCW = paTests[iTest].fFcw & ~(uint16_t)0x80;
4712	State.FSW = paTests[iTest].fFswIn;
4713	pfn(&State, &Res, &InVal);
4714	bool fRndErr = false;
4715	if ( !FpuIsEqualFcwMaybeIgnoreRoundErr(Res.FSW, paTests[iTest].fFswOut, fRndErrOk, &fRndErr)
4716	\|\| !FpuIsEqualR80MaybeIgnoreRoundErr(&Res.r80Result, &paTests[iTest].OutVal, fRndErrOk, &fRndErr))
4717	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n"
4718	"%s -> fsw=%#06x %s\n"
4719	"%s expected %#06x %s%s%s%s (%s)\n",
4720	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
4721	FormatR80(&paTests[iTest].InVal),
4722	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result),
4723	iVar ? " " : "", paTests[iTest].fFswOut, FormatR80(&paTests[iTest].OutVal),
4724	FswDiff(Res.FSW, paTests[iTest].fFswOut),
4725	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result, &paTests[iTest].OutVal) ? " - val" : "",
4726	fRndErrOk ? " - rounding errors ok" : "", FormatFcw(paTests[iTest].fFcw));
4727	cRndErrs += fRndErr;
4728	cPossibleRndErrs += fRndErrOk;
4729	}
4730	pfn = g_aFpuUnaryR80[iFn].pfnNative;
4731	}
4732	if (cPossibleRndErrs > 0)
4733	RTTestPrintf(g_hTest, RTTESTLVL_ALWAYS, "rounding errors: %u out of %u\n", cRndErrs, cPossibleRndErrs);
4734	}
4735	}
4736
4737
4738	/*
4739	* Unary FPU operations on one 80-bit floating point value, but only affects the FSW.
4740	*/
4741	TYPEDEF_SUBTEST_TYPE(FPU_UNARY_FSW_R80_T, FPU_UNARY_R80_TEST_T, PFNIEMAIMPLFPUR80UNARYFSW);
4742
4743	static FPU_UNARY_FSW_R80_T g_aFpuUnaryFswR80[] =
4744	{
4745	ENTRY(ftst_r80),
4746	ENTRY_EX(fxam_r80, 1),
4747	};
4748
4749	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4750	static void FpuUnaryFswR80Generate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests)
4751	{
4752	static RTFLOAT80U const s_aSpecials[] =
4753	{
4754	RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), /* whatever */
4755	};
4756
4757	X86FXSTATE State;
4758	RT_ZERO(State);
4759	uint32_t cMinNormals = cTests / 4;
4760	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryFswR80); iFn++)
4761	{
4762	bool const fIsFxam = g_aFpuUnaryFswR80[iFn].uExtra == 1;
4763	PFNIEMAIMPLFPUR80UNARYFSW const pfn = g_aFpuUnaryFswR80[iFn].pfnNative ? g_aFpuUnaryFswR80[iFn].pfnNative : g_aFpuUnaryFswR80[iFn].pfn;
4764	PRTSTREAM pOutFn = pOut;
4765	if (g_aFpuUnaryFswR80[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
4766	{
4767	if (g_aFpuUnaryFswR80[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour)
4768	continue;
4769	pOutFn = pOutCpu;
4770	}
4771	State.FTW = 0;
4772
4773	GenerateArrayStart(pOutFn, g_aFpuUnaryFswR80[iFn].pszName, "FPU_UNARY_R80_TEST_T");
4774	uint32_t cNormalInputs = 0;
4775	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
4776	{
4777	RTFLOAT80U const InVal = iTest < cTests ? RandR80Src(iTest) : s_aSpecials[iTest - cTests];
4778	if (RTFLOAT80U_IS_NORMAL(&InVal))
4779	cNormalInputs++;
4780	else if (cNormalInputs < cMinNormals && iTest + cMinNormals >= cTests && iTest < cTests)
4781	{
4782	iTest -= 1;
4783	continue;
4784	}
4785
4786	uint16_t const fFcw = RandFcw();
4787	State.FSW = RandFsw();
4788	if (!fIsFxam)
4789	{
4790	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
4791	{
4792	for (uint16_t iPrecision = 0; iPrecision < 4; iPrecision++)
4793	{
4794	for (uint16_t iMask = 0; iMask <= X86_FCW_MASK_ALL; iMask += X86_FCW_MASK_ALL)
4795	{
4796	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_PC_MASK \| X86_FCW_MASK_ALL))
4797	\| (iRounding << X86_FCW_RC_SHIFT)
4798	\| (iPrecision << X86_FCW_PC_SHIFT)
4799	\| iMask;
4800	uint16_t fFswOut = 0;
4801	pfn(&State, &fFswOut, &InVal);
4802	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s }, /* #%u/%u/%u/%c */\n",
4803	State.FCW, State.FSW, fFswOut, GenFormatR80(&InVal),
4804	iTest, iRounding, iPrecision, iMask ? 'c' : 'u');
4805	}
4806	}
4807	}
4808	}
4809	else
4810	{
4811	uint16_t fFswOut = 0;
4812	uint16_t const fEmpty = RTRandU32Ex(0, 3) == 3 ? 0x80 : 0; /* Using MBZ bit 7 in FCW to indicate empty tag value. */
4813	State.FTW = !fEmpty ? 1 << X86_FSW_TOP_GET(State.FSW) : 0;
4814	State.FCW = fFcw;
4815	pfn(&State, &fFswOut, &InVal);
4816	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s }, /* #%u%s */\n",
4817	fFcw \| fEmpty, State.FSW, fFswOut, GenFormatR80(&InVal), iTest, fEmpty ? "/empty" : "");
4818	}
4819	}
4820	GenerateArrayEnd(pOutFn, g_aFpuUnaryFswR80[iFn].pszName);
4821	}
4822	}
4823	#endif
4824
4825
4826	static void FpuUnaryFswR80Test(void)
4827	{
4828	X86FXSTATE State;
4829	RT_ZERO(State);
4830	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryFswR80); iFn++)
4831	{
4832	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuUnaryFswR80[iFn]))
4833	continue;
4834
4835	uint32_t const cTests = *g_aFpuUnaryFswR80[iFn].pcTests;
4836	FPU_UNARY_R80_TEST_T const * const paTests = g_aFpuUnaryFswR80[iFn].paTests;
4837	PFNIEMAIMPLFPUR80UNARYFSW pfn = g_aFpuUnaryFswR80[iFn].pfn;
4838	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuUnaryFswR80[iFn]);
4839	if (!cTests) RTTestSkipped(g_hTest, "no tests");
4840	for (uint32_t iVar = 0; iVar < cVars; iVar++)
4841	{
4842	for (uint32_t iTest = 0; iTest < cTests; iTest++)
4843	{
4844	RTFLOAT80U const InVal = paTests[iTest].InVal;
4845	uint16_t fFswOut = 0;
4846	State.FSW = paTests[iTest].fFswIn;
4847	State.FCW = paTests[iTest].fFcw & ~(uint16_t)0x80; /* see generator code */
4848	State.FTW = paTests[iTest].fFcw & 0x80 ? 0 : 1 << X86_FSW_TOP_GET(paTests[iTest].fFswIn);
4849	pfn(&State, &fFswOut, &InVal);
4850	if (fFswOut != paTests[iTest].fFswOut)
4851	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n"
4852	"%s -> fsw=%#06x\n"
4853	"%s expected %#06x %s (%s%s)\n",
4854	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
4855	FormatR80(&paTests[iTest].InVal),
4856	iVar ? " " : "", fFswOut,
4857	iVar ? " " : "", paTests[iTest].fFswOut,
4858	FswDiff(fFswOut, paTests[iTest].fFswOut), FormatFcw(paTests[iTest].fFcw),
4859	paTests[iTest].fFcw & 0x80 ? " empty" : "");
4860	}
4861	pfn = g_aFpuUnaryFswR80[iFn].pfnNative;
4862	}
4863	}
4864	}
4865
4866	/*
4867	* Unary FPU operations on one 80-bit floating point value, but with two outputs.
4868	*/
4869	TYPEDEF_SUBTEST_TYPE(FPU_UNARY_TWO_R80_T, FPU_UNARY_TWO_R80_TEST_T, PFNIEMAIMPLFPUR80UNARYTWO);
4870
4871	static FPU_UNARY_TWO_R80_T g_aFpuUnaryTwoR80[] =
4872	{
4873	ENTRY(fxtract_r80_r80),
4874	ENTRY_AMD( fptan_r80_r80, 0), // rounding differences
4875	ENTRY_INTEL(fptan_r80_r80, 0),
4876	ENTRY_AMD( fsincos_r80_r80, 0), // C1 differences & value differences (e.g. -1m0x235cf2f580244a27^-1696)
4877	ENTRY_INTEL(fsincos_r80_r80, 0),
4878	};
4879
4880	#ifdef TSTIEMAIMPL_WITH_GENERATOR
4881	static void FpuUnaryTwoR80Generate(PRTSTREAM pOut, PRTSTREAM pOutCpu, uint32_t cTests)
4882	{
4883	static RTFLOAT80U const s_aSpecials[] =
4884	{
4885	RTFLOAT80U_INIT_C(0, 0xffffeeeeddddcccc, RTFLOAT80U_EXP_BIAS), /* whatever */
4886	};
4887
4888	X86FXSTATE State;
4889	RT_ZERO(State);
4890	uint32_t cMinNormals = cTests / 4;
4891	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryTwoR80); iFn++)
4892	{
4893	PFNIEMAIMPLFPUR80UNARYTWO const pfn = g_aFpuUnaryTwoR80[iFn].pfnNative ? g_aFpuUnaryTwoR80[iFn].pfnNative : g_aFpuUnaryTwoR80[iFn].pfn;
4894	PRTSTREAM pOutFn = pOut;
4895	if (g_aFpuUnaryTwoR80[iFn].idxCpuEflFlavour != IEMTARGETCPU_EFL_BEHAVIOR_NATIVE)
4896	{
4897	if (g_aFpuUnaryTwoR80[iFn].idxCpuEflFlavour != g_idxCpuEflFlavour)
4898	continue;
4899	pOutFn = pOutCpu;
4900	}
4901
4902	GenerateArrayStart(pOutFn, g_aFpuUnaryTwoR80[iFn].pszName, "FPU_UNARY_TWO_R80_TEST_T");
4903	uint32_t iTestOutput = 0;
4904	uint32_t cNormalInputs = 0;
4905	uint32_t cTargetRangeInputs = 0;
4906	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
4907	{
4908	RTFLOAT80U InVal = iTest < cTests ? RandR80Src(iTest) : s_aSpecials[iTest - cTests];
4909	if (RTFLOAT80U_IS_NORMAL(&InVal))
4910	{
4911	if (iFn != 0)
4912	{
4913	unsigned uTargetExp = RTFLOAT80U_EXP_BIAS + 63 + 1 /* 2^64..2^-64 */;
4914	unsigned cTargetExp = g_aFpuUnaryR80[iFn].uExtra == kUnary_Rounding_F2xm1 ? 69 : 63*2 + 2;
4915	if (InVal.s.uExponent <= uTargetExp && InVal.s.uExponent >= uTargetExp - cTargetExp)
4916	cTargetRangeInputs++;
4917	else if (cTargetRangeInputs < cMinNormals / 2 && iTest + cMinNormals / 2 >= cTests && iTest < cTests)
4918	{
4919	InVal.s.uExponent = RTRandU32Ex(uTargetExp - cTargetExp, uTargetExp);
4920	cTargetRangeInputs++;
4921	}
4922	}
4923	cNormalInputs++;
4924	}
4925	else if (cNormalInputs < cMinNormals && iTest + cMinNormals >= cTests && iTest < cTests)
4926	{
4927	iTest -= 1;
4928	continue;
4929	}
4930
4931	uint16_t const fFcwExtra = 0; /* for rounding error indication */
4932	uint16_t const fFcw = RandFcw();
4933	State.FSW = RandFsw();
4934
4935	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
4936	for (uint16_t iPrecision = 0; iPrecision < 4; iPrecision++)
4937	{
4938	State.FCW = (fFcw & ~(X86_FCW_RC_MASK \| X86_FCW_PC_MASK \| X86_FCW_MASK_ALL))
4939	\| (iRounding << X86_FCW_RC_SHIFT)
4940	\| (iPrecision << X86_FCW_PC_SHIFT)
4941	\| X86_FCW_MASK_ALL;
4942	IEMFPURESULTTWO ResM = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
4943	pfn(&State, &ResM, &InVal);
4944	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/m = #%u */\n",
4945	State.FCW \| fFcwExtra, State.FSW, ResM.FSW, GenFormatR80(&InVal), GenFormatR80(&ResM.r80Result1),
4946	GenFormatR80(&ResM.r80Result2), iTest, iRounding, iPrecision, iTestOutput++);
4947
4948	State.FCW = State.FCW & ~X86_FCW_MASK_ALL;
4949	IEMFPURESULTTWO ResU = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
4950	pfn(&State, &ResU, &InVal);
4951	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/u = #%u */\n",
4952	State.FCW \| fFcwExtra, State.FSW, ResU.FSW, GenFormatR80(&InVal), GenFormatR80(&ResU.r80Result1),
4953	GenFormatR80(&ResU.r80Result2), iTest, iRounding, iPrecision, iTestOutput++);
4954
4955	uint16_t fXcpt = (ResM.FSW \| ResU.FSW) & X86_FSW_XCPT_MASK & ~X86_FSW_SF;
4956	if (fXcpt)
4957	{
4958	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4959	IEMFPURESULTTWO Res1 = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
4960	pfn(&State, &Res1, &InVal);
4961	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/%#x = #%u */\n",
4962	State.FCW \| fFcwExtra, State.FSW, Res1.FSW, GenFormatR80(&InVal), GenFormatR80(&Res1.r80Result1),
4963	GenFormatR80(&Res1.r80Result2), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4964	if (((Res1.FSW & X86_FSW_XCPT_MASK) & fXcpt) != (Res1.FSW & X86_FSW_XCPT_MASK))
4965	{
4966	fXcpt \|= Res1.FSW & X86_FSW_XCPT_MASK;
4967	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| fXcpt;
4968	IEMFPURESULTTWO Res2 = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
4969	pfn(&State, &Res2, &InVal);
4970	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/%#x[!] = #%u */\n",
4971	State.FCW \| fFcwExtra, State.FSW, Res2.FSW, GenFormatR80(&InVal), GenFormatR80(&Res2.r80Result1),
4972	GenFormatR80(&Res2.r80Result2), iTest, iRounding, iPrecision, fXcpt, iTestOutput++);
4973	}
4974	if (!RT_IS_POWER_OF_TWO(fXcpt))
4975	for (uint16_t fUnmasked = 1; fUnmasked <= X86_FCW_PM; fUnmasked <<= 1)
4976	if (fUnmasked & fXcpt)
4977	{
4978	State.FCW = (State.FCW & ~X86_FCW_MASK_ALL) \| (fXcpt & ~fUnmasked);
4979	IEMFPURESULTTWO Res3 = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
4980	pfn(&State, &Res3, &InVal);
4981	RTStrmPrintf(pOutFn, " { %#06x, %#06x, %#06x, %s, %s, %s }, /* #%u/%u/%u/u%#x = #%u */\n",
4982	State.FCW \| fFcwExtra, State.FSW, Res3.FSW, GenFormatR80(&InVal), GenFormatR80(&Res3.r80Result1),
4983	GenFormatR80(&Res3.r80Result2), iTest, iRounding, iPrecision, fUnmasked, iTestOutput++);
4984	}
4985	}
4986	}
4987	}
4988	GenerateArrayEnd(pOutFn, g_aFpuUnaryTwoR80[iFn].pszName);
4989	}
4990	}
4991	#endif
4992
4993
4994	static void FpuUnaryTwoR80Test(void)
4995	{
4996	X86FXSTATE State;
4997	RT_ZERO(State);
4998	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aFpuUnaryTwoR80); iFn++)
4999	{
5000	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aFpuUnaryTwoR80[iFn]))
5001	continue;
5002
5003	uint32_t const cTests = *g_aFpuUnaryTwoR80[iFn].pcTests;
5004	FPU_UNARY_TWO_R80_TEST_T const * const paTests = g_aFpuUnaryTwoR80[iFn].paTests;
5005	PFNIEMAIMPLFPUR80UNARYTWO pfn = g_aFpuUnaryTwoR80[iFn].pfn;
5006	uint32_t const cVars = COUNT_VARIATIONS(g_aFpuUnaryTwoR80[iFn]);
5007	if (!cTests) RTTestSkipped(g_hTest, "no tests");
5008	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5009	{
5010	for (uint32_t iTest = 0; iTest < cTests; iTest++)
5011	{
5012	IEMFPURESULTTWO Res = { RTFLOAT80U_INIT(0, 0, 0), 0, RTFLOAT80U_INIT(0, 0, 0) };
5013	RTFLOAT80U const InVal = paTests[iTest].InVal;
5014	State.FCW = paTests[iTest].fFcw;
5015	State.FSW = paTests[iTest].fFswIn;
5016	pfn(&State, &Res, &InVal);
5017	if ( Res.FSW != paTests[iTest].fFswOut
5018	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result1, &paTests[iTest].OutVal1)
5019	\|\| !RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result2, &paTests[iTest].OutVal2) )
5020	RTTestFailed(g_hTest, "#%04u%s: fcw=%#06x fsw=%#06x in=%s\n"
5021	"%s -> fsw=%#06x %s %s\n"
5022	"%s expected %#06x %s %s %s%s%s (%s)\n",
5023	iTest, iVar ? "/n" : "", paTests[iTest].fFcw, paTests[iTest].fFswIn,
5024	FormatR80(&paTests[iTest].InVal),
5025	iVar ? " " : "", Res.FSW, FormatR80(&Res.r80Result1), FormatR80(&Res.r80Result2),
5026	iVar ? " " : "", paTests[iTest].fFswOut,
5027	FormatR80(&paTests[iTest].OutVal1), FormatR80(&paTests[iTest].OutVal2),
5028	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result1, &paTests[iTest].OutVal1) ? " - val1" : "",
5029	!RTFLOAT80U_ARE_IDENTICAL(&Res.r80Result2, &paTests[iTest].OutVal2) ? " - val2" : "",
5030	FswDiff(Res.FSW, paTests[iTest].fFswOut), FormatFcw(paTests[iTest].fFcw) );
5031	}
5032	pfn = g_aFpuUnaryTwoR80[iFn].pfnNative;
5033	}
5034	}
5035	}
5036
5037
5038	/*********************************************************************************************************************************
5039	* SSE floating point Binary Operations *
5040	*********************************************************************************************************************************/
5041
5042	/*
5043	* Binary SSE operations on packed single precision floating point values.
5044	*/
5045	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R32_T, SSE_BINARY_TEST_T, PFNIEMAIMPLFPSSEF2U128);
5046
5047	static SSE_BINARY_R32_T g_aSseBinaryR32[] =
5048	{
5049	ENTRY_BIN(addps_u128),
5050	ENTRY_BIN(mulps_u128),
5051	ENTRY_BIN(subps_u128),
5052	ENTRY_BIN(minps_u128),
5053	ENTRY_BIN(divps_u128),
5054	ENTRY_BIN(maxps_u128),
5055	ENTRY_BIN(haddps_u128),
5056	ENTRY_BIN(hsubps_u128),
5057	ENTRY_BIN(sqrtps_u128),
5058	ENTRY_BIN(addsubps_u128),
5059	ENTRY_BIN(cvtps2pd_u128),
5060	};
5061
5062	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5063	static RTEXITCODE SseBinaryR32Generate(const char *pszDataFileFmt, uint32_t cTests)
5064	{
5065	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5066
5067	static struct { RTFLOAT32U aVal1[4], aVal2[4]; } const s_aSpecials[] =
5068	{
5069	{ { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), },
5070	{ RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1), RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1), RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1), RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1) } },
5071	/** @todo More specials. */
5072	};
5073
5074	X86FXSTATE State;
5075	RT_ZERO(State);
5076	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5077	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32); iFn++)
5078	{
5079	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseBinaryR32[iFn].pfnNative ? g_aSseBinaryR32[iFn].pfnNative : g_aSseBinaryR32[iFn].pfn;
5080
5081	IEMBINARYOUTPUT BinOut;
5082	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR32[iFn].pszName), RTEXITCODE_FAILURE);
5083
5084	uint32_t cNormalInputPairs = 0;
5085	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5086	{
5087	SSE_BINARY_TEST_T TestData; RT_ZERO(TestData);
5088
5089	TestData.InVal1.ar32[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
5090	TestData.InVal1.ar32[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
5091	TestData.InVal1.ar32[2] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[2];
5092	TestData.InVal1.ar32[3] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[3];
5093
5094	TestData.InVal2.ar32[0] = iTest < cTests ? RandR32Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[0];
5095	TestData.InVal2.ar32[1] = iTest < cTests ? RandR32Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[1];
5096	TestData.InVal2.ar32[2] = iTest < cTests ? RandR32Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[2];
5097	TestData.InVal2.ar32[3] = iTest < cTests ? RandR32Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[3];
5098
5099	if ( RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[0]) && RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[0])
5100	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[1]) && RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[1])
5101	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[2]) && RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[2])
5102	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[3]) && RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[3]))
5103	cNormalInputPairs++;
5104	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5105	{
5106	iTest -= 1;
5107	continue;
5108	}
5109
5110	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5111	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5112	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5113	for (uint8_t iFz = 0; iFz < 2; iFz++)
5114	{
5115	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5116	\| (iRounding << X86_MXCSR_RC_SHIFT)
5117	\| (iDaz ? X86_MXCSR_DAZ : 0)
5118	\| (iFz ? X86_MXCSR_FZ : 0)
5119	\| X86_MXCSR_XCPT_MASK;
5120	IEMSSERESULT ResM; RT_ZERO(ResM);
5121	pfn(&State, &ResM, &TestData.InVal1, &TestData.InVal2);
5122	TestData.fMxcsrIn = State.MXCSR;
5123	TestData.fMxcsrOut = ResM.MXCSR;
5124	TestData.OutVal = ResM.uResult;
5125	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5126
5127	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5128	IEMSSERESULT ResU; RT_ZERO(ResU);
5129	pfn(&State, &ResU, &TestData.InVal1, &TestData.InVal2);
5130	TestData.fMxcsrIn = State.MXCSR;
5131	TestData.fMxcsrOut = ResU.MXCSR;
5132	TestData.OutVal = ResU.uResult;
5133	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5134
5135	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
5136	if (fXcpt)
5137	{
5138	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5139	IEMSSERESULT Res1; RT_ZERO(Res1);
5140	pfn(&State, &Res1, &TestData.InVal1, &TestData.InVal2);
5141	TestData.fMxcsrIn = State.MXCSR;
5142	TestData.fMxcsrOut = Res1.MXCSR;
5143	TestData.OutVal = Res1.uResult;
5144	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5145
5146	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
5147	{
5148	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
5149	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
5150	IEMSSERESULT Res2; RT_ZERO(Res2);
5151	pfn(&State, &Res2, &TestData.InVal1, &TestData.InVal2);
5152	TestData.fMxcsrIn = State.MXCSR;
5153	TestData.fMxcsrOut = Res2.MXCSR;
5154	TestData.OutVal = Res2.uResult;
5155	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5156	}
5157	if (!RT_IS_POWER_OF_TWO(fXcpt))
5158	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
5159	if (fUnmasked & fXcpt)
5160	{
5161	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
5162	IEMSSERESULT Res3; RT_ZERO(Res3);
5163	pfn(&State, &Res3, &TestData.InVal1, &TestData.InVal2);
5164	TestData.fMxcsrIn = State.MXCSR;
5165	TestData.fMxcsrOut = Res3.MXCSR;
5166	TestData.OutVal = Res3.uResult;
5167	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5168	}
5169	}
5170	}
5171	}
5172	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
5173	}
5174
5175	return RTEXITCODE_SUCCESS;
5176	}
5177	#endif
5178
5179	static void SseBinaryR32Test(void)
5180	{
5181	X86FXSTATE State;
5182	RT_ZERO(State);
5183	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32); iFn++)
5184	{
5185	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR32[iFn]))
5186	continue;
5187
5188	uint32_t const cbTests = *g_aSseBinaryR32[iFn].pcTests;
5189	SSE_BINARY_TEST_T const * const paTests = g_aSseBinaryR32[iFn].paTests;
5190	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseBinaryR32[iFn].pfn;
5191	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR32[iFn]);
5192	if (!cbTests) RTTestSkipped(g_hTest, "no tests");
5193	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5194	{
5195	for (uint32_t iTest = 0; iTest < cbTests / sizeof(paTests[0]); iTest++)
5196	{
5197	IEMSSERESULT Res; RT_ZERO(Res);
5198
5199	State.MXCSR = paTests[iTest].fMxcsrIn;
5200	pfn(&State, &Res, &paTests[iTest].InVal1, &paTests[iTest].InVal2);
5201	bool fValsIdentical = RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[0], &paTests[iTest].OutVal.ar32[0])
5202	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[1], &paTests[iTest].OutVal.ar32[1])
5203	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[2], &paTests[iTest].OutVal.ar32[2])
5204	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[3], &paTests[iTest].OutVal.ar32[3]);
5205	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
5206	\|\| !fValsIdentical)
5207	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s'%s'%s in2=%s'%s'%s'%s\n"
5208	"%s -> mxcsr=%#08x %s'%s'%s'%s\n"
5209	"%s expected %#08x %s'%s'%s'%s%s%s (%s)\n",
5210	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
5211	FormatR32(&paTests[iTest].InVal1.ar32[0]), FormatR32(&paTests[iTest].InVal1.ar32[1]),
5212	FormatR32(&paTests[iTest].InVal1.ar32[2]), FormatR32(&paTests[iTest].InVal1.ar32[3]),
5213	FormatR32(&paTests[iTest].InVal2.ar32[0]), FormatR32(&paTests[iTest].InVal2.ar32[1]),
5214	FormatR32(&paTests[iTest].InVal2.ar32[2]), FormatR32(&paTests[iTest].InVal2.ar32[3]),
5215	iVar ? " " : "", Res.MXCSR,
5216	FormatR32(&Res.uResult.ar32[0]), FormatR32(&Res.uResult.ar32[1]),
5217	FormatR32(&Res.uResult.ar32[2]), FormatR32(&Res.uResult.ar32[3]),
5218	iVar ? " " : "", paTests[iTest].fMxcsrOut,
5219	FormatR32(&paTests[iTest].OutVal.ar32[0]), FormatR32(&paTests[iTest].OutVal.ar32[1]),
5220	FormatR32(&paTests[iTest].OutVal.ar32[2]), FormatR32(&paTests[iTest].OutVal.ar32[3]),
5221	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
5222	!fValsIdentical ? " - val" : "",
5223	FormatMxcsr(paTests[iTest].fMxcsrIn) );
5224	}
5225	pfn = g_aSseBinaryR32[iFn].pfnNative;
5226	}
5227	}
5228	}
5229
5230
5231	/*
5232	* Binary SSE operations on packed single precision floating point values.
5233	*/
5234	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R64_T, SSE_BINARY_TEST_T, PFNIEMAIMPLFPSSEF2U128);
5235
5236	static SSE_BINARY_R64_T g_aSseBinaryR64[] =
5237	{
5238	ENTRY_BIN(addpd_u128),
5239	ENTRY_BIN(mulpd_u128),
5240	ENTRY_BIN(subpd_u128),
5241	ENTRY_BIN(minpd_u128),
5242	ENTRY_BIN(divpd_u128),
5243	ENTRY_BIN(maxpd_u128),
5244	ENTRY_BIN(haddpd_u128),
5245	ENTRY_BIN(hsubpd_u128),
5246	ENTRY_BIN(sqrtpd_u128),
5247	ENTRY_BIN(addsubpd_u128),
5248	ENTRY_BIN(cvtpd2ps_u128),
5249	};
5250
5251	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5252	static RTEXITCODE SseBinaryR64Generate(const char *pszDataFileFmt, uint32_t cTests)
5253	{
5254	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5255
5256	static struct { RTFLOAT64U aVal1[2], aVal2[2]; } const s_aSpecials[] =
5257	{
5258	{ { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) },
5259	{ RTFLOAT64U_INIT_C(0, 8388607, RTFLOAT64U_EXP_MAX - 1), RTFLOAT64U_INIT_C(0, 8388607, RTFLOAT64U_EXP_MAX - 1) } },
5260	/** @todo More specials. */
5261	};
5262
5263	X86FXSTATE State;
5264	RT_ZERO(State);
5265	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5266	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64); iFn++)
5267	{
5268	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseBinaryR64[iFn].pfnNative ? g_aSseBinaryR64[iFn].pfnNative : g_aSseBinaryR64[iFn].pfn;
5269
5270	IEMBINARYOUTPUT BinOut;
5271	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR64[iFn].pszName), RTEXITCODE_FAILURE);
5272
5273	uint32_t cNormalInputPairs = 0;
5274	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5275	{
5276	SSE_BINARY_TEST_T TestData; RT_ZERO(TestData);
5277
5278	TestData.InVal1.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
5279	TestData.InVal1.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
5280	TestData.InVal2.ar64[0] = iTest < cTests ? RandR64Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[0];
5281	TestData.InVal2.ar64[1] = iTest < cTests ? RandR64Src2(iTest) : s_aSpecials[iTest - cTests].aVal2[0];
5282
5283	if ( RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[0]) && RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[1])
5284	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal2.ar64[0]) && RTFLOAT64U_IS_NORMAL(&TestData.InVal2.ar64[1]))
5285	cNormalInputPairs++;
5286	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5287	{
5288	iTest -= 1;
5289	continue;
5290	}
5291
5292	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5293	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5294	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5295	for (uint8_t iFz = 0; iFz < 2; iFz++)
5296	{
5297	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5298	\| (iRounding << X86_MXCSR_RC_SHIFT)
5299	\| (iDaz ? X86_MXCSR_DAZ : 0)
5300	\| (iFz ? X86_MXCSR_FZ : 0)
5301	\| X86_MXCSR_XCPT_MASK;
5302	IEMSSERESULT ResM; RT_ZERO(ResM);
5303	pfn(&State, &ResM, &TestData.InVal1, &TestData.InVal2);
5304	TestData.fMxcsrIn = State.MXCSR;
5305	TestData.fMxcsrOut = ResM.MXCSR;
5306	TestData.OutVal = ResM.uResult;
5307	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5308
5309	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5310	IEMSSERESULT ResU; RT_ZERO(ResU);
5311	pfn(&State, &ResU, &TestData.InVal1, &TestData.InVal2);
5312	TestData.fMxcsrIn = State.MXCSR;
5313	TestData.fMxcsrOut = ResU.MXCSR;
5314	TestData.OutVal = ResU.uResult;
5315	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5316
5317	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
5318	if (fXcpt)
5319	{
5320	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5321	IEMSSERESULT Res1; RT_ZERO(Res1);
5322	pfn(&State, &Res1, &TestData.InVal1, &TestData.InVal2);
5323	TestData.fMxcsrIn = State.MXCSR;
5324	TestData.fMxcsrOut = Res1.MXCSR;
5325	TestData.OutVal = Res1.uResult;
5326	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5327
5328	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
5329	{
5330	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
5331	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
5332	IEMSSERESULT Res2; RT_ZERO(Res2);
5333	pfn(&State, &Res2, &TestData.InVal1, &TestData.InVal2);
5334	TestData.fMxcsrIn = State.MXCSR;
5335	TestData.fMxcsrOut = Res2.MXCSR;
5336	TestData.OutVal = Res2.uResult;
5337	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5338	}
5339	if (!RT_IS_POWER_OF_TWO(fXcpt))
5340	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
5341	if (fUnmasked & fXcpt)
5342	{
5343	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
5344	IEMSSERESULT Res3; RT_ZERO(Res3);
5345	pfn(&State, &Res3, &TestData.InVal1, &TestData.InVal2);
5346	TestData.fMxcsrIn = State.MXCSR;
5347	TestData.fMxcsrOut = Res3.MXCSR;
5348	TestData.OutVal = Res3.uResult;
5349	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5350	}
5351	}
5352	}
5353	}
5354	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
5355	}
5356
5357	return RTEXITCODE_SUCCESS;
5358	}
5359	#endif
5360
5361
5362	static void SseBinaryR64Test(void)
5363	{
5364	X86FXSTATE State;
5365	RT_ZERO(State);
5366	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64); iFn++)
5367	{
5368	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR64[iFn]))
5369	continue;
5370
5371	uint32_t const cTests = *g_aSseBinaryR64[iFn].pcTests;
5372	SSE_BINARY_TEST_T const * const paTests = g_aSseBinaryR64[iFn].paTests;
5373	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseBinaryR64[iFn].pfn;
5374	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR64[iFn]);
5375	if (!cTests) RTTestSkipped(g_hTest, "no tests");
5376	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5377	{
5378	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_TEST_T); iTest++)
5379	{
5380	IEMSSERESULT Res; RT_ZERO(Res);
5381
5382	State.MXCSR = paTests[iTest].fMxcsrIn;
5383	pfn(&State, &Res, &paTests[iTest].InVal1, &paTests[iTest].InVal2);
5384	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
5385	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
5386	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
5387	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s in2=%s'%s\n"
5388	"%s -> mxcsr=%#08x %s'%s\n"
5389	"%s expected %#08x %s'%s%s%s (%s)\n",
5390	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
5391	FormatR64(&paTests[iTest].InVal1.ar64[0]), FormatR64(&paTests[iTest].InVal1.ar64[1]),
5392	FormatR64(&paTests[iTest].InVal2.ar64[0]), FormatR64(&paTests[iTest].InVal2.ar64[1]),
5393	iVar ? " " : "", Res.MXCSR,
5394	FormatR64(&Res.uResult.ar64[0]), FormatR64(&Res.uResult.ar64[1]),
5395	iVar ? " " : "", paTests[iTest].fMxcsrOut,
5396	FormatR64(&paTests[iTest].OutVal.ar64[0]), FormatR64(&paTests[iTest].OutVal.ar64[1]),
5397	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
5398	( !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
5399	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
5400	? " - val" : "",
5401	FormatMxcsr(paTests[iTest].fMxcsrIn) );
5402	}
5403	pfn = g_aSseBinaryR64[iFn].pfnNative;
5404	}
5405	}
5406	}
5407
5408
5409	/*
5410	* Binary SSE operations on packed single precision floating point values.
5411	*/
5412	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_U128_R32_T, SSE_BINARY_U128_R32_TEST_T, PFNIEMAIMPLFPSSEF2U128R32);
5413
5414	static SSE_BINARY_U128_R32_T g_aSseBinaryU128R32[] =
5415	{
5416	ENTRY_BIN(addss_u128_r32),
5417	ENTRY_BIN(mulss_u128_r32),
5418	ENTRY_BIN(subss_u128_r32),
5419	ENTRY_BIN(minss_u128_r32),
5420	ENTRY_BIN(divss_u128_r32),
5421	ENTRY_BIN(maxss_u128_r32),
5422	ENTRY_BIN(cvtss2sd_u128_r32),
5423	ENTRY_BIN(sqrtss_u128_r32),
5424	};
5425
5426	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5427	static RTEXITCODE SseBinaryU128R32Generate(const char *pszDataFileFmt, uint32_t cTests)
5428	{
5429	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5430
5431	static struct { RTFLOAT32U aVal1[4], Val2; } const s_aSpecials[] =
5432	{
5433	{ { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), }, RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1) },
5434	/** @todo More specials. */
5435	};
5436
5437	X86FXSTATE State;
5438	RT_ZERO(State);
5439	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5440	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryU128R32); iFn++)
5441	{
5442	PFNIEMAIMPLFPSSEF2U128R32 const pfn = g_aSseBinaryU128R32[iFn].pfnNative ? g_aSseBinaryU128R32[iFn].pfnNative : g_aSseBinaryU128R32[iFn].pfn;
5443
5444	IEMBINARYOUTPUT BinOut;
5445	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryU128R32[iFn].pszName), RTEXITCODE_FAILURE);
5446
5447	uint32_t cNormalInputPairs = 0;
5448	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5449	{
5450	SSE_BINARY_U128_R32_TEST_T TestData; RT_ZERO(TestData);
5451
5452	TestData.InVal1.ar32[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
5453	TestData.InVal1.ar32[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
5454	TestData.InVal1.ar32[2] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[2];
5455	TestData.InVal1.ar32[3] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[3];
5456
5457	TestData.r32Val2 = iTest < cTests ? RandR32Src2(iTest) : s_aSpecials[iTest - cTests].Val2;
5458
5459	if ( RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[0])
5460	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[1])
5461	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[2])
5462	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[3])
5463	&& RTFLOAT32U_IS_NORMAL(&TestData.r32Val2))
5464	cNormalInputPairs++;
5465	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5466	{
5467	iTest -= 1;
5468	continue;
5469	}
5470
5471	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5472	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5473	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5474	for (uint8_t iFz = 0; iFz < 2; iFz++)
5475	{
5476	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5477	\| (iRounding << X86_MXCSR_RC_SHIFT)
5478	\| (iDaz ? X86_MXCSR_DAZ : 0)
5479	\| (iFz ? X86_MXCSR_FZ : 0)
5480	\| X86_MXCSR_XCPT_MASK;
5481	IEMSSERESULT ResM; RT_ZERO(ResM);
5482	pfn(&State, &ResM, &TestData.InVal1, &TestData.r32Val2);
5483	TestData.fMxcsrIn = State.MXCSR;
5484	TestData.fMxcsrOut = ResM.MXCSR;
5485	TestData.OutVal = ResM.uResult;
5486	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5487
5488	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5489	IEMSSERESULT ResU; RT_ZERO(ResU);
5490	pfn(&State, &ResU, &TestData.InVal1, &TestData.r32Val2);
5491	TestData.fMxcsrIn = State.MXCSR;
5492	TestData.fMxcsrOut = ResU.MXCSR;
5493	TestData.OutVal = ResU.uResult;
5494	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5495
5496	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
5497	if (fXcpt)
5498	{
5499	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5500	IEMSSERESULT Res1; RT_ZERO(Res1);
5501	pfn(&State, &Res1, &TestData.InVal1, &TestData.r32Val2);
5502	TestData.fMxcsrIn = State.MXCSR;
5503	TestData.fMxcsrOut = Res1.MXCSR;
5504	TestData.OutVal = Res1.uResult;
5505	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5506
5507	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
5508	{
5509	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
5510	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
5511	IEMSSERESULT Res2; RT_ZERO(Res2);
5512	pfn(&State, &Res2, &TestData.InVal1, &TestData.r32Val2);
5513	TestData.fMxcsrIn = State.MXCSR;
5514	TestData.fMxcsrOut = Res2.MXCSR;
5515	TestData.OutVal = Res2.uResult;
5516	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5517	}
5518	if (!RT_IS_POWER_OF_TWO(fXcpt))
5519	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
5520	if (fUnmasked & fXcpt)
5521	{
5522	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
5523	IEMSSERESULT Res3; RT_ZERO(Res3);
5524	pfn(&State, &Res3, &TestData.InVal1, &TestData.r32Val2);
5525	TestData.fMxcsrIn = State.MXCSR;
5526	TestData.fMxcsrOut = Res3.MXCSR;
5527	TestData.OutVal = Res3.uResult;
5528	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5529	}
5530	}
5531	}
5532	}
5533	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
5534	}
5535
5536	return RTEXITCODE_SUCCESS;
5537	}
5538	#endif
5539
5540	static void SseBinaryU128R32Test(void)
5541	{
5542	X86FXSTATE State;
5543	RT_ZERO(State);
5544	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryU128R32); iFn++)
5545	{
5546	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryU128R32[iFn]))
5547	continue;
5548
5549	uint32_t const cTests = *g_aSseBinaryU128R32[iFn].pcTests;
5550	SSE_BINARY_U128_R32_TEST_T const * const paTests = g_aSseBinaryU128R32[iFn].paTests;
5551	PFNIEMAIMPLFPSSEF2U128R32 pfn = g_aSseBinaryU128R32[iFn].pfn;
5552	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryU128R32[iFn]);
5553	if (!cTests) RTTestSkipped(g_hTest, "no tests");
5554	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5555	{
5556	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_TEST_T); iTest++)
5557	{
5558	IEMSSERESULT Res; RT_ZERO(Res);
5559
5560	State.MXCSR = paTests[iTest].fMxcsrIn;
5561	pfn(&State, &Res, &paTests[iTest].InVal1, &paTests[iTest].r32Val2);
5562	bool fValsIdentical = RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[0], &paTests[iTest].OutVal.ar32[0])
5563	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[1], &paTests[iTest].OutVal.ar32[1])
5564	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[2], &paTests[iTest].OutVal.ar32[2])
5565	&& RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[3], &paTests[iTest].OutVal.ar32[3]);
5566	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
5567	\|\| !fValsIdentical)
5568	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s'%s'%s in2=%s\n"
5569	"%s -> mxcsr=%#08x %s'%s'%s'%s\n"
5570	"%s expected %#08x %s'%s'%s'%s%s%s (%s)\n",
5571	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
5572	FormatR32(&paTests[iTest].InVal1.ar32[0]), FormatR32(&paTests[iTest].InVal1.ar32[1]),
5573	FormatR32(&paTests[iTest].InVal1.ar32[2]), FormatR32(&paTests[iTest].InVal1.ar32[3]),
5574	FormatR32(&paTests[iTest].r32Val2),
5575	iVar ? " " : "", Res.MXCSR,
5576	FormatR32(&Res.uResult.ar32[0]), FormatR32(&Res.uResult.ar32[1]),
5577	FormatR32(&Res.uResult.ar32[2]), FormatR32(&Res.uResult.ar32[3]),
5578	iVar ? " " : "", paTests[iTest].fMxcsrOut,
5579	FormatR32(&paTests[iTest].OutVal.ar32[0]), FormatR32(&paTests[iTest].OutVal.ar32[1]),
5580	FormatR32(&paTests[iTest].OutVal.ar32[2]), FormatR32(&paTests[iTest].OutVal.ar32[3]),
5581	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
5582	!fValsIdentical ? " - val" : "",
5583	FormatMxcsr(paTests[iTest].fMxcsrIn) );
5584	}
5585	}
5586	}
5587	}
5588
5589
5590	/*
5591	* Binary SSE operations on packed single precision floating point values (xxxsd xmm1, r/m64).
5592	*/
5593	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_U128_R64_T, SSE_BINARY_U128_R64_TEST_T, PFNIEMAIMPLFPSSEF2U128R64);
5594
5595	static SSE_BINARY_U128_R64_T g_aSseBinaryU128R64[] =
5596	{
5597	ENTRY_BIN(addsd_u128_r64),
5598	ENTRY_BIN(mulsd_u128_r64),
5599	ENTRY_BIN(subsd_u128_r64),
5600	ENTRY_BIN(minsd_u128_r64),
5601	ENTRY_BIN(divsd_u128_r64),
5602	ENTRY_BIN(maxsd_u128_r64),
5603	ENTRY_BIN(cvtsd2ss_u128_r64),
5604	ENTRY_BIN(sqrtsd_u128_r64),
5605	};
5606
5607	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5608	static RTEXITCODE SseBinaryU128R64Generate(const char *pszDataFileFmt, uint32_t cTests)
5609	{
5610	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5611
5612	static struct { RTFLOAT64U aVal1[2], Val2; } const s_aSpecials[] =
5613	{
5614	{ { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) }, RTFLOAT64U_INIT_C(0, 8388607, RTFLOAT64U_EXP_MAX - 1) },
5615	/** @todo More specials. */
5616	};
5617
5618	X86FXSTATE State;
5619	RT_ZERO(State);
5620	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5621	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryU128R64); iFn++)
5622	{
5623	PFNIEMAIMPLFPSSEF2U128R64 const pfn = g_aSseBinaryU128R64[iFn].pfnNative ? g_aSseBinaryU128R64[iFn].pfnNative : g_aSseBinaryU128R64[iFn].pfn;
5624
5625	IEMBINARYOUTPUT BinOut;
5626	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryU128R64[iFn].pszName), RTEXITCODE_FAILURE);
5627
5628	uint32_t cNormalInputPairs = 0;
5629	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5630	{
5631	SSE_BINARY_U128_R64_TEST_T TestData; RT_ZERO(TestData);
5632
5633	TestData.InVal1.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
5634	TestData.InVal1.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
5635	TestData.r64Val2 = iTest < cTests ? RandR64Src2(iTest) : s_aSpecials[iTest - cTests].Val2;
5636
5637	if ( RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[0]) && RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[1])
5638	&& RTFLOAT64U_IS_NORMAL(&TestData.r64Val2))
5639	cNormalInputPairs++;
5640	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5641	{
5642	iTest -= 1;
5643	continue;
5644	}
5645
5646	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5647	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5648	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5649	for (uint8_t iFz = 0; iFz < 2; iFz++)
5650	{
5651	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5652	\| (iRounding << X86_MXCSR_RC_SHIFT)
5653	\| (iDaz ? X86_MXCSR_DAZ : 0)
5654	\| (iFz ? X86_MXCSR_FZ : 0)
5655	\| X86_MXCSR_XCPT_MASK;
5656	IEMSSERESULT ResM; RT_ZERO(ResM);
5657	pfn(&State, &ResM, &TestData.InVal1, &TestData.r64Val2);
5658	TestData.fMxcsrIn = State.MXCSR;
5659	TestData.fMxcsrOut = ResM.MXCSR;
5660	TestData.OutVal = ResM.uResult;
5661	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5662
5663	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5664	IEMSSERESULT ResU; RT_ZERO(ResU);
5665	pfn(&State, &ResU, &TestData.InVal1, &TestData.r64Val2);
5666	TestData.fMxcsrIn = State.MXCSR;
5667	TestData.fMxcsrOut = ResU.MXCSR;
5668	TestData.OutVal = ResU.uResult;
5669	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5670
5671	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
5672	if (fXcpt)
5673	{
5674	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5675	IEMSSERESULT Res1; RT_ZERO(Res1);
5676	pfn(&State, &Res1, &TestData.InVal1, &TestData.r64Val2);
5677	TestData.fMxcsrIn = State.MXCSR;
5678	TestData.fMxcsrOut = Res1.MXCSR;
5679	TestData.OutVal = Res1.uResult;
5680	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5681
5682	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
5683	{
5684	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
5685	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
5686	IEMSSERESULT Res2; RT_ZERO(Res2);
5687	pfn(&State, &Res2, &TestData.InVal1, &TestData.r64Val2);
5688	TestData.fMxcsrIn = State.MXCSR;
5689	TestData.fMxcsrOut = Res2.MXCSR;
5690	TestData.OutVal = Res2.uResult;
5691	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5692	}
5693	if (!RT_IS_POWER_OF_TWO(fXcpt))
5694	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
5695	if (fUnmasked & fXcpt)
5696	{
5697	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
5698	IEMSSERESULT Res3; RT_ZERO(Res3);
5699	pfn(&State, &Res3, &TestData.InVal1, &TestData.r64Val2);
5700	TestData.fMxcsrIn = State.MXCSR;
5701	TestData.fMxcsrOut = Res3.MXCSR;
5702	TestData.OutVal = Res3.uResult;
5703	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5704	}
5705	}
5706	}
5707	}
5708	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
5709	}
5710
5711	return RTEXITCODE_SUCCESS;
5712	}
5713	#endif
5714
5715
5716	static void SseBinaryU128R64Test(void)
5717	{
5718	X86FXSTATE State;
5719	RT_ZERO(State);
5720	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryU128R64); iFn++)
5721	{
5722	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryU128R64[iFn]))
5723	continue;
5724
5725	uint32_t const cTests = *g_aSseBinaryU128R64[iFn].pcTests;
5726	SSE_BINARY_U128_R64_TEST_T const * const paTests = g_aSseBinaryU128R64[iFn].paTests;
5727	PFNIEMAIMPLFPSSEF2U128R64 pfn = g_aSseBinaryU128R64[iFn].pfn;
5728	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryU128R64[iFn]);
5729	if (!cTests) RTTestSkipped(g_hTest, "no tests");
5730	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5731	{
5732	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_U128_R64_TEST_T); iTest++)
5733	{
5734	IEMSSERESULT Res; RT_ZERO(Res);
5735
5736	State.MXCSR = paTests[iTest].fMxcsrIn;
5737	pfn(&State, &Res, &paTests[iTest].InVal1, &paTests[iTest].r64Val2);
5738	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
5739	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
5740	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
5741	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s in2=%s\n"
5742	"%s -> mxcsr=%#08x %s'%s\n"
5743	"%s expected %#08x %s'%s%s%s (%s)\n",
5744	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
5745	FormatR64(&paTests[iTest].InVal1.ar64[0]), FormatR64(&paTests[iTest].InVal1.ar64[1]),
5746	FormatR64(&paTests[iTest].r64Val2),
5747	iVar ? " " : "", Res.MXCSR,
5748	FormatR64(&Res.uResult.ar64[0]), FormatR64(&Res.uResult.ar64[1]),
5749	iVar ? " " : "", paTests[iTest].fMxcsrOut,
5750	FormatR64(&paTests[iTest].OutVal.ar64[0]), FormatR64(&paTests[iTest].OutVal.ar64[1]),
5751	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
5752	( !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
5753	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
5754	? " - val" : "",
5755	FormatMxcsr(paTests[iTest].fMxcsrIn) );
5756	}
5757	}
5758	}
5759	}
5760
5761
5762	/*
5763	* SSE operations converting single double-precision floating point values to signed double-word integers (cvttsd2si and friends).
5764	*/
5765	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_I32_R64_T, SSE_BINARY_I32_R64_TEST_T, PFNIEMAIMPLSSEF2I32U64);
5766
5767	static SSE_BINARY_I32_R64_T g_aSseBinaryI32R64[] =
5768	{
5769	ENTRY_BIN(cvttsd2si_i32_r64),
5770	ENTRY_BIN(cvtsd2si_i32_r64),
5771	};
5772
5773	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5774	static RTEXITCODE SseBinaryI32R64Generate(const char *pszDataFileFmt, uint32_t cTests)
5775	{
5776	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5777
5778	static struct { RTFLOAT64U Val; } const s_aSpecials[] =
5779	{
5780	{ RTFLOAT64U_INIT_C(0, 8388607, RTFLOAT64U_EXP_MAX - 1) },
5781	/** @todo More specials. */
5782	};
5783
5784	X86FXSTATE State;
5785	RT_ZERO(State);
5786	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5787	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI32R64); iFn++)
5788	{
5789	PFNIEMAIMPLSSEF2I32U64 const pfn = g_aSseBinaryI32R64[iFn].pfnNative ? g_aSseBinaryI32R64[iFn].pfnNative : g_aSseBinaryI32R64[iFn].pfn;
5790
5791	IEMBINARYOUTPUT BinOut;
5792	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryI32R64[iFn].pszName), RTEXITCODE_FAILURE);
5793
5794	uint32_t cNormalInputPairs = 0;
5795	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5796	{
5797	SSE_BINARY_I32_R64_TEST_T TestData; RT_ZERO(TestData);
5798
5799	TestData.r64ValIn = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val;
5800
5801	if (RTFLOAT64U_IS_NORMAL(&TestData.r64ValIn))
5802	cNormalInputPairs++;
5803	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5804	{
5805	iTest -= 1;
5806	continue;
5807	}
5808
5809	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5810	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5811	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5812	for (uint8_t iFz = 0; iFz < 2; iFz++)
5813	{
5814	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5815	\| (iRounding << X86_MXCSR_RC_SHIFT)
5816	\| (iDaz ? X86_MXCSR_DAZ : 0)
5817	\| (iFz ? X86_MXCSR_FZ : 0)
5818	\| X86_MXCSR_XCPT_MASK;
5819	uint32_t fMxcsrM; int32_t i32OutM;
5820	pfn(&State, &fMxcsrM, &i32OutM, &TestData.r64ValIn.u);
5821	TestData.fMxcsrIn = State.MXCSR;
5822	TestData.fMxcsrOut = fMxcsrM;
5823	TestData.i32ValOut = i32OutM;
5824	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5825
5826	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5827	uint32_t fMxcsrU; int32_t i32OutU;
5828	pfn(&State, &fMxcsrU, &i32OutU, &TestData.r64ValIn.u);
5829	TestData.fMxcsrIn = State.MXCSR;
5830	TestData.fMxcsrOut = fMxcsrU;
5831	TestData.i32ValOut = i32OutU;
5832	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5833
5834	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
5835	if (fXcpt)
5836	{
5837	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5838	uint32_t fMxcsr1; int32_t i32Out1;
5839	pfn(&State, &fMxcsr1, &i32Out1, &TestData.r64ValIn.u);
5840	TestData.fMxcsrIn = State.MXCSR;
5841	TestData.fMxcsrOut = fMxcsr1;
5842	TestData.i32ValOut = i32Out1;
5843	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5844
5845	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
5846	{
5847	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
5848	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
5849	uint32_t fMxcsr2; int32_t i32Out2;
5850	pfn(&State, &fMxcsr2, &i32Out2, &TestData.r64ValIn.u);
5851	TestData.fMxcsrIn = State.MXCSR;
5852	TestData.fMxcsrOut = fMxcsr2;
5853	TestData.i32ValOut = i32Out2;
5854	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5855	}
5856	if (!RT_IS_POWER_OF_TWO(fXcpt))
5857	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
5858	if (fUnmasked & fXcpt)
5859	{
5860	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
5861	uint32_t fMxcsr3; int32_t i32Out3;
5862	pfn(&State, &fMxcsr3, &i32Out3, &TestData.r64ValIn.u);
5863	TestData.fMxcsrIn = State.MXCSR;
5864	TestData.fMxcsrOut = fMxcsr3;
5865	TestData.i32ValOut = i32Out3;
5866	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5867	}
5868	}
5869	}
5870	}
5871	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
5872	}
5873
5874	return RTEXITCODE_SUCCESS;
5875	}
5876	#endif
5877
5878
5879	static void SseBinaryI32R64Test(void)
5880	{
5881	X86FXSTATE State;
5882	RT_ZERO(State);
5883	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI32R64); iFn++)
5884	{
5885	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryI32R64[iFn]))
5886	continue;
5887
5888	uint32_t const cTests = *g_aSseBinaryI32R64[iFn].pcTests;
5889	SSE_BINARY_I32_R64_TEST_T const * const paTests = g_aSseBinaryI32R64[iFn].paTests;
5890	PFNIEMAIMPLSSEF2I32U64 pfn = g_aSseBinaryI32R64[iFn].pfn;
5891	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryI32R64[iFn]);
5892	if (!cTests) RTTestSkipped(g_hTest, "no tests");
5893	for (uint32_t iVar = 0; iVar < cVars; iVar++)
5894	{
5895	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_I32_R64_TEST_T); iTest++)
5896	{
5897	uint32_t fMxcsr = 0;
5898	int32_t i32Dst = 0;
5899
5900	State.MXCSR = paTests[iTest].fMxcsrIn;
5901	pfn(&State, &fMxcsr, &i32Dst, &paTests[iTest].r64ValIn.u);
5902	if ( fMxcsr != paTests[iTest].fMxcsrOut
5903	\|\| i32Dst != paTests[iTest].i32ValOut)
5904	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s\n"
5905	"%s -> mxcsr=%#08x %RI32\n"
5906	"%s expected %#08x %RI32%s%s (%s)\n",
5907	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
5908	FormatR64(&paTests[iTest].r64ValIn),
5909	iVar ? " " : "", fMxcsr, i32Dst,
5910	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].i32ValOut,
5911	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
5912	i32Dst != paTests[iTest].i32ValOut
5913	? " - val" : "",
5914	FormatMxcsr(paTests[iTest].fMxcsrIn) );
5915	}
5916	}
5917	}
5918	}
5919
5920
5921	/*
5922	* SSE operations converting single double-precision floating point values to signed quad-word integers (cvttsd2si and friends).
5923	*/
5924	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_I64_R64_T, SSE_BINARY_I64_R64_TEST_T, PFNIEMAIMPLSSEF2I64U64);
5925
5926	static SSE_BINARY_I64_R64_T g_aSseBinaryI64R64[] =
5927	{
5928	ENTRY_BIN(cvttsd2si_i64_r64),
5929	ENTRY_BIN(cvtsd2si_i64_r64),
5930	};
5931
5932	#ifdef TSTIEMAIMPL_WITH_GENERATOR
5933	static RTEXITCODE SseBinaryI64R64Generate(const char *pszDataFileFmt, uint32_t cTests)
5934	{
5935	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
5936
5937	static struct { RTFLOAT64U Val; } const s_aSpecials[] =
5938	{
5939	{ RTFLOAT64U_INIT_C(0, 8388607, RTFLOAT64U_EXP_MAX - 1) },
5940	/** @todo More specials. */
5941	};
5942
5943	X86FXSTATE State;
5944	RT_ZERO(State);
5945	uint32_t cMinNormalPairs = (cTests - 144) / 4;
5946	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI64R64); iFn++)
5947	{
5948	PFNIEMAIMPLSSEF2I64U64 const pfn = g_aSseBinaryI64R64[iFn].pfnNative ? g_aSseBinaryI64R64[iFn].pfnNative : g_aSseBinaryI64R64[iFn].pfn;
5949
5950	IEMBINARYOUTPUT BinOut;
5951	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryI64R64[iFn].pszName), RTEXITCODE_FAILURE);
5952
5953	uint32_t cNormalInputPairs = 0;
5954	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
5955	{
5956	SSE_BINARY_I64_R64_TEST_T TestData; RT_ZERO(TestData);
5957
5958	TestData.r64ValIn = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val;
5959
5960	if (RTFLOAT64U_IS_NORMAL(&TestData.r64ValIn))
5961	cNormalInputPairs++;
5962	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
5963	{
5964	iTest -= 1;
5965	continue;
5966	}
5967
5968	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
5969	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
5970	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
5971	for (uint8_t iFz = 0; iFz < 2; iFz++)
5972	{
5973	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
5974	\| (iRounding << X86_MXCSR_RC_SHIFT)
5975	\| (iDaz ? X86_MXCSR_DAZ : 0)
5976	\| (iFz ? X86_MXCSR_FZ : 0)
5977	\| X86_MXCSR_XCPT_MASK;
5978	uint32_t fMxcsrM; int64_t i64OutM;
5979	pfn(&State, &fMxcsrM, &i64OutM, &TestData.r64ValIn.u);
5980	TestData.fMxcsrIn = State.MXCSR;
5981	TestData.fMxcsrOut = fMxcsrM;
5982	TestData.i64ValOut = i64OutM;
5983	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5984
5985	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
5986	uint32_t fMxcsrU; int64_t i64OutU;
5987	pfn(&State, &fMxcsrU, &i64OutU, &TestData.r64ValIn.u);
5988	TestData.fMxcsrIn = State.MXCSR;
5989	TestData.fMxcsrOut = fMxcsrU;
5990	TestData.i64ValOut = i64OutU;
5991	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
5992
5993	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
5994	if (fXcpt)
5995	{
5996	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
5997	uint32_t fMxcsr1; int64_t i64Out1;
5998	pfn(&State, &fMxcsr1, &i64Out1, &TestData.r64ValIn.u);
5999	TestData.fMxcsrIn = State.MXCSR;
6000	TestData.fMxcsrOut = fMxcsr1;
6001	TestData.i64ValOut = i64Out1;
6002	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6003
6004	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6005	{
6006	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6007	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6008	uint32_t fMxcsr2; int64_t i64Out2;
6009	pfn(&State, &fMxcsr2, &i64Out2, &TestData.r64ValIn.u);
6010	TestData.fMxcsrIn = State.MXCSR;
6011	TestData.fMxcsrOut = fMxcsr2;
6012	TestData.i64ValOut = i64Out2;
6013	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6014	}
6015	if (!RT_IS_POWER_OF_TWO(fXcpt))
6016	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6017	if (fUnmasked & fXcpt)
6018	{
6019	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6020	uint32_t fMxcsr3; int64_t i64Out3;
6021	pfn(&State, &fMxcsr3, &i64Out3, &TestData.r64ValIn.u);
6022	TestData.fMxcsrIn = State.MXCSR;
6023	TestData.fMxcsrOut = fMxcsr3;
6024	TestData.i64ValOut = i64Out3;
6025	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6026	}
6027	}
6028	}
6029	}
6030	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6031	}
6032
6033	return RTEXITCODE_SUCCESS;
6034	}
6035	#endif
6036
6037
6038	static void SseBinaryI64R64Test(void)
6039	{
6040	X86FXSTATE State;
6041	RT_ZERO(State);
6042	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI64R64); iFn++)
6043	{
6044	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryI64R64[iFn]))
6045	continue;
6046
6047	uint32_t const cTests = *g_aSseBinaryI64R64[iFn].pcTests;
6048	SSE_BINARY_I64_R64_TEST_T const * const paTests = g_aSseBinaryI64R64[iFn].paTests;
6049	PFNIEMAIMPLSSEF2I64U64 pfn = g_aSseBinaryI64R64[iFn].pfn;
6050	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryI32R64[iFn]);
6051	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6052	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6053	{
6054	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_I64_R64_TEST_T); iTest++)
6055	{
6056	uint32_t fMxcsr = 0;
6057	int64_t i64Dst = 0;
6058
6059	State.MXCSR = paTests[iTest].fMxcsrIn;
6060	pfn(&State, &fMxcsr, &i64Dst, &paTests[iTest].r64ValIn.u);
6061	if ( fMxcsr != paTests[iTest].fMxcsrOut
6062	\|\| i64Dst != paTests[iTest].i64ValOut)
6063	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s\n"
6064	"%s -> mxcsr=%#08x %RI64\n"
6065	"%s expected %#08x %RI64%s%s (%s)\n",
6066	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6067	FormatR64(&paTests[iTest].r64ValIn),
6068	iVar ? " " : "", fMxcsr, i64Dst,
6069	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].i64ValOut,
6070	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6071	i64Dst != paTests[iTest].i64ValOut
6072	? " - val" : "",
6073	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6074	}
6075	}
6076	}
6077	}
6078
6079
6080	/*
6081	* SSE operations converting single single-precision floating point values to signed double-word integers (cvttss2si and friends).
6082	*/
6083	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_I32_R32_T, SSE_BINARY_I32_R32_TEST_T, PFNIEMAIMPLSSEF2I32U32);
6084
6085	static SSE_BINARY_I32_R32_T g_aSseBinaryI32R32[] =
6086	{
6087	ENTRY_BIN(cvttss2si_i32_r32),
6088	ENTRY_BIN(cvtss2si_i32_r32),
6089	};
6090
6091	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6092	static RTEXITCODE SseBinaryI32R32Generate(const char *pszDataFileFmt, uint32_t cTests)
6093	{
6094	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6095
6096	static struct { RTFLOAT32U Val; } const s_aSpecials[] =
6097	{
6098	{ RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1) },
6099	/** @todo More specials. */
6100	};
6101
6102	X86FXSTATE State;
6103	RT_ZERO(State);
6104	uint32_t cMinNormalPairs = (cTests - 144) / 4;
6105	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI32R32); iFn++)
6106	{
6107	PFNIEMAIMPLSSEF2I32U32 const pfn = g_aSseBinaryI32R32[iFn].pfnNative ? g_aSseBinaryI32R32[iFn].pfnNative : g_aSseBinaryI32R32[iFn].pfn;
6108
6109	IEMBINARYOUTPUT BinOut;
6110	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryI32R32[iFn].pszName), RTEXITCODE_FAILURE);
6111
6112	uint32_t cNormalInputPairs = 0;
6113	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6114	{
6115	SSE_BINARY_I32_R32_TEST_T TestData; RT_ZERO(TestData);
6116
6117	TestData.r32ValIn = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val;
6118
6119	if (RTFLOAT32U_IS_NORMAL(&TestData.r32ValIn))
6120	cNormalInputPairs++;
6121	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
6122	{
6123	iTest -= 1;
6124	continue;
6125	}
6126
6127	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6128	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6129	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6130	for (uint8_t iFz = 0; iFz < 2; iFz++)
6131	{
6132	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6133	\| (iRounding << X86_MXCSR_RC_SHIFT)
6134	\| (iDaz ? X86_MXCSR_DAZ : 0)
6135	\| (iFz ? X86_MXCSR_FZ : 0)
6136	\| X86_MXCSR_XCPT_MASK;
6137	uint32_t fMxcsrM; int32_t i32OutM;
6138	pfn(&State, &fMxcsrM, &i32OutM, &TestData.r32ValIn.u);
6139	TestData.fMxcsrIn = State.MXCSR;
6140	TestData.fMxcsrOut = fMxcsrM;
6141	TestData.i32ValOut = i32OutM;
6142	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6143
6144	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6145	uint32_t fMxcsrU; int32_t i32OutU;
6146	pfn(&State, &fMxcsrU, &i32OutU, &TestData.r32ValIn.u);
6147	TestData.fMxcsrIn = State.MXCSR;
6148	TestData.fMxcsrOut = fMxcsrU;
6149	TestData.i32ValOut = i32OutU;
6150	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6151
6152	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6153	if (fXcpt)
6154	{
6155	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6156	uint32_t fMxcsr1; int32_t i32Out1;
6157	pfn(&State, &fMxcsr1, &i32Out1, &TestData.r32ValIn.u);
6158	TestData.fMxcsrIn = State.MXCSR;
6159	TestData.fMxcsrOut = fMxcsr1;
6160	TestData.i32ValOut = i32Out1;
6161	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6162
6163	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6164	{
6165	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6166	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6167	uint32_t fMxcsr2; int32_t i32Out2;
6168	pfn(&State, &fMxcsr2, &i32Out2, &TestData.r32ValIn.u);
6169	TestData.fMxcsrIn = State.MXCSR;
6170	TestData.fMxcsrOut = fMxcsr2;
6171	TestData.i32ValOut = i32Out2;
6172	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6173	}
6174	if (!RT_IS_POWER_OF_TWO(fXcpt))
6175	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6176	if (fUnmasked & fXcpt)
6177	{
6178	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6179	uint32_t fMxcsr3; int32_t i32Out3;
6180	pfn(&State, &fMxcsr3, &i32Out3, &TestData.r32ValIn.u);
6181	TestData.fMxcsrIn = State.MXCSR;
6182	TestData.fMxcsrOut = fMxcsr3;
6183	TestData.i32ValOut = i32Out3;
6184	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6185	}
6186	}
6187	}
6188	}
6189	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6190	}
6191
6192	return RTEXITCODE_SUCCESS;
6193	}
6194	#endif
6195
6196
6197	static void SseBinaryI32R32Test(void)
6198	{
6199	X86FXSTATE State;
6200	RT_ZERO(State);
6201	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI32R32); iFn++)
6202	{
6203	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryI32R32[iFn]))
6204	continue;
6205
6206	uint32_t const cTests = *g_aSseBinaryI32R32[iFn].pcTests;
6207	SSE_BINARY_I32_R32_TEST_T const * const paTests = g_aSseBinaryI32R32[iFn].paTests;
6208	PFNIEMAIMPLSSEF2I32U32 pfn = g_aSseBinaryI32R32[iFn].pfn;
6209	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryI32R32[iFn]);
6210	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6211	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6212	{
6213	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_I32_R32_TEST_T); iTest++)
6214	{
6215	uint32_t fMxcsr = 0;
6216	int32_t i32Dst = 0;
6217
6218	State.MXCSR = paTests[iTest].fMxcsrIn;
6219	pfn(&State, &fMxcsr, &i32Dst, &paTests[iTest].r32ValIn.u);
6220	if ( fMxcsr != paTests[iTest].fMxcsrOut
6221	\|\| i32Dst != paTests[iTest].i32ValOut)
6222	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s\n"
6223	"%s -> mxcsr=%#08x %RI32\n"
6224	"%s expected %#08x %RI32%s%s (%s)\n",
6225	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6226	FormatR32(&paTests[iTest].r32ValIn),
6227	iVar ? " " : "", fMxcsr, i32Dst,
6228	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].i32ValOut,
6229	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6230	i32Dst != paTests[iTest].i32ValOut
6231	? " - val" : "",
6232	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6233	}
6234	}
6235	}
6236	}
6237
6238
6239	/*
6240	* SSE operations converting single single-precision floating point values to signed quad-word integers (cvttss2si and friends).
6241	*/
6242	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_I64_R32_T, SSE_BINARY_I64_R32_TEST_T, PFNIEMAIMPLSSEF2I64U32);
6243
6244	static SSE_BINARY_I64_R32_T g_aSseBinaryI64R32[] =
6245	{
6246	ENTRY_BIN(cvttss2si_i64_r32),
6247	ENTRY_BIN(cvtss2si_i64_r32),
6248	};
6249
6250	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6251	static RTEXITCODE SseBinaryI64R32Generate(const char *pszDataFileFmt, uint32_t cTests)
6252	{
6253	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6254
6255	static struct { RTFLOAT32U Val; } const s_aSpecials[] =
6256	{
6257	{ RTFLOAT32U_INIT_C(0, 8388607, RTFLOAT32U_EXP_MAX - 1) },
6258	/** @todo More specials. */
6259	};
6260
6261	X86FXSTATE State;
6262	RT_ZERO(State);
6263	uint32_t cMinNormalPairs = (cTests - 144) / 4;
6264	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI64R32); iFn++)
6265	{
6266	PFNIEMAIMPLSSEF2I64U32 const pfn = g_aSseBinaryI64R32[iFn].pfnNative ? g_aSseBinaryI64R32[iFn].pfnNative : g_aSseBinaryI64R32[iFn].pfn;
6267
6268	IEMBINARYOUTPUT BinOut;
6269	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryI64R32[iFn].pszName), RTEXITCODE_FAILURE);
6270
6271	uint32_t cNormalInputPairs = 0;
6272	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6273	{
6274	SSE_BINARY_I64_R32_TEST_T TestData; RT_ZERO(TestData);
6275
6276	TestData.r32ValIn = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val;
6277
6278	if (RTFLOAT32U_IS_NORMAL(&TestData.r32ValIn))
6279	cNormalInputPairs++;
6280	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
6281	{
6282	iTest -= 1;
6283	continue;
6284	}
6285
6286	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6287	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6288	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6289	for (uint8_t iFz = 0; iFz < 2; iFz++)
6290	{
6291	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6292	\| (iRounding << X86_MXCSR_RC_SHIFT)
6293	\| (iDaz ? X86_MXCSR_DAZ : 0)
6294	\| (iFz ? X86_MXCSR_FZ : 0)
6295	\| X86_MXCSR_XCPT_MASK;
6296	uint32_t fMxcsrM; int64_t i64OutM;
6297	pfn(&State, &fMxcsrM, &i64OutM, &TestData.r32ValIn.u);
6298	TestData.fMxcsrIn = State.MXCSR;
6299	TestData.fMxcsrOut = fMxcsrM;
6300	TestData.i64ValOut = i64OutM;
6301	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6302
6303	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6304	uint32_t fMxcsrU; int64_t i64OutU;
6305	pfn(&State, &fMxcsrU, &i64OutU, &TestData.r32ValIn.u);
6306	TestData.fMxcsrIn = State.MXCSR;
6307	TestData.fMxcsrOut = fMxcsrU;
6308	TestData.i64ValOut = i64OutU;
6309	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6310
6311	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6312	if (fXcpt)
6313	{
6314	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6315	uint32_t fMxcsr1; int64_t i64Out1;
6316	pfn(&State, &fMxcsr1, &i64Out1, &TestData.r32ValIn.u);
6317	TestData.fMxcsrIn = State.MXCSR;
6318	TestData.fMxcsrOut = fMxcsr1;
6319	TestData.i64ValOut = i64Out1;
6320	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6321
6322	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6323	{
6324	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6325	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6326	uint32_t fMxcsr2; int64_t i64Out2;
6327	pfn(&State, &fMxcsr2, &i64Out2, &TestData.r32ValIn.u);
6328	TestData.fMxcsrIn = State.MXCSR;
6329	TestData.fMxcsrOut = fMxcsr2;
6330	TestData.i64ValOut = i64Out2;
6331	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6332	}
6333	if (!RT_IS_POWER_OF_TWO(fXcpt))
6334	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6335	if (fUnmasked & fXcpt)
6336	{
6337	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6338	uint32_t fMxcsr3; int64_t i64Out3;
6339	pfn(&State, &fMxcsr3, &i64Out3, &TestData.r32ValIn.u);
6340	TestData.fMxcsrIn = State.MXCSR;
6341	TestData.fMxcsrOut = fMxcsr3;
6342	TestData.i64ValOut = i64Out3;
6343	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6344	}
6345	}
6346	}
6347	}
6348	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6349	}
6350
6351	return RTEXITCODE_SUCCESS;
6352	}
6353	#endif
6354
6355
6356	static void SseBinaryI64R32Test(void)
6357	{
6358	X86FXSTATE State;
6359	RT_ZERO(State);
6360	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryI64R32); iFn++)
6361	{
6362	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryI64R32[iFn]))
6363	continue;
6364
6365	uint32_t const cTests = *g_aSseBinaryI64R32[iFn].pcTests;
6366	SSE_BINARY_I64_R32_TEST_T const * const paTests = g_aSseBinaryI64R32[iFn].paTests;
6367	PFNIEMAIMPLSSEF2I64U32 pfn = g_aSseBinaryI64R32[iFn].pfn;
6368	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryI64R32[iFn]);
6369	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6370	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6371	{
6372	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_I64_R32_TEST_T); iTest++)
6373	{
6374	uint32_t fMxcsr = 0;
6375	int64_t i64Dst = 0;
6376
6377	State.MXCSR = paTests[iTest].fMxcsrIn;
6378	pfn(&State, &fMxcsr, &i64Dst, &paTests[iTest].r32ValIn.u);
6379	if ( fMxcsr != paTests[iTest].fMxcsrOut
6380	\|\| i64Dst != paTests[iTest].i64ValOut)
6381	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s\n"
6382	"%s -> mxcsr=%#08x %RI64\n"
6383	"%s expected %#08x %RI64%s%s (%s)\n",
6384	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6385	FormatR32(&paTests[iTest].r32ValIn),
6386	iVar ? " " : "", fMxcsr, i64Dst,
6387	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].i64ValOut,
6388	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6389	i64Dst != paTests[iTest].i64ValOut
6390	? " - val" : "",
6391	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6392	}
6393	}
6394	}
6395	}
6396
6397
6398	/*
6399	* SSE operations converting single signed double-word integers to double-precision floating point values (probably only cvtsi2sd).
6400	*/
6401	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R64_I32_T, SSE_BINARY_R64_I32_TEST_T, PFNIEMAIMPLSSEF2R64I32);
6402
6403	static SSE_BINARY_R64_I32_T g_aSseBinaryR64I32[] =
6404	{
6405	ENTRY_BIN(cvtsi2sd_r64_i32)
6406	};
6407
6408	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6409	static RTEXITCODE SseBinaryR64I32Generate(const char *pszDataFileFmt, uint32_t cTests)
6410	{
6411	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6412
6413	static int32_t const s_aSpecials[] =
6414	{
6415	INT32_MIN,
6416	INT32_MAX,
6417	/** @todo More specials. */
6418	};
6419
6420	X86FXSTATE State;
6421	RT_ZERO(State);
6422	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64I32); iFn++)
6423	{
6424	PFNIEMAIMPLSSEF2R64I32 const pfn = g_aSseBinaryR64I32[iFn].pfnNative ? g_aSseBinaryR64I32[iFn].pfnNative : g_aSseBinaryR64I32[iFn].pfn;
6425
6426	IEMBINARYOUTPUT BinOut;
6427	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR64I32[iFn].pszName), RTEXITCODE_FAILURE);
6428
6429	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6430	{
6431	SSE_BINARY_R64_I32_TEST_T TestData; RT_ZERO(TestData);
6432
6433	TestData.i32ValIn = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
6434
6435	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6436	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6437	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6438	for (uint8_t iFz = 0; iFz < 2; iFz++)
6439	{
6440	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6441	\| (iRounding << X86_MXCSR_RC_SHIFT)
6442	\| (iDaz ? X86_MXCSR_DAZ : 0)
6443	\| (iFz ? X86_MXCSR_FZ : 0)
6444	\| X86_MXCSR_XCPT_MASK;
6445	uint32_t fMxcsrM; RTFLOAT64U r64OutM;
6446	pfn(&State, &fMxcsrM, &r64OutM, &TestData.i32ValIn);
6447	TestData.fMxcsrIn = State.MXCSR;
6448	TestData.fMxcsrOut = fMxcsrM;
6449	TestData.r64ValOut = r64OutM;
6450	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6451
6452	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6453	uint32_t fMxcsrU; RTFLOAT64U r64OutU;
6454	pfn(&State, &fMxcsrU, &r64OutU, &TestData.i32ValIn);
6455	TestData.fMxcsrIn = State.MXCSR;
6456	TestData.fMxcsrOut = fMxcsrU;
6457	TestData.r64ValOut = r64OutU;
6458	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6459
6460	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6461	if (fXcpt)
6462	{
6463	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6464	uint32_t fMxcsr1; RTFLOAT64U r64Out1;
6465	pfn(&State, &fMxcsr1, &r64Out1, &TestData.i32ValIn);
6466	TestData.fMxcsrIn = State.MXCSR;
6467	TestData.fMxcsrOut = fMxcsr1;
6468	TestData.r64ValOut = r64Out1;
6469	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6470
6471	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6472	{
6473	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6474	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6475	uint32_t fMxcsr2; RTFLOAT64U r64Out2;
6476	pfn(&State, &fMxcsr2, &r64Out2, &TestData.i32ValIn);
6477	TestData.fMxcsrIn = State.MXCSR;
6478	TestData.fMxcsrOut = fMxcsr2;
6479	TestData.r64ValOut = r64Out2;
6480	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6481	}
6482	if (!RT_IS_POWER_OF_TWO(fXcpt))
6483	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6484	if (fUnmasked & fXcpt)
6485	{
6486	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6487	uint32_t fMxcsr3; RTFLOAT64U r64Out3;
6488	pfn(&State, &fMxcsr3, &r64Out3, &TestData.i32ValIn);
6489	TestData.fMxcsrIn = State.MXCSR;
6490	TestData.fMxcsrOut = fMxcsr3;
6491	TestData.r64ValOut = r64Out3;
6492	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6493	}
6494	}
6495	}
6496	}
6497	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6498	}
6499
6500	return RTEXITCODE_SUCCESS;
6501	}
6502	#endif
6503
6504
6505	static void SseBinaryR64I32Test(void)
6506	{
6507	X86FXSTATE State;
6508	RT_ZERO(State);
6509	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64I32); iFn++)
6510	{
6511	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR64I32[iFn]))
6512	continue;
6513
6514	uint32_t const cTests = *g_aSseBinaryR64I32[iFn].pcTests;
6515	SSE_BINARY_R64_I32_TEST_T const * const paTests = g_aSseBinaryR64I32[iFn].paTests;
6516	PFNIEMAIMPLSSEF2R64I32 pfn = g_aSseBinaryR64I32[iFn].pfn;
6517	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR64I32[iFn]);
6518	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6519	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6520	{
6521	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_R64_I32_TEST_T); iTest++)
6522	{
6523	uint32_t fMxcsr = 0;
6524	RTFLOAT64U r64Dst; RT_ZERO(r64Dst);
6525
6526	State.MXCSR = paTests[iTest].fMxcsrIn;
6527	pfn(&State, &fMxcsr, &r64Dst, &paTests[iTest].i32ValIn);
6528	if ( fMxcsr != paTests[iTest].fMxcsrOut
6529	\|\| !RTFLOAT64U_ARE_IDENTICAL(&r64Dst, &paTests[iTest].r64ValOut))
6530	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32\n"
6531	"%s -> mxcsr=%#08x %s\n"
6532	"%s expected %#08x %s%s%s (%s)\n",
6533	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6534	&paTests[iTest].i32ValIn,
6535	iVar ? " " : "", fMxcsr, FormatR64(&r64Dst),
6536	iVar ? " " : "", paTests[iTest].fMxcsrOut, FormatR64(&paTests[iTest].r64ValOut),
6537	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6538	!RTFLOAT64U_ARE_IDENTICAL(&r64Dst, &paTests[iTest].r64ValOut)
6539	? " - val" : "",
6540	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6541	}
6542	}
6543	}
6544	}
6545
6546
6547	/*
6548	* SSE operations converting single signed quad-word integers to double-precision floating point values (probably only cvtsi2sd).
6549	*/
6550	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R64_I64_T, SSE_BINARY_R64_I64_TEST_T, PFNIEMAIMPLSSEF2R64I64);
6551
6552	static SSE_BINARY_R64_I64_T g_aSseBinaryR64I64[] =
6553	{
6554	ENTRY_BIN(cvtsi2sd_r64_i64),
6555	};
6556
6557	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6558	static RTEXITCODE SseBinaryR64I64Generate(const char *pszDataFileFmt, uint32_t cTests)
6559	{
6560	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6561
6562	static int64_t const s_aSpecials[] =
6563	{
6564	INT64_MIN,
6565	INT64_MAX
6566	/** @todo More specials. */
6567	};
6568
6569	X86FXSTATE State;
6570	RT_ZERO(State);
6571	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64I64); iFn++)
6572	{
6573	PFNIEMAIMPLSSEF2R64I64 const pfn = g_aSseBinaryR64I64[iFn].pfnNative ? g_aSseBinaryR64I64[iFn].pfnNative : g_aSseBinaryR64I64[iFn].pfn;
6574
6575	IEMBINARYOUTPUT BinOut;
6576	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR64I64[iFn].pszName), RTEXITCODE_FAILURE);
6577
6578	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6579	{
6580	SSE_BINARY_R64_I64_TEST_T TestData; RT_ZERO(TestData);
6581
6582	TestData.i64ValIn = iTest < cTests ? RandI64Src(iTest) : s_aSpecials[iTest - cTests];
6583
6584	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6585	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6586	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6587	for (uint8_t iFz = 0; iFz < 2; iFz++)
6588	{
6589	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6590	\| (iRounding << X86_MXCSR_RC_SHIFT)
6591	\| (iDaz ? X86_MXCSR_DAZ : 0)
6592	\| (iFz ? X86_MXCSR_FZ : 0)
6593	\| X86_MXCSR_XCPT_MASK;
6594	uint32_t fMxcsrM; RTFLOAT64U r64OutM;
6595	pfn(&State, &fMxcsrM, &r64OutM, &TestData.i64ValIn);
6596	TestData.fMxcsrIn = State.MXCSR;
6597	TestData.fMxcsrOut = fMxcsrM;
6598	TestData.r64ValOut = r64OutM;
6599	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6600
6601	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6602	uint32_t fMxcsrU; RTFLOAT64U r64OutU;
6603	pfn(&State, &fMxcsrU, &r64OutU, &TestData.i64ValIn);
6604	TestData.fMxcsrIn = State.MXCSR;
6605	TestData.fMxcsrOut = fMxcsrU;
6606	TestData.r64ValOut = r64OutU;
6607	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6608
6609	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6610	if (fXcpt)
6611	{
6612	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6613	uint32_t fMxcsr1; RTFLOAT64U r64Out1;
6614	pfn(&State, &fMxcsr1, &r64Out1, &TestData.i64ValIn);
6615	TestData.fMxcsrIn = State.MXCSR;
6616	TestData.fMxcsrOut = fMxcsr1;
6617	TestData.r64ValOut = r64Out1;
6618	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6619
6620	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6621	{
6622	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6623	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6624	uint32_t fMxcsr2; RTFLOAT64U r64Out2;
6625	pfn(&State, &fMxcsr2, &r64Out2, &TestData.i64ValIn);
6626	TestData.fMxcsrIn = State.MXCSR;
6627	TestData.fMxcsrOut = fMxcsr2;
6628	TestData.r64ValOut = r64Out2;
6629	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6630	}
6631	if (!RT_IS_POWER_OF_TWO(fXcpt))
6632	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6633	if (fUnmasked & fXcpt)
6634	{
6635	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6636	uint32_t fMxcsr3; RTFLOAT64U r64Out3;
6637	pfn(&State, &fMxcsr3, &r64Out3, &TestData.i64ValIn);
6638	TestData.fMxcsrIn = State.MXCSR;
6639	TestData.fMxcsrOut = fMxcsr3;
6640	TestData.r64ValOut = r64Out3;
6641	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6642	}
6643	}
6644	}
6645	}
6646	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6647	}
6648
6649	return RTEXITCODE_SUCCESS;
6650	}
6651	#endif
6652
6653
6654	static void SseBinaryR64I64Test(void)
6655	{
6656	X86FXSTATE State;
6657	RT_ZERO(State);
6658	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR64I64); iFn++)
6659	{
6660	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR64I64[iFn]))
6661	continue;
6662
6663	uint32_t const cTests = *g_aSseBinaryR64I64[iFn].pcTests;
6664	SSE_BINARY_R64_I64_TEST_T const * const paTests = g_aSseBinaryR64I64[iFn].paTests;
6665	PFNIEMAIMPLSSEF2R64I64 pfn = g_aSseBinaryR64I64[iFn].pfn;
6666	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR64I64[iFn]);
6667	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6668	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6669	{
6670	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_R64_I64_TEST_T); iTest++)
6671	{
6672	uint32_t fMxcsr = 0;
6673	RTFLOAT64U r64Dst; RT_ZERO(r64Dst);
6674
6675	State.MXCSR = paTests[iTest].fMxcsrIn;
6676	pfn(&State, &fMxcsr, &r64Dst, &paTests[iTest].i64ValIn);
6677	if ( fMxcsr != paTests[iTest].fMxcsrOut
6678	\|\| !RTFLOAT64U_ARE_IDENTICAL(&r64Dst, &paTests[iTest].r64ValOut))
6679	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI64\n"
6680	"%s -> mxcsr=%#08x %s\n"
6681	"%s expected %#08x %s%s%s (%s)\n",
6682	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6683	&paTests[iTest].i64ValIn,
6684	iVar ? " " : "", fMxcsr, FormatR64(&r64Dst),
6685	iVar ? " " : "", paTests[iTest].fMxcsrOut, FormatR64(&paTests[iTest].r64ValOut),
6686	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6687	!RTFLOAT64U_ARE_IDENTICAL(&r64Dst, &paTests[iTest].r64ValOut)
6688	? " - val" : "",
6689	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6690	}
6691	}
6692	}
6693	}
6694
6695
6696	/*
6697	* SSE operations converting single signed double-word integers to single-precision floating point values (probably only cvtsi2ss).
6698	*/
6699	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R32_I32_T, SSE_BINARY_R32_I32_TEST_T, PFNIEMAIMPLSSEF2R32I32);
6700
6701	static SSE_BINARY_R32_I32_T g_aSseBinaryR32I32[] =
6702	{
6703	ENTRY_BIN(cvtsi2ss_r32_i32),
6704	};
6705
6706	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6707	static RTEXITCODE SseBinaryR32I32Generate(const char *pszDataFileFmt, uint32_t cTests)
6708	{
6709	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6710
6711	static int32_t const s_aSpecials[] =
6712	{
6713	INT32_MIN,
6714	INT32_MAX,
6715	/** @todo More specials. */
6716	};
6717
6718	X86FXSTATE State;
6719	RT_ZERO(State);
6720	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32I32); iFn++)
6721	{
6722	PFNIEMAIMPLSSEF2R32I32 const pfn = g_aSseBinaryR32I32[iFn].pfnNative ? g_aSseBinaryR32I32[iFn].pfnNative : g_aSseBinaryR32I32[iFn].pfn;
6723
6724	IEMBINARYOUTPUT BinOut;
6725	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR32I32[iFn].pszName), RTEXITCODE_FAILURE);
6726
6727	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6728	{
6729	SSE_BINARY_R32_I32_TEST_T TestData; RT_ZERO(TestData);
6730
6731	TestData.i32ValIn = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
6732
6733	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6734	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6735	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6736	for (uint8_t iFz = 0; iFz < 2; iFz++)
6737	{
6738	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6739	\| (iRounding << X86_MXCSR_RC_SHIFT)
6740	\| (iDaz ? X86_MXCSR_DAZ : 0)
6741	\| (iFz ? X86_MXCSR_FZ : 0)
6742	\| X86_MXCSR_XCPT_MASK;
6743	uint32_t fMxcsrM; RTFLOAT32U r32OutM;
6744	pfn(&State, &fMxcsrM, &r32OutM, &TestData.i32ValIn);
6745	TestData.fMxcsrIn = State.MXCSR;
6746	TestData.fMxcsrOut = fMxcsrM;
6747	TestData.r32ValOut = r32OutM;
6748	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6749
6750	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6751	uint32_t fMxcsrU; RTFLOAT32U r32OutU;
6752	pfn(&State, &fMxcsrU, &r32OutU, &TestData.i32ValIn);
6753	TestData.fMxcsrIn = State.MXCSR;
6754	TestData.fMxcsrOut = fMxcsrU;
6755	TestData.r32ValOut = r32OutU;
6756	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6757
6758	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6759	if (fXcpt)
6760	{
6761	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6762	uint32_t fMxcsr1; RTFLOAT32U r32Out1;
6763	pfn(&State, &fMxcsr1, &r32Out1, &TestData.i32ValIn);
6764	TestData.fMxcsrIn = State.MXCSR;
6765	TestData.fMxcsrOut = fMxcsr1;
6766	TestData.r32ValOut = r32Out1;
6767	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6768
6769	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6770	{
6771	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6772	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6773	uint32_t fMxcsr2; RTFLOAT32U r32Out2;
6774	pfn(&State, &fMxcsr2, &r32Out2, &TestData.i32ValIn);
6775	TestData.fMxcsrIn = State.MXCSR;
6776	TestData.fMxcsrOut = fMxcsr2;
6777	TestData.r32ValOut = r32Out2;
6778	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6779	}
6780	if (!RT_IS_POWER_OF_TWO(fXcpt))
6781	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6782	if (fUnmasked & fXcpt)
6783	{
6784	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6785	uint32_t fMxcsr3; RTFLOAT32U r32Out3;
6786	pfn(&State, &fMxcsr3, &r32Out3, &TestData.i32ValIn);
6787	TestData.fMxcsrIn = State.MXCSR;
6788	TestData.fMxcsrOut = fMxcsr3;
6789	TestData.r32ValOut = r32Out3;
6790	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6791	}
6792	}
6793	}
6794	}
6795	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6796	}
6797
6798	return RTEXITCODE_SUCCESS;
6799	}
6800	#endif
6801
6802
6803	static void SseBinaryR32I32Test(void)
6804	{
6805	X86FXSTATE State;
6806	RT_ZERO(State);
6807	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32I32); iFn++)
6808	{
6809	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR32I32[iFn]))
6810	continue;
6811
6812	uint32_t const cTests = *g_aSseBinaryR32I32[iFn].pcTests;
6813	SSE_BINARY_R32_I32_TEST_T const * const paTests = g_aSseBinaryR32I32[iFn].paTests;
6814	PFNIEMAIMPLSSEF2R32I32 pfn = g_aSseBinaryR32I32[iFn].pfn;
6815	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR32I32[iFn]);
6816	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6817	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6818	{
6819	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_R32_I32_TEST_T); iTest++)
6820	{
6821	uint32_t fMxcsr = 0;
6822	RTFLOAT32U r32Dst; RT_ZERO(r32Dst);
6823
6824	State.MXCSR = paTests[iTest].fMxcsrIn;
6825	pfn(&State, &fMxcsr, &r32Dst, &paTests[iTest].i32ValIn);
6826	if ( fMxcsr != paTests[iTest].fMxcsrOut
6827	\|\| !RTFLOAT32U_ARE_IDENTICAL(&r32Dst, &paTests[iTest].r32ValOut))
6828	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32\n"
6829	"%s -> mxcsr=%#08x %RI32\n"
6830	"%s expected %#08x %RI32%s%s (%s)\n",
6831	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6832	&paTests[iTest].i32ValIn,
6833	iVar ? " " : "", fMxcsr, FormatR32(&r32Dst),
6834	iVar ? " " : "", paTests[iTest].fMxcsrOut, FormatR32(&paTests[iTest].r32ValOut),
6835	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6836	!RTFLOAT32U_ARE_IDENTICAL(&r32Dst, &paTests[iTest].r32ValOut)
6837	? " - val" : "",
6838	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6839	}
6840	}
6841	}
6842	}
6843
6844
6845	/*
6846	* SSE operations converting single signed quad-word integers to single-precision floating point values (probably only cvtsi2ss).
6847	*/
6848	TYPEDEF_SUBTEST_TYPE(SSE_BINARY_R32_I64_T, SSE_BINARY_R32_I64_TEST_T, PFNIEMAIMPLSSEF2R32I64);
6849
6850	static SSE_BINARY_R32_I64_T g_aSseBinaryR32I64[] =
6851	{
6852	ENTRY_BIN(cvtsi2ss_r32_i64),
6853	};
6854
6855	#ifdef TSTIEMAIMPL_WITH_GENERATOR
6856	static RTEXITCODE SseBinaryR32I64Generate(const char *pszDataFileFmt, uint32_t cTests)
6857	{
6858	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
6859
6860	static int64_t const s_aSpecials[] =
6861	{
6862	INT64_MIN,
6863	INT64_MAX
6864	/** @todo More specials. */
6865	};
6866
6867	X86FXSTATE State;
6868	RT_ZERO(State);
6869	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32I64); iFn++)
6870	{
6871	PFNIEMAIMPLSSEF2R32I64 const pfn = g_aSseBinaryR32I64[iFn].pfnNative ? g_aSseBinaryR32I64[iFn].pfnNative : g_aSseBinaryR32I64[iFn].pfn;
6872
6873	IEMBINARYOUTPUT BinOut;
6874	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseBinaryR32I64[iFn].pszName), RTEXITCODE_FAILURE);
6875
6876	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
6877	{
6878	SSE_BINARY_R32_I64_TEST_T TestData; RT_ZERO(TestData);
6879
6880	TestData.i64ValIn = iTest < cTests ? RandI64Src(iTest) : s_aSpecials[iTest - cTests];
6881
6882	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
6883	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
6884	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
6885	for (uint8_t iFz = 0; iFz < 2; iFz++)
6886	{
6887	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
6888	\| (iRounding << X86_MXCSR_RC_SHIFT)
6889	\| (iDaz ? X86_MXCSR_DAZ : 0)
6890	\| (iFz ? X86_MXCSR_FZ : 0)
6891	\| X86_MXCSR_XCPT_MASK;
6892	uint32_t fMxcsrM; RTFLOAT32U r32OutM;
6893	pfn(&State, &fMxcsrM, &r32OutM, &TestData.i64ValIn);
6894	TestData.fMxcsrIn = State.MXCSR;
6895	TestData.fMxcsrOut = fMxcsrM;
6896	TestData.r32ValOut = r32OutM;
6897	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6898
6899	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
6900	uint32_t fMxcsrU; RTFLOAT32U r32OutU;
6901	pfn(&State, &fMxcsrU, &r32OutU, &TestData.i64ValIn);
6902	TestData.fMxcsrIn = State.MXCSR;
6903	TestData.fMxcsrOut = fMxcsrU;
6904	TestData.r32ValOut = r32OutU;
6905	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6906
6907	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
6908	if (fXcpt)
6909	{
6910	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
6911	uint32_t fMxcsr1; RTFLOAT32U r32Out1;
6912	pfn(&State, &fMxcsr1, &r32Out1, &TestData.i64ValIn);
6913	TestData.fMxcsrIn = State.MXCSR;
6914	TestData.fMxcsrOut = fMxcsr1;
6915	TestData.r32ValOut = r32Out1;
6916	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6917
6918	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
6919	{
6920	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
6921	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
6922	uint32_t fMxcsr2; RTFLOAT32U r32Out2;
6923	pfn(&State, &fMxcsr2, &r32Out2, &TestData.i64ValIn);
6924	TestData.fMxcsrIn = State.MXCSR;
6925	TestData.fMxcsrOut = fMxcsr2;
6926	TestData.r32ValOut = r32Out2;
6927	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6928	}
6929	if (!RT_IS_POWER_OF_TWO(fXcpt))
6930	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
6931	if (fUnmasked & fXcpt)
6932	{
6933	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
6934	uint32_t fMxcsr3; RTFLOAT32U r32Out3;
6935	pfn(&State, &fMxcsr3, &r32Out3, &TestData.i64ValIn);
6936	TestData.fMxcsrIn = State.MXCSR;
6937	TestData.fMxcsrOut = fMxcsr3;
6938	TestData.r32ValOut = r32Out3;
6939	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
6940	}
6941	}
6942	}
6943	}
6944	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
6945	}
6946
6947	return RTEXITCODE_SUCCESS;
6948	}
6949	#endif
6950
6951
6952	static void SseBinaryR32I64Test(void)
6953	{
6954	X86FXSTATE State;
6955	RT_ZERO(State);
6956	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseBinaryR32I64); iFn++)
6957	{
6958	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseBinaryR32I64[iFn]))
6959	continue;
6960
6961	uint32_t const cTests = *g_aSseBinaryR32I64[iFn].pcTests;
6962	SSE_BINARY_R32_I64_TEST_T const * const paTests = g_aSseBinaryR32I64[iFn].paTests;
6963	PFNIEMAIMPLSSEF2R32I64 pfn = g_aSseBinaryR32I64[iFn].pfn;
6964	uint32_t const cVars = COUNT_VARIATIONS(g_aSseBinaryR32I64[iFn]);
6965	if (!cTests) RTTestSkipped(g_hTest, "no tests");
6966	for (uint32_t iVar = 0; iVar < cVars; iVar++)
6967	{
6968	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_BINARY_R32_I64_TEST_T); iTest++)
6969	{
6970	uint32_t fMxcsr = 0;
6971	RTFLOAT32U r32Dst; RT_ZERO(r32Dst);
6972
6973	State.MXCSR = paTests[iTest].fMxcsrIn;
6974	pfn(&State, &fMxcsr, &r32Dst, &paTests[iTest].i64ValIn);
6975	if ( fMxcsr != paTests[iTest].fMxcsrOut
6976	\|\| !RTFLOAT32U_ARE_IDENTICAL(&r32Dst, &paTests[iTest].r32ValOut))
6977	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI64\n"
6978	"%s -> mxcsr=%#08x %RI32\n"
6979	"%s expected %#08x %RI32%s%s (%s)\n",
6980	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
6981	&paTests[iTest].i64ValIn,
6982	iVar ? " " : "", fMxcsr, FormatR32(&r32Dst),
6983	iVar ? " " : "", paTests[iTest].fMxcsrOut, FormatR32(&paTests[iTest].r32ValOut),
6984	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
6985	!RTFLOAT32U_ARE_IDENTICAL(&r32Dst, &paTests[iTest].r32ValOut)
6986	? " - val" : "",
6987	FormatMxcsr(paTests[iTest].fMxcsrIn) );
6988	}
6989	}
6990	}
6991	}
6992
6993
6994	/*
6995	* Compare SSE operations on single single-precision floating point values - outputting only EFLAGS.
6996	*/
6997	TYPEDEF_SUBTEST_TYPE(SSE_COMPARE_EFL_R32_R32_T, SSE_COMPARE_EFL_R32_R32_TEST_T, PFNIEMAIMPLF2EFLMXCSR128);
6998
6999	static SSE_COMPARE_EFL_R32_R32_T g_aSseCompareEflR32R32[] =
7000	{
7001	ENTRY_BIN(ucomiss_u128),
7002	ENTRY_BIN(comiss_u128),
7003	ENTRY_BIN_AVX(vucomiss_u128),
7004	ENTRY_BIN_AVX(vcomiss_u128),
7005	};
7006
7007	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7008	static RTEXITCODE SseCompareEflR32R32Generate(const char *pszDataFileFmt, uint32_t cTests)
7009	{
7010	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7011
7012	static struct { RTFLOAT32U Val1, Val2; } const s_aSpecials[] =
7013	{
7014	{ RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0) },
7015	{ RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) },
7016	{ RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(0) },
7017	{ RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1) },
7018	{ RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0) },
7019	{ RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) },
7020	{ RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(0) },
7021	{ RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1) },
7022	/** @todo More specials. */
7023	};
7024
7025	uint32_t cMinNormalPairs = (cTests - 144) / 4;
7026	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareEflR32R32); iFn++)
7027	{
7028	PFNIEMAIMPLF2EFLMXCSR128 const pfn = g_aSseCompareEflR32R32[iFn].pfnNative ? g_aSseCompareEflR32R32[iFn].pfnNative : g_aSseCompareEflR32R32[iFn].pfn;
7029
7030	IEMBINARYOUTPUT BinOut;
7031	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseCompareEflR32R32[iFn].pszName), RTEXITCODE_FAILURE);
7032
7033	uint32_t cNormalInputPairs = 0;
7034	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7035	{
7036	SSE_COMPARE_EFL_R32_R32_TEST_T TestData; RT_ZERO(TestData);
7037	X86XMMREG ValIn1; RT_ZERO(ValIn1);
7038	X86XMMREG ValIn2; RT_ZERO(ValIn2);
7039
7040	TestData.r32ValIn1 = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7041	TestData.r32ValIn2 = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7042
7043	ValIn1.ar32[0] = TestData.r32ValIn1;
7044	ValIn2.ar32[0] = TestData.r32ValIn2;
7045
7046	if ( RTFLOAT32U_IS_NORMAL(&TestData.r32ValIn1)
7047	&& RTFLOAT32U_IS_NORMAL(&TestData.r32ValIn2))
7048	cNormalInputPairs++;
7049	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
7050	{
7051	iTest -= 1;
7052	continue;
7053	}
7054
7055	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7056	uint32_t const fEFlags = RandEFlags();
7057	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7058	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7059	for (uint8_t iFz = 0; iFz < 2; iFz++)
7060	{
7061	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
7062	\| (iRounding << X86_MXCSR_RC_SHIFT)
7063	\| (iDaz ? X86_MXCSR_DAZ : 0)
7064	\| (iFz ? X86_MXCSR_FZ : 0)
7065	\| X86_MXCSR_XCPT_MASK;
7066	uint32_t fMxcsrM = fMxcsrIn;
7067	uint32_t fEFlagsM = fEFlags;
7068	pfn(&fMxcsrM, &fEFlagsM, &ValIn1, &ValIn2);
7069	TestData.fMxcsrIn = fMxcsrIn;
7070	TestData.fMxcsrOut = fMxcsrM;
7071	TestData.fEflIn = fEFlags;
7072	TestData.fEflOut = fEFlagsM;
7073	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7074
7075	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
7076	uint32_t fMxcsrU = fMxcsrIn;
7077	uint32_t fEFlagsU = fEFlags;
7078	pfn(&fMxcsrU, &fEFlagsU, &ValIn1, &ValIn2);
7079	TestData.fMxcsrIn = fMxcsrIn;
7080	TestData.fMxcsrOut = fMxcsrU;
7081	TestData.fEflIn = fEFlags;
7082	TestData.fEflOut = fEFlagsU;
7083	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7084
7085	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
7086	if (fXcpt)
7087	{
7088	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
7089	uint32_t fMxcsr1 = fMxcsrIn;
7090	uint32_t fEFlags1 = fEFlags;
7091	pfn(&fMxcsr1, &fEFlags1, &ValIn1, &ValIn2);
7092	TestData.fMxcsrIn = fMxcsrIn;
7093	TestData.fMxcsrOut = fMxcsr1;
7094	TestData.fEflIn = fEFlags;
7095	TestData.fEflOut = fEFlags1;
7096	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7097
7098	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
7099	{
7100	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
7101	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
7102	uint32_t fMxcsr2 = fMxcsrIn;
7103	uint32_t fEFlags2 = fEFlags;
7104	pfn(&fMxcsr2, &fEFlags2, &ValIn1, &ValIn2);
7105	TestData.fMxcsrIn = fMxcsrIn;
7106	TestData.fMxcsrOut = fMxcsr2;
7107	TestData.fEflIn = fEFlags;
7108	TestData.fEflOut = fEFlags2;
7109	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7110	}
7111	if (!RT_IS_POWER_OF_TWO(fXcpt))
7112	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
7113	if (fUnmasked & fXcpt)
7114	{
7115	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
7116	uint32_t fMxcsr3 = fMxcsrIn;
7117	uint32_t fEFlags3 = fEFlags;
7118	pfn(&fMxcsr3, &fEFlags3, &ValIn1, &ValIn2);
7119	TestData.fMxcsrIn = fMxcsrIn;
7120	TestData.fMxcsrOut = fMxcsr3;
7121	TestData.fEflIn = fEFlags;
7122	TestData.fEflOut = fEFlags3;
7123	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7124	}
7125	}
7126	}
7127	}
7128	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
7129	}
7130
7131	return RTEXITCODE_SUCCESS;
7132	}
7133	#endif
7134
7135	static void SseCompareEflR32R32Test(void)
7136	{
7137	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareEflR32R32); iFn++)
7138	{
7139	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseCompareEflR32R32[iFn]))
7140	continue;
7141
7142	uint32_t const cTests = *g_aSseCompareEflR32R32[iFn].pcTests;
7143	SSE_COMPARE_EFL_R32_R32_TEST_T const * const paTests = g_aSseCompareEflR32R32[iFn].paTests;
7144	PFNIEMAIMPLF2EFLMXCSR128 pfn = g_aSseCompareEflR32R32[iFn].pfn;
7145	uint32_t const cVars = COUNT_VARIATIONS(g_aSseCompareEflR32R32[iFn]);
7146	if (!cTests) RTTestSkipped(g_hTest, "no tests");
7147	for (uint32_t iVar = 0; iVar < cVars; iVar++)
7148	{
7149	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_COMPARE_EFL_R32_R32_TEST_T); iTest++)
7150	{
7151	X86XMMREG ValIn1; RT_ZERO(ValIn1);
7152	X86XMMREG ValIn2; RT_ZERO(ValIn2);
7153
7154	ValIn1.ar32[0] = paTests[iTest].r32ValIn1;
7155	ValIn2.ar32[0] = paTests[iTest].r32ValIn2;
7156	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
7157	uint32_t fEFlags = paTests[iTest].fEflIn;
7158	pfn(&fMxcsr, &fEFlags, &ValIn1, &ValIn2);
7159	if ( fMxcsr != paTests[iTest].fMxcsrOut
7160	\|\| fEFlags != paTests[iTest].fEflOut)
7161	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x efl=%#08x in1=%s in2=%s\n"
7162	"%s -> mxcsr=%#08x %#08x\n"
7163	"%s expected %#08x %#08x%s (%s) (EFL: %s)\n",
7164	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn, paTests[iTest].fEflIn,
7165	FormatR32(&paTests[iTest].r32ValIn1), FormatR32(&paTests[iTest].r32ValIn2),
7166	iVar ? " " : "", fMxcsr, fEFlags,
7167	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].fEflOut,
7168	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
7169	FormatMxcsr(paTests[iTest].fMxcsrIn),
7170	EFlagsDiff(fEFlags, paTests[iTest].fEflOut));
7171	}
7172	}
7173	}
7174	}
7175
7176
7177	/*
7178	* Compare SSE operations on single single-precision floating point values - outputting only EFLAGS.
7179	*/
7180	TYPEDEF_SUBTEST_TYPE(SSE_COMPARE_EFL_R64_R64_T, SSE_COMPARE_EFL_R64_R64_TEST_T, PFNIEMAIMPLF2EFLMXCSR128);
7181
7182	static SSE_COMPARE_EFL_R64_R64_T g_aSseCompareEflR64R64[] =
7183	{
7184	ENTRY_BIN(ucomisd_u128),
7185	ENTRY_BIN(comisd_u128),
7186	ENTRY_BIN_AVX(vucomisd_u128),
7187	ENTRY_BIN_AVX(vcomisd_u128)
7188	};
7189
7190	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7191	static RTEXITCODE SseCompareEflR64R64Generate(const char *pszDataFileFmt, uint32_t cTests)
7192	{
7193	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7194
7195	static struct { RTFLOAT64U Val1, Val2; } const s_aSpecials[] =
7196	{
7197	{ RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) },
7198	{ RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) },
7199	{ RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(0) },
7200	{ RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(1) },
7201	{ RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(0) },
7202	{ RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) },
7203	{ RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(0) },
7204	{ RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(1) },
7205	/** @todo More specials. */
7206	};
7207
7208	uint32_t cMinNormalPairs = (cTests - 144) / 4;
7209	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareEflR64R64); iFn++)
7210	{
7211	PFNIEMAIMPLF2EFLMXCSR128 const pfn = g_aSseCompareEflR64R64[iFn].pfnNative ? g_aSseCompareEflR64R64[iFn].pfnNative : g_aSseCompareEflR64R64[iFn].pfn;
7212
7213	IEMBINARYOUTPUT BinOut;
7214	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseCompareEflR64R64[iFn].pszName), RTEXITCODE_FAILURE);
7215
7216	uint32_t cNormalInputPairs = 0;
7217	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7218	{
7219	SSE_COMPARE_EFL_R64_R64_TEST_T TestData; RT_ZERO(TestData);
7220	X86XMMREG ValIn1; RT_ZERO(ValIn1);
7221	X86XMMREG ValIn2; RT_ZERO(ValIn2);
7222
7223	TestData.r64ValIn1 = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7224	TestData.r64ValIn2 = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7225
7226	ValIn1.ar64[0] = TestData.r64ValIn1;
7227	ValIn2.ar64[0] = TestData.r64ValIn2;
7228
7229	if ( RTFLOAT64U_IS_NORMAL(&TestData.r64ValIn1)
7230	&& RTFLOAT64U_IS_NORMAL(&TestData.r64ValIn2))
7231	cNormalInputPairs++;
7232	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
7233	{
7234	iTest -= 1;
7235	continue;
7236	}
7237
7238	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7239	uint32_t const fEFlags = RandEFlags();
7240	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7241	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7242	for (uint8_t iFz = 0; iFz < 2; iFz++)
7243	{
7244	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
7245	\| (iRounding << X86_MXCSR_RC_SHIFT)
7246	\| (iDaz ? X86_MXCSR_DAZ : 0)
7247	\| (iFz ? X86_MXCSR_FZ : 0)
7248	\| X86_MXCSR_XCPT_MASK;
7249	uint32_t fMxcsrM = fMxcsrIn;
7250	uint32_t fEFlagsM = fEFlags;
7251	pfn(&fMxcsrM, &fEFlagsM, &ValIn1, &ValIn2);
7252	TestData.fMxcsrIn = fMxcsrIn;
7253	TestData.fMxcsrOut = fMxcsrM;
7254	TestData.fEflIn = fEFlags;
7255	TestData.fEflOut = fEFlagsM;
7256	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7257
7258	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
7259	uint32_t fMxcsrU = fMxcsrIn;
7260	uint32_t fEFlagsU = fEFlags;
7261	pfn(&fMxcsrU, &fEFlagsU, &ValIn1, &ValIn2);
7262	TestData.fMxcsrIn = fMxcsrIn;
7263	TestData.fMxcsrOut = fMxcsrU;
7264	TestData.fEflIn = fEFlags;
7265	TestData.fEflOut = fEFlagsU;
7266	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7267
7268	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
7269	if (fXcpt)
7270	{
7271	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
7272	uint32_t fMxcsr1 = fMxcsrIn;
7273	uint32_t fEFlags1 = fEFlags;
7274	pfn(&fMxcsr1, &fEFlags1, &ValIn1, &ValIn2);
7275	TestData.fMxcsrIn = fMxcsrIn;
7276	TestData.fMxcsrOut = fMxcsr1;
7277	TestData.fEflIn = fEFlags;
7278	TestData.fEflOut = fEFlags1;
7279	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7280
7281	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
7282	{
7283	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
7284	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
7285	uint32_t fMxcsr2 = fMxcsrIn;
7286	uint32_t fEFlags2 = fEFlags;
7287	pfn(&fMxcsr2, &fEFlags2, &ValIn1, &ValIn2);
7288	TestData.fMxcsrIn = fMxcsrIn;
7289	TestData.fMxcsrOut = fMxcsr2;
7290	TestData.fEflIn = fEFlags;
7291	TestData.fEflOut = fEFlags2;
7292	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7293	}
7294	if (!RT_IS_POWER_OF_TWO(fXcpt))
7295	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
7296	if (fUnmasked & fXcpt)
7297	{
7298	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
7299	uint32_t fMxcsr3 = fMxcsrIn;
7300	uint32_t fEFlags3 = fEFlags;
7301	pfn(&fMxcsr3, &fEFlags3, &ValIn1, &ValIn2);
7302	TestData.fMxcsrIn = fMxcsrIn;
7303	TestData.fMxcsrOut = fMxcsr3;
7304	TestData.fEflIn = fEFlags;
7305	TestData.fEflOut = fEFlags3;
7306	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7307	}
7308	}
7309	}
7310	}
7311	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
7312	}
7313
7314	return RTEXITCODE_SUCCESS;
7315	}
7316	#endif
7317
7318	static void SseCompareEflR64R64Test(void)
7319	{
7320	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareEflR64R64); iFn++)
7321	{
7322	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseCompareEflR64R64[iFn]))
7323	continue;
7324
7325	uint32_t const cTests = *g_aSseCompareEflR64R64[iFn].pcTests;
7326	SSE_COMPARE_EFL_R64_R64_TEST_T const * const paTests = g_aSseCompareEflR64R64[iFn].paTests;
7327	PFNIEMAIMPLF2EFLMXCSR128 pfn = g_aSseCompareEflR64R64[iFn].pfn;
7328	uint32_t const cVars = COUNT_VARIATIONS(g_aSseCompareEflR64R64[iFn]);
7329	if (!cTests) RTTestSkipped(g_hTest, "no tests");
7330	for (uint32_t iVar = 0; iVar < cVars; iVar++)
7331	{
7332	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_COMPARE_EFL_R64_R64_TEST_T); iTest++)
7333	{
7334	X86XMMREG ValIn1; RT_ZERO(ValIn1);
7335	X86XMMREG ValIn2; RT_ZERO(ValIn2);
7336
7337	ValIn1.ar64[0] = paTests[iTest].r64ValIn1;
7338	ValIn2.ar64[0] = paTests[iTest].r64ValIn2;
7339	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
7340	uint32_t fEFlags = paTests[iTest].fEflIn;
7341	pfn(&fMxcsr, &fEFlags, &ValIn1, &ValIn2);
7342	if ( fMxcsr != paTests[iTest].fMxcsrOut
7343	\|\| fEFlags != paTests[iTest].fEflOut)
7344	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x efl=%#08x in1=%s in2=%s\n"
7345	"%s -> mxcsr=%#08x %#08x\n"
7346	"%s expected %#08x %#08x%s (%s) (EFL: %s)\n",
7347	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn, paTests[iTest].fEflIn,
7348	FormatR64(&paTests[iTest].r64ValIn1), FormatR64(&paTests[iTest].r64ValIn2),
7349	iVar ? " " : "", fMxcsr, fEFlags,
7350	iVar ? " " : "", paTests[iTest].fMxcsrOut, paTests[iTest].fEflOut,
7351	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
7352	FormatMxcsr(paTests[iTest].fMxcsrIn),
7353	EFlagsDiff(fEFlags, paTests[iTest].fEflOut));
7354	}
7355	}
7356	}
7357	}
7358
7359
7360	/*
7361	* Compare SSE operations on packed and single single-precision floating point values - outputting a mask.
7362	*/
7363	/** Maximum immediate to try to keep the testdata size under control (at least a little bit)- */
7364	#define SSE_COMPARE_F2_XMM_IMM8_MAX 0x1f
7365
7366	TYPEDEF_SUBTEST_TYPE(SSE_COMPARE_F2_XMM_IMM8_T, SSE_COMPARE_F2_XMM_IMM8_TEST_T, PFNIEMAIMPLMXCSRF2XMMIMM8);
7367
7368	static SSE_COMPARE_F2_XMM_IMM8_T g_aSseCompareF2XmmR32Imm8[] =
7369	{
7370	ENTRY_BIN(cmpps_u128),
7371	ENTRY_BIN(cmpss_u128)
7372	};
7373
7374	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7375	static RTEXITCODE SseCompareF2XmmR32Imm8Generate(const char *pszDataFileFmt, uint32_t cTests)
7376	{
7377	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7378
7379	static struct { RTFLOAT32U Val1, Val2; } const s_aSpecials[] =
7380	{
7381	{ RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0) },
7382	{ RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) },
7383	{ RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(0) },
7384	{ RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1) },
7385	{ RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0) },
7386	{ RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) },
7387	{ RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(0) },
7388	{ RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1) },
7389	/** @todo More specials. */
7390	};
7391
7392	uint32_t cMinNormalPairs = (cTests - 144) / 4;
7393	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareF2XmmR32Imm8); iFn++)
7394	{
7395	PFNIEMAIMPLMXCSRF2XMMIMM8 const pfn = g_aSseCompareF2XmmR32Imm8[iFn].pfnNative ? g_aSseCompareF2XmmR32Imm8[iFn].pfnNative : g_aSseCompareF2XmmR32Imm8[iFn].pfn;
7396
7397	IEMBINARYOUTPUT BinOut;
7398	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseCompareF2XmmR32Imm8[iFn].pszName), RTEXITCODE_FAILURE);
7399
7400	uint32_t cNormalInputPairs = 0;
7401	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7402	{
7403	SSE_COMPARE_F2_XMM_IMM8_TEST_T TestData; RT_ZERO(TestData);
7404
7405	TestData.InVal1.ar32[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7406	TestData.InVal1.ar32[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7407	TestData.InVal1.ar32[2] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7408	TestData.InVal1.ar32[3] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7409
7410	TestData.InVal2.ar32[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7411	TestData.InVal2.ar32[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7412	TestData.InVal2.ar32[2] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7413	TestData.InVal2.ar32[3] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7414
7415	if ( RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[0])
7416	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[1])
7417	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[2])
7418	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal1.ar32[3])
7419	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[0])
7420	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[1])
7421	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[2])
7422	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal2.ar32[3]))
7423	cNormalInputPairs++;
7424	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
7425	{
7426	iTest -= 1;
7427	continue;
7428	}
7429
7430	IEMMEDIAF2XMMSRC Src;
7431	Src.uSrc1 = TestData.InVal1;
7432	Src.uSrc2 = TestData.InVal2;
7433	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7434	for (uint8_t bImm = 0; bImm <= SSE_COMPARE_F2_XMM_IMM8_MAX; bImm++)
7435	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7436	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7437	for (uint8_t iFz = 0; iFz < 2; iFz++)
7438	{
7439	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
7440	\| (iRounding << X86_MXCSR_RC_SHIFT)
7441	\| (iDaz ? X86_MXCSR_DAZ : 0)
7442	\| (iFz ? X86_MXCSR_FZ : 0)
7443	\| X86_MXCSR_XCPT_MASK;
7444	uint32_t fMxcsrM = fMxcsrIn;
7445	X86XMMREG ResM;
7446	pfn(&fMxcsrM, &ResM, &Src, bImm);
7447	TestData.fMxcsrIn = fMxcsrIn;
7448	TestData.fMxcsrOut = fMxcsrM;
7449	TestData.bImm = bImm;
7450	TestData.OutVal = ResM;
7451	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7452
7453	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
7454	uint32_t fMxcsrU = fMxcsrIn;
7455	X86XMMREG ResU;
7456	pfn(&fMxcsrU, &ResU, &Src, bImm);
7457	TestData.fMxcsrIn = fMxcsrIn;
7458	TestData.fMxcsrOut = fMxcsrU;
7459	TestData.bImm = bImm;
7460	TestData.OutVal = ResU;
7461	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7462
7463	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
7464	if (fXcpt)
7465	{
7466	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
7467	uint32_t fMxcsr1 = fMxcsrIn;
7468	X86XMMREG Res1;
7469	pfn(&fMxcsr1, &Res1, &Src, bImm);
7470	TestData.fMxcsrIn = fMxcsrIn;
7471	TestData.fMxcsrOut = fMxcsr1;
7472	TestData.bImm = bImm;
7473	TestData.OutVal = Res1;
7474	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7475
7476	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
7477	{
7478	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
7479	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
7480	uint32_t fMxcsr2 = fMxcsrIn;
7481	X86XMMREG Res2;
7482	pfn(&fMxcsr2, &Res2, &Src, bImm);
7483	TestData.fMxcsrIn = fMxcsrIn;
7484	TestData.fMxcsrOut = fMxcsr2;
7485	TestData.bImm = bImm;
7486	TestData.OutVal = Res2;
7487	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7488	}
7489	if (!RT_IS_POWER_OF_TWO(fXcpt))
7490	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
7491	if (fUnmasked & fXcpt)
7492	{
7493	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
7494	uint32_t fMxcsr3 = fMxcsrIn;
7495	X86XMMREG Res3;
7496	pfn(&fMxcsr3, &Res3, &Src, bImm);
7497	TestData.fMxcsrIn = fMxcsrIn;
7498	TestData.fMxcsrOut = fMxcsr3;
7499	TestData.bImm = bImm;
7500	TestData.OutVal = Res3;
7501	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7502	}
7503	}
7504	}
7505	}
7506	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
7507	}
7508
7509	return RTEXITCODE_SUCCESS;
7510	}
7511	#endif
7512
7513	static void SseCompareF2XmmR32Imm8Test(void)
7514	{
7515	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareF2XmmR32Imm8); iFn++)
7516	{
7517	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseCompareF2XmmR32Imm8[iFn]))
7518	continue;
7519
7520	uint32_t const cTests = *g_aSseCompareF2XmmR32Imm8[iFn].pcTests;
7521	SSE_COMPARE_F2_XMM_IMM8_TEST_T const * const paTests = g_aSseCompareF2XmmR32Imm8[iFn].paTests;
7522	PFNIEMAIMPLMXCSRF2XMMIMM8 pfn = g_aSseCompareF2XmmR32Imm8[iFn].pfn;
7523	uint32_t const cVars = COUNT_VARIATIONS(g_aSseCompareF2XmmR32Imm8[iFn]);
7524	if (!cTests) RTTestSkipped(g_hTest, "no tests");
7525	for (uint32_t iVar = 0; iVar < cVars; iVar++)
7526	{
7527	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_COMPARE_F2_XMM_IMM8_TEST_T); iTest++)
7528	{
7529	IEMMEDIAF2XMMSRC Src;
7530	X86XMMREG ValOut;
7531
7532	Src.uSrc1 = paTests[iTest].InVal1;
7533	Src.uSrc2 = paTests[iTest].InVal2;
7534	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
7535	pfn(&fMxcsr, &ValOut, &Src, paTests[iTest].bImm);
7536	if ( fMxcsr != paTests[iTest].fMxcsrOut
7537	\|\| ValOut.au32[0] != paTests[iTest].OutVal.au32[0]
7538	\|\| ValOut.au32[1] != paTests[iTest].OutVal.au32[1]
7539	\|\| ValOut.au32[2] != paTests[iTest].OutVal.au32[2]
7540	\|\| ValOut.au32[3] != paTests[iTest].OutVal.au32[3])
7541	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s'%s'%s in2=%s'%s'%s'%s imm8=%x\n"
7542	"%s -> mxcsr=%#08x %RX32'%RX32'%RX32'%RX32\n"
7543	"%s expected %#08x %RX32'%RX32'%RX32'%RX32%s%s (%s)\n",
7544	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
7545	FormatR32(&paTests[iTest].InVal1.ar32[0]), FormatR32(&paTests[iTest].InVal1.ar32[1]),
7546	FormatR32(&paTests[iTest].InVal1.ar32[2]), FormatR32(&paTests[iTest].InVal1.ar32[3]),
7547	FormatR32(&paTests[iTest].InVal2.ar32[0]), FormatR32(&paTests[iTest].InVal2.ar32[1]),
7548	FormatR32(&paTests[iTest].InVal2.ar32[2]), FormatR32(&paTests[iTest].InVal2.ar32[3]),
7549	paTests[iTest].bImm,
7550	iVar ? " " : "", fMxcsr, ValOut.au32[0], ValOut.au32[1], ValOut.au32[2], ValOut.au32[3],
7551	iVar ? " " : "", paTests[iTest].fMxcsrOut,
7552	paTests[iTest].OutVal.au32[0], paTests[iTest].OutVal.au32[1],
7553	paTests[iTest].OutVal.au32[2], paTests[iTest].OutVal.au32[3],
7554	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
7555	( ValOut.au32[0] != paTests[iTest].OutVal.au32[0]
7556	\|\| ValOut.au32[1] != paTests[iTest].OutVal.au32[1]
7557	\|\| ValOut.au32[2] != paTests[iTest].OutVal.au32[2]
7558	\|\| ValOut.au32[3] != paTests[iTest].OutVal.au32[3])
7559	? " - val" : "",
7560	FormatMxcsr(paTests[iTest].fMxcsrIn));
7561	}
7562	}
7563	}
7564	}
7565
7566
7567	/*
7568	* Compare SSE operations on packed and single double-precision floating point values - outputting a mask.
7569	*/
7570	static SSE_COMPARE_F2_XMM_IMM8_T g_aSseCompareF2XmmR64Imm8[] =
7571	{
7572	ENTRY_BIN(cmppd_u128),
7573	ENTRY_BIN(cmpsd_u128)
7574	};
7575
7576	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7577	static RTEXITCODE SseCompareF2XmmR64Imm8Generate(const char *pszDataFileFmt, uint32_t cTests)
7578	{
7579	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7580
7581	static struct { RTFLOAT64U Val1, Val2; } const s_aSpecials[] =
7582	{
7583	{ RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) },
7584	{ RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) },
7585	{ RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(0) },
7586	{ RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(1) },
7587	{ RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(0) },
7588	{ RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) },
7589	{ RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(0) },
7590	{ RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(1) },
7591	/** @todo More specials. */
7592	};
7593
7594	uint32_t cMinNormalPairs = (cTests - 144) / 4;
7595	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareF2XmmR64Imm8); iFn++)
7596	{
7597	PFNIEMAIMPLMXCSRF2XMMIMM8 const pfn = g_aSseCompareF2XmmR64Imm8[iFn].pfnNative ? g_aSseCompareF2XmmR64Imm8[iFn].pfnNative : g_aSseCompareF2XmmR64Imm8[iFn].pfn;
7598
7599	IEMBINARYOUTPUT BinOut;
7600	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseCompareF2XmmR64Imm8[iFn].pszName), RTEXITCODE_FAILURE);
7601
7602	uint32_t cNormalInputPairs = 0;
7603	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7604	{
7605	SSE_COMPARE_F2_XMM_IMM8_TEST_T TestData; RT_ZERO(TestData);
7606
7607	TestData.InVal1.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7608	TestData.InVal1.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val1;
7609
7610	TestData.InVal2.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7611	TestData.InVal2.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].Val2;
7612
7613	if ( RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[0])
7614	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal1.ar64[1])
7615	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal2.ar64[0])
7616	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal2.ar64[1]))
7617	cNormalInputPairs++;
7618	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
7619	{
7620	iTest -= 1;
7621	continue;
7622	}
7623
7624	IEMMEDIAF2XMMSRC Src;
7625	Src.uSrc1 = TestData.InVal1;
7626	Src.uSrc2 = TestData.InVal2;
7627	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7628	for (uint8_t bImm = 0; bImm <= SSE_COMPARE_F2_XMM_IMM8_MAX; bImm++)
7629	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7630	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7631	for (uint8_t iFz = 0; iFz < 2; iFz++)
7632	{
7633	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
7634	\| (iRounding << X86_MXCSR_RC_SHIFT)
7635	\| (iDaz ? X86_MXCSR_DAZ : 0)
7636	\| (iFz ? X86_MXCSR_FZ : 0)
7637	\| X86_MXCSR_XCPT_MASK;
7638	uint32_t fMxcsrM = fMxcsrIn;
7639	X86XMMREG ResM;
7640	pfn(&fMxcsrM, &ResM, &Src, bImm);
7641	TestData.fMxcsrIn = fMxcsrIn;
7642	TestData.fMxcsrOut = fMxcsrM;
7643	TestData.bImm = bImm;
7644	TestData.OutVal = ResM;
7645	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7646
7647	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
7648	uint32_t fMxcsrU = fMxcsrIn;
7649	X86XMMREG ResU;
7650	pfn(&fMxcsrU, &ResU, &Src, bImm);
7651	TestData.fMxcsrIn = fMxcsrIn;
7652	TestData.fMxcsrOut = fMxcsrU;
7653	TestData.bImm = bImm;
7654	TestData.OutVal = ResU;
7655	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7656
7657	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
7658	if (fXcpt)
7659	{
7660	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
7661	uint32_t fMxcsr1 = fMxcsrIn;
7662	X86XMMREG Res1;
7663	pfn(&fMxcsr1, &Res1, &Src, bImm);
7664	TestData.fMxcsrIn = fMxcsrIn;
7665	TestData.fMxcsrOut = fMxcsr1;
7666	TestData.bImm = bImm;
7667	TestData.OutVal = Res1;
7668	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7669
7670	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
7671	{
7672	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
7673	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
7674	uint32_t fMxcsr2 = fMxcsrIn;
7675	X86XMMREG Res2;
7676	pfn(&fMxcsr2, &Res2, &Src, bImm);
7677	TestData.fMxcsrIn = fMxcsrIn;
7678	TestData.fMxcsrOut = fMxcsr2;
7679	TestData.bImm = bImm;
7680	TestData.OutVal = Res2;
7681	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7682	}
7683	if (!RT_IS_POWER_OF_TWO(fXcpt))
7684	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
7685	if (fUnmasked & fXcpt)
7686	{
7687	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
7688	uint32_t fMxcsr3 = fMxcsrIn;
7689	X86XMMREG Res3;
7690	pfn(&fMxcsr3, &Res3, &Src, bImm);
7691	TestData.fMxcsrIn = fMxcsrIn;
7692	TestData.fMxcsrOut = fMxcsr3;
7693	TestData.bImm = bImm;
7694	TestData.OutVal = Res3;
7695	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7696	}
7697	}
7698	}
7699	}
7700	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
7701	}
7702
7703	return RTEXITCODE_SUCCESS;
7704	}
7705	#endif
7706
7707	static void SseCompareF2XmmR64Imm8Test(void)
7708	{
7709	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseCompareF2XmmR64Imm8); iFn++)
7710	{
7711	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseCompareF2XmmR64Imm8[iFn]))
7712	continue;
7713
7714	uint32_t const cTests = *g_aSseCompareF2XmmR64Imm8[iFn].pcTests;
7715	SSE_COMPARE_F2_XMM_IMM8_TEST_T const * const paTests = g_aSseCompareF2XmmR64Imm8[iFn].paTests;
7716	PFNIEMAIMPLMXCSRF2XMMIMM8 pfn = g_aSseCompareF2XmmR64Imm8[iFn].pfn;
7717	uint32_t const cVars = COUNT_VARIATIONS(g_aSseCompareF2XmmR64Imm8[iFn]);
7718	if (!cTests) RTTestSkipped(g_hTest, "no tests");
7719	for (uint32_t iVar = 0; iVar < cVars; iVar++)
7720	{
7721	for (uint32_t iTest = 0; iTest < cTests / sizeof(SSE_COMPARE_F2_XMM_IMM8_TEST_T); iTest++)
7722	{
7723	IEMMEDIAF2XMMSRC Src;
7724	X86XMMREG ValOut;
7725
7726	Src.uSrc1 = paTests[iTest].InVal1;
7727	Src.uSrc2 = paTests[iTest].InVal2;
7728	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
7729	pfn(&fMxcsr, &ValOut, &Src, paTests[iTest].bImm);
7730	if ( fMxcsr != paTests[iTest].fMxcsrOut
7731	\|\| ValOut.au64[0] != paTests[iTest].OutVal.au64[0]
7732	\|\| ValOut.au64[1] != paTests[iTest].OutVal.au64[1])
7733	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s in2=%s'%s imm8=%x\n"
7734	"%s -> mxcsr=%#08x %RX64'%RX64\n"
7735	"%s expected %#08x %RX64'%RX64%s%s (%s)\n",
7736	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
7737	FormatR64(&paTests[iTest].InVal1.ar64[0]), FormatR64(&paTests[iTest].InVal1.ar64[1]),
7738	FormatR64(&paTests[iTest].InVal2.ar64[0]), FormatR64(&paTests[iTest].InVal2.ar64[1]),
7739	paTests[iTest].bImm,
7740	iVar ? " " : "", fMxcsr, ValOut.au64[0], ValOut.au64[1],
7741	iVar ? " " : "", paTests[iTest].fMxcsrOut,
7742	paTests[iTest].OutVal.au64[0], paTests[iTest].OutVal.au64[1],
7743	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
7744	( ValOut.au64[0] != paTests[iTest].OutVal.au64[0]
7745	\|\| ValOut.au64[1] != paTests[iTest].OutVal.au64[1])
7746	? " - val" : "",
7747	FormatMxcsr(paTests[iTest].fMxcsrIn));
7748	}
7749	}
7750	}
7751	}
7752
7753
7754	/*
7755	* Convert SSE operations converting signed double-words to single-precision floating point values.
7756	*/
7757	TYPEDEF_SUBTEST_TYPE(SSE_CONVERT_XMM_T, SSE_CONVERT_XMM_TEST_T, PFNIEMAIMPLFPSSEF2U128);
7758
7759	static SSE_CONVERT_XMM_T g_aSseConvertXmmI32R32[] =
7760	{
7761	ENTRY_BIN(cvtdq2ps_u128)
7762	};
7763
7764	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7765	static RTEXITCODE SseConvertXmmI32R32Generate(const char *pszDataFileFmt, uint32_t cTests)
7766	{
7767	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7768
7769	static int32_t const s_aSpecials[] =
7770	{
7771	INT32_MIN,
7772	INT32_MIN / 2,
7773	0,
7774	INT32_MAX / 2,
7775	INT32_MAX,
7776	(int32_t)0x80000000
7777	/** @todo More specials. */
7778	};
7779
7780	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmI32R32); iFn++)
7781	{
7782	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseConvertXmmI32R32[iFn].pfnNative ? g_aSseConvertXmmI32R32[iFn].pfnNative : g_aSseConvertXmmI32R32[iFn].pfn;
7783
7784	IEMBINARYOUTPUT BinOut;
7785	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmI32R32[iFn].pszName), RTEXITCODE_FAILURE);
7786
7787	X86FXSTATE State;
7788	RT_ZERO(State);
7789	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7790	{
7791	SSE_CONVERT_XMM_TEST_T TestData; RT_ZERO(TestData);
7792
7793	TestData.InVal.ai32[0] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
7794	TestData.InVal.ai32[1] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
7795	TestData.InVal.ai32[2] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
7796	TestData.InVal.ai32[3] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
7797
7798	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7799	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7800	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7801	for (uint8_t iFz = 0; iFz < 2; iFz++)
7802	{
7803	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
7804	\| (iRounding << X86_MXCSR_RC_SHIFT)
7805	\| (iDaz ? X86_MXCSR_DAZ : 0)
7806	\| (iFz ? X86_MXCSR_FZ : 0)
7807	\| X86_MXCSR_XCPT_MASK;
7808	IEMSSERESULT ResM; RT_ZERO(ResM);
7809	pfn(&State, &ResM, &ResM.uResult, &TestData.InVal);
7810	TestData.fMxcsrIn = State.MXCSR;
7811	TestData.fMxcsrOut = ResM.MXCSR;
7812	TestData.OutVal = ResM.uResult;
7813	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7814
7815	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
7816	IEMSSERESULT ResU; RT_ZERO(ResU);
7817	pfn(&State, &ResU, &ResU.uResult, &TestData.InVal);
7818	TestData.fMxcsrIn = State.MXCSR;
7819	TestData.fMxcsrOut = ResU.MXCSR;
7820	TestData.OutVal = ResU.uResult;
7821	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7822
7823	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
7824	if (fXcpt)
7825	{
7826	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
7827	IEMSSERESULT Res1; RT_ZERO(Res1);
7828	pfn(&State, &Res1, &Res1.uResult, &TestData.InVal);
7829	TestData.fMxcsrIn = State.MXCSR;
7830	TestData.fMxcsrOut = Res1.MXCSR;
7831	TestData.OutVal = Res1.uResult;
7832	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7833
7834	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
7835	{
7836	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
7837	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
7838	IEMSSERESULT Res2; RT_ZERO(Res2);
7839	pfn(&State, &Res2, &Res2.uResult, &TestData.InVal);
7840	TestData.fMxcsrIn = State.MXCSR;
7841	TestData.fMxcsrOut = Res2.MXCSR;
7842	TestData.OutVal = Res2.uResult;
7843	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7844	}
7845	if (!RT_IS_POWER_OF_TWO(fXcpt))
7846	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
7847	if (fUnmasked & fXcpt)
7848	{
7849	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
7850	IEMSSERESULT Res3; RT_ZERO(Res3);
7851	pfn(&State, &Res3, &Res3.uResult, &TestData.InVal);
7852	TestData.fMxcsrIn = State.MXCSR;
7853	TestData.fMxcsrOut = Res3.MXCSR;
7854	TestData.OutVal = Res3.uResult;
7855	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7856	}
7857	}
7858	}
7859	}
7860	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
7861	}
7862
7863	return RTEXITCODE_SUCCESS;
7864	}
7865	#endif
7866
7867	static void SseConvertXmmI32R32Test(void)
7868	{
7869	X86FXSTATE State;
7870	RT_ZERO(State);
7871
7872	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmI32R32); iFn++)
7873	{
7874	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmI32R32[iFn]))
7875	continue;
7876
7877	uint32_t const cTests = *g_aSseConvertXmmI32R32[iFn].pcTests;
7878	SSE_CONVERT_XMM_TEST_T const * const paTests = g_aSseConvertXmmI32R32[iFn].paTests;
7879	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseConvertXmmI32R32[iFn].pfn;
7880	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmI32R32[iFn]);
7881	if (!cTests) RTTestSkipped(g_hTest, "no tests");
7882	for (uint32_t iVar = 0; iVar < cVars; iVar++)
7883	{
7884	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
7885	{
7886	IEMSSERESULT Res; RT_ZERO(Res);
7887
7888	State.MXCSR = paTests[iTest].fMxcsrIn;
7889	pfn(&State, &Res, &Res.uResult, &paTests[iTest].InVal);
7890	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
7891	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[0], &paTests[iTest].OutVal.ar32[0])
7892	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[1], &paTests[iTest].OutVal.ar32[1])
7893	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[2], &paTests[iTest].OutVal.ar32[2])
7894	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[3], &paTests[iTest].OutVal.ar32[3]))
7895	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32'%RI32'%RI32'%RI32 \n"
7896	"%s -> mxcsr=%#08x %s'%s'%s'%s\n"
7897	"%s expected %#08x %s'%s'%s'%s%s%s (%s)\n",
7898	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
7899	paTests[iTest].InVal.ai32[0], paTests[iTest].InVal.ai32[1],
7900	paTests[iTest].InVal.ai32[2], paTests[iTest].InVal.ai32[3],
7901	iVar ? " " : "", Res.MXCSR,
7902	FormatR32(&Res.uResult.ar32[0]), FormatR32(&Res.uResult.ar32[1]),
7903	FormatR32(&Res.uResult.ar32[2]), FormatR32(&Res.uResult.ar32[3]),
7904	iVar ? " " : "", paTests[iTest].fMxcsrOut,
7905	FormatR32(&paTests[iTest].OutVal.ar32[0]), FormatR32(&paTests[iTest].OutVal.ar32[1]),
7906	FormatR32(&paTests[iTest].OutVal.ar32[2]), FormatR32(&paTests[iTest].OutVal.ar32[3]),
7907	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
7908	( !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[0], &paTests[iTest].OutVal.ar32[0])
7909	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[1], &paTests[iTest].OutVal.ar32[1])
7910	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[2], &paTests[iTest].OutVal.ar32[2])
7911	\|\| !RTFLOAT32U_ARE_IDENTICAL(&Res.uResult.ar32[3], &paTests[iTest].OutVal.ar32[3]))
7912	? " - val" : "",
7913	FormatMxcsr(paTests[iTest].fMxcsrIn));
7914	}
7915	}
7916	}
7917	}
7918
7919
7920	/*
7921	* Convert SSE operations converting signed double-words to single-precision floating point values.
7922	*/
7923	static SSE_CONVERT_XMM_T g_aSseConvertXmmR32I32[] =
7924	{
7925	ENTRY_BIN(cvtps2dq_u128),
7926	ENTRY_BIN(cvttps2dq_u128)
7927	};
7928
7929	#ifdef TSTIEMAIMPL_WITH_GENERATOR
7930	static RTEXITCODE SseConvertXmmR32I32Generate(const char *pszDataFileFmt, uint32_t cTests)
7931	{
7932	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
7933
7934	static struct { RTFLOAT32U aVal1[4]; } const s_aSpecials[] =
7935	{
7936	{ { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0) } },
7937	{ { RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1) } },
7938	{ { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0) } },
7939	{ { RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1) } }
7940	/** @todo More specials. */
7941	};
7942
7943	X86FXSTATE State;
7944	RT_ZERO(State);
7945	uint32_t cMinNormalPairs = (cTests - 144) / 4;
7946	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR32I32); iFn++)
7947	{
7948	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseConvertXmmR32I32[iFn].pfnNative ? g_aSseConvertXmmR32I32[iFn].pfnNative : g_aSseConvertXmmR32I32[iFn].pfn;
7949
7950	IEMBINARYOUTPUT BinOut;
7951	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmR32I32[iFn].pszName), RTEXITCODE_FAILURE);
7952
7953	uint32_t cNormalInputPairs = 0;
7954	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
7955	{
7956	SSE_CONVERT_XMM_TEST_T TestData; RT_ZERO(TestData);
7957
7958	TestData.InVal.ar32[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
7959	TestData.InVal.ar32[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
7960	TestData.InVal.ar32[2] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[2];
7961	TestData.InVal.ar32[3] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[3];
7962
7963	if ( RTFLOAT32U_IS_NORMAL(&TestData.InVal.ar32[0])
7964	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal.ar32[1])
7965	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal.ar32[2])
7966	&& RTFLOAT32U_IS_NORMAL(&TestData.InVal.ar32[3]))
7967	cNormalInputPairs++;
7968	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
7969	{
7970	iTest -= 1;
7971	continue;
7972	}
7973
7974	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
7975	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
7976	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
7977	for (uint8_t iFz = 0; iFz < 2; iFz++)
7978	{
7979	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
7980	\| (iRounding << X86_MXCSR_RC_SHIFT)
7981	\| (iDaz ? X86_MXCSR_DAZ : 0)
7982	\| (iFz ? X86_MXCSR_FZ : 0)
7983	\| X86_MXCSR_XCPT_MASK;
7984	IEMSSERESULT ResM; RT_ZERO(ResM);
7985	pfn(&State, &ResM, &ResM.uResult, &TestData.InVal);
7986	TestData.fMxcsrIn = State.MXCSR;
7987	TestData.fMxcsrOut = ResM.MXCSR;
7988	TestData.OutVal = ResM.uResult;
7989	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7990
7991	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
7992	IEMSSERESULT ResU; RT_ZERO(ResU);
7993	pfn(&State, &ResU, &ResU.uResult, &TestData.InVal);
7994	TestData.fMxcsrIn = State.MXCSR;
7995	TestData.fMxcsrOut = ResU.MXCSR;
7996	TestData.OutVal = ResU.uResult;
7997	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
7998
7999	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
8000	if (fXcpt)
8001	{
8002	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8003	IEMSSERESULT Res1; RT_ZERO(Res1);
8004	pfn(&State, &Res1, &Res1.uResult, &TestData.InVal);
8005	TestData.fMxcsrIn = State.MXCSR;
8006	TestData.fMxcsrOut = Res1.MXCSR;
8007	TestData.OutVal = Res1.uResult;
8008	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8009
8010	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
8011	{
8012	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
8013	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8014	IEMSSERESULT Res2; RT_ZERO(Res2);
8015	pfn(&State, &Res2, &Res2.uResult, &TestData.InVal);
8016	TestData.fMxcsrIn = State.MXCSR;
8017	TestData.fMxcsrOut = Res2.MXCSR;
8018	TestData.OutVal = Res2.uResult;
8019	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8020	}
8021	if (!RT_IS_POWER_OF_TWO(fXcpt))
8022	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8023	if (fUnmasked & fXcpt)
8024	{
8025	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8026	IEMSSERESULT Res3; RT_ZERO(Res3);
8027	pfn(&State, &Res3, &Res3.uResult, &TestData.InVal);
8028	TestData.fMxcsrIn = State.MXCSR;
8029	TestData.fMxcsrOut = Res3.MXCSR;
8030	TestData.OutVal = Res3.uResult;
8031	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8032	}
8033	}
8034	}
8035	}
8036	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8037	}
8038
8039	return RTEXITCODE_SUCCESS;
8040	}
8041	#endif
8042
8043	static void SseConvertXmmR32I32Test(void)
8044	{
8045	X86FXSTATE State;
8046	RT_ZERO(State);
8047
8048	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR32I32); iFn++)
8049	{
8050	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmR32I32[iFn]))
8051	continue;
8052
8053	uint32_t const cTests = *g_aSseConvertXmmR32I32[iFn].pcTests;
8054	SSE_CONVERT_XMM_TEST_T const * const paTests = g_aSseConvertXmmR32I32[iFn].paTests;
8055	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseConvertXmmR32I32[iFn].pfn;
8056	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmR32I32[iFn]);
8057	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8058	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8059	{
8060	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8061	{
8062	IEMSSERESULT Res; RT_ZERO(Res);
8063
8064	State.MXCSR = paTests[iTest].fMxcsrIn;
8065	pfn(&State, &Res, &Res.uResult, &paTests[iTest].InVal);
8066	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
8067	\|\| Res.uResult.ai32[0] != paTests[iTest].OutVal.ai32[0]
8068	\|\| Res.uResult.ai32[1] != paTests[iTest].OutVal.ai32[1]
8069	\|\| Res.uResult.ai32[2] != paTests[iTest].OutVal.ai32[2]
8070	\|\| Res.uResult.ai32[3] != paTests[iTest].OutVal.ai32[3])
8071	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s'%s'%s \n"
8072	"%s -> mxcsr=%#08x %RI32'%RI32'%RI32'%RI32\n"
8073	"%s expected %#08x %RI32'%RI32'%RI32'%RI32%s%s (%s)\n",
8074	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8075	FormatR32(&paTests[iTest].InVal.ar32[0]), FormatR32(&paTests[iTest].InVal.ar32[1]),
8076	FormatR32(&paTests[iTest].InVal.ar32[2]), FormatR32(&paTests[iTest].InVal.ar32[3]),
8077	iVar ? " " : "", Res.MXCSR,
8078	Res.uResult.ai32[0], Res.uResult.ai32[1],
8079	Res.uResult.ai32[2], Res.uResult.ai32[3],
8080	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8081	paTests[iTest].OutVal.ai32[0], paTests[iTest].OutVal.ai32[1],
8082	paTests[iTest].OutVal.ai32[2], paTests[iTest].OutVal.ai32[3],
8083	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
8084	( Res.uResult.ai32[0] != paTests[iTest].OutVal.ai32[0]
8085	\|\| Res.uResult.ai32[1] != paTests[iTest].OutVal.ai32[1]
8086	\|\| Res.uResult.ai32[2] != paTests[iTest].OutVal.ai32[2]
8087	\|\| Res.uResult.ai32[3] != paTests[iTest].OutVal.ai32[3])
8088	? " - val" : "",
8089	FormatMxcsr(paTests[iTest].fMxcsrIn));
8090	}
8091	}
8092	}
8093	}
8094
8095
8096	/*
8097	* Convert SSE operations converting signed double-words to double-precision floating point values.
8098	*/
8099	static SSE_CONVERT_XMM_T g_aSseConvertXmmI32R64[] =
8100	{
8101	ENTRY_BIN(cvtdq2pd_u128)
8102	};
8103
8104	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8105	static RTEXITCODE SseConvertXmmI32R64Generate(const char *pszDataFileFmt, uint32_t cTests)
8106	{
8107	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8108
8109	static int32_t const s_aSpecials[] =
8110	{
8111	INT32_MIN,
8112	INT32_MIN / 2,
8113	0,
8114	INT32_MAX / 2,
8115	INT32_MAX,
8116	(int32_t)0x80000000
8117	/** @todo More specials. */
8118	};
8119
8120	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmI32R64); iFn++)
8121	{
8122	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseConvertXmmI32R64[iFn].pfnNative ? g_aSseConvertXmmI32R64[iFn].pfnNative : g_aSseConvertXmmI32R64[iFn].pfn;
8123
8124	IEMBINARYOUTPUT BinOut;
8125	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmI32R64[iFn].pszName), RTEXITCODE_FAILURE);
8126
8127	X86FXSTATE State;
8128	RT_ZERO(State);
8129	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8130	{
8131	SSE_CONVERT_XMM_TEST_T TestData; RT_ZERO(TestData);
8132
8133	TestData.InVal.ai32[0] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
8134	TestData.InVal.ai32[1] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
8135	TestData.InVal.ai32[2] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
8136	TestData.InVal.ai32[3] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests];
8137
8138	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8139	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8140	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8141	for (uint8_t iFz = 0; iFz < 2; iFz++)
8142	{
8143	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
8144	\| (iRounding << X86_MXCSR_RC_SHIFT)
8145	\| (iDaz ? X86_MXCSR_DAZ : 0)
8146	\| (iFz ? X86_MXCSR_FZ : 0)
8147	\| X86_MXCSR_XCPT_MASK;
8148	IEMSSERESULT ResM; RT_ZERO(ResM);
8149	pfn(&State, &ResM, &ResM.uResult, &TestData.InVal);
8150	TestData.fMxcsrIn = State.MXCSR;
8151	TestData.fMxcsrOut = ResM.MXCSR;
8152	TestData.OutVal = ResM.uResult;
8153	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8154
8155	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
8156	IEMSSERESULT ResU; RT_ZERO(ResU);
8157	pfn(&State, &ResU, &ResU.uResult, &TestData.InVal);
8158	TestData.fMxcsrIn = State.MXCSR;
8159	TestData.fMxcsrOut = ResU.MXCSR;
8160	TestData.OutVal = ResU.uResult;
8161	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8162
8163	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
8164	if (fXcpt)
8165	{
8166	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8167	IEMSSERESULT Res1; RT_ZERO(Res1);
8168	pfn(&State, &Res1, &Res1.uResult, &TestData.InVal);
8169	TestData.fMxcsrIn = State.MXCSR;
8170	TestData.fMxcsrOut = Res1.MXCSR;
8171	TestData.OutVal = Res1.uResult;
8172	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8173
8174	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
8175	{
8176	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
8177	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8178	IEMSSERESULT Res2; RT_ZERO(Res2);
8179	pfn(&State, &Res2, &Res2.uResult, &TestData.InVal);
8180	TestData.fMxcsrIn = State.MXCSR;
8181	TestData.fMxcsrOut = Res2.MXCSR;
8182	TestData.OutVal = Res2.uResult;
8183	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8184	}
8185	if (!RT_IS_POWER_OF_TWO(fXcpt))
8186	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8187	if (fUnmasked & fXcpt)
8188	{
8189	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8190	IEMSSERESULT Res3; RT_ZERO(Res3);
8191	pfn(&State, &Res3, &Res3.uResult, &TestData.InVal);
8192	TestData.fMxcsrIn = State.MXCSR;
8193	TestData.fMxcsrOut = Res3.MXCSR;
8194	TestData.OutVal = Res3.uResult;
8195	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8196	}
8197	}
8198	}
8199	}
8200	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8201	}
8202
8203	return RTEXITCODE_SUCCESS;
8204	}
8205	#endif
8206
8207	static void SseConvertXmmI32R64Test(void)
8208	{
8209	X86FXSTATE State;
8210	RT_ZERO(State);
8211
8212	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmI32R64); iFn++)
8213	{
8214	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmI32R64[iFn]))
8215	continue;
8216
8217	uint32_t const cTests = *g_aSseConvertXmmI32R64[iFn].pcTests;
8218	SSE_CONVERT_XMM_TEST_T const * const paTests = g_aSseConvertXmmI32R64[iFn].paTests;
8219	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseConvertXmmI32R64[iFn].pfn;
8220	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmI32R64[iFn]);
8221	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8222	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8223	{
8224	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8225	{
8226	IEMSSERESULT Res; RT_ZERO(Res);
8227
8228	State.MXCSR = paTests[iTest].fMxcsrIn;
8229	pfn(&State, &Res, &Res.uResult, &paTests[iTest].InVal);
8230	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
8231	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
8232	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
8233	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32'%RI32'%RI32'%RI32 \n"
8234	"%s -> mxcsr=%#08x %s'%s\n"
8235	"%s expected %#08x %s'%s%s%s (%s)\n",
8236	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8237	paTests[iTest].InVal.ai32[0], paTests[iTest].InVal.ai32[1],
8238	paTests[iTest].InVal.ai32[2], paTests[iTest].InVal.ai32[3],
8239	iVar ? " " : "", Res.MXCSR,
8240	FormatR64(&Res.uResult.ar64[0]), FormatR64(&Res.uResult.ar64[1]),
8241	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8242	FormatR64(&paTests[iTest].OutVal.ar64[0]), FormatR64(&paTests[iTest].OutVal.ar64[1]),
8243	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
8244	( !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[0], &paTests[iTest].OutVal.ar64[0])
8245	\|\| !RTFLOAT64U_ARE_IDENTICAL(&Res.uResult.ar64[1], &paTests[iTest].OutVal.ar64[1]))
8246	? " - val" : "",
8247	FormatMxcsr(paTests[iTest].fMxcsrIn));
8248	}
8249	}
8250	}
8251	}
8252
8253
8254	/*
8255	* Convert SSE operations converting signed double-words to double-precision floating point values.
8256	*/
8257	static SSE_CONVERT_XMM_T g_aSseConvertXmmR64I32[] =
8258	{
8259	ENTRY_BIN(cvtpd2dq_u128),
8260	ENTRY_BIN(cvttpd2dq_u128)
8261	};
8262
8263	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8264	static RTEXITCODE SseConvertXmmR64I32Generate(const char *pszDataFileFmt, uint32_t cTests)
8265	{
8266	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8267
8268	static struct { RTFLOAT64U aVal1[2]; } const s_aSpecials[] =
8269	{
8270	{ { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) } },
8271	{ { RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(1) } },
8272	{ { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(0) } },
8273	{ { RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(1) } }
8274	/** @todo More specials. */
8275	};
8276
8277	X86FXSTATE State;
8278	RT_ZERO(State);
8279	uint32_t cMinNormalPairs = (cTests - 144) / 4;
8280	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR64I32); iFn++)
8281	{
8282	PFNIEMAIMPLFPSSEF2U128 const pfn = g_aSseConvertXmmR64I32[iFn].pfnNative ? g_aSseConvertXmmR64I32[iFn].pfnNative : g_aSseConvertXmmR64I32[iFn].pfn;
8283
8284	IEMBINARYOUTPUT BinOut;
8285	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmR64I32[iFn].pszName), RTEXITCODE_FAILURE);
8286
8287	uint32_t cNormalInputPairs = 0;
8288	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8289	{
8290	SSE_CONVERT_XMM_TEST_T TestData; RT_ZERO(TestData);
8291
8292	TestData.InVal.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
8293	TestData.InVal.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
8294
8295	if ( RTFLOAT64U_IS_NORMAL(&TestData.InVal.ar64[0])
8296	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal.ar64[1]))
8297	cNormalInputPairs++;
8298	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
8299	{
8300	iTest -= 1;
8301	continue;
8302	}
8303
8304	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8305	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8306	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8307	for (uint8_t iFz = 0; iFz < 2; iFz++)
8308	{
8309	State.MXCSR = (fMxcsr & ~X86_MXCSR_RC_MASK)
8310	\| (iRounding << X86_MXCSR_RC_SHIFT)
8311	\| (iDaz ? X86_MXCSR_DAZ : 0)
8312	\| (iFz ? X86_MXCSR_FZ : 0)
8313	\| X86_MXCSR_XCPT_MASK;
8314	IEMSSERESULT ResM; RT_ZERO(ResM);
8315	pfn(&State, &ResM, &ResM.uResult, &TestData.InVal);
8316	TestData.fMxcsrIn = State.MXCSR;
8317	TestData.fMxcsrOut = ResM.MXCSR;
8318	TestData.OutVal = ResM.uResult;
8319	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8320
8321	State.MXCSR = State.MXCSR & ~X86_MXCSR_XCPT_MASK;
8322	IEMSSERESULT ResU; RT_ZERO(ResU);
8323	pfn(&State, &ResU, &ResU.uResult, &TestData.InVal);
8324	TestData.fMxcsrIn = State.MXCSR;
8325	TestData.fMxcsrOut = ResU.MXCSR;
8326	TestData.OutVal = ResU.uResult;
8327	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8328
8329	uint16_t fXcpt = (ResM.MXCSR \| ResU.MXCSR) & X86_MXCSR_XCPT_FLAGS;
8330	if (fXcpt)
8331	{
8332	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8333	IEMSSERESULT Res1; RT_ZERO(Res1);
8334	pfn(&State, &Res1, &Res1.uResult, &TestData.InVal);
8335	TestData.fMxcsrIn = State.MXCSR;
8336	TestData.fMxcsrOut = Res1.MXCSR;
8337	TestData.OutVal = Res1.uResult;
8338	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8339
8340	if (((Res1.MXCSR & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (Res1.MXCSR & X86_MXCSR_XCPT_FLAGS))
8341	{
8342	fXcpt \|= Res1.MXCSR & X86_MXCSR_XCPT_FLAGS;
8343	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8344	IEMSSERESULT Res2; RT_ZERO(Res2);
8345	pfn(&State, &Res2, &Res2.uResult, &TestData.InVal);
8346	TestData.fMxcsrIn = State.MXCSR;
8347	TestData.fMxcsrOut = Res2.MXCSR;
8348	TestData.OutVal = Res2.uResult;
8349	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8350	}
8351	if (!RT_IS_POWER_OF_TWO(fXcpt))
8352	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8353	if (fUnmasked & fXcpt)
8354	{
8355	State.MXCSR = (State.MXCSR & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8356	IEMSSERESULT Res3; RT_ZERO(Res3);
8357	pfn(&State, &Res3, &Res3.uResult, &TestData.InVal);
8358	TestData.fMxcsrIn = State.MXCSR;
8359	TestData.fMxcsrOut = Res3.MXCSR;
8360	TestData.OutVal = Res3.uResult;
8361	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8362	}
8363	}
8364	}
8365	}
8366	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8367	}
8368
8369	return RTEXITCODE_SUCCESS;
8370	}
8371	#endif
8372
8373	static void SseConvertXmmR64I32Test(void)
8374	{
8375	X86FXSTATE State;
8376	RT_ZERO(State);
8377
8378	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR64I32); iFn++)
8379	{
8380	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmR64I32[iFn]))
8381	continue;
8382
8383	uint32_t const cTests = *g_aSseConvertXmmR64I32[iFn].pcTests;
8384	SSE_CONVERT_XMM_TEST_T const * const paTests = g_aSseConvertXmmR64I32[iFn].paTests;
8385	PFNIEMAIMPLFPSSEF2U128 pfn = g_aSseConvertXmmR64I32[iFn].pfn;
8386	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmR64I32[iFn]);
8387	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8388	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8389	{
8390	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8391	{
8392	IEMSSERESULT Res; RT_ZERO(Res);
8393
8394	State.MXCSR = paTests[iTest].fMxcsrIn;
8395	pfn(&State, &Res, &Res.uResult, &paTests[iTest].InVal);
8396	if ( Res.MXCSR != paTests[iTest].fMxcsrOut
8397	\|\| Res.uResult.ai32[0] != paTests[iTest].OutVal.ai32[0]
8398	\|\| Res.uResult.ai32[1] != paTests[iTest].OutVal.ai32[1]
8399	\|\| Res.uResult.ai32[2] != paTests[iTest].OutVal.ai32[2]
8400	\|\| Res.uResult.ai32[3] != paTests[iTest].OutVal.ai32[3])
8401	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s \n"
8402	"%s -> mxcsr=%#08x %RI32'%RI32'%RI32'%RI32\n"
8403	"%s expected %#08x %RI32'%RI32'%RI32'%RI32%s%s (%s)\n",
8404	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8405	FormatR64(&paTests[iTest].InVal.ar64[0]), FormatR64(&paTests[iTest].InVal.ar64[1]),
8406	iVar ? " " : "", Res.MXCSR,
8407	Res.uResult.ai32[0], Res.uResult.ai32[1],
8408	Res.uResult.ai32[2], Res.uResult.ai32[3],
8409	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8410	paTests[iTest].OutVal.ai32[0], paTests[iTest].OutVal.ai32[1],
8411	paTests[iTest].OutVal.ai32[2], paTests[iTest].OutVal.ai32[3],
8412	MxcsrDiff(Res.MXCSR, paTests[iTest].fMxcsrOut),
8413	( Res.uResult.ai32[0] != paTests[iTest].OutVal.ai32[0]
8414	\|\| Res.uResult.ai32[1] != paTests[iTest].OutVal.ai32[1]
8415	\|\| Res.uResult.ai32[2] != paTests[iTest].OutVal.ai32[2]
8416	\|\| Res.uResult.ai32[3] != paTests[iTest].OutVal.ai32[3])
8417	? " - val" : "",
8418	FormatMxcsr(paTests[iTest].fMxcsrIn));
8419	}
8420	}
8421	}
8422	}
8423
8424
8425	/*
8426	* Convert SSE operations converting double-precision floating point values to signed double-word values.
8427	*/
8428	TYPEDEF_SUBTEST_TYPE(SSE_CONVERT_MM_XMM_T, SSE_CONVERT_MM_XMM_TEST_T, PFNIEMAIMPLMXCSRU64U128);
8429
8430	static SSE_CONVERT_MM_XMM_T g_aSseConvertMmXmm[] =
8431	{
8432	ENTRY_BIN(cvtpd2pi_u128),
8433	ENTRY_BIN(cvttpd2pi_u128)
8434	};
8435
8436	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8437	static RTEXITCODE SseConvertMmXmmGenerate(const char *pszDataFileFmt, uint32_t cTests)
8438	{
8439	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8440
8441	static struct { RTFLOAT64U aVal1[2]; } const s_aSpecials[] =
8442	{
8443	{ { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(0) } },
8444	{ { RTFLOAT64U_INIT_ZERO(1), RTFLOAT64U_INIT_ZERO(1) } },
8445	{ { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(0) } },
8446	{ { RTFLOAT64U_INIT_INF(1), RTFLOAT64U_INIT_INF(1) } }
8447	/** @todo More specials. */
8448	};
8449
8450	uint32_t cMinNormalPairs = (cTests - 144) / 4;
8451	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertMmXmm); iFn++)
8452	{
8453	PFNIEMAIMPLMXCSRU64U128 const pfn = g_aSseConvertMmXmm[iFn].pfnNative ? g_aSseConvertMmXmm[iFn].pfnNative : g_aSseConvertMmXmm[iFn].pfn;
8454
8455	IEMBINARYOUTPUT BinOut;
8456	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertMmXmm[iFn].pszName), RTEXITCODE_FAILURE);
8457
8458	uint32_t cNormalInputPairs = 0;
8459	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8460	{
8461	SSE_CONVERT_MM_XMM_TEST_T TestData; RT_ZERO(TestData);
8462
8463	TestData.InVal.ar64[0] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
8464	TestData.InVal.ar64[1] = iTest < cTests ? RandR64Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
8465
8466	if ( RTFLOAT64U_IS_NORMAL(&TestData.InVal.ar64[0])
8467	&& RTFLOAT64U_IS_NORMAL(&TestData.InVal.ar64[1]))
8468	cNormalInputPairs++;
8469	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
8470	{
8471	iTest -= 1;
8472	continue;
8473	}
8474
8475	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8476	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8477	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8478	for (uint8_t iFz = 0; iFz < 2; iFz++)
8479	{
8480	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
8481	\| (iRounding << X86_MXCSR_RC_SHIFT)
8482	\| (iDaz ? X86_MXCSR_DAZ : 0)
8483	\| (iFz ? X86_MXCSR_FZ : 0)
8484	\| X86_MXCSR_XCPT_MASK;
8485	uint32_t fMxcsrM = fMxcsrIn;
8486	uint64_t u64ResM;
8487	pfn(&fMxcsrM, &u64ResM, &TestData.InVal);
8488	TestData.fMxcsrIn = fMxcsrIn;
8489	TestData.fMxcsrOut = fMxcsrM;
8490	TestData.OutVal.u = u64ResM;
8491	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8492
8493	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
8494	uint32_t fMxcsrU = fMxcsrIn;
8495	uint64_t u64ResU;
8496	pfn(&fMxcsrU, &u64ResU, &TestData.InVal);
8497	TestData.fMxcsrIn = fMxcsrIn;
8498	TestData.fMxcsrOut = fMxcsrU;
8499	TestData.OutVal.u = u64ResU;
8500	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8501
8502	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
8503	if (fXcpt)
8504	{
8505	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8506	uint32_t fMxcsr1 = fMxcsrIn;
8507	uint64_t u64Res1;
8508	pfn(&fMxcsr1, &u64Res1, &TestData.InVal);
8509	TestData.fMxcsrIn = fMxcsrIn;
8510	TestData.fMxcsrOut = fMxcsr1;
8511	TestData.OutVal.u = u64Res1;
8512	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8513
8514	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
8515	{
8516	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
8517	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8518	uint32_t fMxcsr2 = fMxcsrIn;
8519	uint64_t u64Res2;
8520	pfn(&fMxcsr2, &u64Res2, &TestData.InVal);
8521	TestData.fMxcsrIn = fMxcsrIn;
8522	TestData.fMxcsrOut = fMxcsr2;
8523	TestData.OutVal.u = u64Res2;
8524	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8525	}
8526	if (!RT_IS_POWER_OF_TWO(fXcpt))
8527	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8528	if (fUnmasked & fXcpt)
8529	{
8530	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8531	uint32_t fMxcsr3 = fMxcsrIn;
8532	uint64_t u64Res3;
8533	pfn(&fMxcsr3, &u64Res3, &TestData.InVal);
8534	TestData.fMxcsrIn = fMxcsrIn;
8535	TestData.fMxcsrOut = fMxcsr3;
8536	TestData.OutVal.u = u64Res3;
8537	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8538	}
8539	}
8540	}
8541	}
8542	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8543	}
8544
8545	return RTEXITCODE_SUCCESS;
8546	}
8547	#endif
8548
8549	static void SseConvertMmXmmTest(void)
8550	{
8551	X86FXSTATE State;
8552	RT_ZERO(State);
8553
8554	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertMmXmm); iFn++)
8555	{
8556	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertMmXmm[iFn]))
8557	continue;
8558
8559	uint32_t const cTests = *g_aSseConvertMmXmm[iFn].pcTests;
8560	SSE_CONVERT_MM_XMM_TEST_T const * const paTests = g_aSseConvertMmXmm[iFn].paTests;
8561	PFNIEMAIMPLMXCSRU64U128 pfn = g_aSseConvertMmXmm[iFn].pfn;
8562	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertMmXmm[iFn]);
8563	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8564	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8565	{
8566	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8567	{
8568	RTUINT64U ValOut;
8569	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
8570	pfn(&fMxcsr, &ValOut.u, &paTests[iTest].InVal);
8571	if ( fMxcsr != paTests[iTest].fMxcsrOut
8572	\|\| ValOut.ai32[0] != paTests[iTest].OutVal.ai32[0]
8573	\|\| ValOut.ai32[1] != paTests[iTest].OutVal.ai32[1])
8574	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s\n"
8575	"%s -> mxcsr=%#08x %RI32'%RI32\n"
8576	"%s expected %#08x %RI32'%RI32%s%s (%s)\n",
8577	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8578	FormatR64(&paTests[iTest].InVal.ar64[0]), FormatR64(&paTests[iTest].InVal.ar64[1]),
8579	iVar ? " " : "", fMxcsr, ValOut.ai32[0], ValOut.ai32[1],
8580	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8581	paTests[iTest].OutVal.ai32[0], paTests[iTest].OutVal.ai32[1],
8582	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
8583	( ValOut.ai32[0] != paTests[iTest].OutVal.ai32[0]
8584	\|\| ValOut.ai32[1] != paTests[iTest].OutVal.ai32[1])
8585	? " - val" : "",
8586	FormatMxcsr(paTests[iTest].fMxcsrIn));
8587	}
8588	}
8589	}
8590	}
8591
8592
8593	/*
8594	* Convert SSE operations converting signed double-word values to double precision floating-point values (probably only cvtpi2pd).
8595	*/
8596	TYPEDEF_SUBTEST_TYPE(SSE_CONVERT_XMM_R64_MM_T, SSE_CONVERT_XMM_MM_TEST_T, PFNIEMAIMPLMXCSRU128U64);
8597
8598	static SSE_CONVERT_XMM_R64_MM_T g_aSseConvertXmmR64Mm[] =
8599	{
8600	ENTRY_BIN(cvtpi2pd_u128)
8601	};
8602
8603	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8604	static RTEXITCODE SseConvertXmmR64MmGenerate(const char *pszDataFileFmt, uint32_t cTests)
8605	{
8606	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8607
8608	static struct { int32_t aVal[2]; } const s_aSpecials[] =
8609	{
8610	{ { INT32_MIN, INT32_MIN } },
8611	{ { INT32_MAX, INT32_MAX } }
8612	/** @todo More specials. */
8613	};
8614
8615	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR64Mm); iFn++)
8616	{
8617	PFNIEMAIMPLMXCSRU128U64 const pfn = g_aSseConvertXmmR64Mm[iFn].pfnNative ? g_aSseConvertXmmR64Mm[iFn].pfnNative : g_aSseConvertXmmR64Mm[iFn].pfn;
8618
8619	IEMBINARYOUTPUT BinOut;
8620	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmR64Mm[iFn].pszName), RTEXITCODE_FAILURE);
8621
8622	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8623	{
8624	SSE_CONVERT_XMM_MM_TEST_T TestData; RT_ZERO(TestData);
8625
8626	TestData.InVal.ai32[0] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests].aVal[0];
8627	TestData.InVal.ai32[1] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests].aVal[1];
8628
8629	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8630	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8631	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8632	for (uint8_t iFz = 0; iFz < 2; iFz++)
8633	{
8634	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
8635	\| (iRounding << X86_MXCSR_RC_SHIFT)
8636	\| (iDaz ? X86_MXCSR_DAZ : 0)
8637	\| (iFz ? X86_MXCSR_FZ : 0)
8638	\| X86_MXCSR_XCPT_MASK;
8639	uint32_t fMxcsrM = fMxcsrIn;
8640	pfn(&fMxcsrM, &TestData.OutVal, TestData.InVal.u);
8641	TestData.fMxcsrIn = fMxcsrIn;
8642	TestData.fMxcsrOut = fMxcsrM;
8643	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8644
8645	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
8646	uint32_t fMxcsrU = fMxcsrIn;
8647	pfn(&fMxcsrU, &TestData.OutVal, TestData.InVal.u);
8648	TestData.fMxcsrIn = fMxcsrIn;
8649	TestData.fMxcsrOut = fMxcsrU;
8650	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8651
8652	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
8653	if (fXcpt)
8654	{
8655	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8656	uint32_t fMxcsr1 = fMxcsrIn;
8657	pfn(&fMxcsr1, &TestData.OutVal, TestData.InVal.u);
8658	TestData.fMxcsrIn = fMxcsrIn;
8659	TestData.fMxcsrOut = fMxcsr1;
8660	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8661
8662	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
8663	{
8664	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
8665	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8666	uint32_t fMxcsr2 = fMxcsrIn;
8667	pfn(&fMxcsr2, &TestData.OutVal, TestData.InVal.u);
8668	TestData.fMxcsrIn = fMxcsrIn;
8669	TestData.fMxcsrOut = fMxcsr2;
8670	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8671	}
8672	if (!RT_IS_POWER_OF_TWO(fXcpt))
8673	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8674	if (fUnmasked & fXcpt)
8675	{
8676	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8677	uint32_t fMxcsr3 = fMxcsrIn;
8678	pfn(&fMxcsr3, &TestData.OutVal, TestData.InVal.u);
8679	TestData.fMxcsrIn = fMxcsrIn;
8680	TestData.fMxcsrOut = fMxcsr3;
8681	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8682	}
8683	}
8684	}
8685	}
8686	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8687	}
8688
8689	return RTEXITCODE_SUCCESS;
8690	}
8691	#endif
8692
8693	static void SseConvertXmmR64MmTest(void)
8694	{
8695	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR64Mm); iFn++)
8696	{
8697	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmR64Mm[iFn]))
8698	continue;
8699
8700	uint32_t const cTests = *g_aSseConvertXmmR64Mm[iFn].pcTests;
8701	SSE_CONVERT_XMM_MM_TEST_T const * const paTests = g_aSseConvertXmmR64Mm[iFn].paTests;
8702	PFNIEMAIMPLMXCSRU128U64 pfn = g_aSseConvertXmmR64Mm[iFn].pfn;
8703	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmR64Mm[iFn]);
8704	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8705	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8706	{
8707	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8708	{
8709	X86XMMREG ValOut;
8710	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
8711	pfn(&fMxcsr, &ValOut, paTests[iTest].InVal.u);
8712	if ( fMxcsr != paTests[iTest].fMxcsrOut
8713	\|\| !RTFLOAT64U_ARE_IDENTICAL(&ValOut.ar64[0], &paTests[iTest].OutVal.ar64[0])
8714	\|\| !RTFLOAT64U_ARE_IDENTICAL(&ValOut.ar64[1], &paTests[iTest].OutVal.ar64[1]))
8715	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32'%RI32\n"
8716	"%s -> mxcsr=%#08x %s'%s\n"
8717	"%s expected %#08x %s'%s%s%s (%s)\n",
8718	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8719	paTests[iTest].InVal.ai32[0], paTests[iTest].InVal.ai32[1],
8720	iVar ? " " : "", fMxcsr,
8721	FormatR64(&ValOut.ar64[0]), FormatR64(&ValOut.ar64[1]),
8722	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8723	FormatR64(&paTests[iTest].OutVal.ar64[0]), FormatR64(&paTests[iTest].OutVal.ar64[1]),
8724	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
8725	( !RTFLOAT64U_ARE_IDENTICAL(&ValOut.ar64[0], &paTests[iTest].OutVal.ar64[0])
8726	\|\| !RTFLOAT64U_ARE_IDENTICAL(&ValOut.ar64[1], &paTests[iTest].OutVal.ar64[1]))
8727	? " - val" : "",
8728	FormatMxcsr(paTests[iTest].fMxcsrIn));
8729	}
8730	}
8731	}
8732	}
8733
8734
8735	/*
8736	* Convert SSE operations converting signed double-word values to double precision floating-point values (probably only cvtpi2pd).
8737	*/
8738	TYPEDEF_SUBTEST_TYPE(SSE_CONVERT_XMM_R32_MM_T, SSE_CONVERT_XMM_MM_TEST_T, PFNIEMAIMPLMXCSRU128U64);
8739
8740	static SSE_CONVERT_XMM_R32_MM_T g_aSseConvertXmmR32Mm[] =
8741	{
8742	ENTRY_BIN(cvtpi2ps_u128)
8743	};
8744
8745	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8746	static RTEXITCODE SseConvertXmmR32MmGenerate(const char *pszDataFileFmt, uint32_t cTests)
8747	{
8748	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8749
8750	static struct { int32_t aVal[2]; } const s_aSpecials[] =
8751	{
8752	{ { INT32_MIN, INT32_MIN } },
8753	{ { INT32_MAX, INT32_MAX } }
8754	/** @todo More specials. */
8755	};
8756
8757	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR32Mm); iFn++)
8758	{
8759	PFNIEMAIMPLMXCSRU128U64 const pfn = g_aSseConvertXmmR32Mm[iFn].pfnNative ? g_aSseConvertXmmR32Mm[iFn].pfnNative : g_aSseConvertXmmR32Mm[iFn].pfn;
8760
8761	IEMBINARYOUTPUT BinOut;
8762	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertXmmR32Mm[iFn].pszName), RTEXITCODE_FAILURE);
8763
8764	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8765	{
8766	SSE_CONVERT_XMM_MM_TEST_T TestData; RT_ZERO(TestData);
8767
8768	TestData.InVal.ai32[0] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests].aVal[0];
8769	TestData.InVal.ai32[1] = iTest < cTests ? RandI32Src2(iTest) : s_aSpecials[iTest - cTests].aVal[1];
8770
8771	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8772	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8773	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8774	for (uint8_t iFz = 0; iFz < 2; iFz++)
8775	{
8776	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
8777	\| (iRounding << X86_MXCSR_RC_SHIFT)
8778	\| (iDaz ? X86_MXCSR_DAZ : 0)
8779	\| (iFz ? X86_MXCSR_FZ : 0)
8780	\| X86_MXCSR_XCPT_MASK;
8781	uint32_t fMxcsrM = fMxcsrIn;
8782	pfn(&fMxcsrM, &TestData.OutVal, TestData.InVal.u);
8783	TestData.fMxcsrIn = fMxcsrIn;
8784	TestData.fMxcsrOut = fMxcsrM;
8785	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8786
8787	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
8788	uint32_t fMxcsrU = fMxcsrIn;
8789	pfn(&fMxcsrU, &TestData.OutVal, TestData.InVal.u);
8790	TestData.fMxcsrIn = fMxcsrIn;
8791	TestData.fMxcsrOut = fMxcsrU;
8792	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8793
8794	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
8795	if (fXcpt)
8796	{
8797	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8798	uint32_t fMxcsr1 = fMxcsrIn;
8799	pfn(&fMxcsr1, &TestData.OutVal, TestData.InVal.u);
8800	TestData.fMxcsrIn = fMxcsrIn;
8801	TestData.fMxcsrOut = fMxcsr1;
8802	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8803
8804	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
8805	{
8806	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
8807	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8808	uint32_t fMxcsr2 = fMxcsrIn;
8809	pfn(&fMxcsr2, &TestData.OutVal, TestData.InVal.u);
8810	TestData.fMxcsrIn = fMxcsrIn;
8811	TestData.fMxcsrOut = fMxcsr2;
8812	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8813	}
8814	if (!RT_IS_POWER_OF_TWO(fXcpt))
8815	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8816	if (fUnmasked & fXcpt)
8817	{
8818	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8819	uint32_t fMxcsr3 = fMxcsrIn;
8820	pfn(&fMxcsr3, &TestData.OutVal, TestData.InVal.u);
8821	TestData.fMxcsrIn = fMxcsrIn;
8822	TestData.fMxcsrOut = fMxcsr3;
8823	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8824	}
8825	}
8826	}
8827	}
8828	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8829	}
8830
8831	return RTEXITCODE_SUCCESS;
8832	}
8833	#endif
8834
8835	static void SseConvertXmmR32MmTest(void)
8836	{
8837	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertXmmR32Mm); iFn++)
8838	{
8839	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertXmmR32Mm[iFn]))
8840	continue;
8841
8842	uint32_t const cTests = *g_aSseConvertXmmR32Mm[iFn].pcTests;
8843	SSE_CONVERT_XMM_MM_TEST_T const * const paTests = g_aSseConvertXmmR32Mm[iFn].paTests;
8844	PFNIEMAIMPLMXCSRU128U64 pfn = g_aSseConvertXmmR32Mm[iFn].pfn;
8845	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertXmmR32Mm[iFn]);
8846	if (!cTests) RTTestSkipped(g_hTest, "no tests");
8847	for (uint32_t iVar = 0; iVar < cVars; iVar++)
8848	{
8849	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
8850	{
8851	X86XMMREG ValOut;
8852	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
8853	pfn(&fMxcsr, &ValOut, paTests[iTest].InVal.u);
8854	if ( fMxcsr != paTests[iTest].fMxcsrOut
8855	\|\| !RTFLOAT32U_ARE_IDENTICAL(&ValOut.ar32[0], &paTests[iTest].OutVal.ar32[0])
8856	\|\| !RTFLOAT32U_ARE_IDENTICAL(&ValOut.ar32[1], &paTests[iTest].OutVal.ar32[1]))
8857	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%RI32'%RI32\n"
8858	"%s -> mxcsr=%#08x %s'%s\n"
8859	"%s expected %#08x %s'%s%s%s (%s)\n",
8860	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
8861	paTests[iTest].InVal.ai32[0], paTests[iTest].InVal.ai32[1],
8862	iVar ? " " : "", fMxcsr,
8863	FormatR32(&ValOut.ar32[0]), FormatR32(&ValOut.ar32[1]),
8864	iVar ? " " : "", paTests[iTest].fMxcsrOut,
8865	FormatR32(&paTests[iTest].OutVal.ar32[0]), FormatR32(&paTests[iTest].OutVal.ar32[1]),
8866	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
8867	( !RTFLOAT32U_ARE_IDENTICAL(&ValOut.ar32[0], &paTests[iTest].OutVal.ar32[0])
8868	\|\| !RTFLOAT32U_ARE_IDENTICAL(&ValOut.ar32[1], &paTests[iTest].OutVal.ar32[1]))
8869	? " - val" : "",
8870	FormatMxcsr(paTests[iTest].fMxcsrIn));
8871	}
8872	}
8873	}
8874	}
8875
8876
8877	/*
8878	* Convert SSE operations converting single-precision floating point values to signed double-word values.
8879	*/
8880	TYPEDEF_SUBTEST_TYPE(SSE_CONVERT_MM_I32_XMM_R32_T, SSE_CONVERT_MM_R32_TEST_T, PFNIEMAIMPLMXCSRU64U64);
8881
8882	static SSE_CONVERT_MM_I32_XMM_R32_T g_aSseConvertMmI32XmmR32[] =
8883	{
8884	ENTRY_BIN(cvtps2pi_u128),
8885	ENTRY_BIN(cvttps2pi_u128)
8886	};
8887
8888	#ifdef TSTIEMAIMPL_WITH_GENERATOR
8889	static RTEXITCODE SseConvertMmI32XmmR32Generate(const char *pszDataFileFmt, uint32_t cTests)
8890	{
8891	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
8892
8893	static struct { RTFLOAT32U aVal1[2]; } const s_aSpecials[] =
8894	{
8895	{ { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(0) } },
8896	{ { RTFLOAT32U_INIT_ZERO(1), RTFLOAT32U_INIT_ZERO(1) } },
8897	{ { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(0) } },
8898	{ { RTFLOAT32U_INIT_INF(1), RTFLOAT32U_INIT_INF(1) } }
8899	/** @todo More specials. */
8900	};
8901
8902	uint32_t cMinNormalPairs = (cTests - 144) / 4;
8903	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertMmI32XmmR32); iFn++)
8904	{
8905	PFNIEMAIMPLMXCSRU64U64 const pfn = g_aSseConvertMmI32XmmR32[iFn].pfnNative ? g_aSseConvertMmI32XmmR32[iFn].pfnNative : g_aSseConvertMmI32XmmR32[iFn].pfn;
8906
8907	IEMBINARYOUTPUT BinOut;
8908	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSseConvertMmI32XmmR32[iFn].pszName), RTEXITCODE_FAILURE);
8909
8910	uint32_t cNormalInputPairs = 0;
8911	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
8912	{
8913	SSE_CONVERT_MM_R32_TEST_T TestData; RT_ZERO(TestData);
8914
8915	TestData.ar32InVal[0] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[0];
8916	TestData.ar32InVal[1] = iTest < cTests ? RandR32Src(iTest) : s_aSpecials[iTest - cTests].aVal1[1];
8917
8918	if ( RTFLOAT32U_IS_NORMAL(&TestData.ar32InVal[0])
8919	&& RTFLOAT32U_IS_NORMAL(&TestData.ar32InVal[1]))
8920	cNormalInputPairs++;
8921	else if (cNormalInputPairs < cMinNormalPairs && iTest + cMinNormalPairs >= cTests && iTest < cTests)
8922	{
8923	iTest -= 1;
8924	continue;
8925	}
8926
8927	RTFLOAT64U TestVal;
8928	TestVal.au32[0] = TestData.ar32InVal[0].u;
8929	TestVal.au32[1] = TestData.ar32InVal[1].u;
8930
8931	uint32_t const fMxcsr = RandMxcsr() & X86_MXCSR_XCPT_FLAGS;
8932	for (uint16_t iRounding = 0; iRounding < 4; iRounding++)
8933	for (uint8_t iDaz = 0; iDaz < 2; iDaz++)
8934	for (uint8_t iFz = 0; iFz < 2; iFz++)
8935	{
8936	uint32_t fMxcsrIn = (fMxcsr & ~X86_MXCSR_RC_MASK)
8937	\| (iRounding << X86_MXCSR_RC_SHIFT)
8938	\| (iDaz ? X86_MXCSR_DAZ : 0)
8939	\| (iFz ? X86_MXCSR_FZ : 0)
8940	\| X86_MXCSR_XCPT_MASK;
8941	uint32_t fMxcsrM = fMxcsrIn;
8942	uint64_t u64ResM;
8943	pfn(&fMxcsrM, &u64ResM, TestVal.u);
8944	TestData.fMxcsrIn = fMxcsrIn;
8945	TestData.fMxcsrOut = fMxcsrM;
8946	TestData.OutVal.u = u64ResM;
8947	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8948
8949	fMxcsrIn &= ~X86_MXCSR_XCPT_MASK;
8950	uint32_t fMxcsrU = fMxcsrIn;
8951	uint64_t u64ResU;
8952	pfn(&fMxcsrU, &u64ResU, TestVal.u);
8953	TestData.fMxcsrIn = fMxcsrIn;
8954	TestData.fMxcsrOut = fMxcsrU;
8955	TestData.OutVal.u = u64ResU;
8956	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8957
8958	uint16_t fXcpt = (fMxcsrM \| fMxcsrU) & X86_MXCSR_XCPT_FLAGS;
8959	if (fXcpt)
8960	{
8961	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| fXcpt;
8962	uint32_t fMxcsr1 = fMxcsrIn;
8963	uint64_t u64Res1;
8964	pfn(&fMxcsr1, &u64Res1, TestVal.u);
8965	TestData.fMxcsrIn = fMxcsrIn;
8966	TestData.fMxcsrOut = fMxcsr1;
8967	TestData.OutVal.u = u64Res1;
8968	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8969
8970	if (((fMxcsr1 & X86_MXCSR_XCPT_FLAGS) & fXcpt) != (fMxcsr1 & X86_MXCSR_XCPT_FLAGS))
8971	{
8972	fXcpt \|= fMxcsr1 & X86_MXCSR_XCPT_FLAGS;
8973	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| (fXcpt << X86_MXCSR_XCPT_MASK_SHIFT);
8974	uint32_t fMxcsr2 = fMxcsrIn;
8975	uint64_t u64Res2;
8976	pfn(&fMxcsr2, &u64Res2, TestVal.u);
8977	TestData.fMxcsrIn = fMxcsrIn;
8978	TestData.fMxcsrOut = fMxcsr2;
8979	TestData.OutVal.u = u64Res2;
8980	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8981	}
8982	if (!RT_IS_POWER_OF_TWO(fXcpt))
8983	for (uint16_t fUnmasked = 1; fUnmasked <= X86_MXCSR_PE; fUnmasked <<= 1)
8984	if (fUnmasked & fXcpt)
8985	{
8986	fMxcsrIn = (fMxcsrIn & ~X86_MXCSR_XCPT_MASK) \| ((fXcpt & ~fUnmasked) << X86_MXCSR_XCPT_MASK_SHIFT);
8987	uint32_t fMxcsr3 = fMxcsrIn;
8988	uint64_t u64Res3;
8989	pfn(&fMxcsr3, &u64Res3, TestVal.u);
8990	TestData.fMxcsrIn = fMxcsrIn;
8991	TestData.fMxcsrOut = fMxcsr3;
8992	TestData.OutVal.u = u64Res3;
8993	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
8994	}
8995	}
8996	}
8997	}
8998	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
8999	}
9000
9001	return RTEXITCODE_SUCCESS;
9002	}
9003	#endif
9004
9005	static void SseConvertMmI32XmmR32Test(void)
9006	{
9007	X86FXSTATE State;
9008	RT_ZERO(State);
9009
9010	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSseConvertMmI32XmmR32); iFn++)
9011	{
9012	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSseConvertMmI32XmmR32[iFn]))
9013	continue;
9014
9015	uint32_t const cTests = *g_aSseConvertMmI32XmmR32[iFn].pcTests;
9016	SSE_CONVERT_MM_R32_TEST_T const * const paTests = g_aSseConvertMmI32XmmR32[iFn].paTests;
9017	PFNIEMAIMPLMXCSRU64U64 pfn = g_aSseConvertMmI32XmmR32[iFn].pfn;
9018	uint32_t const cVars = COUNT_VARIATIONS(g_aSseConvertMmI32XmmR32[iFn]);
9019	if (!cTests) RTTestSkipped(g_hTest, "no tests");
9020	for (uint32_t iVar = 0; iVar < cVars; iVar++)
9021	{
9022	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
9023	{
9024	RTUINT64U ValOut;
9025	RTUINT64U ValIn;
9026
9027	ValIn.au32[0] = paTests[iTest].ar32InVal[0].u;
9028	ValIn.au32[1] = paTests[iTest].ar32InVal[1].u;
9029
9030	uint32_t fMxcsr = paTests[iTest].fMxcsrIn;
9031	pfn(&fMxcsr, &ValOut.u, ValIn.u);
9032	if ( fMxcsr != paTests[iTest].fMxcsrOut
9033	\|\| ValOut.ai32[0] != paTests[iTest].OutVal.ai32[0]
9034	\|\| ValOut.ai32[1] != paTests[iTest].OutVal.ai32[1])
9035	RTTestFailed(g_hTest, "#%04u%s: mxcsr=%#08x in1=%s'%s \n"
9036	"%s -> mxcsr=%#08x %RI32'%RI32\n"
9037	"%s expected %#08x %RI32'%RI32%s%s (%s)\n",
9038	iTest, iVar ? "/n" : "", paTests[iTest].fMxcsrIn,
9039	FormatR32(&paTests[iTest].ar32InVal[0]), FormatR32(&paTests[iTest].ar32InVal[1]),
9040	iVar ? " " : "", fMxcsr,
9041	ValOut.ai32[0], ValOut.ai32[1],
9042	iVar ? " " : "", paTests[iTest].fMxcsrOut,
9043	paTests[iTest].OutVal.ai32[0], paTests[iTest].OutVal.ai32[1],
9044	MxcsrDiff(fMxcsr, paTests[iTest].fMxcsrOut),
9045	( ValOut.ai32[0] != paTests[iTest].OutVal.ai32[0]
9046	\|\| ValOut.ai32[1] != paTests[iTest].OutVal.ai32[1])
9047	? " - val" : "",
9048	FormatMxcsr(paTests[iTest].fMxcsrIn));
9049	}
9050	}
9051	}
9052	}
9053
9054
9055	/*
9056	* SSE 4.2 pcmpxstrx instructions.
9057	*/
9058	TYPEDEF_SUBTEST_TYPE(SSE_PCMPISTRI_T, SSE_PCMPISTRI_TEST_T, PFNIEMAIMPLPCMPISTRIU128IMM8);
9059
9060	static SSE_PCMPISTRI_T g_aSsePcmpistri[] =
9061	{
9062	ENTRY_BIN_SSE_OPT(pcmpistri_u128),
9063	};
9064
9065	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9066	static RTEXITCODE SseComparePcmpistriGenerate(const char *pszDataFileFmt, uint32_t cTests)
9067	{
9068	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
9069
9070	static struct { RTUINT128U uSrc1; RTUINT128U uSrc2; } const s_aSpecials[] =
9071	{
9072	{ RTUINT128_INIT_C(0, 0), RTUINT128_INIT_C(0, 0) },
9073	/** @todo More specials. */
9074	};
9075
9076	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpistri); iFn++)
9077	{
9078	PFNIEMAIMPLPCMPISTRIU128IMM8 const pfn = g_aSsePcmpistri[iFn].pfnNative ? g_aSsePcmpistri[iFn].pfnNative : g_aSsePcmpistri[iFn].pfn;
9079
9080	IEMBINARYOUTPUT BinOut;
9081	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSsePcmpistri[iFn].pszName), RTEXITCODE_FAILURE);
9082
9083	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
9084	{
9085	SSE_PCMPISTRI_TEST_T TestData; RT_ZERO(TestData);
9086
9087	TestData.InVal1.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc1;
9088	TestData.InVal2.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc2;
9089
9090	IEMPCMPISTRXSRC TestVal;
9091	TestVal.uSrc1 = TestData.InVal1.uXmm;
9092	TestVal.uSrc2 = TestData.InVal2.uXmm;
9093
9094	uint32_t const fEFlagsIn = RandEFlags();
9095	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9096	{
9097	uint32_t fEFlagsOut = fEFlagsIn;
9098	pfn(&TestData.u32EcxOut, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9099	TestData.fEFlagsIn = fEFlagsIn;
9100	TestData.fEFlagsOut = fEFlagsOut;
9101	TestData.bImm = (uint8_t)u16Imm;
9102	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9103	}
9104
9105	/* Repeat the test with the input value being the same. */
9106	TestData.InVal2.uXmm = TestData.InVal1.uXmm;
9107	TestVal.uSrc1 = TestData.InVal1.uXmm;
9108	TestVal.uSrc2 = TestData.InVal2.uXmm;
9109
9110	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9111	{
9112	uint32_t fEFlagsOut = fEFlagsIn;
9113	pfn(&TestData.u32EcxOut, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9114	TestData.fEFlagsIn = fEFlagsIn;
9115	TestData.fEFlagsOut = fEFlagsOut;
9116	TestData.bImm = (uint8_t)u16Imm;
9117	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9118	}
9119	}
9120	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
9121	}
9122
9123	return RTEXITCODE_SUCCESS;
9124	}
9125	#endif
9126
9127	static void SseComparePcmpistriTest(void)
9128	{
9129	X86FXSTATE State;
9130	RT_ZERO(State);
9131
9132	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpistri); iFn++)
9133	{
9134	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSsePcmpistri[iFn]))
9135	continue;
9136
9137	uint32_t const cTests = *g_aSsePcmpistri[iFn].pcTests;
9138	SSE_PCMPISTRI_TEST_T const * const paTests = g_aSsePcmpistri[iFn].paTests;
9139	PFNIEMAIMPLPCMPISTRIU128IMM8 pfn = g_aSsePcmpistri[iFn].pfn;
9140	uint32_t const cVars = COUNT_VARIATIONS(g_aSsePcmpistri[iFn]);
9141	if (!cTests) RTTestSkipped(g_hTest, "no tests");
9142	for (uint32_t iVar = 0; iVar < cVars; iVar++)
9143	{
9144	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
9145	{
9146	IEMPCMPISTRXSRC TestVal;
9147	TestVal.uSrc1 = paTests[iTest].InVal1.uXmm;
9148	TestVal.uSrc2 = paTests[iTest].InVal2.uXmm;
9149
9150	uint32_t fEFlags = paTests[iTest].fEFlagsIn;
9151	uint32_t u32EcxOut = 0;
9152	pfn(&u32EcxOut, &fEFlags, &TestVal, paTests[iTest].bImm);
9153	if ( fEFlags != paTests[iTest].fEFlagsOut
9154	\|\| u32EcxOut != paTests[iTest].u32EcxOut)
9155	RTTestFailed(g_hTest, "#%04u%s: efl=%#08x in1=%s in2=%s bImm=%#x\n"
9156	"%s -> efl=%#08x %RU32\n"
9157	"%s expected %#08x %RU32%s%s\n",
9158	iTest, iVar ? "/n" : "", paTests[iTest].fEFlagsIn,
9159	FormatU128(&paTests[iTest].InVal1.uXmm), FormatU128(&paTests[iTest].InVal2.uXmm), paTests[iTest].bImm,
9160	iVar ? " " : "", fEFlags, u32EcxOut,
9161	iVar ? " " : "", paTests[iTest].fEFlagsOut, paTests[iTest].u32EcxOut,
9162	EFlagsDiff(fEFlags, paTests[iTest].fEFlagsOut),
9163	(u32EcxOut != paTests[iTest].u32EcxOut) ? " - val" : "");
9164	}
9165	}
9166	}
9167	}
9168
9169
9170	TYPEDEF_SUBTEST_TYPE(SSE_PCMPISTRM_T, SSE_PCMPISTRM_TEST_T, PFNIEMAIMPLPCMPISTRMU128IMM8);
9171
9172	static SSE_PCMPISTRM_T g_aSsePcmpistrm[] =
9173	{
9174	ENTRY_BIN_SSE_OPT(pcmpistrm_u128),
9175	};
9176
9177	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9178	static RTEXITCODE SseComparePcmpistrmGenerate(const char *pszDataFileFmt, uint32_t cTests)
9179	{
9180	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
9181
9182	static struct { RTUINT128U uSrc1; RTUINT128U uSrc2; } const s_aSpecials[] =
9183	{
9184	{ RTUINT128_INIT_C(0, 0), RTUINT128_INIT_C(0, 0) },
9185	/** @todo More specials. */
9186	};
9187
9188	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpistrm); iFn++)
9189	{
9190	PFNIEMAIMPLPCMPISTRMU128IMM8 const pfn = g_aSsePcmpistrm[iFn].pfnNative ? g_aSsePcmpistrm[iFn].pfnNative : g_aSsePcmpistrm[iFn].pfn;
9191
9192	IEMBINARYOUTPUT BinOut;
9193	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSsePcmpistrm[iFn].pszName), RTEXITCODE_FAILURE);
9194
9195	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
9196	{
9197	SSE_PCMPISTRM_TEST_T TestData; RT_ZERO(TestData);
9198
9199	TestData.InVal1.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc1;
9200	TestData.InVal2.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc2;
9201
9202	IEMPCMPISTRXSRC TestVal;
9203	TestVal.uSrc1 = TestData.InVal1.uXmm;
9204	TestVal.uSrc2 = TestData.InVal2.uXmm;
9205
9206	uint32_t const fEFlagsIn = RandEFlags();
9207	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9208	{
9209	uint32_t fEFlagsOut = fEFlagsIn;
9210	pfn(&TestData.OutVal.uXmm, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9211	TestData.fEFlagsIn = fEFlagsIn;
9212	TestData.fEFlagsOut = fEFlagsOut;
9213	TestData.bImm = (uint8_t)u16Imm;
9214	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9215	}
9216
9217	/* Repeat the test with the input value being the same. */
9218	TestData.InVal2.uXmm = TestData.InVal1.uXmm;
9219	TestVal.uSrc1 = TestData.InVal1.uXmm;
9220	TestVal.uSrc2 = TestData.InVal2.uXmm;
9221
9222	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9223	{
9224	uint32_t fEFlagsOut = fEFlagsIn;
9225	pfn(&TestData.OutVal.uXmm, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9226	TestData.fEFlagsIn = fEFlagsIn;
9227	TestData.fEFlagsOut = fEFlagsOut;
9228	TestData.bImm = (uint8_t)u16Imm;
9229	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9230	}
9231	}
9232	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
9233	}
9234
9235	return RTEXITCODE_SUCCESS;
9236	}
9237	#endif
9238
9239	static void SseComparePcmpistrmTest(void)
9240	{
9241	X86FXSTATE State;
9242	RT_ZERO(State);
9243
9244	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpistrm); iFn++)
9245	{
9246	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSsePcmpistrm[iFn]))
9247	continue;
9248
9249	uint32_t const cTests = *g_aSsePcmpistrm[iFn].pcTests;
9250	SSE_PCMPISTRM_TEST_T const * const paTests = g_aSsePcmpistrm[iFn].paTests;
9251	PFNIEMAIMPLPCMPISTRMU128IMM8 pfn = g_aSsePcmpistrm[iFn].pfn;
9252	uint32_t const cVars = COUNT_VARIATIONS(g_aSsePcmpistrm[iFn]);
9253	if (!cTests) RTTestSkipped(g_hTest, "no tests");
9254	for (uint32_t iVar = 0; iVar < cVars; iVar++)
9255	{
9256	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
9257	{
9258	IEMPCMPISTRXSRC TestVal;
9259	TestVal.uSrc1 = paTests[iTest].InVal1.uXmm;
9260	TestVal.uSrc2 = paTests[iTest].InVal2.uXmm;
9261
9262	uint32_t fEFlags = paTests[iTest].fEFlagsIn;
9263	RTUINT128U OutVal;
9264	pfn(&OutVal, &fEFlags, &TestVal, paTests[iTest].bImm);
9265	if ( fEFlags != paTests[iTest].fEFlagsOut
9266	\|\| OutVal.s.Hi != paTests[iTest].OutVal.uXmm.s.Hi
9267	\|\| OutVal.s.Lo != paTests[iTest].OutVal.uXmm.s.Lo)
9268	RTTestFailed(g_hTest, "#%04u%s: efl=%#08x in1=%s in2=%s bImm=%#x\n"
9269	"%s -> efl=%#08x %s\n"
9270	"%s expected %#08x %s%s%s\n",
9271	iTest, iVar ? "/n" : "", paTests[iTest].fEFlagsIn,
9272	FormatU128(&paTests[iTest].InVal1.uXmm), FormatU128(&paTests[iTest].InVal2.uXmm), paTests[iTest].bImm,
9273	iVar ? " " : "", fEFlags, FormatU128(&OutVal),
9274	iVar ? " " : "", paTests[iTest].fEFlagsOut, FormatU128(&paTests[iTest].OutVal.uXmm),
9275	EFlagsDiff(fEFlags, paTests[iTest].fEFlagsOut),
9276	( OutVal.s.Hi != paTests[iTest].OutVal.uXmm.s.Hi
9277	\|\| OutVal.s.Lo != paTests[iTest].OutVal.uXmm.s.Lo) ? " - val" : "");
9278	}
9279	}
9280	}
9281	}
9282
9283
9284	TYPEDEF_SUBTEST_TYPE(SSE_PCMPESTRI_T, SSE_PCMPESTRI_TEST_T, PFNIEMAIMPLPCMPESTRIU128IMM8);
9285
9286	static SSE_PCMPESTRI_T g_aSsePcmpestri[] =
9287	{
9288	ENTRY_BIN_SSE_OPT(pcmpestri_u128),
9289	};
9290
9291	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9292	static RTEXITCODE SseComparePcmpestriGenerate(const char *pszDataFileFmt, uint32_t cTests)
9293	{
9294	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
9295
9296	static struct { RTUINT128U uSrc1; RTUINT128U uSrc2; } const s_aSpecials[] =
9297	{
9298	{ RTUINT128_INIT_C(0, 0), RTUINT128_INIT_C(0, 0) },
9299	/** @todo More specials. */
9300	};
9301
9302	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpestri); iFn++)
9303	{
9304	PFNIEMAIMPLPCMPESTRIU128IMM8 const pfn = g_aSsePcmpestri[iFn].pfnNative ? g_aSsePcmpestri[iFn].pfnNative : g_aSsePcmpestri[iFn].pfn;
9305
9306	IEMBINARYOUTPUT BinOut;
9307	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSsePcmpestri[iFn].pszName), RTEXITCODE_FAILURE);
9308
9309	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
9310	{
9311	SSE_PCMPESTRI_TEST_T TestData; RT_ZERO(TestData);
9312
9313	TestData.InVal1.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc1;
9314	TestData.InVal2.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc2;
9315
9316	for (int64_t i64Rax = -20; i64Rax < 20; i64Rax += 20)
9317	for (int64_t i64Rdx = -20; i64Rdx < 20; i64Rdx += 20)
9318	{
9319	TestData.u64Rax = (uint64_t)i64Rax;
9320	TestData.u64Rdx = (uint64_t)i64Rdx;
9321
9322	IEMPCMPESTRXSRC TestVal;
9323	TestVal.uSrc1 = TestData.InVal1.uXmm;
9324	TestVal.uSrc2 = TestData.InVal2.uXmm;
9325	TestVal.u64Rax = TestData.u64Rax;
9326	TestVal.u64Rdx = TestData.u64Rdx;
9327
9328	uint32_t const fEFlagsIn = RandEFlags();
9329	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9330	{
9331	uint32_t fEFlagsOut = fEFlagsIn;
9332	pfn(&TestData.u32EcxOut, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9333	TestData.fEFlagsIn = fEFlagsIn;
9334	TestData.fEFlagsOut = fEFlagsOut;
9335	TestData.bImm = (uint8_t)u16Imm;
9336	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9337	}
9338
9339	/* Repeat the test with the input value being the same. */
9340	TestData.InVal2.uXmm = TestData.InVal1.uXmm;
9341	TestVal.uSrc1 = TestData.InVal1.uXmm;
9342	TestVal.uSrc2 = TestData.InVal2.uXmm;
9343
9344	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9345	{
9346	uint32_t fEFlagsOut = fEFlagsIn;
9347	pfn(&TestData.u32EcxOut, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9348	TestData.fEFlagsIn = fEFlagsIn;
9349	TestData.fEFlagsOut = fEFlagsOut;
9350	TestData.bImm = (uint8_t)u16Imm;
9351	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9352	}
9353	}
9354	}
9355	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
9356	}
9357
9358	return RTEXITCODE_SUCCESS;
9359	}
9360	#endif
9361
9362	static void SseComparePcmpestriTest(void)
9363	{
9364	X86FXSTATE State;
9365	RT_ZERO(State);
9366
9367	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpestri); iFn++)
9368	{
9369	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSsePcmpestri[iFn]))
9370	continue;
9371
9372	uint32_t const cTests = *g_aSsePcmpestri[iFn].pcTests;
9373	SSE_PCMPESTRI_TEST_T const * const paTests = g_aSsePcmpestri[iFn].paTests;
9374	PFNIEMAIMPLPCMPESTRIU128IMM8 pfn = g_aSsePcmpestri[iFn].pfn;
9375	uint32_t const cVars = COUNT_VARIATIONS(g_aSsePcmpestri[iFn]);
9376	if (!cTests) RTTestSkipped(g_hTest, "no tests");
9377	for (uint32_t iVar = 0; iVar < cVars; iVar++)
9378	{
9379	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
9380	{
9381	IEMPCMPESTRXSRC TestVal;
9382	TestVal.uSrc1 = paTests[iTest].InVal1.uXmm;
9383	TestVal.uSrc2 = paTests[iTest].InVal2.uXmm;
9384	TestVal.u64Rax = paTests[iTest].u64Rax;
9385	TestVal.u64Rdx = paTests[iTest].u64Rdx;
9386
9387	uint32_t fEFlags = paTests[iTest].fEFlagsIn;
9388	uint32_t u32EcxOut = 0;
9389	pfn(&u32EcxOut, &fEFlags, &TestVal, paTests[iTest].bImm);
9390	if ( fEFlags != paTests[iTest].fEFlagsOut
9391	\|\| u32EcxOut != paTests[iTest].u32EcxOut)
9392	RTTestFailed(g_hTest, "#%04u%s: efl=%#08x in1=%s rax1=%RI64 in2=%s rdx2=%RI64 bImm=%#x\n"
9393	"%s -> efl=%#08x %RU32\n"
9394	"%s expected %#08x %RU32%s%s\n",
9395	iTest, iVar ? "/n" : "", paTests[iTest].fEFlagsIn,
9396	FormatU128(&paTests[iTest].InVal1.uXmm), paTests[iTest].u64Rax,
9397	FormatU128(&paTests[iTest].InVal2.uXmm), paTests[iTest].u64Rdx,
9398	paTests[iTest].bImm,
9399	iVar ? " " : "", fEFlags, u32EcxOut,
9400	iVar ? " " : "", paTests[iTest].fEFlagsOut, paTests[iTest].u32EcxOut,
9401	EFlagsDiff(fEFlags, paTests[iTest].fEFlagsOut),
9402	(u32EcxOut != paTests[iTest].u32EcxOut) ? " - val" : "");
9403	}
9404	}
9405	}
9406	}
9407
9408
9409	TYPEDEF_SUBTEST_TYPE(SSE_PCMPESTRM_T, SSE_PCMPESTRM_TEST_T, PFNIEMAIMPLPCMPESTRMU128IMM8);
9410
9411	static SSE_PCMPESTRM_T g_aSsePcmpestrm[] =
9412	{
9413	ENTRY_BIN_SSE_OPT(pcmpestrm_u128),
9414	};
9415
9416	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9417	static RTEXITCODE SseComparePcmpestrmGenerate(const char *pszDataFileFmt, uint32_t cTests)
9418	{
9419	cTests = RT_MAX(192, cTests); /* there are 144 standard input variations */
9420
9421	static struct { RTUINT128U uSrc1; RTUINT128U uSrc2; } const s_aSpecials[] =
9422	{
9423	{ RTUINT128_INIT_C(0, 0), RTUINT128_INIT_C(0, 0) },
9424	/** @todo More specials. */
9425	};
9426
9427	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpestrm); iFn++)
9428	{
9429	PFNIEMAIMPLPCMPESTRMU128IMM8 const pfn = g_aSsePcmpestrm[iFn].pfnNative ? g_aSsePcmpestrm[iFn].pfnNative : g_aSsePcmpestrm[iFn].pfn;
9430
9431	IEMBINARYOUTPUT BinOut;
9432	AssertReturn(GenerateBinaryOpen(&BinOut, pszDataFileFmt, g_aSsePcmpestrm[iFn].pszName), RTEXITCODE_FAILURE);
9433
9434	for (uint32_t iTest = 0; iTest < cTests + RT_ELEMENTS(s_aSpecials); iTest += 1)
9435	{
9436	SSE_PCMPESTRM_TEST_T TestData; RT_ZERO(TestData);
9437
9438	TestData.InVal1.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc1;
9439	TestData.InVal2.uXmm = iTest < cTests ? RandU128() : s_aSpecials[iTest - cTests].uSrc2;
9440
9441	for (int64_t i64Rax = -20; i64Rax < 20; i64Rax += 20)
9442	for (int64_t i64Rdx = -20; i64Rdx < 20; i64Rdx += 20)
9443	{
9444	TestData.u64Rax = (uint64_t)i64Rax;
9445	TestData.u64Rdx = (uint64_t)i64Rdx;
9446
9447	IEMPCMPESTRXSRC TestVal;
9448	TestVal.uSrc1 = TestData.InVal1.uXmm;
9449	TestVal.uSrc2 = TestData.InVal2.uXmm;
9450	TestVal.u64Rax = TestData.u64Rax;
9451	TestVal.u64Rdx = TestData.u64Rdx;
9452
9453	uint32_t const fEFlagsIn = RandEFlags();
9454	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9455	{
9456	uint32_t fEFlagsOut = fEFlagsIn;
9457	pfn(&TestData.OutVal.uXmm, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9458	TestData.fEFlagsIn = fEFlagsIn;
9459	TestData.fEFlagsOut = fEFlagsOut;
9460	TestData.bImm = (uint8_t)u16Imm;
9461	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9462	}
9463
9464	/* Repeat the test with the input value being the same. */
9465	TestData.InVal2.uXmm = TestData.InVal1.uXmm;
9466	TestVal.uSrc1 = TestData.InVal1.uXmm;
9467	TestVal.uSrc2 = TestData.InVal2.uXmm;
9468
9469	for (uint16_t u16Imm = 0; u16Imm < 256; u16Imm++)
9470	{
9471	uint32_t fEFlagsOut = fEFlagsIn;
9472	pfn(&TestData.OutVal.uXmm, &fEFlagsOut, &TestVal, (uint8_t)u16Imm);
9473	TestData.fEFlagsIn = fEFlagsIn;
9474	TestData.fEFlagsOut = fEFlagsOut;
9475	TestData.bImm = (uint8_t)u16Imm;
9476	GenerateBinaryWrite(&BinOut, &TestData, sizeof(TestData));
9477	}
9478	}
9479	}
9480	AssertReturn(GenerateBinaryClose(&BinOut), RTEXITCODE_FAILURE);
9481	}
9482
9483	return RTEXITCODE_SUCCESS;
9484	}
9485	#endif
9486
9487	static void SseComparePcmpestrmTest(void)
9488	{
9489	X86FXSTATE State;
9490	RT_ZERO(State);
9491
9492	for (size_t iFn = 0; iFn < RT_ELEMENTS(g_aSsePcmpestrm); iFn++)
9493	{
9494	if (!SUBTEST_CHECK_IF_ENABLED_AND_DECOMPRESS(g_aSsePcmpestrm[iFn]))
9495	continue;
9496
9497	uint32_t const cTests = *g_aSsePcmpestrm[iFn].pcTests;
9498	SSE_PCMPESTRM_TEST_T const * const paTests = g_aSsePcmpestrm[iFn].paTests;
9499	PFNIEMAIMPLPCMPESTRMU128IMM8 pfn = g_aSsePcmpestrm[iFn].pfn;
9500	uint32_t const cVars = COUNT_VARIATIONS(g_aSsePcmpestrm[iFn]);
9501	if (!cTests) RTTestSkipped(g_hTest, "no tests");
9502	for (uint32_t iVar = 0; iVar < cVars; iVar++)
9503	{
9504	for (uint32_t iTest = 0; iTest < cTests / sizeof(*paTests); iTest++)
9505	{
9506	IEMPCMPESTRXSRC TestVal;
9507	TestVal.uSrc1 = paTests[iTest].InVal1.uXmm;
9508	TestVal.uSrc2 = paTests[iTest].InVal2.uXmm;
9509	TestVal.u64Rax = paTests[iTest].u64Rax;
9510	TestVal.u64Rdx = paTests[iTest].u64Rdx;
9511
9512	uint32_t fEFlags = paTests[iTest].fEFlagsIn;
9513	RTUINT128U OutVal;
9514	pfn(&OutVal, &fEFlags, &TestVal, paTests[iTest].bImm);
9515	if ( fEFlags != paTests[iTest].fEFlagsOut
9516	\|\| OutVal.s.Hi != paTests[iTest].OutVal.uXmm.s.Hi
9517	\|\| OutVal.s.Lo != paTests[iTest].OutVal.uXmm.s.Lo)
9518	RTTestFailed(g_hTest, "#%04u%s: efl=%#08x in1=%s rax1=%RI64 in2=%s rdx2=%RI64 bImm=%#x\n"
9519	"%s -> efl=%#08x %s\n"
9520	"%s expected %#08x %s%s%s\n",
9521	iTest, iVar ? "/n" : "", paTests[iTest].fEFlagsIn,
9522	FormatU128(&paTests[iTest].InVal1.uXmm), paTests[iTest].u64Rax,
9523	FormatU128(&paTests[iTest].InVal2.uXmm), paTests[iTest].u64Rdx,
9524	paTests[iTest].bImm,
9525	iVar ? " " : "", fEFlags, FormatU128(&OutVal),
9526	iVar ? " " : "", paTests[iTest].fEFlagsOut, FormatU128(&paTests[iTest].OutVal.uXmm),
9527	EFlagsDiff(fEFlags, paTests[iTest].fEFlagsOut),
9528	( OutVal.s.Hi != paTests[iTest].OutVal.uXmm.s.Hi
9529	\|\| OutVal.s.Lo != paTests[iTest].OutVal.uXmm.s.Lo) ? " - val" : "");
9530	}
9531	}
9532	}
9533	}
9534
9535
9536
9537	int main(int argc, char **argv)
9538	{
9539	int rc = RTR3InitExe(argc, &argv, 0);
9540	if (RT_FAILURE(rc))
9541	return RTMsgInitFailure(rc);
9542
9543	/*
9544	* Determin the host CPU.
9545	* If not using the IEMAllAImpl.asm code, this will be set to Intel.
9546	*/
9547	#if (defined(RT_ARCH_X86) \|\| defined(RT_ARCH_AMD64)) && !defined(IEM_WITHOUT_ASSEMBLY)
9548	g_idxCpuEflFlavour = ASMIsAmdCpu() \|\| ASMIsHygonCpu()
9549	? IEMTARGETCPU_EFL_BEHAVIOR_AMD
9550	: IEMTARGETCPU_EFL_BEHAVIOR_INTEL;
9551	#else
9552	g_idxCpuEflFlavour = IEMTARGETCPU_EFL_BEHAVIOR_INTEL;
9553	#endif
9554
9555	/*
9556	* Parse arguments.
9557	*/
9558	enum { kModeNotSet, kModeTest, kModeGenerate, kModeDump }
9559	enmMode = kModeNotSet;
9560	bool fInt = true;
9561	bool fFpuLdSt = true;
9562	bool fFpuBinary1 = true;
9563	bool fFpuBinary2 = true;
9564	bool fFpuOther = true;
9565	bool fCpuData = true;
9566	bool fCommonData = true;
9567	bool fSseFpBinary = true;
9568	bool fSseFpOther = true;
9569	bool fSsePcmpxstrx = true;
9570	uint32_t const cDefaultTests = 96;
9571	uint32_t cTests = cDefaultTests;
9572	RTGETOPTDEF const s_aOptions[] =
9573	{
9574	// mode:
9575	{ "--generate", 'g', RTGETOPT_REQ_NOTHING },
9576	{ "--dump", 'G', RTGETOPT_REQ_NOTHING },
9577	{ "--test", 't', RTGETOPT_REQ_NOTHING },
9578	{ "--benchmark", 'b', RTGETOPT_REQ_NOTHING },
9579	// test selection (both)
9580	{ "--all", 'a', RTGETOPT_REQ_NOTHING },
9581	{ "--none", 'z', RTGETOPT_REQ_NOTHING },
9582	{ "--zap", 'z', RTGETOPT_REQ_NOTHING },
9583	{ "--fpu-ld-st", 'F', RTGETOPT_REQ_NOTHING }, /* FPU stuff is upper case */
9584	{ "--fpu-load-store", 'F', RTGETOPT_REQ_NOTHING },
9585	{ "--fpu-binary-1", 'B', RTGETOPT_REQ_NOTHING },
9586	{ "--fpu-binary-2", 'P', RTGETOPT_REQ_NOTHING },
9587	{ "--fpu-other", 'O', RTGETOPT_REQ_NOTHING },
9588	{ "--sse-fp-binary", 'S', RTGETOPT_REQ_NOTHING },
9589	{ "--sse-fp-other", 'T', RTGETOPT_REQ_NOTHING },
9590	{ "--sse-pcmpxstrx", 'C', RTGETOPT_REQ_NOTHING },
9591	{ "--int", 'i', RTGETOPT_REQ_NOTHING },
9592	{ "--include", 'I', RTGETOPT_REQ_STRING },
9593	{ "--exclude", 'X', RTGETOPT_REQ_STRING },
9594	// generation parameters
9595	{ "--common", 'm', RTGETOPT_REQ_NOTHING },
9596	{ "--cpu", 'c', RTGETOPT_REQ_NOTHING },
9597	{ "--number-of-tests", 'n', RTGETOPT_REQ_UINT32 },
9598	{ "--verbose", 'v', RTGETOPT_REQ_NOTHING },
9599	{ "--quiet", 'q', RTGETOPT_REQ_NOTHING },
9600	};
9601
9602	RTGETOPTSTATE State;
9603	rc = RTGetOptInit(&State, argc, argv, s_aOptions, RT_ELEMENTS(s_aOptions), 1, 0);
9604	AssertRCReturn(rc, RTEXITCODE_FAILURE);
9605
9606	RTGETOPTUNION ValueUnion;
9607	while ((rc = RTGetOpt(&State, &ValueUnion)))
9608	{
9609	switch (rc)
9610	{
9611	case 'g':
9612	enmMode = kModeGenerate;
9613	g_cPicoSecBenchmark = 0;
9614	break;
9615	case 'G':
9616	enmMode = kModeDump;
9617	g_cPicoSecBenchmark = 0;
9618	break;
9619	case 't':
9620	enmMode = kModeTest;
9621	g_cPicoSecBenchmark = 0;
9622	break;
9623	case 'b':
9624	enmMode = kModeTest;
9625	g_cPicoSecBenchmark += RT_NS_1SEC / 2 * UINT64_C(1000); /* half a second in pico seconds */
9626	break;
9627
9628	case 'a':
9629	fCpuData = true;
9630	fCommonData = true;
9631	fInt = true;
9632	fFpuLdSt = true;
9633	fFpuBinary1 = true;
9634	fFpuBinary2 = true;
9635	fFpuOther = true;
9636	fSseFpBinary = true;
9637	fSseFpOther = true;
9638	fSsePcmpxstrx = true;
9639	break;
9640	case 'z':
9641	fCpuData = false;
9642	fCommonData = false;
9643	fInt = false;
9644	fFpuLdSt = false;
9645	fFpuBinary1 = false;
9646	fFpuBinary2 = false;
9647	fFpuOther = false;
9648	fSseFpBinary = false;
9649	fSseFpOther = false;
9650	fSsePcmpxstrx = false;
9651	break;
9652
9653	case 'F':
9654	fFpuLdSt = true;
9655	break;
9656	case 'O':
9657	fFpuOther = true;
9658	break;
9659	case 'B':
9660	fFpuBinary1 = true;
9661	break;
9662	case 'P':
9663	fFpuBinary2 = true;
9664	break;
9665	case 'S':
9666	fSseFpBinary = true;
9667	break;
9668	case 'T':
9669	fSseFpOther = true;
9670	break;
9671	case 'C':
9672	fSsePcmpxstrx = true;
9673	break;
9674	case 'i':
9675	fInt = true;
9676	break;
9677
9678	case 'I':
9679	if (g_cIncludeTestPatterns >= RT_ELEMENTS(g_apszIncludeTestPatterns))
9680	return RTMsgErrorExit(RTEXITCODE_SYNTAX, "Too many include patterns (max %zu)",
9681	RT_ELEMENTS(g_apszIncludeTestPatterns));
9682	g_apszIncludeTestPatterns[g_cIncludeTestPatterns++] = ValueUnion.psz;
9683	break;
9684	case 'X':
9685	if (g_cExcludeTestPatterns >= RT_ELEMENTS(g_apszExcludeTestPatterns))
9686	return RTMsgErrorExit(RTEXITCODE_SYNTAX, "Too many exclude patterns (max %zu)",
9687	RT_ELEMENTS(g_apszExcludeTestPatterns));
9688	g_apszExcludeTestPatterns[g_cExcludeTestPatterns++] = ValueUnion.psz;
9689	break;
9690
9691	case 'm':
9692	fCommonData = true;
9693	break;
9694	case 'c':
9695	fCpuData = true;
9696	break;
9697	case 'n':
9698	cTests = ValueUnion.u32;
9699	break;
9700
9701	case 'q':
9702	g_cVerbosity = 0;
9703	break;
9704	case 'v':
9705	g_cVerbosity++;
9706	break;
9707
9708	case 'h':
9709	RTPrintf("usage: %s <-g\|-t> [options]\n"
9710	"\n"
9711	"Mode:\n"
9712	" -g, --generate\n"
9713	" Generate test data.\n"
9714	" -t, --test\n"
9715	" Execute tests.\n"
9716	" -b, --benchmark\n"
9717	" Execute tests and do 1/2 seconds of benchmarking.\n"
9718	" Repeating the option increases the benchmark duration by 0.5 seconds.\n"
9719	"\n"
9720	"Test selection (both modes):\n"
9721	" -a, --all\n"
9722	" Enable all tests and generated test data. (default)\n"
9723	" -z, --zap, --none\n"
9724	" Disable all tests and test data types.\n"
9725	" -i, --int\n"
9726	" Enable non-FPU tests.\n"
9727	" -F, --fpu-ld-st\n"
9728	" Enable FPU load and store tests.\n"
9729	" -B, --fpu-binary-1\n"
9730	" Enable FPU binary 80-bit FP tests.\n"
9731	" -P, --fpu-binary-2\n"
9732	" Enable FPU binary 64- and 32-bit FP tests.\n"
9733	" -O, --fpu-other\n"
9734	" Enable FPU binary 64- and 32-bit FP tests.\n"
9735	" -S, --sse-fp-binary\n"
9736	" Enable SSE binary 64- and 32-bit FP tests.\n"
9737	" -T, --sse-fp-other\n"
9738	" Enable misc SSE 64- and 32-bit FP tests.\n"
9739	" -C, --sse-pcmpxstrx\n"
9740	" Enable SSE pcmpxstrx tests.\n"
9741	" -I,--include=<test-patter>\n"
9742	" Enable tests matching the given pattern.\n"
9743	" -X,--exclude=<test-patter>\n"
9744	" Skip tests matching the given pattern (overrides --include).\n"
9745	"\n"
9746	"Generation:\n"
9747	" -m, --common\n"
9748	" Enable generating common test data.\n"
9749	" -c, --only-cpu\n"
9750	" Enable generating CPU specific test data.\n"
9751	" -n, --number-of-test <count>\n"
9752	" Number of tests to generate. Default: %u\n"
9753	"\n"
9754	"Other:\n"
9755	" -v, --verbose\n"
9756	" -q, --quiet\n"
9757	" Noise level. Default: --quiet\n"
9758	, argv[0], cDefaultTests);
9759	return RTEXITCODE_SUCCESS;
9760	default:
9761	return RTGetOptPrintError(rc, &ValueUnion);
9762	}
9763	}
9764
9765	/*
9766	* Generate data?
9767	*/
9768	if (enmMode == kModeGenerate)
9769	{
9770	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9771	char szCpuDesc[256] = {0};
9772	RTMpGetDescription(NIL_RTCPUID, szCpuDesc, sizeof(szCpuDesc));
9773	const char * const pszCpuType = g_idxCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD ? "Amd" : "Intel";
9774	# if defined(RT_OS_WINDOWS) \|\| defined(RT_OS_OS2)
9775	const char * const pszBitBucket = "NUL";
9776	# else
9777	const char * const pszBitBucket = "/dev/null";
9778	# endif
9779
9780	if (cTests == 0)
9781	cTests = cDefaultTests;
9782	g_cZeroDstTests = RT_MIN(cTests / 16, 32);
9783	g_cZeroSrcTests = g_cZeroDstTests * 2;
9784
9785	if (fInt)
9786	{
9787	const char * const apszNameFmts[] =
9788	{
9789	/[IEMTARGETCPU_EFL_BEHAVIOR_NATIVE] =/ fCommonData ? "tstIEMAImplDataInt-%s.bin.gz" : NULL,
9790	/[IEMTARGETCPU_EFL_BEHAVIOR_INTEL] =/ fCpuData ? "tstIEMAImplDataInt-%s-Intel.bin.gz" : NULL,
9791	/[IEMTARGETCPU_EFL_BEHAVIOR_AMD] =/ fCpuData ? "tstIEMAImplDataInt-%s-Amd.bin.gz" : NULL,
9792	};
9793	RTEXITCODE rcExit = BinU8Generate(cTests, apszNameFmts);
9794	if (rcExit == RTEXITCODE_SUCCESS)
9795	rcExit = BinU16Generate(cTests, apszNameFmts);
9796	if (rcExit == RTEXITCODE_SUCCESS)
9797	rcExit = BinU32Generate(cTests, apszNameFmts);
9798	if (rcExit == RTEXITCODE_SUCCESS)
9799	rcExit = BinU64Generate(cTests, apszNameFmts);
9800	if (rcExit == RTEXITCODE_SUCCESS)
9801	rcExit = ShiftDblGenerate(RT_MAX(cTests, 128), apszNameFmts);
9802	if (rcExit == RTEXITCODE_SUCCESS)
9803	rcExit = UnaryGenerate(cTests, apszNameFmts);
9804	if (rcExit == RTEXITCODE_SUCCESS)
9805	rcExit = ShiftGenerate(cTests, apszNameFmts);
9806	if (rcExit == RTEXITCODE_SUCCESS)
9807	rcExit = MulDivGenerate(cTests, apszNameFmts);
9808	if (rcExit != RTEXITCODE_SUCCESS)
9809	return rcExit;
9810	}
9811
9812	if (fFpuLdSt)
9813	{
9814	const char *pszDataFile = fCommonData ? "tstIEMAImplDataFpuLdSt.cpp" : pszBitBucket;
9815	PRTSTREAM pStrmData = GenerateOpenWithHdr(pszDataFile, szCpuDesc, NULL);
9816	const char *pszDataCpuFile = !fCpuData ? pszBitBucket : g_idxCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD
9817	? "tstIEMAImplDataFpuLdSt-Amd.cpp" : "tstIEMAImplDataFpuLdSt-Intel.cpp";
9818	PRTSTREAM pStrmDataCpu = GenerateOpenWithHdr(pszDataCpuFile, szCpuDesc, pszCpuType);
9819	if (!pStrmData \|\| !pStrmDataCpu)
9820	return RTEXITCODE_FAILURE;
9821
9822	FpuLdConstGenerate(pStrmData, cTests);
9823	FpuLdIntGenerate(pStrmData, cTests);
9824	FpuLdD80Generate(pStrmData, cTests);
9825	FpuStIntGenerate(pStrmData, pStrmDataCpu, cTests);
9826	FpuStD80Generate(pStrmData, cTests);
9827	uint32_t const cTests2 = RT_MAX(cTests, 384); /* need better coverage for the next ones. */
9828	FpuLdMemGenerate(pStrmData, cTests2);
9829	FpuStMemGenerate(pStrmData, cTests2);
9830
9831	RTEXITCODE rcExit = GenerateFooterAndClose(pStrmDataCpu, pszDataCpuFile,
9832	GenerateFooterAndClose(pStrmData, pszDataFile, RTEXITCODE_SUCCESS));
9833	if (rcExit != RTEXITCODE_SUCCESS)
9834	return rcExit;
9835	}
9836
9837	if (fFpuBinary1)
9838	{
9839	const char *pszDataFile = fCommonData ? "tstIEMAImplDataFpuBinary1.cpp" : pszBitBucket;
9840	PRTSTREAM pStrmData = GenerateOpenWithHdr(pszDataFile, szCpuDesc, NULL);
9841	const char *pszDataCpuFile = !fCpuData ? pszBitBucket : g_idxCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD
9842	? "tstIEMAImplDataFpuBinary1-Amd.cpp" : "tstIEMAImplDataFpuBinary1-Intel.cpp";
9843	PRTSTREAM pStrmDataCpu = GenerateOpenWithHdr(pszDataCpuFile, szCpuDesc, pszCpuType);
9844	if (!pStrmData \|\| !pStrmDataCpu)
9845	return RTEXITCODE_FAILURE;
9846
9847	FpuBinaryR80Generate(pStrmData, pStrmDataCpu, cTests);
9848	FpuBinaryFswR80Generate(pStrmData, cTests);
9849	FpuBinaryEflR80Generate(pStrmData, cTests);
9850
9851	RTEXITCODE rcExit = GenerateFooterAndClose(pStrmDataCpu, pszDataCpuFile,
9852	GenerateFooterAndClose(pStrmData, pszDataFile, RTEXITCODE_SUCCESS));
9853	if (rcExit != RTEXITCODE_SUCCESS)
9854	return rcExit;
9855	}
9856
9857	if (fFpuBinary2)
9858	{
9859	const char *pszDataFile = fCommonData ? "tstIEMAImplDataFpuBinary2.cpp" : pszBitBucket;
9860	PRTSTREAM pStrmData = GenerateOpenWithHdr(pszDataFile, szCpuDesc, NULL);
9861	const char pszDataCpuFile = pszBitBucket; /!fCpuData ? pszBitBucket : g_idxCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD
9862	? "tstIEMAImplDataFpuBinary2-Amd.cpp" : "tstIEMAImplDataFpuBinary2-Intel.cpp"; */
9863	PRTSTREAM pStrmDataCpu = GenerateOpenWithHdr(pszDataCpuFile, szCpuDesc, pszCpuType);
9864	if (!pStrmData \|\| !pStrmDataCpu)
9865	return RTEXITCODE_FAILURE;
9866
9867	FpuBinaryR64Generate(pStrmData, cTests);
9868	FpuBinaryR32Generate(pStrmData, cTests);
9869	FpuBinaryI32Generate(pStrmData, cTests);
9870	FpuBinaryI16Generate(pStrmData, cTests);
9871	FpuBinaryFswR64Generate(pStrmData, cTests);
9872	FpuBinaryFswR32Generate(pStrmData, cTests);
9873	FpuBinaryFswI32Generate(pStrmData, cTests);
9874	FpuBinaryFswI16Generate(pStrmData, cTests);
9875
9876	RTEXITCODE rcExit = GenerateFooterAndClose(pStrmDataCpu, pszDataCpuFile,
9877	GenerateFooterAndClose(pStrmData, pszDataFile, RTEXITCODE_SUCCESS));
9878	if (rcExit != RTEXITCODE_SUCCESS)
9879	return rcExit;
9880	}
9881
9882	if (fFpuOther)
9883	{
9884	const char *pszDataFile = fCommonData ? "tstIEMAImplDataFpuOther.cpp" : pszBitBucket;
9885	PRTSTREAM pStrmData = GenerateOpenWithHdr(pszDataFile, szCpuDesc, NULL);
9886	const char *pszDataCpuFile = !fCpuData ? pszBitBucket : g_idxCpuEflFlavour == IEMTARGETCPU_EFL_BEHAVIOR_AMD
9887	? "tstIEMAImplDataFpuOther-Amd.cpp" : "tstIEMAImplDataFpuOther-Intel.cpp";
9888	PRTSTREAM pStrmDataCpu = GenerateOpenWithHdr(pszDataCpuFile, szCpuDesc, pszCpuType);
9889	if (!pStrmData \|\| !pStrmDataCpu)
9890	return RTEXITCODE_FAILURE;
9891
9892	FpuUnaryR80Generate(pStrmData, pStrmDataCpu, cTests);
9893	FpuUnaryFswR80Generate(pStrmData, pStrmDataCpu, cTests);
9894	FpuUnaryTwoR80Generate(pStrmData, pStrmDataCpu, cTests);
9895
9896	RTEXITCODE rcExit = GenerateFooterAndClose(pStrmDataCpu, pszDataCpuFile,
9897	GenerateFooterAndClose(pStrmData, pszDataFile, RTEXITCODE_SUCCESS));
9898	if (rcExit != RTEXITCODE_SUCCESS)
9899	return rcExit;
9900	}
9901
9902	if (fSseFpBinary)
9903	{
9904	const char * const pszDataFileFmt = fCommonData ? "tstIEMAImplDataSseBinary-%s.bin.gz" : NULL;
9905
9906	RTEXITCODE rcExit = SseBinaryR32Generate(pszDataFileFmt, cTests);
9907	if (rcExit == RTEXITCODE_SUCCESS)
9908	rcExit = SseBinaryR64Generate(pszDataFileFmt, cTests);
9909	if (rcExit == RTEXITCODE_SUCCESS)
9910	rcExit = SseBinaryU128R32Generate(pszDataFileFmt, cTests);
9911	if (rcExit == RTEXITCODE_SUCCESS)
9912	rcExit = SseBinaryU128R64Generate(pszDataFileFmt, cTests);
9913
9914	if (rcExit == RTEXITCODE_SUCCESS)
9915	rcExit = SseBinaryI32R64Generate(pszDataFileFmt, cTests);
9916	if (rcExit == RTEXITCODE_SUCCESS)
9917	rcExit = SseBinaryI64R64Generate(pszDataFileFmt, cTests);
9918	if (rcExit == RTEXITCODE_SUCCESS)
9919	rcExit = SseBinaryI32R32Generate(pszDataFileFmt, cTests);
9920	if (rcExit == RTEXITCODE_SUCCESS)
9921	rcExit = SseBinaryI64R32Generate(pszDataFileFmt, cTests);
9922
9923	if (rcExit == RTEXITCODE_SUCCESS)
9924	rcExit = SseBinaryR64I32Generate(pszDataFileFmt, cTests);
9925	if (rcExit == RTEXITCODE_SUCCESS)
9926	rcExit = SseBinaryR64I64Generate(pszDataFileFmt, cTests);
9927	if (rcExit == RTEXITCODE_SUCCESS)
9928	rcExit = SseBinaryR32I32Generate(pszDataFileFmt, cTests);
9929	if (rcExit == RTEXITCODE_SUCCESS)
9930	rcExit = SseBinaryR32I64Generate(pszDataFileFmt, cTests);
9931	if (rcExit != RTEXITCODE_SUCCESS)
9932	return rcExit;
9933	}
9934
9935	if (fSseFpOther)
9936	{
9937	const char * const pszDataFileFmtCmp = fCommonData ? "tstIEMAImplDataSseCompare-%s.bin.gz" : NULL;
9938	const char * const pszDataFileFmtConv = fCommonData ? "tstIEMAImplDataSseConvert-%s.bin.gz" : NULL;
9939
9940	RTEXITCODE rcExit = SseCompareEflR32R32Generate(pszDataFileFmtCmp, cTests);
9941	if (rcExit == RTEXITCODE_SUCCESS)
9942	rcExit = SseCompareEflR64R64Generate(pszDataFileFmtCmp, cTests);
9943	if (rcExit == RTEXITCODE_SUCCESS)
9944	rcExit = SseCompareF2XmmR32Imm8Generate(pszDataFileFmtCmp, cTests);
9945	if (rcExit == RTEXITCODE_SUCCESS)
9946	rcExit = SseCompareF2XmmR64Imm8Generate(pszDataFileFmtCmp, cTests);
9947	if (rcExit == RTEXITCODE_SUCCESS)
9948	rcExit = SseConvertXmmI32R32Generate(pszDataFileFmtConv, cTests);
9949	if (rcExit == RTEXITCODE_SUCCESS)
9950	rcExit = SseConvertXmmR32I32Generate(pszDataFileFmtConv, cTests);
9951	if (rcExit == RTEXITCODE_SUCCESS)
9952	rcExit = SseConvertXmmI32R64Generate(pszDataFileFmtConv, cTests);
9953	if (rcExit == RTEXITCODE_SUCCESS)
9954	rcExit = SseConvertXmmR64I32Generate(pszDataFileFmtConv, cTests);
9955	if (rcExit == RTEXITCODE_SUCCESS)
9956	rcExit = SseConvertMmXmmGenerate(pszDataFileFmtConv, cTests);
9957	if (rcExit == RTEXITCODE_SUCCESS)
9958	rcExit = SseConvertXmmR32MmGenerate(pszDataFileFmtConv, cTests);
9959	if (rcExit == RTEXITCODE_SUCCESS)
9960	rcExit = SseConvertXmmR64MmGenerate(pszDataFileFmtConv, cTests);
9961	if (rcExit == RTEXITCODE_SUCCESS)
9962	rcExit = SseConvertMmI32XmmR32Generate(pszDataFileFmtConv, cTests);
9963	if (rcExit != RTEXITCODE_SUCCESS)
9964	return rcExit;
9965	}
9966
9967	if (fSsePcmpxstrx)
9968	{
9969	const char * const pszDataFileFmtCmp = fCommonData ? "tstIEMAImplDataSsePcmpxstrx-%s.bin.gz" : NULL;
9970
9971	RTEXITCODE rcExit = SseComparePcmpistriGenerate(pszDataFileFmtCmp, cTests);
9972	if (rcExit == RTEXITCODE_SUCCESS)
9973	rcExit = SseComparePcmpistrmGenerate(pszDataFileFmtCmp, cTests);
9974	if (rcExit == RTEXITCODE_SUCCESS)
9975	rcExit = SseComparePcmpestriGenerate(pszDataFileFmtCmp, cTests);
9976	if (rcExit == RTEXITCODE_SUCCESS)
9977	rcExit = SseComparePcmpestrmGenerate(pszDataFileFmtCmp, cTests);
9978	if (rcExit != RTEXITCODE_SUCCESS)
9979	return rcExit;
9980	}
9981
9982	return RTEXITCODE_SUCCESS;
9983	#else
9984	return RTMsgErrorExitFailure("Test data generator not compiled in!");
9985	#endif
9986	}
9987
9988	/*
9989	* Dump tables.
9990	*/
9991	if (enmMode == kModeDump)
9992	{
9993	#ifdef TSTIEMAIMPL_WITH_GENERATOR
9994	if (fInt)
9995	{
9996	const char * const apszNameFmts[] =
9997	{
9998	/[IEMTARGETCPU_EFL_BEHAVIOR_NATIVE] =/ fCommonData ? "tstIEMAImplDataInt-%s.bin.gz" : NULL,
9999	/[IEMTARGETCPU_EFL_BEHAVIOR_INTEL] =/ fCpuData ? "tstIEMAImplDataInt-%s-Intel.bin.gz" : NULL,
10000	/[IEMTARGETCPU_EFL_BEHAVIOR_AMD] =/ fCpuData ? "tstIEMAImplDataInt-%s-Amd.bin.gz" : NULL,
10001	};
10002	RTEXITCODE rcExit = BinU8DumpAll(apszNameFmts);
10003	if (rcExit == RTEXITCODE_SUCCESS)
10004	rcExit = BinU16DumpAll(apszNameFmts);
10005	if (rcExit == RTEXITCODE_SUCCESS)
10006	rcExit = BinU32DumpAll(apszNameFmts);
10007	if (rcExit == RTEXITCODE_SUCCESS)
10008	rcExit = BinU64DumpAll(apszNameFmts);
10009	if (rcExit == RTEXITCODE_SUCCESS)
10010	rcExit = ShiftDblDumpAll(apszNameFmts);
10011	if (rcExit == RTEXITCODE_SUCCESS)
10012	rcExit = UnaryDumpAll(apszNameFmts);
10013	if (rcExit == RTEXITCODE_SUCCESS)
10014	rcExit = ShiftDumpAll(apszNameFmts);
10015	if (rcExit == RTEXITCODE_SUCCESS)
10016	rcExit = MulDivDumpAll(apszNameFmts);
10017	if (rcExit != RTEXITCODE_SUCCESS)
10018	return rcExit;
10019	}
10020
10021	return RTEXITCODE_SUCCESS;
10022	#else
10023	return RTMsgErrorExitFailure("Test data generator not compiled in!");
10024	#endif
10025	}
10026
10027
10028	/*
10029	* Do testing. Currrently disabled by default as data needs to be checked
10030	* on both intel and AMD systems first.
10031	*/
10032	rc = RTTestCreate("tstIEMAimpl", &g_hTest);
10033	AssertRCReturn(rc, RTEXITCODE_FAILURE);
10034	if (enmMode == kModeTest)
10035	{
10036	RTTestBanner(g_hTest);
10037
10038	/* Allocate guarded memory for use in the tests. */
10039	#define ALLOC_GUARDED_VAR(a_puVar) do { \
10040	rc = RTTestGuardedAlloc(g_hTest, sizeof(a_puVar), sizeof(a_puVar), false /fHead/, (void **)&a_puVar); \
10041	if (RT_FAILURE(rc)) RTTestFailed(g_hTest, "Failed to allocate guarded mem: " #a_puVar); \
10042	} while (0)
10043	ALLOC_GUARDED_VAR(g_pu8);
10044	ALLOC_GUARDED_VAR(g_pu16);
10045	ALLOC_GUARDED_VAR(g_pu32);
10046	ALLOC_GUARDED_VAR(g_pu64);
10047	ALLOC_GUARDED_VAR(g_pu128);
10048	ALLOC_GUARDED_VAR(g_pu8Two);
10049	ALLOC_GUARDED_VAR(g_pu16Two);
10050	ALLOC_GUARDED_VAR(g_pu32Two);
10051	ALLOC_GUARDED_VAR(g_pu64Two);
10052	ALLOC_GUARDED_VAR(g_pu128Two);
10053	ALLOC_GUARDED_VAR(g_pfEfl);
10054	if (RTTestErrorCount(g_hTest) == 0)
10055	{
10056	if (fInt)
10057	{
10058	BinU8Test();
10059	BinU16Test();
10060	BinU32Test();
10061	BinU64Test();
10062	XchgTest();
10063	XaddTest();
10064	CmpXchgTest();
10065	CmpXchg8bTest();
10066	CmpXchg16bTest();
10067	ShiftDblTest();
10068	UnaryTest();
10069	ShiftTest();
10070	MulDivTest();
10071	BswapTest();
10072	}
10073
10074	if (fFpuLdSt)
10075	{
10076	FpuLoadConstTest();
10077	FpuLdMemTest();
10078	FpuLdIntTest();
10079	FpuLdD80Test();
10080	FpuStMemTest();
10081	FpuStIntTest();
10082	FpuStD80Test();
10083	}
10084
10085	if (fFpuBinary1)
10086	{
10087	FpuBinaryR80Test();
10088	FpuBinaryFswR80Test();
10089	FpuBinaryEflR80Test();
10090	}
10091
10092	if (fFpuBinary2)
10093	{
10094	FpuBinaryR64Test();
10095	FpuBinaryR32Test();
10096	FpuBinaryI32Test();
10097	FpuBinaryI16Test();
10098	FpuBinaryFswR64Test();
10099	FpuBinaryFswR32Test();
10100	FpuBinaryFswI32Test();
10101	FpuBinaryFswI16Test();
10102	}
10103
10104	if (fFpuOther)
10105	{
10106	FpuUnaryR80Test();
10107	FpuUnaryFswR80Test();
10108	FpuUnaryTwoR80Test();
10109	}
10110
10111	if (fSseFpBinary)
10112	{
10113	SseBinaryR32Test();
10114	SseBinaryR64Test();
10115	SseBinaryU128R32Test();
10116	SseBinaryU128R64Test();
10117
10118	SseBinaryI32R64Test();
10119	SseBinaryI64R64Test();
10120	SseBinaryI32R32Test();
10121	SseBinaryI64R32Test();
10122
10123	SseBinaryR64I32Test();
10124	SseBinaryR64I64Test();
10125	SseBinaryR32I32Test();
10126	SseBinaryR32I64Test();
10127	}
10128
10129	if (fSseFpOther)
10130	{
10131	SseCompareEflR32R32Test();
10132	SseCompareEflR64R64Test();
10133	SseCompareEflR64R64Test();
10134	SseCompareF2XmmR32Imm8Test();
10135	SseCompareF2XmmR64Imm8Test();
10136	SseConvertXmmI32R32Test();
10137	SseConvertXmmR32I32Test();
10138	SseConvertXmmI32R64Test();
10139	SseConvertXmmR64I32Test();
10140	SseConvertMmXmmTest();
10141	SseConvertXmmR32MmTest();
10142	SseConvertXmmR64MmTest();
10143	SseConvertMmI32XmmR32Test();
10144	}
10145
10146	if (fSsePcmpxstrx)
10147	{
10148	SseComparePcmpistriTest();
10149	SseComparePcmpistrmTest();
10150	SseComparePcmpestriTest();
10151	SseComparePcmpestrmTest();
10152	}
10153	}
10154	return RTTestSummaryAndDestroy(g_hTest);
10155	}
10156	return RTTestSkipAndDestroy(g_hTest, "unfinished testcase");
10157	}
10158

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/testcase/tstIEMAImpl.cpp@ 103050

Download in other formats: