VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veRecompiler.cpp@ 101181

Last change on this file since 101181 was 101181, checked in by vboxsync, 19 months ago

VMM/IEM: We don't need to emit code for setting up parameters that the threaded worker functions doesn't use, so generate a table with the parameter counts for all of the threaded functions. bugref:10370

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 63.6 KB
Line 
1/* $Id: IEMAllN8veRecompiler.cpp 101181 2023-09-19 23:14:45Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler
4 *
5 * Logging group IEM_RE_NATIVE assignments:
6 * - Level 1 (Log) : ...
7 * - Flow (LogFlow) : ...
8 * - Level 2 (Log2) : ...
9 * - Level 3 (Log3) : ...
10 * - Level 4 (Log4) : ...
11 * - Level 5 (Log5) : ...
12 * - Level 6 (Log6) : ...
13 * - Level 7 (Log7) : ...
14 * - Level 8 (Log8) : ...
15 * - Level 9 (Log9) : ...
16 * - Level 10 (Log10): ...
17 * - Level 11 (Log11): ...
18 * - Level 12 (Log12): ...
19 */
20
21/*
22 * Copyright (C) 2023 Oracle and/or its affiliates.
23 *
24 * This file is part of VirtualBox base platform packages, as
25 * available from https://www.virtualbox.org.
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation, in version 3 of the
30 * License.
31 *
32 * This program is distributed in the hope that it will be useful, but
33 * WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
35 * General Public License for more details.
36 *
37 * You should have received a copy of the GNU General Public License
38 * along with this program; if not, see <https://www.gnu.org/licenses>.
39 *
40 * SPDX-License-Identifier: GPL-3.0-only
41 */
42
43
44/*********************************************************************************************************************************
45* Header Files *
46*********************************************************************************************************************************/
47#define LOG_GROUP LOG_GROUP_IEM_RE_THREADED
48#define IEM_WITH_OPAQUE_DECODER_STATE
49#define VMCPU_INCL_CPUM_GST_CTX
50#include <VBox/vmm/iem.h>
51#include <VBox/vmm/cpum.h>
52#include "IEMInternal.h"
53#include <VBox/vmm/vmcc.h>
54#include <VBox/log.h>
55#include <VBox/err.h>
56#include <VBox/param.h>
57#include <iprt/assert.h>
58#include <iprt/heap.h>
59#include <iprt/mem.h>
60#include <iprt/string.h>
61
62#ifdef RT_OS_WINDOWS
63# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
64extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
65extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
66#else
67# include <iprt/formats/dwarf.h>
68extern "C" void __register_frame_info(void *begin, void *ob); /* found no header for these two */
69#endif
70
71#include "IEMInline.h"
72#include "IEMThreadedFunctions.h"
73
74
75/*
76 * Narrow down configs here to avoid wasting time on unused configs here.
77 * Note! Same checks in IEMAllThrdRecompiler.cpp.
78 */
79
80#ifndef IEM_WITH_CODE_TLB
81# error The code TLB must be enabled for the recompiler.
82#endif
83
84#ifndef IEM_WITH_DATA_TLB
85# error The data TLB must be enabled for the recompiler.
86#endif
87
88#ifndef IEM_WITH_SETJMP
89# error The setjmp approach must be enabled for the recompiler.
90#endif
91
92
93/*********************************************************************************************************************************
94* Defined Constants And Macros *
95*********************************************************************************************************************************/
96/** @name Stack Frame Layout
97 *
98 * @{ */
99/** The size of the area for stack variables and spills and stuff. */
100#define IEMNATIVE_FRAME_VAR_SIZE 0x40
101#ifdef RT_ARCH_AMD64
102/** Number of stack arguments slots for calls made from the frame. */
103# define IEMNATIVE_FRAME_STACK_ARG_COUNT 4
104/** An stack alignment adjustment (between non-volatile register pushes and
105 * the stack variable area, so the latter better aligned). */
106# define IEMNATIVE_FRAME_ALIGN_SIZE 8
107/** Number of any shadow arguments (spill area) for calls we make. */
108# ifdef RT_OS_WINDOWS
109# define IEMNATIVE_FRAME_SHADOW_ARG_COUNT 4
110# else
111# define IEMNATIVE_FRAME_SHADOW_ARG_COUNT 0
112# endif
113
114/** Frame pointer (RBP) relative offset of the last push. */
115# ifdef RT_OS_WINDOWS
116# define IEMNATIVE_FP_OFF_LAST_PUSH (7 * -8)
117# else
118# define IEMNATIVE_FP_OFF_LAST_PUSH (5 * -8)
119# endif
120/** Frame pointer (RBP) relative offset of the stack variable area (the lowest
121 * address for it). */
122# define IEMNATIVE_FP_OFF_STACK_VARS (IEMNATIVE_FP_OFF_LAST_PUSH - IEMNATIVE_FRAME_ALIGN_SIZE - IEMNATIVE_FRAME_VAR_SIZE)
123/** Frame pointer (RBP) relative offset of the first stack argument for calls. */
124# define IEMNATIVE_FP_OFF_STACK_ARG0 (IEMNATIVE_FP_OFF_STACK_VARS - IEMNATIVE_FRAME_STACK_ARG_COUNT * 8)
125/** Frame pointer (RBP) relative offset of the second stack argument for calls. */
126# define IEMNATIVE_FP_OFF_STACK_ARG1 (IEMNATIVE_FP_OFF_STACK_ARG0 + 8)
127/** Frame pointer (RBP) relative offset of the third stack argument for calls. */
128# define IEMNATIVE_FP_OFF_STACK_ARG2 (IEMNATIVE_FP_OFF_STACK_ARG0 + 16)
129/** Frame pointer (RBP) relative offset of the fourth stack argument for calls. */
130# define IEMNATIVE_FP_OFF_STACK_ARG3 (IEMNATIVE_FP_OFF_STACK_ARG0 + 24)
131
132# ifdef RT_OS_WINDOWS
133/** Frame pointer (RBP) relative offset of the first incoming shadow argument. */
134# define IEMNATIVE_FP_OFF_IN_SHADOW_ARG0 (16)
135/** Frame pointer (RBP) relative offset of the second incoming shadow argument. */
136# define IEMNATIVE_FP_OFF_IN_SHADOW_ARG1 (24)
137/** Frame pointer (RBP) relative offset of the third incoming shadow argument. */
138# define IEMNATIVE_FP_OFF_IN_SHADOW_ARG2 (32)
139/** Frame pointer (RBP) relative offset of the fourth incoming shadow argument. */
140# define IEMNATIVE_FP_OFF_IN_SHADOW_ARG3 (40)
141# endif
142
143#elif RT_ARCH_ARM64
144
145#else
146# error "port me"
147#endif
148/** @} */
149
150
151
152/*********************************************************************************************************************************
153* Executable Memory Allocator *
154*********************************************************************************************************************************/
155
156#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
157/**
158 * Per-chunk unwind info for non-windows hosts.
159 */
160typedef struct IEMEXECMEMCHUNKEHFRAME
161{
162 /** struct object storage area. */
163 uint8_t abObject[1024];
164 /** The dwarf ehframe data for the chunk. */
165 uint8_t abEhFrame[512];
166} IEMEXECMEMCHUNKEHFRAME;
167/** Pointer to per-chunk info info for non-windows hosts. */
168typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
169#endif
170
171
172/**
173 * An chunk of executable memory.
174 */
175typedef struct IEMEXECMEMCHUNK
176{
177 /** The heap handle. */
178 RTHEAPSIMPLE hHeap;
179 /** Pointer to the chunk. */
180 void *pvChunk;
181#ifdef IN_RING3
182# ifdef RT_OS_WINDOWS
183 /** Pointer to the unwind information. This is allocated from hHeap on
184 * windows because (at least for AMD64) the UNWIND_INFO structure address
185 * in the RUNTIME_FUNCTION entry is an RVA and the chunk is the "image". */
186 void *pvUnwindInfo;
187# else
188 /** Exception handling frame information for proper unwinding during C++
189 * throws and (possibly) longjmp(). */
190 PIEMEXECMEMCHUNKEHFRAME pEhFrame;
191# endif
192#elif defined(IN_RING0)
193 /** Allocation handle. */
194 RTR0MEMOBJ hMemObj;
195#endif
196} IEMEXECMEMCHUNK;
197/** Pointer to a memory chunk. */
198typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
199
200
201/**
202 * Executable memory allocator for the native recompiler.
203 */
204typedef struct IEMEXECMEMALLOCATOR
205{
206 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
207 uint32_t uMagic;
208
209 /** The chunk size. */
210 uint32_t cbChunk;
211 /** The maximum number of chunks. */
212 uint32_t cMaxChunks;
213 /** The current number of chunks. */
214 uint32_t cChunks;
215 /** Hint where to start looking for available memory. */
216 uint32_t idxChunkHint;
217 /** Statistics: Current number of allocations. */
218 uint32_t cAllocations;
219
220 /** The total amount of memory available. */
221 uint64_t cbTotal;
222 /** Total amount of free memory. */
223 uint64_t cbFree;
224 /** Total amount of memory allocated. */
225 uint64_t cbAllocated;
226
227 /** @name Tweaks to get 64 byte aligned allocats w/o unnecessary fragmentation.
228 * @{ */
229 /** The size of the heap internal block header. This is used to adjust the
230 * request memory size to make sure there is exacly enough room for a header at
231 * the end of the blocks we allocate before the next 64 byte alignment line. */
232 uint32_t cbHeapBlockHdr;
233 /** The size of initial heap allocation required make sure the first
234 * allocation is correctly aligned. */
235 uint32_t cbHeapAlignTweak;
236 /** The alignment tweak allocation address. */
237 void *pvAlignTweak;
238 /** @} */
239
240 /** The allocation chunks. */
241 RT_FLEXIBLE_ARRAY_EXTENSION
242 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
243} IEMEXECMEMALLOCATOR;
244/** Pointer to an executable memory allocator. */
245typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
246
247/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
248#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
249
250
251#ifdef IN_RING3
252# ifdef RT_OS_WINDOWS
253
254/**
255 * Initializes the unwind info structures for windows hosts.
256 */
257static void *iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator,
258 RTHEAPSIMPLE hHeap, void *pvChunk)
259{
260 /*
261 * The AMD64 unwind opcodes.
262 *
263 * This is a program that starts with RSP after a RET instruction that
264 * ends up in recompiled code, and the operations we describe here will
265 * restore all non-volatile registers and bring RSP back to where our
266 * RET address is. This means it's reverse order from what happens in
267 * the prologue.
268 *
269 * Note! Using a frame register approach here both because we have one
270 * and but mainly because the UWOP_ALLOC_LARGE argument values
271 * would be a pain to write initializers for. On the positive
272 * side, we're impervious to changes in the the stack variable
273 * area can can deal with dynamic stack allocations if necessary.
274 */
275 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
276 {
277 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
278 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
279 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
280 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
281 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
282 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
283 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
284 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
285 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
286 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
287 };
288 union
289 {
290 IMAGE_UNWIND_INFO Info;
291 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
292 } s_UnwindInfo =
293 {
294 {
295 /* .Version = */ 1,
296 /* .Flags = */ 0,
297 /* .SizeOfProlog = */ 16, /* whatever */
298 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
299 /* .FrameRegister = */ X86_GREG_xBP,
300 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
301 }
302 };
303 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
304 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
305
306 /*
307 * Calc how much space we need and allocate it off the exec heap.
308 */
309 unsigned const cFunctionEntries = 1;
310 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
311 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
312 unsigned const cbNeededAligned = RT_ALIGN_32(cbNeeded + pExecMemAllocator->cbHeapBlockHdr, 64)
313 - pExecMemAllocator->cbHeapBlockHdr;
314
315 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions = (PIMAGE_RUNTIME_FUNCTION_ENTRY)RTHeapSimpleAlloc(hHeap, cbNeededAligned,
316 32 /*cbAlignment*/);
317 AssertReturn(paFunctions, NULL);
318
319 /*
320 * Initialize the structures.
321 */
322 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
323
324 paFunctions[0].BeginAddress = 0;
325 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
326 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
327
328 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
329 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
330
331 /*
332 * Register it.
333 */
334 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
335 AssertReturn(fRet, NULL); /* Nothing to clean up on failure, since its within the chunk itself. */
336
337 return paFunctions;
338}
339
340
341# else /* !RT_OS_WINDOWS */
342
343/**
344 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
345 */
346DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
347{
348 if (iValue >= 64)
349 {
350 Assert(iValue < 0x2000);
351 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
352 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
353 }
354 else if (iValue >= 0)
355 *Ptr.pb++ = (uint8_t)iValue;
356 else if (iValue > -64)
357 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
358 else
359 {
360 Assert(iValue > -0x2000);
361 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
362 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
363 }
364 return Ptr;
365}
366
367
368/**
369 * Emits an ULEB128 encoded value (up to 64-bit wide).
370 */
371DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
372{
373 while (uValue >= 0x80)
374 {
375 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
376 uValue >>= 7;
377 }
378 *Ptr.pb++ = (uint8_t)uValue;
379 return Ptr;
380}
381
382
383/**
384 * Emits a CFA rule as register @a uReg + offset @a off.
385 */
386DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
387{
388 *Ptr.pb++ = DW_CFA_def_cfa;
389 Ptr = iemDwarfPutUleb128(Ptr, uReg);
390 Ptr = iemDwarfPutUleb128(Ptr, off);
391 return Ptr;
392}
393
394
395/**
396 * Emits a register (@a uReg) save location:
397 * CFA + @a off * data_alignment_factor
398 */
399DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
400{
401 if (uReg < 0x40)
402 *Ptr.pb++ = DW_CFA_offset | uReg;
403 else
404 {
405 *Ptr.pb++ = DW_CFA_offset_extended;
406 Ptr = iemDwarfPutUleb128(Ptr, uReg);
407 }
408 Ptr = iemDwarfPutUleb128(Ptr, off);
409 return Ptr;
410}
411
412
413/**
414 * Emits a register (@a uReg) save location, using signed offset:
415 * CFA + @a offSigned * data_alignment_factor
416 */
417DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
418{
419 *Ptr.pb++ = DW_CFA_offset_extended_sf;
420 Ptr = iemDwarfPutUleb128(Ptr, uReg);
421 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
422 return Ptr;
423}
424
425
426/**
427 * Initializes the unwind info section for non-windows hosts.
428 */
429static void iemExecMemAllocatorInitEhFrameForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator,
430 PIEMEXECMEMCHUNKEHFRAME pEhFrame, void *pvChunk)
431{
432 RTPTRUNION Ptr = { pEhFrame };
433
434 /*
435 * Generate the CIE first.
436 */
437 RTPTRUNION const PtrCie = Ptr;
438 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
439 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
440 *Ptr.pb++ = 4; /* DwARF v4. */
441 *Ptr.pb++ = 0; /* Augmentation. */
442 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
443 *Ptr.pb++ = 0; /* Segment selector size. */
444 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
445 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
446 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
447 /* Initial instructions: */
448 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
449 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
450 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
451 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
452 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
453 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
454 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
455 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
456 while ((Ptr.u - PtrCie.u) & 3)
457 *Ptr.pb++ = DW_CFA_nop;
458 /* Finalize the CIE size. */
459 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
460
461 /*
462 * Generate an FDE for the whole chunk area.
463 */
464 RTPTRUNION const PtrFde = Ptr;
465 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
466 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
467 Ptr.pu32++;
468 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
469 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
470 //*Ptr.pb++ = DW_CFA_nop; - not required for recent libgcc/glibc.
471 while ((Ptr.u - PtrFde.u) & 3)
472 *Ptr.pb++ = DW_CFA_nop;
473 /* Finalize the FDE size. */
474 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
475
476 /* Terminator entry. */
477 *Ptr.pu32++ = 0;
478 *Ptr.pu32++ = 0; /* just to be sure... */
479 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
480}
481
482# endif /* !RT_OS_WINDOWS */
483#endif /* IN_RING3 */
484
485
486/**
487 * Adds another chunk to the executable memory allocator.
488 *
489 * This is used by the init code for the initial allocation and later by the
490 * regular allocator function when it's out of memory.
491 */
492static int iemExecMemAllocatorGrow(PIEMEXECMEMALLOCATOR pExecMemAllocator)
493{
494 /* Check that we've room for growth. */
495 uint32_t const idxChunk = pExecMemAllocator->cChunks;
496 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
497
498 /* Allocate a chunk. */
499 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
500 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
501
502 /* Initialize the heap for the chunk. */
503 RTHEAPSIMPLE hHeap = NIL_RTHEAPSIMPLE;
504 int rc = RTHeapSimpleInit(&hHeap, pvChunk, pExecMemAllocator->cbChunk);
505 AssertRC(rc);
506 if (RT_SUCCESS(rc))
507 {
508 /*
509 * We want the memory to be aligned on 64 byte, so the first time thru
510 * here we do some exploratory allocations to see how we can achieve this.
511 * On subsequent runs we only make an initial adjustment allocation, if
512 * necessary.
513 *
514 * Since we own the heap implementation, we know that the internal block
515 * header is 32 bytes in size for 64-bit systems (see RTHEAPSIMPLEBLOCK),
516 * so all we need to wrt allocation size adjustments is to add 32 bytes
517 * to the size, align up by 64 bytes, and subtract 32 bytes.
518 *
519 * The heap anchor block is 8 * sizeof(void *) (see RTHEAPSIMPLEINTERNAL),
520 * which mean 64 bytes on a 64-bit system, so we need to make a 64 byte
521 * allocation to force subsequent allocations to return 64 byte aligned
522 * user areas.
523 */
524 if (!pExecMemAllocator->cbHeapBlockHdr)
525 {
526 pExecMemAllocator->cbHeapBlockHdr = sizeof(void *) * 4; /* See RTHEAPSIMPLEBLOCK. */
527 pExecMemAllocator->cbHeapAlignTweak = 64;
528 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak,
529 32 /*cbAlignment*/);
530 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_2);
531
532 void *pvTest1 = RTHeapSimpleAlloc(hHeap,
533 RT_ALIGN_32(256 + pExecMemAllocator->cbHeapBlockHdr, 64)
534 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
535 AssertStmt(pvTest1, rc = VERR_INTERNAL_ERROR_2);
536 AssertStmt(!((uintptr_t)pvTest1 & 63), rc = VERR_INTERNAL_ERROR_3);
537
538 void *pvTest2 = RTHeapSimpleAlloc(hHeap,
539 RT_ALIGN_32(687 + pExecMemAllocator->cbHeapBlockHdr, 64)
540 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
541 AssertStmt(pvTest2, rc = VERR_INTERNAL_ERROR_2);
542 AssertStmt(!((uintptr_t)pvTest2 & 63), rc = VERR_INTERNAL_ERROR_3);
543
544 RTHeapSimpleFree(hHeap, pvTest2);
545 RTHeapSimpleFree(hHeap, pvTest1);
546 }
547 else
548 {
549 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak, 32 /*cbAlignment*/);
550 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_4);
551 }
552 if (RT_SUCCESS(rc))
553 {
554#ifdef IN_RING3
555# ifdef RT_OS_WINDOWS
556 /*
557 * The unwind information need to reside inside the chunk (at least
558 * the UNWIND_INFO structures does), as the UnwindInfoAddress member
559 * of RUNTIME_FUNCTION (AMD64) is relative to the "image base".
560 *
561 * We need unwind info because even longjmp() does a C++ stack unwind.
562 */
563 void *pvUnwindInfo = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pExecMemAllocator, hHeap, pvChunk);
564 AssertStmt(pvUnwindInfo, rc = VERR_INTERNAL_ERROR_3);
565# else
566 /*
567 * Generate an .eh_frame section for the chunk and register it so
568 * the unwinding code works (required for C++ exceptions and
569 * probably also for longjmp()).
570 */
571 PIEMEXECMEMCHUNKEHFRAME pEhFrame = (PIEMEXECMEMCHUNKEHFRAME)RTMemAllocZ(sizeof(IEMEXECMEMCHUNKEHFRAME));
572 if (pEhFrame)
573 {
574 iemExecMemAllocatorInitEhFrameForChunk(pExecMemAllocator, pEhFrame, pvChunk);
575 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
576 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
577 }
578 else
579 rc = VERR_NO_MEMORY;
580# endif
581 if (RT_SUCCESS(rc))
582#endif
583 {
584 /*
585 * Finalize the adding of the chunk.
586 */
587 pExecMemAllocator->aChunks[idxChunk].pvChunk = pvChunk;
588 pExecMemAllocator->aChunks[idxChunk].hHeap = hHeap;
589#ifdef IN_RING3
590# ifdef RT_OS_WINDOWS
591 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pvUnwindInfo;
592# else
593 pExecMemAllocator->aChunks[idxChunk].pEhFrame = pEhFrame;
594# endif
595#endif
596
597 pExecMemAllocator->cChunks = idxChunk + 1;
598 pExecMemAllocator->idxChunkHint = idxChunk;
599
600 size_t const cbFree = RTHeapSimpleGetFreeSize(hHeap);
601 pExecMemAllocator->cbTotal += cbFree;
602 pExecMemAllocator->cbFree += cbFree;
603
604 return VINF_SUCCESS;
605 }
606 }
607 }
608 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
609 return rc;
610}
611
612
613/**
614 * Initializes the executable memory allocator for native recompilation on the
615 * calling EMT.
616 *
617 * @returns VBox status code.
618 * @param pVCpu The cross context virtual CPU structure of the calling
619 * thread.
620 * @param cbMax The max size of the allocator.
621 * @param cbInitial The initial allocator size.
622 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
623 * dependent).
624 */
625int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk)
626{
627 /*
628 * Validate input.
629 */
630 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
631 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
632 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
633 || cbChunk == 0
634 || ( RT_IS_POWER_OF_TWO(cbChunk)
635 && cbChunk >= _1M
636 && cbChunk <= _256M
637 && cbChunk <= cbMax),
638 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
639 VERR_OUT_OF_RANGE);
640
641 /*
642 * Adjust/figure out the chunk size.
643 */
644 if (cbChunk == 0 || cbChunk == UINT32_MAX)
645 {
646 if (cbMax >= _256M)
647 cbChunk = _64M;
648 else
649 {
650 if (cbMax < _16M)
651 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
652 else
653 cbChunk = (uint32_t)cbMax / 4;
654 if (!RT_IS_POWER_OF_TWO(cbChunk))
655 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
656 }
657 }
658
659 if (cbChunk > cbMax)
660 cbMax = cbChunk;
661 else
662 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
663 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
664 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
665
666 /*
667 * Allocate and initialize the allocatore instance.
668 */
669 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR,
670 aChunks[cMaxChunks]));
671 AssertReturn(pExecMemAllocator, VERR_NO_MEMORY);
672 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
673 pExecMemAllocator->cbChunk = cbChunk;
674 pExecMemAllocator->cMaxChunks = cMaxChunks;
675 pExecMemAllocator->cChunks = 0;
676 pExecMemAllocator->idxChunkHint = 0;
677 pExecMemAllocator->cAllocations = 0;
678 pExecMemAllocator->cbTotal = 0;
679 pExecMemAllocator->cbFree = 0;
680 pExecMemAllocator->cbAllocated = 0;
681 for (uint32_t i = 0; i < cMaxChunks; i++)
682 {
683 pExecMemAllocator->aChunks[i].hHeap = NIL_RTHEAPSIMPLE;
684 pExecMemAllocator->aChunks[i].pvChunk = NULL;
685#ifdef IN_RING0
686 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
687#elif !defined(RT_OS_WINDOWS)
688 pExecMemAllocator->aChunks[i].pEhFrame = NULL;
689#endif
690 }
691 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
692
693 /*
694 * Do the initial allocations.
695 */
696 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
697 {
698 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
699 AssertLogRelRCReturn(rc, rc);
700 }
701
702 pExecMemAllocator->idxChunkHint = 0;
703
704 return VINF_SUCCESS;
705}
706
707/**
708 * Worker for iemExecMemAllocatorAlloc that returns @a pvRet after updating
709 * the heap statistics.
710 */
711DECL_FORCE_INLINE(void *) iemExecMemAllocatorAllocTailCode(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvRet,
712 uint32_t cbReq, uint32_t idxChunk)
713{
714 pExecMemAllocator->cAllocations += 1;
715 pExecMemAllocator->cbAllocated += cbReq;
716 pExecMemAllocator->cbFree -= RT_ALIGN_32(cbReq, 64);
717 pExecMemAllocator->idxChunkHint = idxChunk;
718 return pvRet;
719}
720
721
722/**
723 * Allocates @a cbReq bytes of executable memory.
724 *
725 * @returns Pointer to the memory, NULL if out of memory or other problem
726 * encountered.
727 * @param pVCpu The cross context virtual CPU structure of the calling
728 * thread.
729 * @param cbReq How many bytes are required.
730 */
731static void *iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq)
732{
733 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
734 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
735 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
736
737 /*
738 * Adjust the request size as per the logic described in
739 * iemExecMemAllocatorGrow and attempt to allocate it from one of the
740 * existing chunks if we think we've got sufficient free memory around.
741 */
742 cbReq = RT_ALIGN_32(cbReq + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
743 if (cbReq <= pExecMemAllocator->cbFree)
744 {
745 uint32_t const cChunks = pExecMemAllocator->cChunks;
746 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
747 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
748 {
749 void *pvRet = RTHeapSimpleAlloc(pExecMemAllocator->aChunks[idxChunk].hHeap, cbReq, 32);
750 if (pvRet)
751 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet, cbReq, idxChunk);
752 }
753 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
754 {
755 void *pvRet = RTHeapSimpleAlloc(pExecMemAllocator->aChunks[idxChunk].hHeap, cbReq, 32);
756 if (pvRet)
757 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet, cbReq, idxChunk);
758 }
759 }
760
761 /*
762 * Can we grow it with another chunk?
763 */
764 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
765 {
766 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
767 AssertLogRelRCReturn(rc, NULL);
768
769 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
770 void *pvRet = RTHeapSimpleAlloc(pExecMemAllocator->aChunks[idxChunk].hHeap, cbReq, 32);
771 if (pvRet)
772 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet, cbReq, idxChunk);
773 AssertFailed();
774 }
775
776 /* What now? Prune native translation blocks from the cache? */
777 AssertFailed();
778 return NULL;
779}
780
781
782/** This is a hook that we may need later for changing memory protection back
783 * to readonly+exec */
784static void iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb)
785{
786 RT_NOREF(pVCpu, pv, cb);
787}
788
789
790/**
791 * Frees executable memory.
792 */
793void iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb)
794{
795 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
796 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
797 Assert(pv);
798
799 /* Align the size as we did when allocating the block. */
800 cb = RT_ALIGN_Z(cb + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
801
802 /* Assert sanity if strict build. */
803#ifdef VBOX_STRICT
804 uint32_t const cChunks = pExecMemAllocator->cChunks;
805 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
806 bool fFound = false;
807 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
808 {
809 fFound = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunk < cbChunk;
810 if (fFound)
811 {
812 Assert(RTHeapSimpleSize(pExecMemAllocator->aChunks[idxChunk].hHeap, pv) == cb);
813 break;
814 }
815 }
816 Assert(fFound);
817#endif
818
819 /* Update stats while cb is freshly calculated.*/
820 pExecMemAllocator->cbAllocated -= cb;
821 pExecMemAllocator->cbFree += RT_ALIGN_Z(cb, 64);
822 pExecMemAllocator->cAllocations -= 1;
823
824 /* Do the actual freeing. */
825 RTHeapSimpleFree(NIL_RTHEAPSIMPLE, pv);
826}
827
828
829/*********************************************************************************************************************************
830* Native Recompilation *
831*********************************************************************************************************************************/
832
833/** Native code generator label types. */
834typedef enum
835{
836 kIemNativeLabelType_Invalid = 0,
837 kIemNativeLabelType_Return,
838 kIemNativeLabelType_NonZeroRetOrPassUp,
839 kIemNativeLabelType_End
840} IEMNATIVELABELTYPE;
841
842/** Native code generator label definition. */
843typedef struct IEMNATIVELABEL
844{
845 /** Code offset if defined, UINT32_MAX if it needs to be generated after/in
846 * the epilog. */
847 uint32_t off;
848 /** The type of label (IEMNATIVELABELTYPE). */
849 uint16_t enmType;
850 /** Additional label data, type specific. */
851 uint16_t uData;
852} IEMNATIVELABEL;
853/** Pointer to a label. */
854typedef IEMNATIVELABEL *PIEMNATIVELABEL;
855
856
857/** Native code generator fixup types. */
858typedef enum
859{
860 kIemNativeFixupType_Invalid = 0,
861#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
862 /** AMD64 fixup: PC relative 32-bit with addend in bData. */
863 kIemNativeFixupType_Rel32,
864#elif defined(RT_ARCH_ARM64)
865#endif
866 kIemNativeFixupType_End
867} IEMNATIVEFIXUPTYPE;
868
869/** Native code generator fixup. */
870typedef struct IEMNATIVEFIXUP
871{
872 /** Code offset of the fixup location. */
873 uint32_t off;
874 /** The IEMNATIVELABEL this is a fixup for. */
875 uint16_t idxLabel;
876 /** The fixup type (IEMNATIVEFIXUPTYPE). */
877 uint8_t enmType;
878 /** Addend or other data. */
879 int8_t offAddend;
880} IEMNATIVEFIXUP;
881/** Pointer to a native code generator fixup. */
882typedef IEMNATIVEFIXUP *PIEMNATIVEFIXUP;
883
884
885/**
886 * Used by TB code when encountering a non-zero status or rcPassUp after a call.
887 */
888IEM_DECL_IMPL_DEF(int, iemNativeHlpExecStatusCodeFiddling,(PVMCPUCC pVCpu, int rc, uint8_t idxInstr))
889{
890 pVCpu->iem.s.cInstructions += idxInstr;
891 return VBOXSTRICTRC_VAL(iemExecStatusCodeFiddling(pVCpu, rc == VINF_IEM_REEXEC_BREAK ? VINF_SUCCESS : rc));
892}
893
894
895static void iemNativeReInit(PVMCPUCC pVCpu)
896{
897 pVCpu->iem.s.Native.cLabels = 0;
898 pVCpu->iem.s.Native.cFixups = 0;
899}
900
901
902static bool iemNativeInit(PVMCPUCC pVCpu)
903{
904 /*
905 * Try allocate all the buffers and stuff we need.
906 */
907 pVCpu->iem.s.Native.pInstrBuf = (PIEMNATIVEINSTR)RTMemAllocZ(_64K);
908 pVCpu->iem.s.Native.paLabels = (PIEMNATIVELABEL)RTMemAllocZ(sizeof(IEMNATIVELABEL) * _8K);
909 pVCpu->iem.s.Native.paFixups = (PIEMNATIVEFIXUP)RTMemAllocZ(sizeof(IEMNATIVEFIXUP) * _16K);
910 if (RT_LIKELY( pVCpu->iem.s.Native.pInstrBuf
911 && pVCpu->iem.s.Native.paLabels
912 && pVCpu->iem.s.Native.paFixups))
913 {
914 /*
915 * Set the buffer & array sizes on success.
916 */
917 pVCpu->iem.s.Native.cInstrBufAlloc = _64K / sizeof(IEMNATIVEINSTR);
918 pVCpu->iem.s.Native.cLabelsAlloc = _8K;
919 pVCpu->iem.s.Native.cFixupsAlloc = _16K;
920 iemNativeReInit(pVCpu);
921 return true;
922 }
923
924 /*
925 * Failed. Cleanup and the reset state.
926 */
927 AssertFailed();
928 RTMemFree(pVCpu->iem.s.Native.pInstrBuf);
929 RTMemFree(pVCpu->iem.s.Native.paLabels);
930 RTMemFree(pVCpu->iem.s.Native.paFixups);
931 pVCpu->iem.s.Native.pInstrBuf = NULL;
932 pVCpu->iem.s.Native.paLabels = NULL;
933 pVCpu->iem.s.Native.paFixups = NULL;
934 return false;
935}
936
937
938static uint32_t iemNativeMakeLabel(PVMCPUCC pVCpu, IEMNATIVELABELTYPE enmType,
939 uint32_t offWhere = UINT32_MAX, uint16_t uData = 0)
940{
941 /*
942 * Do we have the label already?
943 */
944 PIEMNATIVELABEL paLabels = pVCpu->iem.s.Native.paLabels;
945 uint32_t const cLabels = pVCpu->iem.s.Native.cLabels;
946 for (uint32_t i = 0; i < cLabels; i++)
947 if ( paLabels[i].enmType == enmType
948 && paLabels[i].uData == uData)
949 {
950 if (paLabels[i].off == offWhere || offWhere == UINT32_MAX)
951 return i;
952 if (paLabels[i].off == UINT32_MAX)
953 {
954 paLabels[i].off = offWhere;
955 return i;
956 }
957 }
958
959 /*
960 * Make sure we've got room for another label.
961 */
962 if (RT_LIKELY(cLabels < pVCpu->iem.s.Native.cLabelsAlloc))
963 { /* likely */ }
964 else
965 {
966 uint32_t cNew = pVCpu->iem.s.Native.cLabelsAlloc;
967 AssertReturn(cNew, UINT32_MAX);
968 AssertReturn(cLabels == cNew, UINT32_MAX);
969 cNew *= 2;
970 AssertReturn(cNew <= _64K, UINT32_MAX); /* IEMNATIVEFIXUP::idxLabel type restrict this */
971 paLabels = (PIEMNATIVELABEL)RTMemRealloc(paLabels, cNew * sizeof(paLabels[0]));
972 AssertReturn(paLabels, UINT32_MAX);
973 pVCpu->iem.s.Native.paLabels = paLabels;
974 pVCpu->iem.s.Native.cLabelsAlloc = cNew;
975 }
976
977 /*
978 * Define a new label.
979 */
980 paLabels[cLabels].off = offWhere;
981 paLabels[cLabels].enmType = enmType;
982 paLabels[cLabels].uData = uData;
983 pVCpu->iem.s.Native.cLabels = cLabels + 1;
984 return cLabels;
985}
986
987
988static uint32_t iemNativeFindLabel(PVMCPUCC pVCpu, IEMNATIVELABELTYPE enmType,
989 uint32_t offWhere = UINT32_MAX, uint16_t uData = 0)
990{
991 PIEMNATIVELABEL paLabels = pVCpu->iem.s.Native.paLabels;
992 uint32_t const cLabels = pVCpu->iem.s.Native.cLabels;
993 for (uint32_t i = 0; i < cLabels; i++)
994 if ( paLabels[i].enmType == enmType
995 && paLabels[i].uData == uData
996 && ( paLabels[i].off == offWhere
997 || offWhere == UINT32_MAX
998 || paLabels[i].off == UINT32_MAX))
999 return i;
1000 return UINT32_MAX;
1001}
1002
1003
1004
1005static bool iemNativeAddFixup(PVMCPUCC pVCpu, uint32_t offWhere, uint32_t idxLabel,
1006 IEMNATIVEFIXUPTYPE enmType, int8_t offAddend = 0)
1007{
1008 Assert(idxLabel <= UINT16_MAX);
1009 Assert((unsigned)enmType <= UINT8_MAX);
1010
1011 /*
1012 * Make sure we've room.
1013 */
1014 PIEMNATIVEFIXUP paFixups = pVCpu->iem.s.Native.paFixups;
1015 uint32_t const cFixups = pVCpu->iem.s.Native.cFixups;
1016 if (RT_LIKELY(cFixups < pVCpu->iem.s.Native.cFixupsAlloc))
1017 { /* likely */ }
1018 else
1019 {
1020 uint32_t cNew = pVCpu->iem.s.Native.cFixupsAlloc;
1021 AssertReturn(cNew, false);
1022 AssertReturn(cFixups == cNew, false);
1023 cNew *= 2;
1024 AssertReturn(cNew <= _128K, false);
1025 paFixups = (PIEMNATIVEFIXUP)RTMemRealloc(paFixups, cNew * sizeof(paFixups[0]));
1026 AssertReturn(paFixups, false);
1027 pVCpu->iem.s.Native.paFixups = paFixups;
1028 pVCpu->iem.s.Native.cFixupsAlloc = cNew;
1029 }
1030
1031 /*
1032 * Add the fixup.
1033 */
1034 paFixups[cFixups].off = offWhere;
1035 paFixups[cFixups].idxLabel = (uint16_t)idxLabel;
1036 paFixups[cFixups].enmType = enmType;
1037 paFixups[cFixups].offAddend = offAddend;
1038 pVCpu->iem.s.Native.cFixups = cFixups + 1;
1039 return true;
1040}
1041
1042
1043static PIEMNATIVEINSTR iemNativeInstrBufEnsureSlow(PVMCPUCC pVCpu, uint32_t off, uint32_t cInstrReq)
1044{
1045 /* Double the buffer size till we meet the request. */
1046 uint32_t cNew = pVCpu->iem.s.Native.cInstrBufAlloc;
1047 AssertReturn(cNew > 0, NULL);
1048 do
1049 cNew *= 2;
1050 while (cNew < off + cInstrReq);
1051
1052 uint32_t const cbNew = cNew * sizeof(IEMNATIVEINSTR);
1053 AssertReturn(cbNew <= _2M, NULL);
1054
1055 void *pvNew = RTMemRealloc(pVCpu->iem.s.Native.pInstrBuf, cbNew);
1056 AssertReturn(pvNew, NULL);
1057
1058 pVCpu->iem.s.Native.cInstrBufAlloc = cNew;
1059 return pVCpu->iem.s.Native.pInstrBuf = (PIEMNATIVEINSTR)pvNew;
1060}
1061
1062
1063DECL_FORCE_INLINE(PIEMNATIVEINSTR) iemNativeInstrBufEnsure(PVMCPUCC pVCpu, uint32_t off, uint32_t cInstrReq)
1064{
1065 if (RT_LIKELY(off + cInstrReq <= pVCpu->iem.s.Native.cInstrBufAlloc))
1066 return pVCpu->iem.s.Native.pInstrBuf;
1067 return iemNativeInstrBufEnsureSlow(pVCpu, off, cInstrReq);
1068}
1069
1070
1071/**
1072 * Emit a simple marker instruction to more easily tell where something starts
1073 * in the disassembly.
1074 */
1075uint32_t iemNativeEmitMarker(PVMCPUCC pVCpu, uint32_t off)
1076{
1077#ifdef RT_ARCH_AMD64
1078 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 1);
1079 AssertReturn(pbCodeBuf, UINT32_MAX);
1080 pbCodeBuf[off++] = 0x90; /* nop */
1081
1082#elif RT_ARCH_ARM64
1083 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 1);
1084 pu32CodeBuf[off++] = 0xe503201f; /* nop? */
1085
1086#else
1087# error "port me"
1088#endif
1089 return off;
1090}
1091
1092
1093static uint32_t iemNativeEmitGprZero(PVMCPUCC pVCpu, uint32_t off, uint8_t iGpr)
1094{
1095#ifdef RT_ARCH_AMD64
1096 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 3);
1097 AssertReturn(pbCodeBuf, UINT32_MAX);
1098 if (iGpr >= 8) /* xor gpr32, gpr32 */
1099 pbCodeBuf[off++] = X86_OP_REX_R | X86_OP_REX_B;
1100 pbCodeBuf[off++] = 0x33;
1101 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, iGpr & 7, iGpr & 7);
1102
1103#elif RT_ARCH_ARM64
1104 RT_NOREF(pVCpu, iGpr, uImm64);
1105 off = UINT32_MAX;
1106
1107#else
1108# error "port me"
1109#endif
1110 RT_NOREF(pVCpu);
1111 return off;
1112}
1113
1114
1115static uint32_t iemNativeEmitLoadGprImm64(PVMCPUCC pVCpu, uint32_t off, uint8_t iGpr, uint64_t uImm64)
1116{
1117 if (!uImm64)
1118 return iemNativeEmitGprZero(pVCpu, off, iGpr);
1119
1120#ifdef RT_ARCH_AMD64
1121 if (uImm64 <= UINT32_MAX)
1122 {
1123 /* mov gpr, imm32 */
1124 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 6);
1125 AssertReturn(pbCodeBuf, UINT32_MAX);
1126 if (iGpr >= 8)
1127 pbCodeBuf[off++] = X86_OP_REX_B;
1128 pbCodeBuf[off++] = 0xb8 + (iGpr & 7);
1129 pbCodeBuf[off++] = RT_BYTE1(uImm64);
1130 pbCodeBuf[off++] = RT_BYTE2(uImm64);
1131 pbCodeBuf[off++] = RT_BYTE3(uImm64);
1132 pbCodeBuf[off++] = RT_BYTE4(uImm64);
1133 }
1134 else
1135 {
1136 /* mov gpr, imm64 */
1137 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 10);
1138 AssertReturn(pbCodeBuf, UINT32_MAX);
1139 if (iGpr < 8)
1140 pbCodeBuf[off++] = X86_OP_REX_W;
1141 else
1142 pbCodeBuf[off++] = X86_OP_REX_W | X86_OP_REX_B;
1143 pbCodeBuf[off++] = 0xb8 + (iGpr & 7);
1144 pbCodeBuf[off++] = RT_BYTE1(uImm64);
1145 pbCodeBuf[off++] = RT_BYTE2(uImm64);
1146 pbCodeBuf[off++] = RT_BYTE3(uImm64);
1147 pbCodeBuf[off++] = RT_BYTE4(uImm64);
1148 pbCodeBuf[off++] = RT_BYTE5(uImm64);
1149 pbCodeBuf[off++] = RT_BYTE6(uImm64);
1150 pbCodeBuf[off++] = RT_BYTE7(uImm64);
1151 pbCodeBuf[off++] = RT_BYTE8(uImm64);
1152 }
1153
1154#elif RT_ARCH_ARM64
1155 RT_NOREF(pVCpu, iGpr, uImm64);
1156 off = UINT32_MAX;
1157
1158#else
1159# error "port me"
1160#endif
1161 return off;
1162}
1163
1164
1165static uint32_t iemNativeEmitLoadGprFromVCpuU32(PVMCPUCC pVCpu, uint32_t off, uint8_t iGpr, uint32_t offVCpu)
1166{
1167#ifdef RT_ARCH_AMD64
1168 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1169 AssertReturn(pbCodeBuf, UINT32_MAX);
1170
1171 /* mov reg32, mem32 */
1172 if (iGpr >= 8)
1173 pbCodeBuf[off++] = X86_OP_REX_R;
1174 pbCodeBuf[off++] = 0x8b;
1175 if (offVCpu < 128)
1176 {
1177 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM1, iGpr & 7, X86_GREG_xBX);
1178 pbCodeBuf[off++] = (uint8_t)offVCpu;
1179 }
1180 else
1181 {
1182 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, iGpr & 7, X86_GREG_xBX);
1183 pbCodeBuf[off++] = RT_BYTE1(offVCpu);
1184 pbCodeBuf[off++] = RT_BYTE2(offVCpu);
1185 pbCodeBuf[off++] = RT_BYTE3(offVCpu);
1186 pbCodeBuf[off++] = RT_BYTE4(offVCpu);
1187 }
1188
1189#elif RT_ARCH_ARM64
1190 RT_NOREF(pVCpu, idxInstr);
1191 off = UINT32_MAX;
1192
1193#else
1194# error "port me"
1195#endif
1196 return off;
1197}
1198
1199
1200static uint32_t iemNativeEmitLoadGprFromGpr(PVMCPUCC pVCpu, uint32_t off, uint8_t iGprDst, uint8_t iGprSrc)
1201{
1202#ifdef RT_ARCH_AMD64
1203 /* mov gprdst, gprsrc */
1204 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 3);
1205 AssertReturn(pbCodeBuf, UINT32_MAX);
1206 if ((iGprDst | iGprSrc) >= 8)
1207 pbCodeBuf[off++] = iGprDst < 8 ? X86_OP_REX_W | X86_OP_REX_B
1208 : iGprSrc >= 8 ? X86_OP_REX_W | X86_OP_REX_R | X86_OP_REX_B
1209 : X86_OP_REX_W | X86_OP_REX_R;
1210 else
1211 pbCodeBuf[off++] = X86_OP_REX_W;
1212 pbCodeBuf[off++] = 0x8b;
1213 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, iGprDst & 7, iGprSrc & 7);
1214
1215#elif RT_ARCH_ARM64
1216 RT_NOREF(pVCpu, iGprDst, iGprSrc);
1217 off = UINT32_MAX;
1218
1219#else
1220# error "port me"
1221#endif
1222 return off;
1223}
1224
1225#ifdef RT_ARCH_AMD64
1226/**
1227 * Common bit of iemNativeEmitLoadGprByBp and friends.
1228 */
1229DECL_FORCE_INLINE(uint32_t) iemNativeEmitGprByBpDisp(uint8_t *pbCodeBuf, uint32_t off, uint8_t iGprReg, int32_t offDisp)
1230{
1231 if (offDisp < 128 && offDisp >= -128)
1232 {
1233 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM1, iGprReg & 7, X86_GREG_xBP);
1234 pbCodeBuf[off++] = (uint8_t)(int8_t)offDisp;
1235 }
1236 else
1237 {
1238 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM4, iGprReg & 7, X86_GREG_xBP);
1239 pbCodeBuf[off++] = RT_BYTE1((uint32_t)offDisp);
1240 pbCodeBuf[off++] = RT_BYTE2((uint32_t)offDisp);
1241 pbCodeBuf[off++] = RT_BYTE3((uint32_t)offDisp);
1242 pbCodeBuf[off++] = RT_BYTE4((uint32_t)offDisp);
1243 }
1244 return off;
1245}
1246#endif
1247
1248
1249#ifdef RT_ARCH_AMD64
1250/**
1251 * Emits a 64-bit GRP load instruction with an BP relative source address.
1252 */
1253static uint32_t iemNativeEmitLoadGprByBp(PVMCPUCC pVCpu, uint32_t off, uint8_t iGprDst, int32_t offDisp)
1254{
1255 /* mov gprdst, qword [rbp + offDisp] */
1256 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1257 if (iGprDst < 8)
1258 pbCodeBuf[off++] = X86_OP_REX_W;
1259 else
1260 pbCodeBuf[off++] = X86_OP_REX_W | X86_OP_REX_R;
1261 pbCodeBuf[off++] = 0x8b;
1262 return iemNativeEmitGprByBpDisp(pbCodeBuf, off, iGprDst, offDisp);
1263}
1264#endif
1265
1266
1267#ifdef RT_ARCH_AMD64
1268/**
1269 * Emits a 32-bit GRP load instruction with an BP relative source address.
1270 */
1271static uint32_t iemNativeEmitLoadGprByBpU32(PVMCPUCC pVCpu, uint32_t off, uint8_t iGprDst, int32_t offDisp)
1272{
1273 /* mov gprdst, dword [rbp + offDisp] */
1274 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1275 if (iGprDst >= 8)
1276 pbCodeBuf[off++] = X86_OP_REX_R;
1277 pbCodeBuf[off++] = 0x8b;
1278 return iemNativeEmitGprByBpDisp(pbCodeBuf, off, iGprDst, offDisp);
1279}
1280#endif
1281
1282
1283#ifdef RT_ARCH_AMD64
1284/**
1285 * Emits a load effective address to a GRP with an BP relative source address.
1286 */
1287static uint32_t iemNativeEmitLeaGrpByBp(PVMCPUCC pVCpu, uint32_t off, uint8_t iGprDst, int32_t offDisp)
1288{
1289 /* lea gprdst, [rbp + offDisp] */
1290 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1291 if (iGprDst < 8)
1292 pbCodeBuf[off++] = X86_OP_REX_W;
1293 else
1294 pbCodeBuf[off++] = X86_OP_REX_W | X86_OP_REX_R;
1295 pbCodeBuf[off++] = 0x8d;
1296 return iemNativeEmitGprByBpDisp(pbCodeBuf, off, iGprDst, offDisp);
1297}
1298#endif
1299
1300
1301#ifdef RT_ARCH_AMD64
1302/**
1303 * Emits a 64-bit GPR store with an BP relative destination address.
1304 */
1305static uint32_t iemNativeEmitStoreGprByBp(PVMCPUCC pVCpu, uint32_t off, int32_t offDisp, uint8_t iGprSrc)
1306{
1307 /* mov qword [rbp + offDisp], gprdst */
1308 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1309 if (iGprSrc < 8)
1310 pbCodeBuf[off++] = X86_OP_REX_W;
1311 else
1312 pbCodeBuf[off++] = X86_OP_REX_W | X86_OP_REX_R;
1313 pbCodeBuf[off++] = 0x89;
1314 return iemNativeEmitGprByBpDisp(pbCodeBuf, off, iGprSrc, offDisp);
1315}
1316#endif
1317
1318
1319#ifdef RT_ARCH_AMD64
1320/**
1321 * Emits a 64-bit GPR subtract with a signed immediate subtrahend.
1322 */
1323static uint32_t iemNativeEmitSubGprImm(PVMCPUCC pVCpu, uint32_t off, uint8_t iGprDst, int32_t iSubtrahend)
1324{
1325 /* sub gprdst, imm8/imm32 */
1326 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 7);
1327 if (iGprDst < 7)
1328 pbCodeBuf[off++] = X86_OP_REX_W;
1329 else
1330 pbCodeBuf[off++] = X86_OP_REX_W | X86_OP_REX_B;
1331 if (iSubtrahend < 128 && iSubtrahend >= -128)
1332 {
1333 pbCodeBuf[off++] = 0x83;
1334 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 5, iGprDst & 7);
1335 pbCodeBuf[off++] = (uint8_t)iSubtrahend;
1336 }
1337 else
1338 {
1339 pbCodeBuf[off++] = 0x81;
1340 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 5, iGprDst & 7);
1341 pbCodeBuf[off++] = RT_BYTE1(iSubtrahend);
1342 pbCodeBuf[off++] = RT_BYTE2(iSubtrahend);
1343 pbCodeBuf[off++] = RT_BYTE3(iSubtrahend);
1344 pbCodeBuf[off++] = RT_BYTE4(iSubtrahend);
1345 }
1346 return off;
1347}
1348#endif
1349
1350
1351/**
1352 * Emits a code for checking the return code of a call and rcPassUp, returning
1353 * from the code if either are non-zero.
1354 */
1355static uint32_t iemNativeEmitCheckCallRetAndPassUp(PVMCPUCC pVCpu, uint32_t off, uint8_t idxInstr)
1356{
1357#ifdef RT_ARCH_AMD64
1358 /* eax = call status code.*/
1359
1360 /* edx = rcPassUp */
1361 off = iemNativeEmitLoadGprFromVCpuU32(pVCpu, off, X86_GREG_xDX, RT_UOFFSETOF(VMCPUCC, iem.s.rcPassUp));
1362 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1363
1364 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 10);
1365 AssertReturn(pbCodeBuf, UINT32_MAX);
1366
1367 /* edx = eax | rcPassUp*/
1368 pbCodeBuf[off++] = 0x0b; /* or edx, eax */
1369 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xDX, X86_GREG_xAX);
1370
1371 /* Jump to non-zero status return path, loading cl with the instruction number. */
1372 pbCodeBuf[off++] = 0xb0 + X86_GREG_xCX; /* mov cl, imm8 (pCallEntry->idxInstr) */
1373 pbCodeBuf[off++] = idxInstr;
1374
1375 pbCodeBuf[off++] = 0x0f; /* jnz rel32 */
1376 pbCodeBuf[off++] = 0x85;
1377 uint32_t const idxLabel = iemNativeMakeLabel(pVCpu, kIemNativeLabelType_NonZeroRetOrPassUp);
1378 AssertReturn(idxLabel != UINT32_MAX, UINT32_MAX);
1379 AssertReturn(iemNativeAddFixup(pVCpu, off, idxLabel, kIemNativeFixupType_Rel32, -4), UINT32_MAX);
1380 pbCodeBuf[off++] = 0x00;
1381 pbCodeBuf[off++] = 0x00;
1382 pbCodeBuf[off++] = 0x00;
1383 pbCodeBuf[off++] = 0x00;
1384
1385 /* done. */
1386
1387#elif RT_ARCH_ARM64
1388 RT_NOREF(pVCpu, idxInstr);
1389 off = UINT32_MAX;
1390
1391#else
1392# error "port me"
1393#endif
1394 return off;
1395}
1396
1397
1398/**
1399 * Emits a call to a threaded worker function.
1400 */
1401static uint32_t iemNativeEmitThreadedCall(PVMCPUCC pVCpu, uint32_t off, PCIEMTHRDEDCALLENTRY pCallEntry)
1402{
1403#ifdef VBOX_STRICT
1404 off = iemNativeEmitMarker(pVCpu, off);
1405 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1406#endif
1407 uint8_t const cParams = g_acIemThreadedFunctionUsedArgs[pCallEntry->enmFunction];
1408
1409#ifdef RT_ARCH_AMD64
1410 /* Load the parameters and emit the call. */
1411# ifdef RT_OS_WINDOWS
1412# ifndef VBOXSTRICTRC_STRICT_ENABLED
1413 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xCX, X86_GREG_xBX);
1414 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1415 if (cParams > 0)
1416 {
1417 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xDX, pCallEntry->auParams[0]);
1418 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1419 }
1420 if (cParams > 1)
1421 {
1422 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_x8, pCallEntry->auParams[1]);
1423 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1424 }
1425 if (cParams > 2)
1426 {
1427 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_x9, pCallEntry->auParams[2]);
1428 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1429 }
1430# else /* VBOXSTRICTRC: Returned via hidden parameter. Sigh. */
1431 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xDX, X86_GREG_xBX);
1432 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1433 if (cParams > 0)
1434 {
1435 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_x8, pCallEntry->auParams[0]);
1436 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1437 }
1438 if (cParams > 1)
1439 {
1440 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_x9, pCallEntry->auParams[1]);
1441 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1442 }
1443 if (cParams > 2)
1444 {
1445 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_x10, pCallEntry->auParams[2]);
1446 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1447 }
1448 off = iemNativeEmitStoreGprByBp(pVCpu, off, IEMNATIVE_FP_OFF_STACK_ARG0, X86_GREG_x10);
1449 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1450 off = iemNativeEmitLeaGrpByBp(pVCpu, off, X86_GREG_xCX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict */
1451 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1452# endif /* VBOXSTRICTRC_STRICT_ENABLED */
1453# else
1454 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xDI, X86_GREG_xBX);
1455 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1456 if (cParams > 0)
1457 {
1458 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xSI, pCallEntry->auParams[0]);
1459 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1460 }
1461 if (cParams > 1)
1462 {
1463 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xDX, pCallEntry->auParams[1]);
1464 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1465 }
1466 if (cParams > 2)
1467 {
1468 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xCX, pCallEntry->auParams[2]);
1469 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1470 }
1471# endif
1472 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xAX, (uintptr_t)g_apfnIemThreadedFunctions[pCallEntry->enmFunction]);
1473 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1474
1475 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 2);
1476 AssertReturn(pbCodeBuf, UINT32_MAX);
1477 pbCodeBuf[off++] = 0xff; /* call rax */
1478 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1479
1480# if defined(VBOXSTRICTRC_STRICT_ENABLED) && defined(RT_OS_WINDOWS)
1481 off = iemNativeEmitLoadGprByBpU32(pVCpu, off, X86_GREG_xAX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict (see above) */
1482# endif
1483
1484 /* Check the status code. */
1485 off = iemNativeEmitCheckCallRetAndPassUp(pVCpu, off, pCallEntry->idxInstr);
1486 AssertReturn(off != UINT32_MAX, off);
1487
1488
1489#elif RT_ARCH_ARM64
1490 RT_NOREF(pVCpu, pCallEntry);
1491 off = UINT32_MAX;
1492
1493#else
1494# error "port me"
1495#endif
1496 return off;
1497}
1498
1499
1500/**
1501 * Emits a standard epilog.
1502 */
1503static uint32_t iemNativeEmitEpilog(PVMCPUCC pVCpu, uint32_t off)
1504{
1505#ifdef RT_ARCH_AMD64
1506 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 20);
1507 AssertReturn(pbCodeBuf, UINT32_MAX);
1508
1509 /*
1510 * Successful return, so clear eax.
1511 */
1512 pbCodeBuf[off++] = 0x33; /* xor eax, eax */
1513 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xAX, X86_GREG_xAX);
1514
1515 /*
1516 * Define label for common return point.
1517 */
1518 uint32_t const idxReturn = iemNativeMakeLabel(pVCpu, kIemNativeLabelType_Return, off);
1519 AssertReturn(idxReturn != UINT32_MAX, UINT32_MAX);
1520
1521 /* Reposition esp at the r15 restore point. */
1522 pbCodeBuf[off++] = X86_OP_REX_W;
1523 pbCodeBuf[off++] = 0x8d; /* lea rsp, [rbp - (gcc ? 5 : 7) * 8] */
1524 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM1, X86_GREG_xSP, X86_GREG_xBP);
1525 pbCodeBuf[off++] = (uint8_t)IEMNATIVE_FP_OFF_LAST_PUSH;
1526
1527 /* Pop non-volatile registers and return */
1528 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r15 */
1529 pbCodeBuf[off++] = 0x58 + X86_GREG_x15 - 8;
1530 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r14 */
1531 pbCodeBuf[off++] = 0x58 + X86_GREG_x14 - 8;
1532 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r13 */
1533 pbCodeBuf[off++] = 0x58 + X86_GREG_x13 - 8;
1534 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r12 */
1535 pbCodeBuf[off++] = 0x58 + X86_GREG_x12 - 8;
1536# ifdef RT_OS_WINDOWS
1537 pbCodeBuf[off++] = 0x58 + X86_GREG_xDI; /* pop rdi */
1538 pbCodeBuf[off++] = 0x58 + X86_GREG_xSI; /* pop rsi */
1539# endif
1540 pbCodeBuf[off++] = 0x58 + X86_GREG_xBX; /* pop rbx */
1541 pbCodeBuf[off++] = 0xc9; /* leave */
1542 pbCodeBuf[off++] = 0xc3; /* ret */
1543 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1544
1545 /*
1546 * Generate the rc + rcPassUp fiddling code if needed.
1547 */
1548 uint32_t idxLabel = iemNativeFindLabel(pVCpu, kIemNativeLabelType_NonZeroRetOrPassUp);
1549 if (idxLabel != UINT32_MAX)
1550 {
1551 Assert(pVCpu->iem.s.Native.paLabels[idxLabel].off == UINT32_MAX);
1552 pVCpu->iem.s.Native.paLabels[idxLabel].off = off;
1553
1554 /* Call helper and jump to return point. */
1555# ifdef RT_OS_WINDOWS
1556 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_x8, X86_GREG_xCX); /* cl = instruction number */
1557 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1558 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xCX, X86_GREG_xBX);
1559 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1560 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xDX, X86_GREG_xAX);
1561 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1562# else
1563 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xDI, X86_GREG_xBX);
1564 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1565 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xSI, X86_GREG_xAX);
1566 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1567 off = iemNativeEmitLoadGprFromGpr(pVCpu, off, X86_GREG_xDX, X86_GREG_xCX); /* cl = instruction number */
1568 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1569# endif
1570 off = iemNativeEmitLoadGprImm64(pVCpu, off, X86_GREG_xAX, (uintptr_t)iemNativeHlpExecStatusCodeFiddling);
1571 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1572
1573 pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 10);
1574 AssertReturn(pbCodeBuf, UINT32_MAX);
1575 pbCodeBuf[off++] = 0xff; /* call rax */
1576 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1577
1578 /* Jump to common return point. */
1579 uint32_t offRel = pVCpu->iem.s.Native.paLabels[idxReturn].off - (off + 2);
1580 if (-(int32_t)offRel <= 127)
1581 {
1582 pbCodeBuf[off++] = 0xeb; /* jmp rel8 */
1583 pbCodeBuf[off++] = (uint8_t)offRel;
1584 off++;
1585 }
1586 else
1587 {
1588 offRel -= 3;
1589 pbCodeBuf[off++] = 0xe9; /* jmp rel32 */
1590 pbCodeBuf[off++] = RT_BYTE1(offRel);
1591 pbCodeBuf[off++] = RT_BYTE2(offRel);
1592 pbCodeBuf[off++] = RT_BYTE3(offRel);
1593 pbCodeBuf[off++] = RT_BYTE4(offRel);
1594 }
1595 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1596 }
1597
1598#elif RT_ARCH_ARM64
1599 RT_NOREF(pVCpu);
1600 off = UINT32_MAX;
1601
1602#else
1603# error "port me"
1604#endif
1605 return off;
1606}
1607
1608
1609/**
1610 * Emits a standard prolog.
1611 */
1612static uint32_t iemNativeEmitProlog(PVMCPUCC pVCpu, uint32_t off)
1613{
1614#ifdef RT_ARCH_AMD64
1615 /*
1616 * Set up a regular xBP stack frame, pushing all non-volatile GPRs,
1617 * reserving 64 bytes for stack variables plus 4 non-register argument
1618 * slots. Fixed register assignment: xBX = pVCpu;
1619 *
1620 * Since we always do the same register spilling, we can use the same
1621 * unwind description for all the code.
1622 */
1623 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pVCpu, off, 32);
1624 AssertReturn(pbCodeBuf, UINT32_MAX);
1625 pbCodeBuf[off++] = 0x50 + X86_GREG_xBP; /* push rbp */
1626 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbp, rsp */
1627 pbCodeBuf[off++] = 0x8b;
1628 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBP, X86_GREG_xSP);
1629 pbCodeBuf[off++] = 0x50 + X86_GREG_xBX; /* push rbx */
1630# ifdef RT_OS_WINDOWS
1631 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rcx ; RBX = pVCpu */
1632 pbCodeBuf[off++] = 0x8b;
1633 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xCX);
1634 pbCodeBuf[off++] = 0x50 + X86_GREG_xSI; /* push rsi */
1635 pbCodeBuf[off++] = 0x50 + X86_GREG_xDI; /* push rdi */
1636# else
1637 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rdi ; RBX = pVCpu */
1638 pbCodeBuf[off++] = 0x8b;
1639 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xDI);
1640# endif
1641 pbCodeBuf[off++] = X86_OP_REX_B; /* push r12 */
1642 pbCodeBuf[off++] = 0x50 + X86_GREG_x12 - 8;
1643 pbCodeBuf[off++] = X86_OP_REX_B; /* push r13 */
1644 pbCodeBuf[off++] = 0x50 + X86_GREG_x13 - 8;
1645 pbCodeBuf[off++] = X86_OP_REX_B; /* push r14 */
1646 pbCodeBuf[off++] = 0x50 + X86_GREG_x14 - 8;
1647 pbCodeBuf[off++] = X86_OP_REX_B; /* push r15 */
1648 pbCodeBuf[off++] = 0x50 + X86_GREG_x15 - 8;
1649
1650 off = iemNativeEmitSubGprImm(pVCpu, off, /* sub rsp, byte 28h */
1651 X86_GREG_xSP,
1652 IEMNATIVE_FRAME_ALIGN_SIZE
1653 + IEMNATIVE_FRAME_VAR_SIZE
1654 + IEMNATIVE_FRAME_STACK_ARG_COUNT * 8
1655 + IEMNATIVE_FRAME_SHADOW_ARG_COUNT * 8);
1656 AssertCompile(!(IEMNATIVE_FRAME_VAR_SIZE & 0xf));
1657 AssertCompile(!(IEMNATIVE_FRAME_STACK_ARG_COUNT & 0x1));
1658 AssertCompile(!(IEMNATIVE_FRAME_SHADOW_ARG_COUNT & 0x1));
1659
1660#elif RT_ARCH_ARM64
1661 RT_NOREF(pVCpu);
1662 off = UINT32_MAX;
1663
1664#else
1665# error "port me"
1666#endif
1667 return off;
1668}
1669
1670
1671/**
1672 * Recompiles the given threaded TB into a native one.
1673 *
1674 * In case of failure the translation block will be returned as-is.
1675 *
1676 * @returns pTb.
1677 * @param pVCpu The cross context virtual CPU structure of the calling
1678 * thread.
1679 * @param pTb The threaded translation to recompile to native.
1680 */
1681PIEMTB iemNativeRecompile(PVMCPUCC pVCpu, PIEMTB pTb)
1682{
1683 /*
1684 * The first time thru, we allocate the recompiler state, the other times
1685 * we just need to reset it before using it again.
1686 */
1687 if (RT_LIKELY(pVCpu->iem.s.Native.pInstrBuf))
1688 iemNativeReInit(pVCpu);
1689 else
1690 AssertReturn(iemNativeInit(pVCpu), pTb);
1691
1692 /*
1693 * Emit prolog code (fixed atm).
1694 */
1695 uint32_t off = iemNativeEmitProlog(pVCpu, 0);
1696 AssertReturn(off != UINT32_MAX, pTb);
1697
1698 /*
1699 * Convert the calls to native code.
1700 */
1701 PCIEMTHRDEDCALLENTRY pCallEntry = pTb->Thrd.paCalls;
1702 uint32_t cCallsLeft = pTb->Thrd.cCalls;
1703 while (cCallsLeft-- > 0)
1704 {
1705 off = iemNativeEmitThreadedCall(pVCpu, off, pCallEntry);
1706 AssertReturn(off != UINT32_MAX, pTb);
1707
1708 pCallEntry++;
1709 }
1710
1711 /*
1712 * Emit the epilog code.
1713 */
1714 off = iemNativeEmitEpilog(pVCpu, off);
1715 AssertReturn(off != UINT32_MAX, pTb);
1716
1717 /*
1718 * Make sure all labels has been defined.
1719 */
1720 PIEMNATIVELABEL const paLabels = pVCpu->iem.s.Native.paLabels;
1721#ifdef VBOX_STRICT
1722 uint32_t const cLabels = pVCpu->iem.s.Native.cLabels;
1723 for (uint32_t i = 0; i < cLabels; i++)
1724 AssertMsgReturn(paLabels[i].off < off, ("i=%d enmType=%d\n", i, paLabels[i].enmType), pTb);
1725#endif
1726
1727 /*
1728 * Allocate executable memory, copy over the code we've generated.
1729 */
1730 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1731 if (pTbAllocator->pDelayedFreeHead)
1732 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
1733
1734 PIEMNATIVEINSTR const paFinalInstrBuf = (PIEMNATIVEINSTR)iemExecMemAllocatorAlloc(pVCpu, off * sizeof(IEMNATIVEINSTR));
1735 AssertReturn(paFinalInstrBuf, pTb);
1736 memcpy(paFinalInstrBuf, pVCpu->iem.s.Native.pInstrBuf, off * sizeof(paFinalInstrBuf[0]));
1737
1738 /*
1739 * Apply fixups.
1740 */
1741 PIEMNATIVEFIXUP const paFixups = pVCpu->iem.s.Native.paFixups;
1742 uint32_t const cFixups = pVCpu->iem.s.Native.cFixups;
1743 for (uint32_t i = 0; i < cFixups; i++)
1744 {
1745 Assert(paFixups[i].off < off);
1746 Assert(paFixups[i].idxLabel < cLabels);
1747 RTPTRUNION const Ptr = { &paFinalInstrBuf[paFixups[i].off] };
1748 switch (paFixups[i].enmType)
1749 {
1750#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1751 case kIemNativeFixupType_Rel32:
1752 Assert(paFixups[i].off + 4 <= off);
1753 *Ptr.pi32 = paLabels[paFixups[i].idxLabel].off - paFixups[i].off + paFixups[i].offAddend;
1754 continue;
1755
1756#elif defined(RT_ARCH_ARM64)
1757#endif
1758 case kIemNativeFixupType_Invalid:
1759 case kIemNativeFixupType_End:
1760 break;
1761 }
1762 AssertFailed();
1763 }
1764
1765 iemExecMemAllocatorReadyForUse(pVCpu, paFinalInstrBuf, off * sizeof(IEMNATIVEINSTR));
1766
1767 /*
1768 * Convert the translation block.
1769 */
1770 //RT_BREAKPOINT();
1771 RTMemFree(pTb->Thrd.paCalls);
1772 pTb->Native.paInstructions = paFinalInstrBuf;
1773 pTb->Native.cInstructions = off;
1774 pTb->fFlags = (pTb->fFlags & ~IEMTB_F_TYPE_MASK) | IEMTB_F_TYPE_NATIVE;
1775
1776 Assert(pTbAllocator->cThreadedTbs > 0);
1777 pTbAllocator->cThreadedTbs -= 1;
1778 pTbAllocator->cNativeTbs += 1;
1779 Assert(pTbAllocator->cNativeTbs <= pTbAllocator->cTotalTbs);
1780
1781 return pTb;
1782}
1783
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette