VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp@ 106309

Last change on this file since 106309 was 106309, checked in by vboxsync, 5 months ago

VMM/IEM: Some more tweaking of the bitmap scanning. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 83.4 KB
Line 
1/* $Id: IEMAllN8veExecMem.cpp 106309 2024-10-14 14:58:19Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, Executable Memory Allocator.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_IEM_RE_NATIVE
33#define IEM_WITH_OPAQUE_DECODER_STATE
34#define VMM_INCLUDED_SRC_include_IEMMc_h /* block IEMMc.h inclusion. */
35#include <VBox/vmm/iem.h>
36#include <VBox/vmm/cpum.h>
37#include "IEMInternal.h"
38#include <VBox/vmm/vmcc.h>
39#include <VBox/log.h>
40#include <VBox/err.h>
41#include <VBox/param.h>
42#include <iprt/assert.h>
43#include <iprt/mem.h>
44#include <iprt/string.h>
45#if defined(RT_ARCH_AMD64)
46# include <iprt/x86.h>
47#elif defined(RT_ARCH_ARM64)
48# include <iprt/armv8.h>
49#endif
50
51#ifdef RT_OS_WINDOWS
52# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
53extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
54extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
55#else
56# include <iprt/formats/dwarf.h>
57# if defined(RT_OS_DARWIN)
58# include <libkern/OSCacheControl.h>
59# include <mach/mach.h>
60# include <mach/mach_vm.h>
61# define IEMNATIVE_USE_LIBUNWIND
62extern "C" void __register_frame(const void *pvFde);
63extern "C" void __deregister_frame(const void *pvFde);
64# else
65# ifdef DEBUG_bird /** @todo not thread safe yet */
66# define IEMNATIVE_USE_GDB_JIT
67# endif
68# ifdef IEMNATIVE_USE_GDB_JIT
69# include <iprt/critsect.h>
70# include <iprt/once.h>
71# include <iprt/formats/elf64.h>
72# endif
73extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
74extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
75# endif
76#endif
77
78#include "IEMN8veRecompiler.h"
79
80
81/*********************************************************************************************************************************
82* Executable Memory Allocator *
83*********************************************************************************************************************************/
84/** The chunk sub-allocation unit size in bytes. */
85#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 256
86/** The chunk sub-allocation unit size as a shift factor. */
87#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 8
88/** Enables adding a header to the sub-allocator allocations.
89 * This is useful for freeing up executable memory among other things. */
90#define IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
91/** Use alternative pruning. */
92#define IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
93
94
95#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
96# ifdef IEMNATIVE_USE_GDB_JIT
97# define IEMNATIVE_USE_GDB_JIT_ET_DYN
98
99/** GDB JIT: Code entry. */
100typedef struct GDBJITCODEENTRY
101{
102 struct GDBJITCODEENTRY *pNext;
103 struct GDBJITCODEENTRY *pPrev;
104 uint8_t *pbSymFile;
105 uint64_t cbSymFile;
106} GDBJITCODEENTRY;
107
108/** GDB JIT: Actions. */
109typedef enum GDBJITACTIONS : uint32_t
110{
111 kGdbJitaction_NoAction = 0, kGdbJitaction_Register, kGdbJitaction_Unregister
112} GDBJITACTIONS;
113
114/** GDB JIT: Descriptor. */
115typedef struct GDBJITDESCRIPTOR
116{
117 uint32_t uVersion;
118 GDBJITACTIONS enmAction;
119 GDBJITCODEENTRY *pRelevant;
120 GDBJITCODEENTRY *pHead;
121 /** Our addition: */
122 GDBJITCODEENTRY *pTail;
123} GDBJITDESCRIPTOR;
124
125/** GDB JIT: Our simple symbol file data. */
126typedef struct GDBJITSYMFILE
127{
128 Elf64_Ehdr EHdr;
129# ifndef IEMNATIVE_USE_GDB_JIT_ET_DYN
130 Elf64_Shdr aShdrs[5];
131# else
132 Elf64_Shdr aShdrs[7];
133 Elf64_Phdr aPhdrs[2];
134# endif
135 /** The dwarf ehframe data for the chunk. */
136 uint8_t abEhFrame[512];
137 char szzStrTab[128];
138 Elf64_Sym aSymbols[3];
139# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
140 Elf64_Sym aDynSyms[2];
141 Elf64_Dyn aDyn[6];
142# endif
143} GDBJITSYMFILE;
144
145extern "C" GDBJITDESCRIPTOR __jit_debug_descriptor;
146extern "C" DECLEXPORT(void) __jit_debug_register_code(void);
147
148/** Init once for g_IemNativeGdbJitLock. */
149static RTONCE g_IemNativeGdbJitOnce = RTONCE_INITIALIZER;
150/** Init once for the critical section. */
151static RTCRITSECT g_IemNativeGdbJitLock;
152
153/** GDB reads the info here. */
154GDBJITDESCRIPTOR __jit_debug_descriptor = { 1, kGdbJitaction_NoAction, NULL, NULL };
155
156/** GDB sets a breakpoint on this and checks __jit_debug_descriptor when hit. */
157DECL_NO_INLINE(RT_NOTHING, DECLEXPORT(void)) __jit_debug_register_code(void)
158{
159 ASMNopPause();
160}
161
162/** @callback_method_impl{FNRTONCE} */
163static DECLCALLBACK(int32_t) iemNativeGdbJitInitOnce(void *pvUser)
164{
165 RT_NOREF(pvUser);
166 return RTCritSectInit(&g_IemNativeGdbJitLock);
167}
168
169
170# endif /* IEMNATIVE_USE_GDB_JIT */
171
172/**
173 * Per-chunk unwind info for non-windows hosts.
174 */
175typedef struct IEMEXECMEMCHUNKEHFRAME
176{
177# ifdef IEMNATIVE_USE_LIBUNWIND
178 /** The offset of the FDA into abEhFrame. */
179 uintptr_t offFda;
180# else
181 /** 'struct object' storage area. */
182 uint8_t abObject[1024];
183# endif
184# ifdef IEMNATIVE_USE_GDB_JIT
185# if 0
186 /** The GDB JIT 'symbol file' data. */
187 GDBJITSYMFILE GdbJitSymFile;
188# endif
189 /** The GDB JIT list entry. */
190 GDBJITCODEENTRY GdbJitEntry;
191# endif
192 /** The dwarf ehframe data for the chunk. */
193 uint8_t abEhFrame[512];
194} IEMEXECMEMCHUNKEHFRAME;
195/** Pointer to per-chunk info info for non-windows hosts. */
196typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
197#endif
198
199
200/**
201 * An chunk of executable memory.
202 */
203typedef struct IEMEXECMEMCHUNK
204{
205 /** Number of free items in this chunk. */
206 uint32_t cFreeUnits;
207 /** Hint were to start searching for free space in the allocation bitmap. */
208 uint32_t idxFreeHint;
209 /** Pointer to the readable/writeable view of the memory chunk. */
210 void *pvChunkRw;
211 /** Pointer to the readable/executable view of the memory chunk. */
212 void *pvChunkRx;
213 /** Pointer to the context structure detailing the per chunk common code. */
214 PCIEMNATIVEPERCHUNKCTX pCtx;
215#ifdef IN_RING3
216 /**
217 * Pointer to the unwind information.
218 *
219 * This is used during C++ throw and longjmp (windows and probably most other
220 * platforms). Some debuggers (windbg) makes use of it as well.
221 *
222 * Windows: This is allocated from hHeap on windows because (at least for
223 * AMD64) the UNWIND_INFO structure address in the
224 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
225 *
226 * Others: Allocated from the regular heap to avoid unnecessary executable data
227 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
228 void *pvUnwindInfo;
229#elif defined(IN_RING0)
230 /** Allocation handle. */
231 RTR0MEMOBJ hMemObj;
232#endif
233} IEMEXECMEMCHUNK;
234/** Pointer to a memory chunk. */
235typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
236
237
238/**
239 * Executable memory allocator for the native recompiler.
240 */
241typedef struct IEMEXECMEMALLOCATOR
242{
243 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
244 uint32_t uMagic;
245
246 /** The chunk size. */
247 uint32_t cbChunk;
248 /** The maximum number of chunks. */
249 uint32_t cMaxChunks;
250 /** The current number of chunks. */
251 uint32_t cChunks;
252 /** Hint where to start looking for available memory. */
253 uint32_t idxChunkHint;
254 /** Statistics: Current number of allocations. */
255 uint32_t cAllocations;
256
257 /** The total amount of memory available. */
258 uint64_t cbTotal;
259 /** Total amount of free memory. */
260 uint64_t cbFree;
261 /** Total amount of memory allocated. */
262 uint64_t cbAllocated;
263
264 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
265 *
266 * Since the chunk size is a power of two and the minimum chunk size is a lot
267 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
268 * require a whole number of uint64_t elements in the allocation bitmap. So,
269 * for sake of simplicity, they are allocated as one continous chunk for
270 * simplicity/laziness. */
271 uint64_t *pbmAlloc;
272 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
273 uint32_t cUnitsPerChunk;
274 /** Number of bitmap elements per chunk (for quickly locating the bitmap
275 * portion corresponding to an chunk). */
276 uint32_t cBitmapElementsPerChunk;
277
278 /** Number of times we fruitlessly scanned a chunk for free space. */
279 uint64_t cFruitlessChunkScans;
280
281#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
282 /** The next chunk to prune in. */
283 uint32_t idxChunkPrune;
284 /** Where in chunk offset to start pruning at. */
285 uint32_t offChunkPrune;
286 /** Profiling the pruning code. */
287 STAMPROFILE StatPruneProf;
288 /** Number of bytes recovered by the pruning. */
289 STAMPROFILE StatPruneRecovered;
290#endif
291
292#ifdef VBOX_WITH_STATISTICS
293 STAMPROFILE StatAlloc;
294 /** Total amount of memory not being usable currently due to IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE. */
295 uint64_t cbUnusable;
296#endif
297
298#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
299 /** Pointer to the array of unwind info running parallel to aChunks (same
300 * allocation as this structure, located after the bitmaps).
301 * (For Windows, the structures must reside in 32-bit RVA distance to the
302 * actual chunk, so they are allocated off the chunk.) */
303 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
304#endif
305
306 /** The allocation chunks. */
307 RT_FLEXIBLE_ARRAY_EXTENSION
308 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
309} IEMEXECMEMALLOCATOR;
310/** Pointer to an executable memory allocator. */
311typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
312
313/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
314#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
315
316
317#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
318/**
319 * Allocation header.
320 */
321typedef struct IEMEXECMEMALLOCHDR
322{
323 /** Magic value / eyecatcher (IEMEXECMEMALLOCHDR_MAGIC). */
324 uint32_t uMagic;
325 /** The allocation chunk (for speeding up freeing). */
326 uint32_t idxChunk;
327 /** Pointer to the translation block the allocation belongs to.
328 * This is the whole point of the header. */
329 PIEMTB pTb;
330} IEMEXECMEMALLOCHDR;
331/** Pointer to an allocation header. */
332typedef IEMEXECMEMALLOCHDR *PIEMEXECMEMALLOCHDR;
333/** Magic value for IEMEXECMEMALLOCHDR ('ExeM'). */
334# define IEMEXECMEMALLOCHDR_MAGIC UINT32_C(0x4d657845)
335#endif
336
337
338static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator);
339
340
341#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
342/**
343 * Frees up executable memory when we're out space.
344 *
345 * This is an alternative to iemTbAllocatorFreeupNativeSpace() that frees up
346 * space in a more linear fashion from the allocator's point of view. It may
347 * also defragment if implemented & enabled
348 */
349static void iemExecMemAllocatorPrune(PVMCPU pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
350{
351# ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
352# error "IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING requires IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER"
353# endif
354 STAM_REL_PROFILE_START(&pExecMemAllocator->StatPruneProf, a);
355
356 /*
357 * Before we can start, we must process delayed frees.
358 */
359 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
360
361 AssertCompile(RT_IS_POWER_OF_TWO(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE));
362
363 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
364 AssertReturnVoid(RT_IS_POWER_OF_TWO(cbChunk));
365 AssertReturnVoid(cbChunk >= _1M && cbChunk <= _256M); /* see iemExecMemAllocatorInit */
366
367 uint32_t const cChunks = pExecMemAllocator->cChunks;
368 AssertReturnVoid(cChunks == pExecMemAllocator->cMaxChunks);
369 AssertReturnVoid(cChunks >= 1);
370
371 Assert(!pVCpu->iem.s.pCurTbR3);
372
373 /*
374 * Decide how much to prune. The chunk is is a multiple of two, so we'll be
375 * scanning a multiple of two here as well.
376 */
377 uint32_t cbToPrune = cbChunk;
378
379 /* Never more than 25%. */
380 if (cChunks < 4)
381 cbToPrune /= cChunks == 1 ? 4 : 2;
382
383 /* Upper limit. In a debug build a 4MB limit averages out at ~0.6ms per call. */
384 if (cbToPrune > _4M)
385 cbToPrune = _4M;
386
387 /*
388 * Adjust the pruning chunk and offset accordingly.
389 */
390 uint32_t idxChunk = pExecMemAllocator->idxChunkPrune;
391 uint32_t offChunk = pExecMemAllocator->offChunkPrune;
392 offChunk &= ~(uint32_t)(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1U);
393 if (offChunk >= cbChunk)
394 {
395 offChunk = 0;
396 idxChunk += 1;
397 }
398 if (idxChunk >= cChunks)
399 {
400 offChunk = 0;
401 idxChunk = 0;
402 }
403
404 uint32_t const offPruneEnd = RT_MIN(offChunk + cbToPrune, cbChunk);
405
406 /*
407 * Do the pruning. The current approach is the sever kind.
408 */
409 uint64_t cbPruned = 0;
410 uint8_t * const pbChunk = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
411 while (offChunk < offPruneEnd)
412 {
413 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)&pbChunk[offChunk];
414
415 /* Is this the start of an allocation block for TB? (We typically have
416 one allocation at the start of each chunk for the unwind info where
417 pTb is NULL.) */
418 if ( pHdr->uMagic == IEMEXECMEMALLOCHDR_MAGIC
419 && pHdr->pTb != NULL
420 && pHdr->idxChunk == idxChunk)
421 {
422 PIEMTB const pTb = pHdr->pTb;
423 AssertPtr(pTb);
424
425 uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
426 IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
427 AssertBreakStmt(offChunk + cbBlock <= cbChunk, offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE); /* paranoia */
428
429 iemTbAllocatorFree(pVCpu, pTb);
430
431 cbPruned += cbBlock;
432 offChunk += cbBlock;
433 }
434 else
435 offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
436 }
437 STAM_REL_PROFILE_ADD_PERIOD(&pExecMemAllocator->StatPruneRecovered, cbPruned);
438
439 /*
440 * Save the current pruning point.
441 */
442 pExecMemAllocator->offChunkPrune = offChunk;
443 pExecMemAllocator->idxChunkPrune = idxChunk;
444
445 /* Set the hint to the start of the pruned region. */
446 pExecMemAllocator->idxChunkHint = idxChunk;
447 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = offChunk / IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
448
449 STAM_REL_PROFILE_STOP(&pExecMemAllocator->StatPruneProf, a);
450}
451#endif /* IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING */
452
453
454#if defined(VBOX_STRICT) || 0
455/**
456 * The old bitmap scanner code, for comparison and assertions.
457 */
458static uint32_t iemExecMemAllocatorFindReqFreeUnitsOld(uint64_t *pbmAlloc, uint32_t cToScan, uint32_t cReqUnits)
459{
460 /** @todo This can probably be done more efficiently for non-x86 systems. */
461 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
462 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
463 {
464 uint32_t idxAddBit = 1;
465 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
466 idxAddBit++;
467 if (idxAddBit >= cReqUnits)
468 return (uint32_t)iBit;
469 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
470 }
471 return UINT32_MAX;
472}
473#endif
474
475
476/**
477 * Bitmap scanner code that looks for a bunch of @a cReqUnits zero bits.
478 *
479 * Booting win11 with a r165098 release build the average native TB size is
480 * around 9 units (of 256 bytes). So, it is unlikely we need to scan any
481 * subsequent words once we hit a patch of zeros, thus @a a_fBig.
482 *
483 * @todo This needs more tweaking. While it *is* faster the the old code,
484 * it doens't seem like it's all that much. :/
485 */
486template<const bool a_fBig>
487static uint32_t iemExecMemAllocatorFindReqFreeUnits(uint64_t *pbmAlloc, uint32_t c64WordsToScan, uint32_t cReqUnits)
488{
489 /*
490 * Scan the (section of the) allocation bitmap in 64-bit words.
491 */
492 unsigned cPrevLeadingZeros = 0;
493 for (uint32_t off = 0; off < c64WordsToScan; off++)
494 {
495 uint64_t uWord = pbmAlloc[off];
496 if (uWord == UINT64_MAX)
497 {
498 /*
499 * Getting thru patches of UINT64_MAX is a frequent problem when the allocator
500 * fills up, so it's definitely worth optimizing.
501 *
502 * The complicated code below is a bit faster on arm. Reducing the per TB cost
503 * from 4255ns to 4106ns (best run out of 10). On win/x86 the gain isn't so
504 * marked, despite more full bitmap scans.
505 */
506#if 1
507 off++;
508 uint32_t cQuads = (c64WordsToScan - off) / 4;
509
510 /* Align. */
511 if (cQuads > 1)
512 switch (((uintptr_t)&pbmAlloc[off] / sizeof(uint64_t)) & 3)
513 {
514 case 0:
515 break;
516 case 1:
517 {
518 uWord = pbmAlloc[off];
519 uint64_t uWord1 = pbmAlloc[off + 1];
520 uint64_t uWord2 = pbmAlloc[off + 2];
521 if ((uWord & uWord1 & uWord2) == UINT64_MAX)
522 {
523 off += 3;
524 cQuads = (c64WordsToScan - off) / 4;
525 }
526 else if (uWord == UINT64_MAX)
527 {
528 if (uWord1 != UINT64_MAX)
529 {
530 uWord = uWord1;
531 off += 1;
532 }
533 else
534 {
535 uWord = uWord2;
536 off += 2;
537 }
538 }
539 break;
540 }
541 case 2:
542 {
543 uWord = pbmAlloc[off];
544 uint64_t uWord1 = pbmAlloc[off + 1];
545 if ((uWord & uWord1) == UINT64_MAX)
546 {
547 off += 2;
548 cQuads = (c64WordsToScan - off) / 4;
549 }
550 else if (uWord == UINT64_MAX)
551 {
552 uWord = uWord1;
553 off += 1;
554 }
555 break;
556 }
557 case 3:
558 uWord = pbmAlloc[off];
559 if (uWord == UINT64_MAX)
560 {
561 off++;
562 cQuads = (c64WordsToScan - off) / 4;
563 }
564 break;
565 }
566 if (uWord == UINT64_MAX)
567 {
568 /* Looping over 32 bytes at a time. */
569 for (;;)
570 {
571 if (cQuads-- > 0)
572 {
573 uWord = pbmAlloc[off + 0];
574 uint64_t uWord1 = pbmAlloc[off + 1];
575 uint64_t uWord2 = pbmAlloc[off + 2];
576 uint64_t uWord3 = pbmAlloc[off + 3];
577 if ((uWord & uWord1 & uWord2 & uWord3) == UINT64_MAX)
578 off += 4;
579 else
580 {
581 if (uWord != UINT64_MAX)
582 { }
583 else if (uWord1 != UINT64_MAX)
584 {
585 uWord = uWord1;
586 off += 1;
587 }
588 else if (uWord2 != UINT64_MAX)
589 {
590 uWord = uWord2;
591 off += 2;
592 }
593 else
594 {
595 uWord = uWord3;
596 off += 3;
597 }
598 break;
599 }
600 }
601 else
602 {
603 if (off < c64WordsToScan)
604 {
605 uWord = pbmAlloc[off];
606 if (uWord != UINT64_MAX)
607 break;
608 off++;
609 if (off < c64WordsToScan)
610 {
611 uWord = pbmAlloc[off];
612 if (uWord != UINT64_MAX)
613 break;
614 off++;
615 if (off < c64WordsToScan)
616 {
617 uWord = pbmAlloc[off];
618 if (uWord != UINT64_MAX)
619 break;
620 Assert(off + 1 == c64WordsToScan);
621 }
622 }
623 }
624 return UINT32_MAX;
625 }
626 }
627 }
628#else
629 do
630 {
631 off++;
632 if (off < c64WordsToScan)
633 uWord = pbmAlloc[off];
634 else
635 return UINT32_MAX;
636 } while (uWord == UINT64_MAX);
637#endif
638 cPrevLeadingZeros = 0;
639 }
640
641 if (uWord != 0)
642 {
643 /*
644 * Fend of large request we cannot satisfy before first.
645 */
646 if (!a_fBig || cReqUnits < 64 + cPrevLeadingZeros)
647 {
648#ifdef __GNUC__
649 unsigned cZerosInWord = __builtin_popcountl(~uWord);
650#else
651# ifdef RT_ARCH_AMD64
652 unsigned cZerosInWord = __popcnt64(~uWord);
653# else
654# pragma message("need popcount intrinsic or something...") /** @todo port me: Win/ARM. */
655 unsigned cZerosInWord = 0;
656 for (uint64_t uTmp = ~uWords; uTmp; cZerosInWord++)
657 uTmp &= uTmp - 1; /* Clears the least significant bit set. */
658# endif
659#endif
660 if (cZerosInWord + cPrevLeadingZeros >= cReqUnits)
661 {
662 /* Check if we've got a patch of zeros at the trailing end
663 when joined with the previous word: */
664#ifdef __GNUC__
665 unsigned cTrailingZeros = __builtin_ctzl(uWord);
666#else
667 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
668#endif
669 if (cPrevLeadingZeros + cTrailingZeros >= cReqUnits)
670 return off * 64 - cPrevLeadingZeros;
671
672 /*
673 * Try leading zeros before we get on with the tedious stuff.
674 */
675#ifdef __GNUC__
676 cPrevLeadingZeros = __builtin_clzl(uWord);
677#else
678 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
679#endif
680 if (cPrevLeadingZeros >= cReqUnits)
681 return (off + 1) * 64 - cPrevLeadingZeros;
682
683 /*
684 * Check the popcount again sans leading & trailing before looking
685 * inside the word.
686 */
687 cZerosInWord -= cPrevLeadingZeros + cTrailingZeros;
688 if (cZerosInWord >= cReqUnits)
689 {
690 /* 1; 64 - 0 - 1 = 63; */
691 unsigned const iBitLast = 64 - cPrevLeadingZeros - cReqUnits; /** @todo boundrary */
692 unsigned iBit = cTrailingZeros;
693 uWord >>= cTrailingZeros;
694 do
695 {
696 Assert(uWord & 1);
697#ifdef __GNUC__
698 unsigned iZeroBit = __builtin_ctzl(~uWord);
699#else
700 unsigned iZeroBit = ASMBitFirstSetU64(~uWord) - 1;
701#endif
702 iBit += iZeroBit;
703 uWord >>= iZeroBit;
704 Assert(iBit <= iBitLast);
705 Assert((uWord & 1) == 0);
706#ifdef __GNUC__
707 unsigned cZeros = __builtin_ctzl(uWord);
708#else
709 unsigned cZeros = ASMBitFirstSetU64(uWord) - 1;
710#endif
711 if (cZeros >= cReqUnits)
712 return off * 64 + iBit;
713
714 cZerosInWord -= cZeros; /* (may underflow as we will count shifted in zeros) */
715 iBit += cZeros;
716 uWord >>= cZeros;
717 } while ((int)cZerosInWord >= (int)cReqUnits && iBit < iBitLast);
718 }
719 continue; /* we've already calculated cPrevLeadingZeros */
720 }
721 }
722
723 /* Update the leading (MSB) zero count. */
724#ifdef __GNUC__
725 cPrevLeadingZeros = __builtin_clzl(uWord);
726#else
727 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
728#endif
729 }
730 /*
731 * uWord == 0
732 */
733 else
734 {
735 if RT_CONSTEXPR_IF(!a_fBig)
736 return off * 64 - cPrevLeadingZeros;
737 else
738 {
739 if (cPrevLeadingZeros + 64 >= cReqUnits)
740 return off * 64 - cPrevLeadingZeros;
741 for (uint32_t off2 = off + 1;; off2++)
742 {
743 if (off2 < c64WordsToScan)
744 {
745 uWord = pbmAlloc[off2];
746 if (uWord == UINT64_MAX)
747 {
748 cPrevLeadingZeros = 0;
749 break;
750 }
751 if (uWord == 0)
752 {
753 if (cPrevLeadingZeros + (off2 - off + 1) * 64 >= cReqUnits)
754 return off * 64 - cPrevLeadingZeros;
755 }
756 else
757 {
758#ifdef __GNUC__
759 unsigned cTrailingZeros = __builtin_ctzl(uWord);
760#else
761 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
762#endif
763 if (cPrevLeadingZeros + (off2 - off) * 64 + cTrailingZeros >= cReqUnits)
764 return off * 64 - cPrevLeadingZeros;
765#ifdef __GNUC__
766 cPrevLeadingZeros = __builtin_clzl(uWord);
767#else
768 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
769#endif
770 break;
771 }
772 }
773 else
774 return UINT32_MAX;
775 }
776 }
777 }
778 }
779 return UINT32_MAX;
780}
781
782
783/**
784 * Try allocate a block of @a cReqUnits in the chunk @a idxChunk.
785 */
786static void *
787iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
788 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb,
789 void **ppvExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
790{
791 /*
792 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
793 */
794 Assert(!(cToScan & 63));
795 Assert(!(idxFirst & 63));
796 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
797 pbmAlloc += idxFirst / 64;
798 cToScan += idxFirst & 63;
799 Assert(!(cToScan & 63));
800
801#if 1
802 uint32_t const iBit = cReqUnits < 64
803 ? iemExecMemAllocatorFindReqFreeUnits<false>(pbmAlloc, cToScan / 64, cReqUnits)
804 : iemExecMemAllocatorFindReqFreeUnits<true>( pbmAlloc, cToScan / 64, cReqUnits);
805 Assert(iBit == iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits));
806#else
807 uint32_t const iBit = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
808#endif
809 if (iBit != UINT32_MAX)
810 {
811 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
812
813 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
814 pChunk->cFreeUnits -= cReqUnits;
815 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
816
817 pExecMemAllocator->cAllocations += 1;
818 uint32_t const cbReq = cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
819 pExecMemAllocator->cbAllocated += cbReq;
820 pExecMemAllocator->cbFree -= cbReq;
821 pExecMemAllocator->idxChunkHint = idxChunk;
822
823 void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw
824 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
825
826 if (ppChunkCtx)
827 *ppChunkCtx = pChunk->pCtx;
828
829 /*
830 * Initialize the header and return.
831 */
832# ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
833 PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw;
834 pHdr->uMagic = IEMEXECMEMALLOCHDR_MAGIC;
835 pHdr->idxChunk = idxChunk;
836 pHdr->pTb = pTb;
837
838 if (ppvExec)
839 *ppvExec = (uint8_t *)pChunk->pvChunkRx
840 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT)
841 + sizeof(*pHdr);
842
843 return pHdr + 1;
844#else
845 if (ppvExec)
846 *ppvExec = (uint8_t *)pChunk->pvChunkRx
847 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
848
849 RT_NOREF(pTb);
850 return pvMem;
851#endif
852 }
853
854 return NULL;
855}
856
857
858static PIEMNATIVEINSTR
859iemExecMemAllocatorAllocInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq, PIEMTB pTb,
860 PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
861{
862 /*
863 * Figure out how much to allocate.
864 */
865#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
866 uint32_t const cReqUnits = (cbReq + sizeof(IEMEXECMEMALLOCHDR) + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
867#else
868 uint32_t const cReqUnits = (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
869#endif
870 >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
871 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
872 {
873 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
874 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
875 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
876 {
877 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
878 pExecMemAllocator->cUnitsPerChunk - idxHint,
879 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
880 if (pvRet)
881 {
882#ifdef VBOX_WITH_STATISTICS
883 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
884#endif
885 return (PIEMNATIVEINSTR)pvRet;
886 }
887 }
888 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
889 RT_MIN(pExecMemAllocator->cUnitsPerChunk,
890 RT_ALIGN_32(idxHint + cReqUnits, 64*4)),
891 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
892 if (!pvRet)
893 pExecMemAllocator->cFruitlessChunkScans += 1;
894#ifdef VBOX_WITH_STATISTICS
895 else
896 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
897#endif
898 return (PIEMNATIVEINSTR)pvRet;
899 }
900 return NULL;
901}
902
903
904/**
905 * Allocates @a cbReq bytes of executable memory.
906 *
907 * @returns Pointer to the readable/writeable memory, NULL if out of memory or other problem
908 * encountered.
909 * @param pVCpu The cross context virtual CPU structure of the
910 * calling thread.
911 * @param cbReq How many bytes are required.
912 * @param pTb The translation block that will be using the allocation.
913 * @param ppaExec Where to return the pointer to executable view of
914 * the allocated memory, optional.
915 * @param ppChunkCtx Where to return the per chunk attached context
916 * if available, optional.
917 */
918DECLHIDDEN(PIEMNATIVEINSTR) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb,
919 PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx) RT_NOEXCEPT
920{
921 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
922 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
923 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
924 STAM_PROFILE_START(&pExecMemAllocator->StatAlloc, a);
925
926 for (unsigned iIteration = 0;; iIteration++)
927 {
928 if (cbReq <= pExecMemAllocator->cbFree)
929 {
930 uint32_t const cChunks = pExecMemAllocator->cChunks;
931 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
932 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
933 {
934 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb,
935 ppaExec, ppChunkCtx);
936 if (pRet)
937 {
938 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
939 return pRet;
940 }
941 }
942 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
943 {
944 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb,
945 ppaExec, ppChunkCtx);
946 if (pRet)
947 {
948 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
949 return pRet;
950 }
951 }
952 }
953
954 /*
955 * Can we grow it with another chunk?
956 */
957 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
958 {
959 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
960 AssertLogRelRCReturn(rc, NULL);
961
962 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
963 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, pTb,
964 ppaExec, ppChunkCtx);
965 if (pRet)
966 {
967 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
968 return pRet;
969 }
970 AssertFailed();
971 }
972
973 /*
974 * Try prune native TBs once.
975 */
976 if (iIteration == 0)
977 {
978#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
979 iemExecMemAllocatorPrune(pVCpu, pExecMemAllocator);
980#else
981 /* No header included in the instruction count here. */
982 uint32_t const cNeededInstrs = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) / sizeof(IEMNATIVEINSTR);
983 iemTbAllocatorFreeupNativeSpace(pVCpu, cNeededInstrs);
984#endif
985 }
986 else
987 {
988 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeExecMemInstrBufAllocFailed);
989 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
990 return NULL;
991 }
992 }
993}
994
995
996/** This is a hook to ensure the instruction cache is properly flushed before the code in the memory
997 * given by @a pv and @a cb is executed */
998DECLHIDDEN(void) iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT
999{
1000#ifdef RT_OS_DARWIN
1001 /*
1002 * We need to synchronize the stuff we wrote to the data cache with the
1003 * instruction cache, since these aren't coherent on arm (or at least not
1004 * on Apple Mn CPUs).
1005 *
1006 * Note! Since we don't any share JIT'ed code with the other CPUs, we don't
1007 * really care whether the dcache is fully flushed back to memory. It
1008 * only needs to hit the level 2 cache, which the level 1 instruction
1009 * and data caches seems to be sharing. In ARM terms, we need to reach
1010 * a point of unification (PoU), rather than a point of coherhency (PoC).
1011 *
1012 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
1013 *
1014 * https://developer.arm.com/documentation/den0013/d/Caches/Point-of-coherency-and-unification
1015 *
1016 * Experimenting with the approach used by sys_icache_invalidate() and
1017 * tweaking it a little, could let us shave off a bit of effort. The thing
1018 * that slows the apple code down on an M2 (runing Sonoma 13.4), seems to
1019 * the 'DSB ISH' instructions performed every 20 icache line flushes.
1020 * Skipping these saves ~100ns or more per TB when profiling the native
1021 * recompiler on the TBs from a win11 full boot-desktop-shutdow sequence.
1022 * Thus we will leave DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB undefined if we
1023 * can.
1024 *
1025 * There appears not to be much difference between DSB options 'ISH',
1026 * 'ISHST', 'NSH' and 'NSHST'. The latter is theoretically all we need, so
1027 * we'll use that one.
1028 *
1029 * See https://developer.arm.com/documentation/100941/0101/Barriers for
1030 * details on the barrier options.
1031 *
1032 * Note! The CFG value "/IEM/HostICacheInvalidationViaHostAPI" can be used
1033 * to disabling the experimental code should it misbehave.
1034 */
1035 uint8_t const fHostICacheInvalidation = pVCpu->iem.s.fHostICacheInvalidation;
1036 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_USE_HOST_API))
1037 {
1038# define DCACHE_ICACHE_SYNC_DSB_OPTION "nshst"
1039/*# define DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB*/
1040
1041 /* Skipping this is fine, but doesn't impact perf much. */
1042 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1043
1044 /* Invalidate the icache for the range [pv,pv+cb). */
1045# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1046 size_t const cIvauDsbEvery= 20;
1047 unsigned cDsb = cIvauDsbEvery;
1048# endif
1049 size_t const cbCacheLine = 64;
1050 size_t cbInvalidate = cb + ((uintptr_t)pv & (cbCacheLine - 1)) ;
1051 size_t cCacheLines = RT_ALIGN_Z(cbInvalidate, cbCacheLine) / cbCacheLine;
1052 uintptr_t uPtr = (uintptr_t)pv & ~(uintptr_t)(cbCacheLine - 1);
1053 for (;; uPtr += cbCacheLine)
1054 {
1055 __asm__ /*__volatile__*/("ic ivau, %0" : : "r" (uPtr));
1056 cCacheLines -= 1;
1057 if (!cCacheLines)
1058 break;
1059# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1060 cDsb -= 1;
1061 if (cDsb != 0)
1062 { /* likely */ }
1063 else
1064 {
1065 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1066 cDsb = cIvauDsbEvery;
1067 }
1068# endif
1069 }
1070
1071 /*
1072 * The DSB here is non-optional it seems.
1073 *
1074 * The following ISB can be omitted on M2 without any obvious sideeffects,
1075 * it produces better number in the above mention profiling scenario.
1076 * This could be related to the kHasICDSB flag in cpu_capabilities.h,
1077 * but it doesn't look like that flag is set here (M2, Sonoma 13.4).
1078 *
1079 * I've made the inclusion of the ISH barrier as configurable and with
1080 * a default of skipping it.
1081 */
1082 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_END_WITH_ISH))
1083 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION
1084 ::: "memory");
1085 else
1086 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION "\n\t"
1087 "isb"
1088 ::: "memory");
1089 }
1090 else
1091 sys_icache_invalidate(pv, cb);
1092
1093#elif defined(RT_OS_LINUX) && defined(RT_ARCH_ARM64)
1094 RT_NOREF(pVCpu);
1095
1096 /* There is __builtin___clear_cache() but it flushes both the instruction and data cache, so do it manually. */
1097 static uint32_t s_u32CtrEl0 = 0;
1098 if (!s_u32CtrEl0)
1099 asm volatile ("mrs %0, ctr_el0":"=r" (s_u32CtrEl0));
1100 uintptr_t cbICacheLine = (uintptr_t)4 << (s_u32CtrEl0 & 0xf);
1101
1102 uintptr_t pb = (uintptr_t)pv & ~(cbICacheLine - 1);
1103 for (; pb < (uintptr_t)pv + cb; pb += cbICacheLine)
1104 asm volatile ("ic ivau, %0" : : "r" (pb) : "memory");
1105
1106 asm volatile ("dsb ish\n\t isb\n\t" : : : "memory");
1107
1108#else
1109 RT_NOREF(pVCpu, pv, cb);
1110#endif
1111}
1112
1113
1114/**
1115 * Frees executable memory.
1116 */
1117DECLHIDDEN(void) iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1118{
1119 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1120 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
1121 AssertPtr(pv);
1122#ifdef VBOX_WITH_STATISTICS
1123 size_t const cbOrig = cb;
1124#endif
1125#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1126 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1127
1128 /* Align the size as we did when allocating the block. */
1129 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1130
1131#else
1132 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)pv - 1;
1133 Assert(!((uintptr_t)pHdr & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1134 AssertReturnVoid(pHdr->uMagic == IEMEXECMEMALLOCHDR_MAGIC);
1135 uint32_t const idxChunk = pHdr->idxChunk;
1136 AssertReturnVoid(idxChunk < pExecMemAllocator->cChunks);
1137 pv = pHdr;
1138
1139 /* Adjust and align the size to cover the whole allocation area. */
1140 cb = RT_ALIGN_Z(cb + sizeof(*pHdr), IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1141#endif
1142
1143 /* Free it / assert sanity. */
1144 bool fFound = false;
1145 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1146#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1147 uint32_t const cChunks = pExecMemAllocator->cChunks;
1148 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
1149#endif
1150 {
1151 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
1152 fFound = offChunk < cbChunk;
1153 if (fFound)
1154 {
1155 uint32_t const idxFirst = (uint32_t)offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1156 uint32_t const cReqUnits = (uint32_t)cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1157
1158 /* Check that it's valid and free it. */
1159 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
1160 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
1161 for (uint32_t i = 1; i < cReqUnits; i++)
1162 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
1163 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
1164
1165 /* Invalidate the header using the writeable memory view. */
1166 pHdr = (PIEMEXECMEMALLOCHDR)((uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRw + offChunk);
1167#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1168 pHdr->uMagic = 0;
1169 pHdr->idxChunk = 0;
1170 pHdr->pTb = NULL;
1171#endif
1172 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
1173 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
1174
1175 /* Update the stats. */
1176 pExecMemAllocator->cbAllocated -= cb;
1177 pExecMemAllocator->cbFree += cb;
1178 pExecMemAllocator->cAllocations -= 1;
1179#ifdef VBOX_WITH_STATISTICS
1180 pExecMemAllocator->cbUnusable -= (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbOrig;
1181#endif
1182 return;
1183 }
1184 }
1185 AssertFailed();
1186}
1187
1188
1189/**
1190 * Interface used by iemNativeRecompileAttachExecMemChunkCtx and unwind info
1191 * generators.
1192 */
1193DECLHIDDEN(PIEMNATIVEINSTR)
1194iemExecMemAllocatorAllocFromChunk(PVMCPU pVCpu, uint32_t idxChunk, uint32_t cbReq, PIEMNATIVEINSTR *ppaExec)
1195{
1196 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1197 AssertReturn(idxChunk < pExecMemAllocator->cChunks, NULL);
1198 Assert(cbReq < _1M);
1199 return iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq, NULL /*pTb*/, ppaExec, NULL /*ppChunkCtx*/);
1200}
1201
1202
1203/**
1204 * For getting the per-chunk context detailing common code for a TB.
1205 *
1206 * This is for use by the disassembler.
1207 */
1208DECLHIDDEN(PCIEMNATIVEPERCHUNKCTX) iemExecMemGetTbChunkCtx(PVMCPU pVCpu, PCIEMTB pTb)
1209{
1210 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1211 if ((pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
1212 {
1213 uintptr_t const uAddress = (uintptr_t)pTb->Native.paInstructions;
1214 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1215 uint32_t idxChunk = pExecMemAllocator->cChunks;
1216 while (idxChunk-- > 0)
1217 if (uAddress - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx < cbChunk)
1218 return pExecMemAllocator->aChunks[idxChunk].pCtx;
1219 }
1220 return NULL;
1221}
1222
1223
1224#ifdef IN_RING3
1225# ifdef RT_OS_WINDOWS
1226
1227/**
1228 * Initializes the unwind info structures for windows hosts.
1229 */
1230static int
1231iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1232 void *pvChunk, uint32_t idxChunk)
1233{
1234 RT_NOREF(pVCpu);
1235
1236 /*
1237 * The AMD64 unwind opcodes.
1238 *
1239 * This is a program that starts with RSP after a RET instruction that
1240 * ends up in recompiled code, and the operations we describe here will
1241 * restore all non-volatile registers and bring RSP back to where our
1242 * RET address is. This means it's reverse order from what happens in
1243 * the prologue.
1244 *
1245 * Note! Using a frame register approach here both because we have one
1246 * and but mainly because the UWOP_ALLOC_LARGE argument values
1247 * would be a pain to write initializers for. On the positive
1248 * side, we're impervious to changes in the the stack variable
1249 * area can can deal with dynamic stack allocations if necessary.
1250 */
1251 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
1252 {
1253 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
1254 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
1255 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
1256 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
1257 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
1258 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
1259 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
1260 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
1261 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
1262 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
1263 };
1264 union
1265 {
1266 IMAGE_UNWIND_INFO Info;
1267 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
1268 } s_UnwindInfo =
1269 {
1270 {
1271 /* .Version = */ 1,
1272 /* .Flags = */ 0,
1273 /* .SizeOfProlog = */ 16, /* whatever */
1274 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
1275 /* .FrameRegister = */ X86_GREG_xBP,
1276 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
1277 }
1278 };
1279 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
1280 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
1281
1282 /*
1283 * Calc how much space we need and allocate it off the exec heap.
1284 */
1285 unsigned const cFunctionEntries = 1;
1286 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
1287 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
1288 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
1289 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbNeeded, NULL, NULL, NULL);
1290 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
1291 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
1292
1293 /*
1294 * Initialize the structures.
1295 */
1296 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
1297
1298 paFunctions[0].BeginAddress = 0;
1299 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
1300 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
1301
1302 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
1303 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
1304
1305 /*
1306 * Register it.
1307 */
1308 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
1309 AssertReturn(fRet, VERR_INTERNAL_ERROR_3); /* Nothing to clean up on failure, since its within the chunk itself. */
1310
1311 return VINF_SUCCESS;
1312}
1313
1314
1315# else /* !RT_OS_WINDOWS */
1316
1317/**
1318 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
1319 */
1320DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
1321{
1322 if (iValue >= 64)
1323 {
1324 Assert(iValue < 0x2000);
1325 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1326 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
1327 }
1328 else if (iValue >= 0)
1329 *Ptr.pb++ = (uint8_t)iValue;
1330 else if (iValue > -64)
1331 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
1332 else
1333 {
1334 Assert(iValue > -0x2000);
1335 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1336 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
1337 }
1338 return Ptr;
1339}
1340
1341
1342/**
1343 * Emits an ULEB128 encoded value (up to 64-bit wide).
1344 */
1345DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
1346{
1347 while (uValue >= 0x80)
1348 {
1349 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
1350 uValue >>= 7;
1351 }
1352 *Ptr.pb++ = (uint8_t)uValue;
1353 return Ptr;
1354}
1355
1356
1357/**
1358 * Emits a CFA rule as register @a uReg + offset @a off.
1359 */
1360DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1361{
1362 *Ptr.pb++ = DW_CFA_def_cfa;
1363 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1364 Ptr = iemDwarfPutUleb128(Ptr, off);
1365 return Ptr;
1366}
1367
1368
1369/**
1370 * Emits a register (@a uReg) save location:
1371 * CFA + @a off * data_alignment_factor
1372 */
1373DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1374{
1375 if (uReg < 0x40)
1376 *Ptr.pb++ = DW_CFA_offset | uReg;
1377 else
1378 {
1379 *Ptr.pb++ = DW_CFA_offset_extended;
1380 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1381 }
1382 Ptr = iemDwarfPutUleb128(Ptr, off);
1383 return Ptr;
1384}
1385
1386
1387# if 0 /* unused */
1388/**
1389 * Emits a register (@a uReg) save location, using signed offset:
1390 * CFA + @a offSigned * data_alignment_factor
1391 */
1392DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
1393{
1394 *Ptr.pb++ = DW_CFA_offset_extended_sf;
1395 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1396 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
1397 return Ptr;
1398}
1399# endif
1400
1401
1402/**
1403 * Initializes the unwind info section for non-windows hosts.
1404 */
1405static int
1406iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1407 void *pvChunk, uint32_t idxChunk)
1408{
1409 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
1410 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
1411
1412 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
1413
1414 /*
1415 * Generate the CIE first.
1416 */
1417# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
1418 uint8_t const iDwarfVer = 3;
1419# else
1420 uint8_t const iDwarfVer = 4;
1421# endif
1422 RTPTRUNION const PtrCie = Ptr;
1423 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1424 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
1425 *Ptr.pb++ = iDwarfVer; /* DwARF version */
1426 *Ptr.pb++ = 0; /* Augmentation. */
1427 if (iDwarfVer >= 4)
1428 {
1429 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
1430 *Ptr.pb++ = 0; /* Segment selector size. */
1431 }
1432# ifdef RT_ARCH_AMD64
1433 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
1434# else
1435 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
1436# endif
1437 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
1438# ifdef RT_ARCH_AMD64
1439 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
1440# elif defined(RT_ARCH_ARM64)
1441 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
1442# else
1443# error "port me"
1444# endif
1445 /* Initial instructions: */
1446# ifdef RT_ARCH_AMD64
1447 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
1448 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
1449 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
1450 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
1451 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
1452 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
1453 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
1454 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
1455# elif defined(RT_ARCH_ARM64)
1456# if 1
1457 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
1458# else
1459 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
1460# endif
1461 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
1462 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
1463 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
1464 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
1465 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
1466 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
1467 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
1468 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
1469 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
1470 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
1471 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
1472 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
1473 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1474 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
1475# else
1476# error "port me"
1477# endif
1478 while ((Ptr.u - PtrCie.u) & 3)
1479 *Ptr.pb++ = DW_CFA_nop;
1480 /* Finalize the CIE size. */
1481 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
1482
1483 /*
1484 * Generate an FDE for the whole chunk area.
1485 */
1486# ifdef IEMNATIVE_USE_LIBUNWIND
1487 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
1488# endif
1489 RTPTRUNION const PtrFde = Ptr;
1490 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1491 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
1492 Ptr.pu32++;
1493 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
1494 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
1495# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
1496 *Ptr.pb++ = DW_CFA_nop;
1497# endif
1498 while ((Ptr.u - PtrFde.u) & 3)
1499 *Ptr.pb++ = DW_CFA_nop;
1500 /* Finalize the FDE size. */
1501 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
1502
1503 /* Terminator entry. */
1504 *Ptr.pu32++ = 0;
1505 *Ptr.pu32++ = 0; /* just to be sure... */
1506 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
1507
1508 /*
1509 * Register it.
1510 */
1511# ifdef IEMNATIVE_USE_LIBUNWIND
1512 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
1513# else
1514 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
1515 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
1516# endif
1517
1518# ifdef IEMNATIVE_USE_GDB_JIT
1519 /*
1520 * Now for telling GDB about this (experimental).
1521 *
1522 * This seems to work best with ET_DYN.
1523 */
1524 GDBJITSYMFILE * const pSymFile = (GDBJITSYMFILE *)iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk,
1525 sizeof(GDBJITSYMFILE), NULL, NULL, NULL);
1526 AssertReturn(pSymFile, VERR_INTERNAL_ERROR_5);
1527 unsigned const offSymFileInChunk = (uintptr_t)pSymFile - (uintptr_t)pvChunk;
1528
1529 RT_ZERO(*pSymFile);
1530
1531 /*
1532 * The ELF header:
1533 */
1534 pSymFile->EHdr.e_ident[0] = ELFMAG0;
1535 pSymFile->EHdr.e_ident[1] = ELFMAG1;
1536 pSymFile->EHdr.e_ident[2] = ELFMAG2;
1537 pSymFile->EHdr.e_ident[3] = ELFMAG3;
1538 pSymFile->EHdr.e_ident[EI_VERSION] = EV_CURRENT;
1539 pSymFile->EHdr.e_ident[EI_CLASS] = ELFCLASS64;
1540 pSymFile->EHdr.e_ident[EI_DATA] = ELFDATA2LSB;
1541 pSymFile->EHdr.e_ident[EI_OSABI] = ELFOSABI_NONE;
1542# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1543 pSymFile->EHdr.e_type = ET_DYN;
1544# else
1545 pSymFile->EHdr.e_type = ET_REL;
1546# endif
1547# ifdef RT_ARCH_AMD64
1548 pSymFile->EHdr.e_machine = EM_AMD64;
1549# elif defined(RT_ARCH_ARM64)
1550 pSymFile->EHdr.e_machine = EM_AARCH64;
1551# else
1552# error "port me"
1553# endif
1554 pSymFile->EHdr.e_version = 1; /*?*/
1555 pSymFile->EHdr.e_entry = 0;
1556# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1557 pSymFile->EHdr.e_phoff = RT_UOFFSETOF(GDBJITSYMFILE, aPhdrs);
1558# else
1559 pSymFile->EHdr.e_phoff = 0;
1560# endif
1561 pSymFile->EHdr.e_shoff = sizeof(pSymFile->EHdr);
1562 pSymFile->EHdr.e_flags = 0;
1563 pSymFile->EHdr.e_ehsize = sizeof(pSymFile->EHdr);
1564# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1565 pSymFile->EHdr.e_phentsize = sizeof(pSymFile->aPhdrs[0]);
1566 pSymFile->EHdr.e_phnum = RT_ELEMENTS(pSymFile->aPhdrs);
1567# else
1568 pSymFile->EHdr.e_phentsize = 0;
1569 pSymFile->EHdr.e_phnum = 0;
1570# endif
1571 pSymFile->EHdr.e_shentsize = sizeof(pSymFile->aShdrs[0]);
1572 pSymFile->EHdr.e_shnum = RT_ELEMENTS(pSymFile->aShdrs);
1573 pSymFile->EHdr.e_shstrndx = 0; /* set later */
1574
1575 uint32_t offStrTab = 0;
1576#define APPEND_STR(a_szStr) do { \
1577 memcpy(&pSymFile->szzStrTab[offStrTab], a_szStr, sizeof(a_szStr)); \
1578 offStrTab += sizeof(a_szStr); \
1579 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1580 } while (0)
1581#define APPEND_STR_FMT(a_szStr, ...) do { \
1582 offStrTab += RTStrPrintf(&pSymFile->szzStrTab[offStrTab], sizeof(pSymFile->szzStrTab) - offStrTab, a_szStr, __VA_ARGS__); \
1583 offStrTab++; \
1584 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1585 } while (0)
1586
1587 /*
1588 * Section headers.
1589 */
1590 /* Section header #0: NULL */
1591 unsigned i = 0;
1592 APPEND_STR("");
1593 RT_ZERO(pSymFile->aShdrs[i]);
1594 i++;
1595
1596 /* Section header: .eh_frame */
1597 pSymFile->aShdrs[i].sh_name = offStrTab;
1598 APPEND_STR(".eh_frame");
1599 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1600 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1601# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1602 pSymFile->aShdrs[i].sh_offset
1603 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, abEhFrame);
1604# else
1605 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->abEhFrame[0];
1606 pSymFile->aShdrs[i].sh_offset = 0;
1607# endif
1608
1609 pSymFile->aShdrs[i].sh_size = sizeof(pEhFrame->abEhFrame);
1610 pSymFile->aShdrs[i].sh_link = 0;
1611 pSymFile->aShdrs[i].sh_info = 0;
1612 pSymFile->aShdrs[i].sh_addralign = 1;
1613 pSymFile->aShdrs[i].sh_entsize = 0;
1614 memcpy(pSymFile->abEhFrame, pEhFrame->abEhFrame, sizeof(pEhFrame->abEhFrame));
1615 i++;
1616
1617 /* Section header: .shstrtab */
1618 unsigned const iShStrTab = i;
1619 pSymFile->EHdr.e_shstrndx = iShStrTab;
1620 pSymFile->aShdrs[i].sh_name = offStrTab;
1621 APPEND_STR(".shstrtab");
1622 pSymFile->aShdrs[i].sh_type = SHT_STRTAB;
1623 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1624# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1625 pSymFile->aShdrs[i].sh_offset
1626 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1627# else
1628 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->szzStrTab[0];
1629 pSymFile->aShdrs[i].sh_offset = 0;
1630# endif
1631 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->szzStrTab);
1632 pSymFile->aShdrs[i].sh_link = 0;
1633 pSymFile->aShdrs[i].sh_info = 0;
1634 pSymFile->aShdrs[i].sh_addralign = 1;
1635 pSymFile->aShdrs[i].sh_entsize = 0;
1636 i++;
1637
1638 /* Section header: .symbols */
1639 pSymFile->aShdrs[i].sh_name = offStrTab;
1640 APPEND_STR(".symtab");
1641 pSymFile->aShdrs[i].sh_type = SHT_SYMTAB;
1642 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1643 pSymFile->aShdrs[i].sh_offset
1644 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aSymbols);
1645 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aSymbols);
1646 pSymFile->aShdrs[i].sh_link = iShStrTab;
1647 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aSymbols);
1648 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aSymbols[0].st_value);
1649 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aSymbols[0]);
1650 i++;
1651
1652# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1653 /* Section header: .symbols */
1654 pSymFile->aShdrs[i].sh_name = offStrTab;
1655 APPEND_STR(".dynsym");
1656 pSymFile->aShdrs[i].sh_type = SHT_DYNSYM;
1657 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1658 pSymFile->aShdrs[i].sh_offset
1659 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1660 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDynSyms);
1661 pSymFile->aShdrs[i].sh_link = iShStrTab;
1662 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aDynSyms);
1663 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aDynSyms[0].st_value);
1664 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDynSyms[0]);
1665 i++;
1666# endif
1667
1668# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1669 /* Section header: .dynamic */
1670 pSymFile->aShdrs[i].sh_name = offStrTab;
1671 APPEND_STR(".dynamic");
1672 pSymFile->aShdrs[i].sh_type = SHT_DYNAMIC;
1673 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1674 pSymFile->aShdrs[i].sh_offset
1675 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1676 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDyn);
1677 pSymFile->aShdrs[i].sh_link = iShStrTab;
1678 pSymFile->aShdrs[i].sh_info = 0;
1679 pSymFile->aShdrs[i].sh_addralign = 1;
1680 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDyn[0]);
1681 i++;
1682# endif
1683
1684 /* Section header: .text */
1685 unsigned const iShText = i;
1686 pSymFile->aShdrs[i].sh_name = offStrTab;
1687 APPEND_STR(".text");
1688 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1689 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1690# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1691 pSymFile->aShdrs[i].sh_offset
1692 = pSymFile->aShdrs[i].sh_addr = sizeof(GDBJITSYMFILE);
1693# else
1694 pSymFile->aShdrs[i].sh_addr = (uintptr_t)(pSymFile + 1);
1695 pSymFile->aShdrs[i].sh_offset = 0;
1696# endif
1697 pSymFile->aShdrs[i].sh_size = pExecMemAllocator->cbChunk - offSymFileInChunk - sizeof(GDBJITSYMFILE);
1698 pSymFile->aShdrs[i].sh_link = 0;
1699 pSymFile->aShdrs[i].sh_info = 0;
1700 pSymFile->aShdrs[i].sh_addralign = 1;
1701 pSymFile->aShdrs[i].sh_entsize = 0;
1702 i++;
1703
1704 Assert(i == RT_ELEMENTS(pSymFile->aShdrs));
1705
1706# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1707 /*
1708 * The program headers:
1709 */
1710 /* Everything in a single LOAD segment: */
1711 i = 0;
1712 pSymFile->aPhdrs[i].p_type = PT_LOAD;
1713 pSymFile->aPhdrs[i].p_flags = PF_X | PF_R;
1714 pSymFile->aPhdrs[i].p_offset
1715 = pSymFile->aPhdrs[i].p_vaddr
1716 = pSymFile->aPhdrs[i].p_paddr = 0;
1717 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1718 = pSymFile->aPhdrs[i].p_memsz = pExecMemAllocator->cbChunk - offSymFileInChunk;
1719 pSymFile->aPhdrs[i].p_align = HOST_PAGE_SIZE;
1720 i++;
1721 /* The .dynamic segment. */
1722 pSymFile->aPhdrs[i].p_type = PT_DYNAMIC;
1723 pSymFile->aPhdrs[i].p_flags = PF_R;
1724 pSymFile->aPhdrs[i].p_offset
1725 = pSymFile->aPhdrs[i].p_vaddr
1726 = pSymFile->aPhdrs[i].p_paddr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1727 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1728 = pSymFile->aPhdrs[i].p_memsz = sizeof(pSymFile->aDyn);
1729 pSymFile->aPhdrs[i].p_align = sizeof(pSymFile->aDyn[0].d_tag);
1730 i++;
1731
1732 Assert(i == RT_ELEMENTS(pSymFile->aPhdrs));
1733
1734 /*
1735 * The dynamic section:
1736 */
1737 i = 0;
1738 pSymFile->aDyn[i].d_tag = DT_SONAME;
1739 pSymFile->aDyn[i].d_un.d_val = offStrTab;
1740 APPEND_STR_FMT("iem-exec-chunk-%u-%u", pVCpu->idCpu, idxChunk);
1741 i++;
1742 pSymFile->aDyn[i].d_tag = DT_STRTAB;
1743 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1744 i++;
1745 pSymFile->aDyn[i].d_tag = DT_STRSZ;
1746 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->szzStrTab);
1747 i++;
1748 pSymFile->aDyn[i].d_tag = DT_SYMTAB;
1749 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1750 i++;
1751 pSymFile->aDyn[i].d_tag = DT_SYMENT;
1752 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->aDynSyms[0]);
1753 i++;
1754 pSymFile->aDyn[i].d_tag = DT_NULL;
1755 i++;
1756 Assert(i == RT_ELEMENTS(pSymFile->aDyn));
1757# endif /* IEMNATIVE_USE_GDB_JIT_ET_DYN */
1758
1759 /*
1760 * Symbol tables:
1761 */
1762 /** @todo gdb doesn't seem to really like this ... */
1763 i = 0;
1764 pSymFile->aSymbols[i].st_name = 0;
1765 pSymFile->aSymbols[i].st_shndx = SHN_UNDEF;
1766 pSymFile->aSymbols[i].st_value = 0;
1767 pSymFile->aSymbols[i].st_size = 0;
1768 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_NOTYPE);
1769 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1770# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1771 pSymFile->aDynSyms[0] = pSymFile->aSymbols[i];
1772# endif
1773 i++;
1774
1775 pSymFile->aSymbols[i].st_name = 0;
1776 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1777 pSymFile->aSymbols[i].st_value = 0;
1778 pSymFile->aSymbols[i].st_size = 0;
1779 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_FILE);
1780 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1781 i++;
1782
1783 pSymFile->aSymbols[i].st_name = offStrTab;
1784 APPEND_STR_FMT("iem_exec_chunk_%u_%u", pVCpu->idCpu, idxChunk);
1785# if 0
1786 pSymFile->aSymbols[i].st_shndx = iShText;
1787 pSymFile->aSymbols[i].st_value = 0;
1788# else
1789 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1790 pSymFile->aSymbols[i].st_value = (uintptr_t)(pSymFile + 1);
1791# endif
1792 pSymFile->aSymbols[i].st_size = pSymFile->aShdrs[iShText].sh_size;
1793 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
1794 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1795# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1796 pSymFile->aDynSyms[1] = pSymFile->aSymbols[i];
1797 pSymFile->aDynSyms[1].st_value = (uintptr_t)(pSymFile + 1);
1798# endif
1799 i++;
1800
1801 Assert(i == RT_ELEMENTS(pSymFile->aSymbols));
1802 Assert(offStrTab < sizeof(pSymFile->szzStrTab));
1803
1804 /*
1805 * The GDB JIT entry and informing GDB.
1806 */
1807 pEhFrame->GdbJitEntry.pbSymFile = (uint8_t *)pSymFile;
1808# if 1
1809 pEhFrame->GdbJitEntry.cbSymFile = pExecMemAllocator->cbChunk - ((uintptr_t)pSymFile - (uintptr_t)pvChunk);
1810# else
1811 pEhFrame->GdbJitEntry.cbSymFile = sizeof(GDBJITSYMFILE);
1812# endif
1813
1814 RTOnce(&g_IemNativeGdbJitOnce, iemNativeGdbJitInitOnce, NULL);
1815 RTCritSectEnter(&g_IemNativeGdbJitLock);
1816 pEhFrame->GdbJitEntry.pNext = NULL;
1817 pEhFrame->GdbJitEntry.pPrev = __jit_debug_descriptor.pTail;
1818 if (__jit_debug_descriptor.pTail)
1819 __jit_debug_descriptor.pTail->pNext = &pEhFrame->GdbJitEntry;
1820 else
1821 __jit_debug_descriptor.pHead = &pEhFrame->GdbJitEntry;
1822 __jit_debug_descriptor.pTail = &pEhFrame->GdbJitEntry;
1823 __jit_debug_descriptor.pRelevant = &pEhFrame->GdbJitEntry;
1824
1825 /* Notify GDB: */
1826 __jit_debug_descriptor.enmAction = kGdbJitaction_Register;
1827 __jit_debug_register_code();
1828 __jit_debug_descriptor.enmAction = kGdbJitaction_NoAction;
1829 RTCritSectLeave(&g_IemNativeGdbJitLock);
1830
1831# else /* !IEMNATIVE_USE_GDB_JIT */
1832 RT_NOREF(pVCpu);
1833# endif /* !IEMNATIVE_USE_GDB_JIT */
1834
1835 return VINF_SUCCESS;
1836}
1837
1838# endif /* !RT_OS_WINDOWS */
1839#endif /* IN_RING3 */
1840
1841
1842/**
1843 * Adds another chunk to the executable memory allocator.
1844 *
1845 * This is used by the init code for the initial allocation and later by the
1846 * regular allocator function when it's out of memory.
1847 */
1848static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
1849{
1850 /* Check that we've room for growth. */
1851 uint32_t const idxChunk = pExecMemAllocator->cChunks;
1852 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
1853
1854 /* Allocate a chunk. */
1855#ifdef RT_OS_DARWIN
1856 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
1857#else
1858 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
1859#endif
1860 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
1861
1862#ifdef RT_OS_DARWIN
1863 /*
1864 * Because it is impossible to have a RWX memory allocation on macOS try to remap the memory
1865 * chunk readable/executable somewhere else so we can save us the hassle of switching between
1866 * protections when exeuctable memory is allocated.
1867 */
1868 int rc = VERR_NO_EXEC_MEMORY;
1869 mach_port_t hPortTask = mach_task_self();
1870 mach_vm_address_t AddrChunk = (mach_vm_address_t)pvChunk;
1871 mach_vm_address_t AddrRemapped = 0;
1872 vm_prot_t ProtCur = 0;
1873 vm_prot_t ProtMax = 0;
1874 kern_return_t krc = mach_vm_remap(hPortTask, &AddrRemapped, pExecMemAllocator->cbChunk, 0,
1875 VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
1876 hPortTask, AddrChunk, FALSE, &ProtCur, &ProtMax,
1877 VM_INHERIT_NONE);
1878 if (krc == KERN_SUCCESS)
1879 {
1880 krc = mach_vm_protect(mach_task_self(), AddrRemapped, pExecMemAllocator->cbChunk, FALSE, VM_PROT_READ | VM_PROT_EXECUTE);
1881 if (krc == KERN_SUCCESS)
1882 rc = VINF_SUCCESS;
1883 else
1884 {
1885 AssertLogRelMsgFailed(("mach_vm_protect -> %d (%#x)\n", krc, krc));
1886 krc = mach_vm_deallocate(hPortTask, AddrRemapped, pExecMemAllocator->cbChunk);
1887 Assert(krc == KERN_SUCCESS);
1888 }
1889 }
1890 else
1891 AssertLogRelMsgFailed(("mach_vm_remap -> %d (%#x)\n", krc, krc));
1892 if (RT_FAILURE(rc))
1893 {
1894 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
1895 return rc;
1896 }
1897
1898 void *pvChunkRx = (void *)AddrRemapped;
1899#else
1900 int rc = VINF_SUCCESS;
1901 void *pvChunkRx = pvChunk;
1902#endif
1903
1904 /*
1905 * Add the chunk.
1906 *
1907 * This must be done before the unwind init so windows can allocate
1908 * memory from the chunk when using the alternative sub-allocator.
1909 */
1910 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = pvChunk;
1911 pExecMemAllocator->aChunks[idxChunk].pvChunkRx = pvChunkRx;
1912#ifdef IN_RING3
1913 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
1914#endif
1915 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
1916 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
1917 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
1918 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
1919
1920 pExecMemAllocator->cChunks = idxChunk + 1;
1921 pExecMemAllocator->idxChunkHint = idxChunk;
1922
1923 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
1924 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
1925
1926 /* If there is a chunk context init callback call it. */
1927 rc = iemNativeRecompileAttachExecMemChunkCtx(pVCpu, idxChunk, &pExecMemAllocator->aChunks[idxChunk].pCtx);
1928#ifdef IN_RING3
1929 /*
1930 * Initialize the unwind information (this cannot really fail atm).
1931 * (This sets pvUnwindInfo.)
1932 */
1933 if (RT_SUCCESS(rc))
1934 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunkRx, idxChunk);
1935#endif
1936 if (RT_SUCCESS(rc))
1937 { /* likely */ }
1938 else
1939 {
1940 /* Just in case the impossible happens, undo the above up: */
1941 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
1942 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1943 pExecMemAllocator->cChunks = idxChunk;
1944 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
1945 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
1946 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = NULL;
1947 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
1948
1949# ifdef RT_OS_DARWIN
1950 krc = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx,
1951 pExecMemAllocator->cbChunk);
1952 Assert(krc == KERN_SUCCESS);
1953# endif
1954
1955 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
1956 return rc;
1957 }
1958
1959 return VINF_SUCCESS;
1960}
1961
1962
1963/**
1964 * Initializes the executable memory allocator for native recompilation on the
1965 * calling EMT.
1966 *
1967 * @returns VBox status code.
1968 * @param pVCpu The cross context virtual CPU structure of the calling
1969 * thread.
1970 * @param cbMax The max size of the allocator.
1971 * @param cbInitial The initial allocator size.
1972 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
1973 * dependent).
1974 */
1975int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk) RT_NOEXCEPT
1976{
1977 /*
1978 * Validate input.
1979 */
1980 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
1981 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
1982 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
1983 || cbChunk == 0
1984 || ( RT_IS_POWER_OF_TWO(cbChunk)
1985 && cbChunk >= _1M
1986 && cbChunk <= _256M
1987 && cbChunk <= cbMax),
1988 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
1989 VERR_OUT_OF_RANGE);
1990
1991 /*
1992 * Adjust/figure out the chunk size.
1993 */
1994 if (cbChunk == 0 || cbChunk == UINT32_MAX)
1995 {
1996 if (cbMax >= _256M)
1997 cbChunk = _64M;
1998 else
1999 {
2000 if (cbMax < _16M)
2001 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
2002 else
2003 cbChunk = (uint32_t)cbMax / 4;
2004 if (!RT_IS_POWER_OF_TWO(cbChunk))
2005 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
2006 }
2007 }
2008#if defined(RT_OS_AMD64)
2009 Assert(cbChunk <= _2G);
2010#elif defined(RT_OS_ARM64)
2011 if (cbChunk > _128M)
2012 cbChunk = _128M; /* Max relative branch distance is +/-2^(25+2) = +/-0x8000000 (134 217 728). */
2013#endif
2014
2015 if (cbChunk > cbMax)
2016 cbMax = cbChunk;
2017 else
2018 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
2019 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
2020 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
2021
2022 /*
2023 * Allocate and initialize the allocatore instance.
2024 */
2025 size_t const offBitmaps = RT_ALIGN_Z(RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]), RT_CACHELINE_SIZE);
2026 size_t const cbBitmaps = (size_t)(cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3)) * cMaxChunks;
2027 size_t cbNeeded = offBitmaps + cbBitmaps;
2028 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
2029 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
2030#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2031 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
2032 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
2033#endif
2034 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
2035 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
2036 VERR_NO_MEMORY);
2037 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
2038 pExecMemAllocator->cbChunk = cbChunk;
2039 pExecMemAllocator->cMaxChunks = cMaxChunks;
2040 pExecMemAllocator->cChunks = 0;
2041 pExecMemAllocator->idxChunkHint = 0;
2042 pExecMemAllocator->cAllocations = 0;
2043 pExecMemAllocator->cbTotal = 0;
2044 pExecMemAllocator->cbFree = 0;
2045 pExecMemAllocator->cbAllocated = 0;
2046#ifdef VBOX_WITH_STATISTICS
2047 pExecMemAllocator->cbUnusable = 0;
2048#endif
2049 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
2050 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2051 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
2052 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmaps); /* Mark everything as allocated. Clear when chunks are added. */
2053#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2054 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
2055#endif
2056 for (uint32_t i = 0; i < cMaxChunks; i++)
2057 {
2058 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
2059 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
2060 pExecMemAllocator->aChunks[i].pvChunkRw = NULL;
2061#ifdef IN_RING0
2062 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
2063#else
2064 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
2065#endif
2066 }
2067 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
2068
2069 /*
2070 * Do the initial allocations.
2071 */
2072 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
2073 {
2074 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
2075 AssertLogRelRCReturn(rc, rc);
2076 }
2077
2078 pExecMemAllocator->idxChunkHint = 0;
2079
2080 /*
2081 * Register statistics.
2082 */
2083 PUVM const pUVM = pVCpu->pUVCpu->pUVM;
2084 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cAllocations, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2085 "Current number of allocations", "/IEM/CPU%u/re/ExecMem/cAllocations", pVCpu->idCpu);
2086 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2087 "Currently allocated chunks", "/IEM/CPU%u/re/ExecMem/cChunks", pVCpu->idCpu);
2088 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cMaxChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2089 "Maximum number of chunks", "/IEM/CPU%u/re/ExecMem/cMaxChunks", pVCpu->idCpu);
2090 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbChunk, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2091 "Allocation chunk size", "/IEM/CPU%u/re/ExecMem/cbChunk", pVCpu->idCpu);
2092 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbAllocated, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2093 "Number of bytes current allocated", "/IEM/CPU%u/re/ExecMem/cbAllocated", pVCpu->idCpu);
2094 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbFree, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2095 "Number of bytes current free", "/IEM/CPU%u/re/ExecMem/cbFree", pVCpu->idCpu);
2096 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbTotal, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2097 "Total number of byte", "/IEM/CPU%u/re/ExecMem/cbTotal", pVCpu->idCpu);
2098#ifdef VBOX_WITH_STATISTICS
2099 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbUnusable, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2100 "Total number of bytes being unusable", "/IEM/CPU%u/re/ExecMem/cbUnusable", pVCpu->idCpu);
2101 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatAlloc, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2102 "Profiling the allocator", "/IEM/CPU%u/re/ExecMem/ProfAlloc", pVCpu->idCpu);
2103#endif
2104#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
2105 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneProf, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2106 "Pruning executable memory (alt)", "/IEM/CPU%u/re/ExecMem/Pruning", pVCpu->idCpu);
2107 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneRecovered, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES_PER_CALL,
2108 "Bytes recovered while pruning", "/IEM/CPU%u/re/ExecMem/PruningRecovered", pVCpu->idCpu);
2109#endif
2110 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cFruitlessChunkScans, STAMTYPE_U64_RESET, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2111 "Chunks fruitlessly scanned for free space", "/IEM/CPU%u/re/ExecMem/FruitlessChunkScans", pVCpu->idCpu);
2112
2113 return VINF_SUCCESS;
2114}
2115
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette