VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp@ 107361

Last change on this file since 107361 was 107211, checked in by vboxsync, 2 months ago

VMM/IEM: Unwind info update for win.arm64. jiraref:VBP-1466

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 90.9 KB
Line 
1/* $Id: IEMAllN8veExecMem.cpp 107211 2024-12-02 10:52:56Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, Executable Memory Allocator.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_IEM_RE_NATIVE
33#define IEM_WITH_OPAQUE_DECODER_STATE
34#define VMM_INCLUDED_SRC_include_IEMMc_h /* block IEMMc.h inclusion. */
35#include <VBox/vmm/iem.h>
36#include <VBox/vmm/cpum.h>
37#include "IEMInternal.h"
38#include <VBox/vmm/vmcc.h>
39#include <VBox/log.h>
40#include <VBox/err.h>
41#include <VBox/param.h>
42#include <iprt/assert.h>
43#include <iprt/mem.h>
44#include <iprt/string.h>
45#if defined(RT_ARCH_AMD64)
46# include <iprt/x86.h>
47#elif defined(RT_ARCH_ARM64)
48# include <iprt/armv8.h>
49#endif
50
51#ifdef RT_OS_WINDOWS
52# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
53extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
54extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
55#else
56# include <iprt/formats/dwarf.h>
57# if defined(RT_OS_DARWIN)
58# include <libkern/OSCacheControl.h>
59# include <mach/mach.h>
60# include <mach/mach_vm.h>
61# define IEMNATIVE_USE_LIBUNWIND
62extern "C" void __register_frame(const void *pvFde);
63extern "C" void __deregister_frame(const void *pvFde);
64# else
65# ifdef DEBUG_bird /** @todo not thread safe yet */
66# define IEMNATIVE_USE_GDB_JIT
67# endif
68# ifdef IEMNATIVE_USE_GDB_JIT
69# include <iprt/critsect.h>
70# include <iprt/once.h>
71# include <iprt/formats/elf64.h>
72# endif
73extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
74extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
75# endif
76#endif
77
78#include "IEMN8veRecompiler.h"
79
80
81/*********************************************************************************************************************************
82* Executable Memory Allocator *
83*********************************************************************************************************************************/
84/** The chunk sub-allocation unit size in bytes. */
85#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 256
86/** The chunk sub-allocation unit size as a shift factor. */
87#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 8
88/** Enables adding a header to the sub-allocator allocations.
89 * This is useful for freeing up executable memory among other things. */
90#define IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
91/** Use alternative pruning. */
92#define IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
93
94
95#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
96# ifdef IEMNATIVE_USE_GDB_JIT
97# define IEMNATIVE_USE_GDB_JIT_ET_DYN
98
99/** GDB JIT: Code entry. */
100typedef struct GDBJITCODEENTRY
101{
102 struct GDBJITCODEENTRY *pNext;
103 struct GDBJITCODEENTRY *pPrev;
104 uint8_t *pbSymFile;
105 uint64_t cbSymFile;
106} GDBJITCODEENTRY;
107
108/** GDB JIT: Actions. */
109typedef enum GDBJITACTIONS : uint32_t
110{
111 kGdbJitaction_NoAction = 0, kGdbJitaction_Register, kGdbJitaction_Unregister
112} GDBJITACTIONS;
113
114/** GDB JIT: Descriptor. */
115typedef struct GDBJITDESCRIPTOR
116{
117 uint32_t uVersion;
118 GDBJITACTIONS enmAction;
119 GDBJITCODEENTRY *pRelevant;
120 GDBJITCODEENTRY *pHead;
121 /** Our addition: */
122 GDBJITCODEENTRY *pTail;
123} GDBJITDESCRIPTOR;
124
125/** GDB JIT: Our simple symbol file data. */
126typedef struct GDBJITSYMFILE
127{
128 Elf64_Ehdr EHdr;
129# ifndef IEMNATIVE_USE_GDB_JIT_ET_DYN
130 Elf64_Shdr aShdrs[5];
131# else
132 Elf64_Shdr aShdrs[7];
133 Elf64_Phdr aPhdrs[2];
134# endif
135 /** The dwarf ehframe data for the chunk. */
136 uint8_t abEhFrame[512];
137 char szzStrTab[128];
138 Elf64_Sym aSymbols[3];
139# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
140 Elf64_Sym aDynSyms[2];
141 Elf64_Dyn aDyn[6];
142# endif
143} GDBJITSYMFILE;
144
145extern "C" GDBJITDESCRIPTOR __jit_debug_descriptor;
146extern "C" DECLEXPORT(void) __jit_debug_register_code(void);
147
148/** Init once for g_IemNativeGdbJitLock. */
149static RTONCE g_IemNativeGdbJitOnce = RTONCE_INITIALIZER;
150/** Init once for the critical section. */
151static RTCRITSECT g_IemNativeGdbJitLock;
152
153/** GDB reads the info here. */
154GDBJITDESCRIPTOR __jit_debug_descriptor = { 1, kGdbJitaction_NoAction, NULL, NULL };
155
156/** GDB sets a breakpoint on this and checks __jit_debug_descriptor when hit. */
157DECL_NO_INLINE(RT_NOTHING, DECLEXPORT(void)) __jit_debug_register_code(void)
158{
159 ASMNopPause();
160}
161
162/** @callback_method_impl{FNRTONCE} */
163static DECLCALLBACK(int32_t) iemNativeGdbJitInitOnce(void *pvUser)
164{
165 RT_NOREF(pvUser);
166 return RTCritSectInit(&g_IemNativeGdbJitLock);
167}
168
169
170# endif /* IEMNATIVE_USE_GDB_JIT */
171
172/**
173 * Per-chunk unwind info for non-windows hosts.
174 */
175typedef struct IEMEXECMEMCHUNKEHFRAME
176{
177# ifdef IEMNATIVE_USE_LIBUNWIND
178 /** The offset of the FDA into abEhFrame. */
179 uintptr_t offFda;
180# else
181 /** 'struct object' storage area. */
182 uint8_t abObject[1024];
183# endif
184# ifdef IEMNATIVE_USE_GDB_JIT
185# if 0
186 /** The GDB JIT 'symbol file' data. */
187 GDBJITSYMFILE GdbJitSymFile;
188# endif
189 /** The GDB JIT list entry. */
190 GDBJITCODEENTRY GdbJitEntry;
191# endif
192 /** The dwarf ehframe data for the chunk. */
193 uint8_t abEhFrame[512];
194} IEMEXECMEMCHUNKEHFRAME;
195/** Pointer to per-chunk info info for non-windows hosts. */
196typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
197#endif
198
199
200/**
201 * An chunk of executable memory.
202 */
203typedef struct IEMEXECMEMCHUNK
204{
205 /** Number of free items in this chunk. */
206 uint32_t cFreeUnits;
207 /** Hint were to start searching for free space in the allocation bitmap. */
208 uint32_t idxFreeHint;
209 /** Pointer to the readable/writeable view of the memory chunk. */
210 void *pvChunkRw;
211 /** Pointer to the readable/executable view of the memory chunk. */
212 void *pvChunkRx;
213 /** Pointer to the context structure detailing the per chunk common code. */
214 PCIEMNATIVEPERCHUNKCTX pCtx;
215#ifdef IN_RING3
216 /**
217 * Pointer to the unwind information.
218 *
219 * This is used during C++ throw and longjmp (windows and probably most other
220 * platforms). Some debuggers (windbg) makes use of it as well.
221 *
222 * Windows: This is allocated from hHeap on windows because (at least for
223 * AMD64) the UNWIND_INFO structure address in the
224 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
225 *
226 * Others: Allocated from the regular heap to avoid unnecessary executable data
227 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
228 void *pvUnwindInfo;
229#elif defined(IN_RING0)
230 /** Allocation handle. */
231 RTR0MEMOBJ hMemObj;
232#endif
233} IEMEXECMEMCHUNK;
234/** Pointer to a memory chunk. */
235typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
236
237
238/**
239 * Executable memory allocator for the native recompiler.
240 */
241typedef struct IEMEXECMEMALLOCATOR
242{
243 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
244 uint32_t uMagic;
245
246 /** The chunk size. */
247 uint32_t cbChunk;
248 /** The maximum number of chunks. */
249 uint32_t cMaxChunks;
250 /** The current number of chunks. */
251 uint32_t cChunks;
252 /** Hint where to start looking for available memory. */
253 uint32_t idxChunkHint;
254 /** Statistics: Current number of allocations. */
255 uint32_t cAllocations;
256
257 /** The total amount of memory available. */
258 uint64_t cbTotal;
259 /** Total amount of free memory. */
260 uint64_t cbFree;
261 /** Total amount of memory allocated. */
262 uint64_t cbAllocated;
263
264 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
265 *
266 * Since the chunk size is a power of two and the minimum chunk size is a lot
267 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
268 * require a whole number of uint64_t elements in the allocation bitmap. So,
269 * for sake of simplicity, they are allocated as one continous chunk for
270 * simplicity/laziness. */
271 uint64_t *pbmAlloc;
272 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
273 uint32_t cUnitsPerChunk;
274 /** Number of bitmap elements per chunk (for quickly locating the bitmap
275 * portion corresponding to an chunk). */
276 uint32_t cBitmapElementsPerChunk;
277
278 /** Number of times we fruitlessly scanned a chunk for free space. */
279 uint64_t cFruitlessChunkScans;
280
281#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
282 /** The next chunk to prune in. */
283 uint32_t idxChunkPrune;
284 /** Where in chunk offset to start pruning at. */
285 uint32_t offChunkPrune;
286 /** Profiling the pruning code. */
287 STAMPROFILE StatPruneProf;
288 /** Number of bytes recovered by the pruning. */
289 STAMPROFILE StatPruneRecovered;
290#endif
291
292#ifdef VBOX_WITH_STATISTICS
293 STAMPROFILE StatAlloc;
294 /** Total amount of memory not being usable currently due to IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE. */
295 uint64_t cbUnusable;
296 /** Allocation size distribution (in alloc units; 0 is the slop bucket). */
297 STAMCOUNTER aStatSizes[16];
298#endif
299
300#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
301 /** Pointer to the array of unwind info running parallel to aChunks (same
302 * allocation as this structure, located after the bitmaps).
303 * (For Windows, the structures must reside in 32-bit RVA distance to the
304 * actual chunk, so they are allocated off the chunk.) */
305 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
306#endif
307
308 /** The allocation chunks. */
309 RT_FLEXIBLE_ARRAY_EXTENSION
310 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
311} IEMEXECMEMALLOCATOR;
312/** Pointer to an executable memory allocator. */
313typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
314
315/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
316#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
317
318
319#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
320/**
321 * Allocation header.
322 */
323typedef struct IEMEXECMEMALLOCHDR
324{
325 RT_GCC_EXTENSION
326 union
327 {
328 struct
329 {
330 /** Magic value / eyecatcher (IEMEXECMEMALLOCHDR_MAGIC). */
331 uint32_t uMagic;
332 /** The allocation chunk (for speeding up freeing). */
333 uint32_t idxChunk;
334 };
335 /** Combined magic and chunk index, for the pruning scanner code. */
336 uint64_t u64MagicAndChunkIdx;
337 };
338 /** Pointer to the translation block the allocation belongs to.
339 * This is the whole point of the header. */
340 PIEMTB pTb;
341} IEMEXECMEMALLOCHDR;
342/** Pointer to an allocation header. */
343typedef IEMEXECMEMALLOCHDR *PIEMEXECMEMALLOCHDR;
344/** Magic value for IEMEXECMEMALLOCHDR ('ExeM'). */
345# define IEMEXECMEMALLOCHDR_MAGIC UINT32_C(0x4d657845)
346#endif
347
348
349static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator);
350
351
352#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
353/**
354 * Frees up executable memory when we're out space.
355 *
356 * This is an alternative to iemTbAllocatorFreeupNativeSpace() that frees up
357 * space in a more linear fashion from the allocator's point of view. It may
358 * also defragment if implemented & enabled
359 */
360static void iemExecMemAllocatorPrune(PVMCPU pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
361{
362# ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
363# error "IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING requires IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER"
364# endif
365 STAM_REL_PROFILE_START(&pExecMemAllocator->StatPruneProf, a);
366
367 /*
368 * Before we can start, we must process delayed frees.
369 */
370#if 1
371 PIEMTBALLOCATOR const pTbAllocator = iemTbAllocatorFreeBulkStart(pVCpu);
372#else
373 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
374#endif
375
376 AssertCompile(RT_IS_POWER_OF_TWO(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE));
377
378 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
379 AssertReturnVoid(RT_IS_POWER_OF_TWO(cbChunk));
380 AssertReturnVoid(cbChunk >= _1M && cbChunk <= _256M); /* see iemExecMemAllocatorInit */
381
382 uint32_t const cChunks = pExecMemAllocator->cChunks;
383 AssertReturnVoid(cChunks == pExecMemAllocator->cMaxChunks);
384 AssertReturnVoid(cChunks >= 1);
385
386 Assert(!pVCpu->iem.s.pCurTbR3);
387
388 /*
389 * Decide how much to prune. The chunk is is a multiple of two, so we'll be
390 * scanning a multiple of two here as well.
391 */
392 uint32_t cbToPrune = cbChunk;
393
394 /* Never more than 25%. */
395 if (cChunks < 4)
396 cbToPrune /= cChunks == 1 ? 4 : 2;
397
398 /* Upper limit. In a debug build a 4MB limit averages out at ~0.6ms per call. */
399 if (cbToPrune > _4M)
400 cbToPrune = _4M;
401
402 /*
403 * Adjust the pruning chunk and offset accordingly.
404 */
405 uint32_t idxChunk = pExecMemAllocator->idxChunkPrune;
406 uint32_t offChunk = pExecMemAllocator->offChunkPrune;
407 offChunk &= ~(uint32_t)(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1U);
408 if (offChunk >= cbChunk)
409 {
410 offChunk = 0;
411 idxChunk += 1;
412 }
413 if (idxChunk >= cChunks)
414 {
415 offChunk = 0;
416 idxChunk = 0;
417 }
418
419 uint32_t const offPruneStart = offChunk;
420 uint32_t const offPruneEnd = RT_MIN(offChunk + cbToPrune, cbChunk);
421
422 /*
423 * Do the pruning. The current approach is the sever kind.
424 *
425 * This is memory bound, as we must load both the allocation header and the
426 * associated TB and then modify them. So, the CPU isn't all that unitilized
427 * here. Try apply some prefetching to speed it up a tiny bit.
428 */
429 uint64_t cbPruned = 0;
430 uint64_t const u64MagicAndChunkIdx = RT_MAKE_U64(IEMEXECMEMALLOCHDR_MAGIC, idxChunk);
431 uint8_t * const pbChunk = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
432 while (offChunk < offPruneEnd)
433 {
434 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)&pbChunk[offChunk];
435
436 /* Is this the start of an allocation block for a TB? (We typically
437 have one allocation at the start of each chunk for the unwind info
438 where pTb is NULL.) */
439 PIEMTB pTb;
440 if ( pHdr->u64MagicAndChunkIdx == u64MagicAndChunkIdx
441 && RT_LIKELY((pTb = pHdr->pTb) != NULL))
442 {
443 AssertPtr(pTb);
444
445 uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
446 IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
447
448 /* Prefetch the next header before freeing the current one and its TB. */
449 /** @todo Iff the block size was part of the header in some way, this could be
450 * a tiny bit faster. */
451 offChunk += cbBlock;
452#if defined(_MSC_VER) && defined(RT_ARCH_AMD64)
453 _mm_prefetch((char *)&pbChunk[offChunk], _MM_HINT_T0);
454#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
455 __prefetch(&pbChunk[offChunk]);
456#else
457 __builtin_prefetch(&pbChunk[offChunk], 1 /*rw*/);
458#endif
459 /* Some paranoia first, though. */
460 AssertBreakStmt(offChunk <= cbChunk, offChunk -= cbBlock - IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
461 cbPruned += cbBlock;
462
463#if 1
464 iemTbAllocatorFreeBulk(pVCpu, pTbAllocator, pTb);
465#else
466 iemTbAllocatorFree(pVCpu, pTb);
467#endif
468 }
469 else
470 offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
471 }
472 STAM_REL_PROFILE_ADD_PERIOD(&pExecMemAllocator->StatPruneRecovered, cbPruned);
473
474 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
475
476 /*
477 * Save the current pruning point.
478 */
479 pExecMemAllocator->offChunkPrune = offChunk;
480 pExecMemAllocator->idxChunkPrune = idxChunk;
481
482 /* Set the hint to the start of the pruned region. */
483 pExecMemAllocator->idxChunkHint = idxChunk;
484 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = offPruneStart / IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
485
486 STAM_REL_PROFILE_STOP(&pExecMemAllocator->StatPruneProf, a);
487}
488#endif /* IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING */
489
490
491#if defined(VBOX_STRICT) || 0
492/**
493 * The old bitmap scanner code, for comparison and assertions.
494 */
495static uint32_t iemExecMemAllocatorFindReqFreeUnitsOld(uint64_t *pbmAlloc, uint32_t cToScan, uint32_t cReqUnits)
496{
497 /** @todo This can probably be done more efficiently for non-x86 systems. */
498 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
499 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
500 {
501 uint32_t idxAddBit = 1;
502 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
503 idxAddBit++;
504 if (idxAddBit >= cReqUnits)
505 return (uint32_t)iBit;
506 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
507 }
508 return UINT32_MAX;
509}
510#endif
511
512
513/**
514 * Bitmap scanner code that looks for a bunch of @a cReqUnits zero bits.
515 *
516 * Booting win11 with a r165098 release build the average native TB size is
517 * around 9 units (of 256 bytes). So, it is unlikely we need to scan any
518 * subsequent words once we hit a patch of zeros, thus @a a_fBig.
519 *
520 * @todo This needs more tweaking. While it *is* faster the the old code,
521 * it doens't seem like it's all that much. :/
522 */
523template<const bool a_fBig>
524static uint32_t iemExecMemAllocatorFindReqFreeUnits(uint64_t *pbmAlloc, uint32_t c64WordsToScan, uint32_t cReqUnits)
525{
526 /*
527 * Scan the (section of the) allocation bitmap in 64-bit words.
528 */
529 unsigned cPrevLeadingZeros = 0;
530 for (uint32_t off = 0; off < c64WordsToScan; off++)
531 {
532 uint64_t uWord = pbmAlloc[off];
533 if (uWord == UINT64_MAX)
534 {
535 /*
536 * Getting thru patches of UINT64_MAX is a frequent problem when the allocator
537 * fills up, so it's definitely worth optimizing.
538 *
539 * The complicated code below is a bit faster on arm. Reducing the per TB cost
540 * from 4255ns to 4106ns (best run out of 10). On win/amd64 there isn't an
541 * obvious gain here, at least not with the data currently being profiled.
542 */
543#if 1
544 off++;
545 uint32_t cQuads = (c64WordsToScan - off) / 4;
546
547 /* Align. */
548 if (cQuads > 1)
549 switch (((uintptr_t)&pbmAlloc[off] / sizeof(uint64_t)) & 3)
550 {
551 case 0:
552 break;
553 case 1:
554 {
555 uWord = pbmAlloc[off];
556 uint64_t uWord1 = pbmAlloc[off + 1];
557 uint64_t uWord2 = pbmAlloc[off + 2];
558 if ((uWord & uWord1 & uWord2) == UINT64_MAX)
559 {
560 off += 3;
561 cQuads = (c64WordsToScan - off) / 4;
562 }
563 else if (uWord == UINT64_MAX)
564 {
565 if (uWord1 != UINT64_MAX)
566 {
567 uWord = uWord1;
568 off += 1;
569 }
570 else
571 {
572 uWord = uWord2;
573 off += 2;
574 }
575 }
576 break;
577 }
578 case 2:
579 {
580 uWord = pbmAlloc[off];
581 uint64_t uWord1 = pbmAlloc[off + 1];
582 if ((uWord & uWord1) == UINT64_MAX)
583 {
584 off += 2;
585 cQuads = (c64WordsToScan - off) / 4;
586 }
587 else if (uWord == UINT64_MAX)
588 {
589 uWord = uWord1;
590 off += 1;
591 }
592 break;
593 }
594 case 3:
595 uWord = pbmAlloc[off];
596 if (uWord == UINT64_MAX)
597 {
598 off++;
599 cQuads = (c64WordsToScan - off) / 4;
600 }
601 break;
602 }
603 if (uWord == UINT64_MAX)
604 {
605 /* Looping over 32 bytes at a time. */
606 for (;;)
607 {
608 if (cQuads-- > 0)
609 {
610 uWord = pbmAlloc[off + 0];
611 uint64_t uWord1 = pbmAlloc[off + 1];
612 uint64_t uWord2 = pbmAlloc[off + 2];
613 uint64_t uWord3 = pbmAlloc[off + 3];
614 if ((uWord & uWord1 & uWord2 & uWord3) == UINT64_MAX)
615 off += 4;
616 else
617 {
618 if (uWord != UINT64_MAX)
619 { }
620 else if (uWord1 != UINT64_MAX)
621 {
622 uWord = uWord1;
623 off += 1;
624 }
625 else if (uWord2 != UINT64_MAX)
626 {
627 uWord = uWord2;
628 off += 2;
629 }
630 else
631 {
632 uWord = uWord3;
633 off += 3;
634 }
635 break;
636 }
637 }
638 else
639 {
640 if (off < c64WordsToScan)
641 {
642 uWord = pbmAlloc[off];
643 if (uWord != UINT64_MAX)
644 break;
645 off++;
646 if (off < c64WordsToScan)
647 {
648 uWord = pbmAlloc[off];
649 if (uWord != UINT64_MAX)
650 break;
651 off++;
652 if (off < c64WordsToScan)
653 {
654 uWord = pbmAlloc[off];
655 if (uWord != UINT64_MAX)
656 break;
657 Assert(off + 1 == c64WordsToScan);
658 }
659 }
660 }
661 return UINT32_MAX;
662 }
663 }
664 }
665#else
666 do
667 {
668 off++;
669 if (off < c64WordsToScan)
670 uWord = pbmAlloc[off];
671 else
672 return UINT32_MAX;
673 } while (uWord == UINT64_MAX);
674#endif
675 cPrevLeadingZeros = 0;
676 }
677
678 /*
679 * If we get down here, we have a word that isn't UINT64_MAX.
680 */
681 if (uWord != 0)
682 {
683 /*
684 * Fend of large request we cannot satisfy before the first set bit.
685 */
686 if (!a_fBig || cReqUnits < 64 + cPrevLeadingZeros)
687 {
688#ifdef __GNUC__
689 unsigned cZerosInWord = __builtin_popcountl(~uWord);
690#elif defined(_MSC_VER) && defined(RT_ARCH_AMD64)
691 unsigned cZerosInWord = __popcnt64(~uWord);
692#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
693 unsigned cZerosInWord = _CountOneBits64(~uWord);
694#else
695# pragma message("need popcount intrinsic or something...")
696 unsigned cZerosInWord = 0;
697 for (uint64_t uTmp = ~uWords; uTmp; cZerosInWord++)
698 uTmp &= uTmp - 1; /* Clears the least significant bit set. */
699#endif
700 if (cZerosInWord + cPrevLeadingZeros >= cReqUnits)
701 {
702 /* Check if we've got a patch of zeros at the trailing end
703 when joined with the previous word: */
704#ifdef __GNUC__
705 unsigned cTrailingZeros = __builtin_ctzl(uWord);
706#else
707 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
708#endif
709 if (cPrevLeadingZeros + cTrailingZeros >= cReqUnits)
710 return off * 64 - cPrevLeadingZeros;
711
712 /*
713 * Try leading zeros before we get on with the tedious stuff.
714 */
715#ifdef __GNUC__
716 cPrevLeadingZeros = __builtin_clzl(uWord);
717#else
718 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
719#endif
720 if (cPrevLeadingZeros >= cReqUnits)
721 return (off + 1) * 64 - cPrevLeadingZeros;
722
723 /*
724 * Check the popcount again sans leading & trailing before looking
725 * inside the word.
726 */
727 cZerosInWord -= cPrevLeadingZeros + cTrailingZeros;
728 if (cZerosInWord >= cReqUnits)
729 {
730 /* 1; 64 - 0 - 1 = 63; */
731 unsigned const iBitLast = 64 - cPrevLeadingZeros - cReqUnits; /** @todo boundrary */
732 unsigned iBit = cTrailingZeros;
733 uWord >>= cTrailingZeros;
734 do
735 {
736 Assert(uWord & 1);
737#ifdef __GNUC__
738 unsigned iZeroBit = __builtin_ctzl(~uWord);
739#else
740 unsigned iZeroBit = ASMBitFirstSetU64(~uWord) - 1;
741#endif
742 iBit += iZeroBit;
743 uWord >>= iZeroBit;
744 Assert(iBit <= iBitLast);
745 Assert((uWord & 1) == 0);
746#ifdef __GNUC__
747 unsigned cZeros = __builtin_ctzl(uWord);
748#else
749 unsigned cZeros = ASMBitFirstSetU64(uWord) - 1;
750#endif
751 if (cZeros >= cReqUnits)
752 return off * 64 + iBit;
753
754 cZerosInWord -= cZeros; /* (may underflow as we will count shifted in zeros) */
755 iBit += cZeros;
756 uWord >>= cZeros;
757 } while ((int)cZerosInWord >= (int)cReqUnits && iBit < iBitLast);
758 }
759 continue; /* we've already calculated cPrevLeadingZeros */
760 }
761 }
762
763 /* Update the leading (MSB) zero count. */
764#ifdef __GNUC__
765 cPrevLeadingZeros = __builtin_clzl(uWord);
766#else
767 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
768#endif
769 }
770 /*
771 * uWord == 0
772 */
773 else
774 {
775 if RT_CONSTEXPR_IF(!a_fBig)
776 return off * 64 - cPrevLeadingZeros;
777 else /* keep else */
778 {
779 if (cPrevLeadingZeros + 64 >= cReqUnits)
780 return off * 64 - cPrevLeadingZeros;
781 for (uint32_t off2 = off + 1;; off2++)
782 {
783 if (off2 < c64WordsToScan)
784 {
785 uWord = pbmAlloc[off2];
786 if (uWord == UINT64_MAX)
787 {
788 cPrevLeadingZeros = 0;
789 break;
790 }
791 if (uWord == 0)
792 {
793 if (cPrevLeadingZeros + (off2 - off + 1) * 64 >= cReqUnits)
794 return off * 64 - cPrevLeadingZeros;
795 }
796 else
797 {
798#ifdef __GNUC__
799 unsigned cTrailingZeros = __builtin_ctzl(uWord);
800#else
801 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
802#endif
803 if (cPrevLeadingZeros + (off2 - off) * 64 + cTrailingZeros >= cReqUnits)
804 return off * 64 - cPrevLeadingZeros;
805#ifdef __GNUC__
806 cPrevLeadingZeros = __builtin_clzl(uWord);
807#else
808 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
809#endif
810 break;
811 }
812 }
813 else
814 return UINT32_MAX;
815 }
816 }
817 }
818 }
819 return UINT32_MAX;
820}
821
822
823/**
824 * Try allocate a block of @a cReqUnits in the chunk @a idxChunk.
825 */
826static void *
827iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
828 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb,
829 void **ppvExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
830{
831 /*
832 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
833 */
834 Assert(!(cToScan & 63));
835 Assert(!(idxFirst & 63));
836 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
837 pbmAlloc += idxFirst / 64;
838 cToScan += idxFirst & 63;
839 Assert(!(cToScan & 63));
840
841#if 1
842 uint32_t const iBit = cReqUnits < 64
843 ? iemExecMemAllocatorFindReqFreeUnits<false>(pbmAlloc, cToScan / 64, cReqUnits)
844 : iemExecMemAllocatorFindReqFreeUnits<true>( pbmAlloc, cToScan / 64, cReqUnits);
845# ifdef VBOX_STRICT
846 uint32_t const iBitOld = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
847 AssertMsg( iBit == iBitOld
848 || (iBit / 64) == (iBitOld / 64), /* New algorithm will return trailing hit before middle. */
849 ("iBit=%#x (%#018RX64); iBitOld=%#x (%#018RX64); cReqUnits=%#x\n",
850 iBit, iBit != UINT32_MAX ? pbmAlloc[iBit / 64] : 0,
851 iBitOld, iBitOld != UINT32_MAX ? pbmAlloc[iBitOld / 64] : 0, cReqUnits));
852# endif
853#else
854 uint32_t const iBit = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
855#endif
856 if (iBit != UINT32_MAX)
857 {
858 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
859
860 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
861 pChunk->cFreeUnits -= cReqUnits;
862 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
863
864 pExecMemAllocator->cAllocations += 1;
865 uint32_t const cbReq = cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
866 pExecMemAllocator->cbAllocated += cbReq;
867 pExecMemAllocator->cbFree -= cbReq;
868 pExecMemAllocator->idxChunkHint = idxChunk;
869
870 void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw
871 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
872
873 if (ppChunkCtx)
874 *ppChunkCtx = pChunk->pCtx;
875
876 /*
877 * Initialize the header and return.
878 */
879# ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
880 PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw;
881 pHdr->uMagic = IEMEXECMEMALLOCHDR_MAGIC;
882 pHdr->idxChunk = idxChunk;
883 pHdr->pTb = pTb;
884
885 if (ppvExec)
886 *ppvExec = (uint8_t *)pChunk->pvChunkRx
887 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT)
888 + sizeof(*pHdr);
889
890 return pHdr + 1;
891#else
892 if (ppvExec)
893 *ppvExec = (uint8_t *)pChunk->pvChunkRx
894 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
895
896 RT_NOREF(pTb);
897 return pvMem;
898#endif
899 }
900
901 return NULL;
902}
903
904
905/**
906 * Converts requested number of bytes into a unit count.
907 */
908DECL_FORCE_INLINE(uint32_t) iemExecMemAllocBytesToUnits(uint32_t cbReq)
909{
910#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
911 return (cbReq + sizeof(IEMEXECMEMALLOCHDR) + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
912#else
913 return (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
914#endif
915 >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
916}
917
918
919DECL_FORCE_INLINE(PIEMNATIVEINSTR)
920iemExecMemAllocatorAllocUnitsInChunkInner(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cReqUnits,
921 PIEMTB pTb, PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
922{
923 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
924 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
925 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
926 {
927 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
928 pExecMemAllocator->cUnitsPerChunk - idxHint,
929 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
930 if (pvRet)
931 return (PIEMNATIVEINSTR)pvRet;
932 }
933 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
934 RT_MIN(pExecMemAllocator->cUnitsPerChunk,
935 RT_ALIGN_32(idxHint + cReqUnits, 64*4)),
936 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
937 if (pvRet)
938 return (PIEMNATIVEINSTR)pvRet;
939
940 pExecMemAllocator->cFruitlessChunkScans += 1;
941 return NULL;
942}
943
944
945DECLINLINE(PIEMNATIVEINSTR)
946iemExecMemAllocatorAllocBytesInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq,
947 PIEMNATIVEINSTR *ppaExec)
948{
949 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
950 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
951 return iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, NULL /*pTb*/,
952 ppaExec, NULL /*ppChunkCtx*/);
953 return NULL;
954}
955
956
957/**
958 * Allocates @a cbReq bytes of executable memory.
959 *
960 * @returns Pointer to the readable/writeable memory, NULL if out of memory or other problem
961 * encountered.
962 * @param pVCpu The cross context virtual CPU structure of the
963 * calling thread.
964 * @param cbReq How many bytes are required.
965 * @param pTb The translation block that will be using the allocation.
966 * @param ppaExec Where to return the pointer to executable view of
967 * the allocated memory, optional.
968 * @param ppChunkCtx Where to return the per chunk attached context
969 * if available, optional.
970 */
971DECLHIDDEN(PIEMNATIVEINSTR) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb,
972 PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx) RT_NOEXCEPT
973{
974 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
975 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
976 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
977 STAM_PROFILE_START(&pExecMemAllocator->StatAlloc, a);
978
979 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
980 STAM_COUNTER_INC(&pExecMemAllocator->aStatSizes[cReqUnits < RT_ELEMENTS(pExecMemAllocator->aStatSizes) ? cReqUnits : 0]);
981 for (unsigned iIteration = 0;; iIteration++)
982 {
983 if ( cbReq * 2 <= pExecMemAllocator->cbFree
984 || (cReqUnits == 1 || pExecMemAllocator->cbFree >= IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) )
985 {
986 uint32_t const cChunks = pExecMemAllocator->cChunks;
987 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
988
989 /*
990 * We do two passes here, the first pass we skip chunks with fewer than cReqUnits * 16,
991 * the 2nd pass we skip chunks. The second pass checks the one skipped in the first pass.
992 */
993 for (uint32_t cMinFreePass = cReqUnits == 1 ? cReqUnits : cReqUnits * 16, cMaxFreePass = UINT32_MAX;;)
994 {
995 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
996 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
997 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
998 {
999 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
1000 cReqUnits, pTb, ppaExec, ppChunkCtx);
1001 if (pRet)
1002 {
1003 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1004#ifdef VBOX_WITH_STATISTICS
1005 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1006#endif
1007 return pRet;
1008 }
1009 }
1010 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
1011 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
1012 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
1013 {
1014 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
1015 cReqUnits, pTb, ppaExec, ppChunkCtx);
1016 if (pRet)
1017 {
1018 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1019#ifdef VBOX_WITH_STATISTICS
1020 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1021#endif
1022 return pRet;
1023 }
1024 }
1025 if (cMinFreePass <= cReqUnits * 2)
1026 break;
1027 cMaxFreePass = cMinFreePass - 1;
1028 cMinFreePass = cReqUnits * 2;
1029 }
1030 }
1031
1032 /*
1033 * Can we grow it with another chunk?
1034 */
1035 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
1036 {
1037 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
1038 AssertLogRelRCReturn(rc, NULL);
1039
1040 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
1041 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, pTb,
1042 ppaExec, ppChunkCtx);
1043 if (pRet)
1044 {
1045 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1046#ifdef VBOX_WITH_STATISTICS
1047 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1048#endif
1049 return pRet;
1050 }
1051 AssertFailed();
1052 }
1053
1054 /*
1055 * Try prune native TBs once.
1056 */
1057 if (iIteration == 0)
1058 {
1059#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
1060 iemExecMemAllocatorPrune(pVCpu, pExecMemAllocator);
1061#else
1062 /* No header included in the instruction count here. */
1063 uint32_t const cNeededInstrs = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) / sizeof(IEMNATIVEINSTR);
1064 iemTbAllocatorFreeupNativeSpace(pVCpu, cNeededInstrs);
1065#endif
1066 }
1067 else
1068 {
1069 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeExecMemInstrBufAllocFailed);
1070 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1071 return NULL;
1072 }
1073 }
1074}
1075
1076
1077/** This is a hook to ensure the instruction cache is properly flushed before the code in the memory
1078 * given by @a pv and @a cb is executed */
1079DECLHIDDEN(void) iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1080{
1081#ifdef RT_OS_DARWIN
1082 /*
1083 * We need to synchronize the stuff we wrote to the data cache with the
1084 * instruction cache, since these aren't coherent on arm (or at least not
1085 * on Apple Mn CPUs).
1086 *
1087 * Note! Since we don't any share JIT'ed code with the other CPUs, we don't
1088 * really care whether the dcache is fully flushed back to memory. It
1089 * only needs to hit the level 2 cache, which the level 1 instruction
1090 * and data caches seems to be sharing. In ARM terms, we need to reach
1091 * a point of unification (PoU), rather than a point of coherhency (PoC).
1092 *
1093 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
1094 *
1095 * https://developer.arm.com/documentation/den0013/d/Caches/Point-of-coherency-and-unification
1096 *
1097 * Experimenting with the approach used by sys_icache_invalidate() and
1098 * tweaking it a little, could let us shave off a bit of effort. The thing
1099 * that slows the apple code down on an M2 (runing Sonoma 13.4), seems to
1100 * the 'DSB ISH' instructions performed every 20 icache line flushes.
1101 * Skipping these saves ~100ns or more per TB when profiling the native
1102 * recompiler on the TBs from a win11 full boot-desktop-shutdow sequence.
1103 * Thus we will leave DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB undefined if we
1104 * can.
1105 *
1106 * There appears not to be much difference between DSB options 'ISH',
1107 * 'ISHST', 'NSH' and 'NSHST'. The latter is theoretically all we need, so
1108 * we'll use that one.
1109 *
1110 * See https://developer.arm.com/documentation/100941/0101/Barriers for
1111 * details on the barrier options.
1112 *
1113 * Note! The CFG value "/IEM/HostICacheInvalidationViaHostAPI" can be used
1114 * to disabling the experimental code should it misbehave.
1115 */
1116 uint8_t const fHostICacheInvalidation = pVCpu->iem.s.fHostICacheInvalidation;
1117 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_USE_HOST_API))
1118 {
1119# define DCACHE_ICACHE_SYNC_DSB_OPTION "nshst"
1120/*# define DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB*/
1121
1122 /* Skipping this is fine, but doesn't impact perf much. */
1123 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1124
1125 /* Invalidate the icache for the range [pv,pv+cb). */
1126# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1127 size_t const cIvauDsbEvery= 20;
1128 unsigned cDsb = cIvauDsbEvery;
1129# endif
1130 size_t const cbCacheLine = 64;
1131 size_t cbInvalidate = cb + ((uintptr_t)pv & (cbCacheLine - 1)) ;
1132 size_t cCacheLines = RT_ALIGN_Z(cbInvalidate, cbCacheLine) / cbCacheLine;
1133 uintptr_t uPtr = (uintptr_t)pv & ~(uintptr_t)(cbCacheLine - 1);
1134 for (;; uPtr += cbCacheLine)
1135 {
1136 __asm__ /*__volatile__*/("ic ivau, %0" : : "r" (uPtr));
1137 cCacheLines -= 1;
1138 if (!cCacheLines)
1139 break;
1140# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1141 cDsb -= 1;
1142 if (cDsb != 0)
1143 { /* likely */ }
1144 else
1145 {
1146 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1147 cDsb = cIvauDsbEvery;
1148 }
1149# endif
1150 }
1151
1152 /*
1153 * The DSB here is non-optional it seems.
1154 *
1155 * The following ISB can be omitted on M2 without any obvious sideeffects,
1156 * it produces better number in the above mention profiling scenario.
1157 * This could be related to the kHasICDSB flag in cpu_capabilities.h,
1158 * but it doesn't look like that flag is set here (M2, Sonoma 13.4).
1159 *
1160 * I've made the inclusion of the ISH barrier as configurable and with
1161 * a default of skipping it.
1162 */
1163 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_END_WITH_ISH))
1164 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION
1165 ::: "memory");
1166 else
1167 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION "\n\t"
1168 "isb"
1169 ::: "memory");
1170 }
1171 else
1172 sys_icache_invalidate(pv, cb);
1173
1174#elif defined(RT_OS_LINUX) && defined(RT_ARCH_ARM64)
1175 RT_NOREF(pVCpu);
1176
1177 /* There is __builtin___clear_cache() but it flushes both the instruction and data cache, so do it manually. */
1178 static uint32_t s_u32CtrEl0 = 0;
1179 if (!s_u32CtrEl0)
1180 asm volatile ("mrs %0, ctr_el0":"=r" (s_u32CtrEl0));
1181 uintptr_t cbICacheLine = (uintptr_t)4 << (s_u32CtrEl0 & 0xf);
1182
1183 uintptr_t pb = (uintptr_t)pv & ~(cbICacheLine - 1);
1184 for (; pb < (uintptr_t)pv + cb; pb += cbICacheLine)
1185 asm volatile ("ic ivau, %0" : : "r" (pb) : "memory");
1186
1187 asm volatile ("dsb ish\n\t isb\n\t" : : : "memory");
1188
1189#else
1190 RT_NOREF(pVCpu, pv, cb);
1191#endif
1192}
1193
1194
1195/**
1196 * Frees executable memory.
1197 */
1198DECLHIDDEN(void) iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1199{
1200 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1201 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
1202 AssertPtr(pv);
1203#ifdef VBOX_WITH_STATISTICS
1204 size_t const cbOrig = cb;
1205#endif
1206#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1207 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1208
1209 /* Align the size as we did when allocating the block. */
1210 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1211
1212#else
1213 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)pv - 1;
1214 Assert(!((uintptr_t)pHdr & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1215 AssertReturnVoid(pHdr->uMagic == IEMEXECMEMALLOCHDR_MAGIC);
1216 uint32_t const idxChunk = pHdr->idxChunk;
1217 AssertReturnVoid(idxChunk < pExecMemAllocator->cChunks);
1218 pv = pHdr;
1219
1220 /* Adjust and align the size to cover the whole allocation area. */
1221 cb = RT_ALIGN_Z(cb + sizeof(*pHdr), IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1222#endif
1223
1224 /* Free it / assert sanity. */
1225 bool fFound = false;
1226 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1227#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1228 uint32_t const cChunks = pExecMemAllocator->cChunks;
1229 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
1230#endif
1231 {
1232 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
1233 fFound = offChunk < cbChunk;
1234 if (fFound)
1235 {
1236 uint32_t const idxFirst = (uint32_t)offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1237 uint32_t const cReqUnits = (uint32_t)cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1238
1239 /* Check that it's valid and free it. */
1240 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
1241 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
1242 for (uint32_t i = 1; i < cReqUnits; i++)
1243 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
1244 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
1245
1246 /* Invalidate the header using the writeable memory view. */
1247 pHdr = (PIEMEXECMEMALLOCHDR)((uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRw + offChunk);
1248#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1249 pHdr->uMagic = 0;
1250 pHdr->idxChunk = 0;
1251 pHdr->pTb = NULL;
1252#endif
1253 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
1254 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
1255
1256 /* Update the stats. */
1257 pExecMemAllocator->cbAllocated -= cb;
1258 pExecMemAllocator->cbFree += cb;
1259 pExecMemAllocator->cAllocations -= 1;
1260#ifdef VBOX_WITH_STATISTICS
1261 pExecMemAllocator->cbUnusable -= (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbOrig;
1262#endif
1263 return;
1264 }
1265 }
1266 AssertFailed();
1267}
1268
1269
1270/**
1271 * Interface used by iemNativeRecompileAttachExecMemChunkCtx and unwind info
1272 * generators.
1273 */
1274DECLHIDDEN(PIEMNATIVEINSTR)
1275iemExecMemAllocatorAllocFromChunk(PVMCPU pVCpu, uint32_t idxChunk, uint32_t cbReq, PIEMNATIVEINSTR *ppaExec)
1276{
1277 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1278 AssertReturn(idxChunk < pExecMemAllocator->cChunks, NULL);
1279 Assert(cbReq < _1M);
1280 return iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbReq, ppaExec);
1281}
1282
1283
1284/**
1285 * For getting the per-chunk context detailing common code for a TB.
1286 *
1287 * This is for use by the disassembler.
1288 */
1289DECLHIDDEN(PCIEMNATIVEPERCHUNKCTX) iemExecMemGetTbChunkCtx(PVMCPU pVCpu, PCIEMTB pTb)
1290{
1291 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1292 if ((pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
1293 {
1294 uintptr_t const uAddress = (uintptr_t)pTb->Native.paInstructions;
1295 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1296 uint32_t idxChunk = pExecMemAllocator->cChunks;
1297 while (idxChunk-- > 0)
1298 if (uAddress - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx < cbChunk)
1299 return pExecMemAllocator->aChunks[idxChunk].pCtx;
1300 }
1301 return NULL;
1302}
1303
1304
1305#ifdef IN_RING3
1306# ifdef RT_OS_WINDOWS
1307
1308/**
1309 * Initializes the unwind info structures for windows hosts.
1310 */
1311static int
1312iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1313 void *pvChunk, uint32_t idxChunk)
1314{
1315 RT_NOREF(pVCpu);
1316
1317# ifdef RT_ARCH_AMD64
1318 /*
1319 * The AMD64 unwind opcodes.
1320 *
1321 * This is a program that starts with RSP after a RET instruction that
1322 * ends up in recompiled code, and the operations we describe here will
1323 * restore all non-volatile registers and bring RSP back to where our
1324 * RET address is. This means it's reverse order from what happens in
1325 * the prologue.
1326 *
1327 * Note! Using a frame register approach here both because we have one
1328 * and but mainly because the UWOP_ALLOC_LARGE argument values
1329 * would be a pain to write initializers for. On the positive
1330 * side, we're impervious to changes in the the stack variable
1331 * area can can deal with dynamic stack allocations if necessary.
1332 */
1333 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
1334 {
1335 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
1336 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
1337 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
1338 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
1339 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
1340 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
1341 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
1342 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
1343 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
1344 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
1345 };
1346 union
1347 {
1348 IMAGE_UNWIND_INFO Info;
1349 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
1350 } s_UnwindInfo =
1351 {
1352 {
1353 /* .Version = */ 1,
1354 /* .Flags = */ 0,
1355 /* .SizeOfProlog = */ 16, /* whatever */
1356 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
1357 /* .FrameRegister = */ X86_GREG_xBP,
1358 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
1359 }
1360 };
1361 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
1362 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
1363
1364# elif defined(RT_ARCH_ARM64)
1365 /*
1366 * The ARM64 unwind codes.
1367 *
1368 * See https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling?view=msvc-170
1369 */
1370 static const uint8_t s_abOpcodes[] =
1371 {
1372 /* Prolog: None. */
1373 0xe5, /* end_c */
1374 /* Epilog / unwind info: */
1375 (IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) / 16, /* alloc_s */
1376 0xc8, 0x00, /* save_regp x19, x20, [sp + #0] */
1377 0xc8, 0x82, /* save_regp x21, x22, [sp + #2*8] */
1378 0xc9, 0x04, /* save_regp x23, x24, [sp + #4*8] */
1379 0xc9, 0x86, /* save_regp x25, x26, [sp + #6*8] */
1380 0xca, 0x08, /* save_regp x27, x28, [sp + #8*8] */
1381 0x4a, /* save_fplr x29, x30, [sp + #10*8] */
1382 12*8 / 16, /* alloc_s */
1383 0xc4, /* end */
1384 0xc5 /* nop */
1385 };
1386 AssertCompile(!(sizeof(s_abOpcodes) & 3));
1387 AssertCompile(!((IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) & 15));
1388 AssertCompile((IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) < 512);
1389
1390# else
1391# error "Port me!"
1392# endif
1393
1394 /*
1395 * Calc how much space we need and allocate it off the exec heap.
1396 */
1397# ifdef RT_ARCH_ARM64
1398 unsigned const cbPerEntry = _1M - 4;
1399 unsigned const cFunctionEntries = (pExecMemAllocator->cbChunk + cbPerEntry - 1) / cbPerEntry;
1400 unsigned const cbUnwindInfo = (sizeof(uint32_t) * 2 + sizeof(s_abOpcodes)) * cFunctionEntries;
1401# else
1402 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
1403 unsigned const cFunctionEntries = 1;
1404# endif
1405 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
1406 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
1407 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbNeeded, NULL);
1408 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
1409 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
1410
1411 /*
1412 * Initialize the structures.
1413 */
1414# ifdef RT_ARCH_AMD64
1415 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
1416
1417 paFunctions[0].BeginAddress = 0;
1418 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
1419 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
1420
1421 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
1422 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
1423
1424# elif defined(RT_ARCH_ARM64)
1425
1426 PIMAGE_ARM64_RUNTIME_FUNCTION_ENTRY_XDATA pInfo = (PIMAGE_ARM64_RUNTIME_FUNCTION_ENTRY_XDATA)&paFunctions[cFunctionEntries];
1427 for (uint32_t i = 0, off = 0; i < cFunctionEntries; i++)
1428 {
1429 paFunctions[i].BeginAddress = off;
1430 paFunctions[i].UnwindData = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk) | PdataRefToFullXdata;
1431
1432 uint32_t const cFunctionLengthInWords = RT_MAX(cbPerEntry, pExecMemAllocator->cbChunk - off) / 4;
1433 pInfo[0].FunctionLength = cFunctionLengthInWords;
1434 pInfo[0].Version = 0;
1435 pInfo[0].ExceptionDataPresent = 0;
1436 pInfo[0].EpilogInHeader = 0;
1437 pInfo[0].EpilogCount = 1;
1438 pInfo[0].CodeWords = sizeof(s_abOpcodes) / sizeof(uint32_t);
1439
1440 pInfo[1].EpilogInfo.EpilogStartOffset = cFunctionLengthInWords;
1441 pInfo[1].EpilogInfo.Reserved = 0;
1442 pInfo[1].EpilogInfo.EpilogStartIndex = 1;
1443 pInfo += 2;
1444
1445 memcpy(pInfo, s_abOpcodes, sizeof(s_abOpcodes));
1446 pInfo += sizeof(s_abOpcodes) / sizeof(*pInfo);
1447 }
1448
1449# else
1450# error "Port me!"
1451# endif
1452
1453 /*
1454 * Register them.
1455 */
1456 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
1457 AssertReturn(fRet, VERR_INTERNAL_ERROR_3); /* Nothing to clean up on failure, since its within the chunk itself. */
1458
1459 return VINF_SUCCESS;
1460}
1461
1462
1463# else /* !RT_OS_WINDOWS */
1464
1465/**
1466 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
1467 */
1468DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
1469{
1470 if (iValue >= 64)
1471 {
1472 Assert(iValue < 0x2000);
1473 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1474 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
1475 }
1476 else if (iValue >= 0)
1477 *Ptr.pb++ = (uint8_t)iValue;
1478 else if (iValue > -64)
1479 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
1480 else
1481 {
1482 Assert(iValue > -0x2000);
1483 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1484 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
1485 }
1486 return Ptr;
1487}
1488
1489
1490/**
1491 * Emits an ULEB128 encoded value (up to 64-bit wide).
1492 */
1493DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
1494{
1495 while (uValue >= 0x80)
1496 {
1497 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
1498 uValue >>= 7;
1499 }
1500 *Ptr.pb++ = (uint8_t)uValue;
1501 return Ptr;
1502}
1503
1504
1505/**
1506 * Emits a CFA rule as register @a uReg + offset @a off.
1507 */
1508DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1509{
1510 *Ptr.pb++ = DW_CFA_def_cfa;
1511 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1512 Ptr = iemDwarfPutUleb128(Ptr, off);
1513 return Ptr;
1514}
1515
1516
1517/**
1518 * Emits a register (@a uReg) save location:
1519 * CFA + @a off * data_alignment_factor
1520 */
1521DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1522{
1523 if (uReg < 0x40)
1524 *Ptr.pb++ = DW_CFA_offset | uReg;
1525 else
1526 {
1527 *Ptr.pb++ = DW_CFA_offset_extended;
1528 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1529 }
1530 Ptr = iemDwarfPutUleb128(Ptr, off);
1531 return Ptr;
1532}
1533
1534
1535# if 0 /* unused */
1536/**
1537 * Emits a register (@a uReg) save location, using signed offset:
1538 * CFA + @a offSigned * data_alignment_factor
1539 */
1540DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
1541{
1542 *Ptr.pb++ = DW_CFA_offset_extended_sf;
1543 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1544 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
1545 return Ptr;
1546}
1547# endif
1548
1549
1550/**
1551 * Initializes the unwind info section for non-windows hosts.
1552 */
1553static int
1554iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1555 void *pvChunk, uint32_t idxChunk)
1556{
1557 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
1558 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
1559
1560 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
1561
1562 /*
1563 * Generate the CIE first.
1564 */
1565# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
1566 uint8_t const iDwarfVer = 3;
1567# else
1568 uint8_t const iDwarfVer = 4;
1569# endif
1570 RTPTRUNION const PtrCie = Ptr;
1571 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1572 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
1573 *Ptr.pb++ = iDwarfVer; /* DwARF version */
1574 *Ptr.pb++ = 0; /* Augmentation. */
1575 if (iDwarfVer >= 4)
1576 {
1577 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
1578 *Ptr.pb++ = 0; /* Segment selector size. */
1579 }
1580# ifdef RT_ARCH_AMD64
1581 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
1582# else
1583 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
1584# endif
1585 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
1586# ifdef RT_ARCH_AMD64
1587 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
1588# elif defined(RT_ARCH_ARM64)
1589 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
1590# else
1591# error "port me"
1592# endif
1593 /* Initial instructions: */
1594# ifdef RT_ARCH_AMD64
1595 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
1596 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
1597 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
1598 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
1599 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
1600 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
1601 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
1602 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
1603# elif defined(RT_ARCH_ARM64)
1604# if 1
1605 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
1606# else
1607 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
1608# endif
1609 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
1610 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
1611 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
1612 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
1613 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
1614 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
1615 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
1616 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
1617 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
1618 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
1619 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
1620 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
1621 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1622 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
1623# else
1624# error "port me"
1625# endif
1626 while ((Ptr.u - PtrCie.u) & 3)
1627 *Ptr.pb++ = DW_CFA_nop;
1628 /* Finalize the CIE size. */
1629 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
1630
1631 /*
1632 * Generate an FDE for the whole chunk area.
1633 */
1634# ifdef IEMNATIVE_USE_LIBUNWIND
1635 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
1636# endif
1637 RTPTRUNION const PtrFde = Ptr;
1638 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1639 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
1640 Ptr.pu32++;
1641 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
1642 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
1643# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
1644 *Ptr.pb++ = DW_CFA_nop;
1645# endif
1646 while ((Ptr.u - PtrFde.u) & 3)
1647 *Ptr.pb++ = DW_CFA_nop;
1648 /* Finalize the FDE size. */
1649 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
1650
1651 /* Terminator entry. */
1652 *Ptr.pu32++ = 0;
1653 *Ptr.pu32++ = 0; /* just to be sure... */
1654 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
1655
1656 /*
1657 * Register it.
1658 */
1659# ifdef IEMNATIVE_USE_LIBUNWIND
1660 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
1661# else
1662 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
1663 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
1664# endif
1665
1666# ifdef IEMNATIVE_USE_GDB_JIT
1667 /*
1668 * Now for telling GDB about this (experimental).
1669 *
1670 * This seems to work best with ET_DYN.
1671 */
1672 GDBJITSYMFILE * const pSymFile = (GDBJITSYMFILE *)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk,
1673 sizeof(GDBJITSYMFILE), NULL);
1674 AssertReturn(pSymFile, VERR_INTERNAL_ERROR_5);
1675 unsigned const offSymFileInChunk = (uintptr_t)pSymFile - (uintptr_t)pvChunk;
1676
1677 RT_ZERO(*pSymFile);
1678
1679 /*
1680 * The ELF header:
1681 */
1682 pSymFile->EHdr.e_ident[0] = ELFMAG0;
1683 pSymFile->EHdr.e_ident[1] = ELFMAG1;
1684 pSymFile->EHdr.e_ident[2] = ELFMAG2;
1685 pSymFile->EHdr.e_ident[3] = ELFMAG3;
1686 pSymFile->EHdr.e_ident[EI_VERSION] = EV_CURRENT;
1687 pSymFile->EHdr.e_ident[EI_CLASS] = ELFCLASS64;
1688 pSymFile->EHdr.e_ident[EI_DATA] = ELFDATA2LSB;
1689 pSymFile->EHdr.e_ident[EI_OSABI] = ELFOSABI_NONE;
1690# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1691 pSymFile->EHdr.e_type = ET_DYN;
1692# else
1693 pSymFile->EHdr.e_type = ET_REL;
1694# endif
1695# ifdef RT_ARCH_AMD64
1696 pSymFile->EHdr.e_machine = EM_AMD64;
1697# elif defined(RT_ARCH_ARM64)
1698 pSymFile->EHdr.e_machine = EM_AARCH64;
1699# else
1700# error "port me"
1701# endif
1702 pSymFile->EHdr.e_version = 1; /*?*/
1703 pSymFile->EHdr.e_entry = 0;
1704# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1705 pSymFile->EHdr.e_phoff = RT_UOFFSETOF(GDBJITSYMFILE, aPhdrs);
1706# else
1707 pSymFile->EHdr.e_phoff = 0;
1708# endif
1709 pSymFile->EHdr.e_shoff = sizeof(pSymFile->EHdr);
1710 pSymFile->EHdr.e_flags = 0;
1711 pSymFile->EHdr.e_ehsize = sizeof(pSymFile->EHdr);
1712# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1713 pSymFile->EHdr.e_phentsize = sizeof(pSymFile->aPhdrs[0]);
1714 pSymFile->EHdr.e_phnum = RT_ELEMENTS(pSymFile->aPhdrs);
1715# else
1716 pSymFile->EHdr.e_phentsize = 0;
1717 pSymFile->EHdr.e_phnum = 0;
1718# endif
1719 pSymFile->EHdr.e_shentsize = sizeof(pSymFile->aShdrs[0]);
1720 pSymFile->EHdr.e_shnum = RT_ELEMENTS(pSymFile->aShdrs);
1721 pSymFile->EHdr.e_shstrndx = 0; /* set later */
1722
1723 uint32_t offStrTab = 0;
1724#define APPEND_STR(a_szStr) do { \
1725 memcpy(&pSymFile->szzStrTab[offStrTab], a_szStr, sizeof(a_szStr)); \
1726 offStrTab += sizeof(a_szStr); \
1727 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1728 } while (0)
1729#define APPEND_STR_FMT(a_szStr, ...) do { \
1730 offStrTab += RTStrPrintf(&pSymFile->szzStrTab[offStrTab], sizeof(pSymFile->szzStrTab) - offStrTab, a_szStr, __VA_ARGS__); \
1731 offStrTab++; \
1732 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1733 } while (0)
1734
1735 /*
1736 * Section headers.
1737 */
1738 /* Section header #0: NULL */
1739 unsigned i = 0;
1740 APPEND_STR("");
1741 RT_ZERO(pSymFile->aShdrs[i]);
1742 i++;
1743
1744 /* Section header: .eh_frame */
1745 pSymFile->aShdrs[i].sh_name = offStrTab;
1746 APPEND_STR(".eh_frame");
1747 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1748 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1749# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1750 pSymFile->aShdrs[i].sh_offset
1751 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, abEhFrame);
1752# else
1753 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->abEhFrame[0];
1754 pSymFile->aShdrs[i].sh_offset = 0;
1755# endif
1756
1757 pSymFile->aShdrs[i].sh_size = sizeof(pEhFrame->abEhFrame);
1758 pSymFile->aShdrs[i].sh_link = 0;
1759 pSymFile->aShdrs[i].sh_info = 0;
1760 pSymFile->aShdrs[i].sh_addralign = 1;
1761 pSymFile->aShdrs[i].sh_entsize = 0;
1762 memcpy(pSymFile->abEhFrame, pEhFrame->abEhFrame, sizeof(pEhFrame->abEhFrame));
1763 i++;
1764
1765 /* Section header: .shstrtab */
1766 unsigned const iShStrTab = i;
1767 pSymFile->EHdr.e_shstrndx = iShStrTab;
1768 pSymFile->aShdrs[i].sh_name = offStrTab;
1769 APPEND_STR(".shstrtab");
1770 pSymFile->aShdrs[i].sh_type = SHT_STRTAB;
1771 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1772# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1773 pSymFile->aShdrs[i].sh_offset
1774 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1775# else
1776 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->szzStrTab[0];
1777 pSymFile->aShdrs[i].sh_offset = 0;
1778# endif
1779 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->szzStrTab);
1780 pSymFile->aShdrs[i].sh_link = 0;
1781 pSymFile->aShdrs[i].sh_info = 0;
1782 pSymFile->aShdrs[i].sh_addralign = 1;
1783 pSymFile->aShdrs[i].sh_entsize = 0;
1784 i++;
1785
1786 /* Section header: .symbols */
1787 pSymFile->aShdrs[i].sh_name = offStrTab;
1788 APPEND_STR(".symtab");
1789 pSymFile->aShdrs[i].sh_type = SHT_SYMTAB;
1790 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1791 pSymFile->aShdrs[i].sh_offset
1792 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aSymbols);
1793 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aSymbols);
1794 pSymFile->aShdrs[i].sh_link = iShStrTab;
1795 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aSymbols);
1796 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aSymbols[0].st_value);
1797 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aSymbols[0]);
1798 i++;
1799
1800# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1801 /* Section header: .symbols */
1802 pSymFile->aShdrs[i].sh_name = offStrTab;
1803 APPEND_STR(".dynsym");
1804 pSymFile->aShdrs[i].sh_type = SHT_DYNSYM;
1805 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1806 pSymFile->aShdrs[i].sh_offset
1807 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1808 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDynSyms);
1809 pSymFile->aShdrs[i].sh_link = iShStrTab;
1810 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aDynSyms);
1811 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aDynSyms[0].st_value);
1812 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDynSyms[0]);
1813 i++;
1814# endif
1815
1816# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1817 /* Section header: .dynamic */
1818 pSymFile->aShdrs[i].sh_name = offStrTab;
1819 APPEND_STR(".dynamic");
1820 pSymFile->aShdrs[i].sh_type = SHT_DYNAMIC;
1821 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1822 pSymFile->aShdrs[i].sh_offset
1823 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1824 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDyn);
1825 pSymFile->aShdrs[i].sh_link = iShStrTab;
1826 pSymFile->aShdrs[i].sh_info = 0;
1827 pSymFile->aShdrs[i].sh_addralign = 1;
1828 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDyn[0]);
1829 i++;
1830# endif
1831
1832 /* Section header: .text */
1833 unsigned const iShText = i;
1834 pSymFile->aShdrs[i].sh_name = offStrTab;
1835 APPEND_STR(".text");
1836 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1837 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1838# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1839 pSymFile->aShdrs[i].sh_offset
1840 = pSymFile->aShdrs[i].sh_addr = sizeof(GDBJITSYMFILE);
1841# else
1842 pSymFile->aShdrs[i].sh_addr = (uintptr_t)(pSymFile + 1);
1843 pSymFile->aShdrs[i].sh_offset = 0;
1844# endif
1845 pSymFile->aShdrs[i].sh_size = pExecMemAllocator->cbChunk - offSymFileInChunk - sizeof(GDBJITSYMFILE);
1846 pSymFile->aShdrs[i].sh_link = 0;
1847 pSymFile->aShdrs[i].sh_info = 0;
1848 pSymFile->aShdrs[i].sh_addralign = 1;
1849 pSymFile->aShdrs[i].sh_entsize = 0;
1850 i++;
1851
1852 Assert(i == RT_ELEMENTS(pSymFile->aShdrs));
1853
1854# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1855 /*
1856 * The program headers:
1857 */
1858 /* Everything in a single LOAD segment: */
1859 i = 0;
1860 pSymFile->aPhdrs[i].p_type = PT_LOAD;
1861 pSymFile->aPhdrs[i].p_flags = PF_X | PF_R;
1862 pSymFile->aPhdrs[i].p_offset
1863 = pSymFile->aPhdrs[i].p_vaddr
1864 = pSymFile->aPhdrs[i].p_paddr = 0;
1865 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1866 = pSymFile->aPhdrs[i].p_memsz = pExecMemAllocator->cbChunk - offSymFileInChunk;
1867 pSymFile->aPhdrs[i].p_align = HOST_PAGE_SIZE;
1868 i++;
1869 /* The .dynamic segment. */
1870 pSymFile->aPhdrs[i].p_type = PT_DYNAMIC;
1871 pSymFile->aPhdrs[i].p_flags = PF_R;
1872 pSymFile->aPhdrs[i].p_offset
1873 = pSymFile->aPhdrs[i].p_vaddr
1874 = pSymFile->aPhdrs[i].p_paddr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1875 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1876 = pSymFile->aPhdrs[i].p_memsz = sizeof(pSymFile->aDyn);
1877 pSymFile->aPhdrs[i].p_align = sizeof(pSymFile->aDyn[0].d_tag);
1878 i++;
1879
1880 Assert(i == RT_ELEMENTS(pSymFile->aPhdrs));
1881
1882 /*
1883 * The dynamic section:
1884 */
1885 i = 0;
1886 pSymFile->aDyn[i].d_tag = DT_SONAME;
1887 pSymFile->aDyn[i].d_un.d_val = offStrTab;
1888 APPEND_STR_FMT("iem-exec-chunk-%u-%u", pVCpu->idCpu, idxChunk);
1889 i++;
1890 pSymFile->aDyn[i].d_tag = DT_STRTAB;
1891 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1892 i++;
1893 pSymFile->aDyn[i].d_tag = DT_STRSZ;
1894 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->szzStrTab);
1895 i++;
1896 pSymFile->aDyn[i].d_tag = DT_SYMTAB;
1897 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1898 i++;
1899 pSymFile->aDyn[i].d_tag = DT_SYMENT;
1900 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->aDynSyms[0]);
1901 i++;
1902 pSymFile->aDyn[i].d_tag = DT_NULL;
1903 i++;
1904 Assert(i == RT_ELEMENTS(pSymFile->aDyn));
1905# endif /* IEMNATIVE_USE_GDB_JIT_ET_DYN */
1906
1907 /*
1908 * Symbol tables:
1909 */
1910 /** @todo gdb doesn't seem to really like this ... */
1911 i = 0;
1912 pSymFile->aSymbols[i].st_name = 0;
1913 pSymFile->aSymbols[i].st_shndx = SHN_UNDEF;
1914 pSymFile->aSymbols[i].st_value = 0;
1915 pSymFile->aSymbols[i].st_size = 0;
1916 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_NOTYPE);
1917 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1918# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1919 pSymFile->aDynSyms[0] = pSymFile->aSymbols[i];
1920# endif
1921 i++;
1922
1923 pSymFile->aSymbols[i].st_name = 0;
1924 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1925 pSymFile->aSymbols[i].st_value = 0;
1926 pSymFile->aSymbols[i].st_size = 0;
1927 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_FILE);
1928 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1929 i++;
1930
1931 pSymFile->aSymbols[i].st_name = offStrTab;
1932 APPEND_STR_FMT("iem_exec_chunk_%u_%u", pVCpu->idCpu, idxChunk);
1933# if 0
1934 pSymFile->aSymbols[i].st_shndx = iShText;
1935 pSymFile->aSymbols[i].st_value = 0;
1936# else
1937 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1938 pSymFile->aSymbols[i].st_value = (uintptr_t)(pSymFile + 1);
1939# endif
1940 pSymFile->aSymbols[i].st_size = pSymFile->aShdrs[iShText].sh_size;
1941 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
1942 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1943# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1944 pSymFile->aDynSyms[1] = pSymFile->aSymbols[i];
1945 pSymFile->aDynSyms[1].st_value = (uintptr_t)(pSymFile + 1);
1946# endif
1947 i++;
1948
1949 Assert(i == RT_ELEMENTS(pSymFile->aSymbols));
1950 Assert(offStrTab < sizeof(pSymFile->szzStrTab));
1951
1952 /*
1953 * The GDB JIT entry and informing GDB.
1954 */
1955 pEhFrame->GdbJitEntry.pbSymFile = (uint8_t *)pSymFile;
1956# if 1
1957 pEhFrame->GdbJitEntry.cbSymFile = pExecMemAllocator->cbChunk - ((uintptr_t)pSymFile - (uintptr_t)pvChunk);
1958# else
1959 pEhFrame->GdbJitEntry.cbSymFile = sizeof(GDBJITSYMFILE);
1960# endif
1961
1962 RTOnce(&g_IemNativeGdbJitOnce, iemNativeGdbJitInitOnce, NULL);
1963 RTCritSectEnter(&g_IemNativeGdbJitLock);
1964 pEhFrame->GdbJitEntry.pNext = NULL;
1965 pEhFrame->GdbJitEntry.pPrev = __jit_debug_descriptor.pTail;
1966 if (__jit_debug_descriptor.pTail)
1967 __jit_debug_descriptor.pTail->pNext = &pEhFrame->GdbJitEntry;
1968 else
1969 __jit_debug_descriptor.pHead = &pEhFrame->GdbJitEntry;
1970 __jit_debug_descriptor.pTail = &pEhFrame->GdbJitEntry;
1971 __jit_debug_descriptor.pRelevant = &pEhFrame->GdbJitEntry;
1972
1973 /* Notify GDB: */
1974 __jit_debug_descriptor.enmAction = kGdbJitaction_Register;
1975 __jit_debug_register_code();
1976 __jit_debug_descriptor.enmAction = kGdbJitaction_NoAction;
1977 RTCritSectLeave(&g_IemNativeGdbJitLock);
1978
1979# else /* !IEMNATIVE_USE_GDB_JIT */
1980 RT_NOREF(pVCpu);
1981# endif /* !IEMNATIVE_USE_GDB_JIT */
1982
1983 return VINF_SUCCESS;
1984}
1985
1986# endif /* !RT_OS_WINDOWS */
1987#endif /* IN_RING3 */
1988
1989
1990/**
1991 * Adds another chunk to the executable memory allocator.
1992 *
1993 * This is used by the init code for the initial allocation and later by the
1994 * regular allocator function when it's out of memory.
1995 */
1996static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
1997{
1998 /* Check that we've room for growth. */
1999 uint32_t const idxChunk = pExecMemAllocator->cChunks;
2000 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
2001
2002 /* Allocate a chunk. */
2003#ifdef RT_OS_DARWIN
2004 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
2005#else
2006 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
2007#endif
2008 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
2009
2010#ifdef RT_OS_DARWIN
2011 /*
2012 * Because it is impossible to have a RWX memory allocation on macOS try to remap the memory
2013 * chunk readable/executable somewhere else so we can save us the hassle of switching between
2014 * protections when exeuctable memory is allocated.
2015 */
2016 int rc = VERR_NO_EXEC_MEMORY;
2017 mach_port_t hPortTask = mach_task_self();
2018 mach_vm_address_t AddrChunk = (mach_vm_address_t)pvChunk;
2019 mach_vm_address_t AddrRemapped = 0;
2020 vm_prot_t ProtCur = 0;
2021 vm_prot_t ProtMax = 0;
2022 kern_return_t krc = mach_vm_remap(hPortTask, &AddrRemapped, pExecMemAllocator->cbChunk, 0,
2023 VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
2024 hPortTask, AddrChunk, FALSE, &ProtCur, &ProtMax,
2025 VM_INHERIT_NONE);
2026 if (krc == KERN_SUCCESS)
2027 {
2028 krc = mach_vm_protect(mach_task_self(), AddrRemapped, pExecMemAllocator->cbChunk, FALSE, VM_PROT_READ | VM_PROT_EXECUTE);
2029 if (krc == KERN_SUCCESS)
2030 rc = VINF_SUCCESS;
2031 else
2032 {
2033 AssertLogRelMsgFailed(("mach_vm_protect -> %d (%#x)\n", krc, krc));
2034 krc = mach_vm_deallocate(hPortTask, AddrRemapped, pExecMemAllocator->cbChunk);
2035 Assert(krc == KERN_SUCCESS);
2036 }
2037 }
2038 else
2039 AssertLogRelMsgFailed(("mach_vm_remap -> %d (%#x)\n", krc, krc));
2040 if (RT_FAILURE(rc))
2041 {
2042 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
2043 return rc;
2044 }
2045
2046 void *pvChunkRx = (void *)AddrRemapped;
2047#else
2048 int rc = VINF_SUCCESS;
2049 void *pvChunkRx = pvChunk;
2050#endif
2051
2052 /*
2053 * Add the chunk.
2054 *
2055 * This must be done before the unwind init so windows can allocate
2056 * memory from the chunk when using the alternative sub-allocator.
2057 */
2058 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = pvChunk;
2059 pExecMemAllocator->aChunks[idxChunk].pvChunkRx = pvChunkRx;
2060#ifdef IN_RING3
2061 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
2062#endif
2063 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
2064 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
2065 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
2066 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
2067
2068 pExecMemAllocator->cChunks = idxChunk + 1;
2069 pExecMemAllocator->idxChunkHint = idxChunk;
2070
2071 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
2072 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
2073
2074 /* If there is a chunk context init callback call it. */
2075 rc = iemNativeRecompileAttachExecMemChunkCtx(pVCpu, idxChunk, &pExecMemAllocator->aChunks[idxChunk].pCtx);
2076#ifdef IN_RING3
2077 /*
2078 * Initialize the unwind information (this cannot really fail atm).
2079 * (This sets pvUnwindInfo.)
2080 */
2081 if (RT_SUCCESS(rc))
2082 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunkRx, idxChunk);
2083#endif
2084 if (RT_SUCCESS(rc))
2085 { /* likely */ }
2086 else
2087 {
2088 /* Just in case the impossible happens, undo the above up: */
2089 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
2090 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2091 pExecMemAllocator->cChunks = idxChunk;
2092 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
2093 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
2094 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = NULL;
2095 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
2096
2097# ifdef RT_OS_DARWIN
2098 krc = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx,
2099 pExecMemAllocator->cbChunk);
2100 Assert(krc == KERN_SUCCESS);
2101# endif
2102
2103 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
2104 return rc;
2105 }
2106
2107 return VINF_SUCCESS;
2108}
2109
2110
2111/**
2112 * Initializes the executable memory allocator for native recompilation on the
2113 * calling EMT.
2114 *
2115 * @returns VBox status code.
2116 * @param pVCpu The cross context virtual CPU structure of the calling
2117 * thread.
2118 * @param cbMax The max size of the allocator.
2119 * @param cbInitial The initial allocator size.
2120 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
2121 * dependent).
2122 */
2123int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk) RT_NOEXCEPT
2124{
2125 /*
2126 * Validate input.
2127 */
2128 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
2129 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
2130 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
2131 || cbChunk == 0
2132 || ( RT_IS_POWER_OF_TWO(cbChunk)
2133 && cbChunk >= _1M
2134 && cbChunk <= _256M
2135 && cbChunk <= cbMax),
2136 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
2137 VERR_OUT_OF_RANGE);
2138
2139 /*
2140 * Adjust/figure out the chunk size.
2141 */
2142 if (cbChunk == 0 || cbChunk == UINT32_MAX)
2143 {
2144 if (cbMax >= _256M)
2145 cbChunk = _64M;
2146 else
2147 {
2148 if (cbMax < _16M)
2149 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
2150 else
2151 cbChunk = (uint32_t)cbMax / 4;
2152 if (!RT_IS_POWER_OF_TWO(cbChunk))
2153 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
2154 }
2155 }
2156#if defined(RT_OS_AMD64)
2157 Assert(cbChunk <= _2G);
2158#elif defined(RT_OS_ARM64)
2159 if (cbChunk > _128M)
2160 cbChunk = _128M; /* Max relative branch distance is +/-2^(25+2) = +/-0x8000000 (134 217 728). */
2161#endif
2162
2163 if (cbChunk > cbMax)
2164 cbMax = cbChunk;
2165 else
2166 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
2167 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
2168 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
2169
2170 /*
2171 * Allocate and initialize the allocatore instance.
2172 */
2173 size_t const offBitmaps = RT_ALIGN_Z(RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]), RT_CACHELINE_SIZE);
2174 size_t const cbBitmaps = (size_t)(cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3)) * cMaxChunks;
2175 size_t cbNeeded = offBitmaps + cbBitmaps;
2176 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
2177 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
2178#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2179 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
2180 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
2181#endif
2182 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
2183 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
2184 VERR_NO_MEMORY);
2185 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
2186 pExecMemAllocator->cbChunk = cbChunk;
2187 pExecMemAllocator->cMaxChunks = cMaxChunks;
2188 pExecMemAllocator->cChunks = 0;
2189 pExecMemAllocator->idxChunkHint = 0;
2190 pExecMemAllocator->cAllocations = 0;
2191 pExecMemAllocator->cbTotal = 0;
2192 pExecMemAllocator->cbFree = 0;
2193 pExecMemAllocator->cbAllocated = 0;
2194#ifdef VBOX_WITH_STATISTICS
2195 pExecMemAllocator->cbUnusable = 0;
2196#endif
2197 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
2198 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2199 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
2200 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmaps); /* Mark everything as allocated. Clear when chunks are added. */
2201#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2202 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
2203#endif
2204 for (uint32_t i = 0; i < cMaxChunks; i++)
2205 {
2206 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
2207 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
2208 pExecMemAllocator->aChunks[i].pvChunkRw = NULL;
2209#ifdef IN_RING0
2210 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
2211#else
2212 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
2213#endif
2214 }
2215 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
2216
2217 /*
2218 * Do the initial allocations.
2219 */
2220 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
2221 {
2222 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
2223 AssertLogRelRCReturn(rc, rc);
2224 }
2225
2226 pExecMemAllocator->idxChunkHint = 0;
2227
2228 /*
2229 * Register statistics.
2230 */
2231 PUVM const pUVM = pVCpu->pUVCpu->pUVM;
2232 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cAllocations, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2233 "Current number of allocations", "/IEM/CPU%u/re/ExecMem/cAllocations", pVCpu->idCpu);
2234 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2235 "Currently allocated chunks", "/IEM/CPU%u/re/ExecMem/cChunks", pVCpu->idCpu);
2236 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cMaxChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2237 "Maximum number of chunks", "/IEM/CPU%u/re/ExecMem/cMaxChunks", pVCpu->idCpu);
2238 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbChunk, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2239 "Allocation chunk size", "/IEM/CPU%u/re/ExecMem/cbChunk", pVCpu->idCpu);
2240 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbAllocated, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2241 "Number of bytes current allocated", "/IEM/CPU%u/re/ExecMem/cbAllocated", pVCpu->idCpu);
2242 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbFree, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2243 "Number of bytes current free", "/IEM/CPU%u/re/ExecMem/cbFree", pVCpu->idCpu);
2244 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbTotal, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2245 "Total number of byte", "/IEM/CPU%u/re/ExecMem/cbTotal", pVCpu->idCpu);
2246#ifdef VBOX_WITH_STATISTICS
2247 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbUnusable, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2248 "Total number of bytes being unusable", "/IEM/CPU%u/re/ExecMem/cbUnusable", pVCpu->idCpu);
2249 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatAlloc, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2250 "Profiling the allocator", "/IEM/CPU%u/re/ExecMem/ProfAlloc", pVCpu->idCpu);
2251 for (unsigned i = 1; i < RT_ELEMENTS(pExecMemAllocator->aStatSizes); i++)
2252 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[i], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2253 "Number of allocations of this number of allocation units",
2254 "/IEM/CPU%u/re/ExecMem/aSize%02u", pVCpu->idCpu, i);
2255 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[0], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2256 "Number of allocations 16 units or larger", "/IEM/CPU%u/re/ExecMem/aSize16OrLarger", pVCpu->idCpu);
2257#endif
2258#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
2259 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneProf, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2260 "Pruning executable memory (alt)", "/IEM/CPU%u/re/ExecMem/Pruning", pVCpu->idCpu);
2261 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneRecovered, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES_PER_CALL,
2262 "Bytes recovered while pruning", "/IEM/CPU%u/re/ExecMem/PruningRecovered", pVCpu->idCpu);
2263#endif
2264 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cFruitlessChunkScans, STAMTYPE_U64_RESET, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2265 "Chunks fruitlessly scanned for free space", "/IEM/CPU%u/re/ExecMem/FruitlessChunkScans", pVCpu->idCpu);
2266
2267 return VINF_SUCCESS;
2268}
2269
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette