VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp@ 108195

Last change on this file since 108195 was 108195, checked in by vboxsync, 3 days ago

VMM/IEM: Splitting out most of the x86 target specific stuff from IEMInternal.h and into VMMAll/target-x86/IEMInternal-x86.h. jiraref:VBP-1431

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 90.9 KB
Line 
1/* $Id: IEMAllN8veExecMem.cpp 108195 2025-02-13 14:57:25Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, Executable Memory Allocator.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_IEM_RE_NATIVE
33#define IEM_WITH_OPAQUE_DECODER_STATE
34#define VMM_INCLUDED_SRC_include_IEMMc_h /* block IEMMc.h inclusion. */
35#ifdef IN_RING0
36# error "port me!"
37#endif
38#include <VBox/vmm/iem.h>
39#include <VBox/vmm/cpum.h>
40#include "IEMInternal.h"
41#include <VBox/vmm/vmcc.h>
42#include <VBox/log.h>
43#include <VBox/err.h>
44#include <VBox/param.h>
45#include <iprt/assert.h>
46#include <iprt/mem.h>
47#include <iprt/string.h>
48#if defined(RT_ARCH_AMD64)
49# include <iprt/x86.h>
50#elif defined(RT_ARCH_ARM64)
51# include <iprt/armv8.h>
52#endif
53
54#ifdef RT_OS_WINDOWS
55# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
56extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
57extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
58#else
59# include <iprt/formats/dwarf.h>
60# if defined(RT_OS_DARWIN)
61# include <libkern/OSCacheControl.h>
62# include <mach/mach.h>
63# include <mach/mach_vm.h>
64# define IEMNATIVE_USE_LIBUNWIND
65extern "C" void __register_frame(const void *pvFde);
66extern "C" void __deregister_frame(const void *pvFde);
67# else
68# ifdef DEBUG_bird /** @todo not thread safe yet */
69# define IEMNATIVE_USE_GDB_JIT
70# endif
71# ifdef IEMNATIVE_USE_GDB_JIT
72# include <iprt/critsect.h>
73# include <iprt/once.h>
74# include <iprt/formats/elf64.h>
75# endif
76extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
77extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
78# endif
79#endif
80
81#include "IEMN8veRecompiler.h"
82
83
84/*********************************************************************************************************************************
85* Executable Memory Allocator *
86*********************************************************************************************************************************/
87/** The chunk sub-allocation unit size in bytes. */
88#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 256
89/** The chunk sub-allocation unit size as a shift factor. */
90#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 8
91/** Enables adding a header to the sub-allocator allocations.
92 * This is useful for freeing up executable memory among other things. */
93#define IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
94/** Use alternative pruning. */
95#define IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
96
97
98#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
99# ifdef IEMNATIVE_USE_GDB_JIT
100# define IEMNATIVE_USE_GDB_JIT_ET_DYN
101
102/** GDB JIT: Code entry. */
103typedef struct GDBJITCODEENTRY
104{
105 struct GDBJITCODEENTRY *pNext;
106 struct GDBJITCODEENTRY *pPrev;
107 uint8_t *pbSymFile;
108 uint64_t cbSymFile;
109} GDBJITCODEENTRY;
110
111/** GDB JIT: Actions. */
112typedef enum GDBJITACTIONS : uint32_t
113{
114 kGdbJitaction_NoAction = 0, kGdbJitaction_Register, kGdbJitaction_Unregister
115} GDBJITACTIONS;
116
117/** GDB JIT: Descriptor. */
118typedef struct GDBJITDESCRIPTOR
119{
120 uint32_t uVersion;
121 GDBJITACTIONS enmAction;
122 GDBJITCODEENTRY *pRelevant;
123 GDBJITCODEENTRY *pHead;
124 /** Our addition: */
125 GDBJITCODEENTRY *pTail;
126} GDBJITDESCRIPTOR;
127
128/** GDB JIT: Our simple symbol file data. */
129typedef struct GDBJITSYMFILE
130{
131 Elf64_Ehdr EHdr;
132# ifndef IEMNATIVE_USE_GDB_JIT_ET_DYN
133 Elf64_Shdr aShdrs[5];
134# else
135 Elf64_Shdr aShdrs[7];
136 Elf64_Phdr aPhdrs[2];
137# endif
138 /** The dwarf ehframe data for the chunk. */
139 uint8_t abEhFrame[512];
140 char szzStrTab[128];
141 Elf64_Sym aSymbols[3];
142# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
143 Elf64_Sym aDynSyms[2];
144 Elf64_Dyn aDyn[6];
145# endif
146} GDBJITSYMFILE;
147
148extern "C" GDBJITDESCRIPTOR __jit_debug_descriptor;
149extern "C" DECLEXPORT(void) __jit_debug_register_code(void);
150
151/** Init once for g_IemNativeGdbJitLock. */
152static RTONCE g_IemNativeGdbJitOnce = RTONCE_INITIALIZER;
153/** Init once for the critical section. */
154static RTCRITSECT g_IemNativeGdbJitLock;
155
156/** GDB reads the info here. */
157GDBJITDESCRIPTOR __jit_debug_descriptor = { 1, kGdbJitaction_NoAction, NULL, NULL };
158
159/** GDB sets a breakpoint on this and checks __jit_debug_descriptor when hit. */
160DECL_NO_INLINE(RT_NOTHING, DECLEXPORT(void)) __jit_debug_register_code(void)
161{
162 ASMNopPause();
163}
164
165/** @callback_method_impl{FNRTONCE} */
166static DECLCALLBACK(int32_t) iemNativeGdbJitInitOnce(void *pvUser)
167{
168 RT_NOREF(pvUser);
169 return RTCritSectInit(&g_IemNativeGdbJitLock);
170}
171
172
173# endif /* IEMNATIVE_USE_GDB_JIT */
174
175/**
176 * Per-chunk unwind info for non-windows hosts.
177 */
178typedef struct IEMEXECMEMCHUNKEHFRAME
179{
180# ifdef IEMNATIVE_USE_LIBUNWIND
181 /** The offset of the FDA into abEhFrame. */
182 uintptr_t offFda;
183# else
184 /** 'struct object' storage area. */
185 uint8_t abObject[1024];
186# endif
187# ifdef IEMNATIVE_USE_GDB_JIT
188# if 0
189 /** The GDB JIT 'symbol file' data. */
190 GDBJITSYMFILE GdbJitSymFile;
191# endif
192 /** The GDB JIT list entry. */
193 GDBJITCODEENTRY GdbJitEntry;
194# endif
195 /** The dwarf ehframe data for the chunk. */
196 uint8_t abEhFrame[512];
197} IEMEXECMEMCHUNKEHFRAME;
198/** Pointer to per-chunk info info for non-windows hosts. */
199typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
200#endif
201
202
203/**
204 * An chunk of executable memory.
205 */
206typedef struct IEMEXECMEMCHUNK
207{
208 /** Number of free items in this chunk. */
209 uint32_t cFreeUnits;
210 /** Hint were to start searching for free space in the allocation bitmap. */
211 uint32_t idxFreeHint;
212 /** Pointer to the readable/writeable view of the memory chunk. */
213 void *pvChunkRw;
214 /** Pointer to the readable/executable view of the memory chunk. */
215 void *pvChunkRx;
216 /** Pointer to the context structure detailing the per chunk common code. */
217 PCIEMNATIVEPERCHUNKCTX pCtx;
218#ifdef IN_RING3
219 /**
220 * Pointer to the unwind information.
221 *
222 * This is used during C++ throw and longjmp (windows and probably most other
223 * platforms). Some debuggers (windbg) makes use of it as well.
224 *
225 * Windows: This is allocated from hHeap on windows because (at least for
226 * AMD64) the UNWIND_INFO structure address in the
227 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
228 *
229 * Others: Allocated from the regular heap to avoid unnecessary executable data
230 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
231 void *pvUnwindInfo;
232#elif defined(IN_RING0)
233 /** Allocation handle. */
234 RTR0MEMOBJ hMemObj;
235#endif
236} IEMEXECMEMCHUNK;
237/** Pointer to a memory chunk. */
238typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
239
240
241/**
242 * Executable memory allocator for the native recompiler.
243 */
244typedef struct IEMEXECMEMALLOCATOR
245{
246 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
247 uint32_t uMagic;
248
249 /** The chunk size. */
250 uint32_t cbChunk;
251 /** The maximum number of chunks. */
252 uint32_t cMaxChunks;
253 /** The current number of chunks. */
254 uint32_t cChunks;
255 /** Hint where to start looking for available memory. */
256 uint32_t idxChunkHint;
257 /** Statistics: Current number of allocations. */
258 uint32_t cAllocations;
259
260 /** The total amount of memory available. */
261 uint64_t cbTotal;
262 /** Total amount of free memory. */
263 uint64_t cbFree;
264 /** Total amount of memory allocated. */
265 uint64_t cbAllocated;
266
267 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
268 *
269 * Since the chunk size is a power of two and the minimum chunk size is a lot
270 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
271 * require a whole number of uint64_t elements in the allocation bitmap. So,
272 * for sake of simplicity, they are allocated as one continous chunk for
273 * simplicity/laziness. */
274 uint64_t *pbmAlloc;
275 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
276 uint32_t cUnitsPerChunk;
277 /** Number of bitmap elements per chunk (for quickly locating the bitmap
278 * portion corresponding to an chunk). */
279 uint32_t cBitmapElementsPerChunk;
280
281 /** Number of times we fruitlessly scanned a chunk for free space. */
282 uint64_t cFruitlessChunkScans;
283
284#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
285 /** The next chunk to prune in. */
286 uint32_t idxChunkPrune;
287 /** Where in chunk offset to start pruning at. */
288 uint32_t offChunkPrune;
289 /** Profiling the pruning code. */
290 STAMPROFILE StatPruneProf;
291 /** Number of bytes recovered by the pruning. */
292 STAMPROFILE StatPruneRecovered;
293#endif
294
295#ifdef VBOX_WITH_STATISTICS
296 STAMPROFILE StatAlloc;
297 /** Total amount of memory not being usable currently due to IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE. */
298 uint64_t cbUnusable;
299 /** Allocation size distribution (in alloc units; 0 is the slop bucket). */
300 STAMCOUNTER aStatSizes[16];
301#endif
302
303#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
304 /** Pointer to the array of unwind info running parallel to aChunks (same
305 * allocation as this structure, located after the bitmaps).
306 * (For Windows, the structures must reside in 32-bit RVA distance to the
307 * actual chunk, so they are allocated off the chunk.) */
308 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
309#endif
310
311 /** The allocation chunks. */
312 RT_FLEXIBLE_ARRAY_EXTENSION
313 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
314} IEMEXECMEMALLOCATOR;
315/** Pointer to an executable memory allocator. */
316typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
317
318/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
319#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
320
321
322#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
323/**
324 * Allocation header.
325 */
326typedef struct IEMEXECMEMALLOCHDR
327{
328 RT_GCC_EXTENSION
329 union
330 {
331 struct
332 {
333 /** Magic value / eyecatcher (IEMEXECMEMALLOCHDR_MAGIC). */
334 uint32_t uMagic;
335 /** The allocation chunk (for speeding up freeing). */
336 uint32_t idxChunk;
337 };
338 /** Combined magic and chunk index, for the pruning scanner code. */
339 uint64_t u64MagicAndChunkIdx;
340 };
341 /** Pointer to the translation block the allocation belongs to.
342 * This is the whole point of the header. */
343 PIEMTB pTb;
344} IEMEXECMEMALLOCHDR;
345/** Pointer to an allocation header. */
346typedef IEMEXECMEMALLOCHDR *PIEMEXECMEMALLOCHDR;
347/** Magic value for IEMEXECMEMALLOCHDR ('ExeM'). */
348# define IEMEXECMEMALLOCHDR_MAGIC UINT32_C(0x4d657845)
349#endif
350
351
352static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator);
353
354
355#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
356/**
357 * Frees up executable memory when we're out space.
358 *
359 * This is an alternative to iemTbAllocatorFreeupNativeSpace() that frees up
360 * space in a more linear fashion from the allocator's point of view. It may
361 * also defragment if implemented & enabled
362 */
363static void iemExecMemAllocatorPrune(PVMCPU pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
364{
365# ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
366# error "IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING requires IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER"
367# endif
368 STAM_REL_PROFILE_START(&pExecMemAllocator->StatPruneProf, a);
369
370 /*
371 * Before we can start, we must process delayed frees.
372 */
373#if 1
374 PIEMTBALLOCATOR const pTbAllocator = iemTbAllocatorFreeBulkStart(pVCpu);
375#else
376 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
377#endif
378
379 AssertCompile(RT_IS_POWER_OF_TWO(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE));
380
381 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
382 AssertReturnVoid(RT_IS_POWER_OF_TWO(cbChunk));
383 AssertReturnVoid(cbChunk >= _1M && cbChunk <= _256M); /* see iemExecMemAllocatorInit */
384
385 uint32_t const cChunks = pExecMemAllocator->cChunks;
386 AssertReturnVoid(cChunks == pExecMemAllocator->cMaxChunks);
387 AssertReturnVoid(cChunks >= 1);
388
389 Assert(!pVCpu->iem.s.pCurTbR3);
390
391 /*
392 * Decide how much to prune. The chunk is is a multiple of two, so we'll be
393 * scanning a multiple of two here as well.
394 */
395 uint32_t cbToPrune = cbChunk;
396
397 /* Never more than 25%. */
398 if (cChunks < 4)
399 cbToPrune /= cChunks == 1 ? 4 : 2;
400
401 /* Upper limit. In a debug build a 4MB limit averages out at ~0.6ms per call. */
402 if (cbToPrune > _4M)
403 cbToPrune = _4M;
404
405 /*
406 * Adjust the pruning chunk and offset accordingly.
407 */
408 uint32_t idxChunk = pExecMemAllocator->idxChunkPrune;
409 uint32_t offChunk = pExecMemAllocator->offChunkPrune;
410 offChunk &= ~(uint32_t)(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1U);
411 if (offChunk >= cbChunk)
412 {
413 offChunk = 0;
414 idxChunk += 1;
415 }
416 if (idxChunk >= cChunks)
417 {
418 offChunk = 0;
419 idxChunk = 0;
420 }
421
422 uint32_t const offPruneStart = offChunk;
423 uint32_t const offPruneEnd = RT_MIN(offChunk + cbToPrune, cbChunk);
424
425 /*
426 * Do the pruning. The current approach is the sever kind.
427 *
428 * This is memory bound, as we must load both the allocation header and the
429 * associated TB and then modify them. So, the CPU isn't all that unitilized
430 * here. Try apply some prefetching to speed it up a tiny bit.
431 */
432 uint64_t cbPruned = 0;
433 uint64_t const u64MagicAndChunkIdx = RT_MAKE_U64(IEMEXECMEMALLOCHDR_MAGIC, idxChunk);
434 uint8_t * const pbChunk = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
435 while (offChunk < offPruneEnd)
436 {
437 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)&pbChunk[offChunk];
438
439 /* Is this the start of an allocation block for a TB? (We typically
440 have one allocation at the start of each chunk for the unwind info
441 where pTb is NULL.) */
442 PIEMTB pTb;
443 if ( pHdr->u64MagicAndChunkIdx == u64MagicAndChunkIdx
444 && RT_LIKELY((pTb = pHdr->pTb) != NULL))
445 {
446 AssertPtr(pTb);
447
448 uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
449 IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
450
451 /* Prefetch the next header before freeing the current one and its TB. */
452 /** @todo Iff the block size was part of the header in some way, this could be
453 * a tiny bit faster. */
454 offChunk += cbBlock;
455#if defined(_MSC_VER) && defined(RT_ARCH_AMD64)
456 _mm_prefetch((char *)&pbChunk[offChunk], _MM_HINT_T0);
457#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
458 __prefetch(&pbChunk[offChunk]);
459#else
460 __builtin_prefetch(&pbChunk[offChunk], 1 /*rw*/);
461#endif
462 /* Some paranoia first, though. */
463 AssertBreakStmt(offChunk <= cbChunk, offChunk -= cbBlock - IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
464 cbPruned += cbBlock;
465
466#if 1
467 iemTbAllocatorFreeBulk(pVCpu, pTbAllocator, pTb);
468#else
469 iemTbAllocatorFree(pVCpu, pTb);
470#endif
471 }
472 else
473 offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
474 }
475 STAM_REL_PROFILE_ADD_PERIOD(&pExecMemAllocator->StatPruneRecovered, cbPruned);
476
477 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
478
479 /*
480 * Save the current pruning point.
481 */
482 pExecMemAllocator->offChunkPrune = offChunk;
483 pExecMemAllocator->idxChunkPrune = idxChunk;
484
485 /* Set the hint to the start of the pruned region. */
486 pExecMemAllocator->idxChunkHint = idxChunk;
487 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = offPruneStart / IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
488
489 STAM_REL_PROFILE_STOP(&pExecMemAllocator->StatPruneProf, a);
490}
491#endif /* IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING */
492
493
494#if defined(VBOX_STRICT) || 0
495/**
496 * The old bitmap scanner code, for comparison and assertions.
497 */
498static uint32_t iemExecMemAllocatorFindReqFreeUnitsOld(uint64_t *pbmAlloc, uint32_t cToScan, uint32_t cReqUnits)
499{
500 /** @todo This can probably be done more efficiently for non-x86 systems. */
501 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
502 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
503 {
504 uint32_t idxAddBit = 1;
505 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
506 idxAddBit++;
507 if (idxAddBit >= cReqUnits)
508 return (uint32_t)iBit;
509 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
510 }
511 return UINT32_MAX;
512}
513#endif
514
515
516/**
517 * Bitmap scanner code that looks for a bunch of @a cReqUnits zero bits.
518 *
519 * Booting win11 with a r165098 release build the average native TB size is
520 * around 9 units (of 256 bytes). So, it is unlikely we need to scan any
521 * subsequent words once we hit a patch of zeros, thus @a a_fBig.
522 *
523 * @todo This needs more tweaking. While it *is* faster the the old code,
524 * it doens't seem like it's all that much. :/
525 */
526template<const bool a_fBig>
527static uint32_t iemExecMemAllocatorFindReqFreeUnits(uint64_t *pbmAlloc, uint32_t c64WordsToScan, uint32_t cReqUnits)
528{
529 /*
530 * Scan the (section of the) allocation bitmap in 64-bit words.
531 */
532 unsigned cPrevLeadingZeros = 0;
533 for (uint32_t off = 0; off < c64WordsToScan; off++)
534 {
535 uint64_t uWord = pbmAlloc[off];
536 if (uWord == UINT64_MAX)
537 {
538 /*
539 * Getting thru patches of UINT64_MAX is a frequent problem when the allocator
540 * fills up, so it's definitely worth optimizing.
541 *
542 * The complicated code below is a bit faster on arm. Reducing the per TB cost
543 * from 4255ns to 4106ns (best run out of 10). On win/amd64 there isn't an
544 * obvious gain here, at least not with the data currently being profiled.
545 */
546#if 1
547 off++;
548 uint32_t cQuads = (c64WordsToScan - off) / 4;
549
550 /* Align. */
551 if (cQuads > 1)
552 switch (((uintptr_t)&pbmAlloc[off] / sizeof(uint64_t)) & 3)
553 {
554 case 0:
555 break;
556 case 1:
557 {
558 uWord = pbmAlloc[off];
559 uint64_t uWord1 = pbmAlloc[off + 1];
560 uint64_t uWord2 = pbmAlloc[off + 2];
561 if ((uWord & uWord1 & uWord2) == UINT64_MAX)
562 {
563 off += 3;
564 cQuads = (c64WordsToScan - off) / 4;
565 }
566 else if (uWord == UINT64_MAX)
567 {
568 if (uWord1 != UINT64_MAX)
569 {
570 uWord = uWord1;
571 off += 1;
572 }
573 else
574 {
575 uWord = uWord2;
576 off += 2;
577 }
578 }
579 break;
580 }
581 case 2:
582 {
583 uWord = pbmAlloc[off];
584 uint64_t uWord1 = pbmAlloc[off + 1];
585 if ((uWord & uWord1) == UINT64_MAX)
586 {
587 off += 2;
588 cQuads = (c64WordsToScan - off) / 4;
589 }
590 else if (uWord == UINT64_MAX)
591 {
592 uWord = uWord1;
593 off += 1;
594 }
595 break;
596 }
597 case 3:
598 uWord = pbmAlloc[off];
599 if (uWord == UINT64_MAX)
600 {
601 off++;
602 cQuads = (c64WordsToScan - off) / 4;
603 }
604 break;
605 }
606 if (uWord == UINT64_MAX)
607 {
608 /* Looping over 32 bytes at a time. */
609 for (;;)
610 {
611 if (cQuads-- > 0)
612 {
613 uWord = pbmAlloc[off + 0];
614 uint64_t uWord1 = pbmAlloc[off + 1];
615 uint64_t uWord2 = pbmAlloc[off + 2];
616 uint64_t uWord3 = pbmAlloc[off + 3];
617 if ((uWord & uWord1 & uWord2 & uWord3) == UINT64_MAX)
618 off += 4;
619 else
620 {
621 if (uWord != UINT64_MAX)
622 { }
623 else if (uWord1 != UINT64_MAX)
624 {
625 uWord = uWord1;
626 off += 1;
627 }
628 else if (uWord2 != UINT64_MAX)
629 {
630 uWord = uWord2;
631 off += 2;
632 }
633 else
634 {
635 uWord = uWord3;
636 off += 3;
637 }
638 break;
639 }
640 }
641 else
642 {
643 if (off < c64WordsToScan)
644 {
645 uWord = pbmAlloc[off];
646 if (uWord != UINT64_MAX)
647 break;
648 off++;
649 if (off < c64WordsToScan)
650 {
651 uWord = pbmAlloc[off];
652 if (uWord != UINT64_MAX)
653 break;
654 off++;
655 if (off < c64WordsToScan)
656 {
657 uWord = pbmAlloc[off];
658 if (uWord != UINT64_MAX)
659 break;
660 Assert(off + 1 == c64WordsToScan);
661 }
662 }
663 }
664 return UINT32_MAX;
665 }
666 }
667 }
668#else
669 do
670 {
671 off++;
672 if (off < c64WordsToScan)
673 uWord = pbmAlloc[off];
674 else
675 return UINT32_MAX;
676 } while (uWord == UINT64_MAX);
677#endif
678 cPrevLeadingZeros = 0;
679 }
680
681 /*
682 * If we get down here, we have a word that isn't UINT64_MAX.
683 */
684 if (uWord != 0)
685 {
686 /*
687 * Fend of large request we cannot satisfy before the first set bit.
688 */
689 if (!a_fBig || cReqUnits < 64 + cPrevLeadingZeros)
690 {
691#ifdef __GNUC__
692 unsigned cZerosInWord = __builtin_popcountl(~uWord);
693#elif defined(_MSC_VER) && defined(RT_ARCH_AMD64)
694 unsigned cZerosInWord = __popcnt64(~uWord);
695#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
696 unsigned cZerosInWord = _CountOneBits64(~uWord);
697#else
698# pragma message("need popcount intrinsic or something...")
699 unsigned cZerosInWord = 0;
700 for (uint64_t uTmp = ~uWords; uTmp; cZerosInWord++)
701 uTmp &= uTmp - 1; /* Clears the least significant bit set. */
702#endif
703 if (cZerosInWord + cPrevLeadingZeros >= cReqUnits)
704 {
705 /* Check if we've got a patch of zeros at the trailing end
706 when joined with the previous word: */
707#ifdef __GNUC__
708 unsigned cTrailingZeros = __builtin_ctzl(uWord);
709#else
710 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
711#endif
712 if (cPrevLeadingZeros + cTrailingZeros >= cReqUnits)
713 return off * 64 - cPrevLeadingZeros;
714
715 /*
716 * Try leading zeros before we get on with the tedious stuff.
717 */
718#ifdef __GNUC__
719 cPrevLeadingZeros = __builtin_clzl(uWord);
720#else
721 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
722#endif
723 if (cPrevLeadingZeros >= cReqUnits)
724 return (off + 1) * 64 - cPrevLeadingZeros;
725
726 /*
727 * Check the popcount again sans leading & trailing before looking
728 * inside the word.
729 */
730 cZerosInWord -= cPrevLeadingZeros + cTrailingZeros;
731 if (cZerosInWord >= cReqUnits)
732 {
733 /* 1; 64 - 0 - 1 = 63; */
734 unsigned const iBitLast = 64 - cPrevLeadingZeros - cReqUnits; /** @todo boundrary */
735 unsigned iBit = cTrailingZeros;
736 uWord >>= cTrailingZeros;
737 do
738 {
739 Assert(uWord & 1);
740#ifdef __GNUC__
741 unsigned iZeroBit = __builtin_ctzl(~uWord);
742#else
743 unsigned iZeroBit = ASMBitFirstSetU64(~uWord) - 1;
744#endif
745 iBit += iZeroBit;
746 uWord >>= iZeroBit;
747 Assert(iBit <= iBitLast);
748 Assert((uWord & 1) == 0);
749#ifdef __GNUC__
750 unsigned cZeros = __builtin_ctzl(uWord);
751#else
752 unsigned cZeros = ASMBitFirstSetU64(uWord) - 1;
753#endif
754 if (cZeros >= cReqUnits)
755 return off * 64 + iBit;
756
757 cZerosInWord -= cZeros; /* (may underflow as we will count shifted in zeros) */
758 iBit += cZeros;
759 uWord >>= cZeros;
760 } while ((int)cZerosInWord >= (int)cReqUnits && iBit < iBitLast);
761 }
762 continue; /* we've already calculated cPrevLeadingZeros */
763 }
764 }
765
766 /* Update the leading (MSB) zero count. */
767#ifdef __GNUC__
768 cPrevLeadingZeros = __builtin_clzl(uWord);
769#else
770 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
771#endif
772 }
773 /*
774 * uWord == 0
775 */
776 else
777 {
778 if RT_CONSTEXPR_IF(!a_fBig)
779 return off * 64 - cPrevLeadingZeros;
780 else /* keep else */
781 {
782 if (cPrevLeadingZeros + 64 >= cReqUnits)
783 return off * 64 - cPrevLeadingZeros;
784 for (uint32_t off2 = off + 1;; off2++)
785 {
786 if (off2 < c64WordsToScan)
787 {
788 uWord = pbmAlloc[off2];
789 if (uWord == UINT64_MAX)
790 {
791 cPrevLeadingZeros = 0;
792 break;
793 }
794 if (uWord == 0)
795 {
796 if (cPrevLeadingZeros + (off2 - off + 1) * 64 >= cReqUnits)
797 return off * 64 - cPrevLeadingZeros;
798 }
799 else
800 {
801#ifdef __GNUC__
802 unsigned cTrailingZeros = __builtin_ctzl(uWord);
803#else
804 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
805#endif
806 if (cPrevLeadingZeros + (off2 - off) * 64 + cTrailingZeros >= cReqUnits)
807 return off * 64 - cPrevLeadingZeros;
808#ifdef __GNUC__
809 cPrevLeadingZeros = __builtin_clzl(uWord);
810#else
811 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
812#endif
813 break;
814 }
815 }
816 else
817 return UINT32_MAX;
818 }
819 }
820 }
821 }
822 return UINT32_MAX;
823}
824
825
826/**
827 * Try allocate a block of @a cReqUnits in the chunk @a idxChunk.
828 */
829static void *
830iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
831 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb,
832 void **ppvExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
833{
834 /*
835 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
836 */
837 Assert(!(cToScan & 63));
838 Assert(!(idxFirst & 63));
839 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
840 pbmAlloc += idxFirst / 64;
841 cToScan += idxFirst & 63;
842 Assert(!(cToScan & 63));
843
844#if 1
845 uint32_t const iBit = cReqUnits < 64
846 ? iemExecMemAllocatorFindReqFreeUnits<false>(pbmAlloc, cToScan / 64, cReqUnits)
847 : iemExecMemAllocatorFindReqFreeUnits<true>( pbmAlloc, cToScan / 64, cReqUnits);
848# ifdef VBOX_STRICT
849 uint32_t const iBitOld = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
850 AssertMsg( iBit == iBitOld
851 || (iBit / 64) == (iBitOld / 64), /* New algorithm will return trailing hit before middle. */
852 ("iBit=%#x (%#018RX64); iBitOld=%#x (%#018RX64); cReqUnits=%#x\n",
853 iBit, iBit != UINT32_MAX ? pbmAlloc[iBit / 64] : 0,
854 iBitOld, iBitOld != UINT32_MAX ? pbmAlloc[iBitOld / 64] : 0, cReqUnits));
855# endif
856#else
857 uint32_t const iBit = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
858#endif
859 if (iBit != UINT32_MAX)
860 {
861 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
862
863 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
864 pChunk->cFreeUnits -= cReqUnits;
865 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
866
867 pExecMemAllocator->cAllocations += 1;
868 uint32_t const cbReq = cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
869 pExecMemAllocator->cbAllocated += cbReq;
870 pExecMemAllocator->cbFree -= cbReq;
871 pExecMemAllocator->idxChunkHint = idxChunk;
872
873 void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw
874 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
875
876 if (ppChunkCtx)
877 *ppChunkCtx = pChunk->pCtx;
878
879 /*
880 * Initialize the header and return.
881 */
882# ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
883 PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw;
884 pHdr->uMagic = IEMEXECMEMALLOCHDR_MAGIC;
885 pHdr->idxChunk = idxChunk;
886 pHdr->pTb = pTb;
887
888 if (ppvExec)
889 *ppvExec = (uint8_t *)pChunk->pvChunkRx
890 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT)
891 + sizeof(*pHdr);
892
893 return pHdr + 1;
894#else
895 if (ppvExec)
896 *ppvExec = (uint8_t *)pChunk->pvChunkRx
897 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
898
899 RT_NOREF(pTb);
900 return pvMem;
901#endif
902 }
903
904 return NULL;
905}
906
907
908/**
909 * Converts requested number of bytes into a unit count.
910 */
911DECL_FORCE_INLINE(uint32_t) iemExecMemAllocBytesToUnits(uint32_t cbReq)
912{
913#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
914 return (cbReq + sizeof(IEMEXECMEMALLOCHDR) + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
915#else
916 return (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
917#endif
918 >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
919}
920
921
922DECL_FORCE_INLINE(PIEMNATIVEINSTR)
923iemExecMemAllocatorAllocUnitsInChunkInner(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cReqUnits,
924 PIEMTB pTb, PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
925{
926 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
927 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
928 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
929 {
930 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
931 pExecMemAllocator->cUnitsPerChunk - idxHint,
932 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
933 if (pvRet)
934 return (PIEMNATIVEINSTR)pvRet;
935 }
936 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
937 RT_MIN(pExecMemAllocator->cUnitsPerChunk,
938 RT_ALIGN_32(idxHint + cReqUnits, 64*4)),
939 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
940 if (pvRet)
941 return (PIEMNATIVEINSTR)pvRet;
942
943 pExecMemAllocator->cFruitlessChunkScans += 1;
944 return NULL;
945}
946
947
948DECLINLINE(PIEMNATIVEINSTR)
949iemExecMemAllocatorAllocBytesInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq,
950 PIEMNATIVEINSTR *ppaExec)
951{
952 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
953 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
954 return iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, NULL /*pTb*/,
955 ppaExec, NULL /*ppChunkCtx*/);
956 return NULL;
957}
958
959
960/**
961 * Allocates @a cbReq bytes of executable memory.
962 *
963 * @returns Pointer to the readable/writeable memory, NULL if out of memory or other problem
964 * encountered.
965 * @param pVCpu The cross context virtual CPU structure of the
966 * calling thread.
967 * @param cbReq How many bytes are required.
968 * @param pTb The translation block that will be using the allocation.
969 * @param ppaExec Where to return the pointer to executable view of
970 * the allocated memory, optional.
971 * @param ppChunkCtx Where to return the per chunk attached context
972 * if available, optional.
973 */
974DECLHIDDEN(PIEMNATIVEINSTR) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb,
975 PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx) RT_NOEXCEPT
976{
977 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
978 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
979 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
980 STAM_PROFILE_START(&pExecMemAllocator->StatAlloc, a);
981
982 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
983 STAM_COUNTER_INC(&pExecMemAllocator->aStatSizes[cReqUnits < RT_ELEMENTS(pExecMemAllocator->aStatSizes) ? cReqUnits : 0]);
984 for (unsigned iIteration = 0;; iIteration++)
985 {
986 if ( cbReq * 2 <= pExecMemAllocator->cbFree
987 || (cReqUnits == 1 || pExecMemAllocator->cbFree >= IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) )
988 {
989 uint32_t const cChunks = pExecMemAllocator->cChunks;
990 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
991
992 /*
993 * We do two passes here, the first pass we skip chunks with fewer than cReqUnits * 16,
994 * the 2nd pass we skip chunks. The second pass checks the one skipped in the first pass.
995 */
996 for (uint32_t cMinFreePass = cReqUnits == 1 ? cReqUnits : cReqUnits * 16, cMaxFreePass = UINT32_MAX;;)
997 {
998 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
999 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
1000 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
1001 {
1002 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
1003 cReqUnits, pTb, ppaExec, ppChunkCtx);
1004 if (pRet)
1005 {
1006 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1007#ifdef VBOX_WITH_STATISTICS
1008 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1009#endif
1010 return pRet;
1011 }
1012 }
1013 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
1014 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
1015 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
1016 {
1017 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
1018 cReqUnits, pTb, ppaExec, ppChunkCtx);
1019 if (pRet)
1020 {
1021 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1022#ifdef VBOX_WITH_STATISTICS
1023 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1024#endif
1025 return pRet;
1026 }
1027 }
1028 if (cMinFreePass <= cReqUnits * 2)
1029 break;
1030 cMaxFreePass = cMinFreePass - 1;
1031 cMinFreePass = cReqUnits * 2;
1032 }
1033 }
1034
1035 /*
1036 * Can we grow it with another chunk?
1037 */
1038 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
1039 {
1040 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
1041 AssertLogRelRCReturn(rc, NULL);
1042
1043 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
1044 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, pTb,
1045 ppaExec, ppChunkCtx);
1046 if (pRet)
1047 {
1048 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1049#ifdef VBOX_WITH_STATISTICS
1050 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1051#endif
1052 return pRet;
1053 }
1054 AssertFailed();
1055 }
1056
1057 /*
1058 * Try prune native TBs once.
1059 */
1060 if (iIteration == 0)
1061 {
1062#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
1063 iemExecMemAllocatorPrune(pVCpu, pExecMemAllocator);
1064#else
1065 /* No header included in the instruction count here. */
1066 uint32_t const cNeededInstrs = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) / sizeof(IEMNATIVEINSTR);
1067 iemTbAllocatorFreeupNativeSpace(pVCpu, cNeededInstrs);
1068#endif
1069 }
1070 else
1071 {
1072 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeExecMemInstrBufAllocFailed);
1073 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1074 return NULL;
1075 }
1076 }
1077}
1078
1079
1080/** This is a hook to ensure the instruction cache is properly flushed before the code in the memory
1081 * given by @a pv and @a cb is executed */
1082DECLHIDDEN(void) iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1083{
1084#ifdef RT_OS_DARWIN
1085 /*
1086 * We need to synchronize the stuff we wrote to the data cache with the
1087 * instruction cache, since these aren't coherent on arm (or at least not
1088 * on Apple Mn CPUs).
1089 *
1090 * Note! Since we don't any share JIT'ed code with the other CPUs, we don't
1091 * really care whether the dcache is fully flushed back to memory. It
1092 * only needs to hit the level 2 cache, which the level 1 instruction
1093 * and data caches seems to be sharing. In ARM terms, we need to reach
1094 * a point of unification (PoU), rather than a point of coherhency (PoC).
1095 *
1096 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
1097 *
1098 * https://developer.arm.com/documentation/den0013/d/Caches/Point-of-coherency-and-unification
1099 *
1100 * Experimenting with the approach used by sys_icache_invalidate() and
1101 * tweaking it a little, could let us shave off a bit of effort. The thing
1102 * that slows the apple code down on an M2 (runing Sonoma 13.4), seems to
1103 * the 'DSB ISH' instructions performed every 20 icache line flushes.
1104 * Skipping these saves ~100ns or more per TB when profiling the native
1105 * recompiler on the TBs from a win11 full boot-desktop-shutdow sequence.
1106 * Thus we will leave DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB undefined if we
1107 * can.
1108 *
1109 * There appears not to be much difference between DSB options 'ISH',
1110 * 'ISHST', 'NSH' and 'NSHST'. The latter is theoretically all we need, so
1111 * we'll use that one.
1112 *
1113 * See https://developer.arm.com/documentation/100941/0101/Barriers for
1114 * details on the barrier options.
1115 *
1116 * Note! The CFG value "/IEM/HostICacheInvalidationViaHostAPI" can be used
1117 * to disabling the experimental code should it misbehave.
1118 */
1119 uint8_t const fHostICacheInvalidation = pVCpu->iem.s.fHostICacheInvalidation;
1120 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_USE_HOST_API))
1121 {
1122# define DCACHE_ICACHE_SYNC_DSB_OPTION "nshst"
1123/*# define DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB*/
1124
1125 /* Skipping this is fine, but doesn't impact perf much. */
1126 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1127
1128 /* Invalidate the icache for the range [pv,pv+cb). */
1129# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1130 size_t const cIvauDsbEvery= 20;
1131 unsigned cDsb = cIvauDsbEvery;
1132# endif
1133 size_t const cbCacheLine = 64;
1134 size_t cbInvalidate = cb + ((uintptr_t)pv & (cbCacheLine - 1)) ;
1135 size_t cCacheLines = RT_ALIGN_Z(cbInvalidate, cbCacheLine) / cbCacheLine;
1136 uintptr_t uPtr = (uintptr_t)pv & ~(uintptr_t)(cbCacheLine - 1);
1137 for (;; uPtr += cbCacheLine)
1138 {
1139 __asm__ /*__volatile__*/("ic ivau, %0" : : "r" (uPtr));
1140 cCacheLines -= 1;
1141 if (!cCacheLines)
1142 break;
1143# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1144 cDsb -= 1;
1145 if (cDsb != 0)
1146 { /* likely */ }
1147 else
1148 {
1149 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1150 cDsb = cIvauDsbEvery;
1151 }
1152# endif
1153 }
1154
1155 /*
1156 * The DSB here is non-optional it seems.
1157 *
1158 * The following ISB can be omitted on M2 without any obvious sideeffects,
1159 * it produces better number in the above mention profiling scenario.
1160 * This could be related to the kHasICDSB flag in cpu_capabilities.h,
1161 * but it doesn't look like that flag is set here (M2, Sonoma 13.4).
1162 *
1163 * I've made the inclusion of the ISH barrier as configurable and with
1164 * a default of skipping it.
1165 */
1166 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_END_WITH_ISH))
1167 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION
1168 ::: "memory");
1169 else
1170 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION "\n\t"
1171 "isb"
1172 ::: "memory");
1173 }
1174 else
1175 sys_icache_invalidate(pv, cb);
1176
1177#elif defined(RT_OS_LINUX) && defined(RT_ARCH_ARM64)
1178 RT_NOREF(pVCpu);
1179
1180 /* There is __builtin___clear_cache() but it flushes both the instruction and data cache, so do it manually. */
1181 static uint32_t s_u32CtrEl0 = 0;
1182 if (!s_u32CtrEl0)
1183 asm volatile ("mrs %0, ctr_el0":"=r" (s_u32CtrEl0));
1184 uintptr_t cbICacheLine = (uintptr_t)4 << (s_u32CtrEl0 & 0xf);
1185
1186 uintptr_t pb = (uintptr_t)pv & ~(cbICacheLine - 1);
1187 for (; pb < (uintptr_t)pv + cb; pb += cbICacheLine)
1188 asm volatile ("ic ivau, %0" : : "r" (pb) : "memory");
1189
1190 asm volatile ("dsb ish\n\t isb\n\t" : : : "memory");
1191
1192#else
1193 RT_NOREF(pVCpu, pv, cb);
1194#endif
1195}
1196
1197
1198/**
1199 * Frees executable memory.
1200 */
1201DECLHIDDEN(void) iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1202{
1203 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1204 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
1205 AssertPtr(pv);
1206#ifdef VBOX_WITH_STATISTICS
1207 size_t const cbOrig = cb;
1208#endif
1209#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1210 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1211
1212 /* Align the size as we did when allocating the block. */
1213 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1214
1215#else
1216 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)pv - 1;
1217 Assert(!((uintptr_t)pHdr & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1218 AssertReturnVoid(pHdr->uMagic == IEMEXECMEMALLOCHDR_MAGIC);
1219 uint32_t const idxChunk = pHdr->idxChunk;
1220 AssertReturnVoid(idxChunk < pExecMemAllocator->cChunks);
1221 pv = pHdr;
1222
1223 /* Adjust and align the size to cover the whole allocation area. */
1224 cb = RT_ALIGN_Z(cb + sizeof(*pHdr), IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1225#endif
1226
1227 /* Free it / assert sanity. */
1228 bool fFound = false;
1229 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1230#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1231 uint32_t const cChunks = pExecMemAllocator->cChunks;
1232 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
1233#endif
1234 {
1235 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
1236 fFound = offChunk < cbChunk;
1237 if (fFound)
1238 {
1239 uint32_t const idxFirst = (uint32_t)offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1240 uint32_t const cReqUnits = (uint32_t)cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1241
1242 /* Check that it's valid and free it. */
1243 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
1244 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
1245 for (uint32_t i = 1; i < cReqUnits; i++)
1246 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
1247 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
1248
1249 /* Invalidate the header using the writeable memory view. */
1250 pHdr = (PIEMEXECMEMALLOCHDR)((uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRw + offChunk);
1251#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1252 pHdr->uMagic = 0;
1253 pHdr->idxChunk = 0;
1254 pHdr->pTb = NULL;
1255#endif
1256 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
1257 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
1258
1259 /* Update the stats. */
1260 pExecMemAllocator->cbAllocated -= cb;
1261 pExecMemAllocator->cbFree += cb;
1262 pExecMemAllocator->cAllocations -= 1;
1263#ifdef VBOX_WITH_STATISTICS
1264 pExecMemAllocator->cbUnusable -= (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbOrig;
1265#endif
1266 return;
1267 }
1268 }
1269 AssertFailed();
1270}
1271
1272
1273/**
1274 * Interface used by iemNativeRecompileAttachExecMemChunkCtx and unwind info
1275 * generators.
1276 */
1277DECLHIDDEN(PIEMNATIVEINSTR)
1278iemExecMemAllocatorAllocFromChunk(PVMCPU pVCpu, uint32_t idxChunk, uint32_t cbReq, PIEMNATIVEINSTR *ppaExec)
1279{
1280 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1281 AssertReturn(idxChunk < pExecMemAllocator->cChunks, NULL);
1282 Assert(cbReq < _1M);
1283 return iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbReq, ppaExec);
1284}
1285
1286
1287/**
1288 * For getting the per-chunk context detailing common code for a TB.
1289 *
1290 * This is for use by the disassembler.
1291 */
1292DECLHIDDEN(PCIEMNATIVEPERCHUNKCTX) iemExecMemGetTbChunkCtx(PVMCPU pVCpu, PCIEMTB pTb)
1293{
1294 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1295 if ((pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
1296 {
1297 uintptr_t const uAddress = (uintptr_t)pTb->Native.paInstructions;
1298 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1299 uint32_t idxChunk = pExecMemAllocator->cChunks;
1300 while (idxChunk-- > 0)
1301 if (uAddress - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx < cbChunk)
1302 return pExecMemAllocator->aChunks[idxChunk].pCtx;
1303 }
1304 return NULL;
1305}
1306
1307
1308#ifdef IN_RING3
1309# ifdef RT_OS_WINDOWS
1310
1311/**
1312 * Initializes the unwind info structures for windows hosts.
1313 */
1314static int
1315iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1316 void *pvChunk, uint32_t idxChunk)
1317{
1318 RT_NOREF(pVCpu);
1319
1320# ifdef RT_ARCH_AMD64
1321 /*
1322 * The AMD64 unwind opcodes.
1323 *
1324 * This is a program that starts with RSP after a RET instruction that
1325 * ends up in recompiled code, and the operations we describe here will
1326 * restore all non-volatile registers and bring RSP back to where our
1327 * RET address is. This means it's reverse order from what happens in
1328 * the prologue.
1329 *
1330 * Note! Using a frame register approach here both because we have one
1331 * and but mainly because the UWOP_ALLOC_LARGE argument values
1332 * would be a pain to write initializers for. On the positive
1333 * side, we're impervious to changes in the the stack variable
1334 * area can can deal with dynamic stack allocations if necessary.
1335 */
1336 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
1337 {
1338 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
1339 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
1340 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
1341 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
1342 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
1343 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
1344 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
1345 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
1346 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
1347 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
1348 };
1349 union
1350 {
1351 IMAGE_UNWIND_INFO Info;
1352 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
1353 } s_UnwindInfo =
1354 {
1355 {
1356 /* .Version = */ 1,
1357 /* .Flags = */ 0,
1358 /* .SizeOfProlog = */ 16, /* whatever */
1359 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
1360 /* .FrameRegister = */ X86_GREG_xBP,
1361 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
1362 }
1363 };
1364 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
1365 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
1366
1367# elif defined(RT_ARCH_ARM64)
1368 /*
1369 * The ARM64 unwind codes.
1370 *
1371 * See https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling?view=msvc-170
1372 */
1373 static const uint8_t s_abOpcodes[] =
1374 {
1375 /* Prolog: None. */
1376 0xe5, /* end_c */
1377 /* Epilog / unwind info: */
1378 (IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) / 16, /* alloc_s */
1379 0xc8, 0x00, /* save_regp x19, x20, [sp + #0] */
1380 0xc8, 0x82, /* save_regp x21, x22, [sp + #2*8] */
1381 0xc9, 0x04, /* save_regp x23, x24, [sp + #4*8] */
1382 0xc9, 0x86, /* save_regp x25, x26, [sp + #6*8] */
1383 0xca, 0x08, /* save_regp x27, x28, [sp + #8*8] */
1384 0x4a, /* save_fplr x29, x30, [sp + #10*8] */
1385 12*8 / 16, /* alloc_s */
1386 0xc4, /* end */
1387 0xc5 /* nop */
1388 };
1389 AssertCompile(!(sizeof(s_abOpcodes) & 3));
1390 AssertCompile(!((IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) & 15));
1391 AssertCompile((IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_ALIGN_SIZE) < 512);
1392
1393# else
1394# error "Port me!"
1395# endif
1396
1397 /*
1398 * Calc how much space we need and allocate it off the exec heap.
1399 */
1400# ifdef RT_ARCH_ARM64
1401 unsigned const cbPerEntry = _1M - 4;
1402 unsigned const cFunctionEntries = (pExecMemAllocator->cbChunk + cbPerEntry - 1) / cbPerEntry;
1403 unsigned const cbUnwindInfo = (sizeof(uint32_t) * 2 + sizeof(s_abOpcodes)) * cFunctionEntries;
1404# else
1405 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
1406 unsigned const cFunctionEntries = 1;
1407# endif
1408 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
1409 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
1410 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbNeeded, NULL);
1411 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
1412 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
1413
1414 /*
1415 * Initialize the structures.
1416 */
1417# ifdef RT_ARCH_AMD64
1418 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
1419
1420 paFunctions[0].BeginAddress = 0;
1421 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
1422 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
1423
1424 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
1425 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
1426
1427# elif defined(RT_ARCH_ARM64)
1428
1429 PIMAGE_ARM64_RUNTIME_FUNCTION_ENTRY_XDATA pInfo = (PIMAGE_ARM64_RUNTIME_FUNCTION_ENTRY_XDATA)&paFunctions[cFunctionEntries];
1430 for (uint32_t i = 0, off = 0; i < cFunctionEntries; i++)
1431 {
1432 paFunctions[i].BeginAddress = off;
1433 paFunctions[i].UnwindData = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk) | PdataRefToFullXdata;
1434
1435 uint32_t const cFunctionLengthInWords = RT_MAX(cbPerEntry, pExecMemAllocator->cbChunk - off) / 4;
1436 pInfo[0].FunctionLength = cFunctionLengthInWords;
1437 pInfo[0].Version = 0;
1438 pInfo[0].ExceptionDataPresent = 0;
1439 pInfo[0].EpilogInHeader = 0;
1440 pInfo[0].EpilogCount = 1;
1441 pInfo[0].CodeWords = sizeof(s_abOpcodes) / sizeof(uint32_t);
1442
1443 pInfo[1].EpilogInfo.EpilogStartOffset = cFunctionLengthInWords;
1444 pInfo[1].EpilogInfo.Reserved = 0;
1445 pInfo[1].EpilogInfo.EpilogStartIndex = 1;
1446 pInfo += 2;
1447
1448 memcpy(pInfo, s_abOpcodes, sizeof(s_abOpcodes));
1449 pInfo += sizeof(s_abOpcodes) / sizeof(*pInfo);
1450 }
1451
1452# else
1453# error "Port me!"
1454# endif
1455
1456 /*
1457 * Register them.
1458 */
1459 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
1460 AssertReturn(fRet, VERR_INTERNAL_ERROR_3); /* Nothing to clean up on failure, since its within the chunk itself. */
1461
1462 return VINF_SUCCESS;
1463}
1464
1465
1466# else /* !RT_OS_WINDOWS */
1467
1468/**
1469 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
1470 */
1471DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
1472{
1473 if (iValue >= 64)
1474 {
1475 Assert(iValue < 0x2000);
1476 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1477 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
1478 }
1479 else if (iValue >= 0)
1480 *Ptr.pb++ = (uint8_t)iValue;
1481 else if (iValue > -64)
1482 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
1483 else
1484 {
1485 Assert(iValue > -0x2000);
1486 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1487 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
1488 }
1489 return Ptr;
1490}
1491
1492
1493/**
1494 * Emits an ULEB128 encoded value (up to 64-bit wide).
1495 */
1496DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
1497{
1498 while (uValue >= 0x80)
1499 {
1500 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
1501 uValue >>= 7;
1502 }
1503 *Ptr.pb++ = (uint8_t)uValue;
1504 return Ptr;
1505}
1506
1507
1508/**
1509 * Emits a CFA rule as register @a uReg + offset @a off.
1510 */
1511DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1512{
1513 *Ptr.pb++ = DW_CFA_def_cfa;
1514 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1515 Ptr = iemDwarfPutUleb128(Ptr, off);
1516 return Ptr;
1517}
1518
1519
1520/**
1521 * Emits a register (@a uReg) save location:
1522 * CFA + @a off * data_alignment_factor
1523 */
1524DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1525{
1526 if (uReg < 0x40)
1527 *Ptr.pb++ = DW_CFA_offset | uReg;
1528 else
1529 {
1530 *Ptr.pb++ = DW_CFA_offset_extended;
1531 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1532 }
1533 Ptr = iemDwarfPutUleb128(Ptr, off);
1534 return Ptr;
1535}
1536
1537
1538# if 0 /* unused */
1539/**
1540 * Emits a register (@a uReg) save location, using signed offset:
1541 * CFA + @a offSigned * data_alignment_factor
1542 */
1543DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
1544{
1545 *Ptr.pb++ = DW_CFA_offset_extended_sf;
1546 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1547 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
1548 return Ptr;
1549}
1550# endif
1551
1552
1553/**
1554 * Initializes the unwind info section for non-windows hosts.
1555 */
1556static int
1557iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1558 void *pvChunk, uint32_t idxChunk)
1559{
1560 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
1561 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
1562
1563 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
1564
1565 /*
1566 * Generate the CIE first.
1567 */
1568# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
1569 uint8_t const iDwarfVer = 3;
1570# else
1571 uint8_t const iDwarfVer = 4;
1572# endif
1573 RTPTRUNION const PtrCie = Ptr;
1574 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1575 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
1576 *Ptr.pb++ = iDwarfVer; /* DwARF version */
1577 *Ptr.pb++ = 0; /* Augmentation. */
1578 if (iDwarfVer >= 4)
1579 {
1580 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
1581 *Ptr.pb++ = 0; /* Segment selector size. */
1582 }
1583# ifdef RT_ARCH_AMD64
1584 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
1585# else
1586 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
1587# endif
1588 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
1589# ifdef RT_ARCH_AMD64
1590 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
1591# elif defined(RT_ARCH_ARM64)
1592 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
1593# else
1594# error "port me"
1595# endif
1596 /* Initial instructions: */
1597# ifdef RT_ARCH_AMD64
1598 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
1599 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
1600 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
1601 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
1602 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
1603 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
1604 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
1605 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
1606# elif defined(RT_ARCH_ARM64)
1607# if 1
1608 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
1609# else
1610 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
1611# endif
1612 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
1613 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
1614 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
1615 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
1616 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
1617 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
1618 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
1619 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
1620 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
1621 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
1622 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
1623 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
1624 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1625 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
1626# else
1627# error "port me"
1628# endif
1629 while ((Ptr.u - PtrCie.u) & 3)
1630 *Ptr.pb++ = DW_CFA_nop;
1631 /* Finalize the CIE size. */
1632 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
1633
1634 /*
1635 * Generate an FDE for the whole chunk area.
1636 */
1637# ifdef IEMNATIVE_USE_LIBUNWIND
1638 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
1639# endif
1640 RTPTRUNION const PtrFde = Ptr;
1641 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1642 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
1643 Ptr.pu32++;
1644 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
1645 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
1646# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
1647 *Ptr.pb++ = DW_CFA_nop;
1648# endif
1649 while ((Ptr.u - PtrFde.u) & 3)
1650 *Ptr.pb++ = DW_CFA_nop;
1651 /* Finalize the FDE size. */
1652 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
1653
1654 /* Terminator entry. */
1655 *Ptr.pu32++ = 0;
1656 *Ptr.pu32++ = 0; /* just to be sure... */
1657 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
1658
1659 /*
1660 * Register it.
1661 */
1662# ifdef IEMNATIVE_USE_LIBUNWIND
1663 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
1664# else
1665 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
1666 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
1667# endif
1668
1669# ifdef IEMNATIVE_USE_GDB_JIT
1670 /*
1671 * Now for telling GDB about this (experimental).
1672 *
1673 * This seems to work best with ET_DYN.
1674 */
1675 GDBJITSYMFILE * const pSymFile = (GDBJITSYMFILE *)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk,
1676 sizeof(GDBJITSYMFILE), NULL);
1677 AssertReturn(pSymFile, VERR_INTERNAL_ERROR_5);
1678 unsigned const offSymFileInChunk = (uintptr_t)pSymFile - (uintptr_t)pvChunk;
1679
1680 RT_ZERO(*pSymFile);
1681
1682 /*
1683 * The ELF header:
1684 */
1685 pSymFile->EHdr.e_ident[0] = ELFMAG0;
1686 pSymFile->EHdr.e_ident[1] = ELFMAG1;
1687 pSymFile->EHdr.e_ident[2] = ELFMAG2;
1688 pSymFile->EHdr.e_ident[3] = ELFMAG3;
1689 pSymFile->EHdr.e_ident[EI_VERSION] = EV_CURRENT;
1690 pSymFile->EHdr.e_ident[EI_CLASS] = ELFCLASS64;
1691 pSymFile->EHdr.e_ident[EI_DATA] = ELFDATA2LSB;
1692 pSymFile->EHdr.e_ident[EI_OSABI] = ELFOSABI_NONE;
1693# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1694 pSymFile->EHdr.e_type = ET_DYN;
1695# else
1696 pSymFile->EHdr.e_type = ET_REL;
1697# endif
1698# ifdef RT_ARCH_AMD64
1699 pSymFile->EHdr.e_machine = EM_AMD64;
1700# elif defined(RT_ARCH_ARM64)
1701 pSymFile->EHdr.e_machine = EM_AARCH64;
1702# else
1703# error "port me"
1704# endif
1705 pSymFile->EHdr.e_version = 1; /*?*/
1706 pSymFile->EHdr.e_entry = 0;
1707# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1708 pSymFile->EHdr.e_phoff = RT_UOFFSETOF(GDBJITSYMFILE, aPhdrs);
1709# else
1710 pSymFile->EHdr.e_phoff = 0;
1711# endif
1712 pSymFile->EHdr.e_shoff = sizeof(pSymFile->EHdr);
1713 pSymFile->EHdr.e_flags = 0;
1714 pSymFile->EHdr.e_ehsize = sizeof(pSymFile->EHdr);
1715# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1716 pSymFile->EHdr.e_phentsize = sizeof(pSymFile->aPhdrs[0]);
1717 pSymFile->EHdr.e_phnum = RT_ELEMENTS(pSymFile->aPhdrs);
1718# else
1719 pSymFile->EHdr.e_phentsize = 0;
1720 pSymFile->EHdr.e_phnum = 0;
1721# endif
1722 pSymFile->EHdr.e_shentsize = sizeof(pSymFile->aShdrs[0]);
1723 pSymFile->EHdr.e_shnum = RT_ELEMENTS(pSymFile->aShdrs);
1724 pSymFile->EHdr.e_shstrndx = 0; /* set later */
1725
1726 uint32_t offStrTab = 0;
1727#define APPEND_STR(a_szStr) do { \
1728 memcpy(&pSymFile->szzStrTab[offStrTab], a_szStr, sizeof(a_szStr)); \
1729 offStrTab += sizeof(a_szStr); \
1730 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1731 } while (0)
1732#define APPEND_STR_FMT(a_szStr, ...) do { \
1733 offStrTab += RTStrPrintf(&pSymFile->szzStrTab[offStrTab], sizeof(pSymFile->szzStrTab) - offStrTab, a_szStr, __VA_ARGS__); \
1734 offStrTab++; \
1735 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1736 } while (0)
1737
1738 /*
1739 * Section headers.
1740 */
1741 /* Section header #0: NULL */
1742 unsigned i = 0;
1743 APPEND_STR("");
1744 RT_ZERO(pSymFile->aShdrs[i]);
1745 i++;
1746
1747 /* Section header: .eh_frame */
1748 pSymFile->aShdrs[i].sh_name = offStrTab;
1749 APPEND_STR(".eh_frame");
1750 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1751 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1752# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1753 pSymFile->aShdrs[i].sh_offset
1754 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, abEhFrame);
1755# else
1756 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->abEhFrame[0];
1757 pSymFile->aShdrs[i].sh_offset = 0;
1758# endif
1759
1760 pSymFile->aShdrs[i].sh_size = sizeof(pEhFrame->abEhFrame);
1761 pSymFile->aShdrs[i].sh_link = 0;
1762 pSymFile->aShdrs[i].sh_info = 0;
1763 pSymFile->aShdrs[i].sh_addralign = 1;
1764 pSymFile->aShdrs[i].sh_entsize = 0;
1765 memcpy(pSymFile->abEhFrame, pEhFrame->abEhFrame, sizeof(pEhFrame->abEhFrame));
1766 i++;
1767
1768 /* Section header: .shstrtab */
1769 unsigned const iShStrTab = i;
1770 pSymFile->EHdr.e_shstrndx = iShStrTab;
1771 pSymFile->aShdrs[i].sh_name = offStrTab;
1772 APPEND_STR(".shstrtab");
1773 pSymFile->aShdrs[i].sh_type = SHT_STRTAB;
1774 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1775# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1776 pSymFile->aShdrs[i].sh_offset
1777 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1778# else
1779 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->szzStrTab[0];
1780 pSymFile->aShdrs[i].sh_offset = 0;
1781# endif
1782 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->szzStrTab);
1783 pSymFile->aShdrs[i].sh_link = 0;
1784 pSymFile->aShdrs[i].sh_info = 0;
1785 pSymFile->aShdrs[i].sh_addralign = 1;
1786 pSymFile->aShdrs[i].sh_entsize = 0;
1787 i++;
1788
1789 /* Section header: .symbols */
1790 pSymFile->aShdrs[i].sh_name = offStrTab;
1791 APPEND_STR(".symtab");
1792 pSymFile->aShdrs[i].sh_type = SHT_SYMTAB;
1793 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1794 pSymFile->aShdrs[i].sh_offset
1795 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aSymbols);
1796 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aSymbols);
1797 pSymFile->aShdrs[i].sh_link = iShStrTab;
1798 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aSymbols);
1799 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aSymbols[0].st_value);
1800 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aSymbols[0]);
1801 i++;
1802
1803# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1804 /* Section header: .symbols */
1805 pSymFile->aShdrs[i].sh_name = offStrTab;
1806 APPEND_STR(".dynsym");
1807 pSymFile->aShdrs[i].sh_type = SHT_DYNSYM;
1808 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1809 pSymFile->aShdrs[i].sh_offset
1810 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1811 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDynSyms);
1812 pSymFile->aShdrs[i].sh_link = iShStrTab;
1813 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aDynSyms);
1814 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aDynSyms[0].st_value);
1815 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDynSyms[0]);
1816 i++;
1817# endif
1818
1819# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1820 /* Section header: .dynamic */
1821 pSymFile->aShdrs[i].sh_name = offStrTab;
1822 APPEND_STR(".dynamic");
1823 pSymFile->aShdrs[i].sh_type = SHT_DYNAMIC;
1824 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1825 pSymFile->aShdrs[i].sh_offset
1826 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1827 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDyn);
1828 pSymFile->aShdrs[i].sh_link = iShStrTab;
1829 pSymFile->aShdrs[i].sh_info = 0;
1830 pSymFile->aShdrs[i].sh_addralign = 1;
1831 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDyn[0]);
1832 i++;
1833# endif
1834
1835 /* Section header: .text */
1836 unsigned const iShText = i;
1837 pSymFile->aShdrs[i].sh_name = offStrTab;
1838 APPEND_STR(".text");
1839 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1840 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1841# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1842 pSymFile->aShdrs[i].sh_offset
1843 = pSymFile->aShdrs[i].sh_addr = sizeof(GDBJITSYMFILE);
1844# else
1845 pSymFile->aShdrs[i].sh_addr = (uintptr_t)(pSymFile + 1);
1846 pSymFile->aShdrs[i].sh_offset = 0;
1847# endif
1848 pSymFile->aShdrs[i].sh_size = pExecMemAllocator->cbChunk - offSymFileInChunk - sizeof(GDBJITSYMFILE);
1849 pSymFile->aShdrs[i].sh_link = 0;
1850 pSymFile->aShdrs[i].sh_info = 0;
1851 pSymFile->aShdrs[i].sh_addralign = 1;
1852 pSymFile->aShdrs[i].sh_entsize = 0;
1853 i++;
1854
1855 Assert(i == RT_ELEMENTS(pSymFile->aShdrs));
1856
1857# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1858 /*
1859 * The program headers:
1860 */
1861 /* Everything in a single LOAD segment: */
1862 i = 0;
1863 pSymFile->aPhdrs[i].p_type = PT_LOAD;
1864 pSymFile->aPhdrs[i].p_flags = PF_X | PF_R;
1865 pSymFile->aPhdrs[i].p_offset
1866 = pSymFile->aPhdrs[i].p_vaddr
1867 = pSymFile->aPhdrs[i].p_paddr = 0;
1868 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1869 = pSymFile->aPhdrs[i].p_memsz = pExecMemAllocator->cbChunk - offSymFileInChunk;
1870 pSymFile->aPhdrs[i].p_align = HOST_PAGE_SIZE;
1871 i++;
1872 /* The .dynamic segment. */
1873 pSymFile->aPhdrs[i].p_type = PT_DYNAMIC;
1874 pSymFile->aPhdrs[i].p_flags = PF_R;
1875 pSymFile->aPhdrs[i].p_offset
1876 = pSymFile->aPhdrs[i].p_vaddr
1877 = pSymFile->aPhdrs[i].p_paddr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1878 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1879 = pSymFile->aPhdrs[i].p_memsz = sizeof(pSymFile->aDyn);
1880 pSymFile->aPhdrs[i].p_align = sizeof(pSymFile->aDyn[0].d_tag);
1881 i++;
1882
1883 Assert(i == RT_ELEMENTS(pSymFile->aPhdrs));
1884
1885 /*
1886 * The dynamic section:
1887 */
1888 i = 0;
1889 pSymFile->aDyn[i].d_tag = DT_SONAME;
1890 pSymFile->aDyn[i].d_un.d_val = offStrTab;
1891 APPEND_STR_FMT("iem-exec-chunk-%u-%u", pVCpu->idCpu, idxChunk);
1892 i++;
1893 pSymFile->aDyn[i].d_tag = DT_STRTAB;
1894 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1895 i++;
1896 pSymFile->aDyn[i].d_tag = DT_STRSZ;
1897 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->szzStrTab);
1898 i++;
1899 pSymFile->aDyn[i].d_tag = DT_SYMTAB;
1900 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1901 i++;
1902 pSymFile->aDyn[i].d_tag = DT_SYMENT;
1903 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->aDynSyms[0]);
1904 i++;
1905 pSymFile->aDyn[i].d_tag = DT_NULL;
1906 i++;
1907 Assert(i == RT_ELEMENTS(pSymFile->aDyn));
1908# endif /* IEMNATIVE_USE_GDB_JIT_ET_DYN */
1909
1910 /*
1911 * Symbol tables:
1912 */
1913 /** @todo gdb doesn't seem to really like this ... */
1914 i = 0;
1915 pSymFile->aSymbols[i].st_name = 0;
1916 pSymFile->aSymbols[i].st_shndx = SHN_UNDEF;
1917 pSymFile->aSymbols[i].st_value = 0;
1918 pSymFile->aSymbols[i].st_size = 0;
1919 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_NOTYPE);
1920 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1921# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1922 pSymFile->aDynSyms[0] = pSymFile->aSymbols[i];
1923# endif
1924 i++;
1925
1926 pSymFile->aSymbols[i].st_name = 0;
1927 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1928 pSymFile->aSymbols[i].st_value = 0;
1929 pSymFile->aSymbols[i].st_size = 0;
1930 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_FILE);
1931 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1932 i++;
1933
1934 pSymFile->aSymbols[i].st_name = offStrTab;
1935 APPEND_STR_FMT("iem_exec_chunk_%u_%u", pVCpu->idCpu, idxChunk);
1936# if 0
1937 pSymFile->aSymbols[i].st_shndx = iShText;
1938 pSymFile->aSymbols[i].st_value = 0;
1939# else
1940 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1941 pSymFile->aSymbols[i].st_value = (uintptr_t)(pSymFile + 1);
1942# endif
1943 pSymFile->aSymbols[i].st_size = pSymFile->aShdrs[iShText].sh_size;
1944 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
1945 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1946# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1947 pSymFile->aDynSyms[1] = pSymFile->aSymbols[i];
1948 pSymFile->aDynSyms[1].st_value = (uintptr_t)(pSymFile + 1);
1949# endif
1950 i++;
1951
1952 Assert(i == RT_ELEMENTS(pSymFile->aSymbols));
1953 Assert(offStrTab < sizeof(pSymFile->szzStrTab));
1954
1955 /*
1956 * The GDB JIT entry and informing GDB.
1957 */
1958 pEhFrame->GdbJitEntry.pbSymFile = (uint8_t *)pSymFile;
1959# if 1
1960 pEhFrame->GdbJitEntry.cbSymFile = pExecMemAllocator->cbChunk - ((uintptr_t)pSymFile - (uintptr_t)pvChunk);
1961# else
1962 pEhFrame->GdbJitEntry.cbSymFile = sizeof(GDBJITSYMFILE);
1963# endif
1964
1965 RTOnce(&g_IemNativeGdbJitOnce, iemNativeGdbJitInitOnce, NULL);
1966 RTCritSectEnter(&g_IemNativeGdbJitLock);
1967 pEhFrame->GdbJitEntry.pNext = NULL;
1968 pEhFrame->GdbJitEntry.pPrev = __jit_debug_descriptor.pTail;
1969 if (__jit_debug_descriptor.pTail)
1970 __jit_debug_descriptor.pTail->pNext = &pEhFrame->GdbJitEntry;
1971 else
1972 __jit_debug_descriptor.pHead = &pEhFrame->GdbJitEntry;
1973 __jit_debug_descriptor.pTail = &pEhFrame->GdbJitEntry;
1974 __jit_debug_descriptor.pRelevant = &pEhFrame->GdbJitEntry;
1975
1976 /* Notify GDB: */
1977 __jit_debug_descriptor.enmAction = kGdbJitaction_Register;
1978 __jit_debug_register_code();
1979 __jit_debug_descriptor.enmAction = kGdbJitaction_NoAction;
1980 RTCritSectLeave(&g_IemNativeGdbJitLock);
1981
1982# else /* !IEMNATIVE_USE_GDB_JIT */
1983 RT_NOREF(pVCpu);
1984# endif /* !IEMNATIVE_USE_GDB_JIT */
1985
1986 return VINF_SUCCESS;
1987}
1988
1989# endif /* !RT_OS_WINDOWS */
1990#endif /* IN_RING3 */
1991
1992
1993/**
1994 * Adds another chunk to the executable memory allocator.
1995 *
1996 * This is used by the init code for the initial allocation and later by the
1997 * regular allocator function when it's out of memory.
1998 */
1999static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
2000{
2001 /* Check that we've room for growth. */
2002 uint32_t const idxChunk = pExecMemAllocator->cChunks;
2003 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
2004
2005 /* Allocate a chunk. */
2006#ifdef RT_OS_DARWIN
2007 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
2008#else
2009 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
2010#endif
2011 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
2012
2013#ifdef RT_OS_DARWIN
2014 /*
2015 * Because it is impossible to have a RWX memory allocation on macOS try to remap the memory
2016 * chunk readable/executable somewhere else so we can save us the hassle of switching between
2017 * protections when exeuctable memory is allocated.
2018 */
2019 int rc = VERR_NO_EXEC_MEMORY;
2020 mach_port_t hPortTask = mach_task_self();
2021 mach_vm_address_t AddrChunk = (mach_vm_address_t)pvChunk;
2022 mach_vm_address_t AddrRemapped = 0;
2023 vm_prot_t ProtCur = 0;
2024 vm_prot_t ProtMax = 0;
2025 kern_return_t krc = mach_vm_remap(hPortTask, &AddrRemapped, pExecMemAllocator->cbChunk, 0,
2026 VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
2027 hPortTask, AddrChunk, FALSE, &ProtCur, &ProtMax,
2028 VM_INHERIT_NONE);
2029 if (krc == KERN_SUCCESS)
2030 {
2031 krc = mach_vm_protect(mach_task_self(), AddrRemapped, pExecMemAllocator->cbChunk, FALSE, VM_PROT_READ | VM_PROT_EXECUTE);
2032 if (krc == KERN_SUCCESS)
2033 rc = VINF_SUCCESS;
2034 else
2035 {
2036 AssertLogRelMsgFailed(("mach_vm_protect -> %d (%#x)\n", krc, krc));
2037 krc = mach_vm_deallocate(hPortTask, AddrRemapped, pExecMemAllocator->cbChunk);
2038 Assert(krc == KERN_SUCCESS);
2039 }
2040 }
2041 else
2042 AssertLogRelMsgFailed(("mach_vm_remap -> %d (%#x)\n", krc, krc));
2043 if (RT_FAILURE(rc))
2044 {
2045 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
2046 return rc;
2047 }
2048
2049 void *pvChunkRx = (void *)AddrRemapped;
2050#else
2051 int rc = VINF_SUCCESS;
2052 void *pvChunkRx = pvChunk;
2053#endif
2054
2055 /*
2056 * Add the chunk.
2057 *
2058 * This must be done before the unwind init so windows can allocate
2059 * memory from the chunk when using the alternative sub-allocator.
2060 */
2061 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = pvChunk;
2062 pExecMemAllocator->aChunks[idxChunk].pvChunkRx = pvChunkRx;
2063#ifdef IN_RING3
2064 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
2065#endif
2066 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
2067 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
2068 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
2069 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
2070
2071 pExecMemAllocator->cChunks = idxChunk + 1;
2072 pExecMemAllocator->idxChunkHint = idxChunk;
2073
2074 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
2075 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
2076
2077 /* If there is a chunk context init callback call it. */
2078 rc = iemNativeRecompileAttachExecMemChunkCtx(pVCpu, idxChunk, &pExecMemAllocator->aChunks[idxChunk].pCtx);
2079#ifdef IN_RING3
2080 /*
2081 * Initialize the unwind information (this cannot really fail atm).
2082 * (This sets pvUnwindInfo.)
2083 */
2084 if (RT_SUCCESS(rc))
2085 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunkRx, idxChunk);
2086#endif
2087 if (RT_SUCCESS(rc))
2088 { /* likely */ }
2089 else
2090 {
2091 /* Just in case the impossible happens, undo the above up: */
2092 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
2093 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2094 pExecMemAllocator->cChunks = idxChunk;
2095 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
2096 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
2097 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = NULL;
2098 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
2099
2100# ifdef RT_OS_DARWIN
2101 krc = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx,
2102 pExecMemAllocator->cbChunk);
2103 Assert(krc == KERN_SUCCESS);
2104# endif
2105
2106 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
2107 return rc;
2108 }
2109
2110 return VINF_SUCCESS;
2111}
2112
2113
2114/**
2115 * Initializes the executable memory allocator for native recompilation on the
2116 * calling EMT.
2117 *
2118 * @returns VBox status code.
2119 * @param pVCpu The cross context virtual CPU structure of the calling
2120 * thread.
2121 * @param cbMax The max size of the allocator.
2122 * @param cbInitial The initial allocator size.
2123 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
2124 * dependent).
2125 */
2126int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk) RT_NOEXCEPT
2127{
2128 /*
2129 * Validate input.
2130 */
2131 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
2132 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
2133 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
2134 || cbChunk == 0
2135 || ( RT_IS_POWER_OF_TWO(cbChunk)
2136 && cbChunk >= _1M
2137 && cbChunk <= _256M
2138 && cbChunk <= cbMax),
2139 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
2140 VERR_OUT_OF_RANGE);
2141
2142 /*
2143 * Adjust/figure out the chunk size.
2144 */
2145 if (cbChunk == 0 || cbChunk == UINT32_MAX)
2146 {
2147 if (cbMax >= _256M)
2148 cbChunk = _64M;
2149 else
2150 {
2151 if (cbMax < _16M)
2152 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
2153 else
2154 cbChunk = (uint32_t)cbMax / 4;
2155 if (!RT_IS_POWER_OF_TWO(cbChunk))
2156 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
2157 }
2158 }
2159#if defined(RT_OS_AMD64)
2160 Assert(cbChunk <= _2G);
2161#elif defined(RT_OS_ARM64)
2162 if (cbChunk > _128M)
2163 cbChunk = _128M; /* Max relative branch distance is +/-2^(25+2) = +/-0x8000000 (134 217 728). */
2164#endif
2165
2166 if (cbChunk > cbMax)
2167 cbMax = cbChunk;
2168 else
2169 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
2170 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
2171 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
2172
2173 /*
2174 * Allocate and initialize the allocatore instance.
2175 */
2176 size_t const offBitmaps = RT_ALIGN_Z(RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]), RT_CACHELINE_SIZE);
2177 size_t const cbBitmaps = (size_t)(cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3)) * cMaxChunks;
2178 size_t cbNeeded = offBitmaps + cbBitmaps;
2179 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
2180 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
2181#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2182 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
2183 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
2184#endif
2185 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
2186 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
2187 VERR_NO_MEMORY);
2188 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
2189 pExecMemAllocator->cbChunk = cbChunk;
2190 pExecMemAllocator->cMaxChunks = cMaxChunks;
2191 pExecMemAllocator->cChunks = 0;
2192 pExecMemAllocator->idxChunkHint = 0;
2193 pExecMemAllocator->cAllocations = 0;
2194 pExecMemAllocator->cbTotal = 0;
2195 pExecMemAllocator->cbFree = 0;
2196 pExecMemAllocator->cbAllocated = 0;
2197#ifdef VBOX_WITH_STATISTICS
2198 pExecMemAllocator->cbUnusable = 0;
2199#endif
2200 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
2201 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2202 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
2203 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmaps); /* Mark everything as allocated. Clear when chunks are added. */
2204#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2205 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
2206#endif
2207 for (uint32_t i = 0; i < cMaxChunks; i++)
2208 {
2209 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
2210 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
2211 pExecMemAllocator->aChunks[i].pvChunkRw = NULL;
2212#ifdef IN_RING0
2213 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
2214#else
2215 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
2216#endif
2217 }
2218 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
2219
2220 /*
2221 * Do the initial allocations.
2222 */
2223 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
2224 {
2225 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
2226 AssertLogRelRCReturn(rc, rc);
2227 }
2228
2229 pExecMemAllocator->idxChunkHint = 0;
2230
2231 /*
2232 * Register statistics.
2233 */
2234 PUVM const pUVM = pVCpu->pUVCpu->pUVM;
2235 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cAllocations, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2236 "Current number of allocations", "/IEM/CPU%u/re/ExecMem/cAllocations", pVCpu->idCpu);
2237 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2238 "Currently allocated chunks", "/IEM/CPU%u/re/ExecMem/cChunks", pVCpu->idCpu);
2239 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cMaxChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2240 "Maximum number of chunks", "/IEM/CPU%u/re/ExecMem/cMaxChunks", pVCpu->idCpu);
2241 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbChunk, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2242 "Allocation chunk size", "/IEM/CPU%u/re/ExecMem/cbChunk", pVCpu->idCpu);
2243 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbAllocated, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2244 "Number of bytes current allocated", "/IEM/CPU%u/re/ExecMem/cbAllocated", pVCpu->idCpu);
2245 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbFree, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2246 "Number of bytes current free", "/IEM/CPU%u/re/ExecMem/cbFree", pVCpu->idCpu);
2247 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbTotal, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2248 "Total number of byte", "/IEM/CPU%u/re/ExecMem/cbTotal", pVCpu->idCpu);
2249#ifdef VBOX_WITH_STATISTICS
2250 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbUnusable, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2251 "Total number of bytes being unusable", "/IEM/CPU%u/re/ExecMem/cbUnusable", pVCpu->idCpu);
2252 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatAlloc, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2253 "Profiling the allocator", "/IEM/CPU%u/re/ExecMem/ProfAlloc", pVCpu->idCpu);
2254 for (unsigned i = 1; i < RT_ELEMENTS(pExecMemAllocator->aStatSizes); i++)
2255 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[i], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2256 "Number of allocations of this number of allocation units",
2257 "/IEM/CPU%u/re/ExecMem/aSize%02u", pVCpu->idCpu, i);
2258 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[0], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2259 "Number of allocations 16 units or larger", "/IEM/CPU%u/re/ExecMem/aSize16OrLarger", pVCpu->idCpu);
2260#endif
2261#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
2262 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneProf, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2263 "Pruning executable memory (alt)", "/IEM/CPU%u/re/ExecMem/Pruning", pVCpu->idCpu);
2264 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneRecovered, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES_PER_CALL,
2265 "Bytes recovered while pruning", "/IEM/CPU%u/re/ExecMem/PruningRecovered", pVCpu->idCpu);
2266#endif
2267 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cFruitlessChunkScans, STAMTYPE_U64_RESET, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2268 "Chunks fruitlessly scanned for free space", "/IEM/CPU%u/re/ExecMem/FruitlessChunkScans", pVCpu->idCpu);
2269
2270 return VINF_SUCCESS;
2271}
2272
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette