VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veExecMem.cpp@ 106329

Last change on this file since 106329 was 106329, checked in by vboxsync, 6 weeks ago

VMM/IEM: Some minor perf tweaks for iemExecMemAllocatorPrune. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 87.6 KB
Line 
1/* $Id: IEMAllN8veExecMem.cpp 106329 2024-10-15 14:19:43Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler, Executable Memory Allocator.
4 */
5
6/*
7 * Copyright (C) 2023-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#define LOG_GROUP LOG_GROUP_IEM_RE_NATIVE
33#define IEM_WITH_OPAQUE_DECODER_STATE
34#define VMM_INCLUDED_SRC_include_IEMMc_h /* block IEMMc.h inclusion. */
35#include <VBox/vmm/iem.h>
36#include <VBox/vmm/cpum.h>
37#include "IEMInternal.h"
38#include <VBox/vmm/vmcc.h>
39#include <VBox/log.h>
40#include <VBox/err.h>
41#include <VBox/param.h>
42#include <iprt/assert.h>
43#include <iprt/mem.h>
44#include <iprt/string.h>
45#if defined(RT_ARCH_AMD64)
46# include <iprt/x86.h>
47#elif defined(RT_ARCH_ARM64)
48# include <iprt/armv8.h>
49#endif
50
51#ifdef RT_OS_WINDOWS
52# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
53extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
54extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
55#else
56# include <iprt/formats/dwarf.h>
57# if defined(RT_OS_DARWIN)
58# include <libkern/OSCacheControl.h>
59# include <mach/mach.h>
60# include <mach/mach_vm.h>
61# define IEMNATIVE_USE_LIBUNWIND
62extern "C" void __register_frame(const void *pvFde);
63extern "C" void __deregister_frame(const void *pvFde);
64# else
65# ifdef DEBUG_bird /** @todo not thread safe yet */
66# define IEMNATIVE_USE_GDB_JIT
67# endif
68# ifdef IEMNATIVE_USE_GDB_JIT
69# include <iprt/critsect.h>
70# include <iprt/once.h>
71# include <iprt/formats/elf64.h>
72# endif
73extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
74extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
75# endif
76#endif
77
78#include "IEMN8veRecompiler.h"
79
80
81/*********************************************************************************************************************************
82* Executable Memory Allocator *
83*********************************************************************************************************************************/
84/** The chunk sub-allocation unit size in bytes. */
85#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 256
86/** The chunk sub-allocation unit size as a shift factor. */
87#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 8
88/** Enables adding a header to the sub-allocator allocations.
89 * This is useful for freeing up executable memory among other things. */
90#define IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
91/** Use alternative pruning. */
92#define IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
93
94
95#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
96# ifdef IEMNATIVE_USE_GDB_JIT
97# define IEMNATIVE_USE_GDB_JIT_ET_DYN
98
99/** GDB JIT: Code entry. */
100typedef struct GDBJITCODEENTRY
101{
102 struct GDBJITCODEENTRY *pNext;
103 struct GDBJITCODEENTRY *pPrev;
104 uint8_t *pbSymFile;
105 uint64_t cbSymFile;
106} GDBJITCODEENTRY;
107
108/** GDB JIT: Actions. */
109typedef enum GDBJITACTIONS : uint32_t
110{
111 kGdbJitaction_NoAction = 0, kGdbJitaction_Register, kGdbJitaction_Unregister
112} GDBJITACTIONS;
113
114/** GDB JIT: Descriptor. */
115typedef struct GDBJITDESCRIPTOR
116{
117 uint32_t uVersion;
118 GDBJITACTIONS enmAction;
119 GDBJITCODEENTRY *pRelevant;
120 GDBJITCODEENTRY *pHead;
121 /** Our addition: */
122 GDBJITCODEENTRY *pTail;
123} GDBJITDESCRIPTOR;
124
125/** GDB JIT: Our simple symbol file data. */
126typedef struct GDBJITSYMFILE
127{
128 Elf64_Ehdr EHdr;
129# ifndef IEMNATIVE_USE_GDB_JIT_ET_DYN
130 Elf64_Shdr aShdrs[5];
131# else
132 Elf64_Shdr aShdrs[7];
133 Elf64_Phdr aPhdrs[2];
134# endif
135 /** The dwarf ehframe data for the chunk. */
136 uint8_t abEhFrame[512];
137 char szzStrTab[128];
138 Elf64_Sym aSymbols[3];
139# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
140 Elf64_Sym aDynSyms[2];
141 Elf64_Dyn aDyn[6];
142# endif
143} GDBJITSYMFILE;
144
145extern "C" GDBJITDESCRIPTOR __jit_debug_descriptor;
146extern "C" DECLEXPORT(void) __jit_debug_register_code(void);
147
148/** Init once for g_IemNativeGdbJitLock. */
149static RTONCE g_IemNativeGdbJitOnce = RTONCE_INITIALIZER;
150/** Init once for the critical section. */
151static RTCRITSECT g_IemNativeGdbJitLock;
152
153/** GDB reads the info here. */
154GDBJITDESCRIPTOR __jit_debug_descriptor = { 1, kGdbJitaction_NoAction, NULL, NULL };
155
156/** GDB sets a breakpoint on this and checks __jit_debug_descriptor when hit. */
157DECL_NO_INLINE(RT_NOTHING, DECLEXPORT(void)) __jit_debug_register_code(void)
158{
159 ASMNopPause();
160}
161
162/** @callback_method_impl{FNRTONCE} */
163static DECLCALLBACK(int32_t) iemNativeGdbJitInitOnce(void *pvUser)
164{
165 RT_NOREF(pvUser);
166 return RTCritSectInit(&g_IemNativeGdbJitLock);
167}
168
169
170# endif /* IEMNATIVE_USE_GDB_JIT */
171
172/**
173 * Per-chunk unwind info for non-windows hosts.
174 */
175typedef struct IEMEXECMEMCHUNKEHFRAME
176{
177# ifdef IEMNATIVE_USE_LIBUNWIND
178 /** The offset of the FDA into abEhFrame. */
179 uintptr_t offFda;
180# else
181 /** 'struct object' storage area. */
182 uint8_t abObject[1024];
183# endif
184# ifdef IEMNATIVE_USE_GDB_JIT
185# if 0
186 /** The GDB JIT 'symbol file' data. */
187 GDBJITSYMFILE GdbJitSymFile;
188# endif
189 /** The GDB JIT list entry. */
190 GDBJITCODEENTRY GdbJitEntry;
191# endif
192 /** The dwarf ehframe data for the chunk. */
193 uint8_t abEhFrame[512];
194} IEMEXECMEMCHUNKEHFRAME;
195/** Pointer to per-chunk info info for non-windows hosts. */
196typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
197#endif
198
199
200/**
201 * An chunk of executable memory.
202 */
203typedef struct IEMEXECMEMCHUNK
204{
205 /** Number of free items in this chunk. */
206 uint32_t cFreeUnits;
207 /** Hint were to start searching for free space in the allocation bitmap. */
208 uint32_t idxFreeHint;
209 /** Pointer to the readable/writeable view of the memory chunk. */
210 void *pvChunkRw;
211 /** Pointer to the readable/executable view of the memory chunk. */
212 void *pvChunkRx;
213 /** Pointer to the context structure detailing the per chunk common code. */
214 PCIEMNATIVEPERCHUNKCTX pCtx;
215#ifdef IN_RING3
216 /**
217 * Pointer to the unwind information.
218 *
219 * This is used during C++ throw and longjmp (windows and probably most other
220 * platforms). Some debuggers (windbg) makes use of it as well.
221 *
222 * Windows: This is allocated from hHeap on windows because (at least for
223 * AMD64) the UNWIND_INFO structure address in the
224 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
225 *
226 * Others: Allocated from the regular heap to avoid unnecessary executable data
227 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
228 void *pvUnwindInfo;
229#elif defined(IN_RING0)
230 /** Allocation handle. */
231 RTR0MEMOBJ hMemObj;
232#endif
233} IEMEXECMEMCHUNK;
234/** Pointer to a memory chunk. */
235typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
236
237
238/**
239 * Executable memory allocator for the native recompiler.
240 */
241typedef struct IEMEXECMEMALLOCATOR
242{
243 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
244 uint32_t uMagic;
245
246 /** The chunk size. */
247 uint32_t cbChunk;
248 /** The maximum number of chunks. */
249 uint32_t cMaxChunks;
250 /** The current number of chunks. */
251 uint32_t cChunks;
252 /** Hint where to start looking for available memory. */
253 uint32_t idxChunkHint;
254 /** Statistics: Current number of allocations. */
255 uint32_t cAllocations;
256
257 /** The total amount of memory available. */
258 uint64_t cbTotal;
259 /** Total amount of free memory. */
260 uint64_t cbFree;
261 /** Total amount of memory allocated. */
262 uint64_t cbAllocated;
263
264 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
265 *
266 * Since the chunk size is a power of two and the minimum chunk size is a lot
267 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
268 * require a whole number of uint64_t elements in the allocation bitmap. So,
269 * for sake of simplicity, they are allocated as one continous chunk for
270 * simplicity/laziness. */
271 uint64_t *pbmAlloc;
272 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
273 uint32_t cUnitsPerChunk;
274 /** Number of bitmap elements per chunk (for quickly locating the bitmap
275 * portion corresponding to an chunk). */
276 uint32_t cBitmapElementsPerChunk;
277
278 /** Number of times we fruitlessly scanned a chunk for free space. */
279 uint64_t cFruitlessChunkScans;
280
281#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
282 /** The next chunk to prune in. */
283 uint32_t idxChunkPrune;
284 /** Where in chunk offset to start pruning at. */
285 uint32_t offChunkPrune;
286 /** Profiling the pruning code. */
287 STAMPROFILE StatPruneProf;
288 /** Number of bytes recovered by the pruning. */
289 STAMPROFILE StatPruneRecovered;
290#endif
291
292#ifdef VBOX_WITH_STATISTICS
293 STAMPROFILE StatAlloc;
294 /** Total amount of memory not being usable currently due to IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE. */
295 uint64_t cbUnusable;
296 /** Allocation size distribution (in alloc units; 0 is the slop bucket). */
297 STAMCOUNTER aStatSizes[16];
298#endif
299
300#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
301 /** Pointer to the array of unwind info running parallel to aChunks (same
302 * allocation as this structure, located after the bitmaps).
303 * (For Windows, the structures must reside in 32-bit RVA distance to the
304 * actual chunk, so they are allocated off the chunk.) */
305 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
306#endif
307
308 /** The allocation chunks. */
309 RT_FLEXIBLE_ARRAY_EXTENSION
310 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
311} IEMEXECMEMALLOCATOR;
312/** Pointer to an executable memory allocator. */
313typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
314
315/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
316#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
317
318
319#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
320/**
321 * Allocation header.
322 */
323typedef struct IEMEXECMEMALLOCHDR
324{
325 union
326 {
327 struct
328 {
329 /** Magic value / eyecatcher (IEMEXECMEMALLOCHDR_MAGIC). */
330 uint32_t uMagic;
331 /** The allocation chunk (for speeding up freeing). */
332 uint32_t idxChunk;
333 };
334 /** Combined magic and chunk index, for the pruning scanner code. */
335 uint64_t u64MagicAndChunkIdx;
336 };
337 /** Pointer to the translation block the allocation belongs to.
338 * This is the whole point of the header. */
339 PIEMTB pTb;
340} IEMEXECMEMALLOCHDR;
341/** Pointer to an allocation header. */
342typedef IEMEXECMEMALLOCHDR *PIEMEXECMEMALLOCHDR;
343/** Magic value for IEMEXECMEMALLOCHDR ('ExeM'). */
344# define IEMEXECMEMALLOCHDR_MAGIC UINT32_C(0x4d657845)
345#endif
346
347
348static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator);
349
350
351#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
352/**
353 * Frees up executable memory when we're out space.
354 *
355 * This is an alternative to iemTbAllocatorFreeupNativeSpace() that frees up
356 * space in a more linear fashion from the allocator's point of view. It may
357 * also defragment if implemented & enabled
358 */
359static void iemExecMemAllocatorPrune(PVMCPU pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
360{
361# ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
362# error "IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING requires IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER"
363# endif
364 STAM_REL_PROFILE_START(&pExecMemAllocator->StatPruneProf, a);
365
366 /*
367 * Before we can start, we must process delayed frees.
368 */
369#if 1
370 PIEMTBALLOCATOR const pTbAllocator = iemTbAllocatorFreeBulkStart(pVCpu);
371#else
372 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
373#endif
374
375 AssertCompile(RT_IS_POWER_OF_TWO(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE));
376
377 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
378 AssertReturnVoid(RT_IS_POWER_OF_TWO(cbChunk));
379 AssertReturnVoid(cbChunk >= _1M && cbChunk <= _256M); /* see iemExecMemAllocatorInit */
380
381 uint32_t const cChunks = pExecMemAllocator->cChunks;
382 AssertReturnVoid(cChunks == pExecMemAllocator->cMaxChunks);
383 AssertReturnVoid(cChunks >= 1);
384
385 Assert(!pVCpu->iem.s.pCurTbR3);
386
387 /*
388 * Decide how much to prune. The chunk is is a multiple of two, so we'll be
389 * scanning a multiple of two here as well.
390 */
391 uint32_t cbToPrune = cbChunk;
392
393 /* Never more than 25%. */
394 if (cChunks < 4)
395 cbToPrune /= cChunks == 1 ? 4 : 2;
396
397 /* Upper limit. In a debug build a 4MB limit averages out at ~0.6ms per call. */
398 if (cbToPrune > _4M)
399 cbToPrune = _4M;
400
401 /*
402 * Adjust the pruning chunk and offset accordingly.
403 */
404 uint32_t idxChunk = pExecMemAllocator->idxChunkPrune;
405 uint32_t offChunk = pExecMemAllocator->offChunkPrune;
406 offChunk &= ~(uint32_t)(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1U);
407 if (offChunk >= cbChunk)
408 {
409 offChunk = 0;
410 idxChunk += 1;
411 }
412 if (idxChunk >= cChunks)
413 {
414 offChunk = 0;
415 idxChunk = 0;
416 }
417
418 uint32_t const offPruneEnd = RT_MIN(offChunk + cbToPrune, cbChunk);
419
420 /*
421 * Do the pruning. The current approach is the sever kind.
422 *
423 * This is memory bound, as we must load both the allocation header and the
424 * associated TB and then modify them. So, the CPU isn't all that unitilized
425 * here. Try apply some prefetching to speed it up a tiny bit.
426 */
427 uint64_t cbPruned = 0;
428 uint64_t const u64MagicAndChunkIdx = RT_MAKE_U64(IEMEXECMEMALLOCHDR_MAGIC, idxChunk);
429 uint8_t * const pbChunk = (uint8_t *)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
430 while (offChunk < offPruneEnd)
431 {
432 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)&pbChunk[offChunk];
433
434 /* Is this the start of an allocation block for a TB? (We typically
435 have one allocation at the start of each chunk for the unwind info
436 where pTb is NULL.) */
437 PIEMTB pTb;
438 if ( pHdr->u64MagicAndChunkIdx == u64MagicAndChunkIdx
439 && RT_LIKELY((pTb = pHdr->pTb) != NULL))
440 {
441 AssertPtr(pTb);
442
443 uint32_t const cbBlock = RT_ALIGN_32(pTb->Native.cInstructions * sizeof(IEMNATIVEINSTR) + sizeof(*pHdr),
444 IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
445
446 /* Prefetch the next header before freeing the current one and its TB. */
447 /** @todo Iff the block size was part of the header in some way, this could be
448 * a tiny bit faster. */
449 offChunk += cbBlock;
450#if defined(_MSC_VER) && defined(RT_ARCH_AMD64)
451 _mm_prefetch((char *)&pbChunk[offChunk], _MM_HINT_T0);
452#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
453 __prefetch(&pbChunk[offChunk]);
454#else
455 __builtin_prefetch(&pbChunk[offChunk], 1 /*rw*/);
456#endif
457 /* Some paranoia first, though. */
458 AssertBreakStmt(offChunk <= cbChunk, offChunk -= cbBlock - IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
459 cbPruned += cbBlock;
460
461#if 1
462 iemTbAllocatorFreeBulk(pVCpu, pTbAllocator, pTb);
463#else
464 iemTbAllocatorFree(pVCpu, pTb);
465#endif
466 }
467 else
468 offChunk += IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
469 }
470 STAM_REL_PROFILE_ADD_PERIOD(&pExecMemAllocator->StatPruneRecovered, cbPruned);
471
472 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
473
474 /*
475 * Save the current pruning point.
476 */
477 pExecMemAllocator->offChunkPrune = offChunk;
478 pExecMemAllocator->idxChunkPrune = idxChunk;
479
480 /* Set the hint to the start of the pruned region. */
481 pExecMemAllocator->idxChunkHint = idxChunk;
482 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = offChunk / IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE;
483
484 STAM_REL_PROFILE_STOP(&pExecMemAllocator->StatPruneProf, a);
485}
486#endif /* IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING */
487
488
489#if defined(VBOX_STRICT) || 0
490/**
491 * The old bitmap scanner code, for comparison and assertions.
492 */
493static uint32_t iemExecMemAllocatorFindReqFreeUnitsOld(uint64_t *pbmAlloc, uint32_t cToScan, uint32_t cReqUnits)
494{
495 /** @todo This can probably be done more efficiently for non-x86 systems. */
496 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
497 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
498 {
499 uint32_t idxAddBit = 1;
500 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
501 idxAddBit++;
502 if (idxAddBit >= cReqUnits)
503 return (uint32_t)iBit;
504 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
505 }
506 return UINT32_MAX;
507}
508#endif
509
510
511/**
512 * Bitmap scanner code that looks for a bunch of @a cReqUnits zero bits.
513 *
514 * Booting win11 with a r165098 release build the average native TB size is
515 * around 9 units (of 256 bytes). So, it is unlikely we need to scan any
516 * subsequent words once we hit a patch of zeros, thus @a a_fBig.
517 *
518 * @todo This needs more tweaking. While it *is* faster the the old code,
519 * it doens't seem like it's all that much. :/
520 */
521template<const bool a_fBig>
522static uint32_t iemExecMemAllocatorFindReqFreeUnits(uint64_t *pbmAlloc, uint32_t c64WordsToScan, uint32_t cReqUnits)
523{
524 /*
525 * Scan the (section of the) allocation bitmap in 64-bit words.
526 */
527 unsigned cPrevLeadingZeros = 0;
528 for (uint32_t off = 0; off < c64WordsToScan; off++)
529 {
530 uint64_t uWord = pbmAlloc[off];
531 if (uWord == UINT64_MAX)
532 {
533 /*
534 * Getting thru patches of UINT64_MAX is a frequent problem when the allocator
535 * fills up, so it's definitely worth optimizing.
536 *
537 * The complicated code below is a bit faster on arm. Reducing the per TB cost
538 * from 4255ns to 4106ns (best run out of 10). On win/amd64 there isn't an
539 * obvious gain here, at least not with the data currently being profiled.
540 */
541#if 1
542 off++;
543 uint32_t cQuads = (c64WordsToScan - off) / 4;
544
545 /* Align. */
546 if (cQuads > 1)
547 switch (((uintptr_t)&pbmAlloc[off] / sizeof(uint64_t)) & 3)
548 {
549 case 0:
550 break;
551 case 1:
552 {
553 uWord = pbmAlloc[off];
554 uint64_t uWord1 = pbmAlloc[off + 1];
555 uint64_t uWord2 = pbmAlloc[off + 2];
556 if ((uWord & uWord1 & uWord2) == UINT64_MAX)
557 {
558 off += 3;
559 cQuads = (c64WordsToScan - off) / 4;
560 }
561 else if (uWord == UINT64_MAX)
562 {
563 if (uWord1 != UINT64_MAX)
564 {
565 uWord = uWord1;
566 off += 1;
567 }
568 else
569 {
570 uWord = uWord2;
571 off += 2;
572 }
573 }
574 break;
575 }
576 case 2:
577 {
578 uWord = pbmAlloc[off];
579 uint64_t uWord1 = pbmAlloc[off + 1];
580 if ((uWord & uWord1) == UINT64_MAX)
581 {
582 off += 2;
583 cQuads = (c64WordsToScan - off) / 4;
584 }
585 else if (uWord == UINT64_MAX)
586 {
587 uWord = uWord1;
588 off += 1;
589 }
590 break;
591 }
592 case 3:
593 uWord = pbmAlloc[off];
594 if (uWord == UINT64_MAX)
595 {
596 off++;
597 cQuads = (c64WordsToScan - off) / 4;
598 }
599 break;
600 }
601 if (uWord == UINT64_MAX)
602 {
603 /* Looping over 32 bytes at a time. */
604 for (;;)
605 {
606 if (cQuads-- > 0)
607 {
608 uWord = pbmAlloc[off + 0];
609 uint64_t uWord1 = pbmAlloc[off + 1];
610 uint64_t uWord2 = pbmAlloc[off + 2];
611 uint64_t uWord3 = pbmAlloc[off + 3];
612 if ((uWord & uWord1 & uWord2 & uWord3) == UINT64_MAX)
613 off += 4;
614 else
615 {
616 if (uWord != UINT64_MAX)
617 { }
618 else if (uWord1 != UINT64_MAX)
619 {
620 uWord = uWord1;
621 off += 1;
622 }
623 else if (uWord2 != UINT64_MAX)
624 {
625 uWord = uWord2;
626 off += 2;
627 }
628 else
629 {
630 uWord = uWord3;
631 off += 3;
632 }
633 break;
634 }
635 }
636 else
637 {
638 if (off < c64WordsToScan)
639 {
640 uWord = pbmAlloc[off];
641 if (uWord != UINT64_MAX)
642 break;
643 off++;
644 if (off < c64WordsToScan)
645 {
646 uWord = pbmAlloc[off];
647 if (uWord != UINT64_MAX)
648 break;
649 off++;
650 if (off < c64WordsToScan)
651 {
652 uWord = pbmAlloc[off];
653 if (uWord != UINT64_MAX)
654 break;
655 Assert(off + 1 == c64WordsToScan);
656 }
657 }
658 }
659 return UINT32_MAX;
660 }
661 }
662 }
663#else
664 do
665 {
666 off++;
667 if (off < c64WordsToScan)
668 uWord = pbmAlloc[off];
669 else
670 return UINT32_MAX;
671 } while (uWord == UINT64_MAX);
672#endif
673 cPrevLeadingZeros = 0;
674 }
675
676 /*
677 * If we get down here, we have a word that isn't UINT64_MAX.
678 */
679 if (uWord != 0)
680 {
681 /*
682 * Fend of large request we cannot satisfy before the first set bit.
683 */
684 if (!a_fBig || cReqUnits < 64 + cPrevLeadingZeros)
685 {
686#ifdef __GNUC__
687 unsigned cZerosInWord = __builtin_popcountl(~uWord);
688#elif defined(_MSC_VER) && defined(RT_ARCH_AMD64)
689 unsigned cZerosInWord = __popcnt64(~uWord);
690#elif defined(_MSC_VER) && defined(RT_ARCH_ARM64)
691 unsigned cZerosInWord = _CountOneBits64(~uWord);
692#else
693# pragma message("need popcount intrinsic or something...")
694 unsigned cZerosInWord = 0;
695 for (uint64_t uTmp = ~uWords; uTmp; cZerosInWord++)
696 uTmp &= uTmp - 1; /* Clears the least significant bit set. */
697#endif
698 if (cZerosInWord + cPrevLeadingZeros >= cReqUnits)
699 {
700 /* Check if we've got a patch of zeros at the trailing end
701 when joined with the previous word: */
702#ifdef __GNUC__
703 unsigned cTrailingZeros = __builtin_ctzl(uWord);
704#else
705 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
706#endif
707 if (cPrevLeadingZeros + cTrailingZeros >= cReqUnits)
708 return off * 64 - cPrevLeadingZeros;
709
710 /*
711 * Try leading zeros before we get on with the tedious stuff.
712 */
713#ifdef __GNUC__
714 cPrevLeadingZeros = __builtin_clzl(uWord);
715#else
716 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
717#endif
718 if (cPrevLeadingZeros >= cReqUnits)
719 return (off + 1) * 64 - cPrevLeadingZeros;
720
721 /*
722 * Check the popcount again sans leading & trailing before looking
723 * inside the word.
724 */
725 cZerosInWord -= cPrevLeadingZeros + cTrailingZeros;
726 if (cZerosInWord >= cReqUnits)
727 {
728 /* 1; 64 - 0 - 1 = 63; */
729 unsigned const iBitLast = 64 - cPrevLeadingZeros - cReqUnits; /** @todo boundrary */
730 unsigned iBit = cTrailingZeros;
731 uWord >>= cTrailingZeros;
732 do
733 {
734 Assert(uWord & 1);
735#ifdef __GNUC__
736 unsigned iZeroBit = __builtin_ctzl(~uWord);
737#else
738 unsigned iZeroBit = ASMBitFirstSetU64(~uWord) - 1;
739#endif
740 iBit += iZeroBit;
741 uWord >>= iZeroBit;
742 Assert(iBit <= iBitLast);
743 Assert((uWord & 1) == 0);
744#ifdef __GNUC__
745 unsigned cZeros = __builtin_ctzl(uWord);
746#else
747 unsigned cZeros = ASMBitFirstSetU64(uWord) - 1;
748#endif
749 if (cZeros >= cReqUnits)
750 return off * 64 + iBit;
751
752 cZerosInWord -= cZeros; /* (may underflow as we will count shifted in zeros) */
753 iBit += cZeros;
754 uWord >>= cZeros;
755 } while ((int)cZerosInWord >= (int)cReqUnits && iBit < iBitLast);
756 }
757 continue; /* we've already calculated cPrevLeadingZeros */
758 }
759 }
760
761 /* Update the leading (MSB) zero count. */
762#ifdef __GNUC__
763 cPrevLeadingZeros = __builtin_clzl(uWord);
764#else
765 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
766#endif
767 }
768 /*
769 * uWord == 0
770 */
771 else
772 {
773 if RT_CONSTEXPR_IF(!a_fBig)
774 return off * 64 - cPrevLeadingZeros;
775 else /* keep else */
776 {
777 if (cPrevLeadingZeros + 64 >= cReqUnits)
778 return off * 64 - cPrevLeadingZeros;
779 for (uint32_t off2 = off + 1;; off2++)
780 {
781 if (off2 < c64WordsToScan)
782 {
783 uWord = pbmAlloc[off2];
784 if (uWord == UINT64_MAX)
785 {
786 cPrevLeadingZeros = 0;
787 break;
788 }
789 if (uWord == 0)
790 {
791 if (cPrevLeadingZeros + (off2 - off + 1) * 64 >= cReqUnits)
792 return off * 64 - cPrevLeadingZeros;
793 }
794 else
795 {
796#ifdef __GNUC__
797 unsigned cTrailingZeros = __builtin_ctzl(uWord);
798#else
799 unsigned cTrailingZeros = ASMBitFirstSetU64(uWord) - 1;
800#endif
801 if (cPrevLeadingZeros + (off2 - off) * 64 + cTrailingZeros >= cReqUnits)
802 return off * 64 - cPrevLeadingZeros;
803#ifdef __GNUC__
804 cPrevLeadingZeros = __builtin_clzl(uWord);
805#else
806 cPrevLeadingZeros = 64 - ASMBitLastSetU64(uWord);
807#endif
808 break;
809 }
810 }
811 else
812 return UINT32_MAX;
813 }
814 }
815 }
816 }
817 return UINT32_MAX;
818}
819
820
821/**
822 * Try allocate a block of @a cReqUnits in the chunk @a idxChunk.
823 */
824static void *
825iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
826 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk, PIEMTB pTb,
827 void **ppvExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
828{
829 /*
830 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
831 */
832 Assert(!(cToScan & 63));
833 Assert(!(idxFirst & 63));
834 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
835 pbmAlloc += idxFirst / 64;
836 cToScan += idxFirst & 63;
837 Assert(!(cToScan & 63));
838
839#if 1
840 uint32_t const iBit = cReqUnits < 64
841 ? iemExecMemAllocatorFindReqFreeUnits<false>(pbmAlloc, cToScan / 64, cReqUnits)
842 : iemExecMemAllocatorFindReqFreeUnits<true>( pbmAlloc, cToScan / 64, cReqUnits);
843 Assert(iBit == iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits));
844#else
845 uint32_t const iBit = iemExecMemAllocatorFindReqFreeUnitsOld(pbmAlloc, cToScan, cReqUnits);
846#endif
847 if (iBit != UINT32_MAX)
848 {
849 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
850
851 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
852 pChunk->cFreeUnits -= cReqUnits;
853 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
854
855 pExecMemAllocator->cAllocations += 1;
856 uint32_t const cbReq = cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
857 pExecMemAllocator->cbAllocated += cbReq;
858 pExecMemAllocator->cbFree -= cbReq;
859 pExecMemAllocator->idxChunkHint = idxChunk;
860
861 void * const pvMemRw = (uint8_t *)pChunk->pvChunkRw
862 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
863
864 if (ppChunkCtx)
865 *ppChunkCtx = pChunk->pCtx;
866
867 /*
868 * Initialize the header and return.
869 */
870# ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
871 PIEMEXECMEMALLOCHDR const pHdr = (PIEMEXECMEMALLOCHDR)pvMemRw;
872 pHdr->uMagic = IEMEXECMEMALLOCHDR_MAGIC;
873 pHdr->idxChunk = idxChunk;
874 pHdr->pTb = pTb;
875
876 if (ppvExec)
877 *ppvExec = (uint8_t *)pChunk->pvChunkRx
878 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT)
879 + sizeof(*pHdr);
880
881 return pHdr + 1;
882#else
883 if (ppvExec)
884 *ppvExec = (uint8_t *)pChunk->pvChunkRx
885 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
886
887 RT_NOREF(pTb);
888 return pvMem;
889#endif
890 }
891
892 return NULL;
893}
894
895
896/**
897 * Converts requested number of bytes into a unit count.
898 */
899DECL_FORCE_INLINE(uint32_t) iemExecMemAllocBytesToUnits(uint32_t cbReq)
900{
901#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
902 return (cbReq + sizeof(IEMEXECMEMALLOCHDR) + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
903#else
904 return (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)
905#endif
906 >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
907}
908
909
910DECL_FORCE_INLINE(PIEMNATIVEINSTR)
911iemExecMemAllocatorAllocUnitsInChunkInner(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cReqUnits,
912 PIEMTB pTb, PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx)
913{
914 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
915 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
916 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
917 {
918 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
919 pExecMemAllocator->cUnitsPerChunk - idxHint,
920 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
921 if (pvRet)
922 return (PIEMNATIVEINSTR)pvRet;
923 }
924 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
925 RT_MIN(pExecMemAllocator->cUnitsPerChunk,
926 RT_ALIGN_32(idxHint + cReqUnits, 64*4)),
927 cReqUnits, idxChunk, pTb, (void **)ppaExec, ppChunkCtx);
928 if (pvRet)
929 return (PIEMNATIVEINSTR)pvRet;
930
931 pExecMemAllocator->cFruitlessChunkScans += 1;
932 return NULL;
933}
934
935
936DECLINLINE(PIEMNATIVEINSTR)
937iemExecMemAllocatorAllocBytesInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq,
938 PIEMNATIVEINSTR *ppaExec)
939{
940 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
941 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
942 return iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, NULL /*pTb*/,
943 ppaExec, NULL /*ppChunkCtx*/);
944 return NULL;
945}
946
947
948/**
949 * Allocates @a cbReq bytes of executable memory.
950 *
951 * @returns Pointer to the readable/writeable memory, NULL if out of memory or other problem
952 * encountered.
953 * @param pVCpu The cross context virtual CPU structure of the
954 * calling thread.
955 * @param cbReq How many bytes are required.
956 * @param pTb The translation block that will be using the allocation.
957 * @param ppaExec Where to return the pointer to executable view of
958 * the allocated memory, optional.
959 * @param ppChunkCtx Where to return the per chunk attached context
960 * if available, optional.
961 */
962DECLHIDDEN(PIEMNATIVEINSTR) iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq, PIEMTB pTb,
963 PIEMNATIVEINSTR *ppaExec, PCIEMNATIVEPERCHUNKCTX *ppChunkCtx) RT_NOEXCEPT
964{
965 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
966 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
967 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
968 STAM_PROFILE_START(&pExecMemAllocator->StatAlloc, a);
969
970 uint32_t const cReqUnits = iemExecMemAllocBytesToUnits(cbReq);
971 STAM_COUNTER_INC(&pExecMemAllocator->aStatSizes[cReqUnits < RT_ELEMENTS(pExecMemAllocator->aStatSizes) ? cReqUnits : 0]);
972 for (unsigned iIteration = 0;; iIteration++)
973 {
974 if ( cbReq * 2 <= pExecMemAllocator->cbFree
975 || (cReqUnits == 1 || pExecMemAllocator->cbFree >= IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) )
976 {
977 uint32_t const cChunks = pExecMemAllocator->cChunks;
978 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
979
980 /*
981 * We do two passes here, the first pass we skip chunks with fewer than cReqUnits * 16,
982 * the 2nd pass we skip chunks. The second pass checks the one skipped in the first pass.
983 */
984 for (uint32_t cMinFreePass = cReqUnits == 1 ? cReqUnits : cReqUnits * 16, cMaxFreePass = UINT32_MAX;;)
985 {
986 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
987 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
988 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
989 {
990 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
991 cReqUnits, pTb, ppaExec, ppChunkCtx);
992 if (pRet)
993 {
994 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
995#ifdef VBOX_WITH_STATISTICS
996 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
997#endif
998 return pRet;
999 }
1000 }
1001 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
1002 if ( pExecMemAllocator->aChunks[idxChunk].cFreeUnits >= cMinFreePass
1003 && pExecMemAllocator->aChunks[idxChunk].cFreeUnits <= cMaxFreePass)
1004 {
1005 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk,
1006 cReqUnits, pTb, ppaExec, ppChunkCtx);
1007 if (pRet)
1008 {
1009 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1010#ifdef VBOX_WITH_STATISTICS
1011 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1012#endif
1013 return pRet;
1014 }
1015 }
1016 if (cMinFreePass <= cReqUnits * 2)
1017 break;
1018 cMaxFreePass = cMinFreePass - 1;
1019 cMinFreePass = cReqUnits * 2;
1020 }
1021 }
1022
1023 /*
1024 * Can we grow it with another chunk?
1025 */
1026 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
1027 {
1028 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
1029 AssertLogRelRCReturn(rc, NULL);
1030
1031 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
1032 PIEMNATIVEINSTR const pRet = iemExecMemAllocatorAllocUnitsInChunkInner(pExecMemAllocator, idxChunk, cReqUnits, pTb,
1033 ppaExec, ppChunkCtx);
1034 if (pRet)
1035 {
1036 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1037#ifdef VBOX_WITH_STATISTICS
1038 pExecMemAllocator->cbUnusable += (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbReq;
1039#endif
1040 return pRet;
1041 }
1042 AssertFailed();
1043 }
1044
1045 /*
1046 * Try prune native TBs once.
1047 */
1048 if (iIteration == 0)
1049 {
1050#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
1051 iemExecMemAllocatorPrune(pVCpu, pExecMemAllocator);
1052#else
1053 /* No header included in the instruction count here. */
1054 uint32_t const cNeededInstrs = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) / sizeof(IEMNATIVEINSTR);
1055 iemTbAllocatorFreeupNativeSpace(pVCpu, cNeededInstrs);
1056#endif
1057 }
1058 else
1059 {
1060 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeExecMemInstrBufAllocFailed);
1061 STAM_PROFILE_STOP(&pExecMemAllocator->StatAlloc, a);
1062 return NULL;
1063 }
1064 }
1065}
1066
1067
1068/** This is a hook to ensure the instruction cache is properly flushed before the code in the memory
1069 * given by @a pv and @a cb is executed */
1070DECLHIDDEN(void) iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1071{
1072#ifdef RT_OS_DARWIN
1073 /*
1074 * We need to synchronize the stuff we wrote to the data cache with the
1075 * instruction cache, since these aren't coherent on arm (or at least not
1076 * on Apple Mn CPUs).
1077 *
1078 * Note! Since we don't any share JIT'ed code with the other CPUs, we don't
1079 * really care whether the dcache is fully flushed back to memory. It
1080 * only needs to hit the level 2 cache, which the level 1 instruction
1081 * and data caches seems to be sharing. In ARM terms, we need to reach
1082 * a point of unification (PoU), rather than a point of coherhency (PoC).
1083 *
1084 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
1085 *
1086 * https://developer.arm.com/documentation/den0013/d/Caches/Point-of-coherency-and-unification
1087 *
1088 * Experimenting with the approach used by sys_icache_invalidate() and
1089 * tweaking it a little, could let us shave off a bit of effort. The thing
1090 * that slows the apple code down on an M2 (runing Sonoma 13.4), seems to
1091 * the 'DSB ISH' instructions performed every 20 icache line flushes.
1092 * Skipping these saves ~100ns or more per TB when profiling the native
1093 * recompiler on the TBs from a win11 full boot-desktop-shutdow sequence.
1094 * Thus we will leave DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB undefined if we
1095 * can.
1096 *
1097 * There appears not to be much difference between DSB options 'ISH',
1098 * 'ISHST', 'NSH' and 'NSHST'. The latter is theoretically all we need, so
1099 * we'll use that one.
1100 *
1101 * See https://developer.arm.com/documentation/100941/0101/Barriers for
1102 * details on the barrier options.
1103 *
1104 * Note! The CFG value "/IEM/HostICacheInvalidationViaHostAPI" can be used
1105 * to disabling the experimental code should it misbehave.
1106 */
1107 uint8_t const fHostICacheInvalidation = pVCpu->iem.s.fHostICacheInvalidation;
1108 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_USE_HOST_API))
1109 {
1110# define DCACHE_ICACHE_SYNC_DSB_OPTION "nshst"
1111/*# define DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB*/
1112
1113 /* Skipping this is fine, but doesn't impact perf much. */
1114 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1115
1116 /* Invalidate the icache for the range [pv,pv+cb). */
1117# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1118 size_t const cIvauDsbEvery= 20;
1119 unsigned cDsb = cIvauDsbEvery;
1120# endif
1121 size_t const cbCacheLine = 64;
1122 size_t cbInvalidate = cb + ((uintptr_t)pv & (cbCacheLine - 1)) ;
1123 size_t cCacheLines = RT_ALIGN_Z(cbInvalidate, cbCacheLine) / cbCacheLine;
1124 uintptr_t uPtr = (uintptr_t)pv & ~(uintptr_t)(cbCacheLine - 1);
1125 for (;; uPtr += cbCacheLine)
1126 {
1127 __asm__ /*__volatile__*/("ic ivau, %0" : : "r" (uPtr));
1128 cCacheLines -= 1;
1129 if (!cCacheLines)
1130 break;
1131# ifdef DCACHE_ICACHE_SYNC_WITH_WITH_IVAU_DSB
1132 cDsb -= 1;
1133 if (cDsb != 0)
1134 { /* likely */ }
1135 else
1136 {
1137 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION);
1138 cDsb = cIvauDsbEvery;
1139 }
1140# endif
1141 }
1142
1143 /*
1144 * The DSB here is non-optional it seems.
1145 *
1146 * The following ISB can be omitted on M2 without any obvious sideeffects,
1147 * it produces better number in the above mention profiling scenario.
1148 * This could be related to the kHasICDSB flag in cpu_capabilities.h,
1149 * but it doesn't look like that flag is set here (M2, Sonoma 13.4).
1150 *
1151 * I've made the inclusion of the ISH barrier as configurable and with
1152 * a default of skipping it.
1153 */
1154 if (!(fHostICacheInvalidation & IEMNATIVE_ICACHE_F_END_WITH_ISH))
1155 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION
1156 ::: "memory");
1157 else
1158 __asm__ __volatile__("dsb " DCACHE_ICACHE_SYNC_DSB_OPTION "\n\t"
1159 "isb"
1160 ::: "memory");
1161 }
1162 else
1163 sys_icache_invalidate(pv, cb);
1164
1165#elif defined(RT_OS_LINUX) && defined(RT_ARCH_ARM64)
1166 RT_NOREF(pVCpu);
1167
1168 /* There is __builtin___clear_cache() but it flushes both the instruction and data cache, so do it manually. */
1169 static uint32_t s_u32CtrEl0 = 0;
1170 if (!s_u32CtrEl0)
1171 asm volatile ("mrs %0, ctr_el0":"=r" (s_u32CtrEl0));
1172 uintptr_t cbICacheLine = (uintptr_t)4 << (s_u32CtrEl0 & 0xf);
1173
1174 uintptr_t pb = (uintptr_t)pv & ~(cbICacheLine - 1);
1175 for (; pb < (uintptr_t)pv + cb; pb += cbICacheLine)
1176 asm volatile ("ic ivau, %0" : : "r" (pb) : "memory");
1177
1178 asm volatile ("dsb ish\n\t isb\n\t" : : : "memory");
1179
1180#else
1181 RT_NOREF(pVCpu, pv, cb);
1182#endif
1183}
1184
1185
1186/**
1187 * Frees executable memory.
1188 */
1189DECLHIDDEN(void) iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb) RT_NOEXCEPT
1190{
1191 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1192 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
1193 AssertPtr(pv);
1194#ifdef VBOX_WITH_STATISTICS
1195 size_t const cbOrig = cb;
1196#endif
1197#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1198 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1199
1200 /* Align the size as we did when allocating the block. */
1201 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1202
1203#else
1204 PIEMEXECMEMALLOCHDR pHdr = (PIEMEXECMEMALLOCHDR)pv - 1;
1205 Assert(!((uintptr_t)pHdr & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
1206 AssertReturnVoid(pHdr->uMagic == IEMEXECMEMALLOCHDR_MAGIC);
1207 uint32_t const idxChunk = pHdr->idxChunk;
1208 AssertReturnVoid(idxChunk < pExecMemAllocator->cChunks);
1209 pv = pHdr;
1210
1211 /* Adjust and align the size to cover the whole allocation area. */
1212 cb = RT_ALIGN_Z(cb + sizeof(*pHdr), IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
1213#endif
1214
1215 /* Free it / assert sanity. */
1216 bool fFound = false;
1217 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1218#ifndef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1219 uint32_t const cChunks = pExecMemAllocator->cChunks;
1220 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
1221#endif
1222 {
1223 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx;
1224 fFound = offChunk < cbChunk;
1225 if (fFound)
1226 {
1227 uint32_t const idxFirst = (uint32_t)offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1228 uint32_t const cReqUnits = (uint32_t)cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1229
1230 /* Check that it's valid and free it. */
1231 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
1232 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
1233 for (uint32_t i = 1; i < cReqUnits; i++)
1234 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
1235 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
1236
1237 /* Invalidate the header using the writeable memory view. */
1238 pHdr = (PIEMEXECMEMALLOCHDR)((uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRw + offChunk);
1239#ifdef IEMEXECMEM_ALT_SUB_WITH_ALLOC_HEADER
1240 pHdr->uMagic = 0;
1241 pHdr->idxChunk = 0;
1242 pHdr->pTb = NULL;
1243#endif
1244 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
1245 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
1246
1247 /* Update the stats. */
1248 pExecMemAllocator->cbAllocated -= cb;
1249 pExecMemAllocator->cbFree += cb;
1250 pExecMemAllocator->cAllocations -= 1;
1251#ifdef VBOX_WITH_STATISTICS
1252 pExecMemAllocator->cbUnusable -= (cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT) - cbOrig;
1253#endif
1254 return;
1255 }
1256 }
1257 AssertFailed();
1258}
1259
1260
1261/**
1262 * Interface used by iemNativeRecompileAttachExecMemChunkCtx and unwind info
1263 * generators.
1264 */
1265DECLHIDDEN(PIEMNATIVEINSTR)
1266iemExecMemAllocatorAllocFromChunk(PVMCPU pVCpu, uint32_t idxChunk, uint32_t cbReq, PIEMNATIVEINSTR *ppaExec)
1267{
1268 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1269 AssertReturn(idxChunk < pExecMemAllocator->cChunks, NULL);
1270 Assert(cbReq < _1M);
1271 return iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbReq, ppaExec);
1272}
1273
1274
1275/**
1276 * For getting the per-chunk context detailing common code for a TB.
1277 *
1278 * This is for use by the disassembler.
1279 */
1280DECLHIDDEN(PCIEMNATIVEPERCHUNKCTX) iemExecMemGetTbChunkCtx(PVMCPU pVCpu, PCIEMTB pTb)
1281{
1282 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
1283 if ((pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
1284 {
1285 uintptr_t const uAddress = (uintptr_t)pTb->Native.paInstructions;
1286 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
1287 uint32_t idxChunk = pExecMemAllocator->cChunks;
1288 while (idxChunk-- > 0)
1289 if (uAddress - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx < cbChunk)
1290 return pExecMemAllocator->aChunks[idxChunk].pCtx;
1291 }
1292 return NULL;
1293}
1294
1295
1296#ifdef IN_RING3
1297# ifdef RT_OS_WINDOWS
1298
1299/**
1300 * Initializes the unwind info structures for windows hosts.
1301 */
1302static int
1303iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1304 void *pvChunk, uint32_t idxChunk)
1305{
1306 RT_NOREF(pVCpu);
1307
1308 /*
1309 * The AMD64 unwind opcodes.
1310 *
1311 * This is a program that starts with RSP after a RET instruction that
1312 * ends up in recompiled code, and the operations we describe here will
1313 * restore all non-volatile registers and bring RSP back to where our
1314 * RET address is. This means it's reverse order from what happens in
1315 * the prologue.
1316 *
1317 * Note! Using a frame register approach here both because we have one
1318 * and but mainly because the UWOP_ALLOC_LARGE argument values
1319 * would be a pain to write initializers for. On the positive
1320 * side, we're impervious to changes in the the stack variable
1321 * area can can deal with dynamic stack allocations if necessary.
1322 */
1323 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
1324 {
1325 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
1326 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
1327 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
1328 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
1329 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
1330 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
1331 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
1332 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
1333 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
1334 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
1335 };
1336 union
1337 {
1338 IMAGE_UNWIND_INFO Info;
1339 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
1340 } s_UnwindInfo =
1341 {
1342 {
1343 /* .Version = */ 1,
1344 /* .Flags = */ 0,
1345 /* .SizeOfProlog = */ 16, /* whatever */
1346 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
1347 /* .FrameRegister = */ X86_GREG_xBP,
1348 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
1349 }
1350 };
1351 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
1352 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
1353
1354 /*
1355 * Calc how much space we need and allocate it off the exec heap.
1356 */
1357 unsigned const cFunctionEntries = 1;
1358 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
1359 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
1360 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
1361 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk, cbNeeded, NULL);
1362 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
1363 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
1364
1365 /*
1366 * Initialize the structures.
1367 */
1368 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
1369
1370 paFunctions[0].BeginAddress = 0;
1371 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
1372 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
1373
1374 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
1375 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
1376
1377 /*
1378 * Register it.
1379 */
1380 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
1381 AssertReturn(fRet, VERR_INTERNAL_ERROR_3); /* Nothing to clean up on failure, since its within the chunk itself. */
1382
1383 return VINF_SUCCESS;
1384}
1385
1386
1387# else /* !RT_OS_WINDOWS */
1388
1389/**
1390 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
1391 */
1392DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
1393{
1394 if (iValue >= 64)
1395 {
1396 Assert(iValue < 0x2000);
1397 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1398 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
1399 }
1400 else if (iValue >= 0)
1401 *Ptr.pb++ = (uint8_t)iValue;
1402 else if (iValue > -64)
1403 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
1404 else
1405 {
1406 Assert(iValue > -0x2000);
1407 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
1408 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
1409 }
1410 return Ptr;
1411}
1412
1413
1414/**
1415 * Emits an ULEB128 encoded value (up to 64-bit wide).
1416 */
1417DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
1418{
1419 while (uValue >= 0x80)
1420 {
1421 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
1422 uValue >>= 7;
1423 }
1424 *Ptr.pb++ = (uint8_t)uValue;
1425 return Ptr;
1426}
1427
1428
1429/**
1430 * Emits a CFA rule as register @a uReg + offset @a off.
1431 */
1432DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1433{
1434 *Ptr.pb++ = DW_CFA_def_cfa;
1435 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1436 Ptr = iemDwarfPutUleb128(Ptr, off);
1437 return Ptr;
1438}
1439
1440
1441/**
1442 * Emits a register (@a uReg) save location:
1443 * CFA + @a off * data_alignment_factor
1444 */
1445DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
1446{
1447 if (uReg < 0x40)
1448 *Ptr.pb++ = DW_CFA_offset | uReg;
1449 else
1450 {
1451 *Ptr.pb++ = DW_CFA_offset_extended;
1452 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1453 }
1454 Ptr = iemDwarfPutUleb128(Ptr, off);
1455 return Ptr;
1456}
1457
1458
1459# if 0 /* unused */
1460/**
1461 * Emits a register (@a uReg) save location, using signed offset:
1462 * CFA + @a offSigned * data_alignment_factor
1463 */
1464DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
1465{
1466 *Ptr.pb++ = DW_CFA_offset_extended_sf;
1467 Ptr = iemDwarfPutUleb128(Ptr, uReg);
1468 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
1469 return Ptr;
1470}
1471# endif
1472
1473
1474/**
1475 * Initializes the unwind info section for non-windows hosts.
1476 */
1477static int
1478iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator,
1479 void *pvChunk, uint32_t idxChunk)
1480{
1481 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
1482 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
1483
1484 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
1485
1486 /*
1487 * Generate the CIE first.
1488 */
1489# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
1490 uint8_t const iDwarfVer = 3;
1491# else
1492 uint8_t const iDwarfVer = 4;
1493# endif
1494 RTPTRUNION const PtrCie = Ptr;
1495 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1496 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
1497 *Ptr.pb++ = iDwarfVer; /* DwARF version */
1498 *Ptr.pb++ = 0; /* Augmentation. */
1499 if (iDwarfVer >= 4)
1500 {
1501 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
1502 *Ptr.pb++ = 0; /* Segment selector size. */
1503 }
1504# ifdef RT_ARCH_AMD64
1505 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
1506# else
1507 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
1508# endif
1509 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
1510# ifdef RT_ARCH_AMD64
1511 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
1512# elif defined(RT_ARCH_ARM64)
1513 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
1514# else
1515# error "port me"
1516# endif
1517 /* Initial instructions: */
1518# ifdef RT_ARCH_AMD64
1519 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
1520 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
1521 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
1522 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
1523 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
1524 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
1525 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
1526 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
1527# elif defined(RT_ARCH_ARM64)
1528# if 1
1529 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
1530# else
1531 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
1532# endif
1533 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
1534 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
1535 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
1536 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
1537 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
1538 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
1539 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
1540 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
1541 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
1542 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
1543 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
1544 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
1545 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1546 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
1547# else
1548# error "port me"
1549# endif
1550 while ((Ptr.u - PtrCie.u) & 3)
1551 *Ptr.pb++ = DW_CFA_nop;
1552 /* Finalize the CIE size. */
1553 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
1554
1555 /*
1556 * Generate an FDE for the whole chunk area.
1557 */
1558# ifdef IEMNATIVE_USE_LIBUNWIND
1559 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
1560# endif
1561 RTPTRUNION const PtrFde = Ptr;
1562 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
1563 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
1564 Ptr.pu32++;
1565 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
1566 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
1567# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
1568 *Ptr.pb++ = DW_CFA_nop;
1569# endif
1570 while ((Ptr.u - PtrFde.u) & 3)
1571 *Ptr.pb++ = DW_CFA_nop;
1572 /* Finalize the FDE size. */
1573 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
1574
1575 /* Terminator entry. */
1576 *Ptr.pu32++ = 0;
1577 *Ptr.pu32++ = 0; /* just to be sure... */
1578 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
1579
1580 /*
1581 * Register it.
1582 */
1583# ifdef IEMNATIVE_USE_LIBUNWIND
1584 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
1585# else
1586 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
1587 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
1588# endif
1589
1590# ifdef IEMNATIVE_USE_GDB_JIT
1591 /*
1592 * Now for telling GDB about this (experimental).
1593 *
1594 * This seems to work best with ET_DYN.
1595 */
1596 GDBJITSYMFILE * const pSymFile = (GDBJITSYMFILE *)iemExecMemAllocatorAllocBytesInChunk(pExecMemAllocator, idxChunk,
1597 sizeof(GDBJITSYMFILE), NULL);
1598 AssertReturn(pSymFile, VERR_INTERNAL_ERROR_5);
1599 unsigned const offSymFileInChunk = (uintptr_t)pSymFile - (uintptr_t)pvChunk;
1600
1601 RT_ZERO(*pSymFile);
1602
1603 /*
1604 * The ELF header:
1605 */
1606 pSymFile->EHdr.e_ident[0] = ELFMAG0;
1607 pSymFile->EHdr.e_ident[1] = ELFMAG1;
1608 pSymFile->EHdr.e_ident[2] = ELFMAG2;
1609 pSymFile->EHdr.e_ident[3] = ELFMAG3;
1610 pSymFile->EHdr.e_ident[EI_VERSION] = EV_CURRENT;
1611 pSymFile->EHdr.e_ident[EI_CLASS] = ELFCLASS64;
1612 pSymFile->EHdr.e_ident[EI_DATA] = ELFDATA2LSB;
1613 pSymFile->EHdr.e_ident[EI_OSABI] = ELFOSABI_NONE;
1614# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1615 pSymFile->EHdr.e_type = ET_DYN;
1616# else
1617 pSymFile->EHdr.e_type = ET_REL;
1618# endif
1619# ifdef RT_ARCH_AMD64
1620 pSymFile->EHdr.e_machine = EM_AMD64;
1621# elif defined(RT_ARCH_ARM64)
1622 pSymFile->EHdr.e_machine = EM_AARCH64;
1623# else
1624# error "port me"
1625# endif
1626 pSymFile->EHdr.e_version = 1; /*?*/
1627 pSymFile->EHdr.e_entry = 0;
1628# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1629 pSymFile->EHdr.e_phoff = RT_UOFFSETOF(GDBJITSYMFILE, aPhdrs);
1630# else
1631 pSymFile->EHdr.e_phoff = 0;
1632# endif
1633 pSymFile->EHdr.e_shoff = sizeof(pSymFile->EHdr);
1634 pSymFile->EHdr.e_flags = 0;
1635 pSymFile->EHdr.e_ehsize = sizeof(pSymFile->EHdr);
1636# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1637 pSymFile->EHdr.e_phentsize = sizeof(pSymFile->aPhdrs[0]);
1638 pSymFile->EHdr.e_phnum = RT_ELEMENTS(pSymFile->aPhdrs);
1639# else
1640 pSymFile->EHdr.e_phentsize = 0;
1641 pSymFile->EHdr.e_phnum = 0;
1642# endif
1643 pSymFile->EHdr.e_shentsize = sizeof(pSymFile->aShdrs[0]);
1644 pSymFile->EHdr.e_shnum = RT_ELEMENTS(pSymFile->aShdrs);
1645 pSymFile->EHdr.e_shstrndx = 0; /* set later */
1646
1647 uint32_t offStrTab = 0;
1648#define APPEND_STR(a_szStr) do { \
1649 memcpy(&pSymFile->szzStrTab[offStrTab], a_szStr, sizeof(a_szStr)); \
1650 offStrTab += sizeof(a_szStr); \
1651 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1652 } while (0)
1653#define APPEND_STR_FMT(a_szStr, ...) do { \
1654 offStrTab += RTStrPrintf(&pSymFile->szzStrTab[offStrTab], sizeof(pSymFile->szzStrTab) - offStrTab, a_szStr, __VA_ARGS__); \
1655 offStrTab++; \
1656 Assert(offStrTab < sizeof(pSymFile->szzStrTab)); \
1657 } while (0)
1658
1659 /*
1660 * Section headers.
1661 */
1662 /* Section header #0: NULL */
1663 unsigned i = 0;
1664 APPEND_STR("");
1665 RT_ZERO(pSymFile->aShdrs[i]);
1666 i++;
1667
1668 /* Section header: .eh_frame */
1669 pSymFile->aShdrs[i].sh_name = offStrTab;
1670 APPEND_STR(".eh_frame");
1671 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1672 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1673# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1674 pSymFile->aShdrs[i].sh_offset
1675 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, abEhFrame);
1676# else
1677 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->abEhFrame[0];
1678 pSymFile->aShdrs[i].sh_offset = 0;
1679# endif
1680
1681 pSymFile->aShdrs[i].sh_size = sizeof(pEhFrame->abEhFrame);
1682 pSymFile->aShdrs[i].sh_link = 0;
1683 pSymFile->aShdrs[i].sh_info = 0;
1684 pSymFile->aShdrs[i].sh_addralign = 1;
1685 pSymFile->aShdrs[i].sh_entsize = 0;
1686 memcpy(pSymFile->abEhFrame, pEhFrame->abEhFrame, sizeof(pEhFrame->abEhFrame));
1687 i++;
1688
1689 /* Section header: .shstrtab */
1690 unsigned const iShStrTab = i;
1691 pSymFile->EHdr.e_shstrndx = iShStrTab;
1692 pSymFile->aShdrs[i].sh_name = offStrTab;
1693 APPEND_STR(".shstrtab");
1694 pSymFile->aShdrs[i].sh_type = SHT_STRTAB;
1695 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1696# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1697 pSymFile->aShdrs[i].sh_offset
1698 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1699# else
1700 pSymFile->aShdrs[i].sh_addr = (uintptr_t)&pSymFile->szzStrTab[0];
1701 pSymFile->aShdrs[i].sh_offset = 0;
1702# endif
1703 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->szzStrTab);
1704 pSymFile->aShdrs[i].sh_link = 0;
1705 pSymFile->aShdrs[i].sh_info = 0;
1706 pSymFile->aShdrs[i].sh_addralign = 1;
1707 pSymFile->aShdrs[i].sh_entsize = 0;
1708 i++;
1709
1710 /* Section header: .symbols */
1711 pSymFile->aShdrs[i].sh_name = offStrTab;
1712 APPEND_STR(".symtab");
1713 pSymFile->aShdrs[i].sh_type = SHT_SYMTAB;
1714 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1715 pSymFile->aShdrs[i].sh_offset
1716 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aSymbols);
1717 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aSymbols);
1718 pSymFile->aShdrs[i].sh_link = iShStrTab;
1719 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aSymbols);
1720 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aSymbols[0].st_value);
1721 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aSymbols[0]);
1722 i++;
1723
1724# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1725 /* Section header: .symbols */
1726 pSymFile->aShdrs[i].sh_name = offStrTab;
1727 APPEND_STR(".dynsym");
1728 pSymFile->aShdrs[i].sh_type = SHT_DYNSYM;
1729 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1730 pSymFile->aShdrs[i].sh_offset
1731 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1732 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDynSyms);
1733 pSymFile->aShdrs[i].sh_link = iShStrTab;
1734 pSymFile->aShdrs[i].sh_info = RT_ELEMENTS(pSymFile->aDynSyms);
1735 pSymFile->aShdrs[i].sh_addralign = sizeof(pSymFile->aDynSyms[0].st_value);
1736 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDynSyms[0]);
1737 i++;
1738# endif
1739
1740# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1741 /* Section header: .dynamic */
1742 pSymFile->aShdrs[i].sh_name = offStrTab;
1743 APPEND_STR(".dynamic");
1744 pSymFile->aShdrs[i].sh_type = SHT_DYNAMIC;
1745 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC;
1746 pSymFile->aShdrs[i].sh_offset
1747 = pSymFile->aShdrs[i].sh_addr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1748 pSymFile->aShdrs[i].sh_size = sizeof(pSymFile->aDyn);
1749 pSymFile->aShdrs[i].sh_link = iShStrTab;
1750 pSymFile->aShdrs[i].sh_info = 0;
1751 pSymFile->aShdrs[i].sh_addralign = 1;
1752 pSymFile->aShdrs[i].sh_entsize = sizeof(pSymFile->aDyn[0]);
1753 i++;
1754# endif
1755
1756 /* Section header: .text */
1757 unsigned const iShText = i;
1758 pSymFile->aShdrs[i].sh_name = offStrTab;
1759 APPEND_STR(".text");
1760 pSymFile->aShdrs[i].sh_type = SHT_PROGBITS;
1761 pSymFile->aShdrs[i].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
1762# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN) || defined(IEMNATIVE_USE_GDB_JIT_ELF_RVAS)
1763 pSymFile->aShdrs[i].sh_offset
1764 = pSymFile->aShdrs[i].sh_addr = sizeof(GDBJITSYMFILE);
1765# else
1766 pSymFile->aShdrs[i].sh_addr = (uintptr_t)(pSymFile + 1);
1767 pSymFile->aShdrs[i].sh_offset = 0;
1768# endif
1769 pSymFile->aShdrs[i].sh_size = pExecMemAllocator->cbChunk - offSymFileInChunk - sizeof(GDBJITSYMFILE);
1770 pSymFile->aShdrs[i].sh_link = 0;
1771 pSymFile->aShdrs[i].sh_info = 0;
1772 pSymFile->aShdrs[i].sh_addralign = 1;
1773 pSymFile->aShdrs[i].sh_entsize = 0;
1774 i++;
1775
1776 Assert(i == RT_ELEMENTS(pSymFile->aShdrs));
1777
1778# if defined(IEMNATIVE_USE_GDB_JIT_ET_DYN)
1779 /*
1780 * The program headers:
1781 */
1782 /* Everything in a single LOAD segment: */
1783 i = 0;
1784 pSymFile->aPhdrs[i].p_type = PT_LOAD;
1785 pSymFile->aPhdrs[i].p_flags = PF_X | PF_R;
1786 pSymFile->aPhdrs[i].p_offset
1787 = pSymFile->aPhdrs[i].p_vaddr
1788 = pSymFile->aPhdrs[i].p_paddr = 0;
1789 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1790 = pSymFile->aPhdrs[i].p_memsz = pExecMemAllocator->cbChunk - offSymFileInChunk;
1791 pSymFile->aPhdrs[i].p_align = HOST_PAGE_SIZE;
1792 i++;
1793 /* The .dynamic segment. */
1794 pSymFile->aPhdrs[i].p_type = PT_DYNAMIC;
1795 pSymFile->aPhdrs[i].p_flags = PF_R;
1796 pSymFile->aPhdrs[i].p_offset
1797 = pSymFile->aPhdrs[i].p_vaddr
1798 = pSymFile->aPhdrs[i].p_paddr = RT_UOFFSETOF(GDBJITSYMFILE, aDyn);
1799 pSymFile->aPhdrs[i].p_filesz /* Size of segment in file. */
1800 = pSymFile->aPhdrs[i].p_memsz = sizeof(pSymFile->aDyn);
1801 pSymFile->aPhdrs[i].p_align = sizeof(pSymFile->aDyn[0].d_tag);
1802 i++;
1803
1804 Assert(i == RT_ELEMENTS(pSymFile->aPhdrs));
1805
1806 /*
1807 * The dynamic section:
1808 */
1809 i = 0;
1810 pSymFile->aDyn[i].d_tag = DT_SONAME;
1811 pSymFile->aDyn[i].d_un.d_val = offStrTab;
1812 APPEND_STR_FMT("iem-exec-chunk-%u-%u", pVCpu->idCpu, idxChunk);
1813 i++;
1814 pSymFile->aDyn[i].d_tag = DT_STRTAB;
1815 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, szzStrTab);
1816 i++;
1817 pSymFile->aDyn[i].d_tag = DT_STRSZ;
1818 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->szzStrTab);
1819 i++;
1820 pSymFile->aDyn[i].d_tag = DT_SYMTAB;
1821 pSymFile->aDyn[i].d_un.d_ptr = RT_UOFFSETOF(GDBJITSYMFILE, aDynSyms);
1822 i++;
1823 pSymFile->aDyn[i].d_tag = DT_SYMENT;
1824 pSymFile->aDyn[i].d_un.d_val = sizeof(pSymFile->aDynSyms[0]);
1825 i++;
1826 pSymFile->aDyn[i].d_tag = DT_NULL;
1827 i++;
1828 Assert(i == RT_ELEMENTS(pSymFile->aDyn));
1829# endif /* IEMNATIVE_USE_GDB_JIT_ET_DYN */
1830
1831 /*
1832 * Symbol tables:
1833 */
1834 /** @todo gdb doesn't seem to really like this ... */
1835 i = 0;
1836 pSymFile->aSymbols[i].st_name = 0;
1837 pSymFile->aSymbols[i].st_shndx = SHN_UNDEF;
1838 pSymFile->aSymbols[i].st_value = 0;
1839 pSymFile->aSymbols[i].st_size = 0;
1840 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_NOTYPE);
1841 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1842# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1843 pSymFile->aDynSyms[0] = pSymFile->aSymbols[i];
1844# endif
1845 i++;
1846
1847 pSymFile->aSymbols[i].st_name = 0;
1848 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1849 pSymFile->aSymbols[i].st_value = 0;
1850 pSymFile->aSymbols[i].st_size = 0;
1851 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_LOCAL, STT_FILE);
1852 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1853 i++;
1854
1855 pSymFile->aSymbols[i].st_name = offStrTab;
1856 APPEND_STR_FMT("iem_exec_chunk_%u_%u", pVCpu->idCpu, idxChunk);
1857# if 0
1858 pSymFile->aSymbols[i].st_shndx = iShText;
1859 pSymFile->aSymbols[i].st_value = 0;
1860# else
1861 pSymFile->aSymbols[i].st_shndx = SHN_ABS;
1862 pSymFile->aSymbols[i].st_value = (uintptr_t)(pSymFile + 1);
1863# endif
1864 pSymFile->aSymbols[i].st_size = pSymFile->aShdrs[iShText].sh_size;
1865 pSymFile->aSymbols[i].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
1866 pSymFile->aSymbols[i].st_other = 0 /* STV_DEFAULT */;
1867# ifdef IEMNATIVE_USE_GDB_JIT_ET_DYN
1868 pSymFile->aDynSyms[1] = pSymFile->aSymbols[i];
1869 pSymFile->aDynSyms[1].st_value = (uintptr_t)(pSymFile + 1);
1870# endif
1871 i++;
1872
1873 Assert(i == RT_ELEMENTS(pSymFile->aSymbols));
1874 Assert(offStrTab < sizeof(pSymFile->szzStrTab));
1875
1876 /*
1877 * The GDB JIT entry and informing GDB.
1878 */
1879 pEhFrame->GdbJitEntry.pbSymFile = (uint8_t *)pSymFile;
1880# if 1
1881 pEhFrame->GdbJitEntry.cbSymFile = pExecMemAllocator->cbChunk - ((uintptr_t)pSymFile - (uintptr_t)pvChunk);
1882# else
1883 pEhFrame->GdbJitEntry.cbSymFile = sizeof(GDBJITSYMFILE);
1884# endif
1885
1886 RTOnce(&g_IemNativeGdbJitOnce, iemNativeGdbJitInitOnce, NULL);
1887 RTCritSectEnter(&g_IemNativeGdbJitLock);
1888 pEhFrame->GdbJitEntry.pNext = NULL;
1889 pEhFrame->GdbJitEntry.pPrev = __jit_debug_descriptor.pTail;
1890 if (__jit_debug_descriptor.pTail)
1891 __jit_debug_descriptor.pTail->pNext = &pEhFrame->GdbJitEntry;
1892 else
1893 __jit_debug_descriptor.pHead = &pEhFrame->GdbJitEntry;
1894 __jit_debug_descriptor.pTail = &pEhFrame->GdbJitEntry;
1895 __jit_debug_descriptor.pRelevant = &pEhFrame->GdbJitEntry;
1896
1897 /* Notify GDB: */
1898 __jit_debug_descriptor.enmAction = kGdbJitaction_Register;
1899 __jit_debug_register_code();
1900 __jit_debug_descriptor.enmAction = kGdbJitaction_NoAction;
1901 RTCritSectLeave(&g_IemNativeGdbJitLock);
1902
1903# else /* !IEMNATIVE_USE_GDB_JIT */
1904 RT_NOREF(pVCpu);
1905# endif /* !IEMNATIVE_USE_GDB_JIT */
1906
1907 return VINF_SUCCESS;
1908}
1909
1910# endif /* !RT_OS_WINDOWS */
1911#endif /* IN_RING3 */
1912
1913
1914/**
1915 * Adds another chunk to the executable memory allocator.
1916 *
1917 * This is used by the init code for the initial allocation and later by the
1918 * regular allocator function when it's out of memory.
1919 */
1920static int iemExecMemAllocatorGrow(PVMCPUCC pVCpu, PIEMEXECMEMALLOCATOR pExecMemAllocator)
1921{
1922 /* Check that we've room for growth. */
1923 uint32_t const idxChunk = pExecMemAllocator->cChunks;
1924 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
1925
1926 /* Allocate a chunk. */
1927#ifdef RT_OS_DARWIN
1928 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
1929#else
1930 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
1931#endif
1932 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
1933
1934#ifdef RT_OS_DARWIN
1935 /*
1936 * Because it is impossible to have a RWX memory allocation on macOS try to remap the memory
1937 * chunk readable/executable somewhere else so we can save us the hassle of switching between
1938 * protections when exeuctable memory is allocated.
1939 */
1940 int rc = VERR_NO_EXEC_MEMORY;
1941 mach_port_t hPortTask = mach_task_self();
1942 mach_vm_address_t AddrChunk = (mach_vm_address_t)pvChunk;
1943 mach_vm_address_t AddrRemapped = 0;
1944 vm_prot_t ProtCur = 0;
1945 vm_prot_t ProtMax = 0;
1946 kern_return_t krc = mach_vm_remap(hPortTask, &AddrRemapped, pExecMemAllocator->cbChunk, 0,
1947 VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
1948 hPortTask, AddrChunk, FALSE, &ProtCur, &ProtMax,
1949 VM_INHERIT_NONE);
1950 if (krc == KERN_SUCCESS)
1951 {
1952 krc = mach_vm_protect(mach_task_self(), AddrRemapped, pExecMemAllocator->cbChunk, FALSE, VM_PROT_READ | VM_PROT_EXECUTE);
1953 if (krc == KERN_SUCCESS)
1954 rc = VINF_SUCCESS;
1955 else
1956 {
1957 AssertLogRelMsgFailed(("mach_vm_protect -> %d (%#x)\n", krc, krc));
1958 krc = mach_vm_deallocate(hPortTask, AddrRemapped, pExecMemAllocator->cbChunk);
1959 Assert(krc == KERN_SUCCESS);
1960 }
1961 }
1962 else
1963 AssertLogRelMsgFailed(("mach_vm_remap -> %d (%#x)\n", krc, krc));
1964 if (RT_FAILURE(rc))
1965 {
1966 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
1967 return rc;
1968 }
1969
1970 void *pvChunkRx = (void *)AddrRemapped;
1971#else
1972 int rc = VINF_SUCCESS;
1973 void *pvChunkRx = pvChunk;
1974#endif
1975
1976 /*
1977 * Add the chunk.
1978 *
1979 * This must be done before the unwind init so windows can allocate
1980 * memory from the chunk when using the alternative sub-allocator.
1981 */
1982 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = pvChunk;
1983 pExecMemAllocator->aChunks[idxChunk].pvChunkRx = pvChunkRx;
1984#ifdef IN_RING3
1985 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
1986#endif
1987 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
1988 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
1989 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
1990 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
1991
1992 pExecMemAllocator->cChunks = idxChunk + 1;
1993 pExecMemAllocator->idxChunkHint = idxChunk;
1994
1995 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
1996 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
1997
1998 /* If there is a chunk context init callback call it. */
1999 rc = iemNativeRecompileAttachExecMemChunkCtx(pVCpu, idxChunk, &pExecMemAllocator->aChunks[idxChunk].pCtx);
2000#ifdef IN_RING3
2001 /*
2002 * Initialize the unwind information (this cannot really fail atm).
2003 * (This sets pvUnwindInfo.)
2004 */
2005 if (RT_SUCCESS(rc))
2006 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pVCpu, pExecMemAllocator, pvChunkRx, idxChunk);
2007#endif
2008 if (RT_SUCCESS(rc))
2009 { /* likely */ }
2010 else
2011 {
2012 /* Just in case the impossible happens, undo the above up: */
2013 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
2014 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2015 pExecMemAllocator->cChunks = idxChunk;
2016 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
2017 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
2018 pExecMemAllocator->aChunks[idxChunk].pvChunkRw = NULL;
2019 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
2020
2021# ifdef RT_OS_DARWIN
2022 krc = mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)pExecMemAllocator->aChunks[idxChunk].pvChunkRx,
2023 pExecMemAllocator->cbChunk);
2024 Assert(krc == KERN_SUCCESS);
2025# endif
2026
2027 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
2028 return rc;
2029 }
2030
2031 return VINF_SUCCESS;
2032}
2033
2034
2035/**
2036 * Initializes the executable memory allocator for native recompilation on the
2037 * calling EMT.
2038 *
2039 * @returns VBox status code.
2040 * @param pVCpu The cross context virtual CPU structure of the calling
2041 * thread.
2042 * @param cbMax The max size of the allocator.
2043 * @param cbInitial The initial allocator size.
2044 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
2045 * dependent).
2046 */
2047int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk) RT_NOEXCEPT
2048{
2049 /*
2050 * Validate input.
2051 */
2052 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
2053 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
2054 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
2055 || cbChunk == 0
2056 || ( RT_IS_POWER_OF_TWO(cbChunk)
2057 && cbChunk >= _1M
2058 && cbChunk <= _256M
2059 && cbChunk <= cbMax),
2060 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
2061 VERR_OUT_OF_RANGE);
2062
2063 /*
2064 * Adjust/figure out the chunk size.
2065 */
2066 if (cbChunk == 0 || cbChunk == UINT32_MAX)
2067 {
2068 if (cbMax >= _256M)
2069 cbChunk = _64M;
2070 else
2071 {
2072 if (cbMax < _16M)
2073 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
2074 else
2075 cbChunk = (uint32_t)cbMax / 4;
2076 if (!RT_IS_POWER_OF_TWO(cbChunk))
2077 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
2078 }
2079 }
2080#if defined(RT_OS_AMD64)
2081 Assert(cbChunk <= _2G);
2082#elif defined(RT_OS_ARM64)
2083 if (cbChunk > _128M)
2084 cbChunk = _128M; /* Max relative branch distance is +/-2^(25+2) = +/-0x8000000 (134 217 728). */
2085#endif
2086
2087 if (cbChunk > cbMax)
2088 cbMax = cbChunk;
2089 else
2090 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
2091 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
2092 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
2093
2094 /*
2095 * Allocate and initialize the allocatore instance.
2096 */
2097 size_t const offBitmaps = RT_ALIGN_Z(RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]), RT_CACHELINE_SIZE);
2098 size_t const cbBitmaps = (size_t)(cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3)) * cMaxChunks;
2099 size_t cbNeeded = offBitmaps + cbBitmaps;
2100 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
2101 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
2102#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2103 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
2104 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
2105#endif
2106 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
2107 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
2108 VERR_NO_MEMORY);
2109 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
2110 pExecMemAllocator->cbChunk = cbChunk;
2111 pExecMemAllocator->cMaxChunks = cMaxChunks;
2112 pExecMemAllocator->cChunks = 0;
2113 pExecMemAllocator->idxChunkHint = 0;
2114 pExecMemAllocator->cAllocations = 0;
2115 pExecMemAllocator->cbTotal = 0;
2116 pExecMemAllocator->cbFree = 0;
2117 pExecMemAllocator->cbAllocated = 0;
2118#ifdef VBOX_WITH_STATISTICS
2119 pExecMemAllocator->cbUnusable = 0;
2120#endif
2121 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
2122 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
2123 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
2124 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmaps); /* Mark everything as allocated. Clear when chunks are added. */
2125#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
2126 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
2127#endif
2128 for (uint32_t i = 0; i < cMaxChunks; i++)
2129 {
2130 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
2131 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
2132 pExecMemAllocator->aChunks[i].pvChunkRw = NULL;
2133#ifdef IN_RING0
2134 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
2135#else
2136 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
2137#endif
2138 }
2139 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
2140
2141 /*
2142 * Do the initial allocations.
2143 */
2144 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
2145 {
2146 int rc = iemExecMemAllocatorGrow(pVCpu, pExecMemAllocator);
2147 AssertLogRelRCReturn(rc, rc);
2148 }
2149
2150 pExecMemAllocator->idxChunkHint = 0;
2151
2152 /*
2153 * Register statistics.
2154 */
2155 PUVM const pUVM = pVCpu->pUVCpu->pUVM;
2156 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cAllocations, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2157 "Current number of allocations", "/IEM/CPU%u/re/ExecMem/cAllocations", pVCpu->idCpu);
2158 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2159 "Currently allocated chunks", "/IEM/CPU%u/re/ExecMem/cChunks", pVCpu->idCpu);
2160 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cMaxChunks, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2161 "Maximum number of chunks", "/IEM/CPU%u/re/ExecMem/cMaxChunks", pVCpu->idCpu);
2162 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbChunk, STAMTYPE_U32, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2163 "Allocation chunk size", "/IEM/CPU%u/re/ExecMem/cbChunk", pVCpu->idCpu);
2164 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbAllocated, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2165 "Number of bytes current allocated", "/IEM/CPU%u/re/ExecMem/cbAllocated", pVCpu->idCpu);
2166 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbFree, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2167 "Number of bytes current free", "/IEM/CPU%u/re/ExecMem/cbFree", pVCpu->idCpu);
2168 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbTotal, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2169 "Total number of byte", "/IEM/CPU%u/re/ExecMem/cbTotal", pVCpu->idCpu);
2170#ifdef VBOX_WITH_STATISTICS
2171 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cbUnusable, STAMTYPE_U64, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES,
2172 "Total number of bytes being unusable", "/IEM/CPU%u/re/ExecMem/cbUnusable", pVCpu->idCpu);
2173 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatAlloc, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2174 "Profiling the allocator", "/IEM/CPU%u/re/ExecMem/ProfAlloc", pVCpu->idCpu);
2175 for (unsigned i = 1; i < RT_ELEMENTS(pExecMemAllocator->aStatSizes); i++)
2176 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[i], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2177 "Number of allocations of this number of allocation units",
2178 "/IEM/CPU%u/re/ExecMem/aSize%02u", pVCpu->idCpu, i);
2179 STAMR3RegisterFU(pUVM, &pExecMemAllocator->aStatSizes[0], STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2180 "Number of allocations 16 units or larger", "/IEM/CPU%u/re/ExecMem/aSize16OrLarger", pVCpu->idCpu);
2181#endif
2182#ifdef IEMEXECMEM_ALT_SUB_WITH_ALT_PRUNING
2183 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneProf, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_TICKS_PER_CALL,
2184 "Pruning executable memory (alt)", "/IEM/CPU%u/re/ExecMem/Pruning", pVCpu->idCpu);
2185 STAMR3RegisterFU(pUVM, &pExecMemAllocator->StatPruneRecovered, STAMTYPE_PROFILE, STAMVISIBILITY_ALWAYS, STAMUNIT_BYTES_PER_CALL,
2186 "Bytes recovered while pruning", "/IEM/CPU%u/re/ExecMem/PruningRecovered", pVCpu->idCpu);
2187#endif
2188 STAMR3RegisterFU(pUVM, &pExecMemAllocator->cFruitlessChunkScans, STAMTYPE_U64_RESET, STAMVISIBILITY_ALWAYS, STAMUNIT_COUNT,
2189 "Chunks fruitlessly scanned for free space", "/IEM/CPU%u/re/ExecMem/FruitlessChunkScans", pVCpu->idCpu);
2190
2191 return VINF_SUCCESS;
2192}
2193
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette