VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllThrdRecompiler.cpp@ 106212

Last change on this file since 106212 was 106212, checked in by vboxsync, 2 months ago

VMM/IEM: Some machinery for saving threaded TBs and a program for loading them and running them thru the native recompiler. The program be used for profiling or to check the effect of code changes. bugref:10720

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 158.7 KB
Line 
1/* $Id: IEMAllThrdRecompiler.cpp 106212 2024-10-03 02:42:55Z vboxsync $ */
2/** @file
3 * IEM - Instruction Decoding and Threaded Recompilation.
4 *
5 * Logging group IEM_RE_THREADED assignments:
6 * - Level 1 (Log) : Errors, exceptions, interrupts and such major events. [same as IEM]
7 * - Flow (LogFlow) : TB calls being emitted.
8 * - Level 2 (Log2) : Basic instruction execution state info. [same as IEM]
9 * - Level 3 (Log3) : More detailed execution state info. [same as IEM]
10 * - Level 4 (Log4) : Decoding mnemonics w/ EIP. [same as IEM]
11 * - Level 5 (Log5) : Decoding details. [same as IEM]
12 * - Level 6 (Log6) : TB opcode range management.
13 * - Level 7 (Log7) : TB obsoletion.
14 * - Level 8 (Log8) : TB compilation.
15 * - Level 9 (Log9) : TB exec.
16 * - Level 10 (Log10): TB block lookup.
17 * - Level 11 (Log11): TB block lookup details.
18 * - Level 12 (Log12): TB insertion.
19 */
20
21/*
22 * Copyright (C) 2011-2024 Oracle and/or its affiliates.
23 *
24 * This file is part of VirtualBox base platform packages, as
25 * available from https://www.virtualbox.org.
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation, in version 3 of the
30 * License.
31 *
32 * This program is distributed in the hope that it will be useful, but
33 * WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
35 * General Public License for more details.
36 *
37 * You should have received a copy of the GNU General Public License
38 * along with this program; if not, see <https://www.gnu.org/licenses>.
39 *
40 * SPDX-License-Identifier: GPL-3.0-only
41 */
42
43
44/*********************************************************************************************************************************
45* Header Files *
46*********************************************************************************************************************************/
47#ifndef LOG_GROUP /* defined when included by tstIEMCheckMc.cpp */
48# define LOG_GROUP LOG_GROUP_IEM_RE_THREADED
49#endif
50#define IEM_WITH_CODE_TLB_AND_OPCODE_BUF /* A bit hackish, but its all in IEMInline.h. */
51#define VMCPU_INCL_CPUM_GST_CTX
52#include <VBox/vmm/iem.h>
53#include <VBox/vmm/cpum.h>
54#include <VBox/vmm/apic.h>
55#include <VBox/vmm/pdm.h>
56#include <VBox/vmm/pgm.h>
57#include <VBox/vmm/iom.h>
58#include <VBox/vmm/em.h>
59#include <VBox/vmm/hm.h>
60#include <VBox/vmm/nem.h>
61#include <VBox/vmm/gim.h>
62#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
63# include <VBox/vmm/em.h>
64# include <VBox/vmm/hm_svm.h>
65#endif
66#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
67# include <VBox/vmm/hmvmxinline.h>
68#endif
69#include <VBox/vmm/tm.h>
70#include <VBox/vmm/dbgf.h>
71#include <VBox/vmm/dbgftrace.h>
72#ifndef TST_IEM_CHECK_MC
73# include "IEMInternal.h"
74#endif
75#include <VBox/vmm/vmcc.h>
76#include <VBox/log.h>
77#include <VBox/err.h>
78#include <VBox/param.h>
79#include <VBox/dis.h>
80#include <VBox/disopcode-x86-amd64.h>
81#include <iprt/asm-math.h>
82#include <iprt/assert.h>
83#include <iprt/mem.h>
84#include <iprt/string.h>
85#include <iprt/sort.h>
86#include <iprt/x86.h>
87
88#ifndef TST_IEM_CHECK_MC
89# include "IEMInline.h"
90# include "IEMOpHlp.h"
91# include "IEMMc.h"
92#endif
93
94#include "IEMThreadedFunctions.h"
95#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
96# include "IEMN8veRecompiler.h"
97#endif
98
99
100/*
101 * Narrow down configs here to avoid wasting time on unused configs here.
102 */
103
104#ifndef IEM_WITH_CODE_TLB
105# error The code TLB must be enabled for the recompiler.
106#endif
107
108#ifndef IEM_WITH_DATA_TLB
109# error The data TLB must be enabled for the recompiler.
110#endif
111
112#ifndef IEM_WITH_SETJMP
113# error The setjmp approach must be enabled for the recompiler.
114#endif
115
116#if defined(IEMNATIVE_WITH_SIMD_FP_NATIVE_EMITTERS) && !defined(IEMNATIVE_WITH_SIMD_REG_ALLOCATOR)
117# error "IEMNATIVE_WITH_SIMD_FP_NATIVE_EMITTERS requires IEMNATIVE_WITH_SIMD_REG_ALLOCATOR"
118#endif
119
120
121/**
122 * Calculates the effective address of a ModR/M memory operand, extended version
123 * for use in the recompilers.
124 *
125 * Meant to be used via IEM_MC_CALC_RM_EFF_ADDR.
126 *
127 * May longjmp on internal error.
128 *
129 * @return The effective address.
130 * @param pVCpu The cross context virtual CPU structure of the calling thread.
131 * @param bRm The ModRM byte.
132 * @param cbImmAndRspOffset - First byte: The size of any immediate
133 * following the effective address opcode bytes
134 * (only for RIP relative addressing).
135 * - Second byte: RSP displacement (for POP [ESP]).
136 * @param puInfo Extra info: 32-bit displacement (bits 31:0) and
137 * SIB byte (bits 39:32).
138 *
139 * @note This must be defined in a source file with matching
140 * IEM_WITH_CODE_TLB_AND_OPCODE_BUF define till the define is made default
141 * or implemented differently...
142 */
143RTGCPTR iemOpHlpCalcRmEffAddrJmpEx(PVMCPUCC pVCpu, uint8_t bRm, uint32_t cbImmAndRspOffset, uint64_t *puInfo) IEM_NOEXCEPT_MAY_LONGJMP
144{
145 Log5(("iemOpHlpCalcRmEffAddrJmp: bRm=%#x\n", bRm));
146# define SET_SS_DEF() \
147 do \
148 { \
149 if (!(pVCpu->iem.s.fPrefixes & IEM_OP_PRF_SEG_MASK)) \
150 pVCpu->iem.s.iEffSeg = X86_SREG_SS; \
151 } while (0)
152
153 if (!IEM_IS_64BIT_CODE(pVCpu))
154 {
155/** @todo Check the effective address size crap! */
156 if (pVCpu->iem.s.enmEffAddrMode == IEMMODE_16BIT)
157 {
158 uint16_t u16EffAddr;
159
160 /* Handle the disp16 form with no registers first. */
161 if ((bRm & (X86_MODRM_MOD_MASK | X86_MODRM_RM_MASK)) == 6)
162 {
163 IEM_OPCODE_GET_NEXT_U16(&u16EffAddr);
164 *puInfo = u16EffAddr;
165 }
166 else
167 {
168 /* Get the displacment. */
169 switch ((bRm >> X86_MODRM_MOD_SHIFT) & X86_MODRM_MOD_SMASK)
170 {
171 case 0: u16EffAddr = 0; break;
172 case 1: IEM_OPCODE_GET_NEXT_S8_SX_U16(&u16EffAddr); break;
173 case 2: IEM_OPCODE_GET_NEXT_U16(&u16EffAddr); break;
174 default: AssertFailedStmt(IEM_DO_LONGJMP(pVCpu, VERR_IEM_IPE_1)); /* (caller checked for these) */
175 }
176 *puInfo = u16EffAddr;
177
178 /* Add the base and index registers to the disp. */
179 switch (bRm & X86_MODRM_RM_MASK)
180 {
181 case 0: u16EffAddr += pVCpu->cpum.GstCtx.bx + pVCpu->cpum.GstCtx.si; break;
182 case 1: u16EffAddr += pVCpu->cpum.GstCtx.bx + pVCpu->cpum.GstCtx.di; break;
183 case 2: u16EffAddr += pVCpu->cpum.GstCtx.bp + pVCpu->cpum.GstCtx.si; SET_SS_DEF(); break;
184 case 3: u16EffAddr += pVCpu->cpum.GstCtx.bp + pVCpu->cpum.GstCtx.di; SET_SS_DEF(); break;
185 case 4: u16EffAddr += pVCpu->cpum.GstCtx.si; break;
186 case 5: u16EffAddr += pVCpu->cpum.GstCtx.di; break;
187 case 6: u16EffAddr += pVCpu->cpum.GstCtx.bp; SET_SS_DEF(); break;
188 case 7: u16EffAddr += pVCpu->cpum.GstCtx.bx; break;
189 }
190 }
191
192 Log5(("iemOpHlpCalcRmEffAddrJmp: EffAddr=%#06RX16 uInfo=%#RX64\n", u16EffAddr, *puInfo));
193 return u16EffAddr;
194 }
195
196 Assert(pVCpu->iem.s.enmEffAddrMode == IEMMODE_32BIT);
197 uint32_t u32EffAddr;
198 uint64_t uInfo;
199
200 /* Handle the disp32 form with no registers first. */
201 if ((bRm & (X86_MODRM_MOD_MASK | X86_MODRM_RM_MASK)) == 5)
202 {
203 IEM_OPCODE_GET_NEXT_U32(&u32EffAddr);
204 uInfo = u32EffAddr;
205 }
206 else
207 {
208 /* Get the register (or SIB) value. */
209 uInfo = 0;
210 switch ((bRm & X86_MODRM_RM_MASK))
211 {
212 case 0: u32EffAddr = pVCpu->cpum.GstCtx.eax; break;
213 case 1: u32EffAddr = pVCpu->cpum.GstCtx.ecx; break;
214 case 2: u32EffAddr = pVCpu->cpum.GstCtx.edx; break;
215 case 3: u32EffAddr = pVCpu->cpum.GstCtx.ebx; break;
216 case 4: /* SIB */
217 {
218 uint8_t bSib; IEM_OPCODE_GET_NEXT_U8(&bSib);
219 uInfo = (uint64_t)bSib << 32;
220
221 /* Get the index and scale it. */
222 switch ((bSib >> X86_SIB_INDEX_SHIFT) & X86_SIB_INDEX_SMASK)
223 {
224 case 0: u32EffAddr = pVCpu->cpum.GstCtx.eax; break;
225 case 1: u32EffAddr = pVCpu->cpum.GstCtx.ecx; break;
226 case 2: u32EffAddr = pVCpu->cpum.GstCtx.edx; break;
227 case 3: u32EffAddr = pVCpu->cpum.GstCtx.ebx; break;
228 case 4: u32EffAddr = 0; /*none */ break;
229 case 5: u32EffAddr = pVCpu->cpum.GstCtx.ebp; break;
230 case 6: u32EffAddr = pVCpu->cpum.GstCtx.esi; break;
231 case 7: u32EffAddr = pVCpu->cpum.GstCtx.edi; break;
232 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
233 }
234 u32EffAddr <<= (bSib >> X86_SIB_SCALE_SHIFT) & X86_SIB_SCALE_SMASK;
235
236 /* add base */
237 switch (bSib & X86_SIB_BASE_MASK)
238 {
239 case 0: u32EffAddr += pVCpu->cpum.GstCtx.eax; break;
240 case 1: u32EffAddr += pVCpu->cpum.GstCtx.ecx; break;
241 case 2: u32EffAddr += pVCpu->cpum.GstCtx.edx; break;
242 case 3: u32EffAddr += pVCpu->cpum.GstCtx.ebx; break;
243 case 4: u32EffAddr += pVCpu->cpum.GstCtx.esp + (cbImmAndRspOffset >> 8); SET_SS_DEF(); break;
244 case 5:
245 if ((bRm & X86_MODRM_MOD_MASK) != 0)
246 {
247 u32EffAddr += pVCpu->cpum.GstCtx.ebp;
248 SET_SS_DEF();
249 }
250 else
251 {
252 uint32_t u32Disp;
253 IEM_OPCODE_GET_NEXT_U32(&u32Disp);
254 u32EffAddr += u32Disp;
255 uInfo |= u32Disp;
256 }
257 break;
258 case 6: u32EffAddr += pVCpu->cpum.GstCtx.esi; break;
259 case 7: u32EffAddr += pVCpu->cpum.GstCtx.edi; break;
260 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
261 }
262 break;
263 }
264 case 5: u32EffAddr = pVCpu->cpum.GstCtx.ebp; SET_SS_DEF(); break;
265 case 6: u32EffAddr = pVCpu->cpum.GstCtx.esi; break;
266 case 7: u32EffAddr = pVCpu->cpum.GstCtx.edi; break;
267 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
268 }
269
270 /* Get and add the displacement. */
271 switch ((bRm >> X86_MODRM_MOD_SHIFT) & X86_MODRM_MOD_SMASK)
272 {
273 case 0:
274 break;
275 case 1:
276 {
277 int8_t i8Disp; IEM_OPCODE_GET_NEXT_S8(&i8Disp);
278 u32EffAddr += i8Disp;
279 uInfo |= (uint32_t)(int32_t)i8Disp;
280 break;
281 }
282 case 2:
283 {
284 uint32_t u32Disp; IEM_OPCODE_GET_NEXT_U32(&u32Disp);
285 u32EffAddr += u32Disp;
286 uInfo |= u32Disp;
287 break;
288 }
289 default:
290 AssertFailedStmt(IEM_DO_LONGJMP(pVCpu, VERR_IEM_IPE_2)); /* (caller checked for these) */
291 }
292 }
293
294 *puInfo = uInfo;
295 Log5(("iemOpHlpCalcRmEffAddrJmp: EffAddr=%#010RX32 uInfo=%#RX64\n", u32EffAddr, uInfo));
296 return u32EffAddr;
297 }
298
299 uint64_t u64EffAddr;
300 uint64_t uInfo;
301
302 /* Handle the rip+disp32 form with no registers first. */
303 if ((bRm & (X86_MODRM_MOD_MASK | X86_MODRM_RM_MASK)) == 5)
304 {
305 IEM_OPCODE_GET_NEXT_S32_SX_U64(&u64EffAddr);
306 uInfo = (uint32_t)u64EffAddr;
307 u64EffAddr += pVCpu->cpum.GstCtx.rip + IEM_GET_INSTR_LEN(pVCpu) + (cbImmAndRspOffset & UINT32_C(0xff));
308 }
309 else
310 {
311 /* Get the register (or SIB) value. */
312 uInfo = 0;
313 switch ((bRm & X86_MODRM_RM_MASK) | pVCpu->iem.s.uRexB)
314 {
315 case 0: u64EffAddr = pVCpu->cpum.GstCtx.rax; break;
316 case 1: u64EffAddr = pVCpu->cpum.GstCtx.rcx; break;
317 case 2: u64EffAddr = pVCpu->cpum.GstCtx.rdx; break;
318 case 3: u64EffAddr = pVCpu->cpum.GstCtx.rbx; break;
319 case 5: u64EffAddr = pVCpu->cpum.GstCtx.rbp; SET_SS_DEF(); break;
320 case 6: u64EffAddr = pVCpu->cpum.GstCtx.rsi; break;
321 case 7: u64EffAddr = pVCpu->cpum.GstCtx.rdi; break;
322 case 8: u64EffAddr = pVCpu->cpum.GstCtx.r8; break;
323 case 9: u64EffAddr = pVCpu->cpum.GstCtx.r9; break;
324 case 10: u64EffAddr = pVCpu->cpum.GstCtx.r10; break;
325 case 11: u64EffAddr = pVCpu->cpum.GstCtx.r11; break;
326 case 13: u64EffAddr = pVCpu->cpum.GstCtx.r13; break;
327 case 14: u64EffAddr = pVCpu->cpum.GstCtx.r14; break;
328 case 15: u64EffAddr = pVCpu->cpum.GstCtx.r15; break;
329 /* SIB */
330 case 4:
331 case 12:
332 {
333 uint8_t bSib; IEM_OPCODE_GET_NEXT_U8(&bSib);
334 uInfo = (uint64_t)bSib << 32;
335
336 /* Get the index and scale it. */
337 switch (((bSib >> X86_SIB_INDEX_SHIFT) & X86_SIB_INDEX_SMASK) | pVCpu->iem.s.uRexIndex)
338 {
339 case 0: u64EffAddr = pVCpu->cpum.GstCtx.rax; break;
340 case 1: u64EffAddr = pVCpu->cpum.GstCtx.rcx; break;
341 case 2: u64EffAddr = pVCpu->cpum.GstCtx.rdx; break;
342 case 3: u64EffAddr = pVCpu->cpum.GstCtx.rbx; break;
343 case 4: u64EffAddr = 0; /*none */ break;
344 case 5: u64EffAddr = pVCpu->cpum.GstCtx.rbp; break;
345 case 6: u64EffAddr = pVCpu->cpum.GstCtx.rsi; break;
346 case 7: u64EffAddr = pVCpu->cpum.GstCtx.rdi; break;
347 case 8: u64EffAddr = pVCpu->cpum.GstCtx.r8; break;
348 case 9: u64EffAddr = pVCpu->cpum.GstCtx.r9; break;
349 case 10: u64EffAddr = pVCpu->cpum.GstCtx.r10; break;
350 case 11: u64EffAddr = pVCpu->cpum.GstCtx.r11; break;
351 case 12: u64EffAddr = pVCpu->cpum.GstCtx.r12; break;
352 case 13: u64EffAddr = pVCpu->cpum.GstCtx.r13; break;
353 case 14: u64EffAddr = pVCpu->cpum.GstCtx.r14; break;
354 case 15: u64EffAddr = pVCpu->cpum.GstCtx.r15; break;
355 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
356 }
357 u64EffAddr <<= (bSib >> X86_SIB_SCALE_SHIFT) & X86_SIB_SCALE_SMASK;
358
359 /* add base */
360 switch ((bSib & X86_SIB_BASE_MASK) | pVCpu->iem.s.uRexB)
361 {
362 case 0: u64EffAddr += pVCpu->cpum.GstCtx.rax; break;
363 case 1: u64EffAddr += pVCpu->cpum.GstCtx.rcx; break;
364 case 2: u64EffAddr += pVCpu->cpum.GstCtx.rdx; break;
365 case 3: u64EffAddr += pVCpu->cpum.GstCtx.rbx; break;
366 case 4: u64EffAddr += pVCpu->cpum.GstCtx.rsp + (cbImmAndRspOffset >> 8); SET_SS_DEF(); break;
367 case 6: u64EffAddr += pVCpu->cpum.GstCtx.rsi; break;
368 case 7: u64EffAddr += pVCpu->cpum.GstCtx.rdi; break;
369 case 8: u64EffAddr += pVCpu->cpum.GstCtx.r8; break;
370 case 9: u64EffAddr += pVCpu->cpum.GstCtx.r9; break;
371 case 10: u64EffAddr += pVCpu->cpum.GstCtx.r10; break;
372 case 11: u64EffAddr += pVCpu->cpum.GstCtx.r11; break;
373 case 12: u64EffAddr += pVCpu->cpum.GstCtx.r12; break;
374 case 14: u64EffAddr += pVCpu->cpum.GstCtx.r14; break;
375 case 15: u64EffAddr += pVCpu->cpum.GstCtx.r15; break;
376 /* complicated encodings */
377 case 5:
378 case 13:
379 if ((bRm & X86_MODRM_MOD_MASK) != 0)
380 {
381 if (!pVCpu->iem.s.uRexB)
382 {
383 u64EffAddr += pVCpu->cpum.GstCtx.rbp;
384 SET_SS_DEF();
385 }
386 else
387 u64EffAddr += pVCpu->cpum.GstCtx.r13;
388 }
389 else
390 {
391 uint32_t u32Disp;
392 IEM_OPCODE_GET_NEXT_U32(&u32Disp);
393 u64EffAddr += (int32_t)u32Disp;
394 uInfo |= u32Disp;
395 }
396 break;
397 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
398 }
399 break;
400 }
401 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX);
402 }
403
404 /* Get and add the displacement. */
405 switch ((bRm >> X86_MODRM_MOD_SHIFT) & X86_MODRM_MOD_SMASK)
406 {
407 case 0:
408 break;
409 case 1:
410 {
411 int8_t i8Disp;
412 IEM_OPCODE_GET_NEXT_S8(&i8Disp);
413 u64EffAddr += i8Disp;
414 uInfo |= (uint32_t)(int32_t)i8Disp;
415 break;
416 }
417 case 2:
418 {
419 uint32_t u32Disp;
420 IEM_OPCODE_GET_NEXT_U32(&u32Disp);
421 u64EffAddr += (int32_t)u32Disp;
422 uInfo |= u32Disp;
423 break;
424 }
425 IEM_NOT_REACHED_DEFAULT_CASE_RET2(RTGCPTR_MAX); /* (caller checked for these) */
426 }
427
428 }
429
430 *puInfo = uInfo;
431 if (pVCpu->iem.s.enmEffAddrMode == IEMMODE_64BIT)
432 {
433 Log5(("iemOpHlpCalcRmEffAddrJmp: EffAddr=%#010RGv uInfo=%#RX64\n", u64EffAddr, uInfo));
434 return u64EffAddr;
435 }
436 Assert(pVCpu->iem.s.enmEffAddrMode == IEMMODE_32BIT);
437 Log5(("iemOpHlpCalcRmEffAddrJmp: EffAddr=%#010RGv uInfo=%#RX64\n", u64EffAddr & UINT32_MAX, uInfo));
438 return u64EffAddr & UINT32_MAX;
439}
440
441
442
443/*********************************************************************************************************************************
444* Translation Block Cache. *
445*********************************************************************************************************************************/
446
447/** @callback_method_impl{FNRTSORTCMP, Compare two TBs for pruning sorting purposes.} */
448static DECLCALLBACK(int) iemTbCachePruneCmpTb(void const *pvElement1, void const *pvElement2, void *pvUser)
449{
450 PCIEMTB const pTb1 = (PCIEMTB)pvElement1;
451 PCIEMTB const pTb2 = (PCIEMTB)pvElement2;
452 uint32_t const cMsSinceUse1 = (uint32_t)(uintptr_t)pvUser - pTb1->msLastUsed;
453 uint32_t const cMsSinceUse2 = (uint32_t)(uintptr_t)pvUser - pTb2->msLastUsed;
454 if (cMsSinceUse1 != cMsSinceUse2)
455 return cMsSinceUse1 < cMsSinceUse2 ? -1 : 1;
456 if (pTb1->cUsed != pTb2->cUsed)
457 return pTb1->cUsed > pTb2->cUsed ? -1 : 1;
458 if ((pTb1->fFlags & IEMTB_F_TYPE_MASK) != (pTb2->fFlags & IEMTB_F_TYPE_MASK))
459 return (pTb1->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE ? -1 : 1;
460 return 0;
461}
462
463#ifdef VBOX_STRICT
464/**
465 * Assertion helper that checks a collisions list count.
466 */
467static void iemTbCacheAssertCorrectCount(PIEMTBCACHE pTbCache, uint32_t idxHash, const char *pszOperation)
468{
469 PIEMTB pTb = IEMTBCACHE_PTR_GET_TB(pTbCache->apHash[idxHash]);
470 int cLeft = IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]);
471 while (pTb)
472 {
473 pTb = pTb->pNext;
474 cLeft--;
475 }
476 AssertMsg(cLeft == 0,
477 ("idxHash=%#x cLeft=%d; entry count=%d; %s\n",
478 idxHash, cLeft, IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]), pszOperation));
479}
480#endif
481
482
483DECL_NO_INLINE(static, void) iemTbCacheAddWithPruning(PVMCPUCC pVCpu, PIEMTBCACHE pTbCache, PIEMTB pTb, uint32_t idxHash)
484{
485 STAM_PROFILE_START(&pTbCache->StatPrune, a);
486
487 /*
488 * First convert the collision list to an array.
489 */
490 PIEMTB apSortedTbs[IEMTBCACHE_PTR_MAX_COUNT];
491 uintptr_t cInserted = 0;
492 PIEMTB pTbCollision = IEMTBCACHE_PTR_GET_TB(pTbCache->apHash[idxHash]);
493
494 pTbCache->apHash[idxHash] = NULL; /* Must NULL the entry before trying to free anything. */
495
496 while (pTbCollision && cInserted < RT_ELEMENTS(apSortedTbs))
497 {
498 apSortedTbs[cInserted++] = pTbCollision;
499 pTbCollision = pTbCollision->pNext;
500 }
501
502 /* Free any excess (impossible). */
503 if (RT_LIKELY(!pTbCollision))
504 Assert(cInserted == RT_ELEMENTS(apSortedTbs));
505 else
506 do
507 {
508 PIEMTB pTbToFree = pTbCollision;
509 pTbCollision = pTbToFree->pNext;
510 iemTbAllocatorFree(pVCpu, pTbToFree);
511 } while (pTbCollision);
512
513 /*
514 * Sort it by most recently used and usage count.
515 */
516 RTSortApvShell((void **)apSortedTbs, cInserted, iemTbCachePruneCmpTb, (void *)(uintptr_t)pVCpu->iem.s.msRecompilerPollNow);
517
518 /* We keep half the list for now. Perhaps a bit aggressive... */
519 uintptr_t const cKeep = cInserted / 2;
520
521 /* First free up the TBs we don't wish to keep (before creating the new
522 list because otherwise the free code will scan the list for each one
523 without ever finding it). */
524 for (uintptr_t idx = cKeep; idx < cInserted; idx++)
525 iemTbAllocatorFree(pVCpu, apSortedTbs[idx]);
526
527 /* Then chain the new TB together with the ones we like to keep of the
528 existing ones and insert this list into the hash table. */
529 pTbCollision = pTb;
530 for (uintptr_t idx = 0; idx < cKeep; idx++)
531 pTbCollision = pTbCollision->pNext = apSortedTbs[idx];
532 pTbCollision->pNext = NULL;
533
534 pTbCache->apHash[idxHash] = IEMTBCACHE_PTR_MAKE(pTb, cKeep + 1);
535#ifdef VBOX_STRICT
536 iemTbCacheAssertCorrectCount(pTbCache, idxHash, "add w/ pruning");
537#endif
538
539 STAM_PROFILE_STOP(&pTbCache->StatPrune, a);
540}
541
542
543static void iemTbCacheAdd(PVMCPUCC pVCpu, PIEMTBCACHE pTbCache, PIEMTB pTb)
544{
545 uint32_t const idxHash = IEMTBCACHE_HASH(pTbCache, pTb->fFlags, pTb->GCPhysPc);
546 PIEMTB const pTbOldHead = pTbCache->apHash[idxHash];
547 if (!pTbOldHead)
548 {
549 pTb->pNext = NULL;
550 pTbCache->apHash[idxHash] = IEMTBCACHE_PTR_MAKE(pTb, 1); /** @todo could make 1 implicit... */
551 }
552 else
553 {
554 STAM_REL_COUNTER_INC(&pTbCache->cCollisions);
555 uintptr_t cCollisions = IEMTBCACHE_PTR_GET_COUNT(pTbOldHead);
556 if (cCollisions < IEMTBCACHE_PTR_MAX_COUNT)
557 {
558 pTb->pNext = IEMTBCACHE_PTR_GET_TB(pTbOldHead);
559 pTbCache->apHash[idxHash] = IEMTBCACHE_PTR_MAKE(pTb, cCollisions + 1);
560#ifdef VBOX_STRICT
561 iemTbCacheAssertCorrectCount(pTbCache, idxHash, "add");
562#endif
563 }
564 else
565 iemTbCacheAddWithPruning(pVCpu, pTbCache, pTb, idxHash);
566 }
567}
568
569
570/**
571 * Unlinks @a pTb from the hash table if found in it.
572 *
573 * @returns true if unlinked, false if not present.
574 * @param pTbCache The hash table.
575 * @param pTb The TB to remove.
576 */
577static bool iemTbCacheRemove(PIEMTBCACHE pTbCache, PIEMTB pTb)
578{
579 uint32_t const idxHash = IEMTBCACHE_HASH(pTbCache, pTb->fFlags, pTb->GCPhysPc);
580 PIEMTB pTbHash = IEMTBCACHE_PTR_GET_TB(pTbCache->apHash[idxHash]);
581 uint32_t volatile cLength = IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]); RT_NOREF(cLength);
582
583 /*
584 * At the head of the collision list?
585 */
586 if (pTbHash == pTb)
587 {
588 if (!pTb->pNext)
589 pTbCache->apHash[idxHash] = NULL;
590 else
591 {
592 pTbCache->apHash[idxHash] = IEMTBCACHE_PTR_MAKE(pTb->pNext,
593 IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) - 1);
594#ifdef VBOX_STRICT
595 iemTbCacheAssertCorrectCount(pTbCache, idxHash, "remove #1");
596#endif
597 }
598 return true;
599 }
600
601 /*
602 * Search the collision list.
603 */
604 PIEMTB const pTbHead = pTbHash;
605 while (pTbHash)
606 {
607 PIEMTB const pNextTb = pTbHash->pNext;
608 if (pNextTb == pTb)
609 {
610 pTbHash->pNext = pTb->pNext;
611 pTbCache->apHash[idxHash] = IEMTBCACHE_PTR_MAKE(pTbHead, IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) - 1);
612#ifdef VBOX_STRICT
613 iemTbCacheAssertCorrectCount(pTbCache, idxHash, "remove #2");
614#endif
615 return true;
616 }
617 pTbHash = pNextTb;
618 }
619 return false;
620}
621
622
623/**
624 * Looks up a TB for the given PC and flags in the cache.
625 *
626 * @returns Pointer to TB on success, NULL if not found.
627 * @param pVCpu The cross context virtual CPU structure of the
628 * calling thread.
629 * @param pTbCache The translation block cache.
630 * @param GCPhysPc The PC to look up a TB for.
631 * @param fExtraFlags The extra flags to join with IEMCPU::fExec for
632 * the lookup.
633 * @thread EMT(pVCpu)
634 */
635static PIEMTB iemTbCacheLookup(PVMCPUCC pVCpu, PIEMTBCACHE pTbCache,
636 RTGCPHYS GCPhysPc, uint32_t fExtraFlags) IEM_NOEXCEPT_MAY_LONGJMP /** @todo r=bird: no longjumping here, right? iemNativeRecompile is noexcept. */
637{
638 uint32_t const fFlags = ((pVCpu->iem.s.fExec & IEMTB_F_IEM_F_MASK) | fExtraFlags) & IEMTB_F_KEY_MASK;
639
640 /*
641 * First consult the lookup table entry.
642 */
643 PIEMTB * const ppTbLookup = pVCpu->iem.s.ppTbLookupEntryR3;
644 PIEMTB pTb = *ppTbLookup;
645 if (pTb)
646 {
647 if (pTb->GCPhysPc == GCPhysPc)
648 {
649 if ( (pTb->fFlags & (IEMTB_F_KEY_MASK | IEMTB_F_TYPE_MASK)) == (fFlags | IEMTB_F_TYPE_NATIVE)
650 || (pTb->fFlags & (IEMTB_F_KEY_MASK | IEMTB_F_TYPE_MASK)) == (fFlags | IEMTB_F_TYPE_THREADED) )
651 {
652 if (pTb->x86.fAttr == (uint16_t)pVCpu->cpum.GstCtx.cs.Attr.u)
653 {
654 STAM_COUNTER_INC(&pTbCache->cLookupHitsViaTbLookupTable);
655 pTb->msLastUsed = pVCpu->iem.s.msRecompilerPollNow;
656 pTb->cUsed++;
657#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
658 if ((pTb->fFlags & IEMTB_F_TYPE_NATIVE) || pTb->cUsed != pVCpu->iem.s.uTbNativeRecompileAtUsedCount)
659 {
660 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp: %p (@ %p)\n", fFlags, GCPhysPc, pTb, ppTbLookup));
661 return pTb;
662 }
663 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp: %p (@ %p) - recompiling\n", fFlags, GCPhysPc, pTb, ppTbLookup));
664# ifdef VBOX_WITH_SAVE_THREADED_TBS_FOR_PROFILING
665 iemThreadedSaveTbForProfiling(pVCpu, pTb);
666# endif
667 return iemNativeRecompile(pVCpu, pTb);
668#else
669 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp: %p (@ %p)\n", fFlags, GCPhysPc, pTb, ppTbLookup));
670 return pTb;
671#endif
672 }
673 }
674 }
675 }
676
677 /*
678 * Then consult the hash table.
679 */
680 uint32_t const idxHash = IEMTBCACHE_HASH_NO_KEY_MASK(pTbCache, fFlags, GCPhysPc);
681#if defined(VBOX_STRICT) || defined(LOG_ENABLED)
682 int cLeft = IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]);
683#endif
684 pTb = IEMTBCACHE_PTR_GET_TB(pTbCache->apHash[idxHash]);
685 while (pTb)
686 {
687 if (pTb->GCPhysPc == GCPhysPc)
688 {
689 if ((pTb->fFlags & IEMTB_F_KEY_MASK) == fFlags)
690 {
691 if (pTb->x86.fAttr == (uint16_t)pVCpu->cpum.GstCtx.cs.Attr.u)
692 {
693 STAM_COUNTER_INC(&pTbCache->cLookupHits);
694 AssertMsg(cLeft > 0, ("%d\n", cLeft));
695
696 *ppTbLookup = pTb;
697 pTb->msLastUsed = pVCpu->iem.s.msRecompilerPollNow;
698 pTb->cUsed++;
699#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
700 if ((pTb->fFlags & IEMTB_F_TYPE_NATIVE) || pTb->cUsed != pVCpu->iem.s.uTbNativeRecompileAtUsedCount)
701 {
702 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp idxHash=%#x: %p (@ %d / %d)\n",
703 fFlags, GCPhysPc, idxHash, pTb, IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) - cLeft,
704 IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) ));
705 return pTb;
706 }
707 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp idxHash=%#x: %p (@ %d / %d) - recompiling\n",
708 fFlags, GCPhysPc, idxHash, pTb, IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) - cLeft,
709 IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) ));
710 return iemNativeRecompile(pVCpu, pTb);
711#else
712 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp idxHash=%#x: %p (@ %d / %d)\n",
713 fFlags, GCPhysPc, idxHash, pTb, IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) - cLeft,
714 IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) ));
715 return pTb;
716#endif
717 }
718 Log11(("TB miss: CS: %#x, wanted %#x\n", pTb->x86.fAttr, (uint16_t)pVCpu->cpum.GstCtx.cs.Attr.u));
719 }
720 else
721 Log11(("TB miss: fFlags: %#x, wanted %#x\n", pTb->fFlags, fFlags));
722 }
723 else
724 Log11(("TB miss: GCPhysPc: %#x, wanted %#x\n", pTb->GCPhysPc, GCPhysPc));
725
726 pTb = pTb->pNext;
727#ifdef VBOX_STRICT
728 cLeft--;
729#endif
730 }
731 AssertMsg(cLeft == 0, ("%d\n", cLeft));
732 STAM_REL_COUNTER_INC(&pTbCache->cLookupMisses);
733 Log10(("TB lookup: fFlags=%#x GCPhysPc=%RGp idxHash=%#x: NULL - (%p L %d)\n", fFlags, GCPhysPc, idxHash,
734 IEMTBCACHE_PTR_GET_TB(pTbCache->apHash[idxHash]), IEMTBCACHE_PTR_GET_COUNT(pTbCache->apHash[idxHash]) ));
735 return pTb;
736}
737
738
739/*********************************************************************************************************************************
740* Translation Block Allocator.
741*********************************************************************************************************************************/
742/*
743 * Translation block allocationmanagement.
744 */
745
746#ifdef IEMTB_SIZE_IS_POWER_OF_TWO
747# define IEMTBALLOC_IDX_TO_CHUNK(a_pTbAllocator, a_idxTb) \
748 ((a_idxTb) >> (a_pTbAllocator)->cChunkShift)
749# define IEMTBALLOC_IDX_TO_INDEX_IN_CHUNK(a_pTbAllocator, a_idxTb, a_idxChunk) \
750 ((a_idxTb) & (a_pTbAllocator)->fChunkMask)
751# define IEMTBALLOC_IDX_FOR_CHUNK(a_pTbAllocator, a_idxChunk) \
752 ((uint32_t)(a_idxChunk) << (a_pTbAllocator)->cChunkShift)
753#else
754# define IEMTBALLOC_IDX_TO_CHUNK(a_pTbAllocator, a_idxTb) \
755 ((a_idxTb) / (a_pTbAllocator)->cTbsPerChunk)
756# define IEMTBALLOC_IDX_TO_INDEX_IN_CHUNK(a_pTbAllocator, a_idxTb, a_idxChunk) \
757 ((a_idxTb) - (a_idxChunk) * (a_pTbAllocator)->cTbsPerChunk)
758# define IEMTBALLOC_IDX_FOR_CHUNK(a_pTbAllocator, a_idxChunk) \
759 ((uint32_t)(a_idxChunk) * (a_pTbAllocator)->cTbsPerChunk)
760#endif
761/** Makes a TB index from a chunk index and TB index within that chunk. */
762#define IEMTBALLOC_IDX_MAKE(a_pTbAllocator, a_idxChunk, a_idxInChunk) \
763 (IEMTBALLOC_IDX_FOR_CHUNK(a_pTbAllocator, a_idxChunk) + (a_idxInChunk))
764
765
766/**
767 * Initializes the TB allocator and cache for an EMT.
768 *
769 * @returns VBox status code.
770 * @param pVM The VM handle.
771 * @param cInitialTbs The initial number of translation blocks to
772 * preallocator.
773 * @param cMaxTbs The max number of translation blocks allowed.
774 * @param cbInitialExec The initial size of the executable memory allocator.
775 * @param cbMaxExec The max size of the executable memory allocator.
776 * @param cbChunkExec The chunk size for executable memory allocator. Zero
777 * or UINT32_MAX for automatically determining this.
778 * @thread EMT
779 */
780DECLCALLBACK(int) iemTbInit(PVMCC pVM, uint32_t cInitialTbs, uint32_t cMaxTbs,
781 uint64_t cbInitialExec, uint64_t cbMaxExec, uint32_t cbChunkExec)
782{
783 PVMCPUCC pVCpu = VMMGetCpu(pVM);
784 Assert(!pVCpu->iem.s.pTbCacheR3);
785 Assert(!pVCpu->iem.s.pTbAllocatorR3);
786
787 /*
788 * Calculate the chunk size of the TB allocator.
789 * The minimum chunk size is 2MiB.
790 */
791 AssertCompile(!(sizeof(IEMTB) & IEMTBCACHE_PTR_COUNT_MASK));
792 uint32_t cbPerChunk = _2M;
793 uint32_t cTbsPerChunk = _2M / sizeof(IEMTB);
794#ifdef IEMTB_SIZE_IS_POWER_OF_TWO
795 uint8_t const cTbShift = ASMBitFirstSetU32((uint32_t)sizeof(IEMTB)) - 1;
796 uint8_t cChunkShift = 21 - cTbShift;
797 AssertCompile(RT_BIT_32(21) == _2M); Assert(RT_BIT_32(cChunkShift) == cTbsPerChunk);
798#endif
799 for (;;)
800 {
801 if (cMaxTbs <= cTbsPerChunk * (uint64_t)RT_ELEMENTS(pVCpu->iem.s.pTbAllocatorR3->aChunks))
802 break;
803 cbPerChunk *= 2;
804 cTbsPerChunk = cbPerChunk / sizeof(IEMTB);
805#ifdef IEMTB_SIZE_IS_POWER_OF_TWO
806 cChunkShift += 1;
807#endif
808 }
809
810 uint32_t cMaxChunks = (cMaxTbs + cTbsPerChunk - 1) / cTbsPerChunk;
811 Assert(cMaxChunks * cTbsPerChunk >= cMaxTbs);
812 Assert(cMaxChunks <= RT_ELEMENTS(pVCpu->iem.s.pTbAllocatorR3->aChunks));
813
814 cMaxTbs = cMaxChunks * cTbsPerChunk;
815
816 /*
817 * Allocate and initalize it.
818 */
819 PIEMTBALLOCATOR const pTbAllocator = (PIEMTBALLOCATOR)RTMemAllocZ(sizeof(*pTbAllocator));
820 if (!pTbAllocator)
821 return VMSetError(pVM, VERR_NO_MEMORY, RT_SRC_POS,
822 "Failed to allocate %zu bytes (max %u TBs) for the TB allocator of VCpu #%u",
823 sizeof(*pTbAllocator), cMaxTbs, pVCpu->idCpu);
824 pTbAllocator->uMagic = IEMTBALLOCATOR_MAGIC;
825 pTbAllocator->cMaxChunks = (uint8_t)cMaxChunks;
826 pTbAllocator->cTbsPerChunk = cTbsPerChunk;
827 pTbAllocator->cbPerChunk = cbPerChunk;
828 pTbAllocator->cMaxTbs = cMaxTbs;
829 pTbAllocator->pTbsFreeHead = NULL;
830#ifdef IEMTB_SIZE_IS_POWER_OF_TWO
831 pTbAllocator->fChunkMask = cTbsPerChunk - 1;
832 pTbAllocator->cChunkShift = cChunkShift;
833 Assert(RT_BIT_32(cChunkShift) == cTbsPerChunk);
834#endif
835
836 pVCpu->iem.s.pTbAllocatorR3 = pTbAllocator;
837
838 /*
839 * Allocate the initial chunks.
840 */
841 for (uint32_t idxChunk = 0; ; idxChunk++)
842 {
843 PIEMTB const paTbs = pTbAllocator->aChunks[idxChunk].paTbs = (PIEMTB)RTMemPageAllocZ(cbPerChunk);
844 if (!paTbs)
845 return VMSetError(pVM, VERR_NO_MEMORY, RT_SRC_POS,
846 "Failed to initial %zu bytes for the #%u chunk of TBs for VCpu #%u",
847 cbPerChunk, idxChunk, pVCpu->idCpu);
848
849 for (uint32_t iTb = 0; iTb < cTbsPerChunk; iTb++)
850 {
851 paTbs[iTb].idxAllocChunk = idxChunk; /* This is not strictly necessary... */
852 paTbs[iTb].pNext = pTbAllocator->pTbsFreeHead;
853 pTbAllocator->pTbsFreeHead = &paTbs[iTb];
854 }
855 pTbAllocator->cAllocatedChunks = (uint16_t)(idxChunk + 1);
856 pTbAllocator->cTotalTbs += cTbsPerChunk;
857
858 if ((idxChunk + 1) * cTbsPerChunk >= cInitialTbs)
859 break;
860 }
861
862 /*
863 * Calculate the size of the hash table. We double the max TB count and
864 * round it up to the nearest power of two.
865 */
866 uint32_t cCacheEntries = cMaxTbs * 2;
867 if (!RT_IS_POWER_OF_TWO(cCacheEntries))
868 {
869 uint8_t const iBitTop = ASMBitFirstSetU32(cCacheEntries);
870 cCacheEntries = RT_BIT_32(iBitTop);
871 Assert(cCacheEntries >= cMaxTbs * 2);
872 }
873
874 size_t const cbTbCache = RT_UOFFSETOF_DYN(IEMTBCACHE, apHash[cCacheEntries]);
875 PIEMTBCACHE const pTbCache = (PIEMTBCACHE)RTMemAllocZ(cbTbCache);
876 if (!pTbCache)
877 return VMSetError(pVM, VERR_NO_MEMORY, RT_SRC_POS,
878 "Failed to allocate %zu bytes (%u entries) for the TB cache of VCpu #%u",
879 cbTbCache, cCacheEntries, pVCpu->idCpu);
880
881 /*
882 * Initialize it (assumes zeroed by the allocator).
883 */
884 pTbCache->uMagic = IEMTBCACHE_MAGIC;
885 pTbCache->cHash = cCacheEntries;
886 pTbCache->uHashMask = cCacheEntries - 1;
887 Assert(pTbCache->cHash > pTbCache->uHashMask);
888 pVCpu->iem.s.pTbCacheR3 = pTbCache;
889
890 /*
891 * Initialize the native executable memory allocator.
892 */
893#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
894 int rc = iemExecMemAllocatorInit(pVCpu, cbMaxExec, cbInitialExec, cbChunkExec);
895 AssertLogRelRCReturn(rc, rc);
896#else
897 RT_NOREF(cbMaxExec, cbInitialExec, cbChunkExec);
898#endif
899
900 return VINF_SUCCESS;
901}
902
903
904/**
905 * Inner free worker.
906 */
907static void iemTbAllocatorFreeInner(PVMCPUCC pVCpu, PIEMTBALLOCATOR pTbAllocator,
908 PIEMTB pTb, uint32_t idxChunk, uint32_t idxInChunk)
909{
910 Assert(idxChunk < pTbAllocator->cAllocatedChunks); RT_NOREF(idxChunk);
911 Assert(idxInChunk < pTbAllocator->cTbsPerChunk); RT_NOREF(idxInChunk);
912 Assert((uintptr_t)(pTb - pTbAllocator->aChunks[idxChunk].paTbs) == idxInChunk);
913#ifdef VBOX_STRICT
914 for (PIEMTB pTbOther = pTbAllocator->pDelayedFreeHead; pTbOther; pTbOther = pTbOther->pNext)
915 Assert(pTbOther != pTb);
916#endif
917
918 /*
919 * Unlink the TB from the hash table.
920 */
921 iemTbCacheRemove(pVCpu->iem.s.pTbCacheR3, pTb);
922
923 /*
924 * Free the TB itself.
925 */
926 switch (pTb->fFlags & IEMTB_F_TYPE_MASK)
927 {
928 case IEMTB_F_TYPE_THREADED:
929 pTbAllocator->cThreadedTbs -= 1;
930 RTMemFree(pTb->Thrd.paCalls);
931 break;
932#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
933 case IEMTB_F_TYPE_NATIVE:
934 pTbAllocator->cNativeTbs -= 1;
935 iemExecMemAllocatorFree(pVCpu, pTb->Native.paInstructions,
936 pTb->Native.cInstructions * sizeof(pTb->Native.paInstructions[0]));
937 pTb->Native.paInstructions = NULL; /* required by iemExecMemAllocatorPrune */
938 break;
939#endif
940 default:
941 AssertFailed();
942 }
943
944 RTMemFree(IEMTB_GET_TB_LOOKUP_TAB_ENTRY(pTb, 0)); /* Frees both the TB lookup table and opcode bytes. */
945
946 pTb->pNext = pTbAllocator->pTbsFreeHead;
947 pTbAllocator->pTbsFreeHead = pTb;
948 pTb->fFlags = 0;
949 pTb->GCPhysPc = UINT64_MAX;
950 pTb->Gen.uPtr = 0;
951 pTb->Gen.uData = 0;
952 pTb->cTbLookupEntries = 0;
953 pTb->cbOpcodes = 0;
954 pTb->pabOpcodes = NULL;
955
956 Assert(pTbAllocator->cInUseTbs > 0);
957
958 pTbAllocator->cInUseTbs -= 1;
959 STAM_REL_COUNTER_INC(&pTbAllocator->StatFrees);
960}
961
962
963/**
964 * Frees the given TB.
965 *
966 * @param pVCpu The cross context virtual CPU structure of the calling
967 * thread.
968 * @param pTb The translation block to free.
969 * @thread EMT(pVCpu)
970 */
971DECLHIDDEN(void) iemTbAllocatorFree(PVMCPUCC pVCpu, PIEMTB pTb)
972{
973 /*
974 * Validate state.
975 */
976 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
977 Assert(pTbAllocator && pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC);
978 uint8_t const idxChunk = pTb->idxAllocChunk;
979 AssertLogRelReturnVoid(idxChunk < pTbAllocator->cAllocatedChunks);
980 uintptr_t const idxInChunk = pTb - pTbAllocator->aChunks[idxChunk].paTbs;
981 AssertLogRelReturnVoid(idxInChunk < pTbAllocator->cTbsPerChunk);
982
983 /*
984 * Invalidate the TB lookup pointer and call the inner worker.
985 */
986 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
987 iemTbAllocatorFreeInner(pVCpu, pTbAllocator, pTb, idxChunk, (uint32_t)idxInChunk);
988}
989
990
991/**
992 * Schedules a TB for freeing when it's not longer being executed and/or part of
993 * the caller's call stack.
994 *
995 * The TB will be removed from the translation block cache, though, so it isn't
996 * possible to executed it again and the IEMTB::pNext member can be used to link
997 * it together with other TBs awaiting freeing.
998 *
999 * @param pVCpu The cross context virtual CPU structure of the calling
1000 * thread.
1001 * @param pTb The translation block to schedule for freeing.
1002 */
1003static void iemTbAlloctorScheduleForFree(PVMCPUCC pVCpu, PIEMTB pTb)
1004{
1005 /*
1006 * Validate state.
1007 */
1008 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1009 Assert(pTbAllocator && pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC);
1010 Assert(pTb->idxAllocChunk < pTbAllocator->cAllocatedChunks);
1011 Assert((uintptr_t)(pTb - pTbAllocator->aChunks[pTb->idxAllocChunk].paTbs) < pTbAllocator->cTbsPerChunk);
1012 Assert( (pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE
1013 || (pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_THREADED);
1014#ifdef VBOX_STRICT
1015 for (PIEMTB pTbOther = pTbAllocator->pDelayedFreeHead; pTbOther; pTbOther = pTbOther->pNext)
1016 Assert(pTbOther != pTb);
1017#endif
1018
1019 /*
1020 * Remove it from the cache and prepend it to the allocator's todo list.
1021 *
1022 * Note! It could still be in various lookup tables, so we trash the GCPhys
1023 * and CS attribs to ensure it won't be reused.
1024 */
1025 iemTbCacheRemove(pVCpu->iem.s.pTbCacheR3, pTb);
1026 pTb->GCPhysPc = NIL_RTGCPHYS;
1027 pTb->x86.fAttr = UINT16_MAX;
1028
1029 pTb->pNext = pTbAllocator->pDelayedFreeHead;
1030 pTbAllocator->pDelayedFreeHead = pTb;
1031}
1032
1033
1034/**
1035 * Processes the delayed frees.
1036 *
1037 * This is called by the allocator function as well as the native recompile
1038 * function before making any TB or executable memory allocations respectively.
1039 */
1040void iemTbAllocatorProcessDelayedFrees(PVMCPUCC pVCpu, PIEMTBALLOCATOR pTbAllocator)
1041{
1042 /** @todo r-bird: these have already been removed from the cache,
1043 * iemTbAllocatorFree/Inner redoes that, which is a waste of time. */
1044 PIEMTB pTb = pTbAllocator->pDelayedFreeHead;
1045 pTbAllocator->pDelayedFreeHead = NULL;
1046 while (pTb)
1047 {
1048 PIEMTB const pTbNext = pTb->pNext;
1049 Assert(pVCpu->iem.s.pCurTbR3 != pTb);
1050 iemTbAllocatorFree(pVCpu, pTb);
1051 pTb = pTbNext;
1052 }
1053}
1054
1055
1056#if 0
1057/**
1058 * Frees all TBs.
1059 */
1060static int iemTbAllocatorFreeAll(PVMCPUCC pVCpu)
1061{
1062 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1063 AssertReturn(pTbAllocator, VERR_WRONG_ORDER);
1064 AssertReturn(pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC, VERR_INVALID_MAGIC);
1065
1066 iemTbAllocatorProcessDelayedFrees(pVCpu, pTbAllocator);
1067
1068 uint32_t idxChunk = pTbAllocator->cAllocatedChunks;
1069 while (idxChunk-- > 0)
1070 {
1071 PIEMTB const paTbs = pTbAllocator->aChunks[idxChunk].paTbs;
1072 uint32_t idxTb = pTbAllocator->cTbsPerChunk;
1073 while (idxTb-- > 0)
1074 {
1075 PIEMTB const pTb = &paTbs[idxTb];
1076 if (pTb->fFlags)
1077 iemTbAllocatorFreeInner(pVCpu, pTbAllocator, pTb, idxChunk, idxTb);
1078 }
1079 }
1080
1081 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
1082
1083# if 1
1084 /* Reset the free list. */
1085 pTbAllocator->pTbsFreeHead = NULL;
1086 idxChunk = pTbAllocator->cAllocatedChunks;
1087 while (idxChunk-- > 0)
1088 {
1089 uint32_t const cTbsPerChunk = pTbAllocator->cTbsPerChunk;
1090 PIEMTB const paTbs = pTbAllocator->aChunks[idxChunk].paTbs;
1091 RT_BZERO(paTbs, sizeof(paTbs[0]) * cTbsPerChunk);
1092 for (uint32_t idxTb = 0; idxTb < cTbsPerChunk; idxTb++)
1093 {
1094 paTbs[idxTb].idxAllocChunk = idxChunk; /* This is not strictly necessary... */
1095 paTbs[idxTb].pNext = pTbAllocator->pTbsFreeHead;
1096 pTbAllocator->pTbsFreeHead = &paTbs[idxTb];
1097 }
1098 }
1099# endif
1100
1101# if 1
1102 /* Completely reset the TB cache. */
1103 RT_BZERO(pVCpu->iem.s.pTbCacheR3->apHash, sizeof(pVCpu->iem.s.pTbCacheR3->apHash[0]) * pVCpu->iem.s.pTbCacheR3->cHash);
1104# endif
1105
1106 return VINF_SUCCESS;
1107}
1108#endif
1109
1110
1111/**
1112 * Grow the translation block allocator with another chunk.
1113 */
1114static int iemTbAllocatorGrow(PVMCPUCC pVCpu)
1115{
1116 /*
1117 * Validate state.
1118 */
1119 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1120 AssertReturn(pTbAllocator, VERR_WRONG_ORDER);
1121 AssertReturn(pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC, VERR_INVALID_MAGIC);
1122 uint32_t const idxChunk = pTbAllocator->cAllocatedChunks;
1123 AssertReturn(idxChunk < pTbAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
1124
1125 /*
1126 * Allocate a new chunk and add it to the allocator.
1127 */
1128 PIEMTB const paTbs = (PIEMTB)RTMemPageAllocZ(pTbAllocator->cbPerChunk);
1129 AssertLogRelReturn(paTbs, VERR_NO_PAGE_MEMORY);
1130 pTbAllocator->aChunks[idxChunk].paTbs = paTbs;
1131
1132 uint32_t const cTbsPerChunk = pTbAllocator->cTbsPerChunk;
1133 for (uint32_t iTb = 0; iTb < cTbsPerChunk; iTb++)
1134 {
1135 paTbs[iTb].idxAllocChunk = idxChunk; /* This is not strictly necessary... */
1136 paTbs[iTb].pNext = pTbAllocator->pTbsFreeHead;
1137 pTbAllocator->pTbsFreeHead = &paTbs[iTb];
1138 }
1139 pTbAllocator->cAllocatedChunks = (uint16_t)(idxChunk + 1);
1140 pTbAllocator->cTotalTbs += cTbsPerChunk;
1141
1142 return VINF_SUCCESS;
1143}
1144
1145
1146/**
1147 * Allocates a TB from allocator with free block.
1148 *
1149 * This is common code to both the fast and slow allocator code paths.
1150 */
1151DECL_FORCE_INLINE(PIEMTB) iemTbAllocatorAllocCore(PIEMTBALLOCATOR const pTbAllocator, bool fThreaded)
1152{
1153 Assert(pTbAllocator->cInUseTbs < pTbAllocator->cTotalTbs);
1154 Assert(pTbAllocator->pTbsFreeHead);
1155
1156 PIEMTB const pTb = pTbAllocator->pTbsFreeHead;
1157 pTbAllocator->pTbsFreeHead = pTb->pNext;
1158 pTbAllocator->cInUseTbs += 1;
1159 if (fThreaded)
1160 pTbAllocator->cThreadedTbs += 1;
1161 else
1162 pTbAllocator->cNativeTbs += 1;
1163 STAM_REL_COUNTER_INC(&pTbAllocator->StatAllocs);
1164 return pTb;
1165}
1166
1167
1168/**
1169 * Slow path for iemTbAllocatorAlloc.
1170 */
1171static PIEMTB iemTbAllocatorAllocSlow(PVMCPUCC pVCpu, PIEMTBALLOCATOR const pTbAllocator, bool fThreaded)
1172{
1173 /*
1174 * With some luck we can add another chunk.
1175 */
1176 if (pTbAllocator->cAllocatedChunks < pTbAllocator->cMaxChunks)
1177 {
1178 int rc = iemTbAllocatorGrow(pVCpu);
1179 if (RT_SUCCESS(rc))
1180 return iemTbAllocatorAllocCore(pTbAllocator, fThreaded);
1181 }
1182
1183 /*
1184 * We have to prune stuff. Sigh.
1185 *
1186 * This requires scanning for older TBs and kick them out. Not sure how to
1187 * best do this as we don't want to maintain any list of TBs ordered by last
1188 * usage time. But one reasonably simple approach would be that each time we
1189 * get here we continue a sequential scan of the allocation chunks,
1190 * considering just a smallish number of TBs and freeing a fixed portion of
1191 * them. Say, we consider the next 128 TBs, freeing the least recently used
1192 * in out of groups of 4 TBs, resulting in 32 free TBs.
1193 */
1194 STAM_PROFILE_START(&pTbAllocator->StatPrune, a);
1195 uint32_t const msNow = pVCpu->iem.s.msRecompilerPollNow;
1196 uint32_t const cTbsToPrune = 128;
1197 uint32_t const cTbsPerGroup = 4;
1198 uint32_t cFreedTbs = 0;
1199#ifdef IEMTB_SIZE_IS_POWER_OF_TWO
1200 uint32_t idxTbPruneFrom = pTbAllocator->iPruneFrom & ~(uint32_t)(cTbsToPrune - 1); /* Stay within a chunk! */
1201#else
1202 uint32_t idxTbPruneFrom = pTbAllocator->iPruneFrom;
1203#endif
1204 if (idxTbPruneFrom >= pTbAllocator->cMaxTbs)
1205 idxTbPruneFrom = 0;
1206 for (uint32_t i = 0; i < cTbsToPrune; i += cTbsPerGroup, idxTbPruneFrom += cTbsPerGroup)
1207 {
1208 uint32_t idxChunk = IEMTBALLOC_IDX_TO_CHUNK(pTbAllocator, idxTbPruneFrom);
1209 uint32_t idxInChunk = IEMTBALLOC_IDX_TO_INDEX_IN_CHUNK(pTbAllocator, idxTbPruneFrom, idxChunk);
1210 PIEMTB pTb = &pTbAllocator->aChunks[idxChunk].paTbs[idxInChunk];
1211 uint32_t cMsAge = msNow - pTb->msLastUsed;
1212 Assert(pTb->fFlags & IEMTB_F_TYPE_MASK);
1213
1214 for (uint32_t j = 1, idxChunk2 = idxChunk, idxInChunk2 = idxInChunk + 1; j < cTbsPerGroup; j++, idxInChunk2++)
1215 {
1216#ifndef IEMTB_SIZE_IS_POWER_OF_TWO
1217 if (idxInChunk2 < pTbAllocator->cTbsPerChunk)
1218 { /* likely */ }
1219 else
1220 {
1221 idxInChunk2 = 0;
1222 idxChunk2 += 1;
1223 if (idxChunk2 >= pTbAllocator->cAllocatedChunks)
1224 idxChunk2 = 0;
1225 }
1226#endif
1227 PIEMTB const pTb2 = &pTbAllocator->aChunks[idxChunk2].paTbs[idxInChunk2];
1228 uint32_t const cMsAge2 = msNow - pTb2->msLastUsed;
1229 if ( cMsAge2 > cMsAge
1230 || (cMsAge2 == cMsAge && pTb2->cUsed < pTb->cUsed))
1231 {
1232 Assert(pTb2->fFlags & IEMTB_F_TYPE_MASK);
1233 pTb = pTb2;
1234 idxChunk = idxChunk2;
1235 idxInChunk = idxInChunk2;
1236 cMsAge = cMsAge2;
1237 }
1238 }
1239
1240 /* Free the TB. */
1241 iemTbAllocatorFreeInner(pVCpu, pTbAllocator, pTb, idxChunk, idxInChunk);
1242 cFreedTbs++; /* paranoia */
1243 }
1244 pTbAllocator->iPruneFrom = idxTbPruneFrom;
1245 STAM_PROFILE_STOP(&pTbAllocator->StatPrune, a);
1246
1247 /* Flush the TB lookup entry pointer. */
1248 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
1249
1250 /*
1251 * Allocate a TB from the ones we've pruned.
1252 */
1253 if (cFreedTbs)
1254 return iemTbAllocatorAllocCore(pTbAllocator, fThreaded);
1255 return NULL;
1256}
1257
1258
1259/**
1260 * Allocate a translation block.
1261 *
1262 * @returns Pointer to block on success, NULL if we're out and is unable to
1263 * free up an existing one (very unlikely once implemented).
1264 * @param pVCpu The cross context virtual CPU structure of the calling
1265 * thread.
1266 * @param fThreaded Set if threaded TB being allocated, clear if native TB.
1267 * For statistics.
1268 */
1269DECL_FORCE_INLINE(PIEMTB) iemTbAllocatorAlloc(PVMCPUCC pVCpu, bool fThreaded)
1270{
1271 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1272 Assert(pTbAllocator && pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC);
1273
1274 /* Free any pending TBs before we proceed. */
1275 if (!pTbAllocator->pDelayedFreeHead)
1276 { /* probably likely */ }
1277 else
1278 iemTbAllocatorProcessDelayedFrees(pVCpu, pTbAllocator);
1279
1280 /* If the allocator is full, take slow code path.*/
1281 if (RT_LIKELY(pTbAllocator->cInUseTbs < pTbAllocator->cTotalTbs))
1282 return iemTbAllocatorAllocCore(pTbAllocator, fThreaded);
1283 return iemTbAllocatorAllocSlow(pVCpu, pTbAllocator, fThreaded);
1284}
1285
1286
1287/**
1288 * This is called when we're out of space for native TBs.
1289 *
1290 * This uses a variation on the pruning in iemTbAllocatorAllocSlow.
1291 * The difference is that we only prune native TBs and will only free any if
1292 * there are least two in a group. The conditions under which we're called are
1293 * different - there will probably be free TBs in the table when we're called.
1294 * Therefore we increase the group size and max scan length, though we'll stop
1295 * scanning once we've reached the requested size (@a cNeededInstrs) and freed
1296 * up at least 8 TBs.
1297 */
1298void iemTbAllocatorFreeupNativeSpace(PVMCPUCC pVCpu, uint32_t cNeededInstrs)
1299{
1300 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
1301 AssertReturnVoid(pTbAllocator && pTbAllocator->uMagic == IEMTBALLOCATOR_MAGIC);
1302
1303 STAM_REL_PROFILE_START(&pTbAllocator->StatPruneNative, a);
1304
1305 /*
1306 * Flush the delayed free list before we start freeing TBs indiscriminately.
1307 */
1308 iemTbAllocatorProcessDelayedFrees(pVCpu, pTbAllocator);
1309
1310 /*
1311 * Scan and free TBs.
1312 */
1313 uint32_t const msNow = pVCpu->iem.s.msRecompilerPollNow;
1314 uint32_t const cTbsToPrune = 128 * 8;
1315 uint32_t const cTbsPerGroup = 4 * 4;
1316 uint32_t cFreedTbs = 0;
1317 uint32_t cMaxInstrs = 0;
1318 uint32_t idxTbPruneFrom = pTbAllocator->iPruneNativeFrom & ~(uint32_t)(cTbsPerGroup - 1);
1319 for (uint32_t i = 0; i < cTbsToPrune; i += cTbsPerGroup, idxTbPruneFrom += cTbsPerGroup)
1320 {
1321 if (idxTbPruneFrom >= pTbAllocator->cTotalTbs)
1322 idxTbPruneFrom = 0;
1323 uint32_t idxChunk = IEMTBALLOC_IDX_TO_CHUNK(pTbAllocator, idxTbPruneFrom);
1324 uint32_t idxInChunk = IEMTBALLOC_IDX_TO_INDEX_IN_CHUNK(pTbAllocator, idxTbPruneFrom, idxChunk);
1325 PIEMTB pTb = &pTbAllocator->aChunks[idxChunk].paTbs[idxInChunk];
1326 uint32_t cMsAge = pTb->fFlags & IEMTB_F_TYPE_NATIVE ? msNow - pTb->msLastUsed : msNow;
1327 uint8_t cNativeTbs = (pTb->fFlags & IEMTB_F_TYPE_NATIVE) != 0;
1328
1329 for (uint32_t j = 1, idxChunk2 = idxChunk, idxInChunk2 = idxInChunk + 1; j < cTbsPerGroup; j++, idxInChunk2++)
1330 {
1331 if (idxInChunk2 < pTbAllocator->cTbsPerChunk)
1332 { /* likely */ }
1333 else
1334 {
1335 idxInChunk2 = 0;
1336 idxChunk2 += 1;
1337 if (idxChunk2 >= pTbAllocator->cAllocatedChunks)
1338 idxChunk2 = 0;
1339 }
1340 PIEMTB const pTb2 = &pTbAllocator->aChunks[idxChunk2].paTbs[idxInChunk2];
1341 if (pTb2->fFlags & IEMTB_F_TYPE_NATIVE)
1342 {
1343 cNativeTbs += 1;
1344 uint32_t const cMsAge2 = msNow - pTb2->msLastUsed;
1345 if ( cMsAge2 > cMsAge
1346 || ( cMsAge2 == cMsAge
1347 && ( pTb2->cUsed < pTb->cUsed
1348 || ( pTb2->cUsed == pTb->cUsed
1349 && pTb2->Native.cInstructions > pTb->Native.cInstructions)))
1350 || !(pTb->fFlags & IEMTB_F_TYPE_NATIVE))
1351 {
1352 pTb = pTb2;
1353 idxChunk = idxChunk2;
1354 idxInChunk = idxInChunk2;
1355 cMsAge = cMsAge2;
1356 }
1357 }
1358 }
1359
1360 /* Free the TB if we found at least two native one in this group. */
1361 if (cNativeTbs >= 2)
1362 {
1363 cMaxInstrs = RT_MAX(cMaxInstrs, pTb->Native.cInstructions);
1364 iemTbAllocatorFreeInner(pVCpu, pTbAllocator, pTb, idxChunk, idxInChunk);
1365 cFreedTbs++;
1366 if (cFreedTbs >= 8 && cMaxInstrs >= cNeededInstrs)
1367 break;
1368 }
1369 }
1370 pTbAllocator->iPruneNativeFrom = idxTbPruneFrom;
1371
1372 STAM_REL_PROFILE_STOP(&pTbAllocator->StatPruneNative, a);
1373}
1374
1375
1376/*********************************************************************************************************************************
1377* Threaded Recompiler Core *
1378*********************************************************************************************************************************/
1379/**
1380 * Formats TB flags (IEM_F_XXX and IEMTB_F_XXX) to string.
1381 * @returns pszBuf.
1382 * @param fFlags The flags.
1383 * @param pszBuf The output buffer.
1384 * @param cbBuf The output buffer size. At least 32 bytes.
1385 */
1386DECLHIDDEN(const char *) iemTbFlagsToString(uint32_t fFlags, char *pszBuf, size_t cbBuf) RT_NOEXCEPT
1387{
1388 Assert(cbBuf >= 32);
1389 static RTSTRTUPLE const s_aModes[] =
1390 {
1391 /* [00] = */ { RT_STR_TUPLE("16BIT") },
1392 /* [01] = */ { RT_STR_TUPLE("32BIT") },
1393 /* [02] = */ { RT_STR_TUPLE("!2!") },
1394 /* [03] = */ { RT_STR_TUPLE("!3!") },
1395 /* [04] = */ { RT_STR_TUPLE("16BIT_PRE_386") },
1396 /* [05] = */ { RT_STR_TUPLE("32BIT_FLAT") },
1397 /* [06] = */ { RT_STR_TUPLE("!6!") },
1398 /* [07] = */ { RT_STR_TUPLE("!7!") },
1399 /* [08] = */ { RT_STR_TUPLE("16BIT_PROT") },
1400 /* [09] = */ { RT_STR_TUPLE("32BIT_PROT") },
1401 /* [0a] = */ { RT_STR_TUPLE("64BIT") },
1402 /* [0b] = */ { RT_STR_TUPLE("!b!") },
1403 /* [0c] = */ { RT_STR_TUPLE("16BIT_PROT_PRE_386") },
1404 /* [0d] = */ { RT_STR_TUPLE("32BIT_PROT_FLAT") },
1405 /* [0e] = */ { RT_STR_TUPLE("!e!") },
1406 /* [0f] = */ { RT_STR_TUPLE("!f!") },
1407 /* [10] = */ { RT_STR_TUPLE("!10!") },
1408 /* [11] = */ { RT_STR_TUPLE("!11!") },
1409 /* [12] = */ { RT_STR_TUPLE("!12!") },
1410 /* [13] = */ { RT_STR_TUPLE("!13!") },
1411 /* [14] = */ { RT_STR_TUPLE("!14!") },
1412 /* [15] = */ { RT_STR_TUPLE("!15!") },
1413 /* [16] = */ { RT_STR_TUPLE("!16!") },
1414 /* [17] = */ { RT_STR_TUPLE("!17!") },
1415 /* [18] = */ { RT_STR_TUPLE("16BIT_PROT_V86") },
1416 /* [19] = */ { RT_STR_TUPLE("32BIT_PROT_V86") },
1417 /* [1a] = */ { RT_STR_TUPLE("!1a!") },
1418 /* [1b] = */ { RT_STR_TUPLE("!1b!") },
1419 /* [1c] = */ { RT_STR_TUPLE("!1c!") },
1420 /* [1d] = */ { RT_STR_TUPLE("!1d!") },
1421 /* [1e] = */ { RT_STR_TUPLE("!1e!") },
1422 /* [1f] = */ { RT_STR_TUPLE("!1f!") },
1423 };
1424 AssertCompile(RT_ELEMENTS(s_aModes) == IEM_F_MODE_MASK + 1);
1425 memcpy(pszBuf, s_aModes[fFlags & IEM_F_MODE_MASK].psz, s_aModes[fFlags & IEM_F_MODE_MASK].cch);
1426 size_t off = s_aModes[fFlags & IEM_F_MODE_MASK].cch;
1427
1428 pszBuf[off++] = ' ';
1429 pszBuf[off++] = 'C';
1430 pszBuf[off++] = 'P';
1431 pszBuf[off++] = 'L';
1432 pszBuf[off++] = '0' + ((fFlags >> IEM_F_X86_CPL_SHIFT) & IEM_F_X86_CPL_SMASK);
1433 Assert(off < 32);
1434
1435 fFlags &= ~(IEM_F_MODE_MASK | IEM_F_X86_CPL_SMASK);
1436
1437 static struct { const char *pszName; uint32_t cchName; uint32_t fFlag; } const s_aFlags[] =
1438 {
1439 { RT_STR_TUPLE("BYPASS_HANDLERS"), IEM_F_BYPASS_HANDLERS },
1440 { RT_STR_TUPLE("PENDING_BRK_INSTR"), IEM_F_PENDING_BRK_INSTR },
1441 { RT_STR_TUPLE("PENDING_BRK_DATA"), IEM_F_PENDING_BRK_DATA },
1442 { RT_STR_TUPLE("PENDING_BRK_X86_IO"), IEM_F_PENDING_BRK_X86_IO },
1443 { RT_STR_TUPLE("X86_DISREGARD_LOCK"), IEM_F_X86_DISREGARD_LOCK },
1444 { RT_STR_TUPLE("X86_CTX_VMX"), IEM_F_X86_CTX_VMX },
1445 { RT_STR_TUPLE("X86_CTX_SVM"), IEM_F_X86_CTX_SVM },
1446 { RT_STR_TUPLE("X86_CTX_IN_GUEST"), IEM_F_X86_CTX_IN_GUEST },
1447 { RT_STR_TUPLE("X86_CTX_SMM"), IEM_F_X86_CTX_SMM },
1448 { RT_STR_TUPLE("INHIBIT_SHADOW"), IEMTB_F_INHIBIT_SHADOW },
1449 { RT_STR_TUPLE("INHIBIT_NMI"), IEMTB_F_INHIBIT_NMI },
1450 { RT_STR_TUPLE("CS_LIM_CHECKS"), IEMTB_F_CS_LIM_CHECKS },
1451 { RT_STR_TUPLE("TYPE_THREADED"), IEMTB_F_TYPE_THREADED },
1452 { RT_STR_TUPLE("TYPE_NATIVE"), IEMTB_F_TYPE_NATIVE },
1453 };
1454 if (fFlags)
1455 for (unsigned i = 0; i < RT_ELEMENTS(s_aFlags); i++)
1456 if (s_aFlags[i].fFlag & fFlags)
1457 {
1458 AssertReturnStmt(off + 1 + s_aFlags[i].cchName + 1 <= cbBuf, pszBuf[off] = '\0', pszBuf);
1459 pszBuf[off++] = ' ';
1460 memcpy(&pszBuf[off], s_aFlags[i].pszName, s_aFlags[i].cchName);
1461 off += s_aFlags[i].cchName;
1462 fFlags &= ~s_aFlags[i].fFlag;
1463 if (!fFlags)
1464 break;
1465 }
1466 pszBuf[off] = '\0';
1467
1468 return pszBuf;
1469}
1470
1471
1472/** @callback_method_impl{FNDISREADBYTES, Dummy.} */
1473static DECLCALLBACK(int) iemThreadedDisasReadBytesDummy(PDISSTATE pDis, uint8_t offInstr, uint8_t cbMinRead, uint8_t cbMaxRead)
1474{
1475 RT_BZERO(&pDis->Instr.ab[offInstr], cbMaxRead);
1476 pDis->cbCachedInstr += cbMaxRead;
1477 RT_NOREF(cbMinRead);
1478 return VERR_NO_DATA;
1479}
1480
1481
1482/**
1483 * Worker for iemThreadedDisassembleTb.
1484 */
1485static void iemThreadedDumpLookupTable(PCIEMTB pTb, PCDBGFINFOHLP pHlp, unsigned idxFirst, unsigned cEntries,
1486 const char *pszLeadText = " TB Lookup:") RT_NOEXCEPT
1487{
1488 if (idxFirst + cEntries <= pTb->cTbLookupEntries)
1489 {
1490 PIEMTB * const papTbLookup = IEMTB_GET_TB_LOOKUP_TAB_ENTRY(pTb, idxFirst);
1491 pHlp->pfnPrintf(pHlp, "%s", pszLeadText);
1492 for (uint8_t iLookup = 0; iLookup < cEntries; iLookup++)
1493 {
1494 PIEMTB pLookupTb = papTbLookup[iLookup];
1495 if (pLookupTb)
1496 pHlp->pfnPrintf(pHlp, "%c%p (%s)", iLookup ? ',' : ' ', pLookupTb,
1497 (pLookupTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_THREADED ? "threaded"
1498 : (pLookupTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE ? "native"
1499 : "invalid");
1500 else
1501 pHlp->pfnPrintf(pHlp, "%cNULL", iLookup ? ',' : ' ');
1502 }
1503 pHlp->pfnPrintf(pHlp, "\n");
1504 }
1505 else
1506 {
1507 pHlp->pfnPrintf(pHlp, " !!Bogus TB lookup info: idxFirst=%#x L %u > cTbLookupEntries=%#x!!\n",
1508 idxFirst, cEntries, pTb->cTbLookupEntries);
1509 AssertMsgFailed(("idxFirst=%#x L %u > cTbLookupEntries=%#x\n", idxFirst, cEntries, pTb->cTbLookupEntries));
1510 }
1511}
1512
1513
1514DECLHIDDEN(void) iemThreadedDisassembleTb(PCIEMTB pTb, PCDBGFINFOHLP pHlp) RT_NOEXCEPT
1515{
1516 AssertReturnVoid((pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_THREADED);
1517
1518 char szDisBuf[512];
1519
1520 /*
1521 * Print TB info.
1522 */
1523 pHlp->pfnPrintf(pHlp,
1524 "pTb=%p: GCPhysPc=%RGp (%RGv) cInstructions=%u LB %#x cRanges=%u cTbLookupEntries=%u\n"
1525 "pTb=%p: cUsed=%u msLastUsed=%u fFlags=%#010x %s\n",
1526 pTb, pTb->GCPhysPc, pTb->FlatPc, pTb->cInstructions, pTb->cbOpcodes, pTb->cRanges, pTb->cTbLookupEntries,
1527 pTb, pTb->cUsed, pTb->msLastUsed, pTb->fFlags, iemTbFlagsToString(pTb->fFlags, szDisBuf, sizeof(szDisBuf)));
1528
1529 /*
1530 * This disassembly is driven by the debug info which follows the native
1531 * code and indicates when it starts with the next guest instructions,
1532 * where labels are and such things.
1533 */
1534 DISSTATE Dis;
1535 PCIEMTHRDEDCALLENTRY const paCalls = pTb->Thrd.paCalls;
1536 uint32_t const cCalls = pTb->Thrd.cCalls;
1537 DISCPUMODE enmGstCpuMode = (pTb->fFlags & IEM_F_MODE_CPUMODE_MASK) == IEMMODE_16BIT ? DISCPUMODE_16BIT
1538 : (pTb->fFlags & IEM_F_MODE_CPUMODE_MASK) == IEMMODE_32BIT ? DISCPUMODE_32BIT
1539 : DISCPUMODE_64BIT;
1540 uint32_t fExec = pTb->fFlags & UINT32_C(0x00ffffff);
1541 uint8_t idxRange = UINT8_MAX;
1542 uint8_t const cRanges = RT_MIN(pTb->cRanges, RT_ELEMENTS(pTb->aRanges));
1543 uint32_t offRange = 0;
1544 uint32_t offOpcodes = 0;
1545 uint32_t const cbOpcodes = pTb->cbOpcodes;
1546 RTGCPHYS GCPhysPc = pTb->GCPhysPc;
1547 bool fTbLookupSeen0 = false;
1548
1549 for (uint32_t iCall = 0; iCall < cCalls; iCall++)
1550 {
1551 /*
1552 * New opcode range?
1553 */
1554 if ( idxRange == UINT8_MAX
1555 || idxRange >= cRanges
1556 || offRange >= pTb->aRanges[idxRange].cbOpcodes)
1557 {
1558 idxRange += 1;
1559 if (idxRange < cRanges)
1560 offRange = !idxRange ? 0 : offRange - pTb->aRanges[idxRange - 1].cbOpcodes;
1561 else
1562 continue;
1563 GCPhysPc = pTb->aRanges[idxRange].offPhysPage
1564 + (pTb->aRanges[idxRange].idxPhysPage == 0
1565 ? pTb->GCPhysPc & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK
1566 : pTb->aGCPhysPages[pTb->aRanges[idxRange].idxPhysPage - 1]);
1567 pHlp->pfnPrintf(pHlp, " Range #%u: GCPhysPc=%RGp LB %#x [idxPg=%d]\n",
1568 idxRange, GCPhysPc, pTb->aRanges[idxRange].cbOpcodes,
1569 pTb->aRanges[idxRange].idxPhysPage);
1570 GCPhysPc += offRange;
1571 }
1572
1573 /*
1574 * Disassemble another guest instruction?
1575 */
1576 if ( paCalls[iCall].offOpcode != offOpcodes
1577 && paCalls[iCall].cbOpcode > 0
1578 && (uint32_t)(cbOpcodes - paCalls[iCall].offOpcode) <= cbOpcodes /* paranoia^2 */ )
1579 {
1580 offOpcodes = paCalls[iCall].offOpcode;
1581 uint8_t const cbInstrMax = RT_MIN(cbOpcodes - offOpcodes, 15);
1582 uint32_t cbInstr = 1;
1583 int rc = DISInstrWithPrefetchedBytes(GCPhysPc, enmGstCpuMode, DISOPTYPE_ALL,
1584 &pTb->pabOpcodes[offOpcodes], cbInstrMax,
1585 iemThreadedDisasReadBytesDummy, NULL, &Dis, &cbInstr);
1586 if (RT_SUCCESS(rc))
1587 {
1588 DISFormatYasmEx(&Dis, szDisBuf, sizeof(szDisBuf),
1589 DIS_FMT_FLAGS_BYTES_WIDTH_MAKE(10) | DIS_FMT_FLAGS_BYTES_LEFT
1590 | DIS_FMT_FLAGS_RELATIVE_BRANCH | DIS_FMT_FLAGS_C_HEX,
1591 NULL /*pfnGetSymbol*/, NULL /*pvUser*/);
1592 pHlp->pfnPrintf(pHlp, " %%%%%RGp: %s\n", GCPhysPc, szDisBuf);
1593 }
1594 else
1595 {
1596 pHlp->pfnPrintf(pHlp, " %%%%%RGp: %.*Rhxs - guest disassembly failure %Rrc\n",
1597 GCPhysPc, cbInstrMax, &pTb->pabOpcodes[offOpcodes], rc);
1598 cbInstr = paCalls[iCall].cbOpcode;
1599 }
1600 GCPhysPc += cbInstr;
1601 offRange += cbInstr;
1602 }
1603
1604 /*
1605 * Dump call details.
1606 */
1607 pHlp->pfnPrintf(pHlp,
1608 " Call #%u to %s (%u args)\n",
1609 iCall, g_apszIemThreadedFunctions[paCalls[iCall].enmFunction],
1610 g_acIemThreadedFunctionUsedArgs[paCalls[iCall].enmFunction]);
1611 if (paCalls[iCall].uTbLookup != 0)
1612 {
1613 uint8_t const idxFirst = IEM_TB_LOOKUP_TAB_GET_IDX(paCalls[iCall].uTbLookup);
1614 fTbLookupSeen0 = idxFirst == 0;
1615 iemThreadedDumpLookupTable(pTb, pHlp, idxFirst, IEM_TB_LOOKUP_TAB_GET_SIZE(paCalls[iCall].uTbLookup));
1616 }
1617
1618 /*
1619 * Snoop fExec.
1620 */
1621 switch (paCalls[iCall].enmFunction)
1622 {
1623 default:
1624 break;
1625 case kIemThreadedFunc_BltIn_CheckMode:
1626 fExec = paCalls[iCall].auParams[0];
1627 break;
1628 }
1629 }
1630
1631 if (!fTbLookupSeen0)
1632 iemThreadedDumpLookupTable(pTb, pHlp, 0, 1, " Fallback TB Lookup:");
1633}
1634
1635
1636
1637/**
1638 * Allocate a translation block for threadeded recompilation.
1639 *
1640 * This is allocated with maxed out call table and storage for opcode bytes,
1641 * because it's only supposed to be called once per EMT to allocate the TB
1642 * pointed to by IEMCPU::pThrdCompileTbR3.
1643 *
1644 * @returns Pointer to the translation block on success, NULL on failure.
1645 * @param pVM The cross context virtual machine structure.
1646 * @param pVCpu The cross context virtual CPU structure of the calling
1647 * thread.
1648 * @param GCPhysPc The physical address corresponding to RIP + CS.BASE.
1649 * @param fExtraFlags Extra flags (IEMTB_F_XXX).
1650 */
1651static PIEMTB iemThreadedTbAlloc(PVMCC pVM, PVMCPUCC pVCpu, RTGCPHYS GCPhysPc, uint32_t fExtraFlags)
1652{
1653 PIEMTB pTb = (PIEMTB)RTMemAllocZ(sizeof(IEMTB));
1654 if (pTb)
1655 {
1656 unsigned const cCalls = 256;
1657 pTb->Thrd.paCalls = (PIEMTHRDEDCALLENTRY)RTMemAlloc(sizeof(IEMTHRDEDCALLENTRY) * cCalls);
1658 if (pTb->Thrd.paCalls)
1659 {
1660 pTb->pabOpcodes = (uint8_t *)RTMemAlloc(cCalls * 16);
1661 if (pTb->pabOpcodes)
1662 {
1663 pVCpu->iem.s.cbOpcodesAllocated = cCalls * 16;
1664 pTb->Thrd.cAllocated = cCalls;
1665 pTb->Thrd.cCalls = 0;
1666 pTb->cbOpcodes = 0;
1667 pTb->pNext = NULL;
1668 pTb->cUsed = 0;
1669 pTb->msLastUsed = pVCpu->iem.s.msRecompilerPollNow;
1670 pTb->idxAllocChunk = UINT8_MAX;
1671 pTb->GCPhysPc = GCPhysPc;
1672 pTb->x86.fAttr = (uint16_t)pVCpu->cpum.GstCtx.cs.Attr.u;
1673 pTb->fFlags = (pVCpu->iem.s.fExec & IEMTB_F_IEM_F_MASK) | fExtraFlags;
1674 pTb->cInstructions = 0;
1675 pTb->cTbLookupEntries = 1; /* Entry zero is for anything w/o a specific entry. */
1676
1677 /* Init the first opcode range. */
1678 pTb->cRanges = 1;
1679 pTb->aRanges[0].cbOpcodes = 0;
1680 pTb->aRanges[0].offOpcodes = 0;
1681 pTb->aRanges[0].offPhysPage = GCPhysPc & GUEST_PAGE_OFFSET_MASK;
1682 pTb->aRanges[0].u2Unused = 0;
1683 pTb->aRanges[0].idxPhysPage = 0;
1684 pTb->aGCPhysPages[0] = NIL_RTGCPHYS;
1685 pTb->aGCPhysPages[1] = NIL_RTGCPHYS;
1686
1687 return pTb;
1688 }
1689 RTMemFree(pTb->Thrd.paCalls);
1690 }
1691 RTMemFree(pTb);
1692 }
1693 RT_NOREF(pVM);
1694 return NULL;
1695}
1696
1697
1698/**
1699 * Called on the TB that are dedicated for recompilation before it's reused.
1700 *
1701 * @param pVCpu The cross context virtual CPU structure of the calling
1702 * thread.
1703 * @param pTb The translation block to reuse.
1704 * @param GCPhysPc The physical address corresponding to RIP + CS.BASE.
1705 * @param fExtraFlags Extra flags (IEMTB_F_XXX).
1706 */
1707static void iemThreadedTbReuse(PVMCPUCC pVCpu, PIEMTB pTb, RTGCPHYS GCPhysPc, uint32_t fExtraFlags)
1708{
1709 pTb->GCPhysPc = GCPhysPc;
1710 pTb->fFlags = (pVCpu->iem.s.fExec & IEMTB_F_IEM_F_MASK) | fExtraFlags;
1711 pTb->x86.fAttr = (uint16_t)pVCpu->cpum.GstCtx.cs.Attr.u;
1712 pTb->Thrd.cCalls = 0;
1713 pTb->cbOpcodes = 0;
1714 pTb->cInstructions = 0;
1715 pTb->cTbLookupEntries = 1; /* Entry zero is for anything w/o a specific entry. */
1716
1717 /* Init the first opcode range. */
1718 pTb->cRanges = 1;
1719 pTb->aRanges[0].cbOpcodes = 0;
1720 pTb->aRanges[0].offOpcodes = 0;
1721 pTb->aRanges[0].offPhysPage = GCPhysPc & GUEST_PAGE_OFFSET_MASK;
1722 pTb->aRanges[0].u2Unused = 0;
1723 pTb->aRanges[0].idxPhysPage = 0;
1724 pTb->aGCPhysPages[0] = NIL_RTGCPHYS;
1725 pTb->aGCPhysPages[1] = NIL_RTGCPHYS;
1726}
1727
1728
1729/**
1730 * Used to duplicate a threded translation block after recompilation is done.
1731 *
1732 * @returns Pointer to the translation block on success, NULL on failure.
1733 * @param pVM The cross context virtual machine structure.
1734 * @param pVCpu The cross context virtual CPU structure of the calling
1735 * thread.
1736 * @param pTbSrc The TB to duplicate.
1737 */
1738static PIEMTB iemThreadedTbDuplicate(PVMCC pVM, PVMCPUCC pVCpu, PCIEMTB pTbSrc)
1739{
1740 /*
1741 * Just using the heap for now. Will make this more efficient and
1742 * complicated later, don't worry. :-)
1743 */
1744 PIEMTB pTb = iemTbAllocatorAlloc(pVCpu, true /*fThreaded*/);
1745 if (pTb)
1746 {
1747 uint8_t const idxAllocChunk = pTb->idxAllocChunk;
1748 memcpy(pTb, pTbSrc, sizeof(*pTb));
1749 pTb->idxAllocChunk = idxAllocChunk;
1750
1751 unsigned const cCalls = pTbSrc->Thrd.cCalls;
1752 Assert(cCalls > 0);
1753 pTb->Thrd.paCalls = (PIEMTHRDEDCALLENTRY)RTMemDup(pTbSrc->Thrd.paCalls, sizeof(IEMTHRDEDCALLENTRY) * cCalls);
1754 if (pTb->Thrd.paCalls)
1755 {
1756 size_t const cbTbLookup = pTbSrc->cTbLookupEntries * sizeof(PIEMTB);
1757 Assert(cbTbLookup > 0);
1758 size_t const cbOpcodes = pTbSrc->cbOpcodes;
1759 Assert(cbOpcodes > 0);
1760 size_t const cbBoth = cbTbLookup + RT_ALIGN_Z(cbOpcodes, sizeof(PIEMTB));
1761 uint8_t * const pbBoth = (uint8_t *)RTMemAlloc(cbBoth);
1762 if (pbBoth)
1763 {
1764 RT_BZERO(pbBoth, cbTbLookup);
1765 pTb->pabOpcodes = (uint8_t *)memcpy(&pbBoth[cbTbLookup], pTbSrc->pabOpcodes, cbOpcodes);
1766 pTb->Thrd.cAllocated = cCalls;
1767 pTb->pNext = NULL;
1768 pTb->cUsed = 0;
1769 pTb->msLastUsed = pVCpu->iem.s.msRecompilerPollNow;
1770 pTb->fFlags = pTbSrc->fFlags;
1771
1772 return pTb;
1773 }
1774 RTMemFree(pTb->Thrd.paCalls);
1775 }
1776 iemTbAllocatorFree(pVCpu, pTb);
1777 }
1778 RT_NOREF(pVM);
1779 return NULL;
1780
1781}
1782
1783
1784/**
1785 * Adds the given TB to the hash table.
1786 *
1787 * @param pVCpu The cross context virtual CPU structure of the calling
1788 * thread.
1789 * @param pTbCache The cache to add it to.
1790 * @param pTb The translation block to add.
1791 */
1792static void iemThreadedTbAdd(PVMCPUCC pVCpu, PIEMTBCACHE pTbCache, PIEMTB pTb)
1793{
1794 iemTbCacheAdd(pVCpu, pTbCache, pTb);
1795
1796 STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTbInstr, pTb->cInstructions);
1797 STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTbLookupEntries, pTb->cTbLookupEntries);
1798 STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTbThreadedCalls, pTb->Thrd.cCalls);
1799 if (LogIs12Enabled())
1800 {
1801 Log12(("TB added: %p %RGp LB %#x fl=%#x idxHash=%#x cRanges=%u cInstr=%u cCalls=%u\n",
1802 pTb, pTb->GCPhysPc, pTb->cbOpcodes, pTb->fFlags, IEMTBCACHE_HASH(pTbCache, pTb->fFlags, pTb->GCPhysPc),
1803 pTb->cRanges, pTb->cInstructions, pTb->Thrd.cCalls));
1804 for (uint8_t idxRange = 0; idxRange < pTb->cRanges; idxRange++)
1805 Log12((" range#%u: offPg=%#05x offOp=%#04x LB %#04x pg#%u=%RGp\n", idxRange, pTb->aRanges[idxRange].offPhysPage,
1806 pTb->aRanges[idxRange].offOpcodes, pTb->aRanges[idxRange].cbOpcodes, pTb->aRanges[idxRange].idxPhysPage,
1807 pTb->aRanges[idxRange].idxPhysPage == 0
1808 ? pTb->GCPhysPc & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK
1809 : pTb->aGCPhysPages[pTb->aRanges[idxRange].idxPhysPage - 1]));
1810 }
1811}
1812
1813
1814/**
1815 * Called by opcode verifier functions when they detect a problem.
1816 */
1817void iemThreadedTbObsolete(PVMCPUCC pVCpu, PIEMTB pTb, bool fSafeToFree)
1818{
1819 /* We cannot free the current TB (indicated by fSafeToFree) because:
1820 - A threaded TB will have its current call entry accessed
1821 to update pVCpu->iem.s.cInstructions.
1822 - A native TB will have code left to execute. */
1823 if (fSafeToFree)
1824 iemTbAllocatorFree(pVCpu, pTb);
1825 else
1826 iemTbAlloctorScheduleForFree(pVCpu, pTb);
1827}
1828
1829
1830/*
1831 * Real code.
1832 */
1833
1834#ifdef LOG_ENABLED
1835/**
1836 * Logs the current instruction.
1837 * @param pVCpu The cross context virtual CPU structure of the calling EMT.
1838 * @param pszFunction The IEM function doing the execution.
1839 * @param idxInstr The instruction number in the block.
1840 */
1841static void iemThreadedLogCurInstr(PVMCPUCC pVCpu, const char *pszFunction, uint32_t idxInstr) RT_NOEXCEPT
1842{
1843# ifdef IN_RING3
1844 if (LogIs2Enabled())
1845 {
1846 char szInstr[256];
1847 uint32_t cbInstr = 0;
1848 DBGFR3DisasInstrEx(pVCpu->pVMR3->pUVM, pVCpu->idCpu, 0, 0,
1849 DBGF_DISAS_FLAGS_CURRENT_GUEST | DBGF_DISAS_FLAGS_DEFAULT_MODE,
1850 szInstr, sizeof(szInstr), &cbInstr);
1851
1852 PCX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
1853 Log2(("**** %s fExec=%x pTb=%p cUsed=%u #%u\n"
1854 " eax=%08x ebx=%08x ecx=%08x edx=%08x esi=%08x edi=%08x\n"
1855 " eip=%08x esp=%08x ebp=%08x iopl=%d tr=%04x\n"
1856 " cs=%04x ss=%04x ds=%04x es=%04x fs=%04x gs=%04x efl=%08x\n"
1857 " fsw=%04x fcw=%04x ftw=%02x mxcsr=%04x/%04x\n"
1858 " %s\n"
1859 , pszFunction, pVCpu->iem.s.fExec, pVCpu->iem.s.pCurTbR3, pVCpu->iem.s.pCurTbR3 ? pVCpu->iem.s.pCurTbR3->cUsed : 0, idxInstr,
1860 pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.ebx, pVCpu->cpum.GstCtx.ecx, pVCpu->cpum.GstCtx.edx, pVCpu->cpum.GstCtx.esi, pVCpu->cpum.GstCtx.edi,
1861 pVCpu->cpum.GstCtx.eip, pVCpu->cpum.GstCtx.esp, pVCpu->cpum.GstCtx.ebp, pVCpu->cpum.GstCtx.eflags.Bits.u2IOPL, pVCpu->cpum.GstCtx.tr.Sel,
1862 pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.ss.Sel, pVCpu->cpum.GstCtx.ds.Sel, pVCpu->cpum.GstCtx.es.Sel,
1863 pVCpu->cpum.GstCtx.fs.Sel, pVCpu->cpum.GstCtx.gs.Sel, pVCpu->cpum.GstCtx.eflags.u,
1864 pFpuCtx->FSW, pFpuCtx->FCW, pFpuCtx->FTW, pFpuCtx->MXCSR, pFpuCtx->MXCSR_MASK,
1865 szInstr));
1866
1867 /*if (LogIs3Enabled()) - this outputs an insane amount of stuff, so disabled.
1868 DBGFR3InfoEx(pVCpu->pVMR3->pUVM, pVCpu->idCpu, "cpumguest", "verbose", NULL); */
1869 }
1870 else
1871# endif
1872 LogFlow(("%s: cs:rip=%04x:%08RX64 ss:rsp=%04x:%08RX64 EFL=%06x\n", pszFunction, pVCpu->cpum.GstCtx.cs.Sel,
1873 pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.ss.Sel, pVCpu->cpum.GstCtx.rsp, pVCpu->cpum.GstCtx.eflags.u));
1874}
1875#endif /* LOG_ENABLED */
1876
1877
1878#if 0
1879static VBOXSTRICTRC iemThreadedCompileLongJumped(PVMCC pVM, PVMCPUCC pVCpu, VBOXSTRICTRC rcStrict)
1880{
1881 RT_NOREF(pVM, pVCpu);
1882 return rcStrict;
1883}
1884#endif
1885
1886
1887/**
1888 * Initializes the decoder state when compiling TBs.
1889 *
1890 * This presumes that fExec has already be initialized.
1891 *
1892 * This is very similar to iemInitDecoder() and iemReInitDecoder(), so may need
1893 * to apply fixes to them as well.
1894 *
1895 * @param pVCpu The cross context virtual CPU structure of the calling
1896 * thread.
1897 * @param fReInit Clear for the first call for a TB, set for subsequent
1898 * calls from inside the compile loop where we can skip a
1899 * couple of things.
1900 * @param fExtraFlags The extra translation block flags when @a fReInit is
1901 * true, otherwise ignored. Only IEMTB_F_INHIBIT_SHADOW is
1902 * checked.
1903 */
1904DECL_FORCE_INLINE(void) iemThreadedCompileInitDecoder(PVMCPUCC pVCpu, bool const fReInit, uint32_t const fExtraFlags)
1905{
1906 /* ASSUMES: That iemInitExec was already called and that anyone changing
1907 CPU state affecting the fExec bits since then will have updated fExec! */
1908 AssertMsg((pVCpu->iem.s.fExec & ~IEM_F_USER_OPTS) == iemCalcExecFlags(pVCpu),
1909 ("fExec=%#x iemCalcExecModeFlags=%#x\n", pVCpu->iem.s.fExec, iemCalcExecFlags(pVCpu)));
1910
1911 IEMMODE const enmMode = IEM_GET_CPU_MODE(pVCpu);
1912
1913 /* Decoder state: */
1914 pVCpu->iem.s.enmDefAddrMode = enmMode; /** @todo check if this is correct... */
1915 pVCpu->iem.s.enmEffAddrMode = enmMode;
1916 if (enmMode != IEMMODE_64BIT)
1917 {
1918 pVCpu->iem.s.enmDefOpSize = enmMode; /** @todo check if this is correct... */
1919 pVCpu->iem.s.enmEffOpSize = enmMode;
1920 }
1921 else
1922 {
1923 pVCpu->iem.s.enmDefOpSize = IEMMODE_32BIT;
1924 pVCpu->iem.s.enmEffOpSize = IEMMODE_32BIT;
1925 }
1926 pVCpu->iem.s.fPrefixes = 0;
1927 pVCpu->iem.s.uRexReg = 0;
1928 pVCpu->iem.s.uRexB = 0;
1929 pVCpu->iem.s.uRexIndex = 0;
1930 pVCpu->iem.s.idxPrefix = 0;
1931 pVCpu->iem.s.uVex3rdReg = 0;
1932 pVCpu->iem.s.uVexLength = 0;
1933 pVCpu->iem.s.fEvexStuff = 0;
1934 pVCpu->iem.s.iEffSeg = X86_SREG_DS;
1935 pVCpu->iem.s.offModRm = 0;
1936 pVCpu->iem.s.iNextMapping = 0;
1937
1938 if (!fReInit)
1939 {
1940 pVCpu->iem.s.cActiveMappings = 0;
1941 pVCpu->iem.s.rcPassUp = VINF_SUCCESS;
1942 pVCpu->iem.s.fEndTb = false;
1943 pVCpu->iem.s.fTbCheckOpcodes = true; /* (check opcodes for before executing the first instruction) */
1944 pVCpu->iem.s.fTbBranched = IEMBRANCHED_F_NO;
1945 pVCpu->iem.s.fTbCrossedPage = false;
1946 pVCpu->iem.s.cInstrTillIrqCheck = !(fExtraFlags & IEMTB_F_INHIBIT_SHADOW) ? 32 : 0;
1947 pVCpu->iem.s.idxLastCheckIrqCallNo = UINT16_MAX;
1948 pVCpu->iem.s.fTbCurInstrIsSti = false;
1949 /* Force RF clearing and TF checking on first instruction in the block
1950 as we don't really know what came before and should assume the worst: */
1951 pVCpu->iem.s.fTbPrevInstr = IEM_CIMPL_F_RFLAGS | IEM_CIMPL_F_END_TB;
1952 }
1953 else
1954 {
1955 Assert(pVCpu->iem.s.cActiveMappings == 0);
1956 Assert(pVCpu->iem.s.rcPassUp == VINF_SUCCESS);
1957 Assert(pVCpu->iem.s.fEndTb == false);
1958 Assert(pVCpu->iem.s.fTbCrossedPage == false);
1959 pVCpu->iem.s.fTbPrevInstr = pVCpu->iem.s.fTbCurInstr;
1960 }
1961 pVCpu->iem.s.fTbCurInstr = 0;
1962
1963#ifdef DBGFTRACE_ENABLED
1964 switch (IEM_GET_CPU_MODE(pVCpu))
1965 {
1966 case IEMMODE_64BIT:
1967 RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "I64/%u %08llx", IEM_GET_CPL(pVCpu), pVCpu->cpum.GstCtx.rip);
1968 break;
1969 case IEMMODE_32BIT:
1970 RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "I32/%u %04x:%08x", IEM_GET_CPL(pVCpu), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip);
1971 break;
1972 case IEMMODE_16BIT:
1973 RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "I16/%u %04x:%04x", IEM_GET_CPL(pVCpu), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip);
1974 break;
1975 }
1976#endif
1977}
1978
1979
1980/**
1981 * Initializes the opcode fetcher when starting the compilation.
1982 *
1983 * @param pVCpu The cross context virtual CPU structure of the calling
1984 * thread.
1985 */
1986DECL_FORCE_INLINE(void) iemThreadedCompileInitOpcodeFetching(PVMCPUCC pVCpu)
1987{
1988 /* Almost everything is done by iemGetPcWithPhysAndCode() already. We just need to initialize the index into abOpcode. */
1989#ifdef IEM_WITH_CODE_TLB_AND_OPCODE_BUF
1990 pVCpu->iem.s.offOpcode = 0;
1991#else
1992 RT_NOREF(pVCpu);
1993#endif
1994}
1995
1996
1997/**
1998 * Re-initializes the opcode fetcher between instructions while compiling.
1999 *
2000 * @param pVCpu The cross context virtual CPU structure of the calling
2001 * thread.
2002 */
2003DECL_FORCE_INLINE(void) iemThreadedCompileReInitOpcodeFetching(PVMCPUCC pVCpu)
2004{
2005 if (pVCpu->iem.s.pbInstrBuf)
2006 {
2007 uint64_t off = pVCpu->cpum.GstCtx.rip;
2008 Assert(pVCpu->cpum.GstCtx.cs.u64Base == 0 || !IEM_IS_64BIT_CODE(pVCpu));
2009 off += pVCpu->cpum.GstCtx.cs.u64Base;
2010 off -= pVCpu->iem.s.uInstrBufPc;
2011 if (off < pVCpu->iem.s.cbInstrBufTotal)
2012 {
2013 pVCpu->iem.s.offInstrNextByte = (uint32_t)off;
2014 pVCpu->iem.s.offCurInstrStart = (uint16_t)off;
2015 if ((uint16_t)off + 15 <= pVCpu->iem.s.cbInstrBufTotal)
2016 pVCpu->iem.s.cbInstrBuf = (uint16_t)off + 15;
2017 else
2018 pVCpu->iem.s.cbInstrBuf = pVCpu->iem.s.cbInstrBufTotal;
2019 }
2020 else
2021 {
2022 pVCpu->iem.s.pbInstrBuf = NULL;
2023 pVCpu->iem.s.offInstrNextByte = 0;
2024 pVCpu->iem.s.offCurInstrStart = 0;
2025 pVCpu->iem.s.cbInstrBuf = 0;
2026 pVCpu->iem.s.cbInstrBufTotal = 0;
2027 pVCpu->iem.s.GCPhysInstrBuf = NIL_RTGCPHYS;
2028 }
2029 }
2030 else
2031 {
2032 pVCpu->iem.s.offInstrNextByte = 0;
2033 pVCpu->iem.s.offCurInstrStart = 0;
2034 pVCpu->iem.s.cbInstrBuf = 0;
2035 pVCpu->iem.s.cbInstrBufTotal = 0;
2036#ifdef VBOX_STRICT
2037 pVCpu->iem.s.GCPhysInstrBuf = NIL_RTGCPHYS;
2038#endif
2039 }
2040#ifdef IEM_WITH_CODE_TLB_AND_OPCODE_BUF
2041 pVCpu->iem.s.offOpcode = 0;
2042#endif
2043}
2044
2045#ifdef LOG_ENABLED
2046
2047/**
2048 * Inserts a NOP call.
2049 *
2050 * This is for debugging.
2051 *
2052 * @returns true on success, false if we're out of call entries.
2053 * @param pTb The translation block being compiled.
2054 */
2055bool iemThreadedCompileEmitNop(PIEMTB pTb)
2056{
2057 /* Emit the call. */
2058 uint32_t const idxCall = pTb->Thrd.cCalls;
2059 AssertReturn(idxCall < pTb->Thrd.cAllocated, false);
2060 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2061 pTb->Thrd.cCalls = (uint16_t)(idxCall + 1);
2062 pCall->enmFunction = kIemThreadedFunc_BltIn_Nop;
2063 pCall->idxInstr = pTb->cInstructions - 1;
2064 pCall->cbOpcode = 0;
2065 pCall->offOpcode = 0;
2066 pCall->uTbLookup = 0;
2067 pCall->fFlags = 0;
2068 pCall->auParams[0] = 0;
2069 pCall->auParams[1] = 0;
2070 pCall->auParams[2] = 0;
2071 return true;
2072}
2073
2074
2075/**
2076 * Called by iemThreadedCompile if cpu state logging is desired.
2077 *
2078 * @returns true on success, false if we're out of call entries.
2079 * @param pTb The translation block being compiled.
2080 */
2081bool iemThreadedCompileEmitLogCpuState(PIEMTB pTb)
2082{
2083 /* Emit the call. */
2084 uint32_t const idxCall = pTb->Thrd.cCalls;
2085 AssertReturn(idxCall < pTb->Thrd.cAllocated, false);
2086 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2087 pTb->Thrd.cCalls = (uint16_t)(idxCall + 1);
2088 pCall->enmFunction = kIemThreadedFunc_BltIn_LogCpuState;
2089 pCall->idxInstr = pTb->cInstructions - 1;
2090 pCall->cbOpcode = 0;
2091 pCall->offOpcode = 0;
2092 pCall->uTbLookup = 0;
2093 pCall->fFlags = 0;
2094 pCall->auParams[0] = RT_MAKE_U16(pCall->idxInstr, idxCall); /* currently not used, but whatever */
2095 pCall->auParams[1] = 0;
2096 pCall->auParams[2] = 0;
2097 return true;
2098}
2099
2100#endif /* LOG_ENABLED */
2101
2102DECLINLINE(void) iemThreadedCopyOpcodeBytesInline(PCVMCPUCC pVCpu, uint8_t *pbDst, uint8_t cbInstr)
2103{
2104 switch (cbInstr)
2105 {
2106 default: AssertMsgFailed(("%#x\n", cbInstr)); RT_FALL_THROUGH();
2107 case 15: pbDst[14] = pVCpu->iem.s.abOpcode[14]; RT_FALL_THROUGH();
2108 case 14: pbDst[13] = pVCpu->iem.s.abOpcode[13]; RT_FALL_THROUGH();
2109 case 13: pbDst[12] = pVCpu->iem.s.abOpcode[12]; RT_FALL_THROUGH();
2110 case 12: pbDst[11] = pVCpu->iem.s.abOpcode[11]; RT_FALL_THROUGH();
2111 case 11: pbDst[10] = pVCpu->iem.s.abOpcode[10]; RT_FALL_THROUGH();
2112 case 10: pbDst[9] = pVCpu->iem.s.abOpcode[9]; RT_FALL_THROUGH();
2113 case 9: pbDst[8] = pVCpu->iem.s.abOpcode[8]; RT_FALL_THROUGH();
2114 case 8: pbDst[7] = pVCpu->iem.s.abOpcode[7]; RT_FALL_THROUGH();
2115 case 7: pbDst[6] = pVCpu->iem.s.abOpcode[6]; RT_FALL_THROUGH();
2116 case 6: pbDst[5] = pVCpu->iem.s.abOpcode[5]; RT_FALL_THROUGH();
2117 case 5: pbDst[4] = pVCpu->iem.s.abOpcode[4]; RT_FALL_THROUGH();
2118 case 4: pbDst[3] = pVCpu->iem.s.abOpcode[3]; RT_FALL_THROUGH();
2119 case 3: pbDst[2] = pVCpu->iem.s.abOpcode[2]; RT_FALL_THROUGH();
2120 case 2: pbDst[1] = pVCpu->iem.s.abOpcode[1]; RT_FALL_THROUGH();
2121 case 1: pbDst[0] = pVCpu->iem.s.abOpcode[0]; break;
2122 }
2123}
2124
2125#ifdef IEM_WITH_INTRA_TB_JUMPS
2126
2127/**
2128 * Emits the necessary tail calls for a full TB loop-jump.
2129 */
2130static bool iemThreadedCompileFullTbJump(PVMCPUCC pVCpu, PIEMTB pTb)
2131{
2132 /*
2133 * We need a timer and maybe IRQ check before jumping, so make sure
2134 * we've got sufficient call entries left before emitting anything.
2135 */
2136 uint32_t idxCall = pTb->Thrd.cCalls;
2137 if (idxCall + 1U <= pTb->Thrd.cAllocated)
2138 {
2139 /*
2140 * We're good, emit the calls.
2141 */
2142 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2143 pTb->Thrd.cCalls = (uint16_t)(idxCall + 2);
2144
2145 /* Always check timers as we risk getting stuck in a loop otherwise. We
2146 combine it with an IRQ check if that's not performed in the TB already. */
2147 pCall->enmFunction = pVCpu->iem.s.idxLastCheckIrqCallNo < idxCall
2148 ? kIemThreadedFunc_BltIn_CheckTimers
2149 : kIemThreadedFunc_BltIn_CheckTimersAndIrq;
2150 pCall->idxInstr = 0;
2151 pCall->offOpcode = 0;
2152 pCall->cbOpcode = 0;
2153 pCall->uTbLookup = 0;
2154 pCall->fFlags = 0;
2155 pCall->auParams[0] = 0;
2156 pCall->auParams[1] = 0;
2157 pCall->auParams[2] = 0;
2158 pCall++;
2159
2160 /* The jump callentry[0]. */
2161 pCall->enmFunction = kIemThreadedFunc_BltIn_Jump;
2162 pCall->idxInstr = 0;
2163 pCall->offOpcode = 0;
2164 pCall->cbOpcode = 0;
2165 pCall->uTbLookup = 0;
2166 pCall->fFlags = 0;
2167 pCall->auParams[0] = 0; /* jump target is call zero */
2168 pCall->auParams[1] = 0;
2169 pCall->auParams[2] = 0;
2170
2171 /* Mark callentry #0 as a jump target. */
2172 pTb->Thrd.paCalls[0].fFlags |= IEMTHREADEDCALLENTRY_F_JUMP_TARGET;
2173 }
2174
2175 return false;
2176}
2177
2178/**
2179 * Called by IEM_MC2_BEGIN_EMIT_CALLS when it detects that we're back at the
2180 * first instruction and we didn't just branch to it (that's handled below).
2181 *
2182 * This will emit a loop iff everything is compatible with that.
2183 */
2184DECLHIDDEN(int) iemThreadedCompileBackAtFirstInstruction(PVMCPU pVCpu, PIEMTB pTb) RT_NOEXCEPT
2185{
2186 /* Check if the mode matches. */
2187 if ( (pVCpu->iem.s.fExec & IEMTB_F_IEM_F_MASK & IEMTB_F_KEY_MASK)
2188 == (pTb->fFlags & IEMTB_F_KEY_MASK & ~IEMTB_F_CS_LIM_CHECKS))
2189 {
2190 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatTbLoopFullTbDetected2);
2191 iemThreadedCompileFullTbJump(pVCpu, pTb);
2192 }
2193 return VINF_IEM_RECOMPILE_END_TB;
2194}
2195
2196#endif /* IEM_WITH_INTRA_TB_JUMPS */
2197
2198
2199/**
2200 * Called by IEM_MC2_BEGIN_EMIT_CALLS() under one of these conditions:
2201 *
2202 * - CS LIM check required.
2203 * - Must recheck opcode bytes.
2204 * - Previous instruction branched.
2205 * - TLB load detected, probably due to page crossing.
2206 *
2207 * @returns true if everything went well, false if we're out of space in the TB
2208 * (e.g. opcode ranges) or needs to start doing CS.LIM checks.
2209 * @param pVCpu The cross context virtual CPU structure of the calling
2210 * thread.
2211 * @param pTb The translation block being compiled.
2212 */
2213bool iemThreadedCompileBeginEmitCallsComplications(PVMCPUCC pVCpu, PIEMTB pTb)
2214{
2215 Log6(("%04x:%08RX64: iemThreadedCompileBeginEmitCallsComplications\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2216 Assert((pVCpu->iem.s.GCPhysInstrBuf & GUEST_PAGE_OFFSET_MASK) == 0);
2217#if 0
2218 if (pVCpu->cpum.GstCtx.rip >= 0xc0000000 && !LogIsEnabled())
2219 RTLogChangeFlags(NULL, 0, RTLOGFLAGS_DISABLED);
2220#endif
2221
2222 /*
2223 * If we're not in 64-bit mode and not already checking CS.LIM we need to
2224 * see if it's needed to start checking.
2225 */
2226 bool fConsiderCsLimChecking;
2227 uint32_t const fMode = pVCpu->iem.s.fExec & IEM_F_MODE_MASK;
2228 if ( fMode == IEM_F_MODE_X86_64BIT
2229 || (pTb->fFlags & IEMTB_F_CS_LIM_CHECKS)
2230 || fMode == IEM_F_MODE_X86_32BIT_PROT_FLAT
2231 || fMode == IEM_F_MODE_X86_32BIT_FLAT)
2232 fConsiderCsLimChecking = false; /* already enabled or not needed */
2233 else
2234 {
2235 int64_t const offFromLim = (int64_t)pVCpu->cpum.GstCtx.cs.u32Limit - (int64_t)pVCpu->cpum.GstCtx.eip;
2236 if (offFromLim >= GUEST_PAGE_SIZE + 16 - (int32_t)(pVCpu->cpum.GstCtx.cs.u64Base & GUEST_PAGE_OFFSET_MASK))
2237 fConsiderCsLimChecking = true; /* likely */
2238 else
2239 {
2240 Log8(("%04x:%08RX64: Needs CS.LIM checks (%#RX64)\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, offFromLim));
2241 return false;
2242 }
2243 }
2244
2245 /*
2246 * Prepare call now, even before we know if can accept the instruction in this TB.
2247 * This allows us amending parameters w/o making every case suffer.
2248 */
2249 uint8_t const cbInstr = IEM_GET_INSTR_LEN(pVCpu);
2250 uint16_t const offOpcode = pTb->cbOpcodes;
2251 uint8_t idxRange = pTb->cRanges - 1;
2252
2253 PIEMTHRDEDCALLENTRY const pCall = &pTb->Thrd.paCalls[pTb->Thrd.cCalls];
2254 pCall->idxInstr = pTb->cInstructions;
2255 pCall->cbOpcode = cbInstr;
2256 pCall->offOpcode = offOpcode;
2257 pCall->uTbLookup = 0;
2258 pCall->fFlags = 0;
2259 pCall->auParams[0] = (uint32_t)cbInstr
2260 | (uint32_t)(pVCpu->iem.s.fExec << 8) /* liveness: Enough of fExec for IEM_F_MODE_X86_IS_FLAT. */
2261 /* The upper dword is sometimes used for cbStartPage. */;
2262 pCall->auParams[1] = idxRange;
2263 pCall->auParams[2] = offOpcode - pTb->aRanges[idxRange].offOpcodes;
2264
2265/** @todo check if we require IEMTB_F_CS_LIM_CHECKS for any new page we've
2266 * gotten onto. If we do, stop */
2267
2268 /*
2269 * Case 1: We've branched (RIP changed).
2270 *
2271 * Loop check: If the new PC (GCPhysPC) is within a opcode range of this
2272 * TB, end the TB here as it is most likely a loop and if it
2273 * made sense to unroll it, the guest code compiler should've
2274 * done it already.
2275 *
2276 * Sub-case 1a: Same page, no TLB load (fTbCrossedPage is false).
2277 * Req: 1 extra range, no extra phys.
2278 *
2279 * Sub-case 1b: Different page but no page boundrary crossing, so TLB load
2280 * necessary (fTbCrossedPage is true).
2281 * Req: 1 extra range, probably 1 extra phys page entry.
2282 *
2283 * Sub-case 1c: Different page, so TLB load necessary (fTbCrossedPage is true),
2284 * but in addition we cross into the following page and require
2285 * another TLB load.
2286 * Req: 2 extra ranges, probably 2 extra phys page entries.
2287 *
2288 * Sub-case 1d: Same page, so no initial TLB load necessary, but we cross into
2289 * the following page (thus fTbCrossedPage is true).
2290 * Req: 2 extra ranges, probably 1 extra phys page entry.
2291 *
2292 * Note! The setting fTbCrossedPage is done by the iemOpcodeFetchBytesJmp, but
2293 * it may trigger "spuriously" from the CPU point of view because of
2294 * physical page changes that'll invalid the physical TLB and trigger a
2295 * call to the function. In theory this be a big deal, just a bit
2296 * performance loss as we'll pick the LoadingTlb variants.
2297 *
2298 * Note! We do not currently optimize branching to the next instruction (sorry
2299 * 32-bit PIC code). We could maybe do that in the branching code that
2300 * sets (or not) fTbBranched.
2301 */
2302 /** @todo Optimize 'jmp .next_instr' and 'call .next_instr'. Seen the jmp
2303 * variant in win 3.1 code and the call variant in 32-bit linux PIC
2304 * code. This'll require filtering out far jmps and calls, as they
2305 * load CS which should technically be considered indirect since the
2306 * GDT/LDT entry's base address can be modified independently from
2307 * the code. */
2308 if (pVCpu->iem.s.fTbBranched != IEMBRANCHED_F_NO)
2309 {
2310 if ( !pVCpu->iem.s.fTbCrossedPage /* 1a */
2311 || pVCpu->iem.s.offCurInstrStart >= 0 /* 1b */ )
2312 {
2313 /* 1a + 1b - instruction fully within the branched to page. */
2314 Assert(pVCpu->iem.s.offCurInstrStart >= 0);
2315 Assert(pVCpu->iem.s.offCurInstrStart + cbInstr <= GUEST_PAGE_SIZE);
2316
2317 if (!(pVCpu->iem.s.fTbBranched & IEMBRANCHED_F_ZERO))
2318 {
2319 /* Check that we've got a free range. */
2320 idxRange += 1;
2321 if (idxRange < RT_ELEMENTS(pTb->aRanges))
2322 { /* likely */ }
2323 else
2324 {
2325 Log8(("%04x:%08RX64: out of ranges after branch\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2326 return false;
2327 }
2328 pCall->auParams[1] = idxRange;
2329 pCall->auParams[2] = 0;
2330
2331 /* Check that we've got a free page slot. */
2332 AssertCompile(RT_ELEMENTS(pTb->aGCPhysPages) == 2);
2333 RTGCPHYS const GCPhysNew = pVCpu->iem.s.GCPhysInstrBuf & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK;
2334 uint8_t idxPhysPage;
2335 if ((pTb->GCPhysPc & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK) == GCPhysNew)
2336 pTb->aRanges[idxRange].idxPhysPage = idxPhysPage = 0;
2337 else if (pTb->aGCPhysPages[0] == NIL_RTGCPHYS)
2338 {
2339 pTb->aGCPhysPages[0] = GCPhysNew;
2340 pTb->aRanges[idxRange].idxPhysPage = 1;
2341 idxPhysPage = UINT8_MAX;
2342 }
2343 else if (pTb->aGCPhysPages[0] == GCPhysNew)
2344 pTb->aRanges[idxRange].idxPhysPage = idxPhysPage = 1;
2345 else if (pTb->aGCPhysPages[1] == NIL_RTGCPHYS)
2346 {
2347 pTb->aGCPhysPages[1] = GCPhysNew;
2348 pTb->aRanges[idxRange].idxPhysPage = 2;
2349 idxPhysPage = UINT8_MAX;
2350 }
2351 else if (pTb->aGCPhysPages[1] == GCPhysNew)
2352 pTb->aRanges[idxRange].idxPhysPage = idxPhysPage = 2;
2353 else
2354 {
2355 Log8(("%04x:%08RX64: out of aGCPhysPages entires after branch\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2356 return false;
2357 }
2358
2359 /* Loop check: We weave the loop check in here to optimize the lookup. */
2360 if (idxPhysPage != UINT8_MAX)
2361 {
2362 uint32_t const offPhysPc = pVCpu->iem.s.offCurInstrStart;
2363 for (uint8_t idxLoopRange = 0; idxLoopRange < idxRange; idxLoopRange++)
2364 if ( pTb->aRanges[idxLoopRange].idxPhysPage == idxPhysPage
2365 && offPhysPc - (uint32_t)pTb->aRanges[idxLoopRange].offPhysPage
2366 < (uint32_t)pTb->aRanges[idxLoopRange].cbOpcodes)
2367 {
2368 Log8(("%04x:%08RX64: loop detected after branch\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2369#ifdef IEM_WITH_INTRA_TB_JUMPS
2370 /* If we're looping back to the start of the TB and the mode is still the same,
2371 we could emit a jump optimization. For now we don't do page transitions
2372 as that implies TLB loading and such. */
2373 if ( idxLoopRange == 0
2374 && offPhysPc == pTb->aRanges[0].offPhysPage
2375 && (pVCpu->iem.s.fExec & IEMTB_F_IEM_F_MASK & IEMTB_F_KEY_MASK)
2376 == (pTb->fFlags & IEMTB_F_KEY_MASK & ~IEMTB_F_CS_LIM_CHECKS)
2377 && (pVCpu->iem.s.fTbBranched & ( IEMBRANCHED_F_INDIRECT | IEMBRANCHED_F_FAR
2378 | IEMBRANCHED_F_STACK | IEMBRANCHED_F_RELATIVE))
2379 == IEMBRANCHED_F_RELATIVE)
2380 {
2381 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatTbLoopFullTbDetected);
2382 return iemThreadedCompileFullTbJump(pVCpu, pTb);
2383 }
2384#endif
2385 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatTbLoopInTbDetected);
2386 return false;
2387 }
2388 }
2389
2390 /* Finish setting up the new range. */
2391 pTb->aRanges[idxRange].offPhysPage = pVCpu->iem.s.offCurInstrStart;
2392 pTb->aRanges[idxRange].offOpcodes = offOpcode;
2393 pTb->aRanges[idxRange].cbOpcodes = cbInstr;
2394 pTb->aRanges[idxRange].u2Unused = 0;
2395 pTb->cRanges++;
2396 Log6(("%04x:%08RX64: new range #%u same page: offPhysPage=%#x offOpcodes=%#x\n",
2397 pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, idxRange, pTb->aRanges[idxRange].offPhysPage,
2398 pTb->aRanges[idxRange].offOpcodes));
2399 }
2400 else
2401 {
2402 Log8(("%04x:%08RX64: zero byte jump\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2403 pTb->aRanges[idxRange].cbOpcodes += cbInstr;
2404 }
2405
2406 /* Determin which function we need to load & check.
2407 Note! For jumps to a new page, we'll set both fTbBranched and
2408 fTbCrossedPage to avoid unnecessary TLB work for intra
2409 page branching */
2410 if ( (pVCpu->iem.s.fTbBranched & (IEMBRANCHED_F_INDIRECT | IEMBRANCHED_F_FAR)) /* Far is basically indirect. */
2411 || pVCpu->iem.s.fTbCrossedPage)
2412 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2413 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodesLoadingTlb
2414 : !fConsiderCsLimChecking
2415 ? kIemThreadedFunc_BltIn_CheckOpcodesLoadingTlb
2416 : kIemThreadedFunc_BltIn_CheckOpcodesLoadingTlbConsiderCsLim;
2417 else if (pVCpu->iem.s.fTbBranched & (IEMBRANCHED_F_CONDITIONAL | /* paranoia: */ IEMBRANCHED_F_DIRECT))
2418 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2419 ? kIemThreadedFunc_BltIn_CheckCsLimAndPcAndOpcodes
2420 : !fConsiderCsLimChecking
2421 ? kIemThreadedFunc_BltIn_CheckPcAndOpcodes
2422 : kIemThreadedFunc_BltIn_CheckPcAndOpcodesConsiderCsLim;
2423 else
2424 {
2425 Assert(pVCpu->iem.s.fTbBranched & IEMBRANCHED_F_RELATIVE);
2426 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2427 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodes
2428 : !fConsiderCsLimChecking
2429 ? kIemThreadedFunc_BltIn_CheckOpcodes
2430 : kIemThreadedFunc_BltIn_CheckOpcodesConsiderCsLim;
2431 }
2432 }
2433 else
2434 {
2435 /* 1c + 1d - instruction crosses pages. */
2436 Assert(pVCpu->iem.s.offCurInstrStart < 0);
2437 Assert(pVCpu->iem.s.offCurInstrStart + cbInstr > 0);
2438
2439 /* Lazy bird: Check that this isn't case 1c, since we've already
2440 load the first physical address. End the TB and
2441 make it a case 2b instead.
2442
2443 Hmm. Too much bother to detect, so just do the same
2444 with case 1d as well. */
2445#if 0 /** @todo get back to this later when we've got the actual branch code in
2446 * place. */
2447 uint8_t const cbStartPage = (uint8_t)-pVCpu->iem.s.offCurInstrStart;
2448
2449 /* Check that we've got two free ranges. */
2450 if (idxRange + 2 < RT_ELEMENTS(pTb->aRanges))
2451 { /* likely */ }
2452 else
2453 return false;
2454 idxRange += 1;
2455 pCall->auParams[1] = idxRange;
2456 pCall->auParams[2] = 0;
2457
2458 /* ... */
2459
2460#else
2461 Log8(("%04x:%08RX64: complicated post-branch condition, ending TB.\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2462 return false;
2463#endif
2464 }
2465 }
2466
2467 /*
2468 * Case 2: Page crossing.
2469 *
2470 * Sub-case 2a: The instruction starts on the first byte in the next page.
2471 *
2472 * Sub-case 2b: The instruction has opcode bytes in both the current and
2473 * following page.
2474 *
2475 * Both cases requires a new range table entry and probably a new physical
2476 * page entry. The difference is in which functions to emit and whether to
2477 * add bytes to the current range.
2478 */
2479 else if (pVCpu->iem.s.fTbCrossedPage)
2480 {
2481 /* Check that we've got a free range. */
2482 idxRange += 1;
2483 if (idxRange < RT_ELEMENTS(pTb->aRanges))
2484 { /* likely */ }
2485 else
2486 {
2487 Log8(("%04x:%08RX64: out of ranges while crossing page\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2488 return false;
2489 }
2490
2491 /* Check that we've got a free page slot. */
2492 AssertCompile(RT_ELEMENTS(pTb->aGCPhysPages) == 2);
2493 RTGCPHYS const GCPhysNew = pVCpu->iem.s.GCPhysInstrBuf & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK;
2494 if ((pTb->GCPhysPc & ~(RTGCPHYS)GUEST_PAGE_OFFSET_MASK) == GCPhysNew)
2495 pTb->aRanges[idxRange].idxPhysPage = 0;
2496 else if ( pTb->aGCPhysPages[0] == NIL_RTGCPHYS
2497 || pTb->aGCPhysPages[0] == GCPhysNew)
2498 {
2499 pTb->aGCPhysPages[0] = GCPhysNew;
2500 pTb->aRanges[idxRange].idxPhysPage = 1;
2501 }
2502 else if ( pTb->aGCPhysPages[1] == NIL_RTGCPHYS
2503 || pTb->aGCPhysPages[1] == GCPhysNew)
2504 {
2505 pTb->aGCPhysPages[1] = GCPhysNew;
2506 pTb->aRanges[idxRange].idxPhysPage = 2;
2507 }
2508 else
2509 {
2510 Log8(("%04x:%08RX64: out of aGCPhysPages entires while crossing page\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2511 return false;
2512 }
2513
2514 if (((pTb->aRanges[idxRange - 1].offPhysPage + pTb->aRanges[idxRange - 1].cbOpcodes) & GUEST_PAGE_OFFSET_MASK) == 0)
2515 {
2516 Assert(pVCpu->iem.s.offCurInstrStart == 0);
2517 pCall->auParams[1] = idxRange;
2518 pCall->auParams[2] = 0;
2519
2520 /* Finish setting up the new range. */
2521 pTb->aRanges[idxRange].offPhysPage = pVCpu->iem.s.offCurInstrStart;
2522 pTb->aRanges[idxRange].offOpcodes = offOpcode;
2523 pTb->aRanges[idxRange].cbOpcodes = cbInstr;
2524 pTb->aRanges[idxRange].u2Unused = 0;
2525 pTb->cRanges++;
2526 Log6(("%04x:%08RX64: new range #%u new page (a) %u/%RGp: offPhysPage=%#x offOpcodes=%#x\n",
2527 pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, idxRange, pTb->aRanges[idxRange].idxPhysPage, GCPhysNew,
2528 pTb->aRanges[idxRange].offPhysPage, pTb->aRanges[idxRange].offOpcodes));
2529
2530 /* Determin which function we need to load & check. */
2531 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2532 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodesOnNewPageLoadingTlb
2533 : !fConsiderCsLimChecking
2534 ? kIemThreadedFunc_BltIn_CheckOpcodesOnNewPageLoadingTlb
2535 : kIemThreadedFunc_BltIn_CheckOpcodesOnNewPageLoadingTlbConsiderCsLim;
2536 }
2537 else
2538 {
2539 Assert(pVCpu->iem.s.offCurInstrStart < 0);
2540 Assert(pVCpu->iem.s.offCurInstrStart + cbInstr > 0);
2541 uint8_t const cbStartPage = (uint8_t)-pVCpu->iem.s.offCurInstrStart;
2542 pCall->auParams[0] |= (uint64_t)cbStartPage << 32;
2543
2544 /* We've good. Split the instruction over the old and new range table entries. */
2545 pTb->aRanges[idxRange - 1].cbOpcodes += cbStartPage;
2546
2547 pTb->aRanges[idxRange].offPhysPage = 0;
2548 pTb->aRanges[idxRange].offOpcodes = offOpcode + cbStartPage;
2549 pTb->aRanges[idxRange].cbOpcodes = cbInstr - cbStartPage;
2550 pTb->aRanges[idxRange].u2Unused = 0;
2551 pTb->cRanges++;
2552 Log6(("%04x:%08RX64: new range #%u new page (b) %u/%RGp: offPhysPage=%#x offOpcodes=%#x\n",
2553 pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, idxRange, pTb->aRanges[idxRange].idxPhysPage, GCPhysNew,
2554 pTb->aRanges[idxRange].offPhysPage, pTb->aRanges[idxRange].offOpcodes));
2555
2556 /* Determin which function we need to load & check. */
2557 if (pVCpu->iem.s.fTbCheckOpcodes)
2558 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2559 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodesAcrossPageLoadingTlb
2560 : !fConsiderCsLimChecking
2561 ? kIemThreadedFunc_BltIn_CheckOpcodesAcrossPageLoadingTlb
2562 : kIemThreadedFunc_BltIn_CheckOpcodesAcrossPageLoadingTlbConsiderCsLim;
2563 else
2564 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2565 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodesOnNextPageLoadingTlb
2566 : !fConsiderCsLimChecking
2567 ? kIemThreadedFunc_BltIn_CheckOpcodesOnNextPageLoadingTlb
2568 : kIemThreadedFunc_BltIn_CheckOpcodesOnNextPageLoadingTlbConsiderCsLim;
2569 }
2570 }
2571
2572 /*
2573 * Regular case: No new range required.
2574 */
2575 else
2576 {
2577 Assert(pVCpu->iem.s.fTbCheckOpcodes || (pTb->fFlags & IEMTB_F_CS_LIM_CHECKS));
2578 if (pVCpu->iem.s.fTbCheckOpcodes)
2579 pCall->enmFunction = pTb->fFlags & IEMTB_F_CS_LIM_CHECKS
2580 ? kIemThreadedFunc_BltIn_CheckCsLimAndOpcodes
2581 : kIemThreadedFunc_BltIn_CheckOpcodes;
2582 else
2583 pCall->enmFunction = kIemThreadedFunc_BltIn_CheckCsLim;
2584
2585 iemThreadedCopyOpcodeBytesInline(pVCpu, &pTb->pabOpcodes[offOpcode], cbInstr);
2586 pTb->cbOpcodes = offOpcode + cbInstr;
2587 pTb->aRanges[idxRange].cbOpcodes += cbInstr;
2588 Assert(pTb->cbOpcodes <= pVCpu->iem.s.cbOpcodesAllocated);
2589 }
2590
2591 /*
2592 * Commit the call.
2593 */
2594 pTb->Thrd.cCalls++;
2595
2596 /*
2597 * Clear state.
2598 */
2599 pVCpu->iem.s.fTbBranched = IEMBRANCHED_F_NO;
2600 pVCpu->iem.s.fTbCrossedPage = false;
2601 pVCpu->iem.s.fTbCheckOpcodes = false;
2602
2603 /*
2604 * Copy opcode bytes.
2605 */
2606 iemThreadedCopyOpcodeBytesInline(pVCpu, &pTb->pabOpcodes[offOpcode], cbInstr);
2607 pTb->cbOpcodes = offOpcode + cbInstr;
2608 Assert(pTb->cbOpcodes <= pVCpu->iem.s.cbOpcodesAllocated);
2609
2610 return true;
2611}
2612
2613
2614/**
2615 * Worker for iemThreadedCompileBeginEmitCallsComplications and
2616 * iemThreadedCompileCheckIrq that checks for pending delivarable events.
2617 *
2618 * @returns true if anything is pending, false if not.
2619 * @param pVCpu The cross context virtual CPU structure of the calling
2620 * thread.
2621 */
2622DECL_FORCE_INLINE(bool) iemThreadedCompileIsIrqOrForceFlagPending(PVMCPUCC pVCpu)
2623{
2624 uint64_t fCpu = pVCpu->fLocalForcedActions;
2625 fCpu &= VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC | VMCPU_FF_INTERRUPT_NMI | VMCPU_FF_INTERRUPT_SMI;
2626#if 1
2627 /** @todo this isn't even close to the NMI/IRQ conditions in EM. */
2628 if (RT_LIKELY( !fCpu
2629 || ( !(fCpu & ~(VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
2630 && ( !pVCpu->cpum.GstCtx.rflags.Bits.u1IF
2631 || CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx))) ))
2632 return false;
2633 return true;
2634#else
2635 return false;
2636#endif
2637
2638}
2639
2640
2641/**
2642 * Called by iemThreadedCompile when a block requires a mode check.
2643 *
2644 * @returns true if we should continue, false if we're out of call entries.
2645 * @param pVCpu The cross context virtual CPU structure of the calling
2646 * thread.
2647 * @param pTb The translation block being compiled.
2648 */
2649static bool iemThreadedCompileEmitCheckMode(PVMCPUCC pVCpu, PIEMTB pTb)
2650{
2651 /* Emit the call. */
2652 uint32_t const idxCall = pTb->Thrd.cCalls;
2653 AssertReturn(idxCall < pTb->Thrd.cAllocated, false);
2654 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2655 pTb->Thrd.cCalls = (uint16_t)(idxCall + 1);
2656 pCall->enmFunction = kIemThreadedFunc_BltIn_CheckMode;
2657 pCall->idxInstr = pTb->cInstructions - 1;
2658 pCall->cbOpcode = 0;
2659 pCall->offOpcode = 0;
2660 pCall->uTbLookup = 0;
2661 pCall->fFlags = 0;
2662 pCall->auParams[0] = pVCpu->iem.s.fExec;
2663 pCall->auParams[1] = 0;
2664 pCall->auParams[2] = 0;
2665 LogFunc(("%04x:%08RX64 fExec=%#x\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->iem.s.fExec));
2666 return true;
2667}
2668
2669
2670/**
2671 * Called by IEM_MC2_BEGIN_EMIT_CALLS() when IEM_CIMPL_F_CHECK_IRQ_BEFORE is
2672 * set.
2673 *
2674 * @returns true if we should continue, false if an IRQ is deliverable or a
2675 * relevant force flag is pending.
2676 * @param pVCpu The cross context virtual CPU structure of the calling
2677 * thread.
2678 * @param pTb The translation block being compiled.
2679 * @sa iemThreadedCompileCheckIrq
2680 */
2681bool iemThreadedCompileEmitIrqCheckBefore(PVMCPUCC pVCpu, PIEMTB pTb)
2682{
2683 /*
2684 * Skip this we've already emitted a call after the previous instruction
2685 * or if it's the first call, as we're always checking FFs between blocks.
2686 */
2687 uint32_t const idxCall = pTb->Thrd.cCalls;
2688 if ( idxCall > 0
2689 && pTb->Thrd.paCalls[idxCall - 1].enmFunction != kIemThreadedFunc_BltIn_CheckIrq)
2690 {
2691 /* Emit the call. */
2692 AssertReturn(idxCall < pTb->Thrd.cAllocated, false);
2693 pVCpu->iem.s.idxLastCheckIrqCallNo = (uint16_t)idxCall;
2694 pTb->Thrd.cCalls = (uint16_t)(idxCall + 1);
2695 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2696 pCall->enmFunction = kIemThreadedFunc_BltIn_CheckIrq;
2697 pCall->idxInstr = pTb->cInstructions;
2698 pCall->offOpcode = 0;
2699 pCall->cbOpcode = 0;
2700 pCall->uTbLookup = 0;
2701 pCall->fFlags = 0;
2702 pCall->auParams[0] = 0;
2703 pCall->auParams[1] = 0;
2704 pCall->auParams[2] = 0;
2705 LogFunc(("%04x:%08RX64\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2706
2707 /* Reset the IRQ check value. */
2708 pVCpu->iem.s.cInstrTillIrqCheck = !CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx) ? 32 : 0;
2709
2710 /*
2711 * Check for deliverable IRQs and pending force flags.
2712 */
2713 return !iemThreadedCompileIsIrqOrForceFlagPending(pVCpu);
2714 }
2715 return true; /* continue */
2716}
2717
2718
2719/**
2720 * Emits an IRQ check call and checks for pending IRQs.
2721 *
2722 * @returns true if we should continue, false if an IRQ is deliverable or a
2723 * relevant force flag is pending.
2724 * @param pVCpu The cross context virtual CPU structure of the calling
2725 * thread.
2726 * @param pTb The transation block.
2727 * @sa iemThreadedCompileBeginEmitCallsComplications
2728 */
2729static bool iemThreadedCompileCheckIrqAfter(PVMCPUCC pVCpu, PIEMTB pTb)
2730{
2731 /* Check again in a little bit, unless it is immediately following an STI
2732 in which case we *must* check immediately after the next instruction
2733 as well in case it's executed with interrupt inhibition. We could
2734 otherwise miss the interrupt window. See the irq2 wait2 varaiant in
2735 bs3-timers-1 which is doing sti + sti + cli. */
2736 if (!pVCpu->iem.s.fTbCurInstrIsSti)
2737 pVCpu->iem.s.cInstrTillIrqCheck = 32;
2738 else
2739 {
2740 pVCpu->iem.s.fTbCurInstrIsSti = false;
2741 pVCpu->iem.s.cInstrTillIrqCheck = 0;
2742 }
2743 LogFunc(("%04x:%08RX64\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip));
2744
2745 /*
2746 * Emit the call.
2747 */
2748 uint32_t const idxCall = pTb->Thrd.cCalls;
2749 AssertReturn(idxCall < pTb->Thrd.cAllocated, false);
2750 pVCpu->iem.s.idxLastCheckIrqCallNo = (uint16_t)idxCall;
2751 pTb->Thrd.cCalls = (uint16_t)(idxCall + 1);
2752 PIEMTHRDEDCALLENTRY pCall = &pTb->Thrd.paCalls[idxCall];
2753 pCall->enmFunction = kIemThreadedFunc_BltIn_CheckIrq;
2754 pCall->idxInstr = pTb->cInstructions;
2755 pCall->offOpcode = 0;
2756 pCall->cbOpcode = 0;
2757 pCall->uTbLookup = 0;
2758 pCall->fFlags = 0;
2759 pCall->auParams[0] = 0;
2760 pCall->auParams[1] = 0;
2761 pCall->auParams[2] = 0;
2762
2763 /*
2764 * Check for deliverable IRQs and pending force flags.
2765 */
2766 return !iemThreadedCompileIsIrqOrForceFlagPending(pVCpu);
2767}
2768
2769
2770/**
2771 * Compiles a new TB and executes it.
2772 *
2773 * We combine compilation and execution here as it makes it simpler code flow
2774 * in the main loop and it allows interpreting while compiling if we want to
2775 * explore that option.
2776 *
2777 * @returns Strict VBox status code.
2778 * @param pVM The cross context virtual machine structure.
2779 * @param pVCpu The cross context virtual CPU structure of the calling
2780 * thread.
2781 * @param GCPhysPc The physical address corresponding to the current
2782 * RIP+CS.BASE.
2783 * @param fExtraFlags Extra translation block flags: IEMTB_F_INHIBIT_SHADOW,
2784 * IEMTB_F_INHIBIT_NMI, IEMTB_F_CS_LIM_CHECKS.
2785 */
2786static VBOXSTRICTRC iemThreadedCompile(PVMCC pVM, PVMCPUCC pVCpu, RTGCPHYS GCPhysPc, uint32_t fExtraFlags) IEM_NOEXCEPT_MAY_LONGJMP
2787{
2788 IEMTLBTRACE_TB_COMPILE(pVCpu, GCPhysPc);
2789 Assert(!(fExtraFlags & IEMTB_F_TYPE_MASK));
2790 fExtraFlags |= IEMTB_F_TYPE_THREADED;
2791
2792 /*
2793 * Get the TB we use for the recompiling. This is a maxed-out TB so
2794 * that'll we'll make a more efficient copy of when we're done compiling.
2795 */
2796 PIEMTB pTb = pVCpu->iem.s.pThrdCompileTbR3;
2797 if (pTb)
2798 iemThreadedTbReuse(pVCpu, pTb, GCPhysPc, fExtraFlags);
2799 else
2800 {
2801 pTb = iemThreadedTbAlloc(pVM, pVCpu, GCPhysPc, fExtraFlags);
2802 AssertReturn(pTb, VERR_IEM_TB_ALLOC_FAILED);
2803 pVCpu->iem.s.pThrdCompileTbR3 = pTb;
2804 }
2805 pTb->FlatPc = pVCpu->iem.s.uInstrBufPc | (GCPhysPc & GUEST_PAGE_OFFSET_MASK);
2806
2807 /* Set the current TB so iemThreadedCompileLongJumped and the CIMPL
2808 functions may get at it. */
2809 pVCpu->iem.s.pCurTbR3 = pTb;
2810
2811#if 0
2812 /* Make sure the CheckIrq condition matches the one in EM. */
2813 iemThreadedCompileCheckIrqAfter(pVCpu, pTb);
2814 const uint32_t cZeroCalls = 1;
2815#else
2816 const uint32_t cZeroCalls = 0;
2817#endif
2818
2819 /*
2820 * Now for the recomplication. (This mimicks IEMExecLots in many ways.)
2821 */
2822 iemThreadedCompileInitDecoder(pVCpu, false /*fReInit*/, fExtraFlags);
2823 iemThreadedCompileInitOpcodeFetching(pVCpu);
2824 VBOXSTRICTRC rcStrict;
2825 for (;;)
2826 {
2827 /* Process the next instruction. */
2828#ifdef LOG_ENABLED
2829 iemThreadedLogCurInstr(pVCpu, "CC", pTb->cInstructions);
2830 uint16_t const uCsLog = pVCpu->cpum.GstCtx.cs.Sel;
2831 uint64_t const uRipLog = pVCpu->cpum.GstCtx.rip;
2832 Assert(uCsLog != 0 || uRipLog > 0x400 || !IEM_IS_REAL_OR_V86_MODE(pVCpu)); /* Detect executing RM interrupt table. */
2833#endif
2834 uint8_t b; IEM_OPCODE_GET_FIRST_U8(&b);
2835 uint16_t const cCallsPrev = pTb->Thrd.cCalls;
2836
2837 rcStrict = FNIEMOP_CALL(g_apfnIemThreadedRecompilerOneByteMap[b]);
2838#if 0
2839 for (unsigned i = cCallsPrev; i < pTb->Thrd.cCalls; i++)
2840 Log8(("-> %#u/%u - %d %s\n", i, pTb->Thrd.paCalls[i].idxInstr, pTb->Thrd.paCalls[i].enmFunction,
2841 g_apszIemThreadedFunctions[pTb->Thrd.paCalls[i].enmFunction]));
2842#endif
2843 if ( rcStrict == VINF_SUCCESS
2844 && pVCpu->iem.s.rcPassUp == VINF_SUCCESS
2845 && !pVCpu->iem.s.fEndTb)
2846 {
2847 Assert(pTb->Thrd.cCalls > cCallsPrev);
2848 Assert(cCallsPrev - pTb->Thrd.cCalls < 5);
2849
2850 pVCpu->iem.s.cInstructions++;
2851
2852 /* Check for mode change _after_ certain CIMPL calls, so check that
2853 we continue executing with the same mode value. */
2854 if (!(pVCpu->iem.s.fTbCurInstr & (IEM_CIMPL_F_MODE | IEM_CIMPL_F_XCPT | IEM_CIMPL_F_VMEXIT)))
2855 { /* probable */ }
2856 else if (RT_LIKELY(iemThreadedCompileEmitCheckMode(pVCpu, pTb)))
2857 { /* extremely likely */ }
2858 else
2859 break;
2860
2861#if defined(LOG_ENABLED) && 0 /* for debugging */
2862 //iemThreadedCompileEmitNop(pTb);
2863 iemThreadedCompileEmitLogCpuState(pTb);
2864#endif
2865 }
2866 else
2867 {
2868 Log8(("%04x:%08RX64: End TB - %u instr, %u calls, rc=%d\n",
2869 uCsLog, uRipLog, pTb->cInstructions, pTb->Thrd.cCalls, VBOXSTRICTRC_VAL(rcStrict)));
2870 if (rcStrict == VINF_IEM_RECOMPILE_END_TB)
2871 rcStrict = VINF_SUCCESS;
2872
2873 if (pTb->Thrd.cCalls > cZeroCalls)
2874 {
2875 if (cCallsPrev != pTb->Thrd.cCalls)
2876 pVCpu->iem.s.cInstructions++;
2877 break;
2878 }
2879
2880 pVCpu->iem.s.pCurTbR3 = NULL;
2881 return iemExecStatusCodeFiddling(pVCpu, rcStrict);
2882 }
2883
2884 /* Check for IRQs? */
2885 if (pVCpu->iem.s.cInstrTillIrqCheck > 0)
2886 pVCpu->iem.s.cInstrTillIrqCheck--;
2887 else if (!iemThreadedCompileCheckIrqAfter(pVCpu, pTb))
2888 break;
2889
2890 /* Still space in the TB? */
2891 if ( pTb->Thrd.cCalls + 5 < pTb->Thrd.cAllocated
2892 && pTb->cbOpcodes + 16 <= pVCpu->iem.s.cbOpcodesAllocated
2893 && pTb->cTbLookupEntries < 127)
2894 iemThreadedCompileInitDecoder(pVCpu, true /*fReInit*/, 0);
2895 else
2896 {
2897 Log8(("%04x:%08RX64: End TB - %u instr, %u calls, %u opcode bytes, %u TB lookup entries - full\n",
2898 uCsLog, uRipLog, pTb->cInstructions, pTb->Thrd.cCalls, pTb->cbOpcodes, pTb->cTbLookupEntries));
2899 break;
2900 }
2901 iemThreadedCompileReInitOpcodeFetching(pVCpu);
2902 }
2903
2904 /*
2905 * Reserve lookup space for the final call entry if necessary.
2906 */
2907 PIEMTHRDEDCALLENTRY pFinalCall = &pTb->Thrd.paCalls[pTb->Thrd.cCalls - 1];
2908 if (pTb->Thrd.cCalls > 1)
2909 {
2910 if (pFinalCall->uTbLookup == 0)
2911 {
2912 pFinalCall->uTbLookup = IEM_TB_LOOKUP_TAB_MAKE(pTb->cTbLookupEntries, 0);
2913 pTb->cTbLookupEntries += 1;
2914 }
2915 }
2916 else if (pFinalCall->uTbLookup != 0)
2917 {
2918 Assert(pTb->cTbLookupEntries > 1);
2919 pFinalCall->uTbLookup -= 1;
2920 pTb->cTbLookupEntries -= 1;
2921 }
2922
2923 /*
2924 * Duplicate the TB into a completed one and link it.
2925 */
2926 pTb = iemThreadedTbDuplicate(pVM, pVCpu, pTb);
2927 AssertReturn(pTb, VERR_IEM_TB_ALLOC_FAILED);
2928
2929 iemThreadedTbAdd(pVCpu, pVCpu->iem.s.pTbCacheR3, pTb);
2930
2931#ifdef IEM_COMPILE_ONLY_MODE
2932 /*
2933 * Execute the translation block.
2934 */
2935#endif
2936
2937 return iemExecStatusCodeFiddling(pVCpu, rcStrict);
2938}
2939
2940
2941
2942/*********************************************************************************************************************************
2943* Threaded Translation Block Saving and Restoring for Profiling the Native Recompiler *
2944*********************************************************************************************************************************/
2945#if defined(VBOX_WITH_IEM_NATIVE_RECOMPILER) && defined(VBOX_WITH_SAVE_THREADED_TBS_FOR_PROFILING)
2946# include <iprt/message.h>
2947
2948static const SSMFIELD g_aIemThreadedTbFields[] =
2949{
2950 SSMFIELD_ENTRY( IEMTB, cUsed),
2951 SSMFIELD_ENTRY( IEMTB, msLastUsed),
2952 SSMFIELD_ENTRY_GCPHYS(IEMTB, GCPhysPc),
2953 SSMFIELD_ENTRY( IEMTB, fFlags),
2954 SSMFIELD_ENTRY( IEMTB, x86.fAttr),
2955 SSMFIELD_ENTRY( IEMTB, cRanges),
2956 SSMFIELD_ENTRY( IEMTB, cInstructions),
2957 SSMFIELD_ENTRY( IEMTB, Thrd.cCalls),
2958 SSMFIELD_ENTRY( IEMTB, cTbLookupEntries),
2959 SSMFIELD_ENTRY( IEMTB, cbOpcodes),
2960 SSMFIELD_ENTRY( IEMTB, FlatPc),
2961 SSMFIELD_ENTRY_GCPHYS(IEMTB, aGCPhysPages[0]),
2962 SSMFIELD_ENTRY_GCPHYS(IEMTB, aGCPhysPages[1]),
2963 SSMFIELD_ENTRY_TERM()
2964};
2965
2966/**
2967 * Saves a threaded TB to a dedicated saved state file.
2968 */
2969static void iemThreadedSaveTbForProfiling(PVMCPU pVCpu, PCIEMTB pTb)
2970{
2971 /* Only VCPU #0 for now. */
2972 if (pVCpu->idCpu != 0)
2973 return;
2974
2975 /*
2976 * Get the SSM handle, lazily opening the output file.
2977 */
2978 PSSMHANDLE const pNil = (PSSMHANDLE)~(uintptr_t)0; Assert(!RT_VALID_PTR(pNil));
2979 PSSMHANDLE pSSM = pVCpu->iem.s.pSsmThreadedTbsForProfiling;
2980 if (pSSM && pSSM != pNil)
2981 { /* likely */ }
2982 else if (pSSM)
2983 return;
2984 else
2985 {
2986 pVCpu->iem.s.pSsmThreadedTbsForProfiling = pNil;
2987 int rc = SSMR3Open("ThreadedTBsForRecompilerProfiling.sav", NULL, NULL, SSM_OPEN_F_FOR_WRITING, &pSSM);
2988 AssertLogRelRCReturnVoid(rc);
2989
2990 rc = SSMR3WriteFileHeader(pSSM, 1);
2991 AssertLogRelRCReturnVoid(rc); /* leaks SSM handle, but whatever. */
2992
2993 rc = SSMR3WriteUnitBegin(pSSM, "threaded-tbs", 1, 0);
2994 AssertLogRelRCReturnVoid(rc); /* leaks SSM handle, but whatever. */
2995 pVCpu->iem.s.pSsmThreadedTbsForProfiling = pSSM;
2996 }
2997
2998 /*
2999 * Do the actual saving.
3000 */
3001 SSMR3PutU32(pSSM, 0); /* Indicates that another TB follows. */
3002
3003 /* The basic structure. */
3004 SSMR3PutStructEx(pSSM, pTb, sizeof(*pTb), 0 /*fFlags*/, g_aIemThreadedTbFields, NULL);
3005
3006 /* The ranges. */
3007 for (uint32_t iRange = 0; iRange < pTb->cRanges; iRange++)
3008 {
3009 SSMR3PutU16(pSSM, pTb->aRanges[iRange].offOpcodes);
3010 SSMR3PutU16(pSSM, pTb->aRanges[iRange].cbOpcodes);
3011 SSMR3PutU16(pSSM, pTb->aRanges[iRange].offPhysPage | (pTb->aRanges[iRange].idxPhysPage << 14));
3012 }
3013
3014 /* The opcodes. */
3015 SSMR3PutMem(pSSM, pTb->pabOpcodes, pTb->cbOpcodes);
3016
3017 /* The threaded call table. */
3018 int rc = SSMR3PutMem(pSSM, pTb->Thrd.paCalls, sizeof(*pTb->Thrd.paCalls) * pTb->Thrd.cCalls);
3019 AssertLogRelMsgStmt(RT_SUCCESS(rc), ("rc=%Rrc\n", rc), pVCpu->iem.s.pSsmThreadedTbsForProfiling = pNil);
3020}
3021
3022
3023/**
3024 * Called by IEMR3Term to finish any open profile files.
3025 *
3026 * @note This is not called on the EMT for @a pVCpu, but rather on the thread
3027 * driving the VM termination.
3028 */
3029DECLHIDDEN(void) iemThreadedSaveTbForProfilingCleanup(PVMCPU pVCpu)
3030{
3031 PSSMHANDLE const pSSM = pVCpu->iem.s.pSsmThreadedTbsForProfiling;
3032 pVCpu->iem.s.pSsmThreadedTbsForProfiling = NULL;
3033 if (RT_VALID_PTR(pSSM))
3034 {
3035 /* Indicate that this is the end. */
3036 SSMR3PutU32(pSSM, UINT32_MAX);
3037
3038 int rc = SSMR3WriteUnitComplete(pSSM);
3039 AssertLogRelRC(rc);
3040 rc = SSMR3WriteFileFooter(pSSM);
3041 AssertLogRelRC(rc);
3042 rc = SSMR3Close(pSSM);
3043 AssertLogRelRC(rc);
3044 }
3045}
3046
3047#endif /* VBOX_WITH_IEM_NATIVE_RECOMPILER && VBOX_WITH_SAVE_THREADED_TBS_FOR_PROFILING */
3048
3049#ifdef IN_RING3
3050/**
3051 * API use to process what iemThreadedSaveTbForProfiling() saved.
3052 *
3053 * @note Do not mix build types or revisions. Local changes between saving the
3054 * TBs and calling this API may cause unexpected trouble.
3055 */
3056VMMR3DECL(int) IEMR3ThreadedProfileRecompilingSavedTbs(PVM pVM, const char *pszFilename, uint32_t cMinTbs)
3057{
3058# if defined(VBOX_WITH_IEM_NATIVE_RECOMPILER) && defined(VBOX_WITH_SAVE_THREADED_TBS_FOR_PROFILING)
3059 PVMCPU const pVCpu = pVM->apCpusR3[0];
3060
3061 /* We need to keep an eye on the TB allocator. */
3062 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
3063
3064 /*
3065 * Load the TBs from the file.
3066 */
3067 PSSMHANDLE pSSM = NULL;
3068 int rc = SSMR3Open(pszFilename, NULL, NULL, 0, &pSSM);
3069 if (RT_SUCCESS(rc))
3070 {
3071 uint32_t cTbs = 0;
3072 PIEMTB pTbHead = NULL;
3073 PIEMTB *ppTbTail = &pTbHead;
3074 uint32_t uVersion;
3075 rc = SSMR3Seek(pSSM, "threaded-tbs", 0, &uVersion);
3076 if (RT_SUCCESS(rc))
3077 {
3078 for (;; cTbs++)
3079 {
3080 /* Check for the end tag. */
3081 uint32_t uTag = 0;
3082 rc = SSMR3GetU32(pSSM, &uTag);
3083 AssertRCBreak(rc);
3084 if (uTag == UINT32_MAX)
3085 break;
3086 AssertBreakStmt(uTag == 0, rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3087
3088 /* Do we have room for another TB? */
3089 if (pTbAllocator->cInUseTbs + 2 >= pTbAllocator->cMaxTbs)
3090 {
3091 RTMsgInfo("Too many TBs to load, stopping loading early.\n");
3092 break;
3093 }
3094
3095 /* Allocate a new TB. */
3096 PIEMTB pTb = iemTbAllocatorAlloc(pVCpu, true /*fThreaded*/);
3097 AssertBreakStmt(uTag == 0, rc = VERR_OUT_OF_RESOURCES);
3098
3099 uint8_t const idxAllocChunk = pTb->idxAllocChunk;
3100 RT_ZERO(*pTb);
3101 pTb->idxAllocChunk = idxAllocChunk;
3102
3103 rc = SSMR3GetStructEx(pSSM, pTb, sizeof(*pTb), 0, g_aIemThreadedTbFields, NULL);
3104 if (RT_SUCCESS(rc))
3105 {
3106 AssertStmt(pTb->Thrd.cCalls > 0 && pTb->Thrd.cCalls <= _8K, rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3107 AssertStmt(pTb->cbOpcodes > 0 && pTb->cbOpcodes <= _8K, rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3108 AssertStmt(pTb->cRanges > 0 && pTb->cRanges <= RT_ELEMENTS(pTb->aRanges), rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3109 AssertStmt(pTb->cTbLookupEntries > 0 && pTb->cTbLookupEntries <= _1K, rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3110
3111 if (RT_SUCCESS(rc))
3112 for (uint32_t iRange = 0; iRange < pTb->cRanges; iRange++)
3113 {
3114 SSMR3GetU16(pSSM, &pTb->aRanges[iRange].offOpcodes);
3115 SSMR3GetU16(pSSM, &pTb->aRanges[iRange].cbOpcodes);
3116 uint16_t uTmp = 0;
3117 rc = SSMR3GetU16(pSSM, &uTmp);
3118 AssertRCBreak(rc);
3119 pTb->aRanges[iRange].offPhysPage = uTmp & GUEST_PAGE_OFFSET_MASK;
3120 pTb->aRanges[iRange].idxPhysPage = uTmp >> 14;
3121
3122 AssertBreakStmt(pTb->aRanges[iRange].idxPhysPage <= RT_ELEMENTS(pTb->aGCPhysPages),
3123 rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3124 AssertBreakStmt(pTb->aRanges[iRange].offOpcodes < pTb->cbOpcodes,
3125 rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3126 AssertBreakStmt(pTb->aRanges[iRange].offOpcodes + pTb->aRanges[iRange].cbOpcodes <= pTb->cbOpcodes,
3127 rc = VERR_SSM_DATA_UNIT_FORMAT_CHANGED);
3128 }
3129
3130 if (RT_SUCCESS(rc))
3131 {
3132 pTb->Thrd.paCalls = (PIEMTHRDEDCALLENTRY)RTMemAllocZ(sizeof(IEMTHRDEDCALLENTRY) * pTb->Thrd.cCalls);
3133 if (pTb->Thrd.paCalls)
3134 {
3135 size_t const cbTbLookup = pTb->cTbLookupEntries * sizeof(PIEMTB);
3136 Assert(cbTbLookup > 0);
3137 size_t const cbOpcodes = pTb->cbOpcodes;
3138 Assert(cbOpcodes > 0);
3139 size_t const cbBoth = cbTbLookup + RT_ALIGN_Z(cbOpcodes, sizeof(PIEMTB));
3140 uint8_t * const pbBoth = (uint8_t *)RTMemAllocZ(cbBoth);
3141 if (pbBoth)
3142 {
3143 pTb->pabOpcodes = &pbBoth[cbTbLookup];
3144 SSMR3GetMem(pSSM, pTb->pabOpcodes, pTb->cbOpcodes);
3145 rc = SSMR3GetMem(pSSM, pTb->Thrd.paCalls, sizeof(IEMTHRDEDCALLENTRY) * pTb->Thrd.cCalls);
3146 if (RT_SUCCESS(rc))
3147 {
3148 *ppTbTail = pTb;
3149 ppTbTail = &pTb->pNext;
3150 continue;
3151 }
3152 }
3153 else
3154 rc = VERR_NO_MEMORY;
3155 RTMemFree(pTb->Thrd.paCalls);
3156 }
3157 else
3158 rc = VERR_NO_MEMORY;
3159 }
3160 }
3161 iemTbAllocatorFree(pVCpu, pTb);
3162 break;
3163 }
3164 if (RT_FAILURE(rc))
3165 RTMsgError("Load error: %Rrc (cTbs=%u)", rc, cTbs);
3166 }
3167 else
3168 RTMsgError("SSMR3Seek failed on '%s': %Rrc", pszFilename, rc);
3169 SSMR3Close(pSSM);
3170 if (RT_SUCCESS(rc))
3171 {
3172 /*
3173 * Recompile the TBs.
3174 */
3175 if (pTbHead)
3176 {
3177 RTMsgInfo("Loaded %u TBs\n", cTbs);
3178 if (cTbs < cMinTbs)
3179 {
3180 RTMsgInfo("Duplicating TBs to reach %u TB target\n", cMinTbs);
3181 for (PIEMTB pTb = pTbHead;
3182 cTbs < cMinTbs && pTbAllocator->cInUseTbs + 2 <= pTbAllocator->cMaxTbs;
3183 pTb = pTb->pNext)
3184 {
3185 PIEMTB pTbCopy = iemThreadedTbDuplicate(pVM, pVCpu, pTbHead);
3186 if (!pTbCopy)
3187 break;
3188 *ppTbTail = pTbCopy;
3189 ppTbTail = &pTbCopy->pNext;
3190 cTbs++;
3191 }
3192 }
3193
3194 PIEMTB pTbWarmup = iemThreadedTbDuplicate(pVM, pVCpu, pTbHead);
3195 if (pTbWarmup)
3196 {
3197 iemNativeRecompile(pVCpu, pTbWarmup);
3198 RTThreadSleep(512); /* to make the start visible in the profiler. */
3199 RTMsgInfo("Ready, set, go!\n");
3200
3201 if ((pTbWarmup->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
3202 {
3203 uint32_t cFailed = 0;
3204 uint64_t const nsStart = RTTimeNanoTS();
3205 for (PIEMTB pTb = pTbHead; pTb; pTb = pTb->pNext)
3206 {
3207 iemNativeRecompile(pVCpu, pTb);
3208 if ((pTb->fFlags & IEMTB_F_TYPE_MASK) != IEMTB_F_TYPE_NATIVE)
3209 cFailed++;
3210 }
3211 uint64_t const cNsElapsed = RTTimeNanoTS() - nsStart;
3212 RTMsgInfo("Recompiled %u TBs in %'RU64 ns - averaging %'RU64 ns/TB\n",
3213 cTbs, cNsElapsed, (cNsElapsed + cTbs - 1) / cTbs);
3214 if (cFailed)
3215 {
3216 RTMsgError("Unforuntately %u TB failed!", cFailed);
3217 rc = VERR_GENERAL_FAILURE;
3218 }
3219 RTThreadSleep(128); /* Another gap in the profiler timeline. */
3220 }
3221 else
3222 {
3223 RTMsgError("Failed to recompile the first TB!");
3224 rc = VERR_GENERAL_FAILURE;
3225 }
3226 }
3227 else
3228 rc = VERR_NO_MEMORY;
3229 }
3230 else
3231 {
3232 RTMsgError("'%s' contains no TBs!", pszFilename);
3233 rc = VERR_NO_DATA;
3234 }
3235 }
3236 }
3237 else
3238 RTMsgError("SSMR3Open failed on '%s': %Rrc", pszFilename, rc);
3239 return rc;
3240
3241# else
3242 RT_NOREF(pVM, pszFilename, cMinTbs);
3243 return VERR_NOT_IMPLEMENTED;
3244# endif
3245}
3246#endif /* IN_RING3 */
3247
3248
3249/*********************************************************************************************************************************
3250* Recompiled Execution Core *
3251*********************************************************************************************************************************/
3252
3253/** Default TB factor.
3254 * This is basically the number of nanoseconds we guess executing a TB takes
3255 * on average. We estimates it high if we can.
3256 * @note Best if this is a power of two so it can be translated to a shift. */
3257#define IEM_TIMER_POLL_DEFAULT_FACTOR UINT32_C(64)
3258/** The minimum number of nanoseconds we can allow between timer pollings.
3259 * This must take the cost of TMTimerPollBoolWithNanoTS into mind. We put that
3260 * cost at 104 ns now, thus this constant is at 256 ns. */
3261#define IEM_TIMER_POLL_MIN_NS UINT32_C(256)
3262/** The IEM_TIMER_POLL_MIN_NS value roughly translated to TBs, with some grains
3263 * of salt thrown in.
3264 * The idea is that we will be able to make progress with guest code execution
3265 * before polling timers and between running timers. */
3266#define IEM_TIMER_POLL_MIN_ITER UINT32_C(12)
3267/** The maximum number of nanoseconds we can allow between timer pollings.
3268 * This probably shouldn't be too high, as we don't have any timer
3269 * reprogramming feedback in the polling code. So, when a device reschedule a
3270 * timer for an earlier delivery, we won't know about it. */
3271#define IEM_TIMER_POLL_MAX_NS UINT32_C(8388608) /* 0x800000 ns = 8.4 ms */
3272/** The IEM_TIMER_POLL_MAX_NS value roughly translated to TBs, with some grains
3273 * of salt thrown in.
3274 * This helps control fluctuations in the NU benchmark. */
3275#define IEM_TIMER_POLL_MAX_ITER _512K
3276
3277#ifdef IEM_WITH_ADAPTIVE_TIMER_POLLING
3278/**
3279 * Calculates the number of TBs till the next timer polling using defaults.
3280 *
3281 * This is used when the previous run wasn't long enough to provide sufficient
3282 * data and when comming back from the HALT state and we haven't actually
3283 * executed anything for a while.
3284 */
3285DECL_FORCE_INLINE(uint32_t) iemPollTimersCalcDefaultCountdown(uint64_t cNsDelta) RT_NOEXCEPT
3286{
3287 if (cNsDelta >= IEM_TIMER_POLL_MAX_NS)
3288 return RT_MIN(IEM_TIMER_POLL_MAX_NS / IEM_TIMER_POLL_DEFAULT_FACTOR, IEM_TIMER_POLL_MAX_ITER);
3289
3290 cNsDelta = RT_BIT_64(ASMBitFirstSetU32(cNsDelta) - 1); /* round down to power of 2 */
3291 uint32_t const cRet = cNsDelta / IEM_TIMER_POLL_DEFAULT_FACTOR;
3292 if (cRet >= IEM_TIMER_POLL_MIN_ITER)
3293 {
3294 if (cRet <= IEM_TIMER_POLL_MAX_ITER)
3295 return cRet;
3296 return IEM_TIMER_POLL_MAX_ITER;
3297 }
3298 return IEM_TIMER_POLL_MIN_ITER;
3299}
3300#endif
3301
3302
3303/**
3304 * Helper for polling timers.
3305 */
3306DECLHIDDEN(int) iemPollTimers(PVMCC pVM, PVMCPUCC pVCpu) RT_NOEXCEPT
3307{
3308 STAM_PROFILE_START(&pVCpu->iem.s.StatTimerPoll, a);
3309
3310 /*
3311 * Check for VM_FF_TM_VIRTUAL_SYNC and call TMR3VirtualSyncFF if set.
3312 * This is something all EMTs can do.
3313 */
3314 /* If the virtual sync FF is set, respond to it. */
3315 bool fRanTimers = VM_FF_IS_SET(pVM, VM_FF_TM_VIRTUAL_SYNC);
3316 if (!fRanTimers)
3317 { /* likely */ }
3318 else
3319 {
3320 STAM_PROFILE_START(&pVCpu->iem.s.StatTimerPollRun, b);
3321 TMR3VirtualSyncFF(pVM, pVCpu);
3322 STAM_PROFILE_STOP(&pVCpu->iem.s.StatTimerPollRun, b);
3323 }
3324
3325 /*
3326 * Poll timers.
3327 *
3328 * On the 10980xe the polling averaging 314 ticks, with a min of 201, while
3329 * running a norton utilities DOS benchmark program. TSC runs at 3GHz,
3330 * translating that to 104 ns and 67 ns respectively. (An M2 booting win11
3331 * has an average of 2 ticks / 84 ns.)
3332 *
3333 * With the same setup the TMR3VirtualSyncFF and else branch here profiles
3334 * to 79751 ticks / 26583 ns on average, with a min of 1194 ticks / 398 ns.
3335 * (An M2 booting win11 has an average of 24 ticks / 1008 ns, with a min of
3336 * 8 ticks / 336 ns.)
3337 *
3338 * If we get a zero return value we run timers. Non-timer EMTs shouldn't
3339 * ever see a zero value here, so we just call TMR3TimerQueuesDo. However,
3340 * we do not re-run timers if we already called TMR3VirtualSyncFF above, we
3341 * try to make sure some code is executed first.
3342 */
3343 uint64_t nsNow = 0;
3344 uint64_t cNsDelta = TMTimerPollBoolWithNanoTS(pVM, pVCpu, &nsNow);
3345 if (cNsDelta >= 1) /* It is okay to run virtual sync timers a little early. */
3346 { /* likely */ }
3347 else if (!fRanTimers || VM_FF_IS_SET(pVM, VM_FF_TM_VIRTUAL_SYNC))
3348 {
3349 STAM_PROFILE_START(&pVCpu->iem.s.StatTimerPollRun, b);
3350 TMR3TimerQueuesDo(pVM);
3351 fRanTimers = true;
3352 nsNow = 0;
3353 cNsDelta = TMTimerPollBoolWithNanoTS(pVM, pVCpu, &nsNow);
3354 STAM_PROFILE_STOP(&pVCpu->iem.s.StatTimerPollRun, b);
3355 }
3356 else
3357 cNsDelta = 33;
3358
3359 /*
3360 * Calc interval and update the timestamps.
3361 */
3362 uint64_t const cNsSinceLast = nsNow - pVCpu->iem.s.nsRecompilerPollNow;
3363 pVCpu->iem.s.nsRecompilerPollNow = nsNow;
3364 pVCpu->iem.s.msRecompilerPollNow = (uint32_t)(nsNow / RT_NS_1MS);
3365
3366 /*
3367 * Set the next polling count down value.
3368 *
3369 * We take the previous value and adjust it according to the cNsSinceLast
3370 * value, if it's not within reason. This can't be too accurate since the
3371 * CheckIrq and intra-TB-checks aren't evenly spaced, they depends highly
3372 * on the guest code.
3373 */
3374#ifdef IEM_WITH_ADAPTIVE_TIMER_POLLING
3375 uint32_t cItersTillNextPoll = pVCpu->iem.s.cTbsTillNextTimerPollPrev;
3376 if (cNsDelta >= RT_NS_1SEC / 4)
3377 {
3378 /*
3379 * Non-timer EMTs should end up here with a fixed 500ms delta, just return
3380 * the max and keep the polling over head to the deadicated timer EMT.
3381 */
3382 AssertCompile(IEM_TIMER_POLL_MAX_ITER * IEM_TIMER_POLL_DEFAULT_FACTOR <= RT_NS_100MS);
3383 cItersTillNextPoll = IEM_TIMER_POLL_MAX_ITER;
3384 }
3385 else
3386 {
3387 /*
3388 * This is the timer EMT.
3389 */
3390 if (cNsDelta <= IEM_TIMER_POLL_MIN_NS)
3391 {
3392 STAM_COUNTER_INC(&pVCpu->iem.s.StatTimerPollTiny);
3393 cItersTillNextPoll = IEM_TIMER_POLL_MIN_ITER;
3394 }
3395 else
3396 {
3397 uint32_t const cNsDeltaAdj = cNsDelta >= IEM_TIMER_POLL_MAX_NS ? IEM_TIMER_POLL_MAX_NS : (uint32_t)cNsDelta;
3398 uint32_t const cNsDeltaSlack = cNsDelta >= IEM_TIMER_POLL_MAX_NS ? IEM_TIMER_POLL_MAX_NS / 2 : cNsDeltaAdj / 4;
3399 if ( cNsSinceLast < RT_MAX(IEM_TIMER_POLL_MIN_NS, 64)
3400 || cItersTillNextPoll < IEM_TIMER_POLL_MIN_ITER /* paranoia */)
3401 {
3402 STAM_COUNTER_INC(&pVCpu->iem.s.StatTimerPollDefaultCalc);
3403 cItersTillNextPoll = iemPollTimersCalcDefaultCountdown(cNsDeltaAdj);
3404 }
3405 else if ( cNsSinceLast >= cNsDeltaAdj + cNsDeltaSlack
3406 || cNsSinceLast <= cNsDeltaAdj - cNsDeltaSlack)
3407 {
3408 if (cNsSinceLast >= cItersTillNextPoll)
3409 {
3410 uint32_t uFactor = (uint32_t)(cNsSinceLast + cItersTillNextPoll - 1) / cItersTillNextPoll;
3411 cItersTillNextPoll = cNsDeltaAdj / uFactor;
3412 STAM_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTimerPollFactorDivision, uFactor);
3413 }
3414 else
3415 {
3416 uint32_t uFactor = cItersTillNextPoll / (uint32_t)cNsSinceLast;
3417 cItersTillNextPoll = cNsDeltaAdj * uFactor;
3418 STAM_PROFILE_ADD_PERIOD(&pVCpu->iem.s.StatTimerPollFactorMultiplication, uFactor);
3419 }
3420
3421 if (cItersTillNextPoll >= IEM_TIMER_POLL_MIN_ITER)
3422 {
3423 if (cItersTillNextPoll <= IEM_TIMER_POLL_MAX_ITER)
3424 { /* likely */ }
3425 else
3426 {
3427 STAM_COUNTER_INC(&pVCpu->iem.s.StatTimerPollMax);
3428 cItersTillNextPoll = IEM_TIMER_POLL_MAX_ITER;
3429 }
3430 }
3431 else
3432 cItersTillNextPoll = IEM_TIMER_POLL_MIN_ITER;
3433 }
3434 else
3435 STAM_COUNTER_INC(&pVCpu->iem.s.StatTimerPollUnchanged);
3436 }
3437 pVCpu->iem.s.cTbsTillNextTimerPollPrev = cItersTillNextPoll;
3438 }
3439#else
3440/** Poll timers every 400 us / 2500 Hz. (source: thin air) */
3441# define IEM_TIMER_POLL_IDEAL_NS (400U * RT_NS_1US)
3442 uint32_t cItersTillNextPoll = pVCpu->iem.s.cTbsTillNextTimerPollPrev;
3443 uint32_t const cNsIdealPollInterval = IEM_TIMER_POLL_IDEAL_NS;
3444 int64_t const nsFromIdeal = cNsSinceLast - cNsIdealPollInterval;
3445 if (nsFromIdeal < 0)
3446 {
3447 if ((uint64_t)-nsFromIdeal > cNsIdealPollInterval / 8 && cItersTillNextPoll < _64K)
3448 {
3449 cItersTillNextPoll += cItersTillNextPoll / 8;
3450 pVCpu->iem.s.cTbsTillNextTimerPollPrev = cItersTillNextPoll;
3451 }
3452 }
3453 else
3454 {
3455 if ((uint64_t)nsFromIdeal > cNsIdealPollInterval / 8 && cItersTillNextPoll > 256)
3456 {
3457 cItersTillNextPoll -= cItersTillNextPoll / 8;
3458 pVCpu->iem.s.cTbsTillNextTimerPollPrev = cItersTillNextPoll;
3459 }
3460 }
3461#endif
3462 pVCpu->iem.s.cTbsTillNextTimerPoll = cItersTillNextPoll;
3463
3464 /*
3465 * Repeat the IRQ and FF checks.
3466 */
3467 if (cNsDelta > 0)
3468 {
3469 uint32_t fCpu = pVCpu->fLocalForcedActions;
3470 fCpu &= VMCPU_FF_ALL_MASK & ~( VMCPU_FF_PGM_SYNC_CR3
3471 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL
3472 | VMCPU_FF_TLB_FLUSH
3473 | VMCPU_FF_UNHALT );
3474 if (RT_LIKELY( ( !fCpu
3475 || ( !(fCpu & ~(VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
3476 && ( !pVCpu->cpum.GstCtx.rflags.Bits.u1IF
3477 || CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx)) ) )
3478 && !VM_FF_IS_ANY_SET(pVCpu->CTX_SUFF(pVM), VM_FF_ALL_MASK) ))
3479 {
3480 STAM_PROFILE_STOP(&pVCpu->iem.s.StatTimerPoll, a);
3481 return VINF_SUCCESS;
3482 }
3483 }
3484 STAM_PROFILE_STOP(&pVCpu->iem.s.StatTimerPoll, a);
3485 return VINF_IEM_REEXEC_BREAK_FF;
3486}
3487
3488
3489/** Helper for iemTbExec. */
3490DECL_FORCE_INLINE(PIEMTB *) iemTbGetTbLookupEntryWithRip(PCIEMTB pTb, uint8_t uTbLookup, uint64_t uRip)
3491{
3492 uint8_t const idx = IEM_TB_LOOKUP_TAB_GET_IDX_WITH_RIP(uTbLookup, uRip);
3493 Assert(idx < pTb->cTbLookupEntries);
3494 return IEMTB_GET_TB_LOOKUP_TAB_ENTRY(pTb, idx);
3495}
3496
3497
3498/**
3499 * Executes a translation block.
3500 *
3501 * @returns Strict VBox status code.
3502 * @param pVCpu The cross context virtual CPU structure of the calling
3503 * thread.
3504 * @param pTb The translation block to execute.
3505 */
3506static VBOXSTRICTRC iemTbExec(PVMCPUCC pVCpu, PIEMTB pTb) IEM_NOEXCEPT_MAY_LONGJMP
3507{
3508 Assert(!(pVCpu->iem.s.GCPhysInstrBuf & (RTGCPHYS)GUEST_PAGE_OFFSET_MASK));
3509
3510 /*
3511 * Set the current TB so CIMPL functions may get at it.
3512 */
3513 pVCpu->iem.s.pCurTbR3 = pTb;
3514 pVCpu->iem.s.ppTbLookupEntryR3 = IEMTB_GET_TB_LOOKUP_TAB_ENTRY(pTb, 0);
3515
3516 /*
3517 * Execute the block.
3518 */
3519#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
3520 if (pTb->fFlags & IEMTB_F_TYPE_NATIVE)
3521 {
3522 pVCpu->iem.s.cTbExecNative++;
3523 IEMTLBTRACE_TB_EXEC_N8VE(pVCpu, pTb);
3524# ifdef LOG_ENABLED
3525 iemThreadedLogCurInstr(pVCpu, "EXn", 0);
3526# endif
3527
3528# ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER_LONGJMP
3529 AssertCompileMemberOffset(VMCPUCC, iem.s.pvTbFramePointerR3, 0x7c8); /* This is assumed in iemNativeTbEntry */
3530# endif
3531# ifdef RT_ARCH_AMD64
3532 VBOXSTRICTRC const rcStrict = iemNativeTbEntry(pVCpu, (uintptr_t)pTb->Native.paInstructions);
3533# else
3534 VBOXSTRICTRC const rcStrict = iemNativeTbEntry(pVCpu, &pVCpu->cpum.GstCtx, (uintptr_t)pTb->Native.paInstructions);
3535# endif
3536
3537# ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER_LONGJMP
3538 pVCpu->iem.s.pvTbFramePointerR3 = NULL;
3539# endif
3540# ifdef IEMNATIVE_WITH_SIMD_FP_NATIVE_EMITTERS
3541 /* Restore FPCR/MXCSR if the TB modified it. */
3542 if (pVCpu->iem.s.uRegFpCtrl != IEMNATIVE_SIMD_FP_CTRL_REG_NOT_MODIFIED)
3543 {
3544 iemNativeFpCtrlRegRestore(pVCpu->iem.s.uRegFpCtrl);
3545 /* Reset for the next round saving us an unconditional instruction on next TB entry. */
3546 pVCpu->iem.s.uRegFpCtrl = IEMNATIVE_SIMD_FP_CTRL_REG_NOT_MODIFIED;
3547 }
3548# endif
3549# ifdef IEMNATIVE_STRICT_EFLAGS_SKIPPING
3550 Assert(pVCpu->iem.s.fSkippingEFlags == 0);
3551# endif
3552 if (RT_LIKELY( rcStrict == VINF_SUCCESS
3553 && pVCpu->iem.s.rcPassUp == VINF_SUCCESS /** @todo this isn't great. */))
3554 { /* likely */ }
3555 else
3556 {
3557 /* pVCpu->iem.s.cInstructions is incremented by iemNativeHlpExecStatusCodeFiddling. */
3558 pVCpu->iem.s.pCurTbR3 = NULL;
3559
3560 /* VINF_IEM_REEXEC_BREAK should be treated as VINF_SUCCESS as it's
3561 only to break out of TB execution early. */
3562 if (rcStrict == VINF_IEM_REEXEC_BREAK)
3563 {
3564 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeTbExitReturnBreak);
3565 return iemExecStatusCodeFiddling(pVCpu, VINF_SUCCESS);
3566 }
3567
3568 /* VINF_IEM_REEXEC_BREAK_FF should be treated as VINF_SUCCESS as it's
3569 only to break out of TB execution early due to pending FFs. */
3570 if (rcStrict == VINF_IEM_REEXEC_BREAK_FF)
3571 {
3572 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeTbExitReturnBreakFF);
3573 return iemExecStatusCodeFiddling(pVCpu, VINF_SUCCESS);
3574 }
3575
3576 /* VINF_IEM_REEXEC_WITH_FLAGS needs to receive special treatment
3577 and converted to VINF_SUCCESS or whatever is appropriate. */
3578 if (rcStrict == VINF_IEM_REEXEC_FINISH_WITH_FLAGS)
3579 {
3580 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeTbExitReturnWithFlags);
3581 return iemExecStatusCodeFiddling(pVCpu, iemFinishInstructionWithFlagsSet(pVCpu, VINF_SUCCESS));
3582 }
3583
3584 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeTbExitReturnOtherStatus);
3585 return iemExecStatusCodeFiddling(pVCpu, rcStrict);
3586 }
3587 }
3588 else
3589#endif /* VBOX_WITH_IEM_NATIVE_RECOMPILER */
3590 {
3591 /*
3592 * The threaded execution loop.
3593 */
3594 pVCpu->iem.s.cTbExecThreaded++;
3595 IEMTLBTRACE_TB_EXEC_THRD(pVCpu, pTb);
3596#ifdef LOG_ENABLED
3597 uint64_t uRipPrev = UINT64_MAX;
3598#endif
3599 PCIEMTHRDEDCALLENTRY pCallEntry = pTb->Thrd.paCalls;
3600 uint32_t cCallsLeft = pTb->Thrd.cCalls;
3601 while (cCallsLeft-- > 0)
3602 {
3603#ifdef LOG_ENABLED
3604 if (pVCpu->cpum.GstCtx.rip != uRipPrev)
3605 {
3606 uRipPrev = pVCpu->cpum.GstCtx.rip;
3607 iemThreadedLogCurInstr(pVCpu, "EXt", pTb->Thrd.cCalls - cCallsLeft - 1);
3608 }
3609 Log9(("%04x:%08RX64: #%d/%d - %d %s\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip,
3610 pTb->Thrd.cCalls - cCallsLeft - 1, pCallEntry->idxInstr, pCallEntry->enmFunction,
3611 g_apszIemThreadedFunctions[pCallEntry->enmFunction]));
3612#endif
3613#ifdef VBOX_WITH_STATISTICS
3614 AssertCompile(RT_ELEMENTS(pVCpu->iem.s.acThreadedFuncStats) >= kIemThreadedFunc_End);
3615 pVCpu->iem.s.acThreadedFuncStats[pCallEntry->enmFunction] += 1;
3616#endif
3617 VBOXSTRICTRC const rcStrict = g_apfnIemThreadedFunctions[pCallEntry->enmFunction](pVCpu,
3618 pCallEntry->auParams[0],
3619 pCallEntry->auParams[1],
3620 pCallEntry->auParams[2]);
3621 if (RT_LIKELY( rcStrict == VINF_SUCCESS
3622 && pVCpu->iem.s.rcPassUp == VINF_SUCCESS /** @todo this isn't great. */))
3623 pCallEntry++;
3624 else if (rcStrict == VINF_IEM_REEXEC_JUMP)
3625 {
3626 Assert(pVCpu->iem.s.rcPassUp == VINF_SUCCESS);
3627 Assert(cCallsLeft == 0);
3628 uint32_t const idxTarget = (uint32_t)pCallEntry->auParams[0];
3629 cCallsLeft = pTb->Thrd.cCalls;
3630 AssertBreak(idxTarget < cCallsLeft - 1);
3631 cCallsLeft -= idxTarget;
3632 pCallEntry = &pTb->Thrd.paCalls[idxTarget];
3633 AssertBreak(pCallEntry->fFlags & IEMTHREADEDCALLENTRY_F_JUMP_TARGET);
3634 }
3635 else
3636 {
3637 pVCpu->iem.s.cInstructions += pCallEntry->idxInstr; /* This may be one short, but better than zero. */
3638 pVCpu->iem.s.pCurTbR3 = NULL;
3639 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatTbThreadedExecBreaks);
3640 pVCpu->iem.s.ppTbLookupEntryR3 = iemTbGetTbLookupEntryWithRip(pTb, pCallEntry->uTbLookup, pVCpu->cpum.GstCtx.rip);
3641
3642 /* VINF_IEM_REEXEC_BREAK should be treated as VINF_SUCCESS as it's
3643 only to break out of TB execution early. */
3644 if (rcStrict == VINF_IEM_REEXEC_BREAK)
3645 {
3646#ifdef VBOX_WITH_STATISTICS
3647 if (pCallEntry->uTbLookup)
3648 STAM_COUNTER_INC(&pVCpu->iem.s.StatTbThreadedExecBreaksWithLookup);
3649 else
3650 STAM_COUNTER_INC(&pVCpu->iem.s.StatTbThreadedExecBreaksWithoutLookup);
3651#endif
3652 return iemExecStatusCodeFiddling(pVCpu, VINF_SUCCESS);
3653 }
3654 return iemExecStatusCodeFiddling(pVCpu, rcStrict);
3655 }
3656 }
3657
3658 /* Update the lookup entry. */
3659 pVCpu->iem.s.ppTbLookupEntryR3 = iemTbGetTbLookupEntryWithRip(pTb, pCallEntry[-1].uTbLookup, pVCpu->cpum.GstCtx.rip);
3660 }
3661
3662 pVCpu->iem.s.cInstructions += pTb->cInstructions;
3663 pVCpu->iem.s.pCurTbR3 = NULL;
3664 return VINF_SUCCESS;
3665}
3666
3667
3668/**
3669 * This is called when the PC doesn't match the current pbInstrBuf.
3670 *
3671 * Upon return, we're ready for opcode fetching. But please note that
3672 * pbInstrBuf can be NULL iff the memory doesn't have readable backing (i.e.
3673 * MMIO or unassigned).
3674 */
3675static RTGCPHYS iemGetPcWithPhysAndCodeMissed(PVMCPUCC pVCpu)
3676{
3677 pVCpu->iem.s.pbInstrBuf = NULL;
3678 pVCpu->iem.s.offCurInstrStart = 0;
3679 pVCpu->iem.s.offInstrNextByte = 0;
3680 iemOpcodeFetchBytesJmp(pVCpu, 0, NULL);
3681 return pVCpu->iem.s.GCPhysInstrBuf + pVCpu->iem.s.offCurInstrStart;
3682}
3683
3684
3685/** @todo need private inline decl for throw/nothrow matching IEM_WITH_SETJMP? */
3686DECL_FORCE_INLINE_THROW(RTGCPHYS) iemGetPcWithPhysAndCode(PVMCPUCC pVCpu)
3687{
3688 /*
3689 * Set uCurTbStartPc to RIP and calc the effective PC.
3690 */
3691 uint64_t uPc = pVCpu->cpum.GstCtx.rip;
3692#if 0 /* unused */
3693 pVCpu->iem.s.uCurTbStartPc = uPc;
3694#endif
3695 Assert(pVCpu->cpum.GstCtx.cs.u64Base == 0 || !IEM_IS_64BIT_CODE(pVCpu));
3696 uPc += pVCpu->cpum.GstCtx.cs.u64Base;
3697
3698 /*
3699 * Advance within the current buffer (PAGE) when possible.
3700 */
3701 if (pVCpu->iem.s.pbInstrBuf)
3702 {
3703 uint64_t off = uPc - pVCpu->iem.s.uInstrBufPc;
3704 if (off < pVCpu->iem.s.cbInstrBufTotal)
3705 {
3706 pVCpu->iem.s.offInstrNextByte = (uint32_t)off;
3707 pVCpu->iem.s.offCurInstrStart = (uint16_t)off;
3708 if ((uint16_t)off + 15 <= pVCpu->iem.s.cbInstrBufTotal)
3709 pVCpu->iem.s.cbInstrBuf = (uint16_t)off + 15;
3710 else
3711 pVCpu->iem.s.cbInstrBuf = pVCpu->iem.s.cbInstrBufTotal;
3712
3713 return pVCpu->iem.s.GCPhysInstrBuf + off;
3714 }
3715 }
3716 return iemGetPcWithPhysAndCodeMissed(pVCpu);
3717}
3718
3719
3720/**
3721 * Determines the extra IEMTB_F_XXX flags.
3722 *
3723 * @returns A mix of IEMTB_F_INHIBIT_SHADOW, IEMTB_F_INHIBIT_NMI and
3724 * IEMTB_F_CS_LIM_CHECKS (or zero).
3725 * @param pVCpu The cross context virtual CPU structure of the calling
3726 * thread.
3727 */
3728DECL_FORCE_INLINE(uint32_t) iemGetTbFlagsForCurrentPc(PVMCPUCC pVCpu)
3729{
3730 uint32_t fRet = 0;
3731
3732 /*
3733 * Determine the inhibit bits.
3734 */
3735 if (!(pVCpu->cpum.GstCtx.rflags.uBoth & (CPUMCTX_INHIBIT_SHADOW | CPUMCTX_INHIBIT_NMI)))
3736 { /* typical */ }
3737 else
3738 {
3739 if (CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx))
3740 fRet |= IEMTB_F_INHIBIT_SHADOW;
3741 if (CPUMAreInterruptsInhibitedByNmiEx(&pVCpu->cpum.GstCtx))
3742 fRet |= IEMTB_F_INHIBIT_NMI;
3743 }
3744
3745 /*
3746 * Return IEMTB_F_CS_LIM_CHECKS if the current PC is invalid or if it is
3747 * likely to go invalid before the end of the translation block.
3748 */
3749 if (IEM_F_MODE_X86_IS_FLAT(pVCpu->iem.s.fExec))
3750 return fRet;
3751
3752 int64_t const offFromLim = (int64_t)pVCpu->cpum.GstCtx.cs.u32Limit - (int64_t)pVCpu->cpum.GstCtx.eip;
3753 if (offFromLim >= X86_PAGE_SIZE + 16 - (int32_t)(pVCpu->cpum.GstCtx.cs.u64Base & GUEST_PAGE_OFFSET_MASK))
3754 return fRet;
3755 return fRet | IEMTB_F_CS_LIM_CHECKS;
3756}
3757
3758
3759VMM_INT_DECL(VBOXSTRICTRC) IEMExecRecompiler(PVMCC pVM, PVMCPUCC pVCpu, bool fWasHalted)
3760{
3761 /*
3762 * See if there is an interrupt pending in TRPM, inject it if we can.
3763 */
3764 if (!TRPMHasTrap(pVCpu))
3765 { /* likely */ }
3766 else
3767 {
3768 VBOXSTRICTRC rcStrict = iemExecInjectPendingTrap(pVCpu);
3769 if (RT_LIKELY(rcStrict == VINF_SUCCESS))
3770 { /*likely */ }
3771 else
3772 return rcStrict;
3773 }
3774
3775 /*
3776 * Init the execution environment.
3777 */
3778#if 1 /** @todo this seems like a good idea, however if we ever share memory
3779 * directly with other threads on the host, it isn't necessarily... */
3780 if (pVM->cCpus == 1)
3781 iemInitExec(pVCpu, IEM_F_X86_DISREGARD_LOCK /*fExecOpts*/);
3782 else
3783#endif
3784 iemInitExec(pVCpu, 0 /*fExecOpts*/);
3785
3786 if (RT_LIKELY(!fWasHalted && pVCpu->iem.s.msRecompilerPollNow != 0))
3787 { }
3788 else
3789 {
3790 /* Do polling after halt and the first time we get here. */
3791#ifdef IEM_WITH_ADAPTIVE_TIMER_POLLING
3792 uint64_t nsNow = 0;
3793 uint32_t const cItersTillPoll = iemPollTimersCalcDefaultCountdown(TMTimerPollBoolWithNanoTS(pVM, pVCpu, &nsNow));
3794 pVCpu->iem.s.cTbsTillNextTimerPollPrev = cItersTillPoll;
3795 pVCpu->iem.s.cTbsTillNextTimerPoll = cItersTillPoll;
3796#else
3797 uint64_t const nsNow = TMVirtualGetNoCheck(pVM);
3798#endif
3799 pVCpu->iem.s.nsRecompilerPollNow = nsNow;
3800 pVCpu->iem.s.msRecompilerPollNow = (uint32_t)(nsNow / RT_NS_1MS);
3801 }
3802 pVCpu->iem.s.ppTbLookupEntryR3 = &pVCpu->iem.s.pTbLookupEntryDummyR3;
3803
3804 /*
3805 * Run-loop.
3806 *
3807 * If we're using setjmp/longjmp we combine all the catching here to avoid
3808 * having to call setjmp for each block we're executing.
3809 */
3810 PIEMTBCACHE const pTbCache = pVCpu->iem.s.pTbCacheR3;
3811 for (;;)
3812 {
3813 VBOXSTRICTRC rcStrict;
3814 IEM_TRY_SETJMP(pVCpu, rcStrict)
3815 {
3816 for (;;)
3817 {
3818 /* Translate PC to physical address, we'll need this for both lookup and compilation. */
3819 RTGCPHYS const GCPhysPc = iemGetPcWithPhysAndCode(pVCpu);
3820 if (RT_LIKELY(pVCpu->iem.s.pbInstrBuf != NULL))
3821 {
3822 uint32_t const fExtraFlags = iemGetTbFlagsForCurrentPc(pVCpu);
3823 PIEMTB const pTb = iemTbCacheLookup(pVCpu, pTbCache, GCPhysPc, fExtraFlags);
3824 if (pTb)
3825 rcStrict = iemTbExec(pVCpu, pTb);
3826 else
3827 rcStrict = iemThreadedCompile(pVM, pVCpu, GCPhysPc, fExtraFlags);
3828 }
3829 else
3830 {
3831 /* This can only happen if the current PC cannot be translated into a
3832 host pointer, which means we're in MMIO or unmapped memory... */
3833#if defined(VBOX_STRICT) && defined(IN_RING3)
3834 rcStrict = DBGFSTOP(pVM);
3835 if (rcStrict != VINF_SUCCESS && rcStrict != VERR_DBGF_NOT_ATTACHED)
3836 return rcStrict;
3837#endif
3838 rcStrict = IEMExecLots(pVCpu, 2048, 511, NULL);
3839 }
3840 if (rcStrict == VINF_SUCCESS)
3841 {
3842 Assert(pVCpu->iem.s.cActiveMappings == 0);
3843
3844 /* Note! This IRQ/FF check is repeated in iemPollTimers, iemThreadedFunc_BltIn_CheckIrq
3845 and emitted by iemNativeRecompFunc_BltIn_CheckIrq. */
3846 uint64_t fCpu = pVCpu->fLocalForcedActions;
3847 fCpu &= VMCPU_FF_ALL_MASK & ~( VMCPU_FF_PGM_SYNC_CR3
3848 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL
3849 | VMCPU_FF_TLB_FLUSH
3850 | VMCPU_FF_UNHALT );
3851 /** @todo this isn't even close to the NMI/IRQ conditions in EM. */
3852 if (RT_LIKELY( ( !fCpu
3853 || ( !(fCpu & ~(VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
3854 && ( !pVCpu->cpum.GstCtx.rflags.Bits.u1IF
3855 || CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx) )) )
3856 && !VM_FF_IS_ANY_SET(pVM, VM_FF_ALL_MASK) ))
3857 {
3858 /* Once in a while we need to poll timers here. */
3859 if ((int32_t)--pVCpu->iem.s.cTbsTillNextTimerPoll > 0)
3860 { /* likely */ }
3861 else
3862 {
3863 int rc = iemPollTimers(pVM, pVCpu);
3864 if (rc != VINF_SUCCESS)
3865 return VINF_SUCCESS;
3866 }
3867 }
3868 else
3869 return VINF_SUCCESS;
3870 }
3871 else
3872 return rcStrict;
3873 }
3874 }
3875 IEM_CATCH_LONGJMP_BEGIN(pVCpu, rcStrict);
3876 {
3877 Assert(rcStrict != VINF_IEM_REEXEC_BREAK);
3878 pVCpu->iem.s.cLongJumps++;
3879#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER_LONGJMP
3880 pVCpu->iem.s.pvTbFramePointerR3 = NULL;
3881#endif
3882 if (pVCpu->iem.s.cActiveMappings > 0)
3883 iemMemRollback(pVCpu);
3884
3885#ifdef VBOX_WITH_IEM_NATIVE_RECOMPILER
3886 PIEMTB const pTb = pVCpu->iem.s.pCurTbR3;
3887 if (pTb && (pTb->fFlags & IEMTB_F_TYPE_MASK) == IEMTB_F_TYPE_NATIVE)
3888 {
3889 STAM_REL_COUNTER_INC(&pVCpu->iem.s.StatNativeTbExitLongJump);
3890# ifdef IEMNATIVE_WITH_INSTRUCTION_COUNTING
3891 Assert(pVCpu->iem.s.idxTbCurInstr < pTb->cInstructions);
3892 pVCpu->iem.s.cInstructions += pVCpu->iem.s.idxTbCurInstr;
3893# endif
3894
3895#ifdef IEMNATIVE_WITH_SIMD_FP_NATIVE_EMITTERS
3896 /* Restore FPCR/MXCSR if the TB modified it. */
3897 if (pVCpu->iem.s.uRegFpCtrl != IEMNATIVE_SIMD_FP_CTRL_REG_NOT_MODIFIED)
3898 {
3899 iemNativeFpCtrlRegRestore(pVCpu->iem.s.uRegFpCtrl);
3900 /* Reset for the next round saving us an unconditional instruction on next TB entry. */
3901 pVCpu->iem.s.uRegFpCtrl = IEMNATIVE_SIMD_FP_CTRL_REG_NOT_MODIFIED;
3902 }
3903#endif
3904 }
3905#endif
3906
3907#if 0 /** @todo do we need to clean up anything? If not, we can drop the pTb = NULL some lines up and change the scope. */
3908 /* If pTb isn't NULL we're in iemTbExec. */
3909 if (!pTb)
3910 {
3911 /* If pCurTbR3 is NULL, we're in iemGetPcWithPhysAndCode.*/
3912 pTb = pVCpu->iem.s.pCurTbR3;
3913 if (pTb)
3914 {
3915 if (pTb == pVCpu->iem.s.pThrdCompileTbR3)
3916 return iemThreadedCompileLongJumped(pVM, pVCpu, rcStrict);
3917 Assert(pTb != pVCpu->iem.s.pNativeCompileTbR3);
3918 }
3919 }
3920#endif
3921 pVCpu->iem.s.pCurTbR3 = NULL;
3922 return rcStrict;
3923 }
3924 IEM_CATCH_LONGJMP_END(pVCpu);
3925 }
3926}
3927
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette