VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/PGMAllPool.cpp@ 43030

Last change on this file since 43030 was 42700, checked in by vboxsync, 12 years ago

pgmPoolAccessHandlerFlush: Deal with VINF_EM_RESCHEDULE.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 210.3 KB
Line 
1/* $Id: PGMAllPool.cpp 42700 2012-08-09 00:50:39Z vboxsync $ */
2/** @file
3 * PGM Shadow Page Pool.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*******************************************************************************
20* Header Files *
21*******************************************************************************/
22#define LOG_GROUP LOG_GROUP_PGM_POOL
23#include <VBox/vmm/pgm.h>
24#include <VBox/vmm/mm.h>
25#include <VBox/vmm/em.h>
26#include <VBox/vmm/cpum.h>
27#ifdef IN_RC
28# include <VBox/vmm/patm.h>
29#endif
30#include "PGMInternal.h"
31#include <VBox/vmm/vm.h>
32#include "PGMInline.h"
33#include <VBox/disopcode.h>
34#include <VBox/vmm/hwacc_vmx.h>
35
36#include <VBox/log.h>
37#include <VBox/err.h>
38#include <iprt/asm.h>
39#include <iprt/asm-amd64-x86.h>
40#include <iprt/string.h>
41
42
43/*******************************************************************************
44* Internal Functions *
45*******************************************************************************/
46RT_C_DECLS_BEGIN
47DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind);
48DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind);
49static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
50static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable);
51static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage);
52#ifndef IN_RING3
53DECLEXPORT(int) pgmPoolAccessHandler(PVM pVM, RTGCUINT uErrorCode, PCPUMCTXCORE pRegFrame, RTGCPTR pvFault, RTGCPHYS GCPhysFault, void *pvUser);
54#endif
55#ifdef LOG_ENABLED
56static const char *pgmPoolPoolKindToStr(uint8_t enmKind);
57#endif
58#if 0 /*defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT)*/
59static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT);
60#endif
61
62int pgmPoolTrackFlushGCPhysPTsSlow(PVM pVM, PPGMPAGE pPhysPage);
63PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVM pVM, uint16_t *piPhysExt);
64void pgmPoolTrackPhysExtFree(PVM pVM, uint16_t iPhysExt);
65void pgmPoolTrackPhysExtFreeList(PVM pVM, uint16_t iPhysExt);
66
67RT_C_DECLS_END
68
69
70/**
71 * Checks if the specified page pool kind is for a 4MB or 2MB guest page.
72 *
73 * @returns true if it's the shadow of a 4MB or 2MB guest page, otherwise false.
74 * @param enmKind The page kind.
75 */
76DECLINLINE(bool) pgmPoolIsBigPage(PGMPOOLKIND enmKind)
77{
78 switch (enmKind)
79 {
80 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
81 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
82 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
83 return true;
84 default:
85 return false;
86 }
87}
88
89
90/**
91 * Flushes a chain of pages sharing the same access monitor.
92 *
93 * @returns VBox status code suitable for scheduling.
94 * @param pPool The pool.
95 * @param pPage A page in the chain.
96 * @todo VBOXSTRICTRC
97 */
98int pgmPoolMonitorChainFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
99{
100 LogFlow(("pgmPoolMonitorChainFlush: Flush page %RGp type=%d\n", pPage->GCPhys, pPage->enmKind));
101
102 /*
103 * Find the list head.
104 */
105 uint16_t idx = pPage->idx;
106 if (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
107 {
108 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
109 {
110 idx = pPage->iMonitoredPrev;
111 Assert(idx != pPage->idx);
112 pPage = &pPool->aPages[idx];
113 }
114 }
115
116 /*
117 * Iterate the list flushing each shadow page.
118 */
119 int rc = VINF_SUCCESS;
120 for (;;)
121 {
122 idx = pPage->iMonitoredNext;
123 Assert(idx != pPage->idx);
124 if (pPage->idx >= PGMPOOL_IDX_FIRST)
125 {
126 int rc2 = pgmPoolFlushPage(pPool, pPage);
127 AssertRC(rc2);
128 }
129 /* next */
130 if (idx == NIL_PGMPOOL_IDX)
131 break;
132 pPage = &pPool->aPages[idx];
133 }
134 return rc;
135}
136
137
138/**
139 * Wrapper for getting the current context pointer to the entry being modified.
140 *
141 * @returns VBox status code suitable for scheduling.
142 * @param pVM Pointer to the VM.
143 * @param pvDst Destination address
144 * @param pvSrc Source guest virtual address.
145 * @param GCPhysSrc The source guest physical address.
146 * @param cb Size of data to read
147 */
148DECLINLINE(int) pgmPoolPhysSimpleReadGCPhys(PVM pVM, void *pvDst, CTXTYPE(RTGCPTR, RTHCPTR, RTGCPTR) pvSrc,
149 RTGCPHYS GCPhysSrc, size_t cb)
150{
151#if defined(IN_RING3)
152 NOREF(pVM); NOREF(GCPhysSrc);
153 memcpy(pvDst, (RTHCPTR)((uintptr_t)pvSrc & ~(RTHCUINTPTR)(cb - 1)), cb);
154 return VINF_SUCCESS;
155#else
156 /* @todo in RC we could attempt to use the virtual address, although this can cause many faults (PAE Windows XP guest). */
157 NOREF(pvSrc);
158 return PGMPhysSimpleReadGCPhys(pVM, pvDst, GCPhysSrc & ~(RTGCPHYS)(cb - 1), cb);
159#endif
160}
161
162
163/**
164 * Process shadow entries before they are changed by the guest.
165 *
166 * For PT entries we will clear them. For PD entries, we'll simply check
167 * for mapping conflicts and set the SyncCR3 FF if found.
168 *
169 * @param pVCpu Pointer to the VMCPU.
170 * @param pPool The pool.
171 * @param pPage The head page.
172 * @param GCPhysFault The guest physical fault address.
173 * @param uAddress In R0 and GC this is the guest context fault address (flat).
174 * In R3 this is the host context 'fault' address.
175 * @param cbWrite Write size; might be zero if the caller knows we're not crossing entry boundaries
176 */
177void pgmPoolMonitorChainChanging(PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhysFault,
178 CTXTYPE(RTGCPTR, RTHCPTR, RTGCPTR) pvAddress, unsigned cbWrite)
179{
180 AssertMsg(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX, ("%u (idx=%u)\n", pPage->iMonitoredPrev, pPage->idx));
181 const unsigned off = GCPhysFault & PAGE_OFFSET_MASK;
182 PVM pVM = pPool->CTX_SUFF(pVM);
183 NOREF(pVCpu);
184
185 LogFlow(("pgmPoolMonitorChainChanging: %RGv phys=%RGp cbWrite=%d\n", (RTGCPTR)(CTXTYPE(RTGCPTR, uintptr_t, RTGCPTR))pvAddress, GCPhysFault, cbWrite));
186
187 for (;;)
188 {
189 union
190 {
191 void *pv;
192 PX86PT pPT;
193 PPGMSHWPTPAE pPTPae;
194 PX86PD pPD;
195 PX86PDPAE pPDPae;
196 PX86PDPT pPDPT;
197 PX86PML4 pPML4;
198 } uShw;
199
200 LogFlow(("pgmPoolMonitorChainChanging: page idx=%d phys=%RGp (next=%d) kind=%s\n", pPage->idx, pPage->GCPhys, pPage->iMonitoredNext, pgmPoolPoolKindToStr(pPage->enmKind), cbWrite));
201
202 uShw.pv = NULL;
203 switch (pPage->enmKind)
204 {
205 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
206 {
207 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
208 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
209 const unsigned iShw = off / sizeof(X86PTE);
210 LogFlow(("PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT iShw=%x\n", iShw));
211 if (uShw.pPT->a[iShw].n.u1Present)
212 {
213 X86PTE GstPte;
214
215 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
216 AssertRC(rc);
217 Log4(("pgmPoolMonitorChainChanging 32_32: deref %016RX64 GCPhys %08RX32\n", uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
218 pgmPoolTracDerefGCPhysHint(pPool, pPage,
219 uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK,
220 GstPte.u & X86_PTE_PG_MASK,
221 iShw);
222 ASMAtomicWriteU32(&uShw.pPT->a[iShw].u, 0);
223 }
224 break;
225 }
226
227 /* page/2 sized */
228 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
229 {
230 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
231 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
232 if (!((off ^ pPage->GCPhys) & (PAGE_SIZE / 2)))
233 {
234 const unsigned iShw = (off / sizeof(X86PTE)) & (X86_PG_PAE_ENTRIES - 1);
235 LogFlow(("PGMPOOLKIND_PAE_PT_FOR_32BIT_PT iShw=%x\n", iShw));
236 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
237 {
238 X86PTE GstPte;
239 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
240 AssertRC(rc);
241
242 Log4(("pgmPoolMonitorChainChanging pae_32: deref %016RX64 GCPhys %08RX32\n", uShw.pPT->a[iShw].u & X86_PTE_PAE_PG_MASK, GstPte.u & X86_PTE_PG_MASK));
243 pgmPoolTracDerefGCPhysHint(pPool, pPage,
244 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
245 GstPte.u & X86_PTE_PG_MASK,
246 iShw);
247 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
248 }
249 }
250 break;
251 }
252
253 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
254 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
255 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
256 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
257 {
258 unsigned iGst = off / sizeof(X86PDE);
259 unsigned iShwPdpt = iGst / 256;
260 unsigned iShw = (iGst % 256) * 2;
261 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
262
263 LogFlow(("pgmPoolMonitorChainChanging PAE for 32 bits: iGst=%x iShw=%x idx = %d page idx=%d\n", iGst, iShw, iShwPdpt, pPage->enmKind - PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD));
264 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
265 if (iShwPdpt == pPage->enmKind - (unsigned)PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD)
266 {
267 for (unsigned i = 0; i < 2; i++)
268 {
269# ifdef VBOX_WITH_RAW_MODE_NOT_R0
270 if ((uShw.pPDPae->a[iShw + i].u & (PGM_PDFLAGS_MAPPING | X86_PDE_P)) == (PGM_PDFLAGS_MAPPING | X86_PDE_P))
271 {
272 Assert(pgmMapAreMappingsEnabled(pVM));
273 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
274 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShwPdpt=%#x iShw=%#x!\n", iShwPdpt, iShw+i));
275 break;
276 }
277# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
278 if (uShw.pPDPae->a[iShw+i].n.u1Present)
279 {
280 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw+i, uShw.pPDPae->a[iShw+i].u));
281 pgmPoolFree(pVM,
282 uShw.pPDPae->a[iShw+i].u & X86_PDE_PAE_PG_MASK,
283 pPage->idx,
284 iShw + i);
285 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw+i].u, 0);
286 }
287
288 /* paranoia / a bit assumptive. */
289 if ( (off & 3)
290 && (off & 3) + cbWrite > 4)
291 {
292 const unsigned iShw2 = iShw + 2 + i;
293 if (iShw2 < RT_ELEMENTS(uShw.pPDPae->a))
294 {
295# ifdef VBOX_WITH_RAW_MODE_NOT_R0
296 if ((uShw.pPDPae->a[iShw2].u & (PGM_PDFLAGS_MAPPING | X86_PDE_P)) == (PGM_PDFLAGS_MAPPING | X86_PDE_P))
297 {
298 Assert(pgmMapAreMappingsEnabled(pVM));
299 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
300 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShwPdpt=%#x iShw2=%#x!\n", iShwPdpt, iShw2));
301 break;
302 }
303# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
304 if (uShw.pPDPae->a[iShw2].n.u1Present)
305 {
306 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
307 pgmPoolFree(pVM,
308 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
309 pPage->idx,
310 iShw2);
311 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
312 }
313 }
314 }
315 }
316 }
317 break;
318 }
319
320 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
321 {
322 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
323 const unsigned iShw = off / sizeof(X86PTEPAE);
324 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPT));
325 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw]))
326 {
327 X86PTEPAE GstPte;
328 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress, GCPhysFault, sizeof(GstPte));
329 AssertRC(rc);
330
331 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]), GstPte.u & X86_PTE_PAE_PG_MASK));
332 pgmPoolTracDerefGCPhysHint(pPool, pPage,
333 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw]),
334 GstPte.u & X86_PTE_PAE_PG_MASK,
335 iShw);
336 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw], 0);
337 }
338
339 /* paranoia / a bit assumptive. */
340 if ( (off & 7)
341 && (off & 7) + cbWrite > sizeof(X86PTEPAE))
342 {
343 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTEPAE);
344 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPTPae->a));
345
346 if (PGMSHWPTEPAE_IS_P(uShw.pPTPae->a[iShw2]))
347 {
348 X86PTEPAE GstPte;
349# ifdef IN_RING3
350 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, (RTHCPTR)((RTHCUINTPTR)pvAddress + sizeof(GstPte)), GCPhysFault + sizeof(GstPte), sizeof(GstPte));
351# else
352 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvAddress + sizeof(GstPte), GCPhysFault + sizeof(GstPte), sizeof(GstPte));
353# endif
354 AssertRC(rc);
355 Log4(("pgmPoolMonitorChainChanging pae: deref %016RX64 GCPhys %016RX64\n", PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]), GstPte.u & X86_PTE_PAE_PG_MASK));
356 pgmPoolTracDerefGCPhysHint(pPool, pPage,
357 PGMSHWPTEPAE_GET_HCPHYS(uShw.pPTPae->a[iShw2]),
358 GstPte.u & X86_PTE_PAE_PG_MASK,
359 iShw2);
360 PGMSHWPTEPAE_ATOMIC_SET(uShw.pPTPae->a[iShw2], 0);
361 }
362 }
363 break;
364 }
365
366 case PGMPOOLKIND_32BIT_PD:
367 {
368 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
369 const unsigned iShw = off / sizeof(X86PTE); // ASSUMING 32-bit guest paging!
370
371 LogFlow(("pgmPoolMonitorChainChanging: PGMPOOLKIND_32BIT_PD %x\n", iShw));
372 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
373# ifdef VBOX_WITH_RAW_MODE_NOT_R0
374 if (uShw.pPD->a[iShw].u & PGM_PDFLAGS_MAPPING)
375 {
376 Assert(pgmMapAreMappingsEnabled(pVM));
377 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
378 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
379 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw=%#x!\n", iShw));
380 break;
381 }
382 else
383# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
384 {
385 if (uShw.pPD->a[iShw].n.u1Present)
386 {
387 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPD->a[iShw].u));
388 pgmPoolFree(pVM,
389 uShw.pPD->a[iShw].u & X86_PDE_PAE_PG_MASK,
390 pPage->idx,
391 iShw);
392 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
393 }
394 }
395 /* paranoia / a bit assumptive. */
396 if ( (off & 3)
397 && (off & 3) + cbWrite > sizeof(X86PTE))
398 {
399 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PTE);
400 if ( iShw2 != iShw
401 && iShw2 < RT_ELEMENTS(uShw.pPD->a))
402 {
403# ifdef VBOX_WITH_RAW_MODE_NOT_R0
404 if (uShw.pPD->a[iShw2].u & PGM_PDFLAGS_MAPPING)
405 {
406 Assert(pgmMapAreMappingsEnabled(pVM));
407 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
408 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
409 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
410 break;
411 }
412# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
413 if (uShw.pPD->a[iShw2].n.u1Present)
414 {
415 LogFlow(("pgmPoolMonitorChainChanging: 32 bit pd iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPD->a[iShw2].u));
416 pgmPoolFree(pVM,
417 uShw.pPD->a[iShw2].u & X86_PDE_PAE_PG_MASK,
418 pPage->idx,
419 iShw2);
420 ASMAtomicWriteU32(&uShw.pPD->a[iShw2].u, 0);
421 }
422 }
423 }
424#if 0 /* useful when running PGMAssertCR3(), a bit too troublesome for general use (TLBs). */
425 if ( uShw.pPD->a[iShw].n.u1Present
426 && !VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3))
427 {
428 LogFlow(("pgmPoolMonitorChainChanging: iShw=%#x: %RX32 -> freeing it!\n", iShw, uShw.pPD->a[iShw].u));
429# ifdef IN_RC /* TLB load - we're pushing things a bit... */
430 ASMProbeReadByte(pvAddress);
431# endif
432 pgmPoolFree(pVM, uShw.pPD->a[iShw].u & X86_PDE_PG_MASK, pPage->idx, iShw);
433 ASMAtomicWriteU32(&uShw.pPD->a[iShw].u, 0);
434 }
435#endif
436 break;
437 }
438
439 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
440 {
441 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
442 const unsigned iShw = off / sizeof(X86PDEPAE);
443 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
444#ifdef VBOX_WITH_RAW_MODE_NOT_R0
445 if (uShw.pPDPae->a[iShw].u & PGM_PDFLAGS_MAPPING)
446 {
447 Assert(pgmMapAreMappingsEnabled(pVM));
448 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
449 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
450 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw=%#x!\n", iShw));
451 break;
452 }
453#endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
454 /*
455 * Causes trouble when the guest uses a PDE to refer to the whole page table level
456 * structure. (Invalidate here; faults later on when it tries to change the page
457 * table entries -> recheck; probably only applies to the RC case.)
458 */
459#ifdef VBOX_WITH_RAW_MODE_NOT_R0
460 else
461#endif
462 {
463 if (uShw.pPDPae->a[iShw].n.u1Present)
464 {
465 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPae->a[iShw].u));
466 pgmPoolFree(pVM,
467 uShw.pPDPae->a[iShw].u & X86_PDE_PAE_PG_MASK,
468 pPage->idx,
469 iShw);
470 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
471 }
472 }
473 /* paranoia / a bit assumptive. */
474 if ( (off & 7)
475 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
476 {
477 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
478 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
479
480#ifdef VBOX_WITH_RAW_MODE_NOT_R0
481 if ( iShw2 != iShw
482 && uShw.pPDPae->a[iShw2].u & PGM_PDFLAGS_MAPPING)
483 {
484 Assert(pgmMapAreMappingsEnabled(pVM));
485 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
486 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
487 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
488 break;
489 }
490 else
491#endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
492 if (uShw.pPDPae->a[iShw2].n.u1Present)
493 {
494 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
495 pgmPoolFree(pVM,
496 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
497 pPage->idx,
498 iShw2);
499 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
500 }
501 }
502 break;
503 }
504
505 case PGMPOOLKIND_PAE_PDPT:
506 {
507 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
508 /*
509 * Hopefully this doesn't happen very often:
510 * - touching unused parts of the page
511 * - messing with the bits of pd pointers without changing the physical address
512 */
513 /* PDPT roots are not page aligned; 32 byte only! */
514 const unsigned offPdpt = GCPhysFault - pPage->GCPhys;
515
516 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
517 const unsigned iShw = offPdpt / sizeof(X86PDPE);
518 if (iShw < X86_PG_PAE_PDPE_ENTRIES) /* don't use RT_ELEMENTS(uShw.pPDPT->a), because that's for long mode only */
519 {
520# ifdef VBOX_WITH_RAW_MODE_NOT_R0
521 if (uShw.pPDPT->a[iShw].u & PGM_PLXFLAGS_MAPPING)
522 {
523 Assert(pgmMapAreMappingsEnabled(pVM));
524 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
525 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
526 LogFlow(("pgmPoolMonitorChainChanging: Detected pdpt conflict at iShw=%#x!\n", iShw));
527 break;
528 }
529 else
530# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
531 if (uShw.pPDPT->a[iShw].n.u1Present)
532 {
533 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPT->a[iShw].u));
534 pgmPoolFree(pVM,
535 uShw.pPDPT->a[iShw].u & X86_PDPE_PG_MASK,
536 pPage->idx,
537 iShw);
538 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
539 }
540
541 /* paranoia / a bit assumptive. */
542 if ( (offPdpt & 7)
543 && (offPdpt & 7) + cbWrite > sizeof(X86PDPE))
544 {
545 const unsigned iShw2 = (offPdpt + cbWrite - 1) / sizeof(X86PDPE);
546 if ( iShw2 != iShw
547 && iShw2 < X86_PG_PAE_PDPE_ENTRIES)
548 {
549# ifdef VBOX_WITH_RAW_MODE_NOT_R0
550 if (uShw.pPDPT->a[iShw2].u & PGM_PLXFLAGS_MAPPING)
551 {
552 Assert(pgmMapAreMappingsEnabled(pVM));
553 STAM_COUNTER_INC(&(pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZGuestCR3WriteConflict));
554 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
555 LogFlow(("pgmPoolMonitorChainChanging: Detected conflict at iShw2=%#x!\n", iShw2));
556 break;
557 }
558 else
559# endif /* VBOX_WITH_RAW_MODE_NOT_R0 */
560 if (uShw.pPDPT->a[iShw2].n.u1Present)
561 {
562 LogFlow(("pgmPoolMonitorChainChanging: pae pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPT->a[iShw2].u));
563 pgmPoolFree(pVM,
564 uShw.pPDPT->a[iShw2].u & X86_PDPE_PG_MASK,
565 pPage->idx,
566 iShw2);
567 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
568 }
569 }
570 }
571 }
572 break;
573 }
574
575#ifndef IN_RC
576 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
577 {
578 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPD));
579 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
580 const unsigned iShw = off / sizeof(X86PDEPAE);
581 Assert(!(uShw.pPDPae->a[iShw].u & PGM_PDFLAGS_MAPPING));
582 if (uShw.pPDPae->a[iShw].n.u1Present)
583 {
584 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPae->a[iShw].u));
585 pgmPoolFree(pVM,
586 uShw.pPDPae->a[iShw].u & X86_PDE_PAE_PG_MASK,
587 pPage->idx,
588 iShw);
589 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw].u, 0);
590 }
591 /* paranoia / a bit assumptive. */
592 if ( (off & 7)
593 && (off & 7) + cbWrite > sizeof(X86PDEPAE))
594 {
595 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDEPAE);
596 AssertBreak(iShw2 < RT_ELEMENTS(uShw.pPDPae->a));
597
598 Assert(!(uShw.pPDPae->a[iShw2].u & PGM_PDFLAGS_MAPPING));
599 if (uShw.pPDPae->a[iShw2].n.u1Present)
600 {
601 LogFlow(("pgmPoolMonitorChainChanging: pae pd iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPae->a[iShw2].u));
602 pgmPoolFree(pVM,
603 uShw.pPDPae->a[iShw2].u & X86_PDE_PAE_PG_MASK,
604 pPage->idx,
605 iShw2);
606 ASMAtomicWriteU64(&uShw.pPDPae->a[iShw2].u, 0);
607 }
608 }
609 break;
610 }
611
612 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
613 {
614 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPDPT));
615 /*
616 * Hopefully this doesn't happen very often:
617 * - messing with the bits of pd pointers without changing the physical address
618 */
619 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
620 const unsigned iShw = off / sizeof(X86PDPE);
621 if (uShw.pPDPT->a[iShw].n.u1Present)
622 {
623 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPDPT->a[iShw].u));
624 pgmPoolFree(pVM, uShw.pPDPT->a[iShw].u & X86_PDPE_PG_MASK, pPage->idx, iShw);
625 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw].u, 0);
626 }
627 /* paranoia / a bit assumptive. */
628 if ( (off & 7)
629 && (off & 7) + cbWrite > sizeof(X86PDPE))
630 {
631 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PDPE);
632 if (uShw.pPDPT->a[iShw2].n.u1Present)
633 {
634 LogFlow(("pgmPoolMonitorChainChanging: pdpt iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPDPT->a[iShw2].u));
635 pgmPoolFree(pVM, uShw.pPDPT->a[iShw2].u & X86_PDPE_PG_MASK, pPage->idx, iShw2);
636 ASMAtomicWriteU64(&uShw.pPDPT->a[iShw2].u, 0);
637 }
638 }
639 break;
640 }
641
642 case PGMPOOLKIND_64BIT_PML4:
643 {
644 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FaultPML4));
645 /*
646 * Hopefully this doesn't happen very often:
647 * - messing with the bits of pd pointers without changing the physical address
648 */
649 uShw.pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
650 const unsigned iShw = off / sizeof(X86PDPE);
651 if (uShw.pPML4->a[iShw].n.u1Present)
652 {
653 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw=%#x: %RX64 -> freeing it!\n", iShw, uShw.pPML4->a[iShw].u));
654 pgmPoolFree(pVM, uShw.pPML4->a[iShw].u & X86_PML4E_PG_MASK, pPage->idx, iShw);
655 ASMAtomicWriteU64(&uShw.pPML4->a[iShw].u, 0);
656 }
657 /* paranoia / a bit assumptive. */
658 if ( (off & 7)
659 && (off & 7) + cbWrite > sizeof(X86PDPE))
660 {
661 const unsigned iShw2 = (off + cbWrite - 1) / sizeof(X86PML4E);
662 if (uShw.pPML4->a[iShw2].n.u1Present)
663 {
664 LogFlow(("pgmPoolMonitorChainChanging: pml4 iShw2=%#x: %RX64 -> freeing it!\n", iShw2, uShw.pPML4->a[iShw2].u));
665 pgmPoolFree(pVM, uShw.pPML4->a[iShw2].u & X86_PML4E_PG_MASK, pPage->idx, iShw2);
666 ASMAtomicWriteU64(&uShw.pPML4->a[iShw2].u, 0);
667 }
668 }
669 break;
670 }
671#endif /* IN_RING0 */
672
673 default:
674 AssertFatalMsgFailed(("enmKind=%d\n", pPage->enmKind));
675 }
676 PGM_DYNMAP_UNUSED_HINT_VM(pVM, uShw.pv);
677
678 /* next */
679 if (pPage->iMonitoredNext == NIL_PGMPOOL_IDX)
680 return;
681 pPage = &pPool->aPages[pPage->iMonitoredNext];
682 }
683}
684
685# ifndef IN_RING3
686
687/**
688 * Checks if a access could be a fork operation in progress.
689 *
690 * Meaning, that the guest is setting up the parent process for Copy-On-Write.
691 *
692 * @returns true if it's likely that we're forking, otherwise false.
693 * @param pPool The pool.
694 * @param pDis The disassembled instruction.
695 * @param offFault The access offset.
696 */
697DECLINLINE(bool) pgmPoolMonitorIsForking(PPGMPOOL pPool, PDISCPUSTATE pDis, unsigned offFault)
698{
699 /*
700 * i386 linux is using btr to clear X86_PTE_RW.
701 * The functions involved are (2.6.16 source inspection):
702 * clear_bit
703 * ptep_set_wrprotect
704 * copy_one_pte
705 * copy_pte_range
706 * copy_pmd_range
707 * copy_pud_range
708 * copy_page_range
709 * dup_mmap
710 * dup_mm
711 * copy_mm
712 * copy_process
713 * do_fork
714 */
715 if ( pDis->pCurInstr->uOpcode == OP_BTR
716 && !(offFault & 4)
717 /** @todo Validate that the bit index is X86_PTE_RW. */
718 )
719 {
720 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,Fork));
721 return true;
722 }
723 return false;
724}
725
726
727/**
728 * Determine whether the page is likely to have been reused.
729 *
730 * @returns true if we consider the page as being reused for a different purpose.
731 * @returns false if we consider it to still be a paging page.
732 * @param pVM Pointer to the VM.
733 * @param pVCpu Pointer to the VMCPU.
734 * @param pRegFrame Trap register frame.
735 * @param pDis The disassembly info for the faulting instruction.
736 * @param pvFault The fault address.
737 *
738 * @remark The REP prefix check is left to the caller because of STOSD/W.
739 */
740DECLINLINE(bool) pgmPoolMonitorIsReused(PVM pVM, PVMCPU pVCpu, PCPUMCTXCORE pRegFrame, PDISCPUSTATE pDis, RTGCPTR pvFault)
741{
742#ifndef IN_RC
743 /** @todo could make this general, faulting close to rsp should be a safe reuse heuristic. */
744 if ( HWACCMHasPendingIrq(pVM)
745 && (pRegFrame->rsp - pvFault) < 32)
746 {
747 /* Fault caused by stack writes while trying to inject an interrupt event. */
748 Log(("pgmPoolMonitorIsReused: reused %RGv for interrupt stack (rsp=%RGv).\n", pvFault, pRegFrame->rsp));
749 return true;
750 }
751#else
752 NOREF(pVM); NOREF(pvFault);
753#endif
754
755 LogFlow(("Reused instr %RGv %d at %RGv param1.fUse=%llx param1.reg=%d\n", pRegFrame->rip, pDis->pCurInstr->uOpcode, pvFault, pDis->Param1.fUse, pDis->Param1.Base.idxGenReg));
756
757 /* Non-supervisor mode write means it's used for something else. */
758 if (CPUMGetGuestCPL(pVCpu) != 0)
759 return true;
760
761 switch (pDis->pCurInstr->uOpcode)
762 {
763 /* call implies the actual push of the return address faulted */
764 case OP_CALL:
765 Log4(("pgmPoolMonitorIsReused: CALL\n"));
766 return true;
767 case OP_PUSH:
768 Log4(("pgmPoolMonitorIsReused: PUSH\n"));
769 return true;
770 case OP_PUSHF:
771 Log4(("pgmPoolMonitorIsReused: PUSHF\n"));
772 return true;
773 case OP_PUSHA:
774 Log4(("pgmPoolMonitorIsReused: PUSHA\n"));
775 return true;
776 case OP_FXSAVE:
777 Log4(("pgmPoolMonitorIsReused: FXSAVE\n"));
778 return true;
779 case OP_MOVNTI: /* solaris - block_zero_no_xmm */
780 Log4(("pgmPoolMonitorIsReused: MOVNTI\n"));
781 return true;
782 case OP_MOVNTDQ: /* solaris - hwblkclr & hwblkpagecopy */
783 Log4(("pgmPoolMonitorIsReused: MOVNTDQ\n"));
784 return true;
785 case OP_MOVSWD:
786 case OP_STOSWD:
787 if ( pDis->fPrefix == (DISPREFIX_REP|DISPREFIX_REX)
788 && pRegFrame->rcx >= 0x40
789 )
790 {
791 Assert(pDis->uCpuMode == DISCPUMODE_64BIT);
792
793 Log(("pgmPoolMonitorIsReused: OP_STOSQ\n"));
794 return true;
795 }
796 return false;
797 }
798 if ( ( (pDis->Param1.fUse & DISUSE_REG_GEN32)
799 || (pDis->Param1.fUse & DISUSE_REG_GEN64))
800 && (pDis->Param1.Base.idxGenReg == DISGREG_ESP))
801 {
802 Log4(("pgmPoolMonitorIsReused: ESP\n"));
803 return true;
804 }
805
806 return false;
807}
808
809
810/**
811 * Flushes the page being accessed.
812 *
813 * @returns VBox status code suitable for scheduling.
814 * @param pVM Pointer to the VM.
815 * @param pVCpu Pointer to the VMCPU.
816 * @param pPool The pool.
817 * @param pPage The pool page (head).
818 * @param pDis The disassembly of the write instruction.
819 * @param pRegFrame The trap register frame.
820 * @param GCPhysFault The fault address as guest physical address.
821 * @param pvFault The fault address.
822 * @todo VBOXSTRICTRC
823 */
824static int pgmPoolAccessHandlerFlush(PVM pVM, PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
825 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault)
826{
827 NOREF(GCPhysFault);
828
829 /*
830 * First, do the flushing.
831 */
832 int rc = pgmPoolMonitorChainFlush(pPool, pPage);
833
834 /*
835 * Emulate the instruction (xp/w2k problem, requires pc/cr2/sp detection).
836 * Must do this in raw mode (!); XP boot will fail otherwise.
837 */
838 VBOXSTRICTRC rc2 = EMInterpretInstructionDisasState(pVCpu, pDis, pRegFrame, pvFault, EMCODETYPE_ALL);
839 if (rc2 == VINF_SUCCESS)
840 { /* do nothing */ }
841#ifdef VBOX_WITH_IEM
842 else if (rc2 == VINF_EM_RESCHEDULE)
843 {
844 if (rc == VINF_SUCCESS)
845 rc = rc2;
846# ifndef IN_RING3
847 VMCPU_FF_SET(pVCpu, VMCPU_FF_TO_R3);
848# endif
849 }
850#endif
851 else if (rc2 == VERR_EM_INTERPRETER)
852 {
853#ifdef IN_RC
854 if (PATMIsPatchGCAddr(pVM, pRegFrame->eip))
855 {
856 LogFlow(("pgmPoolAccessHandlerPTWorker: Interpretation failed for patch code %04x:%RGv, ignoring.\n",
857 pRegFrame->cs.Sel, (RTGCPTR)pRegFrame->eip));
858 rc = VINF_SUCCESS;
859 STAM_COUNTER_INC(&pPool->StatMonitorRZIntrFailPatch2);
860 }
861 else
862#endif
863 {
864 rc = VINF_EM_RAW_EMULATE_INSTR;
865 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,EmulateInstr));
866 }
867 }
868 else if (RT_FAILURE_NP(rc2))
869 rc = VBOXSTRICTRC_VAL(rc2);
870 else
871 AssertMsgFailed(("%Rrc\n", VBOXSTRICTRC_VAL(rc2))); /* ASSUMES no complicated stuff here. */
872
873 LogFlow(("pgmPoolAccessHandlerPT: returns %Rrc (flushed)\n", rc));
874 return rc;
875}
876
877
878/**
879 * Handles the STOSD write accesses.
880 *
881 * @returns VBox status code suitable for scheduling.
882 * @param pVM Pointer to the VM.
883 * @param pPool The pool.
884 * @param pPage The pool page (head).
885 * @param pDis The disassembly of the write instruction.
886 * @param pRegFrame The trap register frame.
887 * @param GCPhysFault The fault address as guest physical address.
888 * @param pvFault The fault address.
889 */
890DECLINLINE(int) pgmPoolAccessHandlerSTOSD(PVM pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
891 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault)
892{
893 unsigned uIncrement = pDis->Param1.cb;
894 NOREF(pVM);
895
896 Assert(pDis->uCpuMode == DISCPUMODE_32BIT || pDis->uCpuMode == DISCPUMODE_64BIT);
897 Assert(pRegFrame->rcx <= 0x20);
898
899#ifdef VBOX_STRICT
900 if (pDis->uOpMode == DISCPUMODE_32BIT)
901 Assert(uIncrement == 4);
902 else
903 Assert(uIncrement == 8);
904#endif
905
906 Log3(("pgmPoolAccessHandlerSTOSD\n"));
907
908 /*
909 * Increment the modification counter and insert it into the list
910 * of modified pages the first time.
911 */
912 if (!pPage->cModifications++)
913 pgmPoolMonitorModifiedInsert(pPool, pPage);
914
915 /*
916 * Execute REP STOSD.
917 *
918 * This ASSUMES that we're not invoked by Trap0e on in a out-of-sync
919 * write situation, meaning that it's safe to write here.
920 */
921 PVMCPU pVCpu = VMMGetCpu(pPool->CTX_SUFF(pVM));
922 RTGCUINTPTR pu32 = (RTGCUINTPTR)pvFault;
923 while (pRegFrame->rcx)
924 {
925#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
926 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
927 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, (RTGCPTR)pu32, uIncrement);
928 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
929#else
930 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, (RTGCPTR)pu32, uIncrement);
931#endif
932#ifdef IN_RC
933 *(uint32_t *)(uintptr_t)pu32 = pRegFrame->eax;
934#else
935 PGMPhysSimpleWriteGCPhys(pVM, GCPhysFault, &pRegFrame->rax, uIncrement);
936#endif
937 pu32 += uIncrement;
938 GCPhysFault += uIncrement;
939 pRegFrame->rdi += uIncrement;
940 pRegFrame->rcx--;
941 }
942 pRegFrame->rip += pDis->cbInstr;
943
944 LogFlow(("pgmPoolAccessHandlerSTOSD: returns\n"));
945 return VINF_SUCCESS;
946}
947
948
949/**
950 * Handles the simple write accesses.
951 *
952 * @returns VBox status code suitable for scheduling.
953 * @param pVM Pointer to the VM.
954 * @param pVCpu Pointer to the VMCPU.
955 * @param pPool The pool.
956 * @param pPage The pool page (head).
957 * @param pDis The disassembly of the write instruction.
958 * @param pRegFrame The trap register frame.
959 * @param GCPhysFault The fault address as guest physical address.
960 * @param pvFault The fault address.
961 * @param pfReused Reused state (in/out)
962 */
963DECLINLINE(int) pgmPoolAccessHandlerSimple(PVM pVM, PVMCPU pVCpu, PPGMPOOL pPool, PPGMPOOLPAGE pPage, PDISCPUSTATE pDis,
964 PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, RTGCPTR pvFault, bool *pfReused)
965{
966 Log3(("pgmPoolAccessHandlerSimple\n"));
967 NOREF(pfReused); /* initialized by caller */
968
969 /*
970 * Increment the modification counter and insert it into the list
971 * of modified pages the first time.
972 */
973 if (!pPage->cModifications++)
974 pgmPoolMonitorModifiedInsert(pPool, pPage);
975
976 /*
977 * Clear all the pages. ASSUMES that pvFault is readable.
978 */
979#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
980 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
981 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, pvFault, DISGetParamSize(pDis, &pDis->Param1));
982 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
983#else
984 pgmPoolMonitorChainChanging(pVCpu, pPool, pPage, GCPhysFault, pvFault, DISGetParamSize(pDis, &pDis->Param1));
985#endif
986
987 /*
988 * Interpret the instruction.
989 */
990 VBOXSTRICTRC rc = EMInterpretInstructionDisasState(pVCpu, pDis, pRegFrame, pvFault, EMCODETYPE_ALL);
991 if (RT_SUCCESS(rc))
992 AssertMsg(rc == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rc))); /* ASSUMES no complicated stuff here. */
993 else if (rc == VERR_EM_INTERPRETER)
994 {
995 LogFlow(("pgmPoolAccessHandlerPTWorker: Interpretation failed for %04x:%RGv - opcode=%d\n",
996 pRegFrame->cs.Sel, (RTGCPTR)pRegFrame->rip, pDis->pCurInstr->uOpcode));
997 rc = VINF_EM_RAW_EMULATE_INSTR;
998 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,EmulateInstr));
999 }
1000
1001#if 0 /* experimental code */
1002 if (rc == VINF_SUCCESS)
1003 {
1004 switch (pPage->enmKind)
1005 {
1006 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
1007 {
1008 X86PTEPAE GstPte;
1009 int rc = pgmPoolPhysSimpleReadGCPhys(pVM, &GstPte, pvFault, GCPhysFault, sizeof(GstPte));
1010 AssertRC(rc);
1011
1012 /* Check the new value written by the guest. If present and with a bogus physical address, then
1013 * it's fairly safe to assume the guest is reusing the PT.
1014 */
1015 if (GstPte.n.u1Present)
1016 {
1017 RTHCPHYS HCPhys = -1;
1018 int rc = PGMPhysGCPhys2HCPhys(pVM, GstPte.u & X86_PTE_PAE_PG_MASK, &HCPhys);
1019 if (rc != VINF_SUCCESS)
1020 {
1021 *pfReused = true;
1022 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1023 }
1024 }
1025 break;
1026 }
1027 }
1028 }
1029#endif
1030
1031 LogFlow(("pgmPoolAccessHandlerSimple: returns %Rrc\n", VBOXSTRICTRC_VAL(rc)));
1032 return VBOXSTRICTRC_VAL(rc);
1033}
1034
1035
1036/**
1037 * \#PF Handler callback for PT write accesses.
1038 *
1039 * @returns VBox status code (appropriate for GC return).
1040 * @param pVM Pointer to the VM.
1041 * @param uErrorCode CPU Error code.
1042 * @param pRegFrame Trap register frame.
1043 * NULL on DMA and other non CPU access.
1044 * @param pvFault The fault address (cr2).
1045 * @param GCPhysFault The GC physical address corresponding to pvFault.
1046 * @param pvUser User argument.
1047 */
1048DECLEXPORT(int) pgmPoolAccessHandler(PVM pVM, RTGCUINT uErrorCode, PCPUMCTXCORE pRegFrame, RTGCPTR pvFault,
1049 RTGCPHYS GCPhysFault, void *pvUser)
1050{
1051 STAM_PROFILE_START(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), a);
1052 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1053 PPGMPOOLPAGE pPage = (PPGMPOOLPAGE)pvUser;
1054 PVMCPU pVCpu = VMMGetCpu(pVM);
1055 unsigned cMaxModifications;
1056 bool fForcedFlush = false;
1057 NOREF(uErrorCode);
1058
1059 LogFlow(("pgmPoolAccessHandler: pvFault=%RGv pPage=%p:{.idx=%d} GCPhysFault=%RGp\n", pvFault, pPage, pPage->idx, GCPhysFault));
1060
1061 pgmLock(pVM);
1062 if (PHYS_PAGE_ADDRESS(GCPhysFault) != PHYS_PAGE_ADDRESS(pPage->GCPhys))
1063 {
1064 /* Pool page changed while we were waiting for the lock; ignore. */
1065 Log(("CPU%d: pgmPoolAccessHandler pgm pool page for %RGp changed (to %RGp) while waiting!\n", pVCpu->idCpu, PHYS_PAGE_ADDRESS(GCPhysFault), PHYS_PAGE_ADDRESS(pPage->GCPhys)));
1066 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,Handled), a);
1067 pgmUnlock(pVM);
1068 return VINF_SUCCESS;
1069 }
1070#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1071 if (pPage->fDirty)
1072 {
1073 Assert(VMCPU_FF_ISSET(pVCpu, VMCPU_FF_TLB_FLUSH));
1074 pgmUnlock(pVM);
1075 return VINF_SUCCESS; /* SMP guest case where we were blocking on the pgm lock while the same page was being marked dirty. */
1076 }
1077#endif
1078
1079#if 0 /* test code defined(VBOX_STRICT) && defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) */
1080 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1081 {
1082 void *pvShw = PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pPage);
1083 void *pvGst;
1084 int rc = PGM_GCPHYS_2_PTR(pPool->CTX_SUFF(pVM), pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1085 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1086 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1087 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1088 }
1089#endif
1090
1091 /*
1092 * Disassemble the faulting instruction.
1093 */
1094 PDISCPUSTATE pDis = &pVCpu->pgm.s.DisState;
1095 int rc = EMInterpretDisasCurrent(pVM, pVCpu, pDis, NULL);
1096 if (RT_UNLIKELY(rc != VINF_SUCCESS))
1097 {
1098 AssertMsg(rc == VERR_PAGE_NOT_PRESENT || rc == VERR_PAGE_TABLE_NOT_PRESENT, ("Unexpected rc %d\n", rc));
1099 pgmUnlock(pVM);
1100 return rc;
1101 }
1102
1103 Assert(pPage->enmKind != PGMPOOLKIND_FREE);
1104
1105 /*
1106 * We should ALWAYS have the list head as user parameter. This
1107 * is because we use that page to record the changes.
1108 */
1109 Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1110
1111#ifdef IN_RING0
1112 /* Maximum nr of modifications depends on the page type. */
1113 if ( pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT
1114 || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1115 cMaxModifications = 4;
1116 else
1117 cMaxModifications = 24;
1118#else
1119 cMaxModifications = 48;
1120#endif
1121
1122 /*
1123 * Incremental page table updates should weigh more than random ones.
1124 * (Only applies when started from offset 0)
1125 */
1126 pVCpu->pgm.s.cPoolAccessHandler++;
1127 if ( pPage->GCPtrLastAccessHandlerRip >= pRegFrame->rip - 0x40 /* observed loops in Windows 7 x64 */
1128 && pPage->GCPtrLastAccessHandlerRip < pRegFrame->rip + 0x40
1129 && pvFault == (pPage->GCPtrLastAccessHandlerFault + pDis->Param1.cb)
1130 && pVCpu->pgm.s.cPoolAccessHandler == pPage->cLastAccessHandler + 1)
1131 {
1132 Log(("Possible page reuse cMods=%d -> %d (locked=%d type=%s)\n", pPage->cModifications, pPage->cModifications * 2, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1133 Assert(pPage->cModifications < 32000);
1134 pPage->cModifications = pPage->cModifications * 2;
1135 pPage->GCPtrLastAccessHandlerFault = pvFault;
1136 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1137 if (pPage->cModifications >= cMaxModifications)
1138 {
1139 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FlushReinit));
1140 fForcedFlush = true;
1141 }
1142 }
1143
1144 if (pPage->cModifications >= cMaxModifications)
1145 Log(("Mod overflow %RGv cMods=%d (locked=%d type=%s)\n", pvFault, pPage->cModifications, pgmPoolIsPageLocked(pPage), pgmPoolPoolKindToStr(pPage->enmKind)));
1146
1147 /*
1148 * Check if it's worth dealing with.
1149 */
1150 bool fReused = false;
1151 bool fNotReusedNotForking = false;
1152 if ( ( pPage->cModifications < cMaxModifications /** @todo #define */ /** @todo need to check that it's not mapping EIP. */ /** @todo adjust this! */
1153 || pgmPoolIsPageLocked(pPage)
1154 )
1155 && !(fReused = pgmPoolMonitorIsReused(pVM, pVCpu, pRegFrame, pDis, pvFault))
1156 && !pgmPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1157 {
1158 /*
1159 * Simple instructions, no REP prefix.
1160 */
1161 if (!(pDis->fPrefix & (DISPREFIX_REP | DISPREFIX_REPNE)))
1162 {
1163 rc = pgmPoolAccessHandlerSimple(pVM, pVCpu, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault, &fReused);
1164 if (fReused)
1165 goto flushPage;
1166
1167 /* A mov instruction to change the first page table entry will be remembered so we can detect
1168 * full page table changes early on. This will reduce the amount of unnecessary traps we'll take.
1169 */
1170 if ( rc == VINF_SUCCESS
1171 && !pPage->cLocked /* only applies to unlocked pages as we can't free locked ones (e.g. cr3 root). */
1172 && pDis->pCurInstr->uOpcode == OP_MOV
1173 && (pvFault & PAGE_OFFSET_MASK) == 0)
1174 {
1175 pPage->GCPtrLastAccessHandlerFault = pvFault;
1176 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1177 pPage->GCPtrLastAccessHandlerRip = pRegFrame->rip;
1178 /* Make sure we don't kick out a page too quickly. */
1179 if (pPage->cModifications > 8)
1180 pPage->cModifications = 2;
1181 }
1182 else if (pPage->GCPtrLastAccessHandlerFault == pvFault)
1183 {
1184 /* ignore the 2nd write to this page table entry. */
1185 pPage->cLastAccessHandler = pVCpu->pgm.s.cPoolAccessHandler;
1186 }
1187 else
1188 {
1189 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
1190 pPage->GCPtrLastAccessHandlerRip = 0;
1191 }
1192
1193 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,Handled), a);
1194 pgmUnlock(pVM);
1195 return rc;
1196 }
1197
1198 /*
1199 * Windows is frequently doing small memset() operations (netio test 4k+).
1200 * We have to deal with these or we'll kill the cache and performance.
1201 */
1202 if ( pDis->pCurInstr->uOpcode == OP_STOSWD
1203 && !pRegFrame->eflags.Bits.u1DF
1204 && pDis->uOpMode == pDis->uCpuMode
1205 && pDis->uAddrMode == pDis->uCpuMode)
1206 {
1207 bool fValidStosd = false;
1208
1209 if ( pDis->uCpuMode == DISCPUMODE_32BIT
1210 && pDis->fPrefix == DISPREFIX_REP
1211 && pRegFrame->ecx <= 0x20
1212 && pRegFrame->ecx * 4 <= PAGE_SIZE - ((uintptr_t)pvFault & PAGE_OFFSET_MASK)
1213 && !((uintptr_t)pvFault & 3)
1214 && (pRegFrame->eax == 0 || pRegFrame->eax == 0x80) /* the two values observed. */
1215 )
1216 {
1217 fValidStosd = true;
1218 pRegFrame->rcx &= 0xffffffff; /* paranoia */
1219 }
1220 else
1221 if ( pDis->uCpuMode == DISCPUMODE_64BIT
1222 && pDis->fPrefix == (DISPREFIX_REP | DISPREFIX_REX)
1223 && pRegFrame->rcx <= 0x20
1224 && pRegFrame->rcx * 8 <= PAGE_SIZE - ((uintptr_t)pvFault & PAGE_OFFSET_MASK)
1225 && !((uintptr_t)pvFault & 7)
1226 && (pRegFrame->rax == 0 || pRegFrame->rax == 0x80) /* the two values observed. */
1227 )
1228 {
1229 fValidStosd = true;
1230 }
1231
1232 if (fValidStosd)
1233 {
1234 rc = pgmPoolAccessHandlerSTOSD(pVM, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault);
1235 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,RepStosd), a);
1236 pgmUnlock(pVM);
1237 return rc;
1238 }
1239 }
1240
1241 /* REP prefix, don't bother. */
1242 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,RepPrefix));
1243 Log4(("pgmPoolAccessHandler: eax=%#x ecx=%#x edi=%#x esi=%#x rip=%RGv opcode=%d prefix=%#x\n",
1244 pRegFrame->eax, pRegFrame->ecx, pRegFrame->edi, pRegFrame->esi, (RTGCPTR)pRegFrame->rip, pDis->pCurInstr->uOpcode, pDis->fPrefix));
1245 fNotReusedNotForking = true;
1246 }
1247
1248#if defined(PGMPOOL_WITH_OPTIMIZED_DIRTY_PT) && defined(IN_RING0)
1249 /* E.g. Windows 7 x64 initializes page tables and touches some pages in the table during the process. This
1250 * leads to pgm pool trashing and an excessive amount of write faults due to page monitoring.
1251 */
1252 if ( pPage->cModifications >= cMaxModifications
1253 && !fForcedFlush
1254 && (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT || pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1255 && ( fNotReusedNotForking
1256 || ( !pgmPoolMonitorIsReused(pVM, pVCpu, pRegFrame, pDis, pvFault)
1257 && !pgmPoolMonitorIsForking(pPool, pDis, GCPhysFault & PAGE_OFFSET_MASK))
1258 )
1259 )
1260 {
1261 Assert(!pgmPoolIsPageLocked(pPage));
1262 Assert(pPage->fDirty == false);
1263
1264 /* Flush any monitored duplicates as we will disable write protection. */
1265 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
1266 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
1267 {
1268 PPGMPOOLPAGE pPageHead = pPage;
1269
1270 /* Find the monitor head. */
1271 while (pPageHead->iMonitoredPrev != NIL_PGMPOOL_IDX)
1272 pPageHead = &pPool->aPages[pPageHead->iMonitoredPrev];
1273
1274 while (pPageHead)
1275 {
1276 unsigned idxNext = pPageHead->iMonitoredNext;
1277
1278 if (pPageHead != pPage)
1279 {
1280 STAM_COUNTER_INC(&pPool->StatDirtyPageDupFlush);
1281 Log(("Flush duplicate page idx=%d GCPhys=%RGp type=%s\n", pPageHead->idx, pPageHead->GCPhys, pgmPoolPoolKindToStr(pPageHead->enmKind)));
1282 int rc2 = pgmPoolFlushPage(pPool, pPageHead);
1283 AssertRC(rc2);
1284 }
1285
1286 if (idxNext == NIL_PGMPOOL_IDX)
1287 break;
1288
1289 pPageHead = &pPool->aPages[idxNext];
1290 }
1291 }
1292
1293 /* The flushing above might fail for locked pages, so double check. */
1294 if ( pPage->iMonitoredNext == NIL_PGMPOOL_IDX
1295 && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
1296 {
1297 pgmPoolAddDirtyPage(pVM, pPool, pPage);
1298
1299 /* Temporarily allow write access to the page table again. */
1300 rc = PGMHandlerPhysicalPageTempOff(pVM, pPage->GCPhys & PAGE_BASE_GC_MASK, pPage->GCPhys & PAGE_BASE_GC_MASK);
1301 if (rc == VINF_SUCCESS)
1302 {
1303 rc = PGMShwMakePageWritable(pVCpu, pvFault, PGM_MK_PG_IS_WRITE_FAULT);
1304 AssertMsg(rc == VINF_SUCCESS
1305 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1306 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1307 || rc == VERR_PAGE_NOT_PRESENT,
1308 ("PGMShwModifyPage -> GCPtr=%RGv rc=%d\n", pvFault, rc));
1309# ifdef VBOX_STRICT
1310 pPage->GCPtrDirtyFault = pvFault;
1311# endif
1312
1313 STAM_PROFILE_STOP(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), a);
1314 pgmUnlock(pVM);
1315 return rc;
1316 }
1317 }
1318 }
1319#endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT */
1320
1321 STAM_COUNTER_INC(&pPool->CTX_MID_Z(StatMonitor,FlushModOverflow));
1322flushPage:
1323 /*
1324 * Not worth it, so flush it.
1325 *
1326 * If we considered it to be reused, don't go back to ring-3
1327 * to emulate failed instructions since we usually cannot
1328 * interpret then. This may be a bit risky, in which case
1329 * the reuse detection must be fixed.
1330 */
1331 rc = pgmPoolAccessHandlerFlush(pVM, pVCpu, pPool, pPage, pDis, pRegFrame, GCPhysFault, pvFault);
1332 if ( rc == VINF_EM_RAW_EMULATE_INSTR
1333 && fReused)
1334 {
1335 /* Make sure that the current instruction still has shadow page backing, otherwise we'll end up in a loop. */
1336 if (PGMShwGetPage(pVCpu, pRegFrame->rip, NULL, NULL) == VINF_SUCCESS)
1337 rc = VINF_SUCCESS; /* safe to restart the instruction. */
1338 }
1339 STAM_PROFILE_STOP_EX(&pVM->pgm.s.CTX_SUFF(pPool)->CTX_SUFF_Z(StatMonitor), &pPool->CTX_MID_Z(StatMonitor,FlushPage), a);
1340 pgmUnlock(pVM);
1341 return rc;
1342}
1343
1344# endif /* !IN_RING3 */
1345
1346# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
1347
1348# if defined(VBOX_STRICT) && !defined(IN_RING3)
1349
1350/**
1351 * Check references to guest physical memory in a PAE / PAE page table.
1352 *
1353 * @param pPool The pool.
1354 * @param pPage The page.
1355 * @param pShwPT The shadow page table (mapping of the page).
1356 * @param pGstPT The guest page table.
1357 */
1358static void pgmPoolTrackCheckPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
1359{
1360 unsigned cErrors = 0;
1361 int LastRc = -1; /* initialized to shut up gcc */
1362 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1363 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1364 PVM pVM = pPool->CTX_SUFF(pVM);
1365
1366#ifdef VBOX_STRICT
1367 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1368 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1369#endif
1370 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1371 {
1372 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1373 {
1374 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1375 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1376 if ( rc != VINF_SUCCESS
1377 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1378 {
1379 Log(("rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1380 LastPTE = i;
1381 LastRc = rc;
1382 LastHCPhys = HCPhys;
1383 cErrors++;
1384
1385 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1386 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1387 AssertRC(rc);
1388
1389 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1390 {
1391 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1392
1393 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1394 {
1395 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1396
1397 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1398 {
1399 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1400 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1401 {
1402 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1403 }
1404 }
1405
1406 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1407 }
1408 }
1409 }
1410 }
1411 }
1412 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %RX64 shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1413}
1414
1415
1416/**
1417 * Check references to guest physical memory in a PAE / 32-bit page table.
1418 *
1419 * @param pPool The pool.
1420 * @param pPage The page.
1421 * @param pShwPT The shadow page table (mapping of the page).
1422 * @param pGstPT The guest page table.
1423 */
1424static void pgmPoolTrackCheckPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
1425{
1426 unsigned cErrors = 0;
1427 int LastRc = -1; /* initialized to shut up gcc */
1428 unsigned LastPTE = ~0U; /* initialized to shut up gcc */
1429 RTHCPHYS LastHCPhys = NIL_RTHCPHYS; /* initialized to shut up gcc */
1430 PVM pVM = pPool->CTX_SUFF(pVM);
1431
1432#ifdef VBOX_STRICT
1433 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1434 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1435#endif
1436 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1437 {
1438 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1439 {
1440 RTHCPHYS HCPhys = NIL_RTHCPHYS;
1441 int rc = PGMPhysGCPhys2HCPhys(pVM, pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1442 if ( rc != VINF_SUCCESS
1443 || PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) != HCPhys)
1444 {
1445 Log(("rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", rc, i, pGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1446 LastPTE = i;
1447 LastRc = rc;
1448 LastHCPhys = HCPhys;
1449 cErrors++;
1450
1451 RTHCPHYS HCPhysPT = NIL_RTHCPHYS;
1452 rc = PGMPhysGCPhys2HCPhys(pVM, pPage->GCPhys, &HCPhysPT);
1453 AssertRC(rc);
1454
1455 for (unsigned iPage = 0; iPage < pPool->cCurPages; iPage++)
1456 {
1457 PPGMPOOLPAGE pTempPage = &pPool->aPages[iPage];
1458
1459 if (pTempPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_32BIT_PT)
1460 {
1461 PPGMSHWPTPAE pShwPT2 = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pTempPage);
1462
1463 for (unsigned j = 0; j < RT_ELEMENTS(pShwPT->a); j++)
1464 {
1465 if ( PGMSHWPTEPAE_IS_P_RW(pShwPT2->a[j])
1466 && PGMSHWPTEPAE_GET_HCPHYS(pShwPT2->a[j]) == HCPhysPT)
1467 {
1468 Log(("GCPhys=%RGp idx=%d %RX64 vs %RX64\n", pTempPage->GCPhys, j, PGMSHWPTEPAE_GET_LOG(pShwPT->a[j]), PGMSHWPTEPAE_GET_LOG(pShwPT2->a[j])));
1469 }
1470 }
1471
1472 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pShwPT2);
1473 }
1474 }
1475 }
1476 }
1477 }
1478 AssertMsg(!cErrors, ("cErrors=%d: last rc=%d idx=%d guest %x shw=%RX64 vs %RHp\n", cErrors, LastRc, LastPTE, pGstPT->a[LastPTE].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[LastPTE]), LastHCPhys));
1479}
1480
1481# endif /* VBOX_STRICT && !IN_RING3 */
1482
1483/**
1484 * Clear references to guest physical memory in a PAE / PAE page table.
1485 *
1486 * @returns nr of changed PTEs
1487 * @param pPool The pool.
1488 * @param pPage The page.
1489 * @param pShwPT The shadow page table (mapping of the page).
1490 * @param pGstPT The guest page table.
1491 * @param pOldGstPT The old cached guest page table.
1492 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1493 * @param pfFlush Flush reused page table (out)
1494 */
1495DECLINLINE(unsigned) pgmPoolTrackFlushPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT,
1496 PCX86PTPAE pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1497{
1498 unsigned cChanged = 0;
1499
1500#ifdef VBOX_STRICT
1501 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1502 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1503#endif
1504 *pfFlush = false;
1505
1506 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1507 {
1508 /* Check the new value written by the guest. If present and with a bogus physical address, then
1509 * it's fairly safe to assume the guest is reusing the PT.
1510 */
1511 if ( fAllowRemoval
1512 && pGstPT->a[i].n.u1Present)
1513 {
1514 if (!PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1515 {
1516 *pfFlush = true;
1517 return ++cChanged;
1518 }
1519 }
1520 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1521 {
1522 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1523 if ((pGstPT->a[i].u & X86_PTE_PAE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK))
1524 {
1525#ifdef VBOX_STRICT
1526 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1527 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK, &HCPhys);
1528 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %RX64 old %RX64 shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1529#endif
1530 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1531 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1532 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G | X86_PTE_PAE_NX);
1533 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1534
1535 if ( uHostAttr == uGuestAttr
1536 && fHostRW <= fGuestRW)
1537 continue;
1538 }
1539 cChanged++;
1540 /* Something was changed, so flush it. */
1541 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%RX64\n",
1542 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
1543 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PAE_PG_MASK, i);
1544 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1545 }
1546 }
1547 return cChanged;
1548}
1549
1550
1551/**
1552 * Clear references to guest physical memory in a PAE / PAE page table.
1553 *
1554 * @returns nr of changed PTEs
1555 * @param pPool The pool.
1556 * @param pPage The page.
1557 * @param pShwPT The shadow page table (mapping of the page).
1558 * @param pGstPT The guest page table.
1559 * @param pOldGstPT The old cached guest page table.
1560 * @param fAllowRemoval Bail out as soon as we encounter an invalid PTE
1561 * @param pfFlush Flush reused page table (out)
1562 */
1563DECLINLINE(unsigned) pgmPoolTrackFlushPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT,
1564 PCX86PT pOldGstPT, bool fAllowRemoval, bool *pfFlush)
1565{
1566 unsigned cChanged = 0;
1567
1568#ifdef VBOX_STRICT
1569 for (unsigned i = 0; i < RT_MIN(RT_ELEMENTS(pShwPT->a), pPage->iFirstPresent); i++)
1570 AssertMsg(!PGMSHWPTEPAE_IS_P(pShwPT->a[i]), ("Unexpected PTE: idx=%d %RX64 (first=%d)\n", i, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), pPage->iFirstPresent));
1571#endif
1572 *pfFlush = false;
1573
1574 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
1575 {
1576 /* Check the new value written by the guest. If present and with a bogus physical address, then
1577 * it's fairly safe to assume the guest is reusing the PT.
1578 */
1579 if ( fAllowRemoval
1580 && pGstPT->a[i].n.u1Present)
1581 {
1582 if (!PGMPhysIsGCPhysValid(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PG_MASK))
1583 {
1584 *pfFlush = true;
1585 return ++cChanged;
1586 }
1587 }
1588 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
1589 {
1590 /* If the old cached PTE is identical, then there's no need to flush the shadow copy. */
1591 if ((pGstPT->a[i].u & X86_PTE_PG_MASK) == (pOldGstPT->a[i].u & X86_PTE_PG_MASK))
1592 {
1593#ifdef VBOX_STRICT
1594 RTHCPHYS HCPhys = NIL_RTGCPHYS;
1595 int rc = PGMPhysGCPhys2HCPhys(pPool->CTX_SUFF(pVM), pGstPT->a[i].u & X86_PTE_PG_MASK, &HCPhys);
1596 AssertMsg(rc == VINF_SUCCESS && PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]) == HCPhys, ("rc=%d guest %x old %x shw=%RX64 vs %RHp\n", rc, pGstPT->a[i].u, pOldGstPT->a[i].u, PGMSHWPTEPAE_GET_LOG(pShwPT->a[i]), HCPhys));
1597#endif
1598 uint64_t uHostAttr = PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1599 bool fHostRW = !!(PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & X86_PTE_RW);
1600 uint64_t uGuestAttr = pGstPT->a[i].u & (X86_PTE_P | X86_PTE_US | X86_PTE_A | X86_PTE_D | X86_PTE_G);
1601 bool fGuestRW = !!(pGstPT->a[i].u & X86_PTE_RW);
1602
1603 if ( uHostAttr == uGuestAttr
1604 && fHostRW <= fGuestRW)
1605 continue;
1606 }
1607 cChanged++;
1608 /* Something was changed, so flush it. */
1609 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX64 hint=%x\n",
1610 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK));
1611 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pOldGstPT->a[i].u & X86_PTE_PG_MASK, i);
1612 PGMSHWPTEPAE_ATOMIC_SET(pShwPT->a[i], 0);
1613 }
1614 }
1615 return cChanged;
1616}
1617
1618
1619/**
1620 * Flush a dirty page
1621 *
1622 * @param pVM Pointer to the VM.
1623 * @param pPool The pool.
1624 * @param idxSlot Dirty array slot index
1625 * @param fAllowRemoval Allow a reused page table to be removed
1626 */
1627static void pgmPoolFlushDirtyPage(PVM pVM, PPGMPOOL pPool, unsigned idxSlot, bool fAllowRemoval = false)
1628{
1629 PPGMPOOLPAGE pPage;
1630 unsigned idxPage;
1631
1632 Assert(idxSlot < RT_ELEMENTS(pPool->aDirtyPages));
1633 if (pPool->aDirtyPages[idxSlot].uIdx == NIL_PGMPOOL_IDX)
1634 return;
1635
1636 idxPage = pPool->aDirtyPages[idxSlot].uIdx;
1637 AssertRelease(idxPage != NIL_PGMPOOL_IDX);
1638 pPage = &pPool->aPages[idxPage];
1639 Assert(pPage->idx == idxPage);
1640 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1641
1642 AssertMsg(pPage->fDirty, ("Page %RGp (slot=%d) not marked dirty!", pPage->GCPhys, idxSlot));
1643 Log(("Flush dirty page %RGp cMods=%d\n", pPage->GCPhys, pPage->cModifications));
1644
1645#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
1646 PVMCPU pVCpu = VMMGetCpu(pVM);
1647 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
1648#endif
1649
1650 /* First write protect the page again to catch all write accesses. (before checking for changes -> SMP) */
1651 int rc = PGMHandlerPhysicalReset(pVM, pPage->GCPhys & PAGE_BASE_GC_MASK);
1652 Assert(rc == VINF_SUCCESS);
1653 pPage->fDirty = false;
1654
1655#ifdef VBOX_STRICT
1656 uint64_t fFlags = 0;
1657 RTHCPHYS HCPhys;
1658 rc = PGMShwGetPage(VMMGetCpu(pVM), pPage->GCPtrDirtyFault, &fFlags, &HCPhys);
1659 AssertMsg( ( rc == VINF_SUCCESS
1660 && (!(fFlags & X86_PTE_RW) || HCPhys != pPage->Core.Key))
1661 /* In the SMP case the page table might be removed while we wait for the PGM lock in the trap handler. */
1662 || rc == VERR_PAGE_TABLE_NOT_PRESENT
1663 || rc == VERR_PAGE_NOT_PRESENT,
1664 ("PGMShwGetPage -> GCPtr=%RGv rc=%d flags=%RX64\n", pPage->GCPtrDirtyFault, rc, fFlags));
1665#endif
1666
1667 /* Flush those PTEs that have changed. */
1668 STAM_PROFILE_START(&pPool->StatTrackDeref,a);
1669 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1670 void *pvGst;
1671 rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1672 bool fFlush;
1673 unsigned cChanges;
1674
1675 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1676 cChanges = pgmPoolTrackFlushPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst,
1677 (PCX86PTPAE)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1678 else
1679 cChanges = pgmPoolTrackFlushPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst,
1680 (PCX86PT)&pPool->aDirtyPages[idxSlot].aPage[0], fAllowRemoval, &fFlush);
1681
1682 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1683 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1684 STAM_PROFILE_STOP(&pPool->StatTrackDeref,a);
1685 /* Note: we might want to consider keeping the dirty page active in case there were many changes. */
1686
1687 /* This page is likely to be modified again, so reduce the nr of modifications just a bit here. */
1688 Assert(pPage->cModifications);
1689 if (cChanges < 4)
1690 pPage->cModifications = 1; /* must use > 0 here */
1691 else
1692 pPage->cModifications = RT_MAX(1, pPage->cModifications / 2);
1693
1694 STAM_COUNTER_INC(&pPool->StatResetDirtyPages);
1695 if (pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages))
1696 pPool->idxFreeDirtyPage = idxSlot;
1697
1698 pPool->cDirtyPages--;
1699 pPool->aDirtyPages[idxSlot].uIdx = NIL_PGMPOOL_IDX;
1700 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1701 if (fFlush)
1702 {
1703 Assert(fAllowRemoval);
1704 Log(("Flush reused page table!\n"));
1705 pgmPoolFlushPage(pPool, pPage);
1706 STAM_COUNTER_INC(&pPool->StatForceFlushReused);
1707 }
1708 else
1709 Log(("Removed dirty page %RGp cMods=%d cChanges=%d\n", pPage->GCPhys, pPage->cModifications, cChanges));
1710
1711#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
1712 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
1713#endif
1714}
1715
1716
1717# ifndef IN_RING3
1718/**
1719 * Add a new dirty page
1720 *
1721 * @param pVM Pointer to the VM.
1722 * @param pPool The pool.
1723 * @param pPage The page.
1724 */
1725void pgmPoolAddDirtyPage(PVM pVM, PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1726{
1727 unsigned idxFree;
1728
1729 PGM_LOCK_ASSERT_OWNER(pVM);
1730 AssertCompile(RT_ELEMENTS(pPool->aDirtyPages) == 8 || RT_ELEMENTS(pPool->aDirtyPages) == 16);
1731 Assert(!pPage->fDirty);
1732
1733 idxFree = pPool->idxFreeDirtyPage;
1734 Assert(idxFree < RT_ELEMENTS(pPool->aDirtyPages));
1735 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX && pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
1736
1737 if (pPool->cDirtyPages >= RT_ELEMENTS(pPool->aDirtyPages))
1738 {
1739 STAM_COUNTER_INC(&pPool->StatDirtyPageOverFlowFlush);
1740 pgmPoolFlushDirtyPage(pVM, pPool, idxFree, true /* allow removal of reused page tables*/);
1741 }
1742 Assert(pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages));
1743 AssertMsg(pPool->aDirtyPages[idxFree].uIdx == NIL_PGMPOOL_IDX, ("idxFree=%d cDirtyPages=%d\n", idxFree, pPool->cDirtyPages));
1744
1745 Log(("Add dirty page %RGp (slot=%d)\n", pPage->GCPhys, idxFree));
1746
1747 /*
1748 * Make a copy of the guest page table as we require valid GCPhys addresses
1749 * when removing references to physical pages.
1750 * (The HCPhys linear lookup is *extremely* expensive!)
1751 */
1752 void *pvGst;
1753 int rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
1754 memcpy(&pPool->aDirtyPages[idxFree].aPage[0], pvGst, (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT) ? PAGE_SIZE : PAGE_SIZE/2);
1755# ifdef VBOX_STRICT
1756 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
1757 if (pPage->enmKind == PGMPOOLKIND_PAE_PT_FOR_PAE_PT)
1758 pgmPoolTrackCheckPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
1759 else
1760 pgmPoolTrackCheckPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst);
1761 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
1762# endif
1763 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
1764
1765 STAM_COUNTER_INC(&pPool->StatDirtyPage);
1766 pPage->fDirty = true;
1767 pPage->idxDirtyEntry = (uint8_t)idxFree; Assert(pPage->idxDirtyEntry == idxFree);
1768 pPool->aDirtyPages[idxFree].uIdx = pPage->idx;
1769 pPool->cDirtyPages++;
1770
1771 pPool->idxFreeDirtyPage = (pPool->idxFreeDirtyPage + 1) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1772 if ( pPool->cDirtyPages < RT_ELEMENTS(pPool->aDirtyPages)
1773 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1774 {
1775 unsigned i;
1776 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1777 {
1778 idxFree = (pPool->idxFreeDirtyPage + i) & (RT_ELEMENTS(pPool->aDirtyPages) - 1);
1779 if (pPool->aDirtyPages[idxFree].uIdx == NIL_PGMPOOL_IDX)
1780 {
1781 pPool->idxFreeDirtyPage = idxFree;
1782 break;
1783 }
1784 }
1785 Assert(i != RT_ELEMENTS(pPool->aDirtyPages));
1786 }
1787
1788 Assert(pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages) || pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx == NIL_PGMPOOL_IDX);
1789 return;
1790}
1791# endif /* !IN_RING3 */
1792
1793
1794/**
1795 * Check if the specified page is dirty (not write monitored)
1796 *
1797 * @return dirty or not
1798 * @param pVM Pointer to the VM.
1799 * @param GCPhys Guest physical address
1800 */
1801bool pgmPoolIsDirtyPage(PVM pVM, RTGCPHYS GCPhys)
1802{
1803 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1804 PGM_LOCK_ASSERT_OWNER(pVM);
1805 if (!pPool->cDirtyPages)
1806 return false;
1807
1808 GCPhys = GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
1809
1810 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1811 {
1812 if (pPool->aDirtyPages[i].uIdx != NIL_PGMPOOL_IDX)
1813 {
1814 PPGMPOOLPAGE pPage;
1815 unsigned idxPage = pPool->aDirtyPages[i].uIdx;
1816
1817 pPage = &pPool->aPages[idxPage];
1818 if (pPage->GCPhys == GCPhys)
1819 return true;
1820 }
1821 }
1822 return false;
1823}
1824
1825
1826/**
1827 * Reset all dirty pages by reinstating page monitoring.
1828 *
1829 * @param pVM Pointer to the VM.
1830 */
1831void pgmPoolResetDirtyPages(PVM pVM)
1832{
1833 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1834 PGM_LOCK_ASSERT_OWNER(pVM);
1835 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1836
1837 if (!pPool->cDirtyPages)
1838 return;
1839
1840 Log(("pgmPoolResetDirtyPages\n"));
1841 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1842 pgmPoolFlushDirtyPage(pVM, pPool, i, true /* allow removal of reused page tables*/);
1843
1844 pPool->idxFreeDirtyPage = 0;
1845 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
1846 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1847 {
1848 unsigned i;
1849 for (i = 1; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1850 {
1851 if (pPool->aDirtyPages[i].uIdx == NIL_PGMPOOL_IDX)
1852 {
1853 pPool->idxFreeDirtyPage = i;
1854 break;
1855 }
1856 }
1857 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
1858 }
1859
1860 Assert(pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx == NIL_PGMPOOL_IDX || pPool->cDirtyPages == RT_ELEMENTS(pPool->aDirtyPages));
1861 return;
1862}
1863
1864
1865/**
1866 * Invalidate the PT entry for the specified page
1867 *
1868 * @param pVM Pointer to the VM.
1869 * @param GCPtrPage Guest page to invalidate
1870 */
1871void pgmPoolResetDirtyPage(PVM pVM, RTGCPTR GCPtrPage)
1872{
1873 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1874 PGM_LOCK_ASSERT_OWNER(pVM);
1875 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1876
1877 if (!pPool->cDirtyPages)
1878 return;
1879
1880 Log(("pgmPoolResetDirtyPage %RGv\n", GCPtrPage));
1881 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1882 {
1883 }
1884}
1885
1886
1887/**
1888 * Reset all dirty pages by reinstating page monitoring.
1889 *
1890 * @param pVM Pointer to the VM.
1891 * @param GCPhysPT Physical address of the page table
1892 */
1893void pgmPoolInvalidateDirtyPage(PVM pVM, RTGCPHYS GCPhysPT)
1894{
1895 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
1896 PGM_LOCK_ASSERT_OWNER(pVM);
1897 Assert(pPool->cDirtyPages <= RT_ELEMENTS(pPool->aDirtyPages));
1898 unsigned idxDirtyPage = RT_ELEMENTS(pPool->aDirtyPages);
1899
1900 if (!pPool->cDirtyPages)
1901 return;
1902
1903 GCPhysPT = GCPhysPT & ~(RTGCPHYS)PAGE_OFFSET_MASK;
1904
1905 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1906 {
1907 if (pPool->aDirtyPages[i].uIdx != NIL_PGMPOOL_IDX)
1908 {
1909 unsigned idxPage = pPool->aDirtyPages[i].uIdx;
1910
1911 PPGMPOOLPAGE pPage = &pPool->aPages[idxPage];
1912 if (pPage->GCPhys == GCPhysPT)
1913 {
1914 idxDirtyPage = i;
1915 break;
1916 }
1917 }
1918 }
1919
1920 if (idxDirtyPage != RT_ELEMENTS(pPool->aDirtyPages))
1921 {
1922 pgmPoolFlushDirtyPage(pVM, pPool, idxDirtyPage, true /* allow removal of reused page tables*/);
1923 if ( pPool->cDirtyPages != RT_ELEMENTS(pPool->aDirtyPages)
1924 && pPool->aDirtyPages[pPool->idxFreeDirtyPage].uIdx != NIL_PGMPOOL_IDX)
1925 {
1926 unsigned i;
1927 for (i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
1928 {
1929 if (pPool->aDirtyPages[i].uIdx == NIL_PGMPOOL_IDX)
1930 {
1931 pPool->idxFreeDirtyPage = i;
1932 break;
1933 }
1934 }
1935 AssertMsg(i != RT_ELEMENTS(pPool->aDirtyPages), ("cDirtyPages %d", pPool->cDirtyPages));
1936 }
1937 }
1938}
1939
1940# endif /* PGMPOOL_WITH_OPTIMIZED_DIRTY_PT */
1941
1942/**
1943 * Inserts a page into the GCPhys hash table.
1944 *
1945 * @param pPool The pool.
1946 * @param pPage The page.
1947 */
1948DECLINLINE(void) pgmPoolHashInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1949{
1950 Log3(("pgmPoolHashInsert: %RGp\n", pPage->GCPhys));
1951 Assert(pPage->GCPhys != NIL_RTGCPHYS); Assert(pPage->iNext == NIL_PGMPOOL_IDX);
1952 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
1953 pPage->iNext = pPool->aiHash[iHash];
1954 pPool->aiHash[iHash] = pPage->idx;
1955}
1956
1957
1958/**
1959 * Removes a page from the GCPhys hash table.
1960 *
1961 * @param pPool The pool.
1962 * @param pPage The page.
1963 */
1964DECLINLINE(void) pgmPoolHashRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
1965{
1966 Log3(("pgmPoolHashRemove: %RGp\n", pPage->GCPhys));
1967 uint16_t iHash = PGMPOOL_HASH(pPage->GCPhys);
1968 if (pPool->aiHash[iHash] == pPage->idx)
1969 pPool->aiHash[iHash] = pPage->iNext;
1970 else
1971 {
1972 uint16_t iPrev = pPool->aiHash[iHash];
1973 for (;;)
1974 {
1975 const int16_t i = pPool->aPages[iPrev].iNext;
1976 if (i == pPage->idx)
1977 {
1978 pPool->aPages[iPrev].iNext = pPage->iNext;
1979 break;
1980 }
1981 if (i == NIL_PGMPOOL_IDX)
1982 {
1983 AssertReleaseMsgFailed(("GCPhys=%RGp idx=%d\n", pPage->GCPhys, pPage->idx));
1984 break;
1985 }
1986 iPrev = i;
1987 }
1988 }
1989 pPage->iNext = NIL_PGMPOOL_IDX;
1990}
1991
1992
1993/**
1994 * Frees up one cache page.
1995 *
1996 * @returns VBox status code.
1997 * @retval VINF_SUCCESS on success.
1998 * @param pPool The pool.
1999 * @param iUser The user index.
2000 */
2001static int pgmPoolCacheFreeOne(PPGMPOOL pPool, uint16_t iUser)
2002{
2003#ifndef IN_RC
2004 const PVM pVM = pPool->CTX_SUFF(pVM);
2005#endif
2006 Assert(pPool->iAgeHead != pPool->iAgeTail); /* We shouldn't be here if there < 2 cached entries! */
2007 STAM_COUNTER_INC(&pPool->StatCacheFreeUpOne);
2008
2009 /*
2010 * Select one page from the tail of the age list.
2011 */
2012 PPGMPOOLPAGE pPage;
2013 for (unsigned iLoop = 0; ; iLoop++)
2014 {
2015 uint16_t iToFree = pPool->iAgeTail;
2016 if (iToFree == iUser)
2017 iToFree = pPool->aPages[iToFree].iAgePrev;
2018/* This is the alternative to the SyncCR3 pgmPoolCacheUsed calls.
2019 if (pPool->aPages[iToFree].iUserHead != NIL_PGMPOOL_USER_INDEX)
2020 {
2021 uint16_t i = pPool->aPages[iToFree].iAgePrev;
2022 for (unsigned j = 0; j < 10 && i != NIL_PGMPOOL_USER_INDEX; j++, i = pPool->aPages[i].iAgePrev)
2023 {
2024 if (pPool->aPages[iToFree].iUserHead == NIL_PGMPOOL_USER_INDEX)
2025 continue;
2026 iToFree = i;
2027 break;
2028 }
2029 }
2030*/
2031 Assert(iToFree != iUser);
2032 AssertRelease(iToFree != NIL_PGMPOOL_IDX);
2033 pPage = &pPool->aPages[iToFree];
2034
2035 /*
2036 * Reject any attempts at flushing the currently active shadow CR3 mapping.
2037 * Call pgmPoolCacheUsed to move the page to the head of the age list.
2038 */
2039 if (!pgmPoolIsPageLocked(pPage))
2040 break;
2041 LogFlow(("pgmPoolCacheFreeOne: refuse CR3 mapping\n"));
2042 pgmPoolCacheUsed(pPool, pPage);
2043 AssertLogRelReturn(iLoop < 8192, VERR_PGM_POOL_TOO_MANY_LOOPS);
2044 }
2045
2046 /*
2047 * Found a usable page, flush it and return.
2048 */
2049 int rc = pgmPoolFlushPage(pPool, pPage);
2050 /* This flush was initiated by us and not the guest, so explicitly flush the TLB. */
2051 /* todo: find out why this is necessary; pgmPoolFlushPage should trigger a flush if one is really needed. */
2052 if (rc == VINF_SUCCESS)
2053 PGM_INVL_ALL_VCPU_TLBS(pVM);
2054 return rc;
2055}
2056
2057
2058/**
2059 * Checks if a kind mismatch is really a page being reused
2060 * or if it's just normal remappings.
2061 *
2062 * @returns true if reused and the cached page (enmKind1) should be flushed
2063 * @returns false if not reused.
2064 * @param enmKind1 The kind of the cached page.
2065 * @param enmKind2 The kind of the requested page.
2066 */
2067static bool pgmPoolCacheReusedByKind(PGMPOOLKIND enmKind1, PGMPOOLKIND enmKind2)
2068{
2069 switch (enmKind1)
2070 {
2071 /*
2072 * Never reuse them. There is no remapping in non-paging mode.
2073 */
2074 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2075 case PGMPOOLKIND_32BIT_PD_PHYS:
2076 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2077 case PGMPOOLKIND_PAE_PD_PHYS:
2078 case PGMPOOLKIND_PAE_PDPT_PHYS:
2079 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2080 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2081 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2082 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2083 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2084 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT: /* never reuse them for other types */
2085 return false;
2086
2087 /*
2088 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2089 */
2090 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2091 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2092 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2093 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2094 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2095 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2096 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2097 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2098 case PGMPOOLKIND_32BIT_PD:
2099 case PGMPOOLKIND_PAE_PDPT:
2100 switch (enmKind2)
2101 {
2102 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2103 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2104 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2105 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2106 case PGMPOOLKIND_64BIT_PML4:
2107 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2108 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2109 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2110 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2111 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2112 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2113 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2114 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2115 return true;
2116 default:
2117 return false;
2118 }
2119
2120 /*
2121 * It's perfectly fine to reuse these, except for PAE and non-paging stuff.
2122 */
2123 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2124 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2125 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2126 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2127 case PGMPOOLKIND_64BIT_PML4:
2128 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2129 switch (enmKind2)
2130 {
2131 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2132 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2133 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2134 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2135 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2136 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2137 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2138 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2139 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2140 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2141 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2142 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2143 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2144 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2145 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2146 return true;
2147 default:
2148 return false;
2149 }
2150
2151 /*
2152 * These cannot be flushed, and it's common to reuse the PDs as PTs.
2153 */
2154 case PGMPOOLKIND_ROOT_NESTED:
2155 return false;
2156
2157 default:
2158 AssertFatalMsgFailed(("enmKind1=%d\n", enmKind1));
2159 }
2160}
2161
2162
2163/**
2164 * Attempts to satisfy a pgmPoolAlloc request from the cache.
2165 *
2166 * @returns VBox status code.
2167 * @retval VINF_PGM_CACHED_PAGE on success.
2168 * @retval VERR_FILE_NOT_FOUND if not found.
2169 * @param pPool The pool.
2170 * @param GCPhys The GC physical address of the page we're gonna shadow.
2171 * @param enmKind The kind of mapping.
2172 * @param enmAccess Access type for the mapping (only relevant for big pages)
2173 * @param fA20Enabled Whether the CPU has the A20 gate enabled.
2174 * @param iUser The shadow page pool index of the user table.
2175 * @param iUserTable The index into the user table (shadowed).
2176 * @param ppPage Where to store the pointer to the page.
2177 */
2178static int pgmPoolCacheAlloc(PPGMPOOL pPool, RTGCPHYS GCPhys, PGMPOOLKIND enmKind, PGMPOOLACCESS enmAccess, bool fA20Enabled,
2179 uint16_t iUser, uint32_t iUserTable, PPPGMPOOLPAGE ppPage)
2180{
2181 /*
2182 * Look up the GCPhys in the hash.
2183 */
2184 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2185 Log3(("pgmPoolCacheAlloc: %RGp kind %s iUser=%d iUserTable=%x SLOT=%d\n", GCPhys, pgmPoolPoolKindToStr(enmKind), iUser, iUserTable, i));
2186 if (i != NIL_PGMPOOL_IDX)
2187 {
2188 do
2189 {
2190 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2191 Log4(("pgmPoolCacheAlloc: slot %d found page %RGp\n", i, pPage->GCPhys));
2192 if (pPage->GCPhys == GCPhys)
2193 {
2194 if ( (PGMPOOLKIND)pPage->enmKind == enmKind
2195 && (PGMPOOLACCESS)pPage->enmAccess == enmAccess
2196 && pPage->fA20Enabled == fA20Enabled)
2197 {
2198 /* Put it at the start of the use list to make sure pgmPoolTrackAddUser
2199 * doesn't flush it in case there are no more free use records.
2200 */
2201 pgmPoolCacheUsed(pPool, pPage);
2202
2203 int rc = pgmPoolTrackAddUser(pPool, pPage, iUser, iUserTable);
2204 if (RT_SUCCESS(rc))
2205 {
2206 Assert((PGMPOOLKIND)pPage->enmKind == enmKind);
2207 *ppPage = pPage;
2208 if (pPage->cModifications)
2209 pPage->cModifications = 1; /* reset counter (can't use 0, or else it will be reinserted in the modified list) */
2210 STAM_COUNTER_INC(&pPool->StatCacheHits);
2211 return VINF_PGM_CACHED_PAGE;
2212 }
2213 return rc;
2214 }
2215
2216 if ((PGMPOOLKIND)pPage->enmKind != enmKind)
2217 {
2218 /*
2219 * The kind is different. In some cases we should now flush the page
2220 * as it has been reused, but in most cases this is normal remapping
2221 * of PDs as PT or big pages using the GCPhys field in a slightly
2222 * different way than the other kinds.
2223 */
2224 if (pgmPoolCacheReusedByKind((PGMPOOLKIND)pPage->enmKind, enmKind))
2225 {
2226 STAM_COUNTER_INC(&pPool->StatCacheKindMismatches);
2227 pgmPoolFlushPage(pPool, pPage);
2228 break;
2229 }
2230 }
2231 }
2232
2233 /* next */
2234 i = pPage->iNext;
2235 } while (i != NIL_PGMPOOL_IDX);
2236 }
2237
2238 Log3(("pgmPoolCacheAlloc: Missed GCPhys=%RGp enmKind=%s\n", GCPhys, pgmPoolPoolKindToStr(enmKind)));
2239 STAM_COUNTER_INC(&pPool->StatCacheMisses);
2240 return VERR_FILE_NOT_FOUND;
2241}
2242
2243
2244/**
2245 * Inserts a page into the cache.
2246 *
2247 * @param pPool The pool.
2248 * @param pPage The cached page.
2249 * @param fCanBeCached Set if the page is fit for caching from the caller's point of view.
2250 */
2251static void pgmPoolCacheInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, bool fCanBeCached)
2252{
2253 /*
2254 * Insert into the GCPhys hash if the page is fit for that.
2255 */
2256 Assert(!pPage->fCached);
2257 if (fCanBeCached)
2258 {
2259 pPage->fCached = true;
2260 pgmPoolHashInsert(pPool, pPage);
2261 Log3(("pgmPoolCacheInsert: Caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2262 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2263 STAM_COUNTER_INC(&pPool->StatCacheCacheable);
2264 }
2265 else
2266 {
2267 Log3(("pgmPoolCacheInsert: Not caching %p:{.Core=%RHp, .idx=%d, .enmKind=%s, GCPhys=%RGp}\n",
2268 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
2269 STAM_COUNTER_INC(&pPool->StatCacheUncacheable);
2270 }
2271
2272 /*
2273 * Insert at the head of the age list.
2274 */
2275 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2276 pPage->iAgeNext = pPool->iAgeHead;
2277 if (pPool->iAgeHead != NIL_PGMPOOL_IDX)
2278 pPool->aPages[pPool->iAgeHead].iAgePrev = pPage->idx;
2279 else
2280 pPool->iAgeTail = pPage->idx;
2281 pPool->iAgeHead = pPage->idx;
2282}
2283
2284
2285/**
2286 * Flushes a cached page.
2287 *
2288 * @param pPool The pool.
2289 * @param pPage The cached page.
2290 */
2291static void pgmPoolCacheFlushPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2292{
2293 Log3(("pgmPoolCacheFlushPage: %RGp\n", pPage->GCPhys));
2294
2295 /*
2296 * Remove the page from the hash.
2297 */
2298 if (pPage->fCached)
2299 {
2300 pPage->fCached = false;
2301 pgmPoolHashRemove(pPool, pPage);
2302 }
2303 else
2304 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
2305
2306 /*
2307 * Remove it from the age list.
2308 */
2309 if (pPage->iAgeNext != NIL_PGMPOOL_IDX)
2310 pPool->aPages[pPage->iAgeNext].iAgePrev = pPage->iAgePrev;
2311 else
2312 pPool->iAgeTail = pPage->iAgePrev;
2313 if (pPage->iAgePrev != NIL_PGMPOOL_IDX)
2314 pPool->aPages[pPage->iAgePrev].iAgeNext = pPage->iAgeNext;
2315 else
2316 pPool->iAgeHead = pPage->iAgeNext;
2317 pPage->iAgeNext = NIL_PGMPOOL_IDX;
2318 pPage->iAgePrev = NIL_PGMPOOL_IDX;
2319}
2320
2321
2322/**
2323 * Looks for pages sharing the monitor.
2324 *
2325 * @returns Pointer to the head page.
2326 * @returns NULL if not found.
2327 * @param pPool The Pool
2328 * @param pNewPage The page which is going to be monitored.
2329 */
2330static PPGMPOOLPAGE pgmPoolMonitorGetPageByGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pNewPage)
2331{
2332 /*
2333 * Look up the GCPhys in the hash.
2334 */
2335 RTGCPHYS GCPhys = pNewPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2336 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
2337 if (i == NIL_PGMPOOL_IDX)
2338 return NULL;
2339 do
2340 {
2341 PPGMPOOLPAGE pPage = &pPool->aPages[i];
2342 if ( pPage->GCPhys - GCPhys < PAGE_SIZE
2343 && pPage != pNewPage)
2344 {
2345 switch (pPage->enmKind)
2346 {
2347 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2348 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2349 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2350 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2351 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2352 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2353 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2354 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2355 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2356 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2357 case PGMPOOLKIND_64BIT_PML4:
2358 case PGMPOOLKIND_32BIT_PD:
2359 case PGMPOOLKIND_PAE_PDPT:
2360 {
2361 /* find the head */
2362 while (pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2363 {
2364 Assert(pPage->iMonitoredPrev != pPage->idx);
2365 pPage = &pPool->aPages[pPage->iMonitoredPrev];
2366 }
2367 return pPage;
2368 }
2369
2370 /* ignore, no monitoring. */
2371 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2372 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2373 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2374 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2375 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2376 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2377 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2378 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2379 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2380 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2381 case PGMPOOLKIND_ROOT_NESTED:
2382 case PGMPOOLKIND_PAE_PD_PHYS:
2383 case PGMPOOLKIND_PAE_PDPT_PHYS:
2384 case PGMPOOLKIND_32BIT_PD_PHYS:
2385 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2386 break;
2387 default:
2388 AssertFatalMsgFailed(("enmKind=%d idx=%d\n", pPage->enmKind, pPage->idx));
2389 }
2390 }
2391
2392 /* next */
2393 i = pPage->iNext;
2394 } while (i != NIL_PGMPOOL_IDX);
2395 return NULL;
2396}
2397
2398
2399/**
2400 * Enabled write monitoring of a guest page.
2401 *
2402 * @returns VBox status code.
2403 * @retval VINF_SUCCESS on success.
2404 * @param pPool The pool.
2405 * @param pPage The cached page.
2406 */
2407static int pgmPoolMonitorInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2408{
2409 LogFlow(("pgmPoolMonitorInsert %RGp\n", pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK));
2410
2411 /*
2412 * Filter out the relevant kinds.
2413 */
2414 switch (pPage->enmKind)
2415 {
2416 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2417 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2418 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2419 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2420 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2421 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2422 case PGMPOOLKIND_64BIT_PML4:
2423 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2424 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2425 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2426 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2427 case PGMPOOLKIND_32BIT_PD:
2428 case PGMPOOLKIND_PAE_PDPT:
2429 break;
2430
2431 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2432 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2433 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2434 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2435 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2436 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2437 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2438 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2439 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2440 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2441 case PGMPOOLKIND_ROOT_NESTED:
2442 /* Nothing to monitor here. */
2443 return VINF_SUCCESS;
2444
2445 case PGMPOOLKIND_32BIT_PD_PHYS:
2446 case PGMPOOLKIND_PAE_PDPT_PHYS:
2447 case PGMPOOLKIND_PAE_PD_PHYS:
2448 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
2449 /* Nothing to monitor here. */
2450 return VINF_SUCCESS;
2451 default:
2452 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2453 }
2454
2455 /*
2456 * Install handler.
2457 */
2458 int rc;
2459 PPGMPOOLPAGE pPageHead = pgmPoolMonitorGetPageByGCPhys(pPool, pPage);
2460 if (pPageHead)
2461 {
2462 Assert(pPageHead != pPage); Assert(pPageHead->iMonitoredNext != pPage->idx);
2463 Assert(pPageHead->iMonitoredPrev != pPage->idx);
2464
2465#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2466 if (pPageHead->fDirty)
2467 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPageHead->idxDirtyEntry, false /* do not remove */);
2468#endif
2469
2470 pPage->iMonitoredPrev = pPageHead->idx;
2471 pPage->iMonitoredNext = pPageHead->iMonitoredNext;
2472 if (pPageHead->iMonitoredNext != NIL_PGMPOOL_IDX)
2473 pPool->aPages[pPageHead->iMonitoredNext].iMonitoredPrev = pPage->idx;
2474 pPageHead->iMonitoredNext = pPage->idx;
2475 rc = VINF_SUCCESS;
2476 }
2477 else
2478 {
2479 Assert(pPage->iMonitoredNext == NIL_PGMPOOL_IDX); Assert(pPage->iMonitoredPrev == NIL_PGMPOOL_IDX);
2480 PVM pVM = pPool->CTX_SUFF(pVM);
2481 const RTGCPHYS GCPhysPage = pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
2482 rc = PGMHandlerPhysicalRegisterEx(pVM, PGMPHYSHANDLERTYPE_PHYSICAL_WRITE,
2483 GCPhysPage, GCPhysPage + PAGE_OFFSET_MASK,
2484 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pPage),
2485 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pPage),
2486 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pPage),
2487 pPool->pszAccessHandler);
2488 /** @todo we should probably deal with out-of-memory conditions here, but for now increasing
2489 * the heap size should suffice. */
2490 AssertFatalMsgRC(rc, ("PGMHandlerPhysicalRegisterEx %RGp failed with %Rrc\n", GCPhysPage, rc));
2491 PVMCPU pVCpu = VMMGetCpu(pVM);
2492 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3), ("fSyncFlags=%x syncff=%d\n", pVCpu->pgm.s.fSyncFlags, VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3)));
2493 }
2494 pPage->fMonitored = true;
2495 return rc;
2496}
2497
2498
2499/**
2500 * Disables write monitoring of a guest page.
2501 *
2502 * @returns VBox status code.
2503 * @retval VINF_SUCCESS on success.
2504 * @param pPool The pool.
2505 * @param pPage The cached page.
2506 */
2507static int pgmPoolMonitorFlush(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2508{
2509 /*
2510 * Filter out the relevant kinds.
2511 */
2512 switch (pPage->enmKind)
2513 {
2514 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
2515 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
2516 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
2517 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
2518 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
2519 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
2520 case PGMPOOLKIND_64BIT_PML4:
2521 case PGMPOOLKIND_32BIT_PD:
2522 case PGMPOOLKIND_PAE_PDPT:
2523 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
2524 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
2525 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
2526 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
2527 break;
2528
2529 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
2530 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
2531 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
2532 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
2533 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
2534 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
2535 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
2536 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
2537 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
2538 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
2539 case PGMPOOLKIND_ROOT_NESTED:
2540 case PGMPOOLKIND_PAE_PD_PHYS:
2541 case PGMPOOLKIND_PAE_PDPT_PHYS:
2542 case PGMPOOLKIND_32BIT_PD_PHYS:
2543 /* Nothing to monitor here. */
2544 Assert(!pPage->fMonitored);
2545 return VINF_SUCCESS;
2546
2547 default:
2548 AssertFatalMsgFailed(("This can't happen! enmKind=%d\n", pPage->enmKind));
2549 }
2550 Assert(pPage->fMonitored);
2551
2552 /*
2553 * Remove the page from the monitored list or uninstall it if last.
2554 */
2555 const PVM pVM = pPool->CTX_SUFF(pVM);
2556 int rc;
2557 if ( pPage->iMonitoredNext != NIL_PGMPOOL_IDX
2558 || pPage->iMonitoredPrev != NIL_PGMPOOL_IDX)
2559 {
2560 if (pPage->iMonitoredPrev == NIL_PGMPOOL_IDX)
2561 {
2562 PPGMPOOLPAGE pNewHead = &pPool->aPages[pPage->iMonitoredNext];
2563 pNewHead->iMonitoredPrev = NIL_PGMPOOL_IDX;
2564 rc = PGMHandlerPhysicalChangeCallbacks(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK,
2565 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pNewHead),
2566 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pNewHead),
2567 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pNewHead),
2568 pPool->pszAccessHandler);
2569 AssertFatalRCSuccess(rc);
2570 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2571 }
2572 else
2573 {
2574 pPool->aPages[pPage->iMonitoredPrev].iMonitoredNext = pPage->iMonitoredNext;
2575 if (pPage->iMonitoredNext != NIL_PGMPOOL_IDX)
2576 {
2577 pPool->aPages[pPage->iMonitoredNext].iMonitoredPrev = pPage->iMonitoredPrev;
2578 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
2579 }
2580 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
2581 rc = VINF_SUCCESS;
2582 }
2583 }
2584 else
2585 {
2586 rc = PGMHandlerPhysicalDeregister(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK);
2587 AssertFatalRC(rc);
2588 PVMCPU pVCpu = VMMGetCpu(pVM);
2589 AssertFatalMsg(!(pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL) || VMCPU_FF_ISSET(pVCpu, VMCPU_FF_PGM_SYNC_CR3),
2590 ("%#x %#x\n", pVCpu->pgm.s.fSyncFlags, pVM->fGlobalForcedActions));
2591 }
2592 pPage->fMonitored = false;
2593
2594 /*
2595 * Remove it from the list of modified pages (if in it).
2596 */
2597 pgmPoolMonitorModifiedRemove(pPool, pPage);
2598
2599 return rc;
2600}
2601
2602
2603/**
2604 * Inserts the page into the list of modified pages.
2605 *
2606 * @param pPool The pool.
2607 * @param pPage The page.
2608 */
2609void pgmPoolMonitorModifiedInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2610{
2611 Log3(("pgmPoolMonitorModifiedInsert: idx=%d\n", pPage->idx));
2612 AssertMsg( pPage->iModifiedNext == NIL_PGMPOOL_IDX
2613 && pPage->iModifiedPrev == NIL_PGMPOOL_IDX
2614 && pPool->iModifiedHead != pPage->idx,
2615 ("Next=%d Prev=%d idx=%d cModifications=%d Head=%d cModifiedPages=%d\n",
2616 pPage->iModifiedNext, pPage->iModifiedPrev, pPage->idx, pPage->cModifications,
2617 pPool->iModifiedHead, pPool->cModifiedPages));
2618
2619 pPage->iModifiedNext = pPool->iModifiedHead;
2620 if (pPool->iModifiedHead != NIL_PGMPOOL_IDX)
2621 pPool->aPages[pPool->iModifiedHead].iModifiedPrev = pPage->idx;
2622 pPool->iModifiedHead = pPage->idx;
2623 pPool->cModifiedPages++;
2624#ifdef VBOX_WITH_STATISTICS
2625 if (pPool->cModifiedPages > pPool->cModifiedPagesHigh)
2626 pPool->cModifiedPagesHigh = pPool->cModifiedPages;
2627#endif
2628}
2629
2630
2631/**
2632 * Removes the page from the list of modified pages and resets the
2633 * modification counter.
2634 *
2635 * @param pPool The pool.
2636 * @param pPage The page which is believed to be in the list of modified pages.
2637 */
2638static void pgmPoolMonitorModifiedRemove(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
2639{
2640 Log3(("pgmPoolMonitorModifiedRemove: idx=%d cModifications=%d\n", pPage->idx, pPage->cModifications));
2641 if (pPool->iModifiedHead == pPage->idx)
2642 {
2643 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2644 pPool->iModifiedHead = pPage->iModifiedNext;
2645 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2646 {
2647 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = NIL_PGMPOOL_IDX;
2648 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2649 }
2650 pPool->cModifiedPages--;
2651 }
2652 else if (pPage->iModifiedPrev != NIL_PGMPOOL_IDX)
2653 {
2654 pPool->aPages[pPage->iModifiedPrev].iModifiedNext = pPage->iModifiedNext;
2655 if (pPage->iModifiedNext != NIL_PGMPOOL_IDX)
2656 {
2657 pPool->aPages[pPage->iModifiedNext].iModifiedPrev = pPage->iModifiedPrev;
2658 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2659 }
2660 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2661 pPool->cModifiedPages--;
2662 }
2663 else
2664 Assert(pPage->iModifiedPrev == NIL_PGMPOOL_IDX);
2665 pPage->cModifications = 0;
2666}
2667
2668
2669/**
2670 * Zaps the list of modified pages, resetting their modification counters in the process.
2671 *
2672 * @param pVM Pointer to the VM.
2673 */
2674static void pgmPoolMonitorModifiedClearAll(PVM pVM)
2675{
2676 pgmLock(pVM);
2677 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
2678 LogFlow(("pgmPoolMonitorModifiedClearAll: cModifiedPages=%d\n", pPool->cModifiedPages));
2679
2680 unsigned cPages = 0; NOREF(cPages);
2681
2682#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2683 pgmPoolResetDirtyPages(pVM);
2684#endif
2685
2686 uint16_t idx = pPool->iModifiedHead;
2687 pPool->iModifiedHead = NIL_PGMPOOL_IDX;
2688 while (idx != NIL_PGMPOOL_IDX)
2689 {
2690 PPGMPOOLPAGE pPage = &pPool->aPages[idx];
2691 idx = pPage->iModifiedNext;
2692 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
2693 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
2694 pPage->cModifications = 0;
2695 Assert(++cPages);
2696 }
2697 AssertMsg(cPages == pPool->cModifiedPages, ("%d != %d\n", cPages, pPool->cModifiedPages));
2698 pPool->cModifiedPages = 0;
2699 pgmUnlock(pVM);
2700}
2701
2702
2703/**
2704 * Handle SyncCR3 pool tasks
2705 *
2706 * @returns VBox status code.
2707 * @retval VINF_SUCCESS if successfully added.
2708 * @retval VINF_PGM_SYNC_CR3 is it needs to be deferred to ring 3 (GC only)
2709 * @param pVCpu Pointer to the VMCPU.
2710 * @remark Should only be used when monitoring is available, thus placed in
2711 * the PGMPOOL_WITH_MONITORING #ifdef.
2712 */
2713int pgmPoolSyncCR3(PVMCPU pVCpu)
2714{
2715 PVM pVM = pVCpu->CTX_SUFF(pVM);
2716 LogFlow(("pgmPoolSyncCR3 fSyncFlags=%x\n", pVCpu->pgm.s.fSyncFlags));
2717
2718 /*
2719 * When monitoring shadowed pages, we reset the modification counters on CR3 sync.
2720 * Occasionally we will have to clear all the shadow page tables because we wanted
2721 * to monitor a page which was mapped by too many shadowed page tables. This operation
2722 * sometimes referred to as a 'lightweight flush'.
2723 */
2724# ifdef IN_RING3 /* Don't flush in ring-0 or raw mode, it's taking too long. */
2725 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2726 pgmR3PoolClearAll(pVM, false /*fFlushRemTlb*/);
2727# else /* !IN_RING3 */
2728 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2729 {
2730 Log(("SyncCR3: PGM_SYNC_CLEAR_PGM_POOL is set -> VINF_PGM_SYNC_CR3\n"));
2731 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3); /** @todo no need to do global sync, right? */
2732
2733 /* Make sure all other VCPUs return to ring 3. */
2734 if (pVM->cCpus > 1)
2735 {
2736 VM_FF_SET(pVM, VM_FF_PGM_POOL_FLUSH_PENDING);
2737 PGM_INVL_ALL_VCPU_TLBS(pVM);
2738 }
2739 return VINF_PGM_SYNC_CR3;
2740 }
2741# endif /* !IN_RING3 */
2742 else
2743 {
2744 pgmPoolMonitorModifiedClearAll(pVM);
2745
2746 /* pgmPoolMonitorModifiedClearAll can cause a pgm pool flush (dirty page clearing), so make sure we handle this! */
2747 if (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)
2748 {
2749 Log(("pgmPoolMonitorModifiedClearAll caused a pgm flush -> call pgmPoolSyncCR3 again!\n"));
2750 return pgmPoolSyncCR3(pVCpu);
2751 }
2752 }
2753 return VINF_SUCCESS;
2754}
2755
2756
2757/**
2758 * Frees up at least one user entry.
2759 *
2760 * @returns VBox status code.
2761 * @retval VINF_SUCCESS if successfully added.
2762 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2763 * @param pPool The pool.
2764 * @param iUser The user index.
2765 */
2766static int pgmPoolTrackFreeOneUser(PPGMPOOL pPool, uint16_t iUser)
2767{
2768 STAM_COUNTER_INC(&pPool->StatTrackFreeUpOneUser);
2769 /*
2770 * Just free cached pages in a braindead fashion.
2771 */
2772 /** @todo walk the age list backwards and free the first with usage. */
2773 int rc = VINF_SUCCESS;
2774 do
2775 {
2776 int rc2 = pgmPoolCacheFreeOne(pPool, iUser);
2777 if (RT_FAILURE(rc2) && rc == VINF_SUCCESS)
2778 rc = rc2;
2779 } while (pPool->iUserFreeHead == NIL_PGMPOOL_USER_INDEX);
2780 return rc;
2781}
2782
2783
2784/**
2785 * Inserts a page into the cache.
2786 *
2787 * This will create user node for the page, insert it into the GCPhys
2788 * hash, and insert it into the age list.
2789 *
2790 * @returns VBox status code.
2791 * @retval VINF_SUCCESS if successfully added.
2792 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2793 * @param pPool The pool.
2794 * @param pPage The cached page.
2795 * @param GCPhys The GC physical address of the page we're gonna shadow.
2796 * @param iUser The user index.
2797 * @param iUserTable The user table index.
2798 */
2799DECLINLINE(int) pgmPoolTrackInsert(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTGCPHYS GCPhys, uint16_t iUser, uint32_t iUserTable)
2800{
2801 int rc = VINF_SUCCESS;
2802 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2803
2804 LogFlow(("pgmPoolTrackInsert GCPhys=%RGp iUser=%d iUserTable=%x\n", GCPhys, iUser, iUserTable));
2805
2806#ifdef VBOX_STRICT
2807 /*
2808 * Check that the entry doesn't already exists.
2809 */
2810 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
2811 {
2812 uint16_t i = pPage->iUserHead;
2813 do
2814 {
2815 Assert(i < pPool->cMaxUsers);
2816 AssertMsg(paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
2817 i = paUsers[i].iNext;
2818 } while (i != NIL_PGMPOOL_USER_INDEX);
2819 }
2820#endif
2821
2822 /*
2823 * Find free a user node.
2824 */
2825 uint16_t i = pPool->iUserFreeHead;
2826 if (i == NIL_PGMPOOL_USER_INDEX)
2827 {
2828 rc = pgmPoolTrackFreeOneUser(pPool, iUser);
2829 if (RT_FAILURE(rc))
2830 return rc;
2831 i = pPool->iUserFreeHead;
2832 }
2833
2834 /*
2835 * Unlink the user node from the free list,
2836 * initialize and insert it into the user list.
2837 */
2838 pPool->iUserFreeHead = paUsers[i].iNext;
2839 paUsers[i].iNext = NIL_PGMPOOL_USER_INDEX;
2840 paUsers[i].iUser = iUser;
2841 paUsers[i].iUserTable = iUserTable;
2842 pPage->iUserHead = i;
2843
2844 /*
2845 * Insert into cache and enable monitoring of the guest page if enabled.
2846 *
2847 * Until we implement caching of all levels, including the CR3 one, we'll
2848 * have to make sure we don't try monitor & cache any recursive reuse of
2849 * a monitored CR3 page. Because all windows versions are doing this we'll
2850 * have to be able to do combined access monitoring, CR3 + PT and
2851 * PD + PT (guest PAE).
2852 *
2853 * Update:
2854 * We're now cooperating with the CR3 monitor if an uncachable page is found.
2855 */
2856 const bool fCanBeMonitored = true;
2857 pgmPoolCacheInsert(pPool, pPage, fCanBeMonitored); /* This can be expanded. */
2858 if (fCanBeMonitored)
2859 {
2860 rc = pgmPoolMonitorInsert(pPool, pPage);
2861 AssertRC(rc);
2862 }
2863 return rc;
2864}
2865
2866
2867/**
2868 * Adds a user reference to a page.
2869 *
2870 * This will move the page to the head of the
2871 *
2872 * @returns VBox status code.
2873 * @retval VINF_SUCCESS if successfully added.
2874 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
2875 * @param pPool The pool.
2876 * @param pPage The cached page.
2877 * @param iUser The user index.
2878 * @param iUserTable The user table.
2879 */
2880static int pgmPoolTrackAddUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
2881{
2882 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2883
2884 Log3(("pgmPoolTrackAddUser GCPhys = %RGp iUser %x iUserTable %x\n", pPage->GCPhys, iUser, iUserTable));
2885
2886# ifdef VBOX_STRICT
2887 /*
2888 * Check that the entry doesn't already exists. We only allow multiple
2889 * users of top-level paging structures (SHW_POOL_ROOT_IDX).
2890 */
2891 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
2892 {
2893 uint16_t i = pPage->iUserHead;
2894 do
2895 {
2896 Assert(i < pPool->cMaxUsers);
2897 AssertMsg(iUser != PGMPOOL_IDX_PD || iUser != PGMPOOL_IDX_PDPT || iUser != PGMPOOL_IDX_NESTED_ROOT || iUser != PGMPOOL_IDX_AMD64_CR3 ||
2898 paUsers[i].iUser != iUser || paUsers[i].iUserTable != iUserTable, ("%x %x vs new %x %x\n", paUsers[i].iUser, paUsers[i].iUserTable, iUser, iUserTable));
2899 i = paUsers[i].iNext;
2900 } while (i != NIL_PGMPOOL_USER_INDEX);
2901 }
2902# endif
2903
2904 /*
2905 * Allocate a user node.
2906 */
2907 uint16_t i = pPool->iUserFreeHead;
2908 if (i == NIL_PGMPOOL_USER_INDEX)
2909 {
2910 int rc = pgmPoolTrackFreeOneUser(pPool, iUser);
2911 if (RT_FAILURE(rc))
2912 return rc;
2913 i = pPool->iUserFreeHead;
2914 }
2915 pPool->iUserFreeHead = paUsers[i].iNext;
2916
2917 /*
2918 * Initialize the user node and insert it.
2919 */
2920 paUsers[i].iNext = pPage->iUserHead;
2921 paUsers[i].iUser = iUser;
2922 paUsers[i].iUserTable = iUserTable;
2923 pPage->iUserHead = i;
2924
2925# ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
2926 if (pPage->fDirty)
2927 pgmPoolFlushDirtyPage(pPool->CTX_SUFF(pVM), pPool, pPage->idxDirtyEntry, false /* do not remove */);
2928# endif
2929
2930 /*
2931 * Tell the cache to update its replacement stats for this page.
2932 */
2933 pgmPoolCacheUsed(pPool, pPage);
2934 return VINF_SUCCESS;
2935}
2936
2937
2938/**
2939 * Frees a user record associated with a page.
2940 *
2941 * This does not clear the entry in the user table, it simply replaces the
2942 * user record to the chain of free records.
2943 *
2944 * @param pPool The pool.
2945 * @param HCPhys The HC physical address of the shadow page.
2946 * @param iUser The shadow page pool index of the user table.
2947 * @param iUserTable The index into the user table (shadowed).
2948 */
2949static void pgmPoolTrackFreeUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
2950{
2951 /*
2952 * Unlink and free the specified user entry.
2953 */
2954 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
2955
2956 Log3(("pgmPoolTrackFreeUser %RGp %x %x\n", pPage->GCPhys, iUser, iUserTable));
2957 /* Special: For PAE and 32-bit paging, there is usually no more than one user. */
2958 uint16_t i = pPage->iUserHead;
2959 if ( i != NIL_PGMPOOL_USER_INDEX
2960 && paUsers[i].iUser == iUser
2961 && paUsers[i].iUserTable == iUserTable)
2962 {
2963 pPage->iUserHead = paUsers[i].iNext;
2964
2965 paUsers[i].iUser = NIL_PGMPOOL_IDX;
2966 paUsers[i].iNext = pPool->iUserFreeHead;
2967 pPool->iUserFreeHead = i;
2968 return;
2969 }
2970
2971 /* General: Linear search. */
2972 uint16_t iPrev = NIL_PGMPOOL_USER_INDEX;
2973 while (i != NIL_PGMPOOL_USER_INDEX)
2974 {
2975 if ( paUsers[i].iUser == iUser
2976 && paUsers[i].iUserTable == iUserTable)
2977 {
2978 if (iPrev != NIL_PGMPOOL_USER_INDEX)
2979 paUsers[iPrev].iNext = paUsers[i].iNext;
2980 else
2981 pPage->iUserHead = paUsers[i].iNext;
2982
2983 paUsers[i].iUser = NIL_PGMPOOL_IDX;
2984 paUsers[i].iNext = pPool->iUserFreeHead;
2985 pPool->iUserFreeHead = i;
2986 return;
2987 }
2988 iPrev = i;
2989 i = paUsers[i].iNext;
2990 }
2991
2992 /* Fatal: didn't find it */
2993 AssertFatalMsgFailed(("Didn't find the user entry! iUser=%d iUserTable=%#x GCPhys=%RGp\n",
2994 iUser, iUserTable, pPage->GCPhys));
2995}
2996
2997
2998/**
2999 * Gets the entry size of a shadow table.
3000 *
3001 * @param enmKind The kind of page.
3002 *
3003 * @returns The size of the entry in bytes. That is, 4 or 8.
3004 * @returns If the kind is not for a table, an assertion is raised and 0 is
3005 * returned.
3006 */
3007DECLINLINE(unsigned) pgmPoolTrackGetShadowEntrySize(PGMPOOLKIND enmKind)
3008{
3009 switch (enmKind)
3010 {
3011 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3012 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3013 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3014 case PGMPOOLKIND_32BIT_PD:
3015 case PGMPOOLKIND_32BIT_PD_PHYS:
3016 return 4;
3017
3018 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3019 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3020 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3021 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3022 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3023 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3024 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3025 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3026 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3027 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3028 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3029 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3030 case PGMPOOLKIND_64BIT_PML4:
3031 case PGMPOOLKIND_PAE_PDPT:
3032 case PGMPOOLKIND_ROOT_NESTED:
3033 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3034 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3035 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3036 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3037 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3038 case PGMPOOLKIND_PAE_PD_PHYS:
3039 case PGMPOOLKIND_PAE_PDPT_PHYS:
3040 return 8;
3041
3042 default:
3043 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3044 }
3045}
3046
3047
3048/**
3049 * Gets the entry size of a guest table.
3050 *
3051 * @param enmKind The kind of page.
3052 *
3053 * @returns The size of the entry in bytes. That is, 0, 4 or 8.
3054 * @returns If the kind is not for a table, an assertion is raised and 0 is
3055 * returned.
3056 */
3057DECLINLINE(unsigned) pgmPoolTrackGetGuestEntrySize(PGMPOOLKIND enmKind)
3058{
3059 switch (enmKind)
3060 {
3061 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3062 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3063 case PGMPOOLKIND_32BIT_PD:
3064 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3065 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3066 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3067 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3068 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3069 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3070 return 4;
3071
3072 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3073 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3074 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3075 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3076 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3077 case PGMPOOLKIND_64BIT_PML4:
3078 case PGMPOOLKIND_PAE_PDPT:
3079 return 8;
3080
3081 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3082 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3083 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3084 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3085 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3086 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3087 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3088 case PGMPOOLKIND_ROOT_NESTED:
3089 case PGMPOOLKIND_PAE_PD_PHYS:
3090 case PGMPOOLKIND_PAE_PDPT_PHYS:
3091 case PGMPOOLKIND_32BIT_PD_PHYS:
3092 /** @todo can we return 0? (nobody is calling this...) */
3093 AssertFailed();
3094 return 0;
3095
3096 default:
3097 AssertFatalMsgFailed(("enmKind=%d\n", enmKind));
3098 }
3099}
3100
3101
3102/**
3103 * Checks one shadow page table entry for a mapping of a physical page.
3104 *
3105 * @returns true / false indicating removal of all relevant PTEs
3106 *
3107 * @param pVM Pointer to the VM.
3108 * @param pPhysPage The guest page in question.
3109 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3110 * @param iShw The shadow page table.
3111 * @param iPte Page table entry or NIL_PGMPOOL_PHYSEXT_IDX_PTE if unknown
3112 */
3113static bool pgmPoolTrackFlushGCPhysPTInt(PVM pVM, PCPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw, uint16_t iPte)
3114{
3115 LogFlow(("pgmPoolTrackFlushGCPhysPTInt: pPhysPage=%RHp iShw=%d iPte=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw, iPte));
3116 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3117 bool fRet = false;
3118
3119 /*
3120 * Assert sanity.
3121 */
3122 Assert(iPte != NIL_PGMPOOL_PHYSEXT_IDX_PTE);
3123 AssertFatalMsg(iShw < pPool->cCurPages && iShw != NIL_PGMPOOL_IDX, ("iShw=%d\n", iShw));
3124 PPGMPOOLPAGE pPage = &pPool->aPages[iShw];
3125
3126 /*
3127 * Then, clear the actual mappings to the page in the shadow PT.
3128 */
3129 switch (pPage->enmKind)
3130 {
3131 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3132 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3133 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3134 {
3135 const uint32_t u32 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3136 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3137 uint32_t u32AndMask = 0;
3138 uint32_t u32OrMask = 0;
3139
3140 if (!fFlushPTEs)
3141 {
3142 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3143 {
3144 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /** No handler installed. */
3145 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /** Monitoring is temporarily disabled. */
3146 u32OrMask = X86_PTE_RW;
3147 u32AndMask = UINT32_MAX;
3148 fRet = true;
3149 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3150 break;
3151
3152 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /** Write access is monitored. */
3153 u32OrMask = 0;
3154 u32AndMask = ~X86_PTE_RW;
3155 fRet = true;
3156 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3157 break;
3158 default:
3159 /* (shouldn't be here, will assert below) */
3160 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3161 break;
3162 }
3163 }
3164 else
3165 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3166
3167 /* Update the counter if we're removing references. */
3168 if (!u32AndMask)
3169 {
3170 Assert(pPage->cPresent);
3171 Assert(pPool->cPresent);
3172 pPage->cPresent--;
3173 pPool->cPresent--;
3174 }
3175
3176 if ((pPT->a[iPte].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3177 {
3178 X86PTE Pte;
3179
3180 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX32\n", iPte, pPT->a[iPte]));
3181 Pte.u = (pPT->a[iPte].u & u32AndMask) | u32OrMask;
3182 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3183 Pte.n.u1Write = 0; /* need to disallow writes when dirty bit tracking is still active. */
3184
3185 ASMAtomicWriteU32(&pPT->a[iPte].u, Pte.u);
3186 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3187 return fRet;
3188 }
3189#ifdef LOG_ENABLED
3190 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3191 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3192 if ((pPT->a[i].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3193 {
3194 Log(("i=%d cFound=%d\n", i, ++cFound));
3195 }
3196#endif
3197 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u32=%RX32 poolkind=%x\n", pPage->iFirstPresent, pPage->cPresent, u32, pPage->enmKind));
3198 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3199 break;
3200 }
3201
3202 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3203 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3204 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3205 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3206 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3207 case PGMPOOLKIND_EPT_PT_FOR_PHYS: /* physical mask the same as PAE; RW bit as well; be careful! */
3208 {
3209 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P;
3210 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3211 uint64_t u64OrMask = 0;
3212 uint64_t u64AndMask = 0;
3213
3214 if (!fFlushPTEs)
3215 {
3216 switch (PGM_PAGE_GET_HNDL_PHYS_STATE(pPhysPage))
3217 {
3218 case PGM_PAGE_HNDL_PHYS_STATE_NONE: /* No handler installed. */
3219 case PGM_PAGE_HNDL_PHYS_STATE_DISABLED: /* Monitoring is temporarily disabled. */
3220 u64OrMask = X86_PTE_RW;
3221 u64AndMask = UINT64_MAX;
3222 fRet = true;
3223 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3224 break;
3225
3226 case PGM_PAGE_HNDL_PHYS_STATE_WRITE: /* Write access is monitored. */
3227 u64OrMask = 0;
3228 u64AndMask = ~(uint64_t)X86_PTE_RW;
3229 fRet = true;
3230 STAM_COUNTER_INC(&pPool->StatTrackFlushEntryKeep);
3231 break;
3232
3233 default:
3234 /* (shouldn't be here, will assert below) */
3235 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3236 break;
3237 }
3238 }
3239 else
3240 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3241
3242 /* Update the counter if we're removing references. */
3243 if (!u64AndMask)
3244 {
3245 Assert(pPage->cPresent);
3246 Assert(pPool->cPresent);
3247 pPage->cPresent--;
3248 pPool->cPresent--;
3249 }
3250
3251 if ((PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3252 {
3253 X86PTEPAE Pte;
3254
3255 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pte=%RX64\n", iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3256 Pte.u = (PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & u64AndMask) | u64OrMask;
3257 if (Pte.u & PGM_PTFLAGS_TRACK_DIRTY)
3258 Pte.n.u1Write = 0; /* need to disallow writes when dirty bit tracking is still active. */
3259
3260 PGMSHWPTEPAE_ATOMIC_SET(pPT->a[iPte], Pte.u);
3261 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3262 return fRet;
3263 }
3264#ifdef LOG_ENABLED
3265 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3266 Log(("Found %RX64 expected %RX64\n", PGMSHWPTEPAE_GET_U(pPT->a[iPte]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX), u64));
3267 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPT->a); i++)
3268 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P | X86_PTE_PAE_MBZ_MASK_NX)) == u64)
3269 Log(("i=%d cFound=%d\n", i, ++cFound));
3270#endif
3271 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d u64=%RX64 poolkind=%x iPte=%d PT=%RX64\n", pPage->iFirstPresent, pPage->cPresent, u64, pPage->enmKind, iPte, PGMSHWPTEPAE_GET_LOG(pPT->a[iPte])));
3272 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);*/
3273 break;
3274 }
3275
3276#ifdef PGM_WITH_LARGE_PAGES
3277 /* Large page case only. */
3278 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3279 {
3280 Assert(pVM->pgm.s.fNestedPaging);
3281
3282 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3283 PEPTPD pPD = (PEPTPD)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3284
3285 if ((pPD->a[iPte].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3286 {
3287 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3288 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3289 pPD->a[iPte].u = 0;
3290 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3291
3292 /* Update the counter as we're removing references. */
3293 Assert(pPage->cPresent);
3294 Assert(pPool->cPresent);
3295 pPage->cPresent--;
3296 pPool->cPresent--;
3297
3298 return fRet;
3299 }
3300# ifdef LOG_ENABLED
3301 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3302 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3303 if ((pPD->a[i].u & (EPT_PDE2M_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3304 Log(("i=%d cFound=%d\n", i, ++cFound));
3305# endif
3306 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3307 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3308 break;
3309 }
3310
3311 /* AMD-V nested paging */ /** @todo merge with EPT as we only check the parts that are identical. */
3312 case PGMPOOLKIND_PAE_PD_PHYS:
3313 {
3314 Assert(pVM->pgm.s.fNestedPaging);
3315
3316 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PDE4M_P | X86_PDE4M_PS;
3317 PX86PD pPD = (PX86PD)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3318
3319 if ((pPD->a[iPte].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3320 {
3321 Log4(("pgmPoolTrackFlushGCPhysPTs: i=%d pde=%RX64\n", iPte, pPD->a[iPte]));
3322 STAM_COUNTER_INC(&pPool->StatTrackFlushEntry);
3323 pPD->a[iPte].u = 0;
3324 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);
3325
3326 /* Update the counter as we're removing references. */
3327 Assert(pPage->cPresent);
3328 Assert(pPool->cPresent);
3329 pPage->cPresent--;
3330 pPool->cPresent--;
3331 return fRet;
3332 }
3333# ifdef LOG_ENABLED
3334 Log(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3335 for (unsigned i = 0, cFound = 0; i < RT_ELEMENTS(pPD->a); i++)
3336 if ((pPD->a[i].u & (X86_PDE2M_PAE_PG_MASK | X86_PDE4M_P | X86_PDE4M_PS)) == u64)
3337 Log(("i=%d cFound=%d\n", i, ++cFound));
3338# endif
3339 AssertFatalMsgFailed(("iFirstPresent=%d cPresent=%d\n", pPage->iFirstPresent, pPage->cPresent));
3340 /*PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPD);*/
3341 break;
3342 }
3343#endif /* PGM_WITH_LARGE_PAGES */
3344
3345 default:
3346 AssertFatalMsgFailed(("enmKind=%d iShw=%d\n", pPage->enmKind, iShw));
3347 }
3348
3349 /* not reached. */
3350#ifndef _MSC_VER
3351 return fRet;
3352#endif
3353}
3354
3355
3356/**
3357 * Scans one shadow page table for mappings of a physical page.
3358 *
3359 * @param pVM Pointer to the VM.
3360 * @param pPhysPage The guest page in question.
3361 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3362 * @param iShw The shadow page table.
3363 */
3364static void pgmPoolTrackFlushGCPhysPT(PVM pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iShw)
3365{
3366 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool); NOREF(pPool);
3367
3368 /* We should only come here with when there's only one reference to this physical page. */
3369 Assert(PGMPOOL_TD_GET_CREFS(PGM_PAGE_GET_TRACKING(pPhysPage)) == 1);
3370
3371 Log2(("pgmPoolTrackFlushGCPhysPT: pPhysPage=%RHp iShw=%d\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iShw));
3372 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPT, f);
3373 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, iShw, PGM_PAGE_GET_PTE_INDEX(pPhysPage));
3374 if (!fKeptPTEs)
3375 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3376 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPT, f);
3377}
3378
3379
3380/**
3381 * Flushes a list of shadow page tables mapping the same physical page.
3382 *
3383 * @param pVM Pointer to the VM.
3384 * @param pPhysPage The guest page in question.
3385 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3386 * @param iPhysExt The physical cross reference extent list to flush.
3387 */
3388static void pgmPoolTrackFlushGCPhysPTs(PVM pVM, PPGMPAGE pPhysPage, bool fFlushPTEs, uint16_t iPhysExt)
3389{
3390 PGM_LOCK_ASSERT_OWNER(pVM);
3391 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3392 bool fKeepList = false;
3393
3394 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTs, f);
3395 Log2(("pgmPoolTrackFlushGCPhysPTs: pPhysPage=%RHp iPhysExt\n", PGM_PAGE_GET_HCPHYS(pPhysPage), iPhysExt));
3396
3397 const uint16_t iPhysExtStart = iPhysExt;
3398 PPGMPOOLPHYSEXT pPhysExt;
3399 do
3400 {
3401 Assert(iPhysExt < pPool->cMaxPhysExts);
3402 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3403 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3404 {
3405 if (pPhysExt->aidx[i] != NIL_PGMPOOL_IDX)
3406 {
3407 bool fKeptPTEs = pgmPoolTrackFlushGCPhysPTInt(pVM, pPhysPage, fFlushPTEs, pPhysExt->aidx[i], pPhysExt->apte[i]);
3408 if (!fKeptPTEs)
3409 {
3410 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3411 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3412 }
3413 else
3414 fKeepList = true;
3415 }
3416 }
3417 /* next */
3418 iPhysExt = pPhysExt->iNext;
3419 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
3420
3421 if (!fKeepList)
3422 {
3423 /* insert the list into the free list and clear the ram range entry. */
3424 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3425 pPool->iPhysExtFreeHead = iPhysExtStart;
3426 /* Invalidate the tracking data. */
3427 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3428 }
3429
3430 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTs, f);
3431}
3432
3433
3434/**
3435 * Flushes all shadow page table mappings of the given guest page.
3436 *
3437 * This is typically called when the host page backing the guest one has been
3438 * replaced or when the page protection was changed due to a guest access
3439 * caught by the monitoring.
3440 *
3441 * @returns VBox status code.
3442 * @retval VINF_SUCCESS if all references has been successfully cleared.
3443 * @retval VINF_PGM_SYNC_CR3 if we're better off with a CR3 sync and a page
3444 * pool cleaning. FF and sync flags are set.
3445 *
3446 * @param pVM Pointer to the VM.
3447 * @param GCPhysPage GC physical address of the page in question
3448 * @param pPhysPage The guest page in question.
3449 * @param fFlushPTEs Flush PTEs or allow them to be updated (e.g. in case of an RW bit change)
3450 * @param pfFlushTLBs This is set to @a true if the shadow TLBs should be
3451 * flushed, it is NOT touched if this isn't necessary.
3452 * The caller MUST initialized this to @a false.
3453 */
3454int pgmPoolTrackUpdateGCPhys(PVM pVM, RTGCPHYS GCPhysPage, PPGMPAGE pPhysPage, bool fFlushPTEs, bool *pfFlushTLBs)
3455{
3456 PVMCPU pVCpu = VMMGetCpu(pVM);
3457 pgmLock(pVM);
3458 int rc = VINF_SUCCESS;
3459
3460#ifdef PGM_WITH_LARGE_PAGES
3461 /* Is this page part of a large page? */
3462 if (PGM_PAGE_GET_PDE_TYPE(pPhysPage) == PGM_PAGE_PDE_TYPE_PDE)
3463 {
3464 RTGCPHYS GCPhysBase = GCPhysPage & X86_PDE2M_PAE_PG_MASK;
3465 GCPhysPage &= X86_PDE_PAE_PG_MASK;
3466
3467 /* Fetch the large page base. */
3468 PPGMPAGE pLargePage;
3469 if (GCPhysBase != GCPhysPage)
3470 {
3471 pLargePage = pgmPhysGetPage(pVM, GCPhysBase);
3472 AssertFatal(pLargePage);
3473 }
3474 else
3475 pLargePage = pPhysPage;
3476
3477 Log(("pgmPoolTrackUpdateGCPhys: update large page PDE for %RGp (%RGp)\n", GCPhysBase, GCPhysPage));
3478
3479 if (PGM_PAGE_GET_PDE_TYPE(pLargePage) == PGM_PAGE_PDE_TYPE_PDE)
3480 {
3481 /* Mark the large page as disabled as we need to break it up to change a single page in the 2 MB range. */
3482 PGM_PAGE_SET_PDE_TYPE(pVM, pLargePage, PGM_PAGE_PDE_TYPE_PDE_DISABLED);
3483 pVM->pgm.s.cLargePagesDisabled++;
3484
3485 /* Update the base as that *only* that one has a reference and there's only one PDE to clear. */
3486 rc = pgmPoolTrackUpdateGCPhys(pVM, GCPhysBase, pLargePage, fFlushPTEs, pfFlushTLBs);
3487
3488 *pfFlushTLBs = true;
3489 pgmUnlock(pVM);
3490 return rc;
3491 }
3492 }
3493#else
3494 NOREF(GCPhysPage);
3495#endif /* PGM_WITH_LARGE_PAGES */
3496
3497 const uint16_t u16 = PGM_PAGE_GET_TRACKING(pPhysPage);
3498 if (u16)
3499 {
3500 /*
3501 * The zero page is currently screwing up the tracking and we'll
3502 * have to flush the whole shebang. Unless VBOX_WITH_NEW_LAZY_PAGE_ALLOC
3503 * is defined, zero pages won't normally be mapped. Some kind of solution
3504 * will be needed for this problem of course, but it will have to wait...
3505 */
3506 if ( PGM_PAGE_IS_ZERO(pPhysPage)
3507 || PGM_PAGE_IS_BALLOONED(pPhysPage))
3508 rc = VINF_PGM_GCPHYS_ALIASED;
3509 else
3510 {
3511# if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC) /** @todo we can drop this now. */
3512 /* Start a subset here because pgmPoolTrackFlushGCPhysPTsSlow and
3513 pgmPoolTrackFlushGCPhysPTs will/may kill the pool otherwise. */
3514 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
3515# endif
3516
3517 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
3518 {
3519 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
3520 pgmPoolTrackFlushGCPhysPT(pVM,
3521 pPhysPage,
3522 fFlushPTEs,
3523 PGMPOOL_TD_GET_IDX(u16));
3524 }
3525 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
3526 pgmPoolTrackFlushGCPhysPTs(pVM, pPhysPage, fFlushPTEs, PGMPOOL_TD_GET_IDX(u16));
3527 else
3528 rc = pgmPoolTrackFlushGCPhysPTsSlow(pVM, pPhysPage);
3529 *pfFlushTLBs = true;
3530
3531# if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
3532 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
3533# endif
3534 }
3535 }
3536
3537 if (rc == VINF_PGM_GCPHYS_ALIASED)
3538 {
3539 pVCpu->pgm.s.fSyncFlags |= PGM_SYNC_CLEAR_PGM_POOL;
3540 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
3541 rc = VINF_PGM_SYNC_CR3;
3542 }
3543 pgmUnlock(pVM);
3544 return rc;
3545}
3546
3547
3548/**
3549 * Scans all shadow page tables for mappings of a physical page.
3550 *
3551 * This may be slow, but it's most likely more efficient than cleaning
3552 * out the entire page pool / cache.
3553 *
3554 * @returns VBox status code.
3555 * @retval VINF_SUCCESS if all references has been successfully cleared.
3556 * @retval VINF_PGM_GCPHYS_ALIASED if we're better off with a CR3 sync and
3557 * a page pool cleaning.
3558 *
3559 * @param pVM Pointer to the VM.
3560 * @param pPhysPage The guest page in question.
3561 */
3562int pgmPoolTrackFlushGCPhysPTsSlow(PVM pVM, PPGMPAGE pPhysPage)
3563{
3564 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3565 STAM_PROFILE_START(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3566 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: cUsedPages=%d cPresent=%d pPhysPage=%R[pgmpage]\n",
3567 pPool->cUsedPages, pPool->cPresent, pPhysPage));
3568
3569 /*
3570 * There is a limit to what makes sense.
3571 */
3572 if ( pPool->cPresent > 1024
3573 && pVM->cCpus == 1)
3574 {
3575 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3576 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3577 return VINF_PGM_GCPHYS_ALIASED;
3578 }
3579
3580 /*
3581 * Iterate all the pages until we've encountered all that in use.
3582 * This is simple but not quite optimal solution.
3583 */
3584 const uint64_t u64 = PGM_PAGE_GET_HCPHYS(pPhysPage) | X86_PTE_P; /** @todo drop X86_PTE_P here as we always test if present separately, anyway. */
3585 const uint32_t u32 = u64; /** @todo move into the 32BIT_PT_xx case */
3586 unsigned cLeft = pPool->cUsedPages;
3587 unsigned iPage = pPool->cCurPages;
3588 while (--iPage >= PGMPOOL_IDX_FIRST)
3589 {
3590 PPGMPOOLPAGE pPage = &pPool->aPages[iPage];
3591 if ( pPage->GCPhys != NIL_RTGCPHYS
3592 && pPage->cPresent)
3593 {
3594 switch (pPage->enmKind)
3595 {
3596 /*
3597 * We only care about shadow page tables.
3598 */
3599 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
3600 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
3601 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
3602 {
3603 unsigned cPresent = pPage->cPresent;
3604 PX86PT pPT = (PX86PT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3605 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3606 if (pPT->a[i].n.u1Present)
3607 {
3608 if ((pPT->a[i].u & (X86_PTE_PG_MASK | X86_PTE_P)) == u32)
3609 {
3610 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX32\n", iPage, i, pPT->a[i]));
3611 pPT->a[i].u = 0;
3612
3613 /* Update the counter as we're removing references. */
3614 Assert(pPage->cPresent);
3615 Assert(pPool->cPresent);
3616 pPage->cPresent--;
3617 pPool->cPresent--;
3618 }
3619 if (!--cPresent)
3620 break;
3621 }
3622 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3623 break;
3624 }
3625
3626 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
3627 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
3628 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
3629 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
3630 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
3631 {
3632 unsigned cPresent = pPage->cPresent;
3633 PPGMSHWPTPAE pPT = (PPGMSHWPTPAE)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3634 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3635 if (PGMSHWPTEPAE_IS_P(pPT->a[i]))
3636 {
3637 if ((PGMSHWPTEPAE_GET_U(pPT->a[i]) & (X86_PTE_PAE_PG_MASK | X86_PTE_P)) == u64)
3638 {
3639 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3640 PGMSHWPTEPAE_SET(pPT->a[i], 0); /// @todo why not atomic?
3641
3642 /* Update the counter as we're removing references. */
3643 Assert(pPage->cPresent);
3644 Assert(pPool->cPresent);
3645 pPage->cPresent--;
3646 pPool->cPresent--;
3647 }
3648 if (!--cPresent)
3649 break;
3650 }
3651 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3652 break;
3653 }
3654#ifndef IN_RC
3655 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
3656 {
3657 unsigned cPresent = pPage->cPresent;
3658 PEPTPT pPT = (PEPTPT)PGMPOOL_PAGE_2_PTR(pVM, pPage);
3659 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pPT->a); i++)
3660 if (pPT->a[i].n.u1Present)
3661 {
3662 if ((pPT->a[i].u & (EPT_PTE_PG_MASK | X86_PTE_P)) == u64)
3663 {
3664 //Log4(("pgmPoolTrackFlushGCPhysPTsSlow: idx=%d i=%d pte=%RX64\n", iPage, i, pPT->a[i]));
3665 pPT->a[i].u = 0;
3666
3667 /* Update the counter as we're removing references. */
3668 Assert(pPage->cPresent);
3669 Assert(pPool->cPresent);
3670 pPage->cPresent--;
3671 pPool->cPresent--;
3672 }
3673 if (!--cPresent)
3674 break;
3675 }
3676 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pPT);
3677 break;
3678 }
3679#endif
3680 }
3681 if (!--cLeft)
3682 break;
3683 }
3684 }
3685
3686 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
3687 STAM_PROFILE_STOP(&pPool->StatTrackFlushGCPhysPTsSlow, s);
3688
3689 /*
3690 * There is a limit to what makes sense. The above search is very expensive, so force a pgm pool flush.
3691 */
3692 if (pPool->cPresent > 1024)
3693 {
3694 LogFlow(("pgmPoolTrackFlushGCPhysPTsSlow: giving up... (cPresent=%d)\n", pPool->cPresent));
3695 return VINF_PGM_GCPHYS_ALIASED;
3696 }
3697
3698 return VINF_SUCCESS;
3699}
3700
3701
3702/**
3703 * Clears the user entry in a user table.
3704 *
3705 * This is used to remove all references to a page when flushing it.
3706 */
3707static void pgmPoolTrackClearPageUser(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PCPGMPOOLUSER pUser)
3708{
3709 Assert(pUser->iUser != NIL_PGMPOOL_IDX);
3710 Assert(pUser->iUser < pPool->cCurPages);
3711 uint32_t iUserTable = pUser->iUserTable;
3712
3713 /*
3714 * Map the user page.
3715 */
3716 PPGMPOOLPAGE pUserPage = &pPool->aPages[pUser->iUser];
3717 union
3718 {
3719 uint64_t *pau64;
3720 uint32_t *pau32;
3721 } u;
3722 u.pau64 = (uint64_t *)PGMPOOL_PAGE_2_PTR(pPool->CTX_SUFF(pVM), pUserPage);
3723
3724 LogFlow(("pgmPoolTrackClearPageUser: clear %x in %s (%RGp) (flushing %s)\n", iUserTable, pgmPoolPoolKindToStr(pUserPage->enmKind), pUserPage->Core.Key, pgmPoolPoolKindToStr(pPage->enmKind)));
3725
3726 /* Safety precaution in case we change the paging for other modes too in the future. */
3727 Assert(!pgmPoolIsPageLocked(pPage));
3728
3729#ifdef VBOX_STRICT
3730 /*
3731 * Some sanity checks.
3732 */
3733 switch (pUserPage->enmKind)
3734 {
3735 case PGMPOOLKIND_32BIT_PD:
3736 case PGMPOOLKIND_32BIT_PD_PHYS:
3737 Assert(iUserTable < X86_PG_ENTRIES);
3738 break;
3739 case PGMPOOLKIND_PAE_PDPT:
3740 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
3741 case PGMPOOLKIND_PAE_PDPT_PHYS:
3742 Assert(iUserTable < 4);
3743 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3744 break;
3745 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3746 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3747 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3748 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3749 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3750 case PGMPOOLKIND_PAE_PD_PHYS:
3751 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3752 break;
3753 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3754 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3755 Assert(!(u.pau64[iUserTable] & PGM_PDFLAGS_MAPPING));
3756 break;
3757 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3758 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3759 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3760 break;
3761 case PGMPOOLKIND_64BIT_PML4:
3762 Assert(!(u.pau64[iUserTable] & PGM_PLXFLAGS_PERMANENT));
3763 /* GCPhys >> PAGE_SHIFT is the index here */
3764 break;
3765 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3766 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3767 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3768 break;
3769
3770 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3771 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3772 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3773 break;
3774
3775 case PGMPOOLKIND_ROOT_NESTED:
3776 Assert(iUserTable < X86_PG_PAE_ENTRIES);
3777 break;
3778
3779 default:
3780 AssertMsgFailed(("enmKind=%d\n", pUserPage->enmKind));
3781 break;
3782 }
3783#endif /* VBOX_STRICT */
3784
3785 /*
3786 * Clear the entry in the user page.
3787 */
3788 switch (pUserPage->enmKind)
3789 {
3790 /* 32-bit entries */
3791 case PGMPOOLKIND_32BIT_PD:
3792 case PGMPOOLKIND_32BIT_PD_PHYS:
3793 ASMAtomicWriteU32(&u.pau32[iUserTable], 0);
3794 break;
3795
3796 /* 64-bit entries */
3797 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
3798 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
3799 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
3800 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
3801 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
3802#ifdef IN_RC
3803 /*
3804 * In 32 bits PAE mode we *must* invalidate the TLB when changing a
3805 * PDPT entry; the CPU fetches them only during cr3 load, so any
3806 * non-present PDPT will continue to cause page faults.
3807 */
3808 ASMReloadCR3();
3809 /* no break */
3810#endif
3811 case PGMPOOLKIND_PAE_PD_PHYS:
3812 case PGMPOOLKIND_PAE_PDPT_PHYS:
3813 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
3814 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
3815 case PGMPOOLKIND_64BIT_PML4:
3816 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
3817 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
3818 case PGMPOOLKIND_PAE_PDPT:
3819 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
3820 case PGMPOOLKIND_ROOT_NESTED:
3821 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
3822 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
3823 ASMAtomicWriteU64(&u.pau64[iUserTable], 0);
3824 break;
3825
3826 default:
3827 AssertFatalMsgFailed(("enmKind=%d iUser=%d iUserTable=%#x\n", pUserPage->enmKind, pUser->iUser, pUser->iUserTable));
3828 }
3829 PGM_DYNMAP_UNUSED_HINT_VM(pPool->CTX_SUFF(pVM), u.pau64);
3830}
3831
3832
3833/**
3834 * Clears all users of a page.
3835 */
3836static void pgmPoolTrackClearPageUsers(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
3837{
3838 /*
3839 * Free all the user records.
3840 */
3841 LogFlow(("pgmPoolTrackClearPageUsers %RGp\n", pPage->GCPhys));
3842
3843 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
3844 uint16_t i = pPage->iUserHead;
3845 while (i != NIL_PGMPOOL_USER_INDEX)
3846 {
3847 /* Clear enter in user table. */
3848 pgmPoolTrackClearPageUser(pPool, pPage, &paUsers[i]);
3849
3850 /* Free it. */
3851 const uint16_t iNext = paUsers[i].iNext;
3852 paUsers[i].iUser = NIL_PGMPOOL_IDX;
3853 paUsers[i].iNext = pPool->iUserFreeHead;
3854 pPool->iUserFreeHead = i;
3855
3856 /* Next. */
3857 i = iNext;
3858 }
3859 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
3860}
3861
3862
3863/**
3864 * Allocates a new physical cross reference extent.
3865 *
3866 * @returns Pointer to the allocated extent on success. NULL if we're out of them.
3867 * @param pVM Pointer to the VM.
3868 * @param piPhysExt Where to store the phys ext index.
3869 */
3870PPGMPOOLPHYSEXT pgmPoolTrackPhysExtAlloc(PVM pVM, uint16_t *piPhysExt)
3871{
3872 PGM_LOCK_ASSERT_OWNER(pVM);
3873 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3874 uint16_t iPhysExt = pPool->iPhysExtFreeHead;
3875 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
3876 {
3877 STAM_COUNTER_INC(&pPool->StamTrackPhysExtAllocFailures);
3878 return NULL;
3879 }
3880 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3881 pPool->iPhysExtFreeHead = pPhysExt->iNext;
3882 pPhysExt->iNext = NIL_PGMPOOL_PHYSEXT_INDEX;
3883 *piPhysExt = iPhysExt;
3884 return pPhysExt;
3885}
3886
3887
3888/**
3889 * Frees a physical cross reference extent.
3890 *
3891 * @param pVM Pointer to the VM.
3892 * @param iPhysExt The extent to free.
3893 */
3894void pgmPoolTrackPhysExtFree(PVM pVM, uint16_t iPhysExt)
3895{
3896 PGM_LOCK_ASSERT_OWNER(pVM);
3897 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3898 Assert(iPhysExt < pPool->cMaxPhysExts);
3899 PPGMPOOLPHYSEXT pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3900 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3901 {
3902 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3903 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3904 }
3905 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3906 pPool->iPhysExtFreeHead = iPhysExt;
3907}
3908
3909
3910/**
3911 * Frees a physical cross reference extent.
3912 *
3913 * @param pVM Pointer to the VM.
3914 * @param iPhysExt The extent to free.
3915 */
3916void pgmPoolTrackPhysExtFreeList(PVM pVM, uint16_t iPhysExt)
3917{
3918 PGM_LOCK_ASSERT_OWNER(pVM);
3919 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3920
3921 const uint16_t iPhysExtStart = iPhysExt;
3922 PPGMPOOLPHYSEXT pPhysExt;
3923 do
3924 {
3925 Assert(iPhysExt < pPool->cMaxPhysExts);
3926 pPhysExt = &pPool->CTX_SUFF(paPhysExts)[iPhysExt];
3927 for (unsigned i = 0; i < RT_ELEMENTS(pPhysExt->aidx); i++)
3928 {
3929 pPhysExt->aidx[i] = NIL_PGMPOOL_IDX;
3930 pPhysExt->apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
3931 }
3932
3933 /* next */
3934 iPhysExt = pPhysExt->iNext;
3935 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
3936
3937 pPhysExt->iNext = pPool->iPhysExtFreeHead;
3938 pPool->iPhysExtFreeHead = iPhysExtStart;
3939}
3940
3941
3942/**
3943 * Insert a reference into a list of physical cross reference extents.
3944 *
3945 * @returns The new tracking data for PGMPAGE.
3946 *
3947 * @param pVM Pointer to the VM.
3948 * @param iPhysExt The physical extent index of the list head.
3949 * @param iShwPT The shadow page table index.
3950 * @param iPte Page table entry
3951 *
3952 */
3953static uint16_t pgmPoolTrackPhysExtInsert(PVM pVM, uint16_t iPhysExt, uint16_t iShwPT, uint16_t iPte)
3954{
3955 PGM_LOCK_ASSERT_OWNER(pVM);
3956 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
3957 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
3958
3959 /*
3960 * Special common cases.
3961 */
3962 if (paPhysExts[iPhysExt].aidx[1] == NIL_PGMPOOL_IDX)
3963 {
3964 paPhysExts[iPhysExt].aidx[1] = iShwPT;
3965 paPhysExts[iPhysExt].apte[1] = iPte;
3966 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
3967 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,%d pte %d,}\n", iPhysExt, iShwPT, iPte));
3968 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
3969 }
3970 if (paPhysExts[iPhysExt].aidx[2] == NIL_PGMPOOL_IDX)
3971 {
3972 paPhysExts[iPhysExt].aidx[2] = iShwPT;
3973 paPhysExts[iPhysExt].apte[2] = iPte;
3974 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
3975 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{,,%d pte %d}\n", iPhysExt, iShwPT, iPte));
3976 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
3977 }
3978 AssertCompile(RT_ELEMENTS(paPhysExts[iPhysExt].aidx) == 3);
3979
3980 /*
3981 * General treatment.
3982 */
3983 const uint16_t iPhysExtStart = iPhysExt;
3984 unsigned cMax = 15;
3985 for (;;)
3986 {
3987 Assert(iPhysExt < pPool->cMaxPhysExts);
3988 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
3989 if (paPhysExts[iPhysExt].aidx[i] == NIL_PGMPOOL_IDX)
3990 {
3991 paPhysExts[iPhysExt].aidx[i] = iShwPT;
3992 paPhysExts[iPhysExt].apte[i] = iPte;
3993 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedMany);
3994 LogFlow(("pgmPoolTrackPhysExtInsert: %d:{%d pte %d} i=%d cMax=%d\n", iPhysExt, iShwPT, iPte, i, cMax));
3995 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtStart);
3996 }
3997 if (!--cMax)
3998 {
3999 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackOverflows);
4000 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4001 LogFlow(("pgmPoolTrackPhysExtInsert: overflow (1) iShwPT=%d\n", iShwPT));
4002 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4003 }
4004
4005 /* advance */
4006 iPhysExt = paPhysExts[iPhysExt].iNext;
4007 if (iPhysExt == NIL_PGMPOOL_PHYSEXT_INDEX)
4008 break;
4009 }
4010
4011 /*
4012 * Add another extent to the list.
4013 */
4014 PPGMPOOLPHYSEXT pNew = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4015 if (!pNew)
4016 {
4017 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackNoExtentsLeft);
4018 pgmPoolTrackPhysExtFreeList(pVM, iPhysExtStart);
4019 LogFlow(("pgmPoolTrackPhysExtInsert: pgmPoolTrackPhysExtAlloc failed iShwPT=%d\n", iShwPT));
4020 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4021 }
4022 pNew->iNext = iPhysExtStart;
4023 pNew->aidx[0] = iShwPT;
4024 pNew->apte[0] = iPte;
4025 LogFlow(("pgmPoolTrackPhysExtInsert: added new extent %d:{%d pte %d}->%d\n", iPhysExt, iShwPT, iPte, iPhysExtStart));
4026 return PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4027}
4028
4029
4030/**
4031 * Add a reference to guest physical page where extents are in use.
4032 *
4033 * @returns The new tracking data for PGMPAGE.
4034 *
4035 * @param pVM Pointer to the VM.
4036 * @param pPhysPage Pointer to the aPages entry in the ram range.
4037 * @param u16 The ram range flags (top 16-bits).
4038 * @param iShwPT The shadow page table index.
4039 * @param iPte Page table entry
4040 */
4041uint16_t pgmPoolTrackPhysExtAddref(PVM pVM, PPGMPAGE pPhysPage, uint16_t u16, uint16_t iShwPT, uint16_t iPte)
4042{
4043 pgmLock(pVM);
4044 if (PGMPOOL_TD_GET_CREFS(u16) != PGMPOOL_TD_CREFS_PHYSEXT)
4045 {
4046 /*
4047 * Convert to extent list.
4048 */
4049 Assert(PGMPOOL_TD_GET_CREFS(u16) == 1);
4050 uint16_t iPhysExt;
4051 PPGMPOOLPHYSEXT pPhysExt = pgmPoolTrackPhysExtAlloc(pVM, &iPhysExt);
4052 if (pPhysExt)
4053 {
4054 LogFlow(("pgmPoolTrackPhysExtAddref: new extent: %d:{%d, %d}\n", iPhysExt, PGMPOOL_TD_GET_IDX(u16), iShwPT));
4055 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliased);
4056 pPhysExt->aidx[0] = PGMPOOL_TD_GET_IDX(u16);
4057 pPhysExt->apte[0] = PGM_PAGE_GET_PTE_INDEX(pPhysPage);
4058 pPhysExt->aidx[1] = iShwPT;
4059 pPhysExt->apte[1] = iPte;
4060 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExt);
4061 }
4062 else
4063 u16 = PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED);
4064 }
4065 else if (u16 != PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, PGMPOOL_TD_IDX_OVERFLOWED))
4066 {
4067 /*
4068 * Insert into the extent list.
4069 */
4070 u16 = pgmPoolTrackPhysExtInsert(pVM, PGMPOOL_TD_GET_IDX(u16), iShwPT, iPte);
4071 }
4072 else
4073 STAM_COUNTER_INC(&pVM->pgm.s.CTX_SUFF(pStats)->StatTrackAliasedLots);
4074 pgmUnlock(pVM);
4075 return u16;
4076}
4077
4078
4079/**
4080 * Clear references to guest physical memory.
4081 *
4082 * @param pPool The pool.
4083 * @param pPage The page.
4084 * @param pPhysPage Pointer to the aPages entry in the ram range.
4085 * @param iPte Shadow PTE index
4086 */
4087void pgmPoolTrackPhysExtDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMPAGE pPhysPage, uint16_t iPte)
4088{
4089 PVM pVM = pPool->CTX_SUFF(pVM);
4090 const unsigned cRefs = PGM_PAGE_GET_TD_CREFS(pPhysPage);
4091 AssertFatalMsg(cRefs == PGMPOOL_TD_CREFS_PHYSEXT, ("cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4092
4093 uint16_t iPhysExt = PGM_PAGE_GET_TD_IDX(pPhysPage);
4094 if (iPhysExt != PGMPOOL_TD_IDX_OVERFLOWED)
4095 {
4096 pgmLock(pVM);
4097
4098 uint16_t iPhysExtPrev = NIL_PGMPOOL_PHYSEXT_INDEX;
4099 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
4100 do
4101 {
4102 Assert(iPhysExt < pPool->cMaxPhysExts);
4103
4104 /*
4105 * Look for the shadow page and check if it's all freed.
4106 */
4107 for (unsigned i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4108 {
4109 if ( paPhysExts[iPhysExt].aidx[i] == pPage->idx
4110 && paPhysExts[iPhysExt].apte[i] == iPte)
4111 {
4112 paPhysExts[iPhysExt].aidx[i] = NIL_PGMPOOL_IDX;
4113 paPhysExts[iPhysExt].apte[i] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
4114
4115 for (i = 0; i < RT_ELEMENTS(paPhysExts[iPhysExt].aidx); i++)
4116 if (paPhysExts[iPhysExt].aidx[i] != NIL_PGMPOOL_IDX)
4117 {
4118 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d\n", pPhysPage, pPage->idx));
4119 pgmUnlock(pVM);
4120 return;
4121 }
4122
4123 /* we can free the node. */
4124 const uint16_t iPhysExtNext = paPhysExts[iPhysExt].iNext;
4125 if ( iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX
4126 && iPhysExtNext == NIL_PGMPOOL_PHYSEXT_INDEX)
4127 {
4128 /* lonely node */
4129 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4130 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d lonely\n", pPhysPage, pPage->idx));
4131 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, 0);
4132 }
4133 else if (iPhysExtPrev == NIL_PGMPOOL_PHYSEXT_INDEX)
4134 {
4135 /* head */
4136 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d head\n", pPhysPage, pPage->idx));
4137 PGM_PAGE_SET_TRACKING(pVM, pPhysPage, PGMPOOL_TD_MAKE(PGMPOOL_TD_CREFS_PHYSEXT, iPhysExtNext));
4138 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4139 }
4140 else
4141 {
4142 /* in list */
4143 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage] idx=%d in list\n", pPhysPage, pPage->idx));
4144 paPhysExts[iPhysExtPrev].iNext = iPhysExtNext;
4145 pgmPoolTrackPhysExtFree(pVM, iPhysExt);
4146 }
4147 iPhysExt = iPhysExtNext;
4148 pgmUnlock(pVM);
4149 return;
4150 }
4151 }
4152
4153 /* next */
4154 iPhysExtPrev = iPhysExt;
4155 iPhysExt = paPhysExts[iPhysExt].iNext;
4156 } while (iPhysExt != NIL_PGMPOOL_PHYSEXT_INDEX);
4157
4158 pgmUnlock(pVM);
4159 AssertFatalMsgFailed(("not-found! cRefs=%d pPhysPage=%R[pgmpage] pPage=%p:{.idx=%d}\n", cRefs, pPhysPage, pPage, pPage->idx));
4160 }
4161 else /* nothing to do */
4162 Log2(("pgmPoolTrackPhysExtDerefGCPhys: pPhysPage=%R[pgmpage]\n", pPhysPage));
4163}
4164
4165/**
4166 * Clear references to guest physical memory.
4167 *
4168 * This is the same as pgmPoolTracDerefGCPhysHint except that the guest
4169 * physical address is assumed to be correct, so the linear search can be
4170 * skipped and we can assert at an earlier point.
4171 *
4172 * @param pPool The pool.
4173 * @param pPage The page.
4174 * @param HCPhys The host physical address corresponding to the guest page.
4175 * @param GCPhys The guest physical address corresponding to HCPhys.
4176 * @param iPte Shadow PTE index
4177 */
4178static void pgmPoolTracDerefGCPhys(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhys, uint16_t iPte)
4179{
4180 /*
4181 * Lookup the page and check if it checks out before derefing it.
4182 */
4183 PVM pVM = pPool->CTX_SUFF(pVM);
4184 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhys);
4185 if (pPhysPage)
4186 {
4187 Assert(PGM_PAGE_GET_HCPHYS(pPhysPage));
4188#ifdef LOG_ENABLED
4189 RTHCPHYS HCPhysPage = PGM_PAGE_GET_HCPHYS(pPhysPage);
4190 Log2(("pgmPoolTracDerefGCPhys %RHp vs %RHp\n", HCPhysPage, HCPhys));
4191#endif
4192 if (PGM_PAGE_GET_HCPHYS(pPhysPage) == HCPhys)
4193 {
4194 Assert(pPage->cPresent);
4195 Assert(pPool->cPresent);
4196 pPage->cPresent--;
4197 pPool->cPresent--;
4198 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4199 return;
4200 }
4201
4202 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp; found page has HCPhys=%RHp\n",
4203 HCPhys, GCPhys, PGM_PAGE_GET_HCPHYS(pPhysPage)));
4204 }
4205 AssertFatalMsgFailed(("HCPhys=%RHp GCPhys=%RGp\n", HCPhys, GCPhys));
4206}
4207
4208
4209/**
4210 * Clear references to guest physical memory.
4211 *
4212 * @param pPool The pool.
4213 * @param pPage The page.
4214 * @param HCPhys The host physical address corresponding to the guest page.
4215 * @param GCPhysHint The guest physical address which may corresponding to HCPhys.
4216 * @param iPte Shadow pte index
4217 */
4218void pgmPoolTracDerefGCPhysHint(PPGMPOOL pPool, PPGMPOOLPAGE pPage, RTHCPHYS HCPhys, RTGCPHYS GCPhysHint, uint16_t iPte)
4219{
4220 Log4(("pgmPoolTracDerefGCPhysHint %RHp %RGp\n", HCPhys, GCPhysHint));
4221
4222 /*
4223 * Try the hint first.
4224 */
4225 RTHCPHYS HCPhysHinted;
4226 PVM pVM = pPool->CTX_SUFF(pVM);
4227 PPGMPAGE pPhysPage = pgmPhysGetPage(pVM, GCPhysHint);
4228 if (pPhysPage)
4229 {
4230 HCPhysHinted = PGM_PAGE_GET_HCPHYS(pPhysPage);
4231 Assert(HCPhysHinted);
4232 if (HCPhysHinted == HCPhys)
4233 {
4234 Assert(pPage->cPresent);
4235 Assert(pPool->cPresent);
4236 pPage->cPresent--;
4237 pPool->cPresent--;
4238 pgmTrackDerefGCPhys(pPool, pPage, pPhysPage, iPte);
4239 return;
4240 }
4241 }
4242 else
4243 HCPhysHinted = UINT64_C(0xdeadbeefdeadbeef);
4244
4245 /*
4246 * Damn, the hint didn't work. We'll have to do an expensive linear search.
4247 */
4248 STAM_COUNTER_INC(&pPool->StatTrackLinearRamSearches);
4249 PPGMRAMRANGE pRam = pPool->CTX_SUFF(pVM)->pgm.s.CTX_SUFF(pRamRangesX);
4250 while (pRam)
4251 {
4252 unsigned iPage = pRam->cb >> PAGE_SHIFT;
4253 while (iPage-- > 0)
4254 {
4255 if (PGM_PAGE_GET_HCPHYS(&pRam->aPages[iPage]) == HCPhys)
4256 {
4257 Log4(("pgmPoolTracDerefGCPhysHint: Linear HCPhys=%RHp GCPhysHint=%RGp GCPhysReal=%RGp\n",
4258 HCPhys, GCPhysHint, pRam->GCPhys + (iPage << PAGE_SHIFT)));
4259 Assert(pPage->cPresent);
4260 Assert(pPool->cPresent);
4261 pPage->cPresent--;
4262 pPool->cPresent--;
4263 pgmTrackDerefGCPhys(pPool, pPage, &pRam->aPages[iPage], iPte);
4264 return;
4265 }
4266 }
4267 pRam = pRam->CTX_SUFF(pNext);
4268 }
4269
4270 AssertFatalMsgFailed(("HCPhys=%RHp GCPhysHint=%RGp (Hinted page has HCPhys = %RHp)\n", HCPhys, GCPhysHint, HCPhysHinted));
4271}
4272
4273
4274/**
4275 * Clear references to guest physical memory in a 32-bit / 32-bit page table.
4276 *
4277 * @param pPool The pool.
4278 * @param pPage The page.
4279 * @param pShwPT The shadow page table (mapping of the page).
4280 * @param pGstPT The guest page table.
4281 */
4282DECLINLINE(void) pgmPoolTrackDerefPT32Bit32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT, PCX86PT pGstPT)
4283{
4284 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4285 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4286 {
4287 Assert(!(pShwPT->a[i].u & RT_BIT_32(10)));
4288 if (pShwPT->a[i].n.u1Present)
4289 {
4290 Log4(("pgmPoolTrackDerefPT32Bit32Bit: i=%d pte=%RX32 hint=%RX32\n",
4291 i, pShwPT->a[i].u & X86_PTE_PG_MASK, pGstPT->a[i].u & X86_PTE_PG_MASK));
4292 pgmPoolTracDerefGCPhysHint(pPool, pPage, pShwPT->a[i].u & X86_PTE_PG_MASK, pGstPT->a[i].u & fPgMask, i);
4293 if (!pPage->cPresent)
4294 break;
4295 }
4296 }
4297}
4298
4299
4300/**
4301 * Clear references to guest physical memory in a PAE / 32-bit page table.
4302 *
4303 * @param pPool The pool.
4304 * @param pPage The page.
4305 * @param pShwPT The shadow page table (mapping of the page).
4306 * @param pGstPT The guest page table (just a half one).
4307 */
4308DECLINLINE(void) pgmPoolTrackDerefPTPae32Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PT pGstPT)
4309{
4310 RTGCPHYS32 const fPgMask = pPage->fA20Enabled ? X86_PTE_PG_MASK : X86_PTE_PG_MASK & ~RT_BIT_32(20);
4311 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4312 {
4313 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4314 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4315 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4316 {
4317 Log4(("pgmPoolTrackDerefPTPae32Bit: i=%d pte=%RX64 hint=%RX32\n",
4318 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PG_MASK));
4319 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4320 if (!pPage->cPresent)
4321 break;
4322 }
4323 }
4324}
4325
4326
4327/**
4328 * Clear references to guest physical memory in a PAE / PAE page table.
4329 *
4330 * @param pPool The pool.
4331 * @param pPage The page.
4332 * @param pShwPT The shadow page table (mapping of the page).
4333 * @param pGstPT The guest page table.
4334 */
4335DECLINLINE(void) pgmPoolTrackDerefPTPaePae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT, PCX86PTPAE pGstPT)
4336{
4337 RTGCPHYS const fPgMask = pPage->fA20Enabled ? X86_PTE_PAE_PG_MASK : X86_PTE_PAE_PG_MASK & ~RT_BIT_64(20);
4338 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++)
4339 {
4340 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4341 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4342 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4343 {
4344 Log4(("pgmPoolTrackDerefPTPaePae: i=%d pte=%RX32 hint=%RX32\n",
4345 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & X86_PTE_PAE_PG_MASK));
4346 pgmPoolTracDerefGCPhysHint(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), pGstPT->a[i].u & fPgMask, i);
4347 if (!pPage->cPresent)
4348 break;
4349 }
4350 }
4351}
4352
4353
4354/**
4355 * Clear references to guest physical memory in a 32-bit / 4MB page table.
4356 *
4357 * @param pPool The pool.
4358 * @param pPage The page.
4359 * @param pShwPT The shadow page table (mapping of the page).
4360 */
4361DECLINLINE(void) pgmPoolTrackDerefPT32Bit4MB(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PT pShwPT)
4362{
4363 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4364 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4365 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4366 {
4367 Assert(!(pShwPT->a[i].u & RT_BIT_32(10)));
4368 if (pShwPT->a[i].n.u1Present)
4369 {
4370 Log4(("pgmPoolTrackDerefPT32Bit4MB: i=%d pte=%RX32 GCPhys=%RGp\n",
4371 i, pShwPT->a[i].u & X86_PTE_PG_MASK, GCPhys));
4372 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPT->a[i].u & X86_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4373 if (!pPage->cPresent)
4374 break;
4375 }
4376 }
4377}
4378
4379
4380/**
4381 * Clear references to guest physical memory in a PAE / 2/4MB page table.
4382 *
4383 * @param pPool The pool.
4384 * @param pPage The page.
4385 * @param pShwPT The shadow page table (mapping of the page).
4386 */
4387DECLINLINE(void) pgmPoolTrackDerefPTPaeBig(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PPGMSHWPTPAE pShwPT)
4388{
4389 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4390 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4391 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4392 {
4393 Assert( (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == 0
4394 || (PGMSHWPTEPAE_GET_U(pShwPT->a[i]) & UINT64_C(0x7ff0000000000400)) == UINT64_C(0x7ff0000000000000));
4395 if (PGMSHWPTEPAE_IS_P(pShwPT->a[i]))
4396 {
4397 Log4(("pgmPoolTrackDerefPTPaeBig: i=%d pte=%RX64 hint=%RGp\n",
4398 i, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys));
4399 pgmPoolTracDerefGCPhys(pPool, pPage, PGMSHWPTEPAE_GET_HCPHYS(pShwPT->a[i]), GCPhys & GCPhysA20Mask, i);
4400 if (!pPage->cPresent)
4401 break;
4402 }
4403 }
4404}
4405
4406
4407/**
4408 * Clear references to shadowed pages in an EPT page table.
4409 *
4410 * @param pPool The pool.
4411 * @param pPage The page.
4412 * @param pShwPML4 The shadow page directory pointer table (mapping of the page).
4413 */
4414DECLINLINE(void) pgmPoolTrackDerefPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPT pShwPT)
4415{
4416 RTGCPHYS const GCPhysA20Mask = pPage->fA20Enabled ? UINT64_MAX : ~RT_BIT_64(20);
4417 RTGCPHYS GCPhys = pPage->GCPhys + PAGE_SIZE * pPage->iFirstPresent;
4418 for (unsigned i = pPage->iFirstPresent; i < RT_ELEMENTS(pShwPT->a); i++, GCPhys += PAGE_SIZE)
4419 {
4420 Assert((pShwPT->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4421 if (pShwPT->a[i].n.u1Present)
4422 {
4423 Log4(("pgmPoolTrackDerefPTEPT: i=%d pte=%RX64 GCPhys=%RX64\n",
4424 i, pShwPT->a[i].u & EPT_PTE_PG_MASK, pPage->GCPhys));
4425 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPT->a[i].u & EPT_PTE_PG_MASK, GCPhys & GCPhysA20Mask, i);
4426 if (!pPage->cPresent)
4427 break;
4428 }
4429 }
4430}
4431
4432
4433/**
4434 * Clear references to shadowed pages in a 32 bits page directory.
4435 *
4436 * @param pPool The pool.
4437 * @param pPage The page.
4438 * @param pShwPD The shadow page directory (mapping of the page).
4439 */
4440DECLINLINE(void) pgmPoolTrackDerefPD(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PD pShwPD)
4441{
4442 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4443 {
4444 Assert(!(pShwPD->a[i].u & RT_BIT_32(9)));
4445 if ( pShwPD->a[i].n.u1Present
4446 && !(pShwPD->a[i].u & PGM_PDFLAGS_MAPPING)
4447 )
4448 {
4449 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & X86_PDE_PG_MASK);
4450 if (pSubPage)
4451 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4452 else
4453 AssertFatalMsgFailed(("%x\n", pShwPD->a[i].u & X86_PDE_PG_MASK));
4454 }
4455 }
4456}
4457
4458
4459/**
4460 * Clear references to shadowed pages in a PAE (legacy or 64 bits) page directory.
4461 *
4462 * @param pPool The pool.
4463 * @param pPage The page.
4464 * @param pShwPD The shadow page directory (mapping of the page).
4465 */
4466DECLINLINE(void) pgmPoolTrackDerefPDPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPAE pShwPD)
4467{
4468 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4469 {
4470 if ( pShwPD->a[i].n.u1Present
4471 && !(pShwPD->a[i].u & PGM_PDFLAGS_MAPPING))
4472 {
4473#ifdef PGM_WITH_LARGE_PAGES
4474 if (pShwPD->a[i].b.u1Size)
4475 {
4476 Log4(("pgmPoolTrackDerefPDPae: i=%d pde=%RX64 GCPhys=%RX64\n",
4477 i, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK, pPage->GCPhys));
4478 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK,
4479 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4480 i);
4481 }
4482 else
4483#endif
4484 {
4485 Assert((pShwPD->a[i].u & (X86_PDE_PAE_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4486 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & X86_PDE_PAE_PG_MASK);
4487 if (pSubPage)
4488 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4489 else
4490 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & X86_PDE_PAE_PG_MASK));
4491 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4492 }
4493 }
4494 }
4495}
4496
4497
4498/**
4499 * Clear references to shadowed pages in a PAE page directory pointer table.
4500 *
4501 * @param pPool The pool.
4502 * @param pPage The page.
4503 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4504 */
4505DECLINLINE(void) pgmPoolTrackDerefPDPTPae(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4506{
4507 for (unsigned i = 0; i < X86_PG_PAE_PDPE_ENTRIES; i++)
4508 {
4509 Assert((pShwPDPT->a[i].u & (X86_PDPE_PAE_MBZ_MASK | UINT64_C(0x7ff0000000000200))) == 0);
4510 if ( pShwPDPT->a[i].n.u1Present
4511 && !(pShwPDPT->a[i].u & PGM_PLXFLAGS_MAPPING)
4512 )
4513 {
4514 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & X86_PDPE_PG_MASK);
4515 if (pSubPage)
4516 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4517 else
4518 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & X86_PDPE_PG_MASK));
4519 }
4520 }
4521}
4522
4523
4524/**
4525 * Clear references to shadowed pages in a 64-bit page directory pointer table.
4526 *
4527 * @param pPool The pool.
4528 * @param pPage The page.
4529 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4530 */
4531DECLINLINE(void) pgmPoolTrackDerefPDPT64Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PDPT pShwPDPT)
4532{
4533 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4534 {
4535 Assert((pShwPDPT->a[i].u & (X86_PDPE_LM_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4536 if (pShwPDPT->a[i].n.u1Present)
4537 {
4538 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & X86_PDPE_PG_MASK);
4539 if (pSubPage)
4540 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4541 else
4542 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & X86_PDPE_PG_MASK));
4543 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4544 }
4545 }
4546}
4547
4548
4549/**
4550 * Clear references to shadowed pages in a 64-bit level 4 page table.
4551 *
4552 * @param pPool The pool.
4553 * @param pPage The page.
4554 * @param pShwPML4 The shadow page directory pointer table (mapping of the page).
4555 */
4556DECLINLINE(void) pgmPoolTrackDerefPML464Bit(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PX86PML4 pShwPML4)
4557{
4558 for (unsigned i = 0; i < RT_ELEMENTS(pShwPML4->a); i++)
4559 {
4560 Assert((pShwPML4->a[i].u & (X86_PML4E_MBZ_MASK_NX | UINT64_C(0x7ff0000000000200))) == 0);
4561 if (pShwPML4->a[i].n.u1Present)
4562 {
4563 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPML4->a[i].u & X86_PDPE_PG_MASK);
4564 if (pSubPage)
4565 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4566 else
4567 AssertFatalMsgFailed(("%RX64\n", pShwPML4->a[i].u & X86_PML4E_PG_MASK));
4568 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4569 }
4570 }
4571}
4572
4573
4574/**
4575 * Clear references to shadowed pages in an EPT page directory.
4576 *
4577 * @param pPool The pool.
4578 * @param pPage The page.
4579 * @param pShwPD The shadow page directory (mapping of the page).
4580 */
4581DECLINLINE(void) pgmPoolTrackDerefPDEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPD pShwPD)
4582{
4583 for (unsigned i = 0; i < RT_ELEMENTS(pShwPD->a); i++)
4584 {
4585 Assert((pShwPD->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4586 if (pShwPD->a[i].n.u1Present)
4587 {
4588#ifdef PGM_WITH_LARGE_PAGES
4589 if (pShwPD->a[i].b.u1Size)
4590 {
4591 Log4(("pgmPoolTrackDerefPDEPT: i=%d pde=%RX64 GCPhys=%RX64\n",
4592 i, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK, pPage->GCPhys));
4593 pgmPoolTracDerefGCPhys(pPool, pPage, pShwPD->a[i].u & X86_PDE2M_PAE_PG_MASK,
4594 pPage->GCPhys + i * 2 * _1M /* pPage->GCPhys = base address of the memory described by the PD */,
4595 i);
4596 }
4597 else
4598#endif
4599 {
4600 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPD->a[i].u & EPT_PDE_PG_MASK);
4601 if (pSubPage)
4602 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4603 else
4604 AssertFatalMsgFailed(("%RX64\n", pShwPD->a[i].u & EPT_PDE_PG_MASK));
4605 }
4606 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4607 }
4608 }
4609}
4610
4611
4612/**
4613 * Clear references to shadowed pages in an EPT page directory pointer table.
4614 *
4615 * @param pPool The pool.
4616 * @param pPage The page.
4617 * @param pShwPDPT The shadow page directory pointer table (mapping of the page).
4618 */
4619DECLINLINE(void) pgmPoolTrackDerefPDPTEPT(PPGMPOOL pPool, PPGMPOOLPAGE pPage, PEPTPDPT pShwPDPT)
4620{
4621 for (unsigned i = 0; i < RT_ELEMENTS(pShwPDPT->a); i++)
4622 {
4623 Assert((pShwPDPT->a[i].u & UINT64_C(0xfff0000000000f80)) == 0);
4624 if (pShwPDPT->a[i].n.u1Present)
4625 {
4626 PPGMPOOLPAGE pSubPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, pShwPDPT->a[i].u & EPT_PDPTE_PG_MASK);
4627 if (pSubPage)
4628 pgmPoolTrackFreeUser(pPool, pSubPage, pPage->idx, i);
4629 else
4630 AssertFatalMsgFailed(("%RX64\n", pShwPDPT->a[i].u & EPT_PDPTE_PG_MASK));
4631 /** @todo 64-bit guests: have to ensure that we're not exhausting the dynamic mappings! */
4632 }
4633 }
4634}
4635
4636
4637/**
4638 * Clears all references made by this page.
4639 *
4640 * This includes other shadow pages and GC physical addresses.
4641 *
4642 * @param pPool The pool.
4643 * @param pPage The page.
4644 */
4645static void pgmPoolTrackDeref(PPGMPOOL pPool, PPGMPOOLPAGE pPage)
4646{
4647 /*
4648 * Map the shadow page and take action according to the page kind.
4649 */
4650 PVM pVM = pPool->CTX_SUFF(pVM);
4651 void *pvShw = PGMPOOL_PAGE_2_PTR(pVM, pPage);
4652 switch (pPage->enmKind)
4653 {
4654 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
4655 {
4656 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4657 void *pvGst;
4658 int rc = PGM_GCPHYS_2_PTR(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4659 pgmPoolTrackDerefPT32Bit32Bit(pPool, pPage, (PX86PT)pvShw, (PCX86PT)pvGst);
4660 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4661 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4662 break;
4663 }
4664
4665 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
4666 {
4667 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4668 void *pvGst;
4669 int rc = PGM_GCPHYS_2_PTR_EX(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4670 pgmPoolTrackDerefPTPae32Bit(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PT)pvGst);
4671 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4672 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4673 break;
4674 }
4675
4676 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
4677 {
4678 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4679 void *pvGst;
4680 int rc = PGM_GCPHYS_2_PTR(pVM, pPage->GCPhys, &pvGst); AssertReleaseRC(rc);
4681 pgmPoolTrackDerefPTPaePae(pPool, pPage, (PPGMSHWPTPAE)pvShw, (PCX86PTPAE)pvGst);
4682 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvGst);
4683 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4684 break;
4685 }
4686
4687 case PGMPOOLKIND_32BIT_PT_FOR_PHYS: /* treat it like a 4 MB page */
4688 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
4689 {
4690 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4691 pgmPoolTrackDerefPT32Bit4MB(pPool, pPage, (PX86PT)pvShw);
4692 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4693 break;
4694 }
4695
4696 case PGMPOOLKIND_PAE_PT_FOR_PHYS: /* treat it like a 2 MB page */
4697 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
4698 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
4699 {
4700 STAM_PROFILE_START(&pPool->StatTrackDerefGCPhys, g);
4701 pgmPoolTrackDerefPTPaeBig(pPool, pPage, (PPGMSHWPTPAE)pvShw);
4702 STAM_PROFILE_STOP(&pPool->StatTrackDerefGCPhys, g);
4703 break;
4704 }
4705
4706 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
4707 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
4708 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
4709 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
4710 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
4711 case PGMPOOLKIND_PAE_PD_PHYS:
4712 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
4713 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
4714 pgmPoolTrackDerefPDPae(pPool, pPage, (PX86PDPAE)pvShw);
4715 break;
4716
4717 case PGMPOOLKIND_32BIT_PD_PHYS:
4718 case PGMPOOLKIND_32BIT_PD:
4719 pgmPoolTrackDerefPD(pPool, pPage, (PX86PD)pvShw);
4720 break;
4721
4722 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
4723 case PGMPOOLKIND_PAE_PDPT:
4724 case PGMPOOLKIND_PAE_PDPT_PHYS:
4725 pgmPoolTrackDerefPDPTPae(pPool, pPage, (PX86PDPT)pvShw);
4726 break;
4727
4728 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
4729 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
4730 pgmPoolTrackDerefPDPT64Bit(pPool, pPage, (PX86PDPT)pvShw);
4731 break;
4732
4733 case PGMPOOLKIND_64BIT_PML4:
4734 pgmPoolTrackDerefPML464Bit(pPool, pPage, (PX86PML4)pvShw);
4735 break;
4736
4737 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
4738 pgmPoolTrackDerefPTEPT(pPool, pPage, (PEPTPT)pvShw);
4739 break;
4740
4741 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
4742 pgmPoolTrackDerefPDEPT(pPool, pPage, (PEPTPD)pvShw);
4743 break;
4744
4745 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
4746 pgmPoolTrackDerefPDPTEPT(pPool, pPage, (PEPTPDPT)pvShw);
4747 break;
4748
4749 default:
4750 AssertFatalMsgFailed(("enmKind=%d\n", pPage->enmKind));
4751 }
4752
4753 /* paranoia, clear the shadow page. Remove this laser (i.e. let Alloc and ClearAll do it). */
4754 STAM_PROFILE_START(&pPool->StatZeroPage, z);
4755 ASMMemZeroPage(pvShw);
4756 STAM_PROFILE_STOP(&pPool->StatZeroPage, z);
4757 pPage->fZeroed = true;
4758 Assert(!pPage->cPresent);
4759 PGM_DYNMAP_UNUSED_HINT_VM(pVM, pvShw);
4760}
4761
4762
4763/**
4764 * Flushes a pool page.
4765 *
4766 * This moves the page to the free list after removing all user references to it.
4767 *
4768 * @returns VBox status code.
4769 * @retval VINF_SUCCESS on success.
4770 * @param pPool The pool.
4771 * @param HCPhys The HC physical address of the shadow page.
4772 * @param fFlush Flush the TLBS when required (should only be false in very specific use cases!!)
4773 */
4774int pgmPoolFlushPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage, bool fFlush)
4775{
4776 PVM pVM = pPool->CTX_SUFF(pVM);
4777 bool fFlushRequired = false;
4778
4779 int rc = VINF_SUCCESS;
4780 STAM_PROFILE_START(&pPool->StatFlushPage, f);
4781 LogFlow(("pgmPoolFlushPage: pPage=%p:{.Key=%RHp, .idx=%d, .enmKind=%s, .GCPhys=%RGp}\n",
4782 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), pPage->GCPhys));
4783
4784 /*
4785 * Quietly reject any attempts at flushing any of the special root pages.
4786 */
4787 if (pPage->idx < PGMPOOL_IDX_FIRST)
4788 {
4789 AssertFailed(); /* can no longer happen */
4790 Log(("pgmPoolFlushPage: special root page, rejected. enmKind=%s idx=%d\n", pgmPoolPoolKindToStr(pPage->enmKind), pPage->idx));
4791 return VINF_SUCCESS;
4792 }
4793
4794 pgmLock(pVM);
4795
4796 /*
4797 * Quietly reject any attempts at flushing the currently active shadow CR3 mapping
4798 */
4799 if (pgmPoolIsPageLocked(pPage))
4800 {
4801 AssertMsg( pPage->enmKind == PGMPOOLKIND_64BIT_PML4
4802 || pPage->enmKind == PGMPOOLKIND_PAE_PDPT
4803 || pPage->enmKind == PGMPOOLKIND_PAE_PDPT_FOR_32BIT
4804 || pPage->enmKind == PGMPOOLKIND_32BIT_PD
4805 || pPage->enmKind == PGMPOOLKIND_PAE_PD_FOR_PAE_PD
4806 || pPage->enmKind == PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD
4807 || pPage->enmKind == PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD
4808 || pPage->enmKind == PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD
4809 || pPage->enmKind == PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD
4810 || pPage->enmKind == PGMPOOLKIND_ROOT_NESTED,
4811 ("Can't free the shadow CR3! (%RHp vs %RHp kind=%d\n", PGMGetHyperCR3(VMMGetCpu(pVM)), pPage->Core.Key, pPage->enmKind));
4812 Log(("pgmPoolFlushPage: current active shadow CR3, rejected. enmKind=%s idx=%d\n", pgmPoolPoolKindToStr(pPage->enmKind), pPage->idx));
4813 pgmUnlock(pVM);
4814 return VINF_SUCCESS;
4815 }
4816
4817#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
4818 /* Start a subset so we won't run out of mapping space. */
4819 PVMCPU pVCpu = VMMGetCpu(pVM);
4820 uint32_t iPrevSubset = PGMRZDynMapPushAutoSubset(pVCpu);
4821#endif
4822
4823 /*
4824 * Mark the page as being in need of an ASMMemZeroPage().
4825 */
4826 pPage->fZeroed = false;
4827
4828#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
4829 if (pPage->fDirty)
4830 pgmPoolFlushDirtyPage(pVM, pPool, pPage->idxDirtyEntry, false /* do not remove */);
4831#endif
4832
4833 /* If there are any users of this table, then we *must* issue a tlb flush on all VCPUs. */
4834 if (pPage->iUserHead != NIL_PGMPOOL_USER_INDEX)
4835 fFlushRequired = true;
4836
4837 /*
4838 * Clear the page.
4839 */
4840 pgmPoolTrackClearPageUsers(pPool, pPage);
4841 STAM_PROFILE_START(&pPool->StatTrackDeref,a);
4842 pgmPoolTrackDeref(pPool, pPage);
4843 STAM_PROFILE_STOP(&pPool->StatTrackDeref,a);
4844
4845 /*
4846 * Flush it from the cache.
4847 */
4848 pgmPoolCacheFlushPage(pPool, pPage);
4849
4850#if defined(VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0) || defined(IN_RC)
4851 /* Heavy stuff done. */
4852 PGMRZDynMapPopAutoSubset(pVCpu, iPrevSubset);
4853#endif
4854
4855 /*
4856 * Deregistering the monitoring.
4857 */
4858 if (pPage->fMonitored)
4859 rc = pgmPoolMonitorFlush(pPool, pPage);
4860
4861 /*
4862 * Free the page.
4863 */
4864 Assert(pPage->iNext == NIL_PGMPOOL_IDX);
4865 pPage->iNext = pPool->iFreeHead;
4866 pPool->iFreeHead = pPage->idx;
4867 pPage->enmKind = PGMPOOLKIND_FREE;
4868 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
4869 pPage->GCPhys = NIL_RTGCPHYS;
4870 pPage->fReusedFlushPending = false;
4871
4872 pPool->cUsedPages--;
4873
4874 /* Flush the TLBs of all VCPUs if required. */
4875 if ( fFlushRequired
4876 && fFlush)
4877 {
4878 PGM_INVL_ALL_VCPU_TLBS(pVM);
4879 }
4880
4881 pgmUnlock(pVM);
4882 STAM_PROFILE_STOP(&pPool->StatFlushPage, f);
4883 return rc;
4884}
4885
4886
4887/**
4888 * Frees a usage of a pool page.
4889 *
4890 * The caller is responsible to updating the user table so that it no longer
4891 * references the shadow page.
4892 *
4893 * @param pPool The pool.
4894 * @param HCPhys The HC physical address of the shadow page.
4895 * @param iUser The shadow page pool index of the user table.
4896 * @param iUserTable The index into the user table (shadowed).
4897 */
4898void pgmPoolFreeByPage(PPGMPOOL pPool, PPGMPOOLPAGE pPage, uint16_t iUser, uint32_t iUserTable)
4899{
4900 PVM pVM = pPool->CTX_SUFF(pVM);
4901
4902 STAM_PROFILE_START(&pPool->StatFree, a);
4903 LogFlow(("pgmPoolFreeByPage: pPage=%p:{.Key=%RHp, .idx=%d, enmKind=%s} iUser=%d iUserTable=%#x\n",
4904 pPage, pPage->Core.Key, pPage->idx, pgmPoolPoolKindToStr(pPage->enmKind), iUser, iUserTable));
4905 Assert(pPage->idx >= PGMPOOL_IDX_FIRST);
4906 pgmLock(pVM);
4907 pgmPoolTrackFreeUser(pPool, pPage, iUser, iUserTable);
4908 if (!pPage->fCached)
4909 pgmPoolFlushPage(pPool, pPage);
4910 pgmUnlock(pVM);
4911 STAM_PROFILE_STOP(&pPool->StatFree, a);
4912}
4913
4914
4915/**
4916 * Makes one or more free page free.
4917 *
4918 * @returns VBox status code.
4919 * @retval VINF_SUCCESS on success.
4920 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
4921 *
4922 * @param pPool The pool.
4923 * @param enmKind Page table kind
4924 * @param iUser The user of the page.
4925 */
4926static int pgmPoolMakeMoreFreePages(PPGMPOOL pPool, PGMPOOLKIND enmKind, uint16_t iUser)
4927{
4928 PVM pVM = pPool->CTX_SUFF(pVM);
4929 LogFlow(("pgmPoolMakeMoreFreePages: iUser=%d\n", iUser));
4930 NOREF(enmKind);
4931
4932 /*
4933 * If the pool isn't full grown yet, expand it.
4934 */
4935 if ( pPool->cCurPages < pPool->cMaxPages
4936#if defined(IN_RC)
4937 /* Hack alert: we can't deal with jumps to ring 3 when called from MapCR3 and allocating pages for PAE PDs. */
4938 && enmKind != PGMPOOLKIND_PAE_PD_FOR_PAE_PD
4939 && (enmKind < PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD || enmKind > PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD)
4940#endif
4941 )
4942 {
4943 STAM_PROFILE_ADV_SUSPEND(&pPool->StatAlloc, a);
4944#ifdef IN_RING3
4945 int rc = PGMR3PoolGrow(pVM);
4946#else
4947 int rc = VMMRZCallRing3NoCpu(pVM, VMMCALLRING3_PGM_POOL_GROW, 0);
4948#endif
4949 if (RT_FAILURE(rc))
4950 return rc;
4951 STAM_PROFILE_ADV_RESUME(&pPool->StatAlloc, a);
4952 if (pPool->iFreeHead != NIL_PGMPOOL_IDX)
4953 return VINF_SUCCESS;
4954 }
4955
4956 /*
4957 * Free one cached page.
4958 */
4959 return pgmPoolCacheFreeOne(pPool, iUser);
4960}
4961
4962
4963/**
4964 * Allocates a page from the pool.
4965 *
4966 * This page may actually be a cached page and not in need of any processing
4967 * on the callers part.
4968 *
4969 * @returns VBox status code.
4970 * @retval VINF_SUCCESS if a NEW page was allocated.
4971 * @retval VINF_PGM_CACHED_PAGE if a CACHED page was returned.
4972 * @retval VERR_PGM_POOL_FLUSHED if the pool was flushed.
4973 *
4974 * @param pVM Pointer to the VM.
4975 * @param GCPhys The GC physical address of the page we're gonna shadow.
4976 * For 4MB and 2MB PD entries, it's the first address the
4977 * shadow PT is covering.
4978 * @param enmKind The kind of mapping.
4979 * @param enmAccess Access type for the mapping (only relevant for big pages)
4980 * @param fA20Enabled Whether the A20 gate is enabled or not.
4981 * @param iUser The shadow page pool index of the user table.
4982 * @param iUserTable The index into the user table (shadowed).
4983 * @param fLockPage Lock the page
4984 * @param ppPage Where to store the pointer to the page. NULL is stored here on failure.
4985 */
4986int pgmPoolAlloc(PVM pVM, RTGCPHYS GCPhys, PGMPOOLKIND enmKind, PGMPOOLACCESS enmAccess, bool fA20Enabled,
4987 uint16_t iUser, uint32_t iUserTable, bool fLockPage, PPPGMPOOLPAGE ppPage)
4988{
4989 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
4990 STAM_PROFILE_ADV_START(&pPool->StatAlloc, a);
4991 LogFlow(("pgmPoolAlloc: GCPhys=%RGp enmKind=%s iUser=%d iUserTable=%#x\n", GCPhys, pgmPoolPoolKindToStr(enmKind), iUser, iUserTable));
4992 *ppPage = NULL;
4993 /** @todo CSAM/PGMPrefetchPage messes up here during CSAMR3CheckGates
4994 * (TRPMR3SyncIDT) because of FF priority. Try fix that?
4995 * Assert(!(pVM->pgm.s.fGlobalSyncFlags & PGM_SYNC_CLEAR_PGM_POOL)); */
4996
4997 pgmLock(pVM);
4998
4999 if (pPool->fCacheEnabled)
5000 {
5001 int rc2 = pgmPoolCacheAlloc(pPool, GCPhys, enmKind, enmAccess, fA20Enabled, iUser, iUserTable, ppPage);
5002 if (RT_SUCCESS(rc2))
5003 {
5004 if (fLockPage)
5005 pgmPoolLockPage(pPool, *ppPage);
5006 pgmUnlock(pVM);
5007 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5008 LogFlow(("pgmPoolAlloc: cached returns %Rrc *ppPage=%p:{.Key=%RHp, .idx=%d}\n", rc2, *ppPage, (*ppPage)->Core.Key, (*ppPage)->idx));
5009 return rc2;
5010 }
5011 }
5012
5013 /*
5014 * Allocate a new one.
5015 */
5016 int rc = VINF_SUCCESS;
5017 uint16_t iNew = pPool->iFreeHead;
5018 if (iNew == NIL_PGMPOOL_IDX)
5019 {
5020 rc = pgmPoolMakeMoreFreePages(pPool, enmKind, iUser);
5021 if (RT_FAILURE(rc))
5022 {
5023 pgmUnlock(pVM);
5024 Log(("pgmPoolAlloc: returns %Rrc (Free)\n", rc));
5025 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5026 return rc;
5027 }
5028 iNew = pPool->iFreeHead;
5029 AssertReleaseReturn(iNew != NIL_PGMPOOL_IDX, VERR_PGM_POOL_IPE);
5030 }
5031
5032 /* unlink the free head */
5033 PPGMPOOLPAGE pPage = &pPool->aPages[iNew];
5034 pPool->iFreeHead = pPage->iNext;
5035 pPage->iNext = NIL_PGMPOOL_IDX;
5036
5037 /*
5038 * Initialize it.
5039 */
5040 pPool->cUsedPages++; /* physical handler registration / pgmPoolTrackFlushGCPhysPTsSlow requirement. */
5041 pPage->enmKind = enmKind;
5042 pPage->enmAccess = enmAccess;
5043 pPage->GCPhys = GCPhys;
5044 pPage->fA20Enabled = fA20Enabled;
5045 pPage->fSeenNonGlobal = false; /* Set this to 'true' to disable this feature. */
5046 pPage->fMonitored = false;
5047 pPage->fCached = false;
5048 pPage->fDirty = false;
5049 pPage->fReusedFlushPending = false;
5050 pPage->cModifications = 0;
5051 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5052 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5053 pPage->cPresent = 0;
5054 pPage->iFirstPresent = NIL_PGMPOOL_PRESENT_INDEX;
5055 pPage->idxDirtyEntry = 0;
5056 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
5057 pPage->GCPtrLastAccessHandlerRip = NIL_RTGCPTR;
5058 pPage->cLastAccessHandler = 0;
5059 pPage->cLocked = 0;
5060# ifdef VBOX_STRICT
5061 pPage->GCPtrDirtyFault = NIL_RTGCPTR;
5062# endif
5063
5064 /*
5065 * Insert into the tracking and cache. If this fails, free the page.
5066 */
5067 int rc3 = pgmPoolTrackInsert(pPool, pPage, GCPhys, iUser, iUserTable);
5068 if (RT_FAILURE(rc3))
5069 {
5070 pPool->cUsedPages--;
5071 pPage->enmKind = PGMPOOLKIND_FREE;
5072 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
5073 pPage->GCPhys = NIL_RTGCPHYS;
5074 pPage->iNext = pPool->iFreeHead;
5075 pPool->iFreeHead = pPage->idx;
5076 pgmUnlock(pVM);
5077 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5078 Log(("pgmPoolAlloc: returns %Rrc (Insert)\n", rc3));
5079 return rc3;
5080 }
5081
5082 /*
5083 * Commit the allocation, clear the page and return.
5084 */
5085#ifdef VBOX_WITH_STATISTICS
5086 if (pPool->cUsedPages > pPool->cUsedPagesHigh)
5087 pPool->cUsedPagesHigh = pPool->cUsedPages;
5088#endif
5089
5090 if (!pPage->fZeroed)
5091 {
5092 STAM_PROFILE_START(&pPool->StatZeroPage, z);
5093 void *pv = PGMPOOL_PAGE_2_PTR(pVM, pPage);
5094 ASMMemZeroPage(pv);
5095 STAM_PROFILE_STOP(&pPool->StatZeroPage, z);
5096 }
5097
5098 *ppPage = pPage;
5099 if (fLockPage)
5100 pgmPoolLockPage(pPool, pPage);
5101 pgmUnlock(pVM);
5102 LogFlow(("pgmPoolAlloc: returns %Rrc *ppPage=%p:{.Key=%RHp, .idx=%d, .fCached=%RTbool, .fMonitored=%RTbool}\n",
5103 rc, pPage, pPage->Core.Key, pPage->idx, pPage->fCached, pPage->fMonitored));
5104 STAM_PROFILE_ADV_STOP(&pPool->StatAlloc, a);
5105 return rc;
5106}
5107
5108
5109/**
5110 * Frees a usage of a pool page.
5111 *
5112 * @param pVM Pointer to the VM.
5113 * @param HCPhys The HC physical address of the shadow page.
5114 * @param iUser The shadow page pool index of the user table.
5115 * @param iUserTable The index into the user table (shadowed).
5116 */
5117void pgmPoolFree(PVM pVM, RTHCPHYS HCPhys, uint16_t iUser, uint32_t iUserTable)
5118{
5119 LogFlow(("pgmPoolFree: HCPhys=%RHp iUser=%d iUserTable=%#x\n", HCPhys, iUser, iUserTable));
5120 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5121 pgmPoolFreeByPage(pPool, pgmPoolGetPage(pPool, HCPhys), iUser, iUserTable);
5122}
5123
5124
5125/**
5126 * Internal worker for finding a 'in-use' shadow page give by it's physical address.
5127 *
5128 * @returns Pointer to the shadow page structure.
5129 * @param pPool The pool.
5130 * @param HCPhys The HC physical address of the shadow page.
5131 */
5132PPGMPOOLPAGE pgmPoolGetPage(PPGMPOOL pPool, RTHCPHYS HCPhys)
5133{
5134 PGM_LOCK_ASSERT_OWNER(pPool->CTX_SUFF(pVM));
5135
5136 /*
5137 * Look up the page.
5138 */
5139 PPGMPOOLPAGE pPage = (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, HCPhys & X86_PTE_PAE_PG_MASK);
5140
5141 AssertFatalMsg(pPage && pPage->enmKind != PGMPOOLKIND_FREE, ("HCPhys=%RHp pPage=%p idx=%d\n", HCPhys, pPage, (pPage) ? pPage->idx : 0));
5142 return pPage;
5143}
5144
5145
5146/**
5147 * Internal worker for finding a page for debugging purposes, no assertions.
5148 *
5149 * @returns Pointer to the shadow page structure. NULL on if not found.
5150 * @param pPool The pool.
5151 * @param HCPhys The HC physical address of the shadow page.
5152 */
5153PPGMPOOLPAGE pgmPoolQueryPageForDbg(PPGMPOOL pPool, RTHCPHYS HCPhys)
5154{
5155 PGM_LOCK_ASSERT_OWNER(pPool->CTX_SUFF(pVM));
5156 return (PPGMPOOLPAGE)RTAvloHCPhysGet(&pPool->HCPhysTree, HCPhys & X86_PTE_PAE_PG_MASK);
5157}
5158
5159#ifdef IN_RING3 /* currently only used in ring 3; save some space in the R0 & GC modules (left it here as we might need it elsewhere later on) */
5160
5161/**
5162 * Flush the specified page if present
5163 *
5164 * @param pVM Pointer to the VM.
5165 * @param GCPhys Guest physical address of the page to flush
5166 */
5167void pgmPoolFlushPageByGCPhys(PVM pVM, RTGCPHYS GCPhys)
5168{
5169 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5170
5171 VM_ASSERT_EMT(pVM);
5172
5173 /*
5174 * Look up the GCPhys in the hash.
5175 */
5176 GCPhys = GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK;
5177 unsigned i = pPool->aiHash[PGMPOOL_HASH(GCPhys)];
5178 if (i == NIL_PGMPOOL_IDX)
5179 return;
5180
5181 do
5182 {
5183 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5184 if (pPage->GCPhys - GCPhys < PAGE_SIZE)
5185 {
5186 switch (pPage->enmKind)
5187 {
5188 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
5189 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
5190 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
5191 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
5192 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
5193 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
5194 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
5195 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
5196 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
5197 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
5198 case PGMPOOLKIND_64BIT_PML4:
5199 case PGMPOOLKIND_32BIT_PD:
5200 case PGMPOOLKIND_PAE_PDPT:
5201 {
5202 Log(("PGMPoolFlushPage: found pgm pool pages for %RGp\n", GCPhys));
5203#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
5204 if (pPage->fDirty)
5205 STAM_COUNTER_INC(&pPool->StatForceFlushDirtyPage);
5206 else
5207#endif
5208 STAM_COUNTER_INC(&pPool->StatForceFlushPage);
5209 Assert(!pgmPoolIsPageLocked(pPage));
5210 pgmPoolMonitorChainFlush(pPool, pPage);
5211 return;
5212 }
5213
5214 /* ignore, no monitoring. */
5215 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
5216 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
5217 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
5218 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
5219 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
5220 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
5221 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
5222 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
5223 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
5224 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
5225 case PGMPOOLKIND_ROOT_NESTED:
5226 case PGMPOOLKIND_PAE_PD_PHYS:
5227 case PGMPOOLKIND_PAE_PDPT_PHYS:
5228 case PGMPOOLKIND_32BIT_PD_PHYS:
5229 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
5230 break;
5231
5232 default:
5233 AssertFatalMsgFailed(("enmKind=%d idx=%d\n", pPage->enmKind, pPage->idx));
5234 }
5235 }
5236
5237 /* next */
5238 i = pPage->iNext;
5239 } while (i != NIL_PGMPOOL_IDX);
5240 return;
5241}
5242
5243#endif /* IN_RING3 */
5244#ifdef IN_RING3
5245
5246/**
5247 * Reset CPU on hot plugging.
5248 *
5249 * @param pVM Pointer to the VM.
5250 * @param pVCpu The virtual CPU.
5251 */
5252void pgmR3PoolResetUnpluggedCpu(PVM pVM, PVMCPU pVCpu)
5253{
5254 pgmR3ExitShadowModeBeforePoolFlush(pVCpu);
5255
5256 pgmR3ReEnterShadowModeAfterPoolFlush(pVM, pVCpu);
5257 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
5258 VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH);
5259}
5260
5261
5262/**
5263 * Flushes the entire cache.
5264 *
5265 * It will assert a global CR3 flush (FF) and assumes the caller is aware of
5266 * this and execute this CR3 flush.
5267 *
5268 * @param pPool The pool.
5269 */
5270void pgmR3PoolReset(PVM pVM)
5271{
5272 PPGMPOOL pPool = pVM->pgm.s.CTX_SUFF(pPool);
5273
5274 PGM_LOCK_ASSERT_OWNER(pVM);
5275 STAM_PROFILE_START(&pPool->StatR3Reset, a);
5276 LogFlow(("pgmR3PoolReset:\n"));
5277
5278 /*
5279 * If there are no pages in the pool, there is nothing to do.
5280 */
5281 if (pPool->cCurPages <= PGMPOOL_IDX_FIRST)
5282 {
5283 STAM_PROFILE_STOP(&pPool->StatR3Reset, a);
5284 return;
5285 }
5286
5287 /*
5288 * Exit the shadow mode since we're going to clear everything,
5289 * including the root page.
5290 */
5291 for (VMCPUID i = 0; i < pVM->cCpus; i++)
5292 pgmR3ExitShadowModeBeforePoolFlush(&pVM->aCpus[i]);
5293
5294 /*
5295 * Nuke the free list and reinsert all pages into it.
5296 */
5297 for (unsigned i = pPool->cCurPages - 1; i >= PGMPOOL_IDX_FIRST; i--)
5298 {
5299 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5300
5301 Assert(pPage->Core.Key == MMPage2Phys(pVM, pPage->pvPageR3));
5302 if (pPage->fMonitored)
5303 pgmPoolMonitorFlush(pPool, pPage);
5304 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5305 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5306 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
5307 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
5308 pPage->cModifications = 0;
5309 pPage->GCPhys = NIL_RTGCPHYS;
5310 pPage->enmKind = PGMPOOLKIND_FREE;
5311 pPage->enmAccess = PGMPOOLACCESS_DONTCARE;
5312 Assert(pPage->idx == i);
5313 pPage->iNext = i + 1;
5314 pPage->fA20Enabled = true;
5315 pPage->fZeroed = false; /* This could probably be optimized, but better safe than sorry. */
5316 pPage->fSeenNonGlobal = false;
5317 pPage->fMonitored = false;
5318 pPage->fDirty = false;
5319 pPage->fCached = false;
5320 pPage->fReusedFlushPending = false;
5321 pPage->iUserHead = NIL_PGMPOOL_USER_INDEX;
5322 pPage->iAgeNext = NIL_PGMPOOL_IDX;
5323 pPage->iAgePrev = NIL_PGMPOOL_IDX;
5324 pPage->GCPtrLastAccessHandlerRip = NIL_RTGCPTR;
5325 pPage->GCPtrLastAccessHandlerFault = NIL_RTGCPTR;
5326 pPage->cLastAccessHandler = 0;
5327 pPage->cLocked = 0;
5328#ifdef VBOX_STRICT
5329 pPage->GCPtrDirtyFault = NIL_RTGCPTR;
5330#endif
5331 }
5332 pPool->aPages[pPool->cCurPages - 1].iNext = NIL_PGMPOOL_IDX;
5333 pPool->iFreeHead = PGMPOOL_IDX_FIRST;
5334 pPool->cUsedPages = 0;
5335
5336 /*
5337 * Zap and reinitialize the user records.
5338 */
5339 pPool->cPresent = 0;
5340 pPool->iUserFreeHead = 0;
5341 PPGMPOOLUSER paUsers = pPool->CTX_SUFF(paUsers);
5342 const unsigned cMaxUsers = pPool->cMaxUsers;
5343 for (unsigned i = 0; i < cMaxUsers; i++)
5344 {
5345 paUsers[i].iNext = i + 1;
5346 paUsers[i].iUser = NIL_PGMPOOL_IDX;
5347 paUsers[i].iUserTable = 0xfffffffe;
5348 }
5349 paUsers[cMaxUsers - 1].iNext = NIL_PGMPOOL_USER_INDEX;
5350
5351 /*
5352 * Clear all the GCPhys links and rebuild the phys ext free list.
5353 */
5354 for (PPGMRAMRANGE pRam = pVM->pgm.s.CTX_SUFF(pRamRangesX);
5355 pRam;
5356 pRam = pRam->CTX_SUFF(pNext))
5357 {
5358 unsigned iPage = pRam->cb >> PAGE_SHIFT;
5359 while (iPage-- > 0)
5360 PGM_PAGE_SET_TRACKING(pVM, &pRam->aPages[iPage], 0);
5361 }
5362
5363 pPool->iPhysExtFreeHead = 0;
5364 PPGMPOOLPHYSEXT paPhysExts = pPool->CTX_SUFF(paPhysExts);
5365 const unsigned cMaxPhysExts = pPool->cMaxPhysExts;
5366 for (unsigned i = 0; i < cMaxPhysExts; i++)
5367 {
5368 paPhysExts[i].iNext = i + 1;
5369 paPhysExts[i].aidx[0] = NIL_PGMPOOL_IDX;
5370 paPhysExts[i].apte[0] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5371 paPhysExts[i].aidx[1] = NIL_PGMPOOL_IDX;
5372 paPhysExts[i].apte[1] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5373 paPhysExts[i].aidx[2] = NIL_PGMPOOL_IDX;
5374 paPhysExts[i].apte[2] = NIL_PGMPOOL_PHYSEXT_IDX_PTE;
5375 }
5376 paPhysExts[cMaxPhysExts - 1].iNext = NIL_PGMPOOL_PHYSEXT_INDEX;
5377
5378 /*
5379 * Just zap the modified list.
5380 */
5381 pPool->cModifiedPages = 0;
5382 pPool->iModifiedHead = NIL_PGMPOOL_IDX;
5383
5384 /*
5385 * Clear the GCPhys hash and the age list.
5386 */
5387 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aiHash); i++)
5388 pPool->aiHash[i] = NIL_PGMPOOL_IDX;
5389 pPool->iAgeHead = NIL_PGMPOOL_IDX;
5390 pPool->iAgeTail = NIL_PGMPOOL_IDX;
5391
5392#ifdef PGMPOOL_WITH_OPTIMIZED_DIRTY_PT
5393 /* Clear all dirty pages. */
5394 pPool->idxFreeDirtyPage = 0;
5395 pPool->cDirtyPages = 0;
5396 for (unsigned i = 0; i < RT_ELEMENTS(pPool->aDirtyPages); i++)
5397 pPool->aDirtyPages[i].uIdx = NIL_PGMPOOL_IDX;
5398#endif
5399
5400 /*
5401 * Reinsert active pages into the hash and ensure monitoring chains are correct.
5402 */
5403 for (unsigned i = PGMPOOL_IDX_FIRST_SPECIAL; i < PGMPOOL_IDX_FIRST; i++)
5404 {
5405 PPGMPOOLPAGE pPage = &pPool->aPages[i];
5406 pPage->iNext = NIL_PGMPOOL_IDX;
5407 pPage->iModifiedNext = NIL_PGMPOOL_IDX;
5408 pPage->iModifiedPrev = NIL_PGMPOOL_IDX;
5409 pPage->cModifications = 0;
5410 /* ASSUMES that we're not sharing with any of the other special pages (safe for now). */
5411 pPage->iMonitoredNext = NIL_PGMPOOL_IDX;
5412 pPage->iMonitoredPrev = NIL_PGMPOOL_IDX;
5413 if (pPage->fMonitored)
5414 {
5415 int rc = PGMHandlerPhysicalChangeCallbacks(pVM, pPage->GCPhys & ~(RTGCPHYS)PAGE_OFFSET_MASK,
5416 pPool->pfnAccessHandlerR3, MMHyperCCToR3(pVM, pPage),
5417 pPool->pfnAccessHandlerR0, MMHyperCCToR0(pVM, pPage),
5418 pPool->pfnAccessHandlerRC, MMHyperCCToRC(pVM, pPage),
5419 pPool->pszAccessHandler);
5420 AssertFatalRCSuccess(rc);
5421 pgmPoolHashInsert(pPool, pPage);
5422 }
5423 Assert(pPage->iUserHead == NIL_PGMPOOL_USER_INDEX); /* for now */
5424 Assert(pPage->iAgeNext == NIL_PGMPOOL_IDX);
5425 Assert(pPage->iAgePrev == NIL_PGMPOOL_IDX);
5426 }
5427
5428 for (VMCPUID i = 0; i < pVM->cCpus; i++)
5429 {
5430 /*
5431 * Re-enter the shadowing mode and assert Sync CR3 FF.
5432 */
5433 PVMCPU pVCpu = &pVM->aCpus[i];
5434 pgmR3ReEnterShadowModeAfterPoolFlush(pVM, pVCpu);
5435 VMCPU_FF_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3);
5436 VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH);
5437 }
5438
5439 STAM_PROFILE_STOP(&pPool->StatR3Reset, a);
5440}
5441
5442#endif /* IN_RING3 */
5443
5444#ifdef LOG_ENABLED
5445/**
5446 * Stringifies a PGMPOOLKIND value.
5447 */
5448static const char *pgmPoolPoolKindToStr(uint8_t enmKind)
5449{
5450 switch ((PGMPOOLKIND)enmKind)
5451 {
5452 case PGMPOOLKIND_INVALID:
5453 return "PGMPOOLKIND_INVALID";
5454 case PGMPOOLKIND_FREE:
5455 return "PGMPOOLKIND_FREE";
5456 case PGMPOOLKIND_32BIT_PT_FOR_PHYS:
5457 return "PGMPOOLKIND_32BIT_PT_FOR_PHYS";
5458 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT:
5459 return "PGMPOOLKIND_32BIT_PT_FOR_32BIT_PT";
5460 case PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB:
5461 return "PGMPOOLKIND_32BIT_PT_FOR_32BIT_4MB";
5462 case PGMPOOLKIND_PAE_PT_FOR_PHYS:
5463 return "PGMPOOLKIND_PAE_PT_FOR_PHYS";
5464 case PGMPOOLKIND_PAE_PT_FOR_32BIT_PT:
5465 return "PGMPOOLKIND_PAE_PT_FOR_32BIT_PT";
5466 case PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB:
5467 return "PGMPOOLKIND_PAE_PT_FOR_32BIT_4MB";
5468 case PGMPOOLKIND_PAE_PT_FOR_PAE_PT:
5469 return "PGMPOOLKIND_PAE_PT_FOR_PAE_PT";
5470 case PGMPOOLKIND_PAE_PT_FOR_PAE_2MB:
5471 return "PGMPOOLKIND_PAE_PT_FOR_PAE_2MB";
5472 case PGMPOOLKIND_32BIT_PD:
5473 return "PGMPOOLKIND_32BIT_PD";
5474 case PGMPOOLKIND_32BIT_PD_PHYS:
5475 return "PGMPOOLKIND_32BIT_PD_PHYS";
5476 case PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD:
5477 return "PGMPOOLKIND_PAE_PD0_FOR_32BIT_PD";
5478 case PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD:
5479 return "PGMPOOLKIND_PAE_PD1_FOR_32BIT_PD";
5480 case PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD:
5481 return "PGMPOOLKIND_PAE_PD2_FOR_32BIT_PD";
5482 case PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD:
5483 return "PGMPOOLKIND_PAE_PD3_FOR_32BIT_PD";
5484 case PGMPOOLKIND_PAE_PD_FOR_PAE_PD:
5485 return "PGMPOOLKIND_PAE_PD_FOR_PAE_PD";
5486 case PGMPOOLKIND_PAE_PD_PHYS:
5487 return "PGMPOOLKIND_PAE_PD_PHYS";
5488 case PGMPOOLKIND_PAE_PDPT_FOR_32BIT:
5489 return "PGMPOOLKIND_PAE_PDPT_FOR_32BIT";
5490 case PGMPOOLKIND_PAE_PDPT:
5491 return "PGMPOOLKIND_PAE_PDPT";
5492 case PGMPOOLKIND_PAE_PDPT_PHYS:
5493 return "PGMPOOLKIND_PAE_PDPT_PHYS";
5494 case PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT:
5495 return "PGMPOOLKIND_64BIT_PDPT_FOR_64BIT_PDPT";
5496 case PGMPOOLKIND_64BIT_PDPT_FOR_PHYS:
5497 return "PGMPOOLKIND_64BIT_PDPT_FOR_PHYS";
5498 case PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD:
5499 return "PGMPOOLKIND_64BIT_PD_FOR_64BIT_PD";
5500 case PGMPOOLKIND_64BIT_PD_FOR_PHYS:
5501 return "PGMPOOLKIND_64BIT_PD_FOR_PHYS";
5502 case PGMPOOLKIND_64BIT_PML4:
5503 return "PGMPOOLKIND_64BIT_PML4";
5504 case PGMPOOLKIND_EPT_PDPT_FOR_PHYS:
5505 return "PGMPOOLKIND_EPT_PDPT_FOR_PHYS";
5506 case PGMPOOLKIND_EPT_PD_FOR_PHYS:
5507 return "PGMPOOLKIND_EPT_PD_FOR_PHYS";
5508 case PGMPOOLKIND_EPT_PT_FOR_PHYS:
5509 return "PGMPOOLKIND_EPT_PT_FOR_PHYS";
5510 case PGMPOOLKIND_ROOT_NESTED:
5511 return "PGMPOOLKIND_ROOT_NESTED";
5512 }
5513 return "Unknown kind!";
5514}
5515#endif /* LOG_ENABLED*/
5516
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette