GVMMR0.cpp@ 85972

Last change on this file since 85972 was 82989, checked in by vboxsync, 5 years ago
VMM/GMMR0: Added a per-VM chunk TLB to avoid having everyone hammer the global spinlock. bugref:9627
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 106.3 KB

Line
1	/* $Id: GVMMR0.cpp 82989 2020-02-05 11:16:44Z vboxsync $ */
2	/** @file
3	* GVMM - Global VM Manager.
4	*/
5
6	/*
7	* Copyright (C) 2007-2020 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*/
17
18
19	/** @page pg_gvmm GVMM - The Global VM Manager
20	*
21	* The Global VM Manager lives in ring-0. Its main function at the moment is
22	* to manage a list of all running VMs, keep a ring-0 only structure (GVM) for
23	* each of them, and assign them unique identifiers (so GMM can track page
24	* owners). The GVMM also manage some of the host CPU resources, like the
25	* periodic preemption timer.
26	*
27	* The GVMM will create a ring-0 object for each VM when it is registered, this
28	* is both for session cleanup purposes and for having a point where it is
29	* possible to implement usage polices later (in SUPR0ObjRegister).
30	*
31	*
32	* @section sec_gvmm_ppt Periodic Preemption Timer (PPT)
33	*
34	* On system that sports a high resolution kernel timer API, we use per-cpu
35	* timers to generate interrupts that preempts VT-x, AMD-V and raw-mode guest
36	* execution. The timer frequency is calculating by taking the max
37	* TMCalcHostTimerFrequency for all VMs running on a CPU for the last ~160 ms
38	* (RT_ELEMENTS((PGVMMHOSTCPU)0, Ppt.aHzHistory) *
39	* GVMMHOSTCPU_PPT_HIST_INTERVAL_NS).
40	*
41	* The TMCalcHostTimerFrequency() part of the things gets its takes the max
42	* TMTimerSetFrequencyHint() value and adjusts by the current catch-up percent,
43	* warp drive percent and some fudge factors. VMMR0.cpp reports the result via
44	* GVMMR0SchedUpdatePeriodicPreemptionTimer() before switching to the VT-x,
45	* AMD-V and raw-mode execution environments.
46	*/
47
48
49	/*********************************************************************************************************************************
50	* Header Files *
51	*********************************************************************************************************************************/
52	#define LOG_GROUP LOG_GROUP_GVMM
53	#include <VBox/vmm/gvmm.h>
54	#include <VBox/vmm/gmm.h>
55	#include "GVMMR0Internal.h"
56	#include <VBox/vmm/iom.h>
57	#include <VBox/vmm/pdm.h>
58	#include <VBox/vmm/pgm.h>
59	#include <VBox/vmm/vmm.h>
60	#ifdef VBOX_WITH_NEM_R0
61	# include <VBox/vmm/nem.h>
62	#endif
63	#include <VBox/vmm/vmcpuset.h>
64	#include <VBox/vmm/vmcc.h>
65	#include <VBox/param.h>
66	#include <VBox/err.h>
67
68	#include <iprt/asm.h>
69	#include <iprt/asm-amd64-x86.h>
70	#include <iprt/critsect.h>
71	#include <iprt/mem.h>
72	#include <iprt/semaphore.h>
73	#include <iprt/time.h>
74	#include <VBox/log.h>
75	#include <iprt/thread.h>
76	#include <iprt/process.h>
77	#include <iprt/param.h>
78	#include <iprt/string.h>
79	#include <iprt/assert.h>
80	#include <iprt/mem.h>
81	#include <iprt/memobj.h>
82	#include <iprt/mp.h>
83	#include <iprt/cpuset.h>
84	#include <iprt/spinlock.h>
85	#include <iprt/timer.h>
86
87	#include "dtrace/VBoxVMM.h"
88
89
90	/*********************************************************************************************************************************
91	* Defined Constants And Macros *
92	*********************************************************************************************************************************/
93	#if defined(RT_OS_LINUX) \|\| defined(RT_OS_SOLARIS) \|\| defined(DOXYGEN_RUNNING)
94	/** Define this to enable the periodic preemption timer. */
95	# define GVMM_SCHED_WITH_PPT
96	#endif
97
98
99	/** @def GVMM_CHECK_SMAP_SETUP
100	* SMAP check setup. */
101	/** @def GVMM_CHECK_SMAP_CHECK
102	* Checks that the AC flag is set if SMAP is enabled. If AC is not set,
103	* it will be logged and @a a_BadExpr is executed. */
104	/** @def GVMM_CHECK_SMAP_CHECK2
105	* Checks that the AC flag is set if SMAP is enabled. If AC is not set, it will
106	* be logged, written to the VMs assertion text buffer, and @a a_BadExpr is
107	* executed. */
108	#if (defined(VBOX_STRICT) \|\| 1) && !defined(VBOX_WITH_RAM_IN_KERNEL)
109	# define GVMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = SUPR0GetKernelFeatures()
110	# define GVMM_CHECK_SMAP_CHECK(a_BadExpr) \
111	do { \
112	if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \
113	{ \
114	RTCCUINTREG fEflCheck = ASMGetFlags(); \
115	if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \
116	{ /* likely */ } \
117	else \
118	{ \
119	SUPR0Printf("%s, line %d: EFLAGS.AC is clear! (%#x)\n", __FUNCTION__, __LINE__, (uint32_t)fEflCheck); \
120	a_BadExpr; \
121	} \
122	} \
123	} while (0)
124	# define GVMM_CHECK_SMAP_CHECK2(a_pGVM, a_BadExpr) \
125	do { \
126	if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \
127	{ \
128	RTCCUINTREG fEflCheck = ASMGetFlags(); \
129	if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \
130	{ /* likely */ } \
131	else \
132	{ \
133	SUPR0BadContext((a_pGVM) ? (a_pGVM)->pSession : NULL, __FILE__, __LINE__, "EFLAGS.AC is zero!"); \
134	a_BadExpr; \
135	} \
136	} \
137	} while (0)
138	#else
139	# define GVMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = 0
140	# define GVMM_CHECK_SMAP_CHECK(a_BadExpr) NOREF(fKernelFeatures)
141	# define GVMM_CHECK_SMAP_CHECK2(a_pGVM, a_BadExpr) NOREF(fKernelFeatures)
142	#endif
143
144
145
146	/*********************************************************************************************************************************
147	* Structures and Typedefs *
148	*********************************************************************************************************************************/
149
150	/**
151	* Global VM handle.
152	*/
153	typedef struct GVMHANDLE
154	{
155	/** The index of the next handle in the list (free or used). (0 is nil.) */
156	uint16_t volatile iNext;
157	/** Our own index / handle value. */
158	uint16_t iSelf;
159	/** The process ID of the handle owner.
160	* This is used for access checks. */
161	RTPROCESS ProcId;
162	/** The pointer to the ring-0 only (aka global) VM structure. */
163	PGVM pGVM;
164	/** The virtual machine object. */
165	void *pvObj;
166	/** The session this VM is associated with. */
167	PSUPDRVSESSION pSession;
168	/** The ring-0 handle of the EMT0 thread.
169	* This is used for ownership checks as well as looking up a VM handle by thread
170	* at times like assertions. */
171	RTNATIVETHREAD hEMT0;
172	} GVMHANDLE;
173	/** Pointer to a global VM handle. */
174	typedef GVMHANDLE *PGVMHANDLE;
175
176	/** Number of GVM handles (including the NIL handle). */
177	#if HC_ARCH_BITS == 64
178	# define GVMM_MAX_HANDLES 8192
179	#else
180	# define GVMM_MAX_HANDLES 128
181	#endif
182
183	/**
184	* Per host CPU GVMM data.
185	*/
186	typedef struct GVMMHOSTCPU
187	{
188	/** Magic number (GVMMHOSTCPU_MAGIC). */
189	uint32_t volatile u32Magic;
190	/** The CPU ID. */
191	RTCPUID idCpu;
192	/** The CPU set index. */
193	uint32_t idxCpuSet;
194
195	#ifdef GVMM_SCHED_WITH_PPT
196	/** Periodic preemption timer data. */
197	struct
198	{
199	/** The handle to the periodic preemption timer. */
200	PRTTIMER pTimer;
201	/** Spinlock protecting the data below. */
202	RTSPINLOCK hSpinlock;
203	/** The smalles Hz that we need to care about. (static) */
204	uint32_t uMinHz;
205	/** The number of ticks between each historization. */
206	uint32_t cTicksHistoriziationInterval;
207	/** The current historization tick (counting up to
208	* cTicksHistoriziationInterval and then resetting). */
209	uint32_t iTickHistorization;
210	/** The current timer interval. This is set to 0 when inactive. */
211	uint32_t cNsInterval;
212	/** The current timer frequency. This is set to 0 when inactive. */
213	uint32_t uTimerHz;
214	/** The current max frequency reported by the EMTs.
215	* This gets historicize and reset by the timer callback. This is
216	* read without holding the spinlock, so needs atomic updating. */
217	uint32_t volatile uDesiredHz;
218	/** Whether the timer was started or not. */
219	bool volatile fStarted;
220	/** Set if we're starting timer. */
221	bool volatile fStarting;
222	/** The index of the next history entry (mod it). */
223	uint32_t iHzHistory;
224	/** Historicized uDesiredHz values. The array wraps around, new entries
225	* are added at iHzHistory. This is updated approximately every
226	* GVMMHOSTCPU_PPT_HIST_INTERVAL_NS by the timer callback. */
227	uint32_t aHzHistory[8];
228	/** Statistics counter for recording the number of interval changes. */
229	uint32_t cChanges;
230	/** Statistics counter for recording the number of timer starts. */
231	uint32_t cStarts;
232	} Ppt;
233	#endif /* GVMM_SCHED_WITH_PPT */
234
235	} GVMMHOSTCPU;
236	/** Pointer to the per host CPU GVMM data. */
237	typedef GVMMHOSTCPU *PGVMMHOSTCPU;
238	/** The GVMMHOSTCPU::u32Magic value (Petra, Tanya & Rachel Haden). */
239	#define GVMMHOSTCPU_MAGIC UINT32_C(0x19711011)
240	/** The interval on history entry should cover (approximately) give in
241	* nanoseconds. */
242	#define GVMMHOSTCPU_PPT_HIST_INTERVAL_NS UINT32_C(20000000)
243
244
245	/**
246	* The GVMM instance data.
247	*/
248	typedef struct GVMM
249	{
250	/** Eyecatcher / magic. */
251	uint32_t u32Magic;
252	/** The index of the head of the free handle chain. (0 is nil.) */
253	uint16_t volatile iFreeHead;
254	/** The index of the head of the active handle chain. (0 is nil.) */
255	uint16_t volatile iUsedHead;
256	/** The number of VMs. */
257	uint16_t volatile cVMs;
258	/** Alignment padding. */
259	uint16_t u16Reserved;
260	/** The number of EMTs. */
261	uint32_t volatile cEMTs;
262	/** The number of EMTs that have halted in GVMMR0SchedHalt. */
263	uint32_t volatile cHaltedEMTs;
264	/** Mini lock for restricting early wake-ups to one thread. */
265	bool volatile fDoingEarlyWakeUps;
266	bool afPadding[3]; /*< explicit alignment padding. /
267	/** When the next halted or sleeping EMT will wake up.
268	* This is set to 0 when it needs recalculating and to UINT64_MAX when
269	* there are no halted or sleeping EMTs in the GVMM. */
270	uint64_t uNsNextEmtWakeup;
271	/** The lock used to serialize VM creation, destruction and associated events that
272	* isn't performance critical. Owners may acquire the list lock. */
273	RTCRITSECT CreateDestroyLock;
274	/** The lock used to serialize used list updates and accesses.
275	* This indirectly includes scheduling since the scheduler will have to walk the
276	* used list to examin running VMs. Owners may not acquire any other locks. */
277	RTCRITSECTRW UsedLock;
278	/** The handle array.
279	* The size of this array defines the maximum number of currently running VMs.
280	* The first entry is unused as it represents the NIL handle. */
281	GVMHANDLE aHandles[GVMM_MAX_HANDLES];
282
283	/** @gcfgm{/GVMM/cEMTsMeansCompany, 32-bit, 0, UINT32_MAX, 1}
284	* The number of EMTs that means we no longer consider ourselves alone on a
285	* CPU/Core.
286	*/
287	uint32_t cEMTsMeansCompany;
288	/** @gcfgm{/GVMM/MinSleepAlone,32-bit, 0, 100000000, 750000, ns}
289	* The minimum sleep time for when we're alone, in nano seconds.
290	*/
291	uint32_t nsMinSleepAlone;
292	/** @gcfgm{/GVMM/MinSleepCompany,32-bit,0, 100000000, 15000, ns}
293	* The minimum sleep time for when we've got company, in nano seconds.
294	*/
295	uint32_t nsMinSleepCompany;
296	/** @gcfgm{/GVMM/EarlyWakeUp1, 32-bit, 0, 100000000, 25000, ns}
297	* The limit for the first round of early wake-ups, given in nano seconds.
298	*/
299	uint32_t nsEarlyWakeUp1;
300	/** @gcfgm{/GVMM/EarlyWakeUp2, 32-bit, 0, 100000000, 50000, ns}
301	* The limit for the second round of early wake-ups, given in nano seconds.
302	*/
303	uint32_t nsEarlyWakeUp2;
304
305	/** Set if we're doing early wake-ups.
306	* This reflects nsEarlyWakeUp1 and nsEarlyWakeUp2. */
307	bool volatile fDoEarlyWakeUps;
308
309	/** The number of entries in the host CPU array (aHostCpus). */
310	uint32_t cHostCpus;
311	/** Per host CPU data (variable length). */
312	GVMMHOSTCPU aHostCpus[1];
313	} GVMM;
314	AssertCompileMemberAlignment(GVMM, CreateDestroyLock, 8);
315	AssertCompileMemberAlignment(GVMM, UsedLock, 8);
316	AssertCompileMemberAlignment(GVMM, uNsNextEmtWakeup, 8);
317	/** Pointer to the GVMM instance data. */
318	typedef GVMM *PGVMM;
319
320	/** The GVMM::u32Magic value (Charlie Haden). */
321	#define GVMM_MAGIC UINT32_C(0x19370806)
322
323
324
325	/*********************************************************************************************************************************
326	* Global Variables *
327	*********************************************************************************************************************************/
328	/** Pointer to the GVMM instance data.
329	* (Just my general dislike for global variables.) */
330	static PGVMM g_pGVMM = NULL;
331
332	/** Macro for obtaining and validating the g_pGVMM pointer.
333	* On failure it will return from the invoking function with the specified return value.
334	*
335	* @param pGVMM The name of the pGVMM variable.
336	* @param rc The return value on failure. Use VERR_GVMM_INSTANCE for VBox
337	* status codes.
338	*/
339	#define GVMM_GET_VALID_INSTANCE(pGVMM, rc) \
340	do { \
341	(pGVMM) = g_pGVMM;\
342	AssertPtrReturn((pGVMM), (rc)); \
343	AssertMsgReturn((pGVMM)->u32Magic == GVMM_MAGIC, ("%p - %#x\n", (pGVMM), (pGVMM)->u32Magic), (rc)); \
344	} while (0)
345
346	/** Macro for obtaining and validating the g_pGVMM pointer, void function variant.
347	* On failure it will return from the invoking function.
348	*
349	* @param pGVMM The name of the pGVMM variable.
350	*/
351	#define GVMM_GET_VALID_INSTANCE_VOID(pGVMM) \
352	do { \
353	(pGVMM) = g_pGVMM;\
354	AssertPtrReturnVoid((pGVMM)); \
355	AssertMsgReturnVoid((pGVMM)->u32Magic == GVMM_MAGIC, ("%p - %#x\n", (pGVMM), (pGVMM)->u32Magic)); \
356	} while (0)
357
358
359	/*********************************************************************************************************************************
360	* Internal Functions *
361	*********************************************************************************************************************************/
362	static void gvmmR0InitPerVMData(PGVM pGVM, int16_t hSelf, VMCPUID cCpus, PSUPDRVSESSION pSession);
363	static DECLCALLBACK(void) gvmmR0HandleObjDestructor(void pvObj, void pvGVMM, void *pvHandle);
364	static int gvmmR0ByGVM(PGVM pGVM, PGVMM *ppGVMM, bool fTakeUsedLock);
365	static int gvmmR0ByGVMandEMT(PGVM pGVM, VMCPUID idCpu, PGVMM *ppGVMM);
366
367	#ifdef GVMM_SCHED_WITH_PPT
368	static DECLCALLBACK(void) gvmmR0SchedPeriodicPreemptionTimerCallback(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
369	#endif
370
371
372	/**
373	* Initializes the GVMM.
374	*
375	* This is called while owning the loader semaphore (see supdrvIOCtl_LdrLoad()).
376	*
377	* @returns VBox status code.
378	*/
379	GVMMR0DECL(int) GVMMR0Init(void)
380	{
381	LogFlow(("GVMMR0Init:\n"));
382
383	/*
384	* Allocate and initialize the instance data.
385	*/
386	uint32_t cHostCpus = RTMpGetArraySize();
387	AssertMsgReturn(cHostCpus > 0 && cHostCpus < _64K, ("%d", (int)cHostCpus), VERR_GVMM_HOST_CPU_RANGE);
388
389	PGVMM pGVMM = (PGVMM)RTMemAllocZ(RT_UOFFSETOF_DYN(GVMM, aHostCpus[cHostCpus]));
390	if (!pGVMM)
391	return VERR_NO_MEMORY;
392	int rc = RTCritSectInitEx(&pGVMM->CreateDestroyLock, 0, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE,
393	"GVMM-CreateDestroyLock");
394	if (RT_SUCCESS(rc))
395	{
396	rc = RTCritSectRwInitEx(&pGVMM->UsedLock, 0, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE, "GVMM-UsedLock");
397	if (RT_SUCCESS(rc))
398	{
399	pGVMM->u32Magic = GVMM_MAGIC;
400	pGVMM->iUsedHead = 0;
401	pGVMM->iFreeHead = 1;
402
403	/* the nil handle */
404	pGVMM->aHandles[0].iSelf = 0;
405	pGVMM->aHandles[0].iNext = 0;
406
407	/* the tail */
408	unsigned i = RT_ELEMENTS(pGVMM->aHandles) - 1;
409	pGVMM->aHandles[i].iSelf = i;
410	pGVMM->aHandles[i].iNext = 0; /* nil */
411
412	/* the rest */
413	while (i-- > 1)
414	{
415	pGVMM->aHandles[i].iSelf = i;
416	pGVMM->aHandles[i].iNext = i + 1;
417	}
418
419	/* The default configuration values. */
420	uint32_t cNsResolution = RTSemEventMultiGetResolution();
421	pGVMM->cEMTsMeansCompany = 1; /** @todo should be adjusted to relative to the cpu count or something... */
422	if (cNsResolution >= 5*RT_NS_100US)
423	{
424	pGVMM->nsMinSleepAlone = 750000 /* ns (0.750 ms) /; /* @todo this should be adjusted to be 75% (or something) of the scheduler granularity... */
425	pGVMM->nsMinSleepCompany = 15000 /* ns (0.015 ms) */;
426	pGVMM->nsEarlyWakeUp1 = 25000 /* ns (0.025 ms) */;
427	pGVMM->nsEarlyWakeUp2 = 50000 /* ns (0.050 ms) */;
428	}
429	else if (cNsResolution > RT_NS_100US)
430	{
431	pGVMM->nsMinSleepAlone = cNsResolution / 2;
432	pGVMM->nsMinSleepCompany = cNsResolution / 4;
433	pGVMM->nsEarlyWakeUp1 = 0;
434	pGVMM->nsEarlyWakeUp2 = 0;
435	}
436	else
437	{
438	pGVMM->nsMinSleepAlone = 2000;
439	pGVMM->nsMinSleepCompany = 2000;
440	pGVMM->nsEarlyWakeUp1 = 0;
441	pGVMM->nsEarlyWakeUp2 = 0;
442	}
443	pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0;
444
445	/* The host CPU data. */
446	pGVMM->cHostCpus = cHostCpus;
447	uint32_t iCpu = cHostCpus;
448	RTCPUSET PossibleSet;
449	RTMpGetSet(&PossibleSet);
450	while (iCpu-- > 0)
451	{
452	pGVMM->aHostCpus[iCpu].idxCpuSet = iCpu;
453	#ifdef GVMM_SCHED_WITH_PPT
454	pGVMM->aHostCpus[iCpu].Ppt.pTimer = NULL;
455	pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK;
456	pGVMM->aHostCpus[iCpu].Ppt.uMinHz = 5; /** @todo Add some API which figures this one out. (not that important) */
457	pGVMM->aHostCpus[iCpu].Ppt.cTicksHistoriziationInterval = 1;
458	//pGVMM->aHostCpus[iCpu].Ppt.iTickHistorization = 0;
459	//pGVMM->aHostCpus[iCpu].Ppt.cNsInterval = 0;
460	//pGVMM->aHostCpus[iCpu].Ppt.uTimerHz = 0;
461	//pGVMM->aHostCpus[iCpu].Ppt.uDesiredHz = 0;
462	//pGVMM->aHostCpus[iCpu].Ppt.fStarted = false;
463	//pGVMM->aHostCpus[iCpu].Ppt.fStarting = false;
464	//pGVMM->aHostCpus[iCpu].Ppt.iHzHistory = 0;
465	//pGVMM->aHostCpus[iCpu].Ppt.aHzHistory = {0};
466	#endif
467
468	if (RTCpuSetIsMember(&PossibleSet, iCpu))
469	{
470	pGVMM->aHostCpus[iCpu].idCpu = RTMpCpuIdFromSetIndex(iCpu);
471	pGVMM->aHostCpus[iCpu].u32Magic = GVMMHOSTCPU_MAGIC;
472
473	#ifdef GVMM_SCHED_WITH_PPT
474	rc = RTTimerCreateEx(&pGVMM->aHostCpus[iCpu].Ppt.pTimer,
475	5010001000 /* whatever */,
476	RTTIMER_FLAGS_CPU(iCpu) \| RTTIMER_FLAGS_HIGH_RES,
477	gvmmR0SchedPeriodicPreemptionTimerCallback,
478	&pGVMM->aHostCpus[iCpu]);
479	if (RT_SUCCESS(rc))
480	rc = RTSpinlockCreate(&pGVMM->aHostCpus[iCpu].Ppt.hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "GVMM/CPU");
481	if (RT_FAILURE(rc))
482	{
483	while (iCpu < cHostCpus)
484	{
485	RTTimerDestroy(pGVMM->aHostCpus[iCpu].Ppt.pTimer);
486	RTSpinlockDestroy(pGVMM->aHostCpus[iCpu].Ppt.hSpinlock);
487	pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK;
488	iCpu++;
489	}
490	break;
491	}
492	#endif
493	}
494	else
495	{
496	pGVMM->aHostCpus[iCpu].idCpu = NIL_RTCPUID;
497	pGVMM->aHostCpus[iCpu].u32Magic = 0;
498	}
499	}
500	if (RT_SUCCESS(rc))
501	{
502	g_pGVMM = pGVMM;
503	LogFlow(("GVMMR0Init: pGVMM=%p cHostCpus=%u\n", pGVMM, cHostCpus));
504	return VINF_SUCCESS;
505	}
506
507	/* bail out. */
508	RTCritSectRwDelete(&pGVMM->UsedLock);
509	}
510	RTCritSectDelete(&pGVMM->CreateDestroyLock);
511	}
512
513	RTMemFree(pGVMM);
514	return rc;
515	}
516
517
518	/**
519	* Terminates the GVM.
520	*
521	* This is called while owning the loader semaphore (see supdrvLdrFree()).
522	* And unless something is wrong, there should be absolutely no VMs
523	* registered at this point.
524	*/
525	GVMMR0DECL(void) GVMMR0Term(void)
526	{
527	LogFlow(("GVMMR0Term:\n"));
528
529	PGVMM pGVMM = g_pGVMM;
530	g_pGVMM = NULL;
531	if (RT_UNLIKELY(!RT_VALID_PTR(pGVMM)))
532	{
533	SUPR0Printf("GVMMR0Term: pGVMM=%RKv\n", pGVMM);
534	return;
535	}
536
537	/*
538	* First of all, stop all active timers.
539	*/
540	uint32_t cActiveTimers = 0;
541	uint32_t iCpu = pGVMM->cHostCpus;
542	while (iCpu-- > 0)
543	{
544	ASMAtomicWriteU32(&pGVMM->aHostCpus[iCpu].u32Magic, ~GVMMHOSTCPU_MAGIC);
545	#ifdef GVMM_SCHED_WITH_PPT
546	if ( pGVMM->aHostCpus[iCpu].Ppt.pTimer != NULL
547	&& RT_SUCCESS(RTTimerStop(pGVMM->aHostCpus[iCpu].Ppt.pTimer)))
548	cActiveTimers++;
549	#endif
550	}
551	if (cActiveTimers)
552	RTThreadSleep(1); /* fudge */
553
554	/*
555	* Invalidate the and free resources.
556	*/
557	pGVMM->u32Magic = ~GVMM_MAGIC;
558	RTCritSectRwDelete(&pGVMM->UsedLock);
559	RTCritSectDelete(&pGVMM->CreateDestroyLock);
560
561	pGVMM->iFreeHead = 0;
562	if (pGVMM->iUsedHead)
563	{
564	SUPR0Printf("GVMMR0Term: iUsedHead=%#x! (cVMs=%#x cEMTs=%#x)\n", pGVMM->iUsedHead, pGVMM->cVMs, pGVMM->cEMTs);
565	pGVMM->iUsedHead = 0;
566	}
567
568	#ifdef GVMM_SCHED_WITH_PPT
569	iCpu = pGVMM->cHostCpus;
570	while (iCpu-- > 0)
571	{
572	RTTimerDestroy(pGVMM->aHostCpus[iCpu].Ppt.pTimer);
573	pGVMM->aHostCpus[iCpu].Ppt.pTimer = NULL;
574	RTSpinlockDestroy(pGVMM->aHostCpus[iCpu].Ppt.hSpinlock);
575	pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK;
576	}
577	#endif
578
579	RTMemFree(pGVMM);
580	}
581
582
583	/**
584	* A quick hack for setting global config values.
585	*
586	* @returns VBox status code.
587	*
588	* @param pSession The session handle. Used for authentication.
589	* @param pszName The variable name.
590	* @param u64Value The new value.
591	*/
592	GVMMR0DECL(int) GVMMR0SetConfig(PSUPDRVSESSION pSession, const char *pszName, uint64_t u64Value)
593	{
594	/*
595	* Validate input.
596	*/
597	PGVMM pGVMM;
598	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
599	AssertPtrReturn(pSession, VERR_INVALID_HANDLE);
600	AssertPtrReturn(pszName, VERR_INVALID_POINTER);
601
602	/*
603	* String switch time!
604	*/
605	if (strncmp(pszName, RT_STR_TUPLE("/GVMM/")))
606	return VERR_CFGM_VALUE_NOT_FOUND; /* borrow status codes from CFGM... */
607	int rc = VINF_SUCCESS;
608	pszName += sizeof("/GVMM/") - 1;
609	if (!strcmp(pszName, "cEMTsMeansCompany"))
610	{
611	if (u64Value <= UINT32_MAX)
612	pGVMM->cEMTsMeansCompany = u64Value;
613	else
614	rc = VERR_OUT_OF_RANGE;
615	}
616	else if (!strcmp(pszName, "MinSleepAlone"))
617	{
618	if (u64Value <= RT_NS_100MS)
619	pGVMM->nsMinSleepAlone = u64Value;
620	else
621	rc = VERR_OUT_OF_RANGE;
622	}
623	else if (!strcmp(pszName, "MinSleepCompany"))
624	{
625	if (u64Value <= RT_NS_100MS)
626	pGVMM->nsMinSleepCompany = u64Value;
627	else
628	rc = VERR_OUT_OF_RANGE;
629	}
630	else if (!strcmp(pszName, "EarlyWakeUp1"))
631	{
632	if (u64Value <= RT_NS_100MS)
633	{
634	pGVMM->nsEarlyWakeUp1 = u64Value;
635	pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0;
636	}
637	else
638	rc = VERR_OUT_OF_RANGE;
639	}
640	else if (!strcmp(pszName, "EarlyWakeUp2"))
641	{
642	if (u64Value <= RT_NS_100MS)
643	{
644	pGVMM->nsEarlyWakeUp2 = u64Value;
645	pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0;
646	}
647	else
648	rc = VERR_OUT_OF_RANGE;
649	}
650	else
651	rc = VERR_CFGM_VALUE_NOT_FOUND;
652	return rc;
653	}
654
655
656	/**
657	* A quick hack for getting global config values.
658	*
659	* @returns VBox status code.
660	*
661	* @param pSession The session handle. Used for authentication.
662	* @param pszName The variable name.
663	* @param pu64Value Where to return the value.
664	*/
665	GVMMR0DECL(int) GVMMR0QueryConfig(PSUPDRVSESSION pSession, const char pszName, uint64_t pu64Value)
666	{
667	/*
668	* Validate input.
669	*/
670	PGVMM pGVMM;
671	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
672	AssertPtrReturn(pSession, VERR_INVALID_HANDLE);
673	AssertPtrReturn(pszName, VERR_INVALID_POINTER);
674	AssertPtrReturn(pu64Value, VERR_INVALID_POINTER);
675
676	/*
677	* String switch time!
678	*/
679	if (strncmp(pszName, RT_STR_TUPLE("/GVMM/")))
680	return VERR_CFGM_VALUE_NOT_FOUND; /* borrow status codes from CFGM... */
681	int rc = VINF_SUCCESS;
682	pszName += sizeof("/GVMM/") - 1;
683	if (!strcmp(pszName, "cEMTsMeansCompany"))
684	*pu64Value = pGVMM->cEMTsMeansCompany;
685	else if (!strcmp(pszName, "MinSleepAlone"))
686	*pu64Value = pGVMM->nsMinSleepAlone;
687	else if (!strcmp(pszName, "MinSleepCompany"))
688	*pu64Value = pGVMM->nsMinSleepCompany;
689	else if (!strcmp(pszName, "EarlyWakeUp1"))
690	*pu64Value = pGVMM->nsEarlyWakeUp1;
691	else if (!strcmp(pszName, "EarlyWakeUp2"))
692	*pu64Value = pGVMM->nsEarlyWakeUp2;
693	else
694	rc = VERR_CFGM_VALUE_NOT_FOUND;
695	return rc;
696	}
697
698
699	/**
700	* Acquire the 'used' lock in shared mode.
701	*
702	* This prevents destruction of the VM while we're in ring-0.
703	*
704	* @returns IPRT status code, see RTSemFastMutexRequest.
705	* @param a_pGVMM The GVMM instance data.
706	* @sa GVMMR0_USED_SHARED_UNLOCK, GVMMR0_USED_EXCLUSIVE_LOCK
707	*/
708	#define GVMMR0_USED_SHARED_LOCK(a_pGVMM) RTCritSectRwEnterShared(&(a_pGVMM)->UsedLock)
709
710	/**
711	* Release the 'used' lock in when owning it in shared mode.
712	*
713	* @returns IPRT status code, see RTSemFastMutexRequest.
714	* @param a_pGVMM The GVMM instance data.
715	* @sa GVMMR0_USED_SHARED_LOCK
716	*/
717	#define GVMMR0_USED_SHARED_UNLOCK(a_pGVMM) RTCritSectRwLeaveShared(&(a_pGVMM)->UsedLock)
718
719	/**
720	* Acquire the 'used' lock in exclusive mode.
721	*
722	* Only use this function when making changes to the used list.
723	*
724	* @returns IPRT status code, see RTSemFastMutexRequest.
725	* @param a_pGVMM The GVMM instance data.
726	* @sa GVMMR0_USED_EXCLUSIVE_UNLOCK
727	*/
728	#define GVMMR0_USED_EXCLUSIVE_LOCK(a_pGVMM) RTCritSectRwEnterExcl(&(a_pGVMM)->UsedLock)
729
730	/**
731	* Release the 'used' lock when owning it in exclusive mode.
732	*
733	* @returns IPRT status code, see RTSemFastMutexRelease.
734	* @param a_pGVMM The GVMM instance data.
735	* @sa GVMMR0_USED_EXCLUSIVE_LOCK, GVMMR0_USED_SHARED_UNLOCK
736	*/
737	#define GVMMR0_USED_EXCLUSIVE_UNLOCK(a_pGVMM) RTCritSectRwLeaveExcl(&(a_pGVMM)->UsedLock)
738
739
740	/**
741	* Try acquire the 'create & destroy' lock.
742	*
743	* @returns IPRT status code, see RTSemFastMutexRequest.
744	* @param pGVMM The GVMM instance data.
745	*/
746	DECLINLINE(int) gvmmR0CreateDestroyLock(PGVMM pGVMM)
747	{
748	LogFlow(("++gvmmR0CreateDestroyLock(%p)\n", pGVMM));
749	int rc = RTCritSectEnter(&pGVMM->CreateDestroyLock);
750	LogFlow(("gvmmR0CreateDestroyLock(%p)->%Rrc\n", pGVMM, rc));
751	return rc;
752	}
753
754
755	/**
756	* Release the 'create & destroy' lock.
757	*
758	* @returns IPRT status code, see RTSemFastMutexRequest.
759	* @param pGVMM The GVMM instance data.
760	*/
761	DECLINLINE(int) gvmmR0CreateDestroyUnlock(PGVMM pGVMM)
762	{
763	LogFlow(("--gvmmR0CreateDestroyUnlock(%p)\n", pGVMM));
764	int rc = RTCritSectLeave(&pGVMM->CreateDestroyLock);
765	AssertRC(rc);
766	return rc;
767	}
768
769
770	/**
771	* Request wrapper for the GVMMR0CreateVM API.
772	*
773	* @returns VBox status code.
774	* @param pReq The request buffer.
775	* @param pSession The session handle. The VM will be associated with this.
776	*/
777	GVMMR0DECL(int) GVMMR0CreateVMReq(PGVMMCREATEVMREQ pReq, PSUPDRVSESSION pSession)
778	{
779	/*
780	* Validate the request.
781	*/
782	if (!RT_VALID_PTR(pReq))
783	return VERR_INVALID_POINTER;
784	if (pReq->Hdr.cbReq != sizeof(*pReq))
785	return VERR_INVALID_PARAMETER;
786	if (pReq->pSession != pSession)
787	return VERR_INVALID_POINTER;
788
789	/*
790	* Execute it.
791	*/
792	PGVM pGVM;
793	pReq->pVMR0 = NULL;
794	pReq->pVMR3 = NIL_RTR3PTR;
795	int rc = GVMMR0CreateVM(pSession, pReq->cCpus, &pGVM);
796	if (RT_SUCCESS(rc))
797	{
798	pReq->pVMR0 = pGVM; /** @todo don't expose this to ring-3, use a unique random number instead. */
799	pReq->pVMR3 = pGVM->pVMR3;
800	}
801	return rc;
802	}
803
804
805	/**
806	* Allocates the VM structure and registers it with GVM.
807	*
808	* The caller will become the VM owner and there by the EMT.
809	*
810	* @returns VBox status code.
811	* @param pSession The support driver session.
812	* @param cCpus Number of virtual CPUs for the new VM.
813	* @param ppGVM Where to store the pointer to the VM structure.
814	*
815	* @thread EMT.
816	*/
817	GVMMR0DECL(int) GVMMR0CreateVM(PSUPDRVSESSION pSession, uint32_t cCpus, PGVM *ppGVM)
818	{
819	LogFlow(("GVMMR0CreateVM: pSession=%p\n", pSession));
820	PGVMM pGVMM;
821	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
822
823	AssertPtrReturn(ppGVM, VERR_INVALID_POINTER);
824	*ppGVM = NULL;
825
826	if ( cCpus == 0
827	\|\| cCpus > VMM_MAX_CPU_COUNT)
828	return VERR_INVALID_PARAMETER;
829
830	RTNATIVETHREAD hEMT0 = RTThreadNativeSelf();
831	AssertReturn(hEMT0 != NIL_RTNATIVETHREAD, VERR_GVMM_BROKEN_IPRT);
832	RTPROCESS ProcId = RTProcSelf();
833	AssertReturn(ProcId != NIL_RTPROCESS, VERR_GVMM_BROKEN_IPRT);
834
835	/*
836	* The whole allocation process is protected by the lock.
837	*/
838	int rc = gvmmR0CreateDestroyLock(pGVMM);
839	AssertRCReturn(rc, rc);
840
841	/*
842	* Only one VM per session.
843	*/
844	if (SUPR0GetSessionVM(pSession) != NULL)
845	{
846	gvmmR0CreateDestroyUnlock(pGVMM);
847	SUPR0Printf("GVMMR0CreateVM: The session %p already got a VM: %p\n", pSession, SUPR0GetSessionVM(pSession));
848	return VERR_ALREADY_EXISTS;
849	}
850
851	/*
852	* Allocate a handle first so we don't waste resources unnecessarily.
853	*/
854	uint16_t iHandle = pGVMM->iFreeHead;
855	if (iHandle)
856	{
857	PGVMHANDLE pHandle = &pGVMM->aHandles[iHandle];
858
859	/* consistency checks, a bit paranoid as always. */
860	if ( !pHandle->pGVM
861	&& !pHandle->pvObj
862	&& pHandle->iSelf == iHandle)
863	{
864	pHandle->pvObj = SUPR0ObjRegister(pSession, SUPDRVOBJTYPE_VM, gvmmR0HandleObjDestructor, pGVMM, pHandle);
865	if (pHandle->pvObj)
866	{
867	/*
868	* Move the handle from the free to used list and perform permission checks.
869	*/
870	rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM);
871	AssertRC(rc);
872
873	pGVMM->iFreeHead = pHandle->iNext;
874	pHandle->iNext = pGVMM->iUsedHead;
875	pGVMM->iUsedHead = iHandle;
876	pGVMM->cVMs++;
877
878	pHandle->pGVM = NULL;
879	pHandle->pSession = pSession;
880	pHandle->hEMT0 = NIL_RTNATIVETHREAD;
881	pHandle->ProcId = NIL_RTPROCESS;
882
883	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
884
885	rc = SUPR0ObjVerifyAccess(pHandle->pvObj, pSession, NULL);
886	if (RT_SUCCESS(rc))
887	{
888	/*
889	* Allocate memory for the VM structure (combined VM + GVM).
890	*/
891	const uint32_t cbVM = RT_UOFFSETOF_DYN(GVM, aCpus[cCpus]);
892	const uint32_t cPages = RT_ALIGN_32(cbVM, PAGE_SIZE) >> PAGE_SHIFT;
893	RTR0MEMOBJ hVMMemObj = NIL_RTR0MEMOBJ;
894	rc = RTR0MemObjAllocPage(&hVMMemObj, cPages << PAGE_SHIFT, false /* fExecutable */);
895	if (RT_SUCCESS(rc))
896	{
897	PGVM pGVM = (PGVM)RTR0MemObjAddress(hVMMemObj);
898	AssertPtr(pGVM);
899
900	/*
901	* Initialise the structure.
902	*/
903	RT_BZERO(pGVM, cPages << PAGE_SHIFT);
904	gvmmR0InitPerVMData(pGVM, iHandle, cCpus, pSession);
905	pGVM->gvmm.s.VMMemObj = hVMMemObj;
906	rc = GMMR0InitPerVMData(pGVM);
907	int rc2 = PGMR0InitPerVMData(pGVM);
908	PDMR0InitPerVMData(pGVM);
909	IOMR0InitPerVMData(pGVM);
910	if (RT_SUCCESS(rc) && RT_SUCCESS(rc2))
911	{
912	/*
913	* Allocate page array.
914	* This currently have to be made available to ring-3, but this is should change eventually.
915	*/
916	rc = RTR0MemObjAllocPage(&pGVM->gvmm.s.VMPagesMemObj, cPages * sizeof(SUPPAGE), false /* fExecutable */);
917	if (RT_SUCCESS(rc))
918	{
919	PSUPPAGE paPages = (PSUPPAGE)RTR0MemObjAddress(pGVM->gvmm.s.VMPagesMemObj); AssertPtr(paPages);
920	for (uint32_t iPage = 0; iPage < cPages; iPage++)
921	{
922	paPages[iPage].uReserved = 0;
923	paPages[iPage].Phys = RTR0MemObjGetPagePhysAddr(pGVM->gvmm.s.VMMemObj, iPage);
924	Assert(paPages[iPage].Phys != NIL_RTHCPHYS);
925	}
926
927	/*
928	* Map the page array, VM and VMCPU structures into ring-3.
929	*/
930	AssertCompileSizeAlignment(VM, PAGE_SIZE);
931	rc = RTR0MemObjMapUserEx(&pGVM->gvmm.s.VMMapObj, pGVM->gvmm.s.VMMemObj, (RTR3PTR)-1, 0,
932	RTMEM_PROT_READ \| RTMEM_PROT_WRITE, NIL_RTR0PROCESS,
933	0 /offSub/, sizeof(VM));
934	for (VMCPUID i = 0; i < cCpus && RT_SUCCESS(rc); i++)
935	{
936	AssertCompileSizeAlignment(VMCPU, PAGE_SIZE);
937	rc = RTR0MemObjMapUserEx(&pGVM->aCpus[i].gvmm.s.VMCpuMapObj, pGVM->gvmm.s.VMMemObj,
938	(RTR3PTR)-1, 0, RTMEM_PROT_READ \| RTMEM_PROT_WRITE, NIL_RTR0PROCESS,
939	RT_UOFFSETOF_DYN(GVM, aCpus[i]), sizeof(VMCPU));
940	}
941	if (RT_SUCCESS(rc))
942	rc = RTR0MemObjMapUser(&pGVM->gvmm.s.VMPagesMapObj, pGVM->gvmm.s.VMPagesMemObj, (RTR3PTR)-1,
943	0 /* uAlignment */, RTMEM_PROT_READ \| RTMEM_PROT_WRITE,
944	NIL_RTR0PROCESS);
945	if (RT_SUCCESS(rc))
946	{
947	/*
948	* Initialize all the VM pointers.
949	*/
950	PVMR3 pVMR3 = RTR0MemObjAddressR3(pGVM->gvmm.s.VMMapObj);
951	AssertPtr((void *)pVMR3);
952
953	for (VMCPUID i = 0; i < cCpus; i++)
954	{
955	pGVM->aCpus[i].pVMR0 = pGVM;
956	pGVM->aCpus[i].pVMR3 = pVMR3;
957	pGVM->apCpusR3[i] = RTR0MemObjAddressR3(pGVM->aCpus[i].gvmm.s.VMCpuMapObj);
958	pGVM->aCpus[i].pVCpuR3 = pGVM->apCpusR3[i];
959	pGVM->apCpusR0[i] = &pGVM->aCpus[i];
960	AssertPtr((void *)pGVM->apCpusR3[i]);
961	}
962
963	pGVM->paVMPagesR3 = RTR0MemObjAddressR3(pGVM->gvmm.s.VMPagesMapObj);
964	AssertPtr((void *)pGVM->paVMPagesR3);
965
966	/*
967	* Complete the handle - take the UsedLock sem just to be careful.
968	*/
969	rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM);
970	AssertRC(rc);
971
972	pHandle->pGVM = pGVM;
973	pHandle->hEMT0 = hEMT0;
974	pHandle->ProcId = ProcId;
975	pGVM->pVMR3 = pVMR3;
976	pGVM->pVMR3Unsafe = pVMR3;
977	pGVM->aCpus[0].hEMT = hEMT0;
978	pGVM->aCpus[0].hNativeThreadR0 = hEMT0;
979	pGVMM->cEMTs += cCpus;
980
981	/* Associate it with the session and create the context hook for EMT0. */
982	rc = SUPR0SetSessionVM(pSession, pGVM, pGVM);
983	if (RT_SUCCESS(rc))
984	{
985	rc = VMMR0ThreadCtxHookCreateForEmt(&pGVM->aCpus[0]);
986	if (RT_SUCCESS(rc))
987	{
988	/*
989	* Done!
990	*/
991	VBOXVMM_R0_GVMM_VM_CREATED(pGVM, pGVM, ProcId, (void *)hEMT0, cCpus);
992
993	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
994	gvmmR0CreateDestroyUnlock(pGVMM);
995
996	CPUMR0RegisterVCpuThread(&pGVM->aCpus[0]);
997
998	*ppGVM = pGVM;
999	Log(("GVMMR0CreateVM: pVMR3=%p pGVM=%p hGVM=%d\n", pVMR3, pGVM, iHandle));
1000	return VINF_SUCCESS;
1001	}
1002
1003	SUPR0SetSessionVM(pSession, NULL, NULL);
1004	}
1005	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1006	}
1007
1008	/* Cleanup mappings. */
1009	if (pGVM->gvmm.s.VMMapObj != NIL_RTR0MEMOBJ)
1010	{
1011	RTR0MemObjFree(pGVM->gvmm.s.VMMapObj, false /* fFreeMappings */);
1012	pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ;
1013	}
1014	for (VMCPUID i = 0; i < cCpus; i++)
1015	if (pGVM->aCpus[i].gvmm.s.VMCpuMapObj != NIL_RTR0MEMOBJ)
1016	{
1017	RTR0MemObjFree(pGVM->aCpus[i].gvmm.s.VMCpuMapObj, false /* fFreeMappings */);
1018	pGVM->aCpus[i].gvmm.s.VMCpuMapObj = NIL_RTR0MEMOBJ;
1019	}
1020	if (pGVM->gvmm.s.VMPagesMapObj != NIL_RTR0MEMOBJ)
1021	{
1022	RTR0MemObjFree(pGVM->gvmm.s.VMPagesMapObj, false /* fFreeMappings */);
1023	pGVM->gvmm.s.VMPagesMapObj = NIL_RTR0MEMOBJ;
1024	}
1025	}
1026	}
1027	else if (RT_SUCCESS(rc))
1028	rc = rc2;
1029	}
1030	}
1031	/* else: The user wasn't permitted to create this VM. */
1032
1033	/*
1034	* The handle will be freed by gvmmR0HandleObjDestructor as we release the
1035	* object reference here. A little extra mess because of non-recursive lock.
1036	*/
1037	void *pvObj = pHandle->pvObj;
1038	pHandle->pvObj = NULL;
1039	gvmmR0CreateDestroyUnlock(pGVMM);
1040
1041	SUPR0ObjRelease(pvObj, pSession);
1042
1043	SUPR0Printf("GVMMR0CreateVM: failed, rc=%Rrc\n", rc);
1044	return rc;
1045	}
1046
1047	rc = VERR_NO_MEMORY;
1048	}
1049	else
1050	rc = VERR_GVMM_IPE_1;
1051	}
1052	else
1053	rc = VERR_GVM_TOO_MANY_VMS;
1054
1055	gvmmR0CreateDestroyUnlock(pGVMM);
1056	return rc;
1057	}
1058
1059
1060	/**
1061	* Initializes the per VM data belonging to GVMM.
1062	*
1063	* @param pGVM Pointer to the global VM structure.
1064	* @param hSelf The handle.
1065	* @param cCpus The CPU count.
1066	* @param pSession The session this VM is associated with.
1067	*/
1068	static void gvmmR0InitPerVMData(PGVM pGVM, int16_t hSelf, VMCPUID cCpus, PSUPDRVSESSION pSession)
1069	{
1070	AssertCompile(RT_SIZEOFMEMB(GVM,gvmm.s) <= RT_SIZEOFMEMB(GVM,gvmm.padding));
1071	AssertCompile(RT_SIZEOFMEMB(GVMCPU,gvmm.s) <= RT_SIZEOFMEMB(GVMCPU,gvmm.padding));
1072	AssertCompileMemberAlignment(VM, cpum, 64);
1073	AssertCompileMemberAlignment(VM, tm, 64);
1074
1075	/* GVM: */
1076	pGVM->u32Magic = GVM_MAGIC;
1077	pGVM->hSelf = hSelf;
1078	pGVM->cCpus = cCpus;
1079	pGVM->pSession = pSession;
1080	pGVM->pSelf = pGVM;
1081
1082	/* VM: */
1083	pGVM->enmVMState = VMSTATE_CREATING;
1084	pGVM->hSelfUnsafe = hSelf;
1085	pGVM->pSessionUnsafe = pSession;
1086	pGVM->pVMR0ForCall = pGVM;
1087	pGVM->cCpusUnsafe = cCpus;
1088	pGVM->uCpuExecutionCap = 100; /* default is no cap. */
1089	pGVM->uStructVersion = 1;
1090	pGVM->cbSelf = sizeof(VM);
1091	pGVM->cbVCpu = sizeof(VMCPU);
1092
1093	/* GVMM: */
1094	pGVM->gvmm.s.VMMemObj = NIL_RTR0MEMOBJ;
1095	pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ;
1096	pGVM->gvmm.s.VMPagesMemObj = NIL_RTR0MEMOBJ;
1097	pGVM->gvmm.s.VMPagesMapObj = NIL_RTR0MEMOBJ;
1098	pGVM->gvmm.s.fDoneVMMR0Init = false;
1099	pGVM->gvmm.s.fDoneVMMR0Term = false;
1100
1101	/*
1102	* Per virtual CPU.
1103	*/
1104	for (VMCPUID i = 0; i < pGVM->cCpus; i++)
1105	{
1106	pGVM->aCpus[i].idCpu = i;
1107	pGVM->aCpus[i].idCpuUnsafe = i;
1108	pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI;
1109	pGVM->aCpus[i].gvmm.s.VMCpuMapObj = NIL_RTR0MEMOBJ;
1110	pGVM->aCpus[i].hEMT = NIL_RTNATIVETHREAD;
1111	pGVM->aCpus[i].pGVM = pGVM;
1112	pGVM->aCpus[i].idHostCpu = NIL_RTCPUID;
1113	pGVM->aCpus[i].iHostCpuSet = UINT32_MAX;
1114	pGVM->aCpus[i].hNativeThread = NIL_RTNATIVETHREAD;
1115	pGVM->aCpus[i].hNativeThreadR0 = NIL_RTNATIVETHREAD;
1116	pGVM->aCpus[i].enmState = VMCPUSTATE_STOPPED;
1117	pGVM->aCpus[i].pVCpuR0ForVtg = &pGVM->aCpus[i];
1118	}
1119	}
1120
1121
1122	/**
1123	* Does the VM initialization.
1124	*
1125	* @returns VBox status code.
1126	* @param pGVM The global (ring-0) VM structure.
1127	*/
1128	GVMMR0DECL(int) GVMMR0InitVM(PGVM pGVM)
1129	{
1130	LogFlow(("GVMMR0InitVM: pGVM=%p\n", pGVM));
1131
1132	int rc = VERR_INTERNAL_ERROR_3;
1133	if ( !pGVM->gvmm.s.fDoneVMMR0Init
1134	&& pGVM->aCpus[0].gvmm.s.HaltEventMulti == NIL_RTSEMEVENTMULTI)
1135	{
1136	for (VMCPUID i = 0; i < pGVM->cCpus; i++)
1137	{
1138	rc = RTSemEventMultiCreate(&pGVM->aCpus[i].gvmm.s.HaltEventMulti);
1139	if (RT_FAILURE(rc))
1140	{
1141	pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI;
1142	break;
1143	}
1144	}
1145	}
1146	else
1147	rc = VERR_WRONG_ORDER;
1148
1149	LogFlow(("GVMMR0InitVM: returns %Rrc\n", rc));
1150	return rc;
1151	}
1152
1153
1154	/**
1155	* Indicates that we're done with the ring-0 initialization
1156	* of the VM.
1157	*
1158	* @param pGVM The global (ring-0) VM structure.
1159	* @thread EMT(0)
1160	*/
1161	GVMMR0DECL(void) GVMMR0DoneInitVM(PGVM pGVM)
1162	{
1163	/* Set the indicator. */
1164	pGVM->gvmm.s.fDoneVMMR0Init = true;
1165	}
1166
1167
1168	/**
1169	* Indicates that we're doing the ring-0 termination of the VM.
1170	*
1171	* @returns true if termination hasn't been done already, false if it has.
1172	* @param pGVM Pointer to the global VM structure. Optional.
1173	* @thread EMT(0) or session cleanup thread.
1174	*/
1175	GVMMR0DECL(bool) GVMMR0DoingTermVM(PGVM pGVM)
1176	{
1177	/* Validate the VM structure, state and handle. */
1178	AssertPtrReturn(pGVM, false);
1179
1180	/* Set the indicator. */
1181	if (pGVM->gvmm.s.fDoneVMMR0Term)
1182	return false;
1183	pGVM->gvmm.s.fDoneVMMR0Term = true;
1184	return true;
1185	}
1186
1187
1188	/**
1189	* Destroys the VM, freeing all associated resources (the ring-0 ones anyway).
1190	*
1191	* This is call from the vmR3DestroyFinalBit and from a error path in VMR3Create,
1192	* and the caller is not the EMT thread, unfortunately. For security reasons, it
1193	* would've been nice if the caller was actually the EMT thread or that we somehow
1194	* could've associated the calling thread with the VM up front.
1195	*
1196	* @returns VBox status code.
1197	* @param pGVM The global (ring-0) VM structure.
1198	*
1199	* @thread EMT(0) if it's associated with the VM, otherwise any thread.
1200	*/
1201	GVMMR0DECL(int) GVMMR0DestroyVM(PGVM pGVM)
1202	{
1203	LogFlow(("GVMMR0DestroyVM: pGVM=%p\n", pGVM));
1204	PGVMM pGVMM;
1205	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
1206
1207	/*
1208	* Validate the VM structure, state and caller.
1209	*/
1210	AssertPtrReturn(pGVM, VERR_INVALID_POINTER);
1211	AssertReturn(!((uintptr_t)pGVM & PAGE_OFFSET_MASK), VERR_INVALID_POINTER);
1212	AssertMsgReturn(pGVM->enmVMState >= VMSTATE_CREATING && pGVM->enmVMState <= VMSTATE_TERMINATED, ("%d\n", pGVM->enmVMState),
1213	VERR_WRONG_ORDER);
1214
1215	uint32_t hGVM = pGVM->hSelf;
1216	ASMCompilerBarrier();
1217	AssertReturn(hGVM != NIL_GVM_HANDLE, VERR_INVALID_VM_HANDLE);
1218	AssertReturn(hGVM < RT_ELEMENTS(pGVMM->aHandles), VERR_INVALID_VM_HANDLE);
1219
1220	PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM];
1221	AssertReturn(pHandle->pGVM == pGVM, VERR_NOT_OWNER);
1222
1223	RTPROCESS ProcId = RTProcSelf();
1224	RTNATIVETHREAD hSelf = RTThreadNativeSelf();
1225	AssertReturn( ( pHandle->hEMT0 == hSelf
1226	&& pHandle->ProcId == ProcId)
1227	\|\| pHandle->hEMT0 == NIL_RTNATIVETHREAD, VERR_NOT_OWNER);
1228
1229	/*
1230	* Lookup the handle and destroy the object.
1231	* Since the lock isn't recursive and we'll have to leave it before dereferencing the
1232	* object, we take some precautions against racing callers just in case...
1233	*/
1234	int rc = gvmmR0CreateDestroyLock(pGVMM);
1235	AssertRC(rc);
1236
1237	/* Be careful here because we might theoretically be racing someone else cleaning up. */
1238	if ( pHandle->pGVM == pGVM
1239	&& ( ( pHandle->hEMT0 == hSelf
1240	&& pHandle->ProcId == ProcId)
1241	\|\| pHandle->hEMT0 == NIL_RTNATIVETHREAD)
1242	&& RT_VALID_PTR(pHandle->pvObj)
1243	&& RT_VALID_PTR(pHandle->pSession)
1244	&& RT_VALID_PTR(pHandle->pGVM)
1245	&& pHandle->pGVM->u32Magic == GVM_MAGIC)
1246	{
1247	/* Check that other EMTs have deregistered. */
1248	uint32_t cNotDeregistered = 0;
1249	for (VMCPUID idCpu = 1; idCpu < pGVM->cCpus; idCpu++)
1250	cNotDeregistered += pGVM->aCpus[idCpu].hEMT != ~(RTNATIVETHREAD)1; /* see GVMMR0DeregisterVCpu for the value */
1251	if (cNotDeregistered == 0)
1252	{
1253	/* Grab the object pointer. */
1254	void *pvObj = pHandle->pvObj;
1255	pHandle->pvObj = NULL;
1256	gvmmR0CreateDestroyUnlock(pGVMM);
1257
1258	SUPR0ObjRelease(pvObj, pHandle->pSession);
1259	}
1260	else
1261	{
1262	gvmmR0CreateDestroyUnlock(pGVMM);
1263	rc = VERR_GVMM_NOT_ALL_EMTS_DEREGISTERED;
1264	}
1265	}
1266	else
1267	{
1268	SUPR0Printf("GVMMR0DestroyVM: pHandle=%RKv:{.pGVM=%p, .hEMT0=%p, .ProcId=%u, .pvObj=%p} pGVM=%p hSelf=%p\n",
1269	pHandle, pHandle->pGVM, pHandle->hEMT0, pHandle->ProcId, pHandle->pvObj, pGVM, hSelf);
1270	gvmmR0CreateDestroyUnlock(pGVMM);
1271	rc = VERR_GVMM_IPE_2;
1272	}
1273
1274	return rc;
1275	}
1276
1277
1278	/**
1279	* Performs VM cleanup task as part of object destruction.
1280	*
1281	* @param pGVM The GVM pointer.
1282	*/
1283	static void gvmmR0CleanupVM(PGVM pGVM)
1284	{
1285	if ( pGVM->gvmm.s.fDoneVMMR0Init
1286	&& !pGVM->gvmm.s.fDoneVMMR0Term)
1287	{
1288	if ( pGVM->gvmm.s.VMMemObj != NIL_RTR0MEMOBJ
1289	&& RTR0MemObjAddress(pGVM->gvmm.s.VMMemObj) == pGVM)
1290	{
1291	LogFlow(("gvmmR0CleanupVM: Calling VMMR0TermVM\n"));
1292	VMMR0TermVM(pGVM, NIL_VMCPUID);
1293	}
1294	else
1295	AssertMsgFailed(("gvmmR0CleanupVM: VMMemObj=%p pGVM=%p\n", pGVM->gvmm.s.VMMemObj, pGVM));
1296	}
1297
1298	GMMR0CleanupVM(pGVM);
1299	#ifdef VBOX_WITH_NEM_R0
1300	NEMR0CleanupVM(pGVM);
1301	#endif
1302	PDMR0CleanupVM(pGVM);
1303	IOMR0CleanupVM(pGVM);
1304	PGMR0CleanupVM(pGVM);
1305
1306	AssertCompile(NIL_RTTHREADCTXHOOK == (RTTHREADCTXHOOK)0); /* Depends on zero initialized memory working for NIL at the moment. */
1307	for (VMCPUID idCpu = 0; idCpu < pGVM->cCpus; idCpu++)
1308	{
1309	/** @todo Can we busy wait here for all thread-context hooks to be
1310	* deregistered before releasing (destroying) it? Only until we find a
1311	* solution for not deregistering hooks everytime we're leaving HMR0
1312	* context. */
1313	VMMR0ThreadCtxHookDestroyForEmt(&pGVM->aCpus[idCpu]);
1314	}
1315	}
1316
1317
1318	/**
1319	* @callback_method_impl{FNSUPDRVDESTRUCTOR,VM handle destructor}
1320	*
1321	* pvUser1 is the GVM instance pointer.
1322	* pvUser2 is the handle pointer.
1323	*/
1324	static DECLCALLBACK(void) gvmmR0HandleObjDestructor(void pvObj, void pvUser1, void *pvUser2)
1325	{
1326	LogFlow(("gvmmR0HandleObjDestructor: %p %p %p\n", pvObj, pvUser1, pvUser2));
1327
1328	NOREF(pvObj);
1329
1330	/*
1331	* Some quick, paranoid, input validation.
1332	*/
1333	PGVMHANDLE pHandle = (PGVMHANDLE)pvUser2;
1334	AssertPtr(pHandle);
1335	PGVMM pGVMM = (PGVMM)pvUser1;
1336	Assert(pGVMM == g_pGVMM);
1337	const uint16_t iHandle = pHandle - &pGVMM->aHandles[0];
1338	if ( !iHandle
1339	\|\| iHandle >= RT_ELEMENTS(pGVMM->aHandles)
1340	\|\| iHandle != pHandle->iSelf)
1341	{
1342	SUPR0Printf("GVM: handle %d is out of range or corrupt (iSelf=%d)!\n", iHandle, pHandle->iSelf);
1343	return;
1344	}
1345
1346	int rc = gvmmR0CreateDestroyLock(pGVMM);
1347	AssertRC(rc);
1348	rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM);
1349	AssertRC(rc);
1350
1351	/*
1352	* This is a tad slow but a doubly linked list is too much hassle.
1353	*/
1354	if (RT_UNLIKELY(pHandle->iNext >= RT_ELEMENTS(pGVMM->aHandles)))
1355	{
1356	SUPR0Printf("GVM: used list index %d is out of range!\n", pHandle->iNext);
1357	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1358	gvmmR0CreateDestroyUnlock(pGVMM);
1359	return;
1360	}
1361
1362	if (pGVMM->iUsedHead == iHandle)
1363	pGVMM->iUsedHead = pHandle->iNext;
1364	else
1365	{
1366	uint16_t iPrev = pGVMM->iUsedHead;
1367	int c = RT_ELEMENTS(pGVMM->aHandles) + 2;
1368	while (iPrev)
1369	{
1370	if (RT_UNLIKELY(iPrev >= RT_ELEMENTS(pGVMM->aHandles)))
1371	{
1372	SUPR0Printf("GVM: used list index %d is out of range!\n", iPrev);
1373	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1374	gvmmR0CreateDestroyUnlock(pGVMM);
1375	return;
1376	}
1377	if (RT_UNLIKELY(c-- <= 0))
1378	{
1379	iPrev = 0;
1380	break;
1381	}
1382
1383	if (pGVMM->aHandles[iPrev].iNext == iHandle)
1384	break;
1385	iPrev = pGVMM->aHandles[iPrev].iNext;
1386	}
1387	if (!iPrev)
1388	{
1389	SUPR0Printf("GVM: can't find the handle previous previous of %d!\n", pHandle->iSelf);
1390	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1391	gvmmR0CreateDestroyUnlock(pGVMM);
1392	return;
1393	}
1394
1395	Assert(pGVMM->aHandles[iPrev].iNext == iHandle);
1396	pGVMM->aHandles[iPrev].iNext = pHandle->iNext;
1397	}
1398	pHandle->iNext = 0;
1399	pGVMM->cVMs--;
1400
1401	/*
1402	* Do the global cleanup round.
1403	*/
1404	PGVM pGVM = pHandle->pGVM;
1405	if ( RT_VALID_PTR(pGVM)
1406	&& pGVM->u32Magic == GVM_MAGIC)
1407	{
1408	pGVMM->cEMTs -= pGVM->cCpus;
1409
1410	if (pGVM->pSession)
1411	SUPR0SetSessionVM(pGVM->pSession, NULL, NULL);
1412
1413	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1414
1415	gvmmR0CleanupVM(pGVM);
1416
1417	/*
1418	* Do the GVMM cleanup - must be done last.
1419	*/
1420	/* The VM and VM pages mappings/allocations. */
1421	if (pGVM->gvmm.s.VMPagesMapObj != NIL_RTR0MEMOBJ)
1422	{
1423	rc = RTR0MemObjFree(pGVM->gvmm.s.VMPagesMapObj, false /* fFreeMappings */); AssertRC(rc);
1424	pGVM->gvmm.s.VMPagesMapObj = NIL_RTR0MEMOBJ;
1425	}
1426
1427	if (pGVM->gvmm.s.VMMapObj != NIL_RTR0MEMOBJ)
1428	{
1429	rc = RTR0MemObjFree(pGVM->gvmm.s.VMMapObj, false /* fFreeMappings */); AssertRC(rc);
1430	pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ;
1431	}
1432
1433	if (pGVM->gvmm.s.VMPagesMemObj != NIL_RTR0MEMOBJ)
1434	{
1435	rc = RTR0MemObjFree(pGVM->gvmm.s.VMPagesMemObj, false /* fFreeMappings */); AssertRC(rc);
1436	pGVM->gvmm.s.VMPagesMemObj = NIL_RTR0MEMOBJ;
1437	}
1438
1439	for (VMCPUID i = 0; i < pGVM->cCpus; i++)
1440	{
1441	if (pGVM->aCpus[i].gvmm.s.HaltEventMulti != NIL_RTSEMEVENTMULTI)
1442	{
1443	rc = RTSemEventMultiDestroy(pGVM->aCpus[i].gvmm.s.HaltEventMulti); AssertRC(rc);
1444	pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI;
1445	}
1446	if (pGVM->aCpus[i].gvmm.s.VMCpuMapObj != NIL_RTR0MEMOBJ)
1447	{
1448	rc = RTR0MemObjFree(pGVM->aCpus[i].gvmm.s.VMCpuMapObj, false /* fFreeMappings */); AssertRC(rc);
1449	pGVM->aCpus[i].gvmm.s.VMCpuMapObj = NIL_RTR0MEMOBJ;
1450	}
1451	}
1452
1453	/* the GVM structure itself. */
1454	pGVM->u32Magic \|= UINT32_C(0x80000000);
1455	Assert(pGVM->gvmm.s.VMMemObj != NIL_RTR0MEMOBJ);
1456	rc = RTR0MemObjFree(pGVM->gvmm.s.VMMemObj, true /fFreeMappings/); AssertRC(rc);
1457	pGVM = NULL;
1458
1459	/* Re-acquire the UsedLock before freeing the handle since we're updating handle fields. */
1460	rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM);
1461	AssertRC(rc);
1462	}
1463	/* else: GVMMR0CreateVM cleanup. */
1464
1465	/*
1466	* Free the handle.
1467	*/
1468	pHandle->iNext = pGVMM->iFreeHead;
1469	pGVMM->iFreeHead = iHandle;
1470	ASMAtomicWriteNullPtr(&pHandle->pGVM);
1471	ASMAtomicWriteNullPtr(&pHandle->pvObj);
1472	ASMAtomicWriteNullPtr(&pHandle->pSession);
1473	ASMAtomicWriteHandle(&pHandle->hEMT0, NIL_RTNATIVETHREAD);
1474	ASMAtomicWriteU32(&pHandle->ProcId, NIL_RTPROCESS);
1475
1476	GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM);
1477	gvmmR0CreateDestroyUnlock(pGVMM);
1478	LogFlow(("gvmmR0HandleObjDestructor: returns\n"));
1479	}
1480
1481
1482	/**
1483	* Registers the calling thread as the EMT of a Virtual CPU.
1484	*
1485	* Note that VCPU 0 is automatically registered during VM creation.
1486	*
1487	* @returns VBox status code
1488	* @param pGVM The global (ring-0) VM structure.
1489	* @param idCpu VCPU id to register the current thread as.
1490	*/
1491	GVMMR0DECL(int) GVMMR0RegisterVCpu(PGVM pGVM, VMCPUID idCpu)
1492	{
1493	AssertReturn(idCpu != 0, VERR_INVALID_FUNCTION);
1494
1495	/*
1496	* Validate the VM structure, state and handle.
1497	*/
1498	PGVMM pGVMM;
1499	int rc = gvmmR0ByGVM(pGVM, &pGVMM, false /* fTakeUsedLock /); /* @todo take lock here. */
1500	if (RT_SUCCESS(rc))
1501	{
1502	if (idCpu < pGVM->cCpus)
1503	{
1504	/* Check that the EMT isn't already assigned to a thread. */
1505	if (pGVM->aCpus[idCpu].hEMT == NIL_RTNATIVETHREAD)
1506	{
1507	Assert(pGVM->aCpus[idCpu].hNativeThreadR0 == NIL_RTNATIVETHREAD);
1508
1509	/* A thread may only be one EMT. */
1510	RTNATIVETHREAD const hNativeSelf = RTThreadNativeSelf();
1511	for (VMCPUID iCpu = 0; iCpu < pGVM->cCpus; iCpu++)
1512	AssertBreakStmt(pGVM->aCpus[iCpu].hEMT != hNativeSelf, rc = VERR_INVALID_PARAMETER);
1513	if (RT_SUCCESS(rc))
1514	{
1515	/*
1516	* Do the assignment, then try setup the hook. Undo if that fails.
1517	*/
1518	pGVM->aCpus[idCpu].hNativeThreadR0 = pGVM->aCpus[idCpu].hEMT = RTThreadNativeSelf();
1519
1520	rc = VMMR0ThreadCtxHookCreateForEmt(&pGVM->aCpus[idCpu]);
1521	if (RT_SUCCESS(rc))
1522	CPUMR0RegisterVCpuThread(&pGVM->aCpus[idCpu]);
1523	else
1524	pGVM->aCpus[idCpu].hNativeThreadR0 = pGVM->aCpus[idCpu].hEMT = NIL_RTNATIVETHREAD;
1525	}
1526	}
1527	else
1528	rc = VERR_ACCESS_DENIED;
1529	}
1530	else
1531	rc = VERR_INVALID_CPU_ID;
1532	}
1533	return rc;
1534	}
1535
1536
1537	/**
1538	* Deregisters the calling thread as the EMT of a Virtual CPU.
1539	*
1540	* Note that VCPU 0 shall call GVMMR0DestroyVM intead of this API.
1541	*
1542	* @returns VBox status code
1543	* @param pGVM The global (ring-0) VM structure.
1544	* @param idCpu VCPU id to register the current thread as.
1545	*/
1546	GVMMR0DECL(int) GVMMR0DeregisterVCpu(PGVM pGVM, VMCPUID idCpu)
1547	{
1548	AssertReturn(idCpu != 0, VERR_INVALID_FUNCTION);
1549
1550	/*
1551	* Validate the VM structure, state and handle.
1552	*/
1553	PGVMM pGVMM;
1554	int rc = gvmmR0ByGVMandEMT(pGVM, idCpu, &pGVMM);
1555	if (RT_SUCCESS(rc))
1556	{
1557	/*
1558	* Take the destruction lock and recheck the handle state to
1559	* prevent racing GVMMR0DestroyVM.
1560	*/
1561	gvmmR0CreateDestroyLock(pGVMM);
1562	uint32_t hSelf = pGVM->hSelf;
1563	ASMCompilerBarrier();
1564	if ( hSelf < RT_ELEMENTS(pGVMM->aHandles)
1565	&& pGVMM->aHandles[hSelf].pvObj != NULL
1566	&& pGVMM->aHandles[hSelf].pGVM == pGVM)
1567	{
1568	/*
1569	* Do per-EMT cleanups.
1570	*/
1571	VMMR0ThreadCtxHookDestroyForEmt(&pGVM->aCpus[idCpu]);
1572
1573	/*
1574	* Invalidate hEMT. We don't use NIL here as that would allow
1575	* GVMMR0RegisterVCpu to be called again, and we don't want that.
1576	*/
1577	AssertCompile(~(RTNATIVETHREAD)1 != NIL_RTNATIVETHREAD);
1578	pGVM->aCpus[idCpu].hEMT = ~(RTNATIVETHREAD)1;
1579	pGVM->aCpus[idCpu].hNativeThreadR0 = NIL_RTNATIVETHREAD;
1580	}
1581
1582	gvmmR0CreateDestroyUnlock(pGVMM);
1583	}
1584	return rc;
1585	}
1586
1587
1588	/**
1589	* Lookup a GVM structure by its handle.
1590	*
1591	* @returns The GVM pointer on success, NULL on failure.
1592	* @param hGVM The global VM handle. Asserts on bad handle.
1593	*/
1594	GVMMR0DECL(PGVM) GVMMR0ByHandle(uint32_t hGVM)
1595	{
1596	PGVMM pGVMM;
1597	GVMM_GET_VALID_INSTANCE(pGVMM, NULL);
1598
1599	/*
1600	* Validate.
1601	*/
1602	AssertReturn(hGVM != NIL_GVM_HANDLE, NULL);
1603	AssertReturn(hGVM < RT_ELEMENTS(pGVMM->aHandles), NULL);
1604
1605	/*
1606	* Look it up.
1607	*/
1608	PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM];
1609	AssertPtrReturn(pHandle->pvObj, NULL);
1610	PGVM pGVM = pHandle->pGVM;
1611	AssertPtrReturn(pGVM, NULL);
1612
1613	return pGVM;
1614	}
1615
1616
1617	/**
1618	* Check that the given GVM and VM structures match up.
1619	*
1620	* The calling thread must be in the same process as the VM. All current lookups
1621	* are by threads inside the same process, so this will not be an issue.
1622	*
1623	* @returns VBox status code.
1624	* @param pGVM The global (ring-0) VM structure.
1625	* @param ppGVMM Where to store the pointer to the GVMM instance data.
1626	* @param fTakeUsedLock Whether to take the used lock or not. We take it in
1627	* shared mode when requested.
1628	*
1629	* Be very careful if not taking the lock as it's
1630	* possible that the VM will disappear then!
1631	*
1632	* @remark This will not assert on an invalid pGVM but try return silently.
1633	*/
1634	static int gvmmR0ByGVM(PGVM pGVM, PGVMM *ppGVMM, bool fTakeUsedLock)
1635	{
1636	/*
1637	* Check the pointers.
1638	*/
1639	int rc;
1640	if (RT_LIKELY( RT_VALID_PTR(pGVM)
1641	&& ((uintptr_t)pGVM & PAGE_OFFSET_MASK) == 0 ))
1642	{
1643	/*
1644	* Get the pGVMM instance and check the VM handle.
1645	*/
1646	PGVMM pGVMM;
1647	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
1648
1649	uint16_t hGVM = pGVM->hSelf;
1650	if (RT_LIKELY( hGVM != NIL_GVM_HANDLE
1651	&& hGVM < RT_ELEMENTS(pGVMM->aHandles)))
1652	{
1653	RTPROCESS const pidSelf = RTProcSelf();
1654	PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM];
1655	if (fTakeUsedLock)
1656	{
1657	rc = GVMMR0_USED_SHARED_LOCK(pGVMM);
1658	AssertRCReturn(rc, rc);
1659	}
1660
1661	if (RT_LIKELY( pHandle->pGVM == pGVM
1662	&& pHandle->ProcId == pidSelf
1663	&& RT_VALID_PTR(pHandle->pvObj)))
1664	{
1665	/*
1666	* Some more VM data consistency checks.
1667	*/
1668	if (RT_LIKELY( pGVM->cCpusUnsafe == pGVM->cCpus
1669	&& pGVM->hSelfUnsafe == hGVM
1670	&& pGVM->pSelf == pGVM))
1671	{
1672	if (RT_LIKELY( pGVM->enmVMState >= VMSTATE_CREATING
1673	&& pGVM->enmVMState <= VMSTATE_TERMINATED))
1674	{
1675	*ppGVMM = pGVMM;
1676	return VINF_SUCCESS;
1677	}
1678	rc = VERR_INCONSISTENT_VM_HANDLE;
1679	}
1680	else
1681	rc = VERR_INCONSISTENT_VM_HANDLE;
1682	}
1683	else
1684	rc = VERR_INVALID_VM_HANDLE;
1685
1686	if (fTakeUsedLock)
1687	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
1688	}
1689	else
1690	rc = VERR_INVALID_VM_HANDLE;
1691	}
1692	else
1693	rc = VERR_INVALID_POINTER;
1694	return rc;
1695	}
1696
1697
1698	/**
1699	* Validates a GVM/VM pair.
1700	*
1701	* @returns VBox status code.
1702	* @param pGVM The global (ring-0) VM structure.
1703	*/
1704	GVMMR0DECL(int) GVMMR0ValidateGVM(PGVM pGVM)
1705	{
1706	PGVMM pGVMM;
1707	return gvmmR0ByGVM(pGVM, &pGVMM, false /fTakeUsedLock/);
1708	}
1709
1710
1711	/**
1712	* Check that the given GVM and VM structures match up.
1713	*
1714	* The calling thread must be in the same process as the VM. All current lookups
1715	* are by threads inside the same process, so this will not be an issue.
1716	*
1717	* @returns VBox status code.
1718	* @param pGVM The global (ring-0) VM structure.
1719	* @param idCpu The (alleged) Virtual CPU ID of the calling EMT.
1720	* @param ppGVMM Where to store the pointer to the GVMM instance data.
1721	* @thread EMT
1722	*
1723	* @remarks This will assert in all failure paths.
1724	*/
1725	static int gvmmR0ByGVMandEMT(PGVM pGVM, VMCPUID idCpu, PGVMM *ppGVMM)
1726	{
1727	/*
1728	* Check the pointers.
1729	*/
1730	AssertPtrReturn(pGVM, VERR_INVALID_POINTER);
1731	AssertReturn(((uintptr_t)pGVM & PAGE_OFFSET_MASK) == 0, VERR_INVALID_POINTER);
1732
1733	/*
1734	* Get the pGVMM instance and check the VM handle.
1735	*/
1736	PGVMM pGVMM;
1737	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
1738
1739	uint16_t hGVM = pGVM->hSelf;
1740	ASMCompilerBarrier();
1741	AssertReturn( hGVM != NIL_GVM_HANDLE
1742	&& hGVM < RT_ELEMENTS(pGVMM->aHandles), VERR_INVALID_VM_HANDLE);
1743
1744	RTPROCESS const pidSelf = RTProcSelf();
1745	PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM];
1746	AssertReturn( pHandle->pGVM == pGVM
1747	&& pHandle->ProcId == pidSelf
1748	&& RT_VALID_PTR(pHandle->pvObj),
1749	VERR_INVALID_HANDLE);
1750
1751	/*
1752	* Check the EMT claim.
1753	*/
1754	RTNATIVETHREAD const hAllegedEMT = RTThreadNativeSelf();
1755	AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID);
1756	AssertReturn(pGVM->aCpus[idCpu].hEMT == hAllegedEMT, VERR_NOT_OWNER);
1757
1758	/*
1759	* Some more VM data consistency checks.
1760	*/
1761	AssertReturn(pGVM->cCpusUnsafe == pGVM->cCpus, VERR_INCONSISTENT_VM_HANDLE);
1762	AssertReturn(pGVM->hSelfUnsafe == hGVM, VERR_INCONSISTENT_VM_HANDLE);
1763	AssertReturn( pGVM->enmVMState >= VMSTATE_CREATING
1764	&& pGVM->enmVMState <= VMSTATE_TERMINATED, VERR_INCONSISTENT_VM_HANDLE);
1765
1766	*ppGVMM = pGVMM;
1767	return VINF_SUCCESS;
1768	}
1769
1770
1771	/**
1772	* Validates a GVM/EMT pair.
1773	*
1774	* @returns VBox status code.
1775	* @param pGVM The global (ring-0) VM structure.
1776	* @param idCpu The Virtual CPU ID of the calling EMT.
1777	* @thread EMT(idCpu)
1778	*/
1779	GVMMR0DECL(int) GVMMR0ValidateGVMandEMT(PGVM pGVM, VMCPUID idCpu)
1780	{
1781	PGVMM pGVMM;
1782	return gvmmR0ByGVMandEMT(pGVM, idCpu, &pGVMM);
1783	}
1784
1785
1786	/**
1787	* Looks up the VM belonging to the specified EMT thread.
1788	*
1789	* This is used by the assertion machinery in VMMR0.cpp to avoid causing
1790	* unnecessary kernel panics when the EMT thread hits an assertion. The
1791	* call may or not be an EMT thread.
1792	*
1793	* @returns Pointer to the VM on success, NULL on failure.
1794	* @param hEMT The native thread handle of the EMT.
1795	* NIL_RTNATIVETHREAD means the current thread
1796	*/
1797	GVMMR0DECL(PVMCC) GVMMR0GetVMByEMT(RTNATIVETHREAD hEMT)
1798	{
1799	/*
1800	* No Assertions here as we're usually called in a AssertMsgN or
1801	* RTAssert* context.
1802	*/
1803	PGVMM pGVMM = g_pGVMM;
1804	if ( !RT_VALID_PTR(pGVMM)
1805	\|\| pGVMM->u32Magic != GVMM_MAGIC)
1806	return NULL;
1807
1808	if (hEMT == NIL_RTNATIVETHREAD)
1809	hEMT = RTThreadNativeSelf();
1810	RTPROCESS ProcId = RTProcSelf();
1811
1812	/*
1813	* Search the handles in a linear fashion as we don't dare to take the lock (assert).
1814	*/
1815	/** @todo introduce some pid hash table here, please. */
1816	for (unsigned i = 1; i < RT_ELEMENTS(pGVMM->aHandles); i++)
1817	{
1818	if ( pGVMM->aHandles[i].iSelf == i
1819	&& pGVMM->aHandles[i].ProcId == ProcId
1820	&& RT_VALID_PTR(pGVMM->aHandles[i].pvObj)
1821	&& RT_VALID_PTR(pGVMM->aHandles[i].pGVM))
1822	{
1823	if (pGVMM->aHandles[i].hEMT0 == hEMT)
1824	return pGVMM->aHandles[i].pGVM;
1825
1826	/* This is fearly safe with the current process per VM approach. */
1827	PGVM pGVM = pGVMM->aHandles[i].pGVM;
1828	VMCPUID const cCpus = pGVM->cCpus;
1829	ASMCompilerBarrier();
1830	if ( cCpus < 1
1831	\|\| cCpus > VMM_MAX_CPU_COUNT)
1832	continue;
1833	for (VMCPUID idCpu = 1; idCpu < cCpus; idCpu++)
1834	if (pGVM->aCpus[idCpu].hEMT == hEMT)
1835	return pGVMM->aHandles[i].pGVM;
1836	}
1837	}
1838	return NULL;
1839	}
1840
1841
1842	/**
1843	* Looks up the GVMCPU belonging to the specified EMT thread.
1844	*
1845	* This is used by the assertion machinery in VMMR0.cpp to avoid causing
1846	* unnecessary kernel panics when the EMT thread hits an assertion. The
1847	* call may or not be an EMT thread.
1848	*
1849	* @returns Pointer to the VM on success, NULL on failure.
1850	* @param hEMT The native thread handle of the EMT.
1851	* NIL_RTNATIVETHREAD means the current thread
1852	*/
1853	GVMMR0DECL(PGVMCPU) GVMMR0GetGVCpuByEMT(RTNATIVETHREAD hEMT)
1854	{
1855	/*
1856	* No Assertions here as we're usually called in a AssertMsgN,
1857	* RTAssert*, Log and LogRel contexts.
1858	*/
1859	PGVMM pGVMM = g_pGVMM;
1860	if ( !RT_VALID_PTR(pGVMM)
1861	\|\| pGVMM->u32Magic != GVMM_MAGIC)
1862	return NULL;
1863
1864	if (hEMT == NIL_RTNATIVETHREAD)
1865	hEMT = RTThreadNativeSelf();
1866	RTPROCESS ProcId = RTProcSelf();
1867
1868	/*
1869	* Search the handles in a linear fashion as we don't dare to take the lock (assert).
1870	*/
1871	/** @todo introduce some pid hash table here, please. */
1872	for (unsigned i = 1; i < RT_ELEMENTS(pGVMM->aHandles); i++)
1873	{
1874	if ( pGVMM->aHandles[i].iSelf == i
1875	&& pGVMM->aHandles[i].ProcId == ProcId
1876	&& RT_VALID_PTR(pGVMM->aHandles[i].pvObj)
1877	&& RT_VALID_PTR(pGVMM->aHandles[i].pGVM))
1878	{
1879	PGVM pGVM = pGVMM->aHandles[i].pGVM;
1880	if (pGVMM->aHandles[i].hEMT0 == hEMT)
1881	return &pGVM->aCpus[0];
1882
1883	/* This is fearly safe with the current process per VM approach. */
1884	VMCPUID const cCpus = pGVM->cCpus;
1885	ASMCompilerBarrier();
1886	ASMCompilerBarrier();
1887	if ( cCpus < 1
1888	\|\| cCpus > VMM_MAX_CPU_COUNT)
1889	continue;
1890	for (VMCPUID idCpu = 1; idCpu < cCpus; idCpu++)
1891	if (pGVM->aCpus[idCpu].hEMT == hEMT)
1892	return &pGVM->aCpus[idCpu];
1893	}
1894	}
1895	return NULL;
1896	}
1897
1898
1899	/**
1900	* This is will wake up expired and soon-to-be expired VMs.
1901	*
1902	* @returns Number of VMs that has been woken up.
1903	* @param pGVMM Pointer to the GVMM instance data.
1904	* @param u64Now The current time.
1905	*/
1906	static unsigned gvmmR0SchedDoWakeUps(PGVMM pGVMM, uint64_t u64Now)
1907	{
1908	/*
1909	* Skip this if we've got disabled because of high resolution wakeups or by
1910	* the user.
1911	*/
1912	if (!pGVMM->fDoEarlyWakeUps)
1913	return 0;
1914
1915	/** @todo Rewrite this algorithm. See performance defect XYZ. */
1916
1917	/*
1918	* A cheap optimization to stop wasting so much time here on big setups.
1919	*/
1920	const uint64_t uNsEarlyWakeUp2 = u64Now + pGVMM->nsEarlyWakeUp2;
1921	if ( pGVMM->cHaltedEMTs == 0
1922	\|\| uNsEarlyWakeUp2 > pGVMM->uNsNextEmtWakeup)
1923	return 0;
1924
1925	/*
1926	* Only one thread doing this at a time.
1927	*/
1928	if (!ASMAtomicCmpXchgBool(&pGVMM->fDoingEarlyWakeUps, true, false))
1929	return 0;
1930
1931	/*
1932	* The first pass will wake up VMs which have actually expired
1933	* and look for VMs that should be woken up in the 2nd and 3rd passes.
1934	*/
1935	const uint64_t uNsEarlyWakeUp1 = u64Now + pGVMM->nsEarlyWakeUp1;
1936	uint64_t u64Min = UINT64_MAX;
1937	unsigned cWoken = 0;
1938	unsigned cHalted = 0;
1939	unsigned cTodo2nd = 0;
1940	unsigned cTodo3rd = 0;
1941	for (unsigned i = pGVMM->iUsedHead, cGuard = 0;
1942	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
1943	i = pGVMM->aHandles[i].iNext)
1944	{
1945	PGVM pCurGVM = pGVMM->aHandles[i].pGVM;
1946	if ( RT_VALID_PTR(pCurGVM)
1947	&& pCurGVM->u32Magic == GVM_MAGIC)
1948	{
1949	for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++)
1950	{
1951	PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu];
1952	uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire);
1953	if (u64)
1954	{
1955	if (u64 <= u64Now)
1956	{
1957	if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0))
1958	{
1959	int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti);
1960	AssertRC(rc);
1961	cWoken++;
1962	}
1963	}
1964	else
1965	{
1966	cHalted++;
1967	if (u64 <= uNsEarlyWakeUp1)
1968	cTodo2nd++;
1969	else if (u64 <= uNsEarlyWakeUp2)
1970	cTodo3rd++;
1971	else if (u64 < u64Min)
1972	u64 = u64Min;
1973	}
1974	}
1975	}
1976	}
1977	AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles));
1978	}
1979
1980	if (cTodo2nd)
1981	{
1982	for (unsigned i = pGVMM->iUsedHead, cGuard = 0;
1983	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
1984	i = pGVMM->aHandles[i].iNext)
1985	{
1986	PGVM pCurGVM = pGVMM->aHandles[i].pGVM;
1987	if ( RT_VALID_PTR(pCurGVM)
1988	&& pCurGVM->u32Magic == GVM_MAGIC)
1989	{
1990	for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++)
1991	{
1992	PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu];
1993	uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire);
1994	if ( u64
1995	&& u64 <= uNsEarlyWakeUp1)
1996	{
1997	if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0))
1998	{
1999	int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti);
2000	AssertRC(rc);
2001	cWoken++;
2002	}
2003	}
2004	}
2005	}
2006	AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles));
2007	}
2008	}
2009
2010	if (cTodo3rd)
2011	{
2012	for (unsigned i = pGVMM->iUsedHead, cGuard = 0;
2013	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
2014	i = pGVMM->aHandles[i].iNext)
2015	{
2016	PGVM pCurGVM = pGVMM->aHandles[i].pGVM;
2017	if ( RT_VALID_PTR(pCurGVM)
2018	&& pCurGVM->u32Magic == GVM_MAGIC)
2019	{
2020	for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++)
2021	{
2022	PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu];
2023	uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire);
2024	if ( u64
2025	&& u64 <= uNsEarlyWakeUp2)
2026	{
2027	if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0))
2028	{
2029	int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti);
2030	AssertRC(rc);
2031	cWoken++;
2032	}
2033	}
2034	}
2035	}
2036	AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles));
2037	}
2038	}
2039
2040	/*
2041	* Set the minimum value.
2042	*/
2043	pGVMM->uNsNextEmtWakeup = u64Min;
2044
2045	ASMAtomicWriteBool(&pGVMM->fDoingEarlyWakeUps, false);
2046	return cWoken;
2047	}
2048
2049
2050	/**
2051	* Halt the EMT thread.
2052	*
2053	* @returns VINF_SUCCESS normal wakeup (timeout or kicked by other thread).
2054	* VERR_INTERRUPTED if a signal was scheduled for the thread.
2055	* @param pGVM The global (ring-0) VM structure.
2056	* @param pGVCpu The global (ring-0) CPU structure of the calling
2057	* EMT.
2058	* @param u64ExpireGipTime The time for the sleep to expire expressed as GIP time.
2059	* @thread EMT(pGVCpu).
2060	*/
2061	GVMMR0DECL(int) GVMMR0SchedHalt(PGVM pGVM, PGVMCPU pGVCpu, uint64_t u64ExpireGipTime)
2062	{
2063	LogFlow(("GVMMR0SchedHalt: pGVM=%p pGVCpu=%p(%d) u64ExpireGipTime=%#RX64\n",
2064	pGVM, pGVCpu, pGVCpu->idCpu, u64ExpireGipTime));
2065	GVMM_CHECK_SMAP_SETUP();
2066	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2067
2068	PGVMM pGVMM;
2069	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
2070
2071	pGVM->gvmm.s.StatsSched.cHaltCalls++;
2072	Assert(!pGVCpu->gvmm.s.u64HaltExpire);
2073
2074	/*
2075	* If we're doing early wake-ups, we must take the UsedList lock before we
2076	* start querying the current time.
2077	* Note! Interrupts must NOT be disabled at this point because we ask for GIP time!
2078	*/
2079	bool const fDoEarlyWakeUps = pGVMM->fDoEarlyWakeUps;
2080	if (fDoEarlyWakeUps)
2081	{
2082	int rc2 = GVMMR0_USED_SHARED_LOCK(pGVMM); AssertRC(rc2);
2083	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2084	}
2085
2086	pGVCpu->gvmm.s.iCpuEmt = ASMGetApicId();
2087
2088	/* GIP hack: We might are frequently sleeping for short intervals where the
2089	difference between GIP and system time matters on systems with high resolution
2090	system time. So, convert the input from GIP to System time in that case. */
2091	Assert(ASMGetFlags() & X86_EFL_IF);
2092	const uint64_t u64NowSys = RTTimeSystemNanoTS();
2093	const uint64_t u64NowGip = RTTimeNanoTS();
2094	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2095
2096	if (fDoEarlyWakeUps)
2097	{
2098	pGVM->gvmm.s.StatsSched.cHaltWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64NowGip);
2099	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2100	}
2101
2102	/*
2103	* Go to sleep if we must...
2104	* Cap the sleep time to 1 second to be on the safe side.
2105	*/
2106	int rc;
2107	uint64_t cNsInterval = u64ExpireGipTime - u64NowGip;
2108	if ( u64NowGip < u64ExpireGipTime
2109	&& cNsInterval >= (pGVMM->cEMTs > pGVMM->cEMTsMeansCompany
2110	? pGVMM->nsMinSleepCompany
2111	: pGVMM->nsMinSleepAlone))
2112	{
2113	pGVM->gvmm.s.StatsSched.cHaltBlocking++;
2114	if (cNsInterval > RT_NS_1SEC)
2115	u64ExpireGipTime = u64NowGip + RT_NS_1SEC;
2116	ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, u64ExpireGipTime);
2117	ASMAtomicIncU32(&pGVMM->cHaltedEMTs);
2118	if (fDoEarlyWakeUps)
2119	{
2120	if (u64ExpireGipTime < pGVMM->uNsNextEmtWakeup)
2121	pGVMM->uNsNextEmtWakeup = u64ExpireGipTime;
2122	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2123	}
2124	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2125
2126	rc = RTSemEventMultiWaitEx(pGVCpu->gvmm.s.HaltEventMulti,
2127	RTSEMWAIT_FLAGS_ABSOLUTE \| RTSEMWAIT_FLAGS_NANOSECS \| RTSEMWAIT_FLAGS_INTERRUPTIBLE,
2128	u64NowGip > u64NowSys ? u64ExpireGipTime : u64NowSys + cNsInterval);
2129	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2130
2131	ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, 0);
2132	ASMAtomicDecU32(&pGVMM->cHaltedEMTs);
2133
2134	/* Reset the semaphore to try prevent a few false wake-ups. */
2135	if (rc == VINF_SUCCESS)
2136	{
2137	RTSemEventMultiReset(pGVCpu->gvmm.s.HaltEventMulti);
2138	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2139	}
2140	else if (rc == VERR_TIMEOUT)
2141	{
2142	pGVM->gvmm.s.StatsSched.cHaltTimeouts++;
2143	rc = VINF_SUCCESS;
2144	}
2145	}
2146	else
2147	{
2148	pGVM->gvmm.s.StatsSched.cHaltNotBlocking++;
2149	if (fDoEarlyWakeUps)
2150	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2151	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2152	RTSemEventMultiReset(pGVCpu->gvmm.s.HaltEventMulti);
2153	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2154	rc = VINF_SUCCESS;
2155	}
2156
2157	return rc;
2158	}
2159
2160
2161	/**
2162	* Halt the EMT thread.
2163	*
2164	* @returns VINF_SUCCESS normal wakeup (timeout or kicked by other thread).
2165	* VERR_INTERRUPTED if a signal was scheduled for the thread.
2166	* @param pGVM The global (ring-0) VM structure.
2167	* @param idCpu The Virtual CPU ID of the calling EMT.
2168	* @param u64ExpireGipTime The time for the sleep to expire expressed as GIP time.
2169	* @thread EMT(idCpu).
2170	*/
2171	GVMMR0DECL(int) GVMMR0SchedHaltReq(PGVM pGVM, VMCPUID idCpu, uint64_t u64ExpireGipTime)
2172	{
2173	GVMM_CHECK_SMAP_SETUP();
2174	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2175	PGVMM pGVMM;
2176	int rc = gvmmR0ByGVMandEMT(pGVM, idCpu, &pGVMM);
2177	if (RT_SUCCESS(rc))
2178	{
2179	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2180	rc = GVMMR0SchedHalt(pGVM, &pGVM->aCpus[idCpu], u64ExpireGipTime);
2181	}
2182	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2183	return rc;
2184	}
2185
2186
2187
2188	/**
2189	* Worker for GVMMR0SchedWakeUp and GVMMR0SchedWakeUpAndPokeCpus that wakes up
2190	* the a sleeping EMT.
2191	*
2192	* @retval VINF_SUCCESS if successfully woken up.
2193	* @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked.
2194	*
2195	* @param pGVM The global (ring-0) VM structure.
2196	* @param pGVCpu The global (ring-0) VCPU structure.
2197	*/
2198	DECLINLINE(int) gvmmR0SchedWakeUpOne(PGVM pGVM, PGVMCPU pGVCpu)
2199	{
2200	pGVM->gvmm.s.StatsSched.cWakeUpCalls++;
2201
2202	/*
2203	* Signal the semaphore regardless of whether it's current blocked on it.
2204	*
2205	* The reason for this is that there is absolutely no way we can be 100%
2206	* certain that it isn't about go to go to sleep on it and just got
2207	* delayed a bit en route. So, we will always signal the semaphore when
2208	* the it is flagged as halted in the VMM.
2209	*/
2210	/** @todo we can optimize some of that by means of the pVCpu->enmState now. */
2211	int rc;
2212	if (pGVCpu->gvmm.s.u64HaltExpire)
2213	{
2214	rc = VINF_SUCCESS;
2215	ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, 0);
2216	}
2217	else
2218	{
2219	rc = VINF_GVM_NOT_BLOCKED;
2220	pGVM->gvmm.s.StatsSched.cWakeUpNotHalted++;
2221	}
2222
2223	int rc2 = RTSemEventMultiSignal(pGVCpu->gvmm.s.HaltEventMulti);
2224	AssertRC(rc2);
2225
2226	return rc;
2227	}
2228
2229
2230	/**
2231	* Wakes up the halted EMT thread so it can service a pending request.
2232	*
2233	* @returns VBox status code.
2234	* @retval VINF_SUCCESS if successfully woken up.
2235	* @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked.
2236	*
2237	* @param pGVM The global (ring-0) VM structure.
2238	* @param idCpu The Virtual CPU ID of the EMT to wake up.
2239	* @param fTakeUsedLock Take the used lock or not
2240	* @thread Any but EMT(idCpu).
2241	*/
2242	GVMMR0DECL(int) GVMMR0SchedWakeUpEx(PGVM pGVM, VMCPUID idCpu, bool fTakeUsedLock)
2243	{
2244	GVMM_CHECK_SMAP_SETUP();
2245	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2246
2247	/*
2248	* Validate input and take the UsedLock.
2249	*/
2250	PGVMM pGVMM;
2251	int rc = gvmmR0ByGVM(pGVM, &pGVMM, fTakeUsedLock);
2252	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2253	if (RT_SUCCESS(rc))
2254	{
2255	if (idCpu < pGVM->cCpus)
2256	{
2257	/*
2258	* Do the actual job.
2259	*/
2260	rc = gvmmR0SchedWakeUpOne(pGVM, &pGVM->aCpus[idCpu]);
2261	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2262
2263	if (fTakeUsedLock && pGVMM->fDoEarlyWakeUps)
2264	{
2265	/*
2266	* While we're here, do a round of scheduling.
2267	*/
2268	Assert(ASMGetFlags() & X86_EFL_IF);
2269	const uint64_t u64Now = RTTimeNanoTS(); /* (GIP time) */
2270	pGVM->gvmm.s.StatsSched.cWakeUpWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64Now);
2271	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2272	}
2273	}
2274	else
2275	rc = VERR_INVALID_CPU_ID;
2276
2277	if (fTakeUsedLock)
2278	{
2279	int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2280	AssertRC(rc2);
2281	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2282	}
2283	}
2284
2285	LogFlow(("GVMMR0SchedWakeUpEx: returns %Rrc\n", rc));
2286	return rc;
2287	}
2288
2289
2290	/**
2291	* Wakes up the halted EMT thread so it can service a pending request.
2292	*
2293	* @returns VBox status code.
2294	* @retval VINF_SUCCESS if successfully woken up.
2295	* @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked.
2296	*
2297	* @param pGVM The global (ring-0) VM structure.
2298	* @param idCpu The Virtual CPU ID of the EMT to wake up.
2299	* @thread Any but EMT(idCpu).
2300	*/
2301	GVMMR0DECL(int) GVMMR0SchedWakeUp(PGVM pGVM, VMCPUID idCpu)
2302	{
2303	return GVMMR0SchedWakeUpEx(pGVM, idCpu, true /* fTakeUsedLock */);
2304	}
2305
2306
2307	/**
2308	* Wakes up the halted EMT thread so it can service a pending request, no GVM
2309	* parameter and no used locking.
2310	*
2311	* @returns VBox status code.
2312	* @retval VINF_SUCCESS if successfully woken up.
2313	* @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked.
2314	*
2315	* @param pGVM The global (ring-0) VM structure.
2316	* @param idCpu The Virtual CPU ID of the EMT to wake up.
2317	* @thread Any but EMT(idCpu).
2318	* @deprecated Don't use in new code if possible! Use the GVM variant.
2319	*/
2320	GVMMR0DECL(int) GVMMR0SchedWakeUpNoGVMNoLock(PGVM pGVM, VMCPUID idCpu)
2321	{
2322	GVMM_CHECK_SMAP_SETUP();
2323	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2324	PGVMM pGVMM;
2325	int rc = gvmmR0ByGVM(pGVM, &pGVMM, false /fTakeUsedLock/);
2326	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2327	if (RT_SUCCESS(rc))
2328	rc = GVMMR0SchedWakeUpEx(pGVM, idCpu, false /fTakeUsedLock/);
2329	return rc;
2330	}
2331
2332
2333	/**
2334	* Worker common to GVMMR0SchedPoke and GVMMR0SchedWakeUpAndPokeCpus that pokes
2335	* the Virtual CPU if it's still busy executing guest code.
2336	*
2337	* @returns VBox status code.
2338	* @retval VINF_SUCCESS if poked successfully.
2339	* @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC.
2340	*
2341	* @param pGVM The global (ring-0) VM structure.
2342	* @param pVCpu The cross context virtual CPU structure.
2343	*/
2344	DECLINLINE(int) gvmmR0SchedPokeOne(PGVM pGVM, PVMCPUCC pVCpu)
2345	{
2346	pGVM->gvmm.s.StatsSched.cPokeCalls++;
2347
2348	RTCPUID idHostCpu = pVCpu->idHostCpu;
2349	if ( idHostCpu == NIL_RTCPUID
2350	\|\| VMCPU_GET_STATE(pVCpu) != VMCPUSTATE_STARTED_EXEC)
2351	{
2352	pGVM->gvmm.s.StatsSched.cPokeNotBusy++;
2353	return VINF_GVM_NOT_BUSY_IN_GC;
2354	}
2355
2356	/* Note: this function is not implemented on Darwin and Linux (kernel < 2.6.19) */
2357	RTMpPokeCpu(idHostCpu);
2358	return VINF_SUCCESS;
2359	}
2360
2361
2362	/**
2363	* Pokes an EMT if it's still busy running guest code.
2364	*
2365	* @returns VBox status code.
2366	* @retval VINF_SUCCESS if poked successfully.
2367	* @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC.
2368	*
2369	* @param pGVM The global (ring-0) VM structure.
2370	* @param idCpu The ID of the virtual CPU to poke.
2371	* @param fTakeUsedLock Take the used lock or not
2372	*/
2373	GVMMR0DECL(int) GVMMR0SchedPokeEx(PGVM pGVM, VMCPUID idCpu, bool fTakeUsedLock)
2374	{
2375	/*
2376	* Validate input and take the UsedLock.
2377	*/
2378	PGVMM pGVMM;
2379	int rc = gvmmR0ByGVM(pGVM, &pGVMM, fTakeUsedLock);
2380	if (RT_SUCCESS(rc))
2381	{
2382	if (idCpu < pGVM->cCpus)
2383	rc = gvmmR0SchedPokeOne(pGVM, &pGVM->aCpus[idCpu]);
2384	else
2385	rc = VERR_INVALID_CPU_ID;
2386
2387	if (fTakeUsedLock)
2388	{
2389	int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2390	AssertRC(rc2);
2391	}
2392	}
2393
2394	LogFlow(("GVMMR0SchedWakeUpAndPokeCpus: returns %Rrc\n", rc));
2395	return rc;
2396	}
2397
2398
2399	/**
2400	* Pokes an EMT if it's still busy running guest code.
2401	*
2402	* @returns VBox status code.
2403	* @retval VINF_SUCCESS if poked successfully.
2404	* @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC.
2405	*
2406	* @param pGVM The global (ring-0) VM structure.
2407	* @param idCpu The ID of the virtual CPU to poke.
2408	*/
2409	GVMMR0DECL(int) GVMMR0SchedPoke(PGVM pGVM, VMCPUID idCpu)
2410	{
2411	return GVMMR0SchedPokeEx(pGVM, idCpu, true /* fTakeUsedLock */);
2412	}
2413
2414
2415	/**
2416	* Pokes an EMT if it's still busy running guest code, no GVM parameter and no
2417	* used locking.
2418	*
2419	* @returns VBox status code.
2420	* @retval VINF_SUCCESS if poked successfully.
2421	* @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC.
2422	*
2423	* @param pGVM The global (ring-0) VM structure.
2424	* @param idCpu The ID of the virtual CPU to poke.
2425	*
2426	* @deprecated Don't use in new code if possible! Use the GVM variant.
2427	*/
2428	GVMMR0DECL(int) GVMMR0SchedPokeNoGVMNoLock(PGVM pGVM, VMCPUID idCpu)
2429	{
2430	PGVMM pGVMM;
2431	int rc = gvmmR0ByGVM(pGVM, &pGVMM, false /fTakeUsedLock/);
2432	if (RT_SUCCESS(rc))
2433	{
2434	if (idCpu < pGVM->cCpus)
2435	rc = gvmmR0SchedPokeOne(pGVM, &pGVM->aCpus[idCpu]);
2436	else
2437	rc = VERR_INVALID_CPU_ID;
2438	}
2439	return rc;
2440	}
2441
2442
2443	/**
2444	* Wakes up a set of halted EMT threads so they can service pending request.
2445	*
2446	* @returns VBox status code, no informational stuff.
2447	*
2448	* @param pGVM The global (ring-0) VM structure.
2449	* @param pSleepSet The set of sleepers to wake up.
2450	* @param pPokeSet The set of CPUs to poke.
2451	*/
2452	GVMMR0DECL(int) GVMMR0SchedWakeUpAndPokeCpus(PGVM pGVM, PCVMCPUSET pSleepSet, PCVMCPUSET pPokeSet)
2453	{
2454	AssertPtrReturn(pSleepSet, VERR_INVALID_POINTER);
2455	AssertPtrReturn(pPokeSet, VERR_INVALID_POINTER);
2456	GVMM_CHECK_SMAP_SETUP();
2457	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2458	RTNATIVETHREAD hSelf = RTThreadNativeSelf();
2459
2460	/*
2461	* Validate input and take the UsedLock.
2462	*/
2463	PGVMM pGVMM;
2464	int rc = gvmmR0ByGVM(pGVM, &pGVMM, true /* fTakeUsedLock */);
2465	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2466	if (RT_SUCCESS(rc))
2467	{
2468	rc = VINF_SUCCESS;
2469	VMCPUID idCpu = pGVM->cCpus;
2470	while (idCpu-- > 0)
2471	{
2472	/* Don't try poke or wake up ourselves. */
2473	if (pGVM->aCpus[idCpu].hEMT == hSelf)
2474	continue;
2475
2476	/* just ignore errors for now. */
2477	if (VMCPUSET_IS_PRESENT(pSleepSet, idCpu))
2478	{
2479	gvmmR0SchedWakeUpOne(pGVM, &pGVM->aCpus[idCpu]);
2480	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2481	}
2482	else if (VMCPUSET_IS_PRESENT(pPokeSet, idCpu))
2483	{
2484	gvmmR0SchedPokeOne(pGVM, &pGVM->aCpus[idCpu]);
2485	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2486	}
2487	}
2488
2489	int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2490	AssertRC(rc2);
2491	GVMM_CHECK_SMAP_CHECK2(pGVM, RT_NOTHING);
2492	}
2493
2494	LogFlow(("GVMMR0SchedWakeUpAndPokeCpus: returns %Rrc\n", rc));
2495	return rc;
2496	}
2497
2498
2499	/**
2500	* VMMR0 request wrapper for GVMMR0SchedWakeUpAndPokeCpus.
2501	*
2502	* @returns see GVMMR0SchedWakeUpAndPokeCpus.
2503	* @param pGVM The global (ring-0) VM structure.
2504	* @param pReq Pointer to the request packet.
2505	*/
2506	GVMMR0DECL(int) GVMMR0SchedWakeUpAndPokeCpusReq(PGVM pGVM, PGVMMSCHEDWAKEUPANDPOKECPUSREQ pReq)
2507	{
2508	/*
2509	* Validate input and pass it on.
2510	*/
2511	AssertPtrReturn(pReq, VERR_INVALID_POINTER);
2512	AssertMsgReturn(pReq->Hdr.cbReq == sizeof(pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(pReq)), VERR_INVALID_PARAMETER);
2513
2514	return GVMMR0SchedWakeUpAndPokeCpus(pGVM, &pReq->SleepSet, &pReq->PokeSet);
2515	}
2516
2517
2518
2519	/**
2520	* Poll the schedule to see if someone else should get a chance to run.
2521	*
2522	* This is a bit hackish and will not work too well if the machine is
2523	* under heavy load from non-VM processes.
2524	*
2525	* @returns VINF_SUCCESS if not yielded.
2526	* VINF_GVM_YIELDED if an attempt to switch to a different VM task was made.
2527	* @param pGVM The global (ring-0) VM structure.
2528	* @param idCpu The Virtual CPU ID of the calling EMT.
2529	* @param fYield Whether to yield or not.
2530	* This is for when we're spinning in the halt loop.
2531	* @thread EMT(idCpu).
2532	*/
2533	GVMMR0DECL(int) GVMMR0SchedPoll(PGVM pGVM, VMCPUID idCpu, bool fYield)
2534	{
2535	/*
2536	* Validate input.
2537	*/
2538	PGVMM pGVMM;
2539	int rc = gvmmR0ByGVMandEMT(pGVM, idCpu, &pGVMM);
2540	if (RT_SUCCESS(rc))
2541	{
2542	/*
2543	* We currently only implement helping doing wakeups (fYield = false), so don't
2544	* bother taking the lock if gvmmR0SchedDoWakeUps is not going to do anything.
2545	*/
2546	if (!fYield && pGVMM->fDoEarlyWakeUps)
2547	{
2548	rc = GVMMR0_USED_SHARED_LOCK(pGVMM); AssertRC(rc);
2549	pGVM->gvmm.s.StatsSched.cPollCalls++;
2550
2551	Assert(ASMGetFlags() & X86_EFL_IF);
2552	const uint64_t u64Now = RTTimeNanoTS(); /* (GIP time) */
2553
2554	pGVM->gvmm.s.StatsSched.cPollWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64Now);
2555
2556	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2557	}
2558	/*
2559	* Not quite sure what we could do here...
2560	*/
2561	else if (fYield)
2562	rc = VERR_NOT_IMPLEMENTED; /** @todo implement this... */
2563	else
2564	rc = VINF_SUCCESS;
2565	}
2566
2567	LogFlow(("GVMMR0SchedWakeUp: returns %Rrc\n", rc));
2568	return rc;
2569	}
2570
2571
2572	#ifdef GVMM_SCHED_WITH_PPT
2573	/**
2574	* Timer callback for the periodic preemption timer.
2575	*
2576	* @param pTimer The timer handle.
2577	* @param pvUser Pointer to the per cpu structure.
2578	* @param iTick The current tick.
2579	*/
2580	static DECLCALLBACK(void) gvmmR0SchedPeriodicPreemptionTimerCallback(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2581	{
2582	PGVMMHOSTCPU pCpu = (PGVMMHOSTCPU)pvUser;
2583	NOREF(pTimer); NOREF(iTick);
2584
2585	/*
2586	* Termination check
2587	*/
2588	if (pCpu->u32Magic != GVMMHOSTCPU_MAGIC)
2589	return;
2590
2591	/*
2592	* Do the house keeping.
2593	*/
2594	RTSpinlockAcquire(pCpu->Ppt.hSpinlock);
2595
2596	if (++pCpu->Ppt.iTickHistorization >= pCpu->Ppt.cTicksHistoriziationInterval)
2597	{
2598	/*
2599	* Historicize the max frequency.
2600	*/
2601	uint32_t iHzHistory = ++pCpu->Ppt.iHzHistory % RT_ELEMENTS(pCpu->Ppt.aHzHistory);
2602	pCpu->Ppt.aHzHistory[iHzHistory] = pCpu->Ppt.uDesiredHz;
2603	pCpu->Ppt.iTickHistorization = 0;
2604	pCpu->Ppt.uDesiredHz = 0;
2605
2606	/*
2607	* Check if the current timer frequency.
2608	*/
2609	uint32_t uHistMaxHz = 0;
2610	for (uint32_t i = 0; i < RT_ELEMENTS(pCpu->Ppt.aHzHistory); i++)
2611	if (pCpu->Ppt.aHzHistory[i] > uHistMaxHz)
2612	uHistMaxHz = pCpu->Ppt.aHzHistory[i];
2613	if (uHistMaxHz == pCpu->Ppt.uTimerHz)
2614	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2615	else if (uHistMaxHz)
2616	{
2617	/*
2618	* Reprogram it.
2619	*/
2620	pCpu->Ppt.cChanges++;
2621	pCpu->Ppt.iTickHistorization = 0;
2622	pCpu->Ppt.uTimerHz = uHistMaxHz;
2623	uint32_t const cNsInterval = RT_NS_1SEC / uHistMaxHz;
2624	pCpu->Ppt.cNsInterval = cNsInterval;
2625	if (cNsInterval < GVMMHOSTCPU_PPT_HIST_INTERVAL_NS)
2626	pCpu->Ppt.cTicksHistoriziationInterval = ( GVMMHOSTCPU_PPT_HIST_INTERVAL_NS
2627	+ GVMMHOSTCPU_PPT_HIST_INTERVAL_NS / 2 - 1)
2628	/ cNsInterval;
2629	else
2630	pCpu->Ppt.cTicksHistoriziationInterval = 1;
2631	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2632
2633	/SUPR0Printf("Cpu%u: change to %u Hz / %u ns\n", pCpu->idxCpuSet, uHistMaxHz, cNsInterval);/
2634	RTTimerChangeInterval(pTimer, cNsInterval);
2635	}
2636	else
2637	{
2638	/*
2639	* Stop it.
2640	*/
2641	pCpu->Ppt.fStarted = false;
2642	pCpu->Ppt.uTimerHz = 0;
2643	pCpu->Ppt.cNsInterval = 0;
2644	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2645
2646	/SUPR0Printf("Cpu%u: stopping (%u Hz)\n", pCpu->idxCpuSet, uHistMaxHz);/
2647	RTTimerStop(pTimer);
2648	}
2649	}
2650	else
2651	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2652	}
2653	#endif /* GVMM_SCHED_WITH_PPT */
2654
2655
2656	/**
2657	* Updates the periodic preemption timer for the calling CPU.
2658	*
2659	* The caller must have disabled preemption!
2660	* The caller must check that the host can do high resolution timers.
2661	*
2662	* @param pGVM The global (ring-0) VM structure.
2663	* @param idHostCpu The current host CPU id.
2664	* @param uHz The desired frequency.
2665	*/
2666	GVMMR0DECL(void) GVMMR0SchedUpdatePeriodicPreemptionTimer(PGVM pGVM, RTCPUID idHostCpu, uint32_t uHz)
2667	{
2668	NOREF(pGVM);
2669	#ifdef GVMM_SCHED_WITH_PPT
2670	Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
2671	Assert(RTTimerCanDoHighResolution());
2672
2673	/*
2674	* Resolve the per CPU data.
2675	*/
2676	uint32_t iCpu = RTMpCpuIdToSetIndex(idHostCpu);
2677	PGVMM pGVMM = g_pGVMM;
2678	if ( !RT_VALID_PTR(pGVMM)
2679	\|\| pGVMM->u32Magic != GVMM_MAGIC)
2680	return;
2681	AssertMsgReturnVoid(iCpu < pGVMM->cHostCpus, ("iCpu=%d cHostCpus=%d\n", iCpu, pGVMM->cHostCpus));
2682	PGVMMHOSTCPU pCpu = &pGVMM->aHostCpus[iCpu];
2683	AssertMsgReturnVoid( pCpu->u32Magic == GVMMHOSTCPU_MAGIC
2684	&& pCpu->idCpu == idHostCpu,
2685	("u32Magic=%#x idCpu=% idHostCpu=%d\n", pCpu->u32Magic, pCpu->idCpu, idHostCpu));
2686
2687	/*
2688	* Check whether we need to do anything about the timer.
2689	* We have to be a little bit careful since we might be race the timer
2690	* callback here.
2691	*/
2692	if (uHz > 16384)
2693	uHz = 16384; /** @todo add a query method for this! */
2694	if (RT_UNLIKELY( uHz > ASMAtomicReadU32(&pCpu->Ppt.uDesiredHz)
2695	&& uHz >= pCpu->Ppt.uMinHz
2696	&& !pCpu->Ppt.fStarting /* solaris paranoia */))
2697	{
2698	RTSpinlockAcquire(pCpu->Ppt.hSpinlock);
2699
2700	pCpu->Ppt.uDesiredHz = uHz;
2701	uint32_t cNsInterval = 0;
2702	if (!pCpu->Ppt.fStarted)
2703	{
2704	pCpu->Ppt.cStarts++;
2705	pCpu->Ppt.fStarted = true;
2706	pCpu->Ppt.fStarting = true;
2707	pCpu->Ppt.iTickHistorization = 0;
2708	pCpu->Ppt.uTimerHz = uHz;
2709	pCpu->Ppt.cNsInterval = cNsInterval = RT_NS_1SEC / uHz;
2710	if (cNsInterval < GVMMHOSTCPU_PPT_HIST_INTERVAL_NS)
2711	pCpu->Ppt.cTicksHistoriziationInterval = ( GVMMHOSTCPU_PPT_HIST_INTERVAL_NS
2712	+ GVMMHOSTCPU_PPT_HIST_INTERVAL_NS / 2 - 1)
2713	/ cNsInterval;
2714	else
2715	pCpu->Ppt.cTicksHistoriziationInterval = 1;
2716	}
2717
2718	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2719
2720	if (cNsInterval)
2721	{
2722	RTTimerChangeInterval(pCpu->Ppt.pTimer, cNsInterval);
2723	int rc = RTTimerStart(pCpu->Ppt.pTimer, cNsInterval);
2724	AssertRC(rc);
2725
2726	RTSpinlockAcquire(pCpu->Ppt.hSpinlock);
2727	if (RT_FAILURE(rc))
2728	pCpu->Ppt.fStarted = false;
2729	pCpu->Ppt.fStarting = false;
2730	RTSpinlockRelease(pCpu->Ppt.hSpinlock);
2731	}
2732	}
2733	#else /* !GVMM_SCHED_WITH_PPT */
2734	NOREF(idHostCpu); NOREF(uHz);
2735	#endif /* !GVMM_SCHED_WITH_PPT */
2736	}
2737
2738
2739	/**
2740	* Calls @a pfnCallback for each VM in the system.
2741	*
2742	* This will enumerate the VMs while holding the global VM used list lock in
2743	* shared mode. So, only suitable for simple work. If more expensive work
2744	* needs doing, a different approach must be taken as using this API would
2745	* otherwise block VM creation and destruction.
2746	*
2747	* @returns VBox status code.
2748	* @param pfnCallback The callback function.
2749	* @param pvUser User argument to the callback.
2750	*/
2751	GVMMR0DECL(int) GVMMR0EnumVMs(PFNGVMMR0ENUMCALLBACK pfnCallback, void *pvUser)
2752	{
2753	PGVMM pGVMM;
2754	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
2755
2756	int rc = VINF_SUCCESS;
2757	GVMMR0_USED_SHARED_LOCK(pGVMM);
2758	for (unsigned i = pGVMM->iUsedHead, cLoops = 0;
2759	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
2760	i = pGVMM->aHandles[i].iNext, cLoops++)
2761	{
2762	PGVM pGVM = pGVMM->aHandles[i].pGVM;
2763	if ( RT_VALID_PTR(pGVM)
2764	&& RT_VALID_PTR(pGVMM->aHandles[i].pvObj)
2765	&& pGVM->u32Magic == GVM_MAGIC)
2766	{
2767	rc = pfnCallback(pGVM, pvUser);
2768	if (rc != VINF_SUCCESS)
2769	break;
2770	}
2771
2772	AssertBreak(cLoops < RT_ELEMENTS(pGVMM->aHandles) * 4); /* paranoia */
2773	}
2774	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2775	return rc;
2776	}
2777
2778
2779	/**
2780	* Retrieves the GVMM statistics visible to the caller.
2781	*
2782	* @returns VBox status code.
2783	*
2784	* @param pStats Where to put the statistics.
2785	* @param pSession The current session.
2786	* @param pGVM The GVM to obtain statistics for. Optional.
2787	*/
2788	GVMMR0DECL(int) GVMMR0QueryStatistics(PGVMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM)
2789	{
2790	LogFlow(("GVMMR0QueryStatistics: pStats=%p pSession=%p pGVM=%p\n", pStats, pSession, pGVM));
2791
2792	/*
2793	* Validate input.
2794	*/
2795	AssertPtrReturn(pSession, VERR_INVALID_POINTER);
2796	AssertPtrReturn(pStats, VERR_INVALID_POINTER);
2797	pStats->cVMs = 0; /* (crash before taking the sem...) */
2798
2799	/*
2800	* Take the lock and get the VM statistics.
2801	*/
2802	PGVMM pGVMM;
2803	if (pGVM)
2804	{
2805	int rc = gvmmR0ByGVM(pGVM, &pGVMM, true /fTakeUsedLock/);
2806	if (RT_FAILURE(rc))
2807	return rc;
2808	pStats->SchedVM = pGVM->gvmm.s.StatsSched;
2809	}
2810	else
2811	{
2812	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
2813	memset(&pStats->SchedVM, 0, sizeof(pStats->SchedVM));
2814
2815	int rc = GVMMR0_USED_SHARED_LOCK(pGVMM);
2816	AssertRCReturn(rc, rc);
2817	}
2818
2819	/*
2820	* Enumerate the VMs and add the ones visible to the statistics.
2821	*/
2822	pStats->cVMs = 0;
2823	pStats->cEMTs = 0;
2824	memset(&pStats->SchedSum, 0, sizeof(pStats->SchedSum));
2825
2826	for (unsigned i = pGVMM->iUsedHead;
2827	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
2828	i = pGVMM->aHandles[i].iNext)
2829	{
2830	PGVM pOtherGVM = pGVMM->aHandles[i].pGVM;
2831	void *pvObj = pGVMM->aHandles[i].pvObj;
2832	if ( RT_VALID_PTR(pvObj)
2833	&& RT_VALID_PTR(pOtherGVM)
2834	&& pOtherGVM->u32Magic == GVM_MAGIC
2835	&& RT_SUCCESS(SUPR0ObjVerifyAccess(pvObj, pSession, NULL)))
2836	{
2837	pStats->cVMs++;
2838	pStats->cEMTs += pOtherGVM->cCpus;
2839
2840	pStats->SchedSum.cHaltCalls += pOtherGVM->gvmm.s.StatsSched.cHaltCalls;
2841	pStats->SchedSum.cHaltBlocking += pOtherGVM->gvmm.s.StatsSched.cHaltBlocking;
2842	pStats->SchedSum.cHaltTimeouts += pOtherGVM->gvmm.s.StatsSched.cHaltTimeouts;
2843	pStats->SchedSum.cHaltNotBlocking += pOtherGVM->gvmm.s.StatsSched.cHaltNotBlocking;
2844	pStats->SchedSum.cHaltWakeUps += pOtherGVM->gvmm.s.StatsSched.cHaltWakeUps;
2845
2846	pStats->SchedSum.cWakeUpCalls += pOtherGVM->gvmm.s.StatsSched.cWakeUpCalls;
2847	pStats->SchedSum.cWakeUpNotHalted += pOtherGVM->gvmm.s.StatsSched.cWakeUpNotHalted;
2848	pStats->SchedSum.cWakeUpWakeUps += pOtherGVM->gvmm.s.StatsSched.cWakeUpWakeUps;
2849
2850	pStats->SchedSum.cPokeCalls += pOtherGVM->gvmm.s.StatsSched.cPokeCalls;
2851	pStats->SchedSum.cPokeNotBusy += pOtherGVM->gvmm.s.StatsSched.cPokeNotBusy;
2852
2853	pStats->SchedSum.cPollCalls += pOtherGVM->gvmm.s.StatsSched.cPollCalls;
2854	pStats->SchedSum.cPollHalts += pOtherGVM->gvmm.s.StatsSched.cPollHalts;
2855	pStats->SchedSum.cPollWakeUps += pOtherGVM->gvmm.s.StatsSched.cPollWakeUps;
2856	}
2857	}
2858
2859	/*
2860	* Copy out the per host CPU statistics.
2861	*/
2862	uint32_t iDstCpu = 0;
2863	uint32_t cSrcCpus = pGVMM->cHostCpus;
2864	for (uint32_t iSrcCpu = 0; iSrcCpu < cSrcCpus; iSrcCpu++)
2865	{
2866	if (pGVMM->aHostCpus[iSrcCpu].idCpu != NIL_RTCPUID)
2867	{
2868	pStats->aHostCpus[iDstCpu].idCpu = pGVMM->aHostCpus[iSrcCpu].idCpu;
2869	pStats->aHostCpus[iDstCpu].idxCpuSet = pGVMM->aHostCpus[iSrcCpu].idxCpuSet;
2870	#ifdef GVMM_SCHED_WITH_PPT
2871	pStats->aHostCpus[iDstCpu].uDesiredHz = pGVMM->aHostCpus[iSrcCpu].Ppt.uDesiredHz;
2872	pStats->aHostCpus[iDstCpu].uTimerHz = pGVMM->aHostCpus[iSrcCpu].Ppt.uTimerHz;
2873	pStats->aHostCpus[iDstCpu].cChanges = pGVMM->aHostCpus[iSrcCpu].Ppt.cChanges;
2874	pStats->aHostCpus[iDstCpu].cStarts = pGVMM->aHostCpus[iSrcCpu].Ppt.cStarts;
2875	#else
2876	pStats->aHostCpus[iDstCpu].uDesiredHz = 0;
2877	pStats->aHostCpus[iDstCpu].uTimerHz = 0;
2878	pStats->aHostCpus[iDstCpu].cChanges = 0;
2879	pStats->aHostCpus[iDstCpu].cStarts = 0;
2880	#endif
2881	iDstCpu++;
2882	if (iDstCpu >= RT_ELEMENTS(pStats->aHostCpus))
2883	break;
2884	}
2885	}
2886	pStats->cHostCpus = iDstCpu;
2887
2888	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
2889
2890	return VINF_SUCCESS;
2891	}
2892
2893
2894	/**
2895	* VMMR0 request wrapper for GVMMR0QueryStatistics.
2896	*
2897	* @returns see GVMMR0QueryStatistics.
2898	* @param pGVM The global (ring-0) VM structure. Optional.
2899	* @param pReq Pointer to the request packet.
2900	* @param pSession The current session.
2901	*/
2902	GVMMR0DECL(int) GVMMR0QueryStatisticsReq(PGVM pGVM, PGVMMQUERYSTATISTICSSREQ pReq, PSUPDRVSESSION pSession)
2903	{
2904	/*
2905	* Validate input and pass it on.
2906	*/
2907	AssertPtrReturn(pReq, VERR_INVALID_POINTER);
2908	AssertMsgReturn(pReq->Hdr.cbReq == sizeof(pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(pReq)), VERR_INVALID_PARAMETER);
2909	AssertReturn(pReq->pSession == pSession, VERR_INVALID_PARAMETER);
2910
2911	return GVMMR0QueryStatistics(&pReq->Stats, pSession, pGVM);
2912	}
2913
2914
2915	/**
2916	* Resets the specified GVMM statistics.
2917	*
2918	* @returns VBox status code.
2919	*
2920	* @param pStats Which statistics to reset, that is, non-zero fields indicates which to reset.
2921	* @param pSession The current session.
2922	* @param pGVM The GVM to reset statistics for. Optional.
2923	*/
2924	GVMMR0DECL(int) GVMMR0ResetStatistics(PCGVMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM)
2925	{
2926	LogFlow(("GVMMR0ResetStatistics: pStats=%p pSession=%p pGVM=%p\n", pStats, pSession, pGVM));
2927
2928	/*
2929	* Validate input.
2930	*/
2931	AssertPtrReturn(pSession, VERR_INVALID_POINTER);
2932	AssertPtrReturn(pStats, VERR_INVALID_POINTER);
2933
2934	/*
2935	* Take the lock and get the VM statistics.
2936	*/
2937	PGVMM pGVMM;
2938	if (pGVM)
2939	{
2940	int rc = gvmmR0ByGVM(pGVM, &pGVMM, true /fTakeUsedLock/);
2941	if (RT_FAILURE(rc))
2942	return rc;
2943	# define MAYBE_RESET_FIELD(field) \
2944	do { if (pStats->SchedVM. field ) { pGVM->gvmm.s.StatsSched. field = 0; } } while (0)
2945	MAYBE_RESET_FIELD(cHaltCalls);
2946	MAYBE_RESET_FIELD(cHaltBlocking);
2947	MAYBE_RESET_FIELD(cHaltTimeouts);
2948	MAYBE_RESET_FIELD(cHaltNotBlocking);
2949	MAYBE_RESET_FIELD(cHaltWakeUps);
2950	MAYBE_RESET_FIELD(cWakeUpCalls);
2951	MAYBE_RESET_FIELD(cWakeUpNotHalted);
2952	MAYBE_RESET_FIELD(cWakeUpWakeUps);
2953	MAYBE_RESET_FIELD(cPokeCalls);
2954	MAYBE_RESET_FIELD(cPokeNotBusy);
2955	MAYBE_RESET_FIELD(cPollCalls);
2956	MAYBE_RESET_FIELD(cPollHalts);
2957	MAYBE_RESET_FIELD(cPollWakeUps);
2958	# undef MAYBE_RESET_FIELD
2959	}
2960	else
2961	{
2962	GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE);
2963
2964	int rc = GVMMR0_USED_SHARED_LOCK(pGVMM);
2965	AssertRCReturn(rc, rc);
2966	}
2967
2968	/*
2969	* Enumerate the VMs and add the ones visible to the statistics.
2970	*/
2971	if (!ASMMemIsZero(&pStats->SchedSum, sizeof(pStats->SchedSum)))
2972	{
2973	for (unsigned i = pGVMM->iUsedHead;
2974	i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles);
2975	i = pGVMM->aHandles[i].iNext)
2976	{
2977	PGVM pOtherGVM = pGVMM->aHandles[i].pGVM;
2978	void *pvObj = pGVMM->aHandles[i].pvObj;
2979	if ( RT_VALID_PTR(pvObj)
2980	&& RT_VALID_PTR(pOtherGVM)
2981	&& pOtherGVM->u32Magic == GVM_MAGIC
2982	&& RT_SUCCESS(SUPR0ObjVerifyAccess(pvObj, pSession, NULL)))
2983	{
2984	# define MAYBE_RESET_FIELD(field) \
2985	do { if (pStats->SchedSum. field ) { pOtherGVM->gvmm.s.StatsSched. field = 0; } } while (0)
2986	MAYBE_RESET_FIELD(cHaltCalls);
2987	MAYBE_RESET_FIELD(cHaltBlocking);
2988	MAYBE_RESET_FIELD(cHaltTimeouts);
2989	MAYBE_RESET_FIELD(cHaltNotBlocking);
2990	MAYBE_RESET_FIELD(cHaltWakeUps);
2991	MAYBE_RESET_FIELD(cWakeUpCalls);
2992	MAYBE_RESET_FIELD(cWakeUpNotHalted);
2993	MAYBE_RESET_FIELD(cWakeUpWakeUps);
2994	MAYBE_RESET_FIELD(cPokeCalls);
2995	MAYBE_RESET_FIELD(cPokeNotBusy);
2996	MAYBE_RESET_FIELD(cPollCalls);
2997	MAYBE_RESET_FIELD(cPollHalts);
2998	MAYBE_RESET_FIELD(cPollWakeUps);
2999	# undef MAYBE_RESET_FIELD
3000	}
3001	}
3002	}
3003
3004	GVMMR0_USED_SHARED_UNLOCK(pGVMM);
3005
3006	return VINF_SUCCESS;
3007	}
3008
3009
3010	/**
3011	* VMMR0 request wrapper for GVMMR0ResetStatistics.
3012	*
3013	* @returns see GVMMR0ResetStatistics.
3014	* @param pGVM The global (ring-0) VM structure. Optional.
3015	* @param pReq Pointer to the request packet.
3016	* @param pSession The current session.
3017	*/
3018	GVMMR0DECL(int) GVMMR0ResetStatisticsReq(PGVM pGVM, PGVMMRESETSTATISTICSSREQ pReq, PSUPDRVSESSION pSession)
3019	{
3020	/*
3021	* Validate input and pass it on.
3022	*/
3023	AssertPtrReturn(pReq, VERR_INVALID_POINTER);
3024	AssertMsgReturn(pReq->Hdr.cbReq == sizeof(pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(pReq)), VERR_INVALID_PARAMETER);
3025	AssertReturn(pReq->pSession == pSession, VERR_INVALID_PARAMETER);
3026
3027	return GVMMR0ResetStatistics(&pReq->Stats, pSession, pGVM);
3028	}
3029

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/VMM/VMMR0/GVMMR0.cpp@ 85972

Download in other formats: