VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 92710

Last change on this file since 92710 was 87700, checked in by vboxsync, 4 years ago

SUPDrv,++: Experimental support for wrapping .r0 modules in native kernel modules on linux, so that perf and similar tools work better. Minor IOC version increase as SUP_IOCTL_LDR_OPEN now support just opening a module w/o preparing the loading. SUPDrv must export all the symbols in g_aFunctions the linux way now, or linux won't see them, so introduced a SUPR0_EXPORT_SYMBOL macro similar to RT_EXPORT_SYMBOL. bugref:9937

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 186.4 KB
Line 
1/* $Id: SUPDrvGip.cpp 87700 2021-02-10 20:21:04Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#define LOG_GROUP LOG_GROUP_SUP_DRV
32#define SUPDRV_AGNOSTIC
33#include "SUPDrvInternal.h"
34#ifndef PAGE_SHIFT
35# include <iprt/param.h>
36#endif
37#include <iprt/asm.h>
38#include <iprt/asm-amd64-x86.h>
39#include <iprt/asm-math.h>
40#include <iprt/cpuset.h>
41#include <iprt/handletable.h>
42#include <iprt/mem.h>
43#include <iprt/mp.h>
44#include <iprt/power.h>
45#include <iprt/process.h>
46#include <iprt/semaphore.h>
47#include <iprt/spinlock.h>
48#include <iprt/thread.h>
49#include <iprt/uuid.h>
50#include <iprt/net.h>
51#include <iprt/crc.h>
52#include <iprt/string.h>
53#include <iprt/timer.h>
54#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
55# include <iprt/rand.h>
56# include <iprt/path.h>
57#endif
58#include <iprt/uint128.h>
59#include <iprt/x86.h>
60
61#include <VBox/param.h>
62#include <VBox/log.h>
63#include <VBox/err.h>
64
65#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
66# include "dtrace/SUPDrv.h"
67#else
68/* ... */
69#endif
70
71
72/*********************************************************************************************************************************
73* Defined Constants And Macros *
74*********************************************************************************************************************************/
75/** The frequency by which we recalculate the u32UpdateHz and
76 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
77 *
78 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
79 */
80#define GIP_UPDATEHZ_RECALC_FREQ 0x800
81
82/** A reserved TSC value used for synchronization as well as measurement of
83 * TSC deltas. */
84#define GIP_TSC_DELTA_RSVD UINT64_MAX
85/** The number of TSC delta measurement loops in total (includes primer and
86 * read-time loops). */
87#define GIP_TSC_DELTA_LOOPS 96
88/** The number of cache primer loops. */
89#define GIP_TSC_DELTA_PRIMER_LOOPS 4
90/** The number of loops until we keep computing the minumum read time. */
91#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
92
93/** The TSC frequency refinement period in seconds.
94 * The timer fires after 200ms, then every second, this value just says when
95 * to stop it after that. */
96#define GIP_TSC_REFINE_PERIOD_IN_SECS 12
97/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
98#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
99/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
100#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
101/** The TSC delta value for the initial GIP master - 0 in regular builds.
102 * To test the delta code this can be set to a non-zero value. */
103#if 0
104# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
105#else
106# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
107#endif
108
109AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
110AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
111
112/** @def VBOX_SVN_REV
113 * The makefile should define this if it can. */
114#ifndef VBOX_SVN_REV
115# define VBOX_SVN_REV 0
116#endif
117
118#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
119# define DO_NOT_START_GIP
120#endif
121
122
123/*********************************************************************************************************************************
124* Internal Functions *
125*********************************************************************************************************************************/
126static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
127static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
128static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask);
129static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
130static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas);
131#ifdef SUPDRV_USE_TSC_DELTA_THREAD
132static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
133static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
134static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll);
135#else
136static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt);
137static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
138#endif
139
140
141/*********************************************************************************************************************************
142* Global Variables *
143*********************************************************************************************************************************/
144DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
145SUPR0_EXPORT_SYMBOL(g_pSUPGlobalInfoPage);
146
147
148
149/*
150 *
151 * Misc Common GIP Code
152 * Misc Common GIP Code
153 * Misc Common GIP Code
154 *
155 *
156 */
157
158
159/**
160 * Finds the GIP CPU index corresponding to @a idCpu.
161 *
162 * @returns GIP CPU array index, UINT32_MAX if not found.
163 * @param pGip The GIP.
164 * @param idCpu The CPU ID.
165 */
166static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
167{
168 uint32_t i;
169 for (i = 0; i < pGip->cCpus; i++)
170 if (pGip->aCPUs[i].idCpu == idCpu)
171 return i;
172 return UINT32_MAX;
173}
174
175
176/**
177 * Gets the APIC ID using the best available method.
178 *
179 * @returns APIC ID.
180 * @param pGip The GIP, for SUPGIPGETCPU_XXX.
181 */
182DECLINLINE(uint32_t) supdrvGipGetApicId(PSUPGLOBALINFOPAGE pGip)
183{
184 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_0B)
185 return ASMGetApicIdExt0B();
186 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_8000001E)
187 return ASMGetApicIdExt8000001E();
188 return ASMGetApicId();
189}
190
191
192/**
193 * Gets the APIC ID using the best available method, slow version.
194 */
195static uint32_t supdrvGipGetApicIdSlow(void)
196{
197 uint32_t const idApic = ASMGetApicId();
198
199 /* The Intel CPU topology leaf: */
200 uint32_t uOther = ASMCpuId_EAX(0);
201 if (uOther >= UINT32_C(0xb) && ASMIsValidStdRange(uOther))
202 {
203 uint32_t uEax = 0;
204 uint32_t uEbx = 0;
205 uint32_t uEcx = 0;
206 uint32_t uEdx = 0;
207#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
208 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
209#else
210 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
211#endif
212 if ((uEcx >> 8) != 0) /* level type != invalid */
213 {
214 if ((uEdx & 0xff) == idApic)
215 return uEdx;
216 AssertMsgFailed(("ASMGetApicIdExt0B=>%#x idApic=%#x\n", uEdx, idApic));
217 }
218 }
219
220 /* The AMD leaf: */
221 uOther = ASMCpuId_EAX(UINT32_C(0x80000000));
222 if (uOther >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uOther))
223 {
224 uOther = ASMGetApicIdExt8000001E();
225 if ((uOther & 0xff) == idApic)
226 return uOther;
227 AssertMsgFailed(("ASMGetApicIdExt8000001E=>%#x idApic=%#x\n", uOther, idApic));
228 }
229 return idApic;
230}
231
232
233/*
234 *
235 * GIP Mapping and Unmapping Related Code.
236 * GIP Mapping and Unmapping Related Code.
237 * GIP Mapping and Unmapping Related Code.
238 *
239 *
240 */
241
242
243/**
244 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
245 * updating.
246 *
247 * @param pGipCpu The per CPU structure for this CPU.
248 * @param u64NanoTS The current time.
249 */
250static void supdrvGipReInitCpu(PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
251{
252 /*
253 * Here we don't really care about applying the TSC delta. The re-initialization of this
254 * value is not relevant especially while (re)starting the GIP as the first few ones will
255 * be ignored anyway, see supdrvGipDoUpdateCpu().
256 */
257 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
258 pGipCpu->u64NanoTS = u64NanoTS;
259}
260
261
262/**
263 * Set the current TSC and NanoTS value for the CPU.
264 *
265 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
266 * @param pvUser1 Pointer to the ring-0 GIP mapping.
267 * @param pvUser2 Pointer to the variable holding the current time.
268 */
269static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
270{
271 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
272 uint32_t const idApic = supdrvGipGetApicId(pGip);
273 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
274 {
275 unsigned const iCpu = pGip->aiCpuFromApicId[idApic];
276
277 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
278 supdrvGipReInitCpu(&pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
279 else
280 LogRelMax(64, ("supdrvGipReInitCpuCallback: iCpu=%#x out of bounds (%#zx, idApic=%#x)\n",
281 iCpu, RT_ELEMENTS(pGip->aiCpuFromApicId), idApic));
282 }
283 else
284 LogRelMax(64, ("supdrvGipReInitCpuCallback: idApic=%#x out of bounds (%#zx)\n",
285 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId)));
286
287 NOREF(pvUser2);
288}
289
290
291/**
292 * State structure for supdrvGipDetectGetGipCpuCallback.
293 */
294typedef struct SUPDRVGIPDETECTGETCPU
295{
296 /** Bitmap of APIC IDs that has been seen (initialized to zero).
297 * Used to detect duplicate APIC IDs (paranoia). */
298 uint8_t volatile bmApicId[4096 / 8];
299 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
300 * initially). The callback clears the methods not detected. */
301 uint32_t volatile fSupported;
302 /** The first callback detecting any kind of range issues (initialized to
303 * NIL_RTCPUID). */
304 RTCPUID volatile idCpuProblem;
305} SUPDRVGIPDETECTGETCPU;
306/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
307typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
308
309
310/**
311 * Checks for alternative ways of getting the CPU ID.
312 *
313 * This also checks the APIC ID, CPU ID and CPU set index values against the
314 * GIP tables.
315 *
316 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
317 * @param pvUser1 Pointer to the state structure.
318 * @param pvUser2 Pointer to the GIP.
319 */
320static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
321{
322 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
323 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
324 uint32_t fSupported = 0;
325 uint32_t idApic;
326 uint32_t uEax, uEbx, uEcx, uEdx;
327 int iCpuSet;
328 NOREF(pGip);
329
330 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
331
332 /*
333 * Check that the CPU ID and CPU set index are interchangable.
334 */
335 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
336 if ((RTCPUID)iCpuSet == idCpu)
337 {
338 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
339 if ( iCpuSet >= 0
340 && iCpuSet < RTCPUSET_MAX_CPUS
341 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
342 {
343 PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
344
345 /*
346 * Check whether the IDTR.LIMIT contains a CPU number.
347 */
348#ifdef RT_ARCH_X86
349 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
350#else
351 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
352#endif
353 RTIDTR Idtr;
354 ASMGetIDTR(&Idtr);
355 if (Idtr.cbIdt >= cbIdt)
356 {
357 uint32_t uTmp = Idtr.cbIdt - cbIdt;
358 uTmp &= RTCPUSET_MAX_CPUS - 1;
359 if (uTmp == idCpu)
360 {
361 RTIDTR Idtr2;
362 ASMGetIDTR(&Idtr2);
363 if (Idtr2.cbIdt == Idtr.cbIdt)
364 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
365 }
366 }
367
368 /*
369 * Check whether RDTSCP is an option.
370 */
371 if (ASMHasCpuId())
372 {
373 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
374 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
375 {
376 uint32_t uAux;
377 ASMReadTscWithAux(&uAux);
378 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
379 {
380 ASMNopPause();
381 ASMReadTscWithAux(&uAux);
382 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
383 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
384 }
385
386 if (pGipCpu)
387 {
388 uint32_t const uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
389 if ( (uAux & UINT16_MAX) == uGroupedAux
390 && pGipCpu->iCpuGroupMember <= UINT8_MAX)
391 {
392 ASMNopPause();
393 ASMReadTscWithAux(&uAux);
394 if ((uAux & UINT16_MAX) == uGroupedAux)
395 fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
396 }
397 }
398 }
399 }
400 }
401 }
402
403 /*
404 * Check for extended APIC ID methods.
405 */
406 idApic = UINT32_MAX;
407 uEax = ASMCpuId_EAX(0);
408 if (uEax >= UINT32_C(0xb) && ASMIsValidStdRange(uEax))
409 {
410#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
411 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
412#else
413 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
414#endif
415 if ((uEcx >> 8) != 0) /* level type != invalid */
416 {
417 if (RT_LIKELY( uEdx < RT_ELEMENTS(pGip->aiCpuFromApicId)
418 && !ASMBitTest(pState->bmApicId, uEdx)))
419 {
420 if (uEdx == ASMGetApicIdExt0B())
421 {
422 idApic = uEdx;
423 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_0B;
424 }
425 else
426 AssertMsgFailed(("%#x vs %#x\n", uEdx, ASMGetApicIdExt0B()));
427 }
428 }
429 }
430
431 uEax = ASMCpuId_EAX(UINT32_C(0x80000000));
432 if (uEax >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uEax))
433 {
434#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
435 ASMCpuId_Idx_ECX(UINT32_C(0x8000001e), 0, &uEax, &uEbx, &uEcx, &uEdx);
436#else
437 ASMCpuIdExSlow(UINT32_C(0x8000001e), 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
438#endif
439 if (uEax || uEbx || uEcx || uEdx)
440 {
441 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
442 && ( idApic == UINT32_MAX
443 || idApic == uEax)
444 && !ASMBitTest(pState->bmApicId, uEax)))
445 {
446 if (uEax == ASMGetApicIdExt8000001E())
447 {
448 idApic = uEax;
449 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_8000001E;
450 }
451 else
452 AssertMsgFailed(("%#x vs %#x\n", uEax, ASMGetApicIdExt8000001E()));
453 }
454 }
455 }
456
457 /*
458 * Check that the APIC ID is unique.
459 */
460 uEax = ASMGetApicId();
461 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
462 && ( idApic == UINT32_MAX
463 || idApic == uEax)
464 && !ASMAtomicBitTestAndSet(pState->bmApicId, uEax)))
465 {
466 idApic = uEax;
467 fSupported |= SUPGIPGETCPU_APIC_ID;
468 }
469 else if ( idApic == UINT32_MAX
470 || idApic >= RT_ELEMENTS(pGip->aiCpuFromApicId) /* parnaoia */
471 || ASMAtomicBitTestAndSet(pState->bmApicId, idApic))
472 {
473 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
474 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
475 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x/%#x - duplicate APIC ID.\n",
476 idCpu, iCpuSet, uEax, idApic));
477 }
478
479 /*
480 * Check that the iCpuSet is within the expected range.
481 */
482 if (RT_UNLIKELY( iCpuSet < 0
483 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
484 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
485 {
486 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
487 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
488 idCpu, iCpuSet, idApic));
489 }
490 else
491 {
492 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
493 if (RT_UNLIKELY(idCpu2 != idCpu))
494 {
495 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
496 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
497 idCpu, iCpuSet, idApic, idCpu2));
498 }
499 }
500
501 /*
502 * Update the supported feature mask before we return.
503 */
504 ASMAtomicAndU32(&pState->fSupported, fSupported);
505
506 NOREF(pvUser2);
507}
508
509
510/**
511 * Increase the timer freqency on hosts where this is possible (NT).
512 *
513 * The idea is that more interrupts is better for us... Also, it's better than
514 * we increase the timer frequence, because we might end up getting inaccurate
515 * callbacks if someone else does it.
516 *
517 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
518 */
519static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
520{
521 if (pDevExt->u32SystemTimerGranularityGrant == 0)
522 {
523 uint32_t u32SystemResolution;
524 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
525 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
526 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
527 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
528 )
529 {
530#if 0 /* def VBOX_STRICT - this is somehow triggers bogus assertions on windows 10 */
531 uint32_t u32After = RTTimerGetSystemGranularity();
532 AssertMsg(u32After <= u32SystemResolution, ("u32After=%u u32SystemResolution=%u\n", u32After, u32SystemResolution));
533#endif
534 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
535 }
536 }
537}
538
539
540/**
541 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
542 *
543 * @param pDevExt Clears u32SystemTimerGranularityGrant.
544 */
545static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
546{
547 if (pDevExt->u32SystemTimerGranularityGrant)
548 {
549 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
550 AssertRC(rc2);
551 pDevExt->u32SystemTimerGranularityGrant = 0;
552 }
553}
554
555
556/**
557 * Maps the GIP into userspace and/or get the physical address of the GIP.
558 *
559 * @returns IPRT status code.
560 * @param pSession Session to which the GIP mapping should belong.
561 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
562 * @param pHCPhysGip Where to store the physical address. (optional)
563 *
564 * @remark There is no reference counting on the mapping, so one call to this function
565 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
566 * and remove the session as a GIP user.
567 */
568SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
569{
570 int rc;
571 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
572 RTR3PTR pGipR3 = NIL_RTR3PTR;
573 RTHCPHYS HCPhys = NIL_RTHCPHYS;
574 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
575
576 /*
577 * Validate
578 */
579 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
580 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
581 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
582
583#ifdef SUPDRV_USE_MUTEX_FOR_GIP
584 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
585#else
586 RTSemFastMutexRequest(pDevExt->mtxGip);
587#endif
588 if (pDevExt->pGip)
589 {
590 /*
591 * Map it?
592 */
593 rc = VINF_SUCCESS;
594 if (ppGipR3)
595 {
596 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
597 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
598 RTMEM_PROT_READ, NIL_RTR0PROCESS);
599 if (RT_SUCCESS(rc))
600 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
601 }
602
603 /*
604 * Get physical address.
605 */
606 if (pHCPhysGip && RT_SUCCESS(rc))
607 HCPhys = pDevExt->HCPhysGip;
608
609 /*
610 * Reference globally.
611 */
612 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
613 {
614 pSession->fGipReferenced = 1;
615 pDevExt->cGipUsers++;
616 if (pDevExt->cGipUsers == 1)
617 {
618 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
619 uint64_t u64NanoTS;
620
621 /*
622 * GIP starts/resumes updating again. On windows we bump the
623 * host timer frequency to make sure we don't get stuck in guest
624 * mode and to get better timer (and possibly clock) accuracy.
625 */
626 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
627
628 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
629
630 /*
631 * document me
632 */
633 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
634 {
635 unsigned i;
636 for (i = 0; i < pGipR0->cCpus; i++)
637 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
638 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
639 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
640 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
641 }
642
643 /*
644 * document me
645 */
646 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
647 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
648 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
649 || RTMpGetOnlineCount() == 1)
650 supdrvGipReInitCpu(&pGipR0->aCPUs[0], u64NanoTS);
651 else
652 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
653
654 /*
655 * Detect alternative ways to figure the CPU ID in ring-3 and
656 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
657 * and CPU set indexes while we're at it.
658 */
659 if (RT_SUCCESS(rc))
660 {
661 PSUPDRVGIPDETECTGETCPU pDetectState = (PSUPDRVGIPDETECTGETCPU)RTMemTmpAllocZ(sizeof(*pDetectState));
662 if (pDetectState)
663 {
664 pDetectState->fSupported = UINT32_MAX;
665 pDetectState->idCpuProblem = NIL_RTCPUID;
666 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, pDetectState, pGipR0);
667 if (pDetectState->idCpuProblem == NIL_RTCPUID)
668 {
669 if ( pDetectState->fSupported != UINT32_MAX
670 && pDetectState->fSupported != 0)
671 {
672 if (pGipR0->fGetGipCpu != pDetectState->fSupported)
673 {
674 pGipR0->fGetGipCpu = pDetectState->fSupported;
675 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", pDetectState->fSupported));
676 }
677 }
678 else
679 {
680 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
681 pDetectState->fSupported));
682 rc = VERR_UNSUPPORTED_CPU;
683 }
684 }
685 else
686 {
687 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
688 pDetectState->idCpuProblem, pDetectState->idCpuProblem));
689 rc = VERR_INVALID_CPU_ID;
690 }
691 RTMemTmpFree(pDetectState);
692 }
693 else
694 rc = VERR_NO_TMP_MEMORY;
695 }
696
697 /*
698 * Start the GIP timer if all is well..
699 */
700 if (RT_SUCCESS(rc))
701 {
702#ifndef DO_NOT_START_GIP
703 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
704#endif
705 rc = VINF_SUCCESS;
706 }
707
708 /*
709 * Bail out on error.
710 */
711 if (RT_FAILURE(rc))
712 {
713 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
714 pDevExt->cGipUsers = 0;
715 pSession->fGipReferenced = 0;
716 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
717 {
718 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
719 if (RT_SUCCESS(rc2))
720 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
721 }
722 HCPhys = NIL_RTHCPHYS;
723 pGipR3 = NIL_RTR3PTR;
724 }
725 }
726 }
727 }
728 else
729 {
730 rc = VERR_GENERAL_FAILURE;
731 Log(("SUPR0GipMap: GIP is not available!\n"));
732 }
733#ifdef SUPDRV_USE_MUTEX_FOR_GIP
734 RTSemMutexRelease(pDevExt->mtxGip);
735#else
736 RTSemFastMutexRelease(pDevExt->mtxGip);
737#endif
738
739 /*
740 * Write returns.
741 */
742 if (pHCPhysGip)
743 *pHCPhysGip = HCPhys;
744 if (ppGipR3)
745 *ppGipR3 = pGipR3;
746
747#ifdef DEBUG_DARWIN_GIP
748 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
749#else
750 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
751#endif
752 return rc;
753}
754SUPR0_EXPORT_SYMBOL(SUPR0GipMap);
755
756
757/**
758 * Unmaps any user mapping of the GIP and terminates all GIP access
759 * from this session.
760 *
761 * @returns IPRT status code.
762 * @param pSession Session to which the GIP mapping should belong.
763 */
764SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
765{
766 int rc = VINF_SUCCESS;
767 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
768#ifdef DEBUG_DARWIN_GIP
769 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
770 pSession,
771 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
772 pSession->GipMapObjR3));
773#else
774 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
775#endif
776 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
777
778#ifdef SUPDRV_USE_MUTEX_FOR_GIP
779 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
780#else
781 RTSemFastMutexRequest(pDevExt->mtxGip);
782#endif
783
784 /*
785 * GIP test-mode session?
786 */
787 if ( pSession->fGipTestMode
788 && pDevExt->pGip)
789 {
790 supdrvGipSetFlags(pDevExt, pSession, 0, ~SUPGIP_FLAGS_TESTING_ENABLE);
791 Assert(!pSession->fGipTestMode);
792 }
793
794 /*
795 * Unmap anything?
796 */
797 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
798 {
799 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
800 AssertRC(rc);
801 if (RT_SUCCESS(rc))
802 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
803 }
804
805 /*
806 * Dereference global GIP.
807 */
808 if (pSession->fGipReferenced && !rc)
809 {
810 pSession->fGipReferenced = 0;
811 if ( pDevExt->cGipUsers > 0
812 && !--pDevExt->cGipUsers)
813 {
814 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
815#ifndef DO_NOT_START_GIP
816 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
817#endif
818 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
819 }
820 }
821
822#ifdef SUPDRV_USE_MUTEX_FOR_GIP
823 RTSemMutexRelease(pDevExt->mtxGip);
824#else
825 RTSemFastMutexRelease(pDevExt->mtxGip);
826#endif
827
828 return rc;
829}
830SUPR0_EXPORT_SYMBOL(SUPR0GipUnmap);
831
832
833/**
834 * Gets the GIP pointer.
835 *
836 * @returns Pointer to the GIP or NULL.
837 */
838SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
839{
840 return g_pSUPGlobalInfoPage;
841}
842
843
844
845
846
847/*
848 *
849 *
850 * GIP Initialization, Termination and CPU Offline / Online Related Code.
851 * GIP Initialization, Termination and CPU Offline / Online Related Code.
852 * GIP Initialization, Termination and CPU Offline / Online Related Code.
853 *
854 *
855 */
856
857/**
858 * Used by supdrvGipInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
859 * to update the TSC frequency related GIP variables.
860 *
861 * @param pGip The GIP.
862 * @param nsElapsed The number of nanoseconds elapsed.
863 * @param cElapsedTscTicks The corresponding number of TSC ticks.
864 * @param iTick The tick number for debugging.
865 */
866static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks, uint32_t iTick)
867{
868 /*
869 * Calculate the frequency.
870 */
871 uint64_t uCpuHz;
872 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
873 && nsElapsed < UINT32_MAX)
874 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
875 else
876 {
877 RTUINT128U CpuHz, Tmp, Divisor;
878 CpuHz.s.Lo = CpuHz.s.Hi = 0;
879 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
880 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
881 uCpuHz = CpuHz.s.Lo;
882 }
883
884 /*
885 * Update the GIP.
886 */
887 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
888 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
889 {
890 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
891
892 /* For inspecting the frequency calcs using tstGIP-2, debugger or similar. */
893 if (iTick + 1 < pGip->cCpus)
894 ASMAtomicWriteU64(&pGip->aCPUs[iTick + 1].u64CpuHz, uCpuHz);
895 }
896}
897
898
899/**
900 * Timer callback function for TSC frequency refinement in invariant GIP mode.
901 *
902 * This is started during driver init and fires once
903 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
904 *
905 * @param pTimer The timer.
906 * @param pvUser Opaque pointer to the device instance data.
907 * @param iTick The timer tick.
908 */
909static DECLCALLBACK(void) supdrvGipInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
910{
911 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
912 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
913 RTCPUID idCpu;
914 uint64_t cNsElapsed;
915 uint64_t cTscTicksElapsed;
916 uint64_t nsNow;
917 uint64_t uTsc;
918 RTCCUINTREG fEFlags;
919
920 /* Paranoia. */
921 AssertReturnVoid(pGip);
922 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
923
924 /*
925 * If we got a power event, stop the refinement process.
926 */
927 if (pDevExt->fInvTscRefinePowerEvent)
928 {
929 int rc = RTTimerStop(pTimer); AssertRC(rc);
930 return;
931 }
932
933 /*
934 * Read the TSC and time, noting which CPU we are on.
935 *
936 * Don't bother spinning until RTTimeSystemNanoTS changes, since on
937 * systems where it matters we're in a context where we cannot waste that
938 * much time (DPC watchdog, called from clock interrupt).
939 */
940 fEFlags = ASMIntDisableFlags();
941 uTsc = ASMReadTSC();
942 nsNow = RTTimeSystemNanoTS();
943 idCpu = RTMpCpuId();
944 ASMSetFlags(fEFlags);
945
946 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
947 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
948
949 /*
950 * If the above measurement was taken on a different CPU than the one we
951 * started the process on, cTscTicksElapsed will need to be adjusted with
952 * the TSC deltas of both the CPUs.
953 *
954 * We ASSUME that the delta calculation process takes less time than the
955 * TSC frequency refinement timer. If it doesn't, we'll complain and
956 * drop the frequency refinement.
957 *
958 * Note! We cannot entirely trust enmUseTscDelta here because it's
959 * downgraded after each delta calculation.
960 */
961 if ( idCpu != pDevExt->idCpuInvarTscRefine
962 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
963 {
964 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
965 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
966 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
967 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
968 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
969 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
970 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
971 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
972 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
973 {
974 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
975 {
976 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
977 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
978 }
979 }
980 /*
981 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
982 * calculations.
983 */
984 else if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
985 {
986 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
987 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
988 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
989 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
990 int rc = RTTimerStop(pTimer); AssertRC(rc);
991 return;
992 }
993 }
994
995 /*
996 * Calculate and update the CPU frequency variables in GIP.
997 *
998 * If there is a GIP user already and we've already refined the frequency
999 * a couple of times, don't update it as we want a stable frequency value
1000 * for all VMs.
1001 */
1002 if ( pDevExt->cGipUsers == 0
1003 || cNsElapsed < RT_NS_1SEC * 2)
1004 {
1005 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed, (uint32_t)iTick);
1006
1007 /*
1008 * Stop the timer once we've reached the defined refinement period.
1009 */
1010 if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
1011 {
1012 int rc = RTTimerStop(pTimer);
1013 AssertRC(rc);
1014 }
1015 }
1016 else
1017 {
1018 int rc = RTTimerStop(pTimer);
1019 AssertRC(rc);
1020 }
1021}
1022
1023
1024/**
1025 * @callback_method_impl{FNRTPOWERNOTIFICATION}
1026 */
1027static DECLCALLBACK(void) supdrvGipPowerNotificationCallback(RTPOWEREVENT enmEvent, void *pvUser)
1028{
1029 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1030 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1031
1032 /*
1033 * If the TSC frequency refinement timer is running, we need to cancel it so it
1034 * doesn't screw up the frequency after a long suspend.
1035 *
1036 * Recalculate all TSC-deltas on host resume as it may have changed, seen
1037 * on Windows 7 running on the Dell Optiplex Intel Core i5-3570.
1038 */
1039 if (enmEvent == RTPOWEREVENT_RESUME)
1040 {
1041 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1042 if ( RT_LIKELY(pGip)
1043 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1044 && !supdrvOSAreCpusOfflinedOnSuspend())
1045 {
1046#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1047 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
1048#else
1049 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
1050 supdrvTscMeasureInitialDeltas(pDevExt);
1051#endif
1052 }
1053 }
1054 else if (enmEvent == RTPOWEREVENT_SUSPEND)
1055 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1056}
1057
1058
1059/**
1060 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
1061 *
1062 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
1063 * the CPU may change the TSC frequence between now and when the timer fires
1064 * (supdrvInitAsyncRefineTscTimer).
1065 *
1066 * @param pDevExt Pointer to the device instance data.
1067 */
1068static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt)
1069{
1070 uint64_t u64NanoTS;
1071 RTCCUINTREG fEFlags;
1072 int rc;
1073
1074 /*
1075 * Register a power management callback.
1076 */
1077 pDevExt->fInvTscRefinePowerEvent = false;
1078 rc = RTPowerNotificationRegister(supdrvGipPowerNotificationCallback, pDevExt);
1079 AssertRC(rc); /* ignore */
1080
1081 /*
1082 * Record the TSC and NanoTS as the starting anchor point for refinement
1083 * of the TSC. We try get as close to a clock tick as possible on systems
1084 * which does not provide high resolution time.
1085 */
1086 u64NanoTS = RTTimeSystemNanoTS();
1087 while (RTTimeSystemNanoTS() == u64NanoTS)
1088 ASMNopPause();
1089
1090 fEFlags = ASMIntDisableFlags();
1091 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
1092 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
1093 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
1094 ASMSetFlags(fEFlags);
1095
1096 /*
1097 * Create a timer that runs on the same CPU so we won't have a depencency
1098 * on the TSC-delta and can run in parallel to it. On systems that does not
1099 * implement CPU specific timers we'll apply deltas in the timer callback,
1100 * just like we do for CPUs going offline.
1101 *
1102 * The longer the refinement interval the better the accuracy, at least in
1103 * theory. If it's too long though, ring-3 may already be starting its
1104 * first VMs before we're done. On most systems we will be loading the
1105 * support driver during boot and VMs won't be started for a while yet,
1106 * it is really only a problem during development (especially with
1107 * on-demand driver starting on windows).
1108 *
1109 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
1110 * to calculate the frequency during driver loading, the timer is set
1111 * to fire after 200 ms the first time. It will then reschedule itself
1112 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
1113 * reached or it notices that there is a user land client with GIP
1114 * mapped (we want a stable frequency for all VMs).
1115 */
1116 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC,
1117 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
1118 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1119 if (RT_SUCCESS(rc))
1120 {
1121 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1122 if (RT_SUCCESS(rc))
1123 return;
1124 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1125 }
1126
1127 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1128 {
1129 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC, RTTIMER_FLAGS_CPU_ANY,
1130 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1131 if (RT_SUCCESS(rc))
1132 {
1133 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1134 if (RT_SUCCESS(rc))
1135 return;
1136 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1137 }
1138 }
1139
1140 pDevExt->pInvarTscRefineTimer = NULL;
1141 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1142}
1143
1144
1145/**
1146 * @callback_method_impl{PFNRTMPWORKER,
1147 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1148 * the measurements on.}
1149 */
1150static DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1151{
1152 RTCCUINTREG fEFlags = ASMIntDisableFlags();
1153 uint64_t *puTscStop = (uint64_t *)pvUser1;
1154 uint64_t *pnsStop = (uint64_t *)pvUser2;
1155 RT_NOREF1(idCpu);
1156
1157 *puTscStop = ASMReadTSC();
1158 *pnsStop = RTTimeSystemNanoTS();
1159
1160 ASMSetFlags(fEFlags);
1161}
1162
1163
1164/**
1165 * Measures the TSC frequency of the system.
1166 *
1167 * The TSC frequency can vary on systems which are not reported as invariant.
1168 * On such systems the object of this function is to find out what the nominal,
1169 * maximum TSC frequency under 'normal' CPU operation.
1170 *
1171 * @returns VBox status code.
1172 * @param pGip Pointer to the GIP.
1173 * @param fRough Set if we're doing the rough calculation that the
1174 * TSC measuring code needs, where accuracy isn't all
1175 * that important (too high is better than too low).
1176 * When clear we try for best accuracy that we can
1177 * achieve in reasonably short time.
1178 */
1179static int supdrvGipInitMeasureTscFreq(PSUPGLOBALINFOPAGE pGip, bool fRough)
1180{
1181 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1182 int cTriesLeft = fRough ? 4 : 2;
1183 while (cTriesLeft-- > 0)
1184 {
1185 RTCCUINTREG fEFlags;
1186 uint64_t nsStart;
1187 uint64_t nsStop;
1188 uint64_t uTscStart;
1189 uint64_t uTscStop;
1190 RTCPUID idCpuStart;
1191 RTCPUID idCpuStop;
1192
1193 /*
1194 * Synchronize with the host OS clock tick on systems without high
1195 * resolution time API (older Windows version for example).
1196 */
1197 nsStart = RTTimeSystemNanoTS();
1198 while (RTTimeSystemNanoTS() == nsStart)
1199 ASMNopPause();
1200
1201 /*
1202 * Read the TSC and current time, noting which CPU we're on.
1203 */
1204 fEFlags = ASMIntDisableFlags();
1205 uTscStart = ASMReadTSC();
1206 nsStart = RTTimeSystemNanoTS();
1207 idCpuStart = RTMpCpuId();
1208 ASMSetFlags(fEFlags);
1209
1210 /*
1211 * Delay for a while.
1212 */
1213 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1214 {
1215 /*
1216 * Sleep-wait since the TSC frequency is constant, it eases host load.
1217 * Shorter interval produces more variance in the frequency (esp. Windows).
1218 */
1219 uint64_t msElapsed = 0;
1220 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1221 / RT_NS_1MS;
1222 do
1223 {
1224 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1225 nsStop = RTTimeSystemNanoTS();
1226 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1227 } while (msElapsed < msDelay);
1228
1229 while (RTTimeSystemNanoTS() == nsStop)
1230 ASMNopPause();
1231 }
1232 else
1233 {
1234 /*
1235 * Busy-wait keeping the frequency up.
1236 */
1237 do
1238 {
1239 ASMNopPause();
1240 nsStop = RTTimeSystemNanoTS();
1241 } while (nsStop - nsStart < RT_NS_100MS);
1242 }
1243
1244 /*
1245 * Read the TSC and time again.
1246 */
1247 fEFlags = ASMIntDisableFlags();
1248 uTscStop = ASMReadTSC();
1249 nsStop = RTTimeSystemNanoTS();
1250 idCpuStop = RTMpCpuId();
1251 ASMSetFlags(fEFlags);
1252
1253 /*
1254 * If the CPU changes, things get a bit complicated and what we
1255 * can get away with depends on the GIP mode / TSC reliability.
1256 */
1257 if (idCpuStop != idCpuStart)
1258 {
1259 bool fDoXCall = false;
1260
1261 /*
1262 * Synchronous TSC mode: we're probably fine as it's unlikely
1263 * that we were rescheduled because of TSC throttling or power
1264 * management reasons, so just go ahead.
1265 */
1266 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1267 {
1268 /* Probably ok, maybe we should retry once?. */
1269 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1270 }
1271 /*
1272 * If we're just doing the rough measurement, do the cross call and
1273 * get on with things (we don't have deltas!).
1274 */
1275 else if (fRough)
1276 fDoXCall = true;
1277 /*
1278 * Invariant TSC mode: It doesn't matter if we have delta available
1279 * for both CPUs. That is not something we can assume at this point.
1280 *
1281 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1282 * downgraded after each delta calculation and the delta
1283 * calculations may not be complete yet.
1284 */
1285 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1286 {
1287/** @todo This section of code is never reached atm, consider dropping it later on... */
1288 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1289 {
1290 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1291 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1292 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1293 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1294 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1295 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1296 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1297 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1298 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1299 {
1300 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1301 {
1302 uTscStart -= iStartTscDelta;
1303 uTscStop -= iStopTscDelta;
1304 }
1305 }
1306 /*
1307 * Invalid CPU indexes are not caused by online/offline races, so
1308 * we have to trigger driver load failure if that happens as GIP
1309 * and IPRT assumptions are busted on this system.
1310 */
1311 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1312 {
1313 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1314 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1315 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1316 return VERR_INVALID_CPU_INDEX;
1317 }
1318 /*
1319 * No valid deltas. We retry, if we're on our last retry
1320 * we do the cross call instead just to get a result. The
1321 * frequency will be refined in a few seconds anyway.
1322 */
1323 else if (cTriesLeft > 0)
1324 continue;
1325 else
1326 fDoXCall = true;
1327 }
1328 }
1329 /*
1330 * Asynchronous TSC mode: This is bad, as the reason we usually
1331 * use this mode is to deal with variable TSC frequencies and
1332 * deltas. So, we need to get the TSC from the same CPU as
1333 * started it, we also need to keep that CPU busy. So, retry
1334 * and fall back to the cross call on the last attempt.
1335 */
1336 else
1337 {
1338 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1339 if (cTriesLeft > 0)
1340 continue;
1341 fDoXCall = true;
1342 }
1343
1344 if (fDoXCall)
1345 {
1346 /*
1347 * Try read the TSC and timestamp on the start CPU.
1348 */
1349 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1350 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1351 continue;
1352 }
1353 }
1354
1355 /*
1356 * Calculate the TSC frequency and update it (shared with the refinement timer).
1357 */
1358 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart, 0);
1359 return VINF_SUCCESS;
1360 }
1361
1362 Assert(!fRough);
1363 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1364}
1365
1366
1367/**
1368 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1369 *
1370 * @returns Index of the CPU in the cache set.
1371 * @param pGip The GIP.
1372 * @param idCpu The CPU ID.
1373 */
1374static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1375{
1376 uint32_t i, cTries;
1377
1378 /*
1379 * ASSUMES that CPU IDs are constant.
1380 */
1381 for (i = 0; i < pGip->cCpus; i++)
1382 if (pGip->aCPUs[i].idCpu == idCpu)
1383 return i;
1384
1385 cTries = 0;
1386 do
1387 {
1388 for (i = 0; i < pGip->cCpus; i++)
1389 {
1390 bool fRc;
1391 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1392 if (fRc)
1393 return i;
1394 }
1395 } while (cTries++ < 32);
1396 AssertReleaseFailed();
1397 return i - 1;
1398}
1399
1400
1401/**
1402 * The calling CPU should be accounted as online, update GIP accordingly.
1403 *
1404 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1405 *
1406 * @param pDevExt The device extension.
1407 * @param idCpu The CPU ID.
1408 */
1409static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1410{
1411 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1412 int iCpuSet = 0;
1413 uint32_t idApic;
1414 uint32_t i = 0;
1415 uint64_t u64NanoTS = 0;
1416
1417 AssertPtrReturnVoid(pGip);
1418 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1419 AssertRelease(idCpu == RTMpCpuId());
1420 Assert(pGip->cPossibleCpus == RTMpGetCount());
1421
1422 /*
1423 * Do this behind a spinlock with interrupts disabled as this can fire
1424 * on all CPUs simultaneously, see @bugref{6110}.
1425 */
1426 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1427
1428 /*
1429 * Update the globals.
1430 */
1431 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1432 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1433 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1434 if (iCpuSet >= 0)
1435 {
1436 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1437 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1438 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1439 }
1440
1441 /*
1442 * Update the entry.
1443 */
1444 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1445 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1446
1447 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1448
1449 idApic = supdrvGipGetApicIdSlow();
1450 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1451 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1452 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1453
1454 pGip->aCPUs[i].iCpuGroup = 0;
1455 pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
1456#ifdef RT_OS_WINDOWS
1457 supdrvOSGipInitGroupBitsForCpu(pDevExt, pGip, &pGip->aCPUs[i]);
1458#endif
1459
1460 /*
1461 * Update the APIC ID and CPU set index mappings.
1462 */
1463 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
1464 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1465 else
1466 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: idApic=%#x is out of bounds (%#zx, i=%u, iCpuSet=%d)\n",
1467 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId), i, iCpuSet));
1468 if ((unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
1469 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1470 else
1471 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: iCpuSet=%d is out of bounds (%#zx, i=%u, idApic=%d)\n",
1472 iCpuSet, RT_ELEMENTS(pGip->aiCpuFromApicId), i, idApic));
1473
1474 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1475 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1476
1477 /* Update the Mp online/offline counter. */
1478 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1479
1480 /* Commit it. */
1481 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1482
1483 RTSpinlockRelease(pDevExt->hGipSpinlock);
1484}
1485
1486
1487/**
1488 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1489 *
1490 * @param idCpu The CPU ID we are running on.
1491 * @param pvUser1 Opaque pointer to the device instance data.
1492 * @param pvUser2 Not used.
1493 */
1494static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1495{
1496 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1497 NOREF(pvUser2);
1498 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1499}
1500
1501
1502/**
1503 * The CPU should be accounted as offline, update the GIP accordingly.
1504 *
1505 * This is used by supdrvGipMpEvent.
1506 *
1507 * @param pDevExt The device extension.
1508 * @param idCpu The CPU ID.
1509 */
1510static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1511{
1512 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1513 int iCpuSet;
1514 unsigned i;
1515
1516 AssertPtrReturnVoid(pGip);
1517 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1518
1519 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1520 AssertReturnVoid(iCpuSet >= 0);
1521
1522 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1523 AssertReturnVoid(i < pGip->cCpus);
1524 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1525
1526 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1527 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1528
1529 /* Update the Mp online/offline counter. */
1530 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1531
1532 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1533 {
1534 /* Reset the TSC delta, we will recalculate it lazily. */
1535 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1536 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1537 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1538 }
1539
1540 /* Commit it. */
1541 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1542
1543 RTSpinlockRelease(pDevExt->hGipSpinlock);
1544}
1545
1546
1547/**
1548 * Multiprocessor event notification callback.
1549 *
1550 * This is used to make sure that the GIP master gets passed on to
1551 * another CPU. It also updates the associated CPU data.
1552 *
1553 * @param enmEvent The event.
1554 * @param idCpu The cpu it applies to.
1555 * @param pvUser Pointer to the device extension.
1556 */
1557static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1558{
1559 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1560 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1561
1562 if (pGip)
1563 {
1564 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1565 switch (enmEvent)
1566 {
1567 case RTMPEVENT_ONLINE:
1568 {
1569 RTThreadPreemptDisable(&PreemptState);
1570 if (idCpu == RTMpCpuId())
1571 {
1572 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1573 RTThreadPreemptRestore(&PreemptState);
1574 }
1575 else
1576 {
1577 RTThreadPreemptRestore(&PreemptState);
1578 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1579 }
1580
1581 /*
1582 * Recompute TSC-delta for the newly online'd CPU.
1583 */
1584 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1585 {
1586#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1587 supdrvTscDeltaThreadStartMeasurement(pDevExt, false /* fForceAll */);
1588#else
1589 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1590 supdrvTscMeasureDeltaOne(pDevExt, iCpu);
1591#endif
1592 }
1593 break;
1594 }
1595
1596 case RTMPEVENT_OFFLINE:
1597 supdrvGipMpEventOffline(pDevExt, idCpu);
1598 break;
1599 }
1600 }
1601
1602 /*
1603 * Make sure there is a master GIP.
1604 */
1605 if (enmEvent == RTMPEVENT_OFFLINE)
1606 {
1607 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1608 if (idGipMaster == idCpu)
1609 {
1610 /*
1611 * The GIP master is going offline, find a new one.
1612 */
1613 bool fIgnored;
1614 unsigned i;
1615 RTCPUID idNewGipMaster = NIL_RTCPUID;
1616 RTCPUSET OnlineCpus;
1617 RTMpGetOnlineSet(&OnlineCpus);
1618
1619 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1620 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1621 {
1622 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1623 if (idCurCpu != idGipMaster)
1624 {
1625 idNewGipMaster = idCurCpu;
1626 break;
1627 }
1628 }
1629
1630 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1631 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1632 NOREF(fIgnored);
1633 }
1634 }
1635}
1636
1637
1638/**
1639 * On CPU initialization callback for RTMpOnAll.
1640 *
1641 * @param idCpu The CPU ID.
1642 * @param pvUser1 The device extension.
1643 * @param pvUser2 The GIP.
1644 */
1645static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1646{
1647 /* This is good enough, even though it will update some of the globals a
1648 bit to much. */
1649 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1650 NOREF(pvUser2);
1651}
1652
1653
1654/**
1655 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1656 *
1657 * @param idCpu Ignored.
1658 * @param pvUser1 Where to put the TSC.
1659 * @param pvUser2 Ignored.
1660 */
1661static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1662{
1663 Assert(RTMpCpuIdToSetIndex(idCpu) == (intptr_t)pvUser2);
1664 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1665 RT_NOREF2(idCpu, pvUser2);
1666}
1667
1668
1669/**
1670 * Determine if Async GIP mode is required because of TSC drift.
1671 *
1672 * When using the default/normal timer code it is essential that the time stamp counter
1673 * (TSC) runs never backwards, that is, a read operation to the counter should return
1674 * a bigger value than any previous read operation. This is guaranteed by the latest
1675 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1676 * case we have to choose the asynchronous timer mode.
1677 *
1678 * @param poffMin Pointer to the determined difference between different
1679 * cores (optional, can be NULL).
1680 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1681 */
1682static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1683{
1684 /*
1685 * Just iterate all the cpus 8 times and make sure that the TSC is
1686 * ever increasing. We don't bother taking TSC rollover into account.
1687 */
1688 int iEndCpu = RTMpGetArraySize();
1689 int iCpu;
1690 int cLoops = 8;
1691 bool fAsync = false;
1692 int rc = VINF_SUCCESS;
1693 uint64_t offMax = 0;
1694 uint64_t offMin = ~(uint64_t)0;
1695 uint64_t PrevTsc = ASMReadTSC();
1696
1697 while (cLoops-- > 0)
1698 {
1699 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1700 {
1701 uint64_t CurTsc;
1702 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker,
1703 &CurTsc, (void *)(uintptr_t)iCpu);
1704 if (RT_SUCCESS(rc))
1705 {
1706 if (CurTsc <= PrevTsc)
1707 {
1708 fAsync = true;
1709 offMin = offMax = PrevTsc - CurTsc;
1710 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1711 iCpu, cLoops, CurTsc, PrevTsc));
1712 break;
1713 }
1714
1715 /* Gather statistics (except the first time). */
1716 if (iCpu != 0 || cLoops != 7)
1717 {
1718 uint64_t off = CurTsc - PrevTsc;
1719 if (off < offMin)
1720 offMin = off;
1721 if (off > offMax)
1722 offMax = off;
1723 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1724 }
1725
1726 /* Next */
1727 PrevTsc = CurTsc;
1728 }
1729 else if (rc == VERR_NOT_SUPPORTED)
1730 break;
1731 else
1732 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1733 }
1734
1735 /* broke out of the loop. */
1736 if (iCpu < iEndCpu)
1737 break;
1738 }
1739
1740 if (poffMin)
1741 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1742 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1743 fAsync, iEndCpu, rc, offMin, offMax));
1744#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1745 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1746#endif
1747 return fAsync;
1748}
1749
1750
1751/**
1752 * supdrvGipInit() worker that determines the GIP TSC mode.
1753 *
1754 * @returns The most suitable TSC mode.
1755 * @param pDevExt Pointer to the device instance data.
1756 */
1757static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1758{
1759 uint64_t u64DiffCoresIgnored;
1760 uint32_t uEAX, uEBX, uECX, uEDX;
1761
1762 /*
1763 * Establish whether the CPU advertises TSC as invariant, we need that in
1764 * a couple of places below.
1765 */
1766 bool fInvariantTsc = false;
1767 if (ASMHasCpuId())
1768 {
1769 uEAX = ASMCpuId_EAX(0x80000000);
1770 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1771 {
1772 uEDX = ASMCpuId_EDX(0x80000007);
1773 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1774 fInvariantTsc = true;
1775 }
1776 }
1777
1778 /*
1779 * On single CPU systems, we don't need to consider ASYNC mode.
1780 */
1781 if (RTMpGetCount() <= 1)
1782 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1783
1784 /*
1785 * Allow the user and/or OS specific bits to force async mode.
1786 */
1787 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1788 return SUPGIPMODE_ASYNC_TSC;
1789
1790 /*
1791 * Use invariant mode if the CPU says TSC is invariant.
1792 */
1793 if (fInvariantTsc)
1794 return SUPGIPMODE_INVARIANT_TSC;
1795
1796 /*
1797 * TSC is not invariant and we're on SMP, this presents two problems:
1798 *
1799 * (1) There might be a skew between the CPU, so that cpu0
1800 * returns a TSC that is slightly different from cpu1.
1801 * This screw may be due to (2), bad TSC initialization
1802 * or slightly different TSC rates.
1803 *
1804 * (2) Power management (and other things) may cause the TSC
1805 * to run at a non-constant speed, and cause the speed
1806 * to be different on the cpus. This will result in (1).
1807 *
1808 * If any of the above is detected, we will have to use ASYNC mode.
1809 */
1810 /* (1). Try check for current differences between the cpus. */
1811 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1812 return SUPGIPMODE_ASYNC_TSC;
1813
1814 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1815 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1816 if ( ASMIsValidStdRange(uEAX)
1817 && (ASMIsAmdCpuEx(uEBX, uECX, uEDX) || ASMIsHygonCpuEx(uEBX, uECX, uEDX)) )
1818 {
1819 /* Check for APM support. */
1820 uEAX = ASMCpuId_EAX(0x80000000);
1821 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1822 {
1823 uEDX = ASMCpuId_EDX(0x80000007);
1824 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1825 return SUPGIPMODE_ASYNC_TSC;
1826 }
1827 }
1828
1829 return SUPGIPMODE_SYNC_TSC;
1830}
1831
1832
1833/**
1834 * Initializes per-CPU GIP information.
1835 *
1836 * @param pGip Pointer to the GIP.
1837 * @param pCpu Pointer to which GIP CPU to initialize.
1838 * @param u64NanoTS The current nanosecond timestamp.
1839 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1840 */
1841static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1842{
1843 pCpu->u32TransactionId = 2;
1844 pCpu->u64NanoTS = u64NanoTS;
1845 pCpu->u64TSC = ASMReadTSC();
1846 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1847 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1848
1849 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1850 ASMAtomicWriteU32(&pCpu->idCpu, NIL_RTCPUID);
1851 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1852 ASMAtomicWriteU16(&pCpu->iCpuGroup, 0);
1853 ASMAtomicWriteU16(&pCpu->iCpuGroupMember, UINT16_MAX);
1854 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1855 ASMAtomicWriteU32(&pCpu->iReservedForNumaNode, 0);
1856
1857 /*
1858 * The first time we're called, we don't have a CPU frequency handy,
1859 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1860 * called again and at that point we have a more plausible CPU frequency
1861 * value handy. The frequency history will also be adjusted again on
1862 * the 2nd timer callout (maybe we can skip that now?).
1863 */
1864 if (!uCpuHz)
1865 {
1866 pCpu->u64CpuHz = _4G - 1;
1867 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1868 }
1869 else
1870 {
1871 pCpu->u64CpuHz = uCpuHz;
1872 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1873 }
1874 pCpu->au32TSCHistory[0]
1875 = pCpu->au32TSCHistory[1]
1876 = pCpu->au32TSCHistory[2]
1877 = pCpu->au32TSCHistory[3]
1878 = pCpu->au32TSCHistory[4]
1879 = pCpu->au32TSCHistory[5]
1880 = pCpu->au32TSCHistory[6]
1881 = pCpu->au32TSCHistory[7]
1882 = pCpu->u32UpdateIntervalTSC;
1883}
1884
1885
1886/**
1887 * Initializes the GIP data.
1888 *
1889 * @returns VBox status code.
1890 * @param pDevExt Pointer to the device instance data.
1891 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1892 * @param HCPhys The physical address of the GIP.
1893 * @param u64NanoTS The current nanosecond timestamp.
1894 * @param uUpdateHz The update frequency.
1895 * @param uUpdateIntervalNS The update interval in nanoseconds.
1896 * @param cCpus The CPU count.
1897 * @param cbGipCpuGroups The supdrvOSGipGetGroupTableSize return value we
1898 * used when allocating the GIP structure.
1899 */
1900static int supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1901 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS,
1902 unsigned cCpus, size_t cbGipCpuGroups)
1903{
1904 size_t const cbGip = RT_ALIGN_Z(RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups, PAGE_SIZE);
1905 unsigned i;
1906#ifdef DEBUG_DARWIN_GIP
1907 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1908#else
1909 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1910#endif
1911
1912 /*
1913 * Initialize the structure.
1914 */
1915 memset(pGip, 0, cbGip);
1916
1917 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1918 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1919 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1920 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1921 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1922 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1923 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1924 else
1925 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1926 pGip->cCpus = (uint16_t)cCpus;
1927 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1928 pGip->u32UpdateHz = uUpdateHz;
1929 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1930 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1931 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1932 RTCpuSetEmpty(&pGip->PresentCpuSet);
1933 RTMpGetSet(&pGip->PossibleCpuSet);
1934 pGip->cOnlineCpus = RTMpGetOnlineCount();
1935 pGip->cPresentCpus = RTMpGetPresentCount();
1936 pGip->cPossibleCpus = RTMpGetCount();
1937 pGip->cPossibleCpuGroups = 1;
1938 pGip->idCpuMax = RTMpGetMaxCpuId();
1939 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1940 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1941 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1942 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1943 for (i = 0; i < RT_ELEMENTS(pGip->aoffCpuGroup); i++)
1944 pGip->aoffCpuGroup[i] = UINT32_MAX;
1945 for (i = 0; i < cCpus; i++)
1946 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1947#ifdef RT_OS_WINDOWS
1948 int rc = supdrvOSInitGipGroupTable(pDevExt, pGip, cbGipCpuGroups);
1949 AssertRCReturn(rc, rc);
1950#endif
1951
1952 /*
1953 * Link it to the device extension.
1954 */
1955 pDevExt->pGip = pGip;
1956 pDevExt->HCPhysGip = HCPhys;
1957 pDevExt->cGipUsers = 0;
1958
1959 return VINF_SUCCESS;
1960}
1961
1962
1963/**
1964 * Creates the GIP.
1965 *
1966 * @returns VBox status code.
1967 * @param pDevExt Instance data. GIP stuff may be updated.
1968 */
1969int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1970{
1971 PSUPGLOBALINFOPAGE pGip;
1972 size_t cbGip;
1973 size_t cbGipCpuGroups;
1974 RTHCPHYS HCPhysGip;
1975 uint32_t u32SystemResolution;
1976 uint32_t u32Interval;
1977 uint32_t u32MinInterval;
1978 uint32_t uMod;
1979 unsigned cCpus;
1980 int rc;
1981
1982 LogFlow(("supdrvGipCreate:\n"));
1983
1984 /*
1985 * Assert order.
1986 */
1987 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1988 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1989 Assert(!pDevExt->pGipTimer);
1990#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1991 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1992 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1993#else
1994 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1995 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1996#endif
1997
1998 /*
1999 * Check the CPU count.
2000 */
2001 cCpus = RTMpGetArraySize();
2002 if (cCpus > RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)))
2003 {
2004 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)));
2005 return VERR_TOO_MANY_CPUS;
2006 }
2007
2008 /*
2009 * Allocate a contiguous set of pages with a default kernel mapping.
2010 */
2011#ifdef RT_OS_WINDOWS
2012 cbGipCpuGroups = supdrvOSGipGetGroupTableSize(pDevExt);
2013#else
2014 cbGipCpuGroups = 0;
2015#endif
2016 cbGip = RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups;
2017 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, cbGip, false /*fExecutable*/);
2018 if (RT_FAILURE(rc))
2019 {
2020 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
2021 return rc;
2022 }
2023 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
2024 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
2025
2026 /*
2027 * Find a reasonable update interval and initialize the structure.
2028 */
2029 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
2030 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
2031 * See @bugref{6710}. */
2032 u32MinInterval = RT_NS_10MS;
2033 u32SystemResolution = RTTimerGetSystemGranularity();
2034 u32Interval = u32MinInterval;
2035 uMod = u32MinInterval % u32SystemResolution;
2036 if (uMod)
2037 u32Interval += u32SystemResolution - uMod;
2038
2039 rc = supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval,
2040 cCpus, cbGipCpuGroups);
2041
2042 /*
2043 * Important sanity check... (Sets rc)
2044 */
2045 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
2046 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
2047 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
2048 {
2049 OSDBGPRINT(("supdrvGipCreate: Host-OS/user claims the TSC-deltas are zero but we detected async. TSC! Bad.\n"));
2050 rc = VERR_INTERNAL_ERROR_2;
2051 }
2052
2053 /* It doesn't make sense to do TSC-delta detection on systems we detect as async. */
2054 AssertStmt( pGip->u32Mode != SUPGIPMODE_ASYNC_TSC
2055 || pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED,
2056 rc = VERR_INTERNAL_ERROR_3);
2057
2058 /*
2059 * Do the TSC frequency measurements.
2060 *
2061 * If we're in invariant TSC mode, just to a quick preliminary measurement
2062 * that the TSC-delta measurement code can use to yield cross calls.
2063 *
2064 * If we're in any of the other two modes, neither which require MP init,
2065 * notifications or deltas for the job, do the full measurement now so
2066 * that supdrvGipInitOnCpu() can populate the TSC interval and history
2067 * array with more reasonable values.
2068 */
2069 if (RT_SUCCESS(rc))
2070 {
2071 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
2072 {
2073 rc = supdrvGipInitMeasureTscFreq(pGip, true /*fRough*/); /* cannot fail */
2074 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt);
2075 }
2076 else
2077 rc = supdrvGipInitMeasureTscFreq(pGip, false /*fRough*/);
2078 if (RT_SUCCESS(rc))
2079 {
2080 /*
2081 * Start TSC-delta measurement thread before we start getting MP
2082 * events that will try kick it into action (includes the
2083 * RTMpOnAll/supdrvGipInitOnCpu call below).
2084 */
2085 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
2086 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
2087#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2088 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2089 rc = supdrvTscDeltaThreadInit(pDevExt);
2090#endif
2091 if (RT_SUCCESS(rc))
2092 {
2093 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
2094 if (RT_SUCCESS(rc))
2095 {
2096 /*
2097 * Do GIP initialization on all online CPUs. Wake up the
2098 * TSC-delta thread afterwards.
2099 */
2100 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
2101 if (RT_SUCCESS(rc))
2102 {
2103#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2104 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
2105#else
2106 uint16_t iCpu;
2107 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2108 {
2109 /*
2110 * Measure the TSC deltas now that we have MP notifications.
2111 */
2112 int cTries = 5;
2113 do
2114 {
2115 rc = supdrvTscMeasureInitialDeltas(pDevExt);
2116 if ( rc != VERR_TRY_AGAIN
2117 && rc != VERR_CPU_OFFLINE)
2118 break;
2119 } while (--cTries > 0);
2120 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2121 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
2122 }
2123 else
2124 {
2125 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2126 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
2127 }
2128 if (RT_SUCCESS(rc))
2129#endif
2130 {
2131 /*
2132 * Create the timer.
2133 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
2134 */
2135 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
2136 {
2137 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
2138 supdrvGipAsyncTimer, pDevExt);
2139 if (rc == VERR_NOT_SUPPORTED)
2140 {
2141 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
2142 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
2143 }
2144 }
2145 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2146 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
2147 supdrvGipSyncAndInvariantTimer, pDevExt);
2148 if (RT_SUCCESS(rc))
2149 {
2150 /*
2151 * We're good.
2152 */
2153 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
2154 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2155
2156 g_pSUPGlobalInfoPage = pGip;
2157 return VINF_SUCCESS;
2158 }
2159
2160 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
2161 Assert(!pDevExt->pGipTimer);
2162 }
2163 }
2164 else
2165 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
2166 }
2167 else
2168 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
2169 }
2170 else
2171 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
2172 }
2173 else
2174 OSDBGPRINT(("supdrvGipCreate: supdrvTscMeasureInitialDeltas failed. rc=%Rrc\n", rc));
2175 }
2176
2177 /* Releases timer frequency increase too. */
2178 supdrvGipDestroy(pDevExt);
2179 return rc;
2180}
2181
2182
2183/**
2184 * Invalidates the GIP data upon termination.
2185 *
2186 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2187 */
2188static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
2189{
2190 unsigned i;
2191 pGip->u32Magic = 0;
2192 for (i = 0; i < pGip->cCpus; i++)
2193 {
2194 pGip->aCPUs[i].u64NanoTS = 0;
2195 pGip->aCPUs[i].u64TSC = 0;
2196 pGip->aCPUs[i].iTSCHistoryHead = 0;
2197 pGip->aCPUs[i].u64TSCSample = 0;
2198 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2199 }
2200}
2201
2202
2203/**
2204 * Terminates the GIP.
2205 *
2206 * @param pDevExt Instance data. GIP stuff may be updated.
2207 */
2208void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2209{
2210 int rc;
2211#ifdef DEBUG_DARWIN_GIP
2212 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2213 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2214 pDevExt->pGipTimer, pDevExt->GipMemObj));
2215#endif
2216
2217 /*
2218 * Stop receiving MP notifications before tearing anything else down.
2219 */
2220 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2221
2222#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2223 /*
2224 * Terminate the TSC-delta measurement thread and resources.
2225 */
2226 supdrvTscDeltaTerm(pDevExt);
2227#endif
2228
2229 /*
2230 * Destroy the TSC-refinement timer.
2231 */
2232 if (pDevExt->pInvarTscRefineTimer)
2233 {
2234 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2235 pDevExt->pInvarTscRefineTimer = NULL;
2236 }
2237
2238 /*
2239 * Invalid the GIP data.
2240 */
2241 if (pDevExt->pGip)
2242 {
2243 supdrvGipTerm(pDevExt->pGip);
2244 pDevExt->pGip = NULL;
2245 }
2246 g_pSUPGlobalInfoPage = NULL;
2247
2248 /*
2249 * Destroy the timer and free the GIP memory object.
2250 */
2251 if (pDevExt->pGipTimer)
2252 {
2253 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2254 pDevExt->pGipTimer = NULL;
2255 }
2256
2257 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2258 {
2259 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2260 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2261 }
2262
2263 /*
2264 * Finally, make sure we've release the system timer resolution request
2265 * if one actually succeeded and is still pending.
2266 */
2267 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2268}
2269
2270
2271
2272
2273/*
2274 *
2275 *
2276 * GIP Update Timer Related Code
2277 * GIP Update Timer Related Code
2278 * GIP Update Timer Related Code
2279 *
2280 *
2281 */
2282
2283
2284/**
2285 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2286 * updates all the per cpu data except the transaction id.
2287 *
2288 * @param pDevExt The device extension.
2289 * @param pGipCpu Pointer to the per cpu data.
2290 * @param u64NanoTS The current time stamp.
2291 * @param u64TSC The current TSC.
2292 * @param iTick The current timer tick.
2293 *
2294 * @remarks Can be called with interrupts disabled!
2295 */
2296static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2297{
2298 uint64_t u64TSCDelta;
2299 bool fUpdateCpuHz;
2300 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2301 AssertPtrReturnVoid(pGip);
2302
2303 /* Delta between this and the previous update. */
2304 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2305
2306 /*
2307 * Update the NanoTS.
2308 */
2309 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2310
2311 /*
2312 * Calc TSC delta.
2313 */
2314 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2315 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2316
2317 /*
2318 * Determine if we need to update the CPU (TSC) frequency calculation.
2319 *
2320 * We don't need to keep recalculating the frequency when it's invariant,
2321 * unless the special tstGIP-2 testing mode is enabled.
2322 */
2323 fUpdateCpuHz = pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC;
2324 if (!(pGip->fFlags & SUPGIP_FLAGS_TESTING))
2325 { /* likely*/ }
2326 else
2327 {
2328 uint32_t fGipFlags = pGip->fFlags;
2329 if (fGipFlags & (SUPGIP_FLAGS_TESTING_ENABLE | SUPGIP_FLAGS_TESTING_START))
2330 {
2331 if (fGipFlags & SUPGIP_FLAGS_TESTING_START)
2332 {
2333 /* Cache the TSC frequency before forcing updates due to test mode. */
2334 if (!fUpdateCpuHz)
2335 pDevExt->uGipTestModeInvariantCpuHz = pGip->aCPUs[0].u64CpuHz;
2336 ASMAtomicAndU32(&pGip->fFlags, ~SUPGIP_FLAGS_TESTING_START);
2337 }
2338 fUpdateCpuHz = true;
2339 }
2340 else if (fGipFlags & SUPGIP_FLAGS_TESTING_STOP)
2341 {
2342 /* Restore the cached TSC frequency if any. */
2343 if (!fUpdateCpuHz)
2344 {
2345 Assert(pDevExt->uGipTestModeInvariantCpuHz);
2346 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, pDevExt->uGipTestModeInvariantCpuHz);
2347 }
2348 ASMAtomicAndU32(&pGip->fFlags, ~(SUPGIP_FLAGS_TESTING_STOP | SUPGIP_FLAGS_TESTING));
2349 }
2350 }
2351
2352 /*
2353 * Calculate the CPU (TSC) frequency if necessary.
2354 */
2355 if (fUpdateCpuHz)
2356 {
2357 uint64_t u64CpuHz;
2358 uint32_t u32UpdateIntervalTSC;
2359 uint32_t u32UpdateIntervalTSCSlack;
2360 uint32_t u32TransactionId;
2361 unsigned iTSCHistoryHead;
2362
2363 if (u64TSCDelta >> 32)
2364 {
2365 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2366 pGipCpu->cErrors++;
2367 }
2368
2369 /*
2370 * On the 2nd and 3rd callout, reset the history with the current TSC
2371 * interval since the values entered by supdrvGipInit are totally off.
2372 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2373 * better, while the 3rd should be most reliable.
2374 */
2375 /** @todo Could we drop this now that we initializes the history
2376 * with nominal TSC frequency values? */
2377 u32TransactionId = pGipCpu->u32TransactionId;
2378 if (RT_UNLIKELY( ( u32TransactionId == 5
2379 || u32TransactionId == 7)
2380 && ( iTick == 2
2381 || iTick == 3) ))
2382 {
2383 unsigned i;
2384 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2385 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2386 }
2387
2388 /*
2389 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2390 * Wait until we have at least one full history since the above history reset. The
2391 * assumption is that the majority of the previous history values will be tolerable.
2392 * See @bugref{6710#c67}.
2393 */
2394 /** @todo Could we drop the fudging there now that we initializes the history
2395 * with nominal TSC frequency values? */
2396 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2397 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2398 {
2399 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2400 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2401 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2402 {
2403 uint32_t u32;
2404 u32 = pGipCpu->au32TSCHistory[0];
2405 u32 += pGipCpu->au32TSCHistory[1];
2406 u32 += pGipCpu->au32TSCHistory[2];
2407 u32 += pGipCpu->au32TSCHistory[3];
2408 u32 >>= 2;
2409 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2410 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2411 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2412 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2413 u64TSCDelta >>= 2;
2414 u64TSCDelta += u32;
2415 u64TSCDelta >>= 1;
2416 }
2417 }
2418
2419 /*
2420 * TSC History.
2421 */
2422 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2423 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2424 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2425 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2426
2427 /*
2428 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2429 *
2430 * On Windows, we have an occasional (but recurring) sour value that messed up
2431 * the history but taking only 1 interval reduces the precision overall.
2432 */
2433 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2434 || pGip->u32UpdateHz >= 1000)
2435 {
2436 uint32_t u32;
2437 u32 = pGipCpu->au32TSCHistory[0];
2438 u32 += pGipCpu->au32TSCHistory[1];
2439 u32 += pGipCpu->au32TSCHistory[2];
2440 u32 += pGipCpu->au32TSCHistory[3];
2441 u32 >>= 2;
2442 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2443 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2444 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2445 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2446 u32UpdateIntervalTSC >>= 2;
2447 u32UpdateIntervalTSC += u32;
2448 u32UpdateIntervalTSC >>= 1;
2449
2450 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2451 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2452 }
2453 else if (pGip->u32UpdateHz >= 90)
2454 {
2455 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2456 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2457 u32UpdateIntervalTSC >>= 1;
2458
2459 /* value chosen on a 2GHz thinkpad running windows */
2460 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2461 }
2462 else
2463 {
2464 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2465
2466 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2467 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2468 }
2469 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2470
2471 /*
2472 * CpuHz.
2473 */
2474 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2475 u64CpuHz /= pGip->u32UpdateIntervalNS;
2476 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2477 }
2478}
2479
2480
2481/**
2482 * Updates the GIP.
2483 *
2484 * @param pDevExt The device extension.
2485 * @param u64NanoTS The current nanosecond timestamp.
2486 * @param u64TSC The current TSC timestamp.
2487 * @param idCpu The CPU ID.
2488 * @param iTick The current timer tick.
2489 *
2490 * @remarks Can be called with interrupts disabled!
2491 */
2492static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2493{
2494 /*
2495 * Determine the relevant CPU data.
2496 */
2497 PSUPGIPCPU pGipCpu;
2498 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2499 AssertPtrReturnVoid(pGip);
2500
2501 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2502 pGipCpu = &pGip->aCPUs[0];
2503 else
2504 {
2505 unsigned iCpu;
2506 uint32_t idApic = supdrvGipGetApicId(pGip);
2507 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
2508 { /* likely */ }
2509 else
2510 return;
2511 iCpu = pGip->aiCpuFromApicId[idApic];
2512 if (RT_LIKELY(iCpu < pGip->cCpus))
2513 { /* likely */ }
2514 else
2515 return;
2516 pGipCpu = &pGip->aCPUs[iCpu];
2517 if (RT_LIKELY(pGipCpu->idCpu == idCpu))
2518 { /* likely */ }
2519 else
2520 return;
2521 }
2522
2523 /*
2524 * Start update transaction.
2525 */
2526 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2527 {
2528 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2529 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2530 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2531 pGipCpu->cErrors++;
2532 return;
2533 }
2534
2535 /*
2536 * Recalc the update frequency every 0x800th time.
2537 */
2538 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariant hosts. */
2539 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2540 {
2541 if (pGip->u64NanoTSLastUpdateHz)
2542 {
2543#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2544 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2545 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2546 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2547 {
2548 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2549 * calculation on non-invariant hosts if it changes the history decision
2550 * taken in supdrvGipDoUpdateCpu(). */
2551 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2552 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2553 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2554 }
2555#endif
2556 }
2557 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2558 }
2559
2560 /*
2561 * Update the data.
2562 */
2563 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2564
2565 /*
2566 * Complete transaction.
2567 */
2568 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2569}
2570
2571
2572/**
2573 * Updates the per cpu GIP data for the calling cpu.
2574 *
2575 * @param pDevExt The device extension.
2576 * @param u64NanoTS The current nanosecond timestamp.
2577 * @param u64TSC The current TSC timesaver.
2578 * @param idCpu The CPU ID.
2579 * @param idApic The APIC id for the CPU index.
2580 * @param iTick The current timer tick.
2581 *
2582 * @remarks Can be called with interrupts disabled!
2583 */
2584static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2585 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2586{
2587 uint32_t iCpu;
2588 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2589
2590 /*
2591 * Avoid a potential race when a CPU online notification doesn't fire on
2592 * the onlined CPU but the tick creeps in before the event notification is
2593 * run.
2594 */
2595 if (RT_LIKELY(iTick != 1))
2596 { /* likely*/ }
2597 else
2598 {
2599 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2600 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2601 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2602 }
2603
2604 iCpu = pGip->aiCpuFromApicId[idApic];
2605 if (RT_LIKELY(iCpu < pGip->cCpus))
2606 {
2607 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2608 if (pGipCpu->idCpu == idCpu)
2609 {
2610 /*
2611 * Start update transaction.
2612 */
2613 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2614 {
2615 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2616 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2617 pGipCpu->cErrors++;
2618 return;
2619 }
2620
2621 /*
2622 * Update the data.
2623 */
2624 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2625
2626 /*
2627 * Complete transaction.
2628 */
2629 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2630 }
2631 }
2632}
2633
2634
2635/**
2636 * Timer callback function for the sync and invariant GIP modes.
2637 *
2638 * @param pTimer The timer.
2639 * @param pvUser Opaque pointer to the device extension.
2640 * @param iTick The timer tick.
2641 */
2642static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2643{
2644 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2645 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2646 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2647 uint64_t u64TSC = ASMReadTSC();
2648 uint64_t u64NanoTS = RTTimeSystemNanoTS();
2649 RT_NOREF1(pTimer);
2650
2651 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2652 {
2653 /*
2654 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2655 * missing timer ticks is not an option for GIP because the GIP users
2656 * will end up incrementing the time in 1ns per time getter call until
2657 * there is a complete timer update. So, if the delta has yet to be
2658 * calculated, we just pretend it is zero for now (the GIP users
2659 * probably won't have it for a wee while either and will do the same).
2660 *
2661 * We could maybe on some platforms try cross calling a CPU with a
2662 * working delta here, but it's not worth the hassle since the
2663 * likelihood of this happening is really low. On Windows, Linux, and
2664 * Solaris timers fire on the CPU they were registered/started on.
2665 * Darwin timers doesn't necessarily (they are high priority threads).
2666 */
2667 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2668 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2669 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2670 Assert(!ASMIntAreEnabled());
2671 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2672 {
2673 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2674 if (iTscDelta != INT64_MAX)
2675 u64TSC -= iTscDelta;
2676 }
2677 }
2678
2679 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2680
2681 ASMSetFlags(fEFlags);
2682}
2683
2684
2685/**
2686 * Timer callback function for async GIP mode.
2687 * @param pTimer The timer.
2688 * @param pvUser Opaque pointer to the device extension.
2689 * @param iTick The timer tick.
2690 */
2691static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2692{
2693 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2694 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2695 RTCPUID idCpu = RTMpCpuId();
2696 uint64_t u64TSC = ASMReadTSC();
2697 uint64_t NanoTS = RTTimeSystemNanoTS();
2698 RT_NOREF1(pTimer);
2699
2700 /** @todo reset the transaction number and whatnot when iTick == 1. */
2701 if (pDevExt->idGipMaster == idCpu)
2702 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2703 else
2704 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, supdrvGipGetApicId(pDevExt->pGip), iTick);
2705
2706 ASMSetFlags(fEFlags);
2707}
2708
2709
2710
2711
2712/*
2713 *
2714 *
2715 * TSC Delta Measurements And Related Code
2716 * TSC Delta Measurements And Related Code
2717 * TSC Delta Measurements And Related Code
2718 *
2719 *
2720 */
2721
2722
2723/*
2724 * Select TSC delta measurement algorithm.
2725 */
2726#if 0
2727# define GIP_TSC_DELTA_METHOD_1
2728#else
2729# define GIP_TSC_DELTA_METHOD_2
2730#endif
2731
2732/** For padding variables to keep them away from other cache lines. Better too
2733 * large than too small!
2734 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2735 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2736 * III had 32 bytes cache lines. */
2737#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2738
2739
2740/**
2741 * TSC delta measurement algorithm \#2 result entry.
2742 */
2743typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2744{
2745 uint32_t iSeqMine;
2746 uint32_t iSeqOther;
2747 uint64_t uTsc;
2748} SUPDRVTSCDELTAMETHOD2ENTRY;
2749
2750/**
2751 * TSC delta measurement algorithm \#2 Data.
2752 */
2753typedef struct SUPDRVTSCDELTAMETHOD2
2754{
2755 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2756 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2757 /** The current sequence number of this worker. */
2758 uint32_t volatile iCurSeqNo;
2759 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2760 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2761 /** Result table. */
2762 SUPDRVTSCDELTAMETHOD2ENTRY aResults[64];
2763} SUPDRVTSCDELTAMETHOD2;
2764/** Pointer to the data for TSC delta measurement algorithm \#2 .*/
2765typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2766
2767
2768/**
2769 * The TSC delta synchronization struct, version 2.
2770 *
2771 * The synchronization variable is completely isolated in its own cache line
2772 * (provided our max cache line size estimate is correct).
2773 */
2774typedef struct SUPTSCDELTASYNC2
2775{
2776 /** Padding to make sure the uVar1 is in its own cache line. */
2777 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2778
2779 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2780 volatile uint32_t uSyncVar;
2781 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2782 volatile uint32_t uSyncSeq;
2783
2784 /** Padding to make sure the uVar1 is in its own cache line. */
2785 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2786
2787 /** Start RDTSC value. Put here mainly to save stack space. */
2788 uint64_t uTscStart;
2789 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2790 uint64_t cMaxTscTicks;
2791} SUPTSCDELTASYNC2;
2792AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2793typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2794
2795/** Prestart wait. */
2796#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2797/** Prestart aborted. */
2798#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2799/** Ready (on your mark). */
2800#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2801/** Steady (get set). */
2802#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2803/** Go! */
2804#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2805/** Used by the verification test. */
2806#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2807
2808/** We reached the time limit. */
2809#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2810/** The other party won't touch the sync struct ever again. */
2811#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2812
2813
2814/**
2815 * Argument package/state passed by supdrvTscMeasureDeltaOne() to the RTMpOn
2816 * callback worker.
2817 * @todo add
2818 */
2819typedef struct SUPDRVGIPTSCDELTARGS
2820{
2821 /** The device extension. */
2822 PSUPDRVDEVEXT pDevExt;
2823 /** Pointer to the GIP CPU array entry for the worker. */
2824 PSUPGIPCPU pWorker;
2825 /** Pointer to the GIP CPU array entry for the master. */
2826 PSUPGIPCPU pMaster;
2827 /** The maximum number of ticks to spend in supdrvTscMeasureDeltaCallback.
2828 * (This is what we need a rough TSC frequency for.) */
2829 uint64_t cMaxTscTicks;
2830 /** Used to abort synchronization setup. */
2831 bool volatile fAbortSetup;
2832
2833 /** Padding to make sure the master variables live in its own cache lines. */
2834 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2835
2836 /** @name Master
2837 * @{ */
2838 /** The time the master spent in the MP worker. */
2839 uint64_t cElapsedMasterTscTicks;
2840 /** The iTry value when stopped at. */
2841 uint32_t iTry;
2842 /** Set if the run timed out. */
2843 bool volatile fTimedOut;
2844 /** Pointer to the master's synchronization struct (on stack). */
2845 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2846 /** Master data union. */
2847 union
2848 {
2849 /** Data (master) for delta verification. */
2850 struct
2851 {
2852 /** Verification test TSC values for the master. */
2853 uint64_t volatile auTscs[32];
2854 } Verify;
2855 /** Data (master) for measurement method \#2. */
2856 struct
2857 {
2858 /** Data and sequence number. */
2859 SUPDRVTSCDELTAMETHOD2 Data;
2860 /** The lag setting for the next run. */
2861 bool fLag;
2862 /** Number of hits. */
2863 uint32_t cHits;
2864 } M2;
2865 } uMaster;
2866 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2867 * VERR_TRY_AGAIN on timeout. */
2868 int32_t rcVerify;
2869#ifdef TSCDELTA_VERIFY_WITH_STATS
2870 /** The maximum difference between TSC read during delta verification. */
2871 int64_t cMaxVerifyTscTicks;
2872 /** The minimum difference between two TSC reads during verification. */
2873 int64_t cMinVerifyTscTicks;
2874 /** The bad TSC diff, worker relative to master (= worker - master).
2875 * Negative value means the worker is behind the master. */
2876 int64_t iVerifyBadTscDiff;
2877#endif
2878 /** @} */
2879
2880 /** Padding to make sure the worker variables live is in its own cache line. */
2881 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2882
2883 /** @name Proletarian
2884 * @{ */
2885 /** Pointer to the worker's synchronization struct (on stack). */
2886 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2887 /** The time the worker spent in the MP worker. */
2888 uint64_t cElapsedWorkerTscTicks;
2889 /** Worker data union. */
2890 union
2891 {
2892 /** Data (worker) for delta verification. */
2893 struct
2894 {
2895 /** Verification test TSC values for the worker. */
2896 uint64_t volatile auTscs[32];
2897 } Verify;
2898 /** Data (worker) for measurement method \#2. */
2899 struct
2900 {
2901 /** Data and sequence number. */
2902 SUPDRVTSCDELTAMETHOD2 Data;
2903 /** The lag setting for the next run (set by master). */
2904 bool fLag;
2905 } M2;
2906 } uWorker;
2907 /** @} */
2908
2909 /** Padding to make sure the above is in its own cache line. */
2910 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2911} SUPDRVGIPTSCDELTARGS;
2912typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2913
2914
2915/** @name Macros that implements the basic synchronization steps common to
2916 * the algorithms.
2917 *
2918 * Must be used from loop as the timeouts are implemented via 'break' statements
2919 * at the moment.
2920 *
2921 * @{
2922 */
2923#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2924# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2925# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2926# define TSCDELTA_DBG_CHECK_LOOP() \
2927 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2928#else
2929# define TSCDELTA_DBG_VARS() ((void)0)
2930# define TSCDELTA_DBG_START_LOOP() ((void)0)
2931# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2932#endif
2933#if 0
2934# define TSCDELTA_DBG_SYNC_MSG(a_Args) SUPR0Printf a_Args
2935#else
2936# define TSCDELTA_DBG_SYNC_MSG(a_Args) ((void)0)
2937#endif
2938#if 0
2939# define TSCDELTA_DBG_SYNC_MSG2(a_Args) SUPR0Printf a_Args
2940#else
2941# define TSCDELTA_DBG_SYNC_MSG2(a_Args) ((void)0)
2942#endif
2943#if 0
2944# define TSCDELTA_DBG_SYNC_MSG9(a_Args) SUPR0Printf a_Args
2945#else
2946# define TSCDELTA_DBG_SYNC_MSG9(a_Args) ((void)0)
2947#endif
2948
2949
2950static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2951 bool fIsMaster, PRTCCUINTREG pfEFlags, PSUPDRVGIPTSCDELTARGS pArgs)
2952{
2953 uint32_t iMySeq = fIsMaster ? 0 : 256;
2954 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2955 uint32_t u32Tmp;
2956 uint32_t iSync2Loops = 0;
2957 RTCCUINTREG fEFlags;
2958 TSCDELTA_DBG_VARS();
2959
2960 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2961
2962 /*
2963 * The master tells the worker to get on it's mark.
2964 */
2965 if (fIsMaster)
2966 {
2967 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2968 { /* likely*/ }
2969 else
2970 {
2971 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #1 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2972 return false;
2973 }
2974 }
2975
2976 /*
2977 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2978 */
2979 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2980 for (;;)
2981 {
2982 fEFlags = ASMIntDisableFlags();
2983 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2984 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2985 break;
2986 ASMSetFlags(fEFlags);
2987 ASMNopPause();
2988
2989 /* Abort? */
2990 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2991 {
2992 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #2 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
2993 return false;
2994 }
2995
2996 /* Check for timeouts every so often (not every loop in case RDTSC is
2997 trapping or something). Must check the first time around. */
2998#if 0 /* For debugging the timeout paths. */
2999 static uint32_t volatile xxx;
3000#endif
3001 if ( ( (iSync2Loops & 0x3ff) == 0
3002 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
3003#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
3004 || (!fIsMaster && (++xxx & 0xf) == 0)
3005#endif
3006 )
3007 {
3008 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
3009 ignore the timeout if we've got the go ahead already (simpler). */
3010 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
3011 {
3012 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: timeout\n", fIsMaster ? "master" : "worker"));
3013 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
3014 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3015 return false;
3016 }
3017 }
3018 iSync2Loops++;
3019 }
3020
3021 /*
3022 * Interrupts are now disabled and will remain disabled until we do
3023 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
3024 */
3025 *pfEFlags = fEFlags;
3026
3027 /*
3028 * The worker tells the master that it is on its mark and that the master
3029 * need to get into position as well.
3030 */
3031 if (!fIsMaster)
3032 {
3033 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
3034 { /* likely */ }
3035 else
3036 {
3037 ASMSetFlags(fEFlags);
3038 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #3 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3039 return false;
3040 }
3041 }
3042
3043 /*
3044 * The master sends the 'go' to the worker and wait for ACK.
3045 */
3046 if (fIsMaster)
3047 {
3048 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3049 { /* likely */ }
3050 else
3051 {
3052 ASMSetFlags(fEFlags);
3053 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #4 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3054 return false;
3055 }
3056 }
3057
3058 /*
3059 * Wait for the 'go' signal (ack in the master case).
3060 */
3061 TSCDELTA_DBG_START_LOOP();
3062 for (;;)
3063 {
3064 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3065 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
3066 break;
3067 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
3068 { /* likely */ }
3069 else
3070 {
3071 ASMSetFlags(fEFlags);
3072 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #5 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
3073 return false;
3074 }
3075
3076 TSCDELTA_DBG_CHECK_LOOP();
3077 ASMNopPause();
3078 }
3079
3080 /*
3081 * The worker acks the 'go' (shouldn't fail).
3082 */
3083 if (!fIsMaster)
3084 {
3085 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3086 { /* likely */ }
3087 else
3088 {
3089 ASMSetFlags(fEFlags);
3090 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #6 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3091 return false;
3092 }
3093 }
3094
3095 /*
3096 * Try enter mostly lockstep execution with it.
3097 */
3098 for (;;)
3099 {
3100 uint32_t iOtherSeq1, iOtherSeq2;
3101 ASMCompilerBarrier();
3102 ASMSerializeInstruction();
3103
3104 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
3105 ASMNopPause();
3106 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
3107 ASMNopPause();
3108 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
3109
3110 ASMCompilerBarrier();
3111 if (iOtherSeq1 == iOtherSeq2)
3112 return true;
3113
3114 /* Did the other guy give up? Should we give up? */
3115 if ( iOtherSeq1 == UINT32_MAX
3116 || iOtherSeq2 == UINT32_MAX)
3117 return true;
3118 if (++iMySeq >= iMaxSeq)
3119 {
3120 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
3121 return true;
3122 }
3123 ASMNopPause();
3124 }
3125}
3126
3127#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3128 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3129 { /*likely*/ } \
3130 else if (true) \
3131 { \
3132 TSCDELTA_DBG_SYNC_MSG9(("sync/before/master: #89\n")); \
3133 break; \
3134 } else do {} while (0)
3135#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3136 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3137 { /*likely*/ } \
3138 else if (true) \
3139 { \
3140 TSCDELTA_DBG_SYNC_MSG9(("sync/before/other: #89\n")); \
3141 break; \
3142 } else do {} while (0)
3143
3144
3145static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3146 bool fIsMaster, RTCCUINTREG fEFlags)
3147{
3148 TSCDELTA_DBG_VARS();
3149 RT_NOREF1(pOtherSync);
3150
3151 /*
3152 * Wait for the 'ready' signal. In the master's case, this means the
3153 * worker has completed its data collection, while in the worker's case it
3154 * means the master is done processing the data and it's time for the next
3155 * loop iteration (or whatever).
3156 */
3157 ASMSetFlags(fEFlags);
3158 TSCDELTA_DBG_START_LOOP();
3159 for (;;)
3160 {
3161 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3162 if ( u32Tmp == GIP_TSC_DELTA_SYNC2_READY
3163 || (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY && !fIsMaster) /* kicked twice => race */ )
3164 return true;
3165 ASMNopPause();
3166 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_GO))
3167 { /* likely */}
3168 else
3169 {
3170 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #1 u32Tmp=%#x\n", u32Tmp));
3171 return false; /* shouldn't ever happen! */
3172 }
3173 TSCDELTA_DBG_CHECK_LOOP();
3174 ASMNopPause();
3175 }
3176}
3177
3178#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3179 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_fEFlags))) \
3180 { /* likely */ } \
3181 else if (true) \
3182 { \
3183 TSCDELTA_DBG_SYNC_MSG9(("sync/after/master: #97\n")); \
3184 break; \
3185 } else do {} while (0)
3186
3187#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
3188 /* \
3189 * Tell the worker that we're done processing the data and ready for the next round. \
3190 */ \
3191 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3192 { /* likely */ } \
3193 else if (true)\
3194 { \
3195 TSCDELTA_DBG_SYNC_MSG(("sync/after/master: #99 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3196 break; \
3197 } else do {} while (0)
3198
3199#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3200 if (true) { \
3201 /* \
3202 * Tell the master that we're done collecting data and wait for the next round to start. \
3203 */ \
3204 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3205 { /* likely */ } \
3206 else \
3207 { \
3208 ASMSetFlags(a_fEFlags); \
3209 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #0 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3210 break; \
3211 } \
3212 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_fEFlags))) \
3213 { /* likely */ } \
3214 else \
3215 { \
3216 TSCDELTA_DBG_SYNC_MSG9(("sync/after/other: #98\n")); \
3217 break; \
3218 } \
3219 } else do {} while (0)
3220/** @} */
3221
3222
3223#ifdef GIP_TSC_DELTA_METHOD_1
3224/**
3225 * TSC delta measurement algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
3226 *
3227 *
3228 * We ignore the first few runs of the loop in order to prime the
3229 * cache. Also, we need to be careful about using 'pause' instruction
3230 * in critical busy-wait loops in this code - it can cause undesired
3231 * behaviour with hyperthreading.
3232 *
3233 * We try to minimize the measurement error by computing the minimum
3234 * read time of the compare statement in the worker by taking TSC
3235 * measurements across it.
3236 *
3237 * It must be noted that the computed minimum read time is mostly to
3238 * eliminate huge deltas when the worker is too early and doesn't by
3239 * itself help produce more accurate deltas. We allow two times the
3240 * computed minimum as an arbitrary acceptable threshold. Therefore,
3241 * it is still possible to get negative deltas where there are none
3242 * when the worker is earlier. As long as these occasional negative
3243 * deltas are lower than the time it takes to exit guest-context and
3244 * the OS to reschedule EMT on a different CPU, we won't expose a TSC
3245 * that jumped backwards. It is due to the existence of the negative
3246 * deltas that we don't recompute the delta with the master and
3247 * worker interchanged to eliminate the remaining measurement error.
3248 *
3249 *
3250 * @param pArgs The argument/state data.
3251 * @param pMySync My synchronization structure.
3252 * @param pOtherSync My partner's synchronization structure.
3253 * @param fIsMaster Set if master, clear if worker.
3254 * @param iTry The attempt number.
3255 */
3256static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3257 bool fIsMaster, uint32_t iTry)
3258{
3259 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3260 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3261 uint64_t uMinCmpReadTime = UINT64_MAX;
3262 unsigned iLoop;
3263 NOREF(iTry);
3264
3265 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
3266 {
3267 RTCCUINTREG fEFlags;
3268 if (fIsMaster)
3269 {
3270 /*
3271 * The master.
3272 */
3273 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
3274 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
3275 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
3276 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3277
3278 do
3279 {
3280 ASMSerializeInstruction();
3281 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
3282 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3283
3284 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3285
3286 /* Process the data. */
3287 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3288 {
3289 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
3290 {
3291 int64_t iDelta = pGipCpuWorker->u64TSCSample
3292 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
3293 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3294 ? iDelta < pGipCpuWorker->i64TSCDelta
3295 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3296 pGipCpuWorker->i64TSCDelta = iDelta;
3297 }
3298 }
3299
3300 /* Reset our TSC sample and tell the worker to move on. */
3301 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3302 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3303 }
3304 else
3305 {
3306 /*
3307 * The worker.
3308 */
3309 uint64_t uTscWorker;
3310 uint64_t uTscWorkerFlushed;
3311 uint64_t uCmpReadTime;
3312
3313 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3314 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3315
3316 /*
3317 * Keep reading the TSC until we notice that the master has read his. Reading
3318 * the TSC -after- the master has updated the memory is way too late. We thus
3319 * compensate by trying to measure how long it took for the worker to notice
3320 * the memory flushed from the master.
3321 */
3322 do
3323 {
3324 ASMSerializeInstruction();
3325 uTscWorker = ASMReadTSC();
3326 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3327 ASMSerializeInstruction();
3328 uTscWorkerFlushed = ASMReadTSC();
3329
3330 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3331 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3332 {
3333 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3334 if (uCmpReadTime < (uMinCmpReadTime << 1))
3335 {
3336 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3337 if (uCmpReadTime < uMinCmpReadTime)
3338 uMinCmpReadTime = uCmpReadTime;
3339 }
3340 else
3341 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3342 }
3343 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3344 {
3345 if (uCmpReadTime < uMinCmpReadTime)
3346 uMinCmpReadTime = uCmpReadTime;
3347 }
3348
3349 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3350 }
3351 }
3352
3353 TSCDELTA_DBG_SYNC_MSG9(("sync/method1loop/%s: #92 iLoop=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iLoop,
3354 pMySync->uSyncVar));
3355
3356 /*
3357 * We must reset the worker TSC sample value in case it gets picked as a
3358 * GIP master later on (it's trashed above, naturally).
3359 */
3360 if (!fIsMaster)
3361 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3362}
3363#endif /* GIP_TSC_DELTA_METHOD_1 */
3364
3365
3366#ifdef GIP_TSC_DELTA_METHOD_2
3367/*
3368 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3369 */
3370
3371# define GIP_TSC_DELTA_M2_LOOPS (7 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3372# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 0
3373
3374
3375static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs)
3376{
3377 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3378 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3379 uint32_t idxResult;
3380 uint32_t cHits = 0;
3381
3382 /*
3383 * Look for matching entries in the master and worker tables.
3384 */
3385 for (idxResult = 0; idxResult < RT_ELEMENTS(pArgs->uMaster.M2.Data.aResults); idxResult++)
3386 {
3387 uint32_t idxOther = pArgs->uMaster.M2.Data.aResults[idxResult].iSeqOther;
3388 if (idxOther & 1)
3389 {
3390 idxOther >>= 1;
3391 if (idxOther < RT_ELEMENTS(pArgs->uWorker.M2.Data.aResults))
3392 {
3393 if (pArgs->uWorker.M2.Data.aResults[idxOther].iSeqOther == pArgs->uMaster.M2.Data.aResults[idxResult].iSeqMine)
3394 {
3395 int64_t iDelta;
3396 iDelta = pArgs->uWorker.M2.Data.aResults[idxOther].uTsc
3397 - (pArgs->uMaster.M2.Data.aResults[idxResult].uTsc - iMasterTscDelta);
3398 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3399 ? iDelta < iBestDelta
3400 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3401 iBestDelta = iDelta;
3402 cHits++;
3403 }
3404 }
3405 }
3406 }
3407
3408 /*
3409 * Save the results.
3410 */
3411 if (cHits > 2)
3412 pArgs->pWorker->i64TSCDelta = iBestDelta;
3413 pArgs->uMaster.M2.cHits += cHits;
3414}
3415
3416
3417/**
3418 * The core function of the 2nd TSC delta measurement algorithm.
3419 *
3420 * The idea here is that we have the two CPUs execute the exact same code
3421 * collecting a largish set of TSC samples. The code has one data dependency on
3422 * the other CPU which intention it is to synchronize the execution as well as
3423 * help cross references the two sets of TSC samples (the sequence numbers).
3424 *
3425 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3426 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3427 * it will help with making the CPUs enter lock step execution occasionally.
3428 *
3429 */
3430static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3431{
3432 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3433 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3434
3435 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3436 ASMSerializeInstruction();
3437 while (cLeft-- > 0)
3438 {
3439 uint64_t uTsc;
3440 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3441 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3442 ASMCompilerBarrier();
3443 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3444 uTsc = ASMReadTSC();
3445 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3446 ASMCompilerBarrier();
3447 ASMSerializeInstruction();
3448 pEntry->iSeqMine = iSeqMine;
3449 pEntry->iSeqOther = iSeqOther;
3450 pEntry->uTsc = uTsc;
3451 pEntry++;
3452 ASMSerializeInstruction();
3453 if (fLag)
3454 ASMNopPause();
3455 }
3456}
3457
3458
3459/**
3460 * TSC delta measurement algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3461 *
3462 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3463 *
3464 * @param pArgs The argument/state data.
3465 * @param pMySync My synchronization structure.
3466 * @param pOtherSync My partner's synchronization structure.
3467 * @param fIsMaster Set if master, clear if worker.
3468 * @param iTry The attempt number.
3469 */
3470static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3471 bool fIsMaster, uint32_t iTry)
3472{
3473 unsigned iLoop;
3474 RT_NOREF1(iTry);
3475
3476 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3477 {
3478 RTCCUINTREG fEFlags;
3479 if (fIsMaster)
3480 {
3481 /*
3482 * Adjust the loop lag fudge.
3483 */
3484# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3485 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3486 {
3487 /* Lag during the priming to be nice to everyone.. */
3488 pArgs->uMaster.M2.fLag = true;
3489 pArgs->uWorker.M2.fLag = true;
3490 }
3491 else
3492# endif
3493 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3494 {
3495 /* 25 % of the body without lagging. */
3496 pArgs->uMaster.M2.fLag = false;
3497 pArgs->uWorker.M2.fLag = false;
3498 }
3499 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3500 {
3501 /* 25 % of the body with both lagging. */
3502 pArgs->uMaster.M2.fLag = true;
3503 pArgs->uWorker.M2.fLag = true;
3504 }
3505 else
3506 {
3507 /* 50% of the body with alternating lag. */
3508 pArgs->uMaster.M2.fLag = (iLoop & 1) == 0;
3509 pArgs->uWorker.M2.fLag= (iLoop & 1) == 1;
3510 }
3511
3512 /*
3513 * Sync up with the worker and collect data.
3514 */
3515 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3516 supdrvTscDeltaMethod2CollectData(&pArgs->uMaster.M2.Data, &pArgs->uWorker.M2.Data.iCurSeqNo, pArgs->uMaster.M2.fLag);
3517 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3518
3519 /*
3520 * Process the data.
3521 */
3522# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3523 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3524# endif
3525 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs);
3526
3527 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3528 }
3529 else
3530 {
3531 /*
3532 * The worker.
3533 */
3534 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3535 supdrvTscDeltaMethod2CollectData(&pArgs->uWorker.M2.Data, &pArgs->uMaster.M2.Data.iCurSeqNo, pArgs->uWorker.M2.fLag);
3536 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3537 }
3538 }
3539}
3540
3541#endif /* GIP_TSC_DELTA_METHOD_2 */
3542
3543
3544
3545static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3546 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3547{
3548 /*PSUPGIPCPU pGipCpuWorker = pArgs->pWorker; - unused */
3549 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3550 uint32_t i;
3551 TSCDELTA_DBG_VARS();
3552
3553 for (;;)
3554 {
3555 RTCCUINTREG fEFlags;
3556 AssertCompile((RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) & 1) == 0);
3557 AssertCompile(RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) == RT_ELEMENTS(pArgs->uWorker.Verify.auTscs));
3558
3559 if (fIsMaster)
3560 {
3561 uint64_t uTscWorker;
3562 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3563
3564 /*
3565 * Collect TSC, master goes first.
3566 */
3567 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i += 2)
3568 {
3569 /* Read, kick & wait #1. */
3570 uint64_t uTsc = ASMReadTSC();
3571 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3572 ASMSerializeInstruction();
3573 pArgs->uMaster.Verify.auTscs[i] = uTsc;
3574 TSCDELTA_DBG_START_LOOP();
3575 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3576 {
3577 TSCDELTA_DBG_CHECK_LOOP();
3578 ASMNopPause();
3579 }
3580
3581 /* Read, kick & wait #2. */
3582 uTsc = ASMReadTSC();
3583 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3584 ASMSerializeInstruction();
3585 pArgs->uMaster.Verify.auTscs[i + 1] = uTsc;
3586 TSCDELTA_DBG_START_LOOP();
3587 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3588 {
3589 TSCDELTA_DBG_CHECK_LOOP();
3590 ASMNopPause();
3591 }
3592 }
3593
3594 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3595
3596 /*
3597 * Process the data.
3598 */
3599#ifdef TSCDELTA_VERIFY_WITH_STATS
3600 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3601 pArgs->cMinVerifyTscTicks = INT64_MAX;
3602 pArgs->iVerifyBadTscDiff = 0;
3603#endif
3604 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3605 uTscWorker = 0;
3606 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i++)
3607 {
3608 /* Master vs previous worker entry. */
3609 uint64_t uTscMaster = pArgs->uMaster.Verify.auTscs[i] - pGipCpuMaster->i64TSCDelta;
3610 int64_t iDiff;
3611 if (i > 0)
3612 {
3613 iDiff = uTscMaster - uTscWorker;
3614#ifdef TSCDELTA_VERIFY_WITH_STATS
3615 if (iDiff > pArgs->cMaxVerifyTscTicks)
3616 pArgs->cMaxVerifyTscTicks = iDiff;
3617 if (iDiff < pArgs->cMinVerifyTscTicks)
3618 pArgs->cMinVerifyTscTicks = iDiff;
3619#endif
3620 if (iDiff < 0)
3621 {
3622#ifdef TSCDELTA_VERIFY_WITH_STATS
3623 pArgs->iVerifyBadTscDiff = -iDiff;
3624#endif
3625 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3626 break;
3627 }
3628 }
3629
3630 /* Worker vs master. */
3631 uTscWorker = pArgs->uWorker.Verify.auTscs[i] - iWorkerTscDelta;
3632 iDiff = uTscWorker - uTscMaster;
3633#ifdef TSCDELTA_VERIFY_WITH_STATS
3634 if (iDiff > pArgs->cMaxVerifyTscTicks)
3635 pArgs->cMaxVerifyTscTicks = iDiff;
3636 if (iDiff < pArgs->cMinVerifyTscTicks)
3637 pArgs->cMinVerifyTscTicks = iDiff;
3638#endif
3639 if (iDiff < 0)
3640 {
3641#ifdef TSCDELTA_VERIFY_WITH_STATS
3642 pArgs->iVerifyBadTscDiff = iDiff;
3643#endif
3644 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3645 break;
3646 }
3647 }
3648
3649 /* Done. */
3650 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3651 }
3652 else
3653 {
3654 /*
3655 * The worker, master leads.
3656 */
3657 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3658
3659 for (i = 0; i < RT_ELEMENTS(pArgs->uWorker.Verify.auTscs); i += 2)
3660 {
3661 uint64_t uTsc;
3662
3663 /* Wait, Read and Kick #1. */
3664 TSCDELTA_DBG_START_LOOP();
3665 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3666 {
3667 TSCDELTA_DBG_CHECK_LOOP();
3668 ASMNopPause();
3669 }
3670 uTsc = ASMReadTSC();
3671 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3672 ASMSerializeInstruction();
3673 pArgs->uWorker.Verify.auTscs[i] = uTsc;
3674
3675 /* Wait, Read and Kick #2. */
3676 TSCDELTA_DBG_START_LOOP();
3677 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3678 {
3679 TSCDELTA_DBG_CHECK_LOOP();
3680 ASMNopPause();
3681 }
3682 uTsc = ASMReadTSC();
3683 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3684 ASMSerializeInstruction();
3685 pArgs->uWorker.Verify.auTscs[i + 1] = uTsc;
3686 }
3687
3688 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3689 }
3690 return pArgs->rcVerify;
3691 }
3692
3693 /*
3694 * Timed out, please retry.
3695 */
3696 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3697 return VERR_TIMEOUT;
3698}
3699
3700
3701
3702/**
3703 * Handles the special abort procedure during synchronization setup in
3704 * supdrvTscMeasureDeltaCallbackUnwrapped().
3705 *
3706 * @returns 0 (dummy, ignored)
3707 * @param pArgs Pointer to argument/state data.
3708 * @param pMySync Pointer to my sync structure.
3709 * @param fIsMaster Set if we're the master, clear if worker.
3710 * @param fTimeout Set if it's a timeout.
3711 */
3712DECL_NO_INLINE(static, int)
3713supdrvTscMeasureDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3714{
3715 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3716 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3717 TSCDELTA_DBG_VARS();
3718 RT_NOREF1(pMySync);
3719
3720 /*
3721 * Clear our sync pointer and make sure the abort flag is set.
3722 */
3723 ASMAtomicWriteNullPtr(ppMySync);
3724 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3725 if (fTimeout)
3726 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3727
3728 /*
3729 * Make sure the other party is out of there and won't be touching our
3730 * sync state again (would cause stack corruption).
3731 */
3732 TSCDELTA_DBG_START_LOOP();
3733 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3734 {
3735 ASMNopPause();
3736 ASMNopPause();
3737 ASMNopPause();
3738 TSCDELTA_DBG_CHECK_LOOP();
3739 }
3740
3741 return 0;
3742}
3743
3744
3745/**
3746 * This is used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3747 * and compute the delta between them.
3748 *
3749 * To reduce code size a good when timeout handling was added, a dummy return
3750 * value had to be added (saves 1-3 lines per timeout case), thus this
3751 * 'Unwrapped' function and the dummy 0 return value.
3752 *
3753 * @returns 0 (dummy, ignored)
3754 * @param idCpu The CPU we are current scheduled on.
3755 * @param pArgs Pointer to a parameter package.
3756 *
3757 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3758 * read the TSC at exactly the same time on both the master and the
3759 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3760 * contention, SMI, pipelining etc. there is no guaranteed way of
3761 * doing this on x86 CPUs.
3762 */
3763static int supdrvTscMeasureDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3764{
3765 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3766 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3767 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3768 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3769 uint32_t iTry;
3770 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3771 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3772 SUPTSCDELTASYNC2 MySync;
3773 PSUPTSCDELTASYNC2 pOtherSync;
3774 int rc;
3775 TSCDELTA_DBG_VARS();
3776
3777 /* A bit of paranoia first. */
3778 if (!pGipCpuMaster || !pGipCpuWorker)
3779 return 0;
3780
3781 /*
3782 * If the CPU isn't part of the measurement, return immediately.
3783 */
3784 if ( !fIsMaster
3785 && idCpu != pGipCpuWorker->idCpu)
3786 return 0;
3787
3788 /*
3789 * Set up my synchronization stuff and wait for the other party to show up.
3790 *
3791 * We don't wait forever since the other party may be off fishing (offline,
3792 * spinning with ints disables, whatever), we must play nice to the rest of
3793 * the system as this context generally isn't one in which we will get
3794 * preempted and we may hold up a number of lower priority interrupts.
3795 */
3796 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3797 ASMAtomicWritePtr(ppMySync, &MySync);
3798 MySync.uTscStart = ASMReadTSC();
3799 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3800
3801 /* Look for the partner, might not be here yet... Special abort considerations. */
3802 iTry = 0;
3803 TSCDELTA_DBG_START_LOOP();
3804 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3805 {
3806 ASMNopPause();
3807 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3808 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu) )
3809 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3810 if ( (iTry++ & 0xff) == 0
3811 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3812 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3813 TSCDELTA_DBG_CHECK_LOOP();
3814 ASMNopPause();
3815 }
3816
3817 /* I found my partner, waiting to be found... Special abort considerations. */
3818 if (fIsMaster)
3819 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3820 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3821
3822 iTry = 0;
3823 TSCDELTA_DBG_START_LOOP();
3824 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3825 {
3826 ASMNopPause();
3827 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3828 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3829 if ( (iTry++ & 0xff) == 0
3830 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3831 {
3832 if ( fIsMaster
3833 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3834 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3835 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3836 }
3837 TSCDELTA_DBG_CHECK_LOOP();
3838 }
3839
3840 if (!fIsMaster)
3841 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3842 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3843
3844/** @todo Add a resumable state to pArgs so we don't waste time if we time
3845 * out or something. Timeouts are legit, any of the two CPUs may get
3846 * interrupted. */
3847
3848 /*
3849 * Start by seeing if we have a zero delta between the two CPUs.
3850 * This should normally be the case.
3851 */
3852 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3853 if (RT_SUCCESS(rc))
3854 {
3855 if (fIsMaster)
3856 {
3857 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3858 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3859 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3860 }
3861 }
3862 /*
3863 * If the verification didn't time out, do regular delta measurements.
3864 * We retry this until we get a reasonable value.
3865 */
3866 else if (rc != VERR_TIMEOUT)
3867 {
3868 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3869 for (iTry = 0; iTry < 12; iTry++)
3870 {
3871 /*
3872 * Check the state before we start.
3873 */
3874 uint32_t u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3875 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3876 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3877 {
3878 TSCDELTA_DBG_SYNC_MSG(("sync/loop/%s: #0 iTry=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iTry, u32Tmp));
3879 break;
3880 }
3881
3882 /*
3883 * Do the measurements.
3884 */
3885#ifdef GIP_TSC_DELTA_METHOD_1
3886 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3887#elif defined(GIP_TSC_DELTA_METHOD_2)
3888 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3889#else
3890# error "huh??"
3891#endif
3892
3893 /*
3894 * Check the state.
3895 */
3896 u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3897 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3898 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3899 {
3900 if (fIsMaster)
3901 TSCDELTA_DBG_SYNC_MSG(("sync/loop/master: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3902 else
3903 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/worker: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3904 break;
3905 }
3906
3907 /*
3908 * Success? If so, stop trying. Master decides.
3909 */
3910 if (fIsMaster)
3911 {
3912 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3913 {
3914 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3915 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3916 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/master: #9 iTry=%u MyState=%#x\n", iTry, MySync.uSyncVar));
3917 break;
3918 }
3919 }
3920 }
3921 if (fIsMaster)
3922 pArgs->iTry = iTry;
3923 }
3924
3925 /*
3926 * End the synchronization dance. We tell the other that we're done,
3927 * then wait for the same kind of reply.
3928 */
3929 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3930 ASMAtomicWriteNullPtr(ppMySync);
3931 iTry = 0;
3932 TSCDELTA_DBG_START_LOOP();
3933 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3934 {
3935 iTry++;
3936 if ( iTry == 0
3937 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu))
3938 break; /* this really shouldn't happen. */
3939 TSCDELTA_DBG_CHECK_LOOP();
3940 ASMNopPause();
3941 }
3942
3943 /*
3944 * Collect some runtime stats.
3945 */
3946 if (fIsMaster)
3947 pArgs->cElapsedMasterTscTicks = ASMReadTSC() - MySync.uTscStart;
3948 else
3949 pArgs->cElapsedWorkerTscTicks = ASMReadTSC() - MySync.uTscStart;
3950 return 0;
3951}
3952
3953/**
3954 * Callback used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3955 * and compute the delta between them.
3956 *
3957 * @param idCpu The CPU we are current scheduled on.
3958 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3959 * @param pvUser2 Unused.
3960 */
3961static DECLCALLBACK(void) supdrvTscMeasureDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3962{
3963 supdrvTscMeasureDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3964 RT_NOREF1(pvUser2);
3965}
3966
3967
3968/**
3969 * Measures the TSC delta between the master GIP CPU and one specified worker
3970 * CPU.
3971 *
3972 * @returns VBox status code.
3973 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3974 * failure.
3975 * @param pDevExt Pointer to the device instance data.
3976 * @param idxWorker The index of the worker CPU from the GIP's array of
3977 * CPUs.
3978 *
3979 * @remarks This must be called with preemption enabled!
3980 */
3981static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3982{
3983 int rc;
3984 int rc2;
3985 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3986 RTCPUID idMaster = pDevExt->idGipMaster;
3987 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3988 PSUPGIPCPU pGipCpuMaster;
3989 uint32_t iGipCpuMaster;
3990 uint32_t u32Tmp;
3991
3992 /* Validate input a bit. */
3993 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3994 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3995 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3996
3997 /*
3998 * Don't attempt measuring the delta for the GIP master.
3999 */
4000 if (pGipCpuWorker->idCpu == idMaster)
4001 {
4002 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
4003 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
4004 return VINF_SUCCESS;
4005 }
4006
4007 /*
4008 * One measurement at a time, at least for now. We might be using
4009 * broadcast IPIs so, so be nice to the rest of the system.
4010 */
4011#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4012 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
4013#else
4014 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
4015#endif
4016 if (RT_FAILURE(rc))
4017 return rc;
4018
4019 /*
4020 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
4021 * try pick a different master. (This fudge only works with multi core systems.)
4022 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
4023 *
4024 * We skip this on AMDs for now as their HTT is different from Intel's and
4025 * it doesn't seem to have any favorable effect on the results.
4026 *
4027 * If the master is offline, we need a new master too, so share the code.
4028 */
4029 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
4030 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
4031 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
4032 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
4033 && pGip->cOnlineCpus > 2
4034 && ASMHasCpuId()
4035 && ASMIsValidStdRange(ASMCpuId_EAX(0))
4036 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
4037 && ( !ASMIsAmdCpu()
4038 || ASMGetCpuFamily(u32Tmp = ASMCpuId_EAX(1)) > 0x15
4039 || ( ASMGetCpuFamily(u32Tmp) == 0x15 /* Piledriver+, not bulldozer (FX-4150 didn't like it). */
4040 && ASMGetCpuModelAMD(u32Tmp) >= 0x02) ) )
4041 || !RTMpIsCpuOnline(idMaster) )
4042 {
4043 uint32_t i;
4044 for (i = 0; i < pGip->cCpus; i++)
4045 if ( i != iGipCpuMaster
4046 && i != idxWorker
4047 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
4048 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
4049 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
4050 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
4051 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
4052 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
4053 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
4054 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
4055 {
4056 iGipCpuMaster = i;
4057 pGipCpuMaster = &pGip->aCPUs[i];
4058 idMaster = pGipCpuMaster->idCpu;
4059 break;
4060 }
4061 }
4062
4063 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
4064 {
4065 /*
4066 * Initialize data package for the RTMpOnPair callback.
4067 */
4068 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
4069 if (pArgs)
4070 {
4071 pArgs->pWorker = pGipCpuWorker;
4072 pArgs->pMaster = pGipCpuMaster;
4073 pArgs->pDevExt = pDevExt;
4074 pArgs->pSyncMaster = NULL;
4075 pArgs->pSyncWorker = NULL;
4076 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 512; /* 1953 us */
4077
4078 /*
4079 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
4080 * and supdrvTscMeasureDeltaCallback can use it as a success check.
4081 */
4082 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
4083 * that when doing the restart loop reorg. */
4084 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
4085 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
4086 supdrvTscMeasureDeltaCallback, pArgs, NULL);
4087 if (RT_SUCCESS(rc))
4088 {
4089#if 0
4090 SUPR0Printf("mponpair ticks: %9llu %9llu max: %9llu iTry: %u%s\n", pArgs->cElapsedMasterTscTicks,
4091 pArgs->cElapsedWorkerTscTicks, pArgs->cMaxTscTicks, pArgs->iTry,
4092 pArgs->fTimedOut ? " timed out" :"");
4093#endif
4094#if 0
4095 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
4096 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
4097#endif
4098 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
4099 {
4100 /*
4101 * Work the TSC delta applicability rating. It starts
4102 * optimistic in supdrvGipInit, we downgrade it here.
4103 */
4104 SUPGIPUSETSCDELTA enmRating;
4105 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
4106 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
4107 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
4108 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
4109 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
4110 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
4111 else
4112 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
4113 if (pGip->enmUseTscDelta < enmRating)
4114 {
4115 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
4116 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
4117 }
4118 }
4119 else
4120 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4121 }
4122 /** @todo return try-again if we get an offline CPU error. */
4123
4124 RTMemFree(pArgs);
4125 }
4126 else
4127 rc = VERR_NO_MEMORY;
4128 }
4129 else
4130 rc = VERR_CPU_OFFLINE;
4131
4132 /*
4133 * We're done now.
4134 */
4135#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4136 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4137#else
4138 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4139#endif
4140 return rc;
4141}
4142
4143
4144/**
4145 * Resets the TSC-delta related TSC samples and optionally the deltas
4146 * themselves.
4147 *
4148 * @param pDevExt Pointer to the device instance data.
4149 * @param fResetTscDeltas Whether the TSC-deltas are also to be reset.
4150 *
4151 * @remarks This might be called while holding a spinlock!
4152 */
4153static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fResetTscDeltas)
4154{
4155 unsigned iCpu;
4156 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4157 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4158 {
4159 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
4160 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
4161 if (fResetTscDeltas)
4162 {
4163 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpu->iCpuSet);
4164 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
4165 }
4166 }
4167}
4168
4169
4170/**
4171 * Picks an online CPU as the master TSC for TSC-delta computations.
4172 *
4173 * @returns VBox status code.
4174 * @param pDevExt Pointer to the device instance data.
4175 * @param pidxMaster Where to store the CPU array index of the chosen
4176 * master. Optional, can be NULL.
4177 */
4178static int supdrvTscPickMaster(PSUPDRVDEVEXT pDevExt, uint32_t *pidxMaster)
4179{
4180 /*
4181 * Pick the first CPU online as the master TSC and make it the new GIP master based
4182 * on the APIC ID.
4183 *
4184 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
4185 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
4186 * master as this point since the sync/async timer isn't created yet.
4187 */
4188 unsigned iCpu;
4189 uint32_t idxMaster = UINT32_MAX;
4190 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4191 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
4192 {
4193 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
4194 if (idxCpu != UINT16_MAX)
4195 {
4196 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
4197 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
4198 {
4199 idxMaster = idxCpu;
4200 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
4201 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpu->idCpu);
4202 if (pidxMaster)
4203 *pidxMaster = idxMaster;
4204 return VINF_SUCCESS;
4205 }
4206 }
4207 }
4208 return VERR_CPU_OFFLINE;
4209}
4210
4211
4212/**
4213 * Performs the initial measurements of the TSC deltas between CPUs.
4214 *
4215 * This is called by supdrvGipCreate(), supdrvGipPowerNotificationCallback() or
4216 * triggered by it if threaded.
4217 *
4218 * @returns VBox status code.
4219 * @param pDevExt Pointer to the device instance data.
4220 *
4221 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
4222 * idCpu, GIP's online CPU set which are populated in
4223 * supdrvGipInitOnCpu().
4224 */
4225static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt)
4226{
4227 PSUPGIPCPU pGipCpuMaster;
4228 unsigned iCpu;
4229 unsigned iOddEven;
4230 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4231 uint32_t idxMaster = UINT32_MAX;
4232 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
4233
4234 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4235 supdrvTscResetSamples(pDevExt, true /* fClearDeltas */);
4236 int rc = supdrvTscPickMaster(pDevExt, &idxMaster);
4237 if (RT_FAILURE(rc))
4238 {
4239 SUPR0Printf("Failed to pick a CPU master for TSC-delta measurements rc=%Rrc\n", rc);
4240 return rc;
4241 }
4242 AssertReturn(idxMaster < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4243 pGipCpuMaster = &pGip->aCPUs[idxMaster];
4244 Assert(pDevExt->idGipMaster == pGipCpuMaster->idCpu);
4245
4246 /*
4247 * If there is only a single CPU online we have nothing to do.
4248 */
4249 if (pGip->cOnlineCpus <= 1)
4250 {
4251 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
4252 return VINF_SUCCESS;
4253 }
4254
4255 /*
4256 * Loop thru the GIP CPU array and get deltas for each CPU (except the
4257 * master). We do the CPUs with the even numbered APIC IDs first so that
4258 * we've got alternative master CPUs to pick from on hyper-threaded systems.
4259 */
4260 for (iOddEven = 0; iOddEven < 2; iOddEven++)
4261 {
4262 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4263 {
4264 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4265 if ( iCpu != idxMaster
4266 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
4267 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4268 {
4269 rc = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4270 if (RT_FAILURE(rc))
4271 {
4272 SUPR0Printf("supdrvTscMeasureDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4273 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4274 break;
4275 }
4276
4277 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4278 {
4279 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4280 rc = VERR_TRY_AGAIN;
4281 break;
4282 }
4283 }
4284 }
4285 }
4286
4287 return rc;
4288}
4289
4290
4291#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4292
4293/**
4294 * Switches the TSC-delta measurement thread into the butchered state.
4295 *
4296 * @returns VBox status code.
4297 * @param pDevExt Pointer to the device instance data.
4298 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4299 * @param pszFailed An error message to log.
4300 * @param rcFailed The error code to exit the thread with.
4301 */
4302static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4303{
4304 if (!fSpinlockHeld)
4305 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4306
4307 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4308 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4309 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", pszFailed, rcFailed));
4310 return rcFailed;
4311}
4312
4313
4314/**
4315 * The TSC-delta measurement thread.
4316 *
4317 * @returns VBox status code.
4318 * @param hThread The thread handle.
4319 * @param pvUser Opaque pointer to the device instance data.
4320 */
4321static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4322{
4323 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4324 int rc = VERR_INTERNAL_ERROR_2;
4325 for (;;)
4326 {
4327 /*
4328 * Switch on the current state.
4329 */
4330 SUPDRVTSCDELTATHREADSTATE enmState;
4331 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4332 enmState = pDevExt->enmTscDeltaThreadState;
4333 switch (enmState)
4334 {
4335 case kTscDeltaThreadState_Creating:
4336 {
4337 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4338 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4339 if (RT_FAILURE(rc))
4340 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4341 RT_FALL_THRU();
4342 }
4343
4344 case kTscDeltaThreadState_Listening:
4345 {
4346 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4347
4348 /*
4349 * Linux counts uninterruptible sleeps as load, hence we shall do a
4350 * regular, interruptible sleep here and ignore wake ups due to signals.
4351 * See task_contributes_to_load() in include/linux/sched.h in the Linux sources.
4352 */
4353 rc = RTThreadUserWaitNoResume(hThread, pDevExt->cMsTscDeltaTimeout);
4354 if ( RT_FAILURE(rc)
4355 && rc != VERR_TIMEOUT
4356 && rc != VERR_INTERRUPTED)
4357 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4358 RTThreadUserReset(hThread);
4359 break;
4360 }
4361
4362 case kTscDeltaThreadState_WaitAndMeasure:
4363 {
4364 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4365 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4366 if (RT_FAILURE(rc))
4367 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4368 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4369 RTThreadSleep(1);
4370 RT_FALL_THRU();
4371 }
4372
4373 case kTscDeltaThreadState_Measuring:
4374 {
4375 if (pDevExt->fTscThreadRecomputeAllDeltas)
4376 {
4377 int cTries = 8;
4378 int cMsWaitPerTry = 10;
4379 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4380 Assert(pGip);
4381 do
4382 {
4383 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
4384 rc = supdrvTscMeasureInitialDeltas(pDevExt);
4385 if ( RT_SUCCESS(rc)
4386 || ( RT_FAILURE(rc)
4387 && rc != VERR_TRY_AGAIN
4388 && rc != VERR_CPU_OFFLINE))
4389 {
4390 break;
4391 }
4392 RTThreadSleep(cMsWaitPerTry);
4393 } while (cTries-- > 0);
4394 pDevExt->fTscThreadRecomputeAllDeltas = false;
4395 }
4396 else
4397 {
4398 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4399 unsigned iCpu;
4400
4401 /* Measure TSC-deltas only for the CPUs that are in the set. */
4402 rc = VINF_SUCCESS;
4403 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4404 {
4405 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4406 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4407 {
4408 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4409 {
4410 int rc2 = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4411 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4412 rc = rc2;
4413 }
4414 else
4415 {
4416 /*
4417 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex(),
4418 * mark the delta as fine to get the timer thread off our back.
4419 */
4420 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4421 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4422 }
4423 }
4424 }
4425 }
4426 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4427 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4428 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4429 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4430 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as init value, see supdrvTscDeltaThreadInit(). */
4431 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4432 break;
4433 }
4434
4435 case kTscDeltaThreadState_Terminating:
4436 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4437 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4438 return VINF_SUCCESS;
4439
4440 case kTscDeltaThreadState_Butchered:
4441 default:
4442 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4443 }
4444 }
4445 /* not reached */
4446}
4447
4448
4449/**
4450 * Waits for the TSC-delta measurement thread to respond to a state change.
4451 *
4452 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4453 * other error code on internal error.
4454 *
4455 * @param pDevExt The device instance data.
4456 * @param enmCurState The current state.
4457 * @param enmNewState The new state we're waiting for it to enter.
4458 */
4459static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4460 SUPDRVTSCDELTATHREADSTATE enmNewState)
4461{
4462 SUPDRVTSCDELTATHREADSTATE enmActualState;
4463 int rc;
4464
4465 /*
4466 * Wait a short while for the expected state transition.
4467 */
4468 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4469 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4470 enmActualState = pDevExt->enmTscDeltaThreadState;
4471 if (enmActualState == enmNewState)
4472 {
4473 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4474 rc = VINF_SUCCESS;
4475 }
4476 else if (enmActualState == enmCurState)
4477 {
4478 /*
4479 * Wait longer if the state has not yet transitioned to the one we want.
4480 */
4481 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4482 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4483 if ( RT_SUCCESS(rc)
4484 || rc == VERR_TIMEOUT)
4485 {
4486 /*
4487 * Check the state whether we've succeeded.
4488 */
4489 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4490 enmActualState = pDevExt->enmTscDeltaThreadState;
4491 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4492 if (enmActualState == enmNewState)
4493 rc = VINF_SUCCESS;
4494 else if (enmActualState == enmCurState)
4495 {
4496 rc = VERR_TIMEOUT;
4497 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmActualState=%d enmNewState=%d\n",
4498 enmActualState, enmNewState));
4499 }
4500 else
4501 {
4502 rc = VERR_INTERNAL_ERROR;
4503 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4504 enmActualState, enmNewState));
4505 }
4506 }
4507 else
4508 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4509 }
4510 else
4511 {
4512 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4513 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state %d when transitioning from %d to %d\n",
4514 enmActualState, enmCurState, enmNewState));
4515 rc = VERR_INTERNAL_ERROR;
4516 }
4517
4518 return rc;
4519}
4520
4521
4522/**
4523 * Signals the TSC-delta thread to start measuring TSC-deltas.
4524 *
4525 * @param pDevExt Pointer to the device instance data.
4526 * @param fForceAll Force re-calculating TSC-deltas on all CPUs.
4527 */
4528static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll)
4529{
4530 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
4531 {
4532 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4533 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4534 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4535 {
4536 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4537 if (fForceAll)
4538 pDevExt->fTscThreadRecomputeAllDeltas = true;
4539 }
4540 else if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_WaitAndMeasure
4541 && fForceAll)
4542 pDevExt->fTscThreadRecomputeAllDeltas = true;
4543 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4544 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4545 }
4546}
4547
4548
4549/**
4550 * Terminates the actual thread running supdrvTscDeltaThread().
4551 *
4552 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4553 * supdrvTscDeltaTerm().
4554 *
4555 * @param pDevExt Pointer to the device instance data.
4556 */
4557static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4558{
4559 int rc;
4560 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4561 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4562 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4563 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4564 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4565 if (RT_FAILURE(rc))
4566 {
4567 /* Signal a few more times before giving up. */
4568 int cTriesLeft = 5;
4569 while (--cTriesLeft > 0)
4570 {
4571 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4572 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4573 if (rc != VERR_TIMEOUT)
4574 break;
4575 }
4576 }
4577}
4578
4579
4580/**
4581 * Initializes and spawns the TSC-delta measurement thread.
4582 *
4583 * A thread is required for servicing re-measurement requests from events like
4584 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4585 * under all contexts on all OSs.
4586 *
4587 * @returns VBox status code.
4588 * @param pDevExt Pointer to the device instance data.
4589 *
4590 * @remarks Must only be called -after- initializing GIP and setting up MP
4591 * notifications!
4592 */
4593static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4594{
4595 int rc;
4596 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4597 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4598 if (RT_SUCCESS(rc))
4599 {
4600 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4601 if (RT_SUCCESS(rc))
4602 {
4603 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4604 pDevExt->cMsTscDeltaTimeout = 60000;
4605 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4606 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4607 if (RT_SUCCESS(rc))
4608 {
4609 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4610 if (RT_SUCCESS(rc))
4611 {
4612 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4613 return rc;
4614 }
4615
4616 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4617 supdrvTscDeltaThreadTerminate(pDevExt);
4618 }
4619 else
4620 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4621 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4622 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4623 }
4624 else
4625 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4626 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4627 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4628 }
4629 else
4630 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4631
4632 return rc;
4633}
4634
4635
4636/**
4637 * Terminates the TSC-delta measurement thread and cleanup.
4638 *
4639 * @param pDevExt Pointer to the device instance data.
4640 */
4641static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4642{
4643 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4644 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4645 {
4646 supdrvTscDeltaThreadTerminate(pDevExt);
4647 }
4648
4649 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4650 {
4651 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4652 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4653 }
4654
4655 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4656 {
4657 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4658 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4659 }
4660
4661 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4662}
4663
4664#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4665
4666/**
4667 * Measure the TSC delta for the CPU given by its CPU set index.
4668 *
4669 * @returns VBox status code.
4670 * @retval VERR_INTERRUPTED if interrupted while waiting.
4671 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4672 * measurement.
4673 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4674 *
4675 * @param pSession The caller's session. GIP must've been mapped.
4676 * @param iCpuSet The CPU set index of the CPU to measure.
4677 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4678 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4679 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4680 * ready.
4681 * @param cTries Number of times to try, pass 0 for the default.
4682 */
4683SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4684 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4685{
4686 PSUPDRVDEVEXT pDevExt;
4687 PSUPGLOBALINFOPAGE pGip;
4688 uint16_t iGipCpu;
4689 int rc;
4690#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4691 uint64_t msTsStartWait;
4692 uint32_t iWaitLoop;
4693#endif
4694
4695 /*
4696 * Validate and adjust the input.
4697 */
4698 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4699 if (!pSession->fGipReferenced)
4700 return VERR_WRONG_ORDER;
4701
4702 pDevExt = pSession->pDevExt;
4703 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4704
4705 pGip = pDevExt->pGip;
4706 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4707
4708 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4709 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4710 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4711 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4712
4713 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4714 return VERR_INVALID_FLAGS;
4715
4716 /*
4717 * The request is a noop if the TSC delta isn't being used.
4718 */
4719 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4720 return VINF_SUCCESS;
4721
4722 if (cTries == 0)
4723 cTries = 12;
4724 else if (cTries > 256)
4725 cTries = 256;
4726
4727 if (cMsWaitRetry == 0)
4728 cMsWaitRetry = 2;
4729 else if (cMsWaitRetry > 1000)
4730 cMsWaitRetry = 1000;
4731
4732#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4733 /*
4734 * Has the TSC already been measured and we're not forced to redo it?
4735 */
4736 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4737 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4738 return VINF_SUCCESS;
4739
4740 /*
4741 * Asynchronous request? Forward it to the thread, no waiting.
4742 */
4743 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4744 {
4745 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4746 * to pass those options to the thread somehow and implement it in the
4747 * thread. Check if anyone uses/needs fAsync before implementing this. */
4748 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4749 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4750 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4751 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4752 {
4753 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4754 rc = VINF_SUCCESS;
4755 }
4756 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4757 rc = VERR_THREAD_IS_DEAD;
4758 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4759 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4760 return VINF_SUCCESS;
4761 }
4762
4763 /*
4764 * If a TSC-delta measurement request is already being serviced by the thread,
4765 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4766 */
4767 msTsStartWait = RTTimeSystemMilliTS();
4768 for (iWaitLoop = 0;; iWaitLoop++)
4769 {
4770 uint64_t cMsElapsed;
4771 SUPDRVTSCDELTATHREADSTATE enmState;
4772 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4773 enmState = pDevExt->enmTscDeltaThreadState;
4774 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4775
4776 if (enmState == kTscDeltaThreadState_Measuring)
4777 { /* Must wait, the thread is busy. */ }
4778 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4779 { /* Must wait, this state only says what will happen next. */ }
4780 else if (enmState == kTscDeltaThreadState_Terminating)
4781 { /* Must wait, this state only says what should happen next. */ }
4782 else
4783 break; /* All other states, the thread is either idly listening or dead. */
4784
4785 /* Wait or fail. */
4786 if (cMsWaitThread == 0)
4787 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4788 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4789 if (cMsElapsed >= cMsWaitThread)
4790 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4791
4792 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4793 if (rc == VERR_INTERRUPTED)
4794 return rc;
4795 }
4796#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4797
4798 /*
4799 * Try measure the TSC delta the given number of times.
4800 */
4801 for (;;)
4802 {
4803 /* Unless we're forced to measure the delta, check whether it's done already. */
4804 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4805 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4806 {
4807 rc = VINF_SUCCESS;
4808 break;
4809 }
4810
4811 /* Measure it. */
4812 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4813 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4814 {
4815 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4816 break;
4817 }
4818
4819 /* Retry? */
4820 if (cTries <= 1)
4821 break;
4822 cTries--;
4823
4824 /* Always delay between retries (be nice to the rest of the system
4825 and avoid the BSOD hounds). */
4826 rc = RTThreadSleep(cMsWaitRetry);
4827 if (rc == VERR_INTERRUPTED)
4828 break;
4829 }
4830
4831 return rc;
4832}
4833SUPR0_EXPORT_SYMBOL(SUPR0TscDeltaMeasureBySetIndex);
4834
4835
4836/**
4837 * Service a TSC-delta measurement request.
4838 *
4839 * @returns VBox status code.
4840 * @param pDevExt Pointer to the device instance data.
4841 * @param pSession The support driver session.
4842 * @param pReq Pointer to the TSC-delta measurement request.
4843 */
4844int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4845{
4846 uint32_t cTries;
4847 uint32_t iCpuSet;
4848 uint32_t fFlags;
4849 RTMSINTERVAL cMsWaitRetry;
4850 RT_NOREF1(pDevExt);
4851
4852 /*
4853 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4854 */
4855 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4856
4857 if (pReq->u.In.idCpu == NIL_RTCPUID)
4858 return VERR_INVALID_CPU_ID;
4859 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4860 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4861 return VERR_INVALID_CPU_ID;
4862
4863 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4864
4865 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4866
4867 fFlags = 0;
4868 if (pReq->u.In.fAsync)
4869 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4870 if (pReq->u.In.fForce)
4871 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4872
4873 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4874 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4875 cTries);
4876}
4877
4878
4879/**
4880 * Reads TSC with delta applied.
4881 *
4882 * Will try to resolve delta value INT64_MAX before applying it. This is the
4883 * main purpose of this function, to handle the case where the delta needs to be
4884 * determined.
4885 *
4886 * @returns VBox status code.
4887 * @param pDevExt Pointer to the device instance data.
4888 * @param pSession The support driver session.
4889 * @param pReq Pointer to the TSC-read request.
4890 */
4891int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4892{
4893 PSUPGLOBALINFOPAGE pGip;
4894 int rc;
4895
4896 /*
4897 * Validate. We require the client to have mapped GIP (no asserting on
4898 * ring-3 preconditions).
4899 */
4900 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4901 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4902 return VERR_WRONG_ORDER;
4903 pGip = pDevExt->pGip;
4904 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4905
4906 /*
4907 * We're usually here because we need to apply delta, but we shouldn't be
4908 * upset if the GIP is some different mode.
4909 */
4910 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4911 {
4912 uint32_t cTries = 0;
4913 for (;;)
4914 {
4915 /*
4916 * Start by gathering the data, using CLI for disabling preemption
4917 * while we do that.
4918 */
4919 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4920 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4921 int iGipCpu = 0; /* gcc maybe used uninitialized */
4922 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4923 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4924 {
4925 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4926 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4927 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4928 ASMSetFlags(fEFlags);
4929
4930 /*
4931 * If we're lucky we've got a delta, but no predictions here
4932 * as this I/O control is normally only used when the TSC delta
4933 * is set to INT64_MAX.
4934 */
4935 if (i64Delta != INT64_MAX)
4936 {
4937 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4938 rc = VINF_SUCCESS;
4939 break;
4940 }
4941
4942 /* Give up after a few times. */
4943 if (cTries >= 4)
4944 {
4945 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4946 break;
4947 }
4948
4949 /* Need to measure the delta an try again. */
4950 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4951 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4952 /** @todo should probably delay on failure... dpc watchdogs */
4953 }
4954 else
4955 {
4956 /* This really shouldn't happen. */
4957 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4958 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4959 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4960 ASMSetFlags(fEFlags);
4961 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4962 break;
4963 }
4964 }
4965 }
4966 else
4967 {
4968 /*
4969 * No delta to apply. Easy. Deal with preemption the lazy way.
4970 */
4971 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4972 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4973 int iGipCpu = 0; /* gcc may be used uninitialized */
4974 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4975 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4976 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4977 else
4978 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4979 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4980 ASMSetFlags(fEFlags);
4981 rc = VINF_SUCCESS;
4982 }
4983
4984 return rc;
4985}
4986
4987
4988/**
4989 * Worker for supdrvIOCtl_GipSetFlags.
4990 *
4991 * @returns VBox status code.
4992 * @retval VERR_WRONG_ORDER if an enable-once-per-session flag is set again for
4993 * a session.
4994 *
4995 * @param pDevExt Pointer to the device instance data.
4996 * @param pSession The support driver session.
4997 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4998 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4999 *
5000 * @remarks Caller must own the GIP mutex.
5001 *
5002 * @remarks This function doesn't validate any of the flags.
5003 */
5004static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5005{
5006 uint32_t cRefs;
5007 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
5008 AssertMsg((fOrMask & fAndMask) == fOrMask, ("%#x & %#x\n", fOrMask, fAndMask)); /* ASSUMED by code below */
5009
5010 /*
5011 * Compute GIP test-mode flags.
5012 */
5013 if (fOrMask & SUPGIP_FLAGS_TESTING_ENABLE)
5014 {
5015 if (!pSession->fGipTestMode)
5016 {
5017 Assert(pDevExt->cGipTestModeRefs < _64K);
5018 pSession->fGipTestMode = true;
5019 cRefs = ++pDevExt->cGipTestModeRefs;
5020 if (cRefs == 1)
5021 {
5022 fOrMask |= SUPGIP_FLAGS_TESTING | SUPGIP_FLAGS_TESTING_START;
5023 fAndMask &= ~SUPGIP_FLAGS_TESTING_STOP;
5024 }
5025 }
5026 else
5027 {
5028 LogRelMax(10, ("supdrvGipSetFlags: SUPGIP_FLAGS_TESTING_ENABLE already set for this session\n"));
5029 return VERR_WRONG_ORDER;
5030 }
5031 }
5032 else if ( !(fAndMask & SUPGIP_FLAGS_TESTING_ENABLE)
5033 && pSession->fGipTestMode)
5034 {
5035 Assert(pDevExt->cGipTestModeRefs > 0);
5036 Assert(pDevExt->cGipTestModeRefs < _64K);
5037 pSession->fGipTestMode = false;
5038 cRefs = --pDevExt->cGipTestModeRefs;
5039 if (!cRefs)
5040 fOrMask |= SUPGIP_FLAGS_TESTING_STOP;
5041 else
5042 fAndMask |= SUPGIP_FLAGS_TESTING_ENABLE;
5043 }
5044
5045 /*
5046 * Commit the flags. This should be done as atomically as possible
5047 * since the flag consumers won't be holding the GIP mutex.
5048 */
5049 ASMAtomicOrU32(&pGip->fFlags, fOrMask);
5050 ASMAtomicAndU32(&pGip->fFlags, fAndMask);
5051
5052 return VINF_SUCCESS;
5053}
5054
5055
5056/**
5057 * Sets GIP test mode parameters.
5058 *
5059 * @returns VBox status code.
5060 * @param pDevExt Pointer to the device instance data.
5061 * @param pSession The support driver session.
5062 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5063 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5064 */
5065int VBOXCALL supdrvIOCtl_GipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5066{
5067 PSUPGLOBALINFOPAGE pGip;
5068 int rc;
5069
5070 /*
5071 * Validate. We require the client to have mapped GIP (no asserting on
5072 * ring-3 preconditions).
5073 */
5074 AssertPtr(pDevExt); AssertPtr(pSession); /* paranoia^2 */
5075 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
5076 return VERR_WRONG_ORDER;
5077 pGip = pDevExt->pGip;
5078 AssertReturn(pGip, VERR_INTERNAL_ERROR_3);
5079
5080 if (fOrMask & ~SUPGIP_FLAGS_VALID_MASK)
5081 return VERR_INVALID_PARAMETER;
5082 if ((fAndMask & ~SUPGIP_FLAGS_VALID_MASK) != ~SUPGIP_FLAGS_VALID_MASK)
5083 return VERR_INVALID_PARAMETER;
5084
5085 /*
5086 * Don't confuse supdrvGipSetFlags or anyone else by both setting
5087 * and clearing the same flags. AND takes precedence.
5088 */
5089 fOrMask &= fAndMask;
5090
5091 /*
5092 * Take the loader lock to avoid having to think about races between two
5093 * clients changing the flags at the same time (state is not simple).
5094 */
5095#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5096 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
5097#else
5098 RTSemFastMutexRequest(pDevExt->mtxGip);
5099#endif
5100
5101 rc = supdrvGipSetFlags(pDevExt, pSession, fOrMask, fAndMask);
5102
5103#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5104 RTSemMutexRelease(pDevExt->mtxGip);
5105#else
5106 RTSemFastMutexRelease(pDevExt->mtxGip);
5107#endif
5108 return rc;
5109}
5110
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette