VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 81605

Last change on this file since 81605 was 81605, checked in by vboxsync, 5 years ago

VMM (and related changes): Add support for Hygon Dhyana CPUs. Modified and improved contribution by Hongyong Zang submitted under MIT license. Thank you!

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 185.9 KB
Line 
1/* $Id: SUPDrvGip.cpp 81605 2019-10-31 14:29:46Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2019 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#define LOG_GROUP LOG_GROUP_SUP_DRV
32#define SUPDRV_AGNOSTIC
33#include "SUPDrvInternal.h"
34#ifndef PAGE_SHIFT
35# include <iprt/param.h>
36#endif
37#include <iprt/asm.h>
38#include <iprt/asm-amd64-x86.h>
39#include <iprt/asm-math.h>
40#include <iprt/cpuset.h>
41#include <iprt/handletable.h>
42#include <iprt/mem.h>
43#include <iprt/mp.h>
44#include <iprt/power.h>
45#include <iprt/process.h>
46#include <iprt/semaphore.h>
47#include <iprt/spinlock.h>
48#include <iprt/thread.h>
49#include <iprt/uuid.h>
50#include <iprt/net.h>
51#include <iprt/crc.h>
52#include <iprt/string.h>
53#include <iprt/timer.h>
54#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
55# include <iprt/rand.h>
56# include <iprt/path.h>
57#endif
58#include <iprt/uint128.h>
59#include <iprt/x86.h>
60
61#include <VBox/param.h>
62#include <VBox/log.h>
63#include <VBox/err.h>
64
65#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
66# include "dtrace/SUPDrv.h"
67#else
68/* ... */
69#endif
70
71
72/*********************************************************************************************************************************
73* Defined Constants And Macros *
74*********************************************************************************************************************************/
75/** The frequency by which we recalculate the u32UpdateHz and
76 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
77 *
78 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
79 */
80#define GIP_UPDATEHZ_RECALC_FREQ 0x800
81
82/** A reserved TSC value used for synchronization as well as measurement of
83 * TSC deltas. */
84#define GIP_TSC_DELTA_RSVD UINT64_MAX
85/** The number of TSC delta measurement loops in total (includes primer and
86 * read-time loops). */
87#define GIP_TSC_DELTA_LOOPS 96
88/** The number of cache primer loops. */
89#define GIP_TSC_DELTA_PRIMER_LOOPS 4
90/** The number of loops until we keep computing the minumum read time. */
91#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
92
93/** The TSC frequency refinement period in seconds.
94 * The timer fires after 200ms, then every second, this value just says when
95 * to stop it after that. */
96#define GIP_TSC_REFINE_PERIOD_IN_SECS 12
97/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
98#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
99/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
100#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
101/** The TSC delta value for the initial GIP master - 0 in regular builds.
102 * To test the delta code this can be set to a non-zero value. */
103#if 0
104# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
105#else
106# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
107#endif
108
109AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
110AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
111
112/** @def VBOX_SVN_REV
113 * The makefile should define this if it can. */
114#ifndef VBOX_SVN_REV
115# define VBOX_SVN_REV 0
116#endif
117
118#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
119# define DO_NOT_START_GIP
120#endif
121
122
123/*********************************************************************************************************************************
124* Internal Functions *
125*********************************************************************************************************************************/
126static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
127static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
128static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask);
129static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
130static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas);
131#ifdef SUPDRV_USE_TSC_DELTA_THREAD
132static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
133static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
134static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll);
135#else
136static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt);
137static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
138#endif
139
140
141/*********************************************************************************************************************************
142* Global Variables *
143*********************************************************************************************************************************/
144DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
145
146
147
148/*
149 *
150 * Misc Common GIP Code
151 * Misc Common GIP Code
152 * Misc Common GIP Code
153 *
154 *
155 */
156
157
158/**
159 * Finds the GIP CPU index corresponding to @a idCpu.
160 *
161 * @returns GIP CPU array index, UINT32_MAX if not found.
162 * @param pGip The GIP.
163 * @param idCpu The CPU ID.
164 */
165static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
166{
167 uint32_t i;
168 for (i = 0; i < pGip->cCpus; i++)
169 if (pGip->aCPUs[i].idCpu == idCpu)
170 return i;
171 return UINT32_MAX;
172}
173
174
175/**
176 * Gets the APIC ID using the best available method.
177 *
178 * @returns APIC ID.
179 * @param pGip The GIP, for SUPGIPGETCPU_XXX.
180 */
181DECLINLINE(uint32_t) supdrvGipGetApicId(PSUPGLOBALINFOPAGE pGip)
182{
183 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_0B)
184 return ASMGetApicIdExt0B();
185 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_8000001E)
186 return ASMGetApicIdExt8000001E();
187 return ASMGetApicId();
188}
189
190
191/**
192 * Gets the APIC ID using the best available method, slow version.
193 */
194static uint32_t supdrvGipGetApicIdSlow(void)
195{
196 uint32_t const idApic = ASMGetApicId();
197
198 /* The Intel CPU topology leaf: */
199 uint32_t uOther = ASMCpuId_EAX(0);
200 if (uOther >= UINT32_C(0xb) && ASMIsValidStdRange(uOther))
201 {
202 uOther = ASMGetApicIdExt0B();
203 if ((uOther & 0xff) == idApic)
204 return uOther;
205 AssertMsgFailed(("ASMGetApicIdExt0B=>%#x idApic=%#x\n", uOther, idApic));
206 }
207
208 /* The AMD leaf: */
209 uOther = ASMCpuId_EAX(UINT32_C(0x80000000));
210 if (uOther >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uOther))
211 {
212 uOther = ASMGetApicIdExt8000001E();
213 if ((uOther & 0xff) == idApic)
214 return uOther;
215 AssertMsgFailed(("ASMGetApicIdExt8000001E=>%#x idApic=%#x\n", uOther, idApic));
216 }
217 return idApic;
218}
219
220
221/*
222 *
223 * GIP Mapping and Unmapping Related Code.
224 * GIP Mapping and Unmapping Related Code.
225 * GIP Mapping and Unmapping Related Code.
226 *
227 *
228 */
229
230
231/**
232 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
233 * updating.
234 *
235 * @param pGipCpu The per CPU structure for this CPU.
236 * @param u64NanoTS The current time.
237 */
238static void supdrvGipReInitCpu(PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
239{
240 /*
241 * Here we don't really care about applying the TSC delta. The re-initialization of this
242 * value is not relevant especially while (re)starting the GIP as the first few ones will
243 * be ignored anyway, see supdrvGipDoUpdateCpu().
244 */
245 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
246 pGipCpu->u64NanoTS = u64NanoTS;
247}
248
249
250/**
251 * Set the current TSC and NanoTS value for the CPU.
252 *
253 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
254 * @param pvUser1 Pointer to the ring-0 GIP mapping.
255 * @param pvUser2 Pointer to the variable holding the current time.
256 */
257static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
258{
259 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
260 uint32_t const idApic = supdrvGipGetApicId(pGip);
261 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
262 {
263 unsigned const iCpu = pGip->aiCpuFromApicId[idApic];
264
265 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
266 supdrvGipReInitCpu(&pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
267 else
268 LogRelMax(64, ("supdrvGipReInitCpuCallback: iCpu=%#x out of bounds (%#zx, idApic=%#x)\n",
269 iCpu, RT_ELEMENTS(pGip->aiCpuFromApicId), idApic));
270 }
271 else
272 LogRelMax(64, ("supdrvGipReInitCpuCallback: idApic=%#x out of bounds (%#zx)\n",
273 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId)));
274
275 NOREF(pvUser2);
276}
277
278
279/**
280 * State structure for supdrvGipDetectGetGipCpuCallback.
281 */
282typedef struct SUPDRVGIPDETECTGETCPU
283{
284 /** Bitmap of APIC IDs that has been seen (initialized to zero).
285 * Used to detect duplicate APIC IDs (paranoia). */
286 uint8_t volatile bmApicId[4096 / 8];
287 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
288 * initially). The callback clears the methods not detected. */
289 uint32_t volatile fSupported;
290 /** The first callback detecting any kind of range issues (initialized to
291 * NIL_RTCPUID). */
292 RTCPUID volatile idCpuProblem;
293} SUPDRVGIPDETECTGETCPU;
294/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
295typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
296
297
298/**
299 * Checks for alternative ways of getting the CPU ID.
300 *
301 * This also checks the APIC ID, CPU ID and CPU set index values against the
302 * GIP tables.
303 *
304 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
305 * @param pvUser1 Pointer to the state structure.
306 * @param pvUser2 Pointer to the GIP.
307 */
308static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
309{
310 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
311 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
312 uint32_t fSupported = 0;
313 uint32_t idApic;
314 uint32_t uEax, uEbx, uEcx, uEdx;
315 int iCpuSet;
316 NOREF(pGip);
317
318 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
319
320 /*
321 * Check that the CPU ID and CPU set index are interchangable.
322 */
323 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
324 if ((RTCPUID)iCpuSet == idCpu)
325 {
326 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
327 if ( iCpuSet >= 0
328 && iCpuSet < RTCPUSET_MAX_CPUS
329 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
330 {
331 PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
332
333 /*
334 * Check whether the IDTR.LIMIT contains a CPU number.
335 */
336#ifdef RT_ARCH_X86
337 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
338#else
339 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
340#endif
341 RTIDTR Idtr;
342 ASMGetIDTR(&Idtr);
343 if (Idtr.cbIdt >= cbIdt)
344 {
345 uint32_t uTmp = Idtr.cbIdt - cbIdt;
346 uTmp &= RTCPUSET_MAX_CPUS - 1;
347 if (uTmp == idCpu)
348 {
349 RTIDTR Idtr2;
350 ASMGetIDTR(&Idtr2);
351 if (Idtr2.cbIdt == Idtr.cbIdt)
352 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
353 }
354 }
355
356 /*
357 * Check whether RDTSCP is an option.
358 */
359 if (ASMHasCpuId())
360 {
361 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
362 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
363 {
364 uint32_t uAux;
365 ASMReadTscWithAux(&uAux);
366 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
367 {
368 ASMNopPause();
369 ASMReadTscWithAux(&uAux);
370 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
371 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
372 }
373
374 if (pGipCpu)
375 {
376 uint32_t const uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
377 if ( (uAux & UINT16_MAX) == uGroupedAux
378 && pGipCpu->iCpuGroupMember <= UINT8_MAX)
379 {
380 ASMNopPause();
381 ASMReadTscWithAux(&uAux);
382 if ((uAux & UINT16_MAX) == uGroupedAux)
383 fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
384 }
385 }
386 }
387 }
388 }
389 }
390
391 /*
392 * Check for extended APIC ID methods.
393 */
394 idApic = UINT32_MAX;
395 uEax = ASMCpuId_EAX(0);
396 if (uEax >= UINT32_C(0xb) && ASMIsValidStdRange(uEax))
397 {
398#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
399 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
400#else
401 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
402#endif
403 if (uEax || uEbx || uEcx || uEdx)
404 {
405 if (RT_LIKELY( uEdx < RT_ELEMENTS(pGip->aiCpuFromApicId)
406 && !ASMBitTest(pState->bmApicId, uEdx)))
407 {
408 if (uEdx == ASMGetApicIdExt0B())
409 {
410 idApic = uEdx;
411 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_0B;
412 }
413 else
414 AssertMsgFailed(("%#x vs %#x\n", uEdx, ASMGetApicIdExt0B()));
415 }
416 }
417 }
418
419 uEax = ASMCpuId_EAX(UINT32_C(0x80000000));
420 if (uEax >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uEax))
421 {
422#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
423 ASMCpuId_Idx_ECX(UINT32_C(0x8000001e), 0, &uEax, &uEbx, &uEcx, &uEdx);
424#else
425 ASMCpuIdExSlow(UINT32_C(0x8000001e), 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
426#endif
427 if (uEax || uEbx || uEcx || uEdx)
428 {
429 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
430 && ( idApic == UINT32_MAX
431 || idApic == uEax)
432 && !ASMBitTest(pState->bmApicId, uEax)))
433 {
434 if (uEax == ASMGetApicIdExt8000001E())
435 {
436 idApic = uEax;
437 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_8000001E;
438 }
439 else
440 AssertMsgFailed(("%#x vs %#x\n", uEax, ASMGetApicIdExt8000001E()));
441 }
442 }
443 }
444
445 /*
446 * Check that the APIC ID is unique.
447 */
448 uEax = ASMGetApicId();
449 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
450 && ( idApic == UINT32_MAX
451 || idApic == uEax)
452 && !ASMAtomicBitTestAndSet(pState->bmApicId, uEax)))
453 {
454 idApic = uEax;
455 fSupported |= SUPGIPGETCPU_APIC_ID;
456 }
457 else if ( idApic == UINT32_MAX
458 || idApic >= RT_ELEMENTS(pGip->aiCpuFromApicId) /* parnaoia */
459 || ASMAtomicBitTestAndSet(pState->bmApicId, idApic))
460 {
461 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
462 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
463 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x/%#x - duplicate APIC ID.\n",
464 idCpu, iCpuSet, uEax, idApic));
465 }
466
467 /*
468 * Check that the iCpuSet is within the expected range.
469 */
470 if (RT_UNLIKELY( iCpuSet < 0
471 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
472 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
473 {
474 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
475 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
476 idCpu, iCpuSet, idApic));
477 }
478 else
479 {
480 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
481 if (RT_UNLIKELY(idCpu2 != idCpu))
482 {
483 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
484 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
485 idCpu, iCpuSet, idApic, idCpu2));
486 }
487 }
488
489 /*
490 * Update the supported feature mask before we return.
491 */
492 ASMAtomicAndU32(&pState->fSupported, fSupported);
493
494 NOREF(pvUser2);
495}
496
497
498/**
499 * Increase the timer freqency on hosts where this is possible (NT).
500 *
501 * The idea is that more interrupts is better for us... Also, it's better than
502 * we increase the timer frequence, because we might end up getting inaccurate
503 * callbacks if someone else does it.
504 *
505 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
506 */
507static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
508{
509 if (pDevExt->u32SystemTimerGranularityGrant == 0)
510 {
511 uint32_t u32SystemResolution;
512 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
513 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
514 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
515 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
516 )
517 {
518#if 0 /* def VBOX_STRICT - this is somehow triggers bogus assertions on windows 10 */
519 uint32_t u32After = RTTimerGetSystemGranularity();
520 AssertMsg(u32After <= u32SystemResolution, ("u32After=%u u32SystemResolution=%u\n", u32After, u32SystemResolution));
521#endif
522 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
523 }
524 }
525}
526
527
528/**
529 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
530 *
531 * @param pDevExt Clears u32SystemTimerGranularityGrant.
532 */
533static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
534{
535 if (pDevExt->u32SystemTimerGranularityGrant)
536 {
537 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
538 AssertRC(rc2);
539 pDevExt->u32SystemTimerGranularityGrant = 0;
540 }
541}
542
543
544/**
545 * Maps the GIP into userspace and/or get the physical address of the GIP.
546 *
547 * @returns IPRT status code.
548 * @param pSession Session to which the GIP mapping should belong.
549 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
550 * @param pHCPhysGip Where to store the physical address. (optional)
551 *
552 * @remark There is no reference counting on the mapping, so one call to this function
553 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
554 * and remove the session as a GIP user.
555 */
556SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
557{
558 int rc;
559 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
560 RTR3PTR pGipR3 = NIL_RTR3PTR;
561 RTHCPHYS HCPhys = NIL_RTHCPHYS;
562 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
563
564 /*
565 * Validate
566 */
567 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
568 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
569 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
570
571#ifdef SUPDRV_USE_MUTEX_FOR_GIP
572 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
573#else
574 RTSemFastMutexRequest(pDevExt->mtxGip);
575#endif
576 if (pDevExt->pGip)
577 {
578 /*
579 * Map it?
580 */
581 rc = VINF_SUCCESS;
582 if (ppGipR3)
583 {
584 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
585 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
586 RTMEM_PROT_READ, NIL_RTR0PROCESS);
587 if (RT_SUCCESS(rc))
588 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
589 }
590
591 /*
592 * Get physical address.
593 */
594 if (pHCPhysGip && RT_SUCCESS(rc))
595 HCPhys = pDevExt->HCPhysGip;
596
597 /*
598 * Reference globally.
599 */
600 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
601 {
602 pSession->fGipReferenced = 1;
603 pDevExt->cGipUsers++;
604 if (pDevExt->cGipUsers == 1)
605 {
606 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
607 uint64_t u64NanoTS;
608
609 /*
610 * GIP starts/resumes updating again. On windows we bump the
611 * host timer frequency to make sure we don't get stuck in guest
612 * mode and to get better timer (and possibly clock) accuracy.
613 */
614 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
615
616 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
617
618 /*
619 * document me
620 */
621 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
622 {
623 unsigned i;
624 for (i = 0; i < pGipR0->cCpus; i++)
625 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
626 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
627 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
628 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
629 }
630
631 /*
632 * document me
633 */
634 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
635 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
636 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
637 || RTMpGetOnlineCount() == 1)
638 supdrvGipReInitCpu(&pGipR0->aCPUs[0], u64NanoTS);
639 else
640 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
641
642 /*
643 * Detect alternative ways to figure the CPU ID in ring-3 and
644 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
645 * and CPU set indexes while we're at it.
646 */
647 if (RT_SUCCESS(rc))
648 {
649 PSUPDRVGIPDETECTGETCPU pDetectState = (PSUPDRVGIPDETECTGETCPU)RTMemTmpAllocZ(sizeof(*pDetectState));
650 if (pDetectState)
651 {
652 pDetectState->fSupported = UINT32_MAX;
653 pDetectState->idCpuProblem = NIL_RTCPUID;
654 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, pDetectState, pGipR0);
655 if (pDetectState->idCpuProblem == NIL_RTCPUID)
656 {
657 if ( pDetectState->fSupported != UINT32_MAX
658 && pDetectState->fSupported != 0)
659 {
660 if (pGipR0->fGetGipCpu != pDetectState->fSupported)
661 {
662 pGipR0->fGetGipCpu = pDetectState->fSupported;
663 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", pDetectState->fSupported));
664 }
665 }
666 else
667 {
668 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
669 pDetectState->fSupported));
670 rc = VERR_UNSUPPORTED_CPU;
671 }
672 }
673 else
674 {
675 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
676 pDetectState->idCpuProblem, pDetectState->idCpuProblem));
677 rc = VERR_INVALID_CPU_ID;
678 }
679 RTMemTmpFree(pDetectState);
680 }
681 else
682 rc = VERR_NO_TMP_MEMORY;
683 }
684
685 /*
686 * Start the GIP timer if all is well..
687 */
688 if (RT_SUCCESS(rc))
689 {
690#ifndef DO_NOT_START_GIP
691 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
692#endif
693 rc = VINF_SUCCESS;
694 }
695
696 /*
697 * Bail out on error.
698 */
699 if (RT_FAILURE(rc))
700 {
701 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
702 pDevExt->cGipUsers = 0;
703 pSession->fGipReferenced = 0;
704 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
705 {
706 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
707 if (RT_SUCCESS(rc2))
708 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
709 }
710 HCPhys = NIL_RTHCPHYS;
711 pGipR3 = NIL_RTR3PTR;
712 }
713 }
714 }
715 }
716 else
717 {
718 rc = VERR_GENERAL_FAILURE;
719 Log(("SUPR0GipMap: GIP is not available!\n"));
720 }
721#ifdef SUPDRV_USE_MUTEX_FOR_GIP
722 RTSemMutexRelease(pDevExt->mtxGip);
723#else
724 RTSemFastMutexRelease(pDevExt->mtxGip);
725#endif
726
727 /*
728 * Write returns.
729 */
730 if (pHCPhysGip)
731 *pHCPhysGip = HCPhys;
732 if (ppGipR3)
733 *ppGipR3 = pGipR3;
734
735#ifdef DEBUG_DARWIN_GIP
736 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
737#else
738 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
739#endif
740 return rc;
741}
742
743
744/**
745 * Unmaps any user mapping of the GIP and terminates all GIP access
746 * from this session.
747 *
748 * @returns IPRT status code.
749 * @param pSession Session to which the GIP mapping should belong.
750 */
751SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
752{
753 int rc = VINF_SUCCESS;
754 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
755#ifdef DEBUG_DARWIN_GIP
756 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
757 pSession,
758 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
759 pSession->GipMapObjR3));
760#else
761 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
762#endif
763 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
764
765#ifdef SUPDRV_USE_MUTEX_FOR_GIP
766 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
767#else
768 RTSemFastMutexRequest(pDevExt->mtxGip);
769#endif
770
771 /*
772 * GIP test-mode session?
773 */
774 if ( pSession->fGipTestMode
775 && pDevExt->pGip)
776 {
777 supdrvGipSetFlags(pDevExt, pSession, 0, ~SUPGIP_FLAGS_TESTING_ENABLE);
778 Assert(!pSession->fGipTestMode);
779 }
780
781 /*
782 * Unmap anything?
783 */
784 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
785 {
786 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
787 AssertRC(rc);
788 if (RT_SUCCESS(rc))
789 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
790 }
791
792 /*
793 * Dereference global GIP.
794 */
795 if (pSession->fGipReferenced && !rc)
796 {
797 pSession->fGipReferenced = 0;
798 if ( pDevExt->cGipUsers > 0
799 && !--pDevExt->cGipUsers)
800 {
801 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
802#ifndef DO_NOT_START_GIP
803 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
804#endif
805 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
806 }
807 }
808
809#ifdef SUPDRV_USE_MUTEX_FOR_GIP
810 RTSemMutexRelease(pDevExt->mtxGip);
811#else
812 RTSemFastMutexRelease(pDevExt->mtxGip);
813#endif
814
815 return rc;
816}
817
818
819/**
820 * Gets the GIP pointer.
821 *
822 * @returns Pointer to the GIP or NULL.
823 */
824SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
825{
826 return g_pSUPGlobalInfoPage;
827}
828
829
830
831
832
833/*
834 *
835 *
836 * GIP Initialization, Termination and CPU Offline / Online Related Code.
837 * GIP Initialization, Termination and CPU Offline / Online Related Code.
838 * GIP Initialization, Termination and CPU Offline / Online Related Code.
839 *
840 *
841 */
842
843/**
844 * Used by supdrvGipInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
845 * to update the TSC frequency related GIP variables.
846 *
847 * @param pGip The GIP.
848 * @param nsElapsed The number of nanoseconds elapsed.
849 * @param cElapsedTscTicks The corresponding number of TSC ticks.
850 * @param iTick The tick number for debugging.
851 */
852static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks, uint32_t iTick)
853{
854 /*
855 * Calculate the frequency.
856 */
857 uint64_t uCpuHz;
858 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
859 && nsElapsed < UINT32_MAX)
860 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
861 else
862 {
863 RTUINT128U CpuHz, Tmp, Divisor;
864 CpuHz.s.Lo = CpuHz.s.Hi = 0;
865 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
866 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
867 uCpuHz = CpuHz.s.Lo;
868 }
869
870 /*
871 * Update the GIP.
872 */
873 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
874 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
875 {
876 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
877
878 /* For inspecting the frequency calcs using tstGIP-2, debugger or similar. */
879 if (iTick + 1 < pGip->cCpus)
880 ASMAtomicWriteU64(&pGip->aCPUs[iTick + 1].u64CpuHz, uCpuHz);
881 }
882}
883
884
885/**
886 * Timer callback function for TSC frequency refinement in invariant GIP mode.
887 *
888 * This is started during driver init and fires once
889 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
890 *
891 * @param pTimer The timer.
892 * @param pvUser Opaque pointer to the device instance data.
893 * @param iTick The timer tick.
894 */
895static DECLCALLBACK(void) supdrvGipInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
896{
897 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
898 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
899 RTCPUID idCpu;
900 uint64_t cNsElapsed;
901 uint64_t cTscTicksElapsed;
902 uint64_t nsNow;
903 uint64_t uTsc;
904 RTCCUINTREG fEFlags;
905
906 /* Paranoia. */
907 AssertReturnVoid(pGip);
908 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
909
910 /*
911 * If we got a power event, stop the refinement process.
912 */
913 if (pDevExt->fInvTscRefinePowerEvent)
914 {
915 int rc = RTTimerStop(pTimer); AssertRC(rc);
916 return;
917 }
918
919 /*
920 * Read the TSC and time, noting which CPU we are on.
921 *
922 * Don't bother spinning until RTTimeSystemNanoTS changes, since on
923 * systems where it matters we're in a context where we cannot waste that
924 * much time (DPC watchdog, called from clock interrupt).
925 */
926 fEFlags = ASMIntDisableFlags();
927 uTsc = ASMReadTSC();
928 nsNow = RTTimeSystemNanoTS();
929 idCpu = RTMpCpuId();
930 ASMSetFlags(fEFlags);
931
932 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
933 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
934
935 /*
936 * If the above measurement was taken on a different CPU than the one we
937 * started the process on, cTscTicksElapsed will need to be adjusted with
938 * the TSC deltas of both the CPUs.
939 *
940 * We ASSUME that the delta calculation process takes less time than the
941 * TSC frequency refinement timer. If it doesn't, we'll complain and
942 * drop the frequency refinement.
943 *
944 * Note! We cannot entirely trust enmUseTscDelta here because it's
945 * downgraded after each delta calculation.
946 */
947 if ( idCpu != pDevExt->idCpuInvarTscRefine
948 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
949 {
950 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
951 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
952 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
953 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
954 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
955 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
956 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
957 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
958 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
959 {
960 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
961 {
962 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
963 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
964 }
965 }
966 /*
967 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
968 * calculations.
969 */
970 else if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
971 {
972 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
973 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
974 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
975 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
976 int rc = RTTimerStop(pTimer); AssertRC(rc);
977 return;
978 }
979 }
980
981 /*
982 * Calculate and update the CPU frequency variables in GIP.
983 *
984 * If there is a GIP user already and we've already refined the frequency
985 * a couple of times, don't update it as we want a stable frequency value
986 * for all VMs.
987 */
988 if ( pDevExt->cGipUsers == 0
989 || cNsElapsed < RT_NS_1SEC * 2)
990 {
991 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed, (uint32_t)iTick);
992
993 /*
994 * Stop the timer once we've reached the defined refinement period.
995 */
996 if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
997 {
998 int rc = RTTimerStop(pTimer);
999 AssertRC(rc);
1000 }
1001 }
1002 else
1003 {
1004 int rc = RTTimerStop(pTimer);
1005 AssertRC(rc);
1006 }
1007}
1008
1009
1010/**
1011 * @callback_method_impl{FNRTPOWERNOTIFICATION}
1012 */
1013static DECLCALLBACK(void) supdrvGipPowerNotificationCallback(RTPOWEREVENT enmEvent, void *pvUser)
1014{
1015 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1016 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1017
1018 /*
1019 * If the TSC frequency refinement timer is running, we need to cancel it so it
1020 * doesn't screw up the frequency after a long suspend.
1021 *
1022 * Recalculate all TSC-deltas on host resume as it may have changed, seen
1023 * on Windows 7 running on the Dell Optiplex Intel Core i5-3570.
1024 */
1025 if (enmEvent == RTPOWEREVENT_RESUME)
1026 {
1027 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1028 if ( RT_LIKELY(pGip)
1029 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1030 && !supdrvOSAreCpusOfflinedOnSuspend())
1031 {
1032#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1033 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
1034#else
1035 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
1036 supdrvTscMeasureInitialDeltas(pDevExt);
1037#endif
1038 }
1039 }
1040 else if (enmEvent == RTPOWEREVENT_SUSPEND)
1041 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1042}
1043
1044
1045/**
1046 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
1047 *
1048 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
1049 * the CPU may change the TSC frequence between now and when the timer fires
1050 * (supdrvInitAsyncRefineTscTimer).
1051 *
1052 * @param pDevExt Pointer to the device instance data.
1053 */
1054static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt)
1055{
1056 uint64_t u64NanoTS;
1057 RTCCUINTREG fEFlags;
1058 int rc;
1059
1060 /*
1061 * Register a power management callback.
1062 */
1063 pDevExt->fInvTscRefinePowerEvent = false;
1064 rc = RTPowerNotificationRegister(supdrvGipPowerNotificationCallback, pDevExt);
1065 AssertRC(rc); /* ignore */
1066
1067 /*
1068 * Record the TSC and NanoTS as the starting anchor point for refinement
1069 * of the TSC. We try get as close to a clock tick as possible on systems
1070 * which does not provide high resolution time.
1071 */
1072 u64NanoTS = RTTimeSystemNanoTS();
1073 while (RTTimeSystemNanoTS() == u64NanoTS)
1074 ASMNopPause();
1075
1076 fEFlags = ASMIntDisableFlags();
1077 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
1078 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
1079 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
1080 ASMSetFlags(fEFlags);
1081
1082 /*
1083 * Create a timer that runs on the same CPU so we won't have a depencency
1084 * on the TSC-delta and can run in parallel to it. On systems that does not
1085 * implement CPU specific timers we'll apply deltas in the timer callback,
1086 * just like we do for CPUs going offline.
1087 *
1088 * The longer the refinement interval the better the accuracy, at least in
1089 * theory. If it's too long though, ring-3 may already be starting its
1090 * first VMs before we're done. On most systems we will be loading the
1091 * support driver during boot and VMs won't be started for a while yet,
1092 * it is really only a problem during development (especially with
1093 * on-demand driver starting on windows).
1094 *
1095 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
1096 * to calculate the frequency during driver loading, the timer is set
1097 * to fire after 200 ms the first time. It will then reschedule itself
1098 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
1099 * reached or it notices that there is a user land client with GIP
1100 * mapped (we want a stable frequency for all VMs).
1101 */
1102 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC,
1103 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
1104 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1105 if (RT_SUCCESS(rc))
1106 {
1107 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1108 if (RT_SUCCESS(rc))
1109 return;
1110 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1111 }
1112
1113 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1114 {
1115 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC, RTTIMER_FLAGS_CPU_ANY,
1116 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1117 if (RT_SUCCESS(rc))
1118 {
1119 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1120 if (RT_SUCCESS(rc))
1121 return;
1122 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1123 }
1124 }
1125
1126 pDevExt->pInvarTscRefineTimer = NULL;
1127 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1128}
1129
1130
1131/**
1132 * @callback_method_impl{PFNRTMPWORKER,
1133 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1134 * the measurements on.}
1135 */
1136static DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1137{
1138 RTCCUINTREG fEFlags = ASMIntDisableFlags();
1139 uint64_t *puTscStop = (uint64_t *)pvUser1;
1140 uint64_t *pnsStop = (uint64_t *)pvUser2;
1141 RT_NOREF1(idCpu);
1142
1143 *puTscStop = ASMReadTSC();
1144 *pnsStop = RTTimeSystemNanoTS();
1145
1146 ASMSetFlags(fEFlags);
1147}
1148
1149
1150/**
1151 * Measures the TSC frequency of the system.
1152 *
1153 * The TSC frequency can vary on systems which are not reported as invariant.
1154 * On such systems the object of this function is to find out what the nominal,
1155 * maximum TSC frequency under 'normal' CPU operation.
1156 *
1157 * @returns VBox status code.
1158 * @param pGip Pointer to the GIP.
1159 * @param fRough Set if we're doing the rough calculation that the
1160 * TSC measuring code needs, where accuracy isn't all
1161 * that important (too high is better than too low).
1162 * When clear we try for best accuracy that we can
1163 * achieve in reasonably short time.
1164 */
1165static int supdrvGipInitMeasureTscFreq(PSUPGLOBALINFOPAGE pGip, bool fRough)
1166{
1167 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1168 int cTriesLeft = fRough ? 4 : 2;
1169 while (cTriesLeft-- > 0)
1170 {
1171 RTCCUINTREG fEFlags;
1172 uint64_t nsStart;
1173 uint64_t nsStop;
1174 uint64_t uTscStart;
1175 uint64_t uTscStop;
1176 RTCPUID idCpuStart;
1177 RTCPUID idCpuStop;
1178
1179 /*
1180 * Synchronize with the host OS clock tick on systems without high
1181 * resolution time API (older Windows version for example).
1182 */
1183 nsStart = RTTimeSystemNanoTS();
1184 while (RTTimeSystemNanoTS() == nsStart)
1185 ASMNopPause();
1186
1187 /*
1188 * Read the TSC and current time, noting which CPU we're on.
1189 */
1190 fEFlags = ASMIntDisableFlags();
1191 uTscStart = ASMReadTSC();
1192 nsStart = RTTimeSystemNanoTS();
1193 idCpuStart = RTMpCpuId();
1194 ASMSetFlags(fEFlags);
1195
1196 /*
1197 * Delay for a while.
1198 */
1199 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1200 {
1201 /*
1202 * Sleep-wait since the TSC frequency is constant, it eases host load.
1203 * Shorter interval produces more variance in the frequency (esp. Windows).
1204 */
1205 uint64_t msElapsed = 0;
1206 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1207 / RT_NS_1MS;
1208 do
1209 {
1210 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1211 nsStop = RTTimeSystemNanoTS();
1212 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1213 } while (msElapsed < msDelay);
1214
1215 while (RTTimeSystemNanoTS() == nsStop)
1216 ASMNopPause();
1217 }
1218 else
1219 {
1220 /*
1221 * Busy-wait keeping the frequency up.
1222 */
1223 do
1224 {
1225 ASMNopPause();
1226 nsStop = RTTimeSystemNanoTS();
1227 } while (nsStop - nsStart < RT_NS_100MS);
1228 }
1229
1230 /*
1231 * Read the TSC and time again.
1232 */
1233 fEFlags = ASMIntDisableFlags();
1234 uTscStop = ASMReadTSC();
1235 nsStop = RTTimeSystemNanoTS();
1236 idCpuStop = RTMpCpuId();
1237 ASMSetFlags(fEFlags);
1238
1239 /*
1240 * If the CPU changes, things get a bit complicated and what we
1241 * can get away with depends on the GIP mode / TSC reliability.
1242 */
1243 if (idCpuStop != idCpuStart)
1244 {
1245 bool fDoXCall = false;
1246
1247 /*
1248 * Synchronous TSC mode: we're probably fine as it's unlikely
1249 * that we were rescheduled because of TSC throttling or power
1250 * management reasons, so just go ahead.
1251 */
1252 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1253 {
1254 /* Probably ok, maybe we should retry once?. */
1255 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1256 }
1257 /*
1258 * If we're just doing the rough measurement, do the cross call and
1259 * get on with things (we don't have deltas!).
1260 */
1261 else if (fRough)
1262 fDoXCall = true;
1263 /*
1264 * Invariant TSC mode: It doesn't matter if we have delta available
1265 * for both CPUs. That is not something we can assume at this point.
1266 *
1267 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1268 * downgraded after each delta calculation and the delta
1269 * calculations may not be complete yet.
1270 */
1271 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1272 {
1273/** @todo This section of code is never reached atm, consider dropping it later on... */
1274 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1275 {
1276 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1277 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1278 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1279 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1280 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1281 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1282 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1283 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1284 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1285 {
1286 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1287 {
1288 uTscStart -= iStartTscDelta;
1289 uTscStop -= iStopTscDelta;
1290 }
1291 }
1292 /*
1293 * Invalid CPU indexes are not caused by online/offline races, so
1294 * we have to trigger driver load failure if that happens as GIP
1295 * and IPRT assumptions are busted on this system.
1296 */
1297 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1298 {
1299 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1300 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1301 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1302 return VERR_INVALID_CPU_INDEX;
1303 }
1304 /*
1305 * No valid deltas. We retry, if we're on our last retry
1306 * we do the cross call instead just to get a result. The
1307 * frequency will be refined in a few seconds anyway.
1308 */
1309 else if (cTriesLeft > 0)
1310 continue;
1311 else
1312 fDoXCall = true;
1313 }
1314 }
1315 /*
1316 * Asynchronous TSC mode: This is bad, as the reason we usually
1317 * use this mode is to deal with variable TSC frequencies and
1318 * deltas. So, we need to get the TSC from the same CPU as
1319 * started it, we also need to keep that CPU busy. So, retry
1320 * and fall back to the cross call on the last attempt.
1321 */
1322 else
1323 {
1324 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1325 if (cTriesLeft > 0)
1326 continue;
1327 fDoXCall = true;
1328 }
1329
1330 if (fDoXCall)
1331 {
1332 /*
1333 * Try read the TSC and timestamp on the start CPU.
1334 */
1335 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1336 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1337 continue;
1338 }
1339 }
1340
1341 /*
1342 * Calculate the TSC frequency and update it (shared with the refinement timer).
1343 */
1344 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart, 0);
1345 return VINF_SUCCESS;
1346 }
1347
1348 Assert(!fRough);
1349 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1350}
1351
1352
1353/**
1354 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1355 *
1356 * @returns Index of the CPU in the cache set.
1357 * @param pGip The GIP.
1358 * @param idCpu The CPU ID.
1359 */
1360static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1361{
1362 uint32_t i, cTries;
1363
1364 /*
1365 * ASSUMES that CPU IDs are constant.
1366 */
1367 for (i = 0; i < pGip->cCpus; i++)
1368 if (pGip->aCPUs[i].idCpu == idCpu)
1369 return i;
1370
1371 cTries = 0;
1372 do
1373 {
1374 for (i = 0; i < pGip->cCpus; i++)
1375 {
1376 bool fRc;
1377 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1378 if (fRc)
1379 return i;
1380 }
1381 } while (cTries++ < 32);
1382 AssertReleaseFailed();
1383 return i - 1;
1384}
1385
1386
1387/**
1388 * The calling CPU should be accounted as online, update GIP accordingly.
1389 *
1390 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1391 *
1392 * @param pDevExt The device extension.
1393 * @param idCpu The CPU ID.
1394 */
1395static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1396{
1397 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1398 int iCpuSet = 0;
1399 uint32_t idApic;
1400 uint32_t i = 0;
1401 uint64_t u64NanoTS = 0;
1402
1403 AssertPtrReturnVoid(pGip);
1404 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1405 AssertRelease(idCpu == RTMpCpuId());
1406 Assert(pGip->cPossibleCpus == RTMpGetCount());
1407
1408 /*
1409 * Do this behind a spinlock with interrupts disabled as this can fire
1410 * on all CPUs simultaneously, see @bugref{6110}.
1411 */
1412 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1413
1414 /*
1415 * Update the globals.
1416 */
1417 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1418 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1419 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1420 if (iCpuSet >= 0)
1421 {
1422 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1423 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1424 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1425 }
1426
1427 /*
1428 * Update the entry.
1429 */
1430 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1431 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1432
1433 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1434
1435 idApic = supdrvGipGetApicIdSlow();
1436 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1437 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1438 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1439
1440 pGip->aCPUs[i].iCpuGroup = 0;
1441 pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
1442#ifdef RT_OS_WINDOWS
1443 supdrvOSGipInitGroupBitsForCpu(pDevExt, pGip, &pGip->aCPUs[i]);
1444#endif
1445
1446 /*
1447 * Update the APIC ID and CPU set index mappings.
1448 */
1449 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
1450 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1451 else
1452 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: idApic=%#x is out of bounds (%#zx, i=%u, iCpuSet=%d)\n",
1453 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId), i, iCpuSet));
1454 if ((unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
1455 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1456 else
1457 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: iCpuSet=%d is out of bounds (%#zx, i=%u, idApic=%d)\n",
1458 iCpuSet, RT_ELEMENTS(pGip->aiCpuFromApicId), i, idApic));
1459
1460 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1461 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1462
1463 /* Update the Mp online/offline counter. */
1464 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1465
1466 /* Commit it. */
1467 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1468
1469 RTSpinlockRelease(pDevExt->hGipSpinlock);
1470}
1471
1472
1473/**
1474 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1475 *
1476 * @param idCpu The CPU ID we are running on.
1477 * @param pvUser1 Opaque pointer to the device instance data.
1478 * @param pvUser2 Not used.
1479 */
1480static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1481{
1482 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1483 NOREF(pvUser2);
1484 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1485}
1486
1487
1488/**
1489 * The CPU should be accounted as offline, update the GIP accordingly.
1490 *
1491 * This is used by supdrvGipMpEvent.
1492 *
1493 * @param pDevExt The device extension.
1494 * @param idCpu The CPU ID.
1495 */
1496static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1497{
1498 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1499 int iCpuSet;
1500 unsigned i;
1501
1502 AssertPtrReturnVoid(pGip);
1503 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1504
1505 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1506 AssertReturnVoid(iCpuSet >= 0);
1507
1508 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1509 AssertReturnVoid(i < pGip->cCpus);
1510 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1511
1512 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1513 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1514
1515 /* Update the Mp online/offline counter. */
1516 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1517
1518 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1519 {
1520 /* Reset the TSC delta, we will recalculate it lazily. */
1521 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1522 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1523 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1524 }
1525
1526 /* Commit it. */
1527 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1528
1529 RTSpinlockRelease(pDevExt->hGipSpinlock);
1530}
1531
1532
1533/**
1534 * Multiprocessor event notification callback.
1535 *
1536 * This is used to make sure that the GIP master gets passed on to
1537 * another CPU. It also updates the associated CPU data.
1538 *
1539 * @param enmEvent The event.
1540 * @param idCpu The cpu it applies to.
1541 * @param pvUser Pointer to the device extension.
1542 */
1543static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1544{
1545 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1546 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1547
1548 if (pGip)
1549 {
1550 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1551 switch (enmEvent)
1552 {
1553 case RTMPEVENT_ONLINE:
1554 {
1555 RTThreadPreemptDisable(&PreemptState);
1556 if (idCpu == RTMpCpuId())
1557 {
1558 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1559 RTThreadPreemptRestore(&PreemptState);
1560 }
1561 else
1562 {
1563 RTThreadPreemptRestore(&PreemptState);
1564 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1565 }
1566
1567 /*
1568 * Recompute TSC-delta for the newly online'd CPU.
1569 */
1570 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1571 {
1572#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1573 supdrvTscDeltaThreadStartMeasurement(pDevExt, false /* fForceAll */);
1574#else
1575 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1576 supdrvTscMeasureDeltaOne(pDevExt, iCpu);
1577#endif
1578 }
1579 break;
1580 }
1581
1582 case RTMPEVENT_OFFLINE:
1583 supdrvGipMpEventOffline(pDevExt, idCpu);
1584 break;
1585 }
1586 }
1587
1588 /*
1589 * Make sure there is a master GIP.
1590 */
1591 if (enmEvent == RTMPEVENT_OFFLINE)
1592 {
1593 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1594 if (idGipMaster == idCpu)
1595 {
1596 /*
1597 * The GIP master is going offline, find a new one.
1598 */
1599 bool fIgnored;
1600 unsigned i;
1601 RTCPUID idNewGipMaster = NIL_RTCPUID;
1602 RTCPUSET OnlineCpus;
1603 RTMpGetOnlineSet(&OnlineCpus);
1604
1605 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1606 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1607 {
1608 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1609 if (idCurCpu != idGipMaster)
1610 {
1611 idNewGipMaster = idCurCpu;
1612 break;
1613 }
1614 }
1615
1616 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1617 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1618 NOREF(fIgnored);
1619 }
1620 }
1621}
1622
1623
1624/**
1625 * On CPU initialization callback for RTMpOnAll.
1626 *
1627 * @param idCpu The CPU ID.
1628 * @param pvUser1 The device extension.
1629 * @param pvUser2 The GIP.
1630 */
1631static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1632{
1633 /* This is good enough, even though it will update some of the globals a
1634 bit to much. */
1635 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1636 NOREF(pvUser2);
1637}
1638
1639
1640/**
1641 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1642 *
1643 * @param idCpu Ignored.
1644 * @param pvUser1 Where to put the TSC.
1645 * @param pvUser2 Ignored.
1646 */
1647static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1648{
1649 Assert(RTMpCpuIdToSetIndex(idCpu) == (intptr_t)pvUser2);
1650 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1651 RT_NOREF2(idCpu, pvUser2);
1652}
1653
1654
1655/**
1656 * Determine if Async GIP mode is required because of TSC drift.
1657 *
1658 * When using the default/normal timer code it is essential that the time stamp counter
1659 * (TSC) runs never backwards, that is, a read operation to the counter should return
1660 * a bigger value than any previous read operation. This is guaranteed by the latest
1661 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1662 * case we have to choose the asynchronous timer mode.
1663 *
1664 * @param poffMin Pointer to the determined difference between different
1665 * cores (optional, can be NULL).
1666 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1667 */
1668static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1669{
1670 /*
1671 * Just iterate all the cpus 8 times and make sure that the TSC is
1672 * ever increasing. We don't bother taking TSC rollover into account.
1673 */
1674 int iEndCpu = RTMpGetArraySize();
1675 int iCpu;
1676 int cLoops = 8;
1677 bool fAsync = false;
1678 int rc = VINF_SUCCESS;
1679 uint64_t offMax = 0;
1680 uint64_t offMin = ~(uint64_t)0;
1681 uint64_t PrevTsc = ASMReadTSC();
1682
1683 while (cLoops-- > 0)
1684 {
1685 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1686 {
1687 uint64_t CurTsc;
1688 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker,
1689 &CurTsc, (void *)(uintptr_t)iCpu);
1690 if (RT_SUCCESS(rc))
1691 {
1692 if (CurTsc <= PrevTsc)
1693 {
1694 fAsync = true;
1695 offMin = offMax = PrevTsc - CurTsc;
1696 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1697 iCpu, cLoops, CurTsc, PrevTsc));
1698 break;
1699 }
1700
1701 /* Gather statistics (except the first time). */
1702 if (iCpu != 0 || cLoops != 7)
1703 {
1704 uint64_t off = CurTsc - PrevTsc;
1705 if (off < offMin)
1706 offMin = off;
1707 if (off > offMax)
1708 offMax = off;
1709 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1710 }
1711
1712 /* Next */
1713 PrevTsc = CurTsc;
1714 }
1715 else if (rc == VERR_NOT_SUPPORTED)
1716 break;
1717 else
1718 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1719 }
1720
1721 /* broke out of the loop. */
1722 if (iCpu < iEndCpu)
1723 break;
1724 }
1725
1726 if (poffMin)
1727 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1728 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1729 fAsync, iEndCpu, rc, offMin, offMax));
1730#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1731 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1732#endif
1733 return fAsync;
1734}
1735
1736
1737/**
1738 * supdrvGipInit() worker that determines the GIP TSC mode.
1739 *
1740 * @returns The most suitable TSC mode.
1741 * @param pDevExt Pointer to the device instance data.
1742 */
1743static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1744{
1745 uint64_t u64DiffCoresIgnored;
1746 uint32_t uEAX, uEBX, uECX, uEDX;
1747
1748 /*
1749 * Establish whether the CPU advertises TSC as invariant, we need that in
1750 * a couple of places below.
1751 */
1752 bool fInvariantTsc = false;
1753 if (ASMHasCpuId())
1754 {
1755 uEAX = ASMCpuId_EAX(0x80000000);
1756 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1757 {
1758 uEDX = ASMCpuId_EDX(0x80000007);
1759 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1760 fInvariantTsc = true;
1761 }
1762 }
1763
1764 /*
1765 * On single CPU systems, we don't need to consider ASYNC mode.
1766 */
1767 if (RTMpGetCount() <= 1)
1768 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1769
1770 /*
1771 * Allow the user and/or OS specific bits to force async mode.
1772 */
1773 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1774 return SUPGIPMODE_ASYNC_TSC;
1775
1776 /*
1777 * Use invariant mode if the CPU says TSC is invariant.
1778 */
1779 if (fInvariantTsc)
1780 return SUPGIPMODE_INVARIANT_TSC;
1781
1782 /*
1783 * TSC is not invariant and we're on SMP, this presents two problems:
1784 *
1785 * (1) There might be a skew between the CPU, so that cpu0
1786 * returns a TSC that is slightly different from cpu1.
1787 * This screw may be due to (2), bad TSC initialization
1788 * or slightly different TSC rates.
1789 *
1790 * (2) Power management (and other things) may cause the TSC
1791 * to run at a non-constant speed, and cause the speed
1792 * to be different on the cpus. This will result in (1).
1793 *
1794 * If any of the above is detected, we will have to use ASYNC mode.
1795 */
1796 /* (1). Try check for current differences between the cpus. */
1797 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1798 return SUPGIPMODE_ASYNC_TSC;
1799
1800 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1801 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1802 if ( ASMIsValidStdRange(uEAX)
1803 && (ASMIsAmdCpuEx(uEBX, uECX, uEDX) || ASMIsHygonCpuEx(uEBX, uECX, uEDX)) )
1804 {
1805 /* Check for APM support. */
1806 uEAX = ASMCpuId_EAX(0x80000000);
1807 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1808 {
1809 uEDX = ASMCpuId_EDX(0x80000007);
1810 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1811 return SUPGIPMODE_ASYNC_TSC;
1812 }
1813 }
1814
1815 return SUPGIPMODE_SYNC_TSC;
1816}
1817
1818
1819/**
1820 * Initializes per-CPU GIP information.
1821 *
1822 * @param pGip Pointer to the GIP.
1823 * @param pCpu Pointer to which GIP CPU to initialize.
1824 * @param u64NanoTS The current nanosecond timestamp.
1825 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1826 */
1827static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1828{
1829 pCpu->u32TransactionId = 2;
1830 pCpu->u64NanoTS = u64NanoTS;
1831 pCpu->u64TSC = ASMReadTSC();
1832 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1833 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1834
1835 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1836 ASMAtomicWriteU32(&pCpu->idCpu, NIL_RTCPUID);
1837 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1838 ASMAtomicWriteU16(&pCpu->iCpuGroup, 0);
1839 ASMAtomicWriteU16(&pCpu->iCpuGroupMember, UINT16_MAX);
1840 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1841 ASMAtomicWriteU32(&pCpu->iReservedForNumaNode, 0);
1842
1843 /*
1844 * The first time we're called, we don't have a CPU frequency handy,
1845 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1846 * called again and at that point we have a more plausible CPU frequency
1847 * value handy. The frequency history will also be adjusted again on
1848 * the 2nd timer callout (maybe we can skip that now?).
1849 */
1850 if (!uCpuHz)
1851 {
1852 pCpu->u64CpuHz = _4G - 1;
1853 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1854 }
1855 else
1856 {
1857 pCpu->u64CpuHz = uCpuHz;
1858 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1859 }
1860 pCpu->au32TSCHistory[0]
1861 = pCpu->au32TSCHistory[1]
1862 = pCpu->au32TSCHistory[2]
1863 = pCpu->au32TSCHistory[3]
1864 = pCpu->au32TSCHistory[4]
1865 = pCpu->au32TSCHistory[5]
1866 = pCpu->au32TSCHistory[6]
1867 = pCpu->au32TSCHistory[7]
1868 = pCpu->u32UpdateIntervalTSC;
1869}
1870
1871
1872/**
1873 * Initializes the GIP data.
1874 *
1875 * @returns VBox status code.
1876 * @param pDevExt Pointer to the device instance data.
1877 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1878 * @param HCPhys The physical address of the GIP.
1879 * @param u64NanoTS The current nanosecond timestamp.
1880 * @param uUpdateHz The update frequency.
1881 * @param uUpdateIntervalNS The update interval in nanoseconds.
1882 * @param cCpus The CPU count.
1883 * @param cbGipCpuGroups The supdrvOSGipGetGroupTableSize return value we
1884 * used when allocating the GIP structure.
1885 */
1886static int supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1887 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS,
1888 unsigned cCpus, size_t cbGipCpuGroups)
1889{
1890 size_t const cbGip = RT_ALIGN_Z(RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups, PAGE_SIZE);
1891 unsigned i;
1892#ifdef DEBUG_DARWIN_GIP
1893 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1894#else
1895 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1896#endif
1897
1898 /*
1899 * Initialize the structure.
1900 */
1901 memset(pGip, 0, cbGip);
1902
1903 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1904 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1905 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1906 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1907 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1908 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1909 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1910 else
1911 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1912 pGip->cCpus = (uint16_t)cCpus;
1913 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1914 pGip->u32UpdateHz = uUpdateHz;
1915 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1916 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1917 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1918 RTCpuSetEmpty(&pGip->PresentCpuSet);
1919 RTMpGetSet(&pGip->PossibleCpuSet);
1920 pGip->cOnlineCpus = RTMpGetOnlineCount();
1921 pGip->cPresentCpus = RTMpGetPresentCount();
1922 pGip->cPossibleCpus = RTMpGetCount();
1923 pGip->cPossibleCpuGroups = 1;
1924 pGip->idCpuMax = RTMpGetMaxCpuId();
1925 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1926 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1927 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1928 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1929 for (i = 0; i < RT_ELEMENTS(pGip->aoffCpuGroup); i++)
1930 pGip->aoffCpuGroup[i] = UINT32_MAX;
1931 for (i = 0; i < cCpus; i++)
1932 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1933#ifdef RT_OS_WINDOWS
1934 int rc = supdrvOSInitGipGroupTable(pDevExt, pGip, cbGipCpuGroups);
1935 AssertRCReturn(rc, rc);
1936#endif
1937
1938 /*
1939 * Link it to the device extension.
1940 */
1941 pDevExt->pGip = pGip;
1942 pDevExt->HCPhysGip = HCPhys;
1943 pDevExt->cGipUsers = 0;
1944
1945 return VINF_SUCCESS;
1946}
1947
1948
1949/**
1950 * Creates the GIP.
1951 *
1952 * @returns VBox status code.
1953 * @param pDevExt Instance data. GIP stuff may be updated.
1954 */
1955int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1956{
1957 PSUPGLOBALINFOPAGE pGip;
1958 size_t cbGip;
1959 size_t cbGipCpuGroups;
1960 RTHCPHYS HCPhysGip;
1961 uint32_t u32SystemResolution;
1962 uint32_t u32Interval;
1963 uint32_t u32MinInterval;
1964 uint32_t uMod;
1965 unsigned cCpus;
1966 int rc;
1967
1968 LogFlow(("supdrvGipCreate:\n"));
1969
1970 /*
1971 * Assert order.
1972 */
1973 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1974 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1975 Assert(!pDevExt->pGipTimer);
1976#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1977 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1978 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1979#else
1980 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1981 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1982#endif
1983
1984 /*
1985 * Check the CPU count.
1986 */
1987 cCpus = RTMpGetArraySize();
1988 if (cCpus > RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)))
1989 {
1990 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)));
1991 return VERR_TOO_MANY_CPUS;
1992 }
1993
1994 /*
1995 * Allocate a contiguous set of pages with a default kernel mapping.
1996 */
1997#ifdef RT_OS_WINDOWS
1998 cbGipCpuGroups = supdrvOSGipGetGroupTableSize(pDevExt);
1999#else
2000 cbGipCpuGroups = 0;
2001#endif
2002 cbGip = RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups;
2003 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, cbGip, false /*fExecutable*/);
2004 if (RT_FAILURE(rc))
2005 {
2006 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
2007 return rc;
2008 }
2009 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
2010 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
2011
2012 /*
2013 * Find a reasonable update interval and initialize the structure.
2014 */
2015 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
2016 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
2017 * See @bugref{6710}. */
2018 u32MinInterval = RT_NS_10MS;
2019 u32SystemResolution = RTTimerGetSystemGranularity();
2020 u32Interval = u32MinInterval;
2021 uMod = u32MinInterval % u32SystemResolution;
2022 if (uMod)
2023 u32Interval += u32SystemResolution - uMod;
2024
2025 rc = supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval,
2026 cCpus, cbGipCpuGroups);
2027
2028 /*
2029 * Important sanity check... (Sets rc)
2030 */
2031 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
2032 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
2033 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
2034 {
2035 OSDBGPRINT(("supdrvGipCreate: Host-OS/user claims the TSC-deltas are zero but we detected async. TSC! Bad.\n"));
2036 rc = VERR_INTERNAL_ERROR_2;
2037 }
2038
2039 /* It doesn't make sense to do TSC-delta detection on systems we detect as async. */
2040 AssertStmt( pGip->u32Mode != SUPGIPMODE_ASYNC_TSC
2041 || pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED,
2042 rc = VERR_INTERNAL_ERROR_3);
2043
2044 /*
2045 * Do the TSC frequency measurements.
2046 *
2047 * If we're in invariant TSC mode, just to a quick preliminary measurement
2048 * that the TSC-delta measurement code can use to yield cross calls.
2049 *
2050 * If we're in any of the other two modes, neither which require MP init,
2051 * notifications or deltas for the job, do the full measurement now so
2052 * that supdrvGipInitOnCpu() can populate the TSC interval and history
2053 * array with more reasonable values.
2054 */
2055 if (RT_SUCCESS(rc))
2056 {
2057 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
2058 {
2059 rc = supdrvGipInitMeasureTscFreq(pGip, true /*fRough*/); /* cannot fail */
2060 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt);
2061 }
2062 else
2063 rc = supdrvGipInitMeasureTscFreq(pGip, false /*fRough*/);
2064 if (RT_SUCCESS(rc))
2065 {
2066 /*
2067 * Start TSC-delta measurement thread before we start getting MP
2068 * events that will try kick it into action (includes the
2069 * RTMpOnAll/supdrvGipInitOnCpu call below).
2070 */
2071 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
2072 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
2073#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2074 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2075 rc = supdrvTscDeltaThreadInit(pDevExt);
2076#endif
2077 if (RT_SUCCESS(rc))
2078 {
2079 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
2080 if (RT_SUCCESS(rc))
2081 {
2082 /*
2083 * Do GIP initialization on all online CPUs. Wake up the
2084 * TSC-delta thread afterwards.
2085 */
2086 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
2087 if (RT_SUCCESS(rc))
2088 {
2089#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2090 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
2091#else
2092 uint16_t iCpu;
2093 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2094 {
2095 /*
2096 * Measure the TSC deltas now that we have MP notifications.
2097 */
2098 int cTries = 5;
2099 do
2100 {
2101 rc = supdrvTscMeasureInitialDeltas(pDevExt);
2102 if ( rc != VERR_TRY_AGAIN
2103 && rc != VERR_CPU_OFFLINE)
2104 break;
2105 } while (--cTries > 0);
2106 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2107 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
2108 }
2109 else
2110 {
2111 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2112 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
2113 }
2114 if (RT_SUCCESS(rc))
2115#endif
2116 {
2117 /*
2118 * Create the timer.
2119 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
2120 */
2121 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
2122 {
2123 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
2124 supdrvGipAsyncTimer, pDevExt);
2125 if (rc == VERR_NOT_SUPPORTED)
2126 {
2127 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
2128 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
2129 }
2130 }
2131 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2132 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
2133 supdrvGipSyncAndInvariantTimer, pDevExt);
2134 if (RT_SUCCESS(rc))
2135 {
2136 /*
2137 * We're good.
2138 */
2139 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
2140 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2141
2142 g_pSUPGlobalInfoPage = pGip;
2143 return VINF_SUCCESS;
2144 }
2145
2146 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
2147 Assert(!pDevExt->pGipTimer);
2148 }
2149 }
2150 else
2151 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
2152 }
2153 else
2154 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
2155 }
2156 else
2157 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
2158 }
2159 else
2160 OSDBGPRINT(("supdrvGipCreate: supdrvTscMeasureInitialDeltas failed. rc=%Rrc\n", rc));
2161 }
2162
2163 /* Releases timer frequency increase too. */
2164 supdrvGipDestroy(pDevExt);
2165 return rc;
2166}
2167
2168
2169/**
2170 * Invalidates the GIP data upon termination.
2171 *
2172 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2173 */
2174static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
2175{
2176 unsigned i;
2177 pGip->u32Magic = 0;
2178 for (i = 0; i < pGip->cCpus; i++)
2179 {
2180 pGip->aCPUs[i].u64NanoTS = 0;
2181 pGip->aCPUs[i].u64TSC = 0;
2182 pGip->aCPUs[i].iTSCHistoryHead = 0;
2183 pGip->aCPUs[i].u64TSCSample = 0;
2184 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2185 }
2186}
2187
2188
2189/**
2190 * Terminates the GIP.
2191 *
2192 * @param pDevExt Instance data. GIP stuff may be updated.
2193 */
2194void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2195{
2196 int rc;
2197#ifdef DEBUG_DARWIN_GIP
2198 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2199 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2200 pDevExt->pGipTimer, pDevExt->GipMemObj));
2201#endif
2202
2203 /*
2204 * Stop receiving MP notifications before tearing anything else down.
2205 */
2206 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2207
2208#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2209 /*
2210 * Terminate the TSC-delta measurement thread and resources.
2211 */
2212 supdrvTscDeltaTerm(pDevExt);
2213#endif
2214
2215 /*
2216 * Destroy the TSC-refinement timer.
2217 */
2218 if (pDevExt->pInvarTscRefineTimer)
2219 {
2220 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2221 pDevExt->pInvarTscRefineTimer = NULL;
2222 }
2223
2224 /*
2225 * Invalid the GIP data.
2226 */
2227 if (pDevExt->pGip)
2228 {
2229 supdrvGipTerm(pDevExt->pGip);
2230 pDevExt->pGip = NULL;
2231 }
2232 g_pSUPGlobalInfoPage = NULL;
2233
2234 /*
2235 * Destroy the timer and free the GIP memory object.
2236 */
2237 if (pDevExt->pGipTimer)
2238 {
2239 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2240 pDevExt->pGipTimer = NULL;
2241 }
2242
2243 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2244 {
2245 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2246 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2247 }
2248
2249 /*
2250 * Finally, make sure we've release the system timer resolution request
2251 * if one actually succeeded and is still pending.
2252 */
2253 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2254}
2255
2256
2257
2258
2259/*
2260 *
2261 *
2262 * GIP Update Timer Related Code
2263 * GIP Update Timer Related Code
2264 * GIP Update Timer Related Code
2265 *
2266 *
2267 */
2268
2269
2270/**
2271 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2272 * updates all the per cpu data except the transaction id.
2273 *
2274 * @param pDevExt The device extension.
2275 * @param pGipCpu Pointer to the per cpu data.
2276 * @param u64NanoTS The current time stamp.
2277 * @param u64TSC The current TSC.
2278 * @param iTick The current timer tick.
2279 *
2280 * @remarks Can be called with interrupts disabled!
2281 */
2282static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2283{
2284 uint64_t u64TSCDelta;
2285 bool fUpdateCpuHz;
2286 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2287 AssertPtrReturnVoid(pGip);
2288
2289 /* Delta between this and the previous update. */
2290 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2291
2292 /*
2293 * Update the NanoTS.
2294 */
2295 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2296
2297 /*
2298 * Calc TSC delta.
2299 */
2300 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2301 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2302
2303 /*
2304 * Determine if we need to update the CPU (TSC) frequency calculation.
2305 *
2306 * We don't need to keep recalculating the frequency when it's invariant,
2307 * unless the special tstGIP-2 testing mode is enabled.
2308 */
2309 fUpdateCpuHz = pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC;
2310 if (!(pGip->fFlags & SUPGIP_FLAGS_TESTING))
2311 { /* likely*/ }
2312 else
2313 {
2314 uint32_t fGipFlags = pGip->fFlags;
2315 if (fGipFlags & (SUPGIP_FLAGS_TESTING_ENABLE | SUPGIP_FLAGS_TESTING_START))
2316 {
2317 if (fGipFlags & SUPGIP_FLAGS_TESTING_START)
2318 {
2319 /* Cache the TSC frequency before forcing updates due to test mode. */
2320 if (!fUpdateCpuHz)
2321 pDevExt->uGipTestModeInvariantCpuHz = pGip->aCPUs[0].u64CpuHz;
2322 ASMAtomicAndU32(&pGip->fFlags, ~SUPGIP_FLAGS_TESTING_START);
2323 }
2324 fUpdateCpuHz = true;
2325 }
2326 else if (fGipFlags & SUPGIP_FLAGS_TESTING_STOP)
2327 {
2328 /* Restore the cached TSC frequency if any. */
2329 if (!fUpdateCpuHz)
2330 {
2331 Assert(pDevExt->uGipTestModeInvariantCpuHz);
2332 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, pDevExt->uGipTestModeInvariantCpuHz);
2333 }
2334 ASMAtomicAndU32(&pGip->fFlags, ~(SUPGIP_FLAGS_TESTING_STOP | SUPGIP_FLAGS_TESTING));
2335 }
2336 }
2337
2338 /*
2339 * Calculate the CPU (TSC) frequency if necessary.
2340 */
2341 if (fUpdateCpuHz)
2342 {
2343 uint64_t u64CpuHz;
2344 uint32_t u32UpdateIntervalTSC;
2345 uint32_t u32UpdateIntervalTSCSlack;
2346 uint32_t u32TransactionId;
2347 unsigned iTSCHistoryHead;
2348
2349 if (u64TSCDelta >> 32)
2350 {
2351 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2352 pGipCpu->cErrors++;
2353 }
2354
2355 /*
2356 * On the 2nd and 3rd callout, reset the history with the current TSC
2357 * interval since the values entered by supdrvGipInit are totally off.
2358 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2359 * better, while the 3rd should be most reliable.
2360 */
2361 /** @todo Could we drop this now that we initializes the history
2362 * with nominal TSC frequency values? */
2363 u32TransactionId = pGipCpu->u32TransactionId;
2364 if (RT_UNLIKELY( ( u32TransactionId == 5
2365 || u32TransactionId == 7)
2366 && ( iTick == 2
2367 || iTick == 3) ))
2368 {
2369 unsigned i;
2370 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2371 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2372 }
2373
2374 /*
2375 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2376 * Wait until we have at least one full history since the above history reset. The
2377 * assumption is that the majority of the previous history values will be tolerable.
2378 * See @bugref{6710#c67}.
2379 */
2380 /** @todo Could we drop the fudging there now that we initializes the history
2381 * with nominal TSC frequency values? */
2382 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2383 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2384 {
2385 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2386 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2387 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2388 {
2389 uint32_t u32;
2390 u32 = pGipCpu->au32TSCHistory[0];
2391 u32 += pGipCpu->au32TSCHistory[1];
2392 u32 += pGipCpu->au32TSCHistory[2];
2393 u32 += pGipCpu->au32TSCHistory[3];
2394 u32 >>= 2;
2395 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2396 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2397 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2398 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2399 u64TSCDelta >>= 2;
2400 u64TSCDelta += u32;
2401 u64TSCDelta >>= 1;
2402 }
2403 }
2404
2405 /*
2406 * TSC History.
2407 */
2408 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2409 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2410 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2411 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2412
2413 /*
2414 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2415 *
2416 * On Windows, we have an occasional (but recurring) sour value that messed up
2417 * the history but taking only 1 interval reduces the precision overall.
2418 */
2419 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2420 || pGip->u32UpdateHz >= 1000)
2421 {
2422 uint32_t u32;
2423 u32 = pGipCpu->au32TSCHistory[0];
2424 u32 += pGipCpu->au32TSCHistory[1];
2425 u32 += pGipCpu->au32TSCHistory[2];
2426 u32 += pGipCpu->au32TSCHistory[3];
2427 u32 >>= 2;
2428 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2429 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2430 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2431 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2432 u32UpdateIntervalTSC >>= 2;
2433 u32UpdateIntervalTSC += u32;
2434 u32UpdateIntervalTSC >>= 1;
2435
2436 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2437 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2438 }
2439 else if (pGip->u32UpdateHz >= 90)
2440 {
2441 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2442 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2443 u32UpdateIntervalTSC >>= 1;
2444
2445 /* value chosen on a 2GHz thinkpad running windows */
2446 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2447 }
2448 else
2449 {
2450 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2451
2452 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2453 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2454 }
2455 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2456
2457 /*
2458 * CpuHz.
2459 */
2460 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2461 u64CpuHz /= pGip->u32UpdateIntervalNS;
2462 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2463 }
2464}
2465
2466
2467/**
2468 * Updates the GIP.
2469 *
2470 * @param pDevExt The device extension.
2471 * @param u64NanoTS The current nanosecond timestamp.
2472 * @param u64TSC The current TSC timestamp.
2473 * @param idCpu The CPU ID.
2474 * @param iTick The current timer tick.
2475 *
2476 * @remarks Can be called with interrupts disabled!
2477 */
2478static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2479{
2480 /*
2481 * Determine the relevant CPU data.
2482 */
2483 PSUPGIPCPU pGipCpu;
2484 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2485 AssertPtrReturnVoid(pGip);
2486
2487 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2488 pGipCpu = &pGip->aCPUs[0];
2489 else
2490 {
2491 unsigned iCpu;
2492 uint32_t idApic = supdrvGipGetApicId(pGip);
2493 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
2494 { /* likely */ }
2495 else
2496 return;
2497 iCpu = pGip->aiCpuFromApicId[idApic];
2498 if (RT_LIKELY(iCpu < pGip->cCpus))
2499 { /* likely */ }
2500 else
2501 return;
2502 pGipCpu = &pGip->aCPUs[iCpu];
2503 if (RT_LIKELY(pGipCpu->idCpu == idCpu))
2504 { /* likely */ }
2505 else
2506 return;
2507 }
2508
2509 /*
2510 * Start update transaction.
2511 */
2512 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2513 {
2514 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2515 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2516 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2517 pGipCpu->cErrors++;
2518 return;
2519 }
2520
2521 /*
2522 * Recalc the update frequency every 0x800th time.
2523 */
2524 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariant hosts. */
2525 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2526 {
2527 if (pGip->u64NanoTSLastUpdateHz)
2528 {
2529#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2530 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2531 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2532 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2533 {
2534 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2535 * calculation on non-invariant hosts if it changes the history decision
2536 * taken in supdrvGipDoUpdateCpu(). */
2537 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2538 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2539 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2540 }
2541#endif
2542 }
2543 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2544 }
2545
2546 /*
2547 * Update the data.
2548 */
2549 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2550
2551 /*
2552 * Complete transaction.
2553 */
2554 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2555}
2556
2557
2558/**
2559 * Updates the per cpu GIP data for the calling cpu.
2560 *
2561 * @param pDevExt The device extension.
2562 * @param u64NanoTS The current nanosecond timestamp.
2563 * @param u64TSC The current TSC timesaver.
2564 * @param idCpu The CPU ID.
2565 * @param idApic The APIC id for the CPU index.
2566 * @param iTick The current timer tick.
2567 *
2568 * @remarks Can be called with interrupts disabled!
2569 */
2570static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2571 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2572{
2573 uint32_t iCpu;
2574 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2575
2576 /*
2577 * Avoid a potential race when a CPU online notification doesn't fire on
2578 * the onlined CPU but the tick creeps in before the event notification is
2579 * run.
2580 */
2581 if (RT_LIKELY(iTick != 1))
2582 { /* likely*/ }
2583 else
2584 {
2585 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2586 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2587 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2588 }
2589
2590 iCpu = pGip->aiCpuFromApicId[idApic];
2591 if (RT_LIKELY(iCpu < pGip->cCpus))
2592 {
2593 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2594 if (pGipCpu->idCpu == idCpu)
2595 {
2596 /*
2597 * Start update transaction.
2598 */
2599 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2600 {
2601 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2602 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2603 pGipCpu->cErrors++;
2604 return;
2605 }
2606
2607 /*
2608 * Update the data.
2609 */
2610 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2611
2612 /*
2613 * Complete transaction.
2614 */
2615 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2616 }
2617 }
2618}
2619
2620
2621/**
2622 * Timer callback function for the sync and invariant GIP modes.
2623 *
2624 * @param pTimer The timer.
2625 * @param pvUser Opaque pointer to the device extension.
2626 * @param iTick The timer tick.
2627 */
2628static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2629{
2630 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2631 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2632 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2633 uint64_t u64TSC = ASMReadTSC();
2634 uint64_t u64NanoTS = RTTimeSystemNanoTS();
2635 RT_NOREF1(pTimer);
2636
2637 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2638 {
2639 /*
2640 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2641 * missing timer ticks is not an option for GIP because the GIP users
2642 * will end up incrementing the time in 1ns per time getter call until
2643 * there is a complete timer update. So, if the delta has yet to be
2644 * calculated, we just pretend it is zero for now (the GIP users
2645 * probably won't have it for a wee while either and will do the same).
2646 *
2647 * We could maybe on some platforms try cross calling a CPU with a
2648 * working delta here, but it's not worth the hassle since the
2649 * likelihood of this happening is really low. On Windows, Linux, and
2650 * Solaris timers fire on the CPU they were registered/started on.
2651 * Darwin timers doesn't necessarily (they are high priority threads).
2652 */
2653 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2654 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2655 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2656 Assert(!ASMIntAreEnabled());
2657 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2658 {
2659 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2660 if (iTscDelta != INT64_MAX)
2661 u64TSC -= iTscDelta;
2662 }
2663 }
2664
2665 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2666
2667 ASMSetFlags(fEFlags);
2668}
2669
2670
2671/**
2672 * Timer callback function for async GIP mode.
2673 * @param pTimer The timer.
2674 * @param pvUser Opaque pointer to the device extension.
2675 * @param iTick The timer tick.
2676 */
2677static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2678{
2679 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2680 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2681 RTCPUID idCpu = RTMpCpuId();
2682 uint64_t u64TSC = ASMReadTSC();
2683 uint64_t NanoTS = RTTimeSystemNanoTS();
2684 RT_NOREF1(pTimer);
2685
2686 /** @todo reset the transaction number and whatnot when iTick == 1. */
2687 if (pDevExt->idGipMaster == idCpu)
2688 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2689 else
2690 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, supdrvGipGetApicId(pDevExt->pGip), iTick);
2691
2692 ASMSetFlags(fEFlags);
2693}
2694
2695
2696
2697
2698/*
2699 *
2700 *
2701 * TSC Delta Measurements And Related Code
2702 * TSC Delta Measurements And Related Code
2703 * TSC Delta Measurements And Related Code
2704 *
2705 *
2706 */
2707
2708
2709/*
2710 * Select TSC delta measurement algorithm.
2711 */
2712#if 0
2713# define GIP_TSC_DELTA_METHOD_1
2714#else
2715# define GIP_TSC_DELTA_METHOD_2
2716#endif
2717
2718/** For padding variables to keep them away from other cache lines. Better too
2719 * large than too small!
2720 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2721 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2722 * III had 32 bytes cache lines. */
2723#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2724
2725
2726/**
2727 * TSC delta measurement algorithm \#2 result entry.
2728 */
2729typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2730{
2731 uint32_t iSeqMine;
2732 uint32_t iSeqOther;
2733 uint64_t uTsc;
2734} SUPDRVTSCDELTAMETHOD2ENTRY;
2735
2736/**
2737 * TSC delta measurement algorithm \#2 Data.
2738 */
2739typedef struct SUPDRVTSCDELTAMETHOD2
2740{
2741 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2742 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2743 /** The current sequence number of this worker. */
2744 uint32_t volatile iCurSeqNo;
2745 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2746 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2747 /** Result table. */
2748 SUPDRVTSCDELTAMETHOD2ENTRY aResults[64];
2749} SUPDRVTSCDELTAMETHOD2;
2750/** Pointer to the data for TSC delta measurement algorithm \#2 .*/
2751typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2752
2753
2754/**
2755 * The TSC delta synchronization struct, version 2.
2756 *
2757 * The synchronization variable is completely isolated in its own cache line
2758 * (provided our max cache line size estimate is correct).
2759 */
2760typedef struct SUPTSCDELTASYNC2
2761{
2762 /** Padding to make sure the uVar1 is in its own cache line. */
2763 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2764
2765 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2766 volatile uint32_t uSyncVar;
2767 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2768 volatile uint32_t uSyncSeq;
2769
2770 /** Padding to make sure the uVar1 is in its own cache line. */
2771 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2772
2773 /** Start RDTSC value. Put here mainly to save stack space. */
2774 uint64_t uTscStart;
2775 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2776 uint64_t cMaxTscTicks;
2777} SUPTSCDELTASYNC2;
2778AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2779typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2780
2781/** Prestart wait. */
2782#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2783/** Prestart aborted. */
2784#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2785/** Ready (on your mark). */
2786#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2787/** Steady (get set). */
2788#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2789/** Go! */
2790#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2791/** Used by the verification test. */
2792#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2793
2794/** We reached the time limit. */
2795#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2796/** The other party won't touch the sync struct ever again. */
2797#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2798
2799
2800/**
2801 * Argument package/state passed by supdrvTscMeasureDeltaOne() to the RTMpOn
2802 * callback worker.
2803 * @todo add
2804 */
2805typedef struct SUPDRVGIPTSCDELTARGS
2806{
2807 /** The device extension. */
2808 PSUPDRVDEVEXT pDevExt;
2809 /** Pointer to the GIP CPU array entry for the worker. */
2810 PSUPGIPCPU pWorker;
2811 /** Pointer to the GIP CPU array entry for the master. */
2812 PSUPGIPCPU pMaster;
2813 /** The maximum number of ticks to spend in supdrvTscMeasureDeltaCallback.
2814 * (This is what we need a rough TSC frequency for.) */
2815 uint64_t cMaxTscTicks;
2816 /** Used to abort synchronization setup. */
2817 bool volatile fAbortSetup;
2818
2819 /** Padding to make sure the master variables live in its own cache lines. */
2820 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2821
2822 /** @name Master
2823 * @{ */
2824 /** The time the master spent in the MP worker. */
2825 uint64_t cElapsedMasterTscTicks;
2826 /** The iTry value when stopped at. */
2827 uint32_t iTry;
2828 /** Set if the run timed out. */
2829 bool volatile fTimedOut;
2830 /** Pointer to the master's synchronization struct (on stack). */
2831 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2832 /** Master data union. */
2833 union
2834 {
2835 /** Data (master) for delta verification. */
2836 struct
2837 {
2838 /** Verification test TSC values for the master. */
2839 uint64_t volatile auTscs[32];
2840 } Verify;
2841 /** Data (master) for measurement method \#2. */
2842 struct
2843 {
2844 /** Data and sequence number. */
2845 SUPDRVTSCDELTAMETHOD2 Data;
2846 /** The lag setting for the next run. */
2847 bool fLag;
2848 /** Number of hits. */
2849 uint32_t cHits;
2850 } M2;
2851 } uMaster;
2852 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2853 * VERR_TRY_AGAIN on timeout. */
2854 int32_t rcVerify;
2855#ifdef TSCDELTA_VERIFY_WITH_STATS
2856 /** The maximum difference between TSC read during delta verification. */
2857 int64_t cMaxVerifyTscTicks;
2858 /** The minimum difference between two TSC reads during verification. */
2859 int64_t cMinVerifyTscTicks;
2860 /** The bad TSC diff, worker relative to master (= worker - master).
2861 * Negative value means the worker is behind the master. */
2862 int64_t iVerifyBadTscDiff;
2863#endif
2864 /** @} */
2865
2866 /** Padding to make sure the worker variables live is in its own cache line. */
2867 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2868
2869 /** @name Proletarian
2870 * @{ */
2871 /** Pointer to the worker's synchronization struct (on stack). */
2872 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2873 /** The time the worker spent in the MP worker. */
2874 uint64_t cElapsedWorkerTscTicks;
2875 /** Worker data union. */
2876 union
2877 {
2878 /** Data (worker) for delta verification. */
2879 struct
2880 {
2881 /** Verification test TSC values for the worker. */
2882 uint64_t volatile auTscs[32];
2883 } Verify;
2884 /** Data (worker) for measurement method \#2. */
2885 struct
2886 {
2887 /** Data and sequence number. */
2888 SUPDRVTSCDELTAMETHOD2 Data;
2889 /** The lag setting for the next run (set by master). */
2890 bool fLag;
2891 } M2;
2892 } uWorker;
2893 /** @} */
2894
2895 /** Padding to make sure the above is in its own cache line. */
2896 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2897} SUPDRVGIPTSCDELTARGS;
2898typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2899
2900
2901/** @name Macros that implements the basic synchronization steps common to
2902 * the algorithms.
2903 *
2904 * Must be used from loop as the timeouts are implemented via 'break' statements
2905 * at the moment.
2906 *
2907 * @{
2908 */
2909#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2910# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2911# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2912# define TSCDELTA_DBG_CHECK_LOOP() \
2913 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2914#else
2915# define TSCDELTA_DBG_VARS() ((void)0)
2916# define TSCDELTA_DBG_START_LOOP() ((void)0)
2917# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2918#endif
2919#if 0
2920# define TSCDELTA_DBG_SYNC_MSG(a_Args) SUPR0Printf a_Args
2921#else
2922# define TSCDELTA_DBG_SYNC_MSG(a_Args) ((void)0)
2923#endif
2924#if 0
2925# define TSCDELTA_DBG_SYNC_MSG2(a_Args) SUPR0Printf a_Args
2926#else
2927# define TSCDELTA_DBG_SYNC_MSG2(a_Args) ((void)0)
2928#endif
2929#if 0
2930# define TSCDELTA_DBG_SYNC_MSG9(a_Args) SUPR0Printf a_Args
2931#else
2932# define TSCDELTA_DBG_SYNC_MSG9(a_Args) ((void)0)
2933#endif
2934
2935
2936static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2937 bool fIsMaster, PRTCCUINTREG pfEFlags, PSUPDRVGIPTSCDELTARGS pArgs)
2938{
2939 uint32_t iMySeq = fIsMaster ? 0 : 256;
2940 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2941 uint32_t u32Tmp;
2942 uint32_t iSync2Loops = 0;
2943 RTCCUINTREG fEFlags;
2944 TSCDELTA_DBG_VARS();
2945
2946 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2947
2948 /*
2949 * The master tells the worker to get on it's mark.
2950 */
2951 if (fIsMaster)
2952 {
2953 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2954 { /* likely*/ }
2955 else
2956 {
2957 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #1 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2958 return false;
2959 }
2960 }
2961
2962 /*
2963 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2964 */
2965 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2966 for (;;)
2967 {
2968 fEFlags = ASMIntDisableFlags();
2969 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2970 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2971 break;
2972 ASMSetFlags(fEFlags);
2973 ASMNopPause();
2974
2975 /* Abort? */
2976 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2977 {
2978 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #2 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
2979 return false;
2980 }
2981
2982 /* Check for timeouts every so often (not every loop in case RDTSC is
2983 trapping or something). Must check the first time around. */
2984#if 0 /* For debugging the timeout paths. */
2985 static uint32_t volatile xxx;
2986#endif
2987 if ( ( (iSync2Loops & 0x3ff) == 0
2988 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
2989#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
2990 || (!fIsMaster && (++xxx & 0xf) == 0)
2991#endif
2992 )
2993 {
2994 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
2995 ignore the timeout if we've got the go ahead already (simpler). */
2996 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
2997 {
2998 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: timeout\n", fIsMaster ? "master" : "worker"));
2999 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
3000 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3001 return false;
3002 }
3003 }
3004 iSync2Loops++;
3005 }
3006
3007 /*
3008 * Interrupts are now disabled and will remain disabled until we do
3009 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
3010 */
3011 *pfEFlags = fEFlags;
3012
3013 /*
3014 * The worker tells the master that it is on its mark and that the master
3015 * need to get into position as well.
3016 */
3017 if (!fIsMaster)
3018 {
3019 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
3020 { /* likely */ }
3021 else
3022 {
3023 ASMSetFlags(fEFlags);
3024 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #3 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3025 return false;
3026 }
3027 }
3028
3029 /*
3030 * The master sends the 'go' to the worker and wait for ACK.
3031 */
3032 if (fIsMaster)
3033 {
3034 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3035 { /* likely */ }
3036 else
3037 {
3038 ASMSetFlags(fEFlags);
3039 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #4 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3040 return false;
3041 }
3042 }
3043
3044 /*
3045 * Wait for the 'go' signal (ack in the master case).
3046 */
3047 TSCDELTA_DBG_START_LOOP();
3048 for (;;)
3049 {
3050 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3051 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
3052 break;
3053 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
3054 { /* likely */ }
3055 else
3056 {
3057 ASMSetFlags(fEFlags);
3058 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #5 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
3059 return false;
3060 }
3061
3062 TSCDELTA_DBG_CHECK_LOOP();
3063 ASMNopPause();
3064 }
3065
3066 /*
3067 * The worker acks the 'go' (shouldn't fail).
3068 */
3069 if (!fIsMaster)
3070 {
3071 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3072 { /* likely */ }
3073 else
3074 {
3075 ASMSetFlags(fEFlags);
3076 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #6 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3077 return false;
3078 }
3079 }
3080
3081 /*
3082 * Try enter mostly lockstep execution with it.
3083 */
3084 for (;;)
3085 {
3086 uint32_t iOtherSeq1, iOtherSeq2;
3087 ASMCompilerBarrier();
3088 ASMSerializeInstruction();
3089
3090 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
3091 ASMNopPause();
3092 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
3093 ASMNopPause();
3094 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
3095
3096 ASMCompilerBarrier();
3097 if (iOtherSeq1 == iOtherSeq2)
3098 return true;
3099
3100 /* Did the other guy give up? Should we give up? */
3101 if ( iOtherSeq1 == UINT32_MAX
3102 || iOtherSeq2 == UINT32_MAX)
3103 return true;
3104 if (++iMySeq >= iMaxSeq)
3105 {
3106 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
3107 return true;
3108 }
3109 ASMNopPause();
3110 }
3111}
3112
3113#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3114 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3115 { /*likely*/ } \
3116 else if (true) \
3117 { \
3118 TSCDELTA_DBG_SYNC_MSG9(("sync/before/master: #89\n")); \
3119 break; \
3120 } else do {} while (0)
3121#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3122 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3123 { /*likely*/ } \
3124 else if (true) \
3125 { \
3126 TSCDELTA_DBG_SYNC_MSG9(("sync/before/other: #89\n")); \
3127 break; \
3128 } else do {} while (0)
3129
3130
3131static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3132 bool fIsMaster, RTCCUINTREG fEFlags)
3133{
3134 TSCDELTA_DBG_VARS();
3135 RT_NOREF1(pOtherSync);
3136
3137 /*
3138 * Wait for the 'ready' signal. In the master's case, this means the
3139 * worker has completed its data collection, while in the worker's case it
3140 * means the master is done processing the data and it's time for the next
3141 * loop iteration (or whatever).
3142 */
3143 ASMSetFlags(fEFlags);
3144 TSCDELTA_DBG_START_LOOP();
3145 for (;;)
3146 {
3147 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3148 if ( u32Tmp == GIP_TSC_DELTA_SYNC2_READY
3149 || (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY && !fIsMaster) /* kicked twice => race */ )
3150 return true;
3151 ASMNopPause();
3152 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_GO))
3153 { /* likely */}
3154 else
3155 {
3156 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #1 u32Tmp=%#x\n", u32Tmp));
3157 return false; /* shouldn't ever happen! */
3158 }
3159 TSCDELTA_DBG_CHECK_LOOP();
3160 ASMNopPause();
3161 }
3162}
3163
3164#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3165 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_fEFlags))) \
3166 { /* likely */ } \
3167 else if (true) \
3168 { \
3169 TSCDELTA_DBG_SYNC_MSG9(("sync/after/master: #97\n")); \
3170 break; \
3171 } else do {} while (0)
3172
3173#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
3174 /* \
3175 * Tell the worker that we're done processing the data and ready for the next round. \
3176 */ \
3177 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3178 { /* likely */ } \
3179 else if (true)\
3180 { \
3181 TSCDELTA_DBG_SYNC_MSG(("sync/after/master: #99 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3182 break; \
3183 } else do {} while (0)
3184
3185#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3186 if (true) { \
3187 /* \
3188 * Tell the master that we're done collecting data and wait for the next round to start. \
3189 */ \
3190 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3191 { /* likely */ } \
3192 else \
3193 { \
3194 ASMSetFlags(a_fEFlags); \
3195 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #0 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3196 break; \
3197 } \
3198 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_fEFlags))) \
3199 { /* likely */ } \
3200 else \
3201 { \
3202 TSCDELTA_DBG_SYNC_MSG9(("sync/after/other: #98\n")); \
3203 break; \
3204 } \
3205 } else do {} while (0)
3206/** @} */
3207
3208
3209#ifdef GIP_TSC_DELTA_METHOD_1
3210/**
3211 * TSC delta measurement algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
3212 *
3213 *
3214 * We ignore the first few runs of the loop in order to prime the
3215 * cache. Also, we need to be careful about using 'pause' instruction
3216 * in critical busy-wait loops in this code - it can cause undesired
3217 * behaviour with hyperthreading.
3218 *
3219 * We try to minimize the measurement error by computing the minimum
3220 * read time of the compare statement in the worker by taking TSC
3221 * measurements across it.
3222 *
3223 * It must be noted that the computed minimum read time is mostly to
3224 * eliminate huge deltas when the worker is too early and doesn't by
3225 * itself help produce more accurate deltas. We allow two times the
3226 * computed minimum as an arbitrary acceptable threshold. Therefore,
3227 * it is still possible to get negative deltas where there are none
3228 * when the worker is earlier. As long as these occasional negative
3229 * deltas are lower than the time it takes to exit guest-context and
3230 * the OS to reschedule EMT on a different CPU, we won't expose a TSC
3231 * that jumped backwards. It is due to the existence of the negative
3232 * deltas that we don't recompute the delta with the master and
3233 * worker interchanged to eliminate the remaining measurement error.
3234 *
3235 *
3236 * @param pArgs The argument/state data.
3237 * @param pMySync My synchronization structure.
3238 * @param pOtherSync My partner's synchronization structure.
3239 * @param fIsMaster Set if master, clear if worker.
3240 * @param iTry The attempt number.
3241 */
3242static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3243 bool fIsMaster, uint32_t iTry)
3244{
3245 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3246 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3247 uint64_t uMinCmpReadTime = UINT64_MAX;
3248 unsigned iLoop;
3249 NOREF(iTry);
3250
3251 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
3252 {
3253 RTCCUINTREG fEFlags;
3254 if (fIsMaster)
3255 {
3256 /*
3257 * The master.
3258 */
3259 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
3260 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
3261 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
3262 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3263
3264 do
3265 {
3266 ASMSerializeInstruction();
3267 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
3268 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3269
3270 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3271
3272 /* Process the data. */
3273 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3274 {
3275 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
3276 {
3277 int64_t iDelta = pGipCpuWorker->u64TSCSample
3278 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
3279 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3280 ? iDelta < pGipCpuWorker->i64TSCDelta
3281 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3282 pGipCpuWorker->i64TSCDelta = iDelta;
3283 }
3284 }
3285
3286 /* Reset our TSC sample and tell the worker to move on. */
3287 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3288 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3289 }
3290 else
3291 {
3292 /*
3293 * The worker.
3294 */
3295 uint64_t uTscWorker;
3296 uint64_t uTscWorkerFlushed;
3297 uint64_t uCmpReadTime;
3298
3299 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3300 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3301
3302 /*
3303 * Keep reading the TSC until we notice that the master has read his. Reading
3304 * the TSC -after- the master has updated the memory is way too late. We thus
3305 * compensate by trying to measure how long it took for the worker to notice
3306 * the memory flushed from the master.
3307 */
3308 do
3309 {
3310 ASMSerializeInstruction();
3311 uTscWorker = ASMReadTSC();
3312 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3313 ASMSerializeInstruction();
3314 uTscWorkerFlushed = ASMReadTSC();
3315
3316 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3317 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3318 {
3319 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3320 if (uCmpReadTime < (uMinCmpReadTime << 1))
3321 {
3322 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3323 if (uCmpReadTime < uMinCmpReadTime)
3324 uMinCmpReadTime = uCmpReadTime;
3325 }
3326 else
3327 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3328 }
3329 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3330 {
3331 if (uCmpReadTime < uMinCmpReadTime)
3332 uMinCmpReadTime = uCmpReadTime;
3333 }
3334
3335 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3336 }
3337 }
3338
3339 TSCDELTA_DBG_SYNC_MSG9(("sync/method1loop/%s: #92 iLoop=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iLoop,
3340 pMySync->uSyncVar));
3341
3342 /*
3343 * We must reset the worker TSC sample value in case it gets picked as a
3344 * GIP master later on (it's trashed above, naturally).
3345 */
3346 if (!fIsMaster)
3347 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3348}
3349#endif /* GIP_TSC_DELTA_METHOD_1 */
3350
3351
3352#ifdef GIP_TSC_DELTA_METHOD_2
3353/*
3354 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3355 */
3356
3357# define GIP_TSC_DELTA_M2_LOOPS (7 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3358# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 0
3359
3360
3361static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs)
3362{
3363 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3364 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3365 uint32_t idxResult;
3366 uint32_t cHits = 0;
3367
3368 /*
3369 * Look for matching entries in the master and worker tables.
3370 */
3371 for (idxResult = 0; idxResult < RT_ELEMENTS(pArgs->uMaster.M2.Data.aResults); idxResult++)
3372 {
3373 uint32_t idxOther = pArgs->uMaster.M2.Data.aResults[idxResult].iSeqOther;
3374 if (idxOther & 1)
3375 {
3376 idxOther >>= 1;
3377 if (idxOther < RT_ELEMENTS(pArgs->uWorker.M2.Data.aResults))
3378 {
3379 if (pArgs->uWorker.M2.Data.aResults[idxOther].iSeqOther == pArgs->uMaster.M2.Data.aResults[idxResult].iSeqMine)
3380 {
3381 int64_t iDelta;
3382 iDelta = pArgs->uWorker.M2.Data.aResults[idxOther].uTsc
3383 - (pArgs->uMaster.M2.Data.aResults[idxResult].uTsc - iMasterTscDelta);
3384 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3385 ? iDelta < iBestDelta
3386 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3387 iBestDelta = iDelta;
3388 cHits++;
3389 }
3390 }
3391 }
3392 }
3393
3394 /*
3395 * Save the results.
3396 */
3397 if (cHits > 2)
3398 pArgs->pWorker->i64TSCDelta = iBestDelta;
3399 pArgs->uMaster.M2.cHits += cHits;
3400}
3401
3402
3403/**
3404 * The core function of the 2nd TSC delta measurement algorithm.
3405 *
3406 * The idea here is that we have the two CPUs execute the exact same code
3407 * collecting a largish set of TSC samples. The code has one data dependency on
3408 * the other CPU which intention it is to synchronize the execution as well as
3409 * help cross references the two sets of TSC samples (the sequence numbers).
3410 *
3411 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3412 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3413 * it will help with making the CPUs enter lock step execution occasionally.
3414 *
3415 */
3416static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3417{
3418 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3419 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3420
3421 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3422 ASMSerializeInstruction();
3423 while (cLeft-- > 0)
3424 {
3425 uint64_t uTsc;
3426 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3427 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3428 ASMCompilerBarrier();
3429 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3430 uTsc = ASMReadTSC();
3431 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3432 ASMCompilerBarrier();
3433 ASMSerializeInstruction();
3434 pEntry->iSeqMine = iSeqMine;
3435 pEntry->iSeqOther = iSeqOther;
3436 pEntry->uTsc = uTsc;
3437 pEntry++;
3438 ASMSerializeInstruction();
3439 if (fLag)
3440 ASMNopPause();
3441 }
3442}
3443
3444
3445/**
3446 * TSC delta measurement algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3447 *
3448 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3449 *
3450 * @param pArgs The argument/state data.
3451 * @param pMySync My synchronization structure.
3452 * @param pOtherSync My partner's synchronization structure.
3453 * @param fIsMaster Set if master, clear if worker.
3454 * @param iTry The attempt number.
3455 */
3456static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3457 bool fIsMaster, uint32_t iTry)
3458{
3459 unsigned iLoop;
3460 RT_NOREF1(iTry);
3461
3462 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3463 {
3464 RTCCUINTREG fEFlags;
3465 if (fIsMaster)
3466 {
3467 /*
3468 * Adjust the loop lag fudge.
3469 */
3470# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3471 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3472 {
3473 /* Lag during the priming to be nice to everyone.. */
3474 pArgs->uMaster.M2.fLag = true;
3475 pArgs->uWorker.M2.fLag = true;
3476 }
3477 else
3478# endif
3479 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3480 {
3481 /* 25 % of the body without lagging. */
3482 pArgs->uMaster.M2.fLag = false;
3483 pArgs->uWorker.M2.fLag = false;
3484 }
3485 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3486 {
3487 /* 25 % of the body with both lagging. */
3488 pArgs->uMaster.M2.fLag = true;
3489 pArgs->uWorker.M2.fLag = true;
3490 }
3491 else
3492 {
3493 /* 50% of the body with alternating lag. */
3494 pArgs->uMaster.M2.fLag = (iLoop & 1) == 0;
3495 pArgs->uWorker.M2.fLag= (iLoop & 1) == 1;
3496 }
3497
3498 /*
3499 * Sync up with the worker and collect data.
3500 */
3501 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3502 supdrvTscDeltaMethod2CollectData(&pArgs->uMaster.M2.Data, &pArgs->uWorker.M2.Data.iCurSeqNo, pArgs->uMaster.M2.fLag);
3503 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3504
3505 /*
3506 * Process the data.
3507 */
3508# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3509 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3510# endif
3511 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs);
3512
3513 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3514 }
3515 else
3516 {
3517 /*
3518 * The worker.
3519 */
3520 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3521 supdrvTscDeltaMethod2CollectData(&pArgs->uWorker.M2.Data, &pArgs->uMaster.M2.Data.iCurSeqNo, pArgs->uWorker.M2.fLag);
3522 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3523 }
3524 }
3525}
3526
3527#endif /* GIP_TSC_DELTA_METHOD_2 */
3528
3529
3530
3531static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3532 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3533{
3534 /*PSUPGIPCPU pGipCpuWorker = pArgs->pWorker; - unused */
3535 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3536 uint32_t i;
3537 TSCDELTA_DBG_VARS();
3538
3539 for (;;)
3540 {
3541 RTCCUINTREG fEFlags;
3542 AssertCompile((RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) & 1) == 0);
3543 AssertCompile(RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) == RT_ELEMENTS(pArgs->uWorker.Verify.auTscs));
3544
3545 if (fIsMaster)
3546 {
3547 uint64_t uTscWorker;
3548 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3549
3550 /*
3551 * Collect TSC, master goes first.
3552 */
3553 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i += 2)
3554 {
3555 /* Read, kick & wait #1. */
3556 uint64_t uTsc = ASMReadTSC();
3557 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3558 ASMSerializeInstruction();
3559 pArgs->uMaster.Verify.auTscs[i] = uTsc;
3560 TSCDELTA_DBG_START_LOOP();
3561 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3562 {
3563 TSCDELTA_DBG_CHECK_LOOP();
3564 ASMNopPause();
3565 }
3566
3567 /* Read, kick & wait #2. */
3568 uTsc = ASMReadTSC();
3569 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3570 ASMSerializeInstruction();
3571 pArgs->uMaster.Verify.auTscs[i + 1] = uTsc;
3572 TSCDELTA_DBG_START_LOOP();
3573 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3574 {
3575 TSCDELTA_DBG_CHECK_LOOP();
3576 ASMNopPause();
3577 }
3578 }
3579
3580 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3581
3582 /*
3583 * Process the data.
3584 */
3585#ifdef TSCDELTA_VERIFY_WITH_STATS
3586 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3587 pArgs->cMinVerifyTscTicks = INT64_MAX;
3588 pArgs->iVerifyBadTscDiff = 0;
3589#endif
3590 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3591 uTscWorker = 0;
3592 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i++)
3593 {
3594 /* Master vs previous worker entry. */
3595 uint64_t uTscMaster = pArgs->uMaster.Verify.auTscs[i] - pGipCpuMaster->i64TSCDelta;
3596 int64_t iDiff;
3597 if (i > 0)
3598 {
3599 iDiff = uTscMaster - uTscWorker;
3600#ifdef TSCDELTA_VERIFY_WITH_STATS
3601 if (iDiff > pArgs->cMaxVerifyTscTicks)
3602 pArgs->cMaxVerifyTscTicks = iDiff;
3603 if (iDiff < pArgs->cMinVerifyTscTicks)
3604 pArgs->cMinVerifyTscTicks = iDiff;
3605#endif
3606 if (iDiff < 0)
3607 {
3608#ifdef TSCDELTA_VERIFY_WITH_STATS
3609 pArgs->iVerifyBadTscDiff = -iDiff;
3610#endif
3611 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3612 break;
3613 }
3614 }
3615
3616 /* Worker vs master. */
3617 uTscWorker = pArgs->uWorker.Verify.auTscs[i] - iWorkerTscDelta;
3618 iDiff = uTscWorker - uTscMaster;
3619#ifdef TSCDELTA_VERIFY_WITH_STATS
3620 if (iDiff > pArgs->cMaxVerifyTscTicks)
3621 pArgs->cMaxVerifyTscTicks = iDiff;
3622 if (iDiff < pArgs->cMinVerifyTscTicks)
3623 pArgs->cMinVerifyTscTicks = iDiff;
3624#endif
3625 if (iDiff < 0)
3626 {
3627#ifdef TSCDELTA_VERIFY_WITH_STATS
3628 pArgs->iVerifyBadTscDiff = iDiff;
3629#endif
3630 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3631 break;
3632 }
3633 }
3634
3635 /* Done. */
3636 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3637 }
3638 else
3639 {
3640 /*
3641 * The worker, master leads.
3642 */
3643 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3644
3645 for (i = 0; i < RT_ELEMENTS(pArgs->uWorker.Verify.auTscs); i += 2)
3646 {
3647 uint64_t uTsc;
3648
3649 /* Wait, Read and Kick #1. */
3650 TSCDELTA_DBG_START_LOOP();
3651 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3652 {
3653 TSCDELTA_DBG_CHECK_LOOP();
3654 ASMNopPause();
3655 }
3656 uTsc = ASMReadTSC();
3657 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3658 ASMSerializeInstruction();
3659 pArgs->uWorker.Verify.auTscs[i] = uTsc;
3660
3661 /* Wait, Read and Kick #2. */
3662 TSCDELTA_DBG_START_LOOP();
3663 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3664 {
3665 TSCDELTA_DBG_CHECK_LOOP();
3666 ASMNopPause();
3667 }
3668 uTsc = ASMReadTSC();
3669 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3670 ASMSerializeInstruction();
3671 pArgs->uWorker.Verify.auTscs[i + 1] = uTsc;
3672 }
3673
3674 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3675 }
3676 return pArgs->rcVerify;
3677 }
3678
3679 /*
3680 * Timed out, please retry.
3681 */
3682 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3683 return VERR_TIMEOUT;
3684}
3685
3686
3687
3688/**
3689 * Handles the special abort procedure during synchronization setup in
3690 * supdrvTscMeasureDeltaCallbackUnwrapped().
3691 *
3692 * @returns 0 (dummy, ignored)
3693 * @param pArgs Pointer to argument/state data.
3694 * @param pMySync Pointer to my sync structure.
3695 * @param fIsMaster Set if we're the master, clear if worker.
3696 * @param fTimeout Set if it's a timeout.
3697 */
3698DECL_NO_INLINE(static, int)
3699supdrvTscMeasureDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3700{
3701 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3702 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3703 TSCDELTA_DBG_VARS();
3704 RT_NOREF1(pMySync);
3705
3706 /*
3707 * Clear our sync pointer and make sure the abort flag is set.
3708 */
3709 ASMAtomicWriteNullPtr(ppMySync);
3710 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3711 if (fTimeout)
3712 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3713
3714 /*
3715 * Make sure the other party is out of there and won't be touching our
3716 * sync state again (would cause stack corruption).
3717 */
3718 TSCDELTA_DBG_START_LOOP();
3719 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3720 {
3721 ASMNopPause();
3722 ASMNopPause();
3723 ASMNopPause();
3724 TSCDELTA_DBG_CHECK_LOOP();
3725 }
3726
3727 return 0;
3728}
3729
3730
3731/**
3732 * This is used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3733 * and compute the delta between them.
3734 *
3735 * To reduce code size a good when timeout handling was added, a dummy return
3736 * value had to be added (saves 1-3 lines per timeout case), thus this
3737 * 'Unwrapped' function and the dummy 0 return value.
3738 *
3739 * @returns 0 (dummy, ignored)
3740 * @param idCpu The CPU we are current scheduled on.
3741 * @param pArgs Pointer to a parameter package.
3742 *
3743 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3744 * read the TSC at exactly the same time on both the master and the
3745 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3746 * contention, SMI, pipelining etc. there is no guaranteed way of
3747 * doing this on x86 CPUs.
3748 */
3749static int supdrvTscMeasureDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3750{
3751 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3752 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3753 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3754 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3755 uint32_t iTry;
3756 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3757 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3758 SUPTSCDELTASYNC2 MySync;
3759 PSUPTSCDELTASYNC2 pOtherSync;
3760 int rc;
3761 TSCDELTA_DBG_VARS();
3762
3763 /* A bit of paranoia first. */
3764 if (!pGipCpuMaster || !pGipCpuWorker)
3765 return 0;
3766
3767 /*
3768 * If the CPU isn't part of the measurement, return immediately.
3769 */
3770 if ( !fIsMaster
3771 && idCpu != pGipCpuWorker->idCpu)
3772 return 0;
3773
3774 /*
3775 * Set up my synchronization stuff and wait for the other party to show up.
3776 *
3777 * We don't wait forever since the other party may be off fishing (offline,
3778 * spinning with ints disables, whatever), we must play nice to the rest of
3779 * the system as this context generally isn't one in which we will get
3780 * preempted and we may hold up a number of lower priority interrupts.
3781 */
3782 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3783 ASMAtomicWritePtr(ppMySync, &MySync);
3784 MySync.uTscStart = ASMReadTSC();
3785 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3786
3787 /* Look for the partner, might not be here yet... Special abort considerations. */
3788 iTry = 0;
3789 TSCDELTA_DBG_START_LOOP();
3790 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3791 {
3792 ASMNopPause();
3793 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3794 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu) )
3795 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3796 if ( (iTry++ & 0xff) == 0
3797 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3798 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3799 TSCDELTA_DBG_CHECK_LOOP();
3800 ASMNopPause();
3801 }
3802
3803 /* I found my partner, waiting to be found... Special abort considerations. */
3804 if (fIsMaster)
3805 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3806 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3807
3808 iTry = 0;
3809 TSCDELTA_DBG_START_LOOP();
3810 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3811 {
3812 ASMNopPause();
3813 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3814 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3815 if ( (iTry++ & 0xff) == 0
3816 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3817 {
3818 if ( fIsMaster
3819 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3820 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3821 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3822 }
3823 TSCDELTA_DBG_CHECK_LOOP();
3824 }
3825
3826 if (!fIsMaster)
3827 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3828 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3829
3830/** @todo Add a resumable state to pArgs so we don't waste time if we time
3831 * out or something. Timeouts are legit, any of the two CPUs may get
3832 * interrupted. */
3833
3834 /*
3835 * Start by seeing if we have a zero delta between the two CPUs.
3836 * This should normally be the case.
3837 */
3838 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3839 if (RT_SUCCESS(rc))
3840 {
3841 if (fIsMaster)
3842 {
3843 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3844 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3845 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3846 }
3847 }
3848 /*
3849 * If the verification didn't time out, do regular delta measurements.
3850 * We retry this until we get a reasonable value.
3851 */
3852 else if (rc != VERR_TIMEOUT)
3853 {
3854 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3855 for (iTry = 0; iTry < 12; iTry++)
3856 {
3857 /*
3858 * Check the state before we start.
3859 */
3860 uint32_t u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3861 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3862 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3863 {
3864 TSCDELTA_DBG_SYNC_MSG(("sync/loop/%s: #0 iTry=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iTry, u32Tmp));
3865 break;
3866 }
3867
3868 /*
3869 * Do the measurements.
3870 */
3871#ifdef GIP_TSC_DELTA_METHOD_1
3872 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3873#elif defined(GIP_TSC_DELTA_METHOD_2)
3874 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3875#else
3876# error "huh??"
3877#endif
3878
3879 /*
3880 * Check the state.
3881 */
3882 u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3883 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3884 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3885 {
3886 if (fIsMaster)
3887 TSCDELTA_DBG_SYNC_MSG(("sync/loop/master: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3888 else
3889 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/worker: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3890 break;
3891 }
3892
3893 /*
3894 * Success? If so, stop trying. Master decides.
3895 */
3896 if (fIsMaster)
3897 {
3898 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3899 {
3900 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3901 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3902 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/master: #9 iTry=%u MyState=%#x\n", iTry, MySync.uSyncVar));
3903 break;
3904 }
3905 }
3906 }
3907 if (fIsMaster)
3908 pArgs->iTry = iTry;
3909 }
3910
3911 /*
3912 * End the synchronization dance. We tell the other that we're done,
3913 * then wait for the same kind of reply.
3914 */
3915 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3916 ASMAtomicWriteNullPtr(ppMySync);
3917 iTry = 0;
3918 TSCDELTA_DBG_START_LOOP();
3919 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3920 {
3921 iTry++;
3922 if ( iTry == 0
3923 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu))
3924 break; /* this really shouldn't happen. */
3925 TSCDELTA_DBG_CHECK_LOOP();
3926 ASMNopPause();
3927 }
3928
3929 /*
3930 * Collect some runtime stats.
3931 */
3932 if (fIsMaster)
3933 pArgs->cElapsedMasterTscTicks = ASMReadTSC() - MySync.uTscStart;
3934 else
3935 pArgs->cElapsedWorkerTscTicks = ASMReadTSC() - MySync.uTscStart;
3936 return 0;
3937}
3938
3939/**
3940 * Callback used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3941 * and compute the delta between them.
3942 *
3943 * @param idCpu The CPU we are current scheduled on.
3944 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3945 * @param pvUser2 Unused.
3946 */
3947static DECLCALLBACK(void) supdrvTscMeasureDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3948{
3949 supdrvTscMeasureDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3950 RT_NOREF1(pvUser2);
3951}
3952
3953
3954/**
3955 * Measures the TSC delta between the master GIP CPU and one specified worker
3956 * CPU.
3957 *
3958 * @returns VBox status code.
3959 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3960 * failure.
3961 * @param pDevExt Pointer to the device instance data.
3962 * @param idxWorker The index of the worker CPU from the GIP's array of
3963 * CPUs.
3964 *
3965 * @remarks This must be called with preemption enabled!
3966 */
3967static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3968{
3969 int rc;
3970 int rc2;
3971 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3972 RTCPUID idMaster = pDevExt->idGipMaster;
3973 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3974 PSUPGIPCPU pGipCpuMaster;
3975 uint32_t iGipCpuMaster;
3976 uint32_t u32Tmp;
3977
3978 /* Validate input a bit. */
3979 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3980 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3981 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3982
3983 /*
3984 * Don't attempt measuring the delta for the GIP master.
3985 */
3986 if (pGipCpuWorker->idCpu == idMaster)
3987 {
3988 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3989 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3990 return VINF_SUCCESS;
3991 }
3992
3993 /*
3994 * One measurement at a time, at least for now. We might be using
3995 * broadcast IPIs so, so be nice to the rest of the system.
3996 */
3997#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3998 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
3999#else
4000 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
4001#endif
4002 if (RT_FAILURE(rc))
4003 return rc;
4004
4005 /*
4006 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
4007 * try pick a different master. (This fudge only works with multi core systems.)
4008 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
4009 *
4010 * We skip this on AMDs for now as their HTT is different from Intel's and
4011 * it doesn't seem to have any favorable effect on the results.
4012 *
4013 * If the master is offline, we need a new master too, so share the code.
4014 */
4015 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
4016 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
4017 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
4018 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
4019 && pGip->cOnlineCpus > 2
4020 && ASMHasCpuId()
4021 && ASMIsValidStdRange(ASMCpuId_EAX(0))
4022 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
4023 && ( (!ASMIsAmdCpu() && !ASMIsHygonCpu())
4024 || ASMGetCpuFamily(u32Tmp = ASMCpuId_EAX(1)) > 0x15
4025 || ( ASMGetCpuFamily(u32Tmp) == 0x15 /* Piledriver+, not bulldozer (FX-4150 didn't like it). */
4026 && ASMGetCpuModelAMD(u32Tmp) >= 0x02) ) )
4027 || !RTMpIsCpuOnline(idMaster) )
4028 {
4029 uint32_t i;
4030 for (i = 0; i < pGip->cCpus; i++)
4031 if ( i != iGipCpuMaster
4032 && i != idxWorker
4033 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
4034 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
4035 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
4036 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
4037 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
4038 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
4039 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
4040 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
4041 {
4042 iGipCpuMaster = i;
4043 pGipCpuMaster = &pGip->aCPUs[i];
4044 idMaster = pGipCpuMaster->idCpu;
4045 break;
4046 }
4047 }
4048
4049 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
4050 {
4051 /*
4052 * Initialize data package for the RTMpOnPair callback.
4053 */
4054 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
4055 if (pArgs)
4056 {
4057 pArgs->pWorker = pGipCpuWorker;
4058 pArgs->pMaster = pGipCpuMaster;
4059 pArgs->pDevExt = pDevExt;
4060 pArgs->pSyncMaster = NULL;
4061 pArgs->pSyncWorker = NULL;
4062 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 512; /* 1953 us */
4063
4064 /*
4065 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
4066 * and supdrvTscMeasureDeltaCallback can use it as a success check.
4067 */
4068 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
4069 * that when doing the restart loop reorg. */
4070 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
4071 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
4072 supdrvTscMeasureDeltaCallback, pArgs, NULL);
4073 if (RT_SUCCESS(rc))
4074 {
4075#if 0
4076 SUPR0Printf("mponpair ticks: %9llu %9llu max: %9llu iTry: %u%s\n", pArgs->cElapsedMasterTscTicks,
4077 pArgs->cElapsedWorkerTscTicks, pArgs->cMaxTscTicks, pArgs->iTry,
4078 pArgs->fTimedOut ? " timed out" :"");
4079#endif
4080#if 0
4081 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
4082 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
4083#endif
4084 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
4085 {
4086 /*
4087 * Work the TSC delta applicability rating. It starts
4088 * optimistic in supdrvGipInit, we downgrade it here.
4089 */
4090 SUPGIPUSETSCDELTA enmRating;
4091 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
4092 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
4093 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
4094 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
4095 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
4096 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
4097 else
4098 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
4099 if (pGip->enmUseTscDelta < enmRating)
4100 {
4101 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
4102 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
4103 }
4104 }
4105 else
4106 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4107 }
4108 /** @todo return try-again if we get an offline CPU error. */
4109
4110 RTMemFree(pArgs);
4111 }
4112 else
4113 rc = VERR_NO_MEMORY;
4114 }
4115 else
4116 rc = VERR_CPU_OFFLINE;
4117
4118 /*
4119 * We're done now.
4120 */
4121#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4122 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4123#else
4124 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4125#endif
4126 return rc;
4127}
4128
4129
4130/**
4131 * Resets the TSC-delta related TSC samples and optionally the deltas
4132 * themselves.
4133 *
4134 * @param pDevExt Pointer to the device instance data.
4135 * @param fResetTscDeltas Whether the TSC-deltas are also to be reset.
4136 *
4137 * @remarks This might be called while holding a spinlock!
4138 */
4139static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fResetTscDeltas)
4140{
4141 unsigned iCpu;
4142 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4143 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4144 {
4145 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
4146 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
4147 if (fResetTscDeltas)
4148 {
4149 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpu->iCpuSet);
4150 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
4151 }
4152 }
4153}
4154
4155
4156/**
4157 * Picks an online CPU as the master TSC for TSC-delta computations.
4158 *
4159 * @returns VBox status code.
4160 * @param pDevExt Pointer to the device instance data.
4161 * @param pidxMaster Where to store the CPU array index of the chosen
4162 * master. Optional, can be NULL.
4163 */
4164static int supdrvTscPickMaster(PSUPDRVDEVEXT pDevExt, uint32_t *pidxMaster)
4165{
4166 /*
4167 * Pick the first CPU online as the master TSC and make it the new GIP master based
4168 * on the APIC ID.
4169 *
4170 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
4171 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
4172 * master as this point since the sync/async timer isn't created yet.
4173 */
4174 unsigned iCpu;
4175 uint32_t idxMaster = UINT32_MAX;
4176 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4177 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
4178 {
4179 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
4180 if (idxCpu != UINT16_MAX)
4181 {
4182 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
4183 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
4184 {
4185 idxMaster = idxCpu;
4186 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
4187 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpu->idCpu);
4188 if (pidxMaster)
4189 *pidxMaster = idxMaster;
4190 return VINF_SUCCESS;
4191 }
4192 }
4193 }
4194 return VERR_CPU_OFFLINE;
4195}
4196
4197
4198/**
4199 * Performs the initial measurements of the TSC deltas between CPUs.
4200 *
4201 * This is called by supdrvGipCreate(), supdrvGipPowerNotificationCallback() or
4202 * triggered by it if threaded.
4203 *
4204 * @returns VBox status code.
4205 * @param pDevExt Pointer to the device instance data.
4206 *
4207 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
4208 * idCpu, GIP's online CPU set which are populated in
4209 * supdrvGipInitOnCpu().
4210 */
4211static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt)
4212{
4213 PSUPGIPCPU pGipCpuMaster;
4214 unsigned iCpu;
4215 unsigned iOddEven;
4216 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4217 uint32_t idxMaster = UINT32_MAX;
4218 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
4219
4220 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4221 supdrvTscResetSamples(pDevExt, true /* fClearDeltas */);
4222 int rc = supdrvTscPickMaster(pDevExt, &idxMaster);
4223 if (RT_FAILURE(rc))
4224 {
4225 SUPR0Printf("Failed to pick a CPU master for TSC-delta measurements rc=%Rrc\n", rc);
4226 return rc;
4227 }
4228 AssertReturn(idxMaster < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4229 pGipCpuMaster = &pGip->aCPUs[idxMaster];
4230 Assert(pDevExt->idGipMaster == pGipCpuMaster->idCpu);
4231
4232 /*
4233 * If there is only a single CPU online we have nothing to do.
4234 */
4235 if (pGip->cOnlineCpus <= 1)
4236 {
4237 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
4238 return VINF_SUCCESS;
4239 }
4240
4241 /*
4242 * Loop thru the GIP CPU array and get deltas for each CPU (except the
4243 * master). We do the CPUs with the even numbered APIC IDs first so that
4244 * we've got alternative master CPUs to pick from on hyper-threaded systems.
4245 */
4246 for (iOddEven = 0; iOddEven < 2; iOddEven++)
4247 {
4248 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4249 {
4250 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4251 if ( iCpu != idxMaster
4252 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
4253 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4254 {
4255 rc = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4256 if (RT_FAILURE(rc))
4257 {
4258 SUPR0Printf("supdrvTscMeasureDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4259 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4260 break;
4261 }
4262
4263 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4264 {
4265 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4266 rc = VERR_TRY_AGAIN;
4267 break;
4268 }
4269 }
4270 }
4271 }
4272
4273 return rc;
4274}
4275
4276
4277#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4278
4279/**
4280 * Switches the TSC-delta measurement thread into the butchered state.
4281 *
4282 * @returns VBox status code.
4283 * @param pDevExt Pointer to the device instance data.
4284 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4285 * @param pszFailed An error message to log.
4286 * @param rcFailed The error code to exit the thread with.
4287 */
4288static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4289{
4290 if (!fSpinlockHeld)
4291 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4292
4293 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4294 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4295 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", pszFailed, rcFailed));
4296 return rcFailed;
4297}
4298
4299
4300/**
4301 * The TSC-delta measurement thread.
4302 *
4303 * @returns VBox status code.
4304 * @param hThread The thread handle.
4305 * @param pvUser Opaque pointer to the device instance data.
4306 */
4307static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4308{
4309 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4310 int rc = VERR_INTERNAL_ERROR_2;
4311 for (;;)
4312 {
4313 /*
4314 * Switch on the current state.
4315 */
4316 SUPDRVTSCDELTATHREADSTATE enmState;
4317 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4318 enmState = pDevExt->enmTscDeltaThreadState;
4319 switch (enmState)
4320 {
4321 case kTscDeltaThreadState_Creating:
4322 {
4323 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4324 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4325 if (RT_FAILURE(rc))
4326 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4327 RT_FALL_THRU();
4328 }
4329
4330 case kTscDeltaThreadState_Listening:
4331 {
4332 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4333
4334 /*
4335 * Linux counts uninterruptible sleeps as load, hence we shall do a
4336 * regular, interruptible sleep here and ignore wake ups due to signals.
4337 * See task_contributes_to_load() in include/linux/sched.h in the Linux sources.
4338 */
4339 rc = RTThreadUserWaitNoResume(hThread, pDevExt->cMsTscDeltaTimeout);
4340 if ( RT_FAILURE(rc)
4341 && rc != VERR_TIMEOUT
4342 && rc != VERR_INTERRUPTED)
4343 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4344 RTThreadUserReset(hThread);
4345 break;
4346 }
4347
4348 case kTscDeltaThreadState_WaitAndMeasure:
4349 {
4350 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4351 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4352 if (RT_FAILURE(rc))
4353 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4354 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4355 RTThreadSleep(1);
4356 RT_FALL_THRU();
4357 }
4358
4359 case kTscDeltaThreadState_Measuring:
4360 {
4361 if (pDevExt->fTscThreadRecomputeAllDeltas)
4362 {
4363 int cTries = 8;
4364 int cMsWaitPerTry = 10;
4365 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4366 Assert(pGip);
4367 do
4368 {
4369 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
4370 rc = supdrvTscMeasureInitialDeltas(pDevExt);
4371 if ( RT_SUCCESS(rc)
4372 || ( RT_FAILURE(rc)
4373 && rc != VERR_TRY_AGAIN
4374 && rc != VERR_CPU_OFFLINE))
4375 {
4376 break;
4377 }
4378 RTThreadSleep(cMsWaitPerTry);
4379 } while (cTries-- > 0);
4380 pDevExt->fTscThreadRecomputeAllDeltas = false;
4381 }
4382 else
4383 {
4384 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4385 unsigned iCpu;
4386
4387 /* Measure TSC-deltas only for the CPUs that are in the set. */
4388 rc = VINF_SUCCESS;
4389 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4390 {
4391 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4392 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4393 {
4394 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4395 {
4396 int rc2 = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4397 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4398 rc = rc2;
4399 }
4400 else
4401 {
4402 /*
4403 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex(),
4404 * mark the delta as fine to get the timer thread off our back.
4405 */
4406 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4407 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4408 }
4409 }
4410 }
4411 }
4412 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4413 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4414 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4415 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4416 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as init value, see supdrvTscDeltaThreadInit(). */
4417 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4418 break;
4419 }
4420
4421 case kTscDeltaThreadState_Terminating:
4422 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4423 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4424 return VINF_SUCCESS;
4425
4426 case kTscDeltaThreadState_Butchered:
4427 default:
4428 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4429 }
4430 }
4431 /* not reached */
4432}
4433
4434
4435/**
4436 * Waits for the TSC-delta measurement thread to respond to a state change.
4437 *
4438 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4439 * other error code on internal error.
4440 *
4441 * @param pDevExt The device instance data.
4442 * @param enmCurState The current state.
4443 * @param enmNewState The new state we're waiting for it to enter.
4444 */
4445static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4446 SUPDRVTSCDELTATHREADSTATE enmNewState)
4447{
4448 SUPDRVTSCDELTATHREADSTATE enmActualState;
4449 int rc;
4450
4451 /*
4452 * Wait a short while for the expected state transition.
4453 */
4454 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4455 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4456 enmActualState = pDevExt->enmTscDeltaThreadState;
4457 if (enmActualState == enmNewState)
4458 {
4459 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4460 rc = VINF_SUCCESS;
4461 }
4462 else if (enmActualState == enmCurState)
4463 {
4464 /*
4465 * Wait longer if the state has not yet transitioned to the one we want.
4466 */
4467 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4468 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4469 if ( RT_SUCCESS(rc)
4470 || rc == VERR_TIMEOUT)
4471 {
4472 /*
4473 * Check the state whether we've succeeded.
4474 */
4475 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4476 enmActualState = pDevExt->enmTscDeltaThreadState;
4477 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4478 if (enmActualState == enmNewState)
4479 rc = VINF_SUCCESS;
4480 else if (enmActualState == enmCurState)
4481 {
4482 rc = VERR_TIMEOUT;
4483 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmActualState=%d enmNewState=%d\n",
4484 enmActualState, enmNewState));
4485 }
4486 else
4487 {
4488 rc = VERR_INTERNAL_ERROR;
4489 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4490 enmActualState, enmNewState));
4491 }
4492 }
4493 else
4494 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4495 }
4496 else
4497 {
4498 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4499 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state %d when transitioning from %d to %d\n",
4500 enmActualState, enmCurState, enmNewState));
4501 rc = VERR_INTERNAL_ERROR;
4502 }
4503
4504 return rc;
4505}
4506
4507
4508/**
4509 * Signals the TSC-delta thread to start measuring TSC-deltas.
4510 *
4511 * @param pDevExt Pointer to the device instance data.
4512 * @param fForceAll Force re-calculating TSC-deltas on all CPUs.
4513 */
4514static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll)
4515{
4516 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
4517 {
4518 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4519 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4520 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4521 {
4522 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4523 if (fForceAll)
4524 pDevExt->fTscThreadRecomputeAllDeltas = true;
4525 }
4526 else if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_WaitAndMeasure
4527 && fForceAll)
4528 pDevExt->fTscThreadRecomputeAllDeltas = true;
4529 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4530 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4531 }
4532}
4533
4534
4535/**
4536 * Terminates the actual thread running supdrvTscDeltaThread().
4537 *
4538 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4539 * supdrvTscDeltaTerm().
4540 *
4541 * @param pDevExt Pointer to the device instance data.
4542 */
4543static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4544{
4545 int rc;
4546 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4547 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4548 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4549 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4550 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4551 if (RT_FAILURE(rc))
4552 {
4553 /* Signal a few more times before giving up. */
4554 int cTriesLeft = 5;
4555 while (--cTriesLeft > 0)
4556 {
4557 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4558 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4559 if (rc != VERR_TIMEOUT)
4560 break;
4561 }
4562 }
4563}
4564
4565
4566/**
4567 * Initializes and spawns the TSC-delta measurement thread.
4568 *
4569 * A thread is required for servicing re-measurement requests from events like
4570 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4571 * under all contexts on all OSs.
4572 *
4573 * @returns VBox status code.
4574 * @param pDevExt Pointer to the device instance data.
4575 *
4576 * @remarks Must only be called -after- initializing GIP and setting up MP
4577 * notifications!
4578 */
4579static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4580{
4581 int rc;
4582 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4583 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4584 if (RT_SUCCESS(rc))
4585 {
4586 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4587 if (RT_SUCCESS(rc))
4588 {
4589 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4590 pDevExt->cMsTscDeltaTimeout = 60000;
4591 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4592 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4593 if (RT_SUCCESS(rc))
4594 {
4595 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4596 if (RT_SUCCESS(rc))
4597 {
4598 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4599 return rc;
4600 }
4601
4602 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4603 supdrvTscDeltaThreadTerminate(pDevExt);
4604 }
4605 else
4606 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4607 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4608 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4609 }
4610 else
4611 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4612 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4613 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4614 }
4615 else
4616 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4617
4618 return rc;
4619}
4620
4621
4622/**
4623 * Terminates the TSC-delta measurement thread and cleanup.
4624 *
4625 * @param pDevExt Pointer to the device instance data.
4626 */
4627static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4628{
4629 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4630 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4631 {
4632 supdrvTscDeltaThreadTerminate(pDevExt);
4633 }
4634
4635 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4636 {
4637 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4638 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4639 }
4640
4641 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4642 {
4643 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4644 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4645 }
4646
4647 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4648}
4649
4650#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4651
4652/**
4653 * Measure the TSC delta for the CPU given by its CPU set index.
4654 *
4655 * @returns VBox status code.
4656 * @retval VERR_INTERRUPTED if interrupted while waiting.
4657 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4658 * measurement.
4659 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4660 *
4661 * @param pSession The caller's session. GIP must've been mapped.
4662 * @param iCpuSet The CPU set index of the CPU to measure.
4663 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4664 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4665 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4666 * ready.
4667 * @param cTries Number of times to try, pass 0 for the default.
4668 */
4669SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4670 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4671{
4672 PSUPDRVDEVEXT pDevExt;
4673 PSUPGLOBALINFOPAGE pGip;
4674 uint16_t iGipCpu;
4675 int rc;
4676#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4677 uint64_t msTsStartWait;
4678 uint32_t iWaitLoop;
4679#endif
4680
4681 /*
4682 * Validate and adjust the input.
4683 */
4684 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4685 if (!pSession->fGipReferenced)
4686 return VERR_WRONG_ORDER;
4687
4688 pDevExt = pSession->pDevExt;
4689 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4690
4691 pGip = pDevExt->pGip;
4692 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4693
4694 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4695 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4696 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4697 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4698
4699 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4700 return VERR_INVALID_FLAGS;
4701
4702 /*
4703 * The request is a noop if the TSC delta isn't being used.
4704 */
4705 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4706 return VINF_SUCCESS;
4707
4708 if (cTries == 0)
4709 cTries = 12;
4710 else if (cTries > 256)
4711 cTries = 256;
4712
4713 if (cMsWaitRetry == 0)
4714 cMsWaitRetry = 2;
4715 else if (cMsWaitRetry > 1000)
4716 cMsWaitRetry = 1000;
4717
4718#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4719 /*
4720 * Has the TSC already been measured and we're not forced to redo it?
4721 */
4722 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4723 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4724 return VINF_SUCCESS;
4725
4726 /*
4727 * Asynchronous request? Forward it to the thread, no waiting.
4728 */
4729 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4730 {
4731 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4732 * to pass those options to the thread somehow and implement it in the
4733 * thread. Check if anyone uses/needs fAsync before implementing this. */
4734 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4735 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4736 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4737 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4738 {
4739 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4740 rc = VINF_SUCCESS;
4741 }
4742 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4743 rc = VERR_THREAD_IS_DEAD;
4744 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4745 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4746 return VINF_SUCCESS;
4747 }
4748
4749 /*
4750 * If a TSC-delta measurement request is already being serviced by the thread,
4751 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4752 */
4753 msTsStartWait = RTTimeSystemMilliTS();
4754 for (iWaitLoop = 0;; iWaitLoop++)
4755 {
4756 uint64_t cMsElapsed;
4757 SUPDRVTSCDELTATHREADSTATE enmState;
4758 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4759 enmState = pDevExt->enmTscDeltaThreadState;
4760 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4761
4762 if (enmState == kTscDeltaThreadState_Measuring)
4763 { /* Must wait, the thread is busy. */ }
4764 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4765 { /* Must wait, this state only says what will happen next. */ }
4766 else if (enmState == kTscDeltaThreadState_Terminating)
4767 { /* Must wait, this state only says what should happen next. */ }
4768 else
4769 break; /* All other states, the thread is either idly listening or dead. */
4770
4771 /* Wait or fail. */
4772 if (cMsWaitThread == 0)
4773 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4774 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4775 if (cMsElapsed >= cMsWaitThread)
4776 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4777
4778 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4779 if (rc == VERR_INTERRUPTED)
4780 return rc;
4781 }
4782#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4783
4784 /*
4785 * Try measure the TSC delta the given number of times.
4786 */
4787 for (;;)
4788 {
4789 /* Unless we're forced to measure the delta, check whether it's done already. */
4790 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4791 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4792 {
4793 rc = VINF_SUCCESS;
4794 break;
4795 }
4796
4797 /* Measure it. */
4798 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4799 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4800 {
4801 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4802 break;
4803 }
4804
4805 /* Retry? */
4806 if (cTries <= 1)
4807 break;
4808 cTries--;
4809
4810 /* Always delay between retries (be nice to the rest of the system
4811 and avoid the BSOD hounds). */
4812 rc = RTThreadSleep(cMsWaitRetry);
4813 if (rc == VERR_INTERRUPTED)
4814 break;
4815 }
4816
4817 return rc;
4818}
4819
4820
4821/**
4822 * Service a TSC-delta measurement request.
4823 *
4824 * @returns VBox status code.
4825 * @param pDevExt Pointer to the device instance data.
4826 * @param pSession The support driver session.
4827 * @param pReq Pointer to the TSC-delta measurement request.
4828 */
4829int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4830{
4831 uint32_t cTries;
4832 uint32_t iCpuSet;
4833 uint32_t fFlags;
4834 RTMSINTERVAL cMsWaitRetry;
4835 RT_NOREF1(pDevExt);
4836
4837 /*
4838 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4839 */
4840 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4841
4842 if (pReq->u.In.idCpu == NIL_RTCPUID)
4843 return VERR_INVALID_CPU_ID;
4844 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4845 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4846 return VERR_INVALID_CPU_ID;
4847
4848 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4849
4850 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4851
4852 fFlags = 0;
4853 if (pReq->u.In.fAsync)
4854 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4855 if (pReq->u.In.fForce)
4856 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4857
4858 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4859 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4860 cTries);
4861}
4862
4863
4864/**
4865 * Reads TSC with delta applied.
4866 *
4867 * Will try to resolve delta value INT64_MAX before applying it. This is the
4868 * main purpose of this function, to handle the case where the delta needs to be
4869 * determined.
4870 *
4871 * @returns VBox status code.
4872 * @param pDevExt Pointer to the device instance data.
4873 * @param pSession The support driver session.
4874 * @param pReq Pointer to the TSC-read request.
4875 */
4876int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4877{
4878 PSUPGLOBALINFOPAGE pGip;
4879 int rc;
4880
4881 /*
4882 * Validate. We require the client to have mapped GIP (no asserting on
4883 * ring-3 preconditions).
4884 */
4885 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4886 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4887 return VERR_WRONG_ORDER;
4888 pGip = pDevExt->pGip;
4889 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4890
4891 /*
4892 * We're usually here because we need to apply delta, but we shouldn't be
4893 * upset if the GIP is some different mode.
4894 */
4895 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4896 {
4897 uint32_t cTries = 0;
4898 for (;;)
4899 {
4900 /*
4901 * Start by gathering the data, using CLI for disabling preemption
4902 * while we do that.
4903 */
4904 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4905 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4906 int iGipCpu = 0; /* gcc maybe used uninitialized */
4907 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4908 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4909 {
4910 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4911 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4912 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4913 ASMSetFlags(fEFlags);
4914
4915 /*
4916 * If we're lucky we've got a delta, but no predictions here
4917 * as this I/O control is normally only used when the TSC delta
4918 * is set to INT64_MAX.
4919 */
4920 if (i64Delta != INT64_MAX)
4921 {
4922 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4923 rc = VINF_SUCCESS;
4924 break;
4925 }
4926
4927 /* Give up after a few times. */
4928 if (cTries >= 4)
4929 {
4930 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4931 break;
4932 }
4933
4934 /* Need to measure the delta an try again. */
4935 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4936 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4937 /** @todo should probably delay on failure... dpc watchdogs */
4938 }
4939 else
4940 {
4941 /* This really shouldn't happen. */
4942 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4943 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4944 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4945 ASMSetFlags(fEFlags);
4946 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4947 break;
4948 }
4949 }
4950 }
4951 else
4952 {
4953 /*
4954 * No delta to apply. Easy. Deal with preemption the lazy way.
4955 */
4956 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4957 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4958 int iGipCpu = 0; /* gcc may be used uninitialized */
4959 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4960 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4961 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4962 else
4963 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4964 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4965 ASMSetFlags(fEFlags);
4966 rc = VINF_SUCCESS;
4967 }
4968
4969 return rc;
4970}
4971
4972
4973/**
4974 * Worker for supdrvIOCtl_GipSetFlags.
4975 *
4976 * @returns VBox status code.
4977 * @retval VERR_WRONG_ORDER if an enable-once-per-session flag is set again for
4978 * a session.
4979 *
4980 * @param pDevExt Pointer to the device instance data.
4981 * @param pSession The support driver session.
4982 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4983 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4984 *
4985 * @remarks Caller must own the GIP mutex.
4986 *
4987 * @remarks This function doesn't validate any of the flags.
4988 */
4989static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
4990{
4991 uint32_t cRefs;
4992 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4993 AssertMsg((fOrMask & fAndMask) == fOrMask, ("%#x & %#x\n", fOrMask, fAndMask)); /* ASSUMED by code below */
4994
4995 /*
4996 * Compute GIP test-mode flags.
4997 */
4998 if (fOrMask & SUPGIP_FLAGS_TESTING_ENABLE)
4999 {
5000 if (!pSession->fGipTestMode)
5001 {
5002 Assert(pDevExt->cGipTestModeRefs < _64K);
5003 pSession->fGipTestMode = true;
5004 cRefs = ++pDevExt->cGipTestModeRefs;
5005 if (cRefs == 1)
5006 {
5007 fOrMask |= SUPGIP_FLAGS_TESTING | SUPGIP_FLAGS_TESTING_START;
5008 fAndMask &= ~SUPGIP_FLAGS_TESTING_STOP;
5009 }
5010 }
5011 else
5012 {
5013 LogRelMax(10, ("supdrvGipSetFlags: SUPGIP_FLAGS_TESTING_ENABLE already set for this session\n"));
5014 return VERR_WRONG_ORDER;
5015 }
5016 }
5017 else if ( !(fAndMask & SUPGIP_FLAGS_TESTING_ENABLE)
5018 && pSession->fGipTestMode)
5019 {
5020 Assert(pDevExt->cGipTestModeRefs > 0);
5021 Assert(pDevExt->cGipTestModeRefs < _64K);
5022 pSession->fGipTestMode = false;
5023 cRefs = --pDevExt->cGipTestModeRefs;
5024 if (!cRefs)
5025 fOrMask |= SUPGIP_FLAGS_TESTING_STOP;
5026 else
5027 fAndMask |= SUPGIP_FLAGS_TESTING_ENABLE;
5028 }
5029
5030 /*
5031 * Commit the flags. This should be done as atomically as possible
5032 * since the flag consumers won't be holding the GIP mutex.
5033 */
5034 ASMAtomicOrU32(&pGip->fFlags, fOrMask);
5035 ASMAtomicAndU32(&pGip->fFlags, fAndMask);
5036
5037 return VINF_SUCCESS;
5038}
5039
5040
5041/**
5042 * Sets GIP test mode parameters.
5043 *
5044 * @returns VBox status code.
5045 * @param pDevExt Pointer to the device instance data.
5046 * @param pSession The support driver session.
5047 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5048 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5049 */
5050int VBOXCALL supdrvIOCtl_GipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5051{
5052 PSUPGLOBALINFOPAGE pGip;
5053 int rc;
5054
5055 /*
5056 * Validate. We require the client to have mapped GIP (no asserting on
5057 * ring-3 preconditions).
5058 */
5059 AssertPtr(pDevExt); AssertPtr(pSession); /* paranoia^2 */
5060 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
5061 return VERR_WRONG_ORDER;
5062 pGip = pDevExt->pGip;
5063 AssertReturn(pGip, VERR_INTERNAL_ERROR_3);
5064
5065 if (fOrMask & ~SUPGIP_FLAGS_VALID_MASK)
5066 return VERR_INVALID_PARAMETER;
5067 if ((fAndMask & ~SUPGIP_FLAGS_VALID_MASK) != ~SUPGIP_FLAGS_VALID_MASK)
5068 return VERR_INVALID_PARAMETER;
5069
5070 /*
5071 * Don't confuse supdrvGipSetFlags or anyone else by both setting
5072 * and clearing the same flags. AND takes precedence.
5073 */
5074 fOrMask &= fAndMask;
5075
5076 /*
5077 * Take the loader lock to avoid having to think about races between two
5078 * clients changing the flags at the same time (state is not simple).
5079 */
5080#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5081 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
5082#else
5083 RTSemFastMutexRequest(pDevExt->mtxGip);
5084#endif
5085
5086 rc = supdrvGipSetFlags(pDevExt, pSession, fOrMask, fAndMask);
5087
5088#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5089 RTSemMutexRelease(pDevExt->mtxGip);
5090#else
5091 RTSemFastMutexRelease(pDevExt->mtxGip);
5092#endif
5093 return rc;
5094}
5095
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette