VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 86001

Last change on this file since 86001 was 82968, checked in by vboxsync, 5 years ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 186.2 KB
Line 
1/* $Id: SUPDrvGip.cpp 82968 2020-02-04 10:35:17Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#define LOG_GROUP LOG_GROUP_SUP_DRV
32#define SUPDRV_AGNOSTIC
33#include "SUPDrvInternal.h"
34#ifndef PAGE_SHIFT
35# include <iprt/param.h>
36#endif
37#include <iprt/asm.h>
38#include <iprt/asm-amd64-x86.h>
39#include <iprt/asm-math.h>
40#include <iprt/cpuset.h>
41#include <iprt/handletable.h>
42#include <iprt/mem.h>
43#include <iprt/mp.h>
44#include <iprt/power.h>
45#include <iprt/process.h>
46#include <iprt/semaphore.h>
47#include <iprt/spinlock.h>
48#include <iprt/thread.h>
49#include <iprt/uuid.h>
50#include <iprt/net.h>
51#include <iprt/crc.h>
52#include <iprt/string.h>
53#include <iprt/timer.h>
54#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
55# include <iprt/rand.h>
56# include <iprt/path.h>
57#endif
58#include <iprt/uint128.h>
59#include <iprt/x86.h>
60
61#include <VBox/param.h>
62#include <VBox/log.h>
63#include <VBox/err.h>
64
65#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
66# include "dtrace/SUPDrv.h"
67#else
68/* ... */
69#endif
70
71
72/*********************************************************************************************************************************
73* Defined Constants And Macros *
74*********************************************************************************************************************************/
75/** The frequency by which we recalculate the u32UpdateHz and
76 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
77 *
78 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
79 */
80#define GIP_UPDATEHZ_RECALC_FREQ 0x800
81
82/** A reserved TSC value used for synchronization as well as measurement of
83 * TSC deltas. */
84#define GIP_TSC_DELTA_RSVD UINT64_MAX
85/** The number of TSC delta measurement loops in total (includes primer and
86 * read-time loops). */
87#define GIP_TSC_DELTA_LOOPS 96
88/** The number of cache primer loops. */
89#define GIP_TSC_DELTA_PRIMER_LOOPS 4
90/** The number of loops until we keep computing the minumum read time. */
91#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
92
93/** The TSC frequency refinement period in seconds.
94 * The timer fires after 200ms, then every second, this value just says when
95 * to stop it after that. */
96#define GIP_TSC_REFINE_PERIOD_IN_SECS 12
97/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
98#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
99/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
100#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
101/** The TSC delta value for the initial GIP master - 0 in regular builds.
102 * To test the delta code this can be set to a non-zero value. */
103#if 0
104# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
105#else
106# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
107#endif
108
109AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
110AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
111
112/** @def VBOX_SVN_REV
113 * The makefile should define this if it can. */
114#ifndef VBOX_SVN_REV
115# define VBOX_SVN_REV 0
116#endif
117
118#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
119# define DO_NOT_START_GIP
120#endif
121
122
123/*********************************************************************************************************************************
124* Internal Functions *
125*********************************************************************************************************************************/
126static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
127static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
128static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask);
129static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
130static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas);
131#ifdef SUPDRV_USE_TSC_DELTA_THREAD
132static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
133static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
134static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll);
135#else
136static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt);
137static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
138#endif
139
140
141/*********************************************************************************************************************************
142* Global Variables *
143*********************************************************************************************************************************/
144DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
145
146
147
148/*
149 *
150 * Misc Common GIP Code
151 * Misc Common GIP Code
152 * Misc Common GIP Code
153 *
154 *
155 */
156
157
158/**
159 * Finds the GIP CPU index corresponding to @a idCpu.
160 *
161 * @returns GIP CPU array index, UINT32_MAX if not found.
162 * @param pGip The GIP.
163 * @param idCpu The CPU ID.
164 */
165static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
166{
167 uint32_t i;
168 for (i = 0; i < pGip->cCpus; i++)
169 if (pGip->aCPUs[i].idCpu == idCpu)
170 return i;
171 return UINT32_MAX;
172}
173
174
175/**
176 * Gets the APIC ID using the best available method.
177 *
178 * @returns APIC ID.
179 * @param pGip The GIP, for SUPGIPGETCPU_XXX.
180 */
181DECLINLINE(uint32_t) supdrvGipGetApicId(PSUPGLOBALINFOPAGE pGip)
182{
183 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_0B)
184 return ASMGetApicIdExt0B();
185 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_8000001E)
186 return ASMGetApicIdExt8000001E();
187 return ASMGetApicId();
188}
189
190
191/**
192 * Gets the APIC ID using the best available method, slow version.
193 */
194static uint32_t supdrvGipGetApicIdSlow(void)
195{
196 uint32_t const idApic = ASMGetApicId();
197
198 /* The Intel CPU topology leaf: */
199 uint32_t uOther = ASMCpuId_EAX(0);
200 if (uOther >= UINT32_C(0xb) && ASMIsValidStdRange(uOther))
201 {
202 uint32_t uEax = 0;
203 uint32_t uEbx = 0;
204 uint32_t uEcx = 0;
205 uint32_t uEdx = 0;
206#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
207 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
208#else
209 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
210#endif
211 if ((uEcx >> 8) != 0) /* level type != invalid */
212 {
213 if ((uEdx & 0xff) == idApic)
214 return uEdx;
215 AssertMsgFailed(("ASMGetApicIdExt0B=>%#x idApic=%#x\n", uEdx, idApic));
216 }
217 }
218
219 /* The AMD leaf: */
220 uOther = ASMCpuId_EAX(UINT32_C(0x80000000));
221 if (uOther >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uOther))
222 {
223 uOther = ASMGetApicIdExt8000001E();
224 if ((uOther & 0xff) == idApic)
225 return uOther;
226 AssertMsgFailed(("ASMGetApicIdExt8000001E=>%#x idApic=%#x\n", uOther, idApic));
227 }
228 return idApic;
229}
230
231
232/*
233 *
234 * GIP Mapping and Unmapping Related Code.
235 * GIP Mapping and Unmapping Related Code.
236 * GIP Mapping and Unmapping Related Code.
237 *
238 *
239 */
240
241
242/**
243 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
244 * updating.
245 *
246 * @param pGipCpu The per CPU structure for this CPU.
247 * @param u64NanoTS The current time.
248 */
249static void supdrvGipReInitCpu(PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
250{
251 /*
252 * Here we don't really care about applying the TSC delta. The re-initialization of this
253 * value is not relevant especially while (re)starting the GIP as the first few ones will
254 * be ignored anyway, see supdrvGipDoUpdateCpu().
255 */
256 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
257 pGipCpu->u64NanoTS = u64NanoTS;
258}
259
260
261/**
262 * Set the current TSC and NanoTS value for the CPU.
263 *
264 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
265 * @param pvUser1 Pointer to the ring-0 GIP mapping.
266 * @param pvUser2 Pointer to the variable holding the current time.
267 */
268static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
269{
270 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
271 uint32_t const idApic = supdrvGipGetApicId(pGip);
272 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
273 {
274 unsigned const iCpu = pGip->aiCpuFromApicId[idApic];
275
276 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
277 supdrvGipReInitCpu(&pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
278 else
279 LogRelMax(64, ("supdrvGipReInitCpuCallback: iCpu=%#x out of bounds (%#zx, idApic=%#x)\n",
280 iCpu, RT_ELEMENTS(pGip->aiCpuFromApicId), idApic));
281 }
282 else
283 LogRelMax(64, ("supdrvGipReInitCpuCallback: idApic=%#x out of bounds (%#zx)\n",
284 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId)));
285
286 NOREF(pvUser2);
287}
288
289
290/**
291 * State structure for supdrvGipDetectGetGipCpuCallback.
292 */
293typedef struct SUPDRVGIPDETECTGETCPU
294{
295 /** Bitmap of APIC IDs that has been seen (initialized to zero).
296 * Used to detect duplicate APIC IDs (paranoia). */
297 uint8_t volatile bmApicId[4096 / 8];
298 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
299 * initially). The callback clears the methods not detected. */
300 uint32_t volatile fSupported;
301 /** The first callback detecting any kind of range issues (initialized to
302 * NIL_RTCPUID). */
303 RTCPUID volatile idCpuProblem;
304} SUPDRVGIPDETECTGETCPU;
305/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
306typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
307
308
309/**
310 * Checks for alternative ways of getting the CPU ID.
311 *
312 * This also checks the APIC ID, CPU ID and CPU set index values against the
313 * GIP tables.
314 *
315 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
316 * @param pvUser1 Pointer to the state structure.
317 * @param pvUser2 Pointer to the GIP.
318 */
319static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
320{
321 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
322 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
323 uint32_t fSupported = 0;
324 uint32_t idApic;
325 uint32_t uEax, uEbx, uEcx, uEdx;
326 int iCpuSet;
327 NOREF(pGip);
328
329 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
330
331 /*
332 * Check that the CPU ID and CPU set index are interchangable.
333 */
334 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
335 if ((RTCPUID)iCpuSet == idCpu)
336 {
337 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
338 if ( iCpuSet >= 0
339 && iCpuSet < RTCPUSET_MAX_CPUS
340 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
341 {
342 PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
343
344 /*
345 * Check whether the IDTR.LIMIT contains a CPU number.
346 */
347#ifdef RT_ARCH_X86
348 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
349#else
350 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
351#endif
352 RTIDTR Idtr;
353 ASMGetIDTR(&Idtr);
354 if (Idtr.cbIdt >= cbIdt)
355 {
356 uint32_t uTmp = Idtr.cbIdt - cbIdt;
357 uTmp &= RTCPUSET_MAX_CPUS - 1;
358 if (uTmp == idCpu)
359 {
360 RTIDTR Idtr2;
361 ASMGetIDTR(&Idtr2);
362 if (Idtr2.cbIdt == Idtr.cbIdt)
363 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
364 }
365 }
366
367 /*
368 * Check whether RDTSCP is an option.
369 */
370 if (ASMHasCpuId())
371 {
372 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
373 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
374 {
375 uint32_t uAux;
376 ASMReadTscWithAux(&uAux);
377 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
378 {
379 ASMNopPause();
380 ASMReadTscWithAux(&uAux);
381 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
382 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
383 }
384
385 if (pGipCpu)
386 {
387 uint32_t const uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
388 if ( (uAux & UINT16_MAX) == uGroupedAux
389 && pGipCpu->iCpuGroupMember <= UINT8_MAX)
390 {
391 ASMNopPause();
392 ASMReadTscWithAux(&uAux);
393 if ((uAux & UINT16_MAX) == uGroupedAux)
394 fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
395 }
396 }
397 }
398 }
399 }
400 }
401
402 /*
403 * Check for extended APIC ID methods.
404 */
405 idApic = UINT32_MAX;
406 uEax = ASMCpuId_EAX(0);
407 if (uEax >= UINT32_C(0xb) && ASMIsValidStdRange(uEax))
408 {
409#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
410 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
411#else
412 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
413#endif
414 if ((uEcx >> 8) != 0) /* level type != invalid */
415 {
416 if (RT_LIKELY( uEdx < RT_ELEMENTS(pGip->aiCpuFromApicId)
417 && !ASMBitTest(pState->bmApicId, uEdx)))
418 {
419 if (uEdx == ASMGetApicIdExt0B())
420 {
421 idApic = uEdx;
422 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_0B;
423 }
424 else
425 AssertMsgFailed(("%#x vs %#x\n", uEdx, ASMGetApicIdExt0B()));
426 }
427 }
428 }
429
430 uEax = ASMCpuId_EAX(UINT32_C(0x80000000));
431 if (uEax >= UINT32_C(0x8000001e) && ASMIsValidExtRange(uEax))
432 {
433#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
434 ASMCpuId_Idx_ECX(UINT32_C(0x8000001e), 0, &uEax, &uEbx, &uEcx, &uEdx);
435#else
436 ASMCpuIdExSlow(UINT32_C(0x8000001e), 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
437#endif
438 if (uEax || uEbx || uEcx || uEdx)
439 {
440 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
441 && ( idApic == UINT32_MAX
442 || idApic == uEax)
443 && !ASMBitTest(pState->bmApicId, uEax)))
444 {
445 if (uEax == ASMGetApicIdExt8000001E())
446 {
447 idApic = uEax;
448 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_8000001E;
449 }
450 else
451 AssertMsgFailed(("%#x vs %#x\n", uEax, ASMGetApicIdExt8000001E()));
452 }
453 }
454 }
455
456 /*
457 * Check that the APIC ID is unique.
458 */
459 uEax = ASMGetApicId();
460 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
461 && ( idApic == UINT32_MAX
462 || idApic == uEax)
463 && !ASMAtomicBitTestAndSet(pState->bmApicId, uEax)))
464 {
465 idApic = uEax;
466 fSupported |= SUPGIPGETCPU_APIC_ID;
467 }
468 else if ( idApic == UINT32_MAX
469 || idApic >= RT_ELEMENTS(pGip->aiCpuFromApicId) /* parnaoia */
470 || ASMAtomicBitTestAndSet(pState->bmApicId, idApic))
471 {
472 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
473 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
474 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x/%#x - duplicate APIC ID.\n",
475 idCpu, iCpuSet, uEax, idApic));
476 }
477
478 /*
479 * Check that the iCpuSet is within the expected range.
480 */
481 if (RT_UNLIKELY( iCpuSet < 0
482 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
483 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
484 {
485 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
486 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
487 idCpu, iCpuSet, idApic));
488 }
489 else
490 {
491 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
492 if (RT_UNLIKELY(idCpu2 != idCpu))
493 {
494 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
495 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
496 idCpu, iCpuSet, idApic, idCpu2));
497 }
498 }
499
500 /*
501 * Update the supported feature mask before we return.
502 */
503 ASMAtomicAndU32(&pState->fSupported, fSupported);
504
505 NOREF(pvUser2);
506}
507
508
509/**
510 * Increase the timer freqency on hosts where this is possible (NT).
511 *
512 * The idea is that more interrupts is better for us... Also, it's better than
513 * we increase the timer frequence, because we might end up getting inaccurate
514 * callbacks if someone else does it.
515 *
516 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
517 */
518static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
519{
520 if (pDevExt->u32SystemTimerGranularityGrant == 0)
521 {
522 uint32_t u32SystemResolution;
523 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
524 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
525 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
526 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
527 )
528 {
529#if 0 /* def VBOX_STRICT - this is somehow triggers bogus assertions on windows 10 */
530 uint32_t u32After = RTTimerGetSystemGranularity();
531 AssertMsg(u32After <= u32SystemResolution, ("u32After=%u u32SystemResolution=%u\n", u32After, u32SystemResolution));
532#endif
533 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
534 }
535 }
536}
537
538
539/**
540 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
541 *
542 * @param pDevExt Clears u32SystemTimerGranularityGrant.
543 */
544static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
545{
546 if (pDevExt->u32SystemTimerGranularityGrant)
547 {
548 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
549 AssertRC(rc2);
550 pDevExt->u32SystemTimerGranularityGrant = 0;
551 }
552}
553
554
555/**
556 * Maps the GIP into userspace and/or get the physical address of the GIP.
557 *
558 * @returns IPRT status code.
559 * @param pSession Session to which the GIP mapping should belong.
560 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
561 * @param pHCPhysGip Where to store the physical address. (optional)
562 *
563 * @remark There is no reference counting on the mapping, so one call to this function
564 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
565 * and remove the session as a GIP user.
566 */
567SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
568{
569 int rc;
570 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
571 RTR3PTR pGipR3 = NIL_RTR3PTR;
572 RTHCPHYS HCPhys = NIL_RTHCPHYS;
573 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
574
575 /*
576 * Validate
577 */
578 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
579 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
580 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
581
582#ifdef SUPDRV_USE_MUTEX_FOR_GIP
583 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
584#else
585 RTSemFastMutexRequest(pDevExt->mtxGip);
586#endif
587 if (pDevExt->pGip)
588 {
589 /*
590 * Map it?
591 */
592 rc = VINF_SUCCESS;
593 if (ppGipR3)
594 {
595 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
596 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
597 RTMEM_PROT_READ, NIL_RTR0PROCESS);
598 if (RT_SUCCESS(rc))
599 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
600 }
601
602 /*
603 * Get physical address.
604 */
605 if (pHCPhysGip && RT_SUCCESS(rc))
606 HCPhys = pDevExt->HCPhysGip;
607
608 /*
609 * Reference globally.
610 */
611 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
612 {
613 pSession->fGipReferenced = 1;
614 pDevExt->cGipUsers++;
615 if (pDevExt->cGipUsers == 1)
616 {
617 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
618 uint64_t u64NanoTS;
619
620 /*
621 * GIP starts/resumes updating again. On windows we bump the
622 * host timer frequency to make sure we don't get stuck in guest
623 * mode and to get better timer (and possibly clock) accuracy.
624 */
625 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
626
627 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
628
629 /*
630 * document me
631 */
632 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
633 {
634 unsigned i;
635 for (i = 0; i < pGipR0->cCpus; i++)
636 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
637 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
638 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
639 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
640 }
641
642 /*
643 * document me
644 */
645 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
646 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
647 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
648 || RTMpGetOnlineCount() == 1)
649 supdrvGipReInitCpu(&pGipR0->aCPUs[0], u64NanoTS);
650 else
651 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
652
653 /*
654 * Detect alternative ways to figure the CPU ID in ring-3 and
655 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
656 * and CPU set indexes while we're at it.
657 */
658 if (RT_SUCCESS(rc))
659 {
660 PSUPDRVGIPDETECTGETCPU pDetectState = (PSUPDRVGIPDETECTGETCPU)RTMemTmpAllocZ(sizeof(*pDetectState));
661 if (pDetectState)
662 {
663 pDetectState->fSupported = UINT32_MAX;
664 pDetectState->idCpuProblem = NIL_RTCPUID;
665 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, pDetectState, pGipR0);
666 if (pDetectState->idCpuProblem == NIL_RTCPUID)
667 {
668 if ( pDetectState->fSupported != UINT32_MAX
669 && pDetectState->fSupported != 0)
670 {
671 if (pGipR0->fGetGipCpu != pDetectState->fSupported)
672 {
673 pGipR0->fGetGipCpu = pDetectState->fSupported;
674 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", pDetectState->fSupported));
675 }
676 }
677 else
678 {
679 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
680 pDetectState->fSupported));
681 rc = VERR_UNSUPPORTED_CPU;
682 }
683 }
684 else
685 {
686 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
687 pDetectState->idCpuProblem, pDetectState->idCpuProblem));
688 rc = VERR_INVALID_CPU_ID;
689 }
690 RTMemTmpFree(pDetectState);
691 }
692 else
693 rc = VERR_NO_TMP_MEMORY;
694 }
695
696 /*
697 * Start the GIP timer if all is well..
698 */
699 if (RT_SUCCESS(rc))
700 {
701#ifndef DO_NOT_START_GIP
702 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
703#endif
704 rc = VINF_SUCCESS;
705 }
706
707 /*
708 * Bail out on error.
709 */
710 if (RT_FAILURE(rc))
711 {
712 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
713 pDevExt->cGipUsers = 0;
714 pSession->fGipReferenced = 0;
715 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
716 {
717 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
718 if (RT_SUCCESS(rc2))
719 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
720 }
721 HCPhys = NIL_RTHCPHYS;
722 pGipR3 = NIL_RTR3PTR;
723 }
724 }
725 }
726 }
727 else
728 {
729 rc = VERR_GENERAL_FAILURE;
730 Log(("SUPR0GipMap: GIP is not available!\n"));
731 }
732#ifdef SUPDRV_USE_MUTEX_FOR_GIP
733 RTSemMutexRelease(pDevExt->mtxGip);
734#else
735 RTSemFastMutexRelease(pDevExt->mtxGip);
736#endif
737
738 /*
739 * Write returns.
740 */
741 if (pHCPhysGip)
742 *pHCPhysGip = HCPhys;
743 if (ppGipR3)
744 *ppGipR3 = pGipR3;
745
746#ifdef DEBUG_DARWIN_GIP
747 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
748#else
749 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
750#endif
751 return rc;
752}
753
754
755/**
756 * Unmaps any user mapping of the GIP and terminates all GIP access
757 * from this session.
758 *
759 * @returns IPRT status code.
760 * @param pSession Session to which the GIP mapping should belong.
761 */
762SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
763{
764 int rc = VINF_SUCCESS;
765 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
766#ifdef DEBUG_DARWIN_GIP
767 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
768 pSession,
769 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
770 pSession->GipMapObjR3));
771#else
772 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
773#endif
774 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
775
776#ifdef SUPDRV_USE_MUTEX_FOR_GIP
777 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
778#else
779 RTSemFastMutexRequest(pDevExt->mtxGip);
780#endif
781
782 /*
783 * GIP test-mode session?
784 */
785 if ( pSession->fGipTestMode
786 && pDevExt->pGip)
787 {
788 supdrvGipSetFlags(pDevExt, pSession, 0, ~SUPGIP_FLAGS_TESTING_ENABLE);
789 Assert(!pSession->fGipTestMode);
790 }
791
792 /*
793 * Unmap anything?
794 */
795 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
796 {
797 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
798 AssertRC(rc);
799 if (RT_SUCCESS(rc))
800 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
801 }
802
803 /*
804 * Dereference global GIP.
805 */
806 if (pSession->fGipReferenced && !rc)
807 {
808 pSession->fGipReferenced = 0;
809 if ( pDevExt->cGipUsers > 0
810 && !--pDevExt->cGipUsers)
811 {
812 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
813#ifndef DO_NOT_START_GIP
814 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
815#endif
816 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
817 }
818 }
819
820#ifdef SUPDRV_USE_MUTEX_FOR_GIP
821 RTSemMutexRelease(pDevExt->mtxGip);
822#else
823 RTSemFastMutexRelease(pDevExt->mtxGip);
824#endif
825
826 return rc;
827}
828
829
830/**
831 * Gets the GIP pointer.
832 *
833 * @returns Pointer to the GIP or NULL.
834 */
835SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
836{
837 return g_pSUPGlobalInfoPage;
838}
839
840
841
842
843
844/*
845 *
846 *
847 * GIP Initialization, Termination and CPU Offline / Online Related Code.
848 * GIP Initialization, Termination and CPU Offline / Online Related Code.
849 * GIP Initialization, Termination and CPU Offline / Online Related Code.
850 *
851 *
852 */
853
854/**
855 * Used by supdrvGipInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
856 * to update the TSC frequency related GIP variables.
857 *
858 * @param pGip The GIP.
859 * @param nsElapsed The number of nanoseconds elapsed.
860 * @param cElapsedTscTicks The corresponding number of TSC ticks.
861 * @param iTick The tick number for debugging.
862 */
863static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks, uint32_t iTick)
864{
865 /*
866 * Calculate the frequency.
867 */
868 uint64_t uCpuHz;
869 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
870 && nsElapsed < UINT32_MAX)
871 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
872 else
873 {
874 RTUINT128U CpuHz, Tmp, Divisor;
875 CpuHz.s.Lo = CpuHz.s.Hi = 0;
876 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
877 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
878 uCpuHz = CpuHz.s.Lo;
879 }
880
881 /*
882 * Update the GIP.
883 */
884 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
885 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
886 {
887 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
888
889 /* For inspecting the frequency calcs using tstGIP-2, debugger or similar. */
890 if (iTick + 1 < pGip->cCpus)
891 ASMAtomicWriteU64(&pGip->aCPUs[iTick + 1].u64CpuHz, uCpuHz);
892 }
893}
894
895
896/**
897 * Timer callback function for TSC frequency refinement in invariant GIP mode.
898 *
899 * This is started during driver init and fires once
900 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
901 *
902 * @param pTimer The timer.
903 * @param pvUser Opaque pointer to the device instance data.
904 * @param iTick The timer tick.
905 */
906static DECLCALLBACK(void) supdrvGipInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
907{
908 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
909 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
910 RTCPUID idCpu;
911 uint64_t cNsElapsed;
912 uint64_t cTscTicksElapsed;
913 uint64_t nsNow;
914 uint64_t uTsc;
915 RTCCUINTREG fEFlags;
916
917 /* Paranoia. */
918 AssertReturnVoid(pGip);
919 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
920
921 /*
922 * If we got a power event, stop the refinement process.
923 */
924 if (pDevExt->fInvTscRefinePowerEvent)
925 {
926 int rc = RTTimerStop(pTimer); AssertRC(rc);
927 return;
928 }
929
930 /*
931 * Read the TSC and time, noting which CPU we are on.
932 *
933 * Don't bother spinning until RTTimeSystemNanoTS changes, since on
934 * systems where it matters we're in a context where we cannot waste that
935 * much time (DPC watchdog, called from clock interrupt).
936 */
937 fEFlags = ASMIntDisableFlags();
938 uTsc = ASMReadTSC();
939 nsNow = RTTimeSystemNanoTS();
940 idCpu = RTMpCpuId();
941 ASMSetFlags(fEFlags);
942
943 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
944 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
945
946 /*
947 * If the above measurement was taken on a different CPU than the one we
948 * started the process on, cTscTicksElapsed will need to be adjusted with
949 * the TSC deltas of both the CPUs.
950 *
951 * We ASSUME that the delta calculation process takes less time than the
952 * TSC frequency refinement timer. If it doesn't, we'll complain and
953 * drop the frequency refinement.
954 *
955 * Note! We cannot entirely trust enmUseTscDelta here because it's
956 * downgraded after each delta calculation.
957 */
958 if ( idCpu != pDevExt->idCpuInvarTscRefine
959 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
960 {
961 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
962 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
963 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
964 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
965 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
966 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
967 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
968 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
969 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
970 {
971 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
972 {
973 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
974 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
975 }
976 }
977 /*
978 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
979 * calculations.
980 */
981 else if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
982 {
983 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
984 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
985 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
986 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
987 int rc = RTTimerStop(pTimer); AssertRC(rc);
988 return;
989 }
990 }
991
992 /*
993 * Calculate and update the CPU frequency variables in GIP.
994 *
995 * If there is a GIP user already and we've already refined the frequency
996 * a couple of times, don't update it as we want a stable frequency value
997 * for all VMs.
998 */
999 if ( pDevExt->cGipUsers == 0
1000 || cNsElapsed < RT_NS_1SEC * 2)
1001 {
1002 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed, (uint32_t)iTick);
1003
1004 /*
1005 * Stop the timer once we've reached the defined refinement period.
1006 */
1007 if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
1008 {
1009 int rc = RTTimerStop(pTimer);
1010 AssertRC(rc);
1011 }
1012 }
1013 else
1014 {
1015 int rc = RTTimerStop(pTimer);
1016 AssertRC(rc);
1017 }
1018}
1019
1020
1021/**
1022 * @callback_method_impl{FNRTPOWERNOTIFICATION}
1023 */
1024static DECLCALLBACK(void) supdrvGipPowerNotificationCallback(RTPOWEREVENT enmEvent, void *pvUser)
1025{
1026 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1027 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1028
1029 /*
1030 * If the TSC frequency refinement timer is running, we need to cancel it so it
1031 * doesn't screw up the frequency after a long suspend.
1032 *
1033 * Recalculate all TSC-deltas on host resume as it may have changed, seen
1034 * on Windows 7 running on the Dell Optiplex Intel Core i5-3570.
1035 */
1036 if (enmEvent == RTPOWEREVENT_RESUME)
1037 {
1038 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1039 if ( RT_LIKELY(pGip)
1040 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1041 && !supdrvOSAreCpusOfflinedOnSuspend())
1042 {
1043#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1044 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
1045#else
1046 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
1047 supdrvTscMeasureInitialDeltas(pDevExt);
1048#endif
1049 }
1050 }
1051 else if (enmEvent == RTPOWEREVENT_SUSPEND)
1052 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1053}
1054
1055
1056/**
1057 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
1058 *
1059 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
1060 * the CPU may change the TSC frequence between now and when the timer fires
1061 * (supdrvInitAsyncRefineTscTimer).
1062 *
1063 * @param pDevExt Pointer to the device instance data.
1064 */
1065static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt)
1066{
1067 uint64_t u64NanoTS;
1068 RTCCUINTREG fEFlags;
1069 int rc;
1070
1071 /*
1072 * Register a power management callback.
1073 */
1074 pDevExt->fInvTscRefinePowerEvent = false;
1075 rc = RTPowerNotificationRegister(supdrvGipPowerNotificationCallback, pDevExt);
1076 AssertRC(rc); /* ignore */
1077
1078 /*
1079 * Record the TSC and NanoTS as the starting anchor point for refinement
1080 * of the TSC. We try get as close to a clock tick as possible on systems
1081 * which does not provide high resolution time.
1082 */
1083 u64NanoTS = RTTimeSystemNanoTS();
1084 while (RTTimeSystemNanoTS() == u64NanoTS)
1085 ASMNopPause();
1086
1087 fEFlags = ASMIntDisableFlags();
1088 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
1089 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
1090 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
1091 ASMSetFlags(fEFlags);
1092
1093 /*
1094 * Create a timer that runs on the same CPU so we won't have a depencency
1095 * on the TSC-delta and can run in parallel to it. On systems that does not
1096 * implement CPU specific timers we'll apply deltas in the timer callback,
1097 * just like we do for CPUs going offline.
1098 *
1099 * The longer the refinement interval the better the accuracy, at least in
1100 * theory. If it's too long though, ring-3 may already be starting its
1101 * first VMs before we're done. On most systems we will be loading the
1102 * support driver during boot and VMs won't be started for a while yet,
1103 * it is really only a problem during development (especially with
1104 * on-demand driver starting on windows).
1105 *
1106 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
1107 * to calculate the frequency during driver loading, the timer is set
1108 * to fire after 200 ms the first time. It will then reschedule itself
1109 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
1110 * reached or it notices that there is a user land client with GIP
1111 * mapped (we want a stable frequency for all VMs).
1112 */
1113 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC,
1114 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
1115 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1116 if (RT_SUCCESS(rc))
1117 {
1118 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1119 if (RT_SUCCESS(rc))
1120 return;
1121 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1122 }
1123
1124 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1125 {
1126 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC, RTTIMER_FLAGS_CPU_ANY,
1127 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1128 if (RT_SUCCESS(rc))
1129 {
1130 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1131 if (RT_SUCCESS(rc))
1132 return;
1133 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1134 }
1135 }
1136
1137 pDevExt->pInvarTscRefineTimer = NULL;
1138 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1139}
1140
1141
1142/**
1143 * @callback_method_impl{PFNRTMPWORKER,
1144 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1145 * the measurements on.}
1146 */
1147static DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1148{
1149 RTCCUINTREG fEFlags = ASMIntDisableFlags();
1150 uint64_t *puTscStop = (uint64_t *)pvUser1;
1151 uint64_t *pnsStop = (uint64_t *)pvUser2;
1152 RT_NOREF1(idCpu);
1153
1154 *puTscStop = ASMReadTSC();
1155 *pnsStop = RTTimeSystemNanoTS();
1156
1157 ASMSetFlags(fEFlags);
1158}
1159
1160
1161/**
1162 * Measures the TSC frequency of the system.
1163 *
1164 * The TSC frequency can vary on systems which are not reported as invariant.
1165 * On such systems the object of this function is to find out what the nominal,
1166 * maximum TSC frequency under 'normal' CPU operation.
1167 *
1168 * @returns VBox status code.
1169 * @param pGip Pointer to the GIP.
1170 * @param fRough Set if we're doing the rough calculation that the
1171 * TSC measuring code needs, where accuracy isn't all
1172 * that important (too high is better than too low).
1173 * When clear we try for best accuracy that we can
1174 * achieve in reasonably short time.
1175 */
1176static int supdrvGipInitMeasureTscFreq(PSUPGLOBALINFOPAGE pGip, bool fRough)
1177{
1178 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1179 int cTriesLeft = fRough ? 4 : 2;
1180 while (cTriesLeft-- > 0)
1181 {
1182 RTCCUINTREG fEFlags;
1183 uint64_t nsStart;
1184 uint64_t nsStop;
1185 uint64_t uTscStart;
1186 uint64_t uTscStop;
1187 RTCPUID idCpuStart;
1188 RTCPUID idCpuStop;
1189
1190 /*
1191 * Synchronize with the host OS clock tick on systems without high
1192 * resolution time API (older Windows version for example).
1193 */
1194 nsStart = RTTimeSystemNanoTS();
1195 while (RTTimeSystemNanoTS() == nsStart)
1196 ASMNopPause();
1197
1198 /*
1199 * Read the TSC and current time, noting which CPU we're on.
1200 */
1201 fEFlags = ASMIntDisableFlags();
1202 uTscStart = ASMReadTSC();
1203 nsStart = RTTimeSystemNanoTS();
1204 idCpuStart = RTMpCpuId();
1205 ASMSetFlags(fEFlags);
1206
1207 /*
1208 * Delay for a while.
1209 */
1210 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1211 {
1212 /*
1213 * Sleep-wait since the TSC frequency is constant, it eases host load.
1214 * Shorter interval produces more variance in the frequency (esp. Windows).
1215 */
1216 uint64_t msElapsed = 0;
1217 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1218 / RT_NS_1MS;
1219 do
1220 {
1221 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1222 nsStop = RTTimeSystemNanoTS();
1223 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1224 } while (msElapsed < msDelay);
1225
1226 while (RTTimeSystemNanoTS() == nsStop)
1227 ASMNopPause();
1228 }
1229 else
1230 {
1231 /*
1232 * Busy-wait keeping the frequency up.
1233 */
1234 do
1235 {
1236 ASMNopPause();
1237 nsStop = RTTimeSystemNanoTS();
1238 } while (nsStop - nsStart < RT_NS_100MS);
1239 }
1240
1241 /*
1242 * Read the TSC and time again.
1243 */
1244 fEFlags = ASMIntDisableFlags();
1245 uTscStop = ASMReadTSC();
1246 nsStop = RTTimeSystemNanoTS();
1247 idCpuStop = RTMpCpuId();
1248 ASMSetFlags(fEFlags);
1249
1250 /*
1251 * If the CPU changes, things get a bit complicated and what we
1252 * can get away with depends on the GIP mode / TSC reliability.
1253 */
1254 if (idCpuStop != idCpuStart)
1255 {
1256 bool fDoXCall = false;
1257
1258 /*
1259 * Synchronous TSC mode: we're probably fine as it's unlikely
1260 * that we were rescheduled because of TSC throttling or power
1261 * management reasons, so just go ahead.
1262 */
1263 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1264 {
1265 /* Probably ok, maybe we should retry once?. */
1266 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1267 }
1268 /*
1269 * If we're just doing the rough measurement, do the cross call and
1270 * get on with things (we don't have deltas!).
1271 */
1272 else if (fRough)
1273 fDoXCall = true;
1274 /*
1275 * Invariant TSC mode: It doesn't matter if we have delta available
1276 * for both CPUs. That is not something we can assume at this point.
1277 *
1278 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1279 * downgraded after each delta calculation and the delta
1280 * calculations may not be complete yet.
1281 */
1282 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1283 {
1284/** @todo This section of code is never reached atm, consider dropping it later on... */
1285 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1286 {
1287 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1288 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1289 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1290 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1291 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1292 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1293 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1294 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1295 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1296 {
1297 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1298 {
1299 uTscStart -= iStartTscDelta;
1300 uTscStop -= iStopTscDelta;
1301 }
1302 }
1303 /*
1304 * Invalid CPU indexes are not caused by online/offline races, so
1305 * we have to trigger driver load failure if that happens as GIP
1306 * and IPRT assumptions are busted on this system.
1307 */
1308 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1309 {
1310 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1311 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1312 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1313 return VERR_INVALID_CPU_INDEX;
1314 }
1315 /*
1316 * No valid deltas. We retry, if we're on our last retry
1317 * we do the cross call instead just to get a result. The
1318 * frequency will be refined in a few seconds anyway.
1319 */
1320 else if (cTriesLeft > 0)
1321 continue;
1322 else
1323 fDoXCall = true;
1324 }
1325 }
1326 /*
1327 * Asynchronous TSC mode: This is bad, as the reason we usually
1328 * use this mode is to deal with variable TSC frequencies and
1329 * deltas. So, we need to get the TSC from the same CPU as
1330 * started it, we also need to keep that CPU busy. So, retry
1331 * and fall back to the cross call on the last attempt.
1332 */
1333 else
1334 {
1335 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1336 if (cTriesLeft > 0)
1337 continue;
1338 fDoXCall = true;
1339 }
1340
1341 if (fDoXCall)
1342 {
1343 /*
1344 * Try read the TSC and timestamp on the start CPU.
1345 */
1346 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1347 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1348 continue;
1349 }
1350 }
1351
1352 /*
1353 * Calculate the TSC frequency and update it (shared with the refinement timer).
1354 */
1355 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart, 0);
1356 return VINF_SUCCESS;
1357 }
1358
1359 Assert(!fRough);
1360 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1361}
1362
1363
1364/**
1365 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1366 *
1367 * @returns Index of the CPU in the cache set.
1368 * @param pGip The GIP.
1369 * @param idCpu The CPU ID.
1370 */
1371static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1372{
1373 uint32_t i, cTries;
1374
1375 /*
1376 * ASSUMES that CPU IDs are constant.
1377 */
1378 for (i = 0; i < pGip->cCpus; i++)
1379 if (pGip->aCPUs[i].idCpu == idCpu)
1380 return i;
1381
1382 cTries = 0;
1383 do
1384 {
1385 for (i = 0; i < pGip->cCpus; i++)
1386 {
1387 bool fRc;
1388 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1389 if (fRc)
1390 return i;
1391 }
1392 } while (cTries++ < 32);
1393 AssertReleaseFailed();
1394 return i - 1;
1395}
1396
1397
1398/**
1399 * The calling CPU should be accounted as online, update GIP accordingly.
1400 *
1401 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1402 *
1403 * @param pDevExt The device extension.
1404 * @param idCpu The CPU ID.
1405 */
1406static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1407{
1408 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1409 int iCpuSet = 0;
1410 uint32_t idApic;
1411 uint32_t i = 0;
1412 uint64_t u64NanoTS = 0;
1413
1414 AssertPtrReturnVoid(pGip);
1415 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1416 AssertRelease(idCpu == RTMpCpuId());
1417 Assert(pGip->cPossibleCpus == RTMpGetCount());
1418
1419 /*
1420 * Do this behind a spinlock with interrupts disabled as this can fire
1421 * on all CPUs simultaneously, see @bugref{6110}.
1422 */
1423 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1424
1425 /*
1426 * Update the globals.
1427 */
1428 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1429 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1430 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1431 if (iCpuSet >= 0)
1432 {
1433 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1434 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1435 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1436 }
1437
1438 /*
1439 * Update the entry.
1440 */
1441 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1442 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1443
1444 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1445
1446 idApic = supdrvGipGetApicIdSlow();
1447 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1448 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1449 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1450
1451 pGip->aCPUs[i].iCpuGroup = 0;
1452 pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
1453#ifdef RT_OS_WINDOWS
1454 supdrvOSGipInitGroupBitsForCpu(pDevExt, pGip, &pGip->aCPUs[i]);
1455#endif
1456
1457 /*
1458 * Update the APIC ID and CPU set index mappings.
1459 */
1460 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
1461 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1462 else
1463 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: idApic=%#x is out of bounds (%#zx, i=%u, iCpuSet=%d)\n",
1464 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId), i, iCpuSet));
1465 if ((unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
1466 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1467 else
1468 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: iCpuSet=%d is out of bounds (%#zx, i=%u, idApic=%d)\n",
1469 iCpuSet, RT_ELEMENTS(pGip->aiCpuFromApicId), i, idApic));
1470
1471 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1472 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1473
1474 /* Update the Mp online/offline counter. */
1475 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1476
1477 /* Commit it. */
1478 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1479
1480 RTSpinlockRelease(pDevExt->hGipSpinlock);
1481}
1482
1483
1484/**
1485 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1486 *
1487 * @param idCpu The CPU ID we are running on.
1488 * @param pvUser1 Opaque pointer to the device instance data.
1489 * @param pvUser2 Not used.
1490 */
1491static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1492{
1493 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1494 NOREF(pvUser2);
1495 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1496}
1497
1498
1499/**
1500 * The CPU should be accounted as offline, update the GIP accordingly.
1501 *
1502 * This is used by supdrvGipMpEvent.
1503 *
1504 * @param pDevExt The device extension.
1505 * @param idCpu The CPU ID.
1506 */
1507static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1508{
1509 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1510 int iCpuSet;
1511 unsigned i;
1512
1513 AssertPtrReturnVoid(pGip);
1514 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1515
1516 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1517 AssertReturnVoid(iCpuSet >= 0);
1518
1519 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1520 AssertReturnVoid(i < pGip->cCpus);
1521 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1522
1523 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1524 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1525
1526 /* Update the Mp online/offline counter. */
1527 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1528
1529 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1530 {
1531 /* Reset the TSC delta, we will recalculate it lazily. */
1532 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1533 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1534 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1535 }
1536
1537 /* Commit it. */
1538 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1539
1540 RTSpinlockRelease(pDevExt->hGipSpinlock);
1541}
1542
1543
1544/**
1545 * Multiprocessor event notification callback.
1546 *
1547 * This is used to make sure that the GIP master gets passed on to
1548 * another CPU. It also updates the associated CPU data.
1549 *
1550 * @param enmEvent The event.
1551 * @param idCpu The cpu it applies to.
1552 * @param pvUser Pointer to the device extension.
1553 */
1554static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1555{
1556 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1557 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1558
1559 if (pGip)
1560 {
1561 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1562 switch (enmEvent)
1563 {
1564 case RTMPEVENT_ONLINE:
1565 {
1566 RTThreadPreemptDisable(&PreemptState);
1567 if (idCpu == RTMpCpuId())
1568 {
1569 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1570 RTThreadPreemptRestore(&PreemptState);
1571 }
1572 else
1573 {
1574 RTThreadPreemptRestore(&PreemptState);
1575 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1576 }
1577
1578 /*
1579 * Recompute TSC-delta for the newly online'd CPU.
1580 */
1581 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1582 {
1583#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1584 supdrvTscDeltaThreadStartMeasurement(pDevExt, false /* fForceAll */);
1585#else
1586 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1587 supdrvTscMeasureDeltaOne(pDevExt, iCpu);
1588#endif
1589 }
1590 break;
1591 }
1592
1593 case RTMPEVENT_OFFLINE:
1594 supdrvGipMpEventOffline(pDevExt, idCpu);
1595 break;
1596 }
1597 }
1598
1599 /*
1600 * Make sure there is a master GIP.
1601 */
1602 if (enmEvent == RTMPEVENT_OFFLINE)
1603 {
1604 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1605 if (idGipMaster == idCpu)
1606 {
1607 /*
1608 * The GIP master is going offline, find a new one.
1609 */
1610 bool fIgnored;
1611 unsigned i;
1612 RTCPUID idNewGipMaster = NIL_RTCPUID;
1613 RTCPUSET OnlineCpus;
1614 RTMpGetOnlineSet(&OnlineCpus);
1615
1616 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1617 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1618 {
1619 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1620 if (idCurCpu != idGipMaster)
1621 {
1622 idNewGipMaster = idCurCpu;
1623 break;
1624 }
1625 }
1626
1627 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1628 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1629 NOREF(fIgnored);
1630 }
1631 }
1632}
1633
1634
1635/**
1636 * On CPU initialization callback for RTMpOnAll.
1637 *
1638 * @param idCpu The CPU ID.
1639 * @param pvUser1 The device extension.
1640 * @param pvUser2 The GIP.
1641 */
1642static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1643{
1644 /* This is good enough, even though it will update some of the globals a
1645 bit to much. */
1646 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1647 NOREF(pvUser2);
1648}
1649
1650
1651/**
1652 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1653 *
1654 * @param idCpu Ignored.
1655 * @param pvUser1 Where to put the TSC.
1656 * @param pvUser2 Ignored.
1657 */
1658static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1659{
1660 Assert(RTMpCpuIdToSetIndex(idCpu) == (intptr_t)pvUser2);
1661 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1662 RT_NOREF2(idCpu, pvUser2);
1663}
1664
1665
1666/**
1667 * Determine if Async GIP mode is required because of TSC drift.
1668 *
1669 * When using the default/normal timer code it is essential that the time stamp counter
1670 * (TSC) runs never backwards, that is, a read operation to the counter should return
1671 * a bigger value than any previous read operation. This is guaranteed by the latest
1672 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1673 * case we have to choose the asynchronous timer mode.
1674 *
1675 * @param poffMin Pointer to the determined difference between different
1676 * cores (optional, can be NULL).
1677 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1678 */
1679static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1680{
1681 /*
1682 * Just iterate all the cpus 8 times and make sure that the TSC is
1683 * ever increasing. We don't bother taking TSC rollover into account.
1684 */
1685 int iEndCpu = RTMpGetArraySize();
1686 int iCpu;
1687 int cLoops = 8;
1688 bool fAsync = false;
1689 int rc = VINF_SUCCESS;
1690 uint64_t offMax = 0;
1691 uint64_t offMin = ~(uint64_t)0;
1692 uint64_t PrevTsc = ASMReadTSC();
1693
1694 while (cLoops-- > 0)
1695 {
1696 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1697 {
1698 uint64_t CurTsc;
1699 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker,
1700 &CurTsc, (void *)(uintptr_t)iCpu);
1701 if (RT_SUCCESS(rc))
1702 {
1703 if (CurTsc <= PrevTsc)
1704 {
1705 fAsync = true;
1706 offMin = offMax = PrevTsc - CurTsc;
1707 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1708 iCpu, cLoops, CurTsc, PrevTsc));
1709 break;
1710 }
1711
1712 /* Gather statistics (except the first time). */
1713 if (iCpu != 0 || cLoops != 7)
1714 {
1715 uint64_t off = CurTsc - PrevTsc;
1716 if (off < offMin)
1717 offMin = off;
1718 if (off > offMax)
1719 offMax = off;
1720 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1721 }
1722
1723 /* Next */
1724 PrevTsc = CurTsc;
1725 }
1726 else if (rc == VERR_NOT_SUPPORTED)
1727 break;
1728 else
1729 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1730 }
1731
1732 /* broke out of the loop. */
1733 if (iCpu < iEndCpu)
1734 break;
1735 }
1736
1737 if (poffMin)
1738 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1739 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1740 fAsync, iEndCpu, rc, offMin, offMax));
1741#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1742 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1743#endif
1744 return fAsync;
1745}
1746
1747
1748/**
1749 * supdrvGipInit() worker that determines the GIP TSC mode.
1750 *
1751 * @returns The most suitable TSC mode.
1752 * @param pDevExt Pointer to the device instance data.
1753 */
1754static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1755{
1756 uint64_t u64DiffCoresIgnored;
1757 uint32_t uEAX, uEBX, uECX, uEDX;
1758
1759 /*
1760 * Establish whether the CPU advertises TSC as invariant, we need that in
1761 * a couple of places below.
1762 */
1763 bool fInvariantTsc = false;
1764 if (ASMHasCpuId())
1765 {
1766 uEAX = ASMCpuId_EAX(0x80000000);
1767 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1768 {
1769 uEDX = ASMCpuId_EDX(0x80000007);
1770 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1771 fInvariantTsc = true;
1772 }
1773 }
1774
1775 /*
1776 * On single CPU systems, we don't need to consider ASYNC mode.
1777 */
1778 if (RTMpGetCount() <= 1)
1779 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1780
1781 /*
1782 * Allow the user and/or OS specific bits to force async mode.
1783 */
1784 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1785 return SUPGIPMODE_ASYNC_TSC;
1786
1787 /*
1788 * Use invariant mode if the CPU says TSC is invariant.
1789 */
1790 if (fInvariantTsc)
1791 return SUPGIPMODE_INVARIANT_TSC;
1792
1793 /*
1794 * TSC is not invariant and we're on SMP, this presents two problems:
1795 *
1796 * (1) There might be a skew between the CPU, so that cpu0
1797 * returns a TSC that is slightly different from cpu1.
1798 * This screw may be due to (2), bad TSC initialization
1799 * or slightly different TSC rates.
1800 *
1801 * (2) Power management (and other things) may cause the TSC
1802 * to run at a non-constant speed, and cause the speed
1803 * to be different on the cpus. This will result in (1).
1804 *
1805 * If any of the above is detected, we will have to use ASYNC mode.
1806 */
1807 /* (1). Try check for current differences between the cpus. */
1808 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1809 return SUPGIPMODE_ASYNC_TSC;
1810
1811 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1812 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1813 if ( ASMIsValidStdRange(uEAX)
1814 && (ASMIsAmdCpuEx(uEBX, uECX, uEDX) || ASMIsHygonCpuEx(uEBX, uECX, uEDX)) )
1815 {
1816 /* Check for APM support. */
1817 uEAX = ASMCpuId_EAX(0x80000000);
1818 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1819 {
1820 uEDX = ASMCpuId_EDX(0x80000007);
1821 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1822 return SUPGIPMODE_ASYNC_TSC;
1823 }
1824 }
1825
1826 return SUPGIPMODE_SYNC_TSC;
1827}
1828
1829
1830/**
1831 * Initializes per-CPU GIP information.
1832 *
1833 * @param pGip Pointer to the GIP.
1834 * @param pCpu Pointer to which GIP CPU to initialize.
1835 * @param u64NanoTS The current nanosecond timestamp.
1836 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1837 */
1838static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1839{
1840 pCpu->u32TransactionId = 2;
1841 pCpu->u64NanoTS = u64NanoTS;
1842 pCpu->u64TSC = ASMReadTSC();
1843 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1844 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1845
1846 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1847 ASMAtomicWriteU32(&pCpu->idCpu, NIL_RTCPUID);
1848 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1849 ASMAtomicWriteU16(&pCpu->iCpuGroup, 0);
1850 ASMAtomicWriteU16(&pCpu->iCpuGroupMember, UINT16_MAX);
1851 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1852 ASMAtomicWriteU32(&pCpu->iReservedForNumaNode, 0);
1853
1854 /*
1855 * The first time we're called, we don't have a CPU frequency handy,
1856 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1857 * called again and at that point we have a more plausible CPU frequency
1858 * value handy. The frequency history will also be adjusted again on
1859 * the 2nd timer callout (maybe we can skip that now?).
1860 */
1861 if (!uCpuHz)
1862 {
1863 pCpu->u64CpuHz = _4G - 1;
1864 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1865 }
1866 else
1867 {
1868 pCpu->u64CpuHz = uCpuHz;
1869 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1870 }
1871 pCpu->au32TSCHistory[0]
1872 = pCpu->au32TSCHistory[1]
1873 = pCpu->au32TSCHistory[2]
1874 = pCpu->au32TSCHistory[3]
1875 = pCpu->au32TSCHistory[4]
1876 = pCpu->au32TSCHistory[5]
1877 = pCpu->au32TSCHistory[6]
1878 = pCpu->au32TSCHistory[7]
1879 = pCpu->u32UpdateIntervalTSC;
1880}
1881
1882
1883/**
1884 * Initializes the GIP data.
1885 *
1886 * @returns VBox status code.
1887 * @param pDevExt Pointer to the device instance data.
1888 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1889 * @param HCPhys The physical address of the GIP.
1890 * @param u64NanoTS The current nanosecond timestamp.
1891 * @param uUpdateHz The update frequency.
1892 * @param uUpdateIntervalNS The update interval in nanoseconds.
1893 * @param cCpus The CPU count.
1894 * @param cbGipCpuGroups The supdrvOSGipGetGroupTableSize return value we
1895 * used when allocating the GIP structure.
1896 */
1897static int supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1898 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS,
1899 unsigned cCpus, size_t cbGipCpuGroups)
1900{
1901 size_t const cbGip = RT_ALIGN_Z(RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups, PAGE_SIZE);
1902 unsigned i;
1903#ifdef DEBUG_DARWIN_GIP
1904 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1905#else
1906 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1907#endif
1908
1909 /*
1910 * Initialize the structure.
1911 */
1912 memset(pGip, 0, cbGip);
1913
1914 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1915 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1916 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1917 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1918 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1919 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1920 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1921 else
1922 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1923 pGip->cCpus = (uint16_t)cCpus;
1924 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1925 pGip->u32UpdateHz = uUpdateHz;
1926 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1927 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1928 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1929 RTCpuSetEmpty(&pGip->PresentCpuSet);
1930 RTMpGetSet(&pGip->PossibleCpuSet);
1931 pGip->cOnlineCpus = RTMpGetOnlineCount();
1932 pGip->cPresentCpus = RTMpGetPresentCount();
1933 pGip->cPossibleCpus = RTMpGetCount();
1934 pGip->cPossibleCpuGroups = 1;
1935 pGip->idCpuMax = RTMpGetMaxCpuId();
1936 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1937 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1938 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1939 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1940 for (i = 0; i < RT_ELEMENTS(pGip->aoffCpuGroup); i++)
1941 pGip->aoffCpuGroup[i] = UINT32_MAX;
1942 for (i = 0; i < cCpus; i++)
1943 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1944#ifdef RT_OS_WINDOWS
1945 int rc = supdrvOSInitGipGroupTable(pDevExt, pGip, cbGipCpuGroups);
1946 AssertRCReturn(rc, rc);
1947#endif
1948
1949 /*
1950 * Link it to the device extension.
1951 */
1952 pDevExt->pGip = pGip;
1953 pDevExt->HCPhysGip = HCPhys;
1954 pDevExt->cGipUsers = 0;
1955
1956 return VINF_SUCCESS;
1957}
1958
1959
1960/**
1961 * Creates the GIP.
1962 *
1963 * @returns VBox status code.
1964 * @param pDevExt Instance data. GIP stuff may be updated.
1965 */
1966int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1967{
1968 PSUPGLOBALINFOPAGE pGip;
1969 size_t cbGip;
1970 size_t cbGipCpuGroups;
1971 RTHCPHYS HCPhysGip;
1972 uint32_t u32SystemResolution;
1973 uint32_t u32Interval;
1974 uint32_t u32MinInterval;
1975 uint32_t uMod;
1976 unsigned cCpus;
1977 int rc;
1978
1979 LogFlow(("supdrvGipCreate:\n"));
1980
1981 /*
1982 * Assert order.
1983 */
1984 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1985 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1986 Assert(!pDevExt->pGipTimer);
1987#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1988 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1989 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1990#else
1991 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1992 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1993#endif
1994
1995 /*
1996 * Check the CPU count.
1997 */
1998 cCpus = RTMpGetArraySize();
1999 if (cCpus > RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)))
2000 {
2001 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)));
2002 return VERR_TOO_MANY_CPUS;
2003 }
2004
2005 /*
2006 * Allocate a contiguous set of pages with a default kernel mapping.
2007 */
2008#ifdef RT_OS_WINDOWS
2009 cbGipCpuGroups = supdrvOSGipGetGroupTableSize(pDevExt);
2010#else
2011 cbGipCpuGroups = 0;
2012#endif
2013 cbGip = RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups;
2014 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, cbGip, false /*fExecutable*/);
2015 if (RT_FAILURE(rc))
2016 {
2017 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
2018 return rc;
2019 }
2020 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
2021 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
2022
2023 /*
2024 * Find a reasonable update interval and initialize the structure.
2025 */
2026 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
2027 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
2028 * See @bugref{6710}. */
2029 u32MinInterval = RT_NS_10MS;
2030 u32SystemResolution = RTTimerGetSystemGranularity();
2031 u32Interval = u32MinInterval;
2032 uMod = u32MinInterval % u32SystemResolution;
2033 if (uMod)
2034 u32Interval += u32SystemResolution - uMod;
2035
2036 rc = supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval,
2037 cCpus, cbGipCpuGroups);
2038
2039 /*
2040 * Important sanity check... (Sets rc)
2041 */
2042 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
2043 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
2044 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
2045 {
2046 OSDBGPRINT(("supdrvGipCreate: Host-OS/user claims the TSC-deltas are zero but we detected async. TSC! Bad.\n"));
2047 rc = VERR_INTERNAL_ERROR_2;
2048 }
2049
2050 /* It doesn't make sense to do TSC-delta detection on systems we detect as async. */
2051 AssertStmt( pGip->u32Mode != SUPGIPMODE_ASYNC_TSC
2052 || pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED,
2053 rc = VERR_INTERNAL_ERROR_3);
2054
2055 /*
2056 * Do the TSC frequency measurements.
2057 *
2058 * If we're in invariant TSC mode, just to a quick preliminary measurement
2059 * that the TSC-delta measurement code can use to yield cross calls.
2060 *
2061 * If we're in any of the other two modes, neither which require MP init,
2062 * notifications or deltas for the job, do the full measurement now so
2063 * that supdrvGipInitOnCpu() can populate the TSC interval and history
2064 * array with more reasonable values.
2065 */
2066 if (RT_SUCCESS(rc))
2067 {
2068 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
2069 {
2070 rc = supdrvGipInitMeasureTscFreq(pGip, true /*fRough*/); /* cannot fail */
2071 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt);
2072 }
2073 else
2074 rc = supdrvGipInitMeasureTscFreq(pGip, false /*fRough*/);
2075 if (RT_SUCCESS(rc))
2076 {
2077 /*
2078 * Start TSC-delta measurement thread before we start getting MP
2079 * events that will try kick it into action (includes the
2080 * RTMpOnAll/supdrvGipInitOnCpu call below).
2081 */
2082 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
2083 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
2084#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2085 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2086 rc = supdrvTscDeltaThreadInit(pDevExt);
2087#endif
2088 if (RT_SUCCESS(rc))
2089 {
2090 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
2091 if (RT_SUCCESS(rc))
2092 {
2093 /*
2094 * Do GIP initialization on all online CPUs. Wake up the
2095 * TSC-delta thread afterwards.
2096 */
2097 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
2098 if (RT_SUCCESS(rc))
2099 {
2100#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2101 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
2102#else
2103 uint16_t iCpu;
2104 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2105 {
2106 /*
2107 * Measure the TSC deltas now that we have MP notifications.
2108 */
2109 int cTries = 5;
2110 do
2111 {
2112 rc = supdrvTscMeasureInitialDeltas(pDevExt);
2113 if ( rc != VERR_TRY_AGAIN
2114 && rc != VERR_CPU_OFFLINE)
2115 break;
2116 } while (--cTries > 0);
2117 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2118 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
2119 }
2120 else
2121 {
2122 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2123 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
2124 }
2125 if (RT_SUCCESS(rc))
2126#endif
2127 {
2128 /*
2129 * Create the timer.
2130 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
2131 */
2132 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
2133 {
2134 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
2135 supdrvGipAsyncTimer, pDevExt);
2136 if (rc == VERR_NOT_SUPPORTED)
2137 {
2138 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
2139 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
2140 }
2141 }
2142 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2143 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
2144 supdrvGipSyncAndInvariantTimer, pDevExt);
2145 if (RT_SUCCESS(rc))
2146 {
2147 /*
2148 * We're good.
2149 */
2150 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
2151 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2152
2153 g_pSUPGlobalInfoPage = pGip;
2154 return VINF_SUCCESS;
2155 }
2156
2157 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
2158 Assert(!pDevExt->pGipTimer);
2159 }
2160 }
2161 else
2162 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
2163 }
2164 else
2165 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
2166 }
2167 else
2168 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
2169 }
2170 else
2171 OSDBGPRINT(("supdrvGipCreate: supdrvTscMeasureInitialDeltas failed. rc=%Rrc\n", rc));
2172 }
2173
2174 /* Releases timer frequency increase too. */
2175 supdrvGipDestroy(pDevExt);
2176 return rc;
2177}
2178
2179
2180/**
2181 * Invalidates the GIP data upon termination.
2182 *
2183 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2184 */
2185static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
2186{
2187 unsigned i;
2188 pGip->u32Magic = 0;
2189 for (i = 0; i < pGip->cCpus; i++)
2190 {
2191 pGip->aCPUs[i].u64NanoTS = 0;
2192 pGip->aCPUs[i].u64TSC = 0;
2193 pGip->aCPUs[i].iTSCHistoryHead = 0;
2194 pGip->aCPUs[i].u64TSCSample = 0;
2195 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2196 }
2197}
2198
2199
2200/**
2201 * Terminates the GIP.
2202 *
2203 * @param pDevExt Instance data. GIP stuff may be updated.
2204 */
2205void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2206{
2207 int rc;
2208#ifdef DEBUG_DARWIN_GIP
2209 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2210 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2211 pDevExt->pGipTimer, pDevExt->GipMemObj));
2212#endif
2213
2214 /*
2215 * Stop receiving MP notifications before tearing anything else down.
2216 */
2217 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2218
2219#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2220 /*
2221 * Terminate the TSC-delta measurement thread and resources.
2222 */
2223 supdrvTscDeltaTerm(pDevExt);
2224#endif
2225
2226 /*
2227 * Destroy the TSC-refinement timer.
2228 */
2229 if (pDevExt->pInvarTscRefineTimer)
2230 {
2231 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2232 pDevExt->pInvarTscRefineTimer = NULL;
2233 }
2234
2235 /*
2236 * Invalid the GIP data.
2237 */
2238 if (pDevExt->pGip)
2239 {
2240 supdrvGipTerm(pDevExt->pGip);
2241 pDevExt->pGip = NULL;
2242 }
2243 g_pSUPGlobalInfoPage = NULL;
2244
2245 /*
2246 * Destroy the timer and free the GIP memory object.
2247 */
2248 if (pDevExt->pGipTimer)
2249 {
2250 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2251 pDevExt->pGipTimer = NULL;
2252 }
2253
2254 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2255 {
2256 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2257 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2258 }
2259
2260 /*
2261 * Finally, make sure we've release the system timer resolution request
2262 * if one actually succeeded and is still pending.
2263 */
2264 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2265}
2266
2267
2268
2269
2270/*
2271 *
2272 *
2273 * GIP Update Timer Related Code
2274 * GIP Update Timer Related Code
2275 * GIP Update Timer Related Code
2276 *
2277 *
2278 */
2279
2280
2281/**
2282 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2283 * updates all the per cpu data except the transaction id.
2284 *
2285 * @param pDevExt The device extension.
2286 * @param pGipCpu Pointer to the per cpu data.
2287 * @param u64NanoTS The current time stamp.
2288 * @param u64TSC The current TSC.
2289 * @param iTick The current timer tick.
2290 *
2291 * @remarks Can be called with interrupts disabled!
2292 */
2293static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2294{
2295 uint64_t u64TSCDelta;
2296 bool fUpdateCpuHz;
2297 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2298 AssertPtrReturnVoid(pGip);
2299
2300 /* Delta between this and the previous update. */
2301 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2302
2303 /*
2304 * Update the NanoTS.
2305 */
2306 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2307
2308 /*
2309 * Calc TSC delta.
2310 */
2311 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2312 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2313
2314 /*
2315 * Determine if we need to update the CPU (TSC) frequency calculation.
2316 *
2317 * We don't need to keep recalculating the frequency when it's invariant,
2318 * unless the special tstGIP-2 testing mode is enabled.
2319 */
2320 fUpdateCpuHz = pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC;
2321 if (!(pGip->fFlags & SUPGIP_FLAGS_TESTING))
2322 { /* likely*/ }
2323 else
2324 {
2325 uint32_t fGipFlags = pGip->fFlags;
2326 if (fGipFlags & (SUPGIP_FLAGS_TESTING_ENABLE | SUPGIP_FLAGS_TESTING_START))
2327 {
2328 if (fGipFlags & SUPGIP_FLAGS_TESTING_START)
2329 {
2330 /* Cache the TSC frequency before forcing updates due to test mode. */
2331 if (!fUpdateCpuHz)
2332 pDevExt->uGipTestModeInvariantCpuHz = pGip->aCPUs[0].u64CpuHz;
2333 ASMAtomicAndU32(&pGip->fFlags, ~SUPGIP_FLAGS_TESTING_START);
2334 }
2335 fUpdateCpuHz = true;
2336 }
2337 else if (fGipFlags & SUPGIP_FLAGS_TESTING_STOP)
2338 {
2339 /* Restore the cached TSC frequency if any. */
2340 if (!fUpdateCpuHz)
2341 {
2342 Assert(pDevExt->uGipTestModeInvariantCpuHz);
2343 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, pDevExt->uGipTestModeInvariantCpuHz);
2344 }
2345 ASMAtomicAndU32(&pGip->fFlags, ~(SUPGIP_FLAGS_TESTING_STOP | SUPGIP_FLAGS_TESTING));
2346 }
2347 }
2348
2349 /*
2350 * Calculate the CPU (TSC) frequency if necessary.
2351 */
2352 if (fUpdateCpuHz)
2353 {
2354 uint64_t u64CpuHz;
2355 uint32_t u32UpdateIntervalTSC;
2356 uint32_t u32UpdateIntervalTSCSlack;
2357 uint32_t u32TransactionId;
2358 unsigned iTSCHistoryHead;
2359
2360 if (u64TSCDelta >> 32)
2361 {
2362 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2363 pGipCpu->cErrors++;
2364 }
2365
2366 /*
2367 * On the 2nd and 3rd callout, reset the history with the current TSC
2368 * interval since the values entered by supdrvGipInit are totally off.
2369 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2370 * better, while the 3rd should be most reliable.
2371 */
2372 /** @todo Could we drop this now that we initializes the history
2373 * with nominal TSC frequency values? */
2374 u32TransactionId = pGipCpu->u32TransactionId;
2375 if (RT_UNLIKELY( ( u32TransactionId == 5
2376 || u32TransactionId == 7)
2377 && ( iTick == 2
2378 || iTick == 3) ))
2379 {
2380 unsigned i;
2381 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2382 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2383 }
2384
2385 /*
2386 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2387 * Wait until we have at least one full history since the above history reset. The
2388 * assumption is that the majority of the previous history values will be tolerable.
2389 * See @bugref{6710#c67}.
2390 */
2391 /** @todo Could we drop the fudging there now that we initializes the history
2392 * with nominal TSC frequency values? */
2393 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2394 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2395 {
2396 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2397 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2398 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2399 {
2400 uint32_t u32;
2401 u32 = pGipCpu->au32TSCHistory[0];
2402 u32 += pGipCpu->au32TSCHistory[1];
2403 u32 += pGipCpu->au32TSCHistory[2];
2404 u32 += pGipCpu->au32TSCHistory[3];
2405 u32 >>= 2;
2406 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2407 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2408 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2409 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2410 u64TSCDelta >>= 2;
2411 u64TSCDelta += u32;
2412 u64TSCDelta >>= 1;
2413 }
2414 }
2415
2416 /*
2417 * TSC History.
2418 */
2419 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2420 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2421 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2422 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2423
2424 /*
2425 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2426 *
2427 * On Windows, we have an occasional (but recurring) sour value that messed up
2428 * the history but taking only 1 interval reduces the precision overall.
2429 */
2430 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2431 || pGip->u32UpdateHz >= 1000)
2432 {
2433 uint32_t u32;
2434 u32 = pGipCpu->au32TSCHistory[0];
2435 u32 += pGipCpu->au32TSCHistory[1];
2436 u32 += pGipCpu->au32TSCHistory[2];
2437 u32 += pGipCpu->au32TSCHistory[3];
2438 u32 >>= 2;
2439 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2440 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2441 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2442 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2443 u32UpdateIntervalTSC >>= 2;
2444 u32UpdateIntervalTSC += u32;
2445 u32UpdateIntervalTSC >>= 1;
2446
2447 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2448 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2449 }
2450 else if (pGip->u32UpdateHz >= 90)
2451 {
2452 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2453 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2454 u32UpdateIntervalTSC >>= 1;
2455
2456 /* value chosen on a 2GHz thinkpad running windows */
2457 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2458 }
2459 else
2460 {
2461 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2462
2463 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2464 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2465 }
2466 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2467
2468 /*
2469 * CpuHz.
2470 */
2471 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2472 u64CpuHz /= pGip->u32UpdateIntervalNS;
2473 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2474 }
2475}
2476
2477
2478/**
2479 * Updates the GIP.
2480 *
2481 * @param pDevExt The device extension.
2482 * @param u64NanoTS The current nanosecond timestamp.
2483 * @param u64TSC The current TSC timestamp.
2484 * @param idCpu The CPU ID.
2485 * @param iTick The current timer tick.
2486 *
2487 * @remarks Can be called with interrupts disabled!
2488 */
2489static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2490{
2491 /*
2492 * Determine the relevant CPU data.
2493 */
2494 PSUPGIPCPU pGipCpu;
2495 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2496 AssertPtrReturnVoid(pGip);
2497
2498 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2499 pGipCpu = &pGip->aCPUs[0];
2500 else
2501 {
2502 unsigned iCpu;
2503 uint32_t idApic = supdrvGipGetApicId(pGip);
2504 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
2505 { /* likely */ }
2506 else
2507 return;
2508 iCpu = pGip->aiCpuFromApicId[idApic];
2509 if (RT_LIKELY(iCpu < pGip->cCpus))
2510 { /* likely */ }
2511 else
2512 return;
2513 pGipCpu = &pGip->aCPUs[iCpu];
2514 if (RT_LIKELY(pGipCpu->idCpu == idCpu))
2515 { /* likely */ }
2516 else
2517 return;
2518 }
2519
2520 /*
2521 * Start update transaction.
2522 */
2523 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2524 {
2525 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2526 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2527 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2528 pGipCpu->cErrors++;
2529 return;
2530 }
2531
2532 /*
2533 * Recalc the update frequency every 0x800th time.
2534 */
2535 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariant hosts. */
2536 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2537 {
2538 if (pGip->u64NanoTSLastUpdateHz)
2539 {
2540#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2541 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2542 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2543 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2544 {
2545 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2546 * calculation on non-invariant hosts if it changes the history decision
2547 * taken in supdrvGipDoUpdateCpu(). */
2548 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2549 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2550 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2551 }
2552#endif
2553 }
2554 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2555 }
2556
2557 /*
2558 * Update the data.
2559 */
2560 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2561
2562 /*
2563 * Complete transaction.
2564 */
2565 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2566}
2567
2568
2569/**
2570 * Updates the per cpu GIP data for the calling cpu.
2571 *
2572 * @param pDevExt The device extension.
2573 * @param u64NanoTS The current nanosecond timestamp.
2574 * @param u64TSC The current TSC timesaver.
2575 * @param idCpu The CPU ID.
2576 * @param idApic The APIC id for the CPU index.
2577 * @param iTick The current timer tick.
2578 *
2579 * @remarks Can be called with interrupts disabled!
2580 */
2581static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2582 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2583{
2584 uint32_t iCpu;
2585 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2586
2587 /*
2588 * Avoid a potential race when a CPU online notification doesn't fire on
2589 * the onlined CPU but the tick creeps in before the event notification is
2590 * run.
2591 */
2592 if (RT_LIKELY(iTick != 1))
2593 { /* likely*/ }
2594 else
2595 {
2596 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2597 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2598 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2599 }
2600
2601 iCpu = pGip->aiCpuFromApicId[idApic];
2602 if (RT_LIKELY(iCpu < pGip->cCpus))
2603 {
2604 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2605 if (pGipCpu->idCpu == idCpu)
2606 {
2607 /*
2608 * Start update transaction.
2609 */
2610 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2611 {
2612 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2613 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2614 pGipCpu->cErrors++;
2615 return;
2616 }
2617
2618 /*
2619 * Update the data.
2620 */
2621 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2622
2623 /*
2624 * Complete transaction.
2625 */
2626 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2627 }
2628 }
2629}
2630
2631
2632/**
2633 * Timer callback function for the sync and invariant GIP modes.
2634 *
2635 * @param pTimer The timer.
2636 * @param pvUser Opaque pointer to the device extension.
2637 * @param iTick The timer tick.
2638 */
2639static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2640{
2641 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2642 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2643 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2644 uint64_t u64TSC = ASMReadTSC();
2645 uint64_t u64NanoTS = RTTimeSystemNanoTS();
2646 RT_NOREF1(pTimer);
2647
2648 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2649 {
2650 /*
2651 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2652 * missing timer ticks is not an option for GIP because the GIP users
2653 * will end up incrementing the time in 1ns per time getter call until
2654 * there is a complete timer update. So, if the delta has yet to be
2655 * calculated, we just pretend it is zero for now (the GIP users
2656 * probably won't have it for a wee while either and will do the same).
2657 *
2658 * We could maybe on some platforms try cross calling a CPU with a
2659 * working delta here, but it's not worth the hassle since the
2660 * likelihood of this happening is really low. On Windows, Linux, and
2661 * Solaris timers fire on the CPU they were registered/started on.
2662 * Darwin timers doesn't necessarily (they are high priority threads).
2663 */
2664 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2665 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2666 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2667 Assert(!ASMIntAreEnabled());
2668 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2669 {
2670 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2671 if (iTscDelta != INT64_MAX)
2672 u64TSC -= iTscDelta;
2673 }
2674 }
2675
2676 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2677
2678 ASMSetFlags(fEFlags);
2679}
2680
2681
2682/**
2683 * Timer callback function for async GIP mode.
2684 * @param pTimer The timer.
2685 * @param pvUser Opaque pointer to the device extension.
2686 * @param iTick The timer tick.
2687 */
2688static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2689{
2690 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2691 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2692 RTCPUID idCpu = RTMpCpuId();
2693 uint64_t u64TSC = ASMReadTSC();
2694 uint64_t NanoTS = RTTimeSystemNanoTS();
2695 RT_NOREF1(pTimer);
2696
2697 /** @todo reset the transaction number and whatnot when iTick == 1. */
2698 if (pDevExt->idGipMaster == idCpu)
2699 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2700 else
2701 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, supdrvGipGetApicId(pDevExt->pGip), iTick);
2702
2703 ASMSetFlags(fEFlags);
2704}
2705
2706
2707
2708
2709/*
2710 *
2711 *
2712 * TSC Delta Measurements And Related Code
2713 * TSC Delta Measurements And Related Code
2714 * TSC Delta Measurements And Related Code
2715 *
2716 *
2717 */
2718
2719
2720/*
2721 * Select TSC delta measurement algorithm.
2722 */
2723#if 0
2724# define GIP_TSC_DELTA_METHOD_1
2725#else
2726# define GIP_TSC_DELTA_METHOD_2
2727#endif
2728
2729/** For padding variables to keep them away from other cache lines. Better too
2730 * large than too small!
2731 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2732 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2733 * III had 32 bytes cache lines. */
2734#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2735
2736
2737/**
2738 * TSC delta measurement algorithm \#2 result entry.
2739 */
2740typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2741{
2742 uint32_t iSeqMine;
2743 uint32_t iSeqOther;
2744 uint64_t uTsc;
2745} SUPDRVTSCDELTAMETHOD2ENTRY;
2746
2747/**
2748 * TSC delta measurement algorithm \#2 Data.
2749 */
2750typedef struct SUPDRVTSCDELTAMETHOD2
2751{
2752 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2753 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2754 /** The current sequence number of this worker. */
2755 uint32_t volatile iCurSeqNo;
2756 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2757 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2758 /** Result table. */
2759 SUPDRVTSCDELTAMETHOD2ENTRY aResults[64];
2760} SUPDRVTSCDELTAMETHOD2;
2761/** Pointer to the data for TSC delta measurement algorithm \#2 .*/
2762typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2763
2764
2765/**
2766 * The TSC delta synchronization struct, version 2.
2767 *
2768 * The synchronization variable is completely isolated in its own cache line
2769 * (provided our max cache line size estimate is correct).
2770 */
2771typedef struct SUPTSCDELTASYNC2
2772{
2773 /** Padding to make sure the uVar1 is in its own cache line. */
2774 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2775
2776 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2777 volatile uint32_t uSyncVar;
2778 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2779 volatile uint32_t uSyncSeq;
2780
2781 /** Padding to make sure the uVar1 is in its own cache line. */
2782 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2783
2784 /** Start RDTSC value. Put here mainly to save stack space. */
2785 uint64_t uTscStart;
2786 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2787 uint64_t cMaxTscTicks;
2788} SUPTSCDELTASYNC2;
2789AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2790typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2791
2792/** Prestart wait. */
2793#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2794/** Prestart aborted. */
2795#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2796/** Ready (on your mark). */
2797#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2798/** Steady (get set). */
2799#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2800/** Go! */
2801#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2802/** Used by the verification test. */
2803#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2804
2805/** We reached the time limit. */
2806#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2807/** The other party won't touch the sync struct ever again. */
2808#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2809
2810
2811/**
2812 * Argument package/state passed by supdrvTscMeasureDeltaOne() to the RTMpOn
2813 * callback worker.
2814 * @todo add
2815 */
2816typedef struct SUPDRVGIPTSCDELTARGS
2817{
2818 /** The device extension. */
2819 PSUPDRVDEVEXT pDevExt;
2820 /** Pointer to the GIP CPU array entry for the worker. */
2821 PSUPGIPCPU pWorker;
2822 /** Pointer to the GIP CPU array entry for the master. */
2823 PSUPGIPCPU pMaster;
2824 /** The maximum number of ticks to spend in supdrvTscMeasureDeltaCallback.
2825 * (This is what we need a rough TSC frequency for.) */
2826 uint64_t cMaxTscTicks;
2827 /** Used to abort synchronization setup. */
2828 bool volatile fAbortSetup;
2829
2830 /** Padding to make sure the master variables live in its own cache lines. */
2831 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2832
2833 /** @name Master
2834 * @{ */
2835 /** The time the master spent in the MP worker. */
2836 uint64_t cElapsedMasterTscTicks;
2837 /** The iTry value when stopped at. */
2838 uint32_t iTry;
2839 /** Set if the run timed out. */
2840 bool volatile fTimedOut;
2841 /** Pointer to the master's synchronization struct (on stack). */
2842 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2843 /** Master data union. */
2844 union
2845 {
2846 /** Data (master) for delta verification. */
2847 struct
2848 {
2849 /** Verification test TSC values for the master. */
2850 uint64_t volatile auTscs[32];
2851 } Verify;
2852 /** Data (master) for measurement method \#2. */
2853 struct
2854 {
2855 /** Data and sequence number. */
2856 SUPDRVTSCDELTAMETHOD2 Data;
2857 /** The lag setting for the next run. */
2858 bool fLag;
2859 /** Number of hits. */
2860 uint32_t cHits;
2861 } M2;
2862 } uMaster;
2863 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2864 * VERR_TRY_AGAIN on timeout. */
2865 int32_t rcVerify;
2866#ifdef TSCDELTA_VERIFY_WITH_STATS
2867 /** The maximum difference between TSC read during delta verification. */
2868 int64_t cMaxVerifyTscTicks;
2869 /** The minimum difference between two TSC reads during verification. */
2870 int64_t cMinVerifyTscTicks;
2871 /** The bad TSC diff, worker relative to master (= worker - master).
2872 * Negative value means the worker is behind the master. */
2873 int64_t iVerifyBadTscDiff;
2874#endif
2875 /** @} */
2876
2877 /** Padding to make sure the worker variables live is in its own cache line. */
2878 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2879
2880 /** @name Proletarian
2881 * @{ */
2882 /** Pointer to the worker's synchronization struct (on stack). */
2883 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2884 /** The time the worker spent in the MP worker. */
2885 uint64_t cElapsedWorkerTscTicks;
2886 /** Worker data union. */
2887 union
2888 {
2889 /** Data (worker) for delta verification. */
2890 struct
2891 {
2892 /** Verification test TSC values for the worker. */
2893 uint64_t volatile auTscs[32];
2894 } Verify;
2895 /** Data (worker) for measurement method \#2. */
2896 struct
2897 {
2898 /** Data and sequence number. */
2899 SUPDRVTSCDELTAMETHOD2 Data;
2900 /** The lag setting for the next run (set by master). */
2901 bool fLag;
2902 } M2;
2903 } uWorker;
2904 /** @} */
2905
2906 /** Padding to make sure the above is in its own cache line. */
2907 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2908} SUPDRVGIPTSCDELTARGS;
2909typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2910
2911
2912/** @name Macros that implements the basic synchronization steps common to
2913 * the algorithms.
2914 *
2915 * Must be used from loop as the timeouts are implemented via 'break' statements
2916 * at the moment.
2917 *
2918 * @{
2919 */
2920#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2921# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2922# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2923# define TSCDELTA_DBG_CHECK_LOOP() \
2924 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2925#else
2926# define TSCDELTA_DBG_VARS() ((void)0)
2927# define TSCDELTA_DBG_START_LOOP() ((void)0)
2928# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2929#endif
2930#if 0
2931# define TSCDELTA_DBG_SYNC_MSG(a_Args) SUPR0Printf a_Args
2932#else
2933# define TSCDELTA_DBG_SYNC_MSG(a_Args) ((void)0)
2934#endif
2935#if 0
2936# define TSCDELTA_DBG_SYNC_MSG2(a_Args) SUPR0Printf a_Args
2937#else
2938# define TSCDELTA_DBG_SYNC_MSG2(a_Args) ((void)0)
2939#endif
2940#if 0
2941# define TSCDELTA_DBG_SYNC_MSG9(a_Args) SUPR0Printf a_Args
2942#else
2943# define TSCDELTA_DBG_SYNC_MSG9(a_Args) ((void)0)
2944#endif
2945
2946
2947static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2948 bool fIsMaster, PRTCCUINTREG pfEFlags, PSUPDRVGIPTSCDELTARGS pArgs)
2949{
2950 uint32_t iMySeq = fIsMaster ? 0 : 256;
2951 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2952 uint32_t u32Tmp;
2953 uint32_t iSync2Loops = 0;
2954 RTCCUINTREG fEFlags;
2955 TSCDELTA_DBG_VARS();
2956
2957 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2958
2959 /*
2960 * The master tells the worker to get on it's mark.
2961 */
2962 if (fIsMaster)
2963 {
2964 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2965 { /* likely*/ }
2966 else
2967 {
2968 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #1 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2969 return false;
2970 }
2971 }
2972
2973 /*
2974 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2975 */
2976 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2977 for (;;)
2978 {
2979 fEFlags = ASMIntDisableFlags();
2980 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2981 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2982 break;
2983 ASMSetFlags(fEFlags);
2984 ASMNopPause();
2985
2986 /* Abort? */
2987 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2988 {
2989 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #2 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
2990 return false;
2991 }
2992
2993 /* Check for timeouts every so often (not every loop in case RDTSC is
2994 trapping or something). Must check the first time around. */
2995#if 0 /* For debugging the timeout paths. */
2996 static uint32_t volatile xxx;
2997#endif
2998 if ( ( (iSync2Loops & 0x3ff) == 0
2999 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
3000#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
3001 || (!fIsMaster && (++xxx & 0xf) == 0)
3002#endif
3003 )
3004 {
3005 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
3006 ignore the timeout if we've got the go ahead already (simpler). */
3007 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
3008 {
3009 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: timeout\n", fIsMaster ? "master" : "worker"));
3010 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
3011 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3012 return false;
3013 }
3014 }
3015 iSync2Loops++;
3016 }
3017
3018 /*
3019 * Interrupts are now disabled and will remain disabled until we do
3020 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
3021 */
3022 *pfEFlags = fEFlags;
3023
3024 /*
3025 * The worker tells the master that it is on its mark and that the master
3026 * need to get into position as well.
3027 */
3028 if (!fIsMaster)
3029 {
3030 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
3031 { /* likely */ }
3032 else
3033 {
3034 ASMSetFlags(fEFlags);
3035 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #3 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3036 return false;
3037 }
3038 }
3039
3040 /*
3041 * The master sends the 'go' to the worker and wait for ACK.
3042 */
3043 if (fIsMaster)
3044 {
3045 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3046 { /* likely */ }
3047 else
3048 {
3049 ASMSetFlags(fEFlags);
3050 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #4 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3051 return false;
3052 }
3053 }
3054
3055 /*
3056 * Wait for the 'go' signal (ack in the master case).
3057 */
3058 TSCDELTA_DBG_START_LOOP();
3059 for (;;)
3060 {
3061 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3062 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
3063 break;
3064 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
3065 { /* likely */ }
3066 else
3067 {
3068 ASMSetFlags(fEFlags);
3069 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #5 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
3070 return false;
3071 }
3072
3073 TSCDELTA_DBG_CHECK_LOOP();
3074 ASMNopPause();
3075 }
3076
3077 /*
3078 * The worker acks the 'go' (shouldn't fail).
3079 */
3080 if (!fIsMaster)
3081 {
3082 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3083 { /* likely */ }
3084 else
3085 {
3086 ASMSetFlags(fEFlags);
3087 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #6 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3088 return false;
3089 }
3090 }
3091
3092 /*
3093 * Try enter mostly lockstep execution with it.
3094 */
3095 for (;;)
3096 {
3097 uint32_t iOtherSeq1, iOtherSeq2;
3098 ASMCompilerBarrier();
3099 ASMSerializeInstruction();
3100
3101 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
3102 ASMNopPause();
3103 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
3104 ASMNopPause();
3105 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
3106
3107 ASMCompilerBarrier();
3108 if (iOtherSeq1 == iOtherSeq2)
3109 return true;
3110
3111 /* Did the other guy give up? Should we give up? */
3112 if ( iOtherSeq1 == UINT32_MAX
3113 || iOtherSeq2 == UINT32_MAX)
3114 return true;
3115 if (++iMySeq >= iMaxSeq)
3116 {
3117 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
3118 return true;
3119 }
3120 ASMNopPause();
3121 }
3122}
3123
3124#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3125 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3126 { /*likely*/ } \
3127 else if (true) \
3128 { \
3129 TSCDELTA_DBG_SYNC_MSG9(("sync/before/master: #89\n")); \
3130 break; \
3131 } else do {} while (0)
3132#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3133 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3134 { /*likely*/ } \
3135 else if (true) \
3136 { \
3137 TSCDELTA_DBG_SYNC_MSG9(("sync/before/other: #89\n")); \
3138 break; \
3139 } else do {} while (0)
3140
3141
3142static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3143 bool fIsMaster, RTCCUINTREG fEFlags)
3144{
3145 TSCDELTA_DBG_VARS();
3146 RT_NOREF1(pOtherSync);
3147
3148 /*
3149 * Wait for the 'ready' signal. In the master's case, this means the
3150 * worker has completed its data collection, while in the worker's case it
3151 * means the master is done processing the data and it's time for the next
3152 * loop iteration (or whatever).
3153 */
3154 ASMSetFlags(fEFlags);
3155 TSCDELTA_DBG_START_LOOP();
3156 for (;;)
3157 {
3158 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3159 if ( u32Tmp == GIP_TSC_DELTA_SYNC2_READY
3160 || (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY && !fIsMaster) /* kicked twice => race */ )
3161 return true;
3162 ASMNopPause();
3163 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_GO))
3164 { /* likely */}
3165 else
3166 {
3167 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #1 u32Tmp=%#x\n", u32Tmp));
3168 return false; /* shouldn't ever happen! */
3169 }
3170 TSCDELTA_DBG_CHECK_LOOP();
3171 ASMNopPause();
3172 }
3173}
3174
3175#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3176 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_fEFlags))) \
3177 { /* likely */ } \
3178 else if (true) \
3179 { \
3180 TSCDELTA_DBG_SYNC_MSG9(("sync/after/master: #97\n")); \
3181 break; \
3182 } else do {} while (0)
3183
3184#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
3185 /* \
3186 * Tell the worker that we're done processing the data and ready for the next round. \
3187 */ \
3188 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3189 { /* likely */ } \
3190 else if (true)\
3191 { \
3192 TSCDELTA_DBG_SYNC_MSG(("sync/after/master: #99 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3193 break; \
3194 } else do {} while (0)
3195
3196#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3197 if (true) { \
3198 /* \
3199 * Tell the master that we're done collecting data and wait for the next round to start. \
3200 */ \
3201 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3202 { /* likely */ } \
3203 else \
3204 { \
3205 ASMSetFlags(a_fEFlags); \
3206 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #0 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3207 break; \
3208 } \
3209 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_fEFlags))) \
3210 { /* likely */ } \
3211 else \
3212 { \
3213 TSCDELTA_DBG_SYNC_MSG9(("sync/after/other: #98\n")); \
3214 break; \
3215 } \
3216 } else do {} while (0)
3217/** @} */
3218
3219
3220#ifdef GIP_TSC_DELTA_METHOD_1
3221/**
3222 * TSC delta measurement algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
3223 *
3224 *
3225 * We ignore the first few runs of the loop in order to prime the
3226 * cache. Also, we need to be careful about using 'pause' instruction
3227 * in critical busy-wait loops in this code - it can cause undesired
3228 * behaviour with hyperthreading.
3229 *
3230 * We try to minimize the measurement error by computing the minimum
3231 * read time of the compare statement in the worker by taking TSC
3232 * measurements across it.
3233 *
3234 * It must be noted that the computed minimum read time is mostly to
3235 * eliminate huge deltas when the worker is too early and doesn't by
3236 * itself help produce more accurate deltas. We allow two times the
3237 * computed minimum as an arbitrary acceptable threshold. Therefore,
3238 * it is still possible to get negative deltas where there are none
3239 * when the worker is earlier. As long as these occasional negative
3240 * deltas are lower than the time it takes to exit guest-context and
3241 * the OS to reschedule EMT on a different CPU, we won't expose a TSC
3242 * that jumped backwards. It is due to the existence of the negative
3243 * deltas that we don't recompute the delta with the master and
3244 * worker interchanged to eliminate the remaining measurement error.
3245 *
3246 *
3247 * @param pArgs The argument/state data.
3248 * @param pMySync My synchronization structure.
3249 * @param pOtherSync My partner's synchronization structure.
3250 * @param fIsMaster Set if master, clear if worker.
3251 * @param iTry The attempt number.
3252 */
3253static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3254 bool fIsMaster, uint32_t iTry)
3255{
3256 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3257 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3258 uint64_t uMinCmpReadTime = UINT64_MAX;
3259 unsigned iLoop;
3260 NOREF(iTry);
3261
3262 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
3263 {
3264 RTCCUINTREG fEFlags;
3265 if (fIsMaster)
3266 {
3267 /*
3268 * The master.
3269 */
3270 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
3271 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
3272 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
3273 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3274
3275 do
3276 {
3277 ASMSerializeInstruction();
3278 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
3279 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3280
3281 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3282
3283 /* Process the data. */
3284 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3285 {
3286 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
3287 {
3288 int64_t iDelta = pGipCpuWorker->u64TSCSample
3289 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
3290 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3291 ? iDelta < pGipCpuWorker->i64TSCDelta
3292 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3293 pGipCpuWorker->i64TSCDelta = iDelta;
3294 }
3295 }
3296
3297 /* Reset our TSC sample and tell the worker to move on. */
3298 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3299 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3300 }
3301 else
3302 {
3303 /*
3304 * The worker.
3305 */
3306 uint64_t uTscWorker;
3307 uint64_t uTscWorkerFlushed;
3308 uint64_t uCmpReadTime;
3309
3310 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3311 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3312
3313 /*
3314 * Keep reading the TSC until we notice that the master has read his. Reading
3315 * the TSC -after- the master has updated the memory is way too late. We thus
3316 * compensate by trying to measure how long it took for the worker to notice
3317 * the memory flushed from the master.
3318 */
3319 do
3320 {
3321 ASMSerializeInstruction();
3322 uTscWorker = ASMReadTSC();
3323 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3324 ASMSerializeInstruction();
3325 uTscWorkerFlushed = ASMReadTSC();
3326
3327 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3328 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3329 {
3330 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3331 if (uCmpReadTime < (uMinCmpReadTime << 1))
3332 {
3333 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3334 if (uCmpReadTime < uMinCmpReadTime)
3335 uMinCmpReadTime = uCmpReadTime;
3336 }
3337 else
3338 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3339 }
3340 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3341 {
3342 if (uCmpReadTime < uMinCmpReadTime)
3343 uMinCmpReadTime = uCmpReadTime;
3344 }
3345
3346 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3347 }
3348 }
3349
3350 TSCDELTA_DBG_SYNC_MSG9(("sync/method1loop/%s: #92 iLoop=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iLoop,
3351 pMySync->uSyncVar));
3352
3353 /*
3354 * We must reset the worker TSC sample value in case it gets picked as a
3355 * GIP master later on (it's trashed above, naturally).
3356 */
3357 if (!fIsMaster)
3358 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3359}
3360#endif /* GIP_TSC_DELTA_METHOD_1 */
3361
3362
3363#ifdef GIP_TSC_DELTA_METHOD_2
3364/*
3365 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3366 */
3367
3368# define GIP_TSC_DELTA_M2_LOOPS (7 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3369# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 0
3370
3371
3372static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs)
3373{
3374 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3375 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3376 uint32_t idxResult;
3377 uint32_t cHits = 0;
3378
3379 /*
3380 * Look for matching entries in the master and worker tables.
3381 */
3382 for (idxResult = 0; idxResult < RT_ELEMENTS(pArgs->uMaster.M2.Data.aResults); idxResult++)
3383 {
3384 uint32_t idxOther = pArgs->uMaster.M2.Data.aResults[idxResult].iSeqOther;
3385 if (idxOther & 1)
3386 {
3387 idxOther >>= 1;
3388 if (idxOther < RT_ELEMENTS(pArgs->uWorker.M2.Data.aResults))
3389 {
3390 if (pArgs->uWorker.M2.Data.aResults[idxOther].iSeqOther == pArgs->uMaster.M2.Data.aResults[idxResult].iSeqMine)
3391 {
3392 int64_t iDelta;
3393 iDelta = pArgs->uWorker.M2.Data.aResults[idxOther].uTsc
3394 - (pArgs->uMaster.M2.Data.aResults[idxResult].uTsc - iMasterTscDelta);
3395 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3396 ? iDelta < iBestDelta
3397 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3398 iBestDelta = iDelta;
3399 cHits++;
3400 }
3401 }
3402 }
3403 }
3404
3405 /*
3406 * Save the results.
3407 */
3408 if (cHits > 2)
3409 pArgs->pWorker->i64TSCDelta = iBestDelta;
3410 pArgs->uMaster.M2.cHits += cHits;
3411}
3412
3413
3414/**
3415 * The core function of the 2nd TSC delta measurement algorithm.
3416 *
3417 * The idea here is that we have the two CPUs execute the exact same code
3418 * collecting a largish set of TSC samples. The code has one data dependency on
3419 * the other CPU which intention it is to synchronize the execution as well as
3420 * help cross references the two sets of TSC samples (the sequence numbers).
3421 *
3422 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3423 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3424 * it will help with making the CPUs enter lock step execution occasionally.
3425 *
3426 */
3427static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3428{
3429 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3430 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3431
3432 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3433 ASMSerializeInstruction();
3434 while (cLeft-- > 0)
3435 {
3436 uint64_t uTsc;
3437 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3438 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3439 ASMCompilerBarrier();
3440 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3441 uTsc = ASMReadTSC();
3442 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3443 ASMCompilerBarrier();
3444 ASMSerializeInstruction();
3445 pEntry->iSeqMine = iSeqMine;
3446 pEntry->iSeqOther = iSeqOther;
3447 pEntry->uTsc = uTsc;
3448 pEntry++;
3449 ASMSerializeInstruction();
3450 if (fLag)
3451 ASMNopPause();
3452 }
3453}
3454
3455
3456/**
3457 * TSC delta measurement algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3458 *
3459 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3460 *
3461 * @param pArgs The argument/state data.
3462 * @param pMySync My synchronization structure.
3463 * @param pOtherSync My partner's synchronization structure.
3464 * @param fIsMaster Set if master, clear if worker.
3465 * @param iTry The attempt number.
3466 */
3467static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3468 bool fIsMaster, uint32_t iTry)
3469{
3470 unsigned iLoop;
3471 RT_NOREF1(iTry);
3472
3473 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3474 {
3475 RTCCUINTREG fEFlags;
3476 if (fIsMaster)
3477 {
3478 /*
3479 * Adjust the loop lag fudge.
3480 */
3481# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3482 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3483 {
3484 /* Lag during the priming to be nice to everyone.. */
3485 pArgs->uMaster.M2.fLag = true;
3486 pArgs->uWorker.M2.fLag = true;
3487 }
3488 else
3489# endif
3490 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3491 {
3492 /* 25 % of the body without lagging. */
3493 pArgs->uMaster.M2.fLag = false;
3494 pArgs->uWorker.M2.fLag = false;
3495 }
3496 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3497 {
3498 /* 25 % of the body with both lagging. */
3499 pArgs->uMaster.M2.fLag = true;
3500 pArgs->uWorker.M2.fLag = true;
3501 }
3502 else
3503 {
3504 /* 50% of the body with alternating lag. */
3505 pArgs->uMaster.M2.fLag = (iLoop & 1) == 0;
3506 pArgs->uWorker.M2.fLag= (iLoop & 1) == 1;
3507 }
3508
3509 /*
3510 * Sync up with the worker and collect data.
3511 */
3512 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3513 supdrvTscDeltaMethod2CollectData(&pArgs->uMaster.M2.Data, &pArgs->uWorker.M2.Data.iCurSeqNo, pArgs->uMaster.M2.fLag);
3514 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3515
3516 /*
3517 * Process the data.
3518 */
3519# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3520 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3521# endif
3522 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs);
3523
3524 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3525 }
3526 else
3527 {
3528 /*
3529 * The worker.
3530 */
3531 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3532 supdrvTscDeltaMethod2CollectData(&pArgs->uWorker.M2.Data, &pArgs->uMaster.M2.Data.iCurSeqNo, pArgs->uWorker.M2.fLag);
3533 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3534 }
3535 }
3536}
3537
3538#endif /* GIP_TSC_DELTA_METHOD_2 */
3539
3540
3541
3542static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3543 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3544{
3545 /*PSUPGIPCPU pGipCpuWorker = pArgs->pWorker; - unused */
3546 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3547 uint32_t i;
3548 TSCDELTA_DBG_VARS();
3549
3550 for (;;)
3551 {
3552 RTCCUINTREG fEFlags;
3553 AssertCompile((RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) & 1) == 0);
3554 AssertCompile(RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) == RT_ELEMENTS(pArgs->uWorker.Verify.auTscs));
3555
3556 if (fIsMaster)
3557 {
3558 uint64_t uTscWorker;
3559 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3560
3561 /*
3562 * Collect TSC, master goes first.
3563 */
3564 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i += 2)
3565 {
3566 /* Read, kick & wait #1. */
3567 uint64_t uTsc = ASMReadTSC();
3568 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3569 ASMSerializeInstruction();
3570 pArgs->uMaster.Verify.auTscs[i] = uTsc;
3571 TSCDELTA_DBG_START_LOOP();
3572 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3573 {
3574 TSCDELTA_DBG_CHECK_LOOP();
3575 ASMNopPause();
3576 }
3577
3578 /* Read, kick & wait #2. */
3579 uTsc = ASMReadTSC();
3580 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3581 ASMSerializeInstruction();
3582 pArgs->uMaster.Verify.auTscs[i + 1] = uTsc;
3583 TSCDELTA_DBG_START_LOOP();
3584 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3585 {
3586 TSCDELTA_DBG_CHECK_LOOP();
3587 ASMNopPause();
3588 }
3589 }
3590
3591 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3592
3593 /*
3594 * Process the data.
3595 */
3596#ifdef TSCDELTA_VERIFY_WITH_STATS
3597 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3598 pArgs->cMinVerifyTscTicks = INT64_MAX;
3599 pArgs->iVerifyBadTscDiff = 0;
3600#endif
3601 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3602 uTscWorker = 0;
3603 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i++)
3604 {
3605 /* Master vs previous worker entry. */
3606 uint64_t uTscMaster = pArgs->uMaster.Verify.auTscs[i] - pGipCpuMaster->i64TSCDelta;
3607 int64_t iDiff;
3608 if (i > 0)
3609 {
3610 iDiff = uTscMaster - uTscWorker;
3611#ifdef TSCDELTA_VERIFY_WITH_STATS
3612 if (iDiff > pArgs->cMaxVerifyTscTicks)
3613 pArgs->cMaxVerifyTscTicks = iDiff;
3614 if (iDiff < pArgs->cMinVerifyTscTicks)
3615 pArgs->cMinVerifyTscTicks = iDiff;
3616#endif
3617 if (iDiff < 0)
3618 {
3619#ifdef TSCDELTA_VERIFY_WITH_STATS
3620 pArgs->iVerifyBadTscDiff = -iDiff;
3621#endif
3622 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3623 break;
3624 }
3625 }
3626
3627 /* Worker vs master. */
3628 uTscWorker = pArgs->uWorker.Verify.auTscs[i] - iWorkerTscDelta;
3629 iDiff = uTscWorker - uTscMaster;
3630#ifdef TSCDELTA_VERIFY_WITH_STATS
3631 if (iDiff > pArgs->cMaxVerifyTscTicks)
3632 pArgs->cMaxVerifyTscTicks = iDiff;
3633 if (iDiff < pArgs->cMinVerifyTscTicks)
3634 pArgs->cMinVerifyTscTicks = iDiff;
3635#endif
3636 if (iDiff < 0)
3637 {
3638#ifdef TSCDELTA_VERIFY_WITH_STATS
3639 pArgs->iVerifyBadTscDiff = iDiff;
3640#endif
3641 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3642 break;
3643 }
3644 }
3645
3646 /* Done. */
3647 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3648 }
3649 else
3650 {
3651 /*
3652 * The worker, master leads.
3653 */
3654 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3655
3656 for (i = 0; i < RT_ELEMENTS(pArgs->uWorker.Verify.auTscs); i += 2)
3657 {
3658 uint64_t uTsc;
3659
3660 /* Wait, Read and Kick #1. */
3661 TSCDELTA_DBG_START_LOOP();
3662 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3663 {
3664 TSCDELTA_DBG_CHECK_LOOP();
3665 ASMNopPause();
3666 }
3667 uTsc = ASMReadTSC();
3668 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3669 ASMSerializeInstruction();
3670 pArgs->uWorker.Verify.auTscs[i] = uTsc;
3671
3672 /* Wait, Read and Kick #2. */
3673 TSCDELTA_DBG_START_LOOP();
3674 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3675 {
3676 TSCDELTA_DBG_CHECK_LOOP();
3677 ASMNopPause();
3678 }
3679 uTsc = ASMReadTSC();
3680 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3681 ASMSerializeInstruction();
3682 pArgs->uWorker.Verify.auTscs[i + 1] = uTsc;
3683 }
3684
3685 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3686 }
3687 return pArgs->rcVerify;
3688 }
3689
3690 /*
3691 * Timed out, please retry.
3692 */
3693 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3694 return VERR_TIMEOUT;
3695}
3696
3697
3698
3699/**
3700 * Handles the special abort procedure during synchronization setup in
3701 * supdrvTscMeasureDeltaCallbackUnwrapped().
3702 *
3703 * @returns 0 (dummy, ignored)
3704 * @param pArgs Pointer to argument/state data.
3705 * @param pMySync Pointer to my sync structure.
3706 * @param fIsMaster Set if we're the master, clear if worker.
3707 * @param fTimeout Set if it's a timeout.
3708 */
3709DECL_NO_INLINE(static, int)
3710supdrvTscMeasureDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3711{
3712 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3713 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3714 TSCDELTA_DBG_VARS();
3715 RT_NOREF1(pMySync);
3716
3717 /*
3718 * Clear our sync pointer and make sure the abort flag is set.
3719 */
3720 ASMAtomicWriteNullPtr(ppMySync);
3721 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3722 if (fTimeout)
3723 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3724
3725 /*
3726 * Make sure the other party is out of there and won't be touching our
3727 * sync state again (would cause stack corruption).
3728 */
3729 TSCDELTA_DBG_START_LOOP();
3730 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3731 {
3732 ASMNopPause();
3733 ASMNopPause();
3734 ASMNopPause();
3735 TSCDELTA_DBG_CHECK_LOOP();
3736 }
3737
3738 return 0;
3739}
3740
3741
3742/**
3743 * This is used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3744 * and compute the delta between them.
3745 *
3746 * To reduce code size a good when timeout handling was added, a dummy return
3747 * value had to be added (saves 1-3 lines per timeout case), thus this
3748 * 'Unwrapped' function and the dummy 0 return value.
3749 *
3750 * @returns 0 (dummy, ignored)
3751 * @param idCpu The CPU we are current scheduled on.
3752 * @param pArgs Pointer to a parameter package.
3753 *
3754 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3755 * read the TSC at exactly the same time on both the master and the
3756 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3757 * contention, SMI, pipelining etc. there is no guaranteed way of
3758 * doing this on x86 CPUs.
3759 */
3760static int supdrvTscMeasureDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3761{
3762 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3763 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3764 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3765 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3766 uint32_t iTry;
3767 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3768 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3769 SUPTSCDELTASYNC2 MySync;
3770 PSUPTSCDELTASYNC2 pOtherSync;
3771 int rc;
3772 TSCDELTA_DBG_VARS();
3773
3774 /* A bit of paranoia first. */
3775 if (!pGipCpuMaster || !pGipCpuWorker)
3776 return 0;
3777
3778 /*
3779 * If the CPU isn't part of the measurement, return immediately.
3780 */
3781 if ( !fIsMaster
3782 && idCpu != pGipCpuWorker->idCpu)
3783 return 0;
3784
3785 /*
3786 * Set up my synchronization stuff and wait for the other party to show up.
3787 *
3788 * We don't wait forever since the other party may be off fishing (offline,
3789 * spinning with ints disables, whatever), we must play nice to the rest of
3790 * the system as this context generally isn't one in which we will get
3791 * preempted and we may hold up a number of lower priority interrupts.
3792 */
3793 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3794 ASMAtomicWritePtr(ppMySync, &MySync);
3795 MySync.uTscStart = ASMReadTSC();
3796 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3797
3798 /* Look for the partner, might not be here yet... Special abort considerations. */
3799 iTry = 0;
3800 TSCDELTA_DBG_START_LOOP();
3801 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3802 {
3803 ASMNopPause();
3804 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3805 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu) )
3806 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3807 if ( (iTry++ & 0xff) == 0
3808 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3809 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3810 TSCDELTA_DBG_CHECK_LOOP();
3811 ASMNopPause();
3812 }
3813
3814 /* I found my partner, waiting to be found... Special abort considerations. */
3815 if (fIsMaster)
3816 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3817 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3818
3819 iTry = 0;
3820 TSCDELTA_DBG_START_LOOP();
3821 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3822 {
3823 ASMNopPause();
3824 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3825 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3826 if ( (iTry++ & 0xff) == 0
3827 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3828 {
3829 if ( fIsMaster
3830 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3831 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3832 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3833 }
3834 TSCDELTA_DBG_CHECK_LOOP();
3835 }
3836
3837 if (!fIsMaster)
3838 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3839 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3840
3841/** @todo Add a resumable state to pArgs so we don't waste time if we time
3842 * out or something. Timeouts are legit, any of the two CPUs may get
3843 * interrupted. */
3844
3845 /*
3846 * Start by seeing if we have a zero delta between the two CPUs.
3847 * This should normally be the case.
3848 */
3849 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3850 if (RT_SUCCESS(rc))
3851 {
3852 if (fIsMaster)
3853 {
3854 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3855 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3856 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3857 }
3858 }
3859 /*
3860 * If the verification didn't time out, do regular delta measurements.
3861 * We retry this until we get a reasonable value.
3862 */
3863 else if (rc != VERR_TIMEOUT)
3864 {
3865 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3866 for (iTry = 0; iTry < 12; iTry++)
3867 {
3868 /*
3869 * Check the state before we start.
3870 */
3871 uint32_t u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3872 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3873 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3874 {
3875 TSCDELTA_DBG_SYNC_MSG(("sync/loop/%s: #0 iTry=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iTry, u32Tmp));
3876 break;
3877 }
3878
3879 /*
3880 * Do the measurements.
3881 */
3882#ifdef GIP_TSC_DELTA_METHOD_1
3883 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3884#elif defined(GIP_TSC_DELTA_METHOD_2)
3885 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3886#else
3887# error "huh??"
3888#endif
3889
3890 /*
3891 * Check the state.
3892 */
3893 u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3894 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3895 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3896 {
3897 if (fIsMaster)
3898 TSCDELTA_DBG_SYNC_MSG(("sync/loop/master: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3899 else
3900 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/worker: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3901 break;
3902 }
3903
3904 /*
3905 * Success? If so, stop trying. Master decides.
3906 */
3907 if (fIsMaster)
3908 {
3909 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3910 {
3911 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3912 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3913 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/master: #9 iTry=%u MyState=%#x\n", iTry, MySync.uSyncVar));
3914 break;
3915 }
3916 }
3917 }
3918 if (fIsMaster)
3919 pArgs->iTry = iTry;
3920 }
3921
3922 /*
3923 * End the synchronization dance. We tell the other that we're done,
3924 * then wait for the same kind of reply.
3925 */
3926 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3927 ASMAtomicWriteNullPtr(ppMySync);
3928 iTry = 0;
3929 TSCDELTA_DBG_START_LOOP();
3930 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3931 {
3932 iTry++;
3933 if ( iTry == 0
3934 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu))
3935 break; /* this really shouldn't happen. */
3936 TSCDELTA_DBG_CHECK_LOOP();
3937 ASMNopPause();
3938 }
3939
3940 /*
3941 * Collect some runtime stats.
3942 */
3943 if (fIsMaster)
3944 pArgs->cElapsedMasterTscTicks = ASMReadTSC() - MySync.uTscStart;
3945 else
3946 pArgs->cElapsedWorkerTscTicks = ASMReadTSC() - MySync.uTscStart;
3947 return 0;
3948}
3949
3950/**
3951 * Callback used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3952 * and compute the delta between them.
3953 *
3954 * @param idCpu The CPU we are current scheduled on.
3955 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3956 * @param pvUser2 Unused.
3957 */
3958static DECLCALLBACK(void) supdrvTscMeasureDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3959{
3960 supdrvTscMeasureDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3961 RT_NOREF1(pvUser2);
3962}
3963
3964
3965/**
3966 * Measures the TSC delta between the master GIP CPU and one specified worker
3967 * CPU.
3968 *
3969 * @returns VBox status code.
3970 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3971 * failure.
3972 * @param pDevExt Pointer to the device instance data.
3973 * @param idxWorker The index of the worker CPU from the GIP's array of
3974 * CPUs.
3975 *
3976 * @remarks This must be called with preemption enabled!
3977 */
3978static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3979{
3980 int rc;
3981 int rc2;
3982 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3983 RTCPUID idMaster = pDevExt->idGipMaster;
3984 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3985 PSUPGIPCPU pGipCpuMaster;
3986 uint32_t iGipCpuMaster;
3987 uint32_t u32Tmp;
3988
3989 /* Validate input a bit. */
3990 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3991 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3992 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3993
3994 /*
3995 * Don't attempt measuring the delta for the GIP master.
3996 */
3997 if (pGipCpuWorker->idCpu == idMaster)
3998 {
3999 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
4000 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
4001 return VINF_SUCCESS;
4002 }
4003
4004 /*
4005 * One measurement at a time, at least for now. We might be using
4006 * broadcast IPIs so, so be nice to the rest of the system.
4007 */
4008#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4009 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
4010#else
4011 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
4012#endif
4013 if (RT_FAILURE(rc))
4014 return rc;
4015
4016 /*
4017 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
4018 * try pick a different master. (This fudge only works with multi core systems.)
4019 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
4020 *
4021 * We skip this on AMDs for now as their HTT is different from Intel's and
4022 * it doesn't seem to have any favorable effect on the results.
4023 *
4024 * If the master is offline, we need a new master too, so share the code.
4025 */
4026 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
4027 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
4028 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
4029 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
4030 && pGip->cOnlineCpus > 2
4031 && ASMHasCpuId()
4032 && ASMIsValidStdRange(ASMCpuId_EAX(0))
4033 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
4034 && ( !ASMIsAmdCpu()
4035 || ASMGetCpuFamily(u32Tmp = ASMCpuId_EAX(1)) > 0x15
4036 || ( ASMGetCpuFamily(u32Tmp) == 0x15 /* Piledriver+, not bulldozer (FX-4150 didn't like it). */
4037 && ASMGetCpuModelAMD(u32Tmp) >= 0x02) ) )
4038 || !RTMpIsCpuOnline(idMaster) )
4039 {
4040 uint32_t i;
4041 for (i = 0; i < pGip->cCpus; i++)
4042 if ( i != iGipCpuMaster
4043 && i != idxWorker
4044 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
4045 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
4046 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
4047 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
4048 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
4049 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
4050 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
4051 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
4052 {
4053 iGipCpuMaster = i;
4054 pGipCpuMaster = &pGip->aCPUs[i];
4055 idMaster = pGipCpuMaster->idCpu;
4056 break;
4057 }
4058 }
4059
4060 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
4061 {
4062 /*
4063 * Initialize data package for the RTMpOnPair callback.
4064 */
4065 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
4066 if (pArgs)
4067 {
4068 pArgs->pWorker = pGipCpuWorker;
4069 pArgs->pMaster = pGipCpuMaster;
4070 pArgs->pDevExt = pDevExt;
4071 pArgs->pSyncMaster = NULL;
4072 pArgs->pSyncWorker = NULL;
4073 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 512; /* 1953 us */
4074
4075 /*
4076 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
4077 * and supdrvTscMeasureDeltaCallback can use it as a success check.
4078 */
4079 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
4080 * that when doing the restart loop reorg. */
4081 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
4082 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
4083 supdrvTscMeasureDeltaCallback, pArgs, NULL);
4084 if (RT_SUCCESS(rc))
4085 {
4086#if 0
4087 SUPR0Printf("mponpair ticks: %9llu %9llu max: %9llu iTry: %u%s\n", pArgs->cElapsedMasterTscTicks,
4088 pArgs->cElapsedWorkerTscTicks, pArgs->cMaxTscTicks, pArgs->iTry,
4089 pArgs->fTimedOut ? " timed out" :"");
4090#endif
4091#if 0
4092 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
4093 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
4094#endif
4095 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
4096 {
4097 /*
4098 * Work the TSC delta applicability rating. It starts
4099 * optimistic in supdrvGipInit, we downgrade it here.
4100 */
4101 SUPGIPUSETSCDELTA enmRating;
4102 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
4103 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
4104 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
4105 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
4106 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
4107 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
4108 else
4109 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
4110 if (pGip->enmUseTscDelta < enmRating)
4111 {
4112 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
4113 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
4114 }
4115 }
4116 else
4117 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4118 }
4119 /** @todo return try-again if we get an offline CPU error. */
4120
4121 RTMemFree(pArgs);
4122 }
4123 else
4124 rc = VERR_NO_MEMORY;
4125 }
4126 else
4127 rc = VERR_CPU_OFFLINE;
4128
4129 /*
4130 * We're done now.
4131 */
4132#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4133 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4134#else
4135 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4136#endif
4137 return rc;
4138}
4139
4140
4141/**
4142 * Resets the TSC-delta related TSC samples and optionally the deltas
4143 * themselves.
4144 *
4145 * @param pDevExt Pointer to the device instance data.
4146 * @param fResetTscDeltas Whether the TSC-deltas are also to be reset.
4147 *
4148 * @remarks This might be called while holding a spinlock!
4149 */
4150static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fResetTscDeltas)
4151{
4152 unsigned iCpu;
4153 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4154 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4155 {
4156 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
4157 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
4158 if (fResetTscDeltas)
4159 {
4160 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpu->iCpuSet);
4161 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
4162 }
4163 }
4164}
4165
4166
4167/**
4168 * Picks an online CPU as the master TSC for TSC-delta computations.
4169 *
4170 * @returns VBox status code.
4171 * @param pDevExt Pointer to the device instance data.
4172 * @param pidxMaster Where to store the CPU array index of the chosen
4173 * master. Optional, can be NULL.
4174 */
4175static int supdrvTscPickMaster(PSUPDRVDEVEXT pDevExt, uint32_t *pidxMaster)
4176{
4177 /*
4178 * Pick the first CPU online as the master TSC and make it the new GIP master based
4179 * on the APIC ID.
4180 *
4181 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
4182 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
4183 * master as this point since the sync/async timer isn't created yet.
4184 */
4185 unsigned iCpu;
4186 uint32_t idxMaster = UINT32_MAX;
4187 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4188 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
4189 {
4190 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
4191 if (idxCpu != UINT16_MAX)
4192 {
4193 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
4194 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
4195 {
4196 idxMaster = idxCpu;
4197 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
4198 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpu->idCpu);
4199 if (pidxMaster)
4200 *pidxMaster = idxMaster;
4201 return VINF_SUCCESS;
4202 }
4203 }
4204 }
4205 return VERR_CPU_OFFLINE;
4206}
4207
4208
4209/**
4210 * Performs the initial measurements of the TSC deltas between CPUs.
4211 *
4212 * This is called by supdrvGipCreate(), supdrvGipPowerNotificationCallback() or
4213 * triggered by it if threaded.
4214 *
4215 * @returns VBox status code.
4216 * @param pDevExt Pointer to the device instance data.
4217 *
4218 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
4219 * idCpu, GIP's online CPU set which are populated in
4220 * supdrvGipInitOnCpu().
4221 */
4222static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt)
4223{
4224 PSUPGIPCPU pGipCpuMaster;
4225 unsigned iCpu;
4226 unsigned iOddEven;
4227 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4228 uint32_t idxMaster = UINT32_MAX;
4229 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
4230
4231 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4232 supdrvTscResetSamples(pDevExt, true /* fClearDeltas */);
4233 int rc = supdrvTscPickMaster(pDevExt, &idxMaster);
4234 if (RT_FAILURE(rc))
4235 {
4236 SUPR0Printf("Failed to pick a CPU master for TSC-delta measurements rc=%Rrc\n", rc);
4237 return rc;
4238 }
4239 AssertReturn(idxMaster < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4240 pGipCpuMaster = &pGip->aCPUs[idxMaster];
4241 Assert(pDevExt->idGipMaster == pGipCpuMaster->idCpu);
4242
4243 /*
4244 * If there is only a single CPU online we have nothing to do.
4245 */
4246 if (pGip->cOnlineCpus <= 1)
4247 {
4248 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
4249 return VINF_SUCCESS;
4250 }
4251
4252 /*
4253 * Loop thru the GIP CPU array and get deltas for each CPU (except the
4254 * master). We do the CPUs with the even numbered APIC IDs first so that
4255 * we've got alternative master CPUs to pick from on hyper-threaded systems.
4256 */
4257 for (iOddEven = 0; iOddEven < 2; iOddEven++)
4258 {
4259 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4260 {
4261 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4262 if ( iCpu != idxMaster
4263 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
4264 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4265 {
4266 rc = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4267 if (RT_FAILURE(rc))
4268 {
4269 SUPR0Printf("supdrvTscMeasureDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4270 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4271 break;
4272 }
4273
4274 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4275 {
4276 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4277 rc = VERR_TRY_AGAIN;
4278 break;
4279 }
4280 }
4281 }
4282 }
4283
4284 return rc;
4285}
4286
4287
4288#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4289
4290/**
4291 * Switches the TSC-delta measurement thread into the butchered state.
4292 *
4293 * @returns VBox status code.
4294 * @param pDevExt Pointer to the device instance data.
4295 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4296 * @param pszFailed An error message to log.
4297 * @param rcFailed The error code to exit the thread with.
4298 */
4299static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4300{
4301 if (!fSpinlockHeld)
4302 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4303
4304 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4305 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4306 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", pszFailed, rcFailed));
4307 return rcFailed;
4308}
4309
4310
4311/**
4312 * The TSC-delta measurement thread.
4313 *
4314 * @returns VBox status code.
4315 * @param hThread The thread handle.
4316 * @param pvUser Opaque pointer to the device instance data.
4317 */
4318static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4319{
4320 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4321 int rc = VERR_INTERNAL_ERROR_2;
4322 for (;;)
4323 {
4324 /*
4325 * Switch on the current state.
4326 */
4327 SUPDRVTSCDELTATHREADSTATE enmState;
4328 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4329 enmState = pDevExt->enmTscDeltaThreadState;
4330 switch (enmState)
4331 {
4332 case kTscDeltaThreadState_Creating:
4333 {
4334 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4335 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4336 if (RT_FAILURE(rc))
4337 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4338 RT_FALL_THRU();
4339 }
4340
4341 case kTscDeltaThreadState_Listening:
4342 {
4343 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4344
4345 /*
4346 * Linux counts uninterruptible sleeps as load, hence we shall do a
4347 * regular, interruptible sleep here and ignore wake ups due to signals.
4348 * See task_contributes_to_load() in include/linux/sched.h in the Linux sources.
4349 */
4350 rc = RTThreadUserWaitNoResume(hThread, pDevExt->cMsTscDeltaTimeout);
4351 if ( RT_FAILURE(rc)
4352 && rc != VERR_TIMEOUT
4353 && rc != VERR_INTERRUPTED)
4354 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4355 RTThreadUserReset(hThread);
4356 break;
4357 }
4358
4359 case kTscDeltaThreadState_WaitAndMeasure:
4360 {
4361 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4362 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4363 if (RT_FAILURE(rc))
4364 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4365 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4366 RTThreadSleep(1);
4367 RT_FALL_THRU();
4368 }
4369
4370 case kTscDeltaThreadState_Measuring:
4371 {
4372 if (pDevExt->fTscThreadRecomputeAllDeltas)
4373 {
4374 int cTries = 8;
4375 int cMsWaitPerTry = 10;
4376 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4377 Assert(pGip);
4378 do
4379 {
4380 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
4381 rc = supdrvTscMeasureInitialDeltas(pDevExt);
4382 if ( RT_SUCCESS(rc)
4383 || ( RT_FAILURE(rc)
4384 && rc != VERR_TRY_AGAIN
4385 && rc != VERR_CPU_OFFLINE))
4386 {
4387 break;
4388 }
4389 RTThreadSleep(cMsWaitPerTry);
4390 } while (cTries-- > 0);
4391 pDevExt->fTscThreadRecomputeAllDeltas = false;
4392 }
4393 else
4394 {
4395 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4396 unsigned iCpu;
4397
4398 /* Measure TSC-deltas only for the CPUs that are in the set. */
4399 rc = VINF_SUCCESS;
4400 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4401 {
4402 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4403 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4404 {
4405 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4406 {
4407 int rc2 = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4408 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4409 rc = rc2;
4410 }
4411 else
4412 {
4413 /*
4414 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex(),
4415 * mark the delta as fine to get the timer thread off our back.
4416 */
4417 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4418 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4419 }
4420 }
4421 }
4422 }
4423 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4424 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4425 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4426 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4427 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as init value, see supdrvTscDeltaThreadInit(). */
4428 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4429 break;
4430 }
4431
4432 case kTscDeltaThreadState_Terminating:
4433 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4434 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4435 return VINF_SUCCESS;
4436
4437 case kTscDeltaThreadState_Butchered:
4438 default:
4439 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4440 }
4441 }
4442 /* not reached */
4443}
4444
4445
4446/**
4447 * Waits for the TSC-delta measurement thread to respond to a state change.
4448 *
4449 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4450 * other error code on internal error.
4451 *
4452 * @param pDevExt The device instance data.
4453 * @param enmCurState The current state.
4454 * @param enmNewState The new state we're waiting for it to enter.
4455 */
4456static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4457 SUPDRVTSCDELTATHREADSTATE enmNewState)
4458{
4459 SUPDRVTSCDELTATHREADSTATE enmActualState;
4460 int rc;
4461
4462 /*
4463 * Wait a short while for the expected state transition.
4464 */
4465 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4466 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4467 enmActualState = pDevExt->enmTscDeltaThreadState;
4468 if (enmActualState == enmNewState)
4469 {
4470 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4471 rc = VINF_SUCCESS;
4472 }
4473 else if (enmActualState == enmCurState)
4474 {
4475 /*
4476 * Wait longer if the state has not yet transitioned to the one we want.
4477 */
4478 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4479 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4480 if ( RT_SUCCESS(rc)
4481 || rc == VERR_TIMEOUT)
4482 {
4483 /*
4484 * Check the state whether we've succeeded.
4485 */
4486 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4487 enmActualState = pDevExt->enmTscDeltaThreadState;
4488 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4489 if (enmActualState == enmNewState)
4490 rc = VINF_SUCCESS;
4491 else if (enmActualState == enmCurState)
4492 {
4493 rc = VERR_TIMEOUT;
4494 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmActualState=%d enmNewState=%d\n",
4495 enmActualState, enmNewState));
4496 }
4497 else
4498 {
4499 rc = VERR_INTERNAL_ERROR;
4500 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4501 enmActualState, enmNewState));
4502 }
4503 }
4504 else
4505 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4506 }
4507 else
4508 {
4509 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4510 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state %d when transitioning from %d to %d\n",
4511 enmActualState, enmCurState, enmNewState));
4512 rc = VERR_INTERNAL_ERROR;
4513 }
4514
4515 return rc;
4516}
4517
4518
4519/**
4520 * Signals the TSC-delta thread to start measuring TSC-deltas.
4521 *
4522 * @param pDevExt Pointer to the device instance data.
4523 * @param fForceAll Force re-calculating TSC-deltas on all CPUs.
4524 */
4525static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll)
4526{
4527 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
4528 {
4529 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4530 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4531 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4532 {
4533 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4534 if (fForceAll)
4535 pDevExt->fTscThreadRecomputeAllDeltas = true;
4536 }
4537 else if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_WaitAndMeasure
4538 && fForceAll)
4539 pDevExt->fTscThreadRecomputeAllDeltas = true;
4540 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4541 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4542 }
4543}
4544
4545
4546/**
4547 * Terminates the actual thread running supdrvTscDeltaThread().
4548 *
4549 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4550 * supdrvTscDeltaTerm().
4551 *
4552 * @param pDevExt Pointer to the device instance data.
4553 */
4554static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4555{
4556 int rc;
4557 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4558 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4559 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4560 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4561 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4562 if (RT_FAILURE(rc))
4563 {
4564 /* Signal a few more times before giving up. */
4565 int cTriesLeft = 5;
4566 while (--cTriesLeft > 0)
4567 {
4568 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4569 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4570 if (rc != VERR_TIMEOUT)
4571 break;
4572 }
4573 }
4574}
4575
4576
4577/**
4578 * Initializes and spawns the TSC-delta measurement thread.
4579 *
4580 * A thread is required for servicing re-measurement requests from events like
4581 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4582 * under all contexts on all OSs.
4583 *
4584 * @returns VBox status code.
4585 * @param pDevExt Pointer to the device instance data.
4586 *
4587 * @remarks Must only be called -after- initializing GIP and setting up MP
4588 * notifications!
4589 */
4590static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4591{
4592 int rc;
4593 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4594 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4595 if (RT_SUCCESS(rc))
4596 {
4597 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4598 if (RT_SUCCESS(rc))
4599 {
4600 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4601 pDevExt->cMsTscDeltaTimeout = 60000;
4602 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4603 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4604 if (RT_SUCCESS(rc))
4605 {
4606 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4607 if (RT_SUCCESS(rc))
4608 {
4609 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4610 return rc;
4611 }
4612
4613 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4614 supdrvTscDeltaThreadTerminate(pDevExt);
4615 }
4616 else
4617 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4618 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4619 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4620 }
4621 else
4622 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4623 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4624 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4625 }
4626 else
4627 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4628
4629 return rc;
4630}
4631
4632
4633/**
4634 * Terminates the TSC-delta measurement thread and cleanup.
4635 *
4636 * @param pDevExt Pointer to the device instance data.
4637 */
4638static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4639{
4640 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4641 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4642 {
4643 supdrvTscDeltaThreadTerminate(pDevExt);
4644 }
4645
4646 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4647 {
4648 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4649 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4650 }
4651
4652 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4653 {
4654 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4655 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4656 }
4657
4658 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4659}
4660
4661#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4662
4663/**
4664 * Measure the TSC delta for the CPU given by its CPU set index.
4665 *
4666 * @returns VBox status code.
4667 * @retval VERR_INTERRUPTED if interrupted while waiting.
4668 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4669 * measurement.
4670 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4671 *
4672 * @param pSession The caller's session. GIP must've been mapped.
4673 * @param iCpuSet The CPU set index of the CPU to measure.
4674 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4675 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4676 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4677 * ready.
4678 * @param cTries Number of times to try, pass 0 for the default.
4679 */
4680SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4681 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4682{
4683 PSUPDRVDEVEXT pDevExt;
4684 PSUPGLOBALINFOPAGE pGip;
4685 uint16_t iGipCpu;
4686 int rc;
4687#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4688 uint64_t msTsStartWait;
4689 uint32_t iWaitLoop;
4690#endif
4691
4692 /*
4693 * Validate and adjust the input.
4694 */
4695 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4696 if (!pSession->fGipReferenced)
4697 return VERR_WRONG_ORDER;
4698
4699 pDevExt = pSession->pDevExt;
4700 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4701
4702 pGip = pDevExt->pGip;
4703 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4704
4705 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4706 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4707 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4708 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4709
4710 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4711 return VERR_INVALID_FLAGS;
4712
4713 /*
4714 * The request is a noop if the TSC delta isn't being used.
4715 */
4716 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4717 return VINF_SUCCESS;
4718
4719 if (cTries == 0)
4720 cTries = 12;
4721 else if (cTries > 256)
4722 cTries = 256;
4723
4724 if (cMsWaitRetry == 0)
4725 cMsWaitRetry = 2;
4726 else if (cMsWaitRetry > 1000)
4727 cMsWaitRetry = 1000;
4728
4729#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4730 /*
4731 * Has the TSC already been measured and we're not forced to redo it?
4732 */
4733 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4734 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4735 return VINF_SUCCESS;
4736
4737 /*
4738 * Asynchronous request? Forward it to the thread, no waiting.
4739 */
4740 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4741 {
4742 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4743 * to pass those options to the thread somehow and implement it in the
4744 * thread. Check if anyone uses/needs fAsync before implementing this. */
4745 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4746 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4747 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4748 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4749 {
4750 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4751 rc = VINF_SUCCESS;
4752 }
4753 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4754 rc = VERR_THREAD_IS_DEAD;
4755 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4756 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4757 return VINF_SUCCESS;
4758 }
4759
4760 /*
4761 * If a TSC-delta measurement request is already being serviced by the thread,
4762 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4763 */
4764 msTsStartWait = RTTimeSystemMilliTS();
4765 for (iWaitLoop = 0;; iWaitLoop++)
4766 {
4767 uint64_t cMsElapsed;
4768 SUPDRVTSCDELTATHREADSTATE enmState;
4769 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4770 enmState = pDevExt->enmTscDeltaThreadState;
4771 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4772
4773 if (enmState == kTscDeltaThreadState_Measuring)
4774 { /* Must wait, the thread is busy. */ }
4775 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4776 { /* Must wait, this state only says what will happen next. */ }
4777 else if (enmState == kTscDeltaThreadState_Terminating)
4778 { /* Must wait, this state only says what should happen next. */ }
4779 else
4780 break; /* All other states, the thread is either idly listening or dead. */
4781
4782 /* Wait or fail. */
4783 if (cMsWaitThread == 0)
4784 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4785 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4786 if (cMsElapsed >= cMsWaitThread)
4787 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4788
4789 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4790 if (rc == VERR_INTERRUPTED)
4791 return rc;
4792 }
4793#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4794
4795 /*
4796 * Try measure the TSC delta the given number of times.
4797 */
4798 for (;;)
4799 {
4800 /* Unless we're forced to measure the delta, check whether it's done already. */
4801 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4802 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4803 {
4804 rc = VINF_SUCCESS;
4805 break;
4806 }
4807
4808 /* Measure it. */
4809 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4810 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4811 {
4812 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4813 break;
4814 }
4815
4816 /* Retry? */
4817 if (cTries <= 1)
4818 break;
4819 cTries--;
4820
4821 /* Always delay between retries (be nice to the rest of the system
4822 and avoid the BSOD hounds). */
4823 rc = RTThreadSleep(cMsWaitRetry);
4824 if (rc == VERR_INTERRUPTED)
4825 break;
4826 }
4827
4828 return rc;
4829}
4830
4831
4832/**
4833 * Service a TSC-delta measurement request.
4834 *
4835 * @returns VBox status code.
4836 * @param pDevExt Pointer to the device instance data.
4837 * @param pSession The support driver session.
4838 * @param pReq Pointer to the TSC-delta measurement request.
4839 */
4840int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4841{
4842 uint32_t cTries;
4843 uint32_t iCpuSet;
4844 uint32_t fFlags;
4845 RTMSINTERVAL cMsWaitRetry;
4846 RT_NOREF1(pDevExt);
4847
4848 /*
4849 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4850 */
4851 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4852
4853 if (pReq->u.In.idCpu == NIL_RTCPUID)
4854 return VERR_INVALID_CPU_ID;
4855 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4856 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4857 return VERR_INVALID_CPU_ID;
4858
4859 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4860
4861 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4862
4863 fFlags = 0;
4864 if (pReq->u.In.fAsync)
4865 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4866 if (pReq->u.In.fForce)
4867 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4868
4869 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4870 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4871 cTries);
4872}
4873
4874
4875/**
4876 * Reads TSC with delta applied.
4877 *
4878 * Will try to resolve delta value INT64_MAX before applying it. This is the
4879 * main purpose of this function, to handle the case where the delta needs to be
4880 * determined.
4881 *
4882 * @returns VBox status code.
4883 * @param pDevExt Pointer to the device instance data.
4884 * @param pSession The support driver session.
4885 * @param pReq Pointer to the TSC-read request.
4886 */
4887int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4888{
4889 PSUPGLOBALINFOPAGE pGip;
4890 int rc;
4891
4892 /*
4893 * Validate. We require the client to have mapped GIP (no asserting on
4894 * ring-3 preconditions).
4895 */
4896 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4897 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4898 return VERR_WRONG_ORDER;
4899 pGip = pDevExt->pGip;
4900 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4901
4902 /*
4903 * We're usually here because we need to apply delta, but we shouldn't be
4904 * upset if the GIP is some different mode.
4905 */
4906 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4907 {
4908 uint32_t cTries = 0;
4909 for (;;)
4910 {
4911 /*
4912 * Start by gathering the data, using CLI for disabling preemption
4913 * while we do that.
4914 */
4915 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4916 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4917 int iGipCpu = 0; /* gcc maybe used uninitialized */
4918 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4919 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4920 {
4921 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4922 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4923 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4924 ASMSetFlags(fEFlags);
4925
4926 /*
4927 * If we're lucky we've got a delta, but no predictions here
4928 * as this I/O control is normally only used when the TSC delta
4929 * is set to INT64_MAX.
4930 */
4931 if (i64Delta != INT64_MAX)
4932 {
4933 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4934 rc = VINF_SUCCESS;
4935 break;
4936 }
4937
4938 /* Give up after a few times. */
4939 if (cTries >= 4)
4940 {
4941 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4942 break;
4943 }
4944
4945 /* Need to measure the delta an try again. */
4946 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4947 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4948 /** @todo should probably delay on failure... dpc watchdogs */
4949 }
4950 else
4951 {
4952 /* This really shouldn't happen. */
4953 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4954 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4955 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4956 ASMSetFlags(fEFlags);
4957 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4958 break;
4959 }
4960 }
4961 }
4962 else
4963 {
4964 /*
4965 * No delta to apply. Easy. Deal with preemption the lazy way.
4966 */
4967 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4968 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4969 int iGipCpu = 0; /* gcc may be used uninitialized */
4970 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4971 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4972 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4973 else
4974 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4975 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4976 ASMSetFlags(fEFlags);
4977 rc = VINF_SUCCESS;
4978 }
4979
4980 return rc;
4981}
4982
4983
4984/**
4985 * Worker for supdrvIOCtl_GipSetFlags.
4986 *
4987 * @returns VBox status code.
4988 * @retval VERR_WRONG_ORDER if an enable-once-per-session flag is set again for
4989 * a session.
4990 *
4991 * @param pDevExt Pointer to the device instance data.
4992 * @param pSession The support driver session.
4993 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4994 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4995 *
4996 * @remarks Caller must own the GIP mutex.
4997 *
4998 * @remarks This function doesn't validate any of the flags.
4999 */
5000static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5001{
5002 uint32_t cRefs;
5003 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
5004 AssertMsg((fOrMask & fAndMask) == fOrMask, ("%#x & %#x\n", fOrMask, fAndMask)); /* ASSUMED by code below */
5005
5006 /*
5007 * Compute GIP test-mode flags.
5008 */
5009 if (fOrMask & SUPGIP_FLAGS_TESTING_ENABLE)
5010 {
5011 if (!pSession->fGipTestMode)
5012 {
5013 Assert(pDevExt->cGipTestModeRefs < _64K);
5014 pSession->fGipTestMode = true;
5015 cRefs = ++pDevExt->cGipTestModeRefs;
5016 if (cRefs == 1)
5017 {
5018 fOrMask |= SUPGIP_FLAGS_TESTING | SUPGIP_FLAGS_TESTING_START;
5019 fAndMask &= ~SUPGIP_FLAGS_TESTING_STOP;
5020 }
5021 }
5022 else
5023 {
5024 LogRelMax(10, ("supdrvGipSetFlags: SUPGIP_FLAGS_TESTING_ENABLE already set for this session\n"));
5025 return VERR_WRONG_ORDER;
5026 }
5027 }
5028 else if ( !(fAndMask & SUPGIP_FLAGS_TESTING_ENABLE)
5029 && pSession->fGipTestMode)
5030 {
5031 Assert(pDevExt->cGipTestModeRefs > 0);
5032 Assert(pDevExt->cGipTestModeRefs < _64K);
5033 pSession->fGipTestMode = false;
5034 cRefs = --pDevExt->cGipTestModeRefs;
5035 if (!cRefs)
5036 fOrMask |= SUPGIP_FLAGS_TESTING_STOP;
5037 else
5038 fAndMask |= SUPGIP_FLAGS_TESTING_ENABLE;
5039 }
5040
5041 /*
5042 * Commit the flags. This should be done as atomically as possible
5043 * since the flag consumers won't be holding the GIP mutex.
5044 */
5045 ASMAtomicOrU32(&pGip->fFlags, fOrMask);
5046 ASMAtomicAndU32(&pGip->fFlags, fAndMask);
5047
5048 return VINF_SUCCESS;
5049}
5050
5051
5052/**
5053 * Sets GIP test mode parameters.
5054 *
5055 * @returns VBox status code.
5056 * @param pDevExt Pointer to the device instance data.
5057 * @param pSession The support driver session.
5058 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5059 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5060 */
5061int VBOXCALL supdrvIOCtl_GipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5062{
5063 PSUPGLOBALINFOPAGE pGip;
5064 int rc;
5065
5066 /*
5067 * Validate. We require the client to have mapped GIP (no asserting on
5068 * ring-3 preconditions).
5069 */
5070 AssertPtr(pDevExt); AssertPtr(pSession); /* paranoia^2 */
5071 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
5072 return VERR_WRONG_ORDER;
5073 pGip = pDevExt->pGip;
5074 AssertReturn(pGip, VERR_INTERNAL_ERROR_3);
5075
5076 if (fOrMask & ~SUPGIP_FLAGS_VALID_MASK)
5077 return VERR_INVALID_PARAMETER;
5078 if ((fAndMask & ~SUPGIP_FLAGS_VALID_MASK) != ~SUPGIP_FLAGS_VALID_MASK)
5079 return VERR_INVALID_PARAMETER;
5080
5081 /*
5082 * Don't confuse supdrvGipSetFlags or anyone else by both setting
5083 * and clearing the same flags. AND takes precedence.
5084 */
5085 fOrMask &= fAndMask;
5086
5087 /*
5088 * Take the loader lock to avoid having to think about races between two
5089 * clients changing the flags at the same time (state is not simple).
5090 */
5091#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5092 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
5093#else
5094 RTSemFastMutexRequest(pDevExt->mtxGip);
5095#endif
5096
5097 rc = supdrvGipSetFlags(pDevExt, pSession, fOrMask, fAndMask);
5098
5099#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5100 RTSemMutexRelease(pDevExt->mtxGip);
5101#else
5102 RTSemFastMutexRelease(pDevExt->mtxGip);
5103#endif
5104 return rc;
5105}
5106
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette