VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 100838

Last change on this file since 100838 was 100357, checked in by vboxsync, 19 months ago

Runtime/RTR0MemObj*: Add PhysHighest parameter to RTR0MemObjAllocCont to indicate the maximum allowed physical address for an allocation, bugref:10457 [second attempt]

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 186.7 KB
Line 
1/* $Id: SUPDrvGip.cpp 100357 2023-07-04 07:00:26Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#define LOG_GROUP LOG_GROUP_SUP_DRV
42#define SUPDRV_AGNOSTIC
43#include "SUPDrvInternal.h"
44#ifndef PAGE_SHIFT
45# include <iprt/param.h>
46#endif
47#include <iprt/asm.h>
48#include <iprt/asm-amd64-x86.h>
49#include <iprt/asm-math.h>
50#include <iprt/cpuset.h>
51#include <iprt/handletable.h>
52#include <iprt/mem.h>
53#include <iprt/mp.h>
54#include <iprt/power.h>
55#include <iprt/process.h>
56#include <iprt/semaphore.h>
57#include <iprt/spinlock.h>
58#include <iprt/thread.h>
59#include <iprt/uuid.h>
60#include <iprt/net.h>
61#include <iprt/crc.h>
62#include <iprt/string.h>
63#include <iprt/timer.h>
64#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
65# include <iprt/rand.h>
66# include <iprt/path.h>
67#endif
68#include <iprt/uint128.h>
69#include <iprt/x86.h>
70
71#include <VBox/param.h>
72#include <VBox/log.h>
73#include <VBox/err.h>
74
75#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
76# include "dtrace/SUPDrv.h"
77#else
78/* ... */
79#endif
80
81
82/*********************************************************************************************************************************
83* Defined Constants And Macros *
84*********************************************************************************************************************************/
85/** The frequency by which we recalculate the u32UpdateHz and
86 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
87 *
88 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
89 */
90#define GIP_UPDATEHZ_RECALC_FREQ 0x800
91
92/** A reserved TSC value used for synchronization as well as measurement of
93 * TSC deltas. */
94#define GIP_TSC_DELTA_RSVD UINT64_MAX
95/** The number of TSC delta measurement loops in total (includes primer and
96 * read-time loops). */
97#define GIP_TSC_DELTA_LOOPS 96
98/** The number of cache primer loops. */
99#define GIP_TSC_DELTA_PRIMER_LOOPS 4
100/** The number of loops until we keep computing the minumum read time. */
101#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
102
103/** The TSC frequency refinement period in seconds.
104 * The timer fires after 200ms, then every second, this value just says when
105 * to stop it after that. */
106#define GIP_TSC_REFINE_PERIOD_IN_SECS 12
107/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
108#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
109/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
110#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
111/** The TSC delta value for the initial GIP master - 0 in regular builds.
112 * To test the delta code this can be set to a non-zero value. */
113#if 0
114# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
115#else
116# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
117#endif
118
119AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
120AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
121
122/** @def VBOX_SVN_REV
123 * The makefile should define this if it can. */
124#ifndef VBOX_SVN_REV
125# define VBOX_SVN_REV 0
126#endif
127
128#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
129# define DO_NOT_START_GIP
130#endif
131
132
133/*********************************************************************************************************************************
134* Internal Functions *
135*********************************************************************************************************************************/
136static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
137static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
138static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask);
139static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
140static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas);
141#ifdef SUPDRV_USE_TSC_DELTA_THREAD
142static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
143static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
144static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll);
145#else
146static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt);
147static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
148#endif
149
150
151/*********************************************************************************************************************************
152* Global Variables *
153*********************************************************************************************************************************/
154DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
155SUPR0_EXPORT_SYMBOL(g_pSUPGlobalInfoPage);
156
157
158
159/*
160 *
161 * Misc Common GIP Code
162 * Misc Common GIP Code
163 * Misc Common GIP Code
164 *
165 *
166 */
167
168
169/**
170 * Finds the GIP CPU index corresponding to @a idCpu.
171 *
172 * @returns GIP CPU array index, UINT32_MAX if not found.
173 * @param pGip The GIP.
174 * @param idCpu The CPU ID.
175 */
176static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
177{
178 uint32_t i;
179 for (i = 0; i < pGip->cCpus; i++)
180 if (pGip->aCPUs[i].idCpu == idCpu)
181 return i;
182 return UINT32_MAX;
183}
184
185
186/**
187 * Gets the APIC ID using the best available method.
188 *
189 * @returns APIC ID.
190 * @param pGip The GIP, for SUPGIPGETCPU_XXX.
191 */
192DECLINLINE(uint32_t) supdrvGipGetApicId(PSUPGLOBALINFOPAGE pGip)
193{
194 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_0B)
195 return ASMGetApicIdExt0B();
196 if (pGip->fGetGipCpu & SUPGIPGETCPU_APIC_ID_EXT_8000001E)
197 return ASMGetApicIdExt8000001E();
198 return ASMGetApicId();
199}
200
201
202/**
203 * Gets the APIC ID using the best available method, slow version.
204 */
205static uint32_t supdrvGipGetApicIdSlow(void)
206{
207 uint32_t const idApic = ASMGetApicId();
208
209 /* The Intel CPU topology leaf: */
210 uint32_t uOther = ASMCpuId_EAX(0);
211 if (uOther >= UINT32_C(0xb) && RTX86IsValidStdRange(uOther))
212 {
213 uint32_t uEax = 0;
214 uint32_t uEbx = 0;
215 uint32_t uEcx = 0;
216 uint32_t uEdx = 0;
217#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
218 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
219#else
220 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
221#endif
222 if ((uEcx >> 8) != 0) /* level type != invalid */
223 {
224 if ((uEdx & 0xff) == idApic)
225 return uEdx;
226 AssertMsgFailed(("ASMGetApicIdExt0B=>%#x idApic=%#x\n", uEdx, idApic));
227 }
228 }
229
230 /* The AMD leaf: */
231 uOther = ASMCpuId_EAX(UINT32_C(0x80000000));
232 if (uOther >= UINT32_C(0x8000001e) && RTX86IsValidExtRange(uOther))
233 {
234 uOther = ASMGetApicIdExt8000001E();
235 if ((uOther & 0xff) == idApic)
236 return uOther;
237 AssertMsgFailed(("ASMGetApicIdExt8000001E=>%#x idApic=%#x\n", uOther, idApic));
238 }
239 return idApic;
240}
241
242
243/*
244 *
245 * GIP Mapping and Unmapping Related Code.
246 * GIP Mapping and Unmapping Related Code.
247 * GIP Mapping and Unmapping Related Code.
248 *
249 *
250 */
251
252
253/**
254 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
255 * updating.
256 *
257 * @param pGipCpu The per CPU structure for this CPU.
258 * @param u64NanoTS The current time.
259 */
260static void supdrvGipReInitCpu(PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
261{
262 /*
263 * Here we don't really care about applying the TSC delta. The re-initialization of this
264 * value is not relevant especially while (re)starting the GIP as the first few ones will
265 * be ignored anyway, see supdrvGipDoUpdateCpu().
266 */
267 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
268 pGipCpu->u64NanoTS = u64NanoTS;
269}
270
271
272/**
273 * Set the current TSC and NanoTS value for the CPU.
274 *
275 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
276 * @param pvUser1 Pointer to the ring-0 GIP mapping.
277 * @param pvUser2 Pointer to the variable holding the current time.
278 */
279static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
280{
281 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
282 uint32_t const idApic = supdrvGipGetApicId(pGip);
283 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
284 {
285 unsigned const iCpu = pGip->aiCpuFromApicId[idApic];
286
287 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
288 supdrvGipReInitCpu(&pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
289 else
290 LogRelMax(64, ("supdrvGipReInitCpuCallback: iCpu=%#x out of bounds (%#zx, idApic=%#x)\n",
291 iCpu, RT_ELEMENTS(pGip->aiCpuFromApicId), idApic));
292 }
293 else
294 LogRelMax(64, ("supdrvGipReInitCpuCallback: idApic=%#x out of bounds (%#zx)\n",
295 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId)));
296
297 NOREF(pvUser2);
298}
299
300
301/**
302 * State structure for supdrvGipDetectGetGipCpuCallback.
303 */
304typedef struct SUPDRVGIPDETECTGETCPU
305{
306 /** Bitmap of APIC IDs that has been seen (initialized to zero).
307 * Used to detect duplicate APIC IDs (paranoia). */
308 uint8_t volatile bmApicId[4096 / 8];
309 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
310 * initially). The callback clears the methods not detected. */
311 uint32_t volatile fSupported;
312 /** The first callback detecting any kind of range issues (initialized to
313 * NIL_RTCPUID). */
314 RTCPUID volatile idCpuProblem;
315} SUPDRVGIPDETECTGETCPU;
316/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
317typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
318
319
320/**
321 * Checks for alternative ways of getting the CPU ID.
322 *
323 * This also checks the APIC ID, CPU ID and CPU set index values against the
324 * GIP tables.
325 *
326 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
327 * @param pvUser1 Pointer to the state structure.
328 * @param pvUser2 Pointer to the GIP.
329 */
330static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
331{
332 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
333 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
334 uint32_t fSupported = 0;
335 uint32_t idApic;
336 uint32_t uEax, uEbx, uEcx, uEdx;
337 int iCpuSet;
338 NOREF(pGip);
339
340 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
341
342 /*
343 * Check that the CPU ID and CPU set index are interchangable.
344 */
345 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
346 if ((RTCPUID)iCpuSet == idCpu)
347 {
348 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
349 if ( iCpuSet >= 0
350 && iCpuSet < RTCPUSET_MAX_CPUS
351 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
352 {
353 PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
354
355 /*
356 * Check whether the IDTR.LIMIT contains a CPU number.
357 */
358#ifdef RT_ARCH_X86
359 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
360#else
361 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
362#endif
363 RTIDTR Idtr;
364 ASMGetIDTR(&Idtr);
365 if (Idtr.cbIdt >= cbIdt)
366 {
367 uint32_t uTmp = Idtr.cbIdt - cbIdt;
368 uTmp &= RTCPUSET_MAX_CPUS - 1;
369 if (uTmp == idCpu)
370 {
371 RTIDTR Idtr2;
372 ASMGetIDTR(&Idtr2);
373 if (Idtr2.cbIdt == Idtr.cbIdt)
374 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
375 }
376 }
377
378 /*
379 * Check whether RDTSCP is an option.
380 */
381 if (ASMHasCpuId())
382 {
383 if ( RTX86IsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
384 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
385 {
386 uint32_t uAux;
387 ASMReadTscWithAux(&uAux);
388 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
389 {
390 ASMNopPause();
391 ASMReadTscWithAux(&uAux);
392 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
393 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
394 }
395
396 if (pGipCpu)
397 {
398 uint32_t const uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
399 if ( (uAux & UINT16_MAX) == uGroupedAux
400 && pGipCpu->iCpuGroupMember <= UINT8_MAX)
401 {
402 ASMNopPause();
403 ASMReadTscWithAux(&uAux);
404 if ((uAux & UINT16_MAX) == uGroupedAux)
405 fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
406 }
407 }
408 }
409 }
410 }
411 }
412
413 /*
414 * Check for extended APIC ID methods.
415 */
416 idApic = UINT32_MAX;
417 uEax = ASMCpuId_EAX(0);
418 if (uEax >= UINT32_C(0xb) && RTX86IsValidStdRange(uEax))
419 {
420#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
421 ASMCpuId_Idx_ECX(0xb, 0, &uEax, &uEbx, &uEcx, &uEdx);
422#else
423 ASMCpuIdExSlow(0xb, 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
424#endif
425 if ((uEcx >> 8) != 0) /* level type != invalid */
426 {
427 if (RT_LIKELY( uEdx < RT_ELEMENTS(pGip->aiCpuFromApicId)
428 && !ASMBitTest(pState->bmApicId, uEdx)))
429 {
430 if (uEdx == ASMGetApicIdExt0B())
431 {
432 idApic = uEdx;
433 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_0B;
434 }
435 else
436 AssertMsgFailed(("%#x vs %#x\n", uEdx, ASMGetApicIdExt0B()));
437 }
438 }
439 }
440
441 uEax = ASMCpuId_EAX(UINT32_C(0x80000000));
442 if (uEax >= UINT32_C(0x8000001e) && RTX86IsValidExtRange(uEax))
443 {
444#if defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD)
445 ASMCpuId_Idx_ECX(UINT32_C(0x8000001e), 0, &uEax, &uEbx, &uEcx, &uEdx);
446#else
447 ASMCpuIdExSlow(UINT32_C(0x8000001e), 0, 0, 0, &uEax, &uEbx, &uEcx, &uEdx);
448#endif
449 if (uEax || uEbx || uEcx || uEdx)
450 {
451 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
452 && ( idApic == UINT32_MAX
453 || idApic == uEax)
454 && !ASMBitTest(pState->bmApicId, uEax)))
455 {
456 if (uEax == ASMGetApicIdExt8000001E())
457 {
458 idApic = uEax;
459 fSupported |= SUPGIPGETCPU_APIC_ID_EXT_8000001E;
460 }
461 else
462 AssertMsgFailed(("%#x vs %#x\n", uEax, ASMGetApicIdExt8000001E()));
463 }
464 }
465 }
466
467 /*
468 * Check that the APIC ID is unique.
469 */
470 uEax = ASMGetApicId();
471 if (RT_LIKELY( uEax < RT_ELEMENTS(pGip->aiCpuFromApicId)
472 && ( idApic == UINT32_MAX
473 || idApic == uEax)
474 && !ASMAtomicBitTestAndSet(pState->bmApicId, uEax)))
475 {
476 idApic = uEax;
477 fSupported |= SUPGIPGETCPU_APIC_ID;
478 }
479 else if ( idApic == UINT32_MAX
480 || idApic >= RT_ELEMENTS(pGip->aiCpuFromApicId) /* parnaoia */
481 || ASMAtomicBitTestAndSet(pState->bmApicId, idApic))
482 {
483 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
484 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
485 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x/%#x - duplicate APIC ID.\n",
486 idCpu, iCpuSet, uEax, idApic));
487 }
488
489 /*
490 * Check that the iCpuSet is within the expected range.
491 */
492 if (RT_UNLIKELY( iCpuSet < 0
493 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
494 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
495 {
496 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
497 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
498 idCpu, iCpuSet, idApic));
499 }
500 else
501 {
502 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
503 if (RT_UNLIKELY(idCpu2 != idCpu))
504 {
505 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
506 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
507 idCpu, iCpuSet, idApic, idCpu2));
508 }
509 }
510
511 /*
512 * Update the supported feature mask before we return.
513 */
514 ASMAtomicAndU32(&pState->fSupported, fSupported);
515
516 NOREF(pvUser2);
517}
518
519
520/**
521 * Increase the timer freqency on hosts where this is possible (NT).
522 *
523 * The idea is that more interrupts is better for us... Also, it's better than
524 * we increase the timer frequence, because we might end up getting inaccurate
525 * callbacks if someone else does it.
526 *
527 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
528 */
529static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
530{
531 if (pDevExt->u32SystemTimerGranularityGrant == 0)
532 {
533 uint32_t u32SystemResolution;
534 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
535 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
536 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
537 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
538 )
539 {
540#if 0 /* def VBOX_STRICT - this is somehow triggers bogus assertions on windows 10 */
541 uint32_t u32After = RTTimerGetSystemGranularity();
542 AssertMsg(u32After <= u32SystemResolution, ("u32After=%u u32SystemResolution=%u\n", u32After, u32SystemResolution));
543#endif
544 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
545 }
546 }
547}
548
549
550/**
551 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
552 *
553 * @param pDevExt Clears u32SystemTimerGranularityGrant.
554 */
555static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
556{
557 if (pDevExt->u32SystemTimerGranularityGrant)
558 {
559 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
560 AssertRC(rc2);
561 pDevExt->u32SystemTimerGranularityGrant = 0;
562 }
563}
564
565
566/**
567 * Maps the GIP into userspace and/or get the physical address of the GIP.
568 *
569 * @returns IPRT status code.
570 * @param pSession Session to which the GIP mapping should belong.
571 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
572 * @param pHCPhysGip Where to store the physical address. (optional)
573 *
574 * @remark There is no reference counting on the mapping, so one call to this function
575 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
576 * and remove the session as a GIP user.
577 */
578SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
579{
580 int rc;
581 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
582 RTR3PTR pGipR3 = NIL_RTR3PTR;
583 RTHCPHYS HCPhys = NIL_RTHCPHYS;
584 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
585
586 /*
587 * Validate
588 */
589 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
590 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
591 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
592
593#ifdef SUPDRV_USE_MUTEX_FOR_GIP
594 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
595#else
596 RTSemFastMutexRequest(pDevExt->mtxGip);
597#endif
598 if (pDevExt->pGip)
599 {
600 /*
601 * Map it?
602 */
603 rc = VINF_SUCCESS;
604 if (ppGipR3)
605 {
606 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
607 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
608 RTMEM_PROT_READ, NIL_RTR0PROCESS);
609 if (RT_SUCCESS(rc))
610 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
611 }
612
613 /*
614 * Get physical address.
615 */
616 if (pHCPhysGip && RT_SUCCESS(rc))
617 HCPhys = pDevExt->HCPhysGip;
618
619 /*
620 * Reference globally.
621 */
622 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
623 {
624 pSession->fGipReferenced = 1;
625 pDevExt->cGipUsers++;
626 if (pDevExt->cGipUsers == 1)
627 {
628 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
629 uint64_t u64NanoTS;
630
631 /*
632 * GIP starts/resumes updating again. On windows we bump the
633 * host timer frequency to make sure we don't get stuck in guest
634 * mode and to get better timer (and possibly clock) accuracy.
635 */
636 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
637
638 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
639
640 /*
641 * document me
642 */
643 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
644 {
645 unsigned i;
646 for (i = 0; i < pGipR0->cCpus; i++)
647 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
648 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
649 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
650 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
651 }
652
653 /*
654 * document me
655 */
656 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
657 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
658 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
659 || RTMpGetOnlineCount() == 1)
660 supdrvGipReInitCpu(&pGipR0->aCPUs[0], u64NanoTS);
661 else
662 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
663
664 /*
665 * Detect alternative ways to figure the CPU ID in ring-3 and
666 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
667 * and CPU set indexes while we're at it.
668 */
669 if (RT_SUCCESS(rc))
670 {
671 PSUPDRVGIPDETECTGETCPU pDetectState = (PSUPDRVGIPDETECTGETCPU)RTMemTmpAllocZ(sizeof(*pDetectState));
672 if (pDetectState)
673 {
674 pDetectState->fSupported = UINT32_MAX;
675 pDetectState->idCpuProblem = NIL_RTCPUID;
676 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, pDetectState, pGipR0);
677 if (pDetectState->idCpuProblem == NIL_RTCPUID)
678 {
679 if ( pDetectState->fSupported != UINT32_MAX
680 && pDetectState->fSupported != 0)
681 {
682 if (pGipR0->fGetGipCpu != pDetectState->fSupported)
683 {
684 pGipR0->fGetGipCpu = pDetectState->fSupported;
685 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", pDetectState->fSupported));
686 }
687 }
688 else
689 {
690 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
691 pDetectState->fSupported));
692 rc = VERR_UNSUPPORTED_CPU;
693 }
694 }
695 else
696 {
697 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
698 pDetectState->idCpuProblem, pDetectState->idCpuProblem));
699 rc = VERR_INVALID_CPU_ID;
700 }
701 RTMemTmpFree(pDetectState);
702 }
703 else
704 rc = VERR_NO_TMP_MEMORY;
705 }
706
707 /*
708 * Start the GIP timer if all is well..
709 */
710 if (RT_SUCCESS(rc))
711 {
712#ifndef DO_NOT_START_GIP
713 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
714#endif
715 rc = VINF_SUCCESS;
716 }
717
718 /*
719 * Bail out on error.
720 */
721 if (RT_FAILURE(rc))
722 {
723 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
724 pDevExt->cGipUsers = 0;
725 pSession->fGipReferenced = 0;
726 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
727 {
728 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
729 if (RT_SUCCESS(rc2))
730 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
731 }
732 HCPhys = NIL_RTHCPHYS;
733 pGipR3 = NIL_RTR3PTR;
734 }
735 }
736 }
737 }
738 else
739 {
740 rc = VERR_GENERAL_FAILURE;
741 Log(("SUPR0GipMap: GIP is not available!\n"));
742 }
743#ifdef SUPDRV_USE_MUTEX_FOR_GIP
744 RTSemMutexRelease(pDevExt->mtxGip);
745#else
746 RTSemFastMutexRelease(pDevExt->mtxGip);
747#endif
748
749 /*
750 * Write returns.
751 */
752 if (pHCPhysGip)
753 *pHCPhysGip = HCPhys;
754 if (ppGipR3)
755 *ppGipR3 = pGipR3;
756
757#ifdef DEBUG_DARWIN_GIP
758 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
759#else
760 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
761#endif
762 return rc;
763}
764SUPR0_EXPORT_SYMBOL(SUPR0GipMap);
765
766
767/**
768 * Unmaps any user mapping of the GIP and terminates all GIP access
769 * from this session.
770 *
771 * @returns IPRT status code.
772 * @param pSession Session to which the GIP mapping should belong.
773 */
774SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
775{
776 int rc = VINF_SUCCESS;
777 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
778#ifdef DEBUG_DARWIN_GIP
779 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
780 pSession,
781 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
782 pSession->GipMapObjR3));
783#else
784 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
785#endif
786 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
787
788#ifdef SUPDRV_USE_MUTEX_FOR_GIP
789 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
790#else
791 RTSemFastMutexRequest(pDevExt->mtxGip);
792#endif
793
794 /*
795 * GIP test-mode session?
796 */
797 if ( pSession->fGipTestMode
798 && pDevExt->pGip)
799 {
800 supdrvGipSetFlags(pDevExt, pSession, 0, ~SUPGIP_FLAGS_TESTING_ENABLE);
801 Assert(!pSession->fGipTestMode);
802 }
803
804 /*
805 * Unmap anything?
806 */
807 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
808 {
809 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
810 AssertRC(rc);
811 if (RT_SUCCESS(rc))
812 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
813 }
814
815 /*
816 * Dereference global GIP.
817 */
818 if (pSession->fGipReferenced && !rc)
819 {
820 pSession->fGipReferenced = 0;
821 if ( pDevExt->cGipUsers > 0
822 && !--pDevExt->cGipUsers)
823 {
824 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
825#ifndef DO_NOT_START_GIP
826 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
827#endif
828 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
829 }
830 }
831
832#ifdef SUPDRV_USE_MUTEX_FOR_GIP
833 RTSemMutexRelease(pDevExt->mtxGip);
834#else
835 RTSemFastMutexRelease(pDevExt->mtxGip);
836#endif
837
838 return rc;
839}
840SUPR0_EXPORT_SYMBOL(SUPR0GipUnmap);
841
842
843/**
844 * Gets the GIP pointer.
845 *
846 * @returns Pointer to the GIP or NULL.
847 */
848SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
849{
850 return g_pSUPGlobalInfoPage;
851}
852
853
854
855
856
857/*
858 *
859 *
860 * GIP Initialization, Termination and CPU Offline / Online Related Code.
861 * GIP Initialization, Termination and CPU Offline / Online Related Code.
862 * GIP Initialization, Termination and CPU Offline / Online Related Code.
863 *
864 *
865 */
866
867/**
868 * Used by supdrvGipInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
869 * to update the TSC frequency related GIP variables.
870 *
871 * @param pGip The GIP.
872 * @param nsElapsed The number of nanoseconds elapsed.
873 * @param cElapsedTscTicks The corresponding number of TSC ticks.
874 * @param iTick The tick number for debugging.
875 */
876static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks, uint32_t iTick)
877{
878 /*
879 * Calculate the frequency.
880 */
881 uint64_t uCpuHz;
882 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
883 && nsElapsed < UINT32_MAX)
884 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
885 else
886 {
887 RTUINT128U CpuHz, Tmp, Divisor;
888 CpuHz.s.Lo = CpuHz.s.Hi = 0;
889 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
890 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
891 uCpuHz = CpuHz.s.Lo;
892 }
893
894 /*
895 * Update the GIP.
896 */
897 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
898 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
899 {
900 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
901
902 /* For inspecting the frequency calcs using tstGIP-2, debugger or similar. */
903 if (iTick + 1 < pGip->cCpus)
904 ASMAtomicWriteU64(&pGip->aCPUs[iTick + 1].u64CpuHz, uCpuHz);
905 }
906}
907
908
909/**
910 * Timer callback function for TSC frequency refinement in invariant GIP mode.
911 *
912 * This is started during driver init and fires once
913 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
914 *
915 * @param pTimer The timer.
916 * @param pvUser Opaque pointer to the device instance data.
917 * @param iTick The timer tick.
918 */
919static DECLCALLBACK(void) supdrvGipInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
920{
921 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
922 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
923 RTCPUID idCpu;
924 uint64_t cNsElapsed;
925 uint64_t cTscTicksElapsed;
926 uint64_t nsNow;
927 uint64_t uTsc;
928 RTCCUINTREG fEFlags;
929
930 /* Paranoia. */
931 AssertReturnVoid(pGip);
932 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
933
934 /*
935 * If we got a power event, stop the refinement process.
936 */
937 if (pDevExt->fInvTscRefinePowerEvent)
938 {
939 int rc = RTTimerStop(pTimer); AssertRC(rc);
940 return;
941 }
942
943 /*
944 * Read the TSC and time, noting which CPU we are on.
945 *
946 * Don't bother spinning until RTTimeSystemNanoTS changes, since on
947 * systems where it matters we're in a context where we cannot waste that
948 * much time (DPC watchdog, called from clock interrupt).
949 */
950 fEFlags = ASMIntDisableFlags();
951 uTsc = ASMReadTSC();
952 nsNow = RTTimeSystemNanoTS();
953 idCpu = RTMpCpuId();
954 ASMSetFlags(fEFlags);
955
956 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
957 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
958
959 /*
960 * If the above measurement was taken on a different CPU than the one we
961 * started the process on, cTscTicksElapsed will need to be adjusted with
962 * the TSC deltas of both the CPUs.
963 *
964 * We ASSUME that the delta calculation process takes less time than the
965 * TSC frequency refinement timer. If it doesn't, we'll complain and
966 * drop the frequency refinement.
967 *
968 * Note! We cannot entirely trust enmUseTscDelta here because it's
969 * downgraded after each delta calculation.
970 */
971 if ( idCpu != pDevExt->idCpuInvarTscRefine
972 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
973 {
974 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
975 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
976 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
977 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
978 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
979 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
980 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
981 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
982 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
983 {
984 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
985 {
986 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
987 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
988 }
989 }
990 /*
991 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
992 * calculations.
993 */
994 else if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
995 {
996 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
997 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
998 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
999 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1000 int rc = RTTimerStop(pTimer); AssertRC(rc);
1001 return;
1002 }
1003 }
1004
1005 /*
1006 * Calculate and update the CPU frequency variables in GIP.
1007 *
1008 * If there is a GIP user already and we've already refined the frequency
1009 * a couple of times, don't update it as we want a stable frequency value
1010 * for all VMs.
1011 */
1012 if ( pDevExt->cGipUsers == 0
1013 || cNsElapsed < RT_NS_1SEC * 2)
1014 {
1015 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed, (uint32_t)iTick);
1016
1017 /*
1018 * Stop the timer once we've reached the defined refinement period.
1019 */
1020 if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
1021 {
1022 int rc = RTTimerStop(pTimer);
1023 AssertRC(rc);
1024 }
1025 }
1026 else
1027 {
1028 int rc = RTTimerStop(pTimer);
1029 AssertRC(rc);
1030 }
1031}
1032
1033
1034/**
1035 * @callback_method_impl{FNRTPOWERNOTIFICATION}
1036 */
1037static DECLCALLBACK(void) supdrvGipPowerNotificationCallback(RTPOWEREVENT enmEvent, void *pvUser)
1038{
1039 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1040 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1041
1042 /*
1043 * If the TSC frequency refinement timer is running, we need to cancel it so it
1044 * doesn't screw up the frequency after a long suspend.
1045 *
1046 * Recalculate all TSC-deltas on host resume as it may have changed, seen
1047 * on Windows 7 running on the Dell Optiplex Intel Core i5-3570.
1048 */
1049 if (enmEvent == RTPOWEREVENT_RESUME)
1050 {
1051 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1052 if ( RT_LIKELY(pGip)
1053 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1054 && !supdrvOSAreCpusOfflinedOnSuspend())
1055 {
1056#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1057 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
1058#else
1059 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
1060 supdrvTscMeasureInitialDeltas(pDevExt);
1061#endif
1062 }
1063 }
1064 else if (enmEvent == RTPOWEREVENT_SUSPEND)
1065 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
1066}
1067
1068
1069/**
1070 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
1071 *
1072 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
1073 * the CPU may change the TSC frequence between now and when the timer fires
1074 * (supdrvInitAsyncRefineTscTimer).
1075 *
1076 * @param pDevExt Pointer to the device instance data.
1077 */
1078static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt)
1079{
1080 uint64_t u64NanoTS;
1081 RTCCUINTREG fEFlags;
1082 int rc;
1083
1084 /*
1085 * Register a power management callback.
1086 */
1087 pDevExt->fInvTscRefinePowerEvent = false;
1088 rc = RTPowerNotificationRegister(supdrvGipPowerNotificationCallback, pDevExt);
1089 AssertRC(rc); /* ignore */
1090
1091 /*
1092 * Record the TSC and NanoTS as the starting anchor point for refinement
1093 * of the TSC. We try get as close to a clock tick as possible on systems
1094 * which does not provide high resolution time.
1095 */
1096 u64NanoTS = RTTimeSystemNanoTS();
1097 while (RTTimeSystemNanoTS() == u64NanoTS)
1098 ASMNopPause();
1099
1100 fEFlags = ASMIntDisableFlags();
1101 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
1102 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
1103 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
1104 ASMSetFlags(fEFlags);
1105
1106 /*
1107 * Create a timer that runs on the same CPU so we won't have a depencency
1108 * on the TSC-delta and can run in parallel to it. On systems that does not
1109 * implement CPU specific timers we'll apply deltas in the timer callback,
1110 * just like we do for CPUs going offline.
1111 *
1112 * The longer the refinement interval the better the accuracy, at least in
1113 * theory. If it's too long though, ring-3 may already be starting its
1114 * first VMs before we're done. On most systems we will be loading the
1115 * support driver during boot and VMs won't be started for a while yet,
1116 * it is really only a problem during development (especially with
1117 * on-demand driver starting on windows).
1118 *
1119 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
1120 * to calculate the frequency during driver loading, the timer is set
1121 * to fire after 200 ms the first time. It will then reschedule itself
1122 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
1123 * reached or it notices that there is a user land client with GIP
1124 * mapped (we want a stable frequency for all VMs).
1125 */
1126 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC,
1127 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
1128 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1129 if (RT_SUCCESS(rc))
1130 {
1131 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1132 if (RT_SUCCESS(rc))
1133 return;
1134 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1135 }
1136
1137 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1138 {
1139 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC, RTTIMER_FLAGS_CPU_ANY,
1140 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
1141 if (RT_SUCCESS(rc))
1142 {
1143 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1144 if (RT_SUCCESS(rc))
1145 return;
1146 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1147 }
1148 }
1149
1150 pDevExt->pInvarTscRefineTimer = NULL;
1151 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1152}
1153
1154
1155/**
1156 * @callback_method_impl{PFNRTMPWORKER,
1157 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1158 * the measurements on.}
1159 */
1160static DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1161{
1162 RTCCUINTREG fEFlags = ASMIntDisableFlags();
1163 uint64_t *puTscStop = (uint64_t *)pvUser1;
1164 uint64_t *pnsStop = (uint64_t *)pvUser2;
1165 RT_NOREF1(idCpu);
1166
1167 *puTscStop = ASMReadTSC();
1168 *pnsStop = RTTimeSystemNanoTS();
1169
1170 ASMSetFlags(fEFlags);
1171}
1172
1173
1174/**
1175 * Measures the TSC frequency of the system.
1176 *
1177 * The TSC frequency can vary on systems which are not reported as invariant.
1178 * On such systems the object of this function is to find out what the nominal,
1179 * maximum TSC frequency under 'normal' CPU operation.
1180 *
1181 * @returns VBox status code.
1182 * @param pGip Pointer to the GIP.
1183 * @param fRough Set if we're doing the rough calculation that the
1184 * TSC measuring code needs, where accuracy isn't all
1185 * that important (too high is better than too low).
1186 * When clear we try for best accuracy that we can
1187 * achieve in reasonably short time.
1188 */
1189static int supdrvGipInitMeasureTscFreq(PSUPGLOBALINFOPAGE pGip, bool fRough)
1190{
1191 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1192 int cTriesLeft = fRough ? 4 : 2;
1193 while (cTriesLeft-- > 0)
1194 {
1195 RTCCUINTREG fEFlags;
1196 uint64_t nsStart;
1197 uint64_t nsStop;
1198 uint64_t uTscStart;
1199 uint64_t uTscStop;
1200 RTCPUID idCpuStart;
1201 RTCPUID idCpuStop;
1202
1203 /*
1204 * Synchronize with the host OS clock tick on systems without high
1205 * resolution time API (older Windows version for example).
1206 */
1207 nsStart = RTTimeSystemNanoTS();
1208 while (RTTimeSystemNanoTS() == nsStart)
1209 ASMNopPause();
1210
1211 /*
1212 * Read the TSC and current time, noting which CPU we're on.
1213 */
1214 fEFlags = ASMIntDisableFlags();
1215 uTscStart = ASMReadTSC();
1216 nsStart = RTTimeSystemNanoTS();
1217 idCpuStart = RTMpCpuId();
1218 ASMSetFlags(fEFlags);
1219
1220 /*
1221 * Delay for a while.
1222 */
1223 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1224 {
1225 /*
1226 * Sleep-wait since the TSC frequency is constant, it eases host load.
1227 * Shorter interval produces more variance in the frequency (esp. Windows).
1228 */
1229 uint64_t msElapsed = 0;
1230 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1231 / RT_NS_1MS;
1232 do
1233 {
1234 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1235 nsStop = RTTimeSystemNanoTS();
1236 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1237 } while (msElapsed < msDelay);
1238
1239 while (RTTimeSystemNanoTS() == nsStop)
1240 ASMNopPause();
1241 }
1242 else
1243 {
1244 /*
1245 * Busy-wait keeping the frequency up.
1246 */
1247 do
1248 {
1249 ASMNopPause();
1250 nsStop = RTTimeSystemNanoTS();
1251 } while (nsStop - nsStart < RT_NS_100MS);
1252 }
1253
1254 /*
1255 * Read the TSC and time again.
1256 */
1257 fEFlags = ASMIntDisableFlags();
1258 uTscStop = ASMReadTSC();
1259 nsStop = RTTimeSystemNanoTS();
1260 idCpuStop = RTMpCpuId();
1261 ASMSetFlags(fEFlags);
1262
1263 /*
1264 * If the CPU changes, things get a bit complicated and what we
1265 * can get away with depends on the GIP mode / TSC reliability.
1266 */
1267 if (idCpuStop != idCpuStart)
1268 {
1269 bool fDoXCall = false;
1270
1271 /*
1272 * Synchronous TSC mode: we're probably fine as it's unlikely
1273 * that we were rescheduled because of TSC throttling or power
1274 * management reasons, so just go ahead.
1275 */
1276 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1277 {
1278 /* Probably ok, maybe we should retry once?. */
1279 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1280 }
1281 /*
1282 * If we're just doing the rough measurement, do the cross call and
1283 * get on with things (we don't have deltas!).
1284 */
1285 else if (fRough)
1286 fDoXCall = true;
1287 /*
1288 * Invariant TSC mode: It doesn't matter if we have delta available
1289 * for both CPUs. That is not something we can assume at this point.
1290 *
1291 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1292 * downgraded after each delta calculation and the delta
1293 * calculations may not be complete yet.
1294 */
1295 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1296 {
1297/** @todo This section of code is never reached atm, consider dropping it later on... */
1298 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1299 {
1300 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1301 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1302 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1303 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1304 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1305 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1306 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1307 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1308 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1309 {
1310 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1311 {
1312 uTscStart -= iStartTscDelta;
1313 uTscStop -= iStopTscDelta;
1314 }
1315 }
1316 /*
1317 * Invalid CPU indexes are not caused by online/offline races, so
1318 * we have to trigger driver load failure if that happens as GIP
1319 * and IPRT assumptions are busted on this system.
1320 */
1321 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1322 {
1323 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1324 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1325 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1326 return VERR_INVALID_CPU_INDEX;
1327 }
1328 /*
1329 * No valid deltas. We retry, if we're on our last retry
1330 * we do the cross call instead just to get a result. The
1331 * frequency will be refined in a few seconds anyway.
1332 */
1333 else if (cTriesLeft > 0)
1334 continue;
1335 else
1336 fDoXCall = true;
1337 }
1338 }
1339 /*
1340 * Asynchronous TSC mode: This is bad, as the reason we usually
1341 * use this mode is to deal with variable TSC frequencies and
1342 * deltas. So, we need to get the TSC from the same CPU as
1343 * started it, we also need to keep that CPU busy. So, retry
1344 * and fall back to the cross call on the last attempt.
1345 */
1346 else
1347 {
1348 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1349 if (cTriesLeft > 0)
1350 continue;
1351 fDoXCall = true;
1352 }
1353
1354 if (fDoXCall)
1355 {
1356 /*
1357 * Try read the TSC and timestamp on the start CPU.
1358 */
1359 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1360 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1361 continue;
1362 }
1363 }
1364
1365 /*
1366 * Calculate the TSC frequency and update it (shared with the refinement timer).
1367 */
1368 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart, 0);
1369 return VINF_SUCCESS;
1370 }
1371
1372 Assert(!fRough);
1373 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1374}
1375
1376
1377/**
1378 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1379 *
1380 * @returns Index of the CPU in the cache set.
1381 * @param pGip The GIP.
1382 * @param idCpu The CPU ID.
1383 */
1384static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1385{
1386 uint32_t i, cTries;
1387
1388 /*
1389 * ASSUMES that CPU IDs are constant.
1390 */
1391 for (i = 0; i < pGip->cCpus; i++)
1392 if (pGip->aCPUs[i].idCpu == idCpu)
1393 return i;
1394
1395 cTries = 0;
1396 do
1397 {
1398 for (i = 0; i < pGip->cCpus; i++)
1399 {
1400 bool fRc;
1401 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1402 if (fRc)
1403 return i;
1404 }
1405 } while (cTries++ < 32);
1406 AssertReleaseFailed();
1407 return i - 1;
1408}
1409
1410
1411/**
1412 * The calling CPU should be accounted as online, update GIP accordingly.
1413 *
1414 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1415 *
1416 * @param pDevExt The device extension.
1417 * @param idCpu The CPU ID.
1418 */
1419static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1420{
1421 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1422 int iCpuSet = 0;
1423 uint32_t idApic;
1424 uint32_t i = 0;
1425 uint64_t u64NanoTS = 0;
1426
1427 AssertPtrReturnVoid(pGip);
1428 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1429 AssertRelease(idCpu == RTMpCpuId());
1430 Assert(pGip->cPossibleCpus == RTMpGetCount());
1431
1432 /*
1433 * Do this behind a spinlock with interrupts disabled as this can fire
1434 * on all CPUs simultaneously, see @bugref{6110}.
1435 */
1436 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1437
1438 /*
1439 * Update the globals.
1440 */
1441 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1442 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1443 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1444 if (iCpuSet >= 0)
1445 {
1446 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1447 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1448 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1449 }
1450
1451 /*
1452 * Update the entry.
1453 */
1454 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1455 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1456
1457 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1458
1459 idApic = supdrvGipGetApicIdSlow();
1460 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1461 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1462 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1463
1464 pGip->aCPUs[i].iCpuGroup = 0;
1465 pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
1466#ifdef RT_OS_WINDOWS
1467 supdrvOSGipInitGroupBitsForCpu(pDevExt, pGip, &pGip->aCPUs[i]);
1468#endif
1469
1470 /*
1471 * Update the APIC ID and CPU set index mappings.
1472 */
1473 if (idApic < RT_ELEMENTS(pGip->aiCpuFromApicId))
1474 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1475 else
1476 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: idApic=%#x is out of bounds (%#zx, i=%u, iCpuSet=%d)\n",
1477 idApic, RT_ELEMENTS(pGip->aiCpuFromApicId), i, iCpuSet));
1478 if ((unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
1479 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1480 else
1481 LogRelMax(64, ("supdrvGipMpEventOnlineOrInitOnCpu: iCpuSet=%d is out of bounds (%#zx, i=%u, idApic=%d)\n",
1482 iCpuSet, RT_ELEMENTS(pGip->aiCpuFromApicId), i, idApic));
1483
1484 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1485 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1486
1487 /* Update the Mp online/offline counter. */
1488 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1489
1490 /* Commit it. */
1491 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1492
1493 RTSpinlockRelease(pDevExt->hGipSpinlock);
1494}
1495
1496
1497/**
1498 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1499 *
1500 * @param idCpu The CPU ID we are running on.
1501 * @param pvUser1 Opaque pointer to the device instance data.
1502 * @param pvUser2 Not used.
1503 */
1504static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1505{
1506 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1507 NOREF(pvUser2);
1508 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1509}
1510
1511
1512/**
1513 * The CPU should be accounted as offline, update the GIP accordingly.
1514 *
1515 * This is used by supdrvGipMpEvent.
1516 *
1517 * @param pDevExt The device extension.
1518 * @param idCpu The CPU ID.
1519 */
1520static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1521{
1522 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1523 int iCpuSet;
1524 unsigned i;
1525
1526 AssertPtrReturnVoid(pGip);
1527 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1528
1529 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1530 AssertReturnVoid(iCpuSet >= 0);
1531
1532 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1533 AssertReturnVoid(i < pGip->cCpus);
1534 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1535
1536 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1537 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1538
1539 /* Update the Mp online/offline counter. */
1540 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1541
1542 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1543 {
1544 /* Reset the TSC delta, we will recalculate it lazily. */
1545 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1546 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1547 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1548 }
1549
1550 /* Commit it. */
1551 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1552
1553 RTSpinlockRelease(pDevExt->hGipSpinlock);
1554}
1555
1556
1557/**
1558 * Multiprocessor event notification callback.
1559 *
1560 * This is used to make sure that the GIP master gets passed on to
1561 * another CPU. It also updates the associated CPU data.
1562 *
1563 * @param enmEvent The event.
1564 * @param idCpu The cpu it applies to.
1565 * @param pvUser Pointer to the device extension.
1566 */
1567static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1568{
1569 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1570 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1571
1572 if (pGip)
1573 {
1574 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1575 switch (enmEvent)
1576 {
1577 case RTMPEVENT_ONLINE:
1578 {
1579 RTThreadPreemptDisable(&PreemptState);
1580 if (idCpu == RTMpCpuId())
1581 {
1582 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1583 RTThreadPreemptRestore(&PreemptState);
1584 }
1585 else
1586 {
1587 RTThreadPreemptRestore(&PreemptState);
1588 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1589 }
1590
1591 /*
1592 * Recompute TSC-delta for the newly online'd CPU.
1593 */
1594 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1595 {
1596#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1597 supdrvTscDeltaThreadStartMeasurement(pDevExt, false /* fForceAll */);
1598#else
1599 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1600 supdrvTscMeasureDeltaOne(pDevExt, iCpu);
1601#endif
1602 }
1603 break;
1604 }
1605
1606 case RTMPEVENT_OFFLINE:
1607 supdrvGipMpEventOffline(pDevExt, idCpu);
1608 break;
1609 }
1610 }
1611
1612 /*
1613 * Make sure there is a master GIP.
1614 */
1615 if (enmEvent == RTMPEVENT_OFFLINE)
1616 {
1617 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1618 if (idGipMaster == idCpu)
1619 {
1620 /*
1621 * The GIP master is going offline, find a new one.
1622 */
1623 bool fIgnored;
1624 unsigned i;
1625 RTCPUID idNewGipMaster = NIL_RTCPUID;
1626 RTCPUSET OnlineCpus;
1627 RTMpGetOnlineSet(&OnlineCpus);
1628
1629 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1630 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1631 {
1632 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1633 if (idCurCpu != idGipMaster)
1634 {
1635 idNewGipMaster = idCurCpu;
1636 break;
1637 }
1638 }
1639
1640 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1641 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1642 NOREF(fIgnored);
1643 }
1644 }
1645}
1646
1647
1648/**
1649 * On CPU initialization callback for RTMpOnAll.
1650 *
1651 * @param idCpu The CPU ID.
1652 * @param pvUser1 The device extension.
1653 * @param pvUser2 The GIP.
1654 */
1655static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1656{
1657 /* This is good enough, even though it will update some of the globals a
1658 bit to much. */
1659 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1660 NOREF(pvUser2);
1661}
1662
1663
1664/**
1665 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1666 *
1667 * @param idCpu Ignored.
1668 * @param pvUser1 Where to put the TSC.
1669 * @param pvUser2 Ignored.
1670 */
1671static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1672{
1673 Assert(RTMpCpuIdToSetIndex(idCpu) == (intptr_t)pvUser2);
1674 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1675 RT_NOREF2(idCpu, pvUser2);
1676}
1677
1678
1679/**
1680 * Determine if Async GIP mode is required because of TSC drift.
1681 *
1682 * When using the default/normal timer code it is essential that the time stamp counter
1683 * (TSC) runs never backwards, that is, a read operation to the counter should return
1684 * a bigger value than any previous read operation. This is guaranteed by the latest
1685 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1686 * case we have to choose the asynchronous timer mode.
1687 *
1688 * @param poffMin Pointer to the determined difference between different
1689 * cores (optional, can be NULL).
1690 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1691 */
1692static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1693{
1694 /*
1695 * Just iterate all the cpus 8 times and make sure that the TSC is
1696 * ever increasing. We don't bother taking TSC rollover into account.
1697 */
1698 int iEndCpu = RTMpGetArraySize();
1699 int iCpu;
1700 int cLoops = 8;
1701 bool fAsync = false;
1702 int rc = VINF_SUCCESS;
1703 uint64_t offMax = 0;
1704 uint64_t offMin = ~(uint64_t)0;
1705 uint64_t PrevTsc = ASMReadTSC();
1706
1707 while (cLoops-- > 0)
1708 {
1709 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1710 {
1711 uint64_t CurTsc;
1712 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker,
1713 &CurTsc, (void *)(uintptr_t)iCpu);
1714 if (RT_SUCCESS(rc))
1715 {
1716 if (CurTsc <= PrevTsc)
1717 {
1718 fAsync = true;
1719 offMin = offMax = PrevTsc - CurTsc;
1720 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1721 iCpu, cLoops, CurTsc, PrevTsc));
1722 break;
1723 }
1724
1725 /* Gather statistics (except the first time). */
1726 if (iCpu != 0 || cLoops != 7)
1727 {
1728 uint64_t off = CurTsc - PrevTsc;
1729 if (off < offMin)
1730 offMin = off;
1731 if (off > offMax)
1732 offMax = off;
1733 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1734 }
1735
1736 /* Next */
1737 PrevTsc = CurTsc;
1738 }
1739 else if (rc == VERR_NOT_SUPPORTED)
1740 break;
1741 else
1742 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1743 }
1744
1745 /* broke out of the loop. */
1746 if (iCpu < iEndCpu)
1747 break;
1748 }
1749
1750 if (poffMin)
1751 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1752 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1753 fAsync, iEndCpu, rc, offMin, offMax));
1754#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1755 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1756#endif
1757 return fAsync;
1758}
1759
1760
1761/**
1762 * supdrvGipInit() worker that determines the GIP TSC mode.
1763 *
1764 * @returns The most suitable TSC mode.
1765 * @param pDevExt Pointer to the device instance data.
1766 */
1767static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1768{
1769 uint64_t u64DiffCoresIgnored;
1770 uint32_t uEAX, uEBX, uECX, uEDX;
1771
1772 /*
1773 * Establish whether the CPU advertises TSC as invariant, we need that in
1774 * a couple of places below.
1775 */
1776 bool fInvariantTsc = false;
1777 if (ASMHasCpuId())
1778 {
1779 uEAX = ASMCpuId_EAX(0x80000000);
1780 if (RTX86IsValidExtRange(uEAX) && uEAX >= 0x80000007)
1781 {
1782 uEDX = ASMCpuId_EDX(0x80000007);
1783 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1784 fInvariantTsc = true;
1785 }
1786 }
1787
1788 /*
1789 * On single CPU systems, we don't need to consider ASYNC mode.
1790 */
1791 if (RTMpGetCount() <= 1)
1792 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1793
1794 /*
1795 * Allow the user and/or OS specific bits to force async mode.
1796 */
1797 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1798 return SUPGIPMODE_ASYNC_TSC;
1799
1800 /*
1801 * Use invariant mode if the CPU says TSC is invariant.
1802 */
1803 if (fInvariantTsc)
1804 return SUPGIPMODE_INVARIANT_TSC;
1805
1806 /*
1807 * TSC is not invariant and we're on SMP, this presents two problems:
1808 *
1809 * (1) There might be a skew between the CPU, so that cpu0
1810 * returns a TSC that is slightly different from cpu1.
1811 * This screw may be due to (2), bad TSC initialization
1812 * or slightly different TSC rates.
1813 *
1814 * (2) Power management (and other things) may cause the TSC
1815 * to run at a non-constant speed, and cause the speed
1816 * to be different on the cpus. This will result in (1).
1817 *
1818 * If any of the above is detected, we will have to use ASYNC mode.
1819 */
1820 /* (1). Try check for current differences between the cpus. */
1821 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1822 return SUPGIPMODE_ASYNC_TSC;
1823
1824 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1825 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1826 if ( RTX86IsValidStdRange(uEAX)
1827 && (RTX86IsAmdCpu(uEBX, uECX, uEDX) || RTX86IsHygonCpu(uEBX, uECX, uEDX)) )
1828 {
1829 /* Check for APM support. */
1830 uEAX = ASMCpuId_EAX(0x80000000);
1831 if (RTX86IsValidExtRange(uEAX) && uEAX >= 0x80000007)
1832 {
1833 uEDX = ASMCpuId_EDX(0x80000007);
1834 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1835 return SUPGIPMODE_ASYNC_TSC;
1836 }
1837 }
1838
1839 return SUPGIPMODE_SYNC_TSC;
1840}
1841
1842
1843/**
1844 * Initializes per-CPU GIP information.
1845 *
1846 * @param pGip Pointer to the GIP.
1847 * @param pCpu Pointer to which GIP CPU to initialize.
1848 * @param u64NanoTS The current nanosecond timestamp.
1849 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1850 */
1851static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1852{
1853 pCpu->u32TransactionId = 2;
1854 pCpu->u64NanoTS = u64NanoTS;
1855 pCpu->u64TSC = ASMReadTSC();
1856 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1857 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1858
1859 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1860 ASMAtomicWriteU32(&pCpu->idCpu, NIL_RTCPUID);
1861 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1862 ASMAtomicWriteU16(&pCpu->iCpuGroup, 0);
1863 ASMAtomicWriteU16(&pCpu->iCpuGroupMember, UINT16_MAX);
1864 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1865 ASMAtomicWriteU32(&pCpu->iReservedForNumaNode, 0);
1866
1867 /*
1868 * The first time we're called, we don't have a CPU frequency handy,
1869 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1870 * called again and at that point we have a more plausible CPU frequency
1871 * value handy. The frequency history will also be adjusted again on
1872 * the 2nd timer callout (maybe we can skip that now?).
1873 */
1874 if (!uCpuHz)
1875 {
1876 pCpu->u64CpuHz = _4G - 1;
1877 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1878 }
1879 else
1880 {
1881 pCpu->u64CpuHz = uCpuHz;
1882 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1883 }
1884 pCpu->au32TSCHistory[0]
1885 = pCpu->au32TSCHistory[1]
1886 = pCpu->au32TSCHistory[2]
1887 = pCpu->au32TSCHistory[3]
1888 = pCpu->au32TSCHistory[4]
1889 = pCpu->au32TSCHistory[5]
1890 = pCpu->au32TSCHistory[6]
1891 = pCpu->au32TSCHistory[7]
1892 = pCpu->u32UpdateIntervalTSC;
1893}
1894
1895
1896/**
1897 * Initializes the GIP data.
1898 *
1899 * @returns VBox status code.
1900 * @param pDevExt Pointer to the device instance data.
1901 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1902 * @param HCPhys The physical address of the GIP.
1903 * @param u64NanoTS The current nanosecond timestamp.
1904 * @param uUpdateHz The update frequency.
1905 * @param uUpdateIntervalNS The update interval in nanoseconds.
1906 * @param cCpus The CPU count.
1907 * @param cbGipCpuGroups The supdrvOSGipGetGroupTableSize return value we
1908 * used when allocating the GIP structure.
1909 */
1910static int supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1911 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS,
1912 unsigned cCpus, size_t cbGipCpuGroups)
1913{
1914 size_t const cbGip = RT_ALIGN_Z(RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups, PAGE_SIZE);
1915 unsigned i;
1916#ifdef DEBUG_DARWIN_GIP
1917 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1918#else
1919 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1920#endif
1921
1922 /*
1923 * Initialize the structure.
1924 */
1925 memset(pGip, 0, cbGip);
1926
1927 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1928 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1929 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1930 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1931 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1932 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1933 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1934 else
1935 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1936 pGip->cCpus = (uint16_t)cCpus;
1937 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1938 pGip->u32UpdateHz = uUpdateHz;
1939 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1940 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1941 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1942 RTCpuSetEmpty(&pGip->PresentCpuSet);
1943 RTMpGetSet(&pGip->PossibleCpuSet);
1944 pGip->cOnlineCpus = RTMpGetOnlineCount();
1945 pGip->cPresentCpus = RTMpGetPresentCount();
1946 pGip->cPossibleCpus = RTMpGetCount();
1947 pGip->cPossibleCpuGroups = 1;
1948 pGip->idCpuMax = RTMpGetMaxCpuId();
1949 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1950 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1951 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1952 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1953 for (i = 0; i < RT_ELEMENTS(pGip->aoffCpuGroup); i++)
1954 pGip->aoffCpuGroup[i] = UINT32_MAX;
1955 for (i = 0; i < cCpus; i++)
1956 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1957#ifdef RT_OS_WINDOWS
1958 int rc = supdrvOSInitGipGroupTable(pDevExt, pGip, cbGipCpuGroups);
1959 AssertRCReturn(rc, rc);
1960#endif
1961
1962 /*
1963 * Link it to the device extension.
1964 */
1965 pDevExt->pGip = pGip;
1966 pDevExt->HCPhysGip = HCPhys;
1967 pDevExt->cGipUsers = 0;
1968
1969 return VINF_SUCCESS;
1970}
1971
1972
1973/**
1974 * Creates the GIP.
1975 *
1976 * @returns VBox status code.
1977 * @param pDevExt Instance data. GIP stuff may be updated.
1978 */
1979int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1980{
1981 PSUPGLOBALINFOPAGE pGip;
1982 size_t cbGip;
1983 size_t cbGipCpuGroups;
1984 RTHCPHYS HCPhysGip;
1985 uint32_t u32SystemResolution;
1986 uint32_t u32Interval;
1987 uint32_t u32MinInterval;
1988 uint32_t uMod;
1989 unsigned cCpus;
1990 int rc;
1991
1992 LogFlow(("supdrvGipCreate:\n"));
1993
1994 /*
1995 * Assert order.
1996 */
1997 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1998 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1999 Assert(!pDevExt->pGipTimer);
2000#ifdef SUPDRV_USE_MUTEX_FOR_GIP
2001 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
2002 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
2003#else
2004 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
2005 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
2006#endif
2007
2008 /*
2009 * Check the CPU count.
2010 */
2011 cCpus = RTMpGetArraySize();
2012 if (cCpus > RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)))
2013 {
2014 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, RT_ELEMENTS(pGip->aiCpuFromApicId)));
2015 return VERR_TOO_MANY_CPUS;
2016 }
2017
2018 /*
2019 * Allocate a contiguous set of pages with a default kernel mapping.
2020 */
2021#ifdef RT_OS_WINDOWS
2022 cbGipCpuGroups = supdrvOSGipGetGroupTableSize(pDevExt);
2023#else
2024 cbGipCpuGroups = 0;
2025#endif
2026 cbGip = RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups;
2027 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, cbGip, NIL_RTHCPHYS /*PhysHighest*/, false /*fExecutable*/);
2028 if (RT_FAILURE(rc))
2029 {
2030 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
2031 return rc;
2032 }
2033 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
2034 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
2035
2036 /*
2037 * Find a reasonable update interval and initialize the structure.
2038 */
2039 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
2040 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
2041 * See @bugref{6710}. */
2042 u32MinInterval = RT_NS_10MS;
2043 u32SystemResolution = RTTimerGetSystemGranularity();
2044 u32Interval = u32MinInterval;
2045 uMod = u32MinInterval % u32SystemResolution;
2046 if (uMod)
2047 u32Interval += u32SystemResolution - uMod;
2048
2049 rc = supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval,
2050 cCpus, cbGipCpuGroups);
2051
2052 /*
2053 * Important sanity check... (Sets rc)
2054 */
2055 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
2056 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
2057 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
2058 {
2059 OSDBGPRINT(("supdrvGipCreate: Host-OS/user claims the TSC-deltas are zero but we detected async. TSC! Bad.\n"));
2060 rc = VERR_INTERNAL_ERROR_2;
2061 }
2062
2063 /* It doesn't make sense to do TSC-delta detection on systems we detect as async. */
2064 AssertStmt( pGip->u32Mode != SUPGIPMODE_ASYNC_TSC
2065 || pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED,
2066 rc = VERR_INTERNAL_ERROR_3);
2067
2068 /*
2069 * Do the TSC frequency measurements.
2070 *
2071 * If we're in invariant TSC mode, just to a quick preliminary measurement
2072 * that the TSC-delta measurement code can use to yield cross calls.
2073 *
2074 * If we're in any of the other two modes, neither which require MP init,
2075 * notifications or deltas for the job, do the full measurement now so
2076 * that supdrvGipInitOnCpu() can populate the TSC interval and history
2077 * array with more reasonable values.
2078 */
2079 if (RT_SUCCESS(rc))
2080 {
2081 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
2082 {
2083 rc = supdrvGipInitMeasureTscFreq(pGip, true /*fRough*/); /* cannot fail */
2084 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt);
2085 }
2086 else
2087 rc = supdrvGipInitMeasureTscFreq(pGip, false /*fRough*/);
2088 if (RT_SUCCESS(rc))
2089 {
2090 /*
2091 * Start TSC-delta measurement thread before we start getting MP
2092 * events that will try kick it into action (includes the
2093 * RTMpOnAll/supdrvGipInitOnCpu call below).
2094 */
2095 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
2096 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
2097#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2098 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2099 rc = supdrvTscDeltaThreadInit(pDevExt);
2100#endif
2101 if (RT_SUCCESS(rc))
2102 {
2103 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
2104 if (RT_SUCCESS(rc))
2105 {
2106 /*
2107 * Do GIP initialization on all online CPUs. Wake up the
2108 * TSC-delta thread afterwards.
2109 */
2110 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
2111 if (RT_SUCCESS(rc))
2112 {
2113#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2114 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
2115#else
2116 uint16_t iCpu;
2117 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
2118 {
2119 /*
2120 * Measure the TSC deltas now that we have MP notifications.
2121 */
2122 int cTries = 5;
2123 do
2124 {
2125 rc = supdrvTscMeasureInitialDeltas(pDevExt);
2126 if ( rc != VERR_TRY_AGAIN
2127 && rc != VERR_CPU_OFFLINE)
2128 break;
2129 } while (--cTries > 0);
2130 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2131 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
2132 }
2133 else
2134 {
2135 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2136 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
2137 }
2138 if (RT_SUCCESS(rc))
2139#endif
2140 {
2141 /*
2142 * Create the timer.
2143 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
2144 */
2145 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
2146 {
2147 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
2148 supdrvGipAsyncTimer, pDevExt);
2149 if (rc == VERR_NOT_SUPPORTED)
2150 {
2151 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
2152 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
2153 }
2154 }
2155 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2156 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
2157 supdrvGipSyncAndInvariantTimer, pDevExt);
2158 if (RT_SUCCESS(rc))
2159 {
2160 /*
2161 * We're good.
2162 */
2163 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
2164 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2165
2166 g_pSUPGlobalInfoPage = pGip;
2167 return VINF_SUCCESS;
2168 }
2169
2170 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
2171 Assert(!pDevExt->pGipTimer);
2172 }
2173 }
2174 else
2175 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
2176 }
2177 else
2178 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
2179 }
2180 else
2181 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
2182 }
2183 else
2184 OSDBGPRINT(("supdrvGipCreate: supdrvTscMeasureInitialDeltas failed. rc=%Rrc\n", rc));
2185 }
2186
2187 /* Releases timer frequency increase too. */
2188 supdrvGipDestroy(pDevExt);
2189 return rc;
2190}
2191
2192
2193/**
2194 * Invalidates the GIP data upon termination.
2195 *
2196 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2197 */
2198static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
2199{
2200 unsigned i;
2201 pGip->u32Magic = 0;
2202 for (i = 0; i < pGip->cCpus; i++)
2203 {
2204 pGip->aCPUs[i].u64NanoTS = 0;
2205 pGip->aCPUs[i].u64TSC = 0;
2206 pGip->aCPUs[i].iTSCHistoryHead = 0;
2207 pGip->aCPUs[i].u64TSCSample = 0;
2208 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2209 }
2210}
2211
2212
2213/**
2214 * Terminates the GIP.
2215 *
2216 * @param pDevExt Instance data. GIP stuff may be updated.
2217 */
2218void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2219{
2220 int rc;
2221#ifdef DEBUG_DARWIN_GIP
2222 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2223 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2224 pDevExt->pGipTimer, pDevExt->GipMemObj));
2225#endif
2226
2227 /*
2228 * Stop receiving MP notifications before tearing anything else down.
2229 */
2230 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2231
2232#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2233 /*
2234 * Terminate the TSC-delta measurement thread and resources.
2235 */
2236 supdrvTscDeltaTerm(pDevExt);
2237#endif
2238
2239 /*
2240 * Destroy the TSC-refinement timer.
2241 */
2242 if (pDevExt->pInvarTscRefineTimer)
2243 {
2244 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2245 pDevExt->pInvarTscRefineTimer = NULL;
2246 }
2247
2248 /*
2249 * Invalid the GIP data.
2250 */
2251 if (pDevExt->pGip)
2252 {
2253 supdrvGipTerm(pDevExt->pGip);
2254 pDevExt->pGip = NULL;
2255 }
2256 g_pSUPGlobalInfoPage = NULL;
2257
2258 /*
2259 * Destroy the timer and free the GIP memory object.
2260 */
2261 if (pDevExt->pGipTimer)
2262 {
2263 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2264 pDevExt->pGipTimer = NULL;
2265 }
2266
2267 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2268 {
2269 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2270 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2271 }
2272
2273 /*
2274 * Finally, make sure we've release the system timer resolution request
2275 * if one actually succeeded and is still pending.
2276 */
2277 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2278}
2279
2280
2281
2282
2283/*
2284 *
2285 *
2286 * GIP Update Timer Related Code
2287 * GIP Update Timer Related Code
2288 * GIP Update Timer Related Code
2289 *
2290 *
2291 */
2292
2293
2294/**
2295 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2296 * updates all the per cpu data except the transaction id.
2297 *
2298 * @param pDevExt The device extension.
2299 * @param pGipCpu Pointer to the per cpu data.
2300 * @param u64NanoTS The current time stamp.
2301 * @param u64TSC The current TSC.
2302 * @param iTick The current timer tick.
2303 *
2304 * @remarks Can be called with interrupts disabled!
2305 */
2306static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2307{
2308 uint64_t u64TSCDelta;
2309 bool fUpdateCpuHz;
2310 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2311 AssertPtrReturnVoid(pGip);
2312
2313 /* Delta between this and the previous update. */
2314 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2315
2316 /*
2317 * Update the NanoTS.
2318 */
2319 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2320
2321 /*
2322 * Calc TSC delta.
2323 */
2324 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2325 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2326
2327 /*
2328 * Determine if we need to update the CPU (TSC) frequency calculation.
2329 *
2330 * We don't need to keep recalculating the frequency when it's invariant,
2331 * unless the special tstGIP-2 testing mode is enabled.
2332 */
2333 fUpdateCpuHz = pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC;
2334 if (!(pGip->fFlags & SUPGIP_FLAGS_TESTING))
2335 { /* likely*/ }
2336 else
2337 {
2338 uint32_t fGipFlags = pGip->fFlags;
2339 if (fGipFlags & (SUPGIP_FLAGS_TESTING_ENABLE | SUPGIP_FLAGS_TESTING_START))
2340 {
2341 if (fGipFlags & SUPGIP_FLAGS_TESTING_START)
2342 {
2343 /* Cache the TSC frequency before forcing updates due to test mode. */
2344 if (!fUpdateCpuHz)
2345 pDevExt->uGipTestModeInvariantCpuHz = pGip->aCPUs[0].u64CpuHz;
2346 ASMAtomicAndU32(&pGip->fFlags, ~SUPGIP_FLAGS_TESTING_START);
2347 }
2348 fUpdateCpuHz = true;
2349 }
2350 else if (fGipFlags & SUPGIP_FLAGS_TESTING_STOP)
2351 {
2352 /* Restore the cached TSC frequency if any. */
2353 if (!fUpdateCpuHz)
2354 {
2355 Assert(pDevExt->uGipTestModeInvariantCpuHz);
2356 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, pDevExt->uGipTestModeInvariantCpuHz);
2357 }
2358 ASMAtomicAndU32(&pGip->fFlags, ~(SUPGIP_FLAGS_TESTING_STOP | SUPGIP_FLAGS_TESTING));
2359 }
2360 }
2361
2362 /*
2363 * Calculate the CPU (TSC) frequency if necessary.
2364 */
2365 if (fUpdateCpuHz)
2366 {
2367 uint64_t u64CpuHz;
2368 uint32_t u32UpdateIntervalTSC;
2369 uint32_t u32UpdateIntervalTSCSlack;
2370 uint32_t u32TransactionId;
2371 unsigned iTSCHistoryHead;
2372
2373 if (u64TSCDelta >> 32)
2374 {
2375 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2376 pGipCpu->cErrors++;
2377 }
2378
2379 /*
2380 * On the 2nd and 3rd callout, reset the history with the current TSC
2381 * interval since the values entered by supdrvGipInit are totally off.
2382 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2383 * better, while the 3rd should be most reliable.
2384 */
2385 /** @todo Could we drop this now that we initializes the history
2386 * with nominal TSC frequency values? */
2387 u32TransactionId = pGipCpu->u32TransactionId;
2388 if (RT_UNLIKELY( ( u32TransactionId == 5
2389 || u32TransactionId == 7)
2390 && ( iTick == 2
2391 || iTick == 3) ))
2392 {
2393 unsigned i;
2394 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2395 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2396 }
2397
2398 /*
2399 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2400 * Wait until we have at least one full history since the above history reset. The
2401 * assumption is that the majority of the previous history values will be tolerable.
2402 * See @bugref{6710#c67}.
2403 */
2404 /** @todo Could we drop the fudging there now that we initializes the history
2405 * with nominal TSC frequency values? */
2406 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2407 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2408 {
2409 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2410 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2411 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2412 {
2413 uint32_t u32;
2414 u32 = pGipCpu->au32TSCHistory[0];
2415 u32 += pGipCpu->au32TSCHistory[1];
2416 u32 += pGipCpu->au32TSCHistory[2];
2417 u32 += pGipCpu->au32TSCHistory[3];
2418 u32 >>= 2;
2419 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2420 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2421 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2422 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2423 u64TSCDelta >>= 2;
2424 u64TSCDelta += u32;
2425 u64TSCDelta >>= 1;
2426 }
2427 }
2428
2429 /*
2430 * TSC History.
2431 */
2432 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2433 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2434 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2435 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2436
2437 /*
2438 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2439 *
2440 * On Windows, we have an occasional (but recurring) sour value that messed up
2441 * the history but taking only 1 interval reduces the precision overall.
2442 */
2443 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2444 || pGip->u32UpdateHz >= 1000)
2445 {
2446 uint32_t u32;
2447 u32 = pGipCpu->au32TSCHistory[0];
2448 u32 += pGipCpu->au32TSCHistory[1];
2449 u32 += pGipCpu->au32TSCHistory[2];
2450 u32 += pGipCpu->au32TSCHistory[3];
2451 u32 >>= 2;
2452 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2453 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2454 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2455 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2456 u32UpdateIntervalTSC >>= 2;
2457 u32UpdateIntervalTSC += u32;
2458 u32UpdateIntervalTSC >>= 1;
2459
2460 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2461 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2462 }
2463 else if (pGip->u32UpdateHz >= 90)
2464 {
2465 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2466 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2467 u32UpdateIntervalTSC >>= 1;
2468
2469 /* value chosen on a 2GHz thinkpad running windows */
2470 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2471 }
2472 else
2473 {
2474 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2475
2476 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2477 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2478 }
2479 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2480
2481 /*
2482 * CpuHz.
2483 */
2484 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2485 u64CpuHz /= pGip->u32UpdateIntervalNS;
2486 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2487 }
2488}
2489
2490
2491/**
2492 * Updates the GIP.
2493 *
2494 * @param pDevExt The device extension.
2495 * @param u64NanoTS The current nanosecond timestamp.
2496 * @param u64TSC The current TSC timestamp.
2497 * @param idCpu The CPU ID.
2498 * @param iTick The current timer tick.
2499 *
2500 * @remarks Can be called with interrupts disabled!
2501 */
2502static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2503{
2504 /*
2505 * Determine the relevant CPU data.
2506 */
2507 PSUPGIPCPU pGipCpu;
2508 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2509 AssertPtrReturnVoid(pGip);
2510
2511 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2512 pGipCpu = &pGip->aCPUs[0];
2513 else
2514 {
2515 unsigned iCpu;
2516 uint32_t idApic = supdrvGipGetApicId(pGip);
2517 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
2518 { /* likely */ }
2519 else
2520 return;
2521 iCpu = pGip->aiCpuFromApicId[idApic];
2522 if (RT_LIKELY(iCpu < pGip->cCpus))
2523 { /* likely */ }
2524 else
2525 return;
2526 pGipCpu = &pGip->aCPUs[iCpu];
2527 if (RT_LIKELY(pGipCpu->idCpu == idCpu))
2528 { /* likely */ }
2529 else
2530 return;
2531 }
2532
2533 /*
2534 * Start update transaction.
2535 */
2536 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2537 {
2538 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2539 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2540 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2541 pGipCpu->cErrors++;
2542 return;
2543 }
2544
2545 /*
2546 * Recalc the update frequency every 0x800th time.
2547 */
2548 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariant hosts. */
2549 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2550 {
2551 if (pGip->u64NanoTSLastUpdateHz)
2552 {
2553#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2554 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2555 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2556 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2557 {
2558 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2559 * calculation on non-invariant hosts if it changes the history decision
2560 * taken in supdrvGipDoUpdateCpu(). */
2561 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2562 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2563 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2564 }
2565#endif
2566 }
2567 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2568 }
2569
2570 /*
2571 * Update the data.
2572 */
2573 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2574
2575 /*
2576 * Complete transaction.
2577 */
2578 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2579}
2580
2581
2582/**
2583 * Updates the per cpu GIP data for the calling cpu.
2584 *
2585 * @param pDevExt The device extension.
2586 * @param u64NanoTS The current nanosecond timestamp.
2587 * @param u64TSC The current TSC timesaver.
2588 * @param idCpu The CPU ID.
2589 * @param idApic The APIC id for the CPU index.
2590 * @param iTick The current timer tick.
2591 *
2592 * @remarks Can be called with interrupts disabled!
2593 */
2594static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2595 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2596{
2597 uint32_t iCpu;
2598 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2599
2600 /*
2601 * Avoid a potential race when a CPU online notification doesn't fire on
2602 * the onlined CPU but the tick creeps in before the event notification is
2603 * run.
2604 */
2605 if (RT_LIKELY(iTick != 1))
2606 { /* likely*/ }
2607 else
2608 {
2609 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2610 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2611 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2612 }
2613
2614 iCpu = pGip->aiCpuFromApicId[idApic];
2615 if (RT_LIKELY(iCpu < pGip->cCpus))
2616 {
2617 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2618 if (pGipCpu->idCpu == idCpu)
2619 {
2620 /*
2621 * Start update transaction.
2622 */
2623 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2624 {
2625 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2626 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2627 pGipCpu->cErrors++;
2628 return;
2629 }
2630
2631 /*
2632 * Update the data.
2633 */
2634 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2635
2636 /*
2637 * Complete transaction.
2638 */
2639 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2640 }
2641 }
2642}
2643
2644
2645/**
2646 * Timer callback function for the sync and invariant GIP modes.
2647 *
2648 * @param pTimer The timer.
2649 * @param pvUser Opaque pointer to the device extension.
2650 * @param iTick The timer tick.
2651 */
2652static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2653{
2654 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2655 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2656 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2657 uint64_t u64TSC = ASMReadTSC();
2658 uint64_t u64NanoTS = RTTimeSystemNanoTS();
2659 RT_NOREF1(pTimer);
2660
2661 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2662 {
2663 /*
2664 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2665 * missing timer ticks is not an option for GIP because the GIP users
2666 * will end up incrementing the time in 1ns per time getter call until
2667 * there is a complete timer update. So, if the delta has yet to be
2668 * calculated, we just pretend it is zero for now (the GIP users
2669 * probably won't have it for a wee while either and will do the same).
2670 *
2671 * We could maybe on some platforms try cross calling a CPU with a
2672 * working delta here, but it's not worth the hassle since the
2673 * likelihood of this happening is really low. On Windows, Linux, and
2674 * Solaris timers fire on the CPU they were registered/started on.
2675 * Darwin timers doesn't necessarily (they are high priority threads).
2676 */
2677 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2678 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2679 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2680 Assert(!ASMIntAreEnabled());
2681 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2682 {
2683 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2684 if (iTscDelta != INT64_MAX)
2685 u64TSC -= iTscDelta;
2686 }
2687 }
2688
2689 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2690
2691 ASMSetFlags(fEFlags);
2692}
2693
2694
2695/**
2696 * Timer callback function for async GIP mode.
2697 * @param pTimer The timer.
2698 * @param pvUser Opaque pointer to the device extension.
2699 * @param iTick The timer tick.
2700 */
2701static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2702{
2703 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2704 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2705 RTCPUID idCpu = RTMpCpuId();
2706 uint64_t u64TSC = ASMReadTSC();
2707 uint64_t NanoTS = RTTimeSystemNanoTS();
2708 RT_NOREF1(pTimer);
2709
2710 /** @todo reset the transaction number and whatnot when iTick == 1. */
2711 if (pDevExt->idGipMaster == idCpu)
2712 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2713 else
2714 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, supdrvGipGetApicId(pDevExt->pGip), iTick);
2715
2716 ASMSetFlags(fEFlags);
2717}
2718
2719
2720
2721
2722/*
2723 *
2724 *
2725 * TSC Delta Measurements And Related Code
2726 * TSC Delta Measurements And Related Code
2727 * TSC Delta Measurements And Related Code
2728 *
2729 *
2730 */
2731
2732
2733/*
2734 * Select TSC delta measurement algorithm.
2735 */
2736#if 0
2737# define GIP_TSC_DELTA_METHOD_1
2738#else
2739# define GIP_TSC_DELTA_METHOD_2
2740#endif
2741
2742/** For padding variables to keep them away from other cache lines. Better too
2743 * large than too small!
2744 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2745 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2746 * III had 32 bytes cache lines. */
2747#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2748
2749
2750/**
2751 * TSC delta measurement algorithm \#2 result entry.
2752 */
2753typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2754{
2755 uint32_t iSeqMine;
2756 uint32_t iSeqOther;
2757 uint64_t uTsc;
2758} SUPDRVTSCDELTAMETHOD2ENTRY;
2759
2760/**
2761 * TSC delta measurement algorithm \#2 Data.
2762 */
2763typedef struct SUPDRVTSCDELTAMETHOD2
2764{
2765 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2766 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2767 /** The current sequence number of this worker. */
2768 uint32_t volatile iCurSeqNo;
2769 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2770 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2771 /** Result table. */
2772 SUPDRVTSCDELTAMETHOD2ENTRY aResults[64];
2773} SUPDRVTSCDELTAMETHOD2;
2774/** Pointer to the data for TSC delta measurement algorithm \#2 .*/
2775typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2776
2777
2778/**
2779 * The TSC delta synchronization struct, version 2.
2780 *
2781 * The synchronization variable is completely isolated in its own cache line
2782 * (provided our max cache line size estimate is correct).
2783 */
2784typedef struct SUPTSCDELTASYNC2
2785{
2786 /** Padding to make sure the uVar1 is in its own cache line. */
2787 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2788
2789 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2790 volatile uint32_t uSyncVar;
2791 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2792 volatile uint32_t uSyncSeq;
2793
2794 /** Padding to make sure the uVar1 is in its own cache line. */
2795 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2796
2797 /** Start RDTSC value. Put here mainly to save stack space. */
2798 uint64_t uTscStart;
2799 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2800 uint64_t cMaxTscTicks;
2801} SUPTSCDELTASYNC2;
2802AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2803typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2804
2805/** Prestart wait. */
2806#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2807/** Prestart aborted. */
2808#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2809/** Ready (on your mark). */
2810#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2811/** Steady (get set). */
2812#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2813/** Go! */
2814#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2815/** Used by the verification test. */
2816#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2817
2818/** We reached the time limit. */
2819#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2820/** The other party won't touch the sync struct ever again. */
2821#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2822
2823
2824/**
2825 * Argument package/state passed by supdrvTscMeasureDeltaOne() to the RTMpOn
2826 * callback worker.
2827 * @todo add
2828 */
2829typedef struct SUPDRVGIPTSCDELTARGS
2830{
2831 /** The device extension. */
2832 PSUPDRVDEVEXT pDevExt;
2833 /** Pointer to the GIP CPU array entry for the worker. */
2834 PSUPGIPCPU pWorker;
2835 /** Pointer to the GIP CPU array entry for the master. */
2836 PSUPGIPCPU pMaster;
2837 /** The maximum number of ticks to spend in supdrvTscMeasureDeltaCallback.
2838 * (This is what we need a rough TSC frequency for.) */
2839 uint64_t cMaxTscTicks;
2840 /** Used to abort synchronization setup. */
2841 bool volatile fAbortSetup;
2842
2843 /** Padding to make sure the master variables live in its own cache lines. */
2844 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2845
2846 /** @name Master
2847 * @{ */
2848 /** The time the master spent in the MP worker. */
2849 uint64_t cElapsedMasterTscTicks;
2850 /** The iTry value when stopped at. */
2851 uint32_t iTry;
2852 /** Set if the run timed out. */
2853 bool volatile fTimedOut;
2854 /** Pointer to the master's synchronization struct (on stack). */
2855 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2856 /** Master data union. */
2857 union
2858 {
2859 /** Data (master) for delta verification. */
2860 struct
2861 {
2862 /** Verification test TSC values for the master. */
2863 uint64_t volatile auTscs[32];
2864 } Verify;
2865 /** Data (master) for measurement method \#2. */
2866 struct
2867 {
2868 /** Data and sequence number. */
2869 SUPDRVTSCDELTAMETHOD2 Data;
2870 /** The lag setting for the next run. */
2871 bool fLag;
2872 /** Number of hits. */
2873 uint32_t cHits;
2874 } M2;
2875 } uMaster;
2876 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2877 * VERR_TRY_AGAIN on timeout. */
2878 int32_t rcVerify;
2879#ifdef TSCDELTA_VERIFY_WITH_STATS
2880 /** The maximum difference between TSC read during delta verification. */
2881 int64_t cMaxVerifyTscTicks;
2882 /** The minimum difference between two TSC reads during verification. */
2883 int64_t cMinVerifyTscTicks;
2884 /** The bad TSC diff, worker relative to master (= worker - master).
2885 * Negative value means the worker is behind the master. */
2886 int64_t iVerifyBadTscDiff;
2887#endif
2888 /** @} */
2889
2890 /** Padding to make sure the worker variables live is in its own cache line. */
2891 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2892
2893 /** @name Proletarian
2894 * @{ */
2895 /** Pointer to the worker's synchronization struct (on stack). */
2896 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2897 /** The time the worker spent in the MP worker. */
2898 uint64_t cElapsedWorkerTscTicks;
2899 /** Worker data union. */
2900 union
2901 {
2902 /** Data (worker) for delta verification. */
2903 struct
2904 {
2905 /** Verification test TSC values for the worker. */
2906 uint64_t volatile auTscs[32];
2907 } Verify;
2908 /** Data (worker) for measurement method \#2. */
2909 struct
2910 {
2911 /** Data and sequence number. */
2912 SUPDRVTSCDELTAMETHOD2 Data;
2913 /** The lag setting for the next run (set by master). */
2914 bool fLag;
2915 } M2;
2916 } uWorker;
2917 /** @} */
2918
2919 /** Padding to make sure the above is in its own cache line. */
2920 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2921} SUPDRVGIPTSCDELTARGS;
2922typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2923
2924
2925/** @name Macros that implements the basic synchronization steps common to
2926 * the algorithms.
2927 *
2928 * Must be used from loop as the timeouts are implemented via 'break' statements
2929 * at the moment.
2930 *
2931 * @{
2932 */
2933#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2934# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2935# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2936# define TSCDELTA_DBG_CHECK_LOOP() \
2937 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2938#else
2939# define TSCDELTA_DBG_VARS() ((void)0)
2940# define TSCDELTA_DBG_START_LOOP() ((void)0)
2941# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2942#endif
2943#if 0
2944# define TSCDELTA_DBG_SYNC_MSG(a_Args) SUPR0Printf a_Args
2945#else
2946# define TSCDELTA_DBG_SYNC_MSG(a_Args) ((void)0)
2947#endif
2948#if 0
2949# define TSCDELTA_DBG_SYNC_MSG2(a_Args) SUPR0Printf a_Args
2950#else
2951# define TSCDELTA_DBG_SYNC_MSG2(a_Args) ((void)0)
2952#endif
2953#if 0
2954# define TSCDELTA_DBG_SYNC_MSG9(a_Args) SUPR0Printf a_Args
2955#else
2956# define TSCDELTA_DBG_SYNC_MSG9(a_Args) ((void)0)
2957#endif
2958
2959
2960static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2961 bool fIsMaster, PRTCCUINTREG pfEFlags, PSUPDRVGIPTSCDELTARGS pArgs)
2962{
2963 uint32_t iMySeq = fIsMaster ? 0 : 256;
2964 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2965 uint32_t u32Tmp;
2966 uint32_t iSync2Loops = 0;
2967 RTCCUINTREG fEFlags;
2968 TSCDELTA_DBG_VARS();
2969
2970 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2971
2972 /*
2973 * The master tells the worker to get on it's mark.
2974 */
2975 if (fIsMaster)
2976 {
2977 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2978 { /* likely*/ }
2979 else
2980 {
2981 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #1 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2982 return false;
2983 }
2984 }
2985
2986 /*
2987 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2988 */
2989 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2990 for (;;)
2991 {
2992 fEFlags = ASMIntDisableFlags();
2993 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2994 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2995 break;
2996 ASMSetFlags(fEFlags);
2997 ASMNopPause();
2998
2999 /* Abort? */
3000 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
3001 {
3002 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #2 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
3003 return false;
3004 }
3005
3006 /* Check for timeouts every so often (not every loop in case RDTSC is
3007 trapping or something). Must check the first time around. */
3008#if 0 /* For debugging the timeout paths. */
3009 static uint32_t volatile xxx;
3010#endif
3011 if ( ( (iSync2Loops & 0x3ff) == 0
3012 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
3013#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
3014 || (!fIsMaster && (++xxx & 0xf) == 0)
3015#endif
3016 )
3017 {
3018 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
3019 ignore the timeout if we've got the go ahead already (simpler). */
3020 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
3021 {
3022 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: timeout\n", fIsMaster ? "master" : "worker"));
3023 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
3024 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3025 return false;
3026 }
3027 }
3028 iSync2Loops++;
3029 }
3030
3031 /*
3032 * Interrupts are now disabled and will remain disabled until we do
3033 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
3034 */
3035 *pfEFlags = fEFlags;
3036
3037 /*
3038 * The worker tells the master that it is on its mark and that the master
3039 * need to get into position as well.
3040 */
3041 if (!fIsMaster)
3042 {
3043 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
3044 { /* likely */ }
3045 else
3046 {
3047 ASMSetFlags(fEFlags);
3048 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #3 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3049 return false;
3050 }
3051 }
3052
3053 /*
3054 * The master sends the 'go' to the worker and wait for ACK.
3055 */
3056 if (fIsMaster)
3057 {
3058 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3059 { /* likely */ }
3060 else
3061 {
3062 ASMSetFlags(fEFlags);
3063 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #4 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3064 return false;
3065 }
3066 }
3067
3068 /*
3069 * Wait for the 'go' signal (ack in the master case).
3070 */
3071 TSCDELTA_DBG_START_LOOP();
3072 for (;;)
3073 {
3074 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3075 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
3076 break;
3077 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
3078 { /* likely */ }
3079 else
3080 {
3081 ASMSetFlags(fEFlags);
3082 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #5 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
3083 return false;
3084 }
3085
3086 TSCDELTA_DBG_CHECK_LOOP();
3087 ASMNopPause();
3088 }
3089
3090 /*
3091 * The worker acks the 'go' (shouldn't fail).
3092 */
3093 if (!fIsMaster)
3094 {
3095 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
3096 { /* likely */ }
3097 else
3098 {
3099 ASMSetFlags(fEFlags);
3100 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #6 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
3101 return false;
3102 }
3103 }
3104
3105 /*
3106 * Try enter mostly lockstep execution with it.
3107 */
3108 for (;;)
3109 {
3110 uint32_t iOtherSeq1, iOtherSeq2;
3111 ASMCompilerBarrier();
3112 ASMSerializeInstruction();
3113
3114 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
3115 ASMNopPause();
3116 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
3117 ASMNopPause();
3118 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
3119
3120 ASMCompilerBarrier();
3121 if (iOtherSeq1 == iOtherSeq2)
3122 return true;
3123
3124 /* Did the other guy give up? Should we give up? */
3125 if ( iOtherSeq1 == UINT32_MAX
3126 || iOtherSeq2 == UINT32_MAX)
3127 return true;
3128 if (++iMySeq >= iMaxSeq)
3129 {
3130 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
3131 return true;
3132 }
3133 ASMNopPause();
3134 }
3135}
3136
3137#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3138 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3139 { /*likely*/ } \
3140 else if (true) \
3141 { \
3142 TSCDELTA_DBG_SYNC_MSG9(("sync/before/master: #89\n")); \
3143 break; \
3144 } else do {} while (0)
3145#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
3146 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
3147 { /*likely*/ } \
3148 else if (true) \
3149 { \
3150 TSCDELTA_DBG_SYNC_MSG9(("sync/before/other: #89\n")); \
3151 break; \
3152 } else do {} while (0)
3153
3154
3155static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3156 bool fIsMaster, RTCCUINTREG fEFlags)
3157{
3158 TSCDELTA_DBG_VARS();
3159 RT_NOREF1(pOtherSync);
3160
3161 /*
3162 * Wait for the 'ready' signal. In the master's case, this means the
3163 * worker has completed its data collection, while in the worker's case it
3164 * means the master is done processing the data and it's time for the next
3165 * loop iteration (or whatever).
3166 */
3167 ASMSetFlags(fEFlags);
3168 TSCDELTA_DBG_START_LOOP();
3169 for (;;)
3170 {
3171 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3172 if ( u32Tmp == GIP_TSC_DELTA_SYNC2_READY
3173 || (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY && !fIsMaster) /* kicked twice => race */ )
3174 return true;
3175 ASMNopPause();
3176 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_GO))
3177 { /* likely */}
3178 else
3179 {
3180 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #1 u32Tmp=%#x\n", u32Tmp));
3181 return false; /* shouldn't ever happen! */
3182 }
3183 TSCDELTA_DBG_CHECK_LOOP();
3184 ASMNopPause();
3185 }
3186}
3187
3188#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3189 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_fEFlags))) \
3190 { /* likely */ } \
3191 else if (true) \
3192 { \
3193 TSCDELTA_DBG_SYNC_MSG9(("sync/after/master: #97\n")); \
3194 break; \
3195 } else do {} while (0)
3196
3197#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
3198 /* \
3199 * Tell the worker that we're done processing the data and ready for the next round. \
3200 */ \
3201 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3202 { /* likely */ } \
3203 else if (true)\
3204 { \
3205 TSCDELTA_DBG_SYNC_MSG(("sync/after/master: #99 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3206 break; \
3207 } else do {} while (0)
3208
3209#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3210 if (true) { \
3211 /* \
3212 * Tell the master that we're done collecting data and wait for the next round to start. \
3213 */ \
3214 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3215 { /* likely */ } \
3216 else \
3217 { \
3218 ASMSetFlags(a_fEFlags); \
3219 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #0 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3220 break; \
3221 } \
3222 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_fEFlags))) \
3223 { /* likely */ } \
3224 else \
3225 { \
3226 TSCDELTA_DBG_SYNC_MSG9(("sync/after/other: #98\n")); \
3227 break; \
3228 } \
3229 } else do {} while (0)
3230/** @} */
3231
3232
3233#ifdef GIP_TSC_DELTA_METHOD_1
3234/**
3235 * TSC delta measurement algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
3236 *
3237 *
3238 * We ignore the first few runs of the loop in order to prime the
3239 * cache. Also, we need to be careful about using 'pause' instruction
3240 * in critical busy-wait loops in this code - it can cause undesired
3241 * behaviour with hyperthreading.
3242 *
3243 * We try to minimize the measurement error by computing the minimum
3244 * read time of the compare statement in the worker by taking TSC
3245 * measurements across it.
3246 *
3247 * It must be noted that the computed minimum read time is mostly to
3248 * eliminate huge deltas when the worker is too early and doesn't by
3249 * itself help produce more accurate deltas. We allow two times the
3250 * computed minimum as an arbitrary acceptable threshold. Therefore,
3251 * it is still possible to get negative deltas where there are none
3252 * when the worker is earlier. As long as these occasional negative
3253 * deltas are lower than the time it takes to exit guest-context and
3254 * the OS to reschedule EMT on a different CPU, we won't expose a TSC
3255 * that jumped backwards. It is due to the existence of the negative
3256 * deltas that we don't recompute the delta with the master and
3257 * worker interchanged to eliminate the remaining measurement error.
3258 *
3259 *
3260 * @param pArgs The argument/state data.
3261 * @param pMySync My synchronization structure.
3262 * @param pOtherSync My partner's synchronization structure.
3263 * @param fIsMaster Set if master, clear if worker.
3264 * @param iTry The attempt number.
3265 */
3266static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3267 bool fIsMaster, uint32_t iTry)
3268{
3269 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3270 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3271 uint64_t uMinCmpReadTime = UINT64_MAX;
3272 unsigned iLoop;
3273 NOREF(iTry);
3274
3275 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
3276 {
3277 RTCCUINTREG fEFlags;
3278 if (fIsMaster)
3279 {
3280 /*
3281 * The master.
3282 */
3283 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
3284 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
3285 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
3286 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3287
3288 do
3289 {
3290 ASMSerializeInstruction();
3291 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
3292 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3293
3294 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3295
3296 /* Process the data. */
3297 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3298 {
3299 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
3300 {
3301 int64_t iDelta = pGipCpuWorker->u64TSCSample
3302 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
3303 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3304 ? iDelta < pGipCpuWorker->i64TSCDelta
3305 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3306 pGipCpuWorker->i64TSCDelta = iDelta;
3307 }
3308 }
3309
3310 /* Reset our TSC sample and tell the worker to move on. */
3311 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3312 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3313 }
3314 else
3315 {
3316 /*
3317 * The worker.
3318 */
3319 uint64_t uTscWorker;
3320 uint64_t uTscWorkerFlushed;
3321 uint64_t uCmpReadTime;
3322
3323 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3324 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3325
3326 /*
3327 * Keep reading the TSC until we notice that the master has read his. Reading
3328 * the TSC -after- the master has updated the memory is way too late. We thus
3329 * compensate by trying to measure how long it took for the worker to notice
3330 * the memory flushed from the master.
3331 */
3332 do
3333 {
3334 ASMSerializeInstruction();
3335 uTscWorker = ASMReadTSC();
3336 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3337 ASMSerializeInstruction();
3338 uTscWorkerFlushed = ASMReadTSC();
3339
3340 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3341 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3342 {
3343 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3344 if (uCmpReadTime < (uMinCmpReadTime << 1))
3345 {
3346 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3347 if (uCmpReadTime < uMinCmpReadTime)
3348 uMinCmpReadTime = uCmpReadTime;
3349 }
3350 else
3351 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3352 }
3353 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3354 {
3355 if (uCmpReadTime < uMinCmpReadTime)
3356 uMinCmpReadTime = uCmpReadTime;
3357 }
3358
3359 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3360 }
3361 }
3362
3363 TSCDELTA_DBG_SYNC_MSG9(("sync/method1loop/%s: #92 iLoop=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iLoop,
3364 pMySync->uSyncVar));
3365
3366 /*
3367 * We must reset the worker TSC sample value in case it gets picked as a
3368 * GIP master later on (it's trashed above, naturally).
3369 */
3370 if (!fIsMaster)
3371 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3372}
3373#endif /* GIP_TSC_DELTA_METHOD_1 */
3374
3375
3376#ifdef GIP_TSC_DELTA_METHOD_2
3377/*
3378 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3379 */
3380
3381# define GIP_TSC_DELTA_M2_LOOPS (7 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3382# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 0
3383
3384
3385static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs)
3386{
3387 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3388 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3389 uint32_t idxResult;
3390 uint32_t cHits = 0;
3391
3392 /*
3393 * Look for matching entries in the master and worker tables.
3394 */
3395 for (idxResult = 0; idxResult < RT_ELEMENTS(pArgs->uMaster.M2.Data.aResults); idxResult++)
3396 {
3397 uint32_t idxOther = pArgs->uMaster.M2.Data.aResults[idxResult].iSeqOther;
3398 if (idxOther & 1)
3399 {
3400 idxOther >>= 1;
3401 if (idxOther < RT_ELEMENTS(pArgs->uWorker.M2.Data.aResults))
3402 {
3403 if (pArgs->uWorker.M2.Data.aResults[idxOther].iSeqOther == pArgs->uMaster.M2.Data.aResults[idxResult].iSeqMine)
3404 {
3405 int64_t iDelta;
3406 iDelta = pArgs->uWorker.M2.Data.aResults[idxOther].uTsc
3407 - (pArgs->uMaster.M2.Data.aResults[idxResult].uTsc - iMasterTscDelta);
3408 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3409 ? iDelta < iBestDelta
3410 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3411 iBestDelta = iDelta;
3412 cHits++;
3413 }
3414 }
3415 }
3416 }
3417
3418 /*
3419 * Save the results.
3420 */
3421 if (cHits > 2)
3422 pArgs->pWorker->i64TSCDelta = iBestDelta;
3423 pArgs->uMaster.M2.cHits += cHits;
3424}
3425
3426
3427/**
3428 * The core function of the 2nd TSC delta measurement algorithm.
3429 *
3430 * The idea here is that we have the two CPUs execute the exact same code
3431 * collecting a largish set of TSC samples. The code has one data dependency on
3432 * the other CPU which intention it is to synchronize the execution as well as
3433 * help cross references the two sets of TSC samples (the sequence numbers).
3434 *
3435 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3436 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3437 * it will help with making the CPUs enter lock step execution occasionally.
3438 *
3439 */
3440static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3441{
3442 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3443 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3444
3445 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3446 ASMSerializeInstruction();
3447 while (cLeft-- > 0)
3448 {
3449 uint64_t uTsc;
3450 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3451 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3452 ASMCompilerBarrier();
3453 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3454 uTsc = ASMReadTSC();
3455 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3456 ASMCompilerBarrier();
3457 ASMSerializeInstruction();
3458 pEntry->iSeqMine = iSeqMine;
3459 pEntry->iSeqOther = iSeqOther;
3460 pEntry->uTsc = uTsc;
3461 pEntry++;
3462 ASMSerializeInstruction();
3463 if (fLag)
3464 ASMNopPause();
3465 }
3466}
3467
3468
3469/**
3470 * TSC delta measurement algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3471 *
3472 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3473 *
3474 * @param pArgs The argument/state data.
3475 * @param pMySync My synchronization structure.
3476 * @param pOtherSync My partner's synchronization structure.
3477 * @param fIsMaster Set if master, clear if worker.
3478 * @param iTry The attempt number.
3479 */
3480static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3481 bool fIsMaster, uint32_t iTry)
3482{
3483 unsigned iLoop;
3484 RT_NOREF1(iTry);
3485
3486 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3487 {
3488 RTCCUINTREG fEFlags;
3489 if (fIsMaster)
3490 {
3491 /*
3492 * Adjust the loop lag fudge.
3493 */
3494# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3495 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3496 {
3497 /* Lag during the priming to be nice to everyone.. */
3498 pArgs->uMaster.M2.fLag = true;
3499 pArgs->uWorker.M2.fLag = true;
3500 }
3501 else
3502# endif
3503 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3504 {
3505 /* 25 % of the body without lagging. */
3506 pArgs->uMaster.M2.fLag = false;
3507 pArgs->uWorker.M2.fLag = false;
3508 }
3509 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3510 {
3511 /* 25 % of the body with both lagging. */
3512 pArgs->uMaster.M2.fLag = true;
3513 pArgs->uWorker.M2.fLag = true;
3514 }
3515 else
3516 {
3517 /* 50% of the body with alternating lag. */
3518 pArgs->uMaster.M2.fLag = (iLoop & 1) == 0;
3519 pArgs->uWorker.M2.fLag= (iLoop & 1) == 1;
3520 }
3521
3522 /*
3523 * Sync up with the worker and collect data.
3524 */
3525 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3526 supdrvTscDeltaMethod2CollectData(&pArgs->uMaster.M2.Data, &pArgs->uWorker.M2.Data.iCurSeqNo, pArgs->uMaster.M2.fLag);
3527 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3528
3529 /*
3530 * Process the data.
3531 */
3532# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3533 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3534# endif
3535 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs);
3536
3537 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3538 }
3539 else
3540 {
3541 /*
3542 * The worker.
3543 */
3544 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3545 supdrvTscDeltaMethod2CollectData(&pArgs->uWorker.M2.Data, &pArgs->uMaster.M2.Data.iCurSeqNo, pArgs->uWorker.M2.fLag);
3546 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3547 }
3548 }
3549}
3550
3551#endif /* GIP_TSC_DELTA_METHOD_2 */
3552
3553
3554
3555static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3556 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3557{
3558 /*PSUPGIPCPU pGipCpuWorker = pArgs->pWorker; - unused */
3559 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3560 uint32_t i;
3561 TSCDELTA_DBG_VARS();
3562
3563 for (;;)
3564 {
3565 RTCCUINTREG fEFlags;
3566 AssertCompile((RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) & 1) == 0);
3567 AssertCompile(RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) == RT_ELEMENTS(pArgs->uWorker.Verify.auTscs));
3568
3569 if (fIsMaster)
3570 {
3571 uint64_t uTscWorker;
3572 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3573
3574 /*
3575 * Collect TSC, master goes first.
3576 */
3577 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i += 2)
3578 {
3579 /* Read, kick & wait #1. */
3580 uint64_t uTsc = ASMReadTSC();
3581 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3582 ASMSerializeInstruction();
3583 pArgs->uMaster.Verify.auTscs[i] = uTsc;
3584 TSCDELTA_DBG_START_LOOP();
3585 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3586 {
3587 TSCDELTA_DBG_CHECK_LOOP();
3588 ASMNopPause();
3589 }
3590
3591 /* Read, kick & wait #2. */
3592 uTsc = ASMReadTSC();
3593 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3594 ASMSerializeInstruction();
3595 pArgs->uMaster.Verify.auTscs[i + 1] = uTsc;
3596 TSCDELTA_DBG_START_LOOP();
3597 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3598 {
3599 TSCDELTA_DBG_CHECK_LOOP();
3600 ASMNopPause();
3601 }
3602 }
3603
3604 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3605
3606 /*
3607 * Process the data.
3608 */
3609#ifdef TSCDELTA_VERIFY_WITH_STATS
3610 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3611 pArgs->cMinVerifyTscTicks = INT64_MAX;
3612 pArgs->iVerifyBadTscDiff = 0;
3613#endif
3614 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3615 uTscWorker = 0;
3616 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i++)
3617 {
3618 /* Master vs previous worker entry. */
3619 uint64_t uTscMaster = pArgs->uMaster.Verify.auTscs[i] - pGipCpuMaster->i64TSCDelta;
3620 int64_t iDiff;
3621 if (i > 0)
3622 {
3623 iDiff = uTscMaster - uTscWorker;
3624#ifdef TSCDELTA_VERIFY_WITH_STATS
3625 if (iDiff > pArgs->cMaxVerifyTscTicks)
3626 pArgs->cMaxVerifyTscTicks = iDiff;
3627 if (iDiff < pArgs->cMinVerifyTscTicks)
3628 pArgs->cMinVerifyTscTicks = iDiff;
3629#endif
3630 if (iDiff < 0)
3631 {
3632#ifdef TSCDELTA_VERIFY_WITH_STATS
3633 pArgs->iVerifyBadTscDiff = -iDiff;
3634#endif
3635 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3636 break;
3637 }
3638 }
3639
3640 /* Worker vs master. */
3641 uTscWorker = pArgs->uWorker.Verify.auTscs[i] - iWorkerTscDelta;
3642 iDiff = uTscWorker - uTscMaster;
3643#ifdef TSCDELTA_VERIFY_WITH_STATS
3644 if (iDiff > pArgs->cMaxVerifyTscTicks)
3645 pArgs->cMaxVerifyTscTicks = iDiff;
3646 if (iDiff < pArgs->cMinVerifyTscTicks)
3647 pArgs->cMinVerifyTscTicks = iDiff;
3648#endif
3649 if (iDiff < 0)
3650 {
3651#ifdef TSCDELTA_VERIFY_WITH_STATS
3652 pArgs->iVerifyBadTscDiff = iDiff;
3653#endif
3654 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3655 break;
3656 }
3657 }
3658
3659 /* Done. */
3660 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3661 }
3662 else
3663 {
3664 /*
3665 * The worker, master leads.
3666 */
3667 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3668
3669 for (i = 0; i < RT_ELEMENTS(pArgs->uWorker.Verify.auTscs); i += 2)
3670 {
3671 uint64_t uTsc;
3672
3673 /* Wait, Read and Kick #1. */
3674 TSCDELTA_DBG_START_LOOP();
3675 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3676 {
3677 TSCDELTA_DBG_CHECK_LOOP();
3678 ASMNopPause();
3679 }
3680 uTsc = ASMReadTSC();
3681 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3682 ASMSerializeInstruction();
3683 pArgs->uWorker.Verify.auTscs[i] = uTsc;
3684
3685 /* Wait, Read and Kick #2. */
3686 TSCDELTA_DBG_START_LOOP();
3687 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3688 {
3689 TSCDELTA_DBG_CHECK_LOOP();
3690 ASMNopPause();
3691 }
3692 uTsc = ASMReadTSC();
3693 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3694 ASMSerializeInstruction();
3695 pArgs->uWorker.Verify.auTscs[i + 1] = uTsc;
3696 }
3697
3698 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3699 }
3700 return pArgs->rcVerify;
3701 }
3702
3703 /*
3704 * Timed out, please retry.
3705 */
3706 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3707 return VERR_TIMEOUT;
3708}
3709
3710
3711
3712/**
3713 * Handles the special abort procedure during synchronization setup in
3714 * supdrvTscMeasureDeltaCallbackUnwrapped().
3715 *
3716 * @returns 0 (dummy, ignored)
3717 * @param pArgs Pointer to argument/state data.
3718 * @param pMySync Pointer to my sync structure.
3719 * @param fIsMaster Set if we're the master, clear if worker.
3720 * @param fTimeout Set if it's a timeout.
3721 */
3722DECL_NO_INLINE(static, int)
3723supdrvTscMeasureDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3724{
3725 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3726 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3727 TSCDELTA_DBG_VARS();
3728 RT_NOREF1(pMySync);
3729
3730 /*
3731 * Clear our sync pointer and make sure the abort flag is set.
3732 */
3733 ASMAtomicWriteNullPtr(ppMySync);
3734 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3735 if (fTimeout)
3736 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3737
3738 /*
3739 * Make sure the other party is out of there and won't be touching our
3740 * sync state again (would cause stack corruption).
3741 */
3742 TSCDELTA_DBG_START_LOOP();
3743 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3744 {
3745 ASMNopPause();
3746 ASMNopPause();
3747 ASMNopPause();
3748 TSCDELTA_DBG_CHECK_LOOP();
3749 }
3750
3751 return 0;
3752}
3753
3754
3755/**
3756 * This is used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3757 * and compute the delta between them.
3758 *
3759 * To reduce code size a good when timeout handling was added, a dummy return
3760 * value had to be added (saves 1-3 lines per timeout case), thus this
3761 * 'Unwrapped' function and the dummy 0 return value.
3762 *
3763 * @returns 0 (dummy, ignored)
3764 * @param idCpu The CPU we are current scheduled on.
3765 * @param pArgs Pointer to a parameter package.
3766 *
3767 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3768 * read the TSC at exactly the same time on both the master and the
3769 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3770 * contention, SMI, pipelining etc. there is no guaranteed way of
3771 * doing this on x86 CPUs.
3772 */
3773static int supdrvTscMeasureDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3774{
3775 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3776 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3777 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3778 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3779 uint32_t iTry;
3780 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3781 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3782 SUPTSCDELTASYNC2 MySync;
3783 PSUPTSCDELTASYNC2 pOtherSync;
3784 int rc;
3785 TSCDELTA_DBG_VARS();
3786
3787 /* A bit of paranoia first. */
3788 if (!pGipCpuMaster || !pGipCpuWorker)
3789 return 0;
3790
3791 /*
3792 * If the CPU isn't part of the measurement, return immediately.
3793 */
3794 if ( !fIsMaster
3795 && idCpu != pGipCpuWorker->idCpu)
3796 return 0;
3797
3798 /*
3799 * Set up my synchronization stuff and wait for the other party to show up.
3800 *
3801 * We don't wait forever since the other party may be off fishing (offline,
3802 * spinning with ints disables, whatever), we must play nice to the rest of
3803 * the system as this context generally isn't one in which we will get
3804 * preempted and we may hold up a number of lower priority interrupts.
3805 */
3806 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3807 ASMAtomicWritePtr(ppMySync, &MySync);
3808 MySync.uTscStart = ASMReadTSC();
3809 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3810
3811 /* Look for the partner, might not be here yet... Special abort considerations. */
3812 iTry = 0;
3813 TSCDELTA_DBG_START_LOOP();
3814 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3815 {
3816 ASMNopPause();
3817 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3818 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu) )
3819 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3820 if ( (iTry++ & 0xff) == 0
3821 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3822 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3823 TSCDELTA_DBG_CHECK_LOOP();
3824 ASMNopPause();
3825 }
3826
3827 /* I found my partner, waiting to be found... Special abort considerations. */
3828 if (fIsMaster)
3829 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3830 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3831
3832 iTry = 0;
3833 TSCDELTA_DBG_START_LOOP();
3834 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3835 {
3836 ASMNopPause();
3837 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3838 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3839 if ( (iTry++ & 0xff) == 0
3840 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3841 {
3842 if ( fIsMaster
3843 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3844 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3845 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3846 }
3847 TSCDELTA_DBG_CHECK_LOOP();
3848 }
3849
3850 if (!fIsMaster)
3851 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3852 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3853
3854/** @todo Add a resumable state to pArgs so we don't waste time if we time
3855 * out or something. Timeouts are legit, any of the two CPUs may get
3856 * interrupted. */
3857
3858 /*
3859 * Start by seeing if we have a zero delta between the two CPUs.
3860 * This should normally be the case.
3861 */
3862 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3863 if (RT_SUCCESS(rc))
3864 {
3865 if (fIsMaster)
3866 {
3867 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3868 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3869 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3870 }
3871 }
3872 /*
3873 * If the verification didn't time out, do regular delta measurements.
3874 * We retry this until we get a reasonable value.
3875 */
3876 else if (rc != VERR_TIMEOUT)
3877 {
3878 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3879 for (iTry = 0; iTry < 12; iTry++)
3880 {
3881 /*
3882 * Check the state before we start.
3883 */
3884 uint32_t u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3885 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3886 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3887 {
3888 TSCDELTA_DBG_SYNC_MSG(("sync/loop/%s: #0 iTry=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iTry, u32Tmp));
3889 break;
3890 }
3891
3892 /*
3893 * Do the measurements.
3894 */
3895#ifdef GIP_TSC_DELTA_METHOD_1
3896 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3897#elif defined(GIP_TSC_DELTA_METHOD_2)
3898 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3899#else
3900# error "huh??"
3901#endif
3902
3903 /*
3904 * Check the state.
3905 */
3906 u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3907 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3908 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3909 {
3910 if (fIsMaster)
3911 TSCDELTA_DBG_SYNC_MSG(("sync/loop/master: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3912 else
3913 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/worker: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3914 break;
3915 }
3916
3917 /*
3918 * Success? If so, stop trying. Master decides.
3919 */
3920 if (fIsMaster)
3921 {
3922 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3923 {
3924 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3925 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3926 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/master: #9 iTry=%u MyState=%#x\n", iTry, MySync.uSyncVar));
3927 break;
3928 }
3929 }
3930 }
3931 if (fIsMaster)
3932 pArgs->iTry = iTry;
3933 }
3934
3935 /*
3936 * End the synchronization dance. We tell the other that we're done,
3937 * then wait for the same kind of reply.
3938 */
3939 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3940 ASMAtomicWriteNullPtr(ppMySync);
3941 iTry = 0;
3942 TSCDELTA_DBG_START_LOOP();
3943 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3944 {
3945 iTry++;
3946 if ( iTry == 0
3947 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu))
3948 break; /* this really shouldn't happen. */
3949 TSCDELTA_DBG_CHECK_LOOP();
3950 ASMNopPause();
3951 }
3952
3953 /*
3954 * Collect some runtime stats.
3955 */
3956 if (fIsMaster)
3957 pArgs->cElapsedMasterTscTicks = ASMReadTSC() - MySync.uTscStart;
3958 else
3959 pArgs->cElapsedWorkerTscTicks = ASMReadTSC() - MySync.uTscStart;
3960 return 0;
3961}
3962
3963/**
3964 * Callback used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3965 * and compute the delta between them.
3966 *
3967 * @param idCpu The CPU we are current scheduled on.
3968 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3969 * @param pvUser2 Unused.
3970 */
3971static DECLCALLBACK(void) supdrvTscMeasureDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3972{
3973 supdrvTscMeasureDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3974 RT_NOREF1(pvUser2);
3975}
3976
3977
3978/**
3979 * Measures the TSC delta between the master GIP CPU and one specified worker
3980 * CPU.
3981 *
3982 * @returns VBox status code.
3983 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3984 * failure.
3985 * @param pDevExt Pointer to the device instance data.
3986 * @param idxWorker The index of the worker CPU from the GIP's array of
3987 * CPUs.
3988 *
3989 * @remarks This must be called with preemption enabled!
3990 */
3991static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3992{
3993 int rc;
3994 int rc2;
3995 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3996 RTCPUID idMaster = pDevExt->idGipMaster;
3997 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3998 PSUPGIPCPU pGipCpuMaster;
3999 uint32_t iGipCpuMaster;
4000 uint32_t u32Tmp;
4001
4002 /* Validate input a bit. */
4003 AssertReturn(pGip, VERR_INVALID_PARAMETER);
4004 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4005 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
4006
4007 /*
4008 * Don't attempt measuring the delta for the GIP master.
4009 */
4010 if (pGipCpuWorker->idCpu == idMaster)
4011 {
4012 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
4013 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
4014 return VINF_SUCCESS;
4015 }
4016
4017 /*
4018 * One measurement at a time, at least for now. We might be using
4019 * broadcast IPIs so, so be nice to the rest of the system.
4020 */
4021#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4022 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
4023#else
4024 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
4025#endif
4026 if (RT_FAILURE(rc))
4027 return rc;
4028
4029 /*
4030 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
4031 * try pick a different master. (This fudge only works with multi core systems.)
4032 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
4033 *
4034 * We skip this on AMDs for now as their HTT is different from Intel's and
4035 * it doesn't seem to have any favorable effect on the results.
4036 *
4037 * If the master is offline, we need a new master too, so share the code.
4038 */
4039 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
4040 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
4041 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
4042 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
4043 && pGip->cOnlineCpus > 2
4044 && ASMHasCpuId()
4045 && RTX86IsValidStdRange(ASMCpuId_EAX(0))
4046 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
4047 && ( !ASMIsAmdCpu()
4048 || RTX86GetCpuFamily(u32Tmp = ASMCpuId_EAX(1)) > 0x15
4049 || ( RTX86GetCpuFamily(u32Tmp) == 0x15 /* Piledriver+, not bulldozer (FX-4150 didn't like it). */
4050 && RTX86GetCpuModelAMD(u32Tmp) >= 0x02) ) )
4051 || !RTMpIsCpuOnline(idMaster) )
4052 {
4053 uint32_t i;
4054 for (i = 0; i < pGip->cCpus; i++)
4055 if ( i != iGipCpuMaster
4056 && i != idxWorker
4057 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
4058 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
4059 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
4060 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
4061 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
4062 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
4063 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
4064 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
4065 {
4066 iGipCpuMaster = i;
4067 pGipCpuMaster = &pGip->aCPUs[i];
4068 idMaster = pGipCpuMaster->idCpu;
4069 break;
4070 }
4071 }
4072
4073 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
4074 {
4075 /*
4076 * Initialize data package for the RTMpOnPair callback.
4077 */
4078 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
4079 if (pArgs)
4080 {
4081 pArgs->pWorker = pGipCpuWorker;
4082 pArgs->pMaster = pGipCpuMaster;
4083 pArgs->pDevExt = pDevExt;
4084 pArgs->pSyncMaster = NULL;
4085 pArgs->pSyncWorker = NULL;
4086 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 512; /* 1953 us */
4087
4088 /*
4089 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
4090 * and supdrvTscMeasureDeltaCallback can use it as a success check.
4091 */
4092 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
4093 * that when doing the restart loop reorg. */
4094 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
4095 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
4096 supdrvTscMeasureDeltaCallback, pArgs, NULL);
4097 if (RT_SUCCESS(rc))
4098 {
4099#if 0
4100 SUPR0Printf("mponpair ticks: %9llu %9llu max: %9llu iTry: %u%s\n", pArgs->cElapsedMasterTscTicks,
4101 pArgs->cElapsedWorkerTscTicks, pArgs->cMaxTscTicks, pArgs->iTry,
4102 pArgs->fTimedOut ? " timed out" :"");
4103#endif
4104#if 0
4105 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
4106 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
4107#endif
4108 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
4109 {
4110 /*
4111 * Work the TSC delta applicability rating. It starts
4112 * optimistic in supdrvGipInit, we downgrade it here.
4113 */
4114 SUPGIPUSETSCDELTA enmRating;
4115 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
4116 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
4117 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
4118 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
4119 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
4120 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
4121 else
4122 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
4123 if (pGip->enmUseTscDelta < enmRating)
4124 {
4125 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
4126 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
4127 }
4128 }
4129 else
4130 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4131 }
4132 /** @todo return try-again if we get an offline CPU error. */
4133
4134 RTMemFree(pArgs);
4135 }
4136 else
4137 rc = VERR_NO_MEMORY;
4138 }
4139 else
4140 rc = VERR_CPU_OFFLINE;
4141
4142 /*
4143 * We're done now.
4144 */
4145#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4146 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4147#else
4148 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
4149#endif
4150 return rc;
4151}
4152
4153
4154/**
4155 * Resets the TSC-delta related TSC samples and optionally the deltas
4156 * themselves.
4157 *
4158 * @param pDevExt Pointer to the device instance data.
4159 * @param fResetTscDeltas Whether the TSC-deltas are also to be reset.
4160 *
4161 * @remarks This might be called while holding a spinlock!
4162 */
4163static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fResetTscDeltas)
4164{
4165 unsigned iCpu;
4166 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4167 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4168 {
4169 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
4170 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
4171 if (fResetTscDeltas)
4172 {
4173 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpu->iCpuSet);
4174 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
4175 }
4176 }
4177}
4178
4179
4180/**
4181 * Picks an online CPU as the master TSC for TSC-delta computations.
4182 *
4183 * @returns VBox status code.
4184 * @param pDevExt Pointer to the device instance data.
4185 * @param pidxMaster Where to store the CPU array index of the chosen
4186 * master. Optional, can be NULL.
4187 */
4188static int supdrvTscPickMaster(PSUPDRVDEVEXT pDevExt, uint32_t *pidxMaster)
4189{
4190 /*
4191 * Pick the first CPU online as the master TSC and make it the new GIP master based
4192 * on the APIC ID.
4193 *
4194 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
4195 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
4196 * master as this point since the sync/async timer isn't created yet.
4197 */
4198 unsigned iCpu;
4199 uint32_t idxMaster = UINT32_MAX;
4200 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4201 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
4202 {
4203 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
4204 if (idxCpu != UINT16_MAX)
4205 {
4206 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
4207 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
4208 {
4209 idxMaster = idxCpu;
4210 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
4211 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpu->idCpu);
4212 if (pidxMaster)
4213 *pidxMaster = idxMaster;
4214 return VINF_SUCCESS;
4215 }
4216 }
4217 }
4218 return VERR_CPU_OFFLINE;
4219}
4220
4221
4222/**
4223 * Performs the initial measurements of the TSC deltas between CPUs.
4224 *
4225 * This is called by supdrvGipCreate(), supdrvGipPowerNotificationCallback() or
4226 * triggered by it if threaded.
4227 *
4228 * @returns VBox status code.
4229 * @param pDevExt Pointer to the device instance data.
4230 *
4231 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
4232 * idCpu, GIP's online CPU set which are populated in
4233 * supdrvGipInitOnCpu().
4234 */
4235static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt)
4236{
4237 PSUPGIPCPU pGipCpuMaster;
4238 unsigned iCpu;
4239 unsigned iOddEven;
4240 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4241 uint32_t idxMaster = UINT32_MAX;
4242 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
4243
4244 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4245 supdrvTscResetSamples(pDevExt, true /* fClearDeltas */);
4246 int rc = supdrvTscPickMaster(pDevExt, &idxMaster);
4247 if (RT_FAILURE(rc))
4248 {
4249 SUPR0Printf("Failed to pick a CPU master for TSC-delta measurements rc=%Rrc\n", rc);
4250 return rc;
4251 }
4252 AssertReturn(idxMaster < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4253 pGipCpuMaster = &pGip->aCPUs[idxMaster];
4254 Assert(pDevExt->idGipMaster == pGipCpuMaster->idCpu);
4255
4256 /*
4257 * If there is only a single CPU online we have nothing to do.
4258 */
4259 if (pGip->cOnlineCpus <= 1)
4260 {
4261 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
4262 return VINF_SUCCESS;
4263 }
4264
4265 /*
4266 * Loop thru the GIP CPU array and get deltas for each CPU (except the
4267 * master). We do the CPUs with the even numbered APIC IDs first so that
4268 * we've got alternative master CPUs to pick from on hyper-threaded systems.
4269 */
4270 for (iOddEven = 0; iOddEven < 2; iOddEven++)
4271 {
4272 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4273 {
4274 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4275 if ( iCpu != idxMaster
4276 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
4277 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4278 {
4279 rc = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4280 if (RT_FAILURE(rc))
4281 {
4282 SUPR0Printf("supdrvTscMeasureDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4283 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4284 break;
4285 }
4286
4287 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4288 {
4289 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4290 rc = VERR_TRY_AGAIN;
4291 break;
4292 }
4293 }
4294 }
4295 }
4296
4297 return rc;
4298}
4299
4300
4301#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4302
4303/**
4304 * Switches the TSC-delta measurement thread into the butchered state.
4305 *
4306 * @returns VBox status code.
4307 * @param pDevExt Pointer to the device instance data.
4308 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4309 * @param pszFailed An error message to log.
4310 * @param rcFailed The error code to exit the thread with.
4311 */
4312static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4313{
4314 if (!fSpinlockHeld)
4315 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4316
4317 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4318 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4319 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", pszFailed, rcFailed));
4320 return rcFailed;
4321}
4322
4323
4324/**
4325 * The TSC-delta measurement thread.
4326 *
4327 * @returns VBox status code.
4328 * @param hThread The thread handle.
4329 * @param pvUser Opaque pointer to the device instance data.
4330 */
4331static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4332{
4333 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4334 int rc = VERR_INTERNAL_ERROR_2;
4335 for (;;)
4336 {
4337 /*
4338 * Switch on the current state.
4339 */
4340 SUPDRVTSCDELTATHREADSTATE enmState;
4341 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4342 enmState = pDevExt->enmTscDeltaThreadState;
4343 switch (enmState)
4344 {
4345 case kTscDeltaThreadState_Creating:
4346 {
4347 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4348 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4349 if (RT_FAILURE(rc))
4350 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4351 RT_FALL_THRU();
4352 }
4353
4354 case kTscDeltaThreadState_Listening:
4355 {
4356 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4357
4358 /*
4359 * Linux counts uninterruptible sleeps as load, hence we shall do a
4360 * regular, interruptible sleep here and ignore wake ups due to signals.
4361 * See task_contributes_to_load() in include/linux/sched.h in the Linux sources.
4362 */
4363 rc = RTThreadUserWaitNoResume(hThread, pDevExt->cMsTscDeltaTimeout);
4364 if ( RT_FAILURE(rc)
4365 && rc != VERR_TIMEOUT
4366 && rc != VERR_INTERRUPTED)
4367 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4368 RTThreadUserReset(hThread);
4369 break;
4370 }
4371
4372 case kTscDeltaThreadState_WaitAndMeasure:
4373 {
4374 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4375 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4376 if (RT_FAILURE(rc))
4377 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4378 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4379 RTThreadSleep(1);
4380 RT_FALL_THRU();
4381 }
4382
4383 case kTscDeltaThreadState_Measuring:
4384 {
4385 if (pDevExt->fTscThreadRecomputeAllDeltas)
4386 {
4387 int cTries = 8;
4388 int cMsWaitPerTry = 10;
4389 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4390 Assert(pGip);
4391 do
4392 {
4393 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
4394 rc = supdrvTscMeasureInitialDeltas(pDevExt);
4395 if ( RT_SUCCESS(rc)
4396 || ( RT_FAILURE(rc)
4397 && rc != VERR_TRY_AGAIN
4398 && rc != VERR_CPU_OFFLINE))
4399 {
4400 break;
4401 }
4402 RTThreadSleep(cMsWaitPerTry);
4403 } while (cTries-- > 0);
4404 pDevExt->fTscThreadRecomputeAllDeltas = false;
4405 }
4406 else
4407 {
4408 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4409 unsigned iCpu;
4410
4411 /* Measure TSC-deltas only for the CPUs that are in the set. */
4412 rc = VINF_SUCCESS;
4413 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4414 {
4415 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4416 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4417 {
4418 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4419 {
4420 int rc2 = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4421 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4422 rc = rc2;
4423 }
4424 else
4425 {
4426 /*
4427 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex(),
4428 * mark the delta as fine to get the timer thread off our back.
4429 */
4430 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4431 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4432 }
4433 }
4434 }
4435 }
4436 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4437 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4438 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4439 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4440 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as init value, see supdrvTscDeltaThreadInit(). */
4441 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4442 break;
4443 }
4444
4445 case kTscDeltaThreadState_Terminating:
4446 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4447 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4448 return VINF_SUCCESS;
4449
4450 case kTscDeltaThreadState_Butchered:
4451 default:
4452 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4453 }
4454 }
4455 /* not reached */
4456}
4457
4458
4459/**
4460 * Waits for the TSC-delta measurement thread to respond to a state change.
4461 *
4462 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4463 * other error code on internal error.
4464 *
4465 * @param pDevExt The device instance data.
4466 * @param enmCurState The current state.
4467 * @param enmNewState The new state we're waiting for it to enter.
4468 */
4469static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4470 SUPDRVTSCDELTATHREADSTATE enmNewState)
4471{
4472 SUPDRVTSCDELTATHREADSTATE enmActualState;
4473 int rc;
4474
4475 /*
4476 * Wait a short while for the expected state transition.
4477 */
4478 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4479 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4480 enmActualState = pDevExt->enmTscDeltaThreadState;
4481 if (enmActualState == enmNewState)
4482 {
4483 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4484 rc = VINF_SUCCESS;
4485 }
4486 else if (enmActualState == enmCurState)
4487 {
4488 /*
4489 * Wait longer if the state has not yet transitioned to the one we want.
4490 */
4491 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4492 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4493 if ( RT_SUCCESS(rc)
4494 || rc == VERR_TIMEOUT)
4495 {
4496 /*
4497 * Check the state whether we've succeeded.
4498 */
4499 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4500 enmActualState = pDevExt->enmTscDeltaThreadState;
4501 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4502 if (enmActualState == enmNewState)
4503 rc = VINF_SUCCESS;
4504 else if (enmActualState == enmCurState)
4505 {
4506 rc = VERR_TIMEOUT;
4507 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmActualState=%d enmNewState=%d\n",
4508 enmActualState, enmNewState));
4509 }
4510 else
4511 {
4512 rc = VERR_INTERNAL_ERROR;
4513 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4514 enmActualState, enmNewState));
4515 }
4516 }
4517 else
4518 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4519 }
4520 else
4521 {
4522 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4523 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state %d when transitioning from %d to %d\n",
4524 enmActualState, enmCurState, enmNewState));
4525 rc = VERR_INTERNAL_ERROR;
4526 }
4527
4528 return rc;
4529}
4530
4531
4532/**
4533 * Signals the TSC-delta thread to start measuring TSC-deltas.
4534 *
4535 * @param pDevExt Pointer to the device instance data.
4536 * @param fForceAll Force re-calculating TSC-deltas on all CPUs.
4537 */
4538static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll)
4539{
4540 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
4541 {
4542 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4543 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4544 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4545 {
4546 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4547 if (fForceAll)
4548 pDevExt->fTscThreadRecomputeAllDeltas = true;
4549 }
4550 else if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_WaitAndMeasure
4551 && fForceAll)
4552 pDevExt->fTscThreadRecomputeAllDeltas = true;
4553 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4554 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4555 }
4556}
4557
4558
4559/**
4560 * Terminates the actual thread running supdrvTscDeltaThread().
4561 *
4562 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4563 * supdrvTscDeltaTerm().
4564 *
4565 * @param pDevExt Pointer to the device instance data.
4566 */
4567static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4568{
4569 int rc;
4570 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4571 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4572 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4573 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4574 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4575 if (RT_FAILURE(rc))
4576 {
4577 /* Signal a few more times before giving up. */
4578 int cTriesLeft = 5;
4579 while (--cTriesLeft > 0)
4580 {
4581 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4582 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4583 if (rc != VERR_TIMEOUT)
4584 break;
4585 }
4586 }
4587}
4588
4589
4590/**
4591 * Initializes and spawns the TSC-delta measurement thread.
4592 *
4593 * A thread is required for servicing re-measurement requests from events like
4594 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4595 * under all contexts on all OSs.
4596 *
4597 * @returns VBox status code.
4598 * @param pDevExt Pointer to the device instance data.
4599 *
4600 * @remarks Must only be called -after- initializing GIP and setting up MP
4601 * notifications!
4602 */
4603static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4604{
4605 int rc;
4606 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4607 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4608 if (RT_SUCCESS(rc))
4609 {
4610 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4611 if (RT_SUCCESS(rc))
4612 {
4613 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4614 pDevExt->cMsTscDeltaTimeout = 60000;
4615 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4616 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4617 if (RT_SUCCESS(rc))
4618 {
4619 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4620 if (RT_SUCCESS(rc))
4621 {
4622 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4623 return rc;
4624 }
4625
4626 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4627 supdrvTscDeltaThreadTerminate(pDevExt);
4628 }
4629 else
4630 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4631 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4632 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4633 }
4634 else
4635 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4636 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4637 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4638 }
4639 else
4640 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4641
4642 return rc;
4643}
4644
4645
4646/**
4647 * Terminates the TSC-delta measurement thread and cleanup.
4648 *
4649 * @param pDevExt Pointer to the device instance data.
4650 */
4651static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4652{
4653 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4654 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4655 {
4656 supdrvTscDeltaThreadTerminate(pDevExt);
4657 }
4658
4659 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4660 {
4661 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4662 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4663 }
4664
4665 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4666 {
4667 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4668 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4669 }
4670
4671 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4672}
4673
4674#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4675
4676/**
4677 * Measure the TSC delta for the CPU given by its CPU set index.
4678 *
4679 * @returns VBox status code.
4680 * @retval VERR_INTERRUPTED if interrupted while waiting.
4681 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4682 * measurement.
4683 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4684 *
4685 * @param pSession The caller's session. GIP must've been mapped.
4686 * @param iCpuSet The CPU set index of the CPU to measure.
4687 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4688 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4689 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4690 * ready.
4691 * @param cTries Number of times to try, pass 0 for the default.
4692 */
4693SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4694 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4695{
4696 PSUPDRVDEVEXT pDevExt;
4697 PSUPGLOBALINFOPAGE pGip;
4698 uint16_t iGipCpu;
4699 int rc;
4700#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4701 uint64_t msTsStartWait;
4702 uint32_t iWaitLoop;
4703#endif
4704
4705 /*
4706 * Validate and adjust the input.
4707 */
4708 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4709 if (!pSession->fGipReferenced)
4710 return VERR_WRONG_ORDER;
4711
4712 pDevExt = pSession->pDevExt;
4713 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4714
4715 pGip = pDevExt->pGip;
4716 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4717
4718 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4719 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4720 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4721 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4722
4723 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4724 return VERR_INVALID_FLAGS;
4725
4726 /*
4727 * The request is a noop if the TSC delta isn't being used.
4728 */
4729 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4730 return VINF_SUCCESS;
4731
4732 if (cTries == 0)
4733 cTries = 12;
4734 else if (cTries > 256)
4735 cTries = 256;
4736
4737 if (cMsWaitRetry == 0)
4738 cMsWaitRetry = 2;
4739 else if (cMsWaitRetry > 1000)
4740 cMsWaitRetry = 1000;
4741
4742#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4743 /*
4744 * Has the TSC already been measured and we're not forced to redo it?
4745 */
4746 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4747 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4748 return VINF_SUCCESS;
4749
4750 /*
4751 * Asynchronous request? Forward it to the thread, no waiting.
4752 */
4753 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4754 {
4755 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4756 * to pass those options to the thread somehow and implement it in the
4757 * thread. Check if anyone uses/needs fAsync before implementing this. */
4758 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4759 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4760 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4761 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4762 {
4763 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4764 rc = VINF_SUCCESS;
4765 }
4766 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4767 rc = VERR_THREAD_IS_DEAD;
4768 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4769 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4770 return VINF_SUCCESS;
4771 }
4772
4773 /*
4774 * If a TSC-delta measurement request is already being serviced by the thread,
4775 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4776 */
4777 msTsStartWait = RTTimeSystemMilliTS();
4778 for (iWaitLoop = 0;; iWaitLoop++)
4779 {
4780 uint64_t cMsElapsed;
4781 SUPDRVTSCDELTATHREADSTATE enmState;
4782 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4783 enmState = pDevExt->enmTscDeltaThreadState;
4784 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4785
4786 if (enmState == kTscDeltaThreadState_Measuring)
4787 { /* Must wait, the thread is busy. */ }
4788 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4789 { /* Must wait, this state only says what will happen next. */ }
4790 else if (enmState == kTscDeltaThreadState_Terminating)
4791 { /* Must wait, this state only says what should happen next. */ }
4792 else
4793 break; /* All other states, the thread is either idly listening or dead. */
4794
4795 /* Wait or fail. */
4796 if (cMsWaitThread == 0)
4797 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4798 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4799 if (cMsElapsed >= cMsWaitThread)
4800 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4801
4802 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4803 if (rc == VERR_INTERRUPTED)
4804 return rc;
4805 }
4806#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4807
4808 /*
4809 * Try measure the TSC delta the given number of times.
4810 */
4811 for (;;)
4812 {
4813 /* Unless we're forced to measure the delta, check whether it's done already. */
4814 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4815 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4816 {
4817 rc = VINF_SUCCESS;
4818 break;
4819 }
4820
4821 /* Measure it. */
4822 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4823 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4824 {
4825 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4826 break;
4827 }
4828
4829 /* Retry? */
4830 if (cTries <= 1)
4831 break;
4832 cTries--;
4833
4834 /* Always delay between retries (be nice to the rest of the system
4835 and avoid the BSOD hounds). */
4836 rc = RTThreadSleep(cMsWaitRetry);
4837 if (rc == VERR_INTERRUPTED)
4838 break;
4839 }
4840
4841 return rc;
4842}
4843SUPR0_EXPORT_SYMBOL(SUPR0TscDeltaMeasureBySetIndex);
4844
4845
4846/**
4847 * Service a TSC-delta measurement request.
4848 *
4849 * @returns VBox status code.
4850 * @param pDevExt Pointer to the device instance data.
4851 * @param pSession The support driver session.
4852 * @param pReq Pointer to the TSC-delta measurement request.
4853 */
4854int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4855{
4856 uint32_t cTries;
4857 uint32_t iCpuSet;
4858 uint32_t fFlags;
4859 RTMSINTERVAL cMsWaitRetry;
4860 RT_NOREF1(pDevExt);
4861
4862 /*
4863 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4864 */
4865 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4866
4867 if (pReq->u.In.idCpu == NIL_RTCPUID)
4868 return VERR_INVALID_CPU_ID;
4869 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4870 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4871 return VERR_INVALID_CPU_ID;
4872
4873 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4874
4875 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4876
4877 fFlags = 0;
4878 if (pReq->u.In.fAsync)
4879 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4880 if (pReq->u.In.fForce)
4881 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4882
4883 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4884 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4885 cTries);
4886}
4887
4888
4889/**
4890 * Reads TSC with delta applied.
4891 *
4892 * Will try to resolve delta value INT64_MAX before applying it. This is the
4893 * main purpose of this function, to handle the case where the delta needs to be
4894 * determined.
4895 *
4896 * @returns VBox status code.
4897 * @param pDevExt Pointer to the device instance data.
4898 * @param pSession The support driver session.
4899 * @param pReq Pointer to the TSC-read request.
4900 */
4901int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4902{
4903 PSUPGLOBALINFOPAGE pGip;
4904 int rc;
4905
4906 /*
4907 * Validate. We require the client to have mapped GIP (no asserting on
4908 * ring-3 preconditions).
4909 */
4910 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4911 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4912 return VERR_WRONG_ORDER;
4913 pGip = pDevExt->pGip;
4914 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4915
4916 /*
4917 * We're usually here because we need to apply delta, but we shouldn't be
4918 * upset if the GIP is some different mode.
4919 */
4920 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4921 {
4922 uint32_t cTries = 0;
4923 for (;;)
4924 {
4925 /*
4926 * Start by gathering the data, using CLI for disabling preemption
4927 * while we do that.
4928 */
4929 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4930 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4931 int iGipCpu = 0; /* gcc maybe used uninitialized */
4932 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4933 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4934 {
4935 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4936 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4937 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4938 ASMSetFlags(fEFlags);
4939
4940 /*
4941 * If we're lucky we've got a delta, but no predictions here
4942 * as this I/O control is normally only used when the TSC delta
4943 * is set to INT64_MAX.
4944 */
4945 if (i64Delta != INT64_MAX)
4946 {
4947 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4948 rc = VINF_SUCCESS;
4949 break;
4950 }
4951
4952 /* Give up after a few times. */
4953 if (cTries >= 4)
4954 {
4955 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4956 break;
4957 }
4958
4959 /* Need to measure the delta an try again. */
4960 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4961 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4962 /** @todo should probably delay on failure... dpc watchdogs */
4963 }
4964 else
4965 {
4966 /* This really shouldn't happen. */
4967 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4968 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4969 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4970 ASMSetFlags(fEFlags);
4971 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4972 break;
4973 }
4974 }
4975 }
4976 else
4977 {
4978 /*
4979 * No delta to apply. Easy. Deal with preemption the lazy way.
4980 */
4981 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4982 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4983 int iGipCpu = 0; /* gcc may be used uninitialized */
4984 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4985 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4986 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4987 else
4988 pReq->u.Out.idApic = supdrvGipGetApicIdSlow();
4989 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4990 ASMSetFlags(fEFlags);
4991 rc = VINF_SUCCESS;
4992 }
4993
4994 return rc;
4995}
4996
4997
4998/**
4999 * Worker for supdrvIOCtl_GipSetFlags.
5000 *
5001 * @returns VBox status code.
5002 * @retval VERR_WRONG_ORDER if an enable-once-per-session flag is set again for
5003 * a session.
5004 *
5005 * @param pDevExt Pointer to the device instance data.
5006 * @param pSession The support driver session.
5007 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5008 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5009 *
5010 * @remarks Caller must own the GIP mutex.
5011 *
5012 * @remarks This function doesn't validate any of the flags.
5013 */
5014static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5015{
5016 uint32_t cRefs;
5017 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
5018 AssertMsg((fOrMask & fAndMask) == fOrMask, ("%#x & %#x\n", fOrMask, fAndMask)); /* ASSUMED by code below */
5019
5020 /*
5021 * Compute GIP test-mode flags.
5022 */
5023 if (fOrMask & SUPGIP_FLAGS_TESTING_ENABLE)
5024 {
5025 if (!pSession->fGipTestMode)
5026 {
5027 Assert(pDevExt->cGipTestModeRefs < _64K);
5028 pSession->fGipTestMode = true;
5029 cRefs = ++pDevExt->cGipTestModeRefs;
5030 if (cRefs == 1)
5031 {
5032 fOrMask |= SUPGIP_FLAGS_TESTING | SUPGIP_FLAGS_TESTING_START;
5033 fAndMask &= ~SUPGIP_FLAGS_TESTING_STOP;
5034 }
5035 }
5036 else
5037 {
5038 LogRelMax(10, ("supdrvGipSetFlags: SUPGIP_FLAGS_TESTING_ENABLE already set for this session\n"));
5039 return VERR_WRONG_ORDER;
5040 }
5041 }
5042 else if ( !(fAndMask & SUPGIP_FLAGS_TESTING_ENABLE)
5043 && pSession->fGipTestMode)
5044 {
5045 Assert(pDevExt->cGipTestModeRefs > 0);
5046 Assert(pDevExt->cGipTestModeRefs < _64K);
5047 pSession->fGipTestMode = false;
5048 cRefs = --pDevExt->cGipTestModeRefs;
5049 if (!cRefs)
5050 fOrMask |= SUPGIP_FLAGS_TESTING_STOP;
5051 else
5052 fAndMask |= SUPGIP_FLAGS_TESTING_ENABLE;
5053 }
5054
5055 /*
5056 * Commit the flags. This should be done as atomically as possible
5057 * since the flag consumers won't be holding the GIP mutex.
5058 */
5059 ASMAtomicOrU32(&pGip->fFlags, fOrMask);
5060 ASMAtomicAndU32(&pGip->fFlags, fAndMask);
5061
5062 return VINF_SUCCESS;
5063}
5064
5065
5066/**
5067 * Sets GIP test mode parameters.
5068 *
5069 * @returns VBox status code.
5070 * @param pDevExt Pointer to the device instance data.
5071 * @param pSession The support driver session.
5072 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5073 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
5074 */
5075int VBOXCALL supdrvIOCtl_GipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
5076{
5077 PSUPGLOBALINFOPAGE pGip;
5078 int rc;
5079
5080 /*
5081 * Validate. We require the client to have mapped GIP (no asserting on
5082 * ring-3 preconditions).
5083 */
5084 AssertPtr(pDevExt); AssertPtr(pSession); /* paranoia^2 */
5085 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
5086 return VERR_WRONG_ORDER;
5087 pGip = pDevExt->pGip;
5088 AssertReturn(pGip, VERR_INTERNAL_ERROR_3);
5089
5090 if (fOrMask & ~SUPGIP_FLAGS_VALID_MASK)
5091 return VERR_INVALID_PARAMETER;
5092 if ((fAndMask & ~SUPGIP_FLAGS_VALID_MASK) != ~SUPGIP_FLAGS_VALID_MASK)
5093 return VERR_INVALID_PARAMETER;
5094
5095 /*
5096 * Don't confuse supdrvGipSetFlags or anyone else by both setting
5097 * and clearing the same flags. AND takes precedence.
5098 */
5099 fOrMask &= fAndMask;
5100
5101 /*
5102 * Take the loader lock to avoid having to think about races between two
5103 * clients changing the flags at the same time (state is not simple).
5104 */
5105#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5106 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
5107#else
5108 RTSemFastMutexRequest(pDevExt->mtxGip);
5109#endif
5110
5111 rc = supdrvGipSetFlags(pDevExt, pSession, fOrMask, fAndMask);
5112
5113#ifdef SUPDRV_USE_MUTEX_FOR_GIP
5114 RTSemMutexRelease(pDevExt->mtxGip);
5115#else
5116 RTSemFastMutexRelease(pDevExt->mtxGip);
5117#endif
5118 return rc;
5119}
5120
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette