VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 54387

Last change on this file since 54387 was 54387, checked in by vboxsync, 10 years ago

SUPDrvGip.cpp: Start the TSC measurement by testing for zero delta. We do alternate TSC reads on the two CPUs and check that the TSC value is ever increasing.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 169.6 KB
Line 
1/* $Id: SUPDrvGip.cpp 54387 2015-02-23 16:31:58Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#define LOG_GROUP LOG_GROUP_SUP_DRV
31#define SUPDRV_AGNOSTIC
32#include "SUPDrvInternal.h"
33#ifndef PAGE_SHIFT
34# include <iprt/param.h>
35#endif
36#include <iprt/asm.h>
37#include <iprt/asm-amd64-x86.h>
38#include <iprt/asm-math.h>
39#include <iprt/cpuset.h>
40#include <iprt/handletable.h>
41#include <iprt/mem.h>
42#include <iprt/mp.h>
43#include <iprt/power.h>
44#include <iprt/process.h>
45#include <iprt/semaphore.h>
46#include <iprt/spinlock.h>
47#include <iprt/thread.h>
48#include <iprt/uuid.h>
49#include <iprt/net.h>
50#include <iprt/crc.h>
51#include <iprt/string.h>
52#include <iprt/timer.h>
53#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
54# include <iprt/rand.h>
55# include <iprt/path.h>
56#endif
57#include <iprt/uint128.h>
58#include <iprt/x86.h>
59
60#include <VBox/param.h>
61#include <VBox/log.h>
62#include <VBox/err.h>
63
64#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
65# include "dtrace/SUPDrv.h"
66#else
67/* ... */
68#endif
69
70
71/*******************************************************************************
72* Defined Constants And Macros *
73*******************************************************************************/
74/** The frequency by which we recalculate the u32UpdateHz and
75 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
76 *
77 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
78 */
79#define GIP_UPDATEHZ_RECALC_FREQ 0x800
80
81/** A reserved TSC value used for synchronization as well as measurement of
82 * TSC deltas. */
83#define GIP_TSC_DELTA_RSVD UINT64_MAX
84/** The number of TSC delta measurement loops in total (includes primer and
85 * read-time loops). */
86#define GIP_TSC_DELTA_LOOPS 96
87/** The number of cache primer loops. */
88#define GIP_TSC_DELTA_PRIMER_LOOPS 4
89/** The number of loops until we keep computing the minumum read time. */
90#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
91
92/** @name Master / worker synchronization values.
93 * @{ */
94/** Stop measurement of TSC delta. */
95#define GIP_TSC_DELTA_SYNC_STOP UINT32_C(0)
96/** Start measurement of TSC delta. */
97#define GIP_TSC_DELTA_SYNC_START UINT32_C(1)
98/** Worker thread is ready for reading the TSC. */
99#define GIP_TSC_DELTA_SYNC_WORKER_READY UINT32_C(2)
100/** Worker thread is done updating TSC delta info. */
101#define GIP_TSC_DELTA_SYNC_WORKER_DONE UINT32_C(3)
102/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
103 * with a timeout. */
104#define GIP_TSC_DELTA_SYNC_PRESTART_MASTER UINT32_C(4)
105/** @} */
106
107/** When IPRT is isn't concurrent safe: Worker is ready after waiting for
108 * master with a timeout. */
109#define GIP_TSC_DELTA_SYNC_PRESTART_WORKER 5
110/** The TSC-refinement interval in seconds. */
111#define GIP_TSC_REFINE_PREIOD_IN_SECS 5
112/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
113#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
114/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
115#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
116/** The TSC delta value for the initial GIP master - 0 in regular builds.
117 * To test the delta code this can be set to a non-zero value. */
118#if 0
119# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
120#else
121# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
122#endif
123
124AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
125AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
126
127/** @def VBOX_SVN_REV
128 * The makefile should define this if it can. */
129#ifndef VBOX_SVN_REV
130# define VBOX_SVN_REV 0
131#endif
132
133#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
134# define DO_NOT_START_GIP
135#endif
136
137
138/*******************************************************************************
139* Internal Functions *
140*******************************************************************************/
141static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
142static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
143static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
144#ifdef SUPDRV_USE_TSC_DELTA_THREAD
145static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
146static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
147static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt);
148#endif
149
150
151/*******************************************************************************
152* Global Variables *
153*******************************************************************************/
154DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
155
156
157
158/*
159 *
160 * Misc Common GIP Code
161 * Misc Common GIP Code
162 * Misc Common GIP Code
163 *
164 *
165 */
166
167
168/**
169 * Finds the GIP CPU index corresponding to @a idCpu.
170 *
171 * @returns GIP CPU array index, UINT32_MAX if not found.
172 * @param pGip The GIP.
173 * @param idCpu The CPU ID.
174 */
175static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
176{
177 uint32_t i;
178 for (i = 0; i < pGip->cCpus; i++)
179 if (pGip->aCPUs[i].idCpu == idCpu)
180 return i;
181 return UINT32_MAX;
182}
183
184
185/**
186 * Applies the TSC delta to the supplied raw TSC value.
187 *
188 * @returns VBox status code. (Ignored by all users, just FYI.)
189 * @param pGip Pointer to the GIP.
190 * @param puTsc Pointer to a valid TSC value before the TSC delta has been applied.
191 * @param idApic The APIC ID of the CPU @c puTsc corresponds to.
192 * @param fDeltaApplied Where to store whether the TSC delta was succesfully
193 * applied or not (optional, can be NULL).
194 *
195 * @remarks Maybe called with interrupts disabled in ring-0!
196 *
197 * @note Don't you dare change the delta calculation. If you really do, make
198 * sure you update all places where it's used (IPRT, SUPLibAll.cpp,
199 * SUPDrv.c, supdrvGipMpEvent, and more).
200 */
201DECLINLINE(int) supdrvTscDeltaApply(PSUPGLOBALINFOPAGE pGip, uint64_t *puTsc, uint16_t idApic, bool *pfDeltaApplied)
202{
203 int rc;
204
205 /*
206 * Validate input.
207 */
208 AssertPtr(puTsc);
209 AssertPtr(pGip);
210 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
211
212 /*
213 * Carefully convert the idApic into a GIPCPU entry.
214 */
215 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
216 {
217 uint16_t iCpu = pGip->aiCpuFromApicId[idApic];
218 if (RT_LIKELY(iCpu < pGip->cCpus))
219 {
220 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
221
222 /*
223 * Apply the delta if valid.
224 */
225 if (RT_LIKELY(pGipCpu->i64TSCDelta != INT64_MAX))
226 {
227 *puTsc -= pGipCpu->i64TSCDelta;
228 if (pfDeltaApplied)
229 *pfDeltaApplied = true;
230 return VINF_SUCCESS;
231 }
232
233 rc = VINF_SUCCESS;
234 }
235 else
236 {
237 AssertMsgFailed(("iCpu=%u cCpus=%u\n", iCpu, pGip->cCpus));
238 rc = VERR_INVALID_CPU_INDEX;
239 }
240 }
241 else
242 {
243 AssertMsgFailed(("idApic=%u\n", idApic));
244 rc = VERR_INVALID_CPU_ID;
245 }
246 if (pfDeltaApplied)
247 *pfDeltaApplied = false;
248 return rc;
249}
250
251
252/*
253 *
254 * GIP Mapping and Unmapping Related Code.
255 * GIP Mapping and Unmapping Related Code.
256 * GIP Mapping and Unmapping Related Code.
257 *
258 *
259 */
260
261
262/**
263 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
264 * updating.
265 *
266 * @param pGip Pointer to the GIP.
267 * @param pGipCpu The per CPU structure for this CPU.
268 * @param u64NanoTS The current time.
269 */
270static void supdrvGipReInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
271{
272 /*
273 * Here we don't really care about applying the TSC delta. The re-initialization of this
274 * value is not relevant especially while (re)starting the GIP as the first few ones will
275 * be ignored anyway, see supdrvGipDoUpdateCpu().
276 */
277 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
278 pGipCpu->u64NanoTS = u64NanoTS;
279}
280
281
282/**
283 * Set the current TSC and NanoTS value for the CPU.
284 *
285 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
286 * @param pvUser1 Pointer to the ring-0 GIP mapping.
287 * @param pvUser2 Pointer to the variable holding the current time.
288 */
289static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
290{
291 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
292 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
293
294 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
295 supdrvGipReInitCpu(pGip, &pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
296
297 NOREF(pvUser2);
298 NOREF(idCpu);
299}
300
301
302/**
303 * State structure for supdrvGipDetectGetGipCpuCallback.
304 */
305typedef struct SUPDRVGIPDETECTGETCPU
306{
307 /** Bitmap of APIC IDs that has been seen (initialized to zero).
308 * Used to detect duplicate APIC IDs (paranoia). */
309 uint8_t volatile bmApicId[256 / 8];
310 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
311 * initially). The callback clears the methods not detected. */
312 uint32_t volatile fSupported;
313 /** The first callback detecting any kind of range issues (initialized to
314 * NIL_RTCPUID). */
315 RTCPUID volatile idCpuProblem;
316} SUPDRVGIPDETECTGETCPU;
317/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
318typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
319
320
321/**
322 * Checks for alternative ways of getting the CPU ID.
323 *
324 * This also checks the APIC ID, CPU ID and CPU set index values against the
325 * GIP tables.
326 *
327 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
328 * @param pvUser1 Pointer to the state structure.
329 * @param pvUser2 Pointer to the GIP.
330 */
331static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
332{
333 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
334 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
335 uint32_t fSupported = 0;
336 uint16_t idApic;
337 int iCpuSet;
338
339 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
340
341 /*
342 * Check that the CPU ID and CPU set index are interchangable.
343 */
344 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
345 if ((RTCPUID)iCpuSet == idCpu)
346 {
347 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
348 if ( iCpuSet >= 0
349 && iCpuSet < RTCPUSET_MAX_CPUS
350 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
351 {
352 /*
353 * Check whether the IDTR.LIMIT contains a CPU number.
354 */
355#ifdef RT_ARCH_X86
356 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
357#else
358 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
359#endif
360 RTIDTR Idtr;
361 ASMGetIDTR(&Idtr);
362 if (Idtr.cbIdt >= cbIdt)
363 {
364 uint32_t uTmp = Idtr.cbIdt - cbIdt;
365 uTmp &= RTCPUSET_MAX_CPUS - 1;
366 if (uTmp == idCpu)
367 {
368 RTIDTR Idtr2;
369 ASMGetIDTR(&Idtr2);
370 if (Idtr2.cbIdt == Idtr.cbIdt)
371 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
372 }
373 }
374
375 /*
376 * Check whether RDTSCP is an option.
377 */
378 if (ASMHasCpuId())
379 {
380 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
381 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
382 {
383 uint32_t uAux;
384 ASMReadTscWithAux(&uAux);
385 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
386 {
387 ASMNopPause();
388 ASMReadTscWithAux(&uAux);
389 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
390 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
391 }
392 }
393 }
394 }
395 }
396
397 /*
398 * Check that the APIC ID is unique.
399 */
400 idApic = ASMGetApicId();
401 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
402 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
403 fSupported |= SUPGIPGETCPU_APIC_ID;
404 else
405 {
406 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
407 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
408 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
409 idCpu, iCpuSet, idApic));
410 }
411
412 /*
413 * Check that the iCpuSet is within the expected range.
414 */
415 if (RT_UNLIKELY( iCpuSet < 0
416 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
417 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
418 {
419 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
420 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
421 idCpu, iCpuSet, idApic));
422 }
423 else
424 {
425 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
426 if (RT_UNLIKELY(idCpu2 != idCpu))
427 {
428 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
429 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
430 idCpu, iCpuSet, idApic, idCpu2));
431 }
432 }
433
434 /*
435 * Update the supported feature mask before we return.
436 */
437 ASMAtomicAndU32(&pState->fSupported, fSupported);
438
439 NOREF(pvUser2);
440}
441
442
443/**
444 * Increase the timer freqency on hosts where this is possible (NT).
445 *
446 * The idea is that more interrupts is better for us... Also, it's better than
447 * we increase the timer frequence, because we might end up getting inaccurate
448 * callbacks if someone else does it.
449 *
450 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
451 */
452static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
453{
454 if (pDevExt->u32SystemTimerGranularityGrant == 0)
455 {
456 uint32_t u32SystemResolution;
457 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
458 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
459 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
460 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
461 )
462 {
463 Assert(RTTimerGetSystemGranularity() <= u32SystemResolution);
464 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
465 }
466 }
467}
468
469
470/**
471 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
472 *
473 * @param pDevExt Clears u32SystemTimerGranularityGrant.
474 */
475static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
476{
477 if (pDevExt->u32SystemTimerGranularityGrant)
478 {
479 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
480 AssertRC(rc2);
481 pDevExt->u32SystemTimerGranularityGrant = 0;
482 }
483}
484
485
486/**
487 * Maps the GIP into userspace and/or get the physical address of the GIP.
488 *
489 * @returns IPRT status code.
490 * @param pSession Session to which the GIP mapping should belong.
491 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
492 * @param pHCPhysGip Where to store the physical address. (optional)
493 *
494 * @remark There is no reference counting on the mapping, so one call to this function
495 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
496 * and remove the session as a GIP user.
497 */
498SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
499{
500 int rc;
501 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
502 RTR3PTR pGipR3 = NIL_RTR3PTR;
503 RTHCPHYS HCPhys = NIL_RTHCPHYS;
504 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
505
506 /*
507 * Validate
508 */
509 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
510 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
511 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
512
513#ifdef SUPDRV_USE_MUTEX_FOR_GIP
514 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
515#else
516 RTSemFastMutexRequest(pDevExt->mtxGip);
517#endif
518 if (pDevExt->pGip)
519 {
520 /*
521 * Map it?
522 */
523 rc = VINF_SUCCESS;
524 if (ppGipR3)
525 {
526 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
527 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
528 RTMEM_PROT_READ, RTR0ProcHandleSelf());
529 if (RT_SUCCESS(rc))
530 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
531 }
532
533 /*
534 * Get physical address.
535 */
536 if (pHCPhysGip && RT_SUCCESS(rc))
537 HCPhys = pDevExt->HCPhysGip;
538
539 /*
540 * Reference globally.
541 */
542 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
543 {
544 pSession->fGipReferenced = 1;
545 pDevExt->cGipUsers++;
546 if (pDevExt->cGipUsers == 1)
547 {
548 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
549 uint64_t u64NanoTS;
550
551 /*
552 * GIP starts/resumes updating again. On windows we bump the
553 * host timer frequency to make sure we don't get stuck in guest
554 * mode and to get better timer (and possibly clock) accuracy.
555 */
556 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
557
558 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
559
560 /*
561 * document me
562 */
563 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
564 {
565 unsigned i;
566 for (i = 0; i < pGipR0->cCpus; i++)
567 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
568 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
569 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
570 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
571 }
572
573 /*
574 * document me
575 */
576 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
577 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
578 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
579 || RTMpGetOnlineCount() == 1)
580 supdrvGipReInitCpu(pGipR0, &pGipR0->aCPUs[0], u64NanoTS);
581 else
582 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
583
584 /*
585 * Detect alternative ways to figure the CPU ID in ring-3 and
586 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
587 * and CPU set indexes while we're at it.
588 */
589 if (RT_SUCCESS(rc))
590 {
591 SUPDRVGIPDETECTGETCPU DetectState;
592 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
593 DetectState.fSupported = UINT32_MAX;
594 DetectState.idCpuProblem = NIL_RTCPUID;
595 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
596 if (DetectState.idCpuProblem == NIL_RTCPUID)
597 {
598 if ( DetectState.fSupported != UINT32_MAX
599 && DetectState.fSupported != 0)
600 {
601 if (pGipR0->fGetGipCpu != DetectState.fSupported)
602 {
603 pGipR0->fGetGipCpu = DetectState.fSupported;
604 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
605 }
606 }
607 else
608 {
609 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
610 DetectState.fSupported));
611 rc = VERR_UNSUPPORTED_CPU;
612 }
613 }
614 else
615 {
616 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
617 DetectState.idCpuProblem, DetectState.idCpuProblem));
618 rc = VERR_INVALID_CPU_ID;
619 }
620 }
621
622 /*
623 * Start the GIP timer if all is well..
624 */
625 if (RT_SUCCESS(rc))
626 {
627#ifndef DO_NOT_START_GIP
628 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
629#endif
630 rc = VINF_SUCCESS;
631 }
632
633 /*
634 * Bail out on error.
635 */
636 if (RT_FAILURE(rc))
637 {
638 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
639 pDevExt->cGipUsers = 0;
640 pSession->fGipReferenced = 0;
641 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
642 {
643 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
644 if (RT_SUCCESS(rc2))
645 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
646 }
647 HCPhys = NIL_RTHCPHYS;
648 pGipR3 = NIL_RTR3PTR;
649 }
650 }
651 }
652 }
653 else
654 {
655 rc = VERR_GENERAL_FAILURE;
656 Log(("SUPR0GipMap: GIP is not available!\n"));
657 }
658#ifdef SUPDRV_USE_MUTEX_FOR_GIP
659 RTSemMutexRelease(pDevExt->mtxGip);
660#else
661 RTSemFastMutexRelease(pDevExt->mtxGip);
662#endif
663
664 /*
665 * Write returns.
666 */
667 if (pHCPhysGip)
668 *pHCPhysGip = HCPhys;
669 if (ppGipR3)
670 *ppGipR3 = pGipR3;
671
672#ifdef DEBUG_DARWIN_GIP
673 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
674#else
675 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
676#endif
677 return rc;
678}
679
680
681/**
682 * Unmaps any user mapping of the GIP and terminates all GIP access
683 * from this session.
684 *
685 * @returns IPRT status code.
686 * @param pSession Session to which the GIP mapping should belong.
687 */
688SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
689{
690 int rc = VINF_SUCCESS;
691 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
692#ifdef DEBUG_DARWIN_GIP
693 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
694 pSession,
695 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
696 pSession->GipMapObjR3));
697#else
698 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
699#endif
700 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
701
702#ifdef SUPDRV_USE_MUTEX_FOR_GIP
703 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
704#else
705 RTSemFastMutexRequest(pDevExt->mtxGip);
706#endif
707
708 /*
709 * Unmap anything?
710 */
711 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
712 {
713 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
714 AssertRC(rc);
715 if (RT_SUCCESS(rc))
716 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
717 }
718
719 /*
720 * Dereference global GIP.
721 */
722 if (pSession->fGipReferenced && !rc)
723 {
724 pSession->fGipReferenced = 0;
725 if ( pDevExt->cGipUsers > 0
726 && !--pDevExt->cGipUsers)
727 {
728 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
729#ifndef DO_NOT_START_GIP
730 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
731#endif
732 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
733 }
734 }
735
736#ifdef SUPDRV_USE_MUTEX_FOR_GIP
737 RTSemMutexRelease(pDevExt->mtxGip);
738#else
739 RTSemFastMutexRelease(pDevExt->mtxGip);
740#endif
741
742 return rc;
743}
744
745
746/**
747 * Gets the GIP pointer.
748 *
749 * @returns Pointer to the GIP or NULL.
750 */
751SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
752{
753 return g_pSUPGlobalInfoPage;
754}
755
756
757
758
759
760/*
761 *
762 *
763 * GIP Initialization, Termination and CPU Offline / Online Related Code.
764 * GIP Initialization, Termination and CPU Offline / Online Related Code.
765 * GIP Initialization, Termination and CPU Offline / Online Related Code.
766 *
767 *
768 */
769
770/**
771 * Used by supdrvInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
772 * to update the TSC frequency related GIP variables.
773 *
774 * @param pGip The GIP.
775 * @param nsElapsed The number of nano seconds elapsed.
776 * @param cElapsedTscTicks The corresponding number of TSC ticks.
777 */
778static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks)
779{
780 /*
781 * Calculate the frequency.
782 */
783 uint64_t uCpuHz;
784 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
785 && nsElapsed < UINT32_MAX)
786 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
787 else
788 {
789 RTUINT128U CpuHz, Tmp, Divisor;
790 CpuHz.s.Lo = CpuHz.s.Hi = 0;
791 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
792 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
793 uCpuHz = CpuHz.s.Lo;
794 }
795
796 /*
797 * Update the GIP.
798 */
799 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
800 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
801 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
802}
803
804
805/**
806 * Timer callback function for TSC frequency refinement in invariant GIP mode.
807 *
808 * This is started during driver init and fires once
809 * GIP_TSC_REFINE_PREIOD_IN_SECS seconds later.
810 *
811 * @param pTimer The timer.
812 * @param pvUser Opaque pointer to the device instance data.
813 * @param iTick The timer tick.
814 */
815static DECLCALLBACK(void) supdrvInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
816{
817 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
818 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
819 RTCPUID idCpu;
820 uint64_t cNsElapsed;
821 uint64_t cTscTicksElapsed;
822 uint64_t nsNow;
823 uint64_t uTsc;
824 RTCCUINTREG uFlags;
825
826 /* Paranoia. */
827 AssertReturnVoid(pGip);
828 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
829
830 /*
831 * Try get close to the next clock tick as usual.
832 *
833 * PORTME: If timers are called from the clock interrupt handler, or
834 * an interrupt handler with higher priority than the clock
835 * interrupt, or spinning for ages in timer handlers is frowned
836 * upon, this look must be disabled!
837 *
838 * Darwin, FreeBSD, Linux, Solaris, Windows 8.1+:
839 * High RTTimeSystemNanoTS resolution should prevent any noticable
840 * spinning her.
841 *
842 * Windows 8.0 and earlier:
843 * We're running in a DPC here, so we may trigger the DPC watchdog?
844 *
845 * OS/2:
846 * Timer callbacks are done in the clock interrupt, so skip it.
847 */
848#if !defined(RT_OS_OS2)
849 nsNow = RTTimeSystemNanoTS();
850 while (RTTimeSystemNanoTS() == nsNow)
851 ASMNopPause();
852#endif
853
854 uFlags = ASMIntDisableFlags();
855 uTsc = ASMReadTSC();
856 nsNow = RTTimeSystemNanoTS();
857 idCpu = RTMpCpuId();
858 ASMSetFlags(uFlags);
859
860 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
861 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
862
863 /*
864 * If the above measurement was taken on a different CPU than the one we
865 * started the rprocess on, cTscTicksElapsed will need to be adjusted with
866 * the TSC deltas of both the CPUs.
867 *
868 * We ASSUME that the delta calculation process takes less time than the
869 * TSC frequency refinement timer. If it doesn't, we'll complain and
870 * drop the frequency refinement.
871 *
872 * Note! We cannot entirely trust enmUseTscDelta here because it's
873 * downgraded after each delta calculation.
874 */
875 if ( idCpu != pDevExt->idCpuInvarTscRefine
876 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
877 {
878 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
879 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
880 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
881 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
882 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
883 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
884 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
885 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
886 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
887 {
888 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
889 {
890 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
891 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
892 }
893 }
894 /*
895 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
896 * calculations.
897 */
898 else if (cNsElapsed <= GIP_TSC_REFINE_PREIOD_IN_SECS * 5 * RT_NS_1SEC_64)
899 {
900 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
901 AssertRC(rc);
902 return;
903 }
904 else
905 {
906 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
907 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PREIOD_IN_SECS);
908 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
909 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
910 return;
911 }
912 }
913
914 /*
915 * Calculate and update the CPU frequency variables in GIP.
916 *
917 * If there is a GIP user already and we've already refined the frequency
918 * a couple of times, don't update it as we want a stable frequency value
919 * for all VMs.
920 */
921 if ( pDevExt->cGipUsers == 0
922 || cNsElapsed < RT_NS_1SEC * 2)
923 {
924 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed);
925
926 /*
927 * Reschedule the timer if we haven't yet reached the defined refinement period.
928 */
929 if (cNsElapsed < GIP_TSC_REFINE_PREIOD_IN_SECS * RT_NS_1SEC_64)
930 {
931 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
932 AssertRC(rc);
933 }
934 }
935}
936
937
938/**
939 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
940 *
941 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
942 * the CPU may change the TSC frequence between now and when the timer fires
943 * (supdrvInitAsyncRefineTscTimer).
944 *
945 * @param pDevExt Pointer to the device instance data.
946 * @param pGip Pointer to the GIP.
947 */
948static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
949{
950 uint64_t u64NanoTS;
951 RTCCUINTREG uFlags;
952 int rc;
953
954 /*
955 * Record the TSC and NanoTS as the starting anchor point for refinement
956 * of the TSC. We try get as close to a clock tick as possible on systems
957 * which does not provide high resolution time.
958 */
959 u64NanoTS = RTTimeSystemNanoTS();
960 while (RTTimeSystemNanoTS() == u64NanoTS)
961 ASMNopPause();
962
963 uFlags = ASMIntDisableFlags();
964 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
965 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
966 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
967 ASMSetFlags(uFlags);
968
969/** @todo we need a power management callback that disables the timer if the
970 * system suspends/resumes. */
971
972 /*
973 * Create a timer that runs on the same CPU so we won't have a depencency
974 * on the TSC-delta and can run in parallel to it. On systems that does not
975 * implement CPU specific timers we'll apply deltas in the timer callback,
976 * just like we do for CPUs going offline.
977 *
978 * The longer the refinement interval the better the accuracy, at least in
979 * theory. If it's too long though, ring-3 may already be starting its
980 * first VMs before we're done. On most systems we will be loading the
981 * support driver during boot and VMs won't be started for a while yet,
982 * it is really only a problem during development (especiall with
983 * on-demand driver starting on windows).
984 *
985 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq call
986 * to calculate the frequencey during driver loading, the timer is set
987 * to fire after 200 ms the first time. It will then reschedule itself
988 * to fire every second until GIP_TSC_REFINE_PREIOD_IN_SECS has been
989 * reached or it notices that there is a user land client with GIP
990 * mapped (we want a stable frequency for all VMs).
991 */
992 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */,
993 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
994 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
995 if (RT_SUCCESS(rc))
996 {
997 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
998 if (RT_SUCCESS(rc))
999 return;
1000 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1001 }
1002
1003 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1004 {
1005 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */, RTTIMER_FLAGS_CPU_ANY,
1006 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
1007 if (RT_SUCCESS(rc))
1008 {
1009 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1010 if (RT_SUCCESS(rc))
1011 return;
1012 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1013 }
1014 }
1015
1016 pDevExt->pInvarTscRefineTimer = NULL;
1017 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1018}
1019
1020
1021/**
1022 * @callback_method_impl{PFNRTMPWORKER,
1023 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1024 * the measurements on.}
1025 */
1026DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1027{
1028 RTCCUINTREG uFlags = ASMIntDisableFlags();
1029 uint64_t *puTscStop = (uint64_t *)pvUser1;
1030 uint64_t *pnsStop = (uint64_t *)pvUser2;
1031
1032 *puTscStop = ASMReadTSC();
1033 *pnsStop = RTTimeSystemNanoTS();
1034
1035 ASMSetFlags(uFlags);
1036}
1037
1038
1039/**
1040 * Measures the TSC frequency of the system.
1041 *
1042 * The TSC frequency can vary on systems which are not reported as invariant.
1043 * On such systems the object of this function is to find out what the nominal,
1044 * maximum TSC frequency under 'normal' CPU operation.
1045 *
1046 * @returns VBox status code.
1047 * @param pDevExt Pointer to the device instance.
1048 * @param pGip Pointer to the GIP.
1049 * @param fRough Set if we're doing the rough calculation that the
1050 * TSC measuring code needs, where accuracy isn't all
1051 * that important (too high is better than to low).
1052 * When clear we try for best accuracy that we can
1053 * achieve in reasonably short time.
1054 */
1055static int supdrvGipInitMeasureTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, bool fRough)
1056{
1057 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1058 int cTriesLeft = fRough ? 4 : 2;
1059 while (cTriesLeft-- > 0)
1060 {
1061 RTCCUINTREG uFlags;
1062 uint64_t nsStart;
1063 uint64_t nsStop;
1064 uint64_t uTscStart;
1065 uint64_t uTscStop;
1066 RTCPUID idCpuStart;
1067 RTCPUID idCpuStop;
1068
1069 /*
1070 * Synchronize with the host OS clock tick on systems without high
1071 * resolution time API (older Windows version for example).
1072 */
1073 nsStart = RTTimeSystemNanoTS();
1074 while (RTTimeSystemNanoTS() == nsStart)
1075 ASMNopPause();
1076
1077 /*
1078 * Read the TSC and current time, noting which CPU we're on.
1079 */
1080 uFlags = ASMIntDisableFlags();
1081 uTscStart = ASMReadTSC();
1082 nsStart = RTTimeSystemNanoTS();
1083 idCpuStart = RTMpCpuId();
1084 ASMSetFlags(uFlags);
1085
1086 /*
1087 * Delay for a while.
1088 */
1089 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1090 {
1091 /*
1092 * Sleep-wait since the TSC frequency is constant, it eases host load.
1093 * Shorter interval produces more variance in the frequency (esp. Windows).
1094 */
1095 uint64_t msElapsed = 0;
1096 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1097 / RT_NS_1MS;
1098 do
1099 {
1100 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1101 nsStop = RTTimeSystemNanoTS();
1102 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1103 } while (msElapsed < msDelay);
1104
1105 while (RTTimeSystemNanoTS() == nsStop)
1106 ASMNopPause();
1107 }
1108 else
1109 {
1110 /*
1111 * Busy-wait keeping the frequency up.
1112 */
1113 do
1114 {
1115 ASMNopPause();
1116 nsStop = RTTimeSystemNanoTS();
1117 } while (nsStop - nsStart < RT_NS_100MS);
1118 }
1119
1120 /*
1121 * Read the TSC and time again.
1122 */
1123 uFlags = ASMIntDisableFlags();
1124 uTscStop = ASMReadTSC();
1125 nsStop = RTTimeSystemNanoTS();
1126 idCpuStop = RTMpCpuId();
1127 ASMSetFlags(uFlags);
1128
1129 /*
1130 * If the CPU changes things get a bit complicated and what we
1131 * can get away with depends on the GIP mode / TSC reliablity.
1132 */
1133 if (idCpuStop != idCpuStart)
1134 {
1135 bool fDoXCall = false;
1136
1137 /*
1138 * Synchronous TSC mode: we're probably fine as it's unlikely
1139 * that we were rescheduled because of TSC throttling or power
1140 * management reasons, so just go ahead.
1141 */
1142 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1143 {
1144 /* Probably ok, maybe we should retry once?. */
1145 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1146 }
1147 /*
1148 * If we're just doing the rough measurement, do the cross call and
1149 * get on with things (we don't have deltas!).
1150 */
1151 else if (fRough)
1152 fDoXCall = true;
1153 /*
1154 * Invariant TSC mode: It doesn't matter if we have delta available
1155 * for both CPUs. That is not something we can assume at this point.
1156 *
1157 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1158 * downgraded after each delta calculation and the delta
1159 * calculations may not be complete yet.
1160 */
1161 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1162 {
1163/** @todo This section of code is never reached atm, consider dropping it later on... */
1164 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1165 {
1166 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1167 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1168 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1169 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1170 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1171 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1172 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1173 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1174 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1175 {
1176 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1177 {
1178 uTscStart -= iStartTscDelta;
1179 uTscStop -= iStopTscDelta;
1180 }
1181 }
1182 /*
1183 * Invalid CPU indexes are not caused by online/offline races, so
1184 * we have to trigger driver load failure if that happens as GIP
1185 * and IPRT assumptions are busted on this system.
1186 */
1187 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1188 {
1189 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1190 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1191 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1192 return VERR_INVALID_CPU_INDEX;
1193 }
1194 /*
1195 * No valid deltas. We retry, if we're on our last retry
1196 * we do the cross call instead just to get a result. The
1197 * frequency will be refined in a few seconds anyways.
1198 */
1199 else if (cTriesLeft > 0)
1200 continue;
1201 else
1202 fDoXCall = true;
1203 }
1204 }
1205 /*
1206 * Asynchronous TSC mode: This is bad as the reason we usually
1207 * use this mode is to deal with variable TSC frequencies and
1208 * deltas. So, we need to get the TSC from the same CPU as
1209 * started it, we also need to keep that CPU busy. So, retry
1210 * and fall back to the cross call on the last attempt.
1211 */
1212 else
1213 {
1214 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1215 if (cTriesLeft > 0)
1216 continue;
1217 fDoXCall = true;
1218 }
1219
1220 if (fDoXCall)
1221 {
1222 /*
1223 * Try read the TSC and timestamp on the start CPU.
1224 */
1225 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1226 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1227 continue;
1228 }
1229 }
1230
1231 /*
1232 * Calculate the TSC frequency and update it (shared with the refinement timer).
1233 */
1234 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart);
1235 return VINF_SUCCESS;
1236 }
1237
1238 Assert(!fRough);
1239 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1240}
1241
1242
1243/**
1244 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1245 *
1246 * @returns Index of the CPU in the cache set.
1247 * @param pGip The GIP.
1248 * @param idCpu The CPU ID.
1249 */
1250static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1251{
1252 uint32_t i, cTries;
1253
1254 /*
1255 * ASSUMES that CPU IDs are constant.
1256 */
1257 for (i = 0; i < pGip->cCpus; i++)
1258 if (pGip->aCPUs[i].idCpu == idCpu)
1259 return i;
1260
1261 cTries = 0;
1262 do
1263 {
1264 for (i = 0; i < pGip->cCpus; i++)
1265 {
1266 bool fRc;
1267 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1268 if (fRc)
1269 return i;
1270 }
1271 } while (cTries++ < 32);
1272 AssertReleaseFailed();
1273 return i - 1;
1274}
1275
1276
1277/**
1278 * The calling CPU should be accounted as online, update GIP accordingly.
1279 *
1280 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1281 *
1282 * @param pDevExt The device extension.
1283 * @param idCpu The CPU ID.
1284 */
1285static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1286{
1287 int iCpuSet = 0;
1288 uint16_t idApic = UINT16_MAX;
1289 uint32_t i = 0;
1290 uint64_t u64NanoTS = 0;
1291 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1292
1293 AssertPtrReturnVoid(pGip);
1294 AssertRelease(idCpu == RTMpCpuId());
1295 Assert(pGip->cPossibleCpus == RTMpGetCount());
1296
1297 /*
1298 * Do this behind a spinlock with interrupts disabled as this can fire
1299 * on all CPUs simultaneously, see @bugref{6110}.
1300 */
1301 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1302
1303 /*
1304 * Update the globals.
1305 */
1306 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1307 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1308 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1309 if (iCpuSet >= 0)
1310 {
1311 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1312 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1313 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1314 }
1315
1316 /*
1317 * Update the entry.
1318 */
1319 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1320 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1321
1322 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1323
1324 idApic = ASMGetApicId();
1325 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1326 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1327 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1328
1329 /*
1330 * Update the APIC ID and CPU set index mappings.
1331 */
1332 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1333 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1334
1335 /* Update the Mp online/offline counter. */
1336 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1337
1338 /* Add this CPU to the set of CPUs for which we need to calculate their TSC-deltas. */
1339 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1340 {
1341 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
1342#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1343 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
1344 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
1345 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
1346 {
1347 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
1348 }
1349 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
1350#endif
1351 }
1352
1353 /* commit it */
1354 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1355
1356 RTSpinlockRelease(pDevExt->hGipSpinlock);
1357}
1358
1359
1360/**
1361 * The CPU should be accounted as offline, update the GIP accordingly.
1362 *
1363 * This is used by supdrvGipMpEvent.
1364 *
1365 * @param pDevExt The device extension.
1366 * @param idCpu The CPU ID.
1367 */
1368static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1369{
1370 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1371 int iCpuSet;
1372 unsigned i;
1373
1374 AssertPtrReturnVoid(pGip);
1375 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1376
1377 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1378 AssertReturnVoid(iCpuSet >= 0);
1379
1380 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1381 AssertReturnVoid(i < pGip->cCpus);
1382 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1383
1384 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1385 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1386
1387 /* Update the Mp online/offline counter. */
1388 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1389
1390 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1391 {
1392 /* Reset the TSC delta, we will recalculate it lazily. */
1393 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1394 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1395 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1396 }
1397
1398 /* commit it */
1399 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1400
1401 RTSpinlockRelease(pDevExt->hGipSpinlock);
1402}
1403
1404
1405/**
1406 * Multiprocessor event notification callback.
1407 *
1408 * This is used to make sure that the GIP master gets passed on to
1409 * another CPU. It also updates the associated CPU data.
1410 *
1411 * @param enmEvent The event.
1412 * @param idCpu The cpu it applies to.
1413 * @param pvUser Pointer to the device extension.
1414 *
1415 * @remarks This function -must- fire on the newly online'd CPU for the
1416 * RTMPEVENT_ONLINE case and can fire on any CPU for the
1417 * RTMPEVENT_OFFLINE case.
1418 */
1419static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1420{
1421 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1422 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1423
1424 AssertRelease(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1425
1426 /*
1427 * Update the GIP CPU data.
1428 */
1429 if (pGip)
1430 {
1431 switch (enmEvent)
1432 {
1433 case RTMPEVENT_ONLINE:
1434 AssertRelease(idCpu == RTMpCpuId());
1435 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1436 break;
1437 case RTMPEVENT_OFFLINE:
1438 supdrvGipMpEventOffline(pDevExt, idCpu);
1439 break;
1440 }
1441 }
1442
1443 /*
1444 * Make sure there is a master GIP.
1445 */
1446 if (enmEvent == RTMPEVENT_OFFLINE)
1447 {
1448 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1449 if (idGipMaster == idCpu)
1450 {
1451 /*
1452 * The GIP master is going offline, find a new one.
1453 */
1454 bool fIgnored;
1455 unsigned i;
1456 RTCPUID idNewGipMaster = NIL_RTCPUID;
1457 RTCPUSET OnlineCpus;
1458 RTMpGetOnlineSet(&OnlineCpus);
1459
1460 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1461 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1462 {
1463 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1464 if (idCurCpu != idGipMaster)
1465 {
1466 idNewGipMaster = idCurCpu;
1467 break;
1468 }
1469 }
1470
1471 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1472 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1473 NOREF(fIgnored);
1474 }
1475 }
1476}
1477
1478
1479/**
1480 * On CPU initialization callback for RTMpOnAll.
1481 *
1482 * @param idCpu The CPU ID.
1483 * @param pvUser1 The device extension.
1484 * @param pvUser2 The GIP.
1485 */
1486static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1487{
1488 /* This is good enough, even though it will update some of the globals a
1489 bit to much. */
1490 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1491}
1492
1493
1494/**
1495 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1496 *
1497 * @param idCpu Ignored.
1498 * @param pvUser1 Where to put the TSC.
1499 * @param pvUser2 Ignored.
1500 */
1501static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1502{
1503 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1504}
1505
1506
1507/**
1508 * Determine if Async GIP mode is required because of TSC drift.
1509 *
1510 * When using the default/normal timer code it is essential that the time stamp counter
1511 * (TSC) runs never backwards, that is, a read operation to the counter should return
1512 * a bigger value than any previous read operation. This is guaranteed by the latest
1513 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1514 * case we have to choose the asynchronous timer mode.
1515 *
1516 * @param poffMin Pointer to the determined difference between different
1517 * cores (optional, can be NULL).
1518 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1519 */
1520static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1521{
1522 /*
1523 * Just iterate all the cpus 8 times and make sure that the TSC is
1524 * ever increasing. We don't bother taking TSC rollover into account.
1525 */
1526 int iEndCpu = RTMpGetArraySize();
1527 int iCpu;
1528 int cLoops = 8;
1529 bool fAsync = false;
1530 int rc = VINF_SUCCESS;
1531 uint64_t offMax = 0;
1532 uint64_t offMin = ~(uint64_t)0;
1533 uint64_t PrevTsc = ASMReadTSC();
1534
1535 while (cLoops-- > 0)
1536 {
1537 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1538 {
1539 uint64_t CurTsc;
1540 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker, &CurTsc, NULL);
1541 if (RT_SUCCESS(rc))
1542 {
1543 if (CurTsc <= PrevTsc)
1544 {
1545 fAsync = true;
1546 offMin = offMax = PrevTsc - CurTsc;
1547 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1548 iCpu, cLoops, CurTsc, PrevTsc));
1549 break;
1550 }
1551
1552 /* Gather statistics (except the first time). */
1553 if (iCpu != 0 || cLoops != 7)
1554 {
1555 uint64_t off = CurTsc - PrevTsc;
1556 if (off < offMin)
1557 offMin = off;
1558 if (off > offMax)
1559 offMax = off;
1560 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1561 }
1562
1563 /* Next */
1564 PrevTsc = CurTsc;
1565 }
1566 else if (rc == VERR_NOT_SUPPORTED)
1567 break;
1568 else
1569 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1570 }
1571
1572 /* broke out of the loop. */
1573 if (iCpu < iEndCpu)
1574 break;
1575 }
1576
1577 if (poffMin)
1578 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1579 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1580 fAsync, iEndCpu, rc, offMin, offMax));
1581#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1582 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1583#endif
1584 return fAsync;
1585}
1586
1587
1588/**
1589 * supdrvGipInit() worker that determines the GIP TSC mode.
1590 *
1591 * @returns The most suitable TSC mode.
1592 * @param pDevExt Pointer to the device instance data.
1593 */
1594static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1595{
1596 uint64_t u64DiffCoresIgnored;
1597 uint32_t uEAX, uEBX, uECX, uEDX;
1598
1599 /*
1600 * Establish whether the CPU advertises TSC as invariant, we need that in
1601 * a couple of places below.
1602 */
1603 bool fInvariantTsc = false;
1604 if (ASMHasCpuId())
1605 {
1606 uEAX = ASMCpuId_EAX(0x80000000);
1607 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1608 {
1609 uEDX = ASMCpuId_EDX(0x80000007);
1610 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1611 fInvariantTsc = true;
1612 }
1613 }
1614
1615 /*
1616 * On single CPU systems, we don't need to consider ASYNC mode.
1617 */
1618 if (RTMpGetCount() <= 1)
1619 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1620
1621 /*
1622 * Allow the user and/or OS specific bits to force async mode.
1623 */
1624 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1625 return SUPGIPMODE_ASYNC_TSC;
1626
1627 /*
1628 * Use invariant mode if the CPU says TSC is invariant.
1629 */
1630 if (fInvariantTsc)
1631 return SUPGIPMODE_INVARIANT_TSC;
1632
1633 /*
1634 * TSC is not invariant and we're on SMP, this presents two problems:
1635 *
1636 * (1) There might be a skew between the CPU, so that cpu0
1637 * returns a TSC that is slightly different from cpu1.
1638 * This screw may be due to (2), bad TSC initialization
1639 * or slightly different TSC rates.
1640 *
1641 * (2) Power management (and other things) may cause the TSC
1642 * to run at a non-constant speed, and cause the speed
1643 * to be different on the cpus. This will result in (1).
1644 *
1645 * If any of the above is detected, we will have to use ASYNC mode.
1646 */
1647 /* (1). Try check for current differences between the cpus. */
1648 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1649 return SUPGIPMODE_ASYNC_TSC;
1650
1651 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1652 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1653 if ( ASMIsValidStdRange(uEAX)
1654 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
1655 {
1656 /* Check for APM support. */
1657 uEAX = ASMCpuId_EAX(0x80000000);
1658 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1659 {
1660 uEDX = ASMCpuId_EDX(0x80000007);
1661 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1662 return SUPGIPMODE_ASYNC_TSC;
1663 }
1664 }
1665
1666 return SUPGIPMODE_SYNC_TSC;
1667}
1668
1669
1670/**
1671 * Initializes per-CPU GIP information.
1672 *
1673 * @param pGip Pointer to the GIP.
1674 * @param pCpu Pointer to which GIP CPU to initalize.
1675 * @param u64NanoTS The current nanosecond timestamp.
1676 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1677 */
1678static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1679{
1680 pCpu->u32TransactionId = 2;
1681 pCpu->u64NanoTS = u64NanoTS;
1682 pCpu->u64TSC = ASMReadTSC();
1683 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1684 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1685
1686 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1687 ASMAtomicWriteSize(&pCpu->idCpu, NIL_RTCPUID);
1688 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1689 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1690
1691 /*
1692 * The first time we're called, we don't have a CPU frequency handy,
1693 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1694 * called again and at that point we have a more plausible CPU frequency
1695 * value handy. The frequency history will also be adjusted again on
1696 * the 2nd timer callout (maybe we can skip that now?).
1697 */
1698 if (!uCpuHz)
1699 {
1700 pCpu->u64CpuHz = _4G - 1;
1701 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1702 }
1703 else
1704 {
1705 pCpu->u64CpuHz = uCpuHz;
1706 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1707 }
1708 pCpu->au32TSCHistory[0]
1709 = pCpu->au32TSCHistory[1]
1710 = pCpu->au32TSCHistory[2]
1711 = pCpu->au32TSCHistory[3]
1712 = pCpu->au32TSCHistory[4]
1713 = pCpu->au32TSCHistory[5]
1714 = pCpu->au32TSCHistory[6]
1715 = pCpu->au32TSCHistory[7]
1716 = pCpu->u32UpdateIntervalTSC;
1717}
1718
1719
1720/**
1721 * Initializes the GIP data.
1722 *
1723 * @param pDevExt Pointer to the device instance data.
1724 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1725 * @param HCPhys The physical address of the GIP.
1726 * @param u64NanoTS The current nanosecond timestamp.
1727 * @param uUpdateHz The update frequency.
1728 * @param uUpdateIntervalNS The update interval in nanoseconds.
1729 * @param cCpus The CPU count.
1730 */
1731static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1732 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus)
1733{
1734 size_t const cbGip = RT_ALIGN_Z(RT_OFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), PAGE_SIZE);
1735 unsigned i;
1736#ifdef DEBUG_DARWIN_GIP
1737 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1738#else
1739 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1740#endif
1741
1742 /*
1743 * Initialize the structure.
1744 */
1745 memset(pGip, 0, cbGip);
1746
1747 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1748 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1749 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1750 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1751 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1752 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1753 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1754 else
1755 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1756 pGip->cCpus = (uint16_t)cCpus;
1757 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1758 pGip->u32UpdateHz = uUpdateHz;
1759 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1760 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1761 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1762 RTCpuSetEmpty(&pGip->PresentCpuSet);
1763 RTMpGetSet(&pGip->PossibleCpuSet);
1764 pGip->cOnlineCpus = RTMpGetOnlineCount();
1765 pGip->cPresentCpus = RTMpGetPresentCount();
1766 pGip->cPossibleCpus = RTMpGetCount();
1767 pGip->idCpuMax = RTMpGetMaxCpuId();
1768 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1769 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1770 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1771 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1772 for (i = 0; i < cCpus; i++)
1773 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1774
1775 /*
1776 * Link it to the device extension.
1777 */
1778 pDevExt->pGip = pGip;
1779 pDevExt->HCPhysGip = HCPhys;
1780 pDevExt->cGipUsers = 0;
1781}
1782
1783
1784/**
1785 * Creates the GIP.
1786 *
1787 * @returns VBox status code.
1788 * @param pDevExt Instance data. GIP stuff may be updated.
1789 */
1790int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1791{
1792 PSUPGLOBALINFOPAGE pGip;
1793 RTHCPHYS HCPhysGip;
1794 uint32_t u32SystemResolution;
1795 uint32_t u32Interval;
1796 uint32_t u32MinInterval;
1797 uint32_t uMod;
1798 unsigned cCpus;
1799 int rc;
1800
1801 LogFlow(("supdrvGipCreate:\n"));
1802
1803 /*
1804 * Assert order.
1805 */
1806 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1807 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1808 Assert(!pDevExt->pGipTimer);
1809#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1810 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1811 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1812#else
1813 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1814 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1815#endif
1816
1817 /*
1818 * Check the CPU count.
1819 */
1820 cCpus = RTMpGetArraySize();
1821 if ( cCpus > RTCPUSET_MAX_CPUS
1822 || cCpus > 256 /* ApicId is used for the mappings */)
1823 {
1824 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1825 return VERR_TOO_MANY_CPUS;
1826 }
1827
1828 /*
1829 * Allocate a contiguous set of pages with a default kernel mapping.
1830 */
1831 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, RT_UOFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), false /*fExecutable*/);
1832 if (RT_FAILURE(rc))
1833 {
1834 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1835 return rc;
1836 }
1837 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1838 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1839
1840 /*
1841 * Find a reasonable update interval and initialize the structure.
1842 */
1843 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1844 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1845 * See @bugref{6710}. */
1846 u32MinInterval = RT_NS_10MS;
1847 u32SystemResolution = RTTimerGetSystemGranularity();
1848 u32Interval = u32MinInterval;
1849 uMod = u32MinInterval % u32SystemResolution;
1850 if (uMod)
1851 u32Interval += u32SystemResolution - uMod;
1852
1853 supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval, cCpus);
1854
1855 /*
1856 * Important sanity check...
1857 */
1858 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1859 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1860 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1861 {
1862 /* Basically, invariant Windows boxes, should never be detected as async (i.e. TSC-deltas should be 0). */
1863 OSDBGPRINT(("supdrvGipCreate: The TSC-deltas should be normalized by the host OS, but verifying shows it's not!\n"));
1864 return VERR_INTERNAL_ERROR_2;
1865 }
1866
1867 /*
1868 * Do the TSC frequency measurements.
1869 *
1870 * If we're in invariant TSC mode, just to a quick preliminary measurement
1871 * that the TSC-delta measurement code can use to yield cross calls.
1872 *
1873 * If we're in any of the other two modes, neither which require MP init,
1874 * notifications or deltas for the job, do the full measurement now so
1875 * that supdrvGipInitOnCpu can populate the TSC interval and history
1876 * array with more reasonable values.
1877 */
1878 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1879 {
1880 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, true /*fRough*/); /* cannot fail */
1881 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt, pGip);
1882 }
1883 else
1884 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, false /*fRough*/);
1885 if (RT_SUCCESS(rc))
1886 {
1887 /*
1888 * Start TSC-delta measurement thread before we start getting MP
1889 * events that will try kick it into action (includes the
1890 * RTMpOnAll/supdrvGipInitOnCpu call below).
1891 */
1892 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1893 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1894#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1895 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1896 && pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1897 rc = supdrvTscDeltaThreadInit(pDevExt);
1898#endif
1899 if (RT_SUCCESS(rc))
1900 {
1901 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1902 if (RT_SUCCESS(rc))
1903 {
1904 /*
1905 * Do GIP initialization on all online CPUs. Wake up the
1906 * TSC-delta thread afterwards.
1907 */
1908 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1909 if (RT_SUCCESS(rc))
1910 {
1911#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1912 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
1913 RTThreadUserSignal(pDevExt->hTscDeltaThread);
1914#else
1915 uint16_t iCpu;
1916 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1917 {
1918 /*
1919 * Measure the TSC deltas now that we have MP notifications.
1920 */
1921 int cTries = 5;
1922 do
1923 {
1924 rc = supdrvMeasureInitialTscDeltas(pDevExt);
1925 if ( rc != VERR_TRY_AGAIN
1926 && rc != VERR_CPU_OFFLINE)
1927 break;
1928 } while (--cTries > 0);
1929 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1930 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1931 }
1932 else
1933 {
1934 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1935 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1936 }
1937 if (RT_SUCCESS(rc))
1938#endif
1939 {
1940 /*
1941 * Create the timer.
1942 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1943 */
1944 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1945 {
1946 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
1947 supdrvGipAsyncTimer, pDevExt);
1948 if (rc == VERR_NOT_SUPPORTED)
1949 {
1950 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
1951 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
1952 }
1953 }
1954 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1955 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
1956 supdrvGipSyncAndInvariantTimer, pDevExt);
1957 if (RT_SUCCESS(rc))
1958 {
1959 /*
1960 * We're good.
1961 */
1962 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
1963 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1964
1965 g_pSUPGlobalInfoPage = pGip;
1966 return VINF_SUCCESS;
1967 }
1968
1969 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
1970 Assert(!pDevExt->pGipTimer);
1971 }
1972 }
1973 else
1974 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
1975 }
1976 else
1977 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
1978 }
1979 else
1980 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
1981 }
1982 else
1983 OSDBGPRINT(("supdrvGipCreate: supdrvMeasureInitialTscDeltas failed. rc=%Rrc\n", rc));
1984
1985 /* Releases timer frequency increase too. */
1986 supdrvGipDestroy(pDevExt);
1987 return rc;
1988}
1989
1990
1991/**
1992 * Invalidates the GIP data upon termination.
1993 *
1994 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1995 */
1996static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
1997{
1998 unsigned i;
1999 pGip->u32Magic = 0;
2000 for (i = 0; i < pGip->cCpus; i++)
2001 {
2002 pGip->aCPUs[i].u64NanoTS = 0;
2003 pGip->aCPUs[i].u64TSC = 0;
2004 pGip->aCPUs[i].iTSCHistoryHead = 0;
2005 pGip->aCPUs[i].u64TSCSample = 0;
2006 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2007 }
2008}
2009
2010
2011/**
2012 * Terminates the GIP.
2013 *
2014 * @param pDevExt Instance data. GIP stuff may be updated.
2015 */
2016void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2017{
2018 int rc;
2019#ifdef DEBUG_DARWIN_GIP
2020 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2021 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2022 pDevExt->pGipTimer, pDevExt->GipMemObj));
2023#endif
2024
2025 /*
2026 * Stop receiving MP notifications before tearing anything else down.
2027 */
2028 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2029
2030#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2031 /*
2032 * Terminate the TSC-delta measurement thread and resources.
2033 */
2034 supdrvTscDeltaTerm(pDevExt);
2035#endif
2036
2037 /*
2038 * Destroy the TSC-refinement timer.
2039 */
2040 if (pDevExt->pInvarTscRefineTimer)
2041 {
2042 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2043 pDevExt->pInvarTscRefineTimer = NULL;
2044 }
2045
2046 /*
2047 * Invalid the GIP data.
2048 */
2049 if (pDevExt->pGip)
2050 {
2051 supdrvGipTerm(pDevExt->pGip);
2052 pDevExt->pGip = NULL;
2053 }
2054 g_pSUPGlobalInfoPage = NULL;
2055
2056 /*
2057 * Destroy the timer and free the GIP memory object.
2058 */
2059 if (pDevExt->pGipTimer)
2060 {
2061 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2062 pDevExt->pGipTimer = NULL;
2063 }
2064
2065 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2066 {
2067 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2068 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2069 }
2070
2071 /*
2072 * Finally, make sure we've release the system timer resolution request
2073 * if one actually succeeded and is still pending.
2074 */
2075 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2076}
2077
2078
2079
2080
2081/*
2082 *
2083 *
2084 * GIP Update Timer Related Code
2085 * GIP Update Timer Related Code
2086 * GIP Update Timer Related Code
2087 *
2088 *
2089 */
2090
2091
2092/**
2093 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2094 * updates all the per cpu data except the transaction id.
2095 *
2096 * @param pDevExt The device extension.
2097 * @param pGipCpu Pointer to the per cpu data.
2098 * @param u64NanoTS The current time stamp.
2099 * @param u64TSC The current TSC.
2100 * @param iTick The current timer tick.
2101 *
2102 * @remarks Can be called with interrupts disabled!
2103 */
2104static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2105{
2106 uint64_t u64TSCDelta;
2107 uint32_t u32UpdateIntervalTSC;
2108 uint32_t u32UpdateIntervalTSCSlack;
2109 unsigned iTSCHistoryHead;
2110 uint64_t u64CpuHz;
2111 uint32_t u32TransactionId;
2112
2113 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2114 AssertPtrReturnVoid(pGip);
2115
2116 /* Delta between this and the previous update. */
2117 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2118
2119 /*
2120 * Update the NanoTS.
2121 */
2122 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2123
2124 /*
2125 * Calc TSC delta.
2126 */
2127 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2128 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2129
2130 /*
2131 * We don't need to keep realculating the frequency when it's invariant, so
2132 * the remainder of this function is only for the sync and async TSC modes.
2133 */
2134 if (pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC)
2135 {
2136 if (u64TSCDelta >> 32)
2137 {
2138 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2139 pGipCpu->cErrors++;
2140 }
2141
2142 /*
2143 * On the 2nd and 3rd callout, reset the history with the current TSC
2144 * interval since the values entered by supdrvGipInit are totally off.
2145 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2146 * better, while the 3rd should be most reliable.
2147 */
2148 /** @todo Could we drop this now that we initializes the history
2149 * with nominal TSC frequency values? */
2150 u32TransactionId = pGipCpu->u32TransactionId;
2151 if (RT_UNLIKELY( ( u32TransactionId == 5
2152 || u32TransactionId == 7)
2153 && ( iTick == 2
2154 || iTick == 3) ))
2155 {
2156 unsigned i;
2157 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2158 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2159 }
2160
2161 /*
2162 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2163 * Wait until we have at least one full history since the above history reset. The
2164 * assumption is that the majority of the previous history values will be tolerable.
2165 * See @bugref{6710} comment #67.
2166 */
2167 /** @todo Could we drop the fuding there now that we initializes the history
2168 * with nominal TSC frequency values? */
2169 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2170 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2171 {
2172 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2173 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2174 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2175 {
2176 uint32_t u32;
2177 u32 = pGipCpu->au32TSCHistory[0];
2178 u32 += pGipCpu->au32TSCHistory[1];
2179 u32 += pGipCpu->au32TSCHistory[2];
2180 u32 += pGipCpu->au32TSCHistory[3];
2181 u32 >>= 2;
2182 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2183 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2184 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2185 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2186 u64TSCDelta >>= 2;
2187 u64TSCDelta += u32;
2188 u64TSCDelta >>= 1;
2189 }
2190 }
2191
2192 /*
2193 * TSC History.
2194 */
2195 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2196 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2197 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2198 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2199
2200 /*
2201 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2202 *
2203 * On Windows, we have an occasional (but recurring) sour value that messed up
2204 * the history but taking only 1 interval reduces the precision overall.
2205 */
2206 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2207 || pGip->u32UpdateHz >= 1000)
2208 {
2209 uint32_t u32;
2210 u32 = pGipCpu->au32TSCHistory[0];
2211 u32 += pGipCpu->au32TSCHistory[1];
2212 u32 += pGipCpu->au32TSCHistory[2];
2213 u32 += pGipCpu->au32TSCHistory[3];
2214 u32 >>= 2;
2215 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2216 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2217 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2218 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2219 u32UpdateIntervalTSC >>= 2;
2220 u32UpdateIntervalTSC += u32;
2221 u32UpdateIntervalTSC >>= 1;
2222
2223 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2224 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2225 }
2226 else if (pGip->u32UpdateHz >= 90)
2227 {
2228 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2229 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2230 u32UpdateIntervalTSC >>= 1;
2231
2232 /* value chosen on a 2GHz thinkpad running windows */
2233 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2234 }
2235 else
2236 {
2237 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2238
2239 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2240 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2241 }
2242 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2243
2244 /*
2245 * CpuHz.
2246 */
2247 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2248 u64CpuHz /= pGip->u32UpdateIntervalNS;
2249 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2250 }
2251}
2252
2253
2254/**
2255 * Updates the GIP.
2256 *
2257 * @param pDevExt The device extension.
2258 * @param u64NanoTS The current nanosecond timesamp.
2259 * @param u64TSC The current TSC timesamp.
2260 * @param idCpu The CPU ID.
2261 * @param iTick The current timer tick.
2262 *
2263 * @remarks Can be called with interrupts disabled!
2264 */
2265static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2266{
2267 /*
2268 * Determine the relevant CPU data.
2269 */
2270 PSUPGIPCPU pGipCpu;
2271 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2272 AssertPtrReturnVoid(pGip);
2273
2274 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2275 pGipCpu = &pGip->aCPUs[0];
2276 else
2277 {
2278 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
2279 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
2280 return;
2281 pGipCpu = &pGip->aCPUs[iCpu];
2282 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
2283 return;
2284 }
2285
2286 /*
2287 * Start update transaction.
2288 */
2289 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2290 {
2291 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2292 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2293 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2294 pGipCpu->cErrors++;
2295 return;
2296 }
2297
2298 /*
2299 * Recalc the update frequency every 0x800th time.
2300 */
2301 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariants hosts. */
2302 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2303 {
2304 if (pGip->u64NanoTSLastUpdateHz)
2305 {
2306#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2307 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2308 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2309 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2310 {
2311 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2312 * calculation on non-invariant hosts if it changes the history decision
2313 * taken in supdrvGipDoUpdateCpu(). */
2314 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2315 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2316 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2317 }
2318#endif
2319 }
2320 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2321 }
2322
2323 /*
2324 * Update the data.
2325 */
2326 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2327
2328 /*
2329 * Complete transaction.
2330 */
2331 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2332}
2333
2334
2335/**
2336 * Updates the per cpu GIP data for the calling cpu.
2337 *
2338 * @param pDevExt The device extension.
2339 * @param u64NanoTS The current nanosecond timesamp.
2340 * @param u64TSC The current TSC timesamp.
2341 * @param idCpu The CPU ID.
2342 * @param idApic The APIC id for the CPU index.
2343 * @param iTick The current timer tick.
2344 *
2345 * @remarks Can be called with interrupts disabled!
2346 */
2347static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2348 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2349{
2350 uint32_t iCpu;
2351 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2352
2353 /*
2354 * Avoid a potential race when a CPU online notification doesn't fire on
2355 * the onlined CPU but the tick creeps in before the event notification is
2356 * run.
2357 */
2358 if (RT_UNLIKELY(iTick == 1))
2359 {
2360 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2361 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2362 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2363 }
2364
2365 iCpu = pGip->aiCpuFromApicId[idApic];
2366 if (RT_LIKELY(iCpu < pGip->cCpus))
2367 {
2368 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2369 if (pGipCpu->idCpu == idCpu)
2370 {
2371 /*
2372 * Start update transaction.
2373 */
2374 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2375 {
2376 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2377 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2378 pGipCpu->cErrors++;
2379 return;
2380 }
2381
2382 /*
2383 * Update the data.
2384 */
2385 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2386
2387 /*
2388 * Complete transaction.
2389 */
2390 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2391 }
2392 }
2393}
2394
2395
2396/**
2397 * Timer callback function for the sync and invariant GIP modes.
2398 *
2399 * @param pTimer The timer.
2400 * @param pvUser Opaque pointer to the device extension.
2401 * @param iTick The timer tick.
2402 */
2403static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2404{
2405 RTCCUINTREG uFlags;
2406 uint64_t u64TSC;
2407 uint64_t u64NanoTS;
2408 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2409 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2410
2411 uFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2412 u64TSC = ASMReadTSC();
2413 u64NanoTS = RTTimeSystemNanoTS();
2414
2415 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2416 {
2417 /*
2418 * The calculations in supdrvGipUpdate() is very timing sensitive and doesn't handle
2419 * missed timer ticks. So for now it is better to use a delta of 0 and have the TSC rate
2420 * affected a bit until we get proper TSC deltas than implementing options like
2421 * rescheduling the tick to be delivered on the right CPU or missing the tick entirely.
2422 *
2423 * The likely hood of this happening is really low. On Windows, Linux, and Solaris
2424 * timers fire on the CPU they were registered/started on. Darwin timers doesn't
2425 * necessarily (they are high priority threads waiting).
2426 */
2427 Assert(!ASMIntAreEnabled());
2428 supdrvTscDeltaApply(pGip, &u64TSC, ASMGetApicId(), NULL /* pfDeltaApplied */);
2429 }
2430
2431 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2432
2433 ASMSetFlags(uFlags);
2434
2435#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2436 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
2437 && !RTCpuSetIsEmpty(&pDevExt->TscDeltaCpuSet))
2438 {
2439 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
2440 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
2441 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
2442 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
2443 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
2444 /** @todo Do the actual poking using -- RTThreadUserSignal() */
2445 }
2446#endif
2447}
2448
2449
2450/**
2451 * Timer callback function for async GIP mode.
2452 * @param pTimer The timer.
2453 * @param pvUser Opaque pointer to the device extension.
2454 * @param iTick The timer tick.
2455 */
2456static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2457{
2458 RTCCUINTREG fOldFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2459 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2460 RTCPUID idCpu = RTMpCpuId();
2461 uint64_t u64TSC = ASMReadTSC();
2462 uint64_t NanoTS = RTTimeSystemNanoTS();
2463
2464 /** @todo reset the transaction number and whatnot when iTick == 1. */
2465 if (pDevExt->idGipMaster == idCpu)
2466 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2467 else
2468 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
2469
2470 ASMSetFlags(fOldFlags);
2471}
2472
2473
2474
2475
2476/*
2477 *
2478 *
2479 * TSC Delta Measurements And Related Code
2480 * TSC Delta Measurements And Related Code
2481 * TSC Delta Measurements And Related Code
2482 *
2483 *
2484 */
2485
2486
2487/*
2488 * Select TSC delta measurement algorithm.
2489 */
2490#if 1
2491# define GIP_TSC_DELTA_METHOD_1
2492#else
2493# define GIP_TSC_DELTA_METHOD_2
2494#endif
2495
2496/** For padding variables to keep them away from other cache lines. Better too
2497 * large than too small!
2498 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2499 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2500 * III had 32 bytes cache lines. */
2501#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2502
2503
2504/**
2505 * TSC delta measurment algorithm \#2 result entry.
2506 */
2507typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2508{
2509 uint32_t iSeqMine;
2510 uint32_t iSeqOther;
2511 uint64_t uTsc;
2512} SUPDRVTSCDELTAMETHOD2ENTRY;
2513
2514/**
2515 * TSC delta measurment algorithm \#2 Data.
2516 */
2517typedef struct SUPDRVTSCDELTAMETHOD2
2518{
2519 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2520 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 1];
2521 /** The current sequence number of this worker. */
2522 uint32_t volatile iCurSeqNo;
2523 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2524 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2525 /** Result table. */
2526 SUPDRVTSCDELTAMETHOD2ENTRY aResults[96];
2527} SUPDRVTSCDELTAMETHOD2;
2528/** Pointer to the data for TSC delta mesurment algorithm \#2 .*/
2529typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2530
2531
2532/**
2533 * The TSC delta synchronization struct, version 2.
2534 *
2535 * The syncrhonization variable is completely isolated in its own cache line
2536 * (provided our max cache line size estimate is correct).
2537 */
2538typedef struct SUPTSCDELTASYNC2
2539{
2540 /** Padding to make sure the uVar1 is in its own cache line. */
2541 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2542
2543 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2544 volatile uint32_t uSyncVar;
2545 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2546 volatile uint32_t uSyncSeq;
2547
2548 /** Padding to make sure the uVar1 is in its own cache line. */
2549 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2550
2551 /** Start RDTSC value. Put here mainly to save stack space. */
2552 uint64_t uTscStart;
2553 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2554 uint64_t cMaxTscTicks;
2555} SUPTSCDELTASYNC2;
2556AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2557typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2558
2559/** Prestart wait. */
2560#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2561/** Prestart aborted. */
2562#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2563/** Ready (on your mark). */
2564#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2565/** Steady (get set). */
2566#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2567/** Go! */
2568#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2569/** Used by the verfication test. */
2570#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2571
2572/** We reached the time limit. */
2573#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2574/** The other party won't touch the sync struct ever again. */
2575#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2576
2577
2578/**
2579 * Argument package/state passed by supdrvMeasureTscDeltaOne to the RTMpOn
2580 * callback worker.
2581 */
2582typedef struct SUPDRVGIPTSCDELTARGS
2583{
2584 /** The device extension. */
2585 PSUPDRVDEVEXT pDevExt;
2586 /** Pointer to the GIP CPU array entry for the worker. */
2587 PSUPGIPCPU pWorker;
2588 /** Pointer to the GIP CPU array entry for the master. */
2589 PSUPGIPCPU pMaster;
2590 /** The maximum number of ticks to spend in supdrvMeasureTscDeltaCallback.
2591 * (This is what we need a rough TSC frequency for.) */
2592 uint64_t cMaxTscTicks;
2593 /** Used to abort synchronization setup. */
2594 bool volatile fAbortSetup;
2595
2596#if 0
2597 /** Method 1 data. */
2598 struct
2599 {
2600 } M1;
2601#endif
2602
2603#ifdef GIP_TSC_DELTA_METHOD_2
2604 struct
2605 {
2606 PSUPDRVTSCDELTAMETHOD2 pMasterData;
2607 PSUPDRVTSCDELTAMETHOD2 pWorkerData;
2608 uint32_t cHits;
2609 bool fLagMaster;
2610 bool fLagWorker;
2611 bool volatile fQuitEarly;
2612 } M2;
2613#endif
2614
2615
2616 /** Padding to make sure the master variables live in its own cache lines. */
2617 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2618 /** Pointer to the master's synchronization struct (on stack). */
2619 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2620 /** Verification test TSC values for the master. */
2621 uint64_t volatile auVerifyMasterTscs[32];
2622 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2623 * VERR_TRY_AGAIN on timeout. */
2624 int32_t rcVerify;
2625 /** The maximum difference between TSC read during delta verification. */
2626 int64_t cMaxVerifyTscTicks;
2627 /** The minimum difference between two TSC reads during verification. */
2628 int64_t cMinVerifyTscTicks;
2629 /** The bad TSC diff, worker relative to master (= worker - master).
2630 * Negative value means the worker is behind the master. */
2631 int64_t iVerifyBadTscDiff;
2632
2633 /** Padding to make sure the uVar1 is in its own cache line. */
2634 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2635 /** Pointer to the worker's synchronization struct (on stack). */
2636 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2637 /** Verification test TSC values for the worker. */
2638 uint64_t volatile auVerifyWorkerTscs[32];
2639
2640 /** Padding to make sure the above is in its own cache line. */
2641 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2642} SUPDRVGIPTSCDELTARGS;
2643typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2644
2645
2646/** @name Macros that implements the basic synchronization steps common to
2647 * the algorithms.
2648 *
2649 * Must be used from loop as the timeouts are implemented via 'break' statements
2650 * at the moment.
2651 *
2652 * @{
2653 */
2654#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2655# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2656# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2657# define TSCDELTA_DBG_CHECK_LOOP() \
2658 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2659#else
2660# define TSCDELTA_DBG_VARS() ((void)0)
2661# define TSCDELTA_DBG_START_LOOP() ((void)0)
2662# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2663#endif
2664
2665
2666static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2667 bool fIsMaster, PRTCCUINTREG pfEFlags)
2668{
2669 uint32_t iMySeq = fIsMaster ? 0 : 256;
2670 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2671 uint32_t u32Tmp;
2672 uint32_t iSync2Loops = 0;
2673 RTCCUINTREG fEFlags;
2674 TSCDELTA_DBG_VARS();
2675
2676 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2677
2678 /*
2679 * The master tells the worker to get on it's mark.
2680 */
2681 if (fIsMaster)
2682 {
2683 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2684 { /* likely*/ }
2685 else
2686 return false;
2687 }
2688
2689 /*
2690 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2691 */
2692 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2693 for (;;)
2694 {
2695 fEFlags = ASMIntDisableFlags();
2696 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2697 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2698 break;
2699
2700 ASMSetFlags(fEFlags);
2701 ASMNopPause();
2702
2703 /* Abort? */
2704 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2705 break;
2706
2707 /* Check for timeouts every so often (not every loop in case RDTSC is
2708 trapping or something). Must check the first time around. */
2709#if 0 /* For debugging the timeout paths. */
2710 static uint32_t volatile xxx;
2711#endif
2712 if ( ( (iSync2Loops & 0x3ff) == 0
2713 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
2714#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
2715 || (!fIsMaster && (++xxx & 0xf) == 0)
2716#endif
2717 )
2718 {
2719 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
2720 ignore the timeout if we've got the go ahead already (simpler). */
2721 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
2722 {
2723 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
2724 return false;
2725 }
2726 }
2727 iSync2Loops++;
2728 }
2729
2730 /*
2731 * Interrupts are now disabled and will remain disabled until we do
2732 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
2733 */
2734 *pfEFlags = fEFlags;
2735
2736 /*
2737 * The worker tells the master that it is on its mark and that the master
2738 * need to get into position as well.
2739 */
2740 if (!fIsMaster)
2741 {
2742 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2743 { /* likely */ }
2744 else
2745 {
2746 ASMSetFlags(fEFlags);
2747 return false;
2748 }
2749 }
2750
2751 /*
2752 * The master sends the 'go' to the worker and wait for ACK.
2753 */
2754 if (fIsMaster)
2755 {
2756 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2757 { /* likely */ }
2758 else
2759 {
2760 ASMSetFlags(fEFlags);
2761 return false;
2762 }
2763 }
2764
2765 /*
2766 * Wait for the 'go' signal (ack in the master case).
2767 */
2768 TSCDELTA_DBG_START_LOOP();
2769 for (;;)
2770 {
2771 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2772 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
2773 break;
2774 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
2775 { /* likely */ }
2776 else
2777 {
2778 ASMSetFlags(fEFlags);
2779 return false;
2780 }
2781
2782 TSCDELTA_DBG_CHECK_LOOP();
2783 ASMNopPause();
2784 }
2785
2786 /*
2787 * The worker acks the 'go' (shouldn't fail).
2788 */
2789 if (!fIsMaster)
2790 {
2791 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2792 { /* likely */ }
2793 else
2794 {
2795 ASMSetFlags(fEFlags);
2796 return false;
2797 }
2798 }
2799
2800 /*
2801 * Try enter mostly lockstep execution with it.
2802 */
2803 for (;;)
2804 {
2805 uint32_t iOtherSeq1, iOtherSeq2;
2806 ASMCompilerBarrier();
2807 ASMSerializeInstruction();
2808
2809 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
2810 ASMNopPause();
2811 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
2812 ASMNopPause();
2813 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
2814
2815 ASMCompilerBarrier();
2816 if (iOtherSeq1 == iOtherSeq2)
2817 return true;
2818
2819 /* Did the other guy give up? Should we give up? */
2820 if ( iOtherSeq1 == UINT32_MAX
2821 || iOtherSeq2 == UINT32_MAX)
2822 return true;
2823 if (++iMySeq >= iMaxSeq)
2824 {
2825 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
2826 return true;
2827 }
2828 ASMNopPause();
2829 }
2830}
2831
2832#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync) \
2833 do { \
2834 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fMaster*/, &uFlags))) \
2835 { /*likely*/ } \
2836 else break; \
2837 } while (0)
2838#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync) \
2839 do { \
2840 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fMaster*/, &uFlags))) \
2841 { /*likely*/ } \
2842 else break; \
2843 } while (0)
2844
2845
2846static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync, RTCCUINTREG fEFlags)
2847{
2848 TSCDELTA_DBG_VARS();
2849
2850 /*
2851 * Wait for the 'ready' signal. In the master's case, this means the
2852 * worker has completed its data collection, while in the worker's case it
2853 * means the master is done processing the data and it's time for the next
2854 * loop iteration (or whatever).
2855 */
2856 ASMSetFlags(fEFlags);
2857 TSCDELTA_DBG_START_LOOP();
2858 for (;;)
2859 {
2860 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2861 if (u32Tmp == GIP_TSC_DELTA_SYNC2_READY)
2862 return true;
2863 ASMNopPause();
2864 if (u32Tmp != GIP_TSC_DELTA_SYNC2_GO)
2865 return false; /* shouldn't ever happen! */
2866 TSCDELTA_DBG_CHECK_LOOP();
2867 ASMNopPause();
2868 }
2869}
2870
2871#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync) \
2872 do { \
2873 if (supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, uFlags)) \
2874 { /* likely */ } \
2875 else break; \
2876 } while (0)
2877
2878#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
2879 do {\
2880 /* \
2881 * Tell the woker that we're done processing the data and ready for the next round. \
2882 */ \
2883 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2884 { \
2885 ASMSetFlags(uFlags); \
2886 break; \
2887 } \
2888 } while (0)
2889
2890#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync) \
2891 do { \
2892 /* \
2893 * Tell the master that we're done collecting data and wait for the next round to start. \
2894 */ \
2895 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2896 { \
2897 ASMSetFlags(uFlags); \
2898 break; \
2899 } \
2900 if (supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, uFlags)) \
2901 { /* likely */ } \
2902 else break; \
2903 } while (0)
2904/** @} */
2905
2906#ifdef GIP_TSC_DELTA_METHOD_1
2907
2908/**
2909 * TSC delta measurment algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
2910 *
2911 *
2912 * We ignore the first few runs of the loop in order to prime the
2913 * cache. Also, we need to be careful about using 'pause' instruction
2914 * in critical busy-wait loops in this code - it can cause undesired
2915 * behaviour with hyperthreading.
2916 *
2917 * We try to minimize the measurement error by computing the minimum
2918 * read time of the compare statement in the worker by taking TSC
2919 * measurements across it.
2920 *
2921 * It must be noted that the computed minimum read time is mostly to
2922 * eliminate huge deltas when the worker is too early and doesn't by
2923 * itself help produce more accurate deltas. We allow two times the
2924 * computed minimum as an arbibtrary acceptable threshold. Therefore,
2925 * it is still possible to get negative deltas where there are none
2926 * when the worker is earlier. As long as these occasional negative
2927 * deltas are lower than the time it takes to exit guest-context and
2928 * the OS to reschedule EMT on a different CPU we won't expose a TSC
2929 * that jumped backwards. It is because of the existence of the
2930 * negative deltas we don't recompute the delta with the master and
2931 * worker interchanged to eliminate the remaining measurement error.
2932 *
2933 *
2934 * @param pArgs The argument/state data.
2935 * @param pMySync My synchronization structure.
2936 * @param pOtherSync My partner's synchronization structure.
2937 * @param fIsMaster Set if master, clear if worker.
2938 * @param iTry The attempt number.
2939 */
2940static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2941 bool fIsMaster, uint32_t iTry)
2942{
2943 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
2944 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
2945 uint64_t uMinCmpReadTime = UINT64_MAX;
2946 unsigned iLoop;
2947 NOREF(iTry);
2948
2949 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
2950 {
2951 RTCCUINTREG uFlags;
2952 if (fIsMaster)
2953 {
2954 /*
2955 * The master.
2956 */
2957 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
2958 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
2959 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
2960 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
2961
2962 do
2963 {
2964 ASMSerializeInstruction();
2965 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
2966 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2967
2968 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
2969
2970 /* Process the data. */
2971 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2972 {
2973 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
2974 {
2975 int64_t iDelta = pGipCpuWorker->u64TSCSample
2976 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
2977 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2978 ? iDelta < pGipCpuWorker->i64TSCDelta
2979 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
2980 pGipCpuWorker->i64TSCDelta = iDelta;
2981 }
2982 }
2983
2984 /* Reset our TSC sample and tell the worker to move on. */
2985 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
2986 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
2987 }
2988 else
2989 {
2990 /*
2991 * The worker.
2992 */
2993 uint64_t uTscWorker;
2994 uint64_t uTscWorkerFlushed;
2995 uint64_t uCmpReadTime;
2996
2997 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
2998 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
2999
3000 /*
3001 * Keep reading the TSC until we notice that the master has read his. Reading
3002 * the TSC -after- the master has updated the memory is way too late. We thus
3003 * compensate by trying to measure how long it took for the worker to notice
3004 * the memory flushed from the master.
3005 */
3006 do
3007 {
3008 ASMSerializeInstruction();
3009 uTscWorker = ASMReadTSC();
3010 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3011 ASMSerializeInstruction();
3012 uTscWorkerFlushed = ASMReadTSC();
3013
3014 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3015 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3016 {
3017 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3018 if (uCmpReadTime < (uMinCmpReadTime << 1))
3019 {
3020 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3021 if (uCmpReadTime < uMinCmpReadTime)
3022 uMinCmpReadTime = uCmpReadTime;
3023 }
3024 else
3025 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3026 }
3027 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3028 {
3029 if (uCmpReadTime < uMinCmpReadTime)
3030 uMinCmpReadTime = uCmpReadTime;
3031 }
3032
3033 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
3034 }
3035 }
3036
3037 /*
3038 * We must reset the worker TSC sample value in case it gets picked as a
3039 * GIP master later on (it's trashed above, naturally).
3040 */
3041 if (!fIsMaster)
3042 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3043}
3044
3045
3046/**
3047 * Initializes the argument/state data belonging to algorithm \#1.
3048 *
3049 * @returns VBox status code.
3050 * @param pArgs The argument/state data.
3051 */
3052static int supdrvTscDeltaMethod1Init(PSUPDRVGIPTSCDELTARGS pArgs)
3053{
3054 NOREF(pArgs);
3055 return VINF_SUCCESS;
3056}
3057
3058
3059/**
3060 * Undoes what supdrvTscDeltaMethod1Init() did.
3061 *
3062 * @param pArgs The argument/state data.
3063 */
3064static void supdrvTscDeltaMethod1Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3065{
3066 NOREF(pArgs);
3067}
3068
3069#endif /* GIP_TSC_DELTA_METHOD_1 */
3070
3071
3072#ifdef GIP_TSC_DELTA_METHOD_2
3073/*
3074 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3075 */
3076
3077# define GIP_TSC_DELTA_M2_LOOPS (12 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3078# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 1
3079
3080
3081static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs, uint32_t iLoop)
3082{
3083 PSUPDRVTSCDELTAMETHOD2 pMasterData = pArgs->M2.pMasterData;
3084 PSUPDRVTSCDELTAMETHOD2 pOtherData = pArgs->M2.pWorkerData;
3085 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3086 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3087 uint32_t idxResult;
3088 uint32_t cHits = 0;
3089
3090 /*
3091 * Look for matching entries in the master and worker tables.
3092 */
3093 for (idxResult = 0; idxResult < RT_ELEMENTS(pMasterData->aResults); idxResult++)
3094 {
3095 uint32_t idxOther = pMasterData->aResults[idxResult].iSeqOther;
3096 if (idxOther & 1)
3097 {
3098 idxOther >>= 1;
3099 if (idxOther < RT_ELEMENTS(pOtherData->aResults))
3100 {
3101 if (pOtherData->aResults[idxOther].iSeqOther == pMasterData->aResults[idxResult].iSeqMine)
3102 {
3103 int64_t iDelta;
3104 iDelta = pOtherData->aResults[idxOther].uTsc
3105 - (pMasterData->aResults[idxResult].uTsc - iMasterTscDelta);
3106 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3107 ? iDelta < iBestDelta
3108 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3109 iBestDelta = iDelta;
3110 cHits++;
3111 }
3112 }
3113 }
3114 }
3115
3116 /*
3117 * Save the results.
3118 */
3119 if (cHits > 2)
3120 pArgs->pWorker->i64TSCDelta = iBestDelta;
3121 pArgs->M2.cHits += cHits;
3122
3123 /*
3124 * Check and see if we can quit a little early. If the result is already
3125 * extremely good (+/-16 ticks seems reasonable), just stop.
3126 */
3127 if ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3128 ? iBestDelta <= 16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3129 : iBestDelta >= -16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE)
3130 {
3131 /*SUPR0Printf("quitting early #1: hits=%#x iLoop=%d iBestDelta=%lld\n", cHits, iLoop, iBestDelta);*/
3132 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3133 }
3134 /*
3135 * After a while, just stop if we get sufficent hits.
3136 */
3137 else if ( iLoop >= GIP_TSC_DELTA_M2_LOOPS / 3
3138 && cHits > 8)
3139 {
3140 uint32_t const cHitsNeeded = GIP_TSC_DELTA_M2_LOOPS * RT_ELEMENTS(pArgs->M2.pMasterData->aResults) / 4; /* 25% */
3141 if ( pArgs->M2.cHits >= cHitsNeeded
3142 && ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3143 ? iBestDelta <= GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3144 : iBestDelta >= -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE) )
3145 {
3146 /*SUPR0Printf("quitting early hits=%#x (%#x) needed=%#x iLoop=%d iBestDelta=%lld\n",
3147 pArgs->M2.cHits, cHits, cHitsNeeded, iLoop, iBestDelta);*/
3148 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3149 }
3150 }
3151}
3152
3153
3154/**
3155 * The core function of the 2nd TSC delta mesurment algorithm.
3156 *
3157 * The idea here is that we have the two CPUs execute the exact same code
3158 * collecting a largish set of TSC samples. The code has one data dependency on
3159 * the other CPU which intention it is to synchronize the execution as well as
3160 * help cross references the two sets of TSC samples (the sequence numbers).
3161 *
3162 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3163 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3164 * it will help with making the CPUs enter lock step execution occationally.
3165 *
3166 */
3167static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3168{
3169 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3170 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3171
3172 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3173 ASMSerializeInstruction();
3174 while (cLeft-- > 0)
3175 {
3176 uint64_t uTsc;
3177 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3178 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3179 ASMCompilerBarrier();
3180 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3181 uTsc = ASMReadTSC();
3182 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3183 ASMCompilerBarrier();
3184 ASMSerializeInstruction();
3185 pEntry->iSeqMine = iSeqMine;
3186 pEntry->iSeqOther = iSeqOther;
3187 pEntry->uTsc = uTsc;
3188 pEntry++;
3189 ASMSerializeInstruction();
3190 if (fLag)
3191 ASMNopPause();
3192 }
3193}
3194
3195
3196/**
3197 * TSC delta measurment algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3198 *
3199 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3200 *
3201 * @param pArgs The argument/state data.
3202 * @param pMySync My synchronization structure.
3203 * @param pOtherSync My partner's synchronization structure.
3204 * @param fIsMaster Set if master, clear if worker.
3205 * @param iTry The attempt number.
3206 */
3207static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3208 bool fIsMaster, uint32_t iTry)
3209{
3210 unsigned iLoop;
3211
3212 if (fIsMaster)
3213 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, false);
3214
3215 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3216 {
3217 RTCCUINTREG uFlags;
3218 if (fIsMaster)
3219 {
3220 /*
3221 * Adjust the loop lag fudge.
3222 */
3223# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3224 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3225 {
3226 /* Lag during the priming to be nice to everyone.. */
3227 pArgs->M2.fLagMaster = true;
3228 pArgs->M2.fLagWorker = true;
3229 }
3230 else
3231# endif
3232 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3233 {
3234 /* 25 % of the body without lagging. */
3235 pArgs->M2.fLagMaster = false;
3236 pArgs->M2.fLagWorker = false;
3237 }
3238 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3239 {
3240 /* 25 % of the body with both lagging. */
3241 pArgs->M2.fLagMaster = true;
3242 pArgs->M2.fLagWorker = true;
3243 }
3244 else
3245 {
3246 /* 50% of the body with alternating lag. */
3247 pArgs->M2.fLagMaster = (iLoop & 1) == 0;
3248 pArgs->M2.fLagWorker = (iLoop & 1) == 1;
3249 }
3250
3251 /*
3252 * Sync up with the worker and collect data.
3253 */
3254 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
3255 supdrvTscDeltaMethod2CollectData(pArgs->M2.pMasterData, &pArgs->M2.pWorkerData->iCurSeqNo, pArgs->M2.fLagMaster);
3256 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
3257
3258 /*
3259 * Process the data.
3260 */
3261# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3262 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3263# endif
3264 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs, iLoop);
3265
3266 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3267 }
3268 else
3269 {
3270 /*
3271 * The worker.
3272 */
3273 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
3274 supdrvTscDeltaMethod2CollectData(pArgs->M2.pWorkerData, &pArgs->M2.pMasterData->iCurSeqNo, pArgs->M2.fLagWorker);
3275 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
3276 }
3277
3278 if (ASMAtomicReadBool(&pArgs->M2.fQuitEarly))
3279 break;
3280
3281 }
3282}
3283
3284
3285/**
3286 * Initializes the argument/state data belonging to algorithm \#2.
3287 *
3288 * @returns VBox status code.
3289 * @param pArgs The argument/state data.
3290 */
3291static int supdrvTscDeltaMethod2Init(PSUPDRVGIPTSCDELTARGS pArgs)
3292{
3293 pArgs->M2.pMasterData = NULL;
3294 pArgs->M2.pWorkerData = NULL;
3295
3296 uint32_t const fFlags = /*RTMEMALLOCEX_FLAGS_ANY_CTX |*/ RTMEMALLOCEX_FLAGS_ZEROED;
3297 int rc = RTMemAllocEx(sizeof(*pArgs->M2.pWorkerData), 0, fFlags, (void **)&pArgs->M2.pWorkerData);
3298 if (RT_SUCCESS(rc))
3299 rc = RTMemAllocEx(sizeof(*pArgs->M2.pMasterData), 0, fFlags, (void **)&pArgs->M2.pMasterData);
3300 return rc;
3301}
3302
3303
3304/**
3305 * Undoes what supdrvTscDeltaMethod2Init() did.
3306 *
3307 * @param pArgs The argument/state data.
3308 */
3309static void supdrvTscDeltaMethod2Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3310{
3311 RTMemFreeEx(pArgs->M2.pMasterData, sizeof(*pArgs->M2.pMasterData));
3312 RTMemFreeEx(pArgs->M2.pWorkerData, sizeof(*pArgs->M2.pWorkerData));
3313# if 0
3314 SUPR0Printf("cHits=%d m=%d w=%d\n", pArgs->M2.cHits, pArgs->pMaster->idApic, pArgs->pWorker->idApic);
3315# endif
3316}
3317
3318
3319#endif /* GIP_TSC_DELTA_METHOD_2 */
3320
3321
3322
3323static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3324 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3325{
3326 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3327 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3328 uint32_t i;
3329 TSCDELTA_DBG_VARS();
3330
3331 for (;;)
3332 {
3333 RTCCUINTREG uFlags;
3334 AssertCompile((RT_ELEMENTS(pArgs->auVerifyMasterTscs) & 1) == 0);
3335 AssertCompile(RT_ELEMENTS(pArgs->auVerifyWorkerTscs) == RT_ELEMENTS(pArgs->auVerifyMasterTscs));
3336
3337 if (fIsMaster)
3338 {
3339 uint64_t uTscWorker;
3340 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
3341
3342 /*
3343 * Collect TSC, master goes first.
3344 */
3345 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyMasterTscs); i += 2)
3346 {
3347 /* Read, kick & wait #1. */
3348 uint64_t register uTsc = ASMReadTSC();
3349 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3350 ASMSerializeInstruction();
3351 pArgs->auVerifyMasterTscs[i] = uTsc;
3352 TSCDELTA_DBG_START_LOOP();
3353 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3354 {
3355 TSCDELTA_DBG_CHECK_LOOP();
3356 ASMNopPause();
3357 }
3358
3359 /* Read, kick & wait #2. */
3360 uTsc = ASMReadTSC();
3361 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3362 ASMSerializeInstruction();
3363 pArgs->auVerifyMasterTscs[i + 1] = uTsc;
3364 TSCDELTA_DBG_START_LOOP();
3365 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3366 {
3367 TSCDELTA_DBG_CHECK_LOOP();
3368 ASMNopPause();
3369 }
3370 }
3371
3372 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
3373
3374 /*
3375 * Process the data.
3376 */
3377 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3378 pArgs->cMinVerifyTscTicks = INT64_MAX;
3379 pArgs->iVerifyBadTscDiff = 0;
3380 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3381 uTscWorker = 0;
3382 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyMasterTscs); i++)
3383 {
3384 /* Master vs previous worker entry. */
3385 uint64_t uTscMaster = pArgs->auVerifyMasterTscs[i] - pGipCpuMaster->i64TSCDelta;
3386 int64_t iDiff;
3387 if (i > 0)
3388 {
3389 iDiff = uTscMaster - uTscWorker;
3390 if (iDiff > pArgs->cMaxVerifyTscTicks)
3391 pArgs->cMaxVerifyTscTicks = iDiff;
3392 if (iDiff < pArgs->cMinVerifyTscTicks)
3393 pArgs->cMinVerifyTscTicks = iDiff;
3394 if (iDiff < 0)
3395 {
3396 pArgs->iVerifyBadTscDiff = -iDiff;
3397 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3398 break;
3399 }
3400 }
3401
3402 /* Worker vs master. */
3403 uTscWorker = pArgs->auVerifyWorkerTscs[i] - iWorkerTscDelta;
3404 iDiff = uTscWorker - uTscMaster;
3405 if (iDiff > pArgs->cMaxVerifyTscTicks)
3406 pArgs->cMaxVerifyTscTicks = iDiff;
3407 if (iDiff < pArgs->cMinVerifyTscTicks)
3408 pArgs->cMinVerifyTscTicks = iDiff;
3409 if (iDiff < 0)
3410 {
3411 pArgs->iVerifyBadTscDiff = iDiff;
3412 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3413 break;
3414 }
3415 }
3416
3417 /* Done. */
3418 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3419 }
3420 else
3421 {
3422 /*
3423 * The worker, master leads.
3424 */
3425 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
3426
3427 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyWorkerTscs); i += 2)
3428 {
3429 uint64_t register uTsc;
3430
3431 /* Wait, Read and Kick #1. */
3432 TSCDELTA_DBG_START_LOOP();
3433 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3434 {
3435 TSCDELTA_DBG_CHECK_LOOP();
3436 ASMNopPause();
3437 }
3438 uTsc = ASMReadTSC();
3439 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3440 ASMSerializeInstruction();
3441 pArgs->auVerifyWorkerTscs[i] = uTsc;
3442
3443 /* Wait, Read and Kick #2. */
3444 TSCDELTA_DBG_START_LOOP();
3445 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3446 {
3447 TSCDELTA_DBG_CHECK_LOOP();
3448 ASMNopPause();
3449 }
3450 uTsc = ASMReadTSC();
3451 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3452 ASMSerializeInstruction();
3453 pArgs->auVerifyWorkerTscs[i + 1] = uTsc;
3454 }
3455
3456 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
3457 }
3458 return pArgs->rcVerify;
3459 }
3460
3461 /*
3462 * Timed out, please retry.
3463 */
3464 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3465 return VERR_TIMEOUT;
3466}
3467
3468
3469
3470/**
3471 * Handles the special abort procedure during synchronization setup in
3472 * supdrvMeasureTscDeltaCallbackUnwrapped().
3473 *
3474 * @returns 0 (dummy, ignored)
3475 * @param pArgs Pointer to argument/state data.
3476 * @param pMySync Pointer to my sync structure.
3477 * @param fIsMaster Set if we're the master, clear if worker.
3478 * @param fTimeout Set if it's a timeout.
3479 */
3480DECL_NO_INLINE(static, int)
3481supdrvMeasureTscDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3482{
3483 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3484 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3485 TSCDELTA_DBG_VARS();
3486
3487 /*
3488 * Clear our sync pointer and make sure the abort flag is set.
3489 */
3490 ASMAtomicWriteNullPtr(ppMySync);
3491 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3492
3493 /*
3494 * Make sure the other party is out of there and won't be touching our
3495 * sync state again (would cause stack corruption).
3496 */
3497 TSCDELTA_DBG_START_LOOP();
3498 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3499 {
3500 ASMNopPause();
3501 ASMNopPause();
3502 ASMNopPause();
3503 TSCDELTA_DBG_CHECK_LOOP();
3504 }
3505
3506 return 0;
3507}
3508
3509
3510/**
3511 * This is used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3512 * and compute the delta between them.
3513 *
3514 * To reduce code size a good when timeout handling was added, a dummy return
3515 * value had to be added (saves 1-3 lines per timeout case), thus this
3516 * 'Unwrapped' function and the dummy 0 return value.
3517 *
3518 * @returns 0 (dummy, ignored)
3519 * @param idCpu The CPU we are current scheduled on.
3520 * @param pArgs Pointer to a parameter package.
3521 *
3522 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3523 * read the TSC at exactly the same time on both the master and the
3524 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3525 * contention, SMI, pipelining etc. there is no guaranteed way of
3526 * doing this on x86 CPUs.
3527 */
3528static int supdrvMeasureTscDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3529{
3530 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3531 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3532 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3533 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3534 uint32_t iTry;
3535 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3536 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3537 SUPTSCDELTASYNC2 MySync;
3538 PSUPTSCDELTASYNC2 pOtherSync;
3539 int rc;
3540 TSCDELTA_DBG_VARS();
3541
3542 /* A bit of paranoia first. */
3543 if (!pGipCpuMaster || !pGipCpuWorker)
3544 return 0;
3545
3546 /*
3547 * If the CPU isn't part of the measurement, return immediately.
3548 */
3549 if ( !fIsMaster
3550 && idCpu != pGipCpuWorker->idCpu)
3551 return 0;
3552
3553 /*
3554 * Set up my synchronization stuff and wait for the other party to show up.
3555 *
3556 * We don't wait forever since the other party may be off fishing (offline,
3557 * spinning with ints disables, whatever), we must play nice to the rest of
3558 * the system as this context generally isn't one in which we will get
3559 * preempted and we may hold up a number of lower priority interrupts.
3560 */
3561 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3562 ASMAtomicWritePtr(ppMySync, &MySync);
3563 MySync.uTscStart = ASMReadTSC();
3564 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3565
3566 /* Look for the partner, might not be here yet... Special abort considerations. */
3567 iTry = 0;
3568 TSCDELTA_DBG_START_LOOP();
3569 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3570 {
3571 ASMNopPause();
3572 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3573 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu) )
3574 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3575 if ( (iTry++ & 0xff) == 0
3576 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3577 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3578 TSCDELTA_DBG_CHECK_LOOP();
3579 ASMNopPause();
3580 }
3581
3582 /* I found my partner, waiting to be found... Special abort considerations. */
3583 if (fIsMaster)
3584 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3585 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3586
3587 iTry = 0;
3588 TSCDELTA_DBG_START_LOOP();
3589 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3590 {
3591 ASMNopPause();
3592 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3593 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3594 if ( (iTry++ & 0xff) == 0
3595 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3596 {
3597 if ( fIsMaster
3598 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3599 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3600 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3601 }
3602 TSCDELTA_DBG_CHECK_LOOP();
3603 }
3604
3605 if (!fIsMaster)
3606 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3607 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3608
3609/** @todo Add a resumable state to pArgs so we don't waste time if we time
3610 * out or something. Timeouts are legit, any of the two CPUs may get
3611 * interrupted. */
3612
3613 /*
3614 * Start by seeing if we have a zero delta between the two CPUs.
3615 * This should normally be the case.
3616 */
3617 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3618 if (RT_SUCCESS(rc))
3619 {
3620 if (fIsMaster)
3621 {
3622 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3623 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3624 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3625 }
3626 else
3627 {
3628 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3629 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3630 }
3631 }
3632 /*
3633 * If the verification didn't time out, do regular delta measurements.
3634 * We retry this until we get a reasonable value.
3635 */
3636 else if (rc != VERR_TIMEOUT)
3637 {
3638 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3639 for (iTry = 0; iTry < 12; iTry++)
3640 {
3641 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3642 break;
3643
3644 /*
3645 * Do the measurements.
3646 */
3647#ifdef GIP_TSC_DELTA_METHOD_1
3648 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3649#elif defined(GIP_TSC_DELTA_METHOD_2)
3650 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3651#else
3652# error "huh??"
3653#endif
3654 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3655 break;
3656
3657 /*
3658 * Success? If so, stop trying.
3659 */
3660 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3661 {
3662 if (fIsMaster)
3663 {
3664 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3665 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3666 }
3667 else
3668 {
3669 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3670 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3671 }
3672 break;
3673 }
3674 }
3675 }
3676
3677 /*
3678 * End the synchroniziation dance. We tell the other that we're done,
3679 * then wait for the same kind of reply.
3680 */
3681 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3682 ASMAtomicWriteNullPtr(ppMySync);
3683 iTry = 0;
3684 TSCDELTA_DBG_START_LOOP();
3685 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3686 {
3687 iTry++;
3688 if ( iTry == 0
3689 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu))
3690 break; /* this really shouldn't happen. */
3691 TSCDELTA_DBG_CHECK_LOOP();
3692 ASMNopPause();
3693 }
3694
3695 return 0;
3696}
3697
3698/**
3699 * Callback used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3700 * and compute the delta between them.
3701 *
3702 * @param idCpu The CPU we are current scheduled on.
3703 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3704 * @param pvUser2 Unused.
3705 */
3706static DECLCALLBACK(void) supdrvMeasureTscDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3707{
3708 supdrvMeasureTscDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3709}
3710
3711
3712/**
3713 * Measures the TSC delta between the master GIP CPU and one specified worker
3714 * CPU.
3715 *
3716 * @returns VBox status code.
3717 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3718 * failure.
3719 * @param pDevExt Pointer to the device instance data.
3720 * @param idxWorker The index of the worker CPU from the GIP's array of
3721 * CPUs.
3722 *
3723 * @remarks This must be called with preemption enabled!
3724 */
3725static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3726{
3727 int rc;
3728 int rc2;
3729 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3730 RTCPUID idMaster = pDevExt->idGipMaster;
3731 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3732 PSUPGIPCPU pGipCpuMaster;
3733 uint32_t iGipCpuMaster;
3734
3735 /* Validate input a bit. */
3736 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3737 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3738 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3739
3740 /*
3741 * Don't attempt measuring the delta for the GIP master.
3742 */
3743 if (pGipCpuWorker->idCpu == idMaster)
3744 {
3745 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3746 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3747 return VINF_SUCCESS;
3748 }
3749
3750 /*
3751 * One measurement at at time, at least for now. We might be using
3752 * broadcast IPIs so, so be nice to the rest of the system.
3753 */
3754#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3755 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
3756#else
3757 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
3758#endif
3759 if (RT_FAILURE(rc))
3760 return rc;
3761
3762 /*
3763 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
3764 * try pick a different master. (This fudge only works with multi core systems.)
3765 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
3766 *
3767 * We skip this on AMDs for now as their HTT is different from intel's and
3768 * it doesn't seem to have any favorable effect on the results.
3769 *
3770 * If the master is offline, we need a new master too, so share the code.
3771 */
3772 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
3773 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
3774 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
3775 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
3776 && ASMHasCpuId()
3777 && ASMIsValidStdRange(ASMCpuId_EAX(0))
3778 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
3779 && !ASMIsAmdCpu()
3780 && pGip->cOnlineCpus > 2)
3781 || !RTMpIsCpuOnline(idMaster) )
3782 {
3783 uint32_t i;
3784 for (i = 0; i < pGip->cCpus; i++)
3785 if ( i != iGipCpuMaster
3786 && i != idxWorker
3787 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
3788 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
3789 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
3790 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
3791 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
3792 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
3793 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
3794 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
3795 {
3796 iGipCpuMaster = i;
3797 pGipCpuMaster = &pGip->aCPUs[i];
3798 idMaster = pGipCpuMaster->idCpu;
3799 break;
3800 }
3801 }
3802
3803 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
3804 {
3805 /*
3806 * Initialize data package for the RTMpOnAll callback.
3807 */
3808 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
3809 if (pArgs)
3810 {
3811 pArgs->pWorker = pGipCpuWorker;
3812 pArgs->pMaster = pGipCpuMaster;
3813 pArgs->pDevExt = pDevExt;
3814 pArgs->pSyncMaster = NULL;
3815 pArgs->pSyncWorker = NULL;
3816#if 0 /* later */
3817 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 2048; /* 488 us */
3818#else
3819 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 1024; /* 976 us */
3820#endif
3821
3822#ifdef GIP_TSC_DELTA_METHOD_1
3823 rc = supdrvTscDeltaMethod1Init(pArgs);
3824#elif defined(GIP_TSC_DELTA_METHOD_2)
3825 rc = supdrvTscDeltaMethod2Init(pArgs);
3826#else
3827# error "huh?"
3828#endif
3829 if (RT_SUCCESS(rc))
3830 {
3831 /*
3832 * Fire TSC-read workers on all CPUs but only synchronize between master
3833 * and one worker to ease memory contention.
3834 */
3835 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3836
3837 /** @todo Add RTMpOnPair and replace this ineffecient broadcast IPI. */
3838 rc = RTMpOnAll(supdrvMeasureTscDeltaCallback, pArgs, NULL);
3839 if (RT_SUCCESS(rc))
3840 {
3841#if 0
3842 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
3843 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
3844#endif
3845 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
3846 {
3847 /*
3848 * Work the TSC delta applicability rating. It starts
3849 * optimistic in supdrvGipInit, we downgrade it here.
3850 */
3851 SUPGIPUSETSCDELTA enmRating;
3852 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
3853 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
3854 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
3855 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
3856 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
3857 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
3858 else
3859 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
3860 if (pGip->enmUseTscDelta < enmRating)
3861 {
3862 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
3863 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
3864 }
3865 }
3866 else
3867 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3868 }
3869 /** @todo return try-again if we get an offline CPU error. */
3870 }
3871
3872#ifdef GIP_TSC_DELTA_METHOD_1
3873 supdrvTscDeltaMethod1Delete(pArgs);
3874#elif defined(GIP_TSC_DELTA_METHOD_2)
3875 supdrvTscDeltaMethod2Delete(pArgs);
3876#else
3877# error "huh?"
3878#endif
3879 RTMemFree(pArgs);
3880 }
3881 else
3882 rc = VERR_NO_MEMORY;
3883 }
3884 else
3885 rc = VERR_CPU_OFFLINE;
3886
3887 /*
3888 * We're done now.
3889 */
3890#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3891 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3892#else
3893 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3894#endif
3895 return rc;
3896}
3897
3898
3899/**
3900 * Clears TSC delta related variables.
3901 *
3902 * Clears all TSC samples as well as the delta synchronization variable on the
3903 * all the per-CPU structs. Optionally also clears the per-cpu deltas too.
3904 *
3905 * @param pDevExt Pointer to the device instance data.
3906 * @param fClearDeltas Whether the deltas are also to be cleared.
3907 */
3908static void supdrvClearTscSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas)
3909{
3910 unsigned iCpu;
3911 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3912 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3913 {
3914 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
3915 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
3916 if (fClearDeltas)
3917 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
3918 }
3919}
3920
3921
3922/**
3923 * Performs the initial measurements of the TSC deltas between CPUs.
3924 *
3925 * This is called by supdrvGipCreate or triggered by it if threaded.
3926 *
3927 * @returns VBox status code.
3928 * @param pDevExt Pointer to the device instance data.
3929 *
3930 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
3931 * idCpu, GIP's online CPU set which are populated in
3932 * supdrvGipInitOnCpu().
3933 */
3934static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt)
3935{
3936 PSUPGIPCPU pGipCpuMaster;
3937 unsigned iCpu;
3938 unsigned iOddEven;
3939 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3940 uint32_t idxMaster = UINT32_MAX;
3941 int rc = VINF_SUCCESS;
3942 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
3943
3944 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3945
3946 /*
3947 * Pick the first CPU online as the master TSC and make it the new GIP master based
3948 * on the APIC ID.
3949 *
3950 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
3951 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
3952 * master as this point since the sync/async timer isn't created yet.
3953 */
3954 supdrvClearTscSamples(pDevExt, true /* fClearDeltas */);
3955 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
3956 {
3957 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
3958 if (idxCpu != UINT16_MAX)
3959 {
3960 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
3961 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
3962 {
3963 idxMaster = idxCpu;
3964 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
3965 break;
3966 }
3967 }
3968 }
3969 AssertReturn(idxMaster != UINT32_MAX, VERR_CPU_NOT_FOUND);
3970 pGipCpuMaster = &pGip->aCPUs[idxMaster];
3971 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3972
3973 /*
3974 * If there is only a single CPU online we have nothing to do.
3975 */
3976 if (pGip->cOnlineCpus <= 1)
3977 {
3978 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
3979 return VINF_SUCCESS;
3980 }
3981
3982 /*
3983 * Loop thru the GIP CPU array and get deltas for each CPU (except the
3984 * master). We do the CPUs with the even numbered APIC IDs first so that
3985 * we've got alternative master CPUs to pick from on hyper-threaded systems.
3986 */
3987 for (iOddEven = 0; iOddEven < 2; iOddEven++)
3988 {
3989 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3990 {
3991 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3992 if ( iCpu != idxMaster
3993 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
3994 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3995 {
3996 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3997 if (RT_FAILURE(rc))
3998 {
3999 SUPR0Printf("supdrvMeasureTscDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4000 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4001 break;
4002 }
4003
4004 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4005 {
4006 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4007 rc = VERR_TRY_AGAIN;
4008 break;
4009 }
4010 }
4011 }
4012 }
4013
4014 return rc;
4015}
4016
4017
4018#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4019
4020/**
4021 * Switches the TSC-delta measurement thread into the butchered state.
4022 *
4023 * @returns VBox status code.
4024 * @param pDevExt Pointer to the device instance data.
4025 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4026 * @param pszFailed An error message to log.
4027 * @param rcFailed The error code to exit the thread with.
4028 */
4029static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4030{
4031 if (!fSpinlockHeld)
4032 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4033
4034 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4035 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4036 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", rcFailed));
4037 return rcFailed;
4038}
4039
4040
4041/**
4042 * The TSC-delta measurement thread.
4043 *
4044 * @returns VBox status code.
4045 * @param hThread The thread handle.
4046 * @param pvUser Opaque pointer to the device instance data.
4047 */
4048static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4049{
4050 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4051 bool fInitialMeasurement = true;
4052 uint32_t cConsecutiveTimeouts = 0;
4053 int rc = VERR_INTERNAL_ERROR_2;
4054 for (;;)
4055 {
4056 /*
4057 * Switch on the current state.
4058 */
4059 SUPDRVTSCDELTATHREADSTATE enmState;
4060 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4061 enmState = pDevExt->enmTscDeltaThreadState;
4062 switch (enmState)
4063 {
4064 case kTscDeltaThreadState_Creating:
4065 {
4066 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4067 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4068 if (RT_FAILURE(rc))
4069 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4070 /* fall thru */
4071 }
4072
4073 case kTscDeltaThreadState_Listening:
4074 {
4075 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4076
4077 /* Simple adaptive timeout. */
4078 if (cConsecutiveTimeouts++ == 10)
4079 {
4080 if (pDevExt->cMsTscDeltaTimeout == 1) /* 10 ms */
4081 pDevExt->cMsTscDeltaTimeout = 10;
4082 else if (pDevExt->cMsTscDeltaTimeout == 10) /* +100 ms */
4083 pDevExt->cMsTscDeltaTimeout = 100;
4084 else if (pDevExt->cMsTscDeltaTimeout == 100) /* +1000 ms */
4085 pDevExt->cMsTscDeltaTimeout = 500;
4086 cConsecutiveTimeouts = 0;
4087 }
4088 rc = RTThreadUserWait(pDevExt->hTscDeltaThread, pDevExt->cMsTscDeltaTimeout);
4089 if ( RT_FAILURE(rc)
4090 && rc != VERR_TIMEOUT)
4091 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4092 RTThreadUserReset(pDevExt->hTscDeltaThread);
4093 break;
4094 }
4095
4096 case kTscDeltaThreadState_WaitAndMeasure:
4097 {
4098 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4099 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4100 if (RT_FAILURE(rc))
4101 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4102 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4103 pDevExt->cMsTscDeltaTimeout = 1;
4104 RTThreadSleep(10);
4105 /* fall thru */
4106 }
4107
4108 case kTscDeltaThreadState_Measuring:
4109 {
4110 cConsecutiveTimeouts = 0;
4111 if (fInitialMeasurement)
4112 {
4113 int cTries = 8;
4114 int cMsWaitPerTry = 10;
4115 fInitialMeasurement = false;
4116 do
4117 {
4118 rc = supdrvMeasureInitialTscDeltas(pDevExt);
4119 if ( RT_SUCCESS(rc)
4120 || ( RT_FAILURE(rc)
4121 && rc != VERR_TRY_AGAIN
4122 && rc != VERR_CPU_OFFLINE))
4123 {
4124 break;
4125 }
4126 RTThreadSleep(cMsWaitPerTry);
4127 } while (cTries-- > 0);
4128 }
4129 else
4130 {
4131 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4132 unsigned iCpu;
4133
4134 /* Measure TSC-deltas only for the CPUs that are in the set. */
4135 rc = VINF_SUCCESS;
4136 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4137 {
4138 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4139 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4140 {
4141 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4142 {
4143 int rc2 = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
4144 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4145 rc = rc2;
4146 }
4147 else
4148 {
4149 /*
4150 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex,
4151 * mark the delta as fine to get the timer thread off our back.
4152 */
4153 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4154 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4155 }
4156 }
4157 }
4158 }
4159 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4160 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4161 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4162 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4163 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as the initial value. */
4164 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4165 break;
4166 }
4167
4168 case kTscDeltaThreadState_Terminating:
4169 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4170 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4171 return VINF_SUCCESS;
4172
4173 case kTscDeltaThreadState_Butchered:
4174 default:
4175 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4176 }
4177 }
4178
4179 return rc;
4180}
4181
4182
4183/**
4184 * Waits for the TSC-delta measurement thread to respond to a state change.
4185 *
4186 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4187 * other error code on internal error.
4188 *
4189 * @param pThis Pointer to the grant service instance data.
4190 * @param enmCurState The current state.
4191 * @param enmNewState The new state we're waiting for it to enter.
4192 */
4193static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4194 SUPDRVTSCDELTATHREADSTATE enmNewState)
4195{
4196 /*
4197 * Wait a short while for the expected state transition.
4198 */
4199 int rc;
4200 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4201 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4202 if (pDevExt->enmTscDeltaThreadState == enmNewState)
4203 {
4204 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4205 rc = VINF_SUCCESS;
4206 }
4207 else if (pDevExt->enmTscDeltaThreadState == enmCurState)
4208 {
4209 /*
4210 * Wait longer if the state has not yet transitioned to the one we want.
4211 */
4212 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4213 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4214 if ( RT_SUCCESS(rc)
4215 || rc == VERR_TIMEOUT)
4216 {
4217 /*
4218 * Check the state whether we've succeeded.
4219 */
4220 SUPDRVTSCDELTATHREADSTATE enmState;
4221 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4222 enmState = pDevExt->enmTscDeltaThreadState;
4223 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4224 if (enmState == enmNewState)
4225 rc = VINF_SUCCESS;
4226 else if (enmState == enmCurState)
4227 {
4228 rc = VERR_TIMEOUT;
4229 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmState=%d enmNewState=%d\n", enmState,
4230 enmNewState));
4231 }
4232 else
4233 {
4234 rc = VERR_INTERNAL_ERROR;
4235 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4236 enmState, enmNewState));
4237 }
4238 }
4239 else
4240 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4241 }
4242 else
4243 {
4244 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4245 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d\n", enmCurState, enmNewState));
4246 rc = VERR_INTERNAL_ERROR;
4247 }
4248
4249 return rc;
4250}
4251
4252
4253/**
4254 * Waits for TSC-delta measurements to be completed for all online CPUs.
4255 *
4256 * @returns VBox status code.
4257 * @param pDevExt Pointer to the device instance data.
4258 */
4259static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt)
4260{
4261 int cTriesLeft = 5;
4262 int cMsTotalWait;
4263 int cMsWaited = 0;
4264 int cMsWaitGranularity = 1;
4265
4266 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4267 AssertReturn(pGip, VERR_INVALID_POINTER);
4268
4269 if (RT_UNLIKELY(pDevExt->hTscDeltaThread == NIL_RTTHREAD))
4270 return VERR_THREAD_NOT_WAITABLE;
4271
4272 cMsTotalWait = RT_MIN(pGip->cPresentCpus + 10, 200);
4273 while (cTriesLeft-- > 0)
4274 {
4275 if (RTCpuSetIsEqual(&pDevExt->TscDeltaObtainedCpuSet, &pGip->OnlineCpuSet))
4276 return VINF_SUCCESS;
4277 RTThreadSleep(cMsWaitGranularity);
4278 cMsWaited += cMsWaitGranularity;
4279 if (cMsWaited >= cMsTotalWait)
4280 break;
4281 }
4282
4283 return VERR_TIMEOUT;
4284}
4285
4286
4287/**
4288 * Terminates the actual thread running supdrvTscDeltaThread().
4289 *
4290 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4291 * supdrvTscDeltaTerm().
4292 *
4293 * @param pDevExt Pointer to the device instance data.
4294 */
4295static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4296{
4297 int rc;
4298 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4299 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4300 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4301 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4302 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4303 if (RT_FAILURE(rc))
4304 {
4305 /* Signal a few more times before giving up. */
4306 int cTriesLeft = 5;
4307 while (--cTriesLeft > 0)
4308 {
4309 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4310 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4311 if (rc != VERR_TIMEOUT)
4312 break;
4313 }
4314 }
4315}
4316
4317
4318/**
4319 * Initializes and spawns the TSC-delta measurement thread.
4320 *
4321 * A thread is required for servicing re-measurement requests from events like
4322 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4323 * under all contexts on all OSs.
4324 *
4325 * @returns VBox status code.
4326 * @param pDevExt Pointer to the device instance data.
4327 *
4328 * @remarks Must only be called -after- initializing GIP and setting up MP
4329 * notifications!
4330 */
4331static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4332{
4333 int rc;
4334 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4335 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4336 if (RT_SUCCESS(rc))
4337 {
4338 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4339 if (RT_SUCCESS(rc))
4340 {
4341 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4342 pDevExt->cMsTscDeltaTimeout = 1;
4343 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4344 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4345 if (RT_SUCCESS(rc))
4346 {
4347 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4348 if (RT_SUCCESS(rc))
4349 {
4350 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4351 return rc;
4352 }
4353
4354 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4355 supdrvTscDeltaThreadTerminate(pDevExt);
4356 }
4357 else
4358 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4359 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4360 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4361 }
4362 else
4363 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4364 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4365 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4366 }
4367 else
4368 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4369
4370 return rc;
4371}
4372
4373
4374/**
4375 * Terminates the TSC-delta measurement thread and cleanup.
4376 *
4377 * @param pDevExt Pointer to the device instance data.
4378 */
4379static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4380{
4381 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4382 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4383 {
4384 supdrvTscDeltaThreadTerminate(pDevExt);
4385 }
4386
4387 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4388 {
4389 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4390 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4391 }
4392
4393 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4394 {
4395 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4396 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4397 }
4398
4399 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4400}
4401
4402#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4403
4404/**
4405 * Measure the TSC delta for the CPU given by its CPU set index.
4406 *
4407 * @returns VBox status code.
4408 * @retval VERR_INTERRUPTED if interrupted while waiting.
4409 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4410 * measurment.
4411 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4412 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4413 *
4414 * @param pSession The caller's session. GIP must've been mapped.
4415 * @param iCpuSet The CPU set index of the CPU to measure.
4416 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4417 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4418 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4419 * ready.
4420 * @param cTries Number of times to try, pass 0 for the default.
4421 */
4422SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4423 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4424{
4425 PSUPDRVDEVEXT pDevExt;
4426 PSUPGLOBALINFOPAGE pGip;
4427 uint16_t iGipCpu;
4428 int rc;
4429#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4430 uint64_t msTsStartWait;
4431 uint32_t iWaitLoop;
4432#endif
4433
4434 /*
4435 * Validate and adjust the input.
4436 */
4437 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4438 if (!pSession->fGipReferenced)
4439 return VERR_WRONG_ORDER;
4440
4441 pDevExt = pSession->pDevExt;
4442 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4443
4444 pGip = pDevExt->pGip;
4445 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4446
4447 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4448 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4449 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4450 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4451
4452 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4453 return VERR_INVALID_FLAGS;
4454
4455 if (cTries == 0)
4456 cTries = 12;
4457 else if (cTries > 256)
4458 cTries = 256;
4459
4460 if (cMsWaitRetry == 0)
4461 cMsWaitRetry = 2;
4462 else if (cMsWaitRetry > 1000)
4463 cMsWaitRetry = 1000;
4464
4465 /*
4466 * The request is a noop if the TSC delta isn't being used.
4467 */
4468 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4469 return VINF_SUCCESS;
4470
4471#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4472 /*
4473 * Has the TSC already been measured and we're not forced to redo it?
4474 */
4475 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4476 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4477 return VINF_SUCCESS;
4478
4479 /*
4480 * Asynchronous request? Forward it to the thread, no waiting.
4481 */
4482 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4483 {
4484 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4485 * to pass those options to the thread somehow and implement it in the
4486 * thread. Check if anyone uses/needs fAsync before implementing this. */
4487 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4488 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4489 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4490 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4491 {
4492 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4493 rc = VINF_SUCCESS;
4494 }
4495 else
4496 rc = VERR_THREAD_IS_DEAD;
4497 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4498 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4499 return VINF_SUCCESS;
4500 }
4501
4502 /*
4503 * If a TSC-delta measurement request is already being serviced by the thread,
4504 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4505 */
4506 msTsStartWait = RTTimeSystemMilliTS();
4507 for (iWaitLoop = 0;; iWaitLoop++)
4508 {
4509 uint64_t cMsElapsed;
4510 SUPDRVTSCDELTATHREADSTATE enmState;
4511 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4512 enmState = pDevExt->enmTscDeltaThreadState;
4513 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4514
4515 if (enmState == kTscDeltaThreadState_Measuring)
4516 { /* Must wait, the thread is busy. */ }
4517 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4518 { /* Must wait, this state only says what will happen next. */ }
4519 else if (enmState == kTscDeltaThreadState_Terminating)
4520 { /* Must wait, this state only says what should happen next. */ }
4521 else
4522 break; /* All other states, the thread is either idly listening or dead. */
4523
4524 /* Wait or fail. */
4525 if (cMsWaitThread == 0)
4526 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4527 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4528 if (cMsElapsed >= cMsWaitThread)
4529 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4530
4531 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4532 if (rc == VERR_INTERRUPTED)
4533 return rc;
4534 }
4535#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4536
4537 /*
4538 * Try measure the TSC delta the given number of times.
4539 */
4540 for (;;)
4541 {
4542 /* Unless we're forced to measure the delta, check whether it's done already. */
4543 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4544 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4545 {
4546 rc = VINF_SUCCESS;
4547 break;
4548 }
4549
4550 /* Measure it. */
4551 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4552 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4553 {
4554 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4555 break;
4556 }
4557
4558 /* Retry? */
4559 if (cTries <= 1)
4560 break;
4561 cTries--;
4562
4563 /* Always delay between retries (be nice to the rest of the system
4564 and avoid the BSOD hounds). */
4565 rc = RTThreadSleep(cMsWaitRetry);
4566 if (rc == VERR_INTERRUPTED)
4567 break;
4568 }
4569
4570 return rc;
4571}
4572
4573
4574/**
4575 * Service a TSC-delta measurement request.
4576 *
4577 * @returns VBox status code.
4578 * @param pDevExt Pointer to the device instance data.
4579 * @param pSession The support driver session.
4580 * @param pReq Pointer to the TSC-delta measurement request.
4581 */
4582int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4583{
4584 uint32_t cTries;
4585 uint32_t iCpuSet;
4586 uint32_t fFlags;
4587 RTMSINTERVAL cMsWaitRetry;
4588
4589 /*
4590 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4591 */
4592 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4593
4594 if (pReq->u.In.idCpu == NIL_RTCPUID)
4595 return VERR_INVALID_CPU_ID;
4596 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4597 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4598 return VERR_INVALID_CPU_ID;
4599
4600 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4601
4602 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4603
4604 fFlags = 0;
4605 if (pReq->u.In.fAsync)
4606 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4607 if (pReq->u.In.fForce)
4608 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4609
4610 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4611 cTries == 0 ? 5*RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4612 cTries);
4613}
4614
4615
4616/**
4617 * Reads TSC with delta applied.
4618 *
4619 * Will try to resolve delta value INT64_MAX before applying it. This is the
4620 * main purpose of this function, to handle the case where the delta needs to be
4621 * determined.
4622 *
4623 * @returns VBox status code.
4624 * @param pDevExt Pointer to the device instance data.
4625 * @param pSession The support driver session.
4626 * @param pReq Pointer to the TSC-read request.
4627 */
4628int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4629{
4630 PSUPGLOBALINFOPAGE pGip;
4631 int rc;
4632
4633 /*
4634 * Validate. We require the client to have mapped GIP (no asserting on
4635 * ring-3 preconditions).
4636 */
4637 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4638 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4639 return VERR_WRONG_ORDER;
4640 pGip = pDevExt->pGip;
4641 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4642
4643 /*
4644 * We're usually here because we need to apply delta, but we shouldn't be
4645 * upset if the GIP is some different mode.
4646 */
4647 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4648 {
4649 uint32_t cTries = 0;
4650 for (;;)
4651 {
4652 /*
4653 * Start by gathering the data, using CLI for disabling preemption
4654 * while we do that.
4655 */
4656 RTCCUINTREG uFlags = ASMIntDisableFlags();
4657 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4658 int iGipCpu;
4659 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4660 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4661 {
4662 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4663 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4664 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4665 ASMSetFlags(uFlags);
4666
4667 /*
4668 * If we're lucky we've got a delta, but no predicitions here
4669 * as this I/O control is normally only used when the TSC delta
4670 * is set to INT64_MAX.
4671 */
4672 if (i64Delta != INT64_MAX)
4673 {
4674 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4675 rc = VINF_SUCCESS;
4676 break;
4677 }
4678
4679 /* Give up after a few times. */
4680 if (cTries >= 4)
4681 {
4682 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4683 break;
4684 }
4685
4686 /* Need to measure the delta an try again. */
4687 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4688 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4689 /** @todo should probably delay on failure... dpc watchdogs */
4690 }
4691 else
4692 {
4693 /* This really shouldn't happen. */
4694 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4695 pReq->u.Out.idApic = ASMGetApicId();
4696 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4697 ASMSetFlags(uFlags);
4698 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4699 break;
4700 }
4701 }
4702 }
4703 else
4704 {
4705 /*
4706 * No delta to apply. Easy. Deal with preemption the lazy way.
4707 */
4708 RTCCUINTREG uFlags = ASMIntDisableFlags();
4709 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4710 int iGipCpu;
4711 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4712 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4713 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4714 else
4715 pReq->u.Out.idApic = ASMGetApicId();
4716 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4717 ASMSetFlags(uFlags);
4718 rc = VINF_SUCCESS;
4719 }
4720
4721 return rc;
4722}
4723
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette