VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 54373

Last change on this file since 54373 was 54371, checked in by vboxsync, 10 years ago

SUPDrvGip.cpp: Enabled new tsc-delta measurement sync code w/ timeout checks.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 163.5 KB
Line 
1/* $Id: SUPDrvGip.cpp 54371 2015-02-23 10:00:01Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#define LOG_GROUP LOG_GROUP_SUP_DRV
31#define SUPDRV_AGNOSTIC
32#include "SUPDrvInternal.h"
33#ifndef PAGE_SHIFT
34# include <iprt/param.h>
35#endif
36#include <iprt/asm.h>
37#include <iprt/asm-amd64-x86.h>
38#include <iprt/asm-math.h>
39#include <iprt/cpuset.h>
40#include <iprt/handletable.h>
41#include <iprt/mem.h>
42#include <iprt/mp.h>
43#include <iprt/power.h>
44#include <iprt/process.h>
45#include <iprt/semaphore.h>
46#include <iprt/spinlock.h>
47#include <iprt/thread.h>
48#include <iprt/uuid.h>
49#include <iprt/net.h>
50#include <iprt/crc.h>
51#include <iprt/string.h>
52#include <iprt/timer.h>
53#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
54# include <iprt/rand.h>
55# include <iprt/path.h>
56#endif
57#include <iprt/uint128.h>
58#include <iprt/x86.h>
59
60#include <VBox/param.h>
61#include <VBox/log.h>
62#include <VBox/err.h>
63
64#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
65# include "dtrace/SUPDrv.h"
66#else
67/* ... */
68#endif
69
70
71/*******************************************************************************
72* Defined Constants And Macros *
73*******************************************************************************/
74/** The frequency by which we recalculate the u32UpdateHz and
75 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
76 *
77 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
78 */
79#define GIP_UPDATEHZ_RECALC_FREQ 0x800
80
81/** A reserved TSC value used for synchronization as well as measurement of
82 * TSC deltas. */
83#define GIP_TSC_DELTA_RSVD UINT64_MAX
84/** The number of TSC delta measurement loops in total (includes primer and
85 * read-time loops). */
86#define GIP_TSC_DELTA_LOOPS 96
87/** The number of cache primer loops. */
88#define GIP_TSC_DELTA_PRIMER_LOOPS 4
89/** The number of loops until we keep computing the minumum read time. */
90#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
91
92/** @name Master / worker synchronization values.
93 * @{ */
94/** Stop measurement of TSC delta. */
95#define GIP_TSC_DELTA_SYNC_STOP UINT32_C(0)
96/** Start measurement of TSC delta. */
97#define GIP_TSC_DELTA_SYNC_START UINT32_C(1)
98/** Worker thread is ready for reading the TSC. */
99#define GIP_TSC_DELTA_SYNC_WORKER_READY UINT32_C(2)
100/** Worker thread is done updating TSC delta info. */
101#define GIP_TSC_DELTA_SYNC_WORKER_DONE UINT32_C(3)
102/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
103 * with a timeout. */
104#define GIP_TSC_DELTA_SYNC_PRESTART_MASTER UINT32_C(4)
105/** @} */
106
107/** When IPRT is isn't concurrent safe: Worker is ready after waiting for
108 * master with a timeout. */
109#define GIP_TSC_DELTA_SYNC_PRESTART_WORKER 5
110/** The TSC-refinement interval in seconds. */
111#define GIP_TSC_REFINE_PREIOD_IN_SECS 5
112/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
113#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
114/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
115#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
116/** The TSC delta value for the initial GIP master - 0 in regular builds.
117 * To test the delta code this can be set to a non-zero value. */
118#if 0
119# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
120#else
121# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
122#endif
123
124AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
125AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
126
127/** @def VBOX_SVN_REV
128 * The makefile should define this if it can. */
129#ifndef VBOX_SVN_REV
130# define VBOX_SVN_REV 0
131#endif
132
133#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
134# define DO_NOT_START_GIP
135#endif
136
137
138/*******************************************************************************
139* Internal Functions *
140*******************************************************************************/
141static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
142static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
143static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
144#ifdef SUPDRV_USE_TSC_DELTA_THREAD
145static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
146static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
147static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt);
148#endif
149
150
151/*******************************************************************************
152* Global Variables *
153*******************************************************************************/
154DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
155
156
157
158/*
159 *
160 * Misc Common GIP Code
161 * Misc Common GIP Code
162 * Misc Common GIP Code
163 *
164 *
165 */
166
167
168/**
169 * Finds the GIP CPU index corresponding to @a idCpu.
170 *
171 * @returns GIP CPU array index, UINT32_MAX if not found.
172 * @param pGip The GIP.
173 * @param idCpu The CPU ID.
174 */
175static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
176{
177 uint32_t i;
178 for (i = 0; i < pGip->cCpus; i++)
179 if (pGip->aCPUs[i].idCpu == idCpu)
180 return i;
181 return UINT32_MAX;
182}
183
184
185/**
186 * Applies the TSC delta to the supplied raw TSC value.
187 *
188 * @returns VBox status code. (Ignored by all users, just FYI.)
189 * @param pGip Pointer to the GIP.
190 * @param puTsc Pointer to a valid TSC value before the TSC delta has been applied.
191 * @param idApic The APIC ID of the CPU @c puTsc corresponds to.
192 * @param fDeltaApplied Where to store whether the TSC delta was succesfully
193 * applied or not (optional, can be NULL).
194 *
195 * @remarks Maybe called with interrupts disabled in ring-0!
196 *
197 * @note Don't you dare change the delta calculation. If you really do, make
198 * sure you update all places where it's used (IPRT, SUPLibAll.cpp,
199 * SUPDrv.c, supdrvGipMpEvent, and more).
200 */
201DECLINLINE(int) supdrvTscDeltaApply(PSUPGLOBALINFOPAGE pGip, uint64_t *puTsc, uint16_t idApic, bool *pfDeltaApplied)
202{
203 int rc;
204
205 /*
206 * Validate input.
207 */
208 AssertPtr(puTsc);
209 AssertPtr(pGip);
210 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
211
212 /*
213 * Carefully convert the idApic into a GIPCPU entry.
214 */
215 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
216 {
217 uint16_t iCpu = pGip->aiCpuFromApicId[idApic];
218 if (RT_LIKELY(iCpu < pGip->cCpus))
219 {
220 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
221
222 /*
223 * Apply the delta if valid.
224 */
225 if (RT_LIKELY(pGipCpu->i64TSCDelta != INT64_MAX))
226 {
227 *puTsc -= pGipCpu->i64TSCDelta;
228 if (pfDeltaApplied)
229 *pfDeltaApplied = true;
230 return VINF_SUCCESS;
231 }
232
233 rc = VINF_SUCCESS;
234 }
235 else
236 {
237 AssertMsgFailed(("iCpu=%u cCpus=%u\n", iCpu, pGip->cCpus));
238 rc = VERR_INVALID_CPU_INDEX;
239 }
240 }
241 else
242 {
243 AssertMsgFailed(("idApic=%u\n", idApic));
244 rc = VERR_INVALID_CPU_ID;
245 }
246 if (pfDeltaApplied)
247 *pfDeltaApplied = false;
248 return rc;
249}
250
251
252/*
253 *
254 * GIP Mapping and Unmapping Related Code.
255 * GIP Mapping and Unmapping Related Code.
256 * GIP Mapping and Unmapping Related Code.
257 *
258 *
259 */
260
261
262/**
263 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
264 * updating.
265 *
266 * @param pGip Pointer to the GIP.
267 * @param pGipCpu The per CPU structure for this CPU.
268 * @param u64NanoTS The current time.
269 */
270static void supdrvGipReInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
271{
272 /*
273 * Here we don't really care about applying the TSC delta. The re-initialization of this
274 * value is not relevant especially while (re)starting the GIP as the first few ones will
275 * be ignored anyway, see supdrvGipDoUpdateCpu().
276 */
277 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
278 pGipCpu->u64NanoTS = u64NanoTS;
279}
280
281
282/**
283 * Set the current TSC and NanoTS value for the CPU.
284 *
285 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
286 * @param pvUser1 Pointer to the ring-0 GIP mapping.
287 * @param pvUser2 Pointer to the variable holding the current time.
288 */
289static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
290{
291 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
292 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
293
294 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
295 supdrvGipReInitCpu(pGip, &pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
296
297 NOREF(pvUser2);
298 NOREF(idCpu);
299}
300
301
302/**
303 * State structure for supdrvGipDetectGetGipCpuCallback.
304 */
305typedef struct SUPDRVGIPDETECTGETCPU
306{
307 /** Bitmap of APIC IDs that has been seen (initialized to zero).
308 * Used to detect duplicate APIC IDs (paranoia). */
309 uint8_t volatile bmApicId[256 / 8];
310 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
311 * initially). The callback clears the methods not detected. */
312 uint32_t volatile fSupported;
313 /** The first callback detecting any kind of range issues (initialized to
314 * NIL_RTCPUID). */
315 RTCPUID volatile idCpuProblem;
316} SUPDRVGIPDETECTGETCPU;
317/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
318typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
319
320
321/**
322 * Checks for alternative ways of getting the CPU ID.
323 *
324 * This also checks the APIC ID, CPU ID and CPU set index values against the
325 * GIP tables.
326 *
327 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
328 * @param pvUser1 Pointer to the state structure.
329 * @param pvUser2 Pointer to the GIP.
330 */
331static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
332{
333 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
334 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
335 uint32_t fSupported = 0;
336 uint16_t idApic;
337 int iCpuSet;
338
339 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
340
341 /*
342 * Check that the CPU ID and CPU set index are interchangable.
343 */
344 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
345 if ((RTCPUID)iCpuSet == idCpu)
346 {
347 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
348 if ( iCpuSet >= 0
349 && iCpuSet < RTCPUSET_MAX_CPUS
350 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
351 {
352 /*
353 * Check whether the IDTR.LIMIT contains a CPU number.
354 */
355#ifdef RT_ARCH_X86
356 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
357#else
358 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
359#endif
360 RTIDTR Idtr;
361 ASMGetIDTR(&Idtr);
362 if (Idtr.cbIdt >= cbIdt)
363 {
364 uint32_t uTmp = Idtr.cbIdt - cbIdt;
365 uTmp &= RTCPUSET_MAX_CPUS - 1;
366 if (uTmp == idCpu)
367 {
368 RTIDTR Idtr2;
369 ASMGetIDTR(&Idtr2);
370 if (Idtr2.cbIdt == Idtr.cbIdt)
371 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
372 }
373 }
374
375 /*
376 * Check whether RDTSCP is an option.
377 */
378 if (ASMHasCpuId())
379 {
380 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
381 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
382 {
383 uint32_t uAux;
384 ASMReadTscWithAux(&uAux);
385 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
386 {
387 ASMNopPause();
388 ASMReadTscWithAux(&uAux);
389 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
390 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
391 }
392 }
393 }
394 }
395 }
396
397 /*
398 * Check that the APIC ID is unique.
399 */
400 idApic = ASMGetApicId();
401 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
402 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
403 fSupported |= SUPGIPGETCPU_APIC_ID;
404 else
405 {
406 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
407 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
408 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
409 idCpu, iCpuSet, idApic));
410 }
411
412 /*
413 * Check that the iCpuSet is within the expected range.
414 */
415 if (RT_UNLIKELY( iCpuSet < 0
416 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
417 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
418 {
419 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
420 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
421 idCpu, iCpuSet, idApic));
422 }
423 else
424 {
425 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
426 if (RT_UNLIKELY(idCpu2 != idCpu))
427 {
428 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
429 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
430 idCpu, iCpuSet, idApic, idCpu2));
431 }
432 }
433
434 /*
435 * Update the supported feature mask before we return.
436 */
437 ASMAtomicAndU32(&pState->fSupported, fSupported);
438
439 NOREF(pvUser2);
440}
441
442
443/**
444 * Increase the timer freqency on hosts where this is possible (NT).
445 *
446 * The idea is that more interrupts is better for us... Also, it's better than
447 * we increase the timer frequence, because we might end up getting inaccurate
448 * callbacks if someone else does it.
449 *
450 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
451 */
452static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
453{
454 if (pDevExt->u32SystemTimerGranularityGrant == 0)
455 {
456 uint32_t u32SystemResolution;
457 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
458 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
459 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
460 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
461 )
462 {
463 Assert(RTTimerGetSystemGranularity() <= u32SystemResolution);
464 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
465 }
466 }
467}
468
469
470/**
471 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
472 *
473 * @param pDevExt Clears u32SystemTimerGranularityGrant.
474 */
475static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
476{
477 if (pDevExt->u32SystemTimerGranularityGrant)
478 {
479 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
480 AssertRC(rc2);
481 pDevExt->u32SystemTimerGranularityGrant = 0;
482 }
483}
484
485
486/**
487 * Maps the GIP into userspace and/or get the physical address of the GIP.
488 *
489 * @returns IPRT status code.
490 * @param pSession Session to which the GIP mapping should belong.
491 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
492 * @param pHCPhysGip Where to store the physical address. (optional)
493 *
494 * @remark There is no reference counting on the mapping, so one call to this function
495 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
496 * and remove the session as a GIP user.
497 */
498SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
499{
500 int rc;
501 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
502 RTR3PTR pGipR3 = NIL_RTR3PTR;
503 RTHCPHYS HCPhys = NIL_RTHCPHYS;
504 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
505
506 /*
507 * Validate
508 */
509 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
510 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
511 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
512
513#ifdef SUPDRV_USE_MUTEX_FOR_GIP
514 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
515#else
516 RTSemFastMutexRequest(pDevExt->mtxGip);
517#endif
518 if (pDevExt->pGip)
519 {
520 /*
521 * Map it?
522 */
523 rc = VINF_SUCCESS;
524 if (ppGipR3)
525 {
526 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
527 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
528 RTMEM_PROT_READ, RTR0ProcHandleSelf());
529 if (RT_SUCCESS(rc))
530 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
531 }
532
533 /*
534 * Get physical address.
535 */
536 if (pHCPhysGip && RT_SUCCESS(rc))
537 HCPhys = pDevExt->HCPhysGip;
538
539 /*
540 * Reference globally.
541 */
542 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
543 {
544 pSession->fGipReferenced = 1;
545 pDevExt->cGipUsers++;
546 if (pDevExt->cGipUsers == 1)
547 {
548 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
549 uint64_t u64NanoTS;
550
551 /*
552 * GIP starts/resumes updating again. On windows we bump the
553 * host timer frequency to make sure we don't get stuck in guest
554 * mode and to get better timer (and possibly clock) accuracy.
555 */
556 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
557
558 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
559
560 /*
561 * document me
562 */
563 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
564 {
565 unsigned i;
566 for (i = 0; i < pGipR0->cCpus; i++)
567 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
568 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
569 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
570 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
571 }
572
573 /*
574 * document me
575 */
576 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
577 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
578 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
579 || RTMpGetOnlineCount() == 1)
580 supdrvGipReInitCpu(pGipR0, &pGipR0->aCPUs[0], u64NanoTS);
581 else
582 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
583
584 /*
585 * Detect alternative ways to figure the CPU ID in ring-3 and
586 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
587 * and CPU set indexes while we're at it.
588 */
589 if (RT_SUCCESS(rc))
590 {
591 SUPDRVGIPDETECTGETCPU DetectState;
592 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
593 DetectState.fSupported = UINT32_MAX;
594 DetectState.idCpuProblem = NIL_RTCPUID;
595 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
596 if (DetectState.idCpuProblem == NIL_RTCPUID)
597 {
598 if ( DetectState.fSupported != UINT32_MAX
599 && DetectState.fSupported != 0)
600 {
601 if (pGipR0->fGetGipCpu != DetectState.fSupported)
602 {
603 pGipR0->fGetGipCpu = DetectState.fSupported;
604 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
605 }
606 }
607 else
608 {
609 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
610 DetectState.fSupported));
611 rc = VERR_UNSUPPORTED_CPU;
612 }
613 }
614 else
615 {
616 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
617 DetectState.idCpuProblem, DetectState.idCpuProblem));
618 rc = VERR_INVALID_CPU_ID;
619 }
620 }
621
622 /*
623 * Start the GIP timer if all is well..
624 */
625 if (RT_SUCCESS(rc))
626 {
627#ifndef DO_NOT_START_GIP
628 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
629#endif
630 rc = VINF_SUCCESS;
631 }
632
633 /*
634 * Bail out on error.
635 */
636 if (RT_FAILURE(rc))
637 {
638 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
639 pDevExt->cGipUsers = 0;
640 pSession->fGipReferenced = 0;
641 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
642 {
643 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
644 if (RT_SUCCESS(rc2))
645 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
646 }
647 HCPhys = NIL_RTHCPHYS;
648 pGipR3 = NIL_RTR3PTR;
649 }
650 }
651 }
652 }
653 else
654 {
655 rc = VERR_GENERAL_FAILURE;
656 Log(("SUPR0GipMap: GIP is not available!\n"));
657 }
658#ifdef SUPDRV_USE_MUTEX_FOR_GIP
659 RTSemMutexRelease(pDevExt->mtxGip);
660#else
661 RTSemFastMutexRelease(pDevExt->mtxGip);
662#endif
663
664 /*
665 * Write returns.
666 */
667 if (pHCPhysGip)
668 *pHCPhysGip = HCPhys;
669 if (ppGipR3)
670 *ppGipR3 = pGipR3;
671
672#ifdef DEBUG_DARWIN_GIP
673 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
674#else
675 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
676#endif
677 return rc;
678}
679
680
681/**
682 * Unmaps any user mapping of the GIP and terminates all GIP access
683 * from this session.
684 *
685 * @returns IPRT status code.
686 * @param pSession Session to which the GIP mapping should belong.
687 */
688SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
689{
690 int rc = VINF_SUCCESS;
691 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
692#ifdef DEBUG_DARWIN_GIP
693 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
694 pSession,
695 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
696 pSession->GipMapObjR3));
697#else
698 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
699#endif
700 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
701
702#ifdef SUPDRV_USE_MUTEX_FOR_GIP
703 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
704#else
705 RTSemFastMutexRequest(pDevExt->mtxGip);
706#endif
707
708 /*
709 * Unmap anything?
710 */
711 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
712 {
713 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
714 AssertRC(rc);
715 if (RT_SUCCESS(rc))
716 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
717 }
718
719 /*
720 * Dereference global GIP.
721 */
722 if (pSession->fGipReferenced && !rc)
723 {
724 pSession->fGipReferenced = 0;
725 if ( pDevExt->cGipUsers > 0
726 && !--pDevExt->cGipUsers)
727 {
728 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
729#ifndef DO_NOT_START_GIP
730 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
731#endif
732 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
733 }
734 }
735
736#ifdef SUPDRV_USE_MUTEX_FOR_GIP
737 RTSemMutexRelease(pDevExt->mtxGip);
738#else
739 RTSemFastMutexRelease(pDevExt->mtxGip);
740#endif
741
742 return rc;
743}
744
745
746/**
747 * Gets the GIP pointer.
748 *
749 * @returns Pointer to the GIP or NULL.
750 */
751SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
752{
753 return g_pSUPGlobalInfoPage;
754}
755
756
757
758
759
760/*
761 *
762 *
763 * GIP Initialization, Termination and CPU Offline / Online Related Code.
764 * GIP Initialization, Termination and CPU Offline / Online Related Code.
765 * GIP Initialization, Termination and CPU Offline / Online Related Code.
766 *
767 *
768 */
769
770/**
771 * Used by supdrvInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
772 * to update the TSC frequency related GIP variables.
773 *
774 * @param pGip The GIP.
775 * @param nsElapsed The number of nano seconds elapsed.
776 * @param cElapsedTscTicks The corresponding number of TSC ticks.
777 */
778static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks)
779{
780 /*
781 * Calculate the frequency.
782 */
783 uint64_t uCpuHz;
784 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
785 && nsElapsed < UINT32_MAX)
786 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
787 else
788 {
789 RTUINT128U CpuHz, Tmp, Divisor;
790 CpuHz.s.Lo = CpuHz.s.Hi = 0;
791 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
792 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
793 uCpuHz = CpuHz.s.Lo;
794 }
795
796 /*
797 * Update the GIP.
798 */
799 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
800 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
801 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
802}
803
804
805/**
806 * Timer callback function for TSC frequency refinement in invariant GIP mode.
807 *
808 * This is started during driver init and fires once
809 * GIP_TSC_REFINE_PREIOD_IN_SECS seconds later.
810 *
811 * @param pTimer The timer.
812 * @param pvUser Opaque pointer to the device instance data.
813 * @param iTick The timer tick.
814 */
815static DECLCALLBACK(void) supdrvInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
816{
817 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
818 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
819 RTCPUID idCpu;
820 uint64_t cNsElapsed;
821 uint64_t cTscTicksElapsed;
822 uint64_t nsNow;
823 uint64_t uTsc;
824 RTCCUINTREG uFlags;
825
826 /* Paranoia. */
827 AssertReturnVoid(pGip);
828 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
829
830 /*
831 * Try get close to the next clock tick as usual.
832 *
833 * PORTME: If timers are called from the clock interrupt handler, or
834 * an interrupt handler with higher priority than the clock
835 * interrupt, or spinning for ages in timer handlers is frowned
836 * upon, this look must be disabled!
837 *
838 * Darwin, FreeBSD, Linux, Solaris, Windows 8.1+:
839 * High RTTimeSystemNanoTS resolution should prevent any noticable
840 * spinning her.
841 *
842 * Windows 8.0 and earlier:
843 * We're running in a DPC here, so we may trigger the DPC watchdog?
844 *
845 * OS/2:
846 * Timer callbacks are done in the clock interrupt, so skip it.
847 */
848#if !defined(RT_OS_OS2)
849 nsNow = RTTimeSystemNanoTS();
850 while (RTTimeSystemNanoTS() == nsNow)
851 ASMNopPause();
852#endif
853
854 uFlags = ASMIntDisableFlags();
855 uTsc = ASMReadTSC();
856 nsNow = RTTimeSystemNanoTS();
857 idCpu = RTMpCpuId();
858 ASMSetFlags(uFlags);
859
860 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
861 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
862
863 /*
864 * If the above measurement was taken on a different CPU than the one we
865 * started the rprocess on, cTscTicksElapsed will need to be adjusted with
866 * the TSC deltas of both the CPUs.
867 *
868 * We ASSUME that the delta calculation process takes less time than the
869 * TSC frequency refinement timer. If it doesn't, we'll complain and
870 * drop the frequency refinement.
871 *
872 * Note! We cannot entirely trust enmUseTscDelta here because it's
873 * downgraded after each delta calculation.
874 */
875 if ( idCpu != pDevExt->idCpuInvarTscRefine
876 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
877 {
878 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
879 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
880 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
881 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
882 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
883 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
884 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
885 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
886 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopGipCpu != INT64_MAX))
887 {
888 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
889 {
890 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
891 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
892 }
893 }
894 /*
895 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
896 * calculations.
897 */
898 else if (cNsElapsed <= GIP_TSC_REFINE_PREIOD_IN_SECS * 5 * RT_NS_1SEC_64)
899 {
900 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
901 AssertRC(rc);
902 return;
903 }
904 else
905 {
906 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
907 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PREIOD_IN_SECS);
908 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
909 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
910 return;
911 }
912 }
913
914 /*
915 * Calculate and update the CPU frequency variables in GIP.
916 *
917 * If there is a GIP user already and we've already refined the frequency
918 * a couple of times, don't update it as we want a stable frequency value
919 * for all VMs.
920 */
921 if ( pDevExt->cGipUsers == 0
922 || cNsElapsed < RT_NS_1SEC * 2)
923 {
924 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed);
925
926 /*
927 * Reschedule the timer if we haven't yet reached the defined refinement period.
928 */
929 if (cNsElapsed < GIP_TSC_REFINE_PREIOD_IN_SECS * RT_NS_1SEC_64)
930 {
931 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
932 AssertRC(rc);
933 }
934 }
935}
936
937
938/**
939 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
940 *
941 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
942 * the CPU may change the TSC frequence between now and when the timer fires
943 * (supdrvInitAsyncRefineTscTimer).
944 *
945 * @param pDevExt Pointer to the device instance data.
946 * @param pGip Pointer to the GIP.
947 */
948static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
949{
950 uint64_t u64NanoTS;
951 RTCCUINTREG uFlags;
952 int rc;
953
954 /*
955 * Record the TSC and NanoTS as the starting anchor point for refinement
956 * of the TSC. We try get as close to a clock tick as possible on systems
957 * which does not provide high resolution time.
958 */
959 u64NanoTS = RTTimeSystemNanoTS();
960 while (RTTimeSystemNanoTS() == u64NanoTS)
961 ASMNopPause();
962
963 uFlags = ASMIntDisableFlags();
964 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
965 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
966 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
967 ASMSetFlags(uFlags);
968
969 /*
970 * Create a timer that runs on the same CPU so we won't have a depencency
971 * on the TSC-delta and can run in parallel to it. On systems that does not
972 * implement CPU specific timers we'll apply deltas in the timer callback,
973 * just like we do for CPUs going offline.
974 *
975 * The longer the refinement interval the better the accuracy, at least in
976 * theory. If it's too long though, ring-3 may already be starting its
977 * first VMs before we're done. On most systems we will be loading the
978 * support driver during boot and VMs won't be started for a while yet,
979 * it is really only a problem during development (especiall with
980 * on-demand driver starting on windows).
981 *
982 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq call
983 * to calculate the frequencey during driver loading, the timer is set
984 * to fire after 200 ms the first time. It will then reschedule itself
985 * to fire every second until GIP_TSC_REFINE_PREIOD_IN_SECS has been
986 * reached or it notices that there is a user land client with GIP
987 * mapped (we want a stable frequency for all VMs).
988 */
989 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */,
990 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
991 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
992 if (RT_SUCCESS(rc))
993 {
994 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
995 if (RT_SUCCESS(rc))
996 return;
997 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
998 }
999
1000 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
1001 {
1002 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */, RTTIMER_FLAGS_CPU_ANY,
1003 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
1004 if (RT_SUCCESS(rc))
1005 {
1006 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1007 if (RT_SUCCESS(rc))
1008 return;
1009 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1010 }
1011 }
1012
1013 pDevExt->pInvarTscRefineTimer = NULL;
1014 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1015}
1016
1017
1018/**
1019 * @callback_method_impl{PFNRTMPWORKER,
1020 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1021 * the measurements on.}
1022 */
1023DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1024{
1025 RTCCUINTREG uFlags = ASMIntDisableFlags();
1026 uint64_t *puTscStop = (uint64_t *)pvUser1;
1027 uint64_t *pnsStop = (uint64_t *)pvUser2;
1028
1029 *puTscStop = ASMReadTSC();
1030 *pnsStop = RTTimeSystemNanoTS();
1031
1032 ASMSetFlags(uFlags);
1033}
1034
1035
1036/**
1037 * Measures the TSC frequency of the system.
1038 *
1039 * The TSC frequency can vary on systems which are not reported as invariant.
1040 * On such systems the object of this function is to find out what the nominal,
1041 * maximum TSC frequency under 'normal' CPU operation.
1042 *
1043 * @returns VBox status code.
1044 * @param pDevExt Pointer to the device instance.
1045 * @param pGip Pointer to the GIP.
1046 * @param fRough Set if we're doing the rough calculation that the
1047 * TSC measuring code needs, where accuracy isn't all
1048 * that important (too high is better than to low).
1049 * When clear we try for best accuracy that we can
1050 * achieve in reasonably short time.
1051 */
1052static int supdrvGipInitMeasureTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, bool fRough)
1053{
1054 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1055 int cTriesLeft = fRough ? 4 : 2;
1056 while (cTriesLeft-- > 0)
1057 {
1058 RTCCUINTREG uFlags;
1059 uint64_t nsStart;
1060 uint64_t nsStop;
1061 uint64_t uTscStart;
1062 uint64_t uTscStop;
1063 RTCPUID idCpuStart;
1064 RTCPUID idCpuStop;
1065
1066 /*
1067 * Synchronize with the host OS clock tick on systems without high
1068 * resolution time API (older Windows version for example).
1069 */
1070 nsStart = RTTimeSystemNanoTS();
1071 while (RTTimeSystemNanoTS() == nsStart)
1072 ASMNopPause();
1073
1074 /*
1075 * Read the TSC and current time, noting which CPU we're on.
1076 */
1077 uFlags = ASMIntDisableFlags();
1078 uTscStart = ASMReadTSC();
1079 nsStart = RTTimeSystemNanoTS();
1080 idCpuStart = RTMpCpuId();
1081 ASMSetFlags(uFlags);
1082
1083 /*
1084 * Delay for a while.
1085 */
1086 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1087 {
1088 /*
1089 * Sleep-wait since the TSC frequency is constant, it eases host load.
1090 * Shorter interval produces more variance in the frequency (esp. Windows).
1091 */
1092 uint64_t msElapsed = 0;
1093 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1094 / RT_NS_1MS;
1095 do
1096 {
1097 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1098 nsStop = RTTimeSystemNanoTS();
1099 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1100 } while (msElapsed < msDelay);
1101
1102 while (RTTimeSystemNanoTS() == nsStop)
1103 ASMNopPause();
1104 }
1105 else
1106 {
1107 /*
1108 * Busy-wait keeping the frequency up.
1109 */
1110 do
1111 {
1112 ASMNopPause();
1113 nsStop = RTTimeSystemNanoTS();
1114 } while (nsStop - nsStart < RT_NS_100MS);
1115 }
1116
1117 /*
1118 * Read the TSC and time again.
1119 */
1120 uFlags = ASMIntDisableFlags();
1121 uTscStop = ASMReadTSC();
1122 nsStop = RTTimeSystemNanoTS();
1123 idCpuStop = RTMpCpuId();
1124 ASMSetFlags(uFlags);
1125
1126 /*
1127 * If the CPU changes things get a bit complicated and what we
1128 * can get away with depends on the GIP mode / TSC reliablity.
1129 */
1130 if (idCpuStop != idCpuStart)
1131 {
1132 bool fDoXCall = false;
1133
1134 /*
1135 * Synchronous TSC mode: we're probably fine as it's unlikely
1136 * that we were rescheduled because of TSC throttling or power
1137 * management reasons, so just go ahead.
1138 */
1139 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1140 {
1141 /* Probably ok, maybe we should retry once?. */
1142 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1143 }
1144 /*
1145 * If we're just doing the rough measurement, do the cross call and
1146 * get on with things (we don't have deltas!).
1147 */
1148 else if (fRough)
1149 fDoXCall = true;
1150 /*
1151 * Invariant TSC mode: It doesn't matter if we have delta available
1152 * for both CPUs. That is not something we can assume at this point.
1153 *
1154 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1155 * downgraded after each delta calculation and the delta
1156 * calculations may not be complete yet.
1157 */
1158 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1159 {
1160/** @todo This section of code is never reached atm, consider dropping it later on... */
1161 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1162 {
1163 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1164 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1165 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1166 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1167 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1168 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1169 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1170 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1171 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopGipCpu != INT64_MAX))
1172 {
1173 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1174 {
1175 uTscStart -= iStartTscDelta;
1176 uTscStop -= iStopTscDelta;
1177 }
1178 }
1179 /*
1180 * Invalid CPU indexes are not caused by online/offline races, so
1181 * we have to trigger driver load failure if that happens as GIP
1182 * and IPRT assumptions are busted on this system.
1183 */
1184 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1185 {
1186 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1187 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1188 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1189 return VERR_INVALID_CPU_INDEX;
1190 }
1191 /*
1192 * No valid deltas. We retry, if we're on our last retry
1193 * we do the cross call instead just to get a result. The
1194 * frequency will be refined in a few seconds anyways.
1195 */
1196 else if (cTriesLeft > 0)
1197 continue;
1198 else
1199 fDoXCall = true;
1200 }
1201 }
1202 /*
1203 * Asynchronous TSC mode: This is bad as the reason we usually
1204 * use this mode is to deal with variable TSC frequencies and
1205 * deltas. So, we need to get the TSC from the same CPU as
1206 * started it, we also need to keep that CPU busy. So, retry
1207 * and fall back to the cross call on the last attempt.
1208 */
1209 else
1210 {
1211 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1212 if (cTriesLeft > 0)
1213 continue;
1214 fDoXCall = true;
1215 }
1216
1217 if (fDoXCall)
1218 {
1219 /*
1220 * Try read the TSC and timestamp on the start CPU.
1221 */
1222 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1223 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1224 continue;
1225 }
1226 }
1227
1228 /*
1229 * Calculate the TSC frequency and update it (shared with the refinement timer).
1230 */
1231 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart);
1232 return VINF_SUCCESS;
1233 }
1234
1235 Assert(!fRough);
1236 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1237}
1238
1239
1240/**
1241 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1242 *
1243 * @returns Index of the CPU in the cache set.
1244 * @param pGip The GIP.
1245 * @param idCpu The CPU ID.
1246 */
1247static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1248{
1249 uint32_t i, cTries;
1250
1251 /*
1252 * ASSUMES that CPU IDs are constant.
1253 */
1254 for (i = 0; i < pGip->cCpus; i++)
1255 if (pGip->aCPUs[i].idCpu == idCpu)
1256 return i;
1257
1258 cTries = 0;
1259 do
1260 {
1261 for (i = 0; i < pGip->cCpus; i++)
1262 {
1263 bool fRc;
1264 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1265 if (fRc)
1266 return i;
1267 }
1268 } while (cTries++ < 32);
1269 AssertReleaseFailed();
1270 return i - 1;
1271}
1272
1273
1274/**
1275 * The calling CPU should be accounted as online, update GIP accordingly.
1276 *
1277 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1278 *
1279 * @param pDevExt The device extension.
1280 * @param idCpu The CPU ID.
1281 */
1282static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1283{
1284 int iCpuSet = 0;
1285 uint16_t idApic = UINT16_MAX;
1286 uint32_t i = 0;
1287 uint64_t u64NanoTS = 0;
1288 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1289
1290 AssertPtrReturnVoid(pGip);
1291 AssertRelease(idCpu == RTMpCpuId());
1292 Assert(pGip->cPossibleCpus == RTMpGetCount());
1293
1294 /*
1295 * Do this behind a spinlock with interrupts disabled as this can fire
1296 * on all CPUs simultaneously, see @bugref{6110}.
1297 */
1298 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1299
1300 /*
1301 * Update the globals.
1302 */
1303 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1304 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1305 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1306 if (iCpuSet >= 0)
1307 {
1308 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1309 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1310 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1311 }
1312
1313 /*
1314 * Update the entry.
1315 */
1316 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1317 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1318
1319 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1320
1321 idApic = ASMGetApicId();
1322 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1323 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1324 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1325
1326 /*
1327 * Update the APIC ID and CPU set index mappings.
1328 */
1329 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1330 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1331
1332 /* Update the Mp online/offline counter. */
1333 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1334
1335 /* Add this CPU to the set of CPUs for which we need to calculate their TSC-deltas. */
1336 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1337 {
1338 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
1339#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1340 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
1341 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
1342 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
1343 {
1344 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
1345 }
1346 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
1347#endif
1348 }
1349
1350 /* commit it */
1351 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1352
1353 RTSpinlockRelease(pDevExt->hGipSpinlock);
1354}
1355
1356
1357/**
1358 * The CPU should be accounted as offline, update the GIP accordingly.
1359 *
1360 * This is used by supdrvGipMpEvent.
1361 *
1362 * @param pDevExt The device extension.
1363 * @param idCpu The CPU ID.
1364 */
1365static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1366{
1367 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1368 int iCpuSet;
1369 unsigned i;
1370
1371 AssertPtrReturnVoid(pGip);
1372 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1373
1374 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1375 AssertReturnVoid(iCpuSet >= 0);
1376
1377 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1378 AssertReturnVoid(i < pGip->cCpus);
1379 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1380
1381 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1382 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1383
1384 /* Update the Mp online/offline counter. */
1385 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1386
1387 /* If we are the initiator going offline while measuring the TSC delta, unspin other waiting CPUs! */
1388 if (ASMAtomicReadU32(&pDevExt->idTscDeltaInitiator) == idCpu)
1389 {
1390 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_START);
1391 ASMAtomicWriteU64(&pGip->aCPUs[i].u64TSCSample, ~GIP_TSC_DELTA_RSVD);
1392 }
1393
1394 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1395 {
1396 /* Reset the TSC delta, we will recalculate it lazily. */
1397 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1398 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1399 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1400 }
1401
1402 /* commit it */
1403 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1404
1405 RTSpinlockRelease(pDevExt->hGipSpinlock);
1406}
1407
1408
1409/**
1410 * Multiprocessor event notification callback.
1411 *
1412 * This is used to make sure that the GIP master gets passed on to
1413 * another CPU. It also updates the associated CPU data.
1414 *
1415 * @param enmEvent The event.
1416 * @param idCpu The cpu it applies to.
1417 * @param pvUser Pointer to the device extension.
1418 *
1419 * @remarks This function -must- fire on the newly online'd CPU for the
1420 * RTMPEVENT_ONLINE case and can fire on any CPU for the
1421 * RTMPEVENT_OFFLINE case.
1422 */
1423static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1424{
1425 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1426 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1427
1428 AssertRelease(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1429
1430 /*
1431 * Update the GIP CPU data.
1432 */
1433 if (pGip)
1434 {
1435 switch (enmEvent)
1436 {
1437 case RTMPEVENT_ONLINE:
1438 AssertRelease(idCpu == RTMpCpuId());
1439 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1440 break;
1441 case RTMPEVENT_OFFLINE:
1442 supdrvGipMpEventOffline(pDevExt, idCpu);
1443 break;
1444 }
1445 }
1446
1447 /*
1448 * Make sure there is a master GIP.
1449 */
1450 if (enmEvent == RTMPEVENT_OFFLINE)
1451 {
1452 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1453 if (idGipMaster == idCpu)
1454 {
1455 /*
1456 * The GIP master is going offline, find a new one.
1457 */
1458 bool fIgnored;
1459 unsigned i;
1460 RTCPUID idNewGipMaster = NIL_RTCPUID;
1461 RTCPUSET OnlineCpus;
1462 RTMpGetOnlineSet(&OnlineCpus);
1463
1464 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1465 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1466 {
1467 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1468 if (idCurCpu != idGipMaster)
1469 {
1470 idNewGipMaster = idCurCpu;
1471 break;
1472 }
1473 }
1474
1475 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1476 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1477 NOREF(fIgnored);
1478 }
1479 }
1480}
1481
1482
1483/**
1484 * On CPU initialization callback for RTMpOnAll.
1485 *
1486 * @param idCpu The CPU ID.
1487 * @param pvUser1 The device extension.
1488 * @param pvUser2 The GIP.
1489 */
1490static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1491{
1492 /* This is good enough, even though it will update some of the globals a
1493 bit to much. */
1494 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1495}
1496
1497
1498/**
1499 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1500 *
1501 * @param idCpu Ignored.
1502 * @param pvUser1 Where to put the TSC.
1503 * @param pvUser2 Ignored.
1504 */
1505static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1506{
1507 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1508}
1509
1510
1511/**
1512 * Determine if Async GIP mode is required because of TSC drift.
1513 *
1514 * When using the default/normal timer code it is essential that the time stamp counter
1515 * (TSC) runs never backwards, that is, a read operation to the counter should return
1516 * a bigger value than any previous read operation. This is guaranteed by the latest
1517 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1518 * case we have to choose the asynchronous timer mode.
1519 *
1520 * @param poffMin Pointer to the determined difference between different
1521 * cores (optional, can be NULL).
1522 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1523 */
1524static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1525{
1526 /*
1527 * Just iterate all the cpus 8 times and make sure that the TSC is
1528 * ever increasing. We don't bother taking TSC rollover into account.
1529 */
1530 int iEndCpu = RTMpGetArraySize();
1531 int iCpu;
1532 int cLoops = 8;
1533 bool fAsync = false;
1534 int rc = VINF_SUCCESS;
1535 uint64_t offMax = 0;
1536 uint64_t offMin = ~(uint64_t)0;
1537 uint64_t PrevTsc = ASMReadTSC();
1538
1539 while (cLoops-- > 0)
1540 {
1541 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1542 {
1543 uint64_t CurTsc;
1544 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker, &CurTsc, NULL);
1545 if (RT_SUCCESS(rc))
1546 {
1547 if (CurTsc <= PrevTsc)
1548 {
1549 fAsync = true;
1550 offMin = offMax = PrevTsc - CurTsc;
1551 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1552 iCpu, cLoops, CurTsc, PrevTsc));
1553 break;
1554 }
1555
1556 /* Gather statistics (except the first time). */
1557 if (iCpu != 0 || cLoops != 7)
1558 {
1559 uint64_t off = CurTsc - PrevTsc;
1560 if (off < offMin)
1561 offMin = off;
1562 if (off > offMax)
1563 offMax = off;
1564 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1565 }
1566
1567 /* Next */
1568 PrevTsc = CurTsc;
1569 }
1570 else if (rc == VERR_NOT_SUPPORTED)
1571 break;
1572 else
1573 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1574 }
1575
1576 /* broke out of the loop. */
1577 if (iCpu < iEndCpu)
1578 break;
1579 }
1580
1581 if (poffMin)
1582 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1583 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1584 fAsync, iEndCpu, rc, offMin, offMax));
1585#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1586 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1587#endif
1588 return fAsync;
1589}
1590
1591
1592/**
1593 * supdrvGipInit() worker that determines the GIP TSC mode.
1594 *
1595 * @returns The most suitable TSC mode.
1596 * @param pDevExt Pointer to the device instance data.
1597 */
1598static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1599{
1600 uint64_t u64DiffCoresIgnored;
1601 uint32_t uEAX, uEBX, uECX, uEDX;
1602
1603 /*
1604 * Establish whether the CPU advertises TSC as invariant, we need that in
1605 * a couple of places below.
1606 */
1607 bool fInvariantTsc = false;
1608 if (ASMHasCpuId())
1609 {
1610 uEAX = ASMCpuId_EAX(0x80000000);
1611 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1612 {
1613 uEDX = ASMCpuId_EDX(0x80000007);
1614 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1615 fInvariantTsc = true;
1616 }
1617 }
1618
1619 /*
1620 * On single CPU systems, we don't need to consider ASYNC mode.
1621 */
1622 if (RTMpGetCount() <= 1)
1623 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1624
1625 /*
1626 * Allow the user and/or OS specific bits to force async mode.
1627 */
1628 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1629 return SUPGIPMODE_ASYNC_TSC;
1630
1631 /*
1632 * Use invariant mode if the CPU says TSC is invariant.
1633 */
1634 if (fInvariantTsc)
1635 return SUPGIPMODE_INVARIANT_TSC;
1636
1637 /*
1638 * TSC is not invariant and we're on SMP, this presents two problems:
1639 *
1640 * (1) There might be a skew between the CPU, so that cpu0
1641 * returns a TSC that is slightly different from cpu1.
1642 * This screw may be due to (2), bad TSC initialization
1643 * or slightly different TSC rates.
1644 *
1645 * (2) Power management (and other things) may cause the TSC
1646 * to run at a non-constant speed, and cause the speed
1647 * to be different on the cpus. This will result in (1).
1648 *
1649 * If any of the above is detected, we will have to use ASYNC mode.
1650 */
1651 /* (1). Try check for current differences between the cpus. */
1652 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1653 return SUPGIPMODE_ASYNC_TSC;
1654
1655 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1656 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1657 if ( ASMIsValidStdRange(uEAX)
1658 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
1659 {
1660 /* Check for APM support. */
1661 uEAX = ASMCpuId_EAX(0x80000000);
1662 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1663 {
1664 uEDX = ASMCpuId_EDX(0x80000007);
1665 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1666 return SUPGIPMODE_ASYNC_TSC;
1667 }
1668 }
1669
1670 return SUPGIPMODE_SYNC_TSC;
1671}
1672
1673
1674/**
1675 * Initializes per-CPU GIP information.
1676 *
1677 * @param pGip Pointer to the GIP.
1678 * @param pCpu Pointer to which GIP CPU to initalize.
1679 * @param u64NanoTS The current nanosecond timestamp.
1680 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1681 */
1682static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1683{
1684 pCpu->u32TransactionId = 2;
1685 pCpu->u64NanoTS = u64NanoTS;
1686 pCpu->u64TSC = ASMReadTSC();
1687 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1688 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1689
1690 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1691 ASMAtomicWriteSize(&pCpu->idCpu, NIL_RTCPUID);
1692 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1693 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1694
1695 /*
1696 * The first time we're called, we don't have a CPU frequency handy,
1697 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1698 * called again and at that point we have a more plausible CPU frequency
1699 * value handy. The frequency history will also be adjusted again on
1700 * the 2nd timer callout (maybe we can skip that now?).
1701 */
1702 if (!uCpuHz)
1703 {
1704 pCpu->u64CpuHz = _4G - 1;
1705 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1706 }
1707 else
1708 {
1709 pCpu->u64CpuHz = uCpuHz;
1710 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1711 }
1712 pCpu->au32TSCHistory[0]
1713 = pCpu->au32TSCHistory[1]
1714 = pCpu->au32TSCHistory[2]
1715 = pCpu->au32TSCHistory[3]
1716 = pCpu->au32TSCHistory[4]
1717 = pCpu->au32TSCHistory[5]
1718 = pCpu->au32TSCHistory[6]
1719 = pCpu->au32TSCHistory[7]
1720 = pCpu->u32UpdateIntervalTSC;
1721}
1722
1723
1724/**
1725 * Initializes the GIP data.
1726 *
1727 * @param pDevExt Pointer to the device instance data.
1728 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1729 * @param HCPhys The physical address of the GIP.
1730 * @param u64NanoTS The current nanosecond timestamp.
1731 * @param uUpdateHz The update frequency.
1732 * @param uUpdateIntervalNS The update interval in nanoseconds.
1733 * @param cCpus The CPU count.
1734 */
1735static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1736 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus)
1737{
1738 size_t const cbGip = RT_ALIGN_Z(RT_OFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), PAGE_SIZE);
1739 unsigned i;
1740#ifdef DEBUG_DARWIN_GIP
1741 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1742#else
1743 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1744#endif
1745
1746 /*
1747 * Initialize the structure.
1748 */
1749 memset(pGip, 0, cbGip);
1750
1751 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1752 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1753 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1754 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1755 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1756 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1757 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1758 else
1759 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1760 pGip->cCpus = (uint16_t)cCpus;
1761 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1762 pGip->u32UpdateHz = uUpdateHz;
1763 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1764 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1765 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1766 RTCpuSetEmpty(&pGip->PresentCpuSet);
1767 RTMpGetSet(&pGip->PossibleCpuSet);
1768 pGip->cOnlineCpus = RTMpGetOnlineCount();
1769 pGip->cPresentCpus = RTMpGetPresentCount();
1770 pGip->cPossibleCpus = RTMpGetCount();
1771 pGip->idCpuMax = RTMpGetMaxCpuId();
1772 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1773 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1774 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1775 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1776 for (i = 0; i < cCpus; i++)
1777 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1778
1779 /*
1780 * Link it to the device extension.
1781 */
1782 pDevExt->pGip = pGip;
1783 pDevExt->HCPhysGip = HCPhys;
1784 pDevExt->cGipUsers = 0;
1785}
1786
1787
1788/**
1789 * Creates the GIP.
1790 *
1791 * @returns VBox status code.
1792 * @param pDevExt Instance data. GIP stuff may be updated.
1793 */
1794int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1795{
1796 PSUPGLOBALINFOPAGE pGip;
1797 RTHCPHYS HCPhysGip;
1798 uint32_t u32SystemResolution;
1799 uint32_t u32Interval;
1800 uint32_t u32MinInterval;
1801 uint32_t uMod;
1802 unsigned cCpus;
1803 int rc;
1804
1805 LogFlow(("supdrvGipCreate:\n"));
1806
1807 /* Assert order. */
1808 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1809 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1810 Assert(!pDevExt->pGipTimer);
1811
1812 /*
1813 * Check the CPU count.
1814 */
1815 cCpus = RTMpGetArraySize();
1816 if ( cCpus > RTCPUSET_MAX_CPUS
1817 || cCpus > 256 /* ApicId is used for the mappings */)
1818 {
1819 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1820 return VERR_TOO_MANY_CPUS;
1821 }
1822
1823 /*
1824 * Allocate a contiguous set of pages with a default kernel mapping.
1825 */
1826 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, RT_UOFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), false /*fExecutable*/);
1827 if (RT_FAILURE(rc))
1828 {
1829 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1830 return rc;
1831 }
1832 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1833 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1834
1835 /*
1836 * Allocate the TSC-delta sync struct on a separate cache line.
1837 */
1838 pDevExt->pvTscDeltaSync = RTMemAllocZ(sizeof(SUPTSCDELTASYNC) + 63);
1839 pDevExt->pTscDeltaSync = RT_ALIGN_PT(pDevExt->pvTscDeltaSync, 64, PSUPTSCDELTASYNC);
1840 Assert(RT_ALIGN_PT(pDevExt->pTscDeltaSync, 64, PSUPTSCDELTASYNC) == pDevExt->pTscDeltaSync);
1841
1842 /*
1843 * Find a reasonable update interval and initialize the structure.
1844 */
1845 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1846 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1847 * See @bugref{6710}. */
1848 u32MinInterval = RT_NS_10MS;
1849 u32SystemResolution = RTTimerGetSystemGranularity();
1850 u32Interval = u32MinInterval;
1851 uMod = u32MinInterval % u32SystemResolution;
1852 if (uMod)
1853 u32Interval += u32SystemResolution - uMod;
1854
1855 supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval, cCpus);
1856
1857 /*
1858 * Important sanity check...
1859 */
1860 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1861 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1862 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1863 {
1864 /* Basically, invariant Windows boxes, should never be detected as async (i.e. TSC-deltas should be 0). */
1865 OSDBGPRINT(("supdrvGipCreate: The TSC-deltas should be normalized by the host OS, but verifying shows it's not!\n"));
1866 return VERR_INTERNAL_ERROR_2;
1867 }
1868
1869 /*
1870 * Do the TSC frequency measurements.
1871 *
1872 * If we're in invariant TSC mode, just to a quick preliminary measurement
1873 * that the TSC-delta measurement code can use to yield cross calls.
1874 *
1875 * If we're in any of the other two modes, neither which require MP init,
1876 * notifications or deltas for the job, do the full measurement now so
1877 * that supdrvGipInitOnCpu can populate the TSC interval and history
1878 * array with more reasonable values.
1879 */
1880 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1881 {
1882 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, true /*fRough*/); /* cannot fail */
1883 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt, pGip);
1884 }
1885 else
1886 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, false /*fRough*/);
1887 if (RT_SUCCESS(rc))
1888 {
1889 /*
1890 * Start TSC-delta measurement thread before we start getting MP
1891 * events that will try kick it into action (includes the
1892 * RTMpOnAll/supdrvGipInitOnCpu call below).
1893 */
1894 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1895 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1896#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1897 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1898 && pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1899 rc = supdrvTscDeltaThreadInit(pDevExt);
1900#endif
1901 if (RT_SUCCESS(rc))
1902 {
1903 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1904 if (RT_SUCCESS(rc))
1905 {
1906 /*
1907 * Do GIP initialization on all online CPUs. Wake up the
1908 * TSC-delta thread afterwards.
1909 */
1910 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1911 if (RT_SUCCESS(rc))
1912 {
1913#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1914 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
1915 RTThreadUserSignal(pDevExt->hTscDeltaThread);
1916#else
1917 uint16_t iCpu;
1918 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1919 {
1920 /*
1921 * Measure the TSC deltas now that we have MP notifications.
1922 */
1923 int cTries = 5;
1924 do
1925 {
1926 rc = supdrvMeasureInitialTscDeltas(pDevExt);
1927 if ( rc != VERR_TRY_AGAIN
1928 && rc != VERR_CPU_OFFLINE)
1929 break;
1930 } while (--cTries > 0);
1931 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1932 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1933 }
1934 else
1935 {
1936 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1937 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1938 }
1939 if (RT_SUCCESS(rc))
1940#endif
1941 {
1942 /*
1943 * Create the timer.
1944 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1945 */
1946 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1947 {
1948 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
1949 supdrvGipAsyncTimer, pDevExt);
1950 if (rc == VERR_NOT_SUPPORTED)
1951 {
1952 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
1953 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
1954 }
1955 }
1956 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1957 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
1958 supdrvGipSyncAndInvariantTimer, pDevExt);
1959 if (RT_SUCCESS(rc))
1960 {
1961 /*
1962 * We're good.
1963 */
1964 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
1965 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1966
1967 g_pSUPGlobalInfoPage = pGip;
1968 return VINF_SUCCESS;
1969 }
1970
1971 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
1972 Assert(!pDevExt->pGipTimer);
1973 }
1974 }
1975 else
1976 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
1977 }
1978 else
1979 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
1980 }
1981 else
1982 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
1983 }
1984 else
1985 OSDBGPRINT(("supdrvGipCreate: supdrvMeasureInitialTscDeltas failed. rc=%Rrc\n", rc));
1986
1987 /* Releases timer frequency increase too. */
1988 supdrvGipDestroy(pDevExt);
1989 return rc;
1990}
1991
1992
1993/**
1994 * Invalidates the GIP data upon termination.
1995 *
1996 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1997 */
1998static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
1999{
2000 unsigned i;
2001 pGip->u32Magic = 0;
2002 for (i = 0; i < pGip->cCpus; i++)
2003 {
2004 pGip->aCPUs[i].u64NanoTS = 0;
2005 pGip->aCPUs[i].u64TSC = 0;
2006 pGip->aCPUs[i].iTSCHistoryHead = 0;
2007 pGip->aCPUs[i].u64TSCSample = 0;
2008 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2009 }
2010}
2011
2012
2013/**
2014 * Terminates the GIP.
2015 *
2016 * @param pDevExt Instance data. GIP stuff may be updated.
2017 */
2018void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2019{
2020 int rc;
2021#ifdef DEBUG_DARWIN_GIP
2022 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2023 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2024 pDevExt->pGipTimer, pDevExt->GipMemObj));
2025#endif
2026
2027 /*
2028 * Stop receiving MP notifications before tearing anything else down.
2029 */
2030 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2031
2032#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2033 /*
2034 * Terminate the TSC-delta measurement thread and resources.
2035 */
2036 supdrvTscDeltaTerm(pDevExt);
2037#endif
2038
2039 /*
2040 * Destroy the TSC-refinement timer.
2041 */
2042 if (pDevExt->pInvarTscRefineTimer)
2043 {
2044 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2045 pDevExt->pInvarTscRefineTimer = NULL;
2046 }
2047
2048 if (pDevExt->pvTscDeltaSync)
2049 {
2050 RTMemFree(pDevExt->pvTscDeltaSync);
2051 pDevExt->pTscDeltaSync = NULL;
2052 pDevExt->pvTscDeltaSync = NULL;
2053 }
2054
2055 /*
2056 * Invalid the GIP data.
2057 */
2058 if (pDevExt->pGip)
2059 {
2060 supdrvGipTerm(pDevExt->pGip);
2061 pDevExt->pGip = NULL;
2062 }
2063 g_pSUPGlobalInfoPage = NULL;
2064
2065 /*
2066 * Destroy the timer and free the GIP memory object.
2067 */
2068 if (pDevExt->pGipTimer)
2069 {
2070 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2071 pDevExt->pGipTimer = NULL;
2072 }
2073
2074 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2075 {
2076 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2077 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2078 }
2079
2080 /*
2081 * Finally, make sure we've release the system timer resolution request
2082 * if one actually succeeded and is still pending.
2083 */
2084 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2085}
2086
2087
2088
2089
2090/*
2091 *
2092 *
2093 * GIP Update Timer Related Code
2094 * GIP Update Timer Related Code
2095 * GIP Update Timer Related Code
2096 *
2097 *
2098 */
2099
2100
2101/**
2102 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2103 * updates all the per cpu data except the transaction id.
2104 *
2105 * @param pDevExt The device extension.
2106 * @param pGipCpu Pointer to the per cpu data.
2107 * @param u64NanoTS The current time stamp.
2108 * @param u64TSC The current TSC.
2109 * @param iTick The current timer tick.
2110 *
2111 * @remarks Can be called with interrupts disabled!
2112 */
2113static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2114{
2115 uint64_t u64TSCDelta;
2116 uint32_t u32UpdateIntervalTSC;
2117 uint32_t u32UpdateIntervalTSCSlack;
2118 unsigned iTSCHistoryHead;
2119 uint64_t u64CpuHz;
2120 uint32_t u32TransactionId;
2121
2122 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2123 AssertPtrReturnVoid(pGip);
2124
2125 /* Delta between this and the previous update. */
2126 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2127
2128 /*
2129 * Update the NanoTS.
2130 */
2131 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2132
2133 /*
2134 * Calc TSC delta.
2135 */
2136 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2137 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2138
2139 /*
2140 * We don't need to keep realculating the frequency when it's invariant, so
2141 * the remainder of this function is only for the sync and async TSC modes.
2142 */
2143 if (pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC)
2144 {
2145 if (u64TSCDelta >> 32)
2146 {
2147 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2148 pGipCpu->cErrors++;
2149 }
2150
2151 /*
2152 * On the 2nd and 3rd callout, reset the history with the current TSC
2153 * interval since the values entered by supdrvGipInit are totally off.
2154 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2155 * better, while the 3rd should be most reliable.
2156 */
2157 /** @todo Could we drop this now that we initializes the history
2158 * with nominal TSC frequency values? */
2159 u32TransactionId = pGipCpu->u32TransactionId;
2160 if (RT_UNLIKELY( ( u32TransactionId == 5
2161 || u32TransactionId == 7)
2162 && ( iTick == 2
2163 || iTick == 3) ))
2164 {
2165 unsigned i;
2166 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2167 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2168 }
2169
2170 /*
2171 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2172 * Wait until we have at least one full history since the above history reset. The
2173 * assumption is that the majority of the previous history values will be tolerable.
2174 * See @bugref{6710} comment #67.
2175 */
2176 /** @todo Could we drop the fuding there now that we initializes the history
2177 * with nominal TSC frequency values? */
2178 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2179 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2180 {
2181 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2182 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2183 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2184 {
2185 uint32_t u32;
2186 u32 = pGipCpu->au32TSCHistory[0];
2187 u32 += pGipCpu->au32TSCHistory[1];
2188 u32 += pGipCpu->au32TSCHistory[2];
2189 u32 += pGipCpu->au32TSCHistory[3];
2190 u32 >>= 2;
2191 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2192 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2193 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2194 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2195 u64TSCDelta >>= 2;
2196 u64TSCDelta += u32;
2197 u64TSCDelta >>= 1;
2198 }
2199 }
2200
2201 /*
2202 * TSC History.
2203 */
2204 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2205 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2206 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2207 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2208
2209 /*
2210 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2211 *
2212 * On Windows, we have an occasional (but recurring) sour value that messed up
2213 * the history but taking only 1 interval reduces the precision overall.
2214 */
2215 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2216 || pGip->u32UpdateHz >= 1000)
2217 {
2218 uint32_t u32;
2219 u32 = pGipCpu->au32TSCHistory[0];
2220 u32 += pGipCpu->au32TSCHistory[1];
2221 u32 += pGipCpu->au32TSCHistory[2];
2222 u32 += pGipCpu->au32TSCHistory[3];
2223 u32 >>= 2;
2224 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2225 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2226 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2227 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2228 u32UpdateIntervalTSC >>= 2;
2229 u32UpdateIntervalTSC += u32;
2230 u32UpdateIntervalTSC >>= 1;
2231
2232 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2233 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2234 }
2235 else if (pGip->u32UpdateHz >= 90)
2236 {
2237 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2238 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2239 u32UpdateIntervalTSC >>= 1;
2240
2241 /* value chosen on a 2GHz thinkpad running windows */
2242 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2243 }
2244 else
2245 {
2246 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2247
2248 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2249 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2250 }
2251 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2252
2253 /*
2254 * CpuHz.
2255 */
2256 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2257 u64CpuHz /= pGip->u32UpdateIntervalNS;
2258 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2259 }
2260}
2261
2262
2263/**
2264 * Updates the GIP.
2265 *
2266 * @param pDevExt The device extension.
2267 * @param u64NanoTS The current nanosecond timesamp.
2268 * @param u64TSC The current TSC timesamp.
2269 * @param idCpu The CPU ID.
2270 * @param iTick The current timer tick.
2271 *
2272 * @remarks Can be called with interrupts disabled!
2273 */
2274static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2275{
2276 /*
2277 * Determine the relevant CPU data.
2278 */
2279 PSUPGIPCPU pGipCpu;
2280 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2281 AssertPtrReturnVoid(pGip);
2282
2283 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2284 pGipCpu = &pGip->aCPUs[0];
2285 else
2286 {
2287 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
2288 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
2289 return;
2290 pGipCpu = &pGip->aCPUs[iCpu];
2291 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
2292 return;
2293 }
2294
2295 /*
2296 * Start update transaction.
2297 */
2298 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2299 {
2300 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2301 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2302 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2303 pGipCpu->cErrors++;
2304 return;
2305 }
2306
2307 /*
2308 * Recalc the update frequency every 0x800th time.
2309 */
2310 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariants hosts. */
2311 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2312 {
2313 if (pGip->u64NanoTSLastUpdateHz)
2314 {
2315#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2316 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2317 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2318 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2319 {
2320 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2321 * calculation on non-invariant hosts if it changes the history decision
2322 * taken in supdrvGipDoUpdateCpu(). */
2323 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2324 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2325 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2326 }
2327#endif
2328 }
2329 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2330 }
2331
2332 /*
2333 * Update the data.
2334 */
2335 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2336
2337 /*
2338 * Complete transaction.
2339 */
2340 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2341}
2342
2343
2344/**
2345 * Updates the per cpu GIP data for the calling cpu.
2346 *
2347 * @param pDevExt The device extension.
2348 * @param u64NanoTS The current nanosecond timesamp.
2349 * @param u64TSC The current TSC timesamp.
2350 * @param idCpu The CPU ID.
2351 * @param idApic The APIC id for the CPU index.
2352 * @param iTick The current timer tick.
2353 *
2354 * @remarks Can be called with interrupts disabled!
2355 */
2356static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2357 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2358{
2359 uint32_t iCpu;
2360 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2361
2362 /*
2363 * Avoid a potential race when a CPU online notification doesn't fire on
2364 * the onlined CPU but the tick creeps in before the event notification is
2365 * run.
2366 */
2367 if (RT_UNLIKELY(iTick == 1))
2368 {
2369 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2370 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2371 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2372 }
2373
2374 iCpu = pGip->aiCpuFromApicId[idApic];
2375 if (RT_LIKELY(iCpu < pGip->cCpus))
2376 {
2377 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2378 if (pGipCpu->idCpu == idCpu)
2379 {
2380 /*
2381 * Start update transaction.
2382 */
2383 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2384 {
2385 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2386 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2387 pGipCpu->cErrors++;
2388 return;
2389 }
2390
2391 /*
2392 * Update the data.
2393 */
2394 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2395
2396 /*
2397 * Complete transaction.
2398 */
2399 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2400 }
2401 }
2402}
2403
2404
2405/**
2406 * Timer callback function for the sync and invariant GIP modes.
2407 *
2408 * @param pTimer The timer.
2409 * @param pvUser Opaque pointer to the device extension.
2410 * @param iTick The timer tick.
2411 */
2412static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2413{
2414 RTCCUINTREG uFlags;
2415 uint64_t u64TSC;
2416 uint64_t u64NanoTS;
2417 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2418 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2419
2420 uFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2421 u64TSC = ASMReadTSC();
2422 u64NanoTS = RTTimeSystemNanoTS();
2423
2424 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2425 {
2426 /*
2427 * The calculations in supdrvGipUpdate() is very timing sensitive and doesn't handle
2428 * missed timer ticks. So for now it is better to use a delta of 0 and have the TSC rate
2429 * affected a bit until we get proper TSC deltas than implementing options like
2430 * rescheduling the tick to be delivered on the right CPU or missing the tick entirely.
2431 *
2432 * The likely hood of this happening is really low. On Windows, Linux, and Solaris
2433 * timers fire on the CPU they were registered/started on. Darwin timers doesn't
2434 * necessarily (they are high priority threads waiting).
2435 */
2436 Assert(!ASMIntAreEnabled());
2437 supdrvTscDeltaApply(pGip, &u64TSC, ASMGetApicId(), NULL /* pfDeltaApplied */);
2438 }
2439
2440 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2441
2442 ASMSetFlags(uFlags);
2443
2444#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2445 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
2446 && !RTCpuSetIsEmpty(&pDevExt->TscDeltaCpuSet))
2447 {
2448 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
2449 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
2450 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
2451 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
2452 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
2453 /** @todo Do the actual poking using -- RTThreadUserSignal() */
2454 }
2455#endif
2456}
2457
2458
2459/**
2460 * Timer callback function for async GIP mode.
2461 * @param pTimer The timer.
2462 * @param pvUser Opaque pointer to the device extension.
2463 * @param iTick The timer tick.
2464 */
2465static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2466{
2467 RTCCUINTREG fOldFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2468 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2469 RTCPUID idCpu = RTMpCpuId();
2470 uint64_t u64TSC = ASMReadTSC();
2471 uint64_t NanoTS = RTTimeSystemNanoTS();
2472
2473 /** @todo reset the transaction number and whatnot when iTick == 1. */
2474 if (pDevExt->idGipMaster == idCpu)
2475 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2476 else
2477 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
2478
2479 ASMSetFlags(fOldFlags);
2480}
2481
2482
2483
2484
2485/*
2486 *
2487 *
2488 * TSC Delta Measurements And Related Code
2489 * TSC Delta Measurements And Related Code
2490 * TSC Delta Measurements And Related Code
2491 *
2492 *
2493 */
2494
2495
2496/*
2497 * Select TSC delta measurement algorithm.
2498 */
2499#if 1
2500# define GIP_TSC_DELTA_METHOD_1
2501#else
2502# define GIP_TSC_DELTA_METHOD_2
2503#endif
2504
2505/** For padding variables to keep them away from other cache lines. Better too
2506 * large than too small!
2507 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2508 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2509 * III had 32 bytes cache lines. */
2510#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2511
2512
2513/**
2514 * TSC delta measurment algorithm \#2 result entry.
2515 */
2516typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2517{
2518 uint32_t iSeqMine;
2519 uint32_t iSeqOther;
2520 uint64_t uTsc;
2521} SUPDRVTSCDELTAMETHOD2ENTRY;
2522
2523/**
2524 * TSC delta measurment algorithm \#2 Data.
2525 */
2526typedef struct SUPDRVTSCDELTAMETHOD2
2527{
2528 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2529 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 1];
2530 /** The current sequence number of this worker. */
2531 uint32_t volatile iCurSeqNo;
2532 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2533 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2534 /** Result table. */
2535 SUPDRVTSCDELTAMETHOD2ENTRY aResults[96];
2536} SUPDRVTSCDELTAMETHOD2;
2537/** Pointer to the data for TSC delta mesurment algorithm \#2 .*/
2538typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2539
2540
2541/**
2542 * The TSC delta synchronization struct, version 2.
2543 *
2544 * The syncrhonization variable is completely isolated in its own cache line
2545 * (provided our max cache line size estimate is correct).
2546 */
2547typedef struct SUPTSCDELTASYNC2
2548{
2549 /** Padding to make sure the uVar1 is in its own cache line. */
2550 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2551
2552 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2553 volatile uint32_t uSyncVar;
2554 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2555 volatile uint32_t uSyncSeq;
2556
2557 /** Padding to make sure the uVar1 is in its own cache line. */
2558 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2559
2560 /** Start RDTSC value. Put here mainly to save stack space. */
2561 uint64_t uTscStart;
2562 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2563 uint64_t cMaxTscTicks;
2564} SUPTSCDELTASYNC2;
2565AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2566typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2567
2568/** Prestart wait. */
2569#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2570/** Prestart aborted. */
2571#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2572/** Ready (on your mark). */
2573#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2574/** Steady (get set). */
2575#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2576/** Go! */
2577#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2578
2579/** We reached the time limit. */
2580#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2581/** The other party won't touch the sync struct ever again. */
2582#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2583
2584
2585/**
2586 * Argument package/state passed by supdrvMeasureTscDeltaOne to the RTMpOn
2587 * callback worker.
2588 */
2589typedef struct SUPDRVGIPTSCDELTARGS
2590{
2591 /** The device extension. */
2592 PSUPDRVDEVEXT pDevExt;
2593 /** Pointer to the GIP CPU array entry for the worker. */
2594 PSUPGIPCPU pWorker;
2595 /** Pointer to the GIP CPU array entry for the master. */
2596 PSUPGIPCPU pMaster;
2597 /** Pointer to the master's synchronization struct (on stack). */
2598 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2599 /** Pointer to the worker's synchronization struct (on stack). */
2600 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2601 /** The maximum number of ticks to spend in supdrvMeasureTscDeltaCallback.
2602 * (This is what we need a rough TSC frequency for.) */
2603 uint64_t cMaxTscTicks;
2604 /** Used to abort synchronization setup. */
2605 bool volatile fAbortSetup;
2606
2607#if 0
2608 /** Method 1 data. */
2609 struct
2610 {
2611 } M1;
2612#endif
2613
2614#ifdef GIP_TSC_DELTA_METHOD_2
2615 struct
2616 {
2617 PSUPDRVTSCDELTAMETHOD2 pMasterData;
2618 PSUPDRVTSCDELTAMETHOD2 pWorkerData;
2619 uint32_t cHits;
2620 bool fLagMaster;
2621 bool fLagWorker;
2622 bool volatile fQuitEarly;
2623 } M2;
2624#endif
2625} SUPDRVGIPTSCDELTARGS;
2626typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2627
2628
2629/** @name Macros that implements the basic synchronization steps common to
2630 * the algorithms.
2631 *
2632 * Must be used from loop as the timeouts are implemented via 'break' statements
2633 * at the moment.
2634 *
2635 * @{
2636 */
2637#if 0
2638#define TSCDELTA_MASTER_SYNC_BEFORE(a_pSync1, a_pMySync, a_pOtherSync) \
2639 do {\
2640 ASMAtomicWriteU32(&(a_pSync1)->u, GIP_TSC_DELTA_SYNC_START); \
2641 \
2642 /* Disable interrupts only in the master for as short a period \
2643 as possible, thanks again to Windows. See @bugref{6710} comment #73. */ \
2644 uFlags = ASMIntDisableFlags(); \
2645 \
2646 while (ASMAtomicReadU32(&(a_pSync1)->u) == GIP_TSC_DELTA_SYNC_START) \
2647 { /* nothing */ } \
2648 } while (0)
2649#define TSCDELTA_MASTER_SYNC_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2650 do {\
2651 /* Sync up with worker. */ \
2652 ASMSetFlags(uFlags); \
2653 \
2654 while (ASMAtomicReadU32(&(a_pSync1)->u) != GIP_TSC_DELTA_SYNC_WORKER_DONE) \
2655 { /* nothing */ } \
2656 } while (0)
2657#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2658 do {\
2659 ASMAtomicWriteU32(&(a_pSync1)->u, GIP_TSC_DELTA_SYNC_STOP); \
2660 } while (0)
2661
2662#define TSCDELTA_OTHER_SYNC_BEFORE(a_pSync1, a_pMySync, a_pOtherSync, a_MidSyncExpr) \
2663 do { \
2664 while (ASMAtomicReadU32(&(a_pSync1)->u) != GIP_TSC_DELTA_SYNC_START) \
2665 { /* nothing */ } \
2666 a_MidSyncExpr; \
2667 ASMAtomicWriteU32(&(a_pSync1)->u, GIP_TSC_DELTA_SYNC_WORKER_READY); \
2668 } while (0)
2669#define TSCDELTA_OTHER_SYNC_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2670 do { \
2671 /* Tell master we're done collecting our data. */ \
2672 ASMAtomicWriteU32(&(a_pSync1)->u, GIP_TSC_DELTA_SYNC_WORKER_DONE); \
2673 \
2674 /* Wait for the master to process the data. */ \
2675 while (ASMAtomicReadU32(&(a_pSync1)->u) == GIP_TSC_DELTA_SYNC_WORKER_DONE) \
2676 ASMNopPause(); \
2677 } while (0)
2678#else
2679
2680#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
2681# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2682# define TSCDELTA_DBG_START_LOOP() do {iDbgCounter = 0;} while (0)
2683# define TSCDELTA_DBG_CHECK_LOOP() do { if (++iDbgCounter == 0) __debugbreak(); } while (0)
2684#else
2685# define TSCDELTA_DBG_VARS() ((void)0)
2686# define TSCDELTA_DBG_START_LOOP() ((void)0)
2687# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2688#endif
2689
2690
2691static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2692 bool fIsMaster, PRTCCUINTREG pfEFlags)
2693{
2694 uint32_t iMySeq = fIsMaster ? 0 : 256;
2695 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2696 uint32_t u32Tmp;
2697 uint32_t iSync2Loops = 0;
2698 RTCCUINTREG fEFlags;
2699 TSCDELTA_DBG_VARS();
2700
2701 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2702
2703 /*
2704 * The master tells the worker to get on it's mark.
2705 */
2706 if (fIsMaster)
2707 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2708 { /* likely*/ }
2709 else
2710 return false;
2711
2712 /*
2713 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2714 */
2715 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2716 for (;;)
2717 {
2718 fEFlags = ASMIntDisableFlags();
2719 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2720 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2721 break;
2722
2723 ASMSetFlags(fEFlags);
2724 ASMNopPause();
2725
2726 /* Abort? */
2727 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2728 break;
2729
2730 /* Check for timeouts every so often (not every loop in case RDTSC is
2731 trapping or something). Must check the first time around. */
2732#if 0 /* For debugging the timeout paths. */
2733 static uint32_t volatile xxx;
2734#endif
2735 if ( ( (iSync2Loops & 0x3ff) == 0
2736 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
2737#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
2738 || (!fIsMaster && (++xxx & 0xf) == 0)
2739#endif
2740 )
2741 {
2742 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
2743 ignore the timeout if we've got the go ahead already (simpler). */
2744 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
2745 {
2746 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
2747 return false;
2748 }
2749 }
2750 iSync2Loops++;
2751 }
2752
2753 /*
2754 * Interrupts are now disabled and will remain disabled until we do
2755 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
2756 */
2757 *pfEFlags = fEFlags;
2758
2759 /*
2760 * The worker tells the master that it is on its mark and that the master
2761 * need to get into position as well.
2762 */
2763 if (!fIsMaster)
2764 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2765 { /* likely */ }
2766 else
2767 {
2768 ASMSetFlags(fEFlags);
2769 return false;
2770 }
2771
2772 /*
2773 * The master sends the 'go' to the worker and wait for ACK.
2774 */
2775 if (fIsMaster)
2776 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2777 { /* likely */ }
2778 else
2779 {
2780 ASMSetFlags(fEFlags);
2781 return false;
2782 }
2783
2784 /*
2785 * Wait for the 'go' signal (ack in the master case).
2786 */
2787 TSCDELTA_DBG_START_LOOP();
2788 for (;;)
2789 {
2790 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2791 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
2792 break;
2793 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
2794 { /* likely */ }
2795 else
2796 {
2797 ASMSetFlags(fEFlags);
2798 return false;
2799 }
2800
2801 TSCDELTA_DBG_CHECK_LOOP();
2802 ASMNopPause();
2803 }
2804
2805 /*
2806 * The worker acks the 'go' (shouldn't fail).
2807 */
2808 if (!fIsMaster)
2809 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2810 { /* likely */ }
2811 else
2812 {
2813 ASMSetFlags(fEFlags);
2814 return false;
2815 }
2816
2817 /*
2818 * Try enter mostly lockstep execution with it.
2819 */
2820 for (;;)
2821 {
2822 uint32_t iOtherSeq1, iOtherSeq2;
2823 ASMCompilerBarrier();
2824 ASMSerializeInstruction();
2825
2826 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
2827 ASMNopPause();
2828 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
2829 ASMNopPause();
2830 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
2831
2832 ASMCompilerBarrier();
2833 if (iOtherSeq1 == iOtherSeq2)
2834 return true;
2835
2836 /* Did the other guy give up? Should we give up? */
2837 if ( iOtherSeq1 == UINT32_MAX
2838 || iOtherSeq2 == UINT32_MAX)
2839 return true;
2840 if (++iMySeq >= iMaxSeq)
2841 {
2842 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
2843 return true;
2844 }
2845 ASMNopPause();
2846 }
2847}
2848
2849#define TSCDELTA_MASTER_SYNC_BEFORE(a_pSync1, a_pMySync, a_pOtherSync) \
2850 do { \
2851 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fMaster*/, &uFlags))) \
2852 { /*likely*/ } \
2853 else break; \
2854 } while (0)
2855#define TSCDELTA_OTHER_SYNC_BEFORE(a_pSync1, a_pMySync, a_pOtherSync, a_MidSyncExpr) \
2856 do { \
2857 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fMaster*/, &uFlags))) \
2858 { /*likely*/ } \
2859 else break; \
2860 } while (0)
2861
2862#define TSCDELTA_MASTER_SYNC_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2863 do {\
2864 /* \
2865 * Wait for the worker to give us the 'ready' signal. \
2866 */ \
2867 uint32_t u32Tmp; \
2868 TSCDELTA_DBG_VARS(); \
2869 ASMSetFlags(uFlags); \
2870 TSCDELTA_DBG_START_LOOP(); \
2871 l_master_wait_done: \
2872 u32Tmp = ASMAtomicReadU32(&(a_pMySync)->uSyncVar); \
2873 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY) \
2874 { \
2875 ASMNopPause(); \
2876 if (u32Tmp != GIP_TSC_DELTA_SYNC2_GO) \
2877 break; /* shouldn't ever happen! */ \
2878 TSCDELTA_DBG_CHECK_LOOP(); \
2879 ASMNopPause(); \
2880 goto l_master_wait_done; \
2881 } \
2882 } while (0)
2883
2884#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2885 do {\
2886 /* \
2887 * Tell the woker that we're done processing the data and ready for the next round. \
2888 */ \
2889 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2890 { \
2891 ASMSetFlags(uFlags); \
2892 break; \
2893 } \
2894 } while (0)
2895
2896
2897#define TSCDELTA_OTHER_SYNC_AFTER(a_pSync1, a_pMySync, a_pOtherSync) \
2898 do { \
2899 /* \
2900 * Tell the master that we're done and wait for the data to be processed and the next round to start. \
2901 */ \
2902 uint32_t u32Tmp; \
2903 TSCDELTA_DBG_VARS(); \
2904 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2905 { \
2906 ASMSetFlags(uFlags); \
2907 break; \
2908 } \
2909 ASMSetFlags(uFlags); \
2910 TSCDELTA_DBG_START_LOOP(); \
2911 l_other_wait_done: \
2912 u32Tmp = ASMAtomicReadU32(&(a_pMySync)->uSyncVar); \
2913 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY) \
2914 { \
2915 ASMNopPause(); \
2916 if (u32Tmp != GIP_TSC_DELTA_SYNC2_GO) \
2917 break; /* shouldn't ever happen! */ \
2918 TSCDELTA_DBG_CHECK_LOOP(); \
2919 ASMNopPause(); \
2920 goto l_other_wait_done; \
2921 } \
2922 } while (0)
2923#endif
2924/** @} */
2925
2926#ifdef GIP_TSC_DELTA_METHOD_1
2927
2928/**
2929 * TSC delta measurment algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
2930 *
2931 *
2932 * We ignore the first few runs of the loop in order to prime the
2933 * cache. Also, we need to be careful about using 'pause' instruction
2934 * in critical busy-wait loops in this code - it can cause undesired
2935 * behaviour with hyperthreading.
2936 *
2937 * We try to minimize the measurement error by computing the minimum
2938 * read time of the compare statement in the worker by taking TSC
2939 * measurements across it.
2940 *
2941 * It must be noted that the computed minimum read time is mostly to
2942 * eliminate huge deltas when the worker is too early and doesn't by
2943 * itself help produce more accurate deltas. We allow two times the
2944 * computed minimum as an arbibtrary acceptable threshold. Therefore,
2945 * it is still possible to get negative deltas where there are none
2946 * when the worker is earlier. As long as these occasional negative
2947 * deltas are lower than the time it takes to exit guest-context and
2948 * the OS to reschedule EMT on a different CPU we won't expose a TSC
2949 * that jumped backwards. It is because of the existence of the
2950 * negative deltas we don't recompute the delta with the master and
2951 * worker interchanged to eliminate the remaining measurement error.
2952 *
2953 *
2954 * @param pArgs The argument/state data.
2955 * @param pSync1 The synchronization structure
2956 * (pDevExt->pTscDeltaSync).
2957 * @param fIsMaster Set if master, clear if worker.
2958 * @param iTry The attempt number.
2959 */
2960static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC pSync1,
2961 PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, uint32_t iTry)
2962{
2963 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
2964 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
2965 uint64_t uMinCmpReadTime = UINT64_MAX;
2966 unsigned iLoop;
2967 NOREF(iTry);
2968
2969 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
2970 {
2971 RTCCUINTREG uFlags;
2972 if (fIsMaster)
2973 {
2974 /*
2975 * The master.
2976 */
2977 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
2978 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
2979 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
2980 TSCDELTA_MASTER_SYNC_BEFORE(pSync1, pMySync, pOtherSync);
2981
2982 do
2983 {
2984 ASMSerializeInstruction();
2985 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
2986 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2987
2988 TSCDELTA_MASTER_SYNC_AFTER(pSync1, pMySync, pOtherSync);
2989
2990 /* Process the data. */
2991 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2992 {
2993 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
2994 {
2995 int64_t iDelta = pGipCpuWorker->u64TSCSample
2996 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
2997 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2998 ? iDelta < pGipCpuWorker->i64TSCDelta
2999 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3000 pGipCpuWorker->i64TSCDelta = iDelta;
3001 }
3002 }
3003
3004 /* Reset our TSC sample and tell the worker to move on. */
3005 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3006 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pSync1, pMySync, pOtherSync);
3007 }
3008 else
3009 {
3010 /*
3011 * The worker.
3012 */
3013 uint64_t uTscWorker;
3014 uint64_t uTscWorkerFlushed;
3015 uint64_t uCmpReadTime;
3016
3017 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3018 TSCDELTA_OTHER_SYNC_BEFORE(pSync1, pMySync, pOtherSync, Assert(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD));
3019
3020 /*
3021 * Keep reading the TSC until we notice that the master has read his. Reading
3022 * the TSC -after- the master has updated the memory is way too late. We thus
3023 * compensate by trying to measure how long it took for the worker to notice
3024 * the memory flushed from the master.
3025 */
3026 do
3027 {
3028 ASMSerializeInstruction();
3029 uTscWorker = ASMReadTSC();
3030 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3031 ASMSerializeInstruction();
3032 uTscWorkerFlushed = ASMReadTSC();
3033
3034 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3035 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3036 {
3037 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3038 if (uCmpReadTime < (uMinCmpReadTime << 1))
3039 {
3040 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3041 if (uCmpReadTime < uMinCmpReadTime)
3042 uMinCmpReadTime = uCmpReadTime;
3043 }
3044 else
3045 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3046 }
3047 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3048 {
3049 if (uCmpReadTime < uMinCmpReadTime)
3050 uMinCmpReadTime = uCmpReadTime;
3051 }
3052
3053 TSCDELTA_OTHER_SYNC_AFTER(pSync1, pMySync, pOtherSync);
3054 }
3055 }
3056
3057 /*
3058 * We must reset the worker TSC sample value in case it gets picked as a
3059 * GIP master later on (it's trashed above, naturally).
3060 */
3061 if (!fIsMaster)
3062 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3063}
3064
3065
3066/**
3067 * Initializes the argument/state data belonging to algorithm \#1.
3068 *
3069 * @returns VBox status code.
3070 * @param pArgs The argument/state data.
3071 */
3072static int supdrvTscDeltaMethod1Init(PSUPDRVGIPTSCDELTARGS pArgs)
3073{
3074 NOREF(pArgs);
3075 return VINF_SUCCESS;
3076}
3077
3078
3079/**
3080 * Undoes what supdrvTscDeltaMethod1Init() did.
3081 *
3082 * @param pArgs The argument/state data.
3083 */
3084static void supdrvTscDeltaMethod1Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3085{
3086 NOREF(pArgs);
3087}
3088
3089#endif /* GIP_TSC_DELTA_METHOD_1 */
3090
3091
3092#ifdef GIP_TSC_DELTA_METHOD_2
3093/*
3094 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3095 */
3096
3097# define GIP_TSC_DELTA_M2_LOOPS (12 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3098# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 1
3099
3100
3101static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs, uint32_t iLoop)
3102{
3103 PSUPDRVTSCDELTAMETHOD2 pMasterData = pArgs->M2.pMasterData;
3104 PSUPDRVTSCDELTAMETHOD2 pOtherData = pArgs->M2.pWorkerData;
3105 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3106 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3107 uint32_t idxResult;
3108 uint32_t cHits = 0;
3109
3110 /*
3111 * Look for matching entries in the master and worker tables.
3112 */
3113 for (idxResult = 0; idxResult < RT_ELEMENTS(pMasterData->aResults); idxResult++)
3114 {
3115 uint32_t idxOther = pMasterData->aResults[idxResult].iSeqOther;
3116 if (idxOther & 1)
3117 {
3118 idxOther >>= 1;
3119 if (idxOther < RT_ELEMENTS(pOtherData->aResults))
3120 {
3121 if (pOtherData->aResults[idxOther].iSeqOther == pMasterData->aResults[idxResult].iSeqMine)
3122 {
3123 int64_t iDelta;
3124 iDelta = pOtherData->aResults[idxOther].uTsc
3125 - (pMasterData->aResults[idxResult].uTsc - iMasterTscDelta);
3126 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3127 ? iDelta < iBestDelta
3128 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3129 iBestDelta = iDelta;
3130 cHits++;
3131 }
3132 }
3133 }
3134 }
3135
3136 /*
3137 * Save the results.
3138 */
3139 if (cHits > 2)
3140 pArgs->pWorker->i64TSCDelta = iBestDelta;
3141 pArgs->M2.cHits += cHits;
3142
3143 /*
3144 * Check and see if we can quit a little early. If the result is already
3145 * extremely good (+/-16 ticks seems reasonable), just stop.
3146 */
3147 if ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3148 ? iBestDelta <= 16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3149 : iBestDelta >= -16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE)
3150 {
3151 /*SUPR0Printf("quitting early #1: hits=%#x iLoop=%d iBestDelta=%lld\n", cHits, iLoop, iBestDelta);*/
3152 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3153 }
3154 /*
3155 * After a while, just stop if we get sufficent hits.
3156 */
3157 else if ( iLoop >= GIP_TSC_DELTA_M2_LOOPS / 3
3158 && cHits > 8)
3159 {
3160 uint32_t const cHitsNeeded = GIP_TSC_DELTA_M2_LOOPS * RT_ELEMENTS(pArgs->M2.pMasterData->aResults) / 4; /* 25% */
3161 if ( pArgs->M2.cHits >= cHitsNeeded
3162 && ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3163 ? iBestDelta <= GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3164 : iBestDelta >= -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE) )
3165 {
3166 /*SUPR0Printf("quitting early hits=%#x (%#x) needed=%#x iLoop=%d iBestDelta=%lld\n",
3167 pArgs->M2.cHits, cHits, cHitsNeeded, iLoop, iBestDelta);*/
3168 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3169 }
3170 }
3171}
3172
3173
3174/**
3175 * The core function of the 2nd TSC delta mesurment algorithm.
3176 *
3177 * The idea here is that we have the two CPUs execute the exact same code
3178 * collecting a largish set of TSC samples. The code has one data dependency on
3179 * the other CPU which intention it is to synchronize the execution as well as
3180 * help cross references the two sets of TSC samples (the sequence numbers).
3181 *
3182 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3183 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3184 * it will help with making the CPUs enter lock step execution occationally.
3185 *
3186 */
3187static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3188{
3189 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3190 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3191
3192 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3193 ASMSerializeInstruction();
3194 while (cLeft-- > 0)
3195 {
3196 uint64_t uTsc;
3197 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3198 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3199 ASMCompilerBarrier();
3200 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3201 uTsc = ASMReadTSC();
3202 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3203 ASMCompilerBarrier();
3204 ASMSerializeInstruction();
3205 pEntry->iSeqMine = iSeqMine;
3206 pEntry->iSeqOther = iSeqOther;
3207 pEntry->uTsc = uTsc;
3208 pEntry++;
3209 ASMSerializeInstruction();
3210 if (fLag)
3211 ASMNopPause();
3212 }
3213}
3214
3215
3216/**
3217 * TSC delta measurment algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3218 *
3219 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3220 *
3221 * @param pArgs The argument/state data.
3222 * @param pSync1 The synchronization structure
3223 * (pDevExt->pTscDeltaSync).
3224 * @param fIsMaster Set if master, clear if worker.
3225 * @param iTry The attempt number.
3226 */
3227static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC pSync1,
3228 PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, uint32_t iTry)
3229{
3230 unsigned iLoop;
3231
3232 if (fIsMaster)
3233 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, false);
3234
3235 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3236 {
3237 RTCCUINTREG uFlags;
3238 if (fIsMaster)
3239 {
3240 /*
3241 * Adjust the loop lag fudge.
3242 */
3243# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3244 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3245 {
3246 /* Lag during the priming to be nice to everyone.. */
3247 pArgs->M2.fLagMaster = true;
3248 pArgs->M2.fLagWorker = true;
3249 }
3250 else
3251# endif
3252 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3253 {
3254 /* 25 % of the body without lagging. */
3255 pArgs->M2.fLagMaster = false;
3256 pArgs->M2.fLagWorker = false;
3257 }
3258 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3259 {
3260 /* 25 % of the body with both lagging. */
3261 pArgs->M2.fLagMaster = true;
3262 pArgs->M2.fLagWorker = true;
3263 }
3264 else
3265 {
3266 /* 50% of the body with alternating lag. */
3267 pArgs->M2.fLagMaster = (iLoop & 1) == 0;
3268 pArgs->M2.fLagWorker = (iLoop & 1) == 1;
3269 }
3270
3271 /*
3272 * Sync up with the worker and collect data.
3273 */
3274 TSCDELTA_MASTER_SYNC_BEFORE(pSync1, pMySync, pOtherSync);
3275 supdrvTscDeltaMethod2CollectData(pArgs->M2.pMasterData, &pArgs->M2.pWorkerData->iCurSeqNo, pArgs->M2.fLagMaster);
3276 TSCDELTA_MASTER_SYNC_AFTER(pSync1, pMySync, pOtherSync);
3277
3278 /*
3279 * Process the data.
3280 */
3281# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3282 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3283# endif
3284 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs, iLoop);
3285
3286 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pSync1, pMySync, pOtherSync);
3287 }
3288 else
3289 {
3290 /*
3291 * The worker.
3292 */
3293 TSCDELTA_OTHER_SYNC_BEFORE(pSync1, pMySync, pOtherSync, (void)0);
3294 supdrvTscDeltaMethod2CollectData(pArgs->M2.pWorkerData, &pArgs->M2.pMasterData->iCurSeqNo, pArgs->M2.fLagWorker);
3295 TSCDELTA_OTHER_SYNC_AFTER(pSync1, pMySync, pOtherSync);
3296 }
3297
3298 if (ASMAtomicReadBool(&pArgs->M2.fQuitEarly))
3299 break;
3300
3301 }
3302}
3303
3304
3305/**
3306 * Initializes the argument/state data belonging to algorithm \#2.
3307 *
3308 * @returns VBox status code.
3309 * @param pArgs The argument/state data.
3310 */
3311static int supdrvTscDeltaMethod2Init(PSUPDRVGIPTSCDELTARGS pArgs)
3312{
3313 pArgs->M2.pMasterData = NULL;
3314 pArgs->M2.pWorkerData = NULL;
3315
3316 uint32_t const fFlags = /*RTMEMALLOCEX_FLAGS_ANY_CTX |*/ RTMEMALLOCEX_FLAGS_ZEROED;
3317 int rc = RTMemAllocEx(sizeof(*pArgs->M2.pWorkerData), 0, fFlags, (void **)&pArgs->M2.pWorkerData);
3318 if (RT_SUCCESS(rc))
3319 rc = RTMemAllocEx(sizeof(*pArgs->M2.pMasterData), 0, fFlags, (void **)&pArgs->M2.pMasterData);
3320 return rc;
3321}
3322
3323
3324/**
3325 * Undoes what supdrvTscDeltaMethod2Init() did.
3326 *
3327 * @param pArgs The argument/state data.
3328 */
3329static void supdrvTscDeltaMethod2Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3330{
3331 RTMemFreeEx(pArgs->M2.pMasterData, sizeof(*pArgs->M2.pMasterData));
3332 RTMemFreeEx(pArgs->M2.pWorkerData, sizeof(*pArgs->M2.pWorkerData));
3333# if 0
3334 SUPR0Printf("cHits=%d m=%d w=%d\n", pArgs->M2.cHits, pArgs->pMaster->idApic, pArgs->pWorker->idApic);
3335# endif
3336}
3337
3338
3339#endif /* GIP_TSC_DELTA_METHOD_2 */
3340
3341
3342
3343static int supdrvMeasureTscDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3344 bool fIsMaster, bool fTimeout)
3345{
3346 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3347 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3348#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
3349 uint32_t iTry = 0;
3350#endif
3351
3352 /*
3353 * Clear our sync pointer and make sure the abort flag is set.
3354 */
3355 ASMAtomicWriteNullPtr(ppMySync);
3356 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3357
3358 /*
3359 * Make sure the other party is out of there and won't be touching our
3360 * sync state again (would cause stack corruption).
3361 */
3362 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3363 {
3364 ASMNopPause();
3365 ASMNopPause();
3366 ASMNopPause();
3367#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
3368if (++iTry == 0) __debugbreak();
3369#endif
3370 }
3371
3372 return 0;
3373}
3374
3375
3376/**
3377 * This is used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3378 * and compute the delta between them.
3379 *
3380 * To reduce code size a good when timeout handling was added, a dummy return
3381 * value had to be added (saves 1-3 lines per timeout case), thus this
3382 * 'Unwrapped' function and the dummy 0 return value.
3383 *
3384 * @returns 0 (dummy, ignored)
3385 * @param idCpu The CPU we are current scheduled on.
3386 * @param pArgs Pointer to a parameter package.
3387 *
3388 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3389 * read the TSC at exactly the same time on both the master and the
3390 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3391 * contention, SMI, pipelining etc. there is no guaranteed way of
3392 * doing this on x86 CPUs.
3393 */
3394static int supdrvMeasureTscDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3395{
3396 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3397 PSUPTSCDELTASYNC pSync1 = pDevExt->pTscDeltaSync;
3398 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3399 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3400 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3401 uint32_t iTry;
3402 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3403 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3404 SUPTSCDELTASYNC2 MySync;
3405 PSUPTSCDELTASYNC2 pOtherSync;
3406
3407 /* A bit of paranoia first. */
3408 if (!pGipCpuMaster || !pGipCpuWorker)
3409 return 0;
3410
3411 /*
3412 * If the CPU isn't part of the measurement, return immediately.
3413 */
3414 if ( !fIsMaster
3415 && idCpu != pGipCpuWorker->idCpu)
3416 return 0;
3417
3418 /*
3419 * Set up my synchronization stuff and wait for the other party to show up.
3420 *
3421 * We don't wait forever since the other party may be off fishing (offline,
3422 * spinning with ints disables, whatever), we must play nice to the rest of
3423 * the system as this context generally isn't one in which we will get
3424 * preempted and we may hold up a number of lower priority interrupts.
3425 */
3426 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3427 ASMAtomicWritePtr(ppMySync, &MySync);
3428 MySync.uTscStart = ASMReadTSC();
3429 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3430
3431 /* Look for the partner, might not be here yet... Special abort considerations. */
3432 iTry = 0;
3433 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3434 {
3435 ASMNopPause();
3436 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3437 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu) )
3438 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3439 if ( (iTry++ & 0xff) == 0
3440 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3441 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3442#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
3443if (iTry == 0) __debugbreak();
3444#endif
3445 ASMNopPause();
3446 }
3447
3448 /* I found my partner, waiting to be found... Special abort considerations. */
3449 if (fIsMaster)
3450 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3451 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3452
3453 iTry = 0;
3454 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3455 {
3456 ASMNopPause();
3457 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3458 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3459 if ( (iTry++ & 0xff) == 0
3460 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3461 {
3462 if ( fIsMaster
3463 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3464 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3465 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3466 }
3467#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
3468if (iTry == 0) __debugbreak();
3469#endif
3470 }
3471
3472 if (!fIsMaster)
3473 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3474 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3475
3476 /*
3477 * Retry loop.
3478 */
3479 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3480 for (iTry = 0; iTry < 12; iTry++)
3481 {
3482 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3483 break;
3484
3485 /*
3486 * Do the measurements.
3487 */
3488#ifdef GIP_TSC_DELTA_METHOD_1
3489 supdrvTscDeltaMethod1Loop(pArgs, pSync1, &MySync, pOtherSync, fIsMaster, iTry);
3490#elif defined(GIP_TSC_DELTA_METHOD_2)
3491 supdrvTscDeltaMethod2Loop(pArgs, pSync1, &MySync, pOtherSync, fIsMaster, iTry);
3492#else
3493# error "huh??"
3494#endif
3495 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3496 break;
3497
3498 /*
3499 * Success? If so, stop trying.
3500 */
3501 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3502 {
3503 if (fIsMaster)
3504 {
3505 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3506 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3507 }
3508 else
3509 {
3510 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3511 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3512 }
3513 break;
3514 }
3515 }
3516
3517 /*
3518 * End the synchroniziation dance. We tell the other that we're done,
3519 * then wait for the same kind of reply.
3520 */
3521 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3522 ASMAtomicWriteNullPtr(ppMySync);
3523 iTry = 0;
3524 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3525 {
3526 iTry++;
3527 if ( iTry == 0
3528 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu))
3529 break; /* this really shouldn't happen. */
3530#if defined(DEBUG_bird) && defined(RT_OS_WINDOWS)
3531if (iTry == 0) __debugbreak();
3532#endif
3533 ASMNopPause();
3534 }
3535
3536 return 0;
3537}
3538
3539/**
3540 * Callback used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3541 * and compute the delta between them.
3542 *
3543 * @param idCpu The CPU we are current scheduled on.
3544 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3545 * @param pvUser2 Unused.
3546 */
3547static DECLCALLBACK(void) supdrvMeasureTscDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3548{
3549 supdrvMeasureTscDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3550}
3551
3552
3553/**
3554 * Measures the TSC delta between the master GIP CPU and one specified worker
3555 * CPU.
3556 *
3557 * @returns VBox status code.
3558 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3559 * failure.
3560 * @param pDevExt Pointer to the device instance data.
3561 * @param idxWorker The index of the worker CPU from the GIP's array of
3562 * CPUs.
3563 *
3564 * @remarks This must be called with preemption enabled!
3565 */
3566static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3567{
3568 int rc;
3569 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3570 RTCPUID idMaster = pDevExt->idGipMaster;
3571 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3572 PSUPGIPCPU pGipCpuMaster;
3573 uint32_t iGipCpuMaster;
3574
3575 /* Validate input a bit. */
3576 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3577 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3578 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3579
3580 /*
3581 * Don't attempt measuring the delta for the GIP master.
3582 */
3583 if (pGipCpuWorker->idCpu == idMaster)
3584 {
3585 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3586 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3587 return VINF_SUCCESS;
3588 }
3589
3590 /*
3591 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
3592 * try pick a different master. (This fudge only works with multi core systems.)
3593 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
3594 *
3595 * We skip this on AMDs for now as their HTT is different from intel's and
3596 * it doesn't seem to have any favorable effect on the results.
3597 *
3598 * If the master is offline, we need a new master too, so share the code.
3599 */
3600 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
3601 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
3602 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
3603 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
3604 && ASMHasCpuId()
3605 && ASMIsValidStdRange(ASMCpuId_EAX(0))
3606 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
3607 && !ASMIsAmdCpu()
3608 && pGip->cOnlineCpus > 2)
3609 || !RTMpIsCpuOnline(idMaster) )
3610 {
3611 uint32_t i;
3612 for (i = 0; i < pGip->cCpus; i++)
3613 if ( i != iGipCpuMaster
3614 && i != idxWorker
3615 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
3616 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
3617 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
3618 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
3619 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
3620 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
3621 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
3622 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
3623 {
3624 iGipCpuMaster = i;
3625 pGipCpuMaster = &pGip->aCPUs[i];
3626 idMaster = pGipCpuMaster->idCpu;
3627 break;
3628 }
3629 }
3630
3631 /*
3632 * Set the master TSC as the initiator. This serializes delta measurments.
3633 */
3634 /** @todo We can use a mutex or five for this now, and move it up before we
3635 * do the HTT/offline-master stuff. */
3636 while (!ASMAtomicCmpXchgU32(&pDevExt->idTscDeltaInitiator, idMaster, NIL_RTCPUID))
3637 {
3638 /*
3639 * Sleep here rather than spin as there is a parallel measurement
3640 * being executed and that can take a good while to be done.
3641 */
3642 RTThreadSleep(1);
3643 }
3644
3645 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
3646 {
3647 /** @todo we need to check that the master is online...
3648 * The old supdrvMeasureTscDeltaCallback code would spin forever. */
3649 /*
3650 * Initialize data package for the RTMpOnAll callback.
3651 */
3652 /** @todo this must be allocated, not residing on the stack. */
3653 SUPDRVGIPTSCDELTARGS Args;
3654 RT_ZERO(Args);
3655 Args.pWorker = pGipCpuWorker;
3656 Args.pMaster = pGipCpuMaster;
3657 Args.pDevExt = pDevExt;
3658 Args.pSyncMaster = NULL;
3659 Args.pSyncWorker = NULL;
3660#if 0 /* later */
3661 Args.cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 2048; /* 488 us */
3662#else
3663 Args.cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 1024; /* 976 us */
3664#endif
3665
3666#ifdef GIP_TSC_DELTA_METHOD_1
3667 rc = supdrvTscDeltaMethod1Init(&Args);
3668#elif defined(GIP_TSC_DELTA_METHOD_2)
3669 rc = supdrvTscDeltaMethod2Init(&Args);
3670#else
3671# error "huh?"
3672#endif
3673 if (RT_SUCCESS(rc))
3674 {
3675 /*
3676 * Fire TSC-read workers on all CPUs but only synchronize between master
3677 * and one worker to ease memory contention.
3678 */
3679 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3680 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
3681
3682 /** @todo Add RTMpOnPair and replace this ineffecient broadcast IPI. */
3683 rc = RTMpOnAll(supdrvMeasureTscDeltaCallback, &Args, NULL);
3684 if (RT_SUCCESS(rc))
3685 {
3686 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
3687 {
3688 /*
3689 * Work the TSC delta applicability rating. It starts
3690 * optimistic in supdrvGipInit, we downgrade it here.
3691 */
3692 SUPGIPUSETSCDELTA enmRating;
3693 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
3694 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
3695 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
3696 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
3697 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
3698 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
3699 else
3700 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
3701 if (pGip->enmUseTscDelta < enmRating)
3702 {
3703 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
3704 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
3705 }
3706 }
3707 else
3708 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3709 }
3710 /** @todo return try-again if we get an offline CPU error. */
3711 }
3712
3713#ifdef GIP_TSC_DELTA_METHOD_1
3714 supdrvTscDeltaMethod1Delete(&Args);
3715#elif defined(GIP_TSC_DELTA_METHOD_2)
3716 supdrvTscDeltaMethod2Delete(&Args);
3717#else
3718# error "huh?"
3719#endif
3720 }
3721 else
3722 rc = VERR_CPU_OFFLINE;
3723
3724 ASMAtomicWriteU32(&pDevExt->idTscDeltaInitiator, NIL_RTCPUID);
3725 return rc;
3726}
3727
3728
3729/**
3730 * Clears TSC delta related variables.
3731 *
3732 * Clears all TSC samples as well as the delta synchronization variable on the
3733 * all the per-CPU structs. Optionally also clears the per-cpu deltas too.
3734 *
3735 * @param pDevExt Pointer to the device instance data.
3736 * @param fClearDeltas Whether the deltas are also to be cleared.
3737 */
3738static void supdrvClearTscSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas)
3739{
3740 unsigned iCpu;
3741 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3742 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3743 {
3744 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
3745 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
3746 if (fClearDeltas)
3747 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
3748 }
3749 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
3750}
3751
3752
3753/**
3754 * Performs the initial measurements of the TSC deltas between CPUs.
3755 *
3756 * This is called by supdrvGipCreate or triggered by it if threaded.
3757 *
3758 * @returns VBox status code.
3759 * @param pDevExt Pointer to the device instance data.
3760 *
3761 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
3762 * idCpu, GIP's online CPU set which are populated in
3763 * supdrvGipInitOnCpu().
3764 */
3765static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt)
3766{
3767 PSUPGIPCPU pGipCpuMaster;
3768 unsigned iCpu;
3769 unsigned iOddEven;
3770 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3771 uint32_t idxMaster = UINT32_MAX;
3772 int rc = VINF_SUCCESS;
3773 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
3774
3775 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3776
3777 /*
3778 * Pick the first CPU online as the master TSC and make it the new GIP master based
3779 * on the APIC ID.
3780 *
3781 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
3782 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
3783 * master as this point since the sync/async timer isn't created yet.
3784 */
3785 supdrvClearTscSamples(pDevExt, true /* fClearDeltas */);
3786 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
3787 {
3788 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
3789 if (idxCpu != UINT16_MAX)
3790 {
3791 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
3792 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
3793 {
3794 idxMaster = idxCpu;
3795 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
3796 break;
3797 }
3798 }
3799 }
3800 AssertReturn(idxMaster != UINT32_MAX, VERR_CPU_NOT_FOUND);
3801 pGipCpuMaster = &pGip->aCPUs[idxMaster];
3802 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3803
3804 /*
3805 * If there is only a single CPU online we have nothing to do.
3806 */
3807 if (pGip->cOnlineCpus <= 1)
3808 {
3809 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
3810 return VINF_SUCCESS;
3811 }
3812
3813 /*
3814 * Loop thru the GIP CPU array and get deltas for each CPU (except the
3815 * master). We do the CPUs with the even numbered APIC IDs first so that
3816 * we've got alternative master CPUs to pick from on hyper-threaded systems.
3817 */
3818 for (iOddEven = 0; iOddEven < 2; iOddEven++)
3819 {
3820 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3821 {
3822 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3823 if ( iCpu != idxMaster
3824 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
3825 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3826 {
3827 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3828 if (RT_FAILURE(rc))
3829 {
3830 SUPR0Printf("supdrvMeasureTscDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
3831 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3832 break;
3833 }
3834
3835 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
3836 {
3837 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
3838 rc = VERR_TRY_AGAIN;
3839 break;
3840 }
3841 }
3842 }
3843 }
3844
3845 return rc;
3846}
3847
3848
3849#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3850
3851/**
3852 * Switches the TSC-delta measurement thread into the butchered state.
3853 *
3854 * @returns VBox status code.
3855 * @param pDevExt Pointer to the device instance data.
3856 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
3857 * @param pszFailed An error message to log.
3858 * @param rcFailed The error code to exit the thread with.
3859 */
3860static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
3861{
3862 if (!fSpinlockHeld)
3863 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3864
3865 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
3866 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3867 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", rcFailed));
3868 return rcFailed;
3869}
3870
3871
3872/**
3873 * The TSC-delta measurement thread.
3874 *
3875 * @returns VBox status code.
3876 * @param hThread The thread handle.
3877 * @param pvUser Opaque pointer to the device instance data.
3878 */
3879static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
3880{
3881 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
3882 bool fInitialMeasurement = true;
3883 uint32_t cConsecutiveTimeouts = 0;
3884 int rc = VERR_INTERNAL_ERROR_2;
3885 for (;;)
3886 {
3887 /*
3888 * Switch on the current state.
3889 */
3890 SUPDRVTSCDELTATHREADSTATE enmState;
3891 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3892 enmState = pDevExt->enmTscDeltaThreadState;
3893 switch (enmState)
3894 {
3895 case kTscDeltaThreadState_Creating:
3896 {
3897 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
3898 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
3899 if (RT_FAILURE(rc))
3900 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
3901 /* fall thru */
3902 }
3903
3904 case kTscDeltaThreadState_Listening:
3905 {
3906 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3907
3908 /* Simple adaptive timeout. */
3909 if (cConsecutiveTimeouts++ == 10)
3910 {
3911 if (pDevExt->cMsTscDeltaTimeout == 1) /* 10 ms */
3912 pDevExt->cMsTscDeltaTimeout = 10;
3913 else if (pDevExt->cMsTscDeltaTimeout == 10) /* +100 ms */
3914 pDevExt->cMsTscDeltaTimeout = 100;
3915 else if (pDevExt->cMsTscDeltaTimeout == 100) /* +1000 ms */
3916 pDevExt->cMsTscDeltaTimeout = 500;
3917 cConsecutiveTimeouts = 0;
3918 }
3919 rc = RTThreadUserWait(pDevExt->hTscDeltaThread, pDevExt->cMsTscDeltaTimeout);
3920 if ( RT_FAILURE(rc)
3921 && rc != VERR_TIMEOUT)
3922 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
3923 RTThreadUserReset(pDevExt->hTscDeltaThread);
3924 break;
3925 }
3926
3927 case kTscDeltaThreadState_WaitAndMeasure:
3928 {
3929 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
3930 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
3931 if (RT_FAILURE(rc))
3932 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
3933 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3934 pDevExt->cMsTscDeltaTimeout = 1;
3935 RTThreadSleep(10);
3936 /* fall thru */
3937 }
3938
3939 case kTscDeltaThreadState_Measuring:
3940 {
3941 cConsecutiveTimeouts = 0;
3942 if (fInitialMeasurement)
3943 {
3944 int cTries = 8;
3945 int cMsWaitPerTry = 10;
3946 fInitialMeasurement = false;
3947 do
3948 {
3949 rc = supdrvMeasureInitialTscDeltas(pDevExt);
3950 if ( RT_SUCCESS(rc)
3951 || ( RT_FAILURE(rc)
3952 && rc != VERR_TRY_AGAIN
3953 && rc != VERR_CPU_OFFLINE))
3954 {
3955 break;
3956 }
3957 RTThreadSleep(cMsWaitPerTry);
3958 } while (cTries-- > 0);
3959 }
3960 else
3961 {
3962 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3963 unsigned iCpu;
3964
3965 /* Measure TSC-deltas only for the CPUs that are in the set. */
3966 rc = VINF_SUCCESS;
3967 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3968 {
3969 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3970 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3971 {
3972 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
3973 {
3974 int rc2 = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3975 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
3976 rc = rc2;
3977 }
3978 else
3979 {
3980 /*
3981 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex,
3982 * mark the delta as fine to get the timer thread off our back.
3983 */
3984 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3985 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3986 }
3987 }
3988 }
3989 }
3990 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3991 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
3992 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
3993 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3994 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as the initial value. */
3995 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
3996 break;
3997 }
3998
3999 case kTscDeltaThreadState_Terminating:
4000 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4001 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4002 return VINF_SUCCESS;
4003
4004 case kTscDeltaThreadState_Butchered:
4005 default:
4006 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4007 }
4008 }
4009
4010 return rc;
4011}
4012
4013
4014/**
4015 * Waits for the TSC-delta measurement thread to respond to a state change.
4016 *
4017 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4018 * other error code on internal error.
4019 *
4020 * @param pThis Pointer to the grant service instance data.
4021 * @param enmCurState The current state.
4022 * @param enmNewState The new state we're waiting for it to enter.
4023 */
4024static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4025 SUPDRVTSCDELTATHREADSTATE enmNewState)
4026{
4027 /*
4028 * Wait a short while for the expected state transition.
4029 */
4030 int rc;
4031 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4032 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4033 if (pDevExt->enmTscDeltaThreadState == enmNewState)
4034 {
4035 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4036 rc = VINF_SUCCESS;
4037 }
4038 else if (pDevExt->enmTscDeltaThreadState == enmCurState)
4039 {
4040 /*
4041 * Wait longer if the state has not yet transitioned to the one we want.
4042 */
4043 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4044 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4045 if ( RT_SUCCESS(rc)
4046 || rc == VERR_TIMEOUT)
4047 {
4048 /*
4049 * Check the state whether we've succeeded.
4050 */
4051 SUPDRVTSCDELTATHREADSTATE enmState;
4052 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4053 enmState = pDevExt->enmTscDeltaThreadState;
4054 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4055 if (enmState == enmNewState)
4056 rc = VINF_SUCCESS;
4057 else if (enmState == enmCurState)
4058 {
4059 rc = VERR_TIMEOUT;
4060 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmState=%d enmNewState=%d\n", enmState,
4061 enmNewState));
4062 }
4063 else
4064 {
4065 rc = VERR_INTERNAL_ERROR;
4066 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4067 enmState, enmNewState));
4068 }
4069 }
4070 else
4071 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4072 }
4073 else
4074 {
4075 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4076 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d\n", enmCurState, enmNewState));
4077 rc = VERR_INTERNAL_ERROR;
4078 }
4079
4080 return rc;
4081}
4082
4083
4084/**
4085 * Waits for TSC-delta measurements to be completed for all online CPUs.
4086 *
4087 * @returns VBox status code.
4088 * @param pDevExt Pointer to the device instance data.
4089 */
4090static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt)
4091{
4092 int cTriesLeft = 5;
4093 int cMsTotalWait;
4094 int cMsWaited = 0;
4095 int cMsWaitGranularity = 1;
4096
4097 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4098 AssertReturn(pGip, VERR_INVALID_POINTER);
4099
4100 if (RT_UNLIKELY(pDevExt->hTscDeltaThread == NIL_RTTHREAD))
4101 return VERR_THREAD_NOT_WAITABLE;
4102
4103 cMsTotalWait = RT_MIN(pGip->cPresentCpus + 10, 200);
4104 while (cTriesLeft-- > 0)
4105 {
4106 if (RTCpuSetIsEqual(&pDevExt->TscDeltaObtainedCpuSet, &pGip->OnlineCpuSet))
4107 return VINF_SUCCESS;
4108 RTThreadSleep(cMsWaitGranularity);
4109 cMsWaited += cMsWaitGranularity;
4110 if (cMsWaited >= cMsTotalWait)
4111 break;
4112 }
4113
4114 return VERR_TIMEOUT;
4115}
4116
4117
4118/**
4119 * Terminates the actual thread running supdrvTscDeltaThread().
4120 *
4121 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4122 * supdrvTscDeltaTerm().
4123 *
4124 * @param pDevExt Pointer to the device instance data.
4125 */
4126static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4127{
4128 int rc;
4129 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4130 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4131 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4132 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4133 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4134 if (RT_FAILURE(rc))
4135 {
4136 /* Signal a few more times before giving up. */
4137 int cTriesLeft = 5;
4138 while (--cTriesLeft > 0)
4139 {
4140 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4141 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4142 if (rc != VERR_TIMEOUT)
4143 break;
4144 }
4145 }
4146}
4147
4148
4149/**
4150 * Initializes and spawns the TSC-delta measurement thread.
4151 *
4152 * A thread is required for servicing re-measurement requests from events like
4153 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4154 * under all contexts on all OSs.
4155 *
4156 * @returns VBox status code.
4157 * @param pDevExt Pointer to the device instance data.
4158 *
4159 * @remarks Must only be called -after- initializing GIP and setting up MP
4160 * notifications!
4161 */
4162static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4163{
4164 int rc;
4165 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4166 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4167 if (RT_SUCCESS(rc))
4168 {
4169 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4170 if (RT_SUCCESS(rc))
4171 {
4172 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4173 pDevExt->cMsTscDeltaTimeout = 1;
4174 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4175 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4176 if (RT_SUCCESS(rc))
4177 {
4178 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4179 if (RT_SUCCESS(rc))
4180 {
4181 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4182 return rc;
4183 }
4184
4185 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4186 supdrvTscDeltaThreadTerminate(pDevExt);
4187 }
4188 else
4189 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4190 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4191 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4192 }
4193 else
4194 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4195 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4196 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4197 }
4198 else
4199 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4200
4201 return rc;
4202}
4203
4204
4205/**
4206 * Terminates the TSC-delta measurement thread and cleanup.
4207 *
4208 * @param pDevExt Pointer to the device instance data.
4209 */
4210static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4211{
4212 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4213 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4214 {
4215 supdrvTscDeltaThreadTerminate(pDevExt);
4216 }
4217
4218 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4219 {
4220 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4221 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4222 }
4223
4224 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4225 {
4226 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4227 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4228 }
4229
4230 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4231}
4232
4233#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4234
4235/**
4236 * Measure the TSC delta for the CPU given by its CPU set index.
4237 *
4238 * @returns VBox status code.
4239 * @retval VERR_INTERRUPTED if interrupted while waiting.
4240 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4241 * measurment.
4242 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4243 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4244 *
4245 * @param pSession The caller's session. GIP must've been mapped.
4246 * @param iCpuSet The CPU set index of the CPU to measure.
4247 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4248 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4249 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4250 * ready.
4251 * @param cTries Number of times to try, pass 0 for the default.
4252 */
4253SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4254 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4255{
4256 PSUPDRVDEVEXT pDevExt;
4257 PSUPGLOBALINFOPAGE pGip;
4258 uint16_t iGipCpu;
4259 int rc;
4260#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4261 uint64_t msTsStartWait;
4262 uint32_t iWaitLoop;
4263#endif
4264
4265 /*
4266 * Validate and adjust the input.
4267 */
4268 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4269 if (!pSession->fGipReferenced)
4270 return VERR_WRONG_ORDER;
4271
4272 pDevExt = pSession->pDevExt;
4273 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4274
4275 pGip = pDevExt->pGip;
4276 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4277
4278 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4279 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4280 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4281 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4282
4283 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4284 return VERR_INVALID_FLAGS;
4285
4286 if (cTries == 0)
4287 cTries = 12;
4288 else if (cTries > 256)
4289 cTries = 256;
4290
4291 if (cMsWaitRetry == 0)
4292 cMsWaitRetry = 2;
4293 else if (cMsWaitRetry > 1000)
4294 cMsWaitRetry = 1000;
4295
4296 /*
4297 * The request is a noop if the TSC delta isn't being used.
4298 */
4299 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4300 return VINF_SUCCESS;
4301
4302#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4303 /*
4304 * Has the TSC already been measured and we're not forced to redo it?
4305 */
4306 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4307 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4308 return VINF_SUCCESS;
4309
4310 /*
4311 * Asynchronous request? Forward it to the thread, no waiting.
4312 */
4313 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4314 {
4315 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4316 * to pass those options to the thread somehow and implement it in the
4317 * thread. Check if anyone uses/needs fAsync before implementing this. */
4318 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4319 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4320 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4321 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4322 {
4323 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4324 rc = VINF_SUCCESS;
4325 }
4326 else
4327 rc = VERR_THREAD_IS_DEAD;
4328 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4329 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4330 return VINF_SUCCESS;
4331 }
4332
4333 /*
4334 * If a TSC-delta measurement request is already being serviced by the thread,
4335 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4336 */
4337 msTsStartWait = RTTimeSystemMilliTS();
4338 for (iWaitLoop = 0;; iWaitLoop++)
4339 {
4340 uint64_t cMsElapsed;
4341 SUPDRVTSCDELTATHREADSTATE enmState;
4342 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4343 enmState = pDevExt->enmTscDeltaThreadState;
4344 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4345
4346 if (enmState == kTscDeltaThreadState_Measuring)
4347 { /* Must wait, the thread is busy. */ }
4348 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4349 { /* Must wait, this state only says what will happen next. */ }
4350 else if (enmState == kTscDeltaThreadState_Terminating)
4351 { /* Must wait, this state only says what should happen next. */ }
4352 else
4353 break; /* All other states, the thread is either idly listening or dead. */
4354
4355 /* Wait or fail. */
4356 if (cMsWaitThread == 0)
4357 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4358 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4359 if (cMsElapsed >= cMsWaitThread)
4360 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4361
4362 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4363 if (rc == VERR_INTERRUPTED)
4364 return rc;
4365 }
4366#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4367
4368 /*
4369 * Try measure the TSC delta the given number of times.
4370 */
4371 for (;;)
4372 {
4373 /* Unless we're forced to measure the delta, check whether it's done already. */
4374 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4375 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4376 {
4377 rc = VINF_SUCCESS;
4378 break;
4379 }
4380
4381 /* Measure it. */
4382 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4383 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4384 {
4385 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4386 break;
4387 }
4388
4389 /* Retry? */
4390 if (cTries <= 1)
4391 break;
4392 cTries--;
4393
4394 /* Always delay between retries (be nice to the rest of the system
4395 and avoid the BSOD hounds). */
4396 rc = RTThreadSleep(cMsWaitRetry);
4397 if (rc == VERR_INTERRUPTED)
4398 break;
4399 }
4400
4401 return rc;
4402}
4403
4404
4405/**
4406 * Service a TSC-delta measurement request.
4407 *
4408 * @returns VBox status code.
4409 * @param pDevExt Pointer to the device instance data.
4410 * @param pSession The support driver session.
4411 * @param pReq Pointer to the TSC-delta measurement request.
4412 */
4413int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4414{
4415 uint32_t cTries;
4416 uint32_t iCpuSet;
4417 uint32_t fFlags;
4418 RTMSINTERVAL cMsWaitRetry;
4419
4420 /*
4421 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4422 */
4423 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4424
4425 if (pReq->u.In.idCpu == NIL_RTCPUID)
4426 return VERR_INVALID_CPU_ID;
4427 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4428 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4429 return VERR_INVALID_CPU_ID;
4430
4431 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4432
4433 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4434
4435 fFlags = 0;
4436 if (pReq->u.In.fAsync)
4437 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4438 if (pReq->u.In.fForce)
4439 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4440
4441 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4442 cTries == 0 ? 5*RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4443 cTries);
4444}
4445
4446
4447/**
4448 * Reads TSC with delta applied.
4449 *
4450 * Will try to resolve delta value INT64_MAX before applying it. This is the
4451 * main purpose of this function, to handle the case where the delta needs to be
4452 * determined.
4453 *
4454 * @returns VBox status code.
4455 * @param pDevExt Pointer to the device instance data.
4456 * @param pSession The support driver session.
4457 * @param pReq Pointer to the TSC-read request.
4458 */
4459int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4460{
4461 PSUPGLOBALINFOPAGE pGip;
4462 int rc;
4463
4464 /*
4465 * Validate. We require the client to have mapped GIP (no asserting on
4466 * ring-3 preconditions).
4467 */
4468 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4469 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4470 return VERR_WRONG_ORDER;
4471 pGip = pDevExt->pGip;
4472 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4473
4474 /*
4475 * We're usually here because we need to apply delta, but we shouldn't be
4476 * upset if the GIP is some different mode.
4477 */
4478 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4479 {
4480 uint32_t cTries = 0;
4481 for (;;)
4482 {
4483 /*
4484 * Start by gathering the data, using CLI for disabling preemption
4485 * while we do that.
4486 */
4487 RTCCUINTREG uFlags = ASMIntDisableFlags();
4488 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4489 int iGipCpu;
4490 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4491 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4492 {
4493 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4494 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4495 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4496 ASMSetFlags(uFlags);
4497
4498 /*
4499 * If we're lucky we've got a delta, but no predicitions here
4500 * as this I/O control is normally only used when the TSC delta
4501 * is set to INT64_MAX.
4502 */
4503 if (i64Delta != INT64_MAX)
4504 {
4505 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4506 rc = VINF_SUCCESS;
4507 break;
4508 }
4509
4510 /* Give up after a few times. */
4511 if (cTries >= 4)
4512 {
4513 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4514 break;
4515 }
4516
4517 /* Need to measure the delta an try again. */
4518 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4519 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4520 /** @todo should probably delay on failure... dpc watchdogs */
4521 }
4522 else
4523 {
4524 /* This really shouldn't happen. */
4525 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4526 pReq->u.Out.idApic = ASMGetApicId();
4527 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4528 ASMSetFlags(uFlags);
4529 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4530 break;
4531 }
4532 }
4533 }
4534 else
4535 {
4536 /*
4537 * No delta to apply. Easy. Deal with preemption the lazy way.
4538 */
4539 RTCCUINTREG uFlags = ASMIntDisableFlags();
4540 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4541 int iGipCpu;
4542 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4543 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4544 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4545 else
4546 pReq->u.Out.idApic = ASMGetApicId();
4547 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4548 ASMSetFlags(uFlags);
4549 rc = VINF_SUCCESS;
4550 }
4551
4552 return rc;
4553}
4554
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette