VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 74944

Last change on this file since 74944 was 73097, checked in by vboxsync, 7 years ago

*: Made RT_UOFFSETOF, RT_OFFSETOF, RT_UOFFSETOF_ADD and RT_OFFSETOF_ADD work like builtin_offsetof() and require compile time resolvable requests, adding RT_UOFFSETOF_DYN for the dynamic questions that can only be answered at runtime.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 180.9 KB
Line 
1/* $Id: SUPDrvGip.cpp 73097 2018-07-12 21:06:33Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2017 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#define LOG_GROUP LOG_GROUP_SUP_DRV
32#define SUPDRV_AGNOSTIC
33#include "SUPDrvInternal.h"
34#ifndef PAGE_SHIFT
35# include <iprt/param.h>
36#endif
37#include <iprt/asm.h>
38#include <iprt/asm-amd64-x86.h>
39#include <iprt/asm-math.h>
40#include <iprt/cpuset.h>
41#include <iprt/handletable.h>
42#include <iprt/mem.h>
43#include <iprt/mp.h>
44#include <iprt/power.h>
45#include <iprt/process.h>
46#include <iprt/semaphore.h>
47#include <iprt/spinlock.h>
48#include <iprt/thread.h>
49#include <iprt/uuid.h>
50#include <iprt/net.h>
51#include <iprt/crc.h>
52#include <iprt/string.h>
53#include <iprt/timer.h>
54#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
55# include <iprt/rand.h>
56# include <iprt/path.h>
57#endif
58#include <iprt/uint128.h>
59#include <iprt/x86.h>
60
61#include <VBox/param.h>
62#include <VBox/log.h>
63#include <VBox/err.h>
64
65#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
66# include "dtrace/SUPDrv.h"
67#else
68/* ... */
69#endif
70
71
72/*********************************************************************************************************************************
73* Defined Constants And Macros *
74*********************************************************************************************************************************/
75/** The frequency by which we recalculate the u32UpdateHz and
76 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
77 *
78 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
79 */
80#define GIP_UPDATEHZ_RECALC_FREQ 0x800
81
82/** A reserved TSC value used for synchronization as well as measurement of
83 * TSC deltas. */
84#define GIP_TSC_DELTA_RSVD UINT64_MAX
85/** The number of TSC delta measurement loops in total (includes primer and
86 * read-time loops). */
87#define GIP_TSC_DELTA_LOOPS 96
88/** The number of cache primer loops. */
89#define GIP_TSC_DELTA_PRIMER_LOOPS 4
90/** The number of loops until we keep computing the minumum read time. */
91#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
92
93/** The TSC frequency refinement period in seconds.
94 * The timer fires after 200ms, then every second, this value just says when
95 * to stop it after that. */
96#define GIP_TSC_REFINE_PERIOD_IN_SECS 12
97/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
98#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
99/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
100#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
101/** The TSC delta value for the initial GIP master - 0 in regular builds.
102 * To test the delta code this can be set to a non-zero value. */
103#if 0
104# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
105#else
106# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
107#endif
108
109AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
110AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
111
112/** @def VBOX_SVN_REV
113 * The makefile should define this if it can. */
114#ifndef VBOX_SVN_REV
115# define VBOX_SVN_REV 0
116#endif
117
118#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
119# define DO_NOT_START_GIP
120#endif
121
122
123/*********************************************************************************************************************************
124* Internal Functions *
125*********************************************************************************************************************************/
126static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
127static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
128static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask);
129static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
130static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas);
131#ifdef SUPDRV_USE_TSC_DELTA_THREAD
132static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
133static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
134static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll);
135#else
136static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt);
137static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
138#endif
139
140
141/*********************************************************************************************************************************
142* Global Variables *
143*********************************************************************************************************************************/
144DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
145
146
147
148/*
149 *
150 * Misc Common GIP Code
151 * Misc Common GIP Code
152 * Misc Common GIP Code
153 *
154 *
155 */
156
157
158/**
159 * Finds the GIP CPU index corresponding to @a idCpu.
160 *
161 * @returns GIP CPU array index, UINT32_MAX if not found.
162 * @param pGip The GIP.
163 * @param idCpu The CPU ID.
164 */
165static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
166{
167 uint32_t i;
168 for (i = 0; i < pGip->cCpus; i++)
169 if (pGip->aCPUs[i].idCpu == idCpu)
170 return i;
171 return UINT32_MAX;
172}
173
174
175
176/*
177 *
178 * GIP Mapping and Unmapping Related Code.
179 * GIP Mapping and Unmapping Related Code.
180 * GIP Mapping and Unmapping Related Code.
181 *
182 *
183 */
184
185
186/**
187 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
188 * updating.
189 *
190 * @param pGipCpu The per CPU structure for this CPU.
191 * @param u64NanoTS The current time.
192 */
193static void supdrvGipReInitCpu(PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
194{
195 /*
196 * Here we don't really care about applying the TSC delta. The re-initialization of this
197 * value is not relevant especially while (re)starting the GIP as the first few ones will
198 * be ignored anyway, see supdrvGipDoUpdateCpu().
199 */
200 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
201 pGipCpu->u64NanoTS = u64NanoTS;
202}
203
204
205/**
206 * Set the current TSC and NanoTS value for the CPU.
207 *
208 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
209 * @param pvUser1 Pointer to the ring-0 GIP mapping.
210 * @param pvUser2 Pointer to the variable holding the current time.
211 */
212static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
213{
214 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
215 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
216
217 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
218 supdrvGipReInitCpu(&pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
219
220 NOREF(pvUser2);
221 NOREF(idCpu);
222}
223
224
225/**
226 * State structure for supdrvGipDetectGetGipCpuCallback.
227 */
228typedef struct SUPDRVGIPDETECTGETCPU
229{
230 /** Bitmap of APIC IDs that has been seen (initialized to zero).
231 * Used to detect duplicate APIC IDs (paranoia). */
232 uint8_t volatile bmApicId[256 / 8];
233 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
234 * initially). The callback clears the methods not detected. */
235 uint32_t volatile fSupported;
236 /** The first callback detecting any kind of range issues (initialized to
237 * NIL_RTCPUID). */
238 RTCPUID volatile idCpuProblem;
239} SUPDRVGIPDETECTGETCPU;
240/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
241typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
242
243
244/**
245 * Checks for alternative ways of getting the CPU ID.
246 *
247 * This also checks the APIC ID, CPU ID and CPU set index values against the
248 * GIP tables.
249 *
250 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
251 * @param pvUser1 Pointer to the state structure.
252 * @param pvUser2 Pointer to the GIP.
253 */
254static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
255{
256 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
257 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
258 uint32_t fSupported = 0;
259 uint16_t idApic;
260 int iCpuSet;
261 NOREF(pGip);
262
263 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
264
265 /*
266 * Check that the CPU ID and CPU set index are interchangable.
267 */
268 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
269 if ((RTCPUID)iCpuSet == idCpu)
270 {
271 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
272 if ( iCpuSet >= 0
273 && iCpuSet < RTCPUSET_MAX_CPUS
274 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
275 {
276 PSUPGIPCPU pGipCpu = SUPGetGipCpuBySetIndex(pGip, iCpuSet);
277
278 /*
279 * Check whether the IDTR.LIMIT contains a CPU number.
280 */
281#ifdef RT_ARCH_X86
282 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
283#else
284 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
285#endif
286 RTIDTR Idtr;
287 ASMGetIDTR(&Idtr);
288 if (Idtr.cbIdt >= cbIdt)
289 {
290 uint32_t uTmp = Idtr.cbIdt - cbIdt;
291 uTmp &= RTCPUSET_MAX_CPUS - 1;
292 if (uTmp == idCpu)
293 {
294 RTIDTR Idtr2;
295 ASMGetIDTR(&Idtr2);
296 if (Idtr2.cbIdt == Idtr.cbIdt)
297 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
298 }
299 }
300
301 /*
302 * Check whether RDTSCP is an option.
303 */
304 if (ASMHasCpuId())
305 {
306 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
307 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
308 {
309 uint32_t uAux;
310 ASMReadTscWithAux(&uAux);
311 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
312 {
313 ASMNopPause();
314 ASMReadTscWithAux(&uAux);
315 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
316 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
317 }
318
319 if (pGipCpu)
320 {
321 uint32_t const uGroupedAux = (uint8_t)pGipCpu->iCpuGroupMember | ((uint32_t)pGipCpu->iCpuGroup << 8);
322 if ( (uAux & UINT16_MAX) == uGroupedAux
323 && pGipCpu->iCpuGroupMember <= UINT8_MAX)
324 {
325 ASMNopPause();
326 ASMReadTscWithAux(&uAux);
327 if ((uAux & UINT16_MAX) == uGroupedAux)
328 fSupported |= SUPGIPGETCPU_RDTSCP_GROUP_IN_CH_NUMBER_IN_CL;
329 }
330 }
331 }
332 }
333 }
334 }
335
336 /*
337 * Check that the APIC ID is unique.
338 */
339 idApic = ASMGetApicId();
340 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
341 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
342 fSupported |= SUPGIPGETCPU_APIC_ID;
343 else
344 {
345 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
346 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
347 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
348 idCpu, iCpuSet, idApic));
349 }
350
351 /*
352 * Check that the iCpuSet is within the expected range.
353 */
354 if (RT_UNLIKELY( iCpuSet < 0
355 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
356 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
357 {
358 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
359 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
360 idCpu, iCpuSet, idApic));
361 }
362 else
363 {
364 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
365 if (RT_UNLIKELY(idCpu2 != idCpu))
366 {
367 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
368 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
369 idCpu, iCpuSet, idApic, idCpu2));
370 }
371 }
372
373 /*
374 * Update the supported feature mask before we return.
375 */
376 ASMAtomicAndU32(&pState->fSupported, fSupported);
377
378 NOREF(pvUser2);
379}
380
381
382/**
383 * Increase the timer freqency on hosts where this is possible (NT).
384 *
385 * The idea is that more interrupts is better for us... Also, it's better than
386 * we increase the timer frequence, because we might end up getting inaccurate
387 * callbacks if someone else does it.
388 *
389 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
390 */
391static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
392{
393 if (pDevExt->u32SystemTimerGranularityGrant == 0)
394 {
395 uint32_t u32SystemResolution;
396 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
397 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
398 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
399 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
400 )
401 {
402#if 0 /* def VBOX_STRICT - this is somehow triggers bogus assertions on windows 10 */
403 uint32_t u32After = RTTimerGetSystemGranularity();
404 AssertMsg(u32After <= u32SystemResolution, ("u32After=%u u32SystemResolution=%u\n", u32After, u32SystemResolution));
405#endif
406 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
407 }
408 }
409}
410
411
412/**
413 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
414 *
415 * @param pDevExt Clears u32SystemTimerGranularityGrant.
416 */
417static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
418{
419 if (pDevExt->u32SystemTimerGranularityGrant)
420 {
421 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
422 AssertRC(rc2);
423 pDevExt->u32SystemTimerGranularityGrant = 0;
424 }
425}
426
427
428/**
429 * Maps the GIP into userspace and/or get the physical address of the GIP.
430 *
431 * @returns IPRT status code.
432 * @param pSession Session to which the GIP mapping should belong.
433 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
434 * @param pHCPhysGip Where to store the physical address. (optional)
435 *
436 * @remark There is no reference counting on the mapping, so one call to this function
437 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
438 * and remove the session as a GIP user.
439 */
440SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
441{
442 int rc;
443 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
444 RTR3PTR pGipR3 = NIL_RTR3PTR;
445 RTHCPHYS HCPhys = NIL_RTHCPHYS;
446 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
447
448 /*
449 * Validate
450 */
451 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
452 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
453 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
454
455#ifdef SUPDRV_USE_MUTEX_FOR_GIP
456 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
457#else
458 RTSemFastMutexRequest(pDevExt->mtxGip);
459#endif
460 if (pDevExt->pGip)
461 {
462 /*
463 * Map it?
464 */
465 rc = VINF_SUCCESS;
466 if (ppGipR3)
467 {
468 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
469 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
470 RTMEM_PROT_READ, NIL_RTR0PROCESS);
471 if (RT_SUCCESS(rc))
472 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
473 }
474
475 /*
476 * Get physical address.
477 */
478 if (pHCPhysGip && RT_SUCCESS(rc))
479 HCPhys = pDevExt->HCPhysGip;
480
481 /*
482 * Reference globally.
483 */
484 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
485 {
486 pSession->fGipReferenced = 1;
487 pDevExt->cGipUsers++;
488 if (pDevExt->cGipUsers == 1)
489 {
490 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
491 uint64_t u64NanoTS;
492
493 /*
494 * GIP starts/resumes updating again. On windows we bump the
495 * host timer frequency to make sure we don't get stuck in guest
496 * mode and to get better timer (and possibly clock) accuracy.
497 */
498 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
499
500 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
501
502 /*
503 * document me
504 */
505 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
506 {
507 unsigned i;
508 for (i = 0; i < pGipR0->cCpus; i++)
509 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
510 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
511 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
512 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
513 }
514
515 /*
516 * document me
517 */
518 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
519 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
520 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
521 || RTMpGetOnlineCount() == 1)
522 supdrvGipReInitCpu(&pGipR0->aCPUs[0], u64NanoTS);
523 else
524 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
525
526 /*
527 * Detect alternative ways to figure the CPU ID in ring-3 and
528 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
529 * and CPU set indexes while we're at it.
530 */
531 if (RT_SUCCESS(rc))
532 {
533 SUPDRVGIPDETECTGETCPU DetectState;
534 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
535 DetectState.fSupported = UINT32_MAX;
536 DetectState.idCpuProblem = NIL_RTCPUID;
537 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
538 if (DetectState.idCpuProblem == NIL_RTCPUID)
539 {
540 if ( DetectState.fSupported != UINT32_MAX
541 && DetectState.fSupported != 0)
542 {
543 if (pGipR0->fGetGipCpu != DetectState.fSupported)
544 {
545 pGipR0->fGetGipCpu = DetectState.fSupported;
546 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
547 }
548 }
549 else
550 {
551 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
552 DetectState.fSupported));
553 rc = VERR_UNSUPPORTED_CPU;
554 }
555 }
556 else
557 {
558 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
559 DetectState.idCpuProblem, DetectState.idCpuProblem));
560 rc = VERR_INVALID_CPU_ID;
561 }
562 }
563
564 /*
565 * Start the GIP timer if all is well..
566 */
567 if (RT_SUCCESS(rc))
568 {
569#ifndef DO_NOT_START_GIP
570 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
571#endif
572 rc = VINF_SUCCESS;
573 }
574
575 /*
576 * Bail out on error.
577 */
578 if (RT_FAILURE(rc))
579 {
580 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
581 pDevExt->cGipUsers = 0;
582 pSession->fGipReferenced = 0;
583 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
584 {
585 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
586 if (RT_SUCCESS(rc2))
587 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
588 }
589 HCPhys = NIL_RTHCPHYS;
590 pGipR3 = NIL_RTR3PTR;
591 }
592 }
593 }
594 }
595 else
596 {
597 rc = VERR_GENERAL_FAILURE;
598 Log(("SUPR0GipMap: GIP is not available!\n"));
599 }
600#ifdef SUPDRV_USE_MUTEX_FOR_GIP
601 RTSemMutexRelease(pDevExt->mtxGip);
602#else
603 RTSemFastMutexRelease(pDevExt->mtxGip);
604#endif
605
606 /*
607 * Write returns.
608 */
609 if (pHCPhysGip)
610 *pHCPhysGip = HCPhys;
611 if (ppGipR3)
612 *ppGipR3 = pGipR3;
613
614#ifdef DEBUG_DARWIN_GIP
615 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
616#else
617 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
618#endif
619 return rc;
620}
621
622
623/**
624 * Unmaps any user mapping of the GIP and terminates all GIP access
625 * from this session.
626 *
627 * @returns IPRT status code.
628 * @param pSession Session to which the GIP mapping should belong.
629 */
630SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
631{
632 int rc = VINF_SUCCESS;
633 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
634#ifdef DEBUG_DARWIN_GIP
635 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
636 pSession,
637 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
638 pSession->GipMapObjR3));
639#else
640 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
641#endif
642 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
643
644#ifdef SUPDRV_USE_MUTEX_FOR_GIP
645 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
646#else
647 RTSemFastMutexRequest(pDevExt->mtxGip);
648#endif
649
650 /*
651 * GIP test-mode session?
652 */
653 if ( pSession->fGipTestMode
654 && pDevExt->pGip)
655 {
656 supdrvGipSetFlags(pDevExt, pSession, 0, ~SUPGIP_FLAGS_TESTING_ENABLE);
657 Assert(!pSession->fGipTestMode);
658 }
659
660 /*
661 * Unmap anything?
662 */
663 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
664 {
665 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
666 AssertRC(rc);
667 if (RT_SUCCESS(rc))
668 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
669 }
670
671 /*
672 * Dereference global GIP.
673 */
674 if (pSession->fGipReferenced && !rc)
675 {
676 pSession->fGipReferenced = 0;
677 if ( pDevExt->cGipUsers > 0
678 && !--pDevExt->cGipUsers)
679 {
680 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
681#ifndef DO_NOT_START_GIP
682 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
683#endif
684 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
685 }
686 }
687
688#ifdef SUPDRV_USE_MUTEX_FOR_GIP
689 RTSemMutexRelease(pDevExt->mtxGip);
690#else
691 RTSemFastMutexRelease(pDevExt->mtxGip);
692#endif
693
694 return rc;
695}
696
697
698/**
699 * Gets the GIP pointer.
700 *
701 * @returns Pointer to the GIP or NULL.
702 */
703SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
704{
705 return g_pSUPGlobalInfoPage;
706}
707
708
709
710
711
712/*
713 *
714 *
715 * GIP Initialization, Termination and CPU Offline / Online Related Code.
716 * GIP Initialization, Termination and CPU Offline / Online Related Code.
717 * GIP Initialization, Termination and CPU Offline / Online Related Code.
718 *
719 *
720 */
721
722/**
723 * Used by supdrvGipInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
724 * to update the TSC frequency related GIP variables.
725 *
726 * @param pGip The GIP.
727 * @param nsElapsed The number of nanoseconds elapsed.
728 * @param cElapsedTscTicks The corresponding number of TSC ticks.
729 * @param iTick The tick number for debugging.
730 */
731static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks, uint32_t iTick)
732{
733 /*
734 * Calculate the frequency.
735 */
736 uint64_t uCpuHz;
737 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
738 && nsElapsed < UINT32_MAX)
739 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
740 else
741 {
742 RTUINT128U CpuHz, Tmp, Divisor;
743 CpuHz.s.Lo = CpuHz.s.Hi = 0;
744 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
745 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
746 uCpuHz = CpuHz.s.Lo;
747 }
748
749 /*
750 * Update the GIP.
751 */
752 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
753 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
754 {
755 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
756
757 /* For inspecting the frequency calcs using tstGIP-2, debugger or similar. */
758 if (iTick + 1 < pGip->cCpus)
759 ASMAtomicWriteU64(&pGip->aCPUs[iTick + 1].u64CpuHz, uCpuHz);
760 }
761}
762
763
764/**
765 * Timer callback function for TSC frequency refinement in invariant GIP mode.
766 *
767 * This is started during driver init and fires once
768 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
769 *
770 * @param pTimer The timer.
771 * @param pvUser Opaque pointer to the device instance data.
772 * @param iTick The timer tick.
773 */
774static DECLCALLBACK(void) supdrvGipInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
775{
776 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
777 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
778 RTCPUID idCpu;
779 uint64_t cNsElapsed;
780 uint64_t cTscTicksElapsed;
781 uint64_t nsNow;
782 uint64_t uTsc;
783 RTCCUINTREG fEFlags;
784
785 /* Paranoia. */
786 AssertReturnVoid(pGip);
787 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
788
789 /*
790 * If we got a power event, stop the refinement process.
791 */
792 if (pDevExt->fInvTscRefinePowerEvent)
793 {
794 int rc = RTTimerStop(pTimer); AssertRC(rc);
795 return;
796 }
797
798 /*
799 * Read the TSC and time, noting which CPU we are on.
800 *
801 * Don't bother spinning until RTTimeSystemNanoTS changes, since on
802 * systems where it matters we're in a context where we cannot waste that
803 * much time (DPC watchdog, called from clock interrupt).
804 */
805 fEFlags = ASMIntDisableFlags();
806 uTsc = ASMReadTSC();
807 nsNow = RTTimeSystemNanoTS();
808 idCpu = RTMpCpuId();
809 ASMSetFlags(fEFlags);
810
811 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
812 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
813
814 /*
815 * If the above measurement was taken on a different CPU than the one we
816 * started the process on, cTscTicksElapsed will need to be adjusted with
817 * the TSC deltas of both the CPUs.
818 *
819 * We ASSUME that the delta calculation process takes less time than the
820 * TSC frequency refinement timer. If it doesn't, we'll complain and
821 * drop the frequency refinement.
822 *
823 * Note! We cannot entirely trust enmUseTscDelta here because it's
824 * downgraded after each delta calculation.
825 */
826 if ( idCpu != pDevExt->idCpuInvarTscRefine
827 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
828 {
829 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
830 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
831 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
832 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
833 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
834 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
835 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
836 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
837 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
838 {
839 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
840 {
841 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
842 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
843 }
844 }
845 /*
846 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
847 * calculations.
848 */
849 else if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
850 {
851 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
852 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
853 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
854 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
855 int rc = RTTimerStop(pTimer); AssertRC(rc);
856 return;
857 }
858 }
859
860 /*
861 * Calculate and update the CPU frequency variables in GIP.
862 *
863 * If there is a GIP user already and we've already refined the frequency
864 * a couple of times, don't update it as we want a stable frequency value
865 * for all VMs.
866 */
867 if ( pDevExt->cGipUsers == 0
868 || cNsElapsed < RT_NS_1SEC * 2)
869 {
870 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed, (uint32_t)iTick);
871
872 /*
873 * Stop the timer once we've reached the defined refinement period.
874 */
875 if (cNsElapsed > GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
876 {
877 int rc = RTTimerStop(pTimer);
878 AssertRC(rc);
879 }
880 }
881 else
882 {
883 int rc = RTTimerStop(pTimer);
884 AssertRC(rc);
885 }
886}
887
888
889/**
890 * @callback_method_impl{FNRTPOWERNOTIFICATION}
891 */
892static DECLCALLBACK(void) supdrvGipPowerNotificationCallback(RTPOWEREVENT enmEvent, void *pvUser)
893{
894 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
895 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
896
897 /*
898 * If the TSC frequency refinement timer is running, we need to cancel it so it
899 * doesn't screw up the frequency after a long suspend.
900 *
901 * Recalculate all TSC-deltas on host resume as it may have changed, seen
902 * on Windows 7 running on the Dell Optiplex Intel Core i5-3570.
903 */
904 if (enmEvent == RTPOWEREVENT_RESUME)
905 {
906 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
907 if ( RT_LIKELY(pGip)
908 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
909 && !supdrvOSAreCpusOfflinedOnSuspend())
910 {
911#ifdef SUPDRV_USE_TSC_DELTA_THREAD
912 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
913#else
914 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
915 supdrvTscMeasureInitialDeltas(pDevExt);
916#endif
917 }
918 }
919 else if (enmEvent == RTPOWEREVENT_SUSPEND)
920 ASMAtomicWriteBool(&pDevExt->fInvTscRefinePowerEvent, true);
921}
922
923
924/**
925 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
926 *
927 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
928 * the CPU may change the TSC frequence between now and when the timer fires
929 * (supdrvInitAsyncRefineTscTimer).
930 *
931 * @param pDevExt Pointer to the device instance data.
932 */
933static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt)
934{
935 uint64_t u64NanoTS;
936 RTCCUINTREG fEFlags;
937 int rc;
938
939 /*
940 * Register a power management callback.
941 */
942 pDevExt->fInvTscRefinePowerEvent = false;
943 rc = RTPowerNotificationRegister(supdrvGipPowerNotificationCallback, pDevExt);
944 AssertRC(rc); /* ignore */
945
946 /*
947 * Record the TSC and NanoTS as the starting anchor point for refinement
948 * of the TSC. We try get as close to a clock tick as possible on systems
949 * which does not provide high resolution time.
950 */
951 u64NanoTS = RTTimeSystemNanoTS();
952 while (RTTimeSystemNanoTS() == u64NanoTS)
953 ASMNopPause();
954
955 fEFlags = ASMIntDisableFlags();
956 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
957 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
958 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
959 ASMSetFlags(fEFlags);
960
961 /*
962 * Create a timer that runs on the same CPU so we won't have a depencency
963 * on the TSC-delta and can run in parallel to it. On systems that does not
964 * implement CPU specific timers we'll apply deltas in the timer callback,
965 * just like we do for CPUs going offline.
966 *
967 * The longer the refinement interval the better the accuracy, at least in
968 * theory. If it's too long though, ring-3 may already be starting its
969 * first VMs before we're done. On most systems we will be loading the
970 * support driver during boot and VMs won't be started for a while yet,
971 * it is really only a problem during development (especially with
972 * on-demand driver starting on windows).
973 *
974 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
975 * to calculate the frequency during driver loading, the timer is set
976 * to fire after 200 ms the first time. It will then reschedule itself
977 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
978 * reached or it notices that there is a user land client with GIP
979 * mapped (we want a stable frequency for all VMs).
980 */
981 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC,
982 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
983 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
984 if (RT_SUCCESS(rc))
985 {
986 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
987 if (RT_SUCCESS(rc))
988 return;
989 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
990 }
991
992 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
993 {
994 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, RT_NS_1SEC, RTTIMER_FLAGS_CPU_ANY,
995 supdrvGipInitRefineInvariantTscFreqTimer, pDevExt);
996 if (RT_SUCCESS(rc))
997 {
998 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
999 if (RT_SUCCESS(rc))
1000 return;
1001 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1002 }
1003 }
1004
1005 pDevExt->pInvarTscRefineTimer = NULL;
1006 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1007}
1008
1009
1010/**
1011 * @callback_method_impl{PFNRTMPWORKER,
1012 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1013 * the measurements on.}
1014 */
1015DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1016{
1017 RTCCUINTREG fEFlags = ASMIntDisableFlags();
1018 uint64_t *puTscStop = (uint64_t *)pvUser1;
1019 uint64_t *pnsStop = (uint64_t *)pvUser2;
1020 RT_NOREF1(idCpu);
1021
1022 *puTscStop = ASMReadTSC();
1023 *pnsStop = RTTimeSystemNanoTS();
1024
1025 ASMSetFlags(fEFlags);
1026}
1027
1028
1029/**
1030 * Measures the TSC frequency of the system.
1031 *
1032 * The TSC frequency can vary on systems which are not reported as invariant.
1033 * On such systems the object of this function is to find out what the nominal,
1034 * maximum TSC frequency under 'normal' CPU operation.
1035 *
1036 * @returns VBox status code.
1037 * @param pGip Pointer to the GIP.
1038 * @param fRough Set if we're doing the rough calculation that the
1039 * TSC measuring code needs, where accuracy isn't all
1040 * that important (too high is better than too low).
1041 * When clear we try for best accuracy that we can
1042 * achieve in reasonably short time.
1043 */
1044static int supdrvGipInitMeasureTscFreq(PSUPGLOBALINFOPAGE pGip, bool fRough)
1045{
1046 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1047 int cTriesLeft = fRough ? 4 : 2;
1048 while (cTriesLeft-- > 0)
1049 {
1050 RTCCUINTREG fEFlags;
1051 uint64_t nsStart;
1052 uint64_t nsStop;
1053 uint64_t uTscStart;
1054 uint64_t uTscStop;
1055 RTCPUID idCpuStart;
1056 RTCPUID idCpuStop;
1057
1058 /*
1059 * Synchronize with the host OS clock tick on systems without high
1060 * resolution time API (older Windows version for example).
1061 */
1062 nsStart = RTTimeSystemNanoTS();
1063 while (RTTimeSystemNanoTS() == nsStart)
1064 ASMNopPause();
1065
1066 /*
1067 * Read the TSC and current time, noting which CPU we're on.
1068 */
1069 fEFlags = ASMIntDisableFlags();
1070 uTscStart = ASMReadTSC();
1071 nsStart = RTTimeSystemNanoTS();
1072 idCpuStart = RTMpCpuId();
1073 ASMSetFlags(fEFlags);
1074
1075 /*
1076 * Delay for a while.
1077 */
1078 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1079 {
1080 /*
1081 * Sleep-wait since the TSC frequency is constant, it eases host load.
1082 * Shorter interval produces more variance in the frequency (esp. Windows).
1083 */
1084 uint64_t msElapsed = 0;
1085 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1086 / RT_NS_1MS;
1087 do
1088 {
1089 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1090 nsStop = RTTimeSystemNanoTS();
1091 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1092 } while (msElapsed < msDelay);
1093
1094 while (RTTimeSystemNanoTS() == nsStop)
1095 ASMNopPause();
1096 }
1097 else
1098 {
1099 /*
1100 * Busy-wait keeping the frequency up.
1101 */
1102 do
1103 {
1104 ASMNopPause();
1105 nsStop = RTTimeSystemNanoTS();
1106 } while (nsStop - nsStart < RT_NS_100MS);
1107 }
1108
1109 /*
1110 * Read the TSC and time again.
1111 */
1112 fEFlags = ASMIntDisableFlags();
1113 uTscStop = ASMReadTSC();
1114 nsStop = RTTimeSystemNanoTS();
1115 idCpuStop = RTMpCpuId();
1116 ASMSetFlags(fEFlags);
1117
1118 /*
1119 * If the CPU changes, things get a bit complicated and what we
1120 * can get away with depends on the GIP mode / TSC reliability.
1121 */
1122 if (idCpuStop != idCpuStart)
1123 {
1124 bool fDoXCall = false;
1125
1126 /*
1127 * Synchronous TSC mode: we're probably fine as it's unlikely
1128 * that we were rescheduled because of TSC throttling or power
1129 * management reasons, so just go ahead.
1130 */
1131 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1132 {
1133 /* Probably ok, maybe we should retry once?. */
1134 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1135 }
1136 /*
1137 * If we're just doing the rough measurement, do the cross call and
1138 * get on with things (we don't have deltas!).
1139 */
1140 else if (fRough)
1141 fDoXCall = true;
1142 /*
1143 * Invariant TSC mode: It doesn't matter if we have delta available
1144 * for both CPUs. That is not something we can assume at this point.
1145 *
1146 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1147 * downgraded after each delta calculation and the delta
1148 * calculations may not be complete yet.
1149 */
1150 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1151 {
1152/** @todo This section of code is never reached atm, consider dropping it later on... */
1153 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1154 {
1155 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1156 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1157 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1158 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1159 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1160 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1161 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1162 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1163 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1164 {
1165 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1166 {
1167 uTscStart -= iStartTscDelta;
1168 uTscStop -= iStopTscDelta;
1169 }
1170 }
1171 /*
1172 * Invalid CPU indexes are not caused by online/offline races, so
1173 * we have to trigger driver load failure if that happens as GIP
1174 * and IPRT assumptions are busted on this system.
1175 */
1176 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1177 {
1178 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1179 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1180 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1181 return VERR_INVALID_CPU_INDEX;
1182 }
1183 /*
1184 * No valid deltas. We retry, if we're on our last retry
1185 * we do the cross call instead just to get a result. The
1186 * frequency will be refined in a few seconds anyway.
1187 */
1188 else if (cTriesLeft > 0)
1189 continue;
1190 else
1191 fDoXCall = true;
1192 }
1193 }
1194 /*
1195 * Asynchronous TSC mode: This is bad, as the reason we usually
1196 * use this mode is to deal with variable TSC frequencies and
1197 * deltas. So, we need to get the TSC from the same CPU as
1198 * started it, we also need to keep that CPU busy. So, retry
1199 * and fall back to the cross call on the last attempt.
1200 */
1201 else
1202 {
1203 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1204 if (cTriesLeft > 0)
1205 continue;
1206 fDoXCall = true;
1207 }
1208
1209 if (fDoXCall)
1210 {
1211 /*
1212 * Try read the TSC and timestamp on the start CPU.
1213 */
1214 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1215 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1216 continue;
1217 }
1218 }
1219
1220 /*
1221 * Calculate the TSC frequency and update it (shared with the refinement timer).
1222 */
1223 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart, 0);
1224 return VINF_SUCCESS;
1225 }
1226
1227 Assert(!fRough);
1228 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1229}
1230
1231
1232/**
1233 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1234 *
1235 * @returns Index of the CPU in the cache set.
1236 * @param pGip The GIP.
1237 * @param idCpu The CPU ID.
1238 */
1239static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1240{
1241 uint32_t i, cTries;
1242
1243 /*
1244 * ASSUMES that CPU IDs are constant.
1245 */
1246 for (i = 0; i < pGip->cCpus; i++)
1247 if (pGip->aCPUs[i].idCpu == idCpu)
1248 return i;
1249
1250 cTries = 0;
1251 do
1252 {
1253 for (i = 0; i < pGip->cCpus; i++)
1254 {
1255 bool fRc;
1256 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1257 if (fRc)
1258 return i;
1259 }
1260 } while (cTries++ < 32);
1261 AssertReleaseFailed();
1262 return i - 1;
1263}
1264
1265
1266/**
1267 * The calling CPU should be accounted as online, update GIP accordingly.
1268 *
1269 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1270 *
1271 * @param pDevExt The device extension.
1272 * @param idCpu The CPU ID.
1273 */
1274static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1275{
1276 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1277 int iCpuSet = 0;
1278 uint16_t idApic = UINT16_MAX;
1279 uint32_t i = 0;
1280 uint64_t u64NanoTS = 0;
1281
1282 AssertPtrReturnVoid(pGip);
1283 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1284 AssertRelease(idCpu == RTMpCpuId());
1285 Assert(pGip->cPossibleCpus == RTMpGetCount());
1286
1287 /*
1288 * Do this behind a spinlock with interrupts disabled as this can fire
1289 * on all CPUs simultaneously, see @bugref{6110}.
1290 */
1291 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1292
1293 /*
1294 * Update the globals.
1295 */
1296 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1297 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1298 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1299 if (iCpuSet >= 0)
1300 {
1301 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1302 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1303 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1304 }
1305
1306 /*
1307 * Update the entry.
1308 */
1309 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1310 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1311
1312 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1313
1314 idApic = ASMGetApicId();
1315 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1316 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1317 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1318
1319 pGip->aCPUs[i].iCpuGroup = 0;
1320 pGip->aCPUs[i].iCpuGroupMember = iCpuSet;
1321#ifdef RT_OS_WINDOWS
1322 supdrvOSGipInitGroupBitsForCpu(pDevExt, pGip, &pGip->aCPUs[i]);
1323#endif
1324
1325 /*
1326 * Update the APIC ID and CPU set index mappings.
1327 */
1328 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1329 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1330
1331 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1332 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1333
1334 /* Update the Mp online/offline counter. */
1335 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1336
1337 /* Commit it. */
1338 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1339
1340 RTSpinlockRelease(pDevExt->hGipSpinlock);
1341}
1342
1343
1344/**
1345 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1346 *
1347 * @param idCpu The CPU ID we are running on.
1348 * @param pvUser1 Opaque pointer to the device instance data.
1349 * @param pvUser2 Not used.
1350 */
1351static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1352{
1353 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1354 NOREF(pvUser2);
1355 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1356}
1357
1358
1359/**
1360 * The CPU should be accounted as offline, update the GIP accordingly.
1361 *
1362 * This is used by supdrvGipMpEvent.
1363 *
1364 * @param pDevExt The device extension.
1365 * @param idCpu The CPU ID.
1366 */
1367static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1368{
1369 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1370 int iCpuSet;
1371 unsigned i;
1372
1373 AssertPtrReturnVoid(pGip);
1374 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1375
1376 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1377 AssertReturnVoid(iCpuSet >= 0);
1378
1379 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1380 AssertReturnVoid(i < pGip->cCpus);
1381 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1382
1383 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1384 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1385
1386 /* Update the Mp online/offline counter. */
1387 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1388
1389 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1390 {
1391 /* Reset the TSC delta, we will recalculate it lazily. */
1392 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1393 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1394 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1395 }
1396
1397 /* Commit it. */
1398 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1399
1400 RTSpinlockRelease(pDevExt->hGipSpinlock);
1401}
1402
1403
1404/**
1405 * Multiprocessor event notification callback.
1406 *
1407 * This is used to make sure that the GIP master gets passed on to
1408 * another CPU. It also updates the associated CPU data.
1409 *
1410 * @param enmEvent The event.
1411 * @param idCpu The cpu it applies to.
1412 * @param pvUser Pointer to the device extension.
1413 */
1414static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1415{
1416 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1417 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1418
1419 if (pGip)
1420 {
1421 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1422 switch (enmEvent)
1423 {
1424 case RTMPEVENT_ONLINE:
1425 {
1426 RTThreadPreemptDisable(&PreemptState);
1427 if (idCpu == RTMpCpuId())
1428 {
1429 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1430 RTThreadPreemptRestore(&PreemptState);
1431 }
1432 else
1433 {
1434 RTThreadPreemptRestore(&PreemptState);
1435 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1436 }
1437
1438 /*
1439 * Recompute TSC-delta for the newly online'd CPU.
1440 */
1441 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1442 {
1443#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1444 supdrvTscDeltaThreadStartMeasurement(pDevExt, false /* fForceAll */);
1445#else
1446 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1447 supdrvTscMeasureDeltaOne(pDevExt, iCpu);
1448#endif
1449 }
1450 break;
1451 }
1452
1453 case RTMPEVENT_OFFLINE:
1454 supdrvGipMpEventOffline(pDevExt, idCpu);
1455 break;
1456 }
1457 }
1458
1459 /*
1460 * Make sure there is a master GIP.
1461 */
1462 if (enmEvent == RTMPEVENT_OFFLINE)
1463 {
1464 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1465 if (idGipMaster == idCpu)
1466 {
1467 /*
1468 * The GIP master is going offline, find a new one.
1469 */
1470 bool fIgnored;
1471 unsigned i;
1472 RTCPUID idNewGipMaster = NIL_RTCPUID;
1473 RTCPUSET OnlineCpus;
1474 RTMpGetOnlineSet(&OnlineCpus);
1475
1476 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1477 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1478 {
1479 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1480 if (idCurCpu != idGipMaster)
1481 {
1482 idNewGipMaster = idCurCpu;
1483 break;
1484 }
1485 }
1486
1487 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1488 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1489 NOREF(fIgnored);
1490 }
1491 }
1492}
1493
1494
1495/**
1496 * On CPU initialization callback for RTMpOnAll.
1497 *
1498 * @param idCpu The CPU ID.
1499 * @param pvUser1 The device extension.
1500 * @param pvUser2 The GIP.
1501 */
1502static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1503{
1504 /* This is good enough, even though it will update some of the globals a
1505 bit to much. */
1506 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1507 NOREF(pvUser2);
1508}
1509
1510
1511/**
1512 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1513 *
1514 * @param idCpu Ignored.
1515 * @param pvUser1 Where to put the TSC.
1516 * @param pvUser2 Ignored.
1517 */
1518static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1519{
1520 Assert(RTMpCpuIdToSetIndex(idCpu) == (intptr_t)pvUser2);
1521 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1522 RT_NOREF2(idCpu, pvUser2);
1523}
1524
1525
1526/**
1527 * Determine if Async GIP mode is required because of TSC drift.
1528 *
1529 * When using the default/normal timer code it is essential that the time stamp counter
1530 * (TSC) runs never backwards, that is, a read operation to the counter should return
1531 * a bigger value than any previous read operation. This is guaranteed by the latest
1532 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1533 * case we have to choose the asynchronous timer mode.
1534 *
1535 * @param poffMin Pointer to the determined difference between different
1536 * cores (optional, can be NULL).
1537 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1538 */
1539static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1540{
1541 /*
1542 * Just iterate all the cpus 8 times and make sure that the TSC is
1543 * ever increasing. We don't bother taking TSC rollover into account.
1544 */
1545 int iEndCpu = RTMpGetArraySize();
1546 int iCpu;
1547 int cLoops = 8;
1548 bool fAsync = false;
1549 int rc = VINF_SUCCESS;
1550 uint64_t offMax = 0;
1551 uint64_t offMin = ~(uint64_t)0;
1552 uint64_t PrevTsc = ASMReadTSC();
1553
1554 while (cLoops-- > 0)
1555 {
1556 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1557 {
1558 uint64_t CurTsc;
1559 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker,
1560 &CurTsc, (void *)(uintptr_t)iCpu);
1561 if (RT_SUCCESS(rc))
1562 {
1563 if (CurTsc <= PrevTsc)
1564 {
1565 fAsync = true;
1566 offMin = offMax = PrevTsc - CurTsc;
1567 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1568 iCpu, cLoops, CurTsc, PrevTsc));
1569 break;
1570 }
1571
1572 /* Gather statistics (except the first time). */
1573 if (iCpu != 0 || cLoops != 7)
1574 {
1575 uint64_t off = CurTsc - PrevTsc;
1576 if (off < offMin)
1577 offMin = off;
1578 if (off > offMax)
1579 offMax = off;
1580 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1581 }
1582
1583 /* Next */
1584 PrevTsc = CurTsc;
1585 }
1586 else if (rc == VERR_NOT_SUPPORTED)
1587 break;
1588 else
1589 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1590 }
1591
1592 /* broke out of the loop. */
1593 if (iCpu < iEndCpu)
1594 break;
1595 }
1596
1597 if (poffMin)
1598 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1599 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1600 fAsync, iEndCpu, rc, offMin, offMax));
1601#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1602 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1603#endif
1604 return fAsync;
1605}
1606
1607
1608/**
1609 * supdrvGipInit() worker that determines the GIP TSC mode.
1610 *
1611 * @returns The most suitable TSC mode.
1612 * @param pDevExt Pointer to the device instance data.
1613 */
1614static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1615{
1616 uint64_t u64DiffCoresIgnored;
1617 uint32_t uEAX, uEBX, uECX, uEDX;
1618
1619 /*
1620 * Establish whether the CPU advertises TSC as invariant, we need that in
1621 * a couple of places below.
1622 */
1623 bool fInvariantTsc = false;
1624 if (ASMHasCpuId())
1625 {
1626 uEAX = ASMCpuId_EAX(0x80000000);
1627 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1628 {
1629 uEDX = ASMCpuId_EDX(0x80000007);
1630 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1631 fInvariantTsc = true;
1632 }
1633 }
1634
1635 /*
1636 * On single CPU systems, we don't need to consider ASYNC mode.
1637 */
1638 if (RTMpGetCount() <= 1)
1639 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1640
1641 /*
1642 * Allow the user and/or OS specific bits to force async mode.
1643 */
1644 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1645 return SUPGIPMODE_ASYNC_TSC;
1646
1647 /*
1648 * Use invariant mode if the CPU says TSC is invariant.
1649 */
1650 if (fInvariantTsc)
1651 return SUPGIPMODE_INVARIANT_TSC;
1652
1653 /*
1654 * TSC is not invariant and we're on SMP, this presents two problems:
1655 *
1656 * (1) There might be a skew between the CPU, so that cpu0
1657 * returns a TSC that is slightly different from cpu1.
1658 * This screw may be due to (2), bad TSC initialization
1659 * or slightly different TSC rates.
1660 *
1661 * (2) Power management (and other things) may cause the TSC
1662 * to run at a non-constant speed, and cause the speed
1663 * to be different on the cpus. This will result in (1).
1664 *
1665 * If any of the above is detected, we will have to use ASYNC mode.
1666 */
1667 /* (1). Try check for current differences between the cpus. */
1668 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1669 return SUPGIPMODE_ASYNC_TSC;
1670
1671 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1672 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1673 if ( ASMIsValidStdRange(uEAX)
1674 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
1675 {
1676 /* Check for APM support. */
1677 uEAX = ASMCpuId_EAX(0x80000000);
1678 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1679 {
1680 uEDX = ASMCpuId_EDX(0x80000007);
1681 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1682 return SUPGIPMODE_ASYNC_TSC;
1683 }
1684 }
1685
1686 return SUPGIPMODE_SYNC_TSC;
1687}
1688
1689
1690/**
1691 * Initializes per-CPU GIP information.
1692 *
1693 * @param pGip Pointer to the GIP.
1694 * @param pCpu Pointer to which GIP CPU to initialize.
1695 * @param u64NanoTS The current nanosecond timestamp.
1696 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1697 */
1698static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1699{
1700 pCpu->u32TransactionId = 2;
1701 pCpu->u64NanoTS = u64NanoTS;
1702 pCpu->u64TSC = ASMReadTSC();
1703 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1704 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1705
1706 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1707 ASMAtomicWriteU32(&pCpu->idCpu, NIL_RTCPUID);
1708 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1709 ASMAtomicWriteU16(&pCpu->iCpuGroup, 0);
1710 ASMAtomicWriteU16(&pCpu->iCpuGroupMember, UINT16_MAX);
1711 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1712 ASMAtomicWriteU32(&pCpu->iReservedForNumaNode, 0);
1713
1714 /*
1715 * The first time we're called, we don't have a CPU frequency handy,
1716 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1717 * called again and at that point we have a more plausible CPU frequency
1718 * value handy. The frequency history will also be adjusted again on
1719 * the 2nd timer callout (maybe we can skip that now?).
1720 */
1721 if (!uCpuHz)
1722 {
1723 pCpu->u64CpuHz = _4G - 1;
1724 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1725 }
1726 else
1727 {
1728 pCpu->u64CpuHz = uCpuHz;
1729 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1730 }
1731 pCpu->au32TSCHistory[0]
1732 = pCpu->au32TSCHistory[1]
1733 = pCpu->au32TSCHistory[2]
1734 = pCpu->au32TSCHistory[3]
1735 = pCpu->au32TSCHistory[4]
1736 = pCpu->au32TSCHistory[5]
1737 = pCpu->au32TSCHistory[6]
1738 = pCpu->au32TSCHistory[7]
1739 = pCpu->u32UpdateIntervalTSC;
1740}
1741
1742
1743/**
1744 * Initializes the GIP data.
1745 *
1746 * @returns VBox status code.
1747 * @param pDevExt Pointer to the device instance data.
1748 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1749 * @param HCPhys The physical address of the GIP.
1750 * @param u64NanoTS The current nanosecond timestamp.
1751 * @param uUpdateHz The update frequency.
1752 * @param uUpdateIntervalNS The update interval in nanoseconds.
1753 * @param cCpus The CPU count.
1754 * @param cbGipCpuGroups The supdrvOSGipGetGroupTableSize return value we
1755 * used when allocating the GIP structure.
1756 */
1757static int supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1758 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS,
1759 unsigned cCpus, size_t cbGipCpuGroups)
1760{
1761 size_t const cbGip = RT_ALIGN_Z(RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups, PAGE_SIZE);
1762 unsigned i;
1763#ifdef DEBUG_DARWIN_GIP
1764 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1765#else
1766 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1767#endif
1768
1769 /*
1770 * Initialize the structure.
1771 */
1772 memset(pGip, 0, cbGip);
1773
1774 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1775 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1776 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1777 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1778 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1779 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1780 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1781 else
1782 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1783 pGip->cCpus = (uint16_t)cCpus;
1784 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1785 pGip->u32UpdateHz = uUpdateHz;
1786 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1787 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1788 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1789 RTCpuSetEmpty(&pGip->PresentCpuSet);
1790 RTMpGetSet(&pGip->PossibleCpuSet);
1791 pGip->cOnlineCpus = RTMpGetOnlineCount();
1792 pGip->cPresentCpus = RTMpGetPresentCount();
1793 pGip->cPossibleCpus = RTMpGetCount();
1794 pGip->cPossibleCpuGroups = 1;
1795 pGip->idCpuMax = RTMpGetMaxCpuId();
1796 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1797 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1798 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1799 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1800 for (i = 0; i < RT_ELEMENTS(pGip->aoffCpuGroup); i++)
1801 pGip->aoffCpuGroup[i] = UINT16_MAX;
1802 for (i = 0; i < cCpus; i++)
1803 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1804#ifdef RT_OS_WINDOWS
1805 int rc = supdrvOSInitGipGroupTable(pDevExt, pGip, cbGipCpuGroups);
1806 AssertRCReturn(rc, rc);
1807#endif
1808
1809 /*
1810 * Link it to the device extension.
1811 */
1812 pDevExt->pGip = pGip;
1813 pDevExt->HCPhysGip = HCPhys;
1814 pDevExt->cGipUsers = 0;
1815
1816 return VINF_SUCCESS;
1817}
1818
1819
1820/**
1821 * Creates the GIP.
1822 *
1823 * @returns VBox status code.
1824 * @param pDevExt Instance data. GIP stuff may be updated.
1825 */
1826int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1827{
1828 PSUPGLOBALINFOPAGE pGip;
1829 size_t cbGip;
1830 size_t cbGipCpuGroups;
1831 RTHCPHYS HCPhysGip;
1832 uint32_t u32SystemResolution;
1833 uint32_t u32Interval;
1834 uint32_t u32MinInterval;
1835 uint32_t uMod;
1836 unsigned cCpus;
1837 int rc;
1838
1839 LogFlow(("supdrvGipCreate:\n"));
1840
1841 /*
1842 * Assert order.
1843 */
1844 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1845 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1846 Assert(!pDevExt->pGipTimer);
1847#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1848 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1849 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1850#else
1851 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1852 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1853#endif
1854
1855 /*
1856 * Check the CPU count.
1857 */
1858 cCpus = RTMpGetArraySize();
1859 if ( cCpus > RTCPUSET_MAX_CPUS
1860#if RTCPUSET_MAX_CPUS != 256
1861 || cCpus > 256 /* ApicId is used for the mappings */
1862#endif
1863 )
1864 {
1865 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1866 return VERR_TOO_MANY_CPUS;
1867 }
1868
1869 /*
1870 * Allocate a contiguous set of pages with a default kernel mapping.
1871 */
1872#ifdef RT_OS_WINDOWS
1873 cbGipCpuGroups = supdrvOSGipGetGroupTableSize(pDevExt);
1874#else
1875 cbGipCpuGroups = 0;
1876#endif
1877 cbGip = RT_UOFFSETOF_DYN(SUPGLOBALINFOPAGE, aCPUs[cCpus]) + cbGipCpuGroups;
1878 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, cbGip, false /*fExecutable*/);
1879 if (RT_FAILURE(rc))
1880 {
1881 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1882 return rc;
1883 }
1884 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1885 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1886
1887 /*
1888 * Find a reasonable update interval and initialize the structure.
1889 */
1890 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1891 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1892 * See @bugref{6710}. */
1893 u32MinInterval = RT_NS_10MS;
1894 u32SystemResolution = RTTimerGetSystemGranularity();
1895 u32Interval = u32MinInterval;
1896 uMod = u32MinInterval % u32SystemResolution;
1897 if (uMod)
1898 u32Interval += u32SystemResolution - uMod;
1899
1900 rc = supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval,
1901 cCpus, cbGipCpuGroups);
1902
1903 /*
1904 * Important sanity check... (Sets rc)
1905 */
1906 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1907 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1908 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1909 {
1910 OSDBGPRINT(("supdrvGipCreate: Host-OS/user claims the TSC-deltas are zero but we detected async. TSC! Bad.\n"));
1911 rc = VERR_INTERNAL_ERROR_2;
1912 }
1913
1914 /* It doesn't make sense to do TSC-delta detection on systems we detect as async. */
1915 AssertStmt( pGip->u32Mode != SUPGIPMODE_ASYNC_TSC
1916 || pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED,
1917 rc = VERR_INTERNAL_ERROR_3);
1918
1919 /*
1920 * Do the TSC frequency measurements.
1921 *
1922 * If we're in invariant TSC mode, just to a quick preliminary measurement
1923 * that the TSC-delta measurement code can use to yield cross calls.
1924 *
1925 * If we're in any of the other two modes, neither which require MP init,
1926 * notifications or deltas for the job, do the full measurement now so
1927 * that supdrvGipInitOnCpu() can populate the TSC interval and history
1928 * array with more reasonable values.
1929 */
1930 if (RT_SUCCESS(rc))
1931 {
1932 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1933 {
1934 rc = supdrvGipInitMeasureTscFreq(pGip, true /*fRough*/); /* cannot fail */
1935 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt);
1936 }
1937 else
1938 rc = supdrvGipInitMeasureTscFreq(pGip, false /*fRough*/);
1939 if (RT_SUCCESS(rc))
1940 {
1941 /*
1942 * Start TSC-delta measurement thread before we start getting MP
1943 * events that will try kick it into action (includes the
1944 * RTMpOnAll/supdrvGipInitOnCpu call below).
1945 */
1946 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1947 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1948 #ifdef SUPDRV_USE_TSC_DELTA_THREAD
1949 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1950 rc = supdrvTscDeltaThreadInit(pDevExt);
1951 #endif
1952 if (RT_SUCCESS(rc))
1953 {
1954 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1955 if (RT_SUCCESS(rc))
1956 {
1957 /*
1958 * Do GIP initialization on all online CPUs. Wake up the
1959 * TSC-delta thread afterwards.
1960 */
1961 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1962 if (RT_SUCCESS(rc))
1963 {
1964 #ifdef SUPDRV_USE_TSC_DELTA_THREAD
1965 supdrvTscDeltaThreadStartMeasurement(pDevExt, true /* fForceAll */);
1966 #else
1967 uint16_t iCpu;
1968 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1969 {
1970 /*
1971 * Measure the TSC deltas now that we have MP notifications.
1972 */
1973 int cTries = 5;
1974 do
1975 {
1976 rc = supdrvTscMeasureInitialDeltas(pDevExt);
1977 if ( rc != VERR_TRY_AGAIN
1978 && rc != VERR_CPU_OFFLINE)
1979 break;
1980 } while (--cTries > 0);
1981 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1982 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1983 }
1984 else
1985 {
1986 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1987 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1988 }
1989 if (RT_SUCCESS(rc))
1990 #endif
1991 {
1992 /*
1993 * Create the timer.
1994 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1995 */
1996 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1997 {
1998 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
1999 supdrvGipAsyncTimer, pDevExt);
2000 if (rc == VERR_NOT_SUPPORTED)
2001 {
2002 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
2003 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
2004 }
2005 }
2006 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2007 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
2008 supdrvGipSyncAndInvariantTimer, pDevExt);
2009 if (RT_SUCCESS(rc))
2010 {
2011 /*
2012 * We're good.
2013 */
2014 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
2015 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2016
2017 g_pSUPGlobalInfoPage = pGip;
2018 return VINF_SUCCESS;
2019 }
2020
2021 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
2022 Assert(!pDevExt->pGipTimer);
2023 }
2024 }
2025 else
2026 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
2027 }
2028 else
2029 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
2030 }
2031 else
2032 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
2033 }
2034 else
2035 OSDBGPRINT(("supdrvGipCreate: supdrvTscMeasureInitialDeltas failed. rc=%Rrc\n", rc));
2036 }
2037
2038 /* Releases timer frequency increase too. */
2039 supdrvGipDestroy(pDevExt);
2040 return rc;
2041}
2042
2043
2044/**
2045 * Invalidates the GIP data upon termination.
2046 *
2047 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2048 */
2049static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
2050{
2051 unsigned i;
2052 pGip->u32Magic = 0;
2053 for (i = 0; i < pGip->cCpus; i++)
2054 {
2055 pGip->aCPUs[i].u64NanoTS = 0;
2056 pGip->aCPUs[i].u64TSC = 0;
2057 pGip->aCPUs[i].iTSCHistoryHead = 0;
2058 pGip->aCPUs[i].u64TSCSample = 0;
2059 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2060 }
2061}
2062
2063
2064/**
2065 * Terminates the GIP.
2066 *
2067 * @param pDevExt Instance data. GIP stuff may be updated.
2068 */
2069void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2070{
2071 int rc;
2072#ifdef DEBUG_DARWIN_GIP
2073 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2074 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2075 pDevExt->pGipTimer, pDevExt->GipMemObj));
2076#endif
2077
2078 /*
2079 * Stop receiving MP notifications before tearing anything else down.
2080 */
2081 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2082
2083#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2084 /*
2085 * Terminate the TSC-delta measurement thread and resources.
2086 */
2087 supdrvTscDeltaTerm(pDevExt);
2088#endif
2089
2090 /*
2091 * Destroy the TSC-refinement timer.
2092 */
2093 if (pDevExt->pInvarTscRefineTimer)
2094 {
2095 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2096 pDevExt->pInvarTscRefineTimer = NULL;
2097 }
2098
2099 /*
2100 * Invalid the GIP data.
2101 */
2102 if (pDevExt->pGip)
2103 {
2104 supdrvGipTerm(pDevExt->pGip);
2105 pDevExt->pGip = NULL;
2106 }
2107 g_pSUPGlobalInfoPage = NULL;
2108
2109 /*
2110 * Destroy the timer and free the GIP memory object.
2111 */
2112 if (pDevExt->pGipTimer)
2113 {
2114 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2115 pDevExt->pGipTimer = NULL;
2116 }
2117
2118 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2119 {
2120 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2121 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2122 }
2123
2124 /*
2125 * Finally, make sure we've release the system timer resolution request
2126 * if one actually succeeded and is still pending.
2127 */
2128 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2129}
2130
2131
2132
2133
2134/*
2135 *
2136 *
2137 * GIP Update Timer Related Code
2138 * GIP Update Timer Related Code
2139 * GIP Update Timer Related Code
2140 *
2141 *
2142 */
2143
2144
2145/**
2146 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2147 * updates all the per cpu data except the transaction id.
2148 *
2149 * @param pDevExt The device extension.
2150 * @param pGipCpu Pointer to the per cpu data.
2151 * @param u64NanoTS The current time stamp.
2152 * @param u64TSC The current TSC.
2153 * @param iTick The current timer tick.
2154 *
2155 * @remarks Can be called with interrupts disabled!
2156 */
2157static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2158{
2159 uint64_t u64TSCDelta;
2160 bool fUpdateCpuHz;
2161 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2162 AssertPtrReturnVoid(pGip);
2163
2164 /* Delta between this and the previous update. */
2165 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2166
2167 /*
2168 * Update the NanoTS.
2169 */
2170 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2171
2172 /*
2173 * Calc TSC delta.
2174 */
2175 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2176 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2177
2178 /*
2179 * Determine if we need to update the CPU (TSC) frequency calculation.
2180 *
2181 * We don't need to keep recalculating the frequency when it's invariant,
2182 * unless the special tstGIP-2 testing mode is enabled.
2183 */
2184 fUpdateCpuHz = pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC;
2185 if (!(pGip->fFlags & SUPGIP_FLAGS_TESTING))
2186 { /* likely*/ }
2187 else
2188 {
2189 uint32_t fGipFlags = pGip->fFlags;
2190 if (fGipFlags & (SUPGIP_FLAGS_TESTING_ENABLE | SUPGIP_FLAGS_TESTING_START))
2191 {
2192 if (fGipFlags & SUPGIP_FLAGS_TESTING_START)
2193 {
2194 /* Cache the TSC frequency before forcing updates due to test mode. */
2195 if (!fUpdateCpuHz)
2196 pDevExt->uGipTestModeInvariantCpuHz = pGip->aCPUs[0].u64CpuHz;
2197 ASMAtomicAndU32(&pGip->fFlags, ~SUPGIP_FLAGS_TESTING_START);
2198 }
2199 fUpdateCpuHz = true;
2200 }
2201 else if (fGipFlags & SUPGIP_FLAGS_TESTING_STOP)
2202 {
2203 /* Restore the cached TSC frequency if any. */
2204 if (!fUpdateCpuHz)
2205 {
2206 Assert(pDevExt->uGipTestModeInvariantCpuHz);
2207 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, pDevExt->uGipTestModeInvariantCpuHz);
2208 }
2209 ASMAtomicAndU32(&pGip->fFlags, ~(SUPGIP_FLAGS_TESTING_STOP | SUPGIP_FLAGS_TESTING));
2210 }
2211 }
2212
2213 /*
2214 * Calculate the CPU (TSC) frequency if necessary.
2215 */
2216 if (fUpdateCpuHz)
2217 {
2218 uint64_t u64CpuHz;
2219 uint32_t u32UpdateIntervalTSC;
2220 uint32_t u32UpdateIntervalTSCSlack;
2221 uint32_t u32TransactionId;
2222 unsigned iTSCHistoryHead;
2223
2224 if (u64TSCDelta >> 32)
2225 {
2226 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2227 pGipCpu->cErrors++;
2228 }
2229
2230 /*
2231 * On the 2nd and 3rd callout, reset the history with the current TSC
2232 * interval since the values entered by supdrvGipInit are totally off.
2233 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2234 * better, while the 3rd should be most reliable.
2235 */
2236 /** @todo Could we drop this now that we initializes the history
2237 * with nominal TSC frequency values? */
2238 u32TransactionId = pGipCpu->u32TransactionId;
2239 if (RT_UNLIKELY( ( u32TransactionId == 5
2240 || u32TransactionId == 7)
2241 && ( iTick == 2
2242 || iTick == 3) ))
2243 {
2244 unsigned i;
2245 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2246 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2247 }
2248
2249 /*
2250 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2251 * Wait until we have at least one full history since the above history reset. The
2252 * assumption is that the majority of the previous history values will be tolerable.
2253 * See @bugref{6710#c67}.
2254 */
2255 /** @todo Could we drop the fudging there now that we initializes the history
2256 * with nominal TSC frequency values? */
2257 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2258 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2259 {
2260 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2261 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2262 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2263 {
2264 uint32_t u32;
2265 u32 = pGipCpu->au32TSCHistory[0];
2266 u32 += pGipCpu->au32TSCHistory[1];
2267 u32 += pGipCpu->au32TSCHistory[2];
2268 u32 += pGipCpu->au32TSCHistory[3];
2269 u32 >>= 2;
2270 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2271 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2272 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2273 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2274 u64TSCDelta >>= 2;
2275 u64TSCDelta += u32;
2276 u64TSCDelta >>= 1;
2277 }
2278 }
2279
2280 /*
2281 * TSC History.
2282 */
2283 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2284 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2285 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2286 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2287
2288 /*
2289 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2290 *
2291 * On Windows, we have an occasional (but recurring) sour value that messed up
2292 * the history but taking only 1 interval reduces the precision overall.
2293 */
2294 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2295 || pGip->u32UpdateHz >= 1000)
2296 {
2297 uint32_t u32;
2298 u32 = pGipCpu->au32TSCHistory[0];
2299 u32 += pGipCpu->au32TSCHistory[1];
2300 u32 += pGipCpu->au32TSCHistory[2];
2301 u32 += pGipCpu->au32TSCHistory[3];
2302 u32 >>= 2;
2303 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2304 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2305 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2306 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2307 u32UpdateIntervalTSC >>= 2;
2308 u32UpdateIntervalTSC += u32;
2309 u32UpdateIntervalTSC >>= 1;
2310
2311 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2312 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2313 }
2314 else if (pGip->u32UpdateHz >= 90)
2315 {
2316 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2317 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2318 u32UpdateIntervalTSC >>= 1;
2319
2320 /* value chosen on a 2GHz thinkpad running windows */
2321 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2322 }
2323 else
2324 {
2325 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2326
2327 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2328 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2329 }
2330 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2331
2332 /*
2333 * CpuHz.
2334 */
2335 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2336 u64CpuHz /= pGip->u32UpdateIntervalNS;
2337 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2338 }
2339}
2340
2341
2342/**
2343 * Updates the GIP.
2344 *
2345 * @param pDevExt The device extension.
2346 * @param u64NanoTS The current nanosecond timestamp.
2347 * @param u64TSC The current TSC timestamp.
2348 * @param idCpu The CPU ID.
2349 * @param iTick The current timer tick.
2350 *
2351 * @remarks Can be called with interrupts disabled!
2352 */
2353static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2354{
2355 /*
2356 * Determine the relevant CPU data.
2357 */
2358 PSUPGIPCPU pGipCpu;
2359 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2360 AssertPtrReturnVoid(pGip);
2361
2362 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2363 pGipCpu = &pGip->aCPUs[0];
2364 else
2365 {
2366 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
2367 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
2368 return;
2369 pGipCpu = &pGip->aCPUs[iCpu];
2370 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
2371 return;
2372 }
2373
2374 /*
2375 * Start update transaction.
2376 */
2377 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2378 {
2379 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2380 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2381 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2382 pGipCpu->cErrors++;
2383 return;
2384 }
2385
2386 /*
2387 * Recalc the update frequency every 0x800th time.
2388 */
2389 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariant hosts. */
2390 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2391 {
2392 if (pGip->u64NanoTSLastUpdateHz)
2393 {
2394#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2395 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2396 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2397 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2398 {
2399 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2400 * calculation on non-invariant hosts if it changes the history decision
2401 * taken in supdrvGipDoUpdateCpu(). */
2402 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2403 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2404 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2405 }
2406#endif
2407 }
2408 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2409 }
2410
2411 /*
2412 * Update the data.
2413 */
2414 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2415
2416 /*
2417 * Complete transaction.
2418 */
2419 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2420}
2421
2422
2423/**
2424 * Updates the per cpu GIP data for the calling cpu.
2425 *
2426 * @param pDevExt The device extension.
2427 * @param u64NanoTS The current nanosecond timestamp.
2428 * @param u64TSC The current TSC timesaver.
2429 * @param idCpu The CPU ID.
2430 * @param idApic The APIC id for the CPU index.
2431 * @param iTick The current timer tick.
2432 *
2433 * @remarks Can be called with interrupts disabled!
2434 */
2435static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2436 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2437{
2438 uint32_t iCpu;
2439 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2440
2441 /*
2442 * Avoid a potential race when a CPU online notification doesn't fire on
2443 * the onlined CPU but the tick creeps in before the event notification is
2444 * run.
2445 */
2446 if (RT_LIKELY(iTick != 1))
2447 { /* likely*/ }
2448 else
2449 {
2450 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2451 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2452 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2453 }
2454
2455 iCpu = pGip->aiCpuFromApicId[idApic];
2456 if (RT_LIKELY(iCpu < pGip->cCpus))
2457 {
2458 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2459 if (pGipCpu->idCpu == idCpu)
2460 {
2461 /*
2462 * Start update transaction.
2463 */
2464 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2465 {
2466 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2467 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2468 pGipCpu->cErrors++;
2469 return;
2470 }
2471
2472 /*
2473 * Update the data.
2474 */
2475 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2476
2477 /*
2478 * Complete transaction.
2479 */
2480 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2481 }
2482 }
2483}
2484
2485
2486/**
2487 * Timer callback function for the sync and invariant GIP modes.
2488 *
2489 * @param pTimer The timer.
2490 * @param pvUser Opaque pointer to the device extension.
2491 * @param iTick The timer tick.
2492 */
2493static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2494{
2495 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2496 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2497 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2498 uint64_t u64TSC = ASMReadTSC();
2499 uint64_t u64NanoTS = RTTimeSystemNanoTS();
2500 RT_NOREF1(pTimer);
2501
2502 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2503 {
2504 /*
2505 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2506 * missing timer ticks is not an option for GIP because the GIP users
2507 * will end up incrementing the time in 1ns per time getter call until
2508 * there is a complete timer update. So, if the delta has yet to be
2509 * calculated, we just pretend it is zero for now (the GIP users
2510 * probably won't have it for a wee while either and will do the same).
2511 *
2512 * We could maybe on some platforms try cross calling a CPU with a
2513 * working delta here, but it's not worth the hassle since the
2514 * likelihood of this happening is really low. On Windows, Linux, and
2515 * Solaris timers fire on the CPU they were registered/started on.
2516 * Darwin timers doesn't necessarily (they are high priority threads).
2517 */
2518 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2519 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2520 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2521 Assert(!ASMIntAreEnabled());
2522 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2523 {
2524 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2525 if (iTscDelta != INT64_MAX)
2526 u64TSC -= iTscDelta;
2527 }
2528 }
2529
2530 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2531
2532 ASMSetFlags(fEFlags);
2533}
2534
2535
2536/**
2537 * Timer callback function for async GIP mode.
2538 * @param pTimer The timer.
2539 * @param pvUser Opaque pointer to the device extension.
2540 * @param iTick The timer tick.
2541 */
2542static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2543{
2544 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2545 RTCCUINTREG fEFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2546 RTCPUID idCpu = RTMpCpuId();
2547 uint64_t u64TSC = ASMReadTSC();
2548 uint64_t NanoTS = RTTimeSystemNanoTS();
2549 RT_NOREF1(pTimer);
2550
2551 /** @todo reset the transaction number and whatnot when iTick == 1. */
2552 if (pDevExt->idGipMaster == idCpu)
2553 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2554 else
2555 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
2556
2557 ASMSetFlags(fEFlags);
2558}
2559
2560
2561
2562
2563/*
2564 *
2565 *
2566 * TSC Delta Measurements And Related Code
2567 * TSC Delta Measurements And Related Code
2568 * TSC Delta Measurements And Related Code
2569 *
2570 *
2571 */
2572
2573
2574/*
2575 * Select TSC delta measurement algorithm.
2576 */
2577#if 0
2578# define GIP_TSC_DELTA_METHOD_1
2579#else
2580# define GIP_TSC_DELTA_METHOD_2
2581#endif
2582
2583/** For padding variables to keep them away from other cache lines. Better too
2584 * large than too small!
2585 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2586 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2587 * III had 32 bytes cache lines. */
2588#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2589
2590
2591/**
2592 * TSC delta measurement algorithm \#2 result entry.
2593 */
2594typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2595{
2596 uint32_t iSeqMine;
2597 uint32_t iSeqOther;
2598 uint64_t uTsc;
2599} SUPDRVTSCDELTAMETHOD2ENTRY;
2600
2601/**
2602 * TSC delta measurement algorithm \#2 Data.
2603 */
2604typedef struct SUPDRVTSCDELTAMETHOD2
2605{
2606 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2607 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2608 /** The current sequence number of this worker. */
2609 uint32_t volatile iCurSeqNo;
2610 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2611 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2612 /** Result table. */
2613 SUPDRVTSCDELTAMETHOD2ENTRY aResults[64];
2614} SUPDRVTSCDELTAMETHOD2;
2615/** Pointer to the data for TSC delta measurement algorithm \#2 .*/
2616typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2617
2618
2619/**
2620 * The TSC delta synchronization struct, version 2.
2621 *
2622 * The synchronization variable is completely isolated in its own cache line
2623 * (provided our max cache line size estimate is correct).
2624 */
2625typedef struct SUPTSCDELTASYNC2
2626{
2627 /** Padding to make sure the uVar1 is in its own cache line. */
2628 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2629
2630 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2631 volatile uint32_t uSyncVar;
2632 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2633 volatile uint32_t uSyncSeq;
2634
2635 /** Padding to make sure the uVar1 is in its own cache line. */
2636 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2637
2638 /** Start RDTSC value. Put here mainly to save stack space. */
2639 uint64_t uTscStart;
2640 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2641 uint64_t cMaxTscTicks;
2642} SUPTSCDELTASYNC2;
2643AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2644typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2645
2646/** Prestart wait. */
2647#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2648/** Prestart aborted. */
2649#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2650/** Ready (on your mark). */
2651#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2652/** Steady (get set). */
2653#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2654/** Go! */
2655#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2656/** Used by the verification test. */
2657#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2658
2659/** We reached the time limit. */
2660#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2661/** The other party won't touch the sync struct ever again. */
2662#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2663
2664
2665/**
2666 * Argument package/state passed by supdrvTscMeasureDeltaOne() to the RTMpOn
2667 * callback worker.
2668 * @todo add
2669 */
2670typedef struct SUPDRVGIPTSCDELTARGS
2671{
2672 /** The device extension. */
2673 PSUPDRVDEVEXT pDevExt;
2674 /** Pointer to the GIP CPU array entry for the worker. */
2675 PSUPGIPCPU pWorker;
2676 /** Pointer to the GIP CPU array entry for the master. */
2677 PSUPGIPCPU pMaster;
2678 /** The maximum number of ticks to spend in supdrvTscMeasureDeltaCallback.
2679 * (This is what we need a rough TSC frequency for.) */
2680 uint64_t cMaxTscTicks;
2681 /** Used to abort synchronization setup. */
2682 bool volatile fAbortSetup;
2683
2684 /** Padding to make sure the master variables live in its own cache lines. */
2685 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2686
2687 /** @name Master
2688 * @{ */
2689 /** The time the master spent in the MP worker. */
2690 uint64_t cElapsedMasterTscTicks;
2691 /** The iTry value when stopped at. */
2692 uint32_t iTry;
2693 /** Set if the run timed out. */
2694 bool volatile fTimedOut;
2695 /** Pointer to the master's synchronization struct (on stack). */
2696 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2697 /** Master data union. */
2698 union
2699 {
2700 /** Data (master) for delta verification. */
2701 struct
2702 {
2703 /** Verification test TSC values for the master. */
2704 uint64_t volatile auTscs[32];
2705 } Verify;
2706 /** Data (master) for measurement method \#2. */
2707 struct
2708 {
2709 /** Data and sequence number. */
2710 SUPDRVTSCDELTAMETHOD2 Data;
2711 /** The lag setting for the next run. */
2712 bool fLag;
2713 /** Number of hits. */
2714 uint32_t cHits;
2715 } M2;
2716 } uMaster;
2717 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2718 * VERR_TRY_AGAIN on timeout. */
2719 int32_t rcVerify;
2720#ifdef TSCDELTA_VERIFY_WITH_STATS
2721 /** The maximum difference between TSC read during delta verification. */
2722 int64_t cMaxVerifyTscTicks;
2723 /** The minimum difference between two TSC reads during verification. */
2724 int64_t cMinVerifyTscTicks;
2725 /** The bad TSC diff, worker relative to master (= worker - master).
2726 * Negative value means the worker is behind the master. */
2727 int64_t iVerifyBadTscDiff;
2728#endif
2729 /** @} */
2730
2731 /** Padding to make sure the worker variables live is in its own cache line. */
2732 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2733
2734 /** @name Proletarian
2735 * @{ */
2736 /** Pointer to the worker's synchronization struct (on stack). */
2737 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2738 /** The time the worker spent in the MP worker. */
2739 uint64_t cElapsedWorkerTscTicks;
2740 /** Worker data union. */
2741 union
2742 {
2743 /** Data (worker) for delta verification. */
2744 struct
2745 {
2746 /** Verification test TSC values for the worker. */
2747 uint64_t volatile auTscs[32];
2748 } Verify;
2749 /** Data (worker) for measurement method \#2. */
2750 struct
2751 {
2752 /** Data and sequence number. */
2753 SUPDRVTSCDELTAMETHOD2 Data;
2754 /** The lag setting for the next run (set by master). */
2755 bool fLag;
2756 } M2;
2757 } uWorker;
2758 /** @} */
2759
2760 /** Padding to make sure the above is in its own cache line. */
2761 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2762} SUPDRVGIPTSCDELTARGS;
2763typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2764
2765
2766/** @name Macros that implements the basic synchronization steps common to
2767 * the algorithms.
2768 *
2769 * Must be used from loop as the timeouts are implemented via 'break' statements
2770 * at the moment.
2771 *
2772 * @{
2773 */
2774#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2775# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2776# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2777# define TSCDELTA_DBG_CHECK_LOOP() \
2778 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2779#else
2780# define TSCDELTA_DBG_VARS() ((void)0)
2781# define TSCDELTA_DBG_START_LOOP() ((void)0)
2782# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2783#endif
2784#if 0
2785# define TSCDELTA_DBG_SYNC_MSG(a_Args) SUPR0Printf a_Args
2786#else
2787# define TSCDELTA_DBG_SYNC_MSG(a_Args) ((void)0)
2788#endif
2789#if 0
2790# define TSCDELTA_DBG_SYNC_MSG2(a_Args) SUPR0Printf a_Args
2791#else
2792# define TSCDELTA_DBG_SYNC_MSG2(a_Args) ((void)0)
2793#endif
2794#if 0
2795# define TSCDELTA_DBG_SYNC_MSG9(a_Args) SUPR0Printf a_Args
2796#else
2797# define TSCDELTA_DBG_SYNC_MSG9(a_Args) ((void)0)
2798#endif
2799
2800
2801static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2802 bool fIsMaster, PRTCCUINTREG pfEFlags, PSUPDRVGIPTSCDELTARGS pArgs)
2803{
2804 uint32_t iMySeq = fIsMaster ? 0 : 256;
2805 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2806 uint32_t u32Tmp;
2807 uint32_t iSync2Loops = 0;
2808 RTCCUINTREG fEFlags;
2809 TSCDELTA_DBG_VARS();
2810
2811 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2812
2813 /*
2814 * The master tells the worker to get on it's mark.
2815 */
2816 if (fIsMaster)
2817 {
2818 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2819 { /* likely*/ }
2820 else
2821 {
2822 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #1 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2823 return false;
2824 }
2825 }
2826
2827 /*
2828 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2829 */
2830 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2831 for (;;)
2832 {
2833 fEFlags = ASMIntDisableFlags();
2834 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2835 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2836 break;
2837 ASMSetFlags(fEFlags);
2838 ASMNopPause();
2839
2840 /* Abort? */
2841 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2842 {
2843 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #2 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
2844 return false;
2845 }
2846
2847 /* Check for timeouts every so often (not every loop in case RDTSC is
2848 trapping or something). Must check the first time around. */
2849#if 0 /* For debugging the timeout paths. */
2850 static uint32_t volatile xxx;
2851#endif
2852 if ( ( (iSync2Loops & 0x3ff) == 0
2853 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
2854#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
2855 || (!fIsMaster && (++xxx & 0xf) == 0)
2856#endif
2857 )
2858 {
2859 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
2860 ignore the timeout if we've got the go ahead already (simpler). */
2861 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
2862 {
2863 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: timeout\n", fIsMaster ? "master" : "worker"));
2864 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
2865 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
2866 return false;
2867 }
2868 }
2869 iSync2Loops++;
2870 }
2871
2872 /*
2873 * Interrupts are now disabled and will remain disabled until we do
2874 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
2875 */
2876 *pfEFlags = fEFlags;
2877
2878 /*
2879 * The worker tells the master that it is on its mark and that the master
2880 * need to get into position as well.
2881 */
2882 if (!fIsMaster)
2883 {
2884 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2885 { /* likely */ }
2886 else
2887 {
2888 ASMSetFlags(fEFlags);
2889 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #3 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2890 return false;
2891 }
2892 }
2893
2894 /*
2895 * The master sends the 'go' to the worker and wait for ACK.
2896 */
2897 if (fIsMaster)
2898 {
2899 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2900 { /* likely */ }
2901 else
2902 {
2903 ASMSetFlags(fEFlags);
2904 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #4 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2905 return false;
2906 }
2907 }
2908
2909 /*
2910 * Wait for the 'go' signal (ack in the master case).
2911 */
2912 TSCDELTA_DBG_START_LOOP();
2913 for (;;)
2914 {
2915 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2916 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
2917 break;
2918 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
2919 { /* likely */ }
2920 else
2921 {
2922 ASMSetFlags(fEFlags);
2923 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #5 u32Tmp=%#x\n", fIsMaster ? "master" : "worker", u32Tmp));
2924 return false;
2925 }
2926
2927 TSCDELTA_DBG_CHECK_LOOP();
2928 ASMNopPause();
2929 }
2930
2931 /*
2932 * The worker acks the 'go' (shouldn't fail).
2933 */
2934 if (!fIsMaster)
2935 {
2936 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2937 { /* likely */ }
2938 else
2939 {
2940 ASMSetFlags(fEFlags);
2941 TSCDELTA_DBG_SYNC_MSG(("sync/before/%s: #6 uSyncVar=%#x\n", fIsMaster ? "master" : "worker", pOtherSync->uSyncVar));
2942 return false;
2943 }
2944 }
2945
2946 /*
2947 * Try enter mostly lockstep execution with it.
2948 */
2949 for (;;)
2950 {
2951 uint32_t iOtherSeq1, iOtherSeq2;
2952 ASMCompilerBarrier();
2953 ASMSerializeInstruction();
2954
2955 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
2956 ASMNopPause();
2957 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
2958 ASMNopPause();
2959 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
2960
2961 ASMCompilerBarrier();
2962 if (iOtherSeq1 == iOtherSeq2)
2963 return true;
2964
2965 /* Did the other guy give up? Should we give up? */
2966 if ( iOtherSeq1 == UINT32_MAX
2967 || iOtherSeq2 == UINT32_MAX)
2968 return true;
2969 if (++iMySeq >= iMaxSeq)
2970 {
2971 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
2972 return true;
2973 }
2974 ASMNopPause();
2975 }
2976}
2977
2978#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
2979 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
2980 { /*likely*/ } \
2981 else if (true) \
2982 { \
2983 TSCDELTA_DBG_SYNC_MSG9(("sync/before/master: #89\n")); \
2984 break; \
2985 } else do {} while (0)
2986#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync, a_pfEFlags, a_pArgs) \
2987 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_pfEFlags, a_pArgs))) \
2988 { /*likely*/ } \
2989 else if (true) \
2990 { \
2991 TSCDELTA_DBG_SYNC_MSG9(("sync/before/other: #89\n")); \
2992 break; \
2993 } else do {} while (0)
2994
2995
2996static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2997 bool fIsMaster, RTCCUINTREG fEFlags)
2998{
2999 TSCDELTA_DBG_VARS();
3000 RT_NOREF1(pOtherSync);
3001
3002 /*
3003 * Wait for the 'ready' signal. In the master's case, this means the
3004 * worker has completed its data collection, while in the worker's case it
3005 * means the master is done processing the data and it's time for the next
3006 * loop iteration (or whatever).
3007 */
3008 ASMSetFlags(fEFlags);
3009 TSCDELTA_DBG_START_LOOP();
3010 for (;;)
3011 {
3012 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
3013 if ( u32Tmp == GIP_TSC_DELTA_SYNC2_READY
3014 || (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY && !fIsMaster) /* kicked twice => race */ )
3015 return true;
3016 ASMNopPause();
3017 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_GO))
3018 { /* likely */}
3019 else
3020 {
3021 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #1 u32Tmp=%#x\n", u32Tmp));
3022 return false; /* shouldn't ever happen! */
3023 }
3024 TSCDELTA_DBG_CHECK_LOOP();
3025 ASMNopPause();
3026 }
3027}
3028
3029#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3030 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, true /*fIsMaster*/, a_fEFlags))) \
3031 { /* likely */ } \
3032 else if (true) \
3033 { \
3034 TSCDELTA_DBG_SYNC_MSG9(("sync/after/master: #97\n")); \
3035 break; \
3036 } else do {} while (0)
3037
3038#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
3039 /* \
3040 * Tell the worker that we're done processing the data and ready for the next round. \
3041 */ \
3042 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3043 { /* likely */ } \
3044 else if (true)\
3045 { \
3046 TSCDELTA_DBG_SYNC_MSG(("sync/after/master: #99 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3047 break; \
3048 } else do {} while (0)
3049
3050#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync, a_fEFlags) \
3051 if (true) { \
3052 /* \
3053 * Tell the master that we're done collecting data and wait for the next round to start. \
3054 */ \
3055 if (RT_LIKELY(ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO))) \
3056 { /* likely */ } \
3057 else \
3058 { \
3059 ASMSetFlags(a_fEFlags); \
3060 TSCDELTA_DBG_SYNC_MSG(("sync/after/other: #0 uSyncVar=%#x\n", (a_pOtherSync)->uSyncVar)); \
3061 break; \
3062 } \
3063 if (RT_LIKELY(supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, false /*fIsMaster*/, a_fEFlags))) \
3064 { /* likely */ } \
3065 else \
3066 { \
3067 TSCDELTA_DBG_SYNC_MSG9(("sync/after/other: #98\n")); \
3068 break; \
3069 } \
3070 } else do {} while (0)
3071/** @} */
3072
3073
3074#ifdef GIP_TSC_DELTA_METHOD_1
3075/**
3076 * TSC delta measurement algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
3077 *
3078 *
3079 * We ignore the first few runs of the loop in order to prime the
3080 * cache. Also, we need to be careful about using 'pause' instruction
3081 * in critical busy-wait loops in this code - it can cause undesired
3082 * behaviour with hyperthreading.
3083 *
3084 * We try to minimize the measurement error by computing the minimum
3085 * read time of the compare statement in the worker by taking TSC
3086 * measurements across it.
3087 *
3088 * It must be noted that the computed minimum read time is mostly to
3089 * eliminate huge deltas when the worker is too early and doesn't by
3090 * itself help produce more accurate deltas. We allow two times the
3091 * computed minimum as an arbitrary acceptable threshold. Therefore,
3092 * it is still possible to get negative deltas where there are none
3093 * when the worker is earlier. As long as these occasional negative
3094 * deltas are lower than the time it takes to exit guest-context and
3095 * the OS to reschedule EMT on a different CPU, we won't expose a TSC
3096 * that jumped backwards. It is due to the existence of the negative
3097 * deltas that we don't recompute the delta with the master and
3098 * worker interchanged to eliminate the remaining measurement error.
3099 *
3100 *
3101 * @param pArgs The argument/state data.
3102 * @param pMySync My synchronization structure.
3103 * @param pOtherSync My partner's synchronization structure.
3104 * @param fIsMaster Set if master, clear if worker.
3105 * @param iTry The attempt number.
3106 */
3107static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3108 bool fIsMaster, uint32_t iTry)
3109{
3110 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3111 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3112 uint64_t uMinCmpReadTime = UINT64_MAX;
3113 unsigned iLoop;
3114 NOREF(iTry);
3115
3116 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
3117 {
3118 RTCCUINTREG fEFlags;
3119 if (fIsMaster)
3120 {
3121 /*
3122 * The master.
3123 */
3124 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
3125 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
3126 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
3127 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3128
3129 do
3130 {
3131 ASMSerializeInstruction();
3132 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
3133 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3134
3135 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3136
3137 /* Process the data. */
3138 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3139 {
3140 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
3141 {
3142 int64_t iDelta = pGipCpuWorker->u64TSCSample
3143 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
3144 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3145 ? iDelta < pGipCpuWorker->i64TSCDelta
3146 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
3147 pGipCpuWorker->i64TSCDelta = iDelta;
3148 }
3149 }
3150
3151 /* Reset our TSC sample and tell the worker to move on. */
3152 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
3153 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3154 }
3155 else
3156 {
3157 /*
3158 * The worker.
3159 */
3160 uint64_t uTscWorker;
3161 uint64_t uTscWorkerFlushed;
3162 uint64_t uCmpReadTime;
3163
3164 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
3165 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3166
3167 /*
3168 * Keep reading the TSC until we notice that the master has read his. Reading
3169 * the TSC -after- the master has updated the memory is way too late. We thus
3170 * compensate by trying to measure how long it took for the worker to notice
3171 * the memory flushed from the master.
3172 */
3173 do
3174 {
3175 ASMSerializeInstruction();
3176 uTscWorker = ASMReadTSC();
3177 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
3178 ASMSerializeInstruction();
3179 uTscWorkerFlushed = ASMReadTSC();
3180
3181 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
3182 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
3183 {
3184 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
3185 if (uCmpReadTime < (uMinCmpReadTime << 1))
3186 {
3187 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
3188 if (uCmpReadTime < uMinCmpReadTime)
3189 uMinCmpReadTime = uCmpReadTime;
3190 }
3191 else
3192 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3193 }
3194 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
3195 {
3196 if (uCmpReadTime < uMinCmpReadTime)
3197 uMinCmpReadTime = uCmpReadTime;
3198 }
3199
3200 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3201 }
3202 }
3203
3204 TSCDELTA_DBG_SYNC_MSG9(("sync/method1loop/%s: #92 iLoop=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iLoop,
3205 pMySync->uSyncVar));
3206
3207 /*
3208 * We must reset the worker TSC sample value in case it gets picked as a
3209 * GIP master later on (it's trashed above, naturally).
3210 */
3211 if (!fIsMaster)
3212 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
3213}
3214#endif /* GIP_TSC_DELTA_METHOD_1 */
3215
3216
3217#ifdef GIP_TSC_DELTA_METHOD_2
3218/*
3219 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3220 */
3221
3222# define GIP_TSC_DELTA_M2_LOOPS (7 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3223# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 0
3224
3225
3226static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs)
3227{
3228 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3229 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3230 uint32_t idxResult;
3231 uint32_t cHits = 0;
3232
3233 /*
3234 * Look for matching entries in the master and worker tables.
3235 */
3236 for (idxResult = 0; idxResult < RT_ELEMENTS(pArgs->uMaster.M2.Data.aResults); idxResult++)
3237 {
3238 uint32_t idxOther = pArgs->uMaster.M2.Data.aResults[idxResult].iSeqOther;
3239 if (idxOther & 1)
3240 {
3241 idxOther >>= 1;
3242 if (idxOther < RT_ELEMENTS(pArgs->uWorker.M2.Data.aResults))
3243 {
3244 if (pArgs->uWorker.M2.Data.aResults[idxOther].iSeqOther == pArgs->uMaster.M2.Data.aResults[idxResult].iSeqMine)
3245 {
3246 int64_t iDelta;
3247 iDelta = pArgs->uWorker.M2.Data.aResults[idxOther].uTsc
3248 - (pArgs->uMaster.M2.Data.aResults[idxResult].uTsc - iMasterTscDelta);
3249 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3250 ? iDelta < iBestDelta
3251 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3252 iBestDelta = iDelta;
3253 cHits++;
3254 }
3255 }
3256 }
3257 }
3258
3259 /*
3260 * Save the results.
3261 */
3262 if (cHits > 2)
3263 pArgs->pWorker->i64TSCDelta = iBestDelta;
3264 pArgs->uMaster.M2.cHits += cHits;
3265}
3266
3267
3268/**
3269 * The core function of the 2nd TSC delta measurement algorithm.
3270 *
3271 * The idea here is that we have the two CPUs execute the exact same code
3272 * collecting a largish set of TSC samples. The code has one data dependency on
3273 * the other CPU which intention it is to synchronize the execution as well as
3274 * help cross references the two sets of TSC samples (the sequence numbers).
3275 *
3276 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3277 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3278 * it will help with making the CPUs enter lock step execution occasionally.
3279 *
3280 */
3281static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3282{
3283 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3284 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3285
3286 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3287 ASMSerializeInstruction();
3288 while (cLeft-- > 0)
3289 {
3290 uint64_t uTsc;
3291 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3292 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3293 ASMCompilerBarrier();
3294 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3295 uTsc = ASMReadTSC();
3296 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3297 ASMCompilerBarrier();
3298 ASMSerializeInstruction();
3299 pEntry->iSeqMine = iSeqMine;
3300 pEntry->iSeqOther = iSeqOther;
3301 pEntry->uTsc = uTsc;
3302 pEntry++;
3303 ASMSerializeInstruction();
3304 if (fLag)
3305 ASMNopPause();
3306 }
3307}
3308
3309
3310/**
3311 * TSC delta measurement algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3312 *
3313 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3314 *
3315 * @param pArgs The argument/state data.
3316 * @param pMySync My synchronization structure.
3317 * @param pOtherSync My partner's synchronization structure.
3318 * @param fIsMaster Set if master, clear if worker.
3319 * @param iTry The attempt number.
3320 */
3321static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3322 bool fIsMaster, uint32_t iTry)
3323{
3324 unsigned iLoop;
3325 RT_NOREF1(iTry);
3326
3327 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3328 {
3329 RTCCUINTREG fEFlags;
3330 if (fIsMaster)
3331 {
3332 /*
3333 * Adjust the loop lag fudge.
3334 */
3335# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3336 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3337 {
3338 /* Lag during the priming to be nice to everyone.. */
3339 pArgs->uMaster.M2.fLag = true;
3340 pArgs->uWorker.M2.fLag = true;
3341 }
3342 else
3343# endif
3344 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3345 {
3346 /* 25 % of the body without lagging. */
3347 pArgs->uMaster.M2.fLag = false;
3348 pArgs->uWorker.M2.fLag = false;
3349 }
3350 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3351 {
3352 /* 25 % of the body with both lagging. */
3353 pArgs->uMaster.M2.fLag = true;
3354 pArgs->uWorker.M2.fLag = true;
3355 }
3356 else
3357 {
3358 /* 50% of the body with alternating lag. */
3359 pArgs->uMaster.M2.fLag = (iLoop & 1) == 0;
3360 pArgs->uWorker.M2.fLag= (iLoop & 1) == 1;
3361 }
3362
3363 /*
3364 * Sync up with the worker and collect data.
3365 */
3366 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3367 supdrvTscDeltaMethod2CollectData(&pArgs->uMaster.M2.Data, &pArgs->uWorker.M2.Data.iCurSeqNo, pArgs->uMaster.M2.fLag);
3368 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3369
3370 /*
3371 * Process the data.
3372 */
3373# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3374 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3375# endif
3376 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs);
3377
3378 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3379 }
3380 else
3381 {
3382 /*
3383 * The worker.
3384 */
3385 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3386 supdrvTscDeltaMethod2CollectData(&pArgs->uWorker.M2.Data, &pArgs->uMaster.M2.Data.iCurSeqNo, pArgs->uWorker.M2.fLag);
3387 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3388 }
3389 }
3390}
3391
3392#endif /* GIP_TSC_DELTA_METHOD_2 */
3393
3394
3395
3396static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3397 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3398{
3399 /*PSUPGIPCPU pGipCpuWorker = pArgs->pWorker; - unused */
3400 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3401 uint32_t i;
3402 TSCDELTA_DBG_VARS();
3403
3404 for (;;)
3405 {
3406 RTCCUINTREG fEFlags;
3407 AssertCompile((RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) & 1) == 0);
3408 AssertCompile(RT_ELEMENTS(pArgs->uMaster.Verify.auTscs) == RT_ELEMENTS(pArgs->uWorker.Verify.auTscs));
3409
3410 if (fIsMaster)
3411 {
3412 uint64_t uTscWorker;
3413 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3414
3415 /*
3416 * Collect TSC, master goes first.
3417 */
3418 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i += 2)
3419 {
3420 /* Read, kick & wait #1. */
3421 uint64_t register uTsc = ASMReadTSC();
3422 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3423 ASMSerializeInstruction();
3424 pArgs->uMaster.Verify.auTscs[i] = uTsc;
3425 TSCDELTA_DBG_START_LOOP();
3426 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3427 {
3428 TSCDELTA_DBG_CHECK_LOOP();
3429 ASMNopPause();
3430 }
3431
3432 /* Read, kick & wait #2. */
3433 uTsc = ASMReadTSC();
3434 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3435 ASMSerializeInstruction();
3436 pArgs->uMaster.Verify.auTscs[i + 1] = uTsc;
3437 TSCDELTA_DBG_START_LOOP();
3438 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3439 {
3440 TSCDELTA_DBG_CHECK_LOOP();
3441 ASMNopPause();
3442 }
3443 }
3444
3445 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3446
3447 /*
3448 * Process the data.
3449 */
3450#ifdef TSCDELTA_VERIFY_WITH_STATS
3451 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3452 pArgs->cMinVerifyTscTicks = INT64_MAX;
3453 pArgs->iVerifyBadTscDiff = 0;
3454#endif
3455 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3456 uTscWorker = 0;
3457 for (i = 0; i < RT_ELEMENTS(pArgs->uMaster.Verify.auTscs); i++)
3458 {
3459 /* Master vs previous worker entry. */
3460 uint64_t uTscMaster = pArgs->uMaster.Verify.auTscs[i] - pGipCpuMaster->i64TSCDelta;
3461 int64_t iDiff;
3462 if (i > 0)
3463 {
3464 iDiff = uTscMaster - uTscWorker;
3465#ifdef TSCDELTA_VERIFY_WITH_STATS
3466 if (iDiff > pArgs->cMaxVerifyTscTicks)
3467 pArgs->cMaxVerifyTscTicks = iDiff;
3468 if (iDiff < pArgs->cMinVerifyTscTicks)
3469 pArgs->cMinVerifyTscTicks = iDiff;
3470#endif
3471 if (iDiff < 0)
3472 {
3473#ifdef TSCDELTA_VERIFY_WITH_STATS
3474 pArgs->iVerifyBadTscDiff = -iDiff;
3475#endif
3476 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3477 break;
3478 }
3479 }
3480
3481 /* Worker vs master. */
3482 uTscWorker = pArgs->uWorker.Verify.auTscs[i] - iWorkerTscDelta;
3483 iDiff = uTscWorker - uTscMaster;
3484#ifdef TSCDELTA_VERIFY_WITH_STATS
3485 if (iDiff > pArgs->cMaxVerifyTscTicks)
3486 pArgs->cMaxVerifyTscTicks = iDiff;
3487 if (iDiff < pArgs->cMinVerifyTscTicks)
3488 pArgs->cMinVerifyTscTicks = iDiff;
3489#endif
3490 if (iDiff < 0)
3491 {
3492#ifdef TSCDELTA_VERIFY_WITH_STATS
3493 pArgs->iVerifyBadTscDiff = iDiff;
3494#endif
3495 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3496 break;
3497 }
3498 }
3499
3500 /* Done. */
3501 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3502 }
3503 else
3504 {
3505 /*
3506 * The worker, master leads.
3507 */
3508 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync, &fEFlags, pArgs);
3509
3510 for (i = 0; i < RT_ELEMENTS(pArgs->uWorker.Verify.auTscs); i += 2)
3511 {
3512 uint64_t register uTsc;
3513
3514 /* Wait, Read and Kick #1. */
3515 TSCDELTA_DBG_START_LOOP();
3516 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3517 {
3518 TSCDELTA_DBG_CHECK_LOOP();
3519 ASMNopPause();
3520 }
3521 uTsc = ASMReadTSC();
3522 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3523 ASMSerializeInstruction();
3524 pArgs->uWorker.Verify.auTscs[i] = uTsc;
3525
3526 /* Wait, Read and Kick #2. */
3527 TSCDELTA_DBG_START_LOOP();
3528 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3529 {
3530 TSCDELTA_DBG_CHECK_LOOP();
3531 ASMNopPause();
3532 }
3533 uTsc = ASMReadTSC();
3534 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3535 ASMSerializeInstruction();
3536 pArgs->uWorker.Verify.auTscs[i + 1] = uTsc;
3537 }
3538
3539 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync, fEFlags);
3540 }
3541 return pArgs->rcVerify;
3542 }
3543
3544 /*
3545 * Timed out, please retry.
3546 */
3547 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3548 return VERR_TIMEOUT;
3549}
3550
3551
3552
3553/**
3554 * Handles the special abort procedure during synchronization setup in
3555 * supdrvTscMeasureDeltaCallbackUnwrapped().
3556 *
3557 * @returns 0 (dummy, ignored)
3558 * @param pArgs Pointer to argument/state data.
3559 * @param pMySync Pointer to my sync structure.
3560 * @param fIsMaster Set if we're the master, clear if worker.
3561 * @param fTimeout Set if it's a timeout.
3562 */
3563DECL_NO_INLINE(static, int)
3564supdrvTscMeasureDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3565{
3566 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3567 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3568 TSCDELTA_DBG_VARS();
3569 RT_NOREF1(pMySync);
3570
3571 /*
3572 * Clear our sync pointer and make sure the abort flag is set.
3573 */
3574 ASMAtomicWriteNullPtr(ppMySync);
3575 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3576 if (fTimeout)
3577 ASMAtomicWriteBool(&pArgs->fTimedOut, true);
3578
3579 /*
3580 * Make sure the other party is out of there and won't be touching our
3581 * sync state again (would cause stack corruption).
3582 */
3583 TSCDELTA_DBG_START_LOOP();
3584 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3585 {
3586 ASMNopPause();
3587 ASMNopPause();
3588 ASMNopPause();
3589 TSCDELTA_DBG_CHECK_LOOP();
3590 }
3591
3592 return 0;
3593}
3594
3595
3596/**
3597 * This is used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3598 * and compute the delta between them.
3599 *
3600 * To reduce code size a good when timeout handling was added, a dummy return
3601 * value had to be added (saves 1-3 lines per timeout case), thus this
3602 * 'Unwrapped' function and the dummy 0 return value.
3603 *
3604 * @returns 0 (dummy, ignored)
3605 * @param idCpu The CPU we are current scheduled on.
3606 * @param pArgs Pointer to a parameter package.
3607 *
3608 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3609 * read the TSC at exactly the same time on both the master and the
3610 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3611 * contention, SMI, pipelining etc. there is no guaranteed way of
3612 * doing this on x86 CPUs.
3613 */
3614static int supdrvTscMeasureDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3615{
3616 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3617 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3618 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3619 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3620 uint32_t iTry;
3621 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3622 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3623 SUPTSCDELTASYNC2 MySync;
3624 PSUPTSCDELTASYNC2 pOtherSync;
3625 int rc;
3626 TSCDELTA_DBG_VARS();
3627
3628 /* A bit of paranoia first. */
3629 if (!pGipCpuMaster || !pGipCpuWorker)
3630 return 0;
3631
3632 /*
3633 * If the CPU isn't part of the measurement, return immediately.
3634 */
3635 if ( !fIsMaster
3636 && idCpu != pGipCpuWorker->idCpu)
3637 return 0;
3638
3639 /*
3640 * Set up my synchronization stuff and wait for the other party to show up.
3641 *
3642 * We don't wait forever since the other party may be off fishing (offline,
3643 * spinning with ints disables, whatever), we must play nice to the rest of
3644 * the system as this context generally isn't one in which we will get
3645 * preempted and we may hold up a number of lower priority interrupts.
3646 */
3647 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3648 ASMAtomicWritePtr(ppMySync, &MySync);
3649 MySync.uTscStart = ASMReadTSC();
3650 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3651
3652 /* Look for the partner, might not be here yet... Special abort considerations. */
3653 iTry = 0;
3654 TSCDELTA_DBG_START_LOOP();
3655 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3656 {
3657 ASMNopPause();
3658 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3659 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu) )
3660 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3661 if ( (iTry++ & 0xff) == 0
3662 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3663 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3664 TSCDELTA_DBG_CHECK_LOOP();
3665 ASMNopPause();
3666 }
3667
3668 /* I found my partner, waiting to be found... Special abort considerations. */
3669 if (fIsMaster)
3670 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3671 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3672
3673 iTry = 0;
3674 TSCDELTA_DBG_START_LOOP();
3675 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3676 {
3677 ASMNopPause();
3678 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3679 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3680 if ( (iTry++ & 0xff) == 0
3681 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3682 {
3683 if ( fIsMaster
3684 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3685 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3686 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3687 }
3688 TSCDELTA_DBG_CHECK_LOOP();
3689 }
3690
3691 if (!fIsMaster)
3692 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3693 return supdrvTscMeasureDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3694
3695/** @todo Add a resumable state to pArgs so we don't waste time if we time
3696 * out or something. Timeouts are legit, any of the two CPUs may get
3697 * interrupted. */
3698
3699 /*
3700 * Start by seeing if we have a zero delta between the two CPUs.
3701 * This should normally be the case.
3702 */
3703 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3704 if (RT_SUCCESS(rc))
3705 {
3706 if (fIsMaster)
3707 {
3708 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3709 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3710 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3711 }
3712 }
3713 /*
3714 * If the verification didn't time out, do regular delta measurements.
3715 * We retry this until we get a reasonable value.
3716 */
3717 else if (rc != VERR_TIMEOUT)
3718 {
3719 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3720 for (iTry = 0; iTry < 12; iTry++)
3721 {
3722 /*
3723 * Check the state before we start.
3724 */
3725 uint32_t u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3726 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3727 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3728 {
3729 TSCDELTA_DBG_SYNC_MSG(("sync/loop/%s: #0 iTry=%u MyState=%#x\n", fIsMaster ? "master" : "worker", iTry, u32Tmp));
3730 break;
3731 }
3732
3733 /*
3734 * Do the measurements.
3735 */
3736#ifdef GIP_TSC_DELTA_METHOD_1
3737 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3738#elif defined(GIP_TSC_DELTA_METHOD_2)
3739 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3740#else
3741# error "huh??"
3742#endif
3743
3744 /*
3745 * Check the state.
3746 */
3747 u32Tmp = ASMAtomicReadU32(&MySync.uSyncVar);
3748 if ( u32Tmp != GIP_TSC_DELTA_SYNC2_READY
3749 && (fIsMaster || u32Tmp != GIP_TSC_DELTA_SYNC2_STEADY) /* worker may be late prepping for the next round */ )
3750 {
3751 if (fIsMaster)
3752 TSCDELTA_DBG_SYNC_MSG(("sync/loop/master: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3753 else
3754 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/worker: #1 iTry=%u MyState=%#x\n", iTry, u32Tmp));
3755 break;
3756 }
3757
3758 /*
3759 * Success? If so, stop trying. Master decides.
3760 */
3761 if (fIsMaster)
3762 {
3763 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3764 {
3765 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3766 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3767 TSCDELTA_DBG_SYNC_MSG2(("sync/loop/master: #9 iTry=%u MyState=%#x\n", iTry, MySync.uSyncVar));
3768 break;
3769 }
3770 }
3771 }
3772 if (fIsMaster)
3773 pArgs->iTry = iTry;
3774 }
3775
3776 /*
3777 * End the synchronization dance. We tell the other that we're done,
3778 * then wait for the same kind of reply.
3779 */
3780 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3781 ASMAtomicWriteNullPtr(ppMySync);
3782 iTry = 0;
3783 TSCDELTA_DBG_START_LOOP();
3784 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3785 {
3786 iTry++;
3787 if ( iTry == 0
3788 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuMaster->idCpu))
3789 break; /* this really shouldn't happen. */
3790 TSCDELTA_DBG_CHECK_LOOP();
3791 ASMNopPause();
3792 }
3793
3794 /*
3795 * Collect some runtime stats.
3796 */
3797 if (fIsMaster)
3798 pArgs->cElapsedMasterTscTicks = ASMReadTSC() - MySync.uTscStart;
3799 else
3800 pArgs->cElapsedWorkerTscTicks = ASMReadTSC() - MySync.uTscStart;
3801 return 0;
3802}
3803
3804/**
3805 * Callback used by supdrvTscMeasureInitialDeltas() to read the TSC on two CPUs
3806 * and compute the delta between them.
3807 *
3808 * @param idCpu The CPU we are current scheduled on.
3809 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3810 * @param pvUser2 Unused.
3811 */
3812static DECLCALLBACK(void) supdrvTscMeasureDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3813{
3814 supdrvTscMeasureDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3815 RT_NOREF1(pvUser2);
3816}
3817
3818
3819/**
3820 * Measures the TSC delta between the master GIP CPU and one specified worker
3821 * CPU.
3822 *
3823 * @returns VBox status code.
3824 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3825 * failure.
3826 * @param pDevExt Pointer to the device instance data.
3827 * @param idxWorker The index of the worker CPU from the GIP's array of
3828 * CPUs.
3829 *
3830 * @remarks This must be called with preemption enabled!
3831 */
3832static int supdrvTscMeasureDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3833{
3834 int rc;
3835 int rc2;
3836 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3837 RTCPUID idMaster = pDevExt->idGipMaster;
3838 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3839 PSUPGIPCPU pGipCpuMaster;
3840 uint32_t iGipCpuMaster;
3841 uint32_t u32Tmp;
3842
3843 /* Validate input a bit. */
3844 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3845 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3846 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3847
3848 /*
3849 * Don't attempt measuring the delta for the GIP master.
3850 */
3851 if (pGipCpuWorker->idCpu == idMaster)
3852 {
3853 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3854 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3855 return VINF_SUCCESS;
3856 }
3857
3858 /*
3859 * One measurement at a time, at least for now. We might be using
3860 * broadcast IPIs so, so be nice to the rest of the system.
3861 */
3862#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3863 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
3864#else
3865 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
3866#endif
3867 if (RT_FAILURE(rc))
3868 return rc;
3869
3870 /*
3871 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
3872 * try pick a different master. (This fudge only works with multi core systems.)
3873 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
3874 *
3875 * We skip this on AMDs for now as their HTT is different from Intel's and
3876 * it doesn't seem to have any favorable effect on the results.
3877 *
3878 * If the master is offline, we need a new master too, so share the code.
3879 */
3880 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
3881 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
3882 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
3883 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
3884 && pGip->cOnlineCpus > 2
3885 && ASMHasCpuId()
3886 && ASMIsValidStdRange(ASMCpuId_EAX(0))
3887 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
3888 && ( !ASMIsAmdCpu()
3889 || ASMGetCpuFamily(u32Tmp = ASMCpuId_EAX(1)) > 0x15
3890 || ( ASMGetCpuFamily(u32Tmp) == 0x15 /* Piledriver+, not bulldozer (FX-4150 didn't like it). */
3891 && ASMGetCpuModelAMD(u32Tmp) >= 0x02) ) )
3892 || !RTMpIsCpuOnline(idMaster) )
3893 {
3894 uint32_t i;
3895 for (i = 0; i < pGip->cCpus; i++)
3896 if ( i != iGipCpuMaster
3897 && i != idxWorker
3898 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
3899 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
3900 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
3901 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
3902 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
3903 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
3904 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
3905 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
3906 {
3907 iGipCpuMaster = i;
3908 pGipCpuMaster = &pGip->aCPUs[i];
3909 idMaster = pGipCpuMaster->idCpu;
3910 break;
3911 }
3912 }
3913
3914 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
3915 {
3916 /*
3917 * Initialize data package for the RTMpOnPair callback.
3918 */
3919 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
3920 if (pArgs)
3921 {
3922 pArgs->pWorker = pGipCpuWorker;
3923 pArgs->pMaster = pGipCpuMaster;
3924 pArgs->pDevExt = pDevExt;
3925 pArgs->pSyncMaster = NULL;
3926 pArgs->pSyncWorker = NULL;
3927 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 512; /* 1953 us */
3928
3929 /*
3930 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
3931 * and supdrvTscMeasureDeltaCallback can use it as a success check.
3932 */
3933 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
3934 * that when doing the restart loop reorg. */
3935 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3936 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
3937 supdrvTscMeasureDeltaCallback, pArgs, NULL);
3938 if (RT_SUCCESS(rc))
3939 {
3940#if 0
3941 SUPR0Printf("mponpair ticks: %9llu %9llu max: %9llu iTry: %u%s\n", pArgs->cElapsedMasterTscTicks,
3942 pArgs->cElapsedWorkerTscTicks, pArgs->cMaxTscTicks, pArgs->iTry,
3943 pArgs->fTimedOut ? " timed out" :"");
3944#endif
3945#if 0
3946 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
3947 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
3948#endif
3949 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
3950 {
3951 /*
3952 * Work the TSC delta applicability rating. It starts
3953 * optimistic in supdrvGipInit, we downgrade it here.
3954 */
3955 SUPGIPUSETSCDELTA enmRating;
3956 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
3957 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
3958 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
3959 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
3960 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
3961 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
3962 else
3963 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
3964 if (pGip->enmUseTscDelta < enmRating)
3965 {
3966 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
3967 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
3968 }
3969 }
3970 else
3971 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3972 }
3973 /** @todo return try-again if we get an offline CPU error. */
3974
3975 RTMemFree(pArgs);
3976 }
3977 else
3978 rc = VERR_NO_MEMORY;
3979 }
3980 else
3981 rc = VERR_CPU_OFFLINE;
3982
3983 /*
3984 * We're done now.
3985 */
3986#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3987 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3988#else
3989 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3990#endif
3991 return rc;
3992}
3993
3994
3995/**
3996 * Resets the TSC-delta related TSC samples and optionally the deltas
3997 * themselves.
3998 *
3999 * @param pDevExt Pointer to the device instance data.
4000 * @param fResetTscDeltas Whether the TSC-deltas are also to be reset.
4001 *
4002 * @remarks This might be called while holding a spinlock!
4003 */
4004static void supdrvTscResetSamples(PSUPDRVDEVEXT pDevExt, bool fResetTscDeltas)
4005{
4006 unsigned iCpu;
4007 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4008 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4009 {
4010 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
4011 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
4012 if (fResetTscDeltas)
4013 {
4014 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpu->iCpuSet);
4015 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
4016 }
4017 }
4018}
4019
4020
4021/**
4022 * Picks an online CPU as the master TSC for TSC-delta computations.
4023 *
4024 * @returns VBox status code.
4025 * @param pDevExt Pointer to the device instance data.
4026 * @param pidxMaster Where to store the CPU array index of the chosen
4027 * master. Optional, can be NULL.
4028 */
4029static int supdrvTscPickMaster(PSUPDRVDEVEXT pDevExt, uint32_t *pidxMaster)
4030{
4031 /*
4032 * Pick the first CPU online as the master TSC and make it the new GIP master based
4033 * on the APIC ID.
4034 *
4035 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
4036 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
4037 * master as this point since the sync/async timer isn't created yet.
4038 */
4039 unsigned iCpu;
4040 uint32_t idxMaster = UINT32_MAX;
4041 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4042 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
4043 {
4044 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
4045 if (idxCpu != UINT16_MAX)
4046 {
4047 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
4048 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
4049 {
4050 idxMaster = idxCpu;
4051 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
4052 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpu->idCpu);
4053 if (pidxMaster)
4054 *pidxMaster = idxMaster;
4055 return VINF_SUCCESS;
4056 }
4057 }
4058 }
4059 return VERR_CPU_OFFLINE;
4060}
4061
4062
4063/**
4064 * Performs the initial measurements of the TSC deltas between CPUs.
4065 *
4066 * This is called by supdrvGipCreate(), supdrvGipPowerNotificationCallback() or
4067 * triggered by it if threaded.
4068 *
4069 * @returns VBox status code.
4070 * @param pDevExt Pointer to the device instance data.
4071 *
4072 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
4073 * idCpu, GIP's online CPU set which are populated in
4074 * supdrvGipInitOnCpu().
4075 */
4076static int supdrvTscMeasureInitialDeltas(PSUPDRVDEVEXT pDevExt)
4077{
4078 PSUPGIPCPU pGipCpuMaster;
4079 unsigned iCpu;
4080 unsigned iOddEven;
4081 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4082 uint32_t idxMaster = UINT32_MAX;
4083 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
4084
4085 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4086 supdrvTscResetSamples(pDevExt, true /* fClearDeltas */);
4087 int rc = supdrvTscPickMaster(pDevExt, &idxMaster);
4088 if (RT_FAILURE(rc))
4089 {
4090 SUPR0Printf("Failed to pick a CPU master for TSC-delta measurements rc=%Rrc\n", rc);
4091 return rc;
4092 }
4093 AssertReturn(idxMaster < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4094 pGipCpuMaster = &pGip->aCPUs[idxMaster];
4095 Assert(pDevExt->idGipMaster == pGipCpuMaster->idCpu);
4096
4097 /*
4098 * If there is only a single CPU online we have nothing to do.
4099 */
4100 if (pGip->cOnlineCpus <= 1)
4101 {
4102 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
4103 return VINF_SUCCESS;
4104 }
4105
4106 /*
4107 * Loop thru the GIP CPU array and get deltas for each CPU (except the
4108 * master). We do the CPUs with the even numbered APIC IDs first so that
4109 * we've got alternative master CPUs to pick from on hyper-threaded systems.
4110 */
4111 for (iOddEven = 0; iOddEven < 2; iOddEven++)
4112 {
4113 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4114 {
4115 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4116 if ( iCpu != idxMaster
4117 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
4118 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4119 {
4120 rc = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4121 if (RT_FAILURE(rc))
4122 {
4123 SUPR0Printf("supdrvTscMeasureDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
4124 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
4125 break;
4126 }
4127
4128 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
4129 {
4130 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
4131 rc = VERR_TRY_AGAIN;
4132 break;
4133 }
4134 }
4135 }
4136 }
4137
4138 return rc;
4139}
4140
4141
4142#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4143
4144/**
4145 * Switches the TSC-delta measurement thread into the butchered state.
4146 *
4147 * @returns VBox status code.
4148 * @param pDevExt Pointer to the device instance data.
4149 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
4150 * @param pszFailed An error message to log.
4151 * @param rcFailed The error code to exit the thread with.
4152 */
4153static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
4154{
4155 if (!fSpinlockHeld)
4156 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4157
4158 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
4159 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4160 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", pszFailed, rcFailed));
4161 return rcFailed;
4162}
4163
4164
4165/**
4166 * The TSC-delta measurement thread.
4167 *
4168 * @returns VBox status code.
4169 * @param hThread The thread handle.
4170 * @param pvUser Opaque pointer to the device instance data.
4171 */
4172static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4173{
4174 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4175 uint32_t cConsecutiveTimeouts = 0;
4176 int rc = VERR_INTERNAL_ERROR_2;
4177 for (;;)
4178 {
4179 /*
4180 * Switch on the current state.
4181 */
4182 SUPDRVTSCDELTATHREADSTATE enmState;
4183 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4184 enmState = pDevExt->enmTscDeltaThreadState;
4185 switch (enmState)
4186 {
4187 case kTscDeltaThreadState_Creating:
4188 {
4189 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4190 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4191 if (RT_FAILURE(rc))
4192 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4193 RT_FALL_THRU();
4194 }
4195
4196 case kTscDeltaThreadState_Listening:
4197 {
4198 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4199
4200 /*
4201 * Linux counts uninterruptible sleeps as load, hence we shall do a
4202 * regular, interruptible sleep here and ignore wake ups due to signals.
4203 * See task_contributes_to_load() in include/linux/sched.h in the Linux sources.
4204 */
4205 rc = RTThreadUserWaitNoResume(hThread, pDevExt->cMsTscDeltaTimeout);
4206 if ( RT_FAILURE(rc)
4207 && rc != VERR_TIMEOUT
4208 && rc != VERR_INTERRUPTED)
4209 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4210 RTThreadUserReset(hThread);
4211 break;
4212 }
4213
4214 case kTscDeltaThreadState_WaitAndMeasure:
4215 {
4216 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4217 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4218 if (RT_FAILURE(rc))
4219 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4220 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4221 RTThreadSleep(1);
4222 RT_FALL_THRU();
4223 }
4224
4225 case kTscDeltaThreadState_Measuring:
4226 {
4227 cConsecutiveTimeouts = 0;
4228 if (pDevExt->fTscThreadRecomputeAllDeltas)
4229 {
4230 int cTries = 8;
4231 int cMsWaitPerTry = 10;
4232 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4233 Assert(pGip);
4234 do
4235 {
4236 RTCpuSetCopy(&pDevExt->TscDeltaCpuSet, &pGip->OnlineCpuSet);
4237 rc = supdrvTscMeasureInitialDeltas(pDevExt);
4238 if ( RT_SUCCESS(rc)
4239 || ( RT_FAILURE(rc)
4240 && rc != VERR_TRY_AGAIN
4241 && rc != VERR_CPU_OFFLINE))
4242 {
4243 break;
4244 }
4245 RTThreadSleep(cMsWaitPerTry);
4246 } while (cTries-- > 0);
4247 pDevExt->fTscThreadRecomputeAllDeltas = false;
4248 }
4249 else
4250 {
4251 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4252 unsigned iCpu;
4253
4254 /* Measure TSC-deltas only for the CPUs that are in the set. */
4255 rc = VINF_SUCCESS;
4256 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4257 {
4258 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4259 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4260 {
4261 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4262 {
4263 int rc2 = supdrvTscMeasureDeltaOne(pDevExt, iCpu);
4264 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4265 rc = rc2;
4266 }
4267 else
4268 {
4269 /*
4270 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex(),
4271 * mark the delta as fine to get the timer thread off our back.
4272 */
4273 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4274 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4275 }
4276 }
4277 }
4278 }
4279 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4280 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4281 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4282 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4283 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as init value, see supdrvTscDeltaThreadInit(). */
4284 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4285 break;
4286 }
4287
4288 case kTscDeltaThreadState_Terminating:
4289 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4290 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4291 return VINF_SUCCESS;
4292
4293 case kTscDeltaThreadState_Butchered:
4294 default:
4295 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4296 }
4297 }
4298 /* not reached */
4299}
4300
4301
4302/**
4303 * Waits for the TSC-delta measurement thread to respond to a state change.
4304 *
4305 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4306 * other error code on internal error.
4307 *
4308 * @param pDevExt The device instance data.
4309 * @param enmCurState The current state.
4310 * @param enmNewState The new state we're waiting for it to enter.
4311 */
4312static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4313 SUPDRVTSCDELTATHREADSTATE enmNewState)
4314{
4315 SUPDRVTSCDELTATHREADSTATE enmActualState;
4316 int rc;
4317
4318 /*
4319 * Wait a short while for the expected state transition.
4320 */
4321 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4322 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4323 enmActualState = pDevExt->enmTscDeltaThreadState;
4324 if (enmActualState == enmNewState)
4325 {
4326 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4327 rc = VINF_SUCCESS;
4328 }
4329 else if (enmActualState == enmCurState)
4330 {
4331 /*
4332 * Wait longer if the state has not yet transitioned to the one we want.
4333 */
4334 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4335 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4336 if ( RT_SUCCESS(rc)
4337 || rc == VERR_TIMEOUT)
4338 {
4339 /*
4340 * Check the state whether we've succeeded.
4341 */
4342 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4343 enmActualState = pDevExt->enmTscDeltaThreadState;
4344 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4345 if (enmActualState == enmNewState)
4346 rc = VINF_SUCCESS;
4347 else if (enmActualState == enmCurState)
4348 {
4349 rc = VERR_TIMEOUT;
4350 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmActualState=%d enmNewState=%d\n",
4351 enmActualState, enmNewState));
4352 }
4353 else
4354 {
4355 rc = VERR_INTERNAL_ERROR;
4356 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4357 enmActualState, enmNewState));
4358 }
4359 }
4360 else
4361 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4362 }
4363 else
4364 {
4365 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4366 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state %d when transitioning from %d to %d\n",
4367 enmActualState, enmCurState, enmNewState));
4368 rc = VERR_INTERNAL_ERROR;
4369 }
4370
4371 return rc;
4372}
4373
4374
4375/**
4376 * Signals the TSC-delta thread to start measuring TSC-deltas.
4377 *
4378 * @param pDevExt Pointer to the device instance data.
4379 * @param fForceAll Force re-calculating TSC-deltas on all CPUs.
4380 */
4381static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt, bool fForceAll)
4382{
4383 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
4384 {
4385 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4386 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4387 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4388 {
4389 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4390 if (fForceAll)
4391 pDevExt->fTscThreadRecomputeAllDeltas = true;
4392 }
4393 else if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_WaitAndMeasure
4394 && fForceAll)
4395 pDevExt->fTscThreadRecomputeAllDeltas = true;
4396 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4397 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4398 }
4399}
4400
4401
4402/**
4403 * Terminates the actual thread running supdrvTscDeltaThread().
4404 *
4405 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4406 * supdrvTscDeltaTerm().
4407 *
4408 * @param pDevExt Pointer to the device instance data.
4409 */
4410static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4411{
4412 int rc;
4413 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4414 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4415 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4416 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4417 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4418 if (RT_FAILURE(rc))
4419 {
4420 /* Signal a few more times before giving up. */
4421 int cTriesLeft = 5;
4422 while (--cTriesLeft > 0)
4423 {
4424 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4425 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4426 if (rc != VERR_TIMEOUT)
4427 break;
4428 }
4429 }
4430}
4431
4432
4433/**
4434 * Initializes and spawns the TSC-delta measurement thread.
4435 *
4436 * A thread is required for servicing re-measurement requests from events like
4437 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4438 * under all contexts on all OSs.
4439 *
4440 * @returns VBox status code.
4441 * @param pDevExt Pointer to the device instance data.
4442 *
4443 * @remarks Must only be called -after- initializing GIP and setting up MP
4444 * notifications!
4445 */
4446static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4447{
4448 int rc;
4449 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4450 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4451 if (RT_SUCCESS(rc))
4452 {
4453 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4454 if (RT_SUCCESS(rc))
4455 {
4456 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4457 pDevExt->cMsTscDeltaTimeout = 60000;
4458 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4459 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4460 if (RT_SUCCESS(rc))
4461 {
4462 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4463 if (RT_SUCCESS(rc))
4464 {
4465 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4466 return rc;
4467 }
4468
4469 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4470 supdrvTscDeltaThreadTerminate(pDevExt);
4471 }
4472 else
4473 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4474 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4475 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4476 }
4477 else
4478 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4479 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4480 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4481 }
4482 else
4483 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4484
4485 return rc;
4486}
4487
4488
4489/**
4490 * Terminates the TSC-delta measurement thread and cleanup.
4491 *
4492 * @param pDevExt Pointer to the device instance data.
4493 */
4494static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4495{
4496 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4497 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4498 {
4499 supdrvTscDeltaThreadTerminate(pDevExt);
4500 }
4501
4502 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4503 {
4504 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4505 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4506 }
4507
4508 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4509 {
4510 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4511 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4512 }
4513
4514 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4515}
4516
4517#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4518
4519/**
4520 * Measure the TSC delta for the CPU given by its CPU set index.
4521 *
4522 * @returns VBox status code.
4523 * @retval VERR_INTERRUPTED if interrupted while waiting.
4524 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4525 * measurement.
4526 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4527 *
4528 * @param pSession The caller's session. GIP must've been mapped.
4529 * @param iCpuSet The CPU set index of the CPU to measure.
4530 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4531 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4532 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4533 * ready.
4534 * @param cTries Number of times to try, pass 0 for the default.
4535 */
4536SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4537 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4538{
4539 PSUPDRVDEVEXT pDevExt;
4540 PSUPGLOBALINFOPAGE pGip;
4541 uint16_t iGipCpu;
4542 int rc;
4543#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4544 uint64_t msTsStartWait;
4545 uint32_t iWaitLoop;
4546#endif
4547
4548 /*
4549 * Validate and adjust the input.
4550 */
4551 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4552 if (!pSession->fGipReferenced)
4553 return VERR_WRONG_ORDER;
4554
4555 pDevExt = pSession->pDevExt;
4556 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4557
4558 pGip = pDevExt->pGip;
4559 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4560
4561 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4562 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4563 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4564 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4565
4566 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4567 return VERR_INVALID_FLAGS;
4568
4569 /*
4570 * The request is a noop if the TSC delta isn't being used.
4571 */
4572 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4573 return VINF_SUCCESS;
4574
4575 if (cTries == 0)
4576 cTries = 12;
4577 else if (cTries > 256)
4578 cTries = 256;
4579
4580 if (cMsWaitRetry == 0)
4581 cMsWaitRetry = 2;
4582 else if (cMsWaitRetry > 1000)
4583 cMsWaitRetry = 1000;
4584
4585#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4586 /*
4587 * Has the TSC already been measured and we're not forced to redo it?
4588 */
4589 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4590 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4591 return VINF_SUCCESS;
4592
4593 /*
4594 * Asynchronous request? Forward it to the thread, no waiting.
4595 */
4596 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4597 {
4598 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4599 * to pass those options to the thread somehow and implement it in the
4600 * thread. Check if anyone uses/needs fAsync before implementing this. */
4601 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4602 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4603 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4604 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4605 {
4606 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4607 rc = VINF_SUCCESS;
4608 }
4609 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4610 rc = VERR_THREAD_IS_DEAD;
4611 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4612 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4613 return VINF_SUCCESS;
4614 }
4615
4616 /*
4617 * If a TSC-delta measurement request is already being serviced by the thread,
4618 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4619 */
4620 msTsStartWait = RTTimeSystemMilliTS();
4621 for (iWaitLoop = 0;; iWaitLoop++)
4622 {
4623 uint64_t cMsElapsed;
4624 SUPDRVTSCDELTATHREADSTATE enmState;
4625 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4626 enmState = pDevExt->enmTscDeltaThreadState;
4627 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4628
4629 if (enmState == kTscDeltaThreadState_Measuring)
4630 { /* Must wait, the thread is busy. */ }
4631 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4632 { /* Must wait, this state only says what will happen next. */ }
4633 else if (enmState == kTscDeltaThreadState_Terminating)
4634 { /* Must wait, this state only says what should happen next. */ }
4635 else
4636 break; /* All other states, the thread is either idly listening or dead. */
4637
4638 /* Wait or fail. */
4639 if (cMsWaitThread == 0)
4640 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4641 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4642 if (cMsElapsed >= cMsWaitThread)
4643 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4644
4645 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4646 if (rc == VERR_INTERRUPTED)
4647 return rc;
4648 }
4649#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4650
4651 /*
4652 * Try measure the TSC delta the given number of times.
4653 */
4654 for (;;)
4655 {
4656 /* Unless we're forced to measure the delta, check whether it's done already. */
4657 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4658 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4659 {
4660 rc = VINF_SUCCESS;
4661 break;
4662 }
4663
4664 /* Measure it. */
4665 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4666 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4667 {
4668 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4669 break;
4670 }
4671
4672 /* Retry? */
4673 if (cTries <= 1)
4674 break;
4675 cTries--;
4676
4677 /* Always delay between retries (be nice to the rest of the system
4678 and avoid the BSOD hounds). */
4679 rc = RTThreadSleep(cMsWaitRetry);
4680 if (rc == VERR_INTERRUPTED)
4681 break;
4682 }
4683
4684 return rc;
4685}
4686
4687
4688/**
4689 * Service a TSC-delta measurement request.
4690 *
4691 * @returns VBox status code.
4692 * @param pDevExt Pointer to the device instance data.
4693 * @param pSession The support driver session.
4694 * @param pReq Pointer to the TSC-delta measurement request.
4695 */
4696int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4697{
4698 uint32_t cTries;
4699 uint32_t iCpuSet;
4700 uint32_t fFlags;
4701 RTMSINTERVAL cMsWaitRetry;
4702 RT_NOREF1(pDevExt);
4703
4704 /*
4705 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4706 */
4707 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4708
4709 if (pReq->u.In.idCpu == NIL_RTCPUID)
4710 return VERR_INVALID_CPU_ID;
4711 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4712 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4713 return VERR_INVALID_CPU_ID;
4714
4715 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4716
4717 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4718
4719 fFlags = 0;
4720 if (pReq->u.In.fAsync)
4721 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4722 if (pReq->u.In.fForce)
4723 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4724
4725 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4726 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4727 cTries);
4728}
4729
4730
4731/**
4732 * Reads TSC with delta applied.
4733 *
4734 * Will try to resolve delta value INT64_MAX before applying it. This is the
4735 * main purpose of this function, to handle the case where the delta needs to be
4736 * determined.
4737 *
4738 * @returns VBox status code.
4739 * @param pDevExt Pointer to the device instance data.
4740 * @param pSession The support driver session.
4741 * @param pReq Pointer to the TSC-read request.
4742 */
4743int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4744{
4745 PSUPGLOBALINFOPAGE pGip;
4746 int rc;
4747
4748 /*
4749 * Validate. We require the client to have mapped GIP (no asserting on
4750 * ring-3 preconditions).
4751 */
4752 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4753 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4754 return VERR_WRONG_ORDER;
4755 pGip = pDevExt->pGip;
4756 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4757
4758 /*
4759 * We're usually here because we need to apply delta, but we shouldn't be
4760 * upset if the GIP is some different mode.
4761 */
4762 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4763 {
4764 uint32_t cTries = 0;
4765 for (;;)
4766 {
4767 /*
4768 * Start by gathering the data, using CLI for disabling preemption
4769 * while we do that.
4770 */
4771 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4772 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4773 int iGipCpu;
4774 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4775 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4776 {
4777 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4778 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4779 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4780 ASMSetFlags(fEFlags);
4781
4782 /*
4783 * If we're lucky we've got a delta, but no predictions here
4784 * as this I/O control is normally only used when the TSC delta
4785 * is set to INT64_MAX.
4786 */
4787 if (i64Delta != INT64_MAX)
4788 {
4789 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4790 rc = VINF_SUCCESS;
4791 break;
4792 }
4793
4794 /* Give up after a few times. */
4795 if (cTries >= 4)
4796 {
4797 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4798 break;
4799 }
4800
4801 /* Need to measure the delta an try again. */
4802 rc = supdrvTscMeasureDeltaOne(pDevExt, iGipCpu);
4803 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4804 /** @todo should probably delay on failure... dpc watchdogs */
4805 }
4806 else
4807 {
4808 /* This really shouldn't happen. */
4809 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4810 pReq->u.Out.idApic = ASMGetApicId();
4811 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4812 ASMSetFlags(fEFlags);
4813 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4814 break;
4815 }
4816 }
4817 }
4818 else
4819 {
4820 /*
4821 * No delta to apply. Easy. Deal with preemption the lazy way.
4822 */
4823 RTCCUINTREG fEFlags = ASMIntDisableFlags();
4824 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4825 int iGipCpu;
4826 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4827 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4828 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4829 else
4830 pReq->u.Out.idApic = ASMGetApicId();
4831 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4832 ASMSetFlags(fEFlags);
4833 rc = VINF_SUCCESS;
4834 }
4835
4836 return rc;
4837}
4838
4839
4840/**
4841 * Worker for supdrvIOCtl_GipSetFlags.
4842 *
4843 * @returns VBox status code.
4844 * @retval VERR_WRONG_ORDER if an enable-once-per-session flag is set again for
4845 * a session.
4846 *
4847 * @param pDevExt Pointer to the device instance data.
4848 * @param pSession The support driver session.
4849 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4850 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4851 *
4852 * @remarks Caller must own the GIP mutex.
4853 *
4854 * @remarks This function doesn't validate any of the flags.
4855 */
4856static int supdrvGipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
4857{
4858 uint32_t cRefs;
4859 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4860 AssertMsg((fOrMask & fAndMask) == fOrMask, ("%#x & %#x\n", fOrMask, fAndMask)); /* ASSUMED by code below */
4861
4862 /*
4863 * Compute GIP test-mode flags.
4864 */
4865 if (fOrMask & SUPGIP_FLAGS_TESTING_ENABLE)
4866 {
4867 if (!pSession->fGipTestMode)
4868 {
4869 Assert(pDevExt->cGipTestModeRefs < _64K);
4870 pSession->fGipTestMode = true;
4871 cRefs = ++pDevExt->cGipTestModeRefs;
4872 if (cRefs == 1)
4873 {
4874 fOrMask |= SUPGIP_FLAGS_TESTING | SUPGIP_FLAGS_TESTING_START;
4875 fAndMask &= ~SUPGIP_FLAGS_TESTING_STOP;
4876 }
4877 }
4878 else
4879 {
4880 LogRelMax(10, ("supdrvGipSetFlags: SUPGIP_FLAGS_TESTING_ENABLE already set for this session\n"));
4881 return VERR_WRONG_ORDER;
4882 }
4883 }
4884 else if ( !(fAndMask & SUPGIP_FLAGS_TESTING_ENABLE)
4885 && pSession->fGipTestMode)
4886 {
4887 Assert(pDevExt->cGipTestModeRefs > 0);
4888 Assert(pDevExt->cGipTestModeRefs < _64K);
4889 pSession->fGipTestMode = false;
4890 cRefs = --pDevExt->cGipTestModeRefs;
4891 if (!cRefs)
4892 fOrMask |= SUPGIP_FLAGS_TESTING_STOP;
4893 else
4894 fAndMask |= SUPGIP_FLAGS_TESTING_ENABLE;
4895 }
4896
4897 /*
4898 * Commit the flags. This should be done as atomically as possible
4899 * since the flag consumers won't be holding the GIP mutex.
4900 */
4901 ASMAtomicOrU32(&pGip->fFlags, fOrMask);
4902 ASMAtomicAndU32(&pGip->fFlags, fAndMask);
4903
4904 return VINF_SUCCESS;
4905}
4906
4907
4908/**
4909 * Sets GIP test mode parameters.
4910 *
4911 * @returns VBox status code.
4912 * @param pDevExt Pointer to the device instance data.
4913 * @param pSession The support driver session.
4914 * @param fOrMask The OR mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4915 * @param fAndMask The AND mask of the GIP flags, see SUPGIP_FLAGS_XXX.
4916 */
4917int VBOXCALL supdrvIOCtl_GipSetFlags(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, uint32_t fOrMask, uint32_t fAndMask)
4918{
4919 PSUPGLOBALINFOPAGE pGip;
4920 int rc;
4921
4922 /*
4923 * Validate. We require the client to have mapped GIP (no asserting on
4924 * ring-3 preconditions).
4925 */
4926 AssertPtr(pDevExt); AssertPtr(pSession); /* paranoia^2 */
4927 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4928 return VERR_WRONG_ORDER;
4929 pGip = pDevExt->pGip;
4930 AssertReturn(pGip, VERR_INTERNAL_ERROR_3);
4931
4932 if (fOrMask & ~SUPGIP_FLAGS_VALID_MASK)
4933 return VERR_INVALID_PARAMETER;
4934 if ((fAndMask & ~SUPGIP_FLAGS_VALID_MASK) != ~SUPGIP_FLAGS_VALID_MASK)
4935 return VERR_INVALID_PARAMETER;
4936
4937 /*
4938 * Don't confuse supdrvGipSetFlags or anyone else by both setting
4939 * and clearing the same flags. AND takes precedence.
4940 */
4941 fOrMask &= fAndMask;
4942
4943 /*
4944 * Take the loader lock to avoid having to think about races between two
4945 * clients changing the flags at the same time (state is not simple).
4946 */
4947#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4948 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
4949#else
4950 RTSemFastMutexRequest(pDevExt->mtxGip);
4951#endif
4952
4953 rc = supdrvGipSetFlags(pDevExt, pSession, fOrMask, fAndMask);
4954
4955#ifdef SUPDRV_USE_MUTEX_FOR_GIP
4956 RTSemMutexRelease(pDevExt->mtxGip);
4957#else
4958 RTSemFastMutexRelease(pDevExt->mtxGip);
4959#endif
4960 return rc;
4961}
4962
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette