VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 54412

Last change on this file since 54412 was 54409, checked in by vboxsync, 10 years ago

SUPDrvGip.cpp: Use RTMpOnPair.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 168.5 KB
Line 
1/* $Id: SUPDrvGip.cpp 54409 2015-02-24 02:07:23Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#define LOG_GROUP LOG_GROUP_SUP_DRV
31#define SUPDRV_AGNOSTIC
32#include "SUPDrvInternal.h"
33#ifndef PAGE_SHIFT
34# include <iprt/param.h>
35#endif
36#include <iprt/asm.h>
37#include <iprt/asm-amd64-x86.h>
38#include <iprt/asm-math.h>
39#include <iprt/cpuset.h>
40#include <iprt/handletable.h>
41#include <iprt/mem.h>
42#include <iprt/mp.h>
43#include <iprt/power.h>
44#include <iprt/process.h>
45#include <iprt/semaphore.h>
46#include <iprt/spinlock.h>
47#include <iprt/thread.h>
48#include <iprt/uuid.h>
49#include <iprt/net.h>
50#include <iprt/crc.h>
51#include <iprt/string.h>
52#include <iprt/timer.h>
53#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
54# include <iprt/rand.h>
55# include <iprt/path.h>
56#endif
57#include <iprt/uint128.h>
58#include <iprt/x86.h>
59
60#include <VBox/param.h>
61#include <VBox/log.h>
62#include <VBox/err.h>
63
64#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
65# include "dtrace/SUPDrv.h"
66#else
67/* ... */
68#endif
69
70
71/*******************************************************************************
72* Defined Constants And Macros *
73*******************************************************************************/
74/** The frequency by which we recalculate the u32UpdateHz and
75 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
76 *
77 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
78 */
79#define GIP_UPDATEHZ_RECALC_FREQ 0x800
80
81/** A reserved TSC value used for synchronization as well as measurement of
82 * TSC deltas. */
83#define GIP_TSC_DELTA_RSVD UINT64_MAX
84/** The number of TSC delta measurement loops in total (includes primer and
85 * read-time loops). */
86#define GIP_TSC_DELTA_LOOPS 96
87/** The number of cache primer loops. */
88#define GIP_TSC_DELTA_PRIMER_LOOPS 4
89/** The number of loops until we keep computing the minumum read time. */
90#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
91
92/** @name Master / worker synchronization values.
93 * @{ */
94/** Stop measurement of TSC delta. */
95#define GIP_TSC_DELTA_SYNC_STOP UINT32_C(0)
96/** Start measurement of TSC delta. */
97#define GIP_TSC_DELTA_SYNC_START UINT32_C(1)
98/** Worker thread is ready for reading the TSC. */
99#define GIP_TSC_DELTA_SYNC_WORKER_READY UINT32_C(2)
100/** Worker thread is done updating TSC delta info. */
101#define GIP_TSC_DELTA_SYNC_WORKER_DONE UINT32_C(3)
102/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
103 * with a timeout. */
104#define GIP_TSC_DELTA_SYNC_PRESTART_MASTER UINT32_C(4)
105/** @} */
106
107/** When IPRT is isn't concurrent safe: Worker is ready after waiting for
108 * master with a timeout. */
109#define GIP_TSC_DELTA_SYNC_PRESTART_WORKER 5
110/** The TSC-refinement interval in seconds. */
111#define GIP_TSC_REFINE_PERIOD_IN_SECS 5
112/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
113#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
114/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
115#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
116/** The TSC delta value for the initial GIP master - 0 in regular builds.
117 * To test the delta code this can be set to a non-zero value. */
118#if 0
119# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
120#else
121# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
122#endif
123
124AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
125AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
126
127/** @def VBOX_SVN_REV
128 * The makefile should define this if it can. */
129#ifndef VBOX_SVN_REV
130# define VBOX_SVN_REV 0
131#endif
132
133#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
134# define DO_NOT_START_GIP
135#endif
136
137
138/*******************************************************************************
139* Internal Functions *
140*******************************************************************************/
141static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
142static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
143static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
144#ifdef SUPDRV_USE_TSC_DELTA_THREAD
145static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
146static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
147static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt);
148#else
149static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt);
150static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
151#endif
152
153
154/*******************************************************************************
155* Global Variables *
156*******************************************************************************/
157DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
158
159
160
161/*
162 *
163 * Misc Common GIP Code
164 * Misc Common GIP Code
165 * Misc Common GIP Code
166 *
167 *
168 */
169
170
171/**
172 * Finds the GIP CPU index corresponding to @a idCpu.
173 *
174 * @returns GIP CPU array index, UINT32_MAX if not found.
175 * @param pGip The GIP.
176 * @param idCpu The CPU ID.
177 */
178static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
179{
180 uint32_t i;
181 for (i = 0; i < pGip->cCpus; i++)
182 if (pGip->aCPUs[i].idCpu == idCpu)
183 return i;
184 return UINT32_MAX;
185}
186
187
188
189/*
190 *
191 * GIP Mapping and Unmapping Related Code.
192 * GIP Mapping and Unmapping Related Code.
193 * GIP Mapping and Unmapping Related Code.
194 *
195 *
196 */
197
198
199/**
200 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
201 * updating.
202 *
203 * @param pGip Pointer to the GIP.
204 * @param pGipCpu The per CPU structure for this CPU.
205 * @param u64NanoTS The current time.
206 */
207static void supdrvGipReInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
208{
209 /*
210 * Here we don't really care about applying the TSC delta. The re-initialization of this
211 * value is not relevant especially while (re)starting the GIP as the first few ones will
212 * be ignored anyway, see supdrvGipDoUpdateCpu().
213 */
214 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
215 pGipCpu->u64NanoTS = u64NanoTS;
216}
217
218
219/**
220 * Set the current TSC and NanoTS value for the CPU.
221 *
222 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
223 * @param pvUser1 Pointer to the ring-0 GIP mapping.
224 * @param pvUser2 Pointer to the variable holding the current time.
225 */
226static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
227{
228 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
229 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
230
231 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
232 supdrvGipReInitCpu(pGip, &pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
233
234 NOREF(pvUser2);
235 NOREF(idCpu);
236}
237
238
239/**
240 * State structure for supdrvGipDetectGetGipCpuCallback.
241 */
242typedef struct SUPDRVGIPDETECTGETCPU
243{
244 /** Bitmap of APIC IDs that has been seen (initialized to zero).
245 * Used to detect duplicate APIC IDs (paranoia). */
246 uint8_t volatile bmApicId[256 / 8];
247 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
248 * initially). The callback clears the methods not detected. */
249 uint32_t volatile fSupported;
250 /** The first callback detecting any kind of range issues (initialized to
251 * NIL_RTCPUID). */
252 RTCPUID volatile idCpuProblem;
253} SUPDRVGIPDETECTGETCPU;
254/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
255typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
256
257
258/**
259 * Checks for alternative ways of getting the CPU ID.
260 *
261 * This also checks the APIC ID, CPU ID and CPU set index values against the
262 * GIP tables.
263 *
264 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
265 * @param pvUser1 Pointer to the state structure.
266 * @param pvUser2 Pointer to the GIP.
267 */
268static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
269{
270 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
271 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
272 uint32_t fSupported = 0;
273 uint16_t idApic;
274 int iCpuSet;
275
276 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
277
278 /*
279 * Check that the CPU ID and CPU set index are interchangable.
280 */
281 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
282 if ((RTCPUID)iCpuSet == idCpu)
283 {
284 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
285 if ( iCpuSet >= 0
286 && iCpuSet < RTCPUSET_MAX_CPUS
287 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
288 {
289 /*
290 * Check whether the IDTR.LIMIT contains a CPU number.
291 */
292#ifdef RT_ARCH_X86
293 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
294#else
295 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
296#endif
297 RTIDTR Idtr;
298 ASMGetIDTR(&Idtr);
299 if (Idtr.cbIdt >= cbIdt)
300 {
301 uint32_t uTmp = Idtr.cbIdt - cbIdt;
302 uTmp &= RTCPUSET_MAX_CPUS - 1;
303 if (uTmp == idCpu)
304 {
305 RTIDTR Idtr2;
306 ASMGetIDTR(&Idtr2);
307 if (Idtr2.cbIdt == Idtr.cbIdt)
308 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
309 }
310 }
311
312 /*
313 * Check whether RDTSCP is an option.
314 */
315 if (ASMHasCpuId())
316 {
317 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
318 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
319 {
320 uint32_t uAux;
321 ASMReadTscWithAux(&uAux);
322 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
323 {
324 ASMNopPause();
325 ASMReadTscWithAux(&uAux);
326 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
327 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
328 }
329 }
330 }
331 }
332 }
333
334 /*
335 * Check that the APIC ID is unique.
336 */
337 idApic = ASMGetApicId();
338 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
339 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
340 fSupported |= SUPGIPGETCPU_APIC_ID;
341 else
342 {
343 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
344 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
345 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
346 idCpu, iCpuSet, idApic));
347 }
348
349 /*
350 * Check that the iCpuSet is within the expected range.
351 */
352 if (RT_UNLIKELY( iCpuSet < 0
353 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
354 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
355 {
356 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
357 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
358 idCpu, iCpuSet, idApic));
359 }
360 else
361 {
362 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
363 if (RT_UNLIKELY(idCpu2 != idCpu))
364 {
365 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
366 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
367 idCpu, iCpuSet, idApic, idCpu2));
368 }
369 }
370
371 /*
372 * Update the supported feature mask before we return.
373 */
374 ASMAtomicAndU32(&pState->fSupported, fSupported);
375
376 NOREF(pvUser2);
377}
378
379
380/**
381 * Increase the timer freqency on hosts where this is possible (NT).
382 *
383 * The idea is that more interrupts is better for us... Also, it's better than
384 * we increase the timer frequence, because we might end up getting inaccurate
385 * callbacks if someone else does it.
386 *
387 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
388 */
389static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
390{
391 if (pDevExt->u32SystemTimerGranularityGrant == 0)
392 {
393 uint32_t u32SystemResolution;
394 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
395 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
396 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
397 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
398 )
399 {
400 Assert(RTTimerGetSystemGranularity() <= u32SystemResolution);
401 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
402 }
403 }
404}
405
406
407/**
408 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
409 *
410 * @param pDevExt Clears u32SystemTimerGranularityGrant.
411 */
412static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
413{
414 if (pDevExt->u32SystemTimerGranularityGrant)
415 {
416 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
417 AssertRC(rc2);
418 pDevExt->u32SystemTimerGranularityGrant = 0;
419 }
420}
421
422
423/**
424 * Maps the GIP into userspace and/or get the physical address of the GIP.
425 *
426 * @returns IPRT status code.
427 * @param pSession Session to which the GIP mapping should belong.
428 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
429 * @param pHCPhysGip Where to store the physical address. (optional)
430 *
431 * @remark There is no reference counting on the mapping, so one call to this function
432 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
433 * and remove the session as a GIP user.
434 */
435SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
436{
437 int rc;
438 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
439 RTR3PTR pGipR3 = NIL_RTR3PTR;
440 RTHCPHYS HCPhys = NIL_RTHCPHYS;
441 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
442
443 /*
444 * Validate
445 */
446 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
447 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
448 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
449
450#ifdef SUPDRV_USE_MUTEX_FOR_GIP
451 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
452#else
453 RTSemFastMutexRequest(pDevExt->mtxGip);
454#endif
455 if (pDevExt->pGip)
456 {
457 /*
458 * Map it?
459 */
460 rc = VINF_SUCCESS;
461 if (ppGipR3)
462 {
463 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
464 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
465 RTMEM_PROT_READ, RTR0ProcHandleSelf());
466 if (RT_SUCCESS(rc))
467 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
468 }
469
470 /*
471 * Get physical address.
472 */
473 if (pHCPhysGip && RT_SUCCESS(rc))
474 HCPhys = pDevExt->HCPhysGip;
475
476 /*
477 * Reference globally.
478 */
479 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
480 {
481 pSession->fGipReferenced = 1;
482 pDevExt->cGipUsers++;
483 if (pDevExt->cGipUsers == 1)
484 {
485 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
486 uint64_t u64NanoTS;
487
488 /*
489 * GIP starts/resumes updating again. On windows we bump the
490 * host timer frequency to make sure we don't get stuck in guest
491 * mode and to get better timer (and possibly clock) accuracy.
492 */
493 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
494
495 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
496
497 /*
498 * document me
499 */
500 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
501 {
502 unsigned i;
503 for (i = 0; i < pGipR0->cCpus; i++)
504 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
505 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
506 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
507 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
508 }
509
510 /*
511 * document me
512 */
513 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
514 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
515 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
516 || RTMpGetOnlineCount() == 1)
517 supdrvGipReInitCpu(pGipR0, &pGipR0->aCPUs[0], u64NanoTS);
518 else
519 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
520
521 /*
522 * Detect alternative ways to figure the CPU ID in ring-3 and
523 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
524 * and CPU set indexes while we're at it.
525 */
526 if (RT_SUCCESS(rc))
527 {
528 SUPDRVGIPDETECTGETCPU DetectState;
529 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
530 DetectState.fSupported = UINT32_MAX;
531 DetectState.idCpuProblem = NIL_RTCPUID;
532 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
533 if (DetectState.idCpuProblem == NIL_RTCPUID)
534 {
535 if ( DetectState.fSupported != UINT32_MAX
536 && DetectState.fSupported != 0)
537 {
538 if (pGipR0->fGetGipCpu != DetectState.fSupported)
539 {
540 pGipR0->fGetGipCpu = DetectState.fSupported;
541 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
542 }
543 }
544 else
545 {
546 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
547 DetectState.fSupported));
548 rc = VERR_UNSUPPORTED_CPU;
549 }
550 }
551 else
552 {
553 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
554 DetectState.idCpuProblem, DetectState.idCpuProblem));
555 rc = VERR_INVALID_CPU_ID;
556 }
557 }
558
559 /*
560 * Start the GIP timer if all is well..
561 */
562 if (RT_SUCCESS(rc))
563 {
564#ifndef DO_NOT_START_GIP
565 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
566#endif
567 rc = VINF_SUCCESS;
568 }
569
570 /*
571 * Bail out on error.
572 */
573 if (RT_FAILURE(rc))
574 {
575 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
576 pDevExt->cGipUsers = 0;
577 pSession->fGipReferenced = 0;
578 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
579 {
580 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
581 if (RT_SUCCESS(rc2))
582 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
583 }
584 HCPhys = NIL_RTHCPHYS;
585 pGipR3 = NIL_RTR3PTR;
586 }
587 }
588 }
589 }
590 else
591 {
592 rc = VERR_GENERAL_FAILURE;
593 Log(("SUPR0GipMap: GIP is not available!\n"));
594 }
595#ifdef SUPDRV_USE_MUTEX_FOR_GIP
596 RTSemMutexRelease(pDevExt->mtxGip);
597#else
598 RTSemFastMutexRelease(pDevExt->mtxGip);
599#endif
600
601 /*
602 * Write returns.
603 */
604 if (pHCPhysGip)
605 *pHCPhysGip = HCPhys;
606 if (ppGipR3)
607 *ppGipR3 = pGipR3;
608
609#ifdef DEBUG_DARWIN_GIP
610 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
611#else
612 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
613#endif
614 return rc;
615}
616
617
618/**
619 * Unmaps any user mapping of the GIP and terminates all GIP access
620 * from this session.
621 *
622 * @returns IPRT status code.
623 * @param pSession Session to which the GIP mapping should belong.
624 */
625SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
626{
627 int rc = VINF_SUCCESS;
628 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
629#ifdef DEBUG_DARWIN_GIP
630 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
631 pSession,
632 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
633 pSession->GipMapObjR3));
634#else
635 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
636#endif
637 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
638
639#ifdef SUPDRV_USE_MUTEX_FOR_GIP
640 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
641#else
642 RTSemFastMutexRequest(pDevExt->mtxGip);
643#endif
644
645 /*
646 * Unmap anything?
647 */
648 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
649 {
650 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
651 AssertRC(rc);
652 if (RT_SUCCESS(rc))
653 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
654 }
655
656 /*
657 * Dereference global GIP.
658 */
659 if (pSession->fGipReferenced && !rc)
660 {
661 pSession->fGipReferenced = 0;
662 if ( pDevExt->cGipUsers > 0
663 && !--pDevExt->cGipUsers)
664 {
665 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
666#ifndef DO_NOT_START_GIP
667 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
668#endif
669 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
670 }
671 }
672
673#ifdef SUPDRV_USE_MUTEX_FOR_GIP
674 RTSemMutexRelease(pDevExt->mtxGip);
675#else
676 RTSemFastMutexRelease(pDevExt->mtxGip);
677#endif
678
679 return rc;
680}
681
682
683/**
684 * Gets the GIP pointer.
685 *
686 * @returns Pointer to the GIP or NULL.
687 */
688SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
689{
690 return g_pSUPGlobalInfoPage;
691}
692
693
694
695
696
697/*
698 *
699 *
700 * GIP Initialization, Termination and CPU Offline / Online Related Code.
701 * GIP Initialization, Termination and CPU Offline / Online Related Code.
702 * GIP Initialization, Termination and CPU Offline / Online Related Code.
703 *
704 *
705 */
706
707/**
708 * Used by supdrvInitRefineInvariantTscFreqTimer and supdrvGipInitMeasureTscFreq
709 * to update the TSC frequency related GIP variables.
710 *
711 * @param pGip The GIP.
712 * @param nsElapsed The number of nano seconds elapsed.
713 * @param cElapsedTscTicks The corresponding number of TSC ticks.
714 */
715static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks)
716{
717 /*
718 * Calculate the frequency.
719 */
720 uint64_t uCpuHz;
721 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
722 && nsElapsed < UINT32_MAX)
723 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
724 else
725 {
726 RTUINT128U CpuHz, Tmp, Divisor;
727 CpuHz.s.Lo = CpuHz.s.Hi = 0;
728 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
729 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
730 uCpuHz = CpuHz.s.Lo;
731 }
732
733 /*
734 * Update the GIP.
735 */
736 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
737 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
738 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
739}
740
741
742/**
743 * Timer callback function for TSC frequency refinement in invariant GIP mode.
744 *
745 * This is started during driver init and fires once
746 * GIP_TSC_REFINE_PERIOD_IN_SECS seconds later.
747 *
748 * @param pTimer The timer.
749 * @param pvUser Opaque pointer to the device instance data.
750 * @param iTick The timer tick.
751 */
752static DECLCALLBACK(void) supdrvInitRefineInvariantTscFreqTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
753{
754 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
755 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
756 RTCPUID idCpu;
757 uint64_t cNsElapsed;
758 uint64_t cTscTicksElapsed;
759 uint64_t nsNow;
760 uint64_t uTsc;
761 RTCCUINTREG uFlags;
762
763 /* Paranoia. */
764 AssertReturnVoid(pGip);
765 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
766
767 /*
768 * Try get close to the next clock tick as usual.
769 *
770 * PORTME: If timers are called from the clock interrupt handler, or
771 * an interrupt handler with higher priority than the clock
772 * interrupt, or spinning for ages in timer handlers is frowned
773 * upon, this loop must be disabled!
774 *
775 * Darwin, FreeBSD, Linux, Solaris, Windows 8.1+:
776 * High RTTimeSystemNanoTS resolution should prevent any noticable
777 * spinning her.
778 *
779 * Windows 8.0 and earlier:
780 * We're running in a DPC here, so we may trigger the DPC watchdog?
781 *
782 * OS/2:
783 * Timer callbacks are done in the clock interrupt, so skip it.
784 */
785#if !defined(RT_OS_OS2)
786 nsNow = RTTimeSystemNanoTS();
787 while (RTTimeSystemNanoTS() == nsNow)
788 ASMNopPause();
789#endif
790
791 uFlags = ASMIntDisableFlags();
792 uTsc = ASMReadTSC();
793 nsNow = RTTimeSystemNanoTS();
794 idCpu = RTMpCpuId();
795 ASMSetFlags(uFlags);
796
797 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
798 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
799
800 /*
801 * If the above measurement was taken on a different CPU than the one we
802 * started the process on, cTscTicksElapsed will need to be adjusted with
803 * the TSC deltas of both the CPUs.
804 *
805 * We ASSUME that the delta calculation process takes less time than the
806 * TSC frequency refinement timer. If it doesn't, we'll complain and
807 * drop the frequency refinement.
808 *
809 * Note! We cannot entirely trust enmUseTscDelta here because it's
810 * downgraded after each delta calculation.
811 */
812 if ( idCpu != pDevExt->idCpuInvarTscRefine
813 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
814 {
815 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
816 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
817 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
818 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
819 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
820 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
821 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
822 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
823 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
824 {
825 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
826 {
827 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
828 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
829 }
830 }
831 /*
832 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
833 * calculations.
834 */
835 else if (cNsElapsed <= GIP_TSC_REFINE_PERIOD_IN_SECS * 5 * RT_NS_1SEC_64)
836 {
837 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
838 AssertRC(rc);
839 return;
840 }
841 else
842 {
843 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
844 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PERIOD_IN_SECS);
845 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
846 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
847 return;
848 }
849 }
850
851 /*
852 * Calculate and update the CPU frequency variables in GIP.
853 *
854 * If there is a GIP user already and we've already refined the frequency
855 * a couple of times, don't update it as we want a stable frequency value
856 * for all VMs.
857 */
858 if ( pDevExt->cGipUsers == 0
859 || cNsElapsed < RT_NS_1SEC * 2)
860 {
861 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed);
862
863 /*
864 * Reschedule the timer if we haven't yet reached the defined refinement period.
865 */
866 if (cNsElapsed < GIP_TSC_REFINE_PERIOD_IN_SECS * RT_NS_1SEC_64)
867 {
868 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
869 AssertRC(rc);
870 }
871 }
872}
873
874
875/**
876 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
877 *
878 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
879 * the CPU may change the TSC frequence between now and when the timer fires
880 * (supdrvInitAsyncRefineTscTimer).
881 *
882 * @param pDevExt Pointer to the device instance data.
883 * @param pGip Pointer to the GIP.
884 */
885static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
886{
887 uint64_t u64NanoTS;
888 RTCCUINTREG uFlags;
889 int rc;
890
891 /*
892 * Record the TSC and NanoTS as the starting anchor point for refinement
893 * of the TSC. We try get as close to a clock tick as possible on systems
894 * which does not provide high resolution time.
895 */
896 u64NanoTS = RTTimeSystemNanoTS();
897 while (RTTimeSystemNanoTS() == u64NanoTS)
898 ASMNopPause();
899
900 uFlags = ASMIntDisableFlags();
901 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
902 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
903 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
904 ASMSetFlags(uFlags);
905
906/** @todo we need a power management callback that disables the timer if the
907 * system suspends/resumes. */
908
909 /*
910 * Create a timer that runs on the same CPU so we won't have a depencency
911 * on the TSC-delta and can run in parallel to it. On systems that does not
912 * implement CPU specific timers we'll apply deltas in the timer callback,
913 * just like we do for CPUs going offline.
914 *
915 * The longer the refinement interval the better the accuracy, at least in
916 * theory. If it's too long though, ring-3 may already be starting its
917 * first VMs before we're done. On most systems we will be loading the
918 * support driver during boot and VMs won't be started for a while yet,
919 * it is really only a problem during development (especially with
920 * on-demand driver starting on windows).
921 *
922 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq() call
923 * to calculate the frequency during driver loading, the timer is set
924 * to fire after 200 ms the first time. It will then reschedule itself
925 * to fire every second until GIP_TSC_REFINE_PERIOD_IN_SECS has been
926 * reached or it notices that there is a user land client with GIP
927 * mapped (we want a stable frequency for all VMs).
928 */
929 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */,
930 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
931 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
932 if (RT_SUCCESS(rc))
933 {
934 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
935 if (RT_SUCCESS(rc))
936 return;
937 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
938 }
939
940 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
941 {
942 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */, RTTIMER_FLAGS_CPU_ANY,
943 supdrvInitRefineInvariantTscFreqTimer, pDevExt);
944 if (RT_SUCCESS(rc))
945 {
946 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
947 if (RT_SUCCESS(rc))
948 return;
949 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
950 }
951 }
952
953 pDevExt->pInvarTscRefineTimer = NULL;
954 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
955}
956
957
958/**
959 * @callback_method_impl{PFNRTMPWORKER,
960 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
961 * the measurements on.}
962 */
963DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
964{
965 RTCCUINTREG uFlags = ASMIntDisableFlags();
966 uint64_t *puTscStop = (uint64_t *)pvUser1;
967 uint64_t *pnsStop = (uint64_t *)pvUser2;
968
969 *puTscStop = ASMReadTSC();
970 *pnsStop = RTTimeSystemNanoTS();
971
972 ASMSetFlags(uFlags);
973}
974
975
976/**
977 * Measures the TSC frequency of the system.
978 *
979 * The TSC frequency can vary on systems which are not reported as invariant.
980 * On such systems the object of this function is to find out what the nominal,
981 * maximum TSC frequency under 'normal' CPU operation.
982 *
983 * @returns VBox status code.
984 * @param pDevExt Pointer to the device instance.
985 * @param pGip Pointer to the GIP.
986 * @param fRough Set if we're doing the rough calculation that the
987 * TSC measuring code needs, where accuracy isn't all
988 * that important (too high is better than to low).
989 * When clear we try for best accuracy that we can
990 * achieve in reasonably short time.
991 */
992static int supdrvGipInitMeasureTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, bool fRough)
993{
994 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
995 int cTriesLeft = fRough ? 4 : 2;
996 while (cTriesLeft-- > 0)
997 {
998 RTCCUINTREG uFlags;
999 uint64_t nsStart;
1000 uint64_t nsStop;
1001 uint64_t uTscStart;
1002 uint64_t uTscStop;
1003 RTCPUID idCpuStart;
1004 RTCPUID idCpuStop;
1005
1006 /*
1007 * Synchronize with the host OS clock tick on systems without high
1008 * resolution time API (older Windows version for example).
1009 */
1010 nsStart = RTTimeSystemNanoTS();
1011 while (RTTimeSystemNanoTS() == nsStart)
1012 ASMNopPause();
1013
1014 /*
1015 * Read the TSC and current time, noting which CPU we're on.
1016 */
1017 uFlags = ASMIntDisableFlags();
1018 uTscStart = ASMReadTSC();
1019 nsStart = RTTimeSystemNanoTS();
1020 idCpuStart = RTMpCpuId();
1021 ASMSetFlags(uFlags);
1022
1023 /*
1024 * Delay for a while.
1025 */
1026 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1027 {
1028 /*
1029 * Sleep-wait since the TSC frequency is constant, it eases host load.
1030 * Shorter interval produces more variance in the frequency (esp. Windows).
1031 */
1032 uint64_t msElapsed = 0;
1033 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1034 / RT_NS_1MS;
1035 do
1036 {
1037 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1038 nsStop = RTTimeSystemNanoTS();
1039 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1040 } while (msElapsed < msDelay);
1041
1042 while (RTTimeSystemNanoTS() == nsStop)
1043 ASMNopPause();
1044 }
1045 else
1046 {
1047 /*
1048 * Busy-wait keeping the frequency up.
1049 */
1050 do
1051 {
1052 ASMNopPause();
1053 nsStop = RTTimeSystemNanoTS();
1054 } while (nsStop - nsStart < RT_NS_100MS);
1055 }
1056
1057 /*
1058 * Read the TSC and time again.
1059 */
1060 uFlags = ASMIntDisableFlags();
1061 uTscStop = ASMReadTSC();
1062 nsStop = RTTimeSystemNanoTS();
1063 idCpuStop = RTMpCpuId();
1064 ASMSetFlags(uFlags);
1065
1066 /*
1067 * If the CPU changes things get a bit complicated and what we
1068 * can get away with depends on the GIP mode / TSC reliablity.
1069 */
1070 if (idCpuStop != idCpuStart)
1071 {
1072 bool fDoXCall = false;
1073
1074 /*
1075 * Synchronous TSC mode: we're probably fine as it's unlikely
1076 * that we were rescheduled because of TSC throttling or power
1077 * management reasons, so just go ahead.
1078 */
1079 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1080 {
1081 /* Probably ok, maybe we should retry once?. */
1082 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1083 }
1084 /*
1085 * If we're just doing the rough measurement, do the cross call and
1086 * get on with things (we don't have deltas!).
1087 */
1088 else if (fRough)
1089 fDoXCall = true;
1090 /*
1091 * Invariant TSC mode: It doesn't matter if we have delta available
1092 * for both CPUs. That is not something we can assume at this point.
1093 *
1094 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1095 * downgraded after each delta calculation and the delta
1096 * calculations may not be complete yet.
1097 */
1098 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1099 {
1100/** @todo This section of code is never reached atm, consider dropping it later on... */
1101 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1102 {
1103 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1104 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1105 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1106 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1107 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1108 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1109 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1110 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1111 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopTscDelta != INT64_MAX))
1112 {
1113 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1114 {
1115 uTscStart -= iStartTscDelta;
1116 uTscStop -= iStopTscDelta;
1117 }
1118 }
1119 /*
1120 * Invalid CPU indexes are not caused by online/offline races, so
1121 * we have to trigger driver load failure if that happens as GIP
1122 * and IPRT assumptions are busted on this system.
1123 */
1124 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1125 {
1126 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1127 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1128 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1129 return VERR_INVALID_CPU_INDEX;
1130 }
1131 /*
1132 * No valid deltas. We retry, if we're on our last retry
1133 * we do the cross call instead just to get a result. The
1134 * frequency will be refined in a few seconds anyways.
1135 */
1136 else if (cTriesLeft > 0)
1137 continue;
1138 else
1139 fDoXCall = true;
1140 }
1141 }
1142 /*
1143 * Asynchronous TSC mode: This is bad as the reason we usually
1144 * use this mode is to deal with variable TSC frequencies and
1145 * deltas. So, we need to get the TSC from the same CPU as
1146 * started it, we also need to keep that CPU busy. So, retry
1147 * and fall back to the cross call on the last attempt.
1148 */
1149 else
1150 {
1151 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1152 if (cTriesLeft > 0)
1153 continue;
1154 fDoXCall = true;
1155 }
1156
1157 if (fDoXCall)
1158 {
1159 /*
1160 * Try read the TSC and timestamp on the start CPU.
1161 */
1162 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1163 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1164 continue;
1165 }
1166 }
1167
1168 /*
1169 * Calculate the TSC frequency and update it (shared with the refinement timer).
1170 */
1171 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart);
1172 return VINF_SUCCESS;
1173 }
1174
1175 Assert(!fRough);
1176 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1177}
1178
1179
1180/**
1181 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1182 *
1183 * @returns Index of the CPU in the cache set.
1184 * @param pGip The GIP.
1185 * @param idCpu The CPU ID.
1186 */
1187static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1188{
1189 uint32_t i, cTries;
1190
1191 /*
1192 * ASSUMES that CPU IDs are constant.
1193 */
1194 for (i = 0; i < pGip->cCpus; i++)
1195 if (pGip->aCPUs[i].idCpu == idCpu)
1196 return i;
1197
1198 cTries = 0;
1199 do
1200 {
1201 for (i = 0; i < pGip->cCpus; i++)
1202 {
1203 bool fRc;
1204 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1205 if (fRc)
1206 return i;
1207 }
1208 } while (cTries++ < 32);
1209 AssertReleaseFailed();
1210 return i - 1;
1211}
1212
1213
1214/**
1215 * The calling CPU should be accounted as online, update GIP accordingly.
1216 *
1217 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1218 *
1219 * @param pDevExt The device extension.
1220 * @param idCpu The CPU ID.
1221 */
1222static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1223{
1224 int iCpuSet = 0;
1225 uint16_t idApic = UINT16_MAX;
1226 uint32_t i = 0;
1227 uint64_t u64NanoTS = 0;
1228 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1229
1230 AssertPtrReturnVoid(pGip);
1231 Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1232 AssertRelease(idCpu == RTMpCpuId());
1233 Assert(pGip->cPossibleCpus == RTMpGetCount());
1234
1235 /*
1236 * Do this behind a spinlock with interrupts disabled as this can fire
1237 * on all CPUs simultaneously, see @bugref{6110}.
1238 */
1239 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1240
1241 /*
1242 * Update the globals.
1243 */
1244 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1245 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1246 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1247 if (iCpuSet >= 0)
1248 {
1249 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1250 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1251 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1252 }
1253
1254 /*
1255 * Update the entry.
1256 */
1257 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1258 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1259
1260 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1261
1262 idApic = ASMGetApicId();
1263 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1264 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1265 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1266
1267 /*
1268 * Update the APIC ID and CPU set index mappings.
1269 */
1270 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1271 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1272
1273 /* Add this CPU to this set of CPUs we need to calculate the TSC-delta for. */
1274 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, RTMpCpuIdToSetIndex(idCpu));
1275
1276 /* Update the Mp online/offline counter. */
1277 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1278
1279 /* Commit it. */
1280 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1281
1282 RTSpinlockRelease(pDevExt->hGipSpinlock);
1283}
1284
1285
1286/**
1287 * RTMpOnSpecific callback wrapper for supdrvGipMpEventOnlineOrInitOnCpu().
1288 *
1289 * @param idCpu The CPU ID we are running on.
1290 * @param pvUser1 Opaque pointer to the device instance data.
1291 * @param pvUser2 Not used.
1292 */
1293static DECLCALLBACK(void) supdrvGipMpEventOnlineCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1294{
1295 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser1;
1296 NOREF(pvUser2);
1297 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1298}
1299
1300
1301/**
1302 * The CPU should be accounted as offline, update the GIP accordingly.
1303 *
1304 * This is used by supdrvGipMpEvent.
1305 *
1306 * @param pDevExt The device extension.
1307 * @param idCpu The CPU ID.
1308 */
1309static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1310{
1311 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1312 int iCpuSet;
1313 unsigned i;
1314
1315 AssertPtrReturnVoid(pGip);
1316 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1317
1318 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1319 AssertReturnVoid(iCpuSet >= 0);
1320
1321 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1322 AssertReturnVoid(i < pGip->cCpus);
1323 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1324
1325 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1326 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1327
1328 /* Update the Mp online/offline counter. */
1329 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1330
1331 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1332 {
1333 /* Reset the TSC delta, we will recalculate it lazily. */
1334 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1335 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1336 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1337 }
1338
1339 /* Commit it. */
1340 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1341
1342 RTSpinlockRelease(pDevExt->hGipSpinlock);
1343}
1344
1345
1346/**
1347 * Multiprocessor event notification callback.
1348 *
1349 * This is used to make sure that the GIP master gets passed on to
1350 * another CPU. It also updates the associated CPU data.
1351 *
1352 * @param enmEvent The event.
1353 * @param idCpu The cpu it applies to.
1354 * @param pvUser Pointer to the device extension.
1355 */
1356static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1357{
1358 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1359 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1360
1361 if (pGip)
1362 {
1363 RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
1364 switch (enmEvent)
1365 {
1366 case RTMPEVENT_ONLINE:
1367 {
1368 RTThreadPreemptDisable(&PreemptState);
1369 if (idCpu == RTMpCpuId())
1370 {
1371 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1372 RTThreadPreemptRestore(&PreemptState);
1373 }
1374 else
1375 {
1376 RTThreadPreemptRestore(&PreemptState);
1377 RTMpOnSpecific(idCpu, supdrvGipMpEventOnlineCallback, pDevExt, NULL /* pvUser2 */);
1378 }
1379
1380 /*
1381 * Recompute TSC-delta for the newly online'd CPU.
1382 */
1383 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1384 {
1385#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1386 supdrvTscDeltaThreadStartMeasurement(pDevExt);
1387#else
1388 uint32_t iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1389 supdrvMeasureTscDeltaOne(pDevExt, iCpu);
1390#endif
1391 }
1392 break;
1393 }
1394
1395 case RTMPEVENT_OFFLINE:
1396 supdrvGipMpEventOffline(pDevExt, idCpu);
1397 break;
1398 }
1399 }
1400
1401 /*
1402 * Make sure there is a master GIP.
1403 */
1404 if (enmEvent == RTMPEVENT_OFFLINE)
1405 {
1406 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1407 if (idGipMaster == idCpu)
1408 {
1409 /*
1410 * The GIP master is going offline, find a new one.
1411 */
1412 bool fIgnored;
1413 unsigned i;
1414 RTCPUID idNewGipMaster = NIL_RTCPUID;
1415 RTCPUSET OnlineCpus;
1416 RTMpGetOnlineSet(&OnlineCpus);
1417
1418 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1419 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1420 {
1421 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1422 if (idCurCpu != idGipMaster)
1423 {
1424 idNewGipMaster = idCurCpu;
1425 break;
1426 }
1427 }
1428
1429 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1430 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1431 NOREF(fIgnored);
1432 }
1433 }
1434}
1435
1436
1437/**
1438 * On CPU initialization callback for RTMpOnAll.
1439 *
1440 * @param idCpu The CPU ID.
1441 * @param pvUser1 The device extension.
1442 * @param pvUser2 The GIP.
1443 */
1444static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1445{
1446 /* This is good enough, even though it will update some of the globals a
1447 bit to much. */
1448 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1449}
1450
1451
1452/**
1453 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1454 *
1455 * @param idCpu Ignored.
1456 * @param pvUser1 Where to put the TSC.
1457 * @param pvUser2 Ignored.
1458 */
1459static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1460{
1461 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1462}
1463
1464
1465/**
1466 * Determine if Async GIP mode is required because of TSC drift.
1467 *
1468 * When using the default/normal timer code it is essential that the time stamp counter
1469 * (TSC) runs never backwards, that is, a read operation to the counter should return
1470 * a bigger value than any previous read operation. This is guaranteed by the latest
1471 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1472 * case we have to choose the asynchronous timer mode.
1473 *
1474 * @param poffMin Pointer to the determined difference between different
1475 * cores (optional, can be NULL).
1476 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1477 */
1478static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1479{
1480 /*
1481 * Just iterate all the cpus 8 times and make sure that the TSC is
1482 * ever increasing. We don't bother taking TSC rollover into account.
1483 */
1484 int iEndCpu = RTMpGetArraySize();
1485 int iCpu;
1486 int cLoops = 8;
1487 bool fAsync = false;
1488 int rc = VINF_SUCCESS;
1489 uint64_t offMax = 0;
1490 uint64_t offMin = ~(uint64_t)0;
1491 uint64_t PrevTsc = ASMReadTSC();
1492
1493 while (cLoops-- > 0)
1494 {
1495 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1496 {
1497 uint64_t CurTsc;
1498 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker, &CurTsc, NULL);
1499 if (RT_SUCCESS(rc))
1500 {
1501 if (CurTsc <= PrevTsc)
1502 {
1503 fAsync = true;
1504 offMin = offMax = PrevTsc - CurTsc;
1505 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1506 iCpu, cLoops, CurTsc, PrevTsc));
1507 break;
1508 }
1509
1510 /* Gather statistics (except the first time). */
1511 if (iCpu != 0 || cLoops != 7)
1512 {
1513 uint64_t off = CurTsc - PrevTsc;
1514 if (off < offMin)
1515 offMin = off;
1516 if (off > offMax)
1517 offMax = off;
1518 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1519 }
1520
1521 /* Next */
1522 PrevTsc = CurTsc;
1523 }
1524 else if (rc == VERR_NOT_SUPPORTED)
1525 break;
1526 else
1527 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1528 }
1529
1530 /* broke out of the loop. */
1531 if (iCpu < iEndCpu)
1532 break;
1533 }
1534
1535 if (poffMin)
1536 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1537 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1538 fAsync, iEndCpu, rc, offMin, offMax));
1539#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1540 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1541#endif
1542 return fAsync;
1543}
1544
1545
1546/**
1547 * supdrvGipInit() worker that determines the GIP TSC mode.
1548 *
1549 * @returns The most suitable TSC mode.
1550 * @param pDevExt Pointer to the device instance data.
1551 */
1552static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1553{
1554 uint64_t u64DiffCoresIgnored;
1555 uint32_t uEAX, uEBX, uECX, uEDX;
1556
1557 /*
1558 * Establish whether the CPU advertises TSC as invariant, we need that in
1559 * a couple of places below.
1560 */
1561 bool fInvariantTsc = false;
1562 if (ASMHasCpuId())
1563 {
1564 uEAX = ASMCpuId_EAX(0x80000000);
1565 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1566 {
1567 uEDX = ASMCpuId_EDX(0x80000007);
1568 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1569 fInvariantTsc = true;
1570 }
1571 }
1572
1573 /*
1574 * On single CPU systems, we don't need to consider ASYNC mode.
1575 */
1576 if (RTMpGetCount() <= 1)
1577 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1578
1579 /*
1580 * Allow the user and/or OS specific bits to force async mode.
1581 */
1582 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1583 return SUPGIPMODE_ASYNC_TSC;
1584
1585 /*
1586 * Use invariant mode if the CPU says TSC is invariant.
1587 */
1588 if (fInvariantTsc)
1589 return SUPGIPMODE_INVARIANT_TSC;
1590
1591 /*
1592 * TSC is not invariant and we're on SMP, this presents two problems:
1593 *
1594 * (1) There might be a skew between the CPU, so that cpu0
1595 * returns a TSC that is slightly different from cpu1.
1596 * This screw may be due to (2), bad TSC initialization
1597 * or slightly different TSC rates.
1598 *
1599 * (2) Power management (and other things) may cause the TSC
1600 * to run at a non-constant speed, and cause the speed
1601 * to be different on the cpus. This will result in (1).
1602 *
1603 * If any of the above is detected, we will have to use ASYNC mode.
1604 */
1605 /* (1). Try check for current differences between the cpus. */
1606 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1607 return SUPGIPMODE_ASYNC_TSC;
1608
1609 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1610 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1611 if ( ASMIsValidStdRange(uEAX)
1612 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
1613 {
1614 /* Check for APM support. */
1615 uEAX = ASMCpuId_EAX(0x80000000);
1616 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1617 {
1618 uEDX = ASMCpuId_EDX(0x80000007);
1619 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1620 return SUPGIPMODE_ASYNC_TSC;
1621 }
1622 }
1623
1624 return SUPGIPMODE_SYNC_TSC;
1625}
1626
1627
1628/**
1629 * Initializes per-CPU GIP information.
1630 *
1631 * @param pGip Pointer to the GIP.
1632 * @param pCpu Pointer to which GIP CPU to initalize.
1633 * @param u64NanoTS The current nanosecond timestamp.
1634 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1635 */
1636static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1637{
1638 pCpu->u32TransactionId = 2;
1639 pCpu->u64NanoTS = u64NanoTS;
1640 pCpu->u64TSC = ASMReadTSC();
1641 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1642 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1643
1644 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1645 ASMAtomicWriteSize(&pCpu->idCpu, NIL_RTCPUID);
1646 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1647 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1648
1649 /*
1650 * The first time we're called, we don't have a CPU frequency handy,
1651 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1652 * called again and at that point we have a more plausible CPU frequency
1653 * value handy. The frequency history will also be adjusted again on
1654 * the 2nd timer callout (maybe we can skip that now?).
1655 */
1656 if (!uCpuHz)
1657 {
1658 pCpu->u64CpuHz = _4G - 1;
1659 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1660 }
1661 else
1662 {
1663 pCpu->u64CpuHz = uCpuHz;
1664 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1665 }
1666 pCpu->au32TSCHistory[0]
1667 = pCpu->au32TSCHistory[1]
1668 = pCpu->au32TSCHistory[2]
1669 = pCpu->au32TSCHistory[3]
1670 = pCpu->au32TSCHistory[4]
1671 = pCpu->au32TSCHistory[5]
1672 = pCpu->au32TSCHistory[6]
1673 = pCpu->au32TSCHistory[7]
1674 = pCpu->u32UpdateIntervalTSC;
1675}
1676
1677
1678/**
1679 * Initializes the GIP data.
1680 *
1681 * @param pDevExt Pointer to the device instance data.
1682 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1683 * @param HCPhys The physical address of the GIP.
1684 * @param u64NanoTS The current nanosecond timestamp.
1685 * @param uUpdateHz The update frequency.
1686 * @param uUpdateIntervalNS The update interval in nanoseconds.
1687 * @param cCpus The CPU count.
1688 */
1689static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1690 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus)
1691{
1692 size_t const cbGip = RT_ALIGN_Z(RT_OFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), PAGE_SIZE);
1693 unsigned i;
1694#ifdef DEBUG_DARWIN_GIP
1695 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1696#else
1697 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1698#endif
1699
1700 /*
1701 * Initialize the structure.
1702 */
1703 memset(pGip, 0, cbGip);
1704
1705 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1706 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1707 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1708 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1709 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1710 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1711 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1712 else
1713 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1714 pGip->cCpus = (uint16_t)cCpus;
1715 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1716 pGip->u32UpdateHz = uUpdateHz;
1717 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1718 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1719 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1720 RTCpuSetEmpty(&pGip->PresentCpuSet);
1721 RTMpGetSet(&pGip->PossibleCpuSet);
1722 pGip->cOnlineCpus = RTMpGetOnlineCount();
1723 pGip->cPresentCpus = RTMpGetPresentCount();
1724 pGip->cPossibleCpus = RTMpGetCount();
1725 pGip->idCpuMax = RTMpGetMaxCpuId();
1726 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1727 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1728 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1729 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1730 for (i = 0; i < cCpus; i++)
1731 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1732
1733 /*
1734 * Link it to the device extension.
1735 */
1736 pDevExt->pGip = pGip;
1737 pDevExt->HCPhysGip = HCPhys;
1738 pDevExt->cGipUsers = 0;
1739}
1740
1741
1742/**
1743 * Creates the GIP.
1744 *
1745 * @returns VBox status code.
1746 * @param pDevExt Instance data. GIP stuff may be updated.
1747 */
1748int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1749{
1750 PSUPGLOBALINFOPAGE pGip;
1751 RTHCPHYS HCPhysGip;
1752 uint32_t u32SystemResolution;
1753 uint32_t u32Interval;
1754 uint32_t u32MinInterval;
1755 uint32_t uMod;
1756 unsigned cCpus;
1757 int rc;
1758
1759 LogFlow(("supdrvGipCreate:\n"));
1760
1761 /*
1762 * Assert order.
1763 */
1764 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1765 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1766 Assert(!pDevExt->pGipTimer);
1767#ifdef SUPDRV_USE_MUTEX_FOR_GIP
1768 Assert(pDevExt->mtxGip != NIL_RTSEMMUTEX);
1769 Assert(pDevExt->mtxTscDelta != NIL_RTSEMMUTEX);
1770#else
1771 Assert(pDevExt->mtxGip != NIL_RTSEMFASTMUTEX);
1772 Assert(pDevExt->mtxTscDelta != NIL_RTSEMFASTMUTEX);
1773#endif
1774
1775 /*
1776 * Check the CPU count.
1777 */
1778 cCpus = RTMpGetArraySize();
1779 if ( cCpus > RTCPUSET_MAX_CPUS
1780 || cCpus > 256 /* ApicId is used for the mappings */)
1781 {
1782 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1783 return VERR_TOO_MANY_CPUS;
1784 }
1785
1786 /*
1787 * Allocate a contiguous set of pages with a default kernel mapping.
1788 */
1789 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, RT_UOFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), false /*fExecutable*/);
1790 if (RT_FAILURE(rc))
1791 {
1792 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1793 return rc;
1794 }
1795 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1796 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1797
1798 /*
1799 * Find a reasonable update interval and initialize the structure.
1800 */
1801 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1802 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1803 * See @bugref{6710}. */
1804 u32MinInterval = RT_NS_10MS;
1805 u32SystemResolution = RTTimerGetSystemGranularity();
1806 u32Interval = u32MinInterval;
1807 uMod = u32MinInterval % u32SystemResolution;
1808 if (uMod)
1809 u32Interval += u32SystemResolution - uMod;
1810
1811 supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval, cCpus);
1812
1813 /*
1814 * Important sanity check...
1815 */
1816 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1817 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1818 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1819 {
1820 /* Basically, invariant Windows boxes, should never be detected as async (i.e. TSC-deltas should be 0). */
1821 OSDBGPRINT(("supdrvGipCreate: The TSC-deltas should be normalized by the host OS, but verifying shows it's not!\n"));
1822 return VERR_INTERNAL_ERROR_2;
1823 }
1824
1825 /*
1826 * Do the TSC frequency measurements.
1827 *
1828 * If we're in invariant TSC mode, just to a quick preliminary measurement
1829 * that the TSC-delta measurement code can use to yield cross calls.
1830 *
1831 * If we're in any of the other two modes, neither which require MP init,
1832 * notifications or deltas for the job, do the full measurement now so
1833 * that supdrvGipInitOnCpu() can populate the TSC interval and history
1834 * array with more reasonable values.
1835 */
1836 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1837 {
1838 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, true /*fRough*/); /* cannot fail */
1839 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt, pGip);
1840 }
1841 else
1842 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, false /*fRough*/);
1843 if (RT_SUCCESS(rc))
1844 {
1845 /*
1846 * Start TSC-delta measurement thread before we start getting MP
1847 * events that will try kick it into action (includes the
1848 * RTMpOnAll/supdrvGipInitOnCpu call below).
1849 */
1850 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1851 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1852#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1853 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1854 && pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1855 rc = supdrvTscDeltaThreadInit(pDevExt);
1856#endif
1857 if (RT_SUCCESS(rc))
1858 {
1859 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1860 if (RT_SUCCESS(rc))
1861 {
1862 /*
1863 * Do GIP initialization on all online CPUs. Wake up the
1864 * TSC-delta thread afterwards.
1865 */
1866 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1867 if (RT_SUCCESS(rc))
1868 {
1869#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1870 supdrvTscDeltaThreadStartMeasurement(pDevExt);
1871#else
1872 uint16_t iCpu;
1873 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1874 {
1875 /*
1876 * Measure the TSC deltas now that we have MP notifications.
1877 */
1878 int cTries = 5;
1879 do
1880 {
1881 rc = supdrvMeasureInitialTscDeltas(pDevExt);
1882 if ( rc != VERR_TRY_AGAIN
1883 && rc != VERR_CPU_OFFLINE)
1884 break;
1885 } while (--cTries > 0);
1886 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1887 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1888 }
1889 else
1890 {
1891 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1892 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1893 }
1894 if (RT_SUCCESS(rc))
1895#endif
1896 {
1897 /*
1898 * Create the timer.
1899 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1900 */
1901 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1902 {
1903 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
1904 supdrvGipAsyncTimer, pDevExt);
1905 if (rc == VERR_NOT_SUPPORTED)
1906 {
1907 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
1908 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
1909 }
1910 }
1911 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1912 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
1913 supdrvGipSyncAndInvariantTimer, pDevExt);
1914 if (RT_SUCCESS(rc))
1915 {
1916 /*
1917 * We're good.
1918 */
1919 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
1920 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1921
1922 g_pSUPGlobalInfoPage = pGip;
1923 return VINF_SUCCESS;
1924 }
1925
1926 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
1927 Assert(!pDevExt->pGipTimer);
1928 }
1929 }
1930 else
1931 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
1932 }
1933 else
1934 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
1935 }
1936 else
1937 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
1938 }
1939 else
1940 OSDBGPRINT(("supdrvGipCreate: supdrvMeasureInitialTscDeltas failed. rc=%Rrc\n", rc));
1941
1942 /* Releases timer frequency increase too. */
1943 supdrvGipDestroy(pDevExt);
1944 return rc;
1945}
1946
1947
1948/**
1949 * Invalidates the GIP data upon termination.
1950 *
1951 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1952 */
1953static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
1954{
1955 unsigned i;
1956 pGip->u32Magic = 0;
1957 for (i = 0; i < pGip->cCpus; i++)
1958 {
1959 pGip->aCPUs[i].u64NanoTS = 0;
1960 pGip->aCPUs[i].u64TSC = 0;
1961 pGip->aCPUs[i].iTSCHistoryHead = 0;
1962 pGip->aCPUs[i].u64TSCSample = 0;
1963 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
1964 }
1965}
1966
1967
1968/**
1969 * Terminates the GIP.
1970 *
1971 * @param pDevExt Instance data. GIP stuff may be updated.
1972 */
1973void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
1974{
1975 int rc;
1976#ifdef DEBUG_DARWIN_GIP
1977 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
1978 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
1979 pDevExt->pGipTimer, pDevExt->GipMemObj));
1980#endif
1981
1982 /*
1983 * Stop receiving MP notifications before tearing anything else down.
1984 */
1985 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
1986
1987#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1988 /*
1989 * Terminate the TSC-delta measurement thread and resources.
1990 */
1991 supdrvTscDeltaTerm(pDevExt);
1992#endif
1993
1994 /*
1995 * Destroy the TSC-refinement timer.
1996 */
1997 if (pDevExt->pInvarTscRefineTimer)
1998 {
1999 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2000 pDevExt->pInvarTscRefineTimer = NULL;
2001 }
2002
2003 /*
2004 * Invalid the GIP data.
2005 */
2006 if (pDevExt->pGip)
2007 {
2008 supdrvGipTerm(pDevExt->pGip);
2009 pDevExt->pGip = NULL;
2010 }
2011 g_pSUPGlobalInfoPage = NULL;
2012
2013 /*
2014 * Destroy the timer and free the GIP memory object.
2015 */
2016 if (pDevExt->pGipTimer)
2017 {
2018 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2019 pDevExt->pGipTimer = NULL;
2020 }
2021
2022 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2023 {
2024 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2025 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2026 }
2027
2028 /*
2029 * Finally, make sure we've release the system timer resolution request
2030 * if one actually succeeded and is still pending.
2031 */
2032 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2033}
2034
2035
2036
2037
2038/*
2039 *
2040 *
2041 * GIP Update Timer Related Code
2042 * GIP Update Timer Related Code
2043 * GIP Update Timer Related Code
2044 *
2045 *
2046 */
2047
2048
2049/**
2050 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2051 * updates all the per cpu data except the transaction id.
2052 *
2053 * @param pDevExt The device extension.
2054 * @param pGipCpu Pointer to the per cpu data.
2055 * @param u64NanoTS The current time stamp.
2056 * @param u64TSC The current TSC.
2057 * @param iTick The current timer tick.
2058 *
2059 * @remarks Can be called with interrupts disabled!
2060 */
2061static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2062{
2063 uint64_t u64TSCDelta;
2064 uint32_t u32UpdateIntervalTSC;
2065 uint32_t u32UpdateIntervalTSCSlack;
2066 unsigned iTSCHistoryHead;
2067 uint64_t u64CpuHz;
2068 uint32_t u32TransactionId;
2069
2070 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2071 AssertPtrReturnVoid(pGip);
2072
2073 /* Delta between this and the previous update. */
2074 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2075
2076 /*
2077 * Update the NanoTS.
2078 */
2079 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2080
2081 /*
2082 * Calc TSC delta.
2083 */
2084 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2085 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2086
2087 /*
2088 * We don't need to keep realculating the frequency when it's invariant, so
2089 * the remainder of this function is only for the sync and async TSC modes.
2090 */
2091 if (pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC)
2092 {
2093 if (u64TSCDelta >> 32)
2094 {
2095 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2096 pGipCpu->cErrors++;
2097 }
2098
2099 /*
2100 * On the 2nd and 3rd callout, reset the history with the current TSC
2101 * interval since the values entered by supdrvGipInit are totally off.
2102 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2103 * better, while the 3rd should be most reliable.
2104 */
2105 /** @todo Could we drop this now that we initializes the history
2106 * with nominal TSC frequency values? */
2107 u32TransactionId = pGipCpu->u32TransactionId;
2108 if (RT_UNLIKELY( ( u32TransactionId == 5
2109 || u32TransactionId == 7)
2110 && ( iTick == 2
2111 || iTick == 3) ))
2112 {
2113 unsigned i;
2114 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2115 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2116 }
2117
2118 /*
2119 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2120 * Wait until we have at least one full history since the above history reset. The
2121 * assumption is that the majority of the previous history values will be tolerable.
2122 * See @bugref{6710} comment #67.
2123 */
2124 /** @todo Could we drop the fuding there now that we initializes the history
2125 * with nominal TSC frequency values? */
2126 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2127 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2128 {
2129 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2130 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2131 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2132 {
2133 uint32_t u32;
2134 u32 = pGipCpu->au32TSCHistory[0];
2135 u32 += pGipCpu->au32TSCHistory[1];
2136 u32 += pGipCpu->au32TSCHistory[2];
2137 u32 += pGipCpu->au32TSCHistory[3];
2138 u32 >>= 2;
2139 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2140 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2141 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2142 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2143 u64TSCDelta >>= 2;
2144 u64TSCDelta += u32;
2145 u64TSCDelta >>= 1;
2146 }
2147 }
2148
2149 /*
2150 * TSC History.
2151 */
2152 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2153 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2154 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2155 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2156
2157 /*
2158 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2159 *
2160 * On Windows, we have an occasional (but recurring) sour value that messed up
2161 * the history but taking only 1 interval reduces the precision overall.
2162 */
2163 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2164 || pGip->u32UpdateHz >= 1000)
2165 {
2166 uint32_t u32;
2167 u32 = pGipCpu->au32TSCHistory[0];
2168 u32 += pGipCpu->au32TSCHistory[1];
2169 u32 += pGipCpu->au32TSCHistory[2];
2170 u32 += pGipCpu->au32TSCHistory[3];
2171 u32 >>= 2;
2172 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2173 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2174 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2175 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2176 u32UpdateIntervalTSC >>= 2;
2177 u32UpdateIntervalTSC += u32;
2178 u32UpdateIntervalTSC >>= 1;
2179
2180 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2181 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2182 }
2183 else if (pGip->u32UpdateHz >= 90)
2184 {
2185 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2186 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2187 u32UpdateIntervalTSC >>= 1;
2188
2189 /* value chosen on a 2GHz thinkpad running windows */
2190 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2191 }
2192 else
2193 {
2194 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2195
2196 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2197 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2198 }
2199 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2200
2201 /*
2202 * CpuHz.
2203 */
2204 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2205 u64CpuHz /= pGip->u32UpdateIntervalNS;
2206 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2207 }
2208}
2209
2210
2211/**
2212 * Updates the GIP.
2213 *
2214 * @param pDevExt The device extension.
2215 * @param u64NanoTS The current nanosecond timesamp.
2216 * @param u64TSC The current TSC timesamp.
2217 * @param idCpu The CPU ID.
2218 * @param iTick The current timer tick.
2219 *
2220 * @remarks Can be called with interrupts disabled!
2221 */
2222static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2223{
2224 /*
2225 * Determine the relevant CPU data.
2226 */
2227 PSUPGIPCPU pGipCpu;
2228 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2229 AssertPtrReturnVoid(pGip);
2230
2231 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2232 pGipCpu = &pGip->aCPUs[0];
2233 else
2234 {
2235 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
2236 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
2237 return;
2238 pGipCpu = &pGip->aCPUs[iCpu];
2239 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
2240 return;
2241 }
2242
2243 /*
2244 * Start update transaction.
2245 */
2246 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2247 {
2248 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2249 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2250 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2251 pGipCpu->cErrors++;
2252 return;
2253 }
2254
2255 /*
2256 * Recalc the update frequency every 0x800th time.
2257 */
2258 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariants hosts. */
2259 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2260 {
2261 if (pGip->u64NanoTSLastUpdateHz)
2262 {
2263#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2264 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2265 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2266 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2267 {
2268 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2269 * calculation on non-invariant hosts if it changes the history decision
2270 * taken in supdrvGipDoUpdateCpu(). */
2271 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2272 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2273 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2274 }
2275#endif
2276 }
2277 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2278 }
2279
2280 /*
2281 * Update the data.
2282 */
2283 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2284
2285 /*
2286 * Complete transaction.
2287 */
2288 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2289}
2290
2291
2292/**
2293 * Updates the per cpu GIP data for the calling cpu.
2294 *
2295 * @param pDevExt The device extension.
2296 * @param u64NanoTS The current nanosecond timesamp.
2297 * @param u64TSC The current TSC timesamp.
2298 * @param idCpu The CPU ID.
2299 * @param idApic The APIC id for the CPU index.
2300 * @param iTick The current timer tick.
2301 *
2302 * @remarks Can be called with interrupts disabled!
2303 */
2304static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2305 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2306{
2307 uint32_t iCpu;
2308 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2309
2310 /*
2311 * Avoid a potential race when a CPU online notification doesn't fire on
2312 * the onlined CPU but the tick creeps in before the event notification is
2313 * run.
2314 */
2315 if (RT_UNLIKELY(iTick == 1))
2316 {
2317 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2318 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2319 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2320 }
2321
2322 iCpu = pGip->aiCpuFromApicId[idApic];
2323 if (RT_LIKELY(iCpu < pGip->cCpus))
2324 {
2325 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2326 if (pGipCpu->idCpu == idCpu)
2327 {
2328 /*
2329 * Start update transaction.
2330 */
2331 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2332 {
2333 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2334 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2335 pGipCpu->cErrors++;
2336 return;
2337 }
2338
2339 /*
2340 * Update the data.
2341 */
2342 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2343
2344 /*
2345 * Complete transaction.
2346 */
2347 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2348 }
2349 }
2350}
2351
2352
2353/**
2354 * Timer callback function for the sync and invariant GIP modes.
2355 *
2356 * @param pTimer The timer.
2357 * @param pvUser Opaque pointer to the device extension.
2358 * @param iTick The timer tick.
2359 */
2360static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2361{
2362 RTCCUINTREG uFlags;
2363 uint64_t u64TSC;
2364 uint64_t u64NanoTS;
2365 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2366 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2367
2368 uFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2369 u64TSC = ASMReadTSC();
2370 u64NanoTS = RTTimeSystemNanoTS();
2371
2372 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2373 {
2374 /*
2375 * The calculations in supdrvGipUpdate() is somewhat timing sensitive,
2376 * missing timer ticks is not an option for GIP because the GIP users
2377 * will end up incrementing the time in 1ns per time getter call until
2378 * there is a complete timer update. So, if the delta has yet to be
2379 * calculated, we just pretend it is zero for now (the GIP users
2380 * probably won't have it for a wee while either and will do the same).
2381 *
2382 * We could maybe on some platforms try cross calling a CPU with a
2383 * working delta here, but it's not worth the hassle since the
2384 * likelyhood of this happening is really low. On Windows, Linux, and
2385 * Solaris timers fire on the CPU they were registered/started on.
2386 * Darwin timers doesn't necessarily (they are high priority threads).
2387 */
2388 uint32_t iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
2389 uint16_t iGipCpu = RT_LIKELY(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx))
2390 ? pGip->aiCpuFromCpuSetIdx[iCpuSet] : UINT16_MAX;
2391 Assert(!ASMIntAreEnabled());
2392 if (RT_LIKELY(iGipCpu < pGip->cCpus))
2393 {
2394 int64_t iTscDelta = pGip->aCPUs[iGipCpu].i64TSCDelta;
2395 if (iTscDelta != INT64_MAX)
2396 u64TSC -= iTscDelta;
2397 }
2398 }
2399
2400 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2401
2402 ASMSetFlags(uFlags);
2403}
2404
2405
2406/**
2407 * Timer callback function for async GIP mode.
2408 * @param pTimer The timer.
2409 * @param pvUser Opaque pointer to the device extension.
2410 * @param iTick The timer tick.
2411 */
2412static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2413{
2414 RTCCUINTREG fOldFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2415 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2416 RTCPUID idCpu = RTMpCpuId();
2417 uint64_t u64TSC = ASMReadTSC();
2418 uint64_t NanoTS = RTTimeSystemNanoTS();
2419
2420 /** @todo reset the transaction number and whatnot when iTick == 1. */
2421 if (pDevExt->idGipMaster == idCpu)
2422 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2423 else
2424 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
2425
2426 ASMSetFlags(fOldFlags);
2427}
2428
2429
2430
2431
2432/*
2433 *
2434 *
2435 * TSC Delta Measurements And Related Code
2436 * TSC Delta Measurements And Related Code
2437 * TSC Delta Measurements And Related Code
2438 *
2439 *
2440 */
2441
2442
2443/*
2444 * Select TSC delta measurement algorithm.
2445 */
2446#if 1
2447# define GIP_TSC_DELTA_METHOD_1
2448#else
2449# define GIP_TSC_DELTA_METHOD_2
2450#endif
2451
2452/** For padding variables to keep them away from other cache lines. Better too
2453 * large than too small!
2454 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2455 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2456 * III had 32 bytes cache lines. */
2457#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2458
2459
2460/**
2461 * TSC delta measurment algorithm \#2 result entry.
2462 */
2463typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2464{
2465 uint32_t iSeqMine;
2466 uint32_t iSeqOther;
2467 uint64_t uTsc;
2468} SUPDRVTSCDELTAMETHOD2ENTRY;
2469
2470/**
2471 * TSC delta measurment algorithm \#2 Data.
2472 */
2473typedef struct SUPDRVTSCDELTAMETHOD2
2474{
2475 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2476 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 1];
2477 /** The current sequence number of this worker. */
2478 uint32_t volatile iCurSeqNo;
2479 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2480 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2481 /** Result table. */
2482 SUPDRVTSCDELTAMETHOD2ENTRY aResults[96];
2483} SUPDRVTSCDELTAMETHOD2;
2484/** Pointer to the data for TSC delta mesurment algorithm \#2 .*/
2485typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2486
2487
2488/**
2489 * The TSC delta synchronization struct, version 2.
2490 *
2491 * The syncrhonization variable is completely isolated in its own cache line
2492 * (provided our max cache line size estimate is correct).
2493 */
2494typedef struct SUPTSCDELTASYNC2
2495{
2496 /** Padding to make sure the uVar1 is in its own cache line. */
2497 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2498
2499 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2500 volatile uint32_t uSyncVar;
2501 /** Sequence synchronizing variable used for post 'GO' synchronization. */
2502 volatile uint32_t uSyncSeq;
2503
2504 /** Padding to make sure the uVar1 is in its own cache line. */
2505 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2506
2507 /** Start RDTSC value. Put here mainly to save stack space. */
2508 uint64_t uTscStart;
2509 /** Copy of SUPDRVGIPTSCDELTARGS::cMaxTscTicks. */
2510 uint64_t cMaxTscTicks;
2511} SUPTSCDELTASYNC2;
2512AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2513typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2514
2515/** Prestart wait. */
2516#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0x0ffe)
2517/** Prestart aborted. */
2518#define GIP_TSC_DELTA_SYNC2_PRESTART_ABORT UINT32_C(0x0fff)
2519/** Ready (on your mark). */
2520#define GIP_TSC_DELTA_SYNC2_READY UINT32_C(0x1000)
2521/** Steady (get set). */
2522#define GIP_TSC_DELTA_SYNC2_STEADY UINT32_C(0x1001)
2523/** Go! */
2524#define GIP_TSC_DELTA_SYNC2_GO UINT32_C(0x1002)
2525/** Used by the verfication test. */
2526#define GIP_TSC_DELTA_SYNC2_GO_GO UINT32_C(0x1003)
2527
2528/** We reached the time limit. */
2529#define GIP_TSC_DELTA_SYNC2_TIMEOUT UINT32_C(0x1ffe)
2530/** The other party won't touch the sync struct ever again. */
2531#define GIP_TSC_DELTA_SYNC2_FINAL UINT32_C(0x1fff)
2532
2533
2534/**
2535 * Argument package/state passed by supdrvMeasureTscDeltaOne() to the RTMpOn
2536 * callback worker.
2537 */
2538typedef struct SUPDRVGIPTSCDELTARGS
2539{
2540 /** The device extension. */
2541 PSUPDRVDEVEXT pDevExt;
2542 /** Pointer to the GIP CPU array entry for the worker. */
2543 PSUPGIPCPU pWorker;
2544 /** Pointer to the GIP CPU array entry for the master. */
2545 PSUPGIPCPU pMaster;
2546 /** The maximum number of ticks to spend in supdrvMeasureTscDeltaCallback.
2547 * (This is what we need a rough TSC frequency for.) */
2548 uint64_t cMaxTscTicks;
2549 /** Used to abort synchronization setup. */
2550 bool volatile fAbortSetup;
2551
2552#if 0
2553 /** Method 1 data. */
2554 struct
2555 {
2556 } M1;
2557#endif
2558
2559#ifdef GIP_TSC_DELTA_METHOD_2
2560 struct
2561 {
2562 PSUPDRVTSCDELTAMETHOD2 pMasterData;
2563 PSUPDRVTSCDELTAMETHOD2 pWorkerData;
2564 uint32_t cHits;
2565 bool fLagMaster;
2566 bool fLagWorker;
2567 bool volatile fQuitEarly;
2568 } M2;
2569#endif
2570
2571
2572 /** Padding to make sure the master variables live in its own cache lines. */
2573 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2574 /** Pointer to the master's synchronization struct (on stack). */
2575 PSUPTSCDELTASYNC2 volatile pSyncMaster;
2576 /** Verification test TSC values for the master. */
2577 uint64_t volatile auVerifyMasterTscs[32];
2578 /** The verifier verdict, VINF_SUCCESS if ok, VERR_OUT_OF_RANGE if not,
2579 * VERR_TRY_AGAIN on timeout. */
2580 int32_t rcVerify;
2581 /** The maximum difference between TSC read during delta verification. */
2582 int64_t cMaxVerifyTscTicks;
2583 /** The minimum difference between two TSC reads during verification. */
2584 int64_t cMinVerifyTscTicks;
2585 /** The bad TSC diff, worker relative to master (= worker - master).
2586 * Negative value means the worker is behind the master. */
2587 int64_t iVerifyBadTscDiff;
2588
2589 /** Padding to make sure the uVar1 is in its own cache line. */
2590 uint64_t au64CacheLinePaddingBetween[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2591 /** Pointer to the worker's synchronization struct (on stack). */
2592 PSUPTSCDELTASYNC2 volatile pSyncWorker;
2593 /** Verification test TSC values for the worker. */
2594 uint64_t volatile auVerifyWorkerTscs[32];
2595
2596 /** Padding to make sure the above is in its own cache line. */
2597 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2598} SUPDRVGIPTSCDELTARGS;
2599typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2600
2601
2602/** @name Macros that implements the basic synchronization steps common to
2603 * the algorithms.
2604 *
2605 * Must be used from loop as the timeouts are implemented via 'break' statements
2606 * at the moment.
2607 *
2608 * @{
2609 */
2610#if defined(DEBUG_bird) /* || defined(VBOX_STRICT) */
2611# define TSCDELTA_DBG_VARS() uint32_t iDbgCounter
2612# define TSCDELTA_DBG_START_LOOP() do { iDbgCounter = 0; } while (0)
2613# define TSCDELTA_DBG_CHECK_LOOP() \
2614 do { iDbgCounter++; if ((iDbgCounter & UINT32_C(0x01ffffff)) == 0) RT_BREAKPOINT(); } while (0)
2615#else
2616# define TSCDELTA_DBG_VARS() ((void)0)
2617# define TSCDELTA_DBG_START_LOOP() ((void)0)
2618# define TSCDELTA_DBG_CHECK_LOOP() ((void)0)
2619#endif
2620
2621
2622static bool supdrvTscDeltaSync2_Before(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2623 bool fIsMaster, PRTCCUINTREG pfEFlags)
2624{
2625 uint32_t iMySeq = fIsMaster ? 0 : 256;
2626 uint32_t const iMaxSeq = iMySeq + 16; /* For the last loop, darn linux/freebsd C-ishness. */
2627 uint32_t u32Tmp;
2628 uint32_t iSync2Loops = 0;
2629 RTCCUINTREG fEFlags;
2630 TSCDELTA_DBG_VARS();
2631
2632 *pfEFlags = X86_EFL_IF | X86_EFL_1; /* should shut up most nagging compilers. */
2633
2634 /*
2635 * The master tells the worker to get on it's mark.
2636 */
2637 if (fIsMaster)
2638 {
2639 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2640 { /* likely*/ }
2641 else
2642 return false;
2643 }
2644
2645 /*
2646 * Wait for the on your mark signal (ack in the master case). We process timeouts here.
2647 */
2648 ASMAtomicWriteU32(&(pMySync)->uSyncSeq, 0);
2649 for (;;)
2650 {
2651 fEFlags = ASMIntDisableFlags();
2652 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2653 if (u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY)
2654 break;
2655
2656 ASMSetFlags(fEFlags);
2657 ASMNopPause();
2658
2659 /* Abort? */
2660 if (u32Tmp != GIP_TSC_DELTA_SYNC2_READY)
2661 break;
2662
2663 /* Check for timeouts every so often (not every loop in case RDTSC is
2664 trapping or something). Must check the first time around. */
2665#if 0 /* For debugging the timeout paths. */
2666 static uint32_t volatile xxx;
2667#endif
2668 if ( ( (iSync2Loops & 0x3ff) == 0
2669 && ASMReadTSC() - pMySync->uTscStart > pMySync->cMaxTscTicks)
2670#if 0 /* This is crazy, I know, but enable this code and the results are markedly better when enabled on the 1.4GHz AMD (debug). */
2671 || (!fIsMaster && (++xxx & 0xf) == 0)
2672#endif
2673 )
2674 {
2675 /* Try switch our own state into timeout mode so the master cannot tell us to 'GO',
2676 ignore the timeout if we've got the go ahead already (simpler). */
2677 if (ASMAtomicCmpXchgU32(&pMySync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_READY))
2678 {
2679 ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_TIMEOUT, GIP_TSC_DELTA_SYNC2_STEADY);
2680 return false;
2681 }
2682 }
2683 iSync2Loops++;
2684 }
2685
2686 /*
2687 * Interrupts are now disabled and will remain disabled until we do
2688 * TSCDELTA_MASTER_SYNC_AFTER / TSCDELTA_OTHER_SYNC_AFTER.
2689 */
2690 *pfEFlags = fEFlags;
2691
2692 /*
2693 * The worker tells the master that it is on its mark and that the master
2694 * need to get into position as well.
2695 */
2696 if (!fIsMaster)
2697 {
2698 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_STEADY, GIP_TSC_DELTA_SYNC2_READY)))
2699 { /* likely */ }
2700 else
2701 {
2702 ASMSetFlags(fEFlags);
2703 return false;
2704 }
2705 }
2706
2707 /*
2708 * The master sends the 'go' to the worker and wait for ACK.
2709 */
2710 if (fIsMaster)
2711 {
2712 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2713 { /* likely */ }
2714 else
2715 {
2716 ASMSetFlags(fEFlags);
2717 return false;
2718 }
2719 }
2720
2721 /*
2722 * Wait for the 'go' signal (ack in the master case).
2723 */
2724 TSCDELTA_DBG_START_LOOP();
2725 for (;;)
2726 {
2727 u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2728 if (u32Tmp == GIP_TSC_DELTA_SYNC2_GO)
2729 break;
2730 if (RT_LIKELY(u32Tmp == GIP_TSC_DELTA_SYNC2_STEADY))
2731 { /* likely */ }
2732 else
2733 {
2734 ASMSetFlags(fEFlags);
2735 return false;
2736 }
2737
2738 TSCDELTA_DBG_CHECK_LOOP();
2739 ASMNopPause();
2740 }
2741
2742 /*
2743 * The worker acks the 'go' (shouldn't fail).
2744 */
2745 if (!fIsMaster)
2746 {
2747 if (RT_LIKELY(ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO, GIP_TSC_DELTA_SYNC2_STEADY)))
2748 { /* likely */ }
2749 else
2750 {
2751 ASMSetFlags(fEFlags);
2752 return false;
2753 }
2754 }
2755
2756 /*
2757 * Try enter mostly lockstep execution with it.
2758 */
2759 for (;;)
2760 {
2761 uint32_t iOtherSeq1, iOtherSeq2;
2762 ASMCompilerBarrier();
2763 ASMSerializeInstruction();
2764
2765 ASMAtomicWriteU32(&pMySync->uSyncSeq, iMySeq);
2766 ASMNopPause();
2767 iOtherSeq1 = ASMAtomicXchgU32(&pOtherSync->uSyncSeq, iMySeq);
2768 ASMNopPause();
2769 iOtherSeq2 = ASMAtomicReadU32(&pMySync->uSyncSeq);
2770
2771 ASMCompilerBarrier();
2772 if (iOtherSeq1 == iOtherSeq2)
2773 return true;
2774
2775 /* Did the other guy give up? Should we give up? */
2776 if ( iOtherSeq1 == UINT32_MAX
2777 || iOtherSeq2 == UINT32_MAX)
2778 return true;
2779 if (++iMySeq >= iMaxSeq)
2780 {
2781 ASMAtomicWriteU32(&pMySync->uSyncSeq, UINT32_MAX);
2782 return true;
2783 }
2784 ASMNopPause();
2785 }
2786}
2787
2788#define TSCDELTA_MASTER_SYNC_BEFORE(a_pMySync, a_pOtherSync) \
2789 do { \
2790 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, true /*fMaster*/, &uFlags))) \
2791 { /*likely*/ } \
2792 else break; \
2793 } while (0)
2794#define TSCDELTA_OTHER_SYNC_BEFORE(a_pMySync, a_pOtherSync) \
2795 do { \
2796 if (RT_LIKELY(supdrvTscDeltaSync2_Before(a_pMySync, a_pOtherSync, false /*fMaster*/, &uFlags))) \
2797 { /*likely*/ } \
2798 else break; \
2799 } while (0)
2800
2801
2802static bool supdrvTscDeltaSync2_After(PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync, RTCCUINTREG fEFlags)
2803{
2804 TSCDELTA_DBG_VARS();
2805
2806 /*
2807 * Wait for the 'ready' signal. In the master's case, this means the
2808 * worker has completed its data collection, while in the worker's case it
2809 * means the master is done processing the data and it's time for the next
2810 * loop iteration (or whatever).
2811 */
2812 ASMSetFlags(fEFlags);
2813 TSCDELTA_DBG_START_LOOP();
2814 for (;;)
2815 {
2816 uint32_t u32Tmp = ASMAtomicReadU32(&pMySync->uSyncVar);
2817 if (u32Tmp == GIP_TSC_DELTA_SYNC2_READY)
2818 return true;
2819 ASMNopPause();
2820 if (u32Tmp != GIP_TSC_DELTA_SYNC2_GO)
2821 return false; /* shouldn't ever happen! */
2822 TSCDELTA_DBG_CHECK_LOOP();
2823 ASMNopPause();
2824 }
2825}
2826
2827#define TSCDELTA_MASTER_SYNC_AFTER(a_pMySync, a_pOtherSync) \
2828 do { \
2829 if (supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, uFlags)) \
2830 { /* likely */ } \
2831 else break; \
2832 } while (0)
2833
2834#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pMySync, a_pOtherSync) \
2835 do {\
2836 /* \
2837 * Tell the woker that we're done processing the data and ready for the next round. \
2838 */ \
2839 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2840 { \
2841 ASMSetFlags(uFlags); \
2842 break; \
2843 } \
2844 } while (0)
2845
2846#define TSCDELTA_OTHER_SYNC_AFTER(a_pMySync, a_pOtherSync) \
2847 do { \
2848 /* \
2849 * Tell the master that we're done collecting data and wait for the next round to start. \
2850 */ \
2851 if (!ASMAtomicCmpXchgU32(&(a_pOtherSync)->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_GO)) \
2852 { \
2853 ASMSetFlags(uFlags); \
2854 break; \
2855 } \
2856 if (supdrvTscDeltaSync2_After(a_pMySync, a_pOtherSync, uFlags)) \
2857 { /* likely */ } \
2858 else break; \
2859 } while (0)
2860/** @} */
2861
2862#ifdef GIP_TSC_DELTA_METHOD_1
2863
2864/**
2865 * TSC delta measurment algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
2866 *
2867 *
2868 * We ignore the first few runs of the loop in order to prime the
2869 * cache. Also, we need to be careful about using 'pause' instruction
2870 * in critical busy-wait loops in this code - it can cause undesired
2871 * behaviour with hyperthreading.
2872 *
2873 * We try to minimize the measurement error by computing the minimum
2874 * read time of the compare statement in the worker by taking TSC
2875 * measurements across it.
2876 *
2877 * It must be noted that the computed minimum read time is mostly to
2878 * eliminate huge deltas when the worker is too early and doesn't by
2879 * itself help produce more accurate deltas. We allow two times the
2880 * computed minimum as an arbibtrary acceptable threshold. Therefore,
2881 * it is still possible to get negative deltas where there are none
2882 * when the worker is earlier. As long as these occasional negative
2883 * deltas are lower than the time it takes to exit guest-context and
2884 * the OS to reschedule EMT on a different CPU we won't expose a TSC
2885 * that jumped backwards. It is because of the existence of the
2886 * negative deltas we don't recompute the delta with the master and
2887 * worker interchanged to eliminate the remaining measurement error.
2888 *
2889 *
2890 * @param pArgs The argument/state data.
2891 * @param pMySync My synchronization structure.
2892 * @param pOtherSync My partner's synchronization structure.
2893 * @param fIsMaster Set if master, clear if worker.
2894 * @param iTry The attempt number.
2895 */
2896static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
2897 bool fIsMaster, uint32_t iTry)
2898{
2899 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
2900 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
2901 uint64_t uMinCmpReadTime = UINT64_MAX;
2902 unsigned iLoop;
2903 NOREF(iTry);
2904
2905 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
2906 {
2907 RTCCUINTREG uFlags;
2908 if (fIsMaster)
2909 {
2910 /*
2911 * The master.
2912 */
2913 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
2914 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
2915 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
2916 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
2917
2918 do
2919 {
2920 ASMSerializeInstruction();
2921 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
2922 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2923
2924 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
2925
2926 /* Process the data. */
2927 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2928 {
2929 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
2930 {
2931 int64_t iDelta = pGipCpuWorker->u64TSCSample
2932 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
2933 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2934 ? iDelta < pGipCpuWorker->i64TSCDelta
2935 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
2936 pGipCpuWorker->i64TSCDelta = iDelta;
2937 }
2938 }
2939
2940 /* Reset our TSC sample and tell the worker to move on. */
2941 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
2942 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
2943 }
2944 else
2945 {
2946 /*
2947 * The worker.
2948 */
2949 uint64_t uTscWorker;
2950 uint64_t uTscWorkerFlushed;
2951 uint64_t uCmpReadTime;
2952
2953 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
2954 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
2955
2956 /*
2957 * Keep reading the TSC until we notice that the master has read his. Reading
2958 * the TSC -after- the master has updated the memory is way too late. We thus
2959 * compensate by trying to measure how long it took for the worker to notice
2960 * the memory flushed from the master.
2961 */
2962 do
2963 {
2964 ASMSerializeInstruction();
2965 uTscWorker = ASMReadTSC();
2966 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2967 ASMSerializeInstruction();
2968 uTscWorkerFlushed = ASMReadTSC();
2969
2970 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
2971 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2972 {
2973 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
2974 if (uCmpReadTime < (uMinCmpReadTime << 1))
2975 {
2976 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
2977 if (uCmpReadTime < uMinCmpReadTime)
2978 uMinCmpReadTime = uCmpReadTime;
2979 }
2980 else
2981 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2982 }
2983 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
2984 {
2985 if (uCmpReadTime < uMinCmpReadTime)
2986 uMinCmpReadTime = uCmpReadTime;
2987 }
2988
2989 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
2990 }
2991 }
2992
2993 /*
2994 * We must reset the worker TSC sample value in case it gets picked as a
2995 * GIP master later on (it's trashed above, naturally).
2996 */
2997 if (!fIsMaster)
2998 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2999}
3000
3001
3002/**
3003 * Initializes the argument/state data belonging to algorithm \#1.
3004 *
3005 * @returns VBox status code.
3006 * @param pArgs The argument/state data.
3007 */
3008static int supdrvTscDeltaMethod1Init(PSUPDRVGIPTSCDELTARGS pArgs)
3009{
3010 NOREF(pArgs);
3011 return VINF_SUCCESS;
3012}
3013
3014
3015/**
3016 * Undoes what supdrvTscDeltaMethod1Init() did.
3017 *
3018 * @param pArgs The argument/state data.
3019 */
3020static void supdrvTscDeltaMethod1Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3021{
3022 NOREF(pArgs);
3023}
3024
3025#endif /* GIP_TSC_DELTA_METHOD_1 */
3026
3027
3028#ifdef GIP_TSC_DELTA_METHOD_2
3029/*
3030 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
3031 */
3032
3033# define GIP_TSC_DELTA_M2_LOOPS (12 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3034# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 1
3035
3036
3037static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs, uint32_t iLoop)
3038{
3039 PSUPDRVTSCDELTAMETHOD2 pMasterData = pArgs->M2.pMasterData;
3040 PSUPDRVTSCDELTAMETHOD2 pOtherData = pArgs->M2.pWorkerData;
3041 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
3042 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
3043 uint32_t idxResult;
3044 uint32_t cHits = 0;
3045
3046 /*
3047 * Look for matching entries in the master and worker tables.
3048 */
3049 for (idxResult = 0; idxResult < RT_ELEMENTS(pMasterData->aResults); idxResult++)
3050 {
3051 uint32_t idxOther = pMasterData->aResults[idxResult].iSeqOther;
3052 if (idxOther & 1)
3053 {
3054 idxOther >>= 1;
3055 if (idxOther < RT_ELEMENTS(pOtherData->aResults))
3056 {
3057 if (pOtherData->aResults[idxOther].iSeqOther == pMasterData->aResults[idxResult].iSeqMine)
3058 {
3059 int64_t iDelta;
3060 iDelta = pOtherData->aResults[idxOther].uTsc
3061 - (pMasterData->aResults[idxResult].uTsc - iMasterTscDelta);
3062 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3063 ? iDelta < iBestDelta
3064 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
3065 iBestDelta = iDelta;
3066 cHits++;
3067 }
3068 }
3069 }
3070 }
3071
3072 /*
3073 * Save the results.
3074 */
3075 if (cHits > 2)
3076 pArgs->pWorker->i64TSCDelta = iBestDelta;
3077 pArgs->M2.cHits += cHits;
3078
3079 /*
3080 * Check and see if we can quit a little early. If the result is already
3081 * extremely good (+/-16 ticks seems reasonable), just stop.
3082 */
3083 if ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3084 ? iBestDelta <= 16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3085 : iBestDelta >= -16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE)
3086 {
3087 /*SUPR0Printf("quitting early #1: hits=%#x iLoop=%d iBestDelta=%lld\n", cHits, iLoop, iBestDelta);*/
3088 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3089 }
3090 /*
3091 * After a while, just stop if we get sufficent hits.
3092 */
3093 else if ( iLoop >= GIP_TSC_DELTA_M2_LOOPS / 3
3094 && cHits > 8)
3095 {
3096 uint32_t const cHitsNeeded = GIP_TSC_DELTA_M2_LOOPS * RT_ELEMENTS(pArgs->M2.pMasterData->aResults) / 4; /* 25% */
3097 if ( pArgs->M2.cHits >= cHitsNeeded
3098 && ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3099 ? iBestDelta <= GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
3100 : iBestDelta >= -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE) )
3101 {
3102 /*SUPR0Printf("quitting early hits=%#x (%#x) needed=%#x iLoop=%d iBestDelta=%lld\n",
3103 pArgs->M2.cHits, cHits, cHitsNeeded, iLoop, iBestDelta);*/
3104 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
3105 }
3106 }
3107}
3108
3109
3110/**
3111 * The core function of the 2nd TSC delta mesurment algorithm.
3112 *
3113 * The idea here is that we have the two CPUs execute the exact same code
3114 * collecting a largish set of TSC samples. The code has one data dependency on
3115 * the other CPU which intention it is to synchronize the execution as well as
3116 * help cross references the two sets of TSC samples (the sequence numbers).
3117 *
3118 * The @a fLag parameter is used to modify the execution a tiny bit on one or
3119 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
3120 * it will help with making the CPUs enter lock step execution occationally.
3121 *
3122 */
3123static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
3124{
3125 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
3126 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
3127
3128 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
3129 ASMSerializeInstruction();
3130 while (cLeft-- > 0)
3131 {
3132 uint64_t uTsc;
3133 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
3134 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
3135 ASMCompilerBarrier();
3136 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
3137 uTsc = ASMReadTSC();
3138 ASMAtomicIncU32(&pMyData->iCurSeqNo);
3139 ASMCompilerBarrier();
3140 ASMSerializeInstruction();
3141 pEntry->iSeqMine = iSeqMine;
3142 pEntry->iSeqOther = iSeqOther;
3143 pEntry->uTsc = uTsc;
3144 pEntry++;
3145 ASMSerializeInstruction();
3146 if (fLag)
3147 ASMNopPause();
3148 }
3149}
3150
3151
3152/**
3153 * TSC delta measurment algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
3154 *
3155 * See supdrvTscDeltaMethod2CollectData for algorithm details.
3156 *
3157 * @param pArgs The argument/state data.
3158 * @param pMySync My synchronization structure.
3159 * @param pOtherSync My partner's synchronization structure.
3160 * @param fIsMaster Set if master, clear if worker.
3161 * @param iTry The attempt number.
3162 */
3163static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, PSUPTSCDELTASYNC2 pOtherSync,
3164 bool fIsMaster, uint32_t iTry)
3165{
3166 unsigned iLoop;
3167
3168 if (fIsMaster)
3169 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, false);
3170
3171 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
3172 {
3173 RTCCUINTREG uFlags;
3174 if (fIsMaster)
3175 {
3176 /*
3177 * Adjust the loop lag fudge.
3178 */
3179# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3180 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3181 {
3182 /* Lag during the priming to be nice to everyone.. */
3183 pArgs->M2.fLagMaster = true;
3184 pArgs->M2.fLagWorker = true;
3185 }
3186 else
3187# endif
3188 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
3189 {
3190 /* 25 % of the body without lagging. */
3191 pArgs->M2.fLagMaster = false;
3192 pArgs->M2.fLagWorker = false;
3193 }
3194 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
3195 {
3196 /* 25 % of the body with both lagging. */
3197 pArgs->M2.fLagMaster = true;
3198 pArgs->M2.fLagWorker = true;
3199 }
3200 else
3201 {
3202 /* 50% of the body with alternating lag. */
3203 pArgs->M2.fLagMaster = (iLoop & 1) == 0;
3204 pArgs->M2.fLagWorker = (iLoop & 1) == 1;
3205 }
3206
3207 /*
3208 * Sync up with the worker and collect data.
3209 */
3210 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
3211 supdrvTscDeltaMethod2CollectData(pArgs->M2.pMasterData, &pArgs->M2.pWorkerData->iCurSeqNo, pArgs->M2.fLagMaster);
3212 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
3213
3214 /*
3215 * Process the data.
3216 */
3217# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
3218 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
3219# endif
3220 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs, iLoop);
3221
3222 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3223 }
3224 else
3225 {
3226 /*
3227 * The worker.
3228 */
3229 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
3230 supdrvTscDeltaMethod2CollectData(pArgs->M2.pWorkerData, &pArgs->M2.pMasterData->iCurSeqNo, pArgs->M2.fLagWorker);
3231 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
3232 }
3233
3234 if (ASMAtomicReadBool(&pArgs->M2.fQuitEarly))
3235 break;
3236
3237 }
3238}
3239
3240
3241/**
3242 * Initializes the argument/state data belonging to algorithm \#2.
3243 *
3244 * @returns VBox status code.
3245 * @param pArgs The argument/state data.
3246 */
3247static int supdrvTscDeltaMethod2Init(PSUPDRVGIPTSCDELTARGS pArgs)
3248{
3249 pArgs->M2.pMasterData = NULL;
3250 pArgs->M2.pWorkerData = NULL;
3251
3252 uint32_t const fFlags = /*RTMEMALLOCEX_FLAGS_ANY_CTX |*/ RTMEMALLOCEX_FLAGS_ZEROED;
3253 int rc = RTMemAllocEx(sizeof(*pArgs->M2.pWorkerData), 0, fFlags, (void **)&pArgs->M2.pWorkerData);
3254 if (RT_SUCCESS(rc))
3255 rc = RTMemAllocEx(sizeof(*pArgs->M2.pMasterData), 0, fFlags, (void **)&pArgs->M2.pMasterData);
3256 return rc;
3257}
3258
3259
3260/**
3261 * Undoes what supdrvTscDeltaMethod2Init() did.
3262 *
3263 * @param pArgs The argument/state data.
3264 */
3265static void supdrvTscDeltaMethod2Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3266{
3267 RTMemFreeEx(pArgs->M2.pMasterData, sizeof(*pArgs->M2.pMasterData));
3268 RTMemFreeEx(pArgs->M2.pWorkerData, sizeof(*pArgs->M2.pWorkerData));
3269# if 0
3270 SUPR0Printf("cHits=%d m=%d w=%d\n", pArgs->M2.cHits, pArgs->pMaster->idApic, pArgs->pWorker->idApic);
3271# endif
3272}
3273
3274
3275#endif /* GIP_TSC_DELTA_METHOD_2 */
3276
3277
3278
3279static int supdrvTscDeltaVerify(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync,
3280 PSUPTSCDELTASYNC2 pOtherSync, bool fIsMaster, int64_t iWorkerTscDelta)
3281{
3282 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3283 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3284 uint32_t i;
3285 TSCDELTA_DBG_VARS();
3286
3287 for (;;)
3288 {
3289 RTCCUINTREG uFlags;
3290 AssertCompile((RT_ELEMENTS(pArgs->auVerifyMasterTscs) & 1) == 0);
3291 AssertCompile(RT_ELEMENTS(pArgs->auVerifyWorkerTscs) == RT_ELEMENTS(pArgs->auVerifyMasterTscs));
3292
3293 if (fIsMaster)
3294 {
3295 uint64_t uTscWorker;
3296 TSCDELTA_MASTER_SYNC_BEFORE(pMySync, pOtherSync);
3297
3298 /*
3299 * Collect TSC, master goes first.
3300 */
3301 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyMasterTscs); i += 2)
3302 {
3303 /* Read, kick & wait #1. */
3304 uint64_t register uTsc = ASMReadTSC();
3305 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3306 ASMSerializeInstruction();
3307 pArgs->auVerifyMasterTscs[i] = uTsc;
3308 TSCDELTA_DBG_START_LOOP();
3309 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3310 {
3311 TSCDELTA_DBG_CHECK_LOOP();
3312 ASMNopPause();
3313 }
3314
3315 /* Read, kick & wait #2. */
3316 uTsc = ASMReadTSC();
3317 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3318 ASMSerializeInstruction();
3319 pArgs->auVerifyMasterTscs[i + 1] = uTsc;
3320 TSCDELTA_DBG_START_LOOP();
3321 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3322 {
3323 TSCDELTA_DBG_CHECK_LOOP();
3324 ASMNopPause();
3325 }
3326 }
3327
3328 TSCDELTA_MASTER_SYNC_AFTER(pMySync, pOtherSync);
3329
3330 /*
3331 * Process the data.
3332 */
3333 pArgs->cMaxVerifyTscTicks = INT64_MIN;
3334 pArgs->cMinVerifyTscTicks = INT64_MAX;
3335 pArgs->iVerifyBadTscDiff = 0;
3336 ASMAtomicWriteS32(&pArgs->rcVerify, VINF_SUCCESS);
3337 uTscWorker = 0;
3338 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyMasterTscs); i++)
3339 {
3340 /* Master vs previous worker entry. */
3341 uint64_t uTscMaster = pArgs->auVerifyMasterTscs[i] - pGipCpuMaster->i64TSCDelta;
3342 int64_t iDiff;
3343 if (i > 0)
3344 {
3345 iDiff = uTscMaster - uTscWorker;
3346 if (iDiff > pArgs->cMaxVerifyTscTicks)
3347 pArgs->cMaxVerifyTscTicks = iDiff;
3348 if (iDiff < pArgs->cMinVerifyTscTicks)
3349 pArgs->cMinVerifyTscTicks = iDiff;
3350 if (iDiff < 0)
3351 {
3352 pArgs->iVerifyBadTscDiff = -iDiff;
3353 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3354 break;
3355 }
3356 }
3357
3358 /* Worker vs master. */
3359 uTscWorker = pArgs->auVerifyWorkerTscs[i] - iWorkerTscDelta;
3360 iDiff = uTscWorker - uTscMaster;
3361 if (iDiff > pArgs->cMaxVerifyTscTicks)
3362 pArgs->cMaxVerifyTscTicks = iDiff;
3363 if (iDiff < pArgs->cMinVerifyTscTicks)
3364 pArgs->cMinVerifyTscTicks = iDiff;
3365 if (iDiff < 0)
3366 {
3367 pArgs->iVerifyBadTscDiff = iDiff;
3368 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_OUT_OF_RANGE);
3369 break;
3370 }
3371 }
3372
3373 /* Done. */
3374 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pMySync, pOtherSync);
3375 }
3376 else
3377 {
3378 /*
3379 * The worker, master leads.
3380 */
3381 TSCDELTA_OTHER_SYNC_BEFORE(pMySync, pOtherSync);
3382
3383 for (i = 0; i < RT_ELEMENTS(pArgs->auVerifyWorkerTscs); i += 2)
3384 {
3385 uint64_t register uTsc;
3386
3387 /* Wait, Read and Kick #1. */
3388 TSCDELTA_DBG_START_LOOP();
3389 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO)
3390 {
3391 TSCDELTA_DBG_CHECK_LOOP();
3392 ASMNopPause();
3393 }
3394 uTsc = ASMReadTSC();
3395 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO_GO);
3396 ASMSerializeInstruction();
3397 pArgs->auVerifyWorkerTscs[i] = uTsc;
3398
3399 /* Wait, Read and Kick #2. */
3400 TSCDELTA_DBG_START_LOOP();
3401 while (ASMAtomicReadU32(&pMySync->uSyncVar) == GIP_TSC_DELTA_SYNC2_GO_GO)
3402 {
3403 TSCDELTA_DBG_CHECK_LOOP();
3404 ASMNopPause();
3405 }
3406 uTsc = ASMReadTSC();
3407 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_GO);
3408 ASMSerializeInstruction();
3409 pArgs->auVerifyWorkerTscs[i + 1] = uTsc;
3410 }
3411
3412 TSCDELTA_OTHER_SYNC_AFTER(pMySync, pOtherSync);
3413 }
3414 return pArgs->rcVerify;
3415 }
3416
3417 /*
3418 * Timed out, please retry.
3419 */
3420 ASMAtomicWriteS32(&pArgs->rcVerify, VERR_TRY_AGAIN);
3421 return VERR_TIMEOUT;
3422}
3423
3424
3425
3426/**
3427 * Handles the special abort procedure during synchronization setup in
3428 * supdrvMeasureTscDeltaCallbackUnwrapped().
3429 *
3430 * @returns 0 (dummy, ignored)
3431 * @param pArgs Pointer to argument/state data.
3432 * @param pMySync Pointer to my sync structure.
3433 * @param fIsMaster Set if we're the master, clear if worker.
3434 * @param fTimeout Set if it's a timeout.
3435 */
3436
3437/* XXX See linux-3.14/include/linux/compiler-gcc.h:
3438 * #define noinline __attribute__((noinline)) */
3439#undef noinline
3440DECL_NO_INLINE(static, int)
3441supdrvMeasureTscDeltaCallbackAbortSyncSetup(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC2 pMySync, bool fIsMaster, bool fTimeout)
3442{
3443 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3444 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3445 TSCDELTA_DBG_VARS();
3446
3447 /*
3448 * Clear our sync pointer and make sure the abort flag is set.
3449 */
3450 ASMAtomicWriteNullPtr(ppMySync);
3451 ASMAtomicWriteBool(&pArgs->fAbortSetup, true);
3452
3453 /*
3454 * Make sure the other party is out of there and won't be touching our
3455 * sync state again (would cause stack corruption).
3456 */
3457 TSCDELTA_DBG_START_LOOP();
3458 while (ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2) != NULL)
3459 {
3460 ASMNopPause();
3461 ASMNopPause();
3462 ASMNopPause();
3463 TSCDELTA_DBG_CHECK_LOOP();
3464 }
3465
3466 return 0;
3467}
3468
3469
3470/**
3471 * This is used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3472 * and compute the delta between them.
3473 *
3474 * To reduce code size a good when timeout handling was added, a dummy return
3475 * value had to be added (saves 1-3 lines per timeout case), thus this
3476 * 'Unwrapped' function and the dummy 0 return value.
3477 *
3478 * @returns 0 (dummy, ignored)
3479 * @param idCpu The CPU we are current scheduled on.
3480 * @param pArgs Pointer to a parameter package.
3481 *
3482 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3483 * read the TSC at exactly the same time on both the master and the
3484 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3485 * contention, SMI, pipelining etc. there is no guaranteed way of
3486 * doing this on x86 CPUs.
3487 */
3488static int supdrvMeasureTscDeltaCallbackUnwrapped(RTCPUID idCpu, PSUPDRVGIPTSCDELTARGS pArgs)
3489{
3490 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3491 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3492 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3493 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3494 uint32_t iTry;
3495 PSUPTSCDELTASYNC2 volatile *ppMySync = fIsMaster ? &pArgs->pSyncMaster : &pArgs->pSyncWorker;
3496 PSUPTSCDELTASYNC2 volatile *ppOtherSync = fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster;
3497 SUPTSCDELTASYNC2 MySync;
3498 PSUPTSCDELTASYNC2 pOtherSync;
3499 int rc;
3500 TSCDELTA_DBG_VARS();
3501
3502 /* A bit of paranoia first. */
3503 if (!pGipCpuMaster || !pGipCpuWorker)
3504 return 0;
3505
3506 /*
3507 * If the CPU isn't part of the measurement, return immediately.
3508 */
3509 if ( !fIsMaster
3510 && idCpu != pGipCpuWorker->idCpu)
3511 return 0;
3512
3513 /*
3514 * Set up my synchronization stuff and wait for the other party to show up.
3515 *
3516 * We don't wait forever since the other party may be off fishing (offline,
3517 * spinning with ints disables, whatever), we must play nice to the rest of
3518 * the system as this context generally isn't one in which we will get
3519 * preempted and we may hold up a number of lower priority interrupts.
3520 */
3521 ASMAtomicWriteU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT);
3522 ASMAtomicWritePtr(ppMySync, &MySync);
3523 MySync.uTscStart = ASMReadTSC();
3524 MySync.cMaxTscTicks = pArgs->cMaxTscTicks;
3525
3526 /* Look for the partner, might not be here yet... Special abort considerations. */
3527 iTry = 0;
3528 TSCDELTA_DBG_START_LOOP();
3529 while ((pOtherSync = ASMAtomicReadPtrT(ppOtherSync, PSUPTSCDELTASYNC2)) == NULL)
3530 {
3531 ASMNopPause();
3532 if ( ASMAtomicReadBool(&pArgs->fAbortSetup)
3533 || !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu) )
3534 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3535 if ( (iTry++ & 0xff) == 0
3536 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3537 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3538 TSCDELTA_DBG_CHECK_LOOP();
3539 ASMNopPause();
3540 }
3541
3542 /* I found my partner, waiting to be found... Special abort considerations. */
3543 if (fIsMaster)
3544 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* parnaoia */
3545 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3546
3547 iTry = 0;
3548 TSCDELTA_DBG_START_LOOP();
3549 while (ASMAtomicReadU32(&MySync.uSyncVar) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3550 {
3551 ASMNopPause();
3552 if (ASMAtomicReadBool(&pArgs->fAbortSetup))
3553 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3554 if ( (iTry++ & 0xff) == 0
3555 && ASMReadTSC() - MySync.uTscStart > pArgs->cMaxTscTicks)
3556 {
3557 if ( fIsMaster
3558 && !ASMAtomicCmpXchgU32(&MySync.uSyncVar, GIP_TSC_DELTA_SYNC2_PRESTART_ABORT, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT))
3559 break; /* race #1: slave has moved on, handle timeout in loop instead. */
3560 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, true /*fTimeout*/);
3561 }
3562 TSCDELTA_DBG_CHECK_LOOP();
3563 }
3564
3565 if (!fIsMaster)
3566 if (!ASMAtomicCmpXchgU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_READY, GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)) /* race #1 */
3567 return supdrvMeasureTscDeltaCallbackAbortSyncSetup(pArgs, &MySync, fIsMaster, false /*fTimeout*/);
3568
3569/** @todo Add a resumable state to pArgs so we don't waste time if we time
3570 * out or something. Timeouts are legit, any of the two CPUs may get
3571 * interrupted. */
3572
3573 /*
3574 * Start by seeing if we have a zero delta between the two CPUs.
3575 * This should normally be the case.
3576 */
3577 rc = supdrvTscDeltaVerify(pArgs, &MySync, pOtherSync, fIsMaster, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3578 if (RT_SUCCESS(rc))
3579 {
3580 if (fIsMaster)
3581 {
3582 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3583 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3584 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3585 }
3586 else
3587 {
3588 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3589 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3590 }
3591 }
3592 /*
3593 * If the verification didn't time out, do regular delta measurements.
3594 * We retry this until we get a reasonable value.
3595 */
3596 else if (rc != VERR_TIMEOUT)
3597 {
3598 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3599 for (iTry = 0; iTry < 12; iTry++)
3600 {
3601 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3602 break;
3603
3604 /*
3605 * Do the measurements.
3606 */
3607#ifdef GIP_TSC_DELTA_METHOD_1
3608 supdrvTscDeltaMethod1Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3609#elif defined(GIP_TSC_DELTA_METHOD_2)
3610 supdrvTscDeltaMethod2Loop(pArgs, &MySync, pOtherSync, fIsMaster, iTry);
3611#else
3612# error "huh??"
3613#endif
3614 if (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_READY)
3615 break;
3616
3617 /*
3618 * Success? If so, stop trying.
3619 */
3620 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3621 {
3622 if (fIsMaster)
3623 {
3624 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3625 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3626 }
3627 else
3628 {
3629 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3630 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3631 }
3632 break;
3633 }
3634 }
3635 }
3636
3637 /*
3638 * End the synchroniziation dance. We tell the other that we're done,
3639 * then wait for the same kind of reply.
3640 */
3641 ASMAtomicWriteU32(&pOtherSync->uSyncVar, GIP_TSC_DELTA_SYNC2_FINAL);
3642 ASMAtomicWriteNullPtr(ppMySync);
3643 iTry = 0;
3644 TSCDELTA_DBG_START_LOOP();
3645 while (ASMAtomicReadU32(&MySync.uSyncVar) != GIP_TSC_DELTA_SYNC2_FINAL)
3646 {
3647 iTry++;
3648 if ( iTry == 0
3649 && !RTMpIsCpuOnline(fIsMaster ? pGipCpuWorker->idCpu : pGipCpuWorker->idCpu))
3650 break; /* this really shouldn't happen. */
3651 TSCDELTA_DBG_CHECK_LOOP();
3652 ASMNopPause();
3653 }
3654
3655 return 0;
3656}
3657
3658/**
3659 * Callback used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3660 * and compute the delta between them.
3661 *
3662 * @param idCpu The CPU we are current scheduled on.
3663 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3664 * @param pvUser2 Unused.
3665 */
3666static DECLCALLBACK(void) supdrvMeasureTscDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3667{
3668 supdrvMeasureTscDeltaCallbackUnwrapped(idCpu, (PSUPDRVGIPTSCDELTARGS)pvUser1);
3669}
3670
3671
3672/**
3673 * Measures the TSC delta between the master GIP CPU and one specified worker
3674 * CPU.
3675 *
3676 * @returns VBox status code.
3677 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3678 * failure.
3679 * @param pDevExt Pointer to the device instance data.
3680 * @param idxWorker The index of the worker CPU from the GIP's array of
3681 * CPUs.
3682 *
3683 * @remarks This must be called with preemption enabled!
3684 */
3685static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3686{
3687 int rc;
3688 int rc2;
3689 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3690 RTCPUID idMaster = pDevExt->idGipMaster;
3691 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3692 PSUPGIPCPU pGipCpuMaster;
3693 uint32_t iGipCpuMaster;
3694
3695 /* Validate input a bit. */
3696 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3697 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3698 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3699
3700 /*
3701 * Don't attempt measuring the delta for the GIP master.
3702 */
3703 if (pGipCpuWorker->idCpu == idMaster)
3704 {
3705 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3706 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3707 return VINF_SUCCESS;
3708 }
3709
3710 /*
3711 * One measurement at at time, at least for now. We might be using
3712 * broadcast IPIs so, so be nice to the rest of the system.
3713 */
3714#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3715 rc = RTSemMutexRequest(pDevExt->mtxTscDelta, RT_INDEFINITE_WAIT);
3716#else
3717 rc = RTSemFastMutexRequest(pDevExt->mtxTscDelta);
3718#endif
3719 if (RT_FAILURE(rc))
3720 return rc;
3721
3722 /*
3723 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
3724 * try pick a different master. (This fudge only works with multi core systems.)
3725 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
3726 *
3727 * We skip this on AMDs for now as their HTT is different from intel's and
3728 * it doesn't seem to have any favorable effect on the results.
3729 *
3730 * If the master is offline, we need a new master too, so share the code.
3731 */
3732 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
3733 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
3734 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
3735 if ( ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
3736 && ASMHasCpuId()
3737 && ASMIsValidStdRange(ASMCpuId_EAX(0))
3738 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
3739 && !ASMIsAmdCpu()
3740 && pGip->cOnlineCpus > 2)
3741 || !RTMpIsCpuOnline(idMaster) )
3742 {
3743 uint32_t i;
3744 for (i = 0; i < pGip->cCpus; i++)
3745 if ( i != iGipCpuMaster
3746 && i != idxWorker
3747 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
3748 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
3749 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
3750 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
3751 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
3752 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
3753 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic
3754 && RTMpIsCpuOnline(pGip->aCPUs[i].idCpu))
3755 {
3756 iGipCpuMaster = i;
3757 pGipCpuMaster = &pGip->aCPUs[i];
3758 idMaster = pGipCpuMaster->idCpu;
3759 break;
3760 }
3761 }
3762
3763 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
3764 {
3765 /*
3766 * Initialize data package for the RTMpOnPair callback.
3767 */
3768 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)RTMemAllocZ(sizeof(*pArgs));
3769 if (pArgs)
3770 {
3771 pArgs->pWorker = pGipCpuWorker;
3772 pArgs->pMaster = pGipCpuMaster;
3773 pArgs->pDevExt = pDevExt;
3774 pArgs->pSyncMaster = NULL;
3775 pArgs->pSyncWorker = NULL;
3776#if 0 /* later */
3777 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 2048; /* 488 us */
3778#else
3779 pArgs->cMaxTscTicks = ASMAtomicReadU64(&pGip->u64CpuHz) / 1024; /* 976 us */
3780#endif
3781
3782#ifdef GIP_TSC_DELTA_METHOD_1
3783 rc = supdrvTscDeltaMethod1Init(pArgs);
3784#elif defined(GIP_TSC_DELTA_METHOD_2)
3785 rc = supdrvTscDeltaMethod2Init(pArgs);
3786#else
3787# error "huh?"
3788#endif
3789 if (RT_SUCCESS(rc))
3790 {
3791 /*
3792 * Do the RTMpOnPair call. We reset i64TSCDelta first so we
3793 * and supdrvMeasureTscDeltaCallback can use it as a success check.
3794 */
3795 /** @todo Store the i64TSCDelta result in pArgs first? Perhaps deals with
3796 * that when doing the restart loop reorg. */
3797 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3798 rc = RTMpOnPair(pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, RTMPON_F_CONCURRENT_EXEC,
3799 supdrvMeasureTscDeltaCallback, pArgs, NULL);
3800 if (RT_SUCCESS(rc))
3801 {
3802#if 0
3803 SUPR0Printf("rcVerify=%d iVerifyBadTscDiff=%lld cMinVerifyTscTicks=%lld cMaxVerifyTscTicks=%lld\n",
3804 pArgs->rcVerify, pArgs->iVerifyBadTscDiff, pArgs->cMinVerifyTscTicks, pArgs->cMaxVerifyTscTicks);
3805#endif
3806 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
3807 {
3808 /*
3809 * Work the TSC delta applicability rating. It starts
3810 * optimistic in supdrvGipInit, we downgrade it here.
3811 */
3812 SUPGIPUSETSCDELTA enmRating;
3813 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
3814 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
3815 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
3816 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
3817 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
3818 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
3819 else
3820 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
3821 if (pGip->enmUseTscDelta < enmRating)
3822 {
3823 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
3824 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
3825 }
3826 }
3827 else
3828 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3829 }
3830 /** @todo return try-again if we get an offline CPU error. */
3831 }
3832
3833#ifdef GIP_TSC_DELTA_METHOD_1
3834 supdrvTscDeltaMethod1Delete(pArgs);
3835#elif defined(GIP_TSC_DELTA_METHOD_2)
3836 supdrvTscDeltaMethod2Delete(pArgs);
3837#else
3838# error "huh?"
3839#endif
3840 RTMemFree(pArgs);
3841 }
3842 else
3843 rc = VERR_NO_MEMORY;
3844 }
3845 else
3846 rc = VERR_CPU_OFFLINE;
3847
3848 /*
3849 * We're done now.
3850 */
3851#ifdef SUPDRV_USE_MUTEX_FOR_GIP
3852 rc2 = RTSemMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3853#else
3854 rc2 = RTSemFastMutexRelease(pDevExt->mtxTscDelta); AssertRC(rc2);
3855#endif
3856 return rc;
3857}
3858
3859
3860/**
3861 * Clears TSC delta related variables.
3862 *
3863 * Clears all TSC samples as well as the delta synchronization variable on the
3864 * all the per-CPU structs. Optionally also clears the per-cpu deltas too.
3865 *
3866 * @param pDevExt Pointer to the device instance data.
3867 * @param fClearDeltas Whether the deltas are also to be cleared.
3868 */
3869static void supdrvClearTscSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas)
3870{
3871 unsigned iCpu;
3872 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3873 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3874 {
3875 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
3876 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
3877 if (fClearDeltas)
3878 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
3879 }
3880}
3881
3882
3883/**
3884 * Performs the initial measurements of the TSC deltas between CPUs.
3885 *
3886 * This is called by supdrvGipCreate or triggered by it if threaded.
3887 *
3888 * @returns VBox status code.
3889 * @param pDevExt Pointer to the device instance data.
3890 *
3891 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
3892 * idCpu, GIP's online CPU set which are populated in
3893 * supdrvGipInitOnCpu().
3894 */
3895static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt)
3896{
3897 PSUPGIPCPU pGipCpuMaster;
3898 unsigned iCpu;
3899 unsigned iOddEven;
3900 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3901 uint32_t idxMaster = UINT32_MAX;
3902 int rc = VINF_SUCCESS;
3903 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
3904
3905 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3906
3907 /*
3908 * Pick the first CPU online as the master TSC and make it the new GIP master based
3909 * on the APIC ID.
3910 *
3911 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
3912 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
3913 * master as this point since the sync/async timer isn't created yet.
3914 */
3915 supdrvClearTscSamples(pDevExt, true /* fClearDeltas */);
3916 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
3917 {
3918 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
3919 if (idxCpu != UINT16_MAX)
3920 {
3921 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
3922 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
3923 {
3924 idxMaster = idxCpu;
3925 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
3926 break;
3927 }
3928 }
3929 }
3930 AssertReturn(idxMaster != UINT32_MAX, VERR_CPU_NOT_FOUND);
3931 pGipCpuMaster = &pGip->aCPUs[idxMaster];
3932 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3933
3934 /*
3935 * If there is only a single CPU online we have nothing to do.
3936 */
3937 if (pGip->cOnlineCpus <= 1)
3938 {
3939 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
3940 return VINF_SUCCESS;
3941 }
3942
3943 /*
3944 * Loop thru the GIP CPU array and get deltas for each CPU (except the
3945 * master). We do the CPUs with the even numbered APIC IDs first so that
3946 * we've got alternative master CPUs to pick from on hyper-threaded systems.
3947 */
3948 for (iOddEven = 0; iOddEven < 2; iOddEven++)
3949 {
3950 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3951 {
3952 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3953 if ( iCpu != idxMaster
3954 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
3955 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3956 {
3957 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3958 if (RT_FAILURE(rc))
3959 {
3960 SUPR0Printf("supdrvMeasureTscDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
3961 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3962 break;
3963 }
3964
3965 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
3966 {
3967 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
3968 rc = VERR_TRY_AGAIN;
3969 break;
3970 }
3971 }
3972 }
3973 }
3974
3975 return rc;
3976}
3977
3978
3979#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3980
3981/**
3982 * Switches the TSC-delta measurement thread into the butchered state.
3983 *
3984 * @returns VBox status code.
3985 * @param pDevExt Pointer to the device instance data.
3986 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
3987 * @param pszFailed An error message to log.
3988 * @param rcFailed The error code to exit the thread with.
3989 */
3990static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
3991{
3992 if (!fSpinlockHeld)
3993 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3994
3995 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
3996 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3997 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", rcFailed));
3998 return rcFailed;
3999}
4000
4001
4002/**
4003 * The TSC-delta measurement thread.
4004 *
4005 * @returns VBox status code.
4006 * @param hThread The thread handle.
4007 * @param pvUser Opaque pointer to the device instance data.
4008 */
4009static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
4010{
4011 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
4012 bool fInitialMeasurement = true;
4013 uint32_t cConsecutiveTimeouts = 0;
4014 int rc = VERR_INTERNAL_ERROR_2;
4015 for (;;)
4016 {
4017 /*
4018 * Switch on the current state.
4019 */
4020 SUPDRVTSCDELTATHREADSTATE enmState;
4021 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4022 enmState = pDevExt->enmTscDeltaThreadState;
4023 switch (enmState)
4024 {
4025 case kTscDeltaThreadState_Creating:
4026 {
4027 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4028 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
4029 if (RT_FAILURE(rc))
4030 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4031 /* fall thru */
4032 }
4033
4034 case kTscDeltaThreadState_Listening:
4035 {
4036 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4037
4038 /* Simple adaptive timeout. */
4039 if (cConsecutiveTimeouts++ == 10)
4040 {
4041 if (pDevExt->cMsTscDeltaTimeout == 1) /* 10 ms */
4042 pDevExt->cMsTscDeltaTimeout = 10;
4043 else if (pDevExt->cMsTscDeltaTimeout == 10) /* +100 ms */
4044 pDevExt->cMsTscDeltaTimeout = 100;
4045 else if (pDevExt->cMsTscDeltaTimeout == 100) /* +1000 ms */
4046 pDevExt->cMsTscDeltaTimeout = 500;
4047 cConsecutiveTimeouts = 0;
4048 }
4049 rc = RTThreadUserWait(pDevExt->hTscDeltaThread, pDevExt->cMsTscDeltaTimeout);
4050 if ( RT_FAILURE(rc)
4051 && rc != VERR_TIMEOUT)
4052 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
4053 RTThreadUserReset(pDevExt->hTscDeltaThread);
4054 break;
4055 }
4056
4057 case kTscDeltaThreadState_WaitAndMeasure:
4058 {
4059 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
4060 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
4061 if (RT_FAILURE(rc))
4062 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
4063 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4064 pDevExt->cMsTscDeltaTimeout = 1;
4065 RTThreadSleep(1);
4066 /* fall thru */
4067 }
4068
4069 case kTscDeltaThreadState_Measuring:
4070 {
4071 cConsecutiveTimeouts = 0;
4072 if (fInitialMeasurement)
4073 {
4074 int cTries = 8;
4075 int cMsWaitPerTry = 10;
4076 fInitialMeasurement = false;
4077 do
4078 {
4079 rc = supdrvMeasureInitialTscDeltas(pDevExt);
4080 if ( RT_SUCCESS(rc)
4081 || ( RT_FAILURE(rc)
4082 && rc != VERR_TRY_AGAIN
4083 && rc != VERR_CPU_OFFLINE))
4084 {
4085 break;
4086 }
4087 RTThreadSleep(cMsWaitPerTry);
4088 } while (cTries-- > 0);
4089 }
4090 else
4091 {
4092 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
4093 unsigned iCpu;
4094
4095 /* Measure TSC-deltas only for the CPUs that are in the set. */
4096 rc = VINF_SUCCESS;
4097 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
4098 {
4099 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
4100 if (RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
4101 {
4102 if (pGipCpuWorker->i64TSCDelta == INT64_MAX)
4103 {
4104 int rc2 = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
4105 if (RT_FAILURE(rc2) && RT_SUCCESS(rc))
4106 rc = rc2;
4107 }
4108 else
4109 {
4110 /*
4111 * The thread/someone must've called SUPR0TscDeltaMeasureBySetIndex,
4112 * mark the delta as fine to get the timer thread off our back.
4113 */
4114 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
4115 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
4116 }
4117 }
4118 }
4119 }
4120 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4121 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4122 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
4123 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4124 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as the initial value. */
4125 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
4126 break;
4127 }
4128
4129 case kTscDeltaThreadState_Terminating:
4130 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
4131 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4132 return VINF_SUCCESS;
4133
4134 case kTscDeltaThreadState_Butchered:
4135 default:
4136 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
4137 }
4138 }
4139
4140 return rc;
4141}
4142
4143
4144/**
4145 * Waits for the TSC-delta measurement thread to respond to a state change.
4146 *
4147 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
4148 * other error code on internal error.
4149 *
4150 * @param pThis Pointer to the grant service instance data.
4151 * @param enmCurState The current state.
4152 * @param enmNewState The new state we're waiting for it to enter.
4153 */
4154static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
4155 SUPDRVTSCDELTATHREADSTATE enmNewState)
4156{
4157 /*
4158 * Wait a short while for the expected state transition.
4159 */
4160 int rc;
4161 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
4162 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4163 if (pDevExt->enmTscDeltaThreadState == enmNewState)
4164 {
4165 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4166 rc = VINF_SUCCESS;
4167 }
4168 else if (pDevExt->enmTscDeltaThreadState == enmCurState)
4169 {
4170 /*
4171 * Wait longer if the state has not yet transitioned to the one we want.
4172 */
4173 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4174 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
4175 if ( RT_SUCCESS(rc)
4176 || rc == VERR_TIMEOUT)
4177 {
4178 /*
4179 * Check the state whether we've succeeded.
4180 */
4181 SUPDRVTSCDELTATHREADSTATE enmState;
4182 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4183 enmState = pDevExt->enmTscDeltaThreadState;
4184 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4185 if (enmState == enmNewState)
4186 rc = VINF_SUCCESS;
4187 else if (enmState == enmCurState)
4188 {
4189 rc = VERR_TIMEOUT;
4190 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmState=%d enmNewState=%d\n", enmState,
4191 enmNewState));
4192 }
4193 else
4194 {
4195 rc = VERR_INTERNAL_ERROR;
4196 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
4197 enmState, enmNewState));
4198 }
4199 }
4200 else
4201 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
4202 }
4203 else
4204 {
4205 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4206 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d\n", enmCurState, enmNewState));
4207 rc = VERR_INTERNAL_ERROR;
4208 }
4209
4210 return rc;
4211}
4212
4213
4214/**
4215 * Signals the TSC-delta thread to start measuring TSC-deltas.
4216 *
4217 * @param pDevExt Pointer to the device instance data.
4218 */
4219static void supdrvTscDeltaThreadStartMeasurement(PSUPDRVDEVEXT pDevExt)
4220{
4221 if (RT_LIKELY(pDevExt->hTscDeltaThread != NIL_RTTHREAD))
4222 {
4223 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4224 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4225 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4226 {
4227 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4228 }
4229 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4230 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4231 }
4232}
4233
4234
4235/**
4236 * Terminates the actual thread running supdrvTscDeltaThread().
4237 *
4238 * This is an internal worker function for supdrvTscDeltaThreadInit() and
4239 * supdrvTscDeltaTerm().
4240 *
4241 * @param pDevExt Pointer to the device instance data.
4242 */
4243static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
4244{
4245 int rc;
4246 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4247 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
4248 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4249 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4250 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
4251 if (RT_FAILURE(rc))
4252 {
4253 /* Signal a few more times before giving up. */
4254 int cTriesLeft = 5;
4255 while (--cTriesLeft > 0)
4256 {
4257 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4258 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
4259 if (rc != VERR_TIMEOUT)
4260 break;
4261 }
4262 }
4263}
4264
4265
4266/**
4267 * Initializes and spawns the TSC-delta measurement thread.
4268 *
4269 * A thread is required for servicing re-measurement requests from events like
4270 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
4271 * under all contexts on all OSs.
4272 *
4273 * @returns VBox status code.
4274 * @param pDevExt Pointer to the device instance data.
4275 *
4276 * @remarks Must only be called -after- initializing GIP and setting up MP
4277 * notifications!
4278 */
4279static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
4280{
4281 int rc;
4282 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
4283 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
4284 if (RT_SUCCESS(rc))
4285 {
4286 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
4287 if (RT_SUCCESS(rc))
4288 {
4289 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
4290 pDevExt->cMsTscDeltaTimeout = 1;
4291 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
4292 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
4293 if (RT_SUCCESS(rc))
4294 {
4295 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
4296 if (RT_SUCCESS(rc))
4297 {
4298 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4299 return rc;
4300 }
4301
4302 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
4303 supdrvTscDeltaThreadTerminate(pDevExt);
4304 }
4305 else
4306 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
4307 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4308 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4309 }
4310 else
4311 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
4312 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4313 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4314 }
4315 else
4316 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
4317
4318 return rc;
4319}
4320
4321
4322/**
4323 * Terminates the TSC-delta measurement thread and cleanup.
4324 *
4325 * @param pDevExt Pointer to the device instance data.
4326 */
4327static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
4328{
4329 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
4330 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4331 {
4332 supdrvTscDeltaThreadTerminate(pDevExt);
4333 }
4334
4335 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
4336 {
4337 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
4338 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
4339 }
4340
4341 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
4342 {
4343 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
4344 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
4345 }
4346
4347 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
4348}
4349
4350#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4351
4352/**
4353 * Measure the TSC delta for the CPU given by its CPU set index.
4354 *
4355 * @returns VBox status code.
4356 * @retval VERR_INTERRUPTED if interrupted while waiting.
4357 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
4358 * measurment.
4359 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4360 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
4361 *
4362 * @param pSession The caller's session. GIP must've been mapped.
4363 * @param iCpuSet The CPU set index of the CPU to measure.
4364 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
4365 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
4366 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
4367 * ready.
4368 * @param cTries Number of times to try, pass 0 for the default.
4369 */
4370SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
4371 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
4372{
4373 PSUPDRVDEVEXT pDevExt;
4374 PSUPGLOBALINFOPAGE pGip;
4375 uint16_t iGipCpu;
4376 int rc;
4377#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4378 uint64_t msTsStartWait;
4379 uint32_t iWaitLoop;
4380#endif
4381
4382 /*
4383 * Validate and adjust the input.
4384 */
4385 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
4386 if (!pSession->fGipReferenced)
4387 return VERR_WRONG_ORDER;
4388
4389 pDevExt = pSession->pDevExt;
4390 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
4391
4392 pGip = pDevExt->pGip;
4393 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
4394
4395 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
4396 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
4397 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
4398 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
4399
4400 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
4401 return VERR_INVALID_FLAGS;
4402
4403 /*
4404 * The request is a noop if the TSC delta isn't being used.
4405 */
4406 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4407 return VINF_SUCCESS;
4408
4409 if (cTries == 0)
4410 cTries = 12;
4411 else if (cTries > 256)
4412 cTries = 256;
4413
4414 if (cMsWaitRetry == 0)
4415 cMsWaitRetry = 2;
4416 else if (cMsWaitRetry > 1000)
4417 cMsWaitRetry = 1000;
4418
4419#ifdef SUPDRV_USE_TSC_DELTA_THREAD
4420 /*
4421 * Has the TSC already been measured and we're not forced to redo it?
4422 */
4423 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
4424 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
4425 return VINF_SUCCESS;
4426
4427 /*
4428 * Asynchronous request? Forward it to the thread, no waiting.
4429 */
4430 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
4431 {
4432 /** @todo Async. doesn't implement options like retries, waiting. We'll need
4433 * to pass those options to the thread somehow and implement it in the
4434 * thread. Check if anyone uses/needs fAsync before implementing this. */
4435 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4436 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
4437 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
4438 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
4439 {
4440 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
4441 rc = VINF_SUCCESS;
4442 }
4443 else if (pDevExt->enmTscDeltaThreadState != kTscDeltaThreadState_WaitAndMeasure)
4444 rc = VERR_THREAD_IS_DEAD;
4445 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4446 RTThreadUserSignal(pDevExt->hTscDeltaThread);
4447 return VINF_SUCCESS;
4448 }
4449
4450 /*
4451 * If a TSC-delta measurement request is already being serviced by the thread,
4452 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
4453 */
4454 msTsStartWait = RTTimeSystemMilliTS();
4455 for (iWaitLoop = 0;; iWaitLoop++)
4456 {
4457 uint64_t cMsElapsed;
4458 SUPDRVTSCDELTATHREADSTATE enmState;
4459 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
4460 enmState = pDevExt->enmTscDeltaThreadState;
4461 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
4462
4463 if (enmState == kTscDeltaThreadState_Measuring)
4464 { /* Must wait, the thread is busy. */ }
4465 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
4466 { /* Must wait, this state only says what will happen next. */ }
4467 else if (enmState == kTscDeltaThreadState_Terminating)
4468 { /* Must wait, this state only says what should happen next. */ }
4469 else
4470 break; /* All other states, the thread is either idly listening or dead. */
4471
4472 /* Wait or fail. */
4473 if (cMsWaitThread == 0)
4474 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4475 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
4476 if (cMsElapsed >= cMsWaitThread)
4477 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4478
4479 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4480 if (rc == VERR_INTERRUPTED)
4481 return rc;
4482 }
4483#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4484
4485 /*
4486 * Try measure the TSC delta the given number of times.
4487 */
4488 for (;;)
4489 {
4490 /* Unless we're forced to measure the delta, check whether it's done already. */
4491 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4492 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4493 {
4494 rc = VINF_SUCCESS;
4495 break;
4496 }
4497
4498 /* Measure it. */
4499 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4500 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4501 {
4502 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4503 break;
4504 }
4505
4506 /* Retry? */
4507 if (cTries <= 1)
4508 break;
4509 cTries--;
4510
4511 /* Always delay between retries (be nice to the rest of the system
4512 and avoid the BSOD hounds). */
4513 rc = RTThreadSleep(cMsWaitRetry);
4514 if (rc == VERR_INTERRUPTED)
4515 break;
4516 }
4517
4518 return rc;
4519}
4520
4521
4522/**
4523 * Service a TSC-delta measurement request.
4524 *
4525 * @returns VBox status code.
4526 * @param pDevExt Pointer to the device instance data.
4527 * @param pSession The support driver session.
4528 * @param pReq Pointer to the TSC-delta measurement request.
4529 */
4530int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4531{
4532 uint32_t cTries;
4533 uint32_t iCpuSet;
4534 uint32_t fFlags;
4535 RTMSINTERVAL cMsWaitRetry;
4536
4537 /*
4538 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4539 */
4540 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4541
4542 if (pReq->u.In.idCpu == NIL_RTCPUID)
4543 return VERR_INVALID_CPU_ID;
4544 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4545 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4546 return VERR_INVALID_CPU_ID;
4547
4548 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4549
4550 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4551
4552 fFlags = 0;
4553 if (pReq->u.In.fAsync)
4554 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4555 if (pReq->u.In.fForce)
4556 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4557
4558 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4559 cTries == 0 ? 5 * RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4560 cTries);
4561}
4562
4563
4564/**
4565 * Reads TSC with delta applied.
4566 *
4567 * Will try to resolve delta value INT64_MAX before applying it. This is the
4568 * main purpose of this function, to handle the case where the delta needs to be
4569 * determined.
4570 *
4571 * @returns VBox status code.
4572 * @param pDevExt Pointer to the device instance data.
4573 * @param pSession The support driver session.
4574 * @param pReq Pointer to the TSC-read request.
4575 */
4576int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4577{
4578 PSUPGLOBALINFOPAGE pGip;
4579 int rc;
4580
4581 /*
4582 * Validate. We require the client to have mapped GIP (no asserting on
4583 * ring-3 preconditions).
4584 */
4585 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4586 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4587 return VERR_WRONG_ORDER;
4588 pGip = pDevExt->pGip;
4589 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4590
4591 /*
4592 * We're usually here because we need to apply delta, but we shouldn't be
4593 * upset if the GIP is some different mode.
4594 */
4595 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4596 {
4597 uint32_t cTries = 0;
4598 for (;;)
4599 {
4600 /*
4601 * Start by gathering the data, using CLI for disabling preemption
4602 * while we do that.
4603 */
4604 RTCCUINTREG uFlags = ASMIntDisableFlags();
4605 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4606 int iGipCpu;
4607 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4608 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4609 {
4610 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4611 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4612 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4613 ASMSetFlags(uFlags);
4614
4615 /*
4616 * If we're lucky we've got a delta, but no predicitions here
4617 * as this I/O control is normally only used when the TSC delta
4618 * is set to INT64_MAX.
4619 */
4620 if (i64Delta != INT64_MAX)
4621 {
4622 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4623 rc = VINF_SUCCESS;
4624 break;
4625 }
4626
4627 /* Give up after a few times. */
4628 if (cTries >= 4)
4629 {
4630 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4631 break;
4632 }
4633
4634 /* Need to measure the delta an try again. */
4635 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4636 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4637 /** @todo should probably delay on failure... dpc watchdogs */
4638 }
4639 else
4640 {
4641 /* This really shouldn't happen. */
4642 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4643 pReq->u.Out.idApic = ASMGetApicId();
4644 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4645 ASMSetFlags(uFlags);
4646 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4647 break;
4648 }
4649 }
4650 }
4651 else
4652 {
4653 /*
4654 * No delta to apply. Easy. Deal with preemption the lazy way.
4655 */
4656 RTCCUINTREG uFlags = ASMIntDisableFlags();
4657 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4658 int iGipCpu;
4659 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4660 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4661 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4662 else
4663 pReq->u.Out.idApic = ASMGetApicId();
4664 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4665 ASMSetFlags(uFlags);
4666 rc = VINF_SUCCESS;
4667 }
4668
4669 return rc;
4670}
4671
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette