VirtualBox

source: vbox/trunk/src/VBox/ExtPacks/VBoxDTrace/onnv/uts/common/dtrace/dtrace.c@ 62432

Last change on this file since 62432 was 62432, checked in by vboxsync, 8 years ago

dtrace: MSC level 4 warnings.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 407.2 KB
Line 
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * DTrace - Dynamic Tracing for Solaris
28 *
29 * This is the implementation of the Solaris Dynamic Tracing framework
30 * (DTrace). The user-visible interface to DTrace is described at length in
31 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
32 * library, the in-kernel DTrace framework, and the DTrace providers are
33 * described in the block comments in the <sys/dtrace.h> header file. The
34 * internal architecture of DTrace is described in the block comments in the
35 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
36 * implementation very much assume mastery of all of these sources; if one has
37 * an unanswered question about the implementation, one should consult them
38 * first.
39 *
40 * The functions here are ordered roughly as follows:
41 *
42 * - Probe context functions
43 * - Probe hashing functions
44 * - Non-probe context utility functions
45 * - Matching functions
46 * - Provider-to-Framework API functions
47 * - Probe management functions
48 * - DIF object functions
49 * - Format functions
50 * - Predicate functions
51 * - ECB functions
52 * - Buffer functions
53 * - Enabling functions
54 * - DOF functions
55 * - Anonymous enabling functions
56 * - Consumer state functions
57 * - Helper functions
58 * - Hook functions
59 * - Driver cookbook functions
60 *
61 * Each group of functions begins with a block comment labelled the "DTrace
62 * [Group] Functions", allowing one to find each block by searching forward
63 * on capital-f functions.
64 */
65#ifndef VBOX
66#include <sys/errno.h>
67#include <sys/stat.h>
68#include <sys/modctl.h>
69#include <sys/conf.h>
70#include <sys/systm.h>
71#include <sys/ddi.h>
72#include <sys/sunddi.h>
73#include <sys/cpuvar.h>
74#include <sys/kmem.h>
75#include <sys/strsubr.h>
76#include <sys/sysmacros.h>
77#include <sys/dtrace_impl.h>
78#include <sys/atomic.h>
79#include <sys/cmn_err.h>
80#include <sys/mutex_impl.h>
81#include <sys/rwlock_impl.h>
82#include <sys/ctf_api.h>
83#include <sys/panic.h>
84#include <sys/priv_impl.h>
85#include <sys/policy.h>
86#include <sys/cred_impl.h>
87#include <sys/procfs_isa.h>
88#include <sys/taskq.h>
89#include <sys/mkdev.h>
90#include <sys/kdi.h>
91#include <sys/zone.h>
92#include <sys/socket.h>
93#include <netinet/in.h>
94
95#else /* VBOX */
96# include <sys/dtrace_impl.h>
97# include <VBox/sup.h>
98# include <iprt/assert.h>
99# include <iprt/cpuset.h>
100# include <iprt/mem.h>
101# include <iprt/mp.h>
102# include <iprt/string.h>
103# include <iprt/process.h>
104# include <iprt/thread.h>
105# include <iprt/timer.h>
106# include <limits.h>
107
108/*
109 * Use asm.h to implemente some of the simple stuff in dtrace_asm.s.
110 */
111# include <iprt/asm.h>
112# include <iprt/asm-amd64-x86.h>
113# define dtrace_casptr(a_ppvDst, a_pvOld, a_pvNew) \
114 VBoxDtCompareAndSwapPtr((void * volatile *)a_ppvDst, a_pvOld, a_pvNew)
115DECLINLINE(void *) VBoxDtCompareAndSwapPtr(void * volatile *ppvDst, void *pvOld, void *pvNew)
116{
117 void *pvRet;
118 ASMAtomicCmpXchgExPtrVoid(ppvDst, pvNew, pvOld, &pvRet);
119 return pvRet;
120}
121
122# define dtrace_cas32(a_pu32Dst, a_pu32Old, a_pu32New) \
123 VBoxDtCompareAndSwapU32(a_pu32Dst, a_pu32Old, a_pu32New)
124DECLINLINE(uint32_t) VBoxDtCompareAndSwapU32(uint32_t volatile *pu32Dst, uint32_t u32Old, uint32_t u32New)
125{
126 uint32_t u32Ret;
127 ASMAtomicCmpXchgExU32(pu32Dst, u32New, u32Old, &u32Ret);
128 return u32Ret;
129}
130
131#define dtrace_membar_consumer() ASMReadFence()
132#define dtrace_membar_producer() ASMWriteFence()
133#define dtrace_interrupt_disable() ASMIntDisableFlags()
134#define dtrace_interrupt_enable(a_EFL) ASMSetFlags(a_EFL)
135
136/*
137 * NULL must be set to 0 or we'll end up with a billion warnings(=errors).
138 */
139# undef NULL
140# define NULL (0)
141
142# ifdef _MSC_VER
143//# pragma warning(disable: 4389) /* signed/unsigned mismatch */
144# endif
145
146#endif /* VBOX */
147
148/** Check if the given address is a valid kernel address.
149 * The value can be uintptr_t or uint64_t. */
150#ifndef VBOX
151# define VBDT_IS_VALID_KRNL_ADDR(a_uAddr) ((a_uAddr) >= KERNELBASE)
152#else
153# define VBDT_IS_VALID_KRNL_ADDR(a_uAddr) \
154 ( (sizeof(a_uAddr) == sizeof(uintptr_t) || (uintptr_t)(a_uAddr) == (a_uAddr)) \
155 && RTR0MemKernelIsValidAddr((void *)(uintptr_t)(a_uAddr)) )
156#endif
157
158
159/*
160 * DTrace Tunable Variables
161 *
162 * The following variables may be tuned by adding a line to /etc/system that
163 * includes both the name of the DTrace module ("dtrace") and the name of the
164 * variable. For example:
165 *
166 * set dtrace:dtrace_destructive_disallow = 1
167 *
168 * In general, the only variables that one should be tuning this way are those
169 * that affect system-wide DTrace behavior, and for which the default behavior
170 * is undesirable. Most of these variables are tunable on a per-consumer
171 * basis using DTrace options, and need not be tuned on a system-wide basis.
172 * When tuning these variables, avoid pathological values; while some attempt
173 * is made to verify the integrity of these variables, they are not considered
174 * part of the supported interface to DTrace, and they are therefore not
175 * checked comprehensively. Further, these variables should not be tuned
176 * dynamically via "mdb -kw" or other means; they should only be tuned via
177 * /etc/system.
178 */
179int dtrace_destructive_disallow = 0;
180dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
181size_t dtrace_difo_maxsize = (256 * 1024);
182dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
183size_t dtrace_global_maxsize = (16 * 1024);
184size_t dtrace_actions_max = (16 * 1024);
185size_t dtrace_retain_max = 1024;
186dtrace_optval_t dtrace_helper_actions_max = 32;
187dtrace_optval_t dtrace_helper_providers_max = 32;
188dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
189size_t dtrace_strsize_default = 256;
190dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
191dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
192dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
193dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
194dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
195dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
196dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
197dtrace_optval_t dtrace_nspec_default = 1;
198dtrace_optval_t dtrace_specsize_default = 32 * 1024;
199dtrace_optval_t dtrace_stackframes_default = 20;
200dtrace_optval_t dtrace_ustackframes_default = 20;
201dtrace_optval_t dtrace_jstackframes_default = 50;
202dtrace_optval_t dtrace_jstackstrsize_default = 512;
203int dtrace_msgdsize_max = 128;
204hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
205hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
206int dtrace_devdepth_max = 32;
207int dtrace_err_verbose;
208hrtime_t dtrace_deadman_interval = NANOSEC;
209hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
210hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
211
212/*
213 * DTrace External Variables
214 *
215 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
216 * available to DTrace consumers via the backtick (`) syntax. One of these,
217 * dtrace_zero, is made deliberately so: it is provided as a source of
218 * well-known, zero-filled memory. While this variable is not documented,
219 * it is used by some translators as an implementation detail.
220 */
221const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
222
223/*
224 * DTrace Internal Variables
225 */
226#ifndef VBOX
227static dev_info_t *dtrace_devi; /* device info */
228#endif
229static vmem_t *dtrace_arena; /* probe ID arena */
230#ifndef VBOX
231static vmem_t *dtrace_minor; /* minor number arena */
232static taskq_t *dtrace_taskq; /* task queue */
233#endif
234static dtrace_probe_t **dtrace_probes; /* array of all probes */
235static VBDTTYPE(uint32_t,int) dtrace_nprobes; /* number of probes */
236static dtrace_provider_t *dtrace_provider; /* provider list */
237static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
238static int dtrace_opens; /* number of opens */
239static int dtrace_helpers; /* number of helpers */
240#ifndef VBOX
241static void *dtrace_softstate; /* softstate pointer */
242#endif
243static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
244static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
245static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
246static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
247static int dtrace_toxranges; /* number of toxic ranges */
248static int dtrace_toxranges_max; /* size of toxic range array */
249static dtrace_anon_t dtrace_anon; /* anonymous enabling */
250static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
251static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
252#ifndef VBOX
253static kthread_t *dtrace_panicked; /* panicking thread */
254#endif
255static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
256static dtrace_genid_t dtrace_probegen; /* current probe generation */
257static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
258static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
259static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
260static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
261static int dtrace_dynvar_failclean; /* dynvars failed to clean */
262
263/*
264 * DTrace Locking
265 * DTrace is protected by three (relatively coarse-grained) locks:
266 *
267 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
268 * including enabling state, probes, ECBs, consumer state, helper state,
269 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
270 * probe context is lock-free -- synchronization is handled via the
271 * dtrace_sync() cross call mechanism.
272 *
273 * (2) dtrace_provider_lock is required when manipulating provider state, or
274 * when provider state must be held constant.
275 *
276 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
277 * when meta provider state must be held constant.
278 *
279 * The lock ordering between these three locks is dtrace_meta_lock before
280 * dtrace_provider_lock before dtrace_lock. (In particular, there are
281 * several places where dtrace_provider_lock is held by the framework as it
282 * calls into the providers -- which then call back into the framework,
283 * grabbing dtrace_lock.)
284 *
285 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
286 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
287 * role as a coarse-grained lock; it is acquired before both of these locks.
288 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
289 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
290 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
291 * acquired _between_ dtrace_provider_lock and dtrace_lock.
292 */
293static kmutex_t dtrace_lock; /* probe state lock */
294static kmutex_t dtrace_provider_lock; /* provider state lock */
295static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
296
297/*
298 * DTrace Provider Variables
299 *
300 * These are the variables relating to DTrace as a provider (that is, the
301 * provider of the BEGIN, END, and ERROR probes).
302 */
303static dtrace_pattr_t dtrace_provider_attr = {
304{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
305{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
306{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
307{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
308{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
309};
310
311static void
312dtrace_nullop(void)
313{}
314
315static int
316dtrace_enable_nullop(void)
317{
318 return (0);
319}
320
321static dtrace_pops_t dtrace_provider_ops = {
322 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
323 (void (*)(void *, struct modctl *))dtrace_nullop,
324 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
325 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
326 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
327 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
328 NULL,
329 NULL,
330 NULL,
331 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
332};
333
334static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
335static dtrace_id_t dtrace_probeid_end; /* special END probe */
336dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
337
338/*
339 * DTrace Helper Tracing Variables
340 */
341uint32_t dtrace_helptrace_next = 0;
342uint32_t dtrace_helptrace_nlocals;
343char *dtrace_helptrace_buffer;
344int dtrace_helptrace_bufsize = 512 * 1024;
345
346#ifdef DEBUG
347int dtrace_helptrace_enabled = 1;
348#else
349int dtrace_helptrace_enabled = 0;
350#endif
351
352/*
353 * DTrace Error Hashing
354 *
355 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
356 * table. This is very useful for checking coverage of tests that are
357 * expected to induce DIF or DOF processing errors, and may be useful for
358 * debugging problems in the DIF code generator or in DOF generation . The
359 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
360 */
361#ifdef DEBUG
362static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
363static const char *dtrace_errlast;
364static kthread_t *dtrace_errthread;
365static kmutex_t dtrace_errlock;
366#endif
367
368/*
369 * DTrace Macros and Constants
370 *
371 * These are various macros that are useful in various spots in the
372 * implementation, along with a few random constants that have no meaning
373 * outside of the implementation. There is no real structure to this cpp
374 * mishmash -- but is there ever?
375 */
376#define DTRACE_HASHSTR(hash, probe) \
377 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
378
379#define DTRACE_HASHNEXT(hash, probe) \
380 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
381
382#define DTRACE_HASHPREV(hash, probe) \
383 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
384
385#define DTRACE_HASHEQ(hash, lhs, rhs) \
386 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
387 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
388
389#define DTRACE_AGGHASHSIZE_SLEW 17
390
391#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
392
393/*
394 * The key for a thread-local variable consists of the lower 61 bits of the
395 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
396 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
397 * equal to a variable identifier. This is necessary (but not sufficient) to
398 * assure that global associative arrays never collide with thread-local
399 * variables. To guarantee that they cannot collide, we must also define the
400 * order for keying dynamic variables. That order is:
401 *
402 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
403 *
404 * Because the variable-key and the tls-key are in orthogonal spaces, there is
405 * no way for a global variable key signature to match a thread-local key
406 * signature.
407 */
408#ifndef VBOX
409#define DTRACE_TLS_THRKEY(where) { \
410 uint_t intr = 0; \
411 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
412 for (; actv; actv >>= 1) \
413 intr++; \
414 ASSERT(intr < (1 << 3)); \
415 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
416 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
417}
418#else
419#define DTRACE_TLS_THRKEY(where) do { \
420 (where) = (((uintptr_t)RTThreadNativeSelf() + DIF_VARIABLE_MAX) & (RT_BIT_64(61) - 1)) \
421 | (RTThreadIsInInterrupt(NIL_RTTHREAD) ? RT_BIT_64(61) : 0); \
422} while (0)
423#endif
424
425#define DT_BSWAP_8(x) ((x) & 0xff)
426#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
427#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
428#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
429
430#define DT_MASK_LO 0x00000000FFFFFFFFULL
431
432#define DTRACE_STORE(type, tomax, offset, what) \
433 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
434
435#ifndef __i386
436#define DTRACE_ALIGNCHECK(addr, size, flags) \
437 if (addr & (size - 1)) { \
438 *flags |= CPU_DTRACE_BADALIGN; \
439 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = addr; \
440 return (0); \
441 }
442#else
443#define DTRACE_ALIGNCHECK(addr, size, flags)
444#endif
445
446/*
447 * Test whether a range of memory starting at testaddr of size testsz falls
448 * within the range of memory described by addr, sz. We take care to avoid
449 * problems with overflow and underflow of the unsigned quantities, and
450 * disallow all negative sizes. Ranges of size 0 are allowed.
451 */
452#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
453 ((testaddr) - (baseaddr) < (basesz) && \
454 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
455 (testaddr) + (testsz) >= (testaddr))
456
457/*
458 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
459 * alloc_sz on the righthand side of the comparison in order to avoid overflow
460 * or underflow in the comparison with it. This is simpler than the INRANGE
461 * check above, because we know that the dtms_scratch_ptr is valid in the
462 * range. Allocations of size zero are allowed.
463 */
464#define DTRACE_INSCRATCH(mstate, alloc_sz) \
465 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
466 (mstate)->dtms_scratch_ptr >= (alloc_sz))
467
468#ifndef VBOX
469#define DTRACE_LOADFUNC(bits) \
470/*CSTYLED*/ \
471VBDTSTATIC uint##bits##_t \
472dtrace_load##bits(uintptr_t addr) \
473{ \
474 size_t size = bits / NBBY; \
475 /*CSTYLED*/ \
476 uint##bits##_t rval; \
477 int i; \
478 processorid_t me = VBDT_GET_CPUID(); \
479 volatile uint16_t *flags = (volatile uint16_t *) \
480 &cpu_core[me].cpuc_dtrace_flags; \
481 \
482 DTRACE_ALIGNCHECK(addr, size, flags); \
483 \
484 for (i = 0; i < dtrace_toxranges; i++) { \
485 if (addr >= dtrace_toxrange[i].dtt_limit) \
486 continue; \
487 \
488 if (addr + size <= dtrace_toxrange[i].dtt_base) \
489 continue; \
490 \
491 /* \
492 * This address falls within a toxic region; return 0. \
493 */ \
494 *flags |= CPU_DTRACE_BADADDR; \
495 cpu_core[me].cpuc_dtrace_illval = addr; \
496 return (0); \
497 } \
498 \
499 *flags |= CPU_DTRACE_NOFAULT; \
500 /*CSTYLED*/ \
501 rval = *((volatile uint##bits##_t *)addr); \
502 *flags &= ~CPU_DTRACE_NOFAULT; \
503 \
504 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
505}
506#else /* VBOX */
507# define DTRACE_LOADFUNC(bits) \
508VBDTSTATIC uint##bits##_t \
509dtrace_load##bits(uintptr_t addr) \
510{ \
511 size_t const size = bits / NBBY; \
512 uint##bits##_t rval; \
513 processorid_t me; \
514 int i, rc; \
515 \
516 /*DTRACE_ALIGNCHECK(addr, size, flags);*/ \
517 \
518 for (i = 0; i < dtrace_toxranges; i++) { \
519 if (addr >= dtrace_toxrange[i].dtt_limit) \
520 continue; \
521 \
522 if (addr + size <= dtrace_toxrange[i].dtt_base) \
523 continue; \
524 \
525 /* \
526 * This address falls within a toxic region; return 0. \
527 */ \
528 me = VBDT_GET_CPUID(); \
529 cpu_core[me].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; \
530 cpu_core[me].cpuc_dtrace_illval = addr; \
531 return (0); \
532 } \
533 \
534 rc = RTR0MemKernelCopyFrom(&rval, (void const *)addr, size); \
535 if (RT_SUCCESS(rc)) \
536 return rval; \
537 \
538 /* \
539 * If not supported, pray it won't fault... \
540 */ \
541 if (rc == VERR_NOT_SUPPORTED) \
542 return *(uint##bits##_t const *)addr; \
543 \
544 me = VBDT_GET_CPUID(); \
545 cpu_core[me].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; \
546 cpu_core[me].cpuc_dtrace_illval = addr; \
547 return (0); \
548}
549
550#endif /* VBOX */
551
552#ifdef _LP64
553#define dtrace_loadptr dtrace_load64
554#else
555#define dtrace_loadptr dtrace_load32
556#endif
557
558#define DTRACE_DYNHASH_FREE 0
559#define DTRACE_DYNHASH_SINK 1
560#define DTRACE_DYNHASH_VALID 2
561
562#define DTRACE_MATCH_FAIL -1
563#define DTRACE_MATCH_NEXT 0
564#define DTRACE_MATCH_DONE 1
565#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
566#define DTRACE_STATE_ALIGN 64
567
568#define DTRACE_FLAGS2FLT(flags) \
569 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
570 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
571 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
572 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
573 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
574 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
575 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
576 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
577 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
578 DTRACEFLT_UNKNOWN)
579
580#define DTRACEACT_ISSTRING(act) \
581 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
582 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
583
584static size_t dtrace_strlen(const char *, size_t);
585static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
586static void dtrace_enabling_provide(dtrace_provider_t *);
587static int dtrace_enabling_match(dtrace_enabling_t *, int *);
588static void dtrace_enabling_matchall(void);
589static dtrace_state_t *dtrace_anon_grab(void);
590#ifndef VBOX
591static uint64_t dtrace_helper(int, dtrace_mstate_t *,
592 dtrace_state_t *, uint64_t, uint64_t);
593static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
594#endif
595static void dtrace_buffer_drop(dtrace_buffer_t *);
596static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
597 dtrace_state_t *, dtrace_mstate_t *);
598static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
599 dtrace_optval_t);
600static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
601static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
602
603/*
604 * DTrace Probe Context Functions
605 *
606 * These functions are called from probe context. Because probe context is
607 * any context in which C may be called, arbitrarily locks may be held,
608 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
609 * As a result, functions called from probe context may only call other DTrace
610 * support functions -- they may not interact at all with the system at large.
611 * (Note that the ASSERT macro is made probe-context safe by redefining it in
612 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
613 * loads are to be performed from probe context, they _must_ be in terms of
614 * the safe dtrace_load*() variants.
615 *
616 * Some functions in this block are not actually called from probe context;
617 * for these functions, there will be a comment above the function reading
618 * "Note: not called from probe context."
619 */
620void
621dtrace_panic(const char *format, ...)
622{
623 va_list alist;
624
625 va_start(alist, format);
626 dtrace_vpanic(format, alist);
627 va_end(alist);
628}
629
630#ifndef VBOX /* We have our own assertion machinery. */
631int
632dtrace_assfail(const char *a, const char *f, int l)
633{
634 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
635
636 /*
637 * We just need something here that even the most clever compiler
638 * cannot optimize away.
639 */
640 return (a[(uintptr_t)f]);
641}
642#endif
643
644/*
645 * Atomically increment a specified error counter from probe context.
646 */
647static void
648dtrace_error(uint32_t *counter)
649{
650 /*
651 * Most counters stored to in probe context are per-CPU counters.
652 * However, there are some error conditions that are sufficiently
653 * arcane that they don't merit per-CPU storage. If these counters
654 * are incremented concurrently on different CPUs, scalability will be
655 * adversely affected -- but we don't expect them to be white-hot in a
656 * correctly constructed enabling...
657 */
658 uint32_t oval, nval;
659
660 do {
661 oval = *counter;
662
663 if ((nval = oval + 1) == 0) {
664 /*
665 * If the counter would wrap, set it to 1 -- assuring
666 * that the counter is never zero when we have seen
667 * errors. (The counter must be 32-bits because we
668 * aren't guaranteed a 64-bit compare&swap operation.)
669 * To save this code both the infamy of being fingered
670 * by a priggish news story and the indignity of being
671 * the target of a neo-puritan witch trial, we're
672 * carefully avoiding any colorful description of the
673 * likelihood of this condition -- but suffice it to
674 * say that it is only slightly more likely than the
675 * overflow of predicate cache IDs, as discussed in
676 * dtrace_predicate_create().
677 */
678 nval = 1;
679 }
680 } while (dtrace_cas32(counter, oval, nval) != oval);
681}
682
683/*
684 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
685 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
686 */
687DTRACE_LOADFUNC(8)
688DTRACE_LOADFUNC(16)
689DTRACE_LOADFUNC(32)
690DTRACE_LOADFUNC(64)
691
692static int
693dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
694{
695 if (dest < mstate->dtms_scratch_base)
696 return (0);
697
698 if (dest + size < dest)
699 return (0);
700
701 if (dest + size > mstate->dtms_scratch_ptr)
702 return (0);
703
704 return (1);
705}
706
707static int
708dtrace_canstore_statvar(uint64_t addr, size_t sz,
709 dtrace_statvar_t **svars, int nsvars)
710{
711 int i;
712
713 for (i = 0; i < nsvars; i++) {
714 dtrace_statvar_t *svar = svars[i];
715
716 if (svar == NULL || svar->dtsv_size == 0)
717 continue;
718
719 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
720 return (1);
721 }
722
723 return (0);
724}
725
726/*
727 * Check to see if the address is within a memory region to which a store may
728 * be issued. This includes the DTrace scratch areas, and any DTrace variable
729 * region. The caller of dtrace_canstore() is responsible for performing any
730 * alignment checks that are needed before stores are actually executed.
731 */
732static int
733dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
734 dtrace_vstate_t *vstate)
735{
736 /*
737 * First, check to see if the address is in scratch space...
738 */
739 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
740 mstate->dtms_scratch_size))
741 return (1);
742
743 /*
744 * Now check to see if it's a dynamic variable. This check will pick
745 * up both thread-local variables and any global dynamically-allocated
746 * variables.
747 */
748 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
749 vstate->dtvs_dynvars.dtds_size)) {
750 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
751 uintptr_t base = (uintptr_t)dstate->dtds_base +
752 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
753 uintptr_t chunkoffs;
754
755 /*
756 * Before we assume that we can store here, we need to make
757 * sure that it isn't in our metadata -- storing to our
758 * dynamic variable metadata would corrupt our state. For
759 * the range to not include any dynamic variable metadata,
760 * it must:
761 *
762 * (1) Start above the hash table that is at the base of
763 * the dynamic variable space
764 *
765 * (2) Have a starting chunk offset that is beyond the
766 * dtrace_dynvar_t that is at the base of every chunk
767 *
768 * (3) Not span a chunk boundary
769 *
770 */
771 if (addr < base)
772 return (0);
773
774 chunkoffs = (addr - base) % dstate->dtds_chunksize;
775
776 if (chunkoffs < sizeof (dtrace_dynvar_t))
777 return (0);
778
779 if (chunkoffs + sz > dstate->dtds_chunksize)
780 return (0);
781
782 return (1);
783 }
784
785 /*
786 * Finally, check the static local and global variables. These checks
787 * take the longest, so we perform them last.
788 */
789 if (dtrace_canstore_statvar(addr, sz,
790 vstate->dtvs_locals, vstate->dtvs_nlocals))
791 return (1);
792
793 if (dtrace_canstore_statvar(addr, sz,
794 vstate->dtvs_globals, vstate->dtvs_nglobals))
795 return (1);
796
797 return (0);
798}
799
800
801/*
802 * Convenience routine to check to see if the address is within a memory
803 * region in which a load may be issued given the user's privilege level;
804 * if not, it sets the appropriate error flags and loads 'addr' into the
805 * illegal value slot.
806 *
807 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
808 * appropriate memory access protection.
809 */
810static int
811dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
812 dtrace_vstate_t *vstate)
813{
814 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
815
816 /*
817 * If we hold the privilege to read from kernel memory, then
818 * everything is readable.
819 */
820 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
821 return (1);
822
823 /*
824 * You can obviously read that which you can store.
825 */
826 if (dtrace_canstore(addr, sz, mstate, vstate))
827 return (1);
828
829 /*
830 * We're allowed to read from our own string table.
831 */
832 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
833 mstate->dtms_difo->dtdo_strlen))
834 return (1);
835
836 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
837 *illval = addr;
838 return (0);
839}
840
841/*
842 * Convenience routine to check to see if a given string is within a memory
843 * region in which a load may be issued given the user's privilege level;
844 * this exists so that we don't need to issue unnecessary dtrace_strlen()
845 * calls in the event that the user has all privileges.
846 */
847static int
848dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
849 dtrace_vstate_t *vstate)
850{
851 size_t strsz;
852
853 /*
854 * If we hold the privilege to read from kernel memory, then
855 * everything is readable.
856 */
857 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
858 return (1);
859
860 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
861 if (dtrace_canload(addr, strsz, mstate, vstate))
862 return (1);
863
864 return (0);
865}
866
867/*
868 * Convenience routine to check to see if a given variable is within a memory
869 * region in which a load may be issued given the user's privilege level.
870 */
871static int
872dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
873 dtrace_vstate_t *vstate)
874{
875 size_t sz;
876 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
877
878 /*
879 * If we hold the privilege to read from kernel memory, then
880 * everything is readable.
881 */
882 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
883 return (1);
884
885 if (type->dtdt_kind == DIF_TYPE_STRING)
886 sz = dtrace_strlen(src,
887 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
888 else
889 sz = type->dtdt_size;
890
891 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
892}
893
894/*
895 * Compare two strings using safe loads.
896 */
897static int
898dtrace_strncmp(char *s1, char *s2, size_t limit)
899{
900 uint8_t c1, c2;
901 volatile uint16_t *flags;
902
903 if (s1 == s2 || limit == 0)
904 return (0);
905
906 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
907
908 do {
909 if (s1 == NULL) {
910 c1 = '\0';
911 } else {
912 c1 = dtrace_load8((uintptr_t)s1++);
913 }
914
915 if (s2 == NULL) {
916 c2 = '\0';
917 } else {
918 c2 = dtrace_load8((uintptr_t)s2++);
919 }
920
921 if (c1 != c2)
922 return (c1 - c2);
923 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
924
925 return (0);
926}
927
928/*
929 * Compute strlen(s) for a string using safe memory accesses. The additional
930 * len parameter is used to specify a maximum length to ensure completion.
931 */
932static size_t
933dtrace_strlen(const char *s, size_t lim)
934{
935 uint_t len;
936
937 for (len = 0; len != lim; len++) {
938 if (dtrace_load8((uintptr_t)s++) == '\0')
939 break;
940 }
941
942 return (len);
943}
944
945/*
946 * Check if an address falls within a toxic region.
947 */
948static int
949dtrace_istoxic(uintptr_t kaddr, size_t size)
950{
951 uintptr_t taddr, tsize;
952 int i;
953
954 for (i = 0; i < dtrace_toxranges; i++) {
955 taddr = dtrace_toxrange[i].dtt_base;
956 tsize = dtrace_toxrange[i].dtt_limit - taddr;
957
958 if (kaddr - taddr < tsize) {
959 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
960 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = kaddr;
961 return (1);
962 }
963
964 if (taddr - kaddr < size) {
965 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
966 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = taddr;
967 return (1);
968 }
969 }
970
971 return (0);
972}
973
974/*
975 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
976 * memory specified by the DIF program. The dst is assumed to be safe memory
977 * that we can store to directly because it is managed by DTrace. As with
978 * standard bcopy, overlapping copies are handled properly.
979 */
980static void
981dtrace_bcopy(const void *src, void *dst, size_t len)
982{
983 if (len != 0) {
984 uint8_t *s1 = dst;
985 const uint8_t *s2 = src;
986
987 if (s1 <= s2) {
988 do {
989 *s1++ = dtrace_load8((uintptr_t)s2++);
990 } while (--len != 0);
991 } else {
992 s2 += len;
993 s1 += len;
994
995 do {
996 *--s1 = dtrace_load8((uintptr_t)--s2);
997 } while (--len != 0);
998 }
999 }
1000}
1001
1002/*
1003 * Copy src to dst using safe memory accesses, up to either the specified
1004 * length, or the point that a nul byte is encountered. The src is assumed to
1005 * be unsafe memory specified by the DIF program. The dst is assumed to be
1006 * safe memory that we can store to directly because it is managed by DTrace.
1007 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1008 */
1009static void
1010dtrace_strcpy(const void *src, void *dst, size_t len)
1011{
1012 if (len != 0) {
1013 uint8_t *s1 = dst, c;
1014 const uint8_t *s2 = src;
1015
1016 do {
1017 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1018 } while (--len != 0 && c != '\0');
1019 }
1020}
1021
1022/*
1023 * Copy src to dst, deriving the size and type from the specified (BYREF)
1024 * variable type. The src is assumed to be unsafe memory specified by the DIF
1025 * program. The dst is assumed to be DTrace variable memory that is of the
1026 * specified type; we assume that we can store to directly.
1027 */
1028static void
1029dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1030{
1031 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1032
1033 if (type->dtdt_kind == DIF_TYPE_STRING) {
1034 dtrace_strcpy(src, dst, type->dtdt_size);
1035 } else {
1036 dtrace_bcopy(src, dst, type->dtdt_size);
1037 }
1038}
1039
1040/*
1041 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1042 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1043 * safe memory that we can access directly because it is managed by DTrace.
1044 */
1045static int
1046dtrace_bcmp(const void *s1, const void *s2, size_t len)
1047{
1048 volatile uint16_t *flags;
1049
1050 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
1051
1052 if (s1 == s2)
1053 return (0);
1054
1055 if (s1 == NULL || s2 == NULL)
1056 return (1);
1057
1058 if (s1 != s2 && len != 0) {
1059 const uint8_t *ps1 = s1;
1060 const uint8_t *ps2 = s2;
1061
1062 do {
1063 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1064 return (1);
1065 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1066 }
1067 return (0);
1068}
1069
1070/*
1071 * Zero the specified region using a simple byte-by-byte loop. Note that this
1072 * is for safe DTrace-managed memory only.
1073 */
1074static void
1075dtrace_bzero(void *dst, size_t len)
1076{
1077 uchar_t *cp;
1078
1079 for (cp = dst; len != 0; len--)
1080 *cp++ = 0;
1081}
1082
1083static void
1084dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1085{
1086 uint64_t result[2];
1087
1088 result[0] = addend1[0] + addend2[0];
1089 result[1] = addend1[1] + addend2[1] +
1090 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1091
1092 sum[0] = result[0];
1093 sum[1] = result[1];
1094}
1095
1096/*
1097 * Shift the 128-bit value in a by b. If b is positive, shift left.
1098 * If b is negative, shift right.
1099 */
1100static void
1101dtrace_shift_128(uint64_t *a, int b)
1102{
1103 uint64_t mask;
1104
1105 if (b == 0)
1106 return;
1107
1108 if (b < 0) {
1109 b = -b;
1110 if (b >= 64) {
1111 a[0] = a[1] >> (b - 64);
1112 a[1] = 0;
1113 } else {
1114 a[0] >>= b;
1115 mask = 1LL << (64 - b);
1116 mask -= 1;
1117 a[0] |= ((a[1] & mask) << (64 - b));
1118 a[1] >>= b;
1119 }
1120 } else {
1121 if (b >= 64) {
1122 a[1] = a[0] << (b - 64);
1123 a[0] = 0;
1124 } else {
1125 a[1] <<= b;
1126 mask = a[0] >> (64 - b);
1127 a[1] |= mask;
1128 a[0] <<= b;
1129 }
1130 }
1131}
1132
1133/*
1134 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1135 * use native multiplication on those, and then re-combine into the
1136 * resulting 128-bit value.
1137 *
1138 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1139 * hi1 * hi2 << 64 +
1140 * hi1 * lo2 << 32 +
1141 * hi2 * lo1 << 32 +
1142 * lo1 * lo2
1143 */
1144static void
1145dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1146{
1147 uint64_t hi1, hi2, lo1, lo2;
1148 uint64_t tmp[2];
1149
1150 hi1 = factor1 >> 32;
1151 hi2 = factor2 >> 32;
1152
1153 lo1 = factor1 & DT_MASK_LO;
1154 lo2 = factor2 & DT_MASK_LO;
1155
1156 product[0] = lo1 * lo2;
1157 product[1] = hi1 * hi2;
1158
1159 tmp[0] = hi1 * lo2;
1160 tmp[1] = 0;
1161 dtrace_shift_128(tmp, 32);
1162 dtrace_add_128(product, tmp, product);
1163
1164 tmp[0] = hi2 * lo1;
1165 tmp[1] = 0;
1166 dtrace_shift_128(tmp, 32);
1167 dtrace_add_128(product, tmp, product);
1168}
1169
1170/*
1171 * This privilege check should be used by actions and subroutines to
1172 * verify that the user credentials of the process that enabled the
1173 * invoking ECB match the target credentials
1174 */
1175static int
1176dtrace_priv_proc_common_user(dtrace_state_t *state)
1177{
1178 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1179
1180 /*
1181 * We should always have a non-NULL state cred here, since if cred
1182 * is null (anonymous tracing), we fast-path bypass this routine.
1183 */
1184 ASSERT(s_cr != NULL);
1185
1186 if ((cr = CRED()) != NULL &&
1187 s_cr->cr_uid == cr->cr_uid &&
1188 s_cr->cr_uid == cr->cr_ruid &&
1189 s_cr->cr_uid == cr->cr_suid &&
1190 s_cr->cr_gid == cr->cr_gid &&
1191 s_cr->cr_gid == cr->cr_rgid &&
1192 s_cr->cr_gid == cr->cr_sgid)
1193 return (1);
1194
1195 return (0);
1196}
1197
1198/*
1199 * This privilege check should be used by actions and subroutines to
1200 * verify that the zone of the process that enabled the invoking ECB
1201 * matches the target credentials
1202 */
1203static int
1204dtrace_priv_proc_common_zone(dtrace_state_t *state)
1205{
1206 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1207
1208 /*
1209 * We should always have a non-NULL state cred here, since if cred
1210 * is null (anonymous tracing), we fast-path bypass this routine.
1211 */
1212 ASSERT(s_cr != NULL);
1213
1214 if ((cr = CRED()) != NULL &&
1215 s_cr->cr_zone == cr->cr_zone)
1216 return (1);
1217
1218 return (0);
1219}
1220
1221/*
1222 * This privilege check should be used by actions and subroutines to
1223 * verify that the process has not setuid or changed credentials.
1224 */
1225static int
1226dtrace_priv_proc_common_nocd(VBDTVOID)
1227{
1228#ifndef VBOX
1229 proc_t *proc;
1230
1231 if ((proc = VBDT_GET_PROC()) != NULL &&
1232 !(proc->p_flag & SNOCD))
1233 return (1);
1234
1235 return (0);
1236#else
1237 return (1);
1238#endif
1239}
1240
1241static int
1242dtrace_priv_proc_destructive(dtrace_state_t *state)
1243{
1244 int action = state->dts_cred.dcr_action;
1245
1246 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1247 dtrace_priv_proc_common_zone(state) == 0)
1248 goto bad;
1249
1250 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1251 dtrace_priv_proc_common_user(state) == 0)
1252 goto bad;
1253
1254 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1255 dtrace_priv_proc_common_nocd() == 0)
1256 goto bad;
1257
1258 return (1);
1259
1260bad:
1261 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1262
1263 return (0);
1264}
1265
1266static int
1267dtrace_priv_proc_control(dtrace_state_t *state)
1268{
1269 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1270 return (1);
1271
1272 if (dtrace_priv_proc_common_zone(state) &&
1273 dtrace_priv_proc_common_user(state) &&
1274 dtrace_priv_proc_common_nocd())
1275 return (1);
1276
1277 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1278
1279 return (0);
1280}
1281
1282static int
1283dtrace_priv_proc(dtrace_state_t *state)
1284{
1285 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1286 return (1);
1287
1288 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1289
1290 return (0);
1291}
1292
1293static int
1294dtrace_priv_kernel(dtrace_state_t *state)
1295{
1296 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1297 return (1);
1298
1299 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1300
1301 return (0);
1302}
1303
1304static int
1305dtrace_priv_kernel_destructive(dtrace_state_t *state)
1306{
1307 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1308 return (1);
1309
1310 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1311
1312 return (0);
1313}
1314
1315/*
1316 * Note: not called from probe context. This function is called
1317 * asynchronously (and at a regular interval) from outside of probe context to
1318 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1319 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1320 */
1321VBDTSTATIC void
1322dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1323{
1324 dtrace_dynvar_t *dirty;
1325 dtrace_dstate_percpu_t *dcpu;
1326 dtrace_dynvar_t **rinsep;
1327 int i, j, work = 0;
1328
1329 for (i = 0; i < NCPU; i++) {
1330 dcpu = &dstate->dtds_percpu[i];
1331 rinsep = &dcpu->dtdsc_rinsing;
1332
1333 /*
1334 * If the dirty list is NULL, there is no dirty work to do.
1335 */
1336 if (dcpu->dtdsc_dirty == NULL)
1337 continue;
1338
1339 if (dcpu->dtdsc_rinsing != NULL) {
1340 /*
1341 * If the rinsing list is non-NULL, then it is because
1342 * this CPU was selected to accept another CPU's
1343 * dirty list -- and since that time, dirty buffers
1344 * have accumulated. This is a highly unlikely
1345 * condition, but we choose to ignore the dirty
1346 * buffers -- they'll be picked up a future cleanse.
1347 */
1348 continue;
1349 }
1350
1351 if (dcpu->dtdsc_clean != NULL) {
1352 /*
1353 * If the clean list is non-NULL, then we're in a
1354 * situation where a CPU has done deallocations (we
1355 * have a non-NULL dirty list) but no allocations (we
1356 * also have a non-NULL clean list). We can't simply
1357 * move the dirty list into the clean list on this
1358 * CPU, yet we also don't want to allow this condition
1359 * to persist, lest a short clean list prevent a
1360 * massive dirty list from being cleaned (which in
1361 * turn could lead to otherwise avoidable dynamic
1362 * drops). To deal with this, we look for some CPU
1363 * with a NULL clean list, NULL dirty list, and NULL
1364 * rinsing list -- and then we borrow this CPU to
1365 * rinse our dirty list.
1366 */
1367 for (j = 0; j < NCPU; j++) {
1368 dtrace_dstate_percpu_t *rinser;
1369
1370 rinser = &dstate->dtds_percpu[j];
1371
1372 if (rinser->dtdsc_rinsing != NULL)
1373 continue;
1374
1375 if (rinser->dtdsc_dirty != NULL)
1376 continue;
1377
1378 if (rinser->dtdsc_clean != NULL)
1379 continue;
1380
1381 rinsep = &rinser->dtdsc_rinsing;
1382 break;
1383 }
1384
1385 if (j == NCPU) {
1386 /*
1387 * We were unable to find another CPU that
1388 * could accept this dirty list -- we are
1389 * therefore unable to clean it now.
1390 */
1391 dtrace_dynvar_failclean++;
1392 continue;
1393 }
1394 }
1395
1396 work = 1;
1397
1398 /*
1399 * Atomically move the dirty list aside.
1400 */
1401 do {
1402 dirty = dcpu->dtdsc_dirty;
1403
1404 /*
1405 * Before we zap the dirty list, set the rinsing list.
1406 * (This allows for a potential assertion in
1407 * dtrace_dynvar(): if a free dynamic variable appears
1408 * on a hash chain, either the dirty list or the
1409 * rinsing list for some CPU must be non-NULL.)
1410 */
1411 *rinsep = dirty;
1412 dtrace_membar_producer();
1413 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1414 dirty, NULL) != dirty);
1415 }
1416
1417 if (!work) {
1418 /*
1419 * We have no work to do; we can simply return.
1420 */
1421 return;
1422 }
1423
1424 dtrace_sync();
1425
1426 for (i = 0; i < NCPU; i++) {
1427 dcpu = &dstate->dtds_percpu[i];
1428
1429 if (dcpu->dtdsc_rinsing == NULL)
1430 continue;
1431
1432 /*
1433 * We are now guaranteed that no hash chain contains a pointer
1434 * into this dirty list; we can make it clean.
1435 */
1436 ASSERT(dcpu->dtdsc_clean == NULL);
1437 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1438 dcpu->dtdsc_rinsing = NULL;
1439 }
1440
1441 /*
1442 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1443 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1444 * This prevents a race whereby a CPU incorrectly decides that
1445 * the state should be something other than DTRACE_DSTATE_CLEAN
1446 * after dtrace_dynvar_clean() has completed.
1447 */
1448 dtrace_sync();
1449
1450 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1451}
1452
1453/*
1454 * Depending on the value of the op parameter, this function looks-up,
1455 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1456 * allocation is requested, this function will return a pointer to a
1457 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1458 * variable can be allocated. If NULL is returned, the appropriate counter
1459 * will be incremented.
1460 */
1461VBDTSTATIC dtrace_dynvar_t *
1462dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1463 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1464 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1465{
1466 uint64_t hashval = DTRACE_DYNHASH_VALID;
1467 dtrace_dynhash_t *hash = dstate->dtds_hash;
1468 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1469 processorid_t me = VBDT_GET_CPUID(), cpu = me;
1470 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1471 size_t bucket, ksize;
1472 size_t chunksize = dstate->dtds_chunksize;
1473 uintptr_t kdata, lock, nstate;
1474 uint_t i;
1475
1476 ASSERT(nkeys != 0);
1477
1478 /*
1479 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1480 * algorithm. For the by-value portions, we perform the algorithm in
1481 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1482 * bit, and seems to have only a minute effect on distribution. For
1483 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1484 * over each referenced byte. It's painful to do this, but it's much
1485 * better than pathological hash distribution. The efficacy of the
1486 * hashing algorithm (and a comparison with other algorithms) may be
1487 * found by running the ::dtrace_dynstat MDB dcmd.
1488 */
1489 for (i = 0; i < nkeys; i++) {
1490 if (key[i].dttk_size == 0) {
1491 uint64_t val = key[i].dttk_value;
1492
1493 hashval += (val >> 48) & 0xffff;
1494 hashval += (hashval << 10);
1495 hashval ^= (hashval >> 6);
1496
1497 hashval += (val >> 32) & 0xffff;
1498 hashval += (hashval << 10);
1499 hashval ^= (hashval >> 6);
1500
1501 hashval += (val >> 16) & 0xffff;
1502 hashval += (hashval << 10);
1503 hashval ^= (hashval >> 6);
1504
1505 hashval += val & 0xffff;
1506 hashval += (hashval << 10);
1507 hashval ^= (hashval >> 6);
1508 } else {
1509 /*
1510 * This is incredibly painful, but it beats the hell
1511 * out of the alternative.
1512 */
1513 uint64_t j, size = key[i].dttk_size;
1514 uintptr_t base = (uintptr_t)key[i].dttk_value;
1515
1516 if (!dtrace_canload(base, size, mstate, vstate))
1517 break;
1518
1519 for (j = 0; j < size; j++) {
1520 hashval += dtrace_load8(base + j);
1521 hashval += (hashval << 10);
1522 hashval ^= (hashval >> 6);
1523 }
1524 }
1525 }
1526
1527 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1528 return (NULL);
1529
1530 hashval += (hashval << 3);
1531 hashval ^= (hashval >> 11);
1532 hashval += (hashval << 15);
1533
1534 /*
1535 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1536 * comes out to be one of our two sentinel hash values. If this
1537 * actually happens, we set the hashval to be a value known to be a
1538 * non-sentinel value.
1539 */
1540 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1541 hashval = DTRACE_DYNHASH_VALID;
1542
1543 /*
1544 * Yes, it's painful to do a divide here. If the cycle count becomes
1545 * important here, tricks can be pulled to reduce it. (However, it's
1546 * critical that hash collisions be kept to an absolute minimum;
1547 * they're much more painful than a divide.) It's better to have a
1548 * solution that generates few collisions and still keeps things
1549 * relatively simple.
1550 */
1551 bucket = hashval % dstate->dtds_hashsize;
1552
1553 if (op == DTRACE_DYNVAR_DEALLOC) {
1554 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1555
1556 for (;;) {
1557 while ((lock = *lockp) & 1)
1558 continue;
1559
1560 if (dtrace_casptr((void *)lockp,
1561 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1562 break;
1563 }
1564
1565 dtrace_membar_producer();
1566 }
1567
1568top:
1569 prev = NULL;
1570 lock = hash[bucket].dtdh_lock;
1571
1572 dtrace_membar_consumer();
1573
1574 start = hash[bucket].dtdh_chain;
1575 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1576 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1577 op != DTRACE_DYNVAR_DEALLOC));
1578
1579 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1580 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1581 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1582
1583 if (dvar->dtdv_hashval != hashval) {
1584 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1585 /*
1586 * We've reached the sink, and therefore the
1587 * end of the hash chain; we can kick out of
1588 * the loop knowing that we have seen a valid
1589 * snapshot of state.
1590 */
1591 ASSERT(dvar->dtdv_next == NULL);
1592 ASSERT(dvar == &dtrace_dynhash_sink);
1593 break;
1594 }
1595
1596 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1597 /*
1598 * We've gone off the rails: somewhere along
1599 * the line, one of the members of this hash
1600 * chain was deleted. Note that we could also
1601 * detect this by simply letting this loop run
1602 * to completion, as we would eventually hit
1603 * the end of the dirty list. However, we
1604 * want to avoid running the length of the
1605 * dirty list unnecessarily (it might be quite
1606 * long), so we catch this as early as
1607 * possible by detecting the hash marker. In
1608 * this case, we simply set dvar to NULL and
1609 * break; the conditional after the loop will
1610 * send us back to top.
1611 */
1612 dvar = NULL;
1613 break;
1614 }
1615
1616 goto next;
1617 }
1618
1619 if (dtuple->dtt_nkeys != nkeys)
1620 goto next;
1621
1622 for (i = 0; i < nkeys; i++, dkey++) {
1623 if (dkey->dttk_size != key[i].dttk_size)
1624 goto next; /* size or type mismatch */
1625
1626 if (dkey->dttk_size != 0) {
1627 if (dtrace_bcmp(
1628 (void *)(uintptr_t)key[i].dttk_value,
1629 (void *)(uintptr_t)dkey->dttk_value,
1630 dkey->dttk_size))
1631 goto next;
1632 } else {
1633 if (dkey->dttk_value != key[i].dttk_value)
1634 goto next;
1635 }
1636 }
1637
1638 if (op != DTRACE_DYNVAR_DEALLOC)
1639 return (dvar);
1640
1641 ASSERT(dvar->dtdv_next == NULL ||
1642 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1643
1644 if (prev != NULL) {
1645 ASSERT(hash[bucket].dtdh_chain != dvar);
1646 ASSERT(start != dvar);
1647 ASSERT(prev->dtdv_next == dvar);
1648 prev->dtdv_next = dvar->dtdv_next;
1649 } else {
1650 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1651 start, dvar->dtdv_next) != start) {
1652 /*
1653 * We have failed to atomically swing the
1654 * hash table head pointer, presumably because
1655 * of a conflicting allocation on another CPU.
1656 * We need to reread the hash chain and try
1657 * again.
1658 */
1659 goto top;
1660 }
1661 }
1662
1663 dtrace_membar_producer();
1664
1665 /*
1666 * Now set the hash value to indicate that it's free.
1667 */
1668 ASSERT(hash[bucket].dtdh_chain != dvar);
1669 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1670
1671 dtrace_membar_producer();
1672
1673 /*
1674 * Set the next pointer to point at the dirty list, and
1675 * atomically swing the dirty pointer to the newly freed dvar.
1676 */
1677 do {
1678 next = dcpu->dtdsc_dirty;
1679 dvar->dtdv_next = next;
1680 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1681
1682 /*
1683 * Finally, unlock this hash bucket.
1684 */
1685 ASSERT(hash[bucket].dtdh_lock == lock);
1686 ASSERT(lock & 1);
1687 hash[bucket].dtdh_lock++;
1688
1689 return (NULL);
1690next:
1691 prev = dvar;
1692 continue;
1693 }
1694
1695 if (dvar == NULL) {
1696 /*
1697 * If dvar is NULL, it is because we went off the rails:
1698 * one of the elements that we traversed in the hash chain
1699 * was deleted while we were traversing it. In this case,
1700 * we assert that we aren't doing a dealloc (deallocs lock
1701 * the hash bucket to prevent themselves from racing with
1702 * one another), and retry the hash chain traversal.
1703 */
1704 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1705 goto top;
1706 }
1707
1708 if (op != DTRACE_DYNVAR_ALLOC) {
1709 /*
1710 * If we are not to allocate a new variable, we want to
1711 * return NULL now. Before we return, check that the value
1712 * of the lock word hasn't changed. If it has, we may have
1713 * seen an inconsistent snapshot.
1714 */
1715 if (op == DTRACE_DYNVAR_NOALLOC) {
1716 if (hash[bucket].dtdh_lock != lock)
1717 goto top;
1718 } else {
1719 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1720 ASSERT(hash[bucket].dtdh_lock == lock);
1721 ASSERT(lock & 1);
1722 hash[bucket].dtdh_lock++;
1723 }
1724
1725 return (NULL);
1726 }
1727
1728 /*
1729 * We need to allocate a new dynamic variable. The size we need is the
1730 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1731 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1732 * the size of any referred-to data (dsize). We then round the final
1733 * size up to the chunksize for allocation.
1734 */
1735 for (ksize = 0, i = 0; i < nkeys; i++)
1736 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1737
1738 /*
1739 * This should be pretty much impossible, but could happen if, say,
1740 * strange DIF specified the tuple. Ideally, this should be an
1741 * assertion and not an error condition -- but that requires that the
1742 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1743 * bullet-proof. (That is, it must not be able to be fooled by
1744 * malicious DIF.) Given the lack of backwards branches in DIF,
1745 * solving this would presumably not amount to solving the Halting
1746 * Problem -- but it still seems awfully hard.
1747 */
1748 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1749 ksize + dsize > chunksize) {
1750 dcpu->dtdsc_drops++;
1751 return (NULL);
1752 }
1753
1754 nstate = DTRACE_DSTATE_EMPTY;
1755
1756 do {
1757retry:
1758 free = dcpu->dtdsc_free;
1759
1760 if (free == NULL) {
1761 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1762 void *rval;
1763
1764 if (clean == NULL) {
1765 /*
1766 * We're out of dynamic variable space on
1767 * this CPU. Unless we have tried all CPUs,
1768 * we'll try to allocate from a different
1769 * CPU.
1770 */
1771 switch (dstate->dtds_state) {
1772 case DTRACE_DSTATE_CLEAN: {
1773 void *sp = &dstate->dtds_state;
1774
1775 if (++cpu >= NCPU)
1776 cpu = 0;
1777
1778 if (dcpu->dtdsc_dirty != NULL &&
1779 nstate == DTRACE_DSTATE_EMPTY)
1780 nstate = DTRACE_DSTATE_DIRTY;
1781
1782 if (dcpu->dtdsc_rinsing != NULL)
1783 nstate = DTRACE_DSTATE_RINSING;
1784
1785 dcpu = &dstate->dtds_percpu[cpu];
1786
1787 if (cpu != me)
1788 goto retry;
1789
1790 (void) dtrace_cas32(sp,
1791 DTRACE_DSTATE_CLEAN, nstate);
1792
1793 /*
1794 * To increment the correct bean
1795 * counter, take another lap.
1796 */
1797 goto retry;
1798 }
1799
1800 case DTRACE_DSTATE_DIRTY:
1801 dcpu->dtdsc_dirty_drops++;
1802 break;
1803
1804 case DTRACE_DSTATE_RINSING:
1805 dcpu->dtdsc_rinsing_drops++;
1806 break;
1807
1808 case DTRACE_DSTATE_EMPTY:
1809 dcpu->dtdsc_drops++;
1810 break;
1811 }
1812
1813 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1814 return (NULL);
1815 }
1816
1817 /*
1818 * The clean list appears to be non-empty. We want to
1819 * move the clean list to the free list; we start by
1820 * moving the clean pointer aside.
1821 */
1822 if (dtrace_casptr(&dcpu->dtdsc_clean,
1823 clean, NULL) != clean) {
1824 /*
1825 * We are in one of two situations:
1826 *
1827 * (a) The clean list was switched to the
1828 * free list by another CPU.
1829 *
1830 * (b) The clean list was added to by the
1831 * cleansing cyclic.
1832 *
1833 * In either of these situations, we can
1834 * just reattempt the free list allocation.
1835 */
1836 goto retry;
1837 }
1838
1839 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1840
1841 /*
1842 * Now we'll move the clean list to our free list.
1843 * It's impossible for this to fail: the only way
1844 * the free list can be updated is through this
1845 * code path, and only one CPU can own the clean list.
1846 * Thus, it would only be possible for this to fail if
1847 * this code were racing with dtrace_dynvar_clean().
1848 * (That is, if dtrace_dynvar_clean() updated the clean
1849 * list, and we ended up racing to update the free
1850 * list.) This race is prevented by the dtrace_sync()
1851 * in dtrace_dynvar_clean() -- which flushes the
1852 * owners of the clean lists out before resetting
1853 * the clean lists.
1854 */
1855 dcpu = &dstate->dtds_percpu[me];
1856 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1857 ASSERT(rval == NULL);
1858 goto retry;
1859 }
1860
1861 dvar = free;
1862 new_free = dvar->dtdv_next;
1863 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1864
1865 /*
1866 * We have now allocated a new chunk. We copy the tuple keys into the
1867 * tuple array and copy any referenced key data into the data space
1868 * following the tuple array. As we do this, we relocate dttk_value
1869 * in the final tuple to point to the key data address in the chunk.
1870 */
1871 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1872 dvar->dtdv_data = (void *)(kdata + ksize);
1873 dvar->dtdv_tuple.dtt_nkeys = nkeys;
1874
1875 for (i = 0; i < nkeys; i++) {
1876 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1877 size_t kesize = key[i].dttk_size;
1878
1879 if (kesize != 0) {
1880 dtrace_bcopy(
1881 (const void *)(uintptr_t)key[i].dttk_value,
1882 (void *)kdata, kesize);
1883 dkey->dttk_value = kdata;
1884 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1885 } else {
1886 dkey->dttk_value = key[i].dttk_value;
1887 }
1888
1889 dkey->dttk_size = kesize;
1890 }
1891
1892 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1893 dvar->dtdv_hashval = hashval;
1894 dvar->dtdv_next = start;
1895
1896 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1897 return (dvar);
1898
1899 /*
1900 * The cas has failed. Either another CPU is adding an element to
1901 * this hash chain, or another CPU is deleting an element from this
1902 * hash chain. The simplest way to deal with both of these cases
1903 * (though not necessarily the most efficient) is to free our
1904 * allocated block and tail-call ourselves. Note that the free is
1905 * to the dirty list and _not_ to the free list. This is to prevent
1906 * races with allocators, above.
1907 */
1908 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1909
1910 dtrace_membar_producer();
1911
1912 do {
1913 free = dcpu->dtdsc_dirty;
1914 dvar->dtdv_next = free;
1915 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1916
1917 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1918}
1919
1920/*ARGSUSED*/
1921static void
1922dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1923{
1924 if ((int64_t)nval < (int64_t)*oval)
1925 *oval = nval;
1926}
1927
1928/*ARGSUSED*/
1929static void
1930dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1931{
1932 if ((int64_t)nval > (int64_t)*oval)
1933 *oval = nval;
1934}
1935
1936static void
1937dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1938{
1939 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1940 int64_t val = (int64_t)nval;
1941
1942 if (val < 0) {
1943 for (i = 0; i < zero; i++) {
1944 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1945 quanta[i] += incr;
1946 return;
1947 }
1948 }
1949 } else {
1950 for (i = zero + 1; i < VBDTCAST(int)DTRACE_QUANTIZE_NBUCKETS; i++) {
1951 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1952 quanta[i - 1] += incr;
1953 return;
1954 }
1955 }
1956
1957 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1958 return;
1959 }
1960
1961#ifndef VBOX
1962 ASSERT(0);
1963#else
1964 AssertFatalFailed();
1965#endif
1966}
1967
1968static void
1969dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1970{
1971 uint64_t arg = *lquanta++;
1972 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1973 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1974 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1975 int32_t val = (int32_t)nval, level;
1976
1977 ASSERT(step != 0);
1978 ASSERT(levels != 0);
1979
1980 if (val < base) {
1981 /*
1982 * This is an underflow.
1983 */
1984 lquanta[0] += incr;
1985 return;
1986 }
1987
1988 level = (val - base) / step;
1989
1990 if (level < levels) {
1991 lquanta[level + 1] += incr;
1992 return;
1993 }
1994
1995 /*
1996 * This is an overflow.
1997 */
1998 lquanta[levels + 1] += incr;
1999}
2000
2001/*ARGSUSED*/
2002static void
2003dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2004{
2005 data[0]++;
2006 data[1] += nval;
2007}
2008
2009/*ARGSUSED*/
2010static void
2011dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2012{
2013 int64_t snval = (int64_t)nval;
2014 uint64_t tmp[2];
2015
2016 data[0]++;
2017 data[1] += nval;
2018
2019 /*
2020 * What we want to say here is:
2021 *
2022 * data[2] += nval * nval;
2023 *
2024 * But given that nval is 64-bit, we could easily overflow, so
2025 * we do this as 128-bit arithmetic.
2026 */
2027 if (snval < 0)
2028 snval = -snval;
2029
2030 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2031 dtrace_add_128(data + 2, tmp, data + 2);
2032}
2033
2034/*ARGSUSED*/
2035static void
2036dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2037{
2038 *oval = *oval + 1;
2039}
2040
2041/*ARGSUSED*/
2042static void
2043dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2044{
2045 *oval += nval;
2046}
2047
2048/*
2049 * Aggregate given the tuple in the principal data buffer, and the aggregating
2050 * action denoted by the specified dtrace_aggregation_t. The aggregation
2051 * buffer is specified as the buf parameter. This routine does not return
2052 * failure; if there is no space in the aggregation buffer, the data will be
2053 * dropped, and a corresponding counter incremented.
2054 */
2055static void
2056dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2057 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2058{
2059 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2060 uint32_t i, ndx, size, fsize;
2061 uint32_t align = sizeof (uint64_t) - 1;
2062 dtrace_aggbuffer_t *agb;
2063 dtrace_aggkey_t *key;
2064 uint32_t hashval = 0, limit, isstr;
2065 caddr_t tomax, data, kdata;
2066 dtrace_actkind_t action;
2067 dtrace_action_t *act;
2068 uintptr_t offs;
2069
2070 if (buf == NULL)
2071 return;
2072
2073 if (!agg->dtag_hasarg) {
2074 /*
2075 * Currently, only quantize() and lquantize() take additional
2076 * arguments, and they have the same semantics: an increment
2077 * value that defaults to 1 when not present. If additional
2078 * aggregating actions take arguments, the setting of the
2079 * default argument value will presumably have to become more
2080 * sophisticated...
2081 */
2082 arg = 1;
2083 }
2084
2085 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2086 size = rec->dtrd_offset - agg->dtag_base;
2087 fsize = size + rec->dtrd_size;
2088
2089 ASSERT(dbuf->dtb_tomax != NULL);
2090 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2091
2092 if ((tomax = buf->dtb_tomax) == NULL) {
2093 dtrace_buffer_drop(buf);
2094 return;
2095 }
2096
2097 /*
2098 * The metastructure is always at the bottom of the buffer.
2099 */
2100 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2101 sizeof (dtrace_aggbuffer_t));
2102
2103 if (buf->dtb_offset == 0) {
2104 /*
2105 * We just kludge up approximately 1/8th of the size to be
2106 * buckets. If this guess ends up being routinely
2107 * off-the-mark, we may need to dynamically readjust this
2108 * based on past performance.
2109 */
2110 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2111
2112 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2113 (uintptr_t)tomax || hashsize == 0) {
2114 /*
2115 * We've been given a ludicrously small buffer;
2116 * increment our drop count and leave.
2117 */
2118 dtrace_buffer_drop(buf);
2119 return;
2120 }
2121
2122 /*
2123 * And now, a pathetic attempt to try to get a an odd (or
2124 * perchance, a prime) hash size for better hash distribution.
2125 */
2126 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2127 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2128
2129 agb->dtagb_hashsize = hashsize;
2130 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2131 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2132 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2133
2134 for (i = 0; i < agb->dtagb_hashsize; i++)
2135 agb->dtagb_hash[i] = NULL;
2136 }
2137
2138 ASSERT(agg->dtag_first != NULL);
2139 ASSERT(agg->dtag_first->dta_intuple);
2140
2141 /*
2142 * Calculate the hash value based on the key. Note that we _don't_
2143 * include the aggid in the hashing (but we will store it as part of
2144 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2145 * algorithm: a simple, quick algorithm that has no known funnels, and
2146 * gets good distribution in practice. The efficacy of the hashing
2147 * algorithm (and a comparison with other algorithms) may be found by
2148 * running the ::dtrace_aggstat MDB dcmd.
2149 */
2150 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2151 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2152 limit = i + act->dta_rec.dtrd_size;
2153 ASSERT(limit <= size);
2154 isstr = DTRACEACT_ISSTRING(act);
2155
2156 for (; i < limit; i++) {
2157 hashval += data[i];
2158 hashval += (hashval << 10);
2159 hashval ^= (hashval >> 6);
2160
2161 if (isstr && data[i] == '\0')
2162 break;
2163 }
2164 }
2165
2166 hashval += (hashval << 3);
2167 hashval ^= (hashval >> 11);
2168 hashval += (hashval << 15);
2169
2170 /*
2171 * Yes, the divide here is expensive -- but it's generally the least
2172 * of the performance issues given the amount of data that we iterate
2173 * over to compute hash values, compare data, etc.
2174 */
2175 ndx = hashval % agb->dtagb_hashsize;
2176
2177 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2178 ASSERT((caddr_t)key >= tomax);
2179 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2180
2181 if (hashval != key->dtak_hashval || key->dtak_size != size)
2182 continue;
2183
2184 kdata = key->dtak_data;
2185 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2186
2187 for (act = agg->dtag_first; act->dta_intuple;
2188 act = act->dta_next) {
2189 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2190 limit = i + act->dta_rec.dtrd_size;
2191 ASSERT(limit <= size);
2192 isstr = DTRACEACT_ISSTRING(act);
2193
2194 for (; i < limit; i++) {
2195 if (kdata[i] != data[i])
2196 goto next;
2197
2198 if (isstr && data[i] == '\0')
2199 break;
2200 }
2201 }
2202
2203 if (action != key->dtak_action) {
2204 /*
2205 * We are aggregating on the same value in the same
2206 * aggregation with two different aggregating actions.
2207 * (This should have been picked up in the compiler,
2208 * so we may be dealing with errant or devious DIF.)
2209 * This is an error condition; we indicate as much,
2210 * and return.
2211 */
2212 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2213 return;
2214 }
2215
2216 /*
2217 * This is a hit: we need to apply the aggregator to
2218 * the value at this key.
2219 */
2220 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2221 return;
2222next:
2223 continue;
2224 }
2225
2226 /*
2227 * We didn't find it. We need to allocate some zero-filled space,
2228 * link it into the hash table appropriately, and apply the aggregator
2229 * to the (zero-filled) value.
2230 */
2231 offs = buf->dtb_offset;
2232 while (offs & (align - 1))
2233 offs += sizeof (uint32_t);
2234
2235 /*
2236 * If we don't have enough room to both allocate a new key _and_
2237 * its associated data, increment the drop count and return.
2238 */
2239 if ((uintptr_t)tomax + offs + fsize >
2240 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2241 dtrace_buffer_drop(buf);
2242 return;
2243 }
2244
2245 /*CONSTCOND*/
2246 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2247 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2248 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2249
2250 key->dtak_data = kdata = tomax + offs;
2251 buf->dtb_offset = offs + fsize;
2252
2253 /*
2254 * Now copy the data across.
2255 */
2256 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2257
2258 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2259 kdata[i] = data[i];
2260
2261 /*
2262 * Because strings are not zeroed out by default, we need to iterate
2263 * looking for actions that store strings, and we need to explicitly
2264 * pad these strings out with zeroes.
2265 */
2266 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2267 int nul;
2268
2269 if (!DTRACEACT_ISSTRING(act))
2270 continue;
2271
2272 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2273 limit = i + act->dta_rec.dtrd_size;
2274 ASSERT(limit <= size);
2275
2276 for (nul = 0; i < limit; i++) {
2277 if (nul) {
2278 kdata[i] = '\0';
2279 continue;
2280 }
2281
2282 if (data[i] != '\0')
2283 continue;
2284
2285 nul = 1;
2286 }
2287 }
2288
2289 for (i = size; i < fsize; i++)
2290 kdata[i] = 0;
2291
2292 key->dtak_hashval = hashval;
2293 key->dtak_size = size;
2294 key->dtak_action = action;
2295 key->dtak_next = agb->dtagb_hash[ndx];
2296 agb->dtagb_hash[ndx] = key;
2297
2298 /*
2299 * Finally, apply the aggregator.
2300 */
2301 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2302 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2303}
2304
2305/*
2306 * Given consumer state, this routine finds a speculation in the INACTIVE
2307 * state and transitions it into the ACTIVE state. If there is no speculation
2308 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2309 * incremented -- it is up to the caller to take appropriate action.
2310 */
2311static int
2312dtrace_speculation(dtrace_state_t *state)
2313{
2314 int i = 0;
2315 dtrace_speculation_state_t current;
2316 uint32_t *stat = &state->dts_speculations_unavail, count;
2317
2318 while (i < state->dts_nspeculations) {
2319 dtrace_speculation_t *spec = &state->dts_speculations[i];
2320
2321 current = spec->dtsp_state;
2322
2323 if (current != DTRACESPEC_INACTIVE) {
2324 if (current == DTRACESPEC_COMMITTINGMANY ||
2325 current == DTRACESPEC_COMMITTING ||
2326 current == DTRACESPEC_DISCARDING)
2327 stat = &state->dts_speculations_busy;
2328 i++;
2329 continue;
2330 }
2331
2332 if ( (dtrace_speculation_state_t)dtrace_cas32((uint32_t *)&spec->dtsp_state, current, DTRACESPEC_ACTIVE)
2333 == current)
2334 return (i + 1);
2335 }
2336
2337 /*
2338 * We couldn't find a speculation. If we found as much as a single
2339 * busy speculation buffer, we'll attribute this failure as "busy"
2340 * instead of "unavail".
2341 */
2342 do {
2343 count = *stat;
2344 } while (dtrace_cas32(stat, count, count + 1) != count);
2345
2346 return (0);
2347}
2348
2349/*
2350 * This routine commits an active speculation. If the specified speculation
2351 * is not in a valid state to perform a commit(), this routine will silently do
2352 * nothing. The state of the specified speculation is transitioned according
2353 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2354 */
2355static void
2356dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2357 dtrace_specid_t which)
2358{
2359 dtrace_speculation_t *spec;
2360 dtrace_buffer_t *src, *dest;
2361 uintptr_t daddr, saddr, dlimit;
2362 dtrace_speculation_state_t current, new VBDTUNASS(-1);
2363 intptr_t offs;
2364
2365 if (which == 0)
2366 return;
2367
2368 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2369 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2370 return;
2371 }
2372
2373 spec = &state->dts_speculations[which - 1];
2374 src = &spec->dtsp_buffer[cpu];
2375 dest = &state->dts_buffer[cpu];
2376
2377 do {
2378 current = spec->dtsp_state;
2379
2380 if (current == DTRACESPEC_COMMITTINGMANY)
2381 break;
2382
2383 switch (current) {
2384 case DTRACESPEC_INACTIVE:
2385 case DTRACESPEC_DISCARDING:
2386 return;
2387
2388 case DTRACESPEC_COMMITTING:
2389 /*
2390 * This is only possible if we are (a) commit()'ing
2391 * without having done a prior speculate() on this CPU
2392 * and (b) racing with another commit() on a different
2393 * CPU. There's nothing to do -- we just assert that
2394 * our offset is 0.
2395 */
2396 ASSERT(src->dtb_offset == 0);
2397 return;
2398
2399 case DTRACESPEC_ACTIVE:
2400 new = DTRACESPEC_COMMITTING;
2401 break;
2402
2403 case DTRACESPEC_ACTIVEONE:
2404 /*
2405 * This speculation is active on one CPU. If our
2406 * buffer offset is non-zero, we know that the one CPU
2407 * must be us. Otherwise, we are committing on a
2408 * different CPU from the speculate(), and we must
2409 * rely on being asynchronously cleaned.
2410 */
2411 if (src->dtb_offset != 0) {
2412 new = DTRACESPEC_COMMITTING;
2413 break;
2414 }
2415 /*FALLTHROUGH*/
2416
2417 case DTRACESPEC_ACTIVEMANY:
2418 new = DTRACESPEC_COMMITTINGMANY;
2419 break;
2420
2421 default:
2422#ifndef VBOX
2423 ASSERT(0);
2424#else
2425 AssertFatalMsgFailed(("%d\n", current));
2426#endif
2427 }
2428 } while ((dtrace_speculation_state_t)dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new) != current);
2429
2430 /*
2431 * We have set the state to indicate that we are committing this
2432 * speculation. Now reserve the necessary space in the destination
2433 * buffer.
2434 */
2435 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2436 sizeof (uint64_t), state, NULL)) < 0) {
2437 dtrace_buffer_drop(dest);
2438 goto out;
2439 }
2440
2441 /*
2442 * We have the space; copy the buffer across. (Note that this is a
2443 * highly subobtimal bcopy(); in the unlikely event that this becomes
2444 * a serious performance issue, a high-performance DTrace-specific
2445 * bcopy() should obviously be invented.)
2446 */
2447 daddr = (uintptr_t)dest->dtb_tomax + offs;
2448 dlimit = daddr + src->dtb_offset;
2449 saddr = (uintptr_t)src->dtb_tomax;
2450
2451 /*
2452 * First, the aligned portion.
2453 */
2454 while (dlimit - daddr >= sizeof (uint64_t)) {
2455 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2456
2457 daddr += sizeof (uint64_t);
2458 saddr += sizeof (uint64_t);
2459 }
2460
2461 /*
2462 * Now any left-over bit...
2463 */
2464 while (dlimit - daddr)
2465 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2466
2467 /*
2468 * Finally, commit the reserved space in the destination buffer.
2469 */
2470 dest->dtb_offset = offs + src->dtb_offset;
2471
2472out:
2473 /*
2474 * If we're lucky enough to be the only active CPU on this speculation
2475 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2476 */
2477 if (current == DTRACESPEC_ACTIVE ||
2478 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2479 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2480 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2481
2482 ASSERT(rval == DTRACESPEC_COMMITTING);
2483 }
2484
2485 src->dtb_offset = 0;
2486 src->dtb_xamot_drops += src->dtb_drops;
2487 src->dtb_drops = 0;
2488}
2489
2490/*
2491 * This routine discards an active speculation. If the specified speculation
2492 * is not in a valid state to perform a discard(), this routine will silently
2493 * do nothing. The state of the specified speculation is transitioned
2494 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2495 */
2496static void
2497dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2498 dtrace_specid_t which)
2499{
2500 dtrace_speculation_t *spec;
2501 dtrace_speculation_state_t current, new;
2502 dtrace_buffer_t *buf;
2503
2504 if (which == 0)
2505 return;
2506
2507 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2508 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2509 return;
2510 }
2511
2512 spec = &state->dts_speculations[which - 1];
2513 buf = &spec->dtsp_buffer[cpu];
2514
2515 do {
2516 current = spec->dtsp_state;
2517
2518 switch (current) {
2519 case DTRACESPEC_INACTIVE:
2520 case DTRACESPEC_COMMITTINGMANY:
2521 case DTRACESPEC_COMMITTING:
2522 case DTRACESPEC_DISCARDING:
2523 return;
2524
2525 case DTRACESPEC_ACTIVE:
2526 case DTRACESPEC_ACTIVEMANY:
2527 new = DTRACESPEC_DISCARDING;
2528 break;
2529
2530 case DTRACESPEC_ACTIVEONE:
2531 if (buf->dtb_offset != 0) {
2532 new = DTRACESPEC_INACTIVE;
2533 } else {
2534 new = DTRACESPEC_DISCARDING;
2535 }
2536 break;
2537
2538 default:
2539#ifndef VBOX
2540 ASSERT(0);
2541#else
2542 AssertFatalMsgFailed(("%d\n", current));
2543#endif
2544 }
2545 } while ((dtrace_speculation_state_t)dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new) != current);
2546
2547 buf->dtb_offset = 0;
2548 buf->dtb_drops = 0;
2549}
2550
2551/*
2552 * Note: not called from probe context. This function is called
2553 * asynchronously from cross call context to clean any speculations that are
2554 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2555 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2556 * speculation.
2557 */
2558static void
2559dtrace_speculation_clean_here(dtrace_state_t *state)
2560{
2561 dtrace_icookie_t cookie;
2562 processorid_t cpu = VBDT_GET_CPUID();
2563 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2564 dtrace_specid_t i;
2565
2566 cookie = dtrace_interrupt_disable();
2567
2568 if (dest->dtb_tomax == NULL) {
2569 dtrace_interrupt_enable(cookie);
2570 return;
2571 }
2572
2573 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2574 dtrace_speculation_t *spec = &state->dts_speculations[i];
2575 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2576
2577 if (src->dtb_tomax == NULL)
2578 continue;
2579
2580 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2581 src->dtb_offset = 0;
2582 continue;
2583 }
2584
2585 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2586 continue;
2587
2588 if (src->dtb_offset == 0)
2589 continue;
2590
2591 dtrace_speculation_commit(state, cpu, i + 1);
2592 }
2593
2594 dtrace_interrupt_enable(cookie);
2595}
2596
2597#ifdef VBOX
2598/** */
2599static DECLCALLBACK(void) dtrace_speculation_clean_here_wrapper(RTCPUID idCpu, void *pvUser1, void *pvUser2)
2600{
2601 dtrace_speculation_clean_here((dtrace_state_t *)pvUser1);
2602 NOREF(pvUser2); NOREF(idCpu);
2603}
2604#endif
2605
2606/*
2607 * Note: not called from probe context. This function is called
2608 * asynchronously (and at a regular interval) to clean any speculations that
2609 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2610 * is work to be done, it cross calls all CPUs to perform that work;
2611 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2612 * INACTIVE state until they have been cleaned by all CPUs.
2613 */
2614static void
2615dtrace_speculation_clean(dtrace_state_t *state)
2616{
2617 int work = 0, rv;
2618 dtrace_specid_t i;
2619
2620 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2621 dtrace_speculation_t *spec = &state->dts_speculations[i];
2622
2623 ASSERT(!spec->dtsp_cleaning);
2624
2625 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2626 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2627 continue;
2628
2629 work++;
2630 spec->dtsp_cleaning = 1;
2631 }
2632
2633 if (!work)
2634 return;
2635
2636#ifndef VBOX
2637 dtrace_xcall(DTRACE_CPUALL,
2638 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2639#else
2640 RTMpOnAll(dtrace_speculation_clean_here_wrapper, state, NULL);
2641#endif
2642
2643 /*
2644 * We now know that all CPUs have committed or discarded their
2645 * speculation buffers, as appropriate. We can now set the state
2646 * to inactive.
2647 */
2648 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2649 dtrace_speculation_t *spec = &state->dts_speculations[i];
2650 dtrace_speculation_state_t current, new;
2651
2652 if (!spec->dtsp_cleaning)
2653 continue;
2654
2655 current = spec->dtsp_state;
2656 ASSERT(current == DTRACESPEC_DISCARDING ||
2657 current == DTRACESPEC_COMMITTINGMANY);
2658
2659 new = DTRACESPEC_INACTIVE;
2660
2661 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2662 ASSERT(VBDTCAST(dtrace_speculation_state_t)rv == current);
2663 spec->dtsp_cleaning = 0;
2664 }
2665}
2666
2667/*
2668 * Called as part of a speculate() to get the speculative buffer associated
2669 * with a given speculation. Returns NULL if the specified speculation is not
2670 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2671 * the active CPU is not the specified CPU -- the speculation will be
2672 * atomically transitioned into the ACTIVEMANY state.
2673 */
2674static dtrace_buffer_t *
2675dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2676 dtrace_specid_t which)
2677{
2678 dtrace_speculation_t *spec;
2679 dtrace_speculation_state_t current, new VBDTUNASS(-1);
2680 dtrace_buffer_t *buf;
2681
2682 if (which == 0)
2683 return (NULL);
2684
2685 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2686 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2687 return (NULL);
2688 }
2689
2690 spec = &state->dts_speculations[which - 1];
2691 buf = &spec->dtsp_buffer[cpuid];
2692
2693 do {
2694 current = spec->dtsp_state;
2695
2696 switch (current) {
2697 case DTRACESPEC_INACTIVE:
2698 case DTRACESPEC_COMMITTINGMANY:
2699 case DTRACESPEC_DISCARDING:
2700 return (NULL);
2701
2702 case DTRACESPEC_COMMITTING:
2703 ASSERT(buf->dtb_offset == 0);
2704 return (NULL);
2705
2706 case DTRACESPEC_ACTIVEONE:
2707 /*
2708 * This speculation is currently active on one CPU.
2709 * Check the offset in the buffer; if it's non-zero,
2710 * that CPU must be us (and we leave the state alone).
2711 * If it's zero, assume that we're starting on a new
2712 * CPU -- and change the state to indicate that the
2713 * speculation is active on more than one CPU.
2714 */
2715 if (buf->dtb_offset != 0)
2716 return (buf);
2717
2718 new = DTRACESPEC_ACTIVEMANY;
2719 break;
2720
2721 case DTRACESPEC_ACTIVEMANY:
2722 return (buf);
2723
2724 case DTRACESPEC_ACTIVE:
2725 new = DTRACESPEC_ACTIVEONE;
2726 break;
2727
2728 default:
2729#ifndef VBOX
2730 ASSERT(0);
2731#else
2732 AssertFatalMsgFailed(("%d\n", current));
2733#endif
2734 }
2735 } while ((dtrace_speculation_state_t)dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new) != current);
2736
2737 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2738 return (buf);
2739}
2740
2741/*
2742 * Return a string. In the event that the user lacks the privilege to access
2743 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2744 * don't fail access checking.
2745 *
2746 * dtrace_dif_variable() uses this routine as a helper for various
2747 * builtin values such as 'execname' and 'probefunc.'
2748 */
2749VBDTSTATIC uintptr_t
2750dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2751 dtrace_mstate_t *mstate)
2752{
2753 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2754 uintptr_t ret;
2755 size_t strsz;
2756
2757 /*
2758 * The easy case: this probe is allowed to read all of memory, so
2759 * we can just return this as a vanilla pointer.
2760 */
2761 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2762 return (addr);
2763
2764 /*
2765 * This is the tougher case: we copy the string in question from
2766 * kernel memory into scratch memory and return it that way: this
2767 * ensures that we won't trip up when access checking tests the
2768 * BYREF return value.
2769 */
2770 strsz = dtrace_strlen((char *)addr, size) + 1;
2771
2772 if (mstate->dtms_scratch_ptr + strsz >
2773 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2774 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2775 return (NULL);
2776 }
2777
2778 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2779 strsz);
2780 ret = mstate->dtms_scratch_ptr;
2781 mstate->dtms_scratch_ptr += strsz;
2782 return (ret);
2783}
2784
2785/*
2786 * This function implements the DIF emulator's variable lookups. The emulator
2787 * passes a reserved variable identifier and optional built-in array index.
2788 */
2789static uint64_t
2790dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2791 uint64_t ndx)
2792{
2793 /*
2794 * If we're accessing one of the uncached arguments, we'll turn this
2795 * into a reference in the args array.
2796 */
2797 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2798 ndx = v - DIF_VAR_ARG0;
2799 v = DIF_VAR_ARGS;
2800 }
2801
2802 switch (v) {
2803 case DIF_VAR_ARGS:
2804 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2805 if (ndx >= sizeof (mstate->dtms_arg) /
2806 sizeof (mstate->dtms_arg[0])) {
2807 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2808 dtrace_provider_t *pv;
2809 uint64_t val;
2810
2811 pv = mstate->dtms_probe->dtpr_provider;
2812 if (pv->dtpv_pops.dtps_getargval != NULL)
2813 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2814 mstate->dtms_probe->dtpr_id,
2815 mstate->dtms_probe->dtpr_arg, ndx, aframes);
2816 else
2817 val = dtrace_getarg(ndx, aframes);
2818
2819 /*
2820 * This is regrettably required to keep the compiler
2821 * from tail-optimizing the call to dtrace_getarg().
2822 * The condition always evaluates to true, but the
2823 * compiler has no way of figuring that out a priori.
2824 * (None of this would be necessary if the compiler
2825 * could be relied upon to _always_ tail-optimize
2826 * the call to dtrace_getarg() -- but it can't.)
2827 */
2828 if (mstate->dtms_probe != NULL)
2829 return (val);
2830
2831#ifndef VBOX
2832 ASSERT(0);
2833#else
2834 AssertFatalFailed();
2835#endif
2836 }
2837
2838 return (mstate->dtms_arg[ndx]);
2839
2840 case DIF_VAR_UREGS: {
2841#ifndef VBOX
2842 klwp_t *lwp;
2843
2844 if (!dtrace_priv_proc(state))
2845 return (0);
2846
2847 if ((lwp = curthread->t_lwp) == NULL) {
2848 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2849 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = NULL;
2850 return (0);
2851 }
2852
2853 return (dtrace_getreg(lwp->lwp_regs, ndx));
2854#else
2855 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2856 return (0);
2857#endif
2858 }
2859
2860 case DIF_VAR_CURTHREAD:
2861 if (!dtrace_priv_kernel(state))
2862 return (0);
2863#ifndef VBOX
2864 return ((uint64_t)(uintptr_t)curthread);
2865#else
2866 return ((uintptr_t)RTThreadNativeSelf());
2867#endif
2868
2869 case DIF_VAR_TIMESTAMP:
2870 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2871 mstate->dtms_timestamp = dtrace_gethrtime();
2872 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2873 }
2874 return (mstate->dtms_timestamp);
2875
2876 case DIF_VAR_VTIMESTAMP:
2877#ifndef VBOX
2878 ASSERT(dtrace_vtime_references != 0);
2879 return (curthread->t_dtrace_vtime);
2880#else
2881 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2882 return (0);
2883#endif
2884
2885 case DIF_VAR_WALLTIMESTAMP:
2886 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2887 mstate->dtms_walltimestamp = dtrace_gethrestime();
2888 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2889 }
2890 return (mstate->dtms_walltimestamp);
2891
2892 case DIF_VAR_IPL:
2893 if (!dtrace_priv_kernel(state))
2894 return (0);
2895 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2896 mstate->dtms_ipl = dtrace_getipl();
2897 mstate->dtms_present |= DTRACE_MSTATE_IPL;
2898 }
2899 return (mstate->dtms_ipl);
2900
2901 case DIF_VAR_EPID:
2902 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2903 return (mstate->dtms_epid);
2904
2905 case DIF_VAR_ID:
2906 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2907 return (mstate->dtms_probe->dtpr_id);
2908
2909 case DIF_VAR_STACKDEPTH:
2910 if (!dtrace_priv_kernel(state))
2911 return (0);
2912 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2913 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2914
2915 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2916 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2917 }
2918 return (mstate->dtms_stackdepth);
2919
2920 case DIF_VAR_USTACKDEPTH:
2921 if (!dtrace_priv_proc(state))
2922 return (0);
2923 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2924 /*
2925 * See comment in DIF_VAR_PID.
2926 */
2927 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2928 CPU_ON_INTR(CPU)) {
2929 mstate->dtms_ustackdepth = 0;
2930 } else {
2931 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2932 mstate->dtms_ustackdepth =
2933 dtrace_getustackdepth();
2934 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2935 }
2936 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2937 }
2938 return (mstate->dtms_ustackdepth);
2939
2940 case DIF_VAR_CALLER:
2941 if (!dtrace_priv_kernel(state))
2942 return (0);
2943 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2944 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2945
2946 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2947 /*
2948 * If this is an unanchored probe, we are
2949 * required to go through the slow path:
2950 * dtrace_caller() only guarantees correct
2951 * results for anchored probes.
2952 */
2953 pc_t caller[2];
2954
2955 dtrace_getpcstack(caller, 2, aframes,
2956 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2957 mstate->dtms_caller = caller[1];
2958 } else if ((mstate->dtms_caller =
2959 dtrace_caller(aframes)) == VBDTCAST(uintptr_t)-1) {
2960 /*
2961 * We have failed to do this the quick way;
2962 * we must resort to the slower approach of
2963 * calling dtrace_getpcstack().
2964 */
2965 pc_t caller;
2966
2967 dtrace_getpcstack(&caller, 1, aframes, NULL);
2968 mstate->dtms_caller = caller;
2969 }
2970
2971 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2972 }
2973 return (mstate->dtms_caller);
2974
2975 case DIF_VAR_UCALLER:
2976 if (!dtrace_priv_proc(state))
2977 return (0);
2978
2979 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
2980 uint64_t ustack[3];
2981
2982 /*
2983 * dtrace_getupcstack() fills in the first uint64_t
2984 * with the current PID. The second uint64_t will
2985 * be the program counter at user-level. The third
2986 * uint64_t will contain the caller, which is what
2987 * we're after.
2988 */
2989 ustack[2] = NULL;
2990 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2991 dtrace_getupcstack(ustack, 3);
2992 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2993 mstate->dtms_ucaller = ustack[2];
2994 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
2995 }
2996
2997 return (mstate->dtms_ucaller);
2998
2999 case DIF_VAR_PROBEPROV:
3000 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3001 return (dtrace_dif_varstr(
3002 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3003 state, mstate));
3004
3005 case DIF_VAR_PROBEMOD:
3006 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3007 return (dtrace_dif_varstr(
3008 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3009 state, mstate));
3010
3011 case DIF_VAR_PROBEFUNC:
3012 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3013 return (dtrace_dif_varstr(
3014 (uintptr_t)mstate->dtms_probe->dtpr_func,
3015 state, mstate));
3016
3017 case DIF_VAR_PROBENAME:
3018 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3019 return (dtrace_dif_varstr(
3020 (uintptr_t)mstate->dtms_probe->dtpr_name,
3021 state, mstate));
3022
3023 case DIF_VAR_PID:
3024 if (!dtrace_priv_proc(state))
3025 return (0);
3026
3027#ifndef VBOX
3028 /*
3029 * Note that we are assuming that an unanchored probe is
3030 * always due to a high-level interrupt. (And we're assuming
3031 * that there is only a single high level interrupt.)
3032 */
3033 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3034 return (pid0.pid_id);
3035
3036 /*
3037 * It is always safe to dereference one's own t_procp pointer:
3038 * it always points to a valid, allocated proc structure.
3039 * Further, it is always safe to dereference the p_pidp member
3040 * of one's own proc structure. (These are truisms becuase
3041 * threads and processes don't clean up their own state --
3042 * they leave that task to whomever reaps them.)
3043 */
3044 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3045#else
3046 return (RTProcSelf());
3047#endif
3048
3049 case DIF_VAR_PPID:
3050 if (!dtrace_priv_proc(state))
3051 return (0);
3052
3053#ifndef VBOX
3054 /*
3055 * See comment in DIF_VAR_PID.
3056 */
3057 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3058 return (pid0.pid_id);
3059
3060 /*
3061 * It is always safe to dereference one's own t_procp pointer:
3062 * it always points to a valid, allocated proc structure.
3063 * (This is true because threads don't clean up their own
3064 * state -- they leave that task to whomever reaps them.)
3065 */
3066 return ((uint64_t)curthread->t_procp->p_ppid);
3067#else
3068 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3069 return (0); /** @todo parent pid? */
3070#endif
3071
3072 case DIF_VAR_TID:
3073#ifndef VBOX
3074 /*
3075 * See comment in DIF_VAR_PID.
3076 */
3077 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3078 return (0);
3079
3080 return ((uint64_t)curthread->t_tid);
3081#else
3082 return (RTThreadNativeSelf()); /** @todo proper tid? */
3083#endif
3084
3085 case DIF_VAR_EXECNAME:
3086 if (!dtrace_priv_proc(state))
3087 return (0);
3088
3089#ifndef VBOX
3090 /*
3091 * See comment in DIF_VAR_PID.
3092 */
3093 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3094 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3095
3096 /*
3097 * It is always safe to dereference one's own t_procp pointer:
3098 * it always points to a valid, allocated proc structure.
3099 * (This is true because threads don't clean up their own
3100 * state -- they leave that task to whomever reaps them.)
3101 */
3102 return (dtrace_dif_varstr(
3103 (uintptr_t)curthread->t_procp->p_user.u_comm,
3104 state, mstate));
3105#else
3106 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3107 return (0); /** @todo execname */
3108#endif
3109
3110 case DIF_VAR_ZONENAME:
3111 if (!dtrace_priv_proc(state))
3112 return (0);
3113
3114#ifndef VBOX
3115 /*
3116 * See comment in DIF_VAR_PID.
3117 */
3118 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3119 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3120
3121 /*
3122 * It is always safe to dereference one's own t_procp pointer:
3123 * it always points to a valid, allocated proc structure.
3124 * (This is true because threads don't clean up their own
3125 * state -- they leave that task to whomever reaps them.)
3126 */
3127 return (dtrace_dif_varstr(
3128 (uintptr_t)curthread->t_procp->p_zone->zone_name,
3129 state, mstate));
3130#else
3131 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3132 return (0);
3133#endif
3134
3135 case DIF_VAR_UID:
3136 if (!dtrace_priv_proc(state))
3137 return (0);
3138
3139#ifndef VBOX
3140 /*
3141 * See comment in DIF_VAR_PID.
3142 */
3143 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3144 return ((uint64_t)p0.p_cred->cr_uid);
3145
3146 /*
3147 * It is always safe to dereference one's own t_procp pointer:
3148 * it always points to a valid, allocated proc structure.
3149 * (This is true because threads don't clean up their own
3150 * state -- they leave that task to whomever reaps them.)
3151 *
3152 * Additionally, it is safe to dereference one's own process
3153 * credential, since this is never NULL after process birth.
3154 */
3155 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3156#else
3157 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3158 return (0);
3159#endif
3160
3161 case DIF_VAR_GID:
3162 if (!dtrace_priv_proc(state))
3163 return (0);
3164
3165#ifndef VBOX
3166 /*
3167 * See comment in DIF_VAR_PID.
3168 */
3169 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3170 return ((uint64_t)p0.p_cred->cr_gid);
3171
3172 /*
3173 * It is always safe to dereference one's own t_procp pointer:
3174 * it always points to a valid, allocated proc structure.
3175 * (This is true because threads don't clean up their own
3176 * state -- they leave that task to whomever reaps them.)
3177 *
3178 * Additionally, it is safe to dereference one's own process
3179 * credential, since this is never NULL after process birth.
3180 */
3181 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3182#else
3183 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3184 return (0);
3185#endif
3186
3187 case DIF_VAR_ERRNO: {
3188#ifndef VBOX
3189 klwp_t *lwp;
3190#endif
3191 if (!dtrace_priv_proc(state))
3192 return (0);
3193
3194#ifndef VBOX
3195 /*
3196 * See comment in DIF_VAR_PID.
3197 */
3198 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3199 return (0);
3200
3201 /*
3202 * It is always safe to dereference one's own t_lwp pointer in
3203 * the event that this pointer is non-NULL. (This is true
3204 * because threads and lwps don't clean up their own state --
3205 * they leave that task to whomever reaps them.)
3206 */
3207 if ((lwp = curthread->t_lwp) == NULL)
3208 return (0);
3209
3210 return ((uint64_t)lwp->lwp_errno);
3211#else
3212 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3213 return (0);
3214#endif
3215 }
3216 default:
3217 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3218 return (0);
3219 }
3220}
3221
3222/*
3223 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3224 * Notice that we don't bother validating the proper number of arguments or
3225 * their types in the tuple stack. This isn't needed because all argument
3226 * interpretation is safe because of our load safety -- the worst that can
3227 * happen is that a bogus program can obtain bogus results.
3228 */
3229static void
3230dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3231 dtrace_key_t *tupregs, int nargs,
3232 dtrace_mstate_t *mstate, dtrace_state_t *state)
3233{
3234 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
3235 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
3236 dtrace_vstate_t *vstate = &state->dts_vstate;
3237
3238#ifndef VBOX
3239 union {
3240 mutex_impl_t mi;
3241 uint64_t mx;
3242 } m;
3243
3244 union {
3245 krwlock_t ri;
3246 uintptr_t rw;
3247 } r;
3248#endif
3249
3250 switch (subr) {
3251 case DIF_SUBR_RAND:
3252 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3253 break;
3254
3255 case DIF_SUBR_MUTEX_OWNED:
3256#ifndef VBOX
3257 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3258 mstate, vstate)) {
3259 regs[rd] = NULL;
3260 break;
3261 }
3262
3263 m.mx = dtrace_load64(tupregs[0].dttk_value);
3264 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3265 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3266 else
3267 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3268#else
3269 regs[rd] = 0;
3270 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3271#endif
3272 break;
3273
3274 case DIF_SUBR_MUTEX_OWNER:
3275#ifndef VBOX
3276 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3277 mstate, vstate)) {
3278 regs[rd] = NULL;
3279 break;
3280 }
3281
3282 m.mx = dtrace_load64(tupregs[0].dttk_value);
3283 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3284 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3285 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3286 else
3287 regs[rd] = 0;
3288#else
3289 regs[rd] = 0;
3290 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3291#endif
3292 break;
3293
3294 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3295#ifndef VBOX
3296 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3297 mstate, vstate)) {
3298 regs[rd] = NULL;
3299 break;
3300 }
3301
3302 m.mx = dtrace_load64(tupregs[0].dttk_value);
3303 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3304#else
3305 regs[rd] = 0;
3306 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3307#endif
3308 break;
3309
3310 case DIF_SUBR_MUTEX_TYPE_SPIN:
3311#ifndef VBOX
3312 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3313 mstate, vstate)) {
3314 regs[rd] = NULL;
3315 break;
3316 }
3317
3318 m.mx = dtrace_load64(tupregs[0].dttk_value);
3319 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3320#else
3321 regs[rd] = 0;
3322 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3323#endif
3324 break;
3325
3326 case DIF_SUBR_RW_READ_HELD: {
3327#ifndef VBOX
3328 uintptr_t tmp;
3329
3330 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3331 mstate, vstate)) {
3332 regs[rd] = NULL;
3333 break;
3334 }
3335
3336 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3337 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3338#else
3339 regs[rd] = 0;
3340 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3341#endif
3342 break;
3343 }
3344
3345 case DIF_SUBR_RW_WRITE_HELD:
3346#ifndef VBOX
3347 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3348 mstate, vstate)) {
3349 regs[rd] = NULL;
3350 break;
3351 }
3352
3353 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3354 regs[rd] = _RW_WRITE_HELD(&r.ri);
3355#else
3356 regs[rd] = 0;
3357 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3358#endif
3359 break;
3360
3361 case DIF_SUBR_RW_ISWRITER:
3362#ifndef VBOX
3363 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3364 mstate, vstate)) {
3365 regs[rd] = NULL;
3366 break;
3367 }
3368
3369 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3370 regs[rd] = _RW_ISWRITER(&r.ri);
3371#else
3372 regs[rd] = 0;
3373 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3374#endif
3375 break;
3376
3377 case DIF_SUBR_BCOPY: {
3378 /*
3379 * We need to be sure that the destination is in the scratch
3380 * region -- no other region is allowed.
3381 */
3382 uintptr_t src = tupregs[0].dttk_value;
3383 uintptr_t dest = tupregs[1].dttk_value;
3384 size_t size = tupregs[2].dttk_value;
3385
3386 if (!dtrace_inscratch(dest, size, mstate)) {
3387 *flags |= CPU_DTRACE_BADADDR;
3388 *illval = regs[rd];
3389 break;
3390 }
3391
3392 if (!dtrace_canload(src, size, mstate, vstate)) {
3393 regs[rd] = NULL;
3394 break;
3395 }
3396
3397 dtrace_bcopy((void *)src, (void *)dest, size);
3398 break;
3399 }
3400
3401 case DIF_SUBR_ALLOCA:
3402 case DIF_SUBR_COPYIN: {
3403 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3404 uint64_t size =
3405 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3406 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3407
3408 /*
3409 * This action doesn't require any credential checks since
3410 * probes will not activate in user contexts to which the
3411 * enabling user does not have permissions.
3412 */
3413
3414 /*
3415 * Rounding up the user allocation size could have overflowed
3416 * a large, bogus allocation (like -1ULL) to 0.
3417 */
3418 if (scratch_size < size ||
3419 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3420 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3421 regs[rd] = NULL;
3422 break;
3423 }
3424
3425 if (subr == DIF_SUBR_COPYIN) {
3426 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3427 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3428 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3429 }
3430
3431 mstate->dtms_scratch_ptr += scratch_size;
3432 regs[rd] = dest;
3433 break;
3434 }
3435
3436 case DIF_SUBR_COPYINTO: {
3437 uint64_t size = tupregs[1].dttk_value;
3438 uintptr_t dest = tupregs[2].dttk_value;
3439
3440 /*
3441 * This action doesn't require any credential checks since
3442 * probes will not activate in user contexts to which the
3443 * enabling user does not have permissions.
3444 */
3445 if (!dtrace_inscratch(dest, size, mstate)) {
3446 *flags |= CPU_DTRACE_BADADDR;
3447 *illval = regs[rd];
3448 break;
3449 }
3450
3451 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3452 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3453 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3454 break;
3455 }
3456
3457 case DIF_SUBR_COPYINSTR: {
3458 uintptr_t dest = mstate->dtms_scratch_ptr;
3459 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3460
3461 if (nargs > 1 && tupregs[1].dttk_value < size)
3462 size = tupregs[1].dttk_value + 1;
3463
3464 /*
3465 * This action doesn't require any credential checks since
3466 * probes will not activate in user contexts to which the
3467 * enabling user does not have permissions.
3468 */
3469 if (!DTRACE_INSCRATCH(mstate, size)) {
3470 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3471 regs[rd] = NULL;
3472 break;
3473 }
3474
3475 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3476 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3477 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3478
3479 ((char *)dest)[size - 1] = '\0';
3480 mstate->dtms_scratch_ptr += size;
3481 regs[rd] = dest;
3482 break;
3483 }
3484
3485 case DIF_SUBR_MSGSIZE:
3486 case DIF_SUBR_MSGDSIZE: {
3487#ifndef VBOX
3488 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3489 uintptr_t wptr, rptr;
3490 size_t count = 0;
3491 int cont = 0;
3492
3493 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3494
3495 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3496 vstate)) {
3497 regs[rd] = NULL;
3498 break;
3499 }
3500
3501 wptr = dtrace_loadptr(baddr +
3502 offsetof(mblk_t, b_wptr));
3503
3504 rptr = dtrace_loadptr(baddr +
3505 offsetof(mblk_t, b_rptr));
3506
3507 if (wptr < rptr) {
3508 *flags |= CPU_DTRACE_BADADDR;
3509 *illval = tupregs[0].dttk_value;
3510 break;
3511 }
3512
3513 daddr = dtrace_loadptr(baddr +
3514 offsetof(mblk_t, b_datap));
3515
3516 baddr = dtrace_loadptr(baddr +
3517 offsetof(mblk_t, b_cont));
3518
3519 /*
3520 * We want to prevent against denial-of-service here,
3521 * so we're only going to search the list for
3522 * dtrace_msgdsize_max mblks.
3523 */
3524 if (cont++ > dtrace_msgdsize_max) {
3525 *flags |= CPU_DTRACE_ILLOP;
3526 break;
3527 }
3528
3529 if (subr == DIF_SUBR_MSGDSIZE) {
3530 if (dtrace_load8(daddr +
3531 offsetof(dblk_t, db_type)) != M_DATA)
3532 continue;
3533 }
3534
3535 count += wptr - rptr;
3536 }
3537
3538 if (!(*flags & CPU_DTRACE_FAULT))
3539 regs[rd] = count;
3540
3541#else
3542 regs[rd] = 0;
3543 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3544#endif
3545 break;
3546 }
3547
3548 case DIF_SUBR_PROGENYOF: {
3549#ifndef VBOX
3550 pid_t pid = tupregs[0].dttk_value;
3551 proc_t *p;
3552 int rval = 0;
3553
3554 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3555
3556 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3557 if (p->p_pidp->pid_id == pid) {
3558 rval = 1;
3559 break;
3560 }
3561 }
3562
3563 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3564
3565 regs[rd] = rval;
3566#else
3567 regs[rd] = 0;
3568 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3569#endif
3570 break;
3571 }
3572
3573 case DIF_SUBR_SPECULATION:
3574 regs[rd] = dtrace_speculation(state);
3575 break;
3576
3577 case DIF_SUBR_COPYOUT: {
3578 uintptr_t kaddr = tupregs[0].dttk_value;
3579 uintptr_t uaddr = tupregs[1].dttk_value;
3580 uint64_t size = tupregs[2].dttk_value;
3581
3582 if (!dtrace_destructive_disallow &&
3583 dtrace_priv_proc_control(state) &&
3584 !dtrace_istoxic(kaddr, size)) {
3585 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3586 dtrace_copyout(kaddr, uaddr, size, flags);
3587 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3588 }
3589 break;
3590 }
3591
3592 case DIF_SUBR_COPYOUTSTR: {
3593 uintptr_t kaddr = tupregs[0].dttk_value;
3594 uintptr_t uaddr = tupregs[1].dttk_value;
3595 uint64_t size = tupregs[2].dttk_value;
3596
3597 if (!dtrace_destructive_disallow &&
3598 dtrace_priv_proc_control(state) &&
3599 !dtrace_istoxic(kaddr, size)) {
3600 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3601 dtrace_copyoutstr(kaddr, uaddr, size, flags);
3602 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3603 }
3604 break;
3605 }
3606
3607 case DIF_SUBR_STRLEN: {
3608 size_t sz;
3609 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3610 sz = dtrace_strlen((char *)addr,
3611 state->dts_options[DTRACEOPT_STRSIZE]);
3612
3613 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3614 regs[rd] = NULL;
3615 break;
3616 }
3617
3618 regs[rd] = sz;
3619
3620 break;
3621 }
3622
3623 case DIF_SUBR_STRCHR:
3624 case DIF_SUBR_STRRCHR: {
3625 /*
3626 * We're going to iterate over the string looking for the
3627 * specified character. We will iterate until we have reached
3628 * the string length or we have found the character. If this
3629 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3630 * of the specified character instead of the first.
3631 */
3632 uintptr_t saddr = tupregs[0].dttk_value;
3633 uintptr_t addr = tupregs[0].dttk_value;
3634 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3635 char c, target = (char)tupregs[1].dttk_value;
3636
3637 for (regs[rd] = NULL; addr < limit; addr++) {
3638 if ((c = dtrace_load8(addr)) == target) {
3639 regs[rd] = addr;
3640
3641 if (subr == DIF_SUBR_STRCHR)
3642 break;
3643 }
3644
3645 if (c == '\0')
3646 break;
3647 }
3648
3649 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3650 regs[rd] = NULL;
3651 break;
3652 }
3653
3654 break;
3655 }
3656
3657 case DIF_SUBR_STRSTR:
3658 case DIF_SUBR_INDEX:
3659 case DIF_SUBR_RINDEX: {
3660 /*
3661 * We're going to iterate over the string looking for the
3662 * specified string. We will iterate until we have reached
3663 * the string length or we have found the string. (Yes, this
3664 * is done in the most naive way possible -- but considering
3665 * that the string we're searching for is likely to be
3666 * relatively short, the complexity of Rabin-Karp or similar
3667 * hardly seems merited.)
3668 */
3669 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3670 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3671 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3672 size_t len = dtrace_strlen(addr, size);
3673 size_t sublen = dtrace_strlen(substr, size);
3674 char *limit = addr + len, *orig = addr;
3675 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3676 int inc = 1;
3677
3678 regs[rd] = notfound;
3679
3680 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3681 regs[rd] = NULL;
3682 break;
3683 }
3684
3685 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3686 vstate)) {
3687 regs[rd] = NULL;
3688 break;
3689 }
3690
3691 /*
3692 * strstr() and index()/rindex() have similar semantics if
3693 * both strings are the empty string: strstr() returns a
3694 * pointer to the (empty) string, and index() and rindex()
3695 * both return index 0 (regardless of any position argument).
3696 */
3697 if (sublen == 0 && len == 0) {
3698 if (subr == DIF_SUBR_STRSTR)
3699 regs[rd] = (uintptr_t)addr;
3700 else
3701 regs[rd] = 0;
3702 break;
3703 }
3704
3705 if (subr != DIF_SUBR_STRSTR) {
3706 if (subr == DIF_SUBR_RINDEX) {
3707 limit = orig - 1;
3708 addr += len;
3709 inc = -1;
3710 }
3711
3712 /*
3713 * Both index() and rindex() take an optional position
3714 * argument that denotes the starting position.
3715 */
3716 if (nargs == 3) {
3717 int64_t pos = (int64_t)tupregs[2].dttk_value;
3718
3719 /*
3720 * If the position argument to index() is
3721 * negative, Perl implicitly clamps it at
3722 * zero. This semantic is a little surprising
3723 * given the special meaning of negative
3724 * positions to similar Perl functions like
3725 * substr(), but it appears to reflect a
3726 * notion that index() can start from a
3727 * negative index and increment its way up to
3728 * the string. Given this notion, Perl's
3729 * rindex() is at least self-consistent in
3730 * that it implicitly clamps positions greater
3731 * than the string length to be the string
3732 * length. Where Perl completely loses
3733 * coherence, however, is when the specified
3734 * substring is the empty string (""). In
3735 * this case, even if the position is
3736 * negative, rindex() returns 0 -- and even if
3737 * the position is greater than the length,
3738 * index() returns the string length. These
3739 * semantics violate the notion that index()
3740 * should never return a value less than the
3741 * specified position and that rindex() should
3742 * never return a value greater than the
3743 * specified position. (One assumes that
3744 * these semantics are artifacts of Perl's
3745 * implementation and not the results of
3746 * deliberate design -- it beggars belief that
3747 * even Larry Wall could desire such oddness.)
3748 * While in the abstract one would wish for
3749 * consistent position semantics across
3750 * substr(), index() and rindex() -- or at the
3751 * very least self-consistent position
3752 * semantics for index() and rindex() -- we
3753 * instead opt to keep with the extant Perl
3754 * semantics, in all their broken glory. (Do
3755 * we have more desire to maintain Perl's
3756 * semantics than Perl does? Probably.)
3757 */
3758 if (subr == DIF_SUBR_RINDEX) {
3759 if (pos < 0) {
3760 if (sublen == 0)
3761 regs[rd] = 0;
3762 break;
3763 }
3764
3765 if (VBDTCAST(uint64_t)pos > len)
3766 pos = len;
3767 } else {
3768 if (pos < 0)
3769 pos = 0;
3770
3771 if (VBDTCAST(uint64_t)pos >= len) {
3772 if (sublen == 0)
3773 regs[rd] = len;
3774 break;
3775 }
3776 }
3777
3778 addr = orig + pos;
3779 }
3780 }
3781
3782 for (regs[rd] = notfound; addr != limit; addr += inc) {
3783 if (dtrace_strncmp(addr, substr, sublen) == 0) {
3784 if (subr != DIF_SUBR_STRSTR) {
3785 /*
3786 * As D index() and rindex() are
3787 * modeled on Perl (and not on awk),
3788 * we return a zero-based (and not a
3789 * one-based) index. (For you Perl
3790 * weenies: no, we're not going to add
3791 * $[ -- and shouldn't you be at a con
3792 * or something?)
3793 */
3794 regs[rd] = (uintptr_t)(addr - orig);
3795 break;
3796 }
3797
3798 ASSERT(subr == DIF_SUBR_STRSTR);
3799 regs[rd] = (uintptr_t)addr;
3800 break;
3801 }
3802 }
3803
3804 break;
3805 }
3806
3807 case DIF_SUBR_STRTOK: {
3808 uintptr_t addr = tupregs[0].dttk_value;
3809 uintptr_t tokaddr = tupregs[1].dttk_value;
3810 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3811 uintptr_t limit, toklimit = tokaddr + size;
3812 uint8_t c VBDTUNASS(0), tokmap[32]; /* 256 / 8 */
3813 char *dest = (char *)mstate->dtms_scratch_ptr;
3814 VBDTTYPE(unsigned,int) i;
3815
3816 /*
3817 * Check both the token buffer and (later) the input buffer,
3818 * since both could be non-scratch addresses.
3819 */
3820 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3821 regs[rd] = NULL;
3822 break;
3823 }
3824
3825 if (!DTRACE_INSCRATCH(mstate, size)) {
3826 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3827 regs[rd] = NULL;
3828 break;
3829 }
3830
3831 if (addr == NULL) {
3832 /*
3833 * If the address specified is NULL, we use our saved
3834 * strtok pointer from the mstate. Note that this
3835 * means that the saved strtok pointer is _only_
3836 * valid within multiple enablings of the same probe --
3837 * it behaves like an implicit clause-local variable.
3838 */
3839 addr = mstate->dtms_strtok;
3840 } else {
3841 /*
3842 * If the user-specified address is non-NULL we must
3843 * access check it. This is the only time we have
3844 * a chance to do so, since this address may reside
3845 * in the string table of this clause-- future calls
3846 * (when we fetch addr from mstate->dtms_strtok)
3847 * would fail this access check.
3848 */
3849 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3850 regs[rd] = NULL;
3851 break;
3852 }
3853 }
3854
3855 /*
3856 * First, zero the token map, and then process the token
3857 * string -- setting a bit in the map for every character
3858 * found in the token string.
3859 */
3860 for (i = 0; i < sizeof (tokmap); i++)
3861 tokmap[i] = 0;
3862
3863 for (; tokaddr < toklimit; tokaddr++) {
3864 if ((c = dtrace_load8(tokaddr)) == '\0')
3865 break;
3866
3867 ASSERT((c >> 3) < sizeof (tokmap));
3868 tokmap[c >> 3] |= (1 << (c & 0x7));
3869 }
3870
3871 for (limit = addr + size; addr < limit; addr++) {
3872 /*
3873 * We're looking for a character that is _not_ contained
3874 * in the token string.
3875 */
3876 if ((c = dtrace_load8(addr)) == '\0')
3877 break;
3878
3879 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3880 break;
3881 }
3882
3883 if (c == '\0') {
3884 /*
3885 * We reached the end of the string without finding
3886 * any character that was not in the token string.
3887 * We return NULL in this case, and we set the saved
3888 * address to NULL as well.
3889 */
3890 regs[rd] = NULL;
3891 mstate->dtms_strtok = NULL;
3892 break;
3893 }
3894
3895 /*
3896 * From here on, we're copying into the destination string.
3897 */
3898 for (i = 0; addr < limit && i < size - 1; addr++) {
3899 if ((c = dtrace_load8(addr)) == '\0')
3900 break;
3901
3902 if (tokmap[c >> 3] & (1 << (c & 0x7)))
3903 break;
3904
3905 ASSERT(i < size);
3906 dest[i++] = c;
3907 }
3908
3909 ASSERT(i < size);
3910 dest[i] = '\0';
3911 regs[rd] = (uintptr_t)dest;
3912 mstate->dtms_scratch_ptr += size;
3913 mstate->dtms_strtok = addr;
3914 break;
3915 }
3916
3917 case DIF_SUBR_SUBSTR: {
3918 uintptr_t s = tupregs[0].dttk_value;
3919 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3920 char *d = (char *)mstate->dtms_scratch_ptr;
3921 int64_t index = (int64_t)tupregs[1].dttk_value;
3922 int64_t remaining = (int64_t)tupregs[2].dttk_value;
3923 size_t len = dtrace_strlen((char *)s, size);
3924 int64_t i;
3925
3926 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3927 regs[rd] = NULL;
3928 break;
3929 }
3930
3931 if (!DTRACE_INSCRATCH(mstate, size)) {
3932 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3933 regs[rd] = NULL;
3934 break;
3935 }
3936
3937 if (nargs <= 2)
3938 remaining = (int64_t)size;
3939
3940 if (index < 0) {
3941 index += len;
3942
3943 if (index < 0 && index + remaining > 0) {
3944 remaining += index;
3945 index = 0;
3946 }
3947 }
3948
3949 if (VBDTCAST(uint64_t)index >= len || index < 0) {
3950 remaining = 0;
3951 } else if (remaining < 0) {
3952 remaining += len - index;
3953 } else if (VBDTCAST(uint64_t)index + remaining > size) {
3954 remaining = size - index;
3955 }
3956
3957 for (i = 0; i < remaining; i++) {
3958 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3959 break;
3960 }
3961
3962 d[i] = '\0';
3963
3964 mstate->dtms_scratch_ptr += size;
3965 regs[rd] = (uintptr_t)d;
3966 break;
3967 }
3968
3969 case DIF_SUBR_GETMAJOR:
3970#ifndef VBOX
3971#ifdef _LP64
3972 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3973#else
3974 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3975#endif
3976#else
3977 regs[rd] = 0;
3978 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3979#endif
3980 break;
3981
3982 case DIF_SUBR_GETMINOR:
3983#ifndef VBOX
3984#ifdef _LP64
3985 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3986#else
3987 regs[rd] = tupregs[0].dttk_value & MAXMIN;
3988#endif
3989#else
3990 regs[rd] = 0;
3991 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3992#endif
3993 break;
3994
3995 case DIF_SUBR_DDI_PATHNAME: {
3996#ifndef VBOX
3997 /*
3998 * This one is a galactic mess. We are going to roughly
3999 * emulate ddi_pathname(), but it's made more complicated
4000 * by the fact that we (a) want to include the minor name and
4001 * (b) must proceed iteratively instead of recursively.
4002 */
4003 uintptr_t dest = mstate->dtms_scratch_ptr;
4004 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4005 char *start = (char *)dest, *end = start + size - 1;
4006 uintptr_t daddr = tupregs[0].dttk_value;
4007 int64_t minor = (int64_t)tupregs[1].dttk_value;
4008 char *s;
4009 int i, len, depth = 0;
4010
4011 /*
4012 * Due to all the pointer jumping we do and context we must
4013 * rely upon, we just mandate that the user must have kernel
4014 * read privileges to use this routine.
4015 */
4016 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4017 *flags |= CPU_DTRACE_KPRIV;
4018 *illval = daddr;
4019 regs[rd] = NULL;
4020 }
4021
4022 if (!DTRACE_INSCRATCH(mstate, size)) {
4023 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4024 regs[rd] = NULL;
4025 break;
4026 }
4027
4028 *end = '\0';
4029
4030 /*
4031 * We want to have a name for the minor. In order to do this,
4032 * we need to walk the minor list from the devinfo. We want
4033 * to be sure that we don't infinitely walk a circular list,
4034 * so we check for circularity by sending a scout pointer
4035 * ahead two elements for every element that we iterate over;
4036 * if the list is circular, these will ultimately point to the
4037 * same element. You may recognize this little trick as the
4038 * answer to a stupid interview question -- one that always
4039 * seems to be asked by those who had to have it laboriously
4040 * explained to them, and who can't even concisely describe
4041 * the conditions under which one would be forced to resort to
4042 * this technique. Needless to say, those conditions are
4043 * found here -- and probably only here. Is this the only use
4044 * of this infamous trick in shipping, production code? If it
4045 * isn't, it probably should be...
4046 */
4047 if (minor != -1) {
4048 uintptr_t maddr = dtrace_loadptr(daddr +
4049 offsetof(struct dev_info, devi_minor));
4050
4051 uintptr_t next = offsetof(struct ddi_minor_data, next);
4052 uintptr_t name = offsetof(struct ddi_minor_data,
4053 d_minor) + offsetof(struct ddi_minor, name);
4054 uintptr_t dev = offsetof(struct ddi_minor_data,
4055 d_minor) + offsetof(struct ddi_minor, dev);
4056 uintptr_t scout;
4057
4058 if (maddr != NULL)
4059 scout = dtrace_loadptr(maddr + next);
4060
4061 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4062 uint64_t m;
4063#ifdef _LP64
4064 m = dtrace_load64(maddr + dev) & MAXMIN64;
4065#else
4066 m = dtrace_load32(maddr + dev) & MAXMIN;
4067#endif
4068 if (m != minor) {
4069 maddr = dtrace_loadptr(maddr + next);
4070
4071 if (scout == NULL)
4072 continue;
4073
4074 scout = dtrace_loadptr(scout + next);
4075
4076 if (scout == NULL)
4077 continue;
4078
4079 scout = dtrace_loadptr(scout + next);
4080
4081 if (scout == NULL)
4082 continue;
4083
4084 if (scout == maddr) {
4085 *flags |= CPU_DTRACE_ILLOP;
4086 break;
4087 }
4088
4089 continue;
4090 }
4091
4092 /*
4093 * We have the minor data. Now we need to
4094 * copy the minor's name into the end of the
4095 * pathname.
4096 */
4097 s = (char *)dtrace_loadptr(maddr + name);
4098 len = dtrace_strlen(s, size);
4099
4100 if (*flags & CPU_DTRACE_FAULT)
4101 break;
4102
4103 if (len != 0) {
4104 if ((end -= (len + 1)) < start)
4105 break;
4106
4107 *end = ':';
4108 }
4109
4110 for (i = 1; i <= len; i++)
4111 end[i] = dtrace_load8((uintptr_t)s++);
4112 break;
4113 }
4114 }
4115
4116 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4117 ddi_node_state_t devi_state;
4118
4119 devi_state = dtrace_load32(daddr +
4120 offsetof(struct dev_info, devi_node_state));
4121
4122 if (*flags & CPU_DTRACE_FAULT)
4123 break;
4124
4125 if (devi_state >= DS_INITIALIZED) {
4126 s = (char *)dtrace_loadptr(daddr +
4127 offsetof(struct dev_info, devi_addr));
4128 len = dtrace_strlen(s, size);
4129
4130 if (*flags & CPU_DTRACE_FAULT)
4131 break;
4132
4133 if (len != 0) {
4134 if ((end -= (len + 1)) < start)
4135 break;
4136
4137 *end = '@';
4138 }
4139
4140 for (i = 1; i <= len; i++)
4141 end[i] = dtrace_load8((uintptr_t)s++);
4142 }
4143
4144 /*
4145 * Now for the node name...
4146 */
4147 s = (char *)dtrace_loadptr(daddr +
4148 offsetof(struct dev_info, devi_node_name));
4149
4150 daddr = dtrace_loadptr(daddr +
4151 offsetof(struct dev_info, devi_parent));
4152
4153 /*
4154 * If our parent is NULL (that is, if we're the root
4155 * node), we're going to use the special path
4156 * "devices".
4157 */
4158 if (daddr == NULL)
4159 s = "devices";
4160
4161 len = dtrace_strlen(s, size);
4162 if (*flags & CPU_DTRACE_FAULT)
4163 break;
4164
4165 if ((end -= (len + 1)) < start)
4166 break;
4167
4168 for (i = 1; i <= len; i++)
4169 end[i] = dtrace_load8((uintptr_t)s++);
4170 *end = '/';
4171
4172 if (depth++ > dtrace_devdepth_max) {
4173 *flags |= CPU_DTRACE_ILLOP;
4174 break;
4175 }
4176 }
4177
4178 if (end < start)
4179 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4180
4181 if (daddr == NULL) {
4182 regs[rd] = (uintptr_t)end;
4183 mstate->dtms_scratch_ptr += size;
4184 }
4185
4186#else
4187 regs[rd] = 0;
4188 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4189#endif
4190 break;
4191 }
4192
4193 case DIF_SUBR_STRJOIN: {
4194 char *d = (char *)mstate->dtms_scratch_ptr;
4195 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4196 uintptr_t s1 = tupregs[0].dttk_value;
4197 uintptr_t s2 = tupregs[1].dttk_value;
4198 VBDTTYPE(unsigned,int) i = 0;
4199
4200 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4201 !dtrace_strcanload(s2, size, mstate, vstate)) {
4202 regs[rd] = NULL;
4203 break;
4204 }
4205
4206 if (!DTRACE_INSCRATCH(mstate, size)) {
4207 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4208 regs[rd] = NULL;
4209 break;
4210 }
4211
4212 for (;;) {
4213 if (i >= size) {
4214 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4215 regs[rd] = NULL;
4216 break;
4217 }
4218
4219 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4220 i--;
4221 break;
4222 }
4223 }
4224
4225 for (;;) {
4226 if (i >= size) {
4227 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4228 regs[rd] = NULL;
4229 break;
4230 }
4231
4232 if ((d[i++] = dtrace_load8(s2++)) == '\0')
4233 break;
4234 }
4235
4236 if (i < size) {
4237 mstate->dtms_scratch_ptr += i;
4238 regs[rd] = (uintptr_t)d;
4239 }
4240
4241 break;
4242 }
4243
4244 case DIF_SUBR_LLTOSTR: {
4245 int64_t i = (int64_t)tupregs[0].dttk_value;
4246 int64_t val = i < 0 ? i * -1 : i;
4247 uint64_t size = 22; /* enough room for 2^64 in decimal */
4248 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4249
4250 if (!DTRACE_INSCRATCH(mstate, size)) {
4251 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4252 regs[rd] = NULL;
4253 break;
4254 }
4255
4256 for (*end-- = '\0'; val; val /= 10)
4257 *end-- = '0' + (val % 10);
4258
4259 if (i == 0)
4260 *end-- = '0';
4261
4262 if (i < 0)
4263 *end-- = '-';
4264
4265 regs[rd] = (uintptr_t)end + 1;
4266 mstate->dtms_scratch_ptr += size;
4267 break;
4268 }
4269
4270 case DIF_SUBR_HTONS:
4271 case DIF_SUBR_NTOHS:
4272#ifdef _BIG_ENDIAN
4273 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4274#else
4275 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4276#endif
4277 break;
4278
4279
4280 case DIF_SUBR_HTONL:
4281 case DIF_SUBR_NTOHL:
4282#ifdef _BIG_ENDIAN
4283 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4284#else
4285 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4286#endif
4287 break;
4288
4289
4290 case DIF_SUBR_HTONLL:
4291 case DIF_SUBR_NTOHLL:
4292#ifdef _BIG_ENDIAN
4293 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4294#else
4295 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4296#endif
4297 break;
4298
4299
4300 case DIF_SUBR_DIRNAME:
4301 case DIF_SUBR_BASENAME: {
4302 char *dest = (char *)mstate->dtms_scratch_ptr;
4303 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4304 uintptr_t src = tupregs[0].dttk_value;
4305 int i, j, len = VBDTCAST(int)dtrace_strlen((char *)src, size);
4306 int lastbase = -1, firstbase = -1, lastdir = -1;
4307 int start, end;
4308
4309 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4310 regs[rd] = NULL;
4311 break;
4312 }
4313
4314 if (!DTRACE_INSCRATCH(mstate, size)) {
4315 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4316 regs[rd] = NULL;
4317 break;
4318 }
4319
4320 /*
4321 * The basename and dirname for a zero-length string is
4322 * defined to be "."
4323 */
4324 if (len == 0) {
4325 len = 1;
4326 src = (uintptr_t)".";
4327 }
4328
4329 /*
4330 * Start from the back of the string, moving back toward the
4331 * front until we see a character that isn't a slash. That
4332 * character is the last character in the basename.
4333 */
4334 for (i = len - 1; i >= 0; i--) {
4335 if (dtrace_load8(src + i) != '/')
4336 break;
4337 }
4338
4339 if (i >= 0)
4340 lastbase = i;
4341
4342 /*
4343 * Starting from the last character in the basename, move
4344 * towards the front until we find a slash. The character
4345 * that we processed immediately before that is the first
4346 * character in the basename.
4347 */
4348 for (; i >= 0; i--) {
4349 if (dtrace_load8(src + i) == '/')
4350 break;
4351 }
4352
4353 if (i >= 0)
4354 firstbase = i + 1;
4355
4356 /*
4357 * Now keep going until we find a non-slash character. That
4358 * character is the last character in the dirname.
4359 */
4360 for (; i >= 0; i--) {
4361 if (dtrace_load8(src + i) != '/')
4362 break;
4363 }
4364
4365 if (i >= 0)
4366 lastdir = i;
4367
4368 ASSERT(!(lastbase == -1 && firstbase != -1));
4369 ASSERT(!(firstbase == -1 && lastdir != -1));
4370
4371 if (lastbase == -1) {
4372 /*
4373 * We didn't find a non-slash character. We know that
4374 * the length is non-zero, so the whole string must be
4375 * slashes. In either the dirname or the basename
4376 * case, we return '/'.
4377 */
4378 ASSERT(firstbase == -1);
4379 firstbase = lastbase = lastdir = 0;
4380 }
4381
4382 if (firstbase == -1) {
4383 /*
4384 * The entire string consists only of a basename
4385 * component. If we're looking for dirname, we need
4386 * to change our string to be just "."; if we're
4387 * looking for a basename, we'll just set the first
4388 * character of the basename to be 0.
4389 */
4390 if (subr == DIF_SUBR_DIRNAME) {
4391 ASSERT(lastdir == -1);
4392 src = (uintptr_t)".";
4393 lastdir = 0;
4394 } else {
4395 firstbase = 0;
4396 }
4397 }
4398
4399 if (subr == DIF_SUBR_DIRNAME) {
4400 if (lastdir == -1) {
4401 /*
4402 * We know that we have a slash in the name --
4403 * or lastdir would be set to 0, above. And
4404 * because lastdir is -1, we know that this
4405 * slash must be the first character. (That
4406 * is, the full string must be of the form
4407 * "/basename".) In this case, the last
4408 * character of the directory name is 0.
4409 */
4410 lastdir = 0;
4411 }
4412
4413 start = 0;
4414 end = lastdir;
4415 } else {
4416 ASSERT(subr == DIF_SUBR_BASENAME);
4417 ASSERT(firstbase != -1 && lastbase != -1);
4418 start = firstbase;
4419 end = lastbase;
4420 }
4421
4422 for (i = start, j = 0; i <= end && VBDTCAST(unsigned)j < size - 1; i++, j++)
4423 dest[j] = dtrace_load8(src + i);
4424
4425 dest[j] = '\0';
4426 regs[rd] = (uintptr_t)dest;
4427 mstate->dtms_scratch_ptr += size;
4428 break;
4429 }
4430
4431 case DIF_SUBR_CLEANPATH: {
4432 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4433 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4434 uintptr_t src = tupregs[0].dttk_value;
4435 int i = 0, j = 0;
4436
4437 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4438 regs[rd] = NULL;
4439 break;
4440 }
4441
4442 if (!DTRACE_INSCRATCH(mstate, size)) {
4443 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4444 regs[rd] = NULL;
4445 break;
4446 }
4447
4448 /*
4449 * Move forward, loading each character.
4450 */
4451 do {
4452 c = dtrace_load8(src + i++);
4453next:
4454 if (j + 5 >= VBDTCAST(int64_t)size) /* 5 = strlen("/..c\0") */
4455 break;
4456
4457 if (c != '/') {
4458 dest[j++] = c;
4459 continue;
4460 }
4461
4462 c = dtrace_load8(src + i++);
4463
4464 if (c == '/') {
4465 /*
4466 * We have two slashes -- we can just advance
4467 * to the next character.
4468 */
4469 goto next;
4470 }
4471
4472 if (c != '.') {
4473 /*
4474 * This is not "." and it's not ".." -- we can
4475 * just store the "/" and this character and
4476 * drive on.
4477 */
4478 dest[j++] = '/';
4479 dest[j++] = c;
4480 continue;
4481 }
4482
4483 c = dtrace_load8(src + i++);
4484
4485 if (c == '/') {
4486 /*
4487 * This is a "/./" component. We're not going
4488 * to store anything in the destination buffer;
4489 * we're just going to go to the next component.
4490 */
4491 goto next;
4492 }
4493
4494 if (c != '.') {
4495 /*
4496 * This is not ".." -- we can just store the
4497 * "/." and this character and continue
4498 * processing.
4499 */
4500 dest[j++] = '/';
4501 dest[j++] = '.';
4502 dest[j++] = c;
4503 continue;
4504 }
4505
4506 c = dtrace_load8(src + i++);
4507
4508 if (c != '/' && c != '\0') {
4509 /*
4510 * This is not ".." -- it's "..[mumble]".
4511 * We'll store the "/.." and this character
4512 * and continue processing.
4513 */
4514 dest[j++] = '/';
4515 dest[j++] = '.';
4516 dest[j++] = '.';
4517 dest[j++] = c;
4518 continue;
4519 }
4520
4521 /*
4522 * This is "/../" or "/..\0". We need to back up
4523 * our destination pointer until we find a "/".
4524 */
4525 i--;
4526 while (j != 0 && dest[--j] != '/')
4527 continue;
4528
4529 if (c == '\0')
4530 dest[++j] = '/';
4531 } while (c != '\0');
4532
4533 dest[j] = '\0';
4534 regs[rd] = (uintptr_t)dest;
4535 mstate->dtms_scratch_ptr += size;
4536 break;
4537 }
4538
4539 case DIF_SUBR_INET_NTOA:
4540 case DIF_SUBR_INET_NTOA6:
4541 case DIF_SUBR_INET_NTOP: {
4542#ifndef VBOX
4543 size_t size;
4544 int af, argi, i;
4545 char *base, *end;
4546
4547 if (subr == DIF_SUBR_INET_NTOP) {
4548 af = (int)tupregs[0].dttk_value;
4549 argi = 1;
4550 } else {
4551 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4552 argi = 0;
4553 }
4554
4555 if (af == AF_INET) {
4556 ipaddr_t ip4;
4557 uint8_t *ptr8, val;
4558
4559 /*
4560 * Safely load the IPv4 address.
4561 */
4562 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4563
4564 /*
4565 * Check an IPv4 string will fit in scratch.
4566 */
4567 size = INET_ADDRSTRLEN;
4568 if (!DTRACE_INSCRATCH(mstate, size)) {
4569 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4570 regs[rd] = NULL;
4571 break;
4572 }
4573 base = (char *)mstate->dtms_scratch_ptr;
4574 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4575
4576 /*
4577 * Stringify as a dotted decimal quad.
4578 */
4579 *end-- = '\0';
4580 ptr8 = (uint8_t *)&ip4;
4581 for (i = 3; i >= 0; i--) {
4582 val = ptr8[i];
4583
4584 if (val == 0) {
4585 *end-- = '0';
4586 } else {
4587 for (; val; val /= 10) {
4588 *end-- = '0' + (val % 10);
4589 }
4590 }
4591
4592 if (i > 0)
4593 *end-- = '.';
4594 }
4595 ASSERT(end + 1 >= base);
4596
4597 } else if (af == AF_INET6) {
4598 struct in6_addr ip6;
4599 int firstzero, tryzero, numzero, v6end;
4600 uint16_t val;
4601 const char digits[] = "0123456789abcdef";
4602
4603 /*
4604 * Stringify using RFC 1884 convention 2 - 16 bit
4605 * hexadecimal values with a zero-run compression.
4606 * Lower case hexadecimal digits are used.
4607 * eg, fe80::214:4fff:fe0b:76c8.
4608 * The IPv4 embedded form is returned for inet_ntop,
4609 * just the IPv4 string is returned for inet_ntoa6.
4610 */
4611
4612 /*
4613 * Safely load the IPv6 address.
4614 */
4615 dtrace_bcopy(
4616 (void *)(uintptr_t)tupregs[argi].dttk_value,
4617 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4618
4619 /*
4620 * Check an IPv6 string will fit in scratch.
4621 */
4622 size = INET6_ADDRSTRLEN;
4623 if (!DTRACE_INSCRATCH(mstate, size)) {
4624 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4625 regs[rd] = NULL;
4626 break;
4627 }
4628 base = (char *)mstate->dtms_scratch_ptr;
4629 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4630 *end-- = '\0';
4631
4632 /*
4633 * Find the longest run of 16 bit zero values
4634 * for the single allowed zero compression - "::".
4635 */
4636 firstzero = -1;
4637 tryzero = -1;
4638 numzero = 1;
4639 for (i = 0; i < sizeof (struct in6_addr); i++) {
4640 if (ip6._S6_un._S6_u8[i] == 0 &&
4641 tryzero == -1 && i % 2 == 0) {
4642 tryzero = i;
4643 continue;
4644 }
4645
4646 if (tryzero != -1 &&
4647 (ip6._S6_un._S6_u8[i] != 0 ||
4648 i == sizeof (struct in6_addr) - 1)) {
4649
4650 if (i - tryzero <= numzero) {
4651 tryzero = -1;
4652 continue;
4653 }
4654
4655 firstzero = tryzero;
4656 numzero = i - i % 2 - tryzero;
4657 tryzero = -1;
4658
4659 if (ip6._S6_un._S6_u8[i] == 0 &&
4660 i == sizeof (struct in6_addr) - 1)
4661 numzero += 2;
4662 }
4663 }
4664 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4665
4666 /*
4667 * Check for an IPv4 embedded address.
4668 */
4669 v6end = sizeof (struct in6_addr) - 2;
4670 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4671 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4672 for (i = sizeof (struct in6_addr) - 1;
4673 i >= DTRACE_V4MAPPED_OFFSET; i--) {
4674 ASSERT(end >= base);
4675
4676 val = ip6._S6_un._S6_u8[i];
4677
4678 if (val == 0) {
4679 *end-- = '0';
4680 } else {
4681 for (; val; val /= 10) {
4682 *end-- = '0' + val % 10;
4683 }
4684 }
4685
4686 if (i > DTRACE_V4MAPPED_OFFSET)
4687 *end-- = '.';
4688 }
4689
4690 if (subr == DIF_SUBR_INET_NTOA6)
4691 goto inetout;
4692
4693 /*
4694 * Set v6end to skip the IPv4 address that
4695 * we have already stringified.
4696 */
4697 v6end = 10;
4698 }
4699
4700 /*
4701 * Build the IPv6 string by working through the
4702 * address in reverse.
4703 */
4704 for (i = v6end; i >= 0; i -= 2) {
4705 ASSERT(end >= base);
4706
4707 if (i == firstzero + numzero - 2) {
4708 *end-- = ':';
4709 *end-- = ':';
4710 i -= numzero - 2;
4711 continue;
4712 }
4713
4714 if (i < 14 && i != firstzero - 2)
4715 *end-- = ':';
4716
4717 val = (ip6._S6_un._S6_u8[i] << 8) +
4718 ip6._S6_un._S6_u8[i + 1];
4719
4720 if (val == 0) {
4721 *end-- = '0';
4722 } else {
4723 for (; val; val /= 16) {
4724 *end-- = digits[val % 16];
4725 }
4726 }
4727 }
4728 ASSERT(end + 1 >= base);
4729
4730 } else {
4731 /*
4732 * The user didn't use AH_INET or AH_INET6.
4733 */
4734 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4735 regs[rd] = NULL;
4736 break;
4737 }
4738
4739inetout: regs[rd] = (uintptr_t)end + 1;
4740 mstate->dtms_scratch_ptr += size;
4741#else /* VBOX */
4742 regs[rd] = 0;
4743 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4744#endif /* VBOX */
4745 break;
4746 }
4747
4748 }
4749}
4750
4751/*
4752 * Emulate the execution of DTrace IR instructions specified by the given
4753 * DIF object. This function is deliberately void of assertions as all of
4754 * the necessary checks are handled by a call to dtrace_difo_validate().
4755 */
4756static uint64_t
4757dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4758 dtrace_vstate_t *vstate, dtrace_state_t *state)
4759{
4760 const dif_instr_t *text = difo->dtdo_buf;
4761 const uint_t textlen = difo->dtdo_len;
4762 const char *strtab = difo->dtdo_strtab;
4763 const uint64_t *inttab = difo->dtdo_inttab;
4764
4765 uint64_t rval = 0;
4766 dtrace_statvar_t *svar;
4767 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4768 dtrace_difv_t *v;
4769 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
4770 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
4771
4772 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4773 uint64_t regs[DIF_DIR_NREGS];
4774 uint64_t *tmp;
4775
4776 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4777 int64_t cc_r;
4778 uint_t pc = 0, id, opc VBDTUNASS(0);
4779 uint8_t ttop = 0;
4780 dif_instr_t instr;
4781 uint_t r1, r2, rd;
4782
4783 /*
4784 * We stash the current DIF object into the machine state: we need it
4785 * for subsequent access checking.
4786 */
4787 mstate->dtms_difo = difo;
4788
4789 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
4790
4791 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4792 opc = pc;
4793
4794 instr = text[pc++];
4795 r1 = DIF_INSTR_R1(instr);
4796 r2 = DIF_INSTR_R2(instr);
4797 rd = DIF_INSTR_RD(instr);
4798
4799 switch (DIF_INSTR_OP(instr)) {
4800 case DIF_OP_OR:
4801 regs[rd] = regs[r1] | regs[r2];
4802 break;
4803 case DIF_OP_XOR:
4804 regs[rd] = regs[r1] ^ regs[r2];
4805 break;
4806 case DIF_OP_AND:
4807 regs[rd] = regs[r1] & regs[r2];
4808 break;
4809 case DIF_OP_SLL:
4810 regs[rd] = regs[r1] << regs[r2];
4811 break;
4812 case DIF_OP_SRL:
4813 regs[rd] = regs[r1] >> regs[r2];
4814 break;
4815 case DIF_OP_SUB:
4816 regs[rd] = regs[r1] - regs[r2];
4817 break;
4818 case DIF_OP_ADD:
4819 regs[rd] = regs[r1] + regs[r2];
4820 break;
4821 case DIF_OP_MUL:
4822 regs[rd] = regs[r1] * regs[r2];
4823 break;
4824 case DIF_OP_SDIV:
4825 if (regs[r2] == 0) {
4826 regs[rd] = 0;
4827 *flags |= CPU_DTRACE_DIVZERO;
4828 } else {
4829 regs[rd] = (int64_t)regs[r1] /
4830 (int64_t)regs[r2];
4831 }
4832 break;
4833
4834 case DIF_OP_UDIV:
4835 if (regs[r2] == 0) {
4836 regs[rd] = 0;
4837 *flags |= CPU_DTRACE_DIVZERO;
4838 } else {
4839 regs[rd] = regs[r1] / regs[r2];
4840 }
4841 break;
4842
4843 case DIF_OP_SREM:
4844 if (regs[r2] == 0) {
4845 regs[rd] = 0;
4846 *flags |= CPU_DTRACE_DIVZERO;
4847 } else {
4848 regs[rd] = (int64_t)regs[r1] %
4849 (int64_t)regs[r2];
4850 }
4851 break;
4852
4853 case DIF_OP_UREM:
4854 if (regs[r2] == 0) {
4855 regs[rd] = 0;
4856 *flags |= CPU_DTRACE_DIVZERO;
4857 } else {
4858 regs[rd] = regs[r1] % regs[r2];
4859 }
4860 break;
4861
4862 case DIF_OP_NOT:
4863 regs[rd] = ~regs[r1];
4864 break;
4865 case DIF_OP_MOV:
4866 regs[rd] = regs[r1];
4867 break;
4868 case DIF_OP_CMP:
4869 cc_r = regs[r1] - regs[r2];
4870 cc_n = cc_r < 0;
4871 cc_z = cc_r == 0;
4872 cc_v = 0;
4873 cc_c = regs[r1] < regs[r2];
4874 break;
4875 case DIF_OP_TST:
4876 cc_n = cc_v = cc_c = 0;
4877 cc_z = regs[r1] == 0;
4878 break;
4879 case DIF_OP_BA:
4880 pc = DIF_INSTR_LABEL(instr);
4881 break;
4882 case DIF_OP_BE:
4883 if (cc_z)
4884 pc = DIF_INSTR_LABEL(instr);
4885 break;
4886 case DIF_OP_BNE:
4887 if (cc_z == 0)
4888 pc = DIF_INSTR_LABEL(instr);
4889 break;
4890 case DIF_OP_BG:
4891 if ((cc_z | (cc_n ^ cc_v)) == 0)
4892 pc = DIF_INSTR_LABEL(instr);
4893 break;
4894 case DIF_OP_BGU:
4895 if ((cc_c | cc_z) == 0)
4896 pc = DIF_INSTR_LABEL(instr);
4897 break;
4898 case DIF_OP_BGE:
4899 if ((cc_n ^ cc_v) == 0)
4900 pc = DIF_INSTR_LABEL(instr);
4901 break;
4902 case DIF_OP_BGEU:
4903 if (cc_c == 0)
4904 pc = DIF_INSTR_LABEL(instr);
4905 break;
4906 case DIF_OP_BL:
4907 if (cc_n ^ cc_v)
4908 pc = DIF_INSTR_LABEL(instr);
4909 break;
4910 case DIF_OP_BLU:
4911 if (cc_c)
4912 pc = DIF_INSTR_LABEL(instr);
4913 break;
4914 case DIF_OP_BLE:
4915 if (cc_z | (cc_n ^ cc_v))
4916 pc = DIF_INSTR_LABEL(instr);
4917 break;
4918 case DIF_OP_BLEU:
4919 if (cc_c | cc_z)
4920 pc = DIF_INSTR_LABEL(instr);
4921 break;
4922 case DIF_OP_RLDSB:
4923 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4924 *flags |= CPU_DTRACE_KPRIV;
4925 *illval = regs[r1];
4926 break;
4927 }
4928 /*FALLTHROUGH*/
4929 case DIF_OP_LDSB:
4930 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4931 break;
4932 case DIF_OP_RLDSH:
4933 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4934 *flags |= CPU_DTRACE_KPRIV;
4935 *illval = regs[r1];
4936 break;
4937 }
4938 /*FALLTHROUGH*/
4939 case DIF_OP_LDSH:
4940 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4941 break;
4942 case DIF_OP_RLDSW:
4943 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4944 *flags |= CPU_DTRACE_KPRIV;
4945 *illval = regs[r1];
4946 break;
4947 }
4948 /*FALLTHROUGH*/
4949 case DIF_OP_LDSW:
4950 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4951 break;
4952 case DIF_OP_RLDUB:
4953 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4954 *flags |= CPU_DTRACE_KPRIV;
4955 *illval = regs[r1];
4956 break;
4957 }
4958 /*FALLTHROUGH*/
4959 case DIF_OP_LDUB:
4960 regs[rd] = dtrace_load8(regs[r1]);
4961 break;
4962 case DIF_OP_RLDUH:
4963 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4964 *flags |= CPU_DTRACE_KPRIV;
4965 *illval = regs[r1];
4966 break;
4967 }
4968 /*FALLTHROUGH*/
4969 case DIF_OP_LDUH:
4970 regs[rd] = dtrace_load16(regs[r1]);
4971 break;
4972 case DIF_OP_RLDUW:
4973 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4974 *flags |= CPU_DTRACE_KPRIV;
4975 *illval = regs[r1];
4976 break;
4977 }
4978 /*FALLTHROUGH*/
4979 case DIF_OP_LDUW:
4980 regs[rd] = dtrace_load32(regs[r1]);
4981 break;
4982 case DIF_OP_RLDX:
4983 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4984 *flags |= CPU_DTRACE_KPRIV;
4985 *illval = regs[r1];
4986 break;
4987 }
4988 /*FALLTHROUGH*/
4989 case DIF_OP_LDX:
4990 regs[rd] = dtrace_load64(regs[r1]);
4991 break;
4992 case DIF_OP_ULDSB:
4993 regs[rd] = (int8_t)
4994 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4995 break;
4996 case DIF_OP_ULDSH:
4997 regs[rd] = (int16_t)
4998 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4999 break;
5000 case DIF_OP_ULDSW:
5001 regs[rd] = (int32_t)
5002 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5003 break;
5004 case DIF_OP_ULDUB:
5005 regs[rd] =
5006 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5007 break;
5008 case DIF_OP_ULDUH:
5009 regs[rd] =
5010 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5011 break;
5012 case DIF_OP_ULDUW:
5013 regs[rd] =
5014 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5015 break;
5016 case DIF_OP_ULDX:
5017 regs[rd] =
5018 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5019 break;
5020 case DIF_OP_RET:
5021 rval = regs[rd];
5022 pc = textlen;
5023 break;
5024 case DIF_OP_NOP:
5025 break;
5026 case DIF_OP_SETX:
5027 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5028 break;
5029 case DIF_OP_SETS:
5030 regs[rd] = (uint64_t)(uintptr_t)
5031 (strtab + DIF_INSTR_STRING(instr));
5032 break;
5033 case DIF_OP_SCMP: {
5034 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5035 uintptr_t s1 = regs[r1];
5036 uintptr_t s2 = regs[r2];
5037
5038 if (s1 != NULL &&
5039 !dtrace_strcanload(s1, sz, mstate, vstate))
5040 break;
5041 if (s2 != NULL &&
5042 !dtrace_strcanload(s2, sz, mstate, vstate))
5043 break;
5044
5045 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5046
5047 cc_n = cc_r < 0;
5048 cc_z = cc_r == 0;
5049 cc_v = cc_c = 0;
5050 break;
5051 }
5052 case DIF_OP_LDGA:
5053 regs[rd] = dtrace_dif_variable(mstate, state,
5054 r1, regs[r2]);
5055 break;
5056 case DIF_OP_LDGS:
5057 id = DIF_INSTR_VAR(instr);
5058
5059 if (id >= DIF_VAR_OTHER_UBASE) {
5060 uintptr_t a;
5061
5062 id -= DIF_VAR_OTHER_UBASE;
5063 svar = vstate->dtvs_globals[id];
5064 ASSERT(svar != NULL);
5065 v = &svar->dtsv_var;
5066
5067 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5068 regs[rd] = svar->dtsv_data;
5069 break;
5070 }
5071
5072 a = (uintptr_t)svar->dtsv_data;
5073
5074 if (*(uint8_t *)a == UINT8_MAX) {
5075 /*
5076 * If the 0th byte is set to UINT8_MAX
5077 * then this is to be treated as a
5078 * reference to a NULL variable.
5079 */
5080 regs[rd] = NULL;
5081 } else {
5082 regs[rd] = a + sizeof (uint64_t);
5083 }
5084
5085 break;
5086 }
5087
5088 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5089 break;
5090
5091 case DIF_OP_STGS:
5092 id = DIF_INSTR_VAR(instr);
5093
5094 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5095 id -= DIF_VAR_OTHER_UBASE;
5096
5097 svar = vstate->dtvs_globals[id];
5098 ASSERT(svar != NULL);
5099 v = &svar->dtsv_var;
5100
5101 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5102 uintptr_t a = (uintptr_t)svar->dtsv_data;
5103
5104 ASSERT(a != NULL);
5105 ASSERT(svar->dtsv_size != 0);
5106
5107 if (regs[rd] == NULL) {
5108 *(uint8_t *)a = UINT8_MAX;
5109 break;
5110 } else {
5111 *(uint8_t *)a = 0;
5112 a += sizeof (uint64_t);
5113 }
5114 if (!dtrace_vcanload(
5115 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5116 mstate, vstate))
5117 break;
5118
5119 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5120 (void *)a, &v->dtdv_type);
5121 break;
5122 }
5123
5124 svar->dtsv_data = regs[rd];
5125 break;
5126
5127 case DIF_OP_LDTA:
5128 /*
5129 * There are no DTrace built-in thread-local arrays at
5130 * present. This opcode is saved for future work.
5131 */
5132 *flags |= CPU_DTRACE_ILLOP;
5133 regs[rd] = 0;
5134 break;
5135
5136 case DIF_OP_LDLS:
5137 id = DIF_INSTR_VAR(instr);
5138
5139 if (id < DIF_VAR_OTHER_UBASE) {
5140 /*
5141 * For now, this has no meaning.
5142 */
5143 regs[rd] = 0;
5144 break;
5145 }
5146
5147 id -= DIF_VAR_OTHER_UBASE;
5148
5149 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5150 ASSERT(vstate->dtvs_locals != NULL);
5151
5152 svar = vstate->dtvs_locals[id];
5153 ASSERT(svar != NULL);
5154 v = &svar->dtsv_var;
5155
5156 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5157 uintptr_t a = (uintptr_t)svar->dtsv_data;
5158 size_t sz = v->dtdv_type.dtdt_size;
5159
5160 sz += sizeof (uint64_t);
5161 ASSERT(svar->dtsv_size == NCPU * sz);
5162 a += VBDT_GET_CPUID() * sz;
5163
5164 if (*(uint8_t *)a == UINT8_MAX) {
5165 /*
5166 * If the 0th byte is set to UINT8_MAX
5167 * then this is to be treated as a
5168 * reference to a NULL variable.
5169 */
5170 regs[rd] = NULL;
5171 } else {
5172 regs[rd] = a + sizeof (uint64_t);
5173 }
5174
5175 break;
5176 }
5177
5178 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5179 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5180 regs[rd] = tmp[VBDT_GET_CPUID()];
5181 break;
5182
5183 case DIF_OP_STLS:
5184 id = DIF_INSTR_VAR(instr);
5185
5186 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5187 id -= DIF_VAR_OTHER_UBASE;
5188 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5189
5190 ASSERT(vstate->dtvs_locals != NULL);
5191 svar = vstate->dtvs_locals[id];
5192 ASSERT(svar != NULL);
5193 v = &svar->dtsv_var;
5194
5195 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5196 uintptr_t a = (uintptr_t)svar->dtsv_data;
5197 size_t sz = v->dtdv_type.dtdt_size;
5198
5199 sz += sizeof (uint64_t);
5200 ASSERT(svar->dtsv_size == NCPU * sz);
5201 a += VBDT_GET_CPUID() * sz;
5202
5203 if (regs[rd] == NULL) {
5204 *(uint8_t *)a = UINT8_MAX;
5205 break;
5206 } else {
5207 *(uint8_t *)a = 0;
5208 a += sizeof (uint64_t);
5209 }
5210
5211 if (!dtrace_vcanload(
5212 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5213 mstate, vstate))
5214 break;
5215
5216 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5217 (void *)a, &v->dtdv_type);
5218 break;
5219 }
5220
5221 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5222 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5223 tmp[VBDT_GET_CPUID()] = regs[rd];
5224 break;
5225
5226 case DIF_OP_LDTS: {
5227 dtrace_dynvar_t *dvar;
5228 dtrace_key_t *key;
5229
5230 id = DIF_INSTR_VAR(instr);
5231 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5232 id -= DIF_VAR_OTHER_UBASE;
5233 v = &vstate->dtvs_tlocals[id];
5234
5235 key = &tupregs[DIF_DTR_NREGS];
5236 key[0].dttk_value = (uint64_t)id;
5237 key[0].dttk_size = 0;
5238 DTRACE_TLS_THRKEY(key[1].dttk_value);
5239 key[1].dttk_size = 0;
5240
5241 dvar = dtrace_dynvar(dstate, 2, key,
5242 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5243 mstate, vstate);
5244
5245 if (dvar == NULL) {
5246 regs[rd] = 0;
5247 break;
5248 }
5249
5250 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5251 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5252 } else {
5253 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5254 }
5255
5256 break;
5257 }
5258
5259 case DIF_OP_STTS: {
5260 dtrace_dynvar_t *dvar;
5261 dtrace_key_t *key;
5262
5263 id = DIF_INSTR_VAR(instr);
5264 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5265 id -= DIF_VAR_OTHER_UBASE;
5266
5267 key = &tupregs[DIF_DTR_NREGS];
5268 key[0].dttk_value = (uint64_t)id;
5269 key[0].dttk_size = 0;
5270 DTRACE_TLS_THRKEY(key[1].dttk_value);
5271 key[1].dttk_size = 0;
5272 v = &vstate->dtvs_tlocals[id];
5273
5274 dvar = dtrace_dynvar(dstate, 2, key,
5275 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5276 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5277 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5278 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5279
5280 /*
5281 * Given that we're storing to thread-local data,
5282 * we need to flush our predicate cache.
5283 */
5284 curthread->t_predcache = NULL;
5285
5286 if (dvar == NULL)
5287 break;
5288
5289 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5290 if (!dtrace_vcanload(
5291 (void *)(uintptr_t)regs[rd],
5292 &v->dtdv_type, mstate, vstate))
5293 break;
5294
5295 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5296 dvar->dtdv_data, &v->dtdv_type);
5297 } else {
5298 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5299 }
5300
5301 break;
5302 }
5303
5304 case DIF_OP_SRA:
5305 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5306 break;
5307
5308 case DIF_OP_CALL:
5309 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5310 regs, tupregs, ttop, mstate, state);
5311 break;
5312
5313 case DIF_OP_PUSHTR:
5314 if (ttop == DIF_DTR_NREGS) {
5315 *flags |= CPU_DTRACE_TUPOFLOW;
5316 break;
5317 }
5318
5319 if (r1 == DIF_TYPE_STRING) {
5320 /*
5321 * If this is a string type and the size is 0,
5322 * we'll use the system-wide default string
5323 * size. Note that we are _not_ looking at
5324 * the value of the DTRACEOPT_STRSIZE option;
5325 * had this been set, we would expect to have
5326 * a non-zero size value in the "pushtr".
5327 */
5328 tupregs[ttop].dttk_size =
5329 dtrace_strlen((char *)(uintptr_t)regs[rd],
5330 regs[r2] ? regs[r2] :
5331 dtrace_strsize_default) + 1;
5332 } else {
5333 tupregs[ttop].dttk_size = regs[r2];
5334 }
5335
5336 tupregs[ttop++].dttk_value = regs[rd];
5337 break;
5338
5339 case DIF_OP_PUSHTV:
5340 if (ttop == DIF_DTR_NREGS) {
5341 *flags |= CPU_DTRACE_TUPOFLOW;
5342 break;
5343 }
5344
5345 tupregs[ttop].dttk_value = regs[rd];
5346 tupregs[ttop++].dttk_size = 0;
5347 break;
5348
5349 case DIF_OP_POPTS:
5350 if (ttop != 0)
5351 ttop--;
5352 break;
5353
5354 case DIF_OP_FLUSHTS:
5355 ttop = 0;
5356 break;
5357
5358 case DIF_OP_LDGAA:
5359 case DIF_OP_LDTAA: {
5360 dtrace_dynvar_t *dvar;
5361 dtrace_key_t *key = tupregs;
5362 uint_t nkeys = ttop;
5363
5364 id = DIF_INSTR_VAR(instr);
5365 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5366 id -= DIF_VAR_OTHER_UBASE;
5367
5368 key[nkeys].dttk_value = (uint64_t)id;
5369 key[nkeys++].dttk_size = 0;
5370
5371 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5372 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5373 key[nkeys++].dttk_size = 0;
5374 v = &vstate->dtvs_tlocals[id];
5375 } else {
5376 v = &vstate->dtvs_globals[id]->dtsv_var;
5377 }
5378
5379 dvar = dtrace_dynvar(dstate, nkeys, key,
5380 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5381 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5382 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5383
5384 if (dvar == NULL) {
5385 regs[rd] = 0;
5386 break;
5387 }
5388
5389 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5390 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5391 } else {
5392 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5393 }
5394
5395 break;
5396 }
5397
5398 case DIF_OP_STGAA:
5399 case DIF_OP_STTAA: {
5400 dtrace_dynvar_t *dvar;
5401 dtrace_key_t *key = tupregs;
5402 uint_t nkeys = ttop;
5403
5404 id = DIF_INSTR_VAR(instr);
5405 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5406 id -= DIF_VAR_OTHER_UBASE;
5407
5408 key[nkeys].dttk_value = (uint64_t)id;
5409 key[nkeys++].dttk_size = 0;
5410
5411 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5412 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5413 key[nkeys++].dttk_size = 0;
5414 v = &vstate->dtvs_tlocals[id];
5415 } else {
5416 v = &vstate->dtvs_globals[id]->dtsv_var;
5417 }
5418
5419 dvar = dtrace_dynvar(dstate, nkeys, key,
5420 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5421 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5422 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5423 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5424
5425 if (dvar == NULL)
5426 break;
5427
5428 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5429 if (!dtrace_vcanload(
5430 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5431 mstate, vstate))
5432 break;
5433
5434 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5435 dvar->dtdv_data, &v->dtdv_type);
5436 } else {
5437 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5438 }
5439
5440 break;
5441 }
5442
5443 case DIF_OP_ALLOCS: {
5444 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5445 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5446
5447 /*
5448 * Rounding up the user allocation size could have
5449 * overflowed large, bogus allocations (like -1ULL) to
5450 * 0.
5451 */
5452 if (size < regs[r1] ||
5453 !DTRACE_INSCRATCH(mstate, size)) {
5454 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5455 regs[rd] = NULL;
5456 break;
5457 }
5458
5459 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5460 mstate->dtms_scratch_ptr += size;
5461 regs[rd] = ptr;
5462 break;
5463 }
5464
5465 case DIF_OP_COPYS:
5466 if (!dtrace_canstore(regs[rd], regs[r2],
5467 mstate, vstate)) {
5468 *flags |= CPU_DTRACE_BADADDR;
5469 *illval = regs[rd];
5470 break;
5471 }
5472
5473 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5474 break;
5475
5476 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5477 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5478 break;
5479
5480 case DIF_OP_STB:
5481 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5482 *flags |= CPU_DTRACE_BADADDR;
5483 *illval = regs[rd];
5484 break;
5485 }
5486 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5487 break;
5488
5489 case DIF_OP_STH:
5490 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5491 *flags |= CPU_DTRACE_BADADDR;
5492 *illval = regs[rd];
5493 break;
5494 }
5495 if (regs[rd] & 1) {
5496 *flags |= CPU_DTRACE_BADALIGN;
5497 *illval = regs[rd];
5498 break;
5499 }
5500 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5501 break;
5502
5503 case DIF_OP_STW:
5504 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5505 *flags |= CPU_DTRACE_BADADDR;
5506 *illval = regs[rd];
5507 break;
5508 }
5509 if (regs[rd] & 3) {
5510 *flags |= CPU_DTRACE_BADALIGN;
5511 *illval = regs[rd];
5512 break;
5513 }
5514 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5515 break;
5516
5517 case DIF_OP_STX:
5518 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5519 *flags |= CPU_DTRACE_BADADDR;
5520 *illval = regs[rd];
5521 break;
5522 }
5523 if (regs[rd] & 7) {
5524 *flags |= CPU_DTRACE_BADALIGN;
5525 *illval = regs[rd];
5526 break;
5527 }
5528 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5529 break;
5530 }
5531 }
5532
5533 if (!(*flags & CPU_DTRACE_FAULT))
5534 return (rval);
5535
5536 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5537 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5538
5539 return (0);
5540}
5541
5542#ifndef VBOX /* no destructive stuff */
5543
5544static void
5545dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5546{
5547 dtrace_probe_t *probe = ecb->dte_probe;
5548 dtrace_provider_t *prov = probe->dtpr_provider;
5549 char c[DTRACE_FULLNAMELEN + 80], *str;
5550 char *msg = "dtrace: breakpoint action at probe ";
5551 char *ecbmsg = " (ecb ";
5552 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5553 uintptr_t val = (uintptr_t)ecb;
5554 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5555
5556 if (dtrace_destructive_disallow)
5557 return;
5558
5559 /*
5560 * It's impossible to be taking action on the NULL probe.
5561 */
5562 ASSERT(probe != NULL);
5563
5564 /*
5565 * This is a poor man's (destitute man's?) sprintf(): we want to
5566 * print the provider name, module name, function name and name of
5567 * the probe, along with the hex address of the ECB with the breakpoint
5568 * action -- all of which we must place in the character buffer by
5569 * hand.
5570 */
5571 while (*msg != '\0')
5572 c[i++] = *msg++;
5573
5574 for (str = prov->dtpv_name; *str != '\0'; str++)
5575 c[i++] = *str;
5576 c[i++] = ':';
5577
5578 for (str = probe->dtpr_mod; *str != '\0'; str++)
5579 c[i++] = *str;
5580 c[i++] = ':';
5581
5582 for (str = probe->dtpr_func; *str != '\0'; str++)
5583 c[i++] = *str;
5584 c[i++] = ':';
5585
5586 for (str = probe->dtpr_name; *str != '\0'; str++)
5587 c[i++] = *str;
5588
5589 while (*ecbmsg != '\0')
5590 c[i++] = *ecbmsg++;
5591
5592 while (shift >= 0) {
5593 mask = (uintptr_t)0xf << shift;
5594
5595 if (val >= ((uintptr_t)1 << shift))
5596 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5597 shift -= 4;
5598 }
5599
5600 c[i++] = ')';
5601 c[i] = '\0';
5602
5603 debug_enter(c);
5604}
5605
5606static void
5607dtrace_action_panic(dtrace_ecb_t *ecb)
5608{
5609 dtrace_probe_t *probe = ecb->dte_probe;
5610
5611 /*
5612 * It's impossible to be taking action on the NULL probe.
5613 */
5614 ASSERT(probe != NULL);
5615
5616 if (dtrace_destructive_disallow)
5617 return;
5618
5619 if (dtrace_panicked != NULL)
5620 return;
5621
5622 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5623 return;
5624
5625 /*
5626 * We won the right to panic. (We want to be sure that only one
5627 * thread calls panic() from dtrace_probe(), and that panic() is
5628 * called exactly once.)
5629 */
5630 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5631 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5632 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5633}
5634
5635static void
5636dtrace_action_raise(uint64_t sig)
5637{
5638 if (dtrace_destructive_disallow)
5639 return;
5640
5641 if (sig >= NSIG) {
5642 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5643 return;
5644 }
5645
5646 /*
5647 * raise() has a queue depth of 1 -- we ignore all subsequent
5648 * invocations of the raise() action.
5649 */
5650 if (curthread->t_dtrace_sig == 0)
5651 curthread->t_dtrace_sig = (uint8_t)sig;
5652
5653 curthread->t_sig_check = 1;
5654 aston(curthread);
5655}
5656
5657static void
5658dtrace_action_stop(void)
5659{
5660 if (dtrace_destructive_disallow)
5661 return;
5662
5663 if (!curthread->t_dtrace_stop) {
5664 curthread->t_dtrace_stop = 1;
5665 curthread->t_sig_check = 1;
5666 aston(curthread);
5667 }
5668}
5669
5670static void
5671dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5672{
5673 hrtime_t now;
5674 volatile uint16_t *flags;
5675 cpu_t *cpu = CPU;
5676
5677 if (dtrace_destructive_disallow)
5678 return;
5679
5680 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5681
5682 now = dtrace_gethrtime();
5683
5684 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5685 /*
5686 * We need to advance the mark to the current time.
5687 */
5688 cpu->cpu_dtrace_chillmark = now;
5689 cpu->cpu_dtrace_chilled = 0;
5690 }
5691
5692 /*
5693 * Now check to see if the requested chill time would take us over
5694 * the maximum amount of time allowed in the chill interval. (Or
5695 * worse, if the calculation itself induces overflow.)
5696 */
5697 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5698 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5699 *flags |= CPU_DTRACE_ILLOP;
5700 return;
5701 }
5702
5703 while (dtrace_gethrtime() - now < val)
5704 continue;
5705
5706 /*
5707 * Normally, we assure that the value of the variable "timestamp" does
5708 * not change within an ECB. The presence of chill() represents an
5709 * exception to this rule, however.
5710 */
5711 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5712 cpu->cpu_dtrace_chilled += val;
5713}
5714
5715#endif /* !VBOX */
5716
5717static void
5718dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5719 uint64_t *buf, uint64_t arg)
5720{
5721 int nframes = DTRACE_USTACK_NFRAMES(arg);
5722 int strsize = DTRACE_USTACK_STRSIZE(arg);
5723 uint64_t *pcs = &buf[1], *fps;
5724 char *str = (char *)&pcs[nframes];
5725 int size, offs = 0, i, j;
5726 uintptr_t old = mstate->dtms_scratch_ptr, saved;
5727#ifndef VBOX
5728 uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5729#else
5730 uint16_t volatile *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5731#endif
5732 char *sym;
5733
5734 /*
5735 * Should be taking a faster path if string space has not been
5736 * allocated.
5737 */
5738 ASSERT(strsize != 0);
5739
5740 /*
5741 * We will first allocate some temporary space for the frame pointers.
5742 */
5743 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5744 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5745 (nframes * sizeof (uint64_t));
5746
5747 if (!DTRACE_INSCRATCH(mstate, VBDTCAST(unsigned)size)) {
5748 /*
5749 * Not enough room for our frame pointers -- need to indicate
5750 * that we ran out of scratch space.
5751 */
5752 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5753 return;
5754 }
5755
5756 mstate->dtms_scratch_ptr += size;
5757 saved = mstate->dtms_scratch_ptr;
5758
5759 /*
5760 * Now get a stack with both program counters and frame pointers.
5761 */
5762 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5763 dtrace_getufpstack(buf, fps, nframes + 1);
5764 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5765
5766 /*
5767 * If that faulted, we're cooked.
5768 */
5769 if (*flags & CPU_DTRACE_FAULT)
5770 goto out;
5771
5772 /*
5773 * Now we want to walk up the stack, calling the USTACK helper. For
5774 * each iteration, we restore the scratch pointer.
5775 */
5776 for (i = 0; i < nframes; i++) {
5777 mstate->dtms_scratch_ptr = saved;
5778
5779 if (offs >= strsize)
5780 break;
5781
5782#ifndef VBOX
5783 sym = (char *)(uintptr_t)dtrace_helper(
5784 DTRACE_HELPER_ACTION_USTACK,
5785 mstate, state, pcs[i], fps[i]);
5786#else
5787 sym = NULL;
5788#endif
5789
5790 /*
5791 * If we faulted while running the helper, we're going to
5792 * clear the fault and null out the corresponding string.
5793 */
5794 if (*flags & CPU_DTRACE_FAULT) {
5795 *flags &= ~CPU_DTRACE_FAULT;
5796 str[offs++] = '\0';
5797 continue;
5798 }
5799
5800 if (sym == NULL) {
5801 str[offs++] = '\0';
5802 continue;
5803 }
5804
5805 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5806
5807 /*
5808 * Now copy in the string that the helper returned to us.
5809 */
5810 for (j = 0; offs + j < strsize; j++) {
5811 if ((str[offs + j] = sym[j]) == '\0')
5812 break;
5813 }
5814
5815 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5816
5817 offs += j + 1;
5818 }
5819
5820 if (offs >= strsize) {
5821 /*
5822 * If we didn't have room for all of the strings, we don't
5823 * abort processing -- this needn't be a fatal error -- but we
5824 * still want to increment a counter (dts_stkstroverflows) to
5825 * allow this condition to be warned about. (If this is from
5826 * a jstack() action, it is easily tuned via jstackstrsize.)
5827 */
5828 dtrace_error(&state->dts_stkstroverflows);
5829 }
5830
5831 while (offs < strsize)
5832 str[offs++] = '\0';
5833
5834out:
5835 mstate->dtms_scratch_ptr = old;
5836}
5837
5838#ifdef VBOX
5839extern void dtrace_probe6(dtrace_id_t, uintptr_t arg0, uintptr_t arg1,
5840 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
5841# define dtrace_probe_error(a1, a2, a3, a4, a5, a6) \
5842 dtrace_probe6(dtrace_probeid_error, (uintptr_t)a1, a2, a3, a4, a5, a6)
5843#endif
5844
5845/*
5846 * If you're looking for the epicenter of DTrace, you just found it. This
5847 * is the function called by the provider to fire a probe -- from which all
5848 * subsequent probe-context DTrace activity emanates.
5849 */
5850void
5851dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5852 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5853{
5854 processorid_t cpuid;
5855 dtrace_icookie_t cookie;
5856 dtrace_probe_t *probe;
5857 dtrace_mstate_t mstate;
5858 dtrace_ecb_t *ecb;
5859 dtrace_action_t *act;
5860 intptr_t offs;
5861 size_t size;
5862 int vtime, onintr;
5863 volatile uint16_t *flags;
5864 hrtime_t now;
5865
5866#ifndef VBOX
5867 /*
5868 * Kick out immediately if this CPU is still being born (in which case
5869 * curthread will be set to -1) or the current thread can't allow
5870 * probes in its current context.
5871 */
5872 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5873 return;
5874#endif
5875
5876 cookie = dtrace_interrupt_disable();
5877 probe = dtrace_probes[id - 1];
5878 cpuid = VBDT_GET_CPUID();
5879 onintr = CPU_ON_INTR(CPU);
5880
5881 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5882 probe->dtpr_predcache == curthread->t_predcache) {
5883 /*
5884 * We have hit in the predicate cache; we know that
5885 * this predicate would evaluate to be false.
5886 */
5887 dtrace_interrupt_enable(cookie);
5888 return;
5889 }
5890
5891#ifndef VBOX
5892 if (panic_quiesce) {
5893 /*
5894 * We don't trace anything if we're panicking.
5895 */
5896 dtrace_interrupt_enable(cookie);
5897 return;
5898 }
5899#endif
5900
5901 now = dtrace_gethrtime();
5902 vtime = dtrace_vtime_references != 0;
5903
5904 if (vtime && curthread->t_dtrace_start)
5905 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5906
5907 mstate.dtms_difo = NULL;
5908 mstate.dtms_probe = probe;
5909 mstate.dtms_strtok = NULL;
5910 mstate.dtms_arg[0] = arg0;
5911 mstate.dtms_arg[1] = arg1;
5912 mstate.dtms_arg[2] = arg2;
5913 mstate.dtms_arg[3] = arg3;
5914 mstate.dtms_arg[4] = arg4;
5915
5916 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5917
5918 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5919 dtrace_predicate_t *pred = ecb->dte_predicate;
5920 dtrace_state_t *state = ecb->dte_state;
5921 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5922 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5923 dtrace_vstate_t *vstate = &state->dts_vstate;
5924 dtrace_provider_t *prov = probe->dtpr_provider;
5925 int committed = 0;
5926 caddr_t tomax;
5927
5928 /*
5929 * A little subtlety with the following (seemingly innocuous)
5930 * declaration of the automatic 'val': by looking at the
5931 * code, you might think that it could be declared in the
5932 * action processing loop, below. (That is, it's only used in
5933 * the action processing loop.) However, it must be declared
5934 * out of that scope because in the case of DIF expression
5935 * arguments to aggregating actions, one iteration of the
5936 * action loop will use the last iteration's value.
5937 */
5938#ifdef lint
5939 uint64_t val = 0;
5940#else
5941 uint64_t val VBDTUNASS(0);
5942#endif
5943
5944 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5945 *flags &= ~CPU_DTRACE_ERROR;
5946
5947 if (prov == dtrace_provider) {
5948 /*
5949 * If dtrace itself is the provider of this probe,
5950 * we're only going to continue processing the ECB if
5951 * arg0 (the dtrace_state_t) is equal to the ECB's
5952 * creating state. (This prevents disjoint consumers
5953 * from seeing one another's metaprobes.)
5954 */
5955 if (arg0 != (uint64_t)(uintptr_t)state)
5956 continue;
5957 }
5958
5959 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5960 /*
5961 * We're not currently active. If our provider isn't
5962 * the dtrace pseudo provider, we're not interested.
5963 */
5964 if (prov != dtrace_provider)
5965 continue;
5966
5967 /*
5968 * Now we must further check if we are in the BEGIN
5969 * probe. If we are, we will only continue processing
5970 * if we're still in WARMUP -- if one BEGIN enabling
5971 * has invoked the exit() action, we don't want to
5972 * evaluate subsequent BEGIN enablings.
5973 */
5974 if (probe->dtpr_id == dtrace_probeid_begin &&
5975 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5976 ASSERT(state->dts_activity ==
5977 DTRACE_ACTIVITY_DRAINING);
5978 continue;
5979 }
5980 }
5981
5982 if (ecb->dte_cond) {
5983 /*
5984 * If the dte_cond bits indicate that this
5985 * consumer is only allowed to see user-mode firings
5986 * of this probe, call the provider's dtps_usermode()
5987 * entry point to check that the probe was fired
5988 * while in a user context. Skip this ECB if that's
5989 * not the case.
5990 */
5991 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
5992 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
5993 probe->dtpr_id, probe->dtpr_arg) == 0)
5994 continue;
5995
5996 /*
5997 * This is more subtle than it looks. We have to be
5998 * absolutely certain that CRED() isn't going to
5999 * change out from under us so it's only legit to
6000 * examine that structure if we're in constrained
6001 * situations. Currently, the only times we'll this
6002 * check is if a non-super-user has enabled the
6003 * profile or syscall providers -- providers that
6004 * allow visibility of all processes. For the
6005 * profile case, the check above will ensure that
6006 * we're examining a user context.
6007 */
6008 if (ecb->dte_cond & DTRACE_COND_OWNER) {
6009 cred_t *cr;
6010 cred_t *s_cr =
6011 ecb->dte_state->dts_cred.dcr_cred;
6012#ifndef VBOX
6013 proc_t *proc;
6014#endif
6015
6016 ASSERT(s_cr != NULL);
6017
6018 if ((cr = CRED()) == NULL ||
6019 s_cr->cr_uid != cr->cr_uid ||
6020 s_cr->cr_uid != cr->cr_ruid ||
6021 s_cr->cr_uid != cr->cr_suid ||
6022 s_cr->cr_gid != cr->cr_gid ||
6023 s_cr->cr_gid != cr->cr_rgid ||
6024 s_cr->cr_gid != cr->cr_sgid ||
6025#ifndef VBOX
6026 (proc = VBDT_GET_PROC()) == NULL ||
6027 (proc->p_flag & SNOCD))
6028#else
6029 0)
6030
6031#endif
6032 continue;
6033 }
6034
6035#ifndef VBOX
6036 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6037 cred_t *cr;
6038 cred_t *s_cr =
6039 ecb->dte_state->dts_cred.dcr_cred;
6040
6041 ASSERT(s_cr != NULL);
6042
6043 if ((cr = CRED()) == NULL ||
6044 s_cr->cr_zone->zone_id !=
6045 cr->cr_zone->zone_id)
6046 continue;
6047 }
6048#endif
6049 }
6050
6051 if (now - state->dts_alive > dtrace_deadman_timeout) {
6052 /*
6053 * We seem to be dead. Unless we (a) have kernel
6054 * destructive permissions (b) have expicitly enabled
6055 * destructive actions and (c) destructive actions have
6056 * not been disabled, we're going to transition into
6057 * the KILLED state, from which no further processing
6058 * on this state will be performed.
6059 */
6060 if (!dtrace_priv_kernel_destructive(state) ||
6061 !state->dts_cred.dcr_destructive ||
6062 dtrace_destructive_disallow) {
6063 void *activity = &state->dts_activity;
6064 dtrace_activity_t current;
6065
6066 do {
6067 current = state->dts_activity;
6068 } while ( (dtrace_speculation_state_t)dtrace_cas32(activity, current, DTRACE_ACTIVITY_KILLED)
6069 != current);
6070
6071 continue;
6072 }
6073 }
6074
6075 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6076 ecb->dte_alignment, state, &mstate)) < 0)
6077 continue;
6078
6079 tomax = buf->dtb_tomax;
6080 ASSERT(tomax != NULL);
6081
6082 if (ecb->dte_size != 0)
6083 DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6084
6085 mstate.dtms_epid = ecb->dte_epid;
6086 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6087
6088 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6089 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6090 else
6091 mstate.dtms_access = 0;
6092
6093 if (pred != NULL) {
6094 dtrace_difo_t *dp = pred->dtp_difo;
6095 int rval;
6096
6097 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6098
6099 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6100 dtrace_cacheid_t cid = probe->dtpr_predcache;
6101
6102 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6103 /*
6104 * Update the predicate cache...
6105 */
6106 ASSERT(cid == pred->dtp_cacheid);
6107 curthread->t_predcache = cid;
6108 }
6109
6110 continue;
6111 }
6112 }
6113
6114 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6115 act != NULL; act = act->dta_next) {
6116 size_t valoffs;
6117 dtrace_difo_t *dp;
6118 dtrace_recdesc_t *rec = &act->dta_rec;
6119
6120 size = rec->dtrd_size;
6121 valoffs = offs + rec->dtrd_offset;
6122
6123 if (DTRACEACT_ISAGG(act->dta_kind)) {
6124 uint64_t v = 0xbad;
6125 dtrace_aggregation_t *agg;
6126
6127 agg = (dtrace_aggregation_t *)act;
6128
6129 if ((dp = act->dta_difo) != NULL)
6130 v = dtrace_dif_emulate(dp,
6131 &mstate, vstate, state);
6132
6133 if (*flags & CPU_DTRACE_ERROR)
6134 continue;
6135
6136 /*
6137 * Note that we always pass the expression
6138 * value from the previous iteration of the
6139 * action loop. This value will only be used
6140 * if there is an expression argument to the
6141 * aggregating action, denoted by the
6142 * dtag_hasarg field.
6143 */
6144 dtrace_aggregate(agg, buf,
6145 offs, aggbuf, v, val);
6146 continue;
6147 }
6148
6149 switch (act->dta_kind) {
6150 case DTRACEACT_STOP:
6151#ifndef VBOX
6152 if (dtrace_priv_proc_destructive(state))
6153 dtrace_action_stop();
6154#else
6155 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6156#endif
6157 continue;
6158
6159 case DTRACEACT_BREAKPOINT:
6160#ifndef VBOX
6161 if (dtrace_priv_kernel_destructive(state))
6162 dtrace_action_breakpoint(ecb);
6163#else
6164 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6165#endif
6166 continue;
6167
6168 case DTRACEACT_PANIC:
6169#ifndef VBOX
6170 if (dtrace_priv_kernel_destructive(state))
6171 dtrace_action_panic(ecb);
6172#endif
6173 continue;
6174
6175 case DTRACEACT_STACK:
6176 if (!dtrace_priv_kernel(state))
6177 continue;
6178
6179 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6180 VBDTCAST(int)(size / sizeof (pc_t)), probe->dtpr_aframes,
6181 DTRACE_ANCHORED(probe) ? NULL :
6182 (uint32_t *)arg0);
6183
6184 continue;
6185
6186 case DTRACEACT_JSTACK:
6187 case DTRACEACT_USTACK:
6188 if (!dtrace_priv_proc(state))
6189 continue;
6190
6191 /*
6192 * See comment in DIF_VAR_PID.
6193 */
6194 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6195 CPU_ON_INTR(CPU)) {
6196 int depth = DTRACE_USTACK_NFRAMES(
6197 rec->dtrd_arg) + 1;
6198
6199 dtrace_bzero((void *)(tomax + valoffs),
6200 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6201 + depth * sizeof (uint64_t));
6202
6203 continue;
6204 }
6205
6206#ifndef VBOX /* no helpers */
6207 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6208 curproc->p_dtrace_helpers != NULL) {
6209 /*
6210 * This is the slow path -- we have
6211 * allocated string space, and we're
6212 * getting the stack of a process that
6213 * has helpers. Call into a separate
6214 * routine to perform this processing.
6215 */
6216 dtrace_action_ustack(&mstate, state,
6217 (uint64_t *)(tomax + valoffs),
6218 rec->dtrd_arg);
6219 continue;
6220 }
6221#endif
6222
6223 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6224 dtrace_getupcstack((uint64_t *)
6225 (tomax + valoffs),
6226 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6227 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6228 continue;
6229
6230 default:
6231 break;
6232 }
6233
6234 dp = act->dta_difo;
6235 ASSERT(dp != NULL);
6236
6237 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6238
6239 if (*flags & CPU_DTRACE_ERROR)
6240 continue;
6241
6242 switch (act->dta_kind) {
6243 case DTRACEACT_SPECULATE:
6244 ASSERT(buf == &state->dts_buffer[cpuid]);
6245 buf = dtrace_speculation_buffer(state,
6246 cpuid, val);
6247
6248 if (buf == NULL) {
6249 *flags |= CPU_DTRACE_DROP;
6250 continue;
6251 }
6252
6253 offs = dtrace_buffer_reserve(buf,
6254 ecb->dte_needed, ecb->dte_alignment,
6255 state, NULL);
6256
6257 if (offs < 0) {
6258 *flags |= CPU_DTRACE_DROP;
6259 continue;
6260 }
6261
6262 tomax = buf->dtb_tomax;
6263 ASSERT(tomax != NULL);
6264
6265 if (ecb->dte_size != 0)
6266 DTRACE_STORE(uint32_t, tomax, offs,
6267 ecb->dte_epid);
6268 continue;
6269
6270 case DTRACEACT_CHILL:
6271#ifndef VBOX
6272 if (dtrace_priv_kernel_destructive(state))
6273 dtrace_action_chill(&mstate, val);
6274#else
6275 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6276#endif
6277 continue;
6278
6279 case DTRACEACT_RAISE:
6280#ifndef VBOX
6281 if (dtrace_priv_proc_destructive(state))
6282 dtrace_action_raise(val);
6283#else
6284 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6285#endif
6286 continue;
6287
6288 case DTRACEACT_COMMIT:
6289 ASSERT(!committed);
6290
6291 /*
6292 * We need to commit our buffer state.
6293 */
6294 if (ecb->dte_size)
6295 buf->dtb_offset = offs + ecb->dte_size;
6296 buf = &state->dts_buffer[cpuid];
6297 dtrace_speculation_commit(state, cpuid, val);
6298 committed = 1;
6299 continue;
6300
6301 case DTRACEACT_DISCARD:
6302 dtrace_speculation_discard(state, cpuid, val);
6303 continue;
6304
6305 case DTRACEACT_DIFEXPR:
6306 case DTRACEACT_LIBACT:
6307 case DTRACEACT_PRINTF:
6308 case DTRACEACT_PRINTA:
6309 case DTRACEACT_SYSTEM:
6310 case DTRACEACT_FREOPEN:
6311 break;
6312
6313 case DTRACEACT_SYM:
6314 case DTRACEACT_MOD:
6315 if (!dtrace_priv_kernel(state))
6316 continue;
6317 break;
6318
6319 case DTRACEACT_USYM:
6320 case DTRACEACT_UMOD:
6321 case DTRACEACT_UADDR: {
6322#ifndef VBOX
6323 struct pid *pid = curthread->t_procp->p_pidp;
6324
6325 if (!dtrace_priv_proc(state))
6326 continue;
6327
6328 DTRACE_STORE(uint64_t, tomax,
6329 valoffs, (uint64_t)pid->pid_id);
6330 DTRACE_STORE(uint64_t, tomax,
6331 valoffs + sizeof (uint64_t), val);
6332#else
6333 DTRACE_CPUFLAG_SET(CPU_DTRACE_UPRIV);
6334#endif
6335 continue;
6336 }
6337
6338 case DTRACEACT_EXIT: {
6339 /*
6340 * For the exit action, we are going to attempt
6341 * to atomically set our activity to be
6342 * draining. If this fails (either because
6343 * another CPU has beat us to the exit action,
6344 * or because our current activity is something
6345 * other than ACTIVE or WARMUP), we will
6346 * continue. This assures that the exit action
6347 * can be successfully recorded at most once
6348 * when we're in the ACTIVE state. If we're
6349 * encountering the exit() action while in
6350 * COOLDOWN, however, we want to honor the new
6351 * status code. (We know that we're the only
6352 * thread in COOLDOWN, so there is no race.)
6353 */
6354 void *activity = &state->dts_activity;
6355 dtrace_activity_t current = state->dts_activity;
6356
6357 if (current == DTRACE_ACTIVITY_COOLDOWN)
6358 break;
6359
6360 if (current != DTRACE_ACTIVITY_WARMUP)
6361 current = DTRACE_ACTIVITY_ACTIVE;
6362
6363 if ( (dtrace_speculation_state_t)dtrace_cas32(activity, current, DTRACE_ACTIVITY_DRAINING)
6364 != current) {
6365 *flags |= CPU_DTRACE_DROP;
6366 continue;
6367 }
6368
6369 break;
6370 }
6371
6372 default:
6373#ifndef VBOX
6374 ASSERT(0);
6375#else
6376 AssertFatalMsgFailed(("%d\n", act->dta_kind));
6377#endif
6378 }
6379
6380 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6381 uintptr_t end = valoffs + size;
6382
6383 if (!dtrace_vcanload((void *)(uintptr_t)val,
6384 &dp->dtdo_rtype, &mstate, vstate))
6385 continue;
6386
6387 /*
6388 * If this is a string, we're going to only
6389 * load until we find the zero byte -- after
6390 * which we'll store zero bytes.
6391 */
6392 if (dp->dtdo_rtype.dtdt_kind ==
6393 DIF_TYPE_STRING) {
6394 char c = '\0' + 1;
6395 int intuple = act->dta_intuple;
6396 size_t s;
6397
6398 for (s = 0; s < size; s++) {
6399 if (c != '\0')
6400 c = dtrace_load8(val++);
6401
6402 DTRACE_STORE(uint8_t, tomax,
6403 valoffs++, c);
6404
6405 if (c == '\0' && intuple)
6406 break;
6407 }
6408
6409 continue;
6410 }
6411
6412 while (valoffs < end) {
6413 DTRACE_STORE(uint8_t, tomax, valoffs++,
6414 dtrace_load8(val++));
6415 }
6416
6417 continue;
6418 }
6419
6420 switch (size) {
6421 case 0:
6422 break;
6423
6424 case sizeof (uint8_t):
6425 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6426 break;
6427 case sizeof (uint16_t):
6428 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6429 break;
6430 case sizeof (uint32_t):
6431 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6432 break;
6433 case sizeof (uint64_t):
6434 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6435 break;
6436 default:
6437 /*
6438 * Any other size should have been returned by
6439 * reference, not by value.
6440 */
6441#ifndef VBOX
6442 ASSERT(0);
6443#else
6444 AssertFatalMsgFailed(("%zu\n", size));
6445#endif
6446 break;
6447 }
6448 }
6449
6450 if (*flags & CPU_DTRACE_DROP)
6451 continue;
6452
6453 if (*flags & CPU_DTRACE_FAULT) {
6454 int ndx;
6455 dtrace_action_t *err;
6456
6457 buf->dtb_errors++;
6458
6459 if (probe->dtpr_id == dtrace_probeid_error) {
6460 /*
6461 * There's nothing we can do -- we had an
6462 * error on the error probe. We bump an
6463 * error counter to at least indicate that
6464 * this condition happened.
6465 */
6466 dtrace_error(&state->dts_dblerrors);
6467 continue;
6468 }
6469
6470 if (vtime) {
6471 /*
6472 * Before recursing on dtrace_probe(), we
6473 * need to explicitly clear out our start
6474 * time to prevent it from being accumulated
6475 * into t_dtrace_vtime.
6476 */
6477 curthread->t_dtrace_start = 0;
6478 }
6479
6480 /*
6481 * Iterate over the actions to figure out which action
6482 * we were processing when we experienced the error.
6483 * Note that act points _past_ the faulting action; if
6484 * act is ecb->dte_action, the fault was in the
6485 * predicate, if it's ecb->dte_action->dta_next it's
6486 * in action #1, and so on.
6487 */
6488 for (err = ecb->dte_action, ndx = 0;
6489 err != act; err = err->dta_next, ndx++)
6490 continue;
6491
6492 dtrace_probe_error(state, ecb->dte_epid, ndx,
6493 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6494 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6495 cpu_core[cpuid].cpuc_dtrace_illval);
6496
6497 continue;
6498 }
6499
6500 if (!committed)
6501 buf->dtb_offset = offs + ecb->dte_size;
6502 }
6503
6504 if (vtime)
6505 curthread->t_dtrace_start = dtrace_gethrtime();
6506
6507 dtrace_interrupt_enable(cookie);
6508}
6509
6510/*
6511 * DTrace Probe Hashing Functions
6512 *
6513 * The functions in this section (and indeed, the functions in remaining
6514 * sections) are not _called_ from probe context. (Any exceptions to this are
6515 * marked with a "Note:".) Rather, they are called from elsewhere in the
6516 * DTrace framework to look-up probes in, add probes to and remove probes from
6517 * the DTrace probe hashes. (Each probe is hashed by each element of the
6518 * probe tuple -- allowing for fast lookups, regardless of what was
6519 * specified.)
6520 */
6521static uint_t
6522dtrace_hash_str(char *p)
6523{
6524 unsigned int g;
6525 uint_t hval = 0;
6526
6527 while (*p) {
6528 hval = (hval << 4) + *p++;
6529 if ((g = (hval & 0xf0000000)) != 0)
6530 hval ^= g >> 24;
6531 hval &= ~g;
6532 }
6533 return (hval);
6534}
6535
6536static dtrace_hash_t *
6537dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6538{
6539 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6540
6541 hash->dth_stroffs = stroffs;
6542 hash->dth_nextoffs = nextoffs;
6543 hash->dth_prevoffs = prevoffs;
6544
6545 hash->dth_size = 1;
6546 hash->dth_mask = hash->dth_size - 1;
6547
6548 hash->dth_tab = kmem_zalloc(hash->dth_size *
6549 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6550
6551 return (hash);
6552}
6553
6554static void
6555dtrace_hash_destroy(dtrace_hash_t *hash)
6556{
6557#ifdef DEBUG
6558 int i;
6559
6560 for (i = 0; i < hash->dth_size; i++)
6561 ASSERT(hash->dth_tab[i] == NULL);
6562#endif
6563
6564 kmem_free(hash->dth_tab,
6565 hash->dth_size * sizeof (dtrace_hashbucket_t *));
6566 kmem_free(hash, sizeof (dtrace_hash_t));
6567}
6568
6569static void
6570dtrace_hash_resize(dtrace_hash_t *hash)
6571{
6572 int size = hash->dth_size, i, ndx;
6573 int new_size = hash->dth_size << 1;
6574 int new_mask = new_size - 1;
6575 dtrace_hashbucket_t **new_tab, *bucket, *next;
6576
6577 ASSERT((new_size & new_mask) == 0);
6578
6579 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6580
6581 for (i = 0; i < size; i++) {
6582 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6583 dtrace_probe_t *probe = bucket->dthb_chain;
6584
6585 ASSERT(probe != NULL);
6586 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6587
6588 next = bucket->dthb_next;
6589 bucket->dthb_next = new_tab[ndx];
6590 new_tab[ndx] = bucket;
6591 }
6592 }
6593
6594 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6595 hash->dth_tab = new_tab;
6596 hash->dth_size = new_size;
6597 hash->dth_mask = new_mask;
6598}
6599
6600static void
6601dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6602{
6603 int hashval = DTRACE_HASHSTR(hash, new);
6604 int ndx = hashval & hash->dth_mask;
6605 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6606 dtrace_probe_t **nextp, **prevp;
6607
6608 for (; bucket != NULL; bucket = bucket->dthb_next) {
6609 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6610 goto add;
6611 }
6612
6613 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6614 dtrace_hash_resize(hash);
6615 dtrace_hash_add(hash, new);
6616 return;
6617 }
6618
6619 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6620 bucket->dthb_next = hash->dth_tab[ndx];
6621 hash->dth_tab[ndx] = bucket;
6622 hash->dth_nbuckets++;
6623
6624add:
6625 nextp = DTRACE_HASHNEXT(hash, new);
6626 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6627 *nextp = bucket->dthb_chain;
6628
6629 if (bucket->dthb_chain != NULL) {
6630 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6631 ASSERT(*prevp == NULL);
6632 *prevp = new;
6633 }
6634
6635 bucket->dthb_chain = new;
6636 bucket->dthb_len++;
6637}
6638
6639static dtrace_probe_t *
6640dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6641{
6642 int hashval = DTRACE_HASHSTR(hash, template);
6643 int ndx = hashval & hash->dth_mask;
6644 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6645
6646 for (; bucket != NULL; bucket = bucket->dthb_next) {
6647 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6648 return (bucket->dthb_chain);
6649 }
6650
6651 return (NULL);
6652}
6653
6654static int
6655dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6656{
6657 int hashval = DTRACE_HASHSTR(hash, template);
6658 int ndx = hashval & hash->dth_mask;
6659 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6660
6661 for (; bucket != NULL; bucket = bucket->dthb_next) {
6662 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6663 return (bucket->dthb_len);
6664 }
6665
6666 return (NULL);
6667}
6668
6669static void
6670dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6671{
6672 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6673 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6674
6675 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6676 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6677
6678 /*
6679 * Find the bucket that we're removing this probe from.
6680 */
6681 for (; bucket != NULL; bucket = bucket->dthb_next) {
6682 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6683 break;
6684 }
6685
6686 ASSERT(bucket != NULL);
6687
6688 if (*prevp == NULL) {
6689 if (*nextp == NULL) {
6690 /*
6691 * The removed probe was the only probe on this
6692 * bucket; we need to remove the bucket.
6693 */
6694 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6695
6696 ASSERT(bucket->dthb_chain == probe);
6697 ASSERT(b != NULL);
6698
6699 if (b == bucket) {
6700 hash->dth_tab[ndx] = bucket->dthb_next;
6701 } else {
6702 while (b->dthb_next != bucket)
6703 b = b->dthb_next;
6704 b->dthb_next = bucket->dthb_next;
6705 }
6706
6707 ASSERT(hash->dth_nbuckets > 0);
6708 hash->dth_nbuckets--;
6709 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6710 return;
6711 }
6712
6713 bucket->dthb_chain = *nextp;
6714 } else {
6715 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6716 }
6717
6718 if (*nextp != NULL)
6719 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6720}
6721
6722/*
6723 * DTrace Utility Functions
6724 *
6725 * These are random utility functions that are _not_ called from probe context.
6726 */
6727static int
6728dtrace_badattr(const dtrace_attribute_t *a)
6729{
6730 return (a->dtat_name > DTRACE_STABILITY_MAX ||
6731 a->dtat_data > DTRACE_STABILITY_MAX ||
6732 a->dtat_class > DTRACE_CLASS_MAX);
6733}
6734
6735/*
6736 * Return a duplicate copy of a string. If the specified string is NULL,
6737 * this function returns a zero-length string.
6738 */
6739static char *
6740dtrace_strdup(const char *str)
6741{
6742 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6743
6744 if (str != NULL)
6745 (void) strcpy(new, str);
6746
6747 return (new);
6748}
6749
6750#define DTRACE_ISALPHA(c) \
6751 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6752
6753static int
6754dtrace_badname(const char *s)
6755{
6756 char c;
6757
6758 if (s == NULL || (c = *s++) == '\0')
6759 return (0);
6760
6761 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6762 return (1);
6763
6764 while ((c = *s++) != '\0') {
6765 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6766 c != '-' && c != '_' && c != '.' && c != '`')
6767 return (1);
6768 }
6769
6770 return (0);
6771}
6772
6773static void
6774dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6775{
6776 uint32_t priv;
6777
6778 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6779 /*
6780 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6781 */
6782 priv = DTRACE_PRIV_ALL;
6783#ifdef VBOX
6784 *uidp = UINT32_MAX;
6785 *zoneidp = 0;
6786#endif
6787 } else {
6788 *uidp = crgetuid(cr);
6789 *zoneidp = crgetzoneid(cr);
6790
6791 priv = 0;
6792 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6793 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6794 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6795 priv |= DTRACE_PRIV_USER;
6796 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6797 priv |= DTRACE_PRIV_PROC;
6798 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6799 priv |= DTRACE_PRIV_OWNER;
6800 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6801 priv |= DTRACE_PRIV_ZONEOWNER;
6802 }
6803
6804 *privp = priv;
6805}
6806
6807#ifdef DTRACE_ERRDEBUG
6808static void
6809dtrace_errdebug(const char *str)
6810{
6811 int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6812 int occupied = 0;
6813
6814 mutex_enter(&dtrace_errlock);
6815 dtrace_errlast = str;
6816 dtrace_errthread = curthread;
6817
6818 while (occupied++ < DTRACE_ERRHASHSZ) {
6819 if (dtrace_errhash[hval].dter_msg == str) {
6820 dtrace_errhash[hval].dter_count++;
6821 goto out;
6822 }
6823
6824 if (dtrace_errhash[hval].dter_msg != NULL) {
6825 hval = (hval + 1) % DTRACE_ERRHASHSZ;
6826 continue;
6827 }
6828
6829 dtrace_errhash[hval].dter_msg = str;
6830 dtrace_errhash[hval].dter_count = 1;
6831 goto out;
6832 }
6833
6834 panic("dtrace: undersized error hash");
6835out:
6836 mutex_exit(&dtrace_errlock);
6837}
6838#endif
6839
6840/*
6841 * DTrace Matching Functions
6842 *
6843 * These functions are used to match groups of probes, given some elements of
6844 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6845 */
6846static int
6847dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6848 zoneid_t zoneid)
6849{
6850 if (priv != DTRACE_PRIV_ALL) {
6851 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6852 uint32_t match = priv & ppriv;
6853
6854 /*
6855 * No PRIV_DTRACE_* privileges...
6856 */
6857 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6858 DTRACE_PRIV_KERNEL)) == 0)
6859 return (0);
6860
6861 /*
6862 * No matching bits, but there were bits to match...
6863 */
6864 if (match == 0 && ppriv != 0)
6865 return (0);
6866
6867 /*
6868 * Need to have permissions to the process, but don't...
6869 */
6870 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6871 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6872 return (0);
6873 }
6874
6875 /*
6876 * Need to be in the same zone unless we possess the
6877 * privilege to examine all zones.
6878 */
6879 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6880 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6881 return (0);
6882 }
6883 }
6884
6885 return (1);
6886}
6887
6888/*
6889 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6890 * consists of input pattern strings and an ops-vector to evaluate them.
6891 * This function returns >0 for match, 0 for no match, and <0 for error.
6892 */
6893static int
6894dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6895 uint32_t priv, uid_t uid, zoneid_t zoneid)
6896{
6897 dtrace_provider_t *pvp = prp->dtpr_provider;
6898 int rv;
6899
6900 if (pvp->dtpv_defunct)
6901 return (0);
6902
6903 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6904 return (rv);
6905
6906 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6907 return (rv);
6908
6909 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6910 return (rv);
6911
6912 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6913 return (rv);
6914
6915 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6916 return (0);
6917
6918 return (rv);
6919}
6920
6921/*
6922 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6923 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
6924 * libc's version, the kernel version only applies to 8-bit ASCII strings.
6925 * In addition, all of the recursion cases except for '*' matching have been
6926 * unwound. For '*', we still implement recursive evaluation, but a depth
6927 * counter is maintained and matching is aborted if we recurse too deep.
6928 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6929 */
6930static int
6931dtrace_match_glob(const char *s, const char *p, int depth)
6932{
6933 const char *olds;
6934 char s1, c;
6935 int gs;
6936
6937 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6938 return (-1);
6939
6940 if (s == NULL)
6941 s = ""; /* treat NULL as empty string */
6942
6943top:
6944 olds = s;
6945 s1 = *s++;
6946
6947 if (p == NULL)
6948 return (0);
6949
6950 if ((c = *p++) == '\0')
6951 return (s1 == '\0');
6952
6953 switch (c) {
6954 case '[': {
6955 int ok = 0, notflag = 0;
6956 char lc = '\0';
6957
6958 if (s1 == '\0')
6959 return (0);
6960
6961 if (*p == '!') {
6962 notflag = 1;
6963 p++;
6964 }
6965
6966 if ((c = *p++) == '\0')
6967 return (0);
6968
6969 do {
6970 if (c == '-' && lc != '\0' && *p != ']') {
6971 if ((c = *p++) == '\0')
6972 return (0);
6973 if (c == '\\' && (c = *p++) == '\0')
6974 return (0);
6975
6976 if (notflag) {
6977 if (s1 < lc || s1 > c)
6978 ok++;
6979 else
6980 return (0);
6981 } else if (lc <= s1 && s1 <= c)
6982 ok++;
6983
6984 } else if (c == '\\' && (c = *p++) == '\0')
6985 return (0);
6986
6987 lc = c; /* save left-hand 'c' for next iteration */
6988
6989 if (notflag) {
6990 if (s1 != c)
6991 ok++;
6992 else
6993 return (0);
6994 } else if (s1 == c)
6995 ok++;
6996
6997 if ((c = *p++) == '\0')
6998 return (0);
6999
7000 } while (c != ']');
7001
7002 if (ok)
7003 goto top;
7004
7005 return (0);
7006 }
7007
7008 case '\\':
7009 if ((c = *p++) == '\0')
7010 return (0);
7011 /*FALLTHRU*/
7012
7013 default:
7014 if (c != s1)
7015 return (0);
7016 /*FALLTHRU*/
7017
7018 case '?':
7019 if (s1 != '\0')
7020 goto top;
7021 return (0);
7022
7023 case '*':
7024 while (*p == '*')
7025 p++; /* consecutive *'s are identical to a single one */
7026
7027 if (*p == '\0')
7028 return (1);
7029
7030 for (s = olds; *s != '\0'; s++) {
7031 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7032 return (gs);
7033 }
7034
7035 return (0);
7036 }
7037}
7038
7039/*ARGSUSED*/
7040static int
7041dtrace_match_string(const char *s, const char *p, int depth)
7042{
7043 return (s != NULL && strcmp(s, p) == 0);
7044}
7045
7046/*ARGSUSED*/
7047static int
7048dtrace_match_nul(const char *s, const char *p, int depth)
7049{
7050 return (1); /* always match the empty pattern */
7051}
7052
7053/*ARGSUSED*/
7054static int
7055dtrace_match_nonzero(const char *s, const char *p, int depth)
7056{
7057 return (s != NULL && s[0] != '\0');
7058}
7059
7060static int
7061dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7062 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7063{
7064 dtrace_probe_t template, *probe;
7065 dtrace_hash_t *hash = NULL;
7066 int len, rc, best = INT_MAX, nmatched = 0;
7067 dtrace_id_t i;
7068
7069 ASSERT(MUTEX_HELD(&dtrace_lock));
7070
7071 /*
7072 * If the probe ID is specified in the key, just lookup by ID and
7073 * invoke the match callback once if a matching probe is found.
7074 */
7075 if (pkp->dtpk_id != DTRACE_IDNONE) {
7076 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7077 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7078 if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7079 return (DTRACE_MATCH_FAIL);
7080 nmatched++;
7081 }
7082 return (nmatched);
7083 }
7084
7085 template.dtpr_mod = (char *)pkp->dtpk_mod;
7086 template.dtpr_func = (char *)pkp->dtpk_func;
7087 template.dtpr_name = (char *)pkp->dtpk_name;
7088
7089 /*
7090 * We want to find the most distinct of the module name, function
7091 * name, and name. So for each one that is not a glob pattern or
7092 * empty string, we perform a lookup in the corresponding hash and
7093 * use the hash table with the fewest collisions to do our search.
7094 */
7095 if (pkp->dtpk_mmatch == &dtrace_match_string &&
7096 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7097 best = len;
7098 hash = dtrace_bymod;
7099 }
7100
7101 if (pkp->dtpk_fmatch == &dtrace_match_string &&
7102 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7103 best = len;
7104 hash = dtrace_byfunc;
7105 }
7106
7107 if (pkp->dtpk_nmatch == &dtrace_match_string &&
7108 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7109 best = len;
7110 hash = dtrace_byname;
7111 }
7112
7113 /*
7114 * If we did not select a hash table, iterate over every probe and
7115 * invoke our callback for each one that matches our input probe key.
7116 */
7117 if (hash == NULL) {
7118 for (i = 0; i < VBDTCAST(dtrace_id_t)dtrace_nprobes; i++) {
7119 if ((probe = dtrace_probes[i]) == NULL ||
7120 dtrace_match_probe(probe, pkp, priv, uid,
7121 zoneid) <= 0)
7122 continue;
7123
7124 nmatched++;
7125
7126 if ((rc = (*matched)(probe, arg)) !=
7127 DTRACE_MATCH_NEXT) {
7128 if (rc == DTRACE_MATCH_FAIL)
7129 return (DTRACE_MATCH_FAIL);
7130 break;
7131 }
7132 }
7133
7134 return (nmatched);
7135 }
7136
7137 /*
7138 * If we selected a hash table, iterate over each probe of the same key
7139 * name and invoke the callback for every probe that matches the other
7140 * attributes of our input probe key.
7141 */
7142 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7143 probe = *(DTRACE_HASHNEXT(hash, probe))) {
7144
7145 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7146 continue;
7147
7148 nmatched++;
7149
7150 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7151 if (rc == DTRACE_MATCH_FAIL)
7152 return (DTRACE_MATCH_FAIL);
7153 break;
7154 }
7155 }
7156
7157 return (nmatched);
7158}
7159
7160/*
7161 * Return the function pointer dtrace_probecmp() should use to compare the
7162 * specified pattern with a string. For NULL or empty patterns, we select
7163 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7164 * For non-empty non-glob strings, we use dtrace_match_string().
7165 */
7166static dtrace_probekey_f *
7167dtrace_probekey_func(const char *p)
7168{
7169 char c;
7170
7171 if (p == NULL || *p == '\0')
7172 return (&dtrace_match_nul);
7173
7174 while ((c = *p++) != '\0') {
7175 if (c == '[' || c == '?' || c == '*' || c == '\\')
7176 return (&dtrace_match_glob);
7177 }
7178
7179 return (&dtrace_match_string);
7180}
7181
7182/*
7183 * Build a probe comparison key for use with dtrace_match_probe() from the
7184 * given probe description. By convention, a null key only matches anchored
7185 * probes: if each field is the empty string, reset dtpk_fmatch to
7186 * dtrace_match_nonzero().
7187 */
7188static void
7189dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7190{
7191 pkp->dtpk_prov = pdp->dtpd_provider;
7192 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7193
7194 pkp->dtpk_mod = pdp->dtpd_mod;
7195 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7196
7197 pkp->dtpk_func = pdp->dtpd_func;
7198 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7199
7200 pkp->dtpk_name = pdp->dtpd_name;
7201 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7202
7203 pkp->dtpk_id = pdp->dtpd_id;
7204
7205 if (pkp->dtpk_id == DTRACE_IDNONE &&
7206 pkp->dtpk_pmatch == &dtrace_match_nul &&
7207 pkp->dtpk_mmatch == &dtrace_match_nul &&
7208 pkp->dtpk_fmatch == &dtrace_match_nul &&
7209 pkp->dtpk_nmatch == &dtrace_match_nul)
7210 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7211}
7212
7213/*
7214 * DTrace Provider-to-Framework API Functions
7215 *
7216 * These functions implement much of the Provider-to-Framework API, as
7217 * described in <sys/dtrace.h>. The parts of the API not in this section are
7218 * the functions in the API for probe management (found below), and
7219 * dtrace_probe() itself (found above).
7220 */
7221
7222/*
7223 * Register the calling provider with the DTrace framework. This should
7224 * generally be called by DTrace providers in their attach(9E) entry point.
7225 */
7226int
7227dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7228 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7229{
7230 dtrace_provider_t *provider;
7231
7232 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7233 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7234 "arguments", name ? name : "<NULL>");
7235 return (EINVAL);
7236 }
7237
7238 if (name[0] == '\0' || dtrace_badname(name)) {
7239 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7240 "provider name", name);
7241 return (EINVAL);
7242 }
7243
7244 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7245 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7246 pops->dtps_destroy == NULL ||
7247 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7248 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7249 "provider ops", name);
7250 return (EINVAL);
7251 }
7252
7253 if (dtrace_badattr(&pap->dtpa_provider) ||
7254 dtrace_badattr(&pap->dtpa_mod) ||
7255 dtrace_badattr(&pap->dtpa_func) ||
7256 dtrace_badattr(&pap->dtpa_name) ||
7257 dtrace_badattr(&pap->dtpa_args)) {
7258 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7259 "provider attributes", name);
7260 return (EINVAL);
7261 }
7262
7263 if (priv & ~DTRACE_PRIV_ALL) {
7264 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7265 "privilege attributes", name);
7266 return (EINVAL);
7267 }
7268
7269 if ((priv & DTRACE_PRIV_KERNEL) &&
7270 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7271 pops->dtps_usermode == NULL) {
7272 cmn_err(CE_WARN, "failed to register provider '%s': need "
7273 "dtps_usermode() op for given privilege attributes", name);
7274 return (EINVAL);
7275 }
7276
7277 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7278 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7279 (void) strcpy(provider->dtpv_name, name);
7280
7281 provider->dtpv_attr = *pap;
7282 provider->dtpv_priv.dtpp_flags = priv;
7283 if (cr != NULL) {
7284 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7285 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7286 }
7287 provider->dtpv_pops = *pops;
7288
7289 if (pops->dtps_provide == NULL) {
7290 ASSERT(pops->dtps_provide_module != NULL);
7291 provider->dtpv_pops.dtps_provide =
7292 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7293 }
7294
7295 if (pops->dtps_provide_module == NULL) {
7296 ASSERT(pops->dtps_provide != NULL);
7297 provider->dtpv_pops.dtps_provide_module =
7298 (void (*)(void *, struct modctl *))dtrace_nullop;
7299 }
7300
7301 if (pops->dtps_suspend == NULL) {
7302 ASSERT(pops->dtps_resume == NULL);
7303 provider->dtpv_pops.dtps_suspend =
7304 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7305 provider->dtpv_pops.dtps_resume =
7306 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7307 }
7308
7309 provider->dtpv_arg = arg;
7310 *idp = (dtrace_provider_id_t)provider;
7311
7312 if (pops == &dtrace_provider_ops) {
7313 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7314 ASSERT(MUTEX_HELD(&dtrace_lock));
7315 ASSERT(dtrace_anon.dta_enabling == NULL);
7316
7317 /*
7318 * We make sure that the DTrace provider is at the head of
7319 * the provider chain.
7320 */
7321 provider->dtpv_next = dtrace_provider;
7322 dtrace_provider = provider;
7323 return (0);
7324 }
7325
7326 mutex_enter(&dtrace_provider_lock);
7327 mutex_enter(&dtrace_lock);
7328
7329 /*
7330 * If there is at least one provider registered, we'll add this
7331 * provider after the first provider.
7332 */
7333 if (dtrace_provider != NULL) {
7334 provider->dtpv_next = dtrace_provider->dtpv_next;
7335 dtrace_provider->dtpv_next = provider;
7336 } else {
7337 dtrace_provider = provider;
7338 }
7339
7340 if (dtrace_retained != NULL) {
7341 dtrace_enabling_provide(provider);
7342
7343 /*
7344 * Now we need to call dtrace_enabling_matchall() -- which
7345 * will acquire cpu_lock and dtrace_lock. We therefore need
7346 * to drop all of our locks before calling into it...
7347 */
7348 mutex_exit(&dtrace_lock);
7349 mutex_exit(&dtrace_provider_lock);
7350 dtrace_enabling_matchall();
7351
7352 return (0);
7353 }
7354
7355 mutex_exit(&dtrace_lock);
7356 mutex_exit(&dtrace_provider_lock);
7357
7358 return (0);
7359}
7360
7361/*
7362 * Unregister the specified provider from the DTrace framework. This should
7363 * generally be called by DTrace providers in their detach(9E) entry point.
7364 */
7365int
7366dtrace_unregister(dtrace_provider_id_t id)
7367{
7368 dtrace_provider_t *old = (dtrace_provider_t *)id;
7369 dtrace_provider_t *prev = NULL;
7370 VBDTTYPE(uint32_t,int) i, self = 0;
7371 dtrace_probe_t *probe, *first = NULL;
7372
7373 if (old->dtpv_pops.dtps_enable ==
7374 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7375 /*
7376 * If DTrace itself is the provider, we're called with locks
7377 * already held.
7378 */
7379 ASSERT(old == dtrace_provider);
7380#ifndef VBOX
7381 ASSERT(dtrace_devi != NULL);
7382#endif
7383 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7384 ASSERT(MUTEX_HELD(&dtrace_lock));
7385 self = 1;
7386
7387 if (dtrace_provider->dtpv_next != NULL) {
7388 /*
7389 * There's another provider here; return failure.
7390 */
7391 return (EBUSY);
7392 }
7393 } else {
7394 mutex_enter(&dtrace_provider_lock);
7395 mutex_enter(&mod_lock);
7396 mutex_enter(&dtrace_lock);
7397 }
7398
7399 /*
7400 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7401 * probes, we refuse to let providers slither away, unless this
7402 * provider has already been explicitly invalidated.
7403 */
7404 if (!old->dtpv_defunct &&
7405 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7406 dtrace_anon.dta_state->dts_necbs > 0))) {
7407 if (!self) {
7408 mutex_exit(&dtrace_lock);
7409 mutex_exit(&mod_lock);
7410 mutex_exit(&dtrace_provider_lock);
7411 }
7412 return (EBUSY);
7413 }
7414
7415 /*
7416 * Attempt to destroy the probes associated with this provider.
7417 */
7418 for (i = 0; i < dtrace_nprobes; i++) {
7419 if ((probe = dtrace_probes[i]) == NULL)
7420 continue;
7421
7422 if (probe->dtpr_provider != old)
7423 continue;
7424
7425 if (probe->dtpr_ecb == NULL)
7426 continue;
7427
7428 /*
7429 * We have at least one ECB; we can't remove this provider.
7430 */
7431 if (!self) {
7432 mutex_exit(&dtrace_lock);
7433 mutex_exit(&mod_lock);
7434 mutex_exit(&dtrace_provider_lock);
7435 }
7436 return (EBUSY);
7437 }
7438
7439 /*
7440 * All of the probes for this provider are disabled; we can safely
7441 * remove all of them from their hash chains and from the probe array.
7442 */
7443 for (i = 0; i < dtrace_nprobes; i++) {
7444 if ((probe = dtrace_probes[i]) == NULL)
7445 continue;
7446
7447 if (probe->dtpr_provider != old)
7448 continue;
7449
7450 dtrace_probes[i] = NULL;
7451
7452 dtrace_hash_remove(dtrace_bymod, probe);
7453 dtrace_hash_remove(dtrace_byfunc, probe);
7454 dtrace_hash_remove(dtrace_byname, probe);
7455
7456 if (first == NULL) {
7457 first = probe;
7458 probe->dtpr_nextmod = NULL;
7459 } else {
7460 probe->dtpr_nextmod = first;
7461 first = probe;
7462 }
7463 }
7464
7465 /*
7466 * The provider's probes have been removed from the hash chains and
7467 * from the probe array. Now issue a dtrace_sync() to be sure that
7468 * everyone has cleared out from any probe array processing.
7469 */
7470 dtrace_sync();
7471
7472 for (probe = first; probe != NULL; probe = first) {
7473 first = probe->dtpr_nextmod;
7474
7475 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7476 probe->dtpr_arg);
7477 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7478 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7479 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7480 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7481 kmem_free(probe, sizeof (dtrace_probe_t));
7482 }
7483
7484 if ((prev = dtrace_provider) == old) {
7485#ifndef VBOX
7486 ASSERT(self || dtrace_devi == NULL);
7487 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7488#endif
7489 dtrace_provider = old->dtpv_next;
7490 } else {
7491 while (prev != NULL && prev->dtpv_next != old)
7492 prev = prev->dtpv_next;
7493
7494 if (prev == NULL) {
7495 panic("attempt to unregister non-existent "
7496 "dtrace provider %p\n", (void *)id);
7497 }
7498
7499 prev->dtpv_next = old->dtpv_next;
7500 }
7501
7502 if (!self) {
7503 mutex_exit(&dtrace_lock);
7504 mutex_exit(&mod_lock);
7505 mutex_exit(&dtrace_provider_lock);
7506 }
7507
7508 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7509 kmem_free(old, sizeof (dtrace_provider_t));
7510
7511 return (0);
7512}
7513
7514/*
7515 * Invalidate the specified provider. All subsequent probe lookups for the
7516 * specified provider will fail, but its probes will not be removed.
7517 */
7518void
7519dtrace_invalidate(dtrace_provider_id_t id)
7520{
7521 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7522
7523 ASSERT(pvp->dtpv_pops.dtps_enable !=
7524 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7525
7526 mutex_enter(&dtrace_provider_lock);
7527 mutex_enter(&dtrace_lock);
7528
7529 pvp->dtpv_defunct = 1;
7530
7531 mutex_exit(&dtrace_lock);
7532 mutex_exit(&dtrace_provider_lock);
7533}
7534
7535/*
7536 * Indicate whether or not DTrace has attached.
7537 */
7538int
7539dtrace_attached(void)
7540{
7541 /*
7542 * dtrace_provider will be non-NULL iff the DTrace driver has
7543 * attached. (It's non-NULL because DTrace is always itself a
7544 * provider.)
7545 */
7546 return (dtrace_provider != NULL);
7547}
7548
7549/*
7550 * Remove all the unenabled probes for the given provider. This function is
7551 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7552 * -- just as many of its associated probes as it can.
7553 */
7554int
7555dtrace_condense(dtrace_provider_id_t id)
7556{
7557 dtrace_provider_t *prov = (dtrace_provider_t *)id;
7558 VBDTTYPE(uint32_t,int) i;
7559 dtrace_probe_t *probe;
7560
7561 /*
7562 * Make sure this isn't the dtrace provider itself.
7563 */
7564 ASSERT(prov->dtpv_pops.dtps_enable !=
7565 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7566
7567 mutex_enter(&dtrace_provider_lock);
7568 mutex_enter(&dtrace_lock);
7569
7570 /*
7571 * Attempt to destroy the probes associated with this provider.
7572 */
7573 for (i = 0; i < dtrace_nprobes; i++) {
7574 if ((probe = dtrace_probes[i]) == NULL)
7575 continue;
7576
7577 if (probe->dtpr_provider != prov)
7578 continue;
7579
7580 if (probe->dtpr_ecb != NULL)
7581 continue;
7582
7583 dtrace_probes[i] = NULL;
7584
7585 dtrace_hash_remove(dtrace_bymod, probe);
7586 dtrace_hash_remove(dtrace_byfunc, probe);
7587 dtrace_hash_remove(dtrace_byname, probe);
7588
7589 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7590 probe->dtpr_arg);
7591 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7592 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7593 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7594 kmem_free(probe, sizeof (dtrace_probe_t));
7595 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7596 }
7597
7598 mutex_exit(&dtrace_lock);
7599 mutex_exit(&dtrace_provider_lock);
7600
7601 return (0);
7602}
7603
7604/*
7605 * DTrace Probe Management Functions
7606 *
7607 * The functions in this section perform the DTrace probe management,
7608 * including functions to create probes, look-up probes, and call into the
7609 * providers to request that probes be provided. Some of these functions are
7610 * in the Provider-to-Framework API; these functions can be identified by the
7611 * fact that they are not declared "static".
7612 */
7613
7614/*
7615 * Create a probe with the specified module name, function name, and name.
7616 */
7617dtrace_id_t
7618dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7619 const char *func, const char *name, int aframes, void *arg)
7620{
7621 dtrace_probe_t *probe, **probes;
7622 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7623 dtrace_id_t id;
7624
7625 if (provider == dtrace_provider) {
7626 ASSERT(MUTEX_HELD(&dtrace_lock));
7627 } else {
7628 mutex_enter(&dtrace_lock);
7629 }
7630
7631 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7632 VM_BESTFIT | VM_SLEEP);
7633 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7634
7635 probe->dtpr_id = id;
7636 probe->dtpr_gen = dtrace_probegen++;
7637 probe->dtpr_mod = dtrace_strdup(mod);
7638 probe->dtpr_func = dtrace_strdup(func);
7639 probe->dtpr_name = dtrace_strdup(name);
7640 probe->dtpr_arg = arg;
7641 probe->dtpr_aframes = aframes;
7642 probe->dtpr_provider = provider;
7643
7644 dtrace_hash_add(dtrace_bymod, probe);
7645 dtrace_hash_add(dtrace_byfunc, probe);
7646 dtrace_hash_add(dtrace_byname, probe);
7647
7648 if (id - 1 >= dtrace_nprobes) {
7649 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7650 size_t nsize = osize << 1;
7651
7652 if (nsize == 0) {
7653 ASSERT(osize == 0);
7654 ASSERT(dtrace_probes == NULL);
7655 nsize = sizeof (dtrace_probe_t *);
7656 }
7657
7658 probes = kmem_zalloc(nsize, KM_SLEEP);
7659
7660 if (dtrace_probes == NULL) {
7661 ASSERT(osize == 0);
7662 dtrace_probes = probes;
7663 dtrace_nprobes = 1;
7664 } else {
7665 dtrace_probe_t **oprobes = dtrace_probes;
7666
7667 bcopy(oprobes, probes, osize);
7668 dtrace_membar_producer();
7669 dtrace_probes = probes;
7670
7671 dtrace_sync();
7672
7673 /*
7674 * All CPUs are now seeing the new probes array; we can
7675 * safely free the old array.
7676 */
7677 kmem_free(oprobes, osize);
7678 dtrace_nprobes <<= 1;
7679 }
7680
7681 ASSERT(id - 1 < dtrace_nprobes);
7682 }
7683
7684 ASSERT(dtrace_probes[id - 1] == NULL);
7685 dtrace_probes[id - 1] = probe;
7686
7687 if (provider != dtrace_provider)
7688 mutex_exit(&dtrace_lock);
7689
7690 return (id);
7691}
7692
7693static dtrace_probe_t *
7694dtrace_probe_lookup_id(dtrace_id_t id)
7695{
7696 ASSERT(MUTEX_HELD(&dtrace_lock));
7697
7698 if (id == 0 || id > dtrace_nprobes)
7699 return (NULL);
7700
7701 return (dtrace_probes[id - 1]);
7702}
7703
7704static int
7705dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7706{
7707 *((dtrace_id_t *)arg) = probe->dtpr_id;
7708
7709 return (DTRACE_MATCH_DONE);
7710}
7711
7712/*
7713 * Look up a probe based on provider and one or more of module name, function
7714 * name and probe name.
7715 */
7716dtrace_id_t
7717dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7718 const char *func, const char *name)
7719{
7720 dtrace_probekey_t pkey;
7721 dtrace_id_t id;
7722 int match;
7723
7724 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7725 pkey.dtpk_pmatch = &dtrace_match_string;
7726 pkey.dtpk_mod = mod;
7727 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7728 pkey.dtpk_func = func;
7729 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7730 pkey.dtpk_name = name;
7731 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7732 pkey.dtpk_id = DTRACE_IDNONE;
7733
7734 mutex_enter(&dtrace_lock);
7735 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7736 dtrace_probe_lookup_match, &id);
7737 mutex_exit(&dtrace_lock);
7738
7739 ASSERT(match == 1 || match == 0);
7740 return (match ? id : 0);
7741}
7742
7743/*
7744 * Returns the probe argument associated with the specified probe.
7745 */
7746void *
7747dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7748{
7749 dtrace_probe_t *probe;
7750 void *rval = NULL;
7751
7752 mutex_enter(&dtrace_lock);
7753
7754 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7755 probe->dtpr_provider == (dtrace_provider_t *)id)
7756 rval = probe->dtpr_arg;
7757
7758 mutex_exit(&dtrace_lock);
7759
7760 return (rval);
7761}
7762
7763/*
7764 * Copy a probe into a probe description.
7765 */
7766static void
7767dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7768{
7769 bzero(pdp, sizeof (dtrace_probedesc_t));
7770 pdp->dtpd_id = prp->dtpr_id;
7771
7772 (void) strncpy(pdp->dtpd_provider,
7773 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7774
7775 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7776 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7777 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7778}
7779
7780/*
7781 * Called to indicate that a probe -- or probes -- should be provided by a
7782 * specfied provider. If the specified description is NULL, the provider will
7783 * be told to provide all of its probes. (This is done whenever a new
7784 * consumer comes along, or whenever a retained enabling is to be matched.) If
7785 * the specified description is non-NULL, the provider is given the
7786 * opportunity to dynamically provide the specified probe, allowing providers
7787 * to support the creation of probes on-the-fly. (So-called _autocreated_
7788 * probes.) If the provider is NULL, the operations will be applied to all
7789 * providers; if the provider is non-NULL the operations will only be applied
7790 * to the specified provider. The dtrace_provider_lock must be held, and the
7791 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7792 * will need to grab the dtrace_lock when it reenters the framework through
7793 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7794 */
7795static void
7796dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7797{
7798#ifndef VBOX
7799 struct modctl *ctl;
7800#endif
7801 int all = 0;
7802
7803 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7804
7805 if (prv == NULL) {
7806 all = 1;
7807 prv = dtrace_provider;
7808 }
7809
7810 do {
7811 /*
7812 * First, call the blanket provide operation.
7813 */
7814 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7815
7816#ifndef VBOX
7817 /*
7818 * Now call the per-module provide operation. We will grab
7819 * mod_lock to prevent the list from being modified. Note
7820 * that this also prevents the mod_busy bits from changing.
7821 * (mod_busy can only be changed with mod_lock held.)
7822 */
7823 mutex_enter(&mod_lock);
7824
7825 ctl = &modules;
7826 do {
7827 if (ctl->mod_busy || ctl->mod_mp == NULL)
7828 continue;
7829
7830 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7831
7832 } while ((ctl = ctl->mod_next) != &modules);
7833
7834 mutex_exit(&mod_lock);
7835#endif
7836 } while (all && (prv = prv->dtpv_next) != NULL);
7837}
7838
7839/*
7840 * Iterate over each probe, and call the Framework-to-Provider API function
7841 * denoted by offs.
7842 */
7843static void
7844dtrace_probe_foreach(uintptr_t offs)
7845{
7846 dtrace_provider_t *prov;
7847 void (*func)(void *, dtrace_id_t, void *);
7848 dtrace_probe_t *probe;
7849 dtrace_icookie_t cookie;
7850 VBDTTYPE(uint32_t,int) i;
7851
7852 /*
7853 * We disable interrupts to walk through the probe array. This is
7854 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7855 * won't see stale data.
7856 */
7857 cookie = dtrace_interrupt_disable();
7858
7859 for (i = 0; i < dtrace_nprobes; i++) {
7860 if ((probe = dtrace_probes[i]) == NULL)
7861 continue;
7862
7863 if (probe->dtpr_ecb == NULL) {
7864 /*
7865 * This probe isn't enabled -- don't call the function.
7866 */
7867 continue;
7868 }
7869
7870 prov = probe->dtpr_provider;
7871 func = *((void(**)(void *, dtrace_id_t, void *))
7872 ((uintptr_t)&prov->dtpv_pops + offs));
7873
7874 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7875 }
7876
7877 dtrace_interrupt_enable(cookie);
7878}
7879
7880static int
7881dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7882{
7883 dtrace_probekey_t pkey;
7884 uint32_t priv;
7885 uid_t uid;
7886 zoneid_t zoneid;
7887
7888 ASSERT(MUTEX_HELD(&dtrace_lock));
7889 dtrace_ecb_create_cache = NULL;
7890
7891 if (desc == NULL) {
7892 /*
7893 * If we're passed a NULL description, we're being asked to
7894 * create an ECB with a NULL probe.
7895 */
7896 (void) dtrace_ecb_create_enable(NULL, enab);
7897 return (0);
7898 }
7899
7900 dtrace_probekey(desc, &pkey);
7901 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7902 &priv, &uid, &zoneid);
7903
7904 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7905 enab));
7906}
7907
7908/*
7909 * DTrace Helper Provider Functions
7910 */
7911static void
7912dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7913{
7914 attr->dtat_name = DOF_ATTR_NAME(dofattr);
7915 attr->dtat_data = DOF_ATTR_DATA(dofattr);
7916 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7917}
7918
7919static void
7920dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7921 const dof_provider_t *dofprov, char *strtab)
7922{
7923 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7924 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7925 dofprov->dofpv_provattr);
7926 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7927 dofprov->dofpv_modattr);
7928 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7929 dofprov->dofpv_funcattr);
7930 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7931 dofprov->dofpv_nameattr);
7932 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7933 dofprov->dofpv_argsattr);
7934}
7935
7936static void
7937dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7938{
7939 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7940 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7941 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7942 dof_provider_t *provider;
7943 dof_probe_t *probe;
7944 uint32_t *off, *enoff;
7945 uint8_t *arg;
7946 char *strtab;
7947 uint_t i, nprobes;
7948 dtrace_helper_provdesc_t dhpv;
7949 dtrace_helper_probedesc_t dhpb;
7950 dtrace_meta_t *meta = dtrace_meta_pid;
7951 dtrace_mops_t *mops = &meta->dtm_mops;
7952 void *parg;
7953
7954 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7955 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7956 provider->dofpv_strtab * dof->dofh_secsize);
7957 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7958 provider->dofpv_probes * dof->dofh_secsize);
7959 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7960 provider->dofpv_prargs * dof->dofh_secsize);
7961 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7962 provider->dofpv_proffs * dof->dofh_secsize);
7963
7964 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7965 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7966 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7967 enoff = NULL;
7968
7969 /*
7970 * See dtrace_helper_provider_validate().
7971 */
7972 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7973 provider->dofpv_prenoffs != DOF_SECT_NONE) {
7974 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7975 provider->dofpv_prenoffs * dof->dofh_secsize);
7976 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7977 }
7978
7979 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7980
7981 /*
7982 * Create the provider.
7983 */
7984 dtrace_dofprov2hprov(&dhpv, provider, strtab);
7985
7986 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7987 return;
7988
7989 meta->dtm_count++;
7990
7991 /*
7992 * Create the probes.
7993 */
7994 for (i = 0; i < nprobes; i++) {
7995 probe = (dof_probe_t *)(uintptr_t)(daddr +
7996 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7997
7998 dhpb.dthpb_mod = dhp->dofhp_mod;
7999 dhpb.dthpb_func = strtab + probe->dofpr_func;
8000 dhpb.dthpb_name = strtab + probe->dofpr_name;
8001 dhpb.dthpb_base = probe->dofpr_addr;
8002 dhpb.dthpb_offs = off + probe->dofpr_offidx;
8003 dhpb.dthpb_noffs = probe->dofpr_noffs;
8004 if (enoff != NULL) {
8005 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8006 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8007 } else {
8008 dhpb.dthpb_enoffs = NULL;
8009 dhpb.dthpb_nenoffs = 0;
8010 }
8011 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8012 dhpb.dthpb_nargc = probe->dofpr_nargc;
8013 dhpb.dthpb_xargc = probe->dofpr_xargc;
8014 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8015 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8016
8017 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8018 }
8019}
8020
8021static void
8022dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8023{
8024 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8025 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8026 VBDTTYPE(uint32_t,int) i;
8027
8028 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8029
8030 for (i = 0; i < dof->dofh_secnum; i++) {
8031 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8032 dof->dofh_secoff + i * dof->dofh_secsize);
8033
8034 if (sec->dofs_type != DOF_SECT_PROVIDER)
8035 continue;
8036
8037 dtrace_helper_provide_one(dhp, sec, pid);
8038 }
8039
8040 /*
8041 * We may have just created probes, so we must now rematch against
8042 * any retained enablings. Note that this call will acquire both
8043 * cpu_lock and dtrace_lock; the fact that we are holding
8044 * dtrace_meta_lock now is what defines the ordering with respect to
8045 * these three locks.
8046 */
8047 dtrace_enabling_matchall();
8048}
8049
8050static void
8051dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8052{
8053 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8054 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8055 dof_sec_t *str_sec;
8056 dof_provider_t *provider;
8057 char *strtab;
8058 dtrace_helper_provdesc_t dhpv;
8059 dtrace_meta_t *meta = dtrace_meta_pid;
8060 dtrace_mops_t *mops = &meta->dtm_mops;
8061
8062 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8063 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8064 provider->dofpv_strtab * dof->dofh_secsize);
8065
8066 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8067
8068 /*
8069 * Create the provider.
8070 */
8071 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8072
8073 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8074
8075 meta->dtm_count--;
8076}
8077
8078static void
8079dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8080{
8081 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8082 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8083 VBDTTYPE(uint32_t,int) i;
8084
8085 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8086
8087 for (i = 0; i < dof->dofh_secnum; i++) {
8088 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8089 dof->dofh_secoff + i * dof->dofh_secsize);
8090
8091 if (sec->dofs_type != DOF_SECT_PROVIDER)
8092 continue;
8093
8094 dtrace_helper_provider_remove_one(dhp, sec, pid);
8095 }
8096}
8097
8098/*
8099 * DTrace Meta Provider-to-Framework API Functions
8100 *
8101 * These functions implement the Meta Provider-to-Framework API, as described
8102 * in <sys/dtrace.h>.
8103 */
8104int
8105dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8106 dtrace_meta_provider_id_t *idp)
8107{
8108 dtrace_meta_t *meta;
8109 dtrace_helpers_t *help, *next;
8110 VBDTTYPE(uint32_t,int) i;
8111
8112 *idp = DTRACE_METAPROVNONE;
8113
8114 /*
8115 * We strictly don't need the name, but we hold onto it for
8116 * debuggability. All hail error queues!
8117 */
8118 if (name == NULL) {
8119 cmn_err(CE_WARN, "failed to register meta-provider: "
8120 "invalid name");
8121 return (EINVAL);
8122 }
8123
8124 if (mops == NULL ||
8125 mops->dtms_create_probe == NULL ||
8126 mops->dtms_provide_pid == NULL ||
8127 mops->dtms_remove_pid == NULL) {
8128 cmn_err(CE_WARN, "failed to register meta-register %s: "
8129 "invalid ops", name);
8130 return (EINVAL);
8131 }
8132
8133 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8134 meta->dtm_mops = *mops;
8135 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8136 (void) strcpy(meta->dtm_name, name);
8137 meta->dtm_arg = arg;
8138
8139 mutex_enter(&dtrace_meta_lock);
8140 mutex_enter(&dtrace_lock);
8141
8142 if (dtrace_meta_pid != NULL) {
8143 mutex_exit(&dtrace_lock);
8144 mutex_exit(&dtrace_meta_lock);
8145 cmn_err(CE_WARN, "failed to register meta-register %s: "
8146 "user-land meta-provider exists", name);
8147 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8148 kmem_free(meta, sizeof (dtrace_meta_t));
8149 return (EINVAL);
8150 }
8151
8152 dtrace_meta_pid = meta;
8153 *idp = (dtrace_meta_provider_id_t)meta;
8154
8155 /*
8156 * If there are providers and probes ready to go, pass them
8157 * off to the new meta provider now.
8158 */
8159
8160 help = dtrace_deferred_pid;
8161 dtrace_deferred_pid = NULL;
8162
8163 mutex_exit(&dtrace_lock);
8164
8165 while (help != NULL) {
8166 for (i = 0; i < help->dthps_nprovs; i++) {
8167 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8168 help->dthps_pid);
8169 }
8170
8171 next = help->dthps_next;
8172 help->dthps_next = NULL;
8173 help->dthps_prev = NULL;
8174 help->dthps_deferred = 0;
8175 help = next;
8176 }
8177
8178 mutex_exit(&dtrace_meta_lock);
8179
8180 return (0);
8181}
8182
8183int
8184dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8185{
8186 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8187
8188 mutex_enter(&dtrace_meta_lock);
8189 mutex_enter(&dtrace_lock);
8190
8191 if (old == dtrace_meta_pid) {
8192 pp = &dtrace_meta_pid;
8193 } else {
8194 panic("attempt to unregister non-existent "
8195 "dtrace meta-provider %p\n", (void *)old);
8196#ifdef VBOX
8197 return EINVAL;
8198#endif
8199 }
8200
8201 if (old->dtm_count != 0) {
8202 mutex_exit(&dtrace_lock);
8203 mutex_exit(&dtrace_meta_lock);
8204 return (EBUSY);
8205 }
8206
8207 *pp = NULL;
8208
8209 mutex_exit(&dtrace_lock);
8210 mutex_exit(&dtrace_meta_lock);
8211
8212 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8213 kmem_free(old, sizeof (dtrace_meta_t));
8214
8215 return (0);
8216}
8217
8218
8219/*
8220 * DTrace DIF Object Functions
8221 */
8222static int
8223dtrace_difo_err(uint_t pc, const char *format, ...)
8224{
8225 if (dtrace_err_verbose) {
8226 va_list alist;
8227
8228 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8229 va_start(alist, format);
8230 (void) vuprintf(format, alist);
8231 va_end(alist);
8232 }
8233
8234#ifdef DTRACE_ERRDEBUG
8235 dtrace_errdebug(format);
8236#endif
8237 return (1);
8238}
8239
8240/*
8241 * Validate a DTrace DIF object by checking the IR instructions. The following
8242 * rules are currently enforced by dtrace_difo_validate():
8243 *
8244 * 1. Each instruction must have a valid opcode
8245 * 2. Each register, string, variable, or subroutine reference must be valid
8246 * 3. No instruction can modify register %r0 (must be zero)
8247 * 4. All instruction reserved bits must be set to zero
8248 * 5. The last instruction must be a "ret" instruction
8249 * 6. All branch targets must reference a valid instruction _after_ the branch
8250 */
8251static int
8252dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8253 cred_t *cr)
8254{
8255#ifndef VBOX
8256 int err = 0, i;
8257#else
8258 int err = 0;
8259 uint_t i;
8260#endif
8261 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8262 int kcheckload;
8263 uint_t pc;
8264
8265 kcheckload = cr == NULL ||
8266 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8267
8268 dp->dtdo_destructive = 0;
8269
8270 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8271 dif_instr_t instr = dp->dtdo_buf[pc];
8272
8273 uint_t r1 = DIF_INSTR_R1(instr);
8274 uint_t r2 = DIF_INSTR_R2(instr);
8275 uint_t rd = DIF_INSTR_RD(instr);
8276 uint_t rs = DIF_INSTR_RS(instr);
8277 uint_t label = DIF_INSTR_LABEL(instr);
8278 uint_t v = DIF_INSTR_VAR(instr);
8279 uint_t subr = DIF_INSTR_SUBR(instr);
8280 uint_t type = DIF_INSTR_TYPE(instr);
8281 uint_t op = DIF_INSTR_OP(instr);
8282
8283 switch (op) {
8284 case DIF_OP_OR:
8285 case DIF_OP_XOR:
8286 case DIF_OP_AND:
8287 case DIF_OP_SLL:
8288 case DIF_OP_SRL:
8289 case DIF_OP_SRA:
8290 case DIF_OP_SUB:
8291 case DIF_OP_ADD:
8292 case DIF_OP_MUL:
8293 case DIF_OP_SDIV:
8294 case DIF_OP_UDIV:
8295 case DIF_OP_SREM:
8296 case DIF_OP_UREM:
8297 case DIF_OP_COPYS:
8298 if (r1 >= nregs)
8299 err += efunc(pc, "invalid register %u\n", r1);
8300 if (r2 >= nregs)
8301 err += efunc(pc, "invalid register %u\n", r2);
8302 if (rd >= nregs)
8303 err += efunc(pc, "invalid register %u\n", rd);
8304 if (rd == 0)
8305 err += efunc(pc, "cannot write to %r0\n");
8306 break;
8307 case DIF_OP_NOT:
8308 case DIF_OP_MOV:
8309 case DIF_OP_ALLOCS:
8310 if (r1 >= nregs)
8311 err += efunc(pc, "invalid register %u\n", r1);
8312 if (r2 != 0)
8313 err += efunc(pc, "non-zero reserved bits\n");
8314 if (rd >= nregs)
8315 err += efunc(pc, "invalid register %u\n", rd);
8316 if (rd == 0)
8317 err += efunc(pc, "cannot write to %r0\n");
8318 break;
8319 case DIF_OP_LDSB:
8320 case DIF_OP_LDSH:
8321 case DIF_OP_LDSW:
8322 case DIF_OP_LDUB:
8323 case DIF_OP_LDUH:
8324 case DIF_OP_LDUW:
8325 case DIF_OP_LDX:
8326 if (r1 >= nregs)
8327 err += efunc(pc, "invalid register %u\n", r1);
8328 if (r2 != 0)
8329 err += efunc(pc, "non-zero reserved bits\n");
8330 if (rd >= nregs)
8331 err += efunc(pc, "invalid register %u\n", rd);
8332 if (rd == 0)
8333 err += efunc(pc, "cannot write to %r0\n");
8334 if (kcheckload)
8335 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8336 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8337 break;
8338 case DIF_OP_RLDSB:
8339 case DIF_OP_RLDSH:
8340 case DIF_OP_RLDSW:
8341 case DIF_OP_RLDUB:
8342 case DIF_OP_RLDUH:
8343 case DIF_OP_RLDUW:
8344 case DIF_OP_RLDX:
8345 if (r1 >= nregs)
8346 err += efunc(pc, "invalid register %u\n", r1);
8347 if (r2 != 0)
8348 err += efunc(pc, "non-zero reserved bits\n");
8349 if (rd >= nregs)
8350 err += efunc(pc, "invalid register %u\n", rd);
8351 if (rd == 0)
8352 err += efunc(pc, "cannot write to %r0\n");
8353 break;
8354 case DIF_OP_ULDSB:
8355 case DIF_OP_ULDSH:
8356 case DIF_OP_ULDSW:
8357 case DIF_OP_ULDUB:
8358 case DIF_OP_ULDUH:
8359 case DIF_OP_ULDUW:
8360 case DIF_OP_ULDX:
8361 if (r1 >= nregs)
8362 err += efunc(pc, "invalid register %u\n", r1);
8363 if (r2 != 0)
8364 err += efunc(pc, "non-zero reserved bits\n");
8365 if (rd >= nregs)
8366 err += efunc(pc, "invalid register %u\n", rd);
8367 if (rd == 0)
8368 err += efunc(pc, "cannot write to %r0\n");
8369 break;
8370 case DIF_OP_STB:
8371 case DIF_OP_STH:
8372 case DIF_OP_STW:
8373 case DIF_OP_STX:
8374 if (r1 >= nregs)
8375 err += efunc(pc, "invalid register %u\n", r1);
8376 if (r2 != 0)
8377 err += efunc(pc, "non-zero reserved bits\n");
8378 if (rd >= nregs)
8379 err += efunc(pc, "invalid register %u\n", rd);
8380 if (rd == 0)
8381 err += efunc(pc, "cannot write to 0 address\n");
8382 break;
8383 case DIF_OP_CMP:
8384 case DIF_OP_SCMP:
8385 if (r1 >= nregs)
8386 err += efunc(pc, "invalid register %u\n", r1);
8387 if (r2 >= nregs)
8388 err += efunc(pc, "invalid register %u\n", r2);
8389 if (rd != 0)
8390 err += efunc(pc, "non-zero reserved bits\n");
8391 break;
8392 case DIF_OP_TST:
8393 if (r1 >= nregs)
8394 err += efunc(pc, "invalid register %u\n", r1);
8395 if (r2 != 0 || rd != 0)
8396 err += efunc(pc, "non-zero reserved bits\n");
8397 break;
8398 case DIF_OP_BA:
8399 case DIF_OP_BE:
8400 case DIF_OP_BNE:
8401 case DIF_OP_BG:
8402 case DIF_OP_BGU:
8403 case DIF_OP_BGE:
8404 case DIF_OP_BGEU:
8405 case DIF_OP_BL:
8406 case DIF_OP_BLU:
8407 case DIF_OP_BLE:
8408 case DIF_OP_BLEU:
8409 if (label >= dp->dtdo_len) {
8410 err += efunc(pc, "invalid branch target %u\n",
8411 label);
8412 }
8413 if (label <= pc) {
8414 err += efunc(pc, "backward branch to %u\n",
8415 label);
8416 }
8417 break;
8418 case DIF_OP_RET:
8419 if (r1 != 0 || r2 != 0)
8420 err += efunc(pc, "non-zero reserved bits\n");
8421 if (rd >= nregs)
8422 err += efunc(pc, "invalid register %u\n", rd);
8423 break;
8424 case DIF_OP_NOP:
8425 case DIF_OP_POPTS:
8426 case DIF_OP_FLUSHTS:
8427 if (r1 != 0 || r2 != 0 || rd != 0)
8428 err += efunc(pc, "non-zero reserved bits\n");
8429 break;
8430 case DIF_OP_SETX:
8431 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8432 err += efunc(pc, "invalid integer ref %u\n",
8433 DIF_INSTR_INTEGER(instr));
8434 }
8435 if (rd >= nregs)
8436 err += efunc(pc, "invalid register %u\n", rd);
8437 if (rd == 0)
8438 err += efunc(pc, "cannot write to %r0\n");
8439 break;
8440 case DIF_OP_SETS:
8441 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8442 err += efunc(pc, "invalid string ref %u\n",
8443 DIF_INSTR_STRING(instr));
8444 }
8445 if (rd >= nregs)
8446 err += efunc(pc, "invalid register %u\n", rd);
8447 if (rd == 0)
8448 err += efunc(pc, "cannot write to %r0\n");
8449 break;
8450 case DIF_OP_LDGA:
8451 case DIF_OP_LDTA:
8452 if (r1 > DIF_VAR_ARRAY_MAX)
8453 err += efunc(pc, "invalid array %u\n", r1);
8454 if (r2 >= nregs)
8455 err += efunc(pc, "invalid register %u\n", r2);
8456 if (rd >= nregs)
8457 err += efunc(pc, "invalid register %u\n", rd);
8458 if (rd == 0)
8459 err += efunc(pc, "cannot write to %r0\n");
8460 break;
8461 case DIF_OP_LDGS:
8462 case DIF_OP_LDTS:
8463 case DIF_OP_LDLS:
8464 case DIF_OP_LDGAA:
8465 case DIF_OP_LDTAA:
8466 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8467 err += efunc(pc, "invalid variable %u\n", v);
8468 if (rd >= nregs)
8469 err += efunc(pc, "invalid register %u\n", rd);
8470 if (rd == 0)
8471 err += efunc(pc, "cannot write to %r0\n");
8472 break;
8473 case DIF_OP_STGS:
8474 case DIF_OP_STTS:
8475 case DIF_OP_STLS:
8476 case DIF_OP_STGAA:
8477 case DIF_OP_STTAA:
8478 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8479 err += efunc(pc, "invalid variable %u\n", v);
8480 if (rs >= nregs)
8481 err += efunc(pc, "invalid register %u\n", rd);
8482 break;
8483 case DIF_OP_CALL:
8484 if (subr > DIF_SUBR_MAX)
8485 err += efunc(pc, "invalid subr %u\n", subr);
8486 if (rd >= nregs)
8487 err += efunc(pc, "invalid register %u\n", rd);
8488 if (rd == 0)
8489 err += efunc(pc, "cannot write to %r0\n");
8490
8491 if (subr == DIF_SUBR_COPYOUT ||
8492 subr == DIF_SUBR_COPYOUTSTR) {
8493 dp->dtdo_destructive = 1;
8494 }
8495 break;
8496 case DIF_OP_PUSHTR:
8497 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8498 err += efunc(pc, "invalid ref type %u\n", type);
8499 if (r2 >= nregs)
8500 err += efunc(pc, "invalid register %u\n", r2);
8501 if (rs >= nregs)
8502 err += efunc(pc, "invalid register %u\n", rs);
8503 break;
8504 case DIF_OP_PUSHTV:
8505 if (type != DIF_TYPE_CTF)
8506 err += efunc(pc, "invalid val type %u\n", type);
8507 if (r2 >= nregs)
8508 err += efunc(pc, "invalid register %u\n", r2);
8509 if (rs >= nregs)
8510 err += efunc(pc, "invalid register %u\n", rs);
8511 break;
8512 default:
8513 err += efunc(pc, "invalid opcode %u\n",
8514 DIF_INSTR_OP(instr));
8515 }
8516 }
8517
8518 if (dp->dtdo_len != 0 &&
8519 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8520 err += efunc(dp->dtdo_len - 1,
8521 "expected 'ret' as last DIF instruction\n");
8522 }
8523
8524 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8525 /*
8526 * If we're not returning by reference, the size must be either
8527 * 0 or the size of one of the base types.
8528 */
8529 switch (dp->dtdo_rtype.dtdt_size) {
8530 case 0:
8531 case sizeof (uint8_t):
8532 case sizeof (uint16_t):
8533 case sizeof (uint32_t):
8534 case sizeof (uint64_t):
8535 break;
8536
8537 default:
8538 err += efunc(dp->dtdo_len - 1, "bad return size\n");
8539 }
8540 }
8541
8542 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8543 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8544 dtrace_diftype_t *vt, *et;
8545 uint_t id, ndx;
8546
8547 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8548 v->dtdv_scope != DIFV_SCOPE_THREAD &&
8549 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8550 err += efunc(i, "unrecognized variable scope %d\n",
8551 v->dtdv_scope);
8552 break;
8553 }
8554
8555 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8556 v->dtdv_kind != DIFV_KIND_SCALAR) {
8557 err += efunc(i, "unrecognized variable type %d\n",
8558 v->dtdv_kind);
8559 break;
8560 }
8561
8562 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8563 err += efunc(i, "%d exceeds variable id limit\n", id);
8564 break;
8565 }
8566
8567 if (id < DIF_VAR_OTHER_UBASE)
8568 continue;
8569
8570 /*
8571 * For user-defined variables, we need to check that this
8572 * definition is identical to any previous definition that we
8573 * encountered.
8574 */
8575 ndx = id - DIF_VAR_OTHER_UBASE;
8576
8577 switch (v->dtdv_scope) {
8578 case DIFV_SCOPE_GLOBAL:
8579 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nglobals) {
8580 dtrace_statvar_t *svar;
8581
8582 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8583 existing = &svar->dtsv_var;
8584 }
8585
8586 break;
8587
8588 case DIFV_SCOPE_THREAD:
8589 if (VBDTCAST(int64_t)ndx < vstate->dtvs_ntlocals)
8590 existing = &vstate->dtvs_tlocals[ndx];
8591 break;
8592
8593 case DIFV_SCOPE_LOCAL:
8594 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nlocals) {
8595 dtrace_statvar_t *svar;
8596
8597 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8598 existing = &svar->dtsv_var;
8599 }
8600
8601 break;
8602 }
8603
8604 vt = &v->dtdv_type;
8605
8606 if (vt->dtdt_flags & DIF_TF_BYREF) {
8607 if (vt->dtdt_size == 0) {
8608 err += efunc(i, "zero-sized variable\n");
8609 break;
8610 }
8611
8612 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8613 vt->dtdt_size > dtrace_global_maxsize) {
8614 err += efunc(i, "oversized by-ref global\n");
8615 break;
8616 }
8617 }
8618
8619 if (existing == NULL || existing->dtdv_id == 0)
8620 continue;
8621
8622 ASSERT(existing->dtdv_id == v->dtdv_id);
8623 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8624
8625 if (existing->dtdv_kind != v->dtdv_kind)
8626 err += efunc(i, "%d changed variable kind\n", id);
8627
8628 et = &existing->dtdv_type;
8629
8630 if (vt->dtdt_flags != et->dtdt_flags) {
8631 err += efunc(i, "%d changed variable type flags\n", id);
8632 break;
8633 }
8634
8635 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8636 err += efunc(i, "%d changed variable type size\n", id);
8637 break;
8638 }
8639 }
8640
8641 return (err);
8642}
8643
8644/*
8645 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
8646 * are much more constrained than normal DIFOs. Specifically, they may
8647 * not:
8648 *
8649 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8650 * miscellaneous string routines
8651 * 2. Access DTrace variables other than the args[] array, and the
8652 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8653 * 3. Have thread-local variables.
8654 * 4. Have dynamic variables.
8655 */
8656static int
8657dtrace_difo_validate_helper(dtrace_difo_t *dp)
8658{
8659 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8660 int err = 0;
8661 uint_t pc;
8662
8663 for (pc = 0; pc < dp->dtdo_len; pc++) {
8664 dif_instr_t instr = dp->dtdo_buf[pc];
8665
8666 uint_t v = DIF_INSTR_VAR(instr);
8667 uint_t subr = DIF_INSTR_SUBR(instr);
8668 uint_t op = DIF_INSTR_OP(instr);
8669
8670 switch (op) {
8671 case DIF_OP_OR:
8672 case DIF_OP_XOR:
8673 case DIF_OP_AND:
8674 case DIF_OP_SLL:
8675 case DIF_OP_SRL:
8676 case DIF_OP_SRA:
8677 case DIF_OP_SUB:
8678 case DIF_OP_ADD:
8679 case DIF_OP_MUL:
8680 case DIF_OP_SDIV:
8681 case DIF_OP_UDIV:
8682 case DIF_OP_SREM:
8683 case DIF_OP_UREM:
8684 case DIF_OP_COPYS:
8685 case DIF_OP_NOT:
8686 case DIF_OP_MOV:
8687 case DIF_OP_RLDSB:
8688 case DIF_OP_RLDSH:
8689 case DIF_OP_RLDSW:
8690 case DIF_OP_RLDUB:
8691 case DIF_OP_RLDUH:
8692 case DIF_OP_RLDUW:
8693 case DIF_OP_RLDX:
8694 case DIF_OP_ULDSB:
8695 case DIF_OP_ULDSH:
8696 case DIF_OP_ULDSW:
8697 case DIF_OP_ULDUB:
8698 case DIF_OP_ULDUH:
8699 case DIF_OP_ULDUW:
8700 case DIF_OP_ULDX:
8701 case DIF_OP_STB:
8702 case DIF_OP_STH:
8703 case DIF_OP_STW:
8704 case DIF_OP_STX:
8705 case DIF_OP_ALLOCS:
8706 case DIF_OP_CMP:
8707 case DIF_OP_SCMP:
8708 case DIF_OP_TST:
8709 case DIF_OP_BA:
8710 case DIF_OP_BE:
8711 case DIF_OP_BNE:
8712 case DIF_OP_BG:
8713 case DIF_OP_BGU:
8714 case DIF_OP_BGE:
8715 case DIF_OP_BGEU:
8716 case DIF_OP_BL:
8717 case DIF_OP_BLU:
8718 case DIF_OP_BLE:
8719 case DIF_OP_BLEU:
8720 case DIF_OP_RET:
8721 case DIF_OP_NOP:
8722 case DIF_OP_POPTS:
8723 case DIF_OP_FLUSHTS:
8724 case DIF_OP_SETX:
8725 case DIF_OP_SETS:
8726 case DIF_OP_LDGA:
8727 case DIF_OP_LDLS:
8728 case DIF_OP_STGS:
8729 case DIF_OP_STLS:
8730 case DIF_OP_PUSHTR:
8731 case DIF_OP_PUSHTV:
8732 break;
8733
8734 case DIF_OP_LDGS:
8735 if (v >= DIF_VAR_OTHER_UBASE)
8736 break;
8737
8738 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8739 break;
8740
8741 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8742 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8743 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8744 v == DIF_VAR_UID || v == DIF_VAR_GID)
8745 break;
8746
8747 err += efunc(pc, "illegal variable %u\n", v);
8748 break;
8749
8750 case DIF_OP_LDTA:
8751 case DIF_OP_LDTS:
8752 case DIF_OP_LDGAA:
8753 case DIF_OP_LDTAA:
8754 err += efunc(pc, "illegal dynamic variable load\n");
8755 break;
8756
8757 case DIF_OP_STTS:
8758 case DIF_OP_STGAA:
8759 case DIF_OP_STTAA:
8760 err += efunc(pc, "illegal dynamic variable store\n");
8761 break;
8762
8763 case DIF_OP_CALL:
8764 if (subr == DIF_SUBR_ALLOCA ||
8765 subr == DIF_SUBR_BCOPY ||
8766 subr == DIF_SUBR_COPYIN ||
8767 subr == DIF_SUBR_COPYINTO ||
8768 subr == DIF_SUBR_COPYINSTR ||
8769 subr == DIF_SUBR_INDEX ||
8770 subr == DIF_SUBR_INET_NTOA ||
8771 subr == DIF_SUBR_INET_NTOA6 ||
8772 subr == DIF_SUBR_INET_NTOP ||
8773 subr == DIF_SUBR_LLTOSTR ||
8774 subr == DIF_SUBR_RINDEX ||
8775 subr == DIF_SUBR_STRCHR ||
8776 subr == DIF_SUBR_STRJOIN ||
8777 subr == DIF_SUBR_STRRCHR ||
8778 subr == DIF_SUBR_STRSTR ||
8779 subr == DIF_SUBR_HTONS ||
8780 subr == DIF_SUBR_HTONL ||
8781 subr == DIF_SUBR_HTONLL ||
8782 subr == DIF_SUBR_NTOHS ||
8783 subr == DIF_SUBR_NTOHL ||
8784 subr == DIF_SUBR_NTOHLL)
8785 break;
8786
8787 err += efunc(pc, "invalid subr %u\n", subr);
8788 break;
8789
8790 default:
8791 err += efunc(pc, "invalid opcode %u\n",
8792 DIF_INSTR_OP(instr));
8793 }
8794 }
8795
8796 return (err);
8797}
8798
8799/*
8800 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8801 * basis; 0 if not.
8802 */
8803static int
8804dtrace_difo_cacheable(dtrace_difo_t *dp)
8805{
8806 VBDTTYPE(uint_t,int) i;
8807
8808 if (dp == NULL)
8809 return (0);
8810
8811 for (i = 0; i < dp->dtdo_varlen; i++) {
8812 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8813
8814 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8815 continue;
8816
8817 switch (v->dtdv_id) {
8818 case DIF_VAR_CURTHREAD:
8819 case DIF_VAR_PID:
8820 case DIF_VAR_TID:
8821 case DIF_VAR_EXECNAME:
8822 case DIF_VAR_ZONENAME:
8823 break;
8824
8825 default:
8826 return (0);
8827 }
8828 }
8829
8830 /*
8831 * This DIF object may be cacheable. Now we need to look for any
8832 * array loading instructions, any memory loading instructions, or
8833 * any stores to thread-local variables.
8834 */
8835 for (i = 0; i < dp->dtdo_len; i++) {
8836 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8837
8838 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8839 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8840 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8841 op == DIF_OP_LDGA || op == DIF_OP_STTS)
8842 return (0);
8843 }
8844
8845 return (1);
8846}
8847
8848static void
8849dtrace_difo_hold(dtrace_difo_t *dp)
8850{
8851#ifndef VBOX
8852 VBDTTYPE(uint_t,int) i;
8853#endif
8854
8855 ASSERT(MUTEX_HELD(&dtrace_lock));
8856
8857 dp->dtdo_refcnt++;
8858 ASSERT(dp->dtdo_refcnt != 0);
8859
8860#ifndef VBOX
8861 /*
8862 * We need to check this DIF object for references to the variable
8863 * DIF_VAR_VTIMESTAMP.
8864 */
8865 for (i = 0; i < dp->dtdo_varlen; i++) {
8866 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8867
8868 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8869 continue;
8870
8871 if (dtrace_vtime_references++ == 0)
8872 dtrace_vtime_enable();
8873 }
8874#endif
8875}
8876
8877/*
8878 * This routine calculates the dynamic variable chunksize for a given DIF
8879 * object. The calculation is not fool-proof, and can probably be tricked by
8880 * malicious DIF -- but it works for all compiler-generated DIF. Because this
8881 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8882 * if a dynamic variable size exceeds the chunksize.
8883 */
8884static void
8885dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8886{
8887 uint64_t sval VBDTGCC(0);
8888 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8889 const dif_instr_t *text = dp->dtdo_buf;
8890 uint_t pc, srd = 0;
8891 uint_t ttop = 0;
8892 size_t size, ksize;
8893 uint_t id, i;
8894
8895 for (pc = 0; pc < dp->dtdo_len; pc++) {
8896 dif_instr_t instr = text[pc];
8897 uint_t op = DIF_INSTR_OP(instr);
8898 uint_t rd = DIF_INSTR_RD(instr);
8899 uint_t r1 = DIF_INSTR_R1(instr);
8900 uint_t nkeys = 0;
8901 uchar_t scope VBDTGCC(0);
8902
8903 dtrace_key_t *key = tupregs;
8904
8905 switch (op) {
8906 case DIF_OP_SETX:
8907 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8908 srd = rd;
8909 continue;
8910
8911 case DIF_OP_STTS:
8912 key = &tupregs[DIF_DTR_NREGS];
8913 key[0].dttk_size = 0;
8914 key[1].dttk_size = 0;
8915 nkeys = 2;
8916 scope = DIFV_SCOPE_THREAD;
8917 break;
8918
8919 case DIF_OP_STGAA:
8920 case DIF_OP_STTAA:
8921 nkeys = ttop;
8922
8923 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8924 key[nkeys++].dttk_size = 0;
8925
8926 key[nkeys++].dttk_size = 0;
8927
8928 if (op == DIF_OP_STTAA) {
8929 scope = DIFV_SCOPE_THREAD;
8930 } else {
8931 scope = DIFV_SCOPE_GLOBAL;
8932 }
8933
8934 break;
8935
8936 case DIF_OP_PUSHTR:
8937 if (ttop == DIF_DTR_NREGS)
8938 return;
8939
8940 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8941 /*
8942 * If the register for the size of the "pushtr"
8943 * is %r0 (or the value is 0) and the type is
8944 * a string, we'll use the system-wide default
8945 * string size.
8946 */
8947 tupregs[ttop++].dttk_size =
8948 dtrace_strsize_default;
8949 } else {
8950 if (srd == 0)
8951 return;
8952
8953 tupregs[ttop++].dttk_size = sval;
8954 }
8955
8956 break;
8957
8958 case DIF_OP_PUSHTV:
8959 if (ttop == DIF_DTR_NREGS)
8960 return;
8961
8962 tupregs[ttop++].dttk_size = 0;
8963 break;
8964
8965 case DIF_OP_FLUSHTS:
8966 ttop = 0;
8967 break;
8968
8969 case DIF_OP_POPTS:
8970 if (ttop != 0)
8971 ttop--;
8972 break;
8973 }
8974
8975 sval = 0;
8976 srd = 0;
8977
8978 if (nkeys == 0)
8979 continue;
8980
8981 /*
8982 * We have a dynamic variable allocation; calculate its size.
8983 */
8984 for (ksize = 0, i = 0; i < nkeys; i++)
8985 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8986
8987 size = sizeof (dtrace_dynvar_t);
8988 size += sizeof (dtrace_key_t) * (nkeys - 1);
8989 size += ksize;
8990
8991 /*
8992 * Now we need to determine the size of the stored data.
8993 */
8994 id = DIF_INSTR_VAR(instr);
8995
8996 for (i = 0; i < dp->dtdo_varlen; i++) {
8997 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8998
8999 if (v->dtdv_id == id && v->dtdv_scope == scope) {
9000 size += v->dtdv_type.dtdt_size;
9001 break;
9002 }
9003 }
9004
9005 if (i == dp->dtdo_varlen)
9006 return;
9007
9008 /*
9009 * We have the size. If this is larger than the chunk size
9010 * for our dynamic variable state, reset the chunk size.
9011 */
9012 size = P2ROUNDUP(size, sizeof (uint64_t));
9013
9014 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9015 vstate->dtvs_dynvars.dtds_chunksize = size;
9016 }
9017}
9018
9019static void
9020dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9021{
9022#ifndef VBOX
9023 int i, oldsvars, osz, nsz, otlocals, ntlocals;
9024#else
9025 int oldsvars, osz, nsz, otlocals, ntlocals;
9026 uint_t i;
9027#endif
9028 uint_t id;
9029
9030 ASSERT(MUTEX_HELD(&dtrace_lock));
9031 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9032
9033 for (i = 0; i < dp->dtdo_varlen; i++) {
9034 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9035 dtrace_statvar_t *svar, ***svarp;
9036 size_t dsize = 0;
9037 uint8_t scope = v->dtdv_scope;
9038 int *np;
9039
9040 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9041 continue;
9042
9043 id -= DIF_VAR_OTHER_UBASE;
9044
9045 switch (scope) {
9046 case DIFV_SCOPE_THREAD:
9047 while (VBDTCAST(int64_t)id >= (otlocals = vstate->dtvs_ntlocals)) {
9048 dtrace_difv_t *tlocals;
9049
9050 if ((ntlocals = (otlocals << 1)) == 0)
9051 ntlocals = 1;
9052
9053 osz = otlocals * sizeof (dtrace_difv_t);
9054 nsz = ntlocals * sizeof (dtrace_difv_t);
9055
9056 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9057
9058 if (osz != 0) {
9059 bcopy(vstate->dtvs_tlocals,
9060 tlocals, osz);
9061 kmem_free(vstate->dtvs_tlocals, osz);
9062 }
9063
9064 vstate->dtvs_tlocals = tlocals;
9065 vstate->dtvs_ntlocals = ntlocals;
9066 }
9067
9068 vstate->dtvs_tlocals[id] = *v;
9069 continue;
9070
9071 case DIFV_SCOPE_LOCAL:
9072 np = &vstate->dtvs_nlocals;
9073 svarp = &vstate->dtvs_locals;
9074
9075 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9076 dsize = NCPU * (v->dtdv_type.dtdt_size +
9077 sizeof (uint64_t));
9078 else
9079 dsize = NCPU * sizeof (uint64_t);
9080
9081 break;
9082
9083 case DIFV_SCOPE_GLOBAL:
9084 np = &vstate->dtvs_nglobals;
9085 svarp = &vstate->dtvs_globals;
9086
9087 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9088 dsize = v->dtdv_type.dtdt_size +
9089 sizeof (uint64_t);
9090
9091 break;
9092
9093 default:
9094#ifndef VBOX
9095 ASSERT(0);
9096#else
9097 AssertFatalMsgFailed(("%d\n", scope));
9098#endif
9099 }
9100
9101 while (VBDTCAST(int64_t)id >= (oldsvars = *np)) {
9102 dtrace_statvar_t **statics;
9103 int newsvars, oldsize, newsize;
9104
9105 if ((newsvars = (oldsvars << 1)) == 0)
9106 newsvars = 1;
9107
9108 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9109 newsize = newsvars * sizeof (dtrace_statvar_t *);
9110
9111 statics = kmem_zalloc(newsize, KM_SLEEP);
9112
9113 if (oldsize != 0) {
9114 bcopy(*svarp, statics, oldsize);
9115 kmem_free(*svarp, oldsize);
9116 }
9117
9118 *svarp = statics;
9119 *np = newsvars;
9120 }
9121
9122 if ((svar = (*svarp)[id]) == NULL) {
9123 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9124 svar->dtsv_var = *v;
9125
9126 if ((svar->dtsv_size = dsize) != 0) {
9127 svar->dtsv_data = (uint64_t)(uintptr_t)
9128 kmem_zalloc(dsize, KM_SLEEP);
9129 }
9130
9131 (*svarp)[id] = svar;
9132 }
9133
9134 svar->dtsv_refcnt++;
9135 }
9136
9137 dtrace_difo_chunksize(dp, vstate);
9138 dtrace_difo_hold(dp);
9139}
9140
9141static dtrace_difo_t *
9142dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9143{
9144 dtrace_difo_t *new;
9145 size_t sz;
9146
9147 ASSERT(dp->dtdo_buf != NULL);
9148 ASSERT(dp->dtdo_refcnt != 0);
9149
9150 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9151
9152 ASSERT(dp->dtdo_buf != NULL);
9153 sz = dp->dtdo_len * sizeof (dif_instr_t);
9154 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9155 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9156 new->dtdo_len = dp->dtdo_len;
9157
9158 if (dp->dtdo_strtab != NULL) {
9159 ASSERT(dp->dtdo_strlen != 0);
9160 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9161 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9162 new->dtdo_strlen = dp->dtdo_strlen;
9163 }
9164
9165 if (dp->dtdo_inttab != NULL) {
9166 ASSERT(dp->dtdo_intlen != 0);
9167 sz = dp->dtdo_intlen * sizeof (uint64_t);
9168 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9169 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9170 new->dtdo_intlen = dp->dtdo_intlen;
9171 }
9172
9173 if (dp->dtdo_vartab != NULL) {
9174 ASSERT(dp->dtdo_varlen != 0);
9175 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9176 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9177 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9178 new->dtdo_varlen = dp->dtdo_varlen;
9179 }
9180
9181 dtrace_difo_init(new, vstate);
9182 return (new);
9183}
9184
9185static void
9186dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9187{
9188 VBDTTYPE(uint_t,int) i;
9189
9190 ASSERT(dp->dtdo_refcnt == 0);
9191
9192 for (i = 0; i < dp->dtdo_varlen; i++) {
9193 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9194 dtrace_statvar_t *svar, **svarp;
9195 uint_t id;
9196 uint8_t scope = v->dtdv_scope;
9197 int *np;
9198
9199 switch (scope) {
9200 case DIFV_SCOPE_THREAD:
9201 continue;
9202
9203 case DIFV_SCOPE_LOCAL:
9204 np = &vstate->dtvs_nlocals;
9205 svarp = vstate->dtvs_locals;
9206 break;
9207
9208 case DIFV_SCOPE_GLOBAL:
9209 np = &vstate->dtvs_nglobals;
9210 svarp = vstate->dtvs_globals;
9211 break;
9212
9213 default:
9214#ifndef VBOX
9215 ASSERT(0);
9216#else
9217 AssertFatalMsgFailed(("%d\n", scope));
9218#endif
9219 }
9220
9221 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9222 continue;
9223
9224 id -= DIF_VAR_OTHER_UBASE;
9225 ASSERT(VBDTCAST(int64_t)id < *np);
9226
9227 svar = svarp[id];
9228 ASSERT(svar != NULL);
9229 ASSERT(svar->dtsv_refcnt > 0);
9230
9231 if (--svar->dtsv_refcnt > 0)
9232 continue;
9233
9234 if (svar->dtsv_size != 0) {
9235 ASSERT(svar->dtsv_data != NULL);
9236 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9237 svar->dtsv_size);
9238 }
9239
9240 kmem_free(svar, sizeof (dtrace_statvar_t));
9241 svarp[id] = NULL;
9242 }
9243
9244 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9245 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9246 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9247 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9248
9249 kmem_free(dp, sizeof (dtrace_difo_t));
9250}
9251
9252static void
9253dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9254{
9255#ifndef VBOX
9256 VBDTTYPE(uint_t,int) i;
9257#endif
9258
9259 ASSERT(MUTEX_HELD(&dtrace_lock));
9260 ASSERT(dp->dtdo_refcnt != 0);
9261
9262#ifndef VBOX
9263 for (i = 0; i < dp->dtdo_varlen; i++) {
9264 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9265
9266 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9267 continue;
9268
9269 ASSERT(dtrace_vtime_references > 0);
9270 if (--dtrace_vtime_references == 0)
9271 dtrace_vtime_disable();
9272 }
9273#endif
9274
9275 if (--dp->dtdo_refcnt == 0)
9276 dtrace_difo_destroy(dp, vstate);
9277}
9278
9279/*
9280 * DTrace Format Functions
9281 */
9282static uint16_t
9283dtrace_format_add(dtrace_state_t *state, char *str)
9284{
9285 char *fmt, **new;
9286 uint16_t ndx, len = VBDTCAST(uint16_t)strlen(str) + 1;
9287
9288 fmt = kmem_zalloc(len, KM_SLEEP);
9289 bcopy(str, fmt, len);
9290
9291 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9292 if (state->dts_formats[ndx] == NULL) {
9293 state->dts_formats[ndx] = fmt;
9294 return (ndx + 1);
9295 }
9296 }
9297
9298 if (state->dts_nformats == USHRT_MAX) {
9299 /*
9300 * This is only likely if a denial-of-service attack is being
9301 * attempted. As such, it's okay to fail silently here.
9302 */
9303 kmem_free(fmt, len);
9304 return (0);
9305 }
9306
9307 /*
9308 * For simplicity, we always resize the formats array to be exactly the
9309 * number of formats.
9310 */
9311 ndx = state->dts_nformats++;
9312 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9313
9314 if (state->dts_formats != NULL) {
9315 ASSERT(ndx != 0);
9316 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9317 kmem_free(state->dts_formats, ndx * sizeof (char *));
9318 }
9319
9320 state->dts_formats = new;
9321 state->dts_formats[ndx] = fmt;
9322
9323 return (ndx + 1);
9324}
9325
9326static void
9327dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9328{
9329 char *fmt;
9330
9331 ASSERT(state->dts_formats != NULL);
9332 ASSERT(format <= state->dts_nformats);
9333 ASSERT(state->dts_formats[format - 1] != NULL);
9334
9335 fmt = state->dts_formats[format - 1];
9336 kmem_free(fmt, strlen(fmt) + 1);
9337 state->dts_formats[format - 1] = NULL;
9338}
9339
9340static void
9341dtrace_format_destroy(dtrace_state_t *state)
9342{
9343 int i;
9344
9345 if (state->dts_nformats == 0) {
9346 ASSERT(state->dts_formats == NULL);
9347 return;
9348 }
9349
9350 ASSERT(state->dts_formats != NULL);
9351
9352 for (i = 0; i < state->dts_nformats; i++) {
9353 char *fmt = state->dts_formats[i];
9354
9355 if (fmt == NULL)
9356 continue;
9357
9358 kmem_free(fmt, strlen(fmt) + 1);
9359 }
9360
9361 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9362 state->dts_nformats = 0;
9363 state->dts_formats = NULL;
9364}
9365
9366/*
9367 * DTrace Predicate Functions
9368 */
9369static dtrace_predicate_t *
9370dtrace_predicate_create(dtrace_difo_t *dp)
9371{
9372 dtrace_predicate_t *pred;
9373
9374 ASSERT(MUTEX_HELD(&dtrace_lock));
9375 ASSERT(dp->dtdo_refcnt != 0);
9376
9377 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9378 pred->dtp_difo = dp;
9379 pred->dtp_refcnt = 1;
9380
9381 if (!dtrace_difo_cacheable(dp))
9382 return (pred);
9383
9384 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9385 /*
9386 * This is only theoretically possible -- we have had 2^32
9387 * cacheable predicates on this machine. We cannot allow any
9388 * more predicates to become cacheable: as unlikely as it is,
9389 * there may be a thread caching a (now stale) predicate cache
9390 * ID. (N.B.: the temptation is being successfully resisted to
9391 * have this cmn_err() "Holy shit -- we executed this code!")
9392 */
9393 return (pred);
9394 }
9395
9396 pred->dtp_cacheid = dtrace_predcache_id++;
9397
9398 return (pred);
9399}
9400
9401static void
9402dtrace_predicate_hold(dtrace_predicate_t *pred)
9403{
9404 ASSERT(MUTEX_HELD(&dtrace_lock));
9405 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9406 ASSERT(pred->dtp_refcnt > 0);
9407
9408 pred->dtp_refcnt++;
9409}
9410
9411static void
9412dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9413{
9414 dtrace_difo_t *dp = pred->dtp_difo;
9415
9416 ASSERT(MUTEX_HELD(&dtrace_lock));
9417 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9418 ASSERT(pred->dtp_refcnt > 0);
9419
9420 if (--pred->dtp_refcnt == 0) {
9421 dtrace_difo_release(pred->dtp_difo, vstate);
9422 kmem_free(pred, sizeof (dtrace_predicate_t));
9423 }
9424}
9425
9426/*
9427 * DTrace Action Description Functions
9428 */
9429static dtrace_actdesc_t *
9430dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9431 uint64_t uarg, uint64_t arg)
9432{
9433 dtrace_actdesc_t *act;
9434
9435 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9436 VBDT_IS_VALID_KRNL_ADDR(arg)) || (arg == NULL && kind == DTRACEACT_PRINTA));
9437
9438 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9439 act->dtad_kind = kind;
9440 act->dtad_ntuple = ntuple;
9441 act->dtad_uarg = uarg;
9442 act->dtad_arg = arg;
9443 act->dtad_refcnt = 1;
9444
9445 return (act);
9446}
9447
9448static void
9449dtrace_actdesc_hold(dtrace_actdesc_t *act)
9450{
9451 ASSERT(act->dtad_refcnt >= 1);
9452 act->dtad_refcnt++;
9453}
9454
9455static void
9456dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9457{
9458 dtrace_actkind_t kind = act->dtad_kind;
9459 dtrace_difo_t *dp;
9460
9461 ASSERT(act->dtad_refcnt >= 1);
9462
9463 if (--act->dtad_refcnt != 0)
9464 return;
9465
9466 if ((dp = act->dtad_difo) != NULL)
9467 dtrace_difo_release(dp, vstate);
9468
9469 if (DTRACEACT_ISPRINTFLIKE(kind)) {
9470 char *str = (char *)(uintptr_t)act->dtad_arg;
9471
9472 ASSERT((str != NULL && VBDT_IS_VALID_KRNL_ADDR((uintptr_t)str)) ||
9473 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9474
9475 if (str != NULL)
9476 kmem_free(str, strlen(str) + 1);
9477 }
9478
9479 kmem_free(act, sizeof (dtrace_actdesc_t));
9480}
9481
9482/*
9483 * DTrace ECB Functions
9484 */
9485static dtrace_ecb_t *
9486dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9487{
9488 dtrace_ecb_t *ecb;
9489 dtrace_epid_t epid;
9490
9491 ASSERT(MUTEX_HELD(&dtrace_lock));
9492
9493 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9494 ecb->dte_predicate = NULL;
9495 ecb->dte_probe = probe;
9496
9497 /*
9498 * The default size is the size of the default action: recording
9499 * the epid.
9500 */
9501 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9502 ecb->dte_alignment = sizeof (dtrace_epid_t);
9503
9504 epid = state->dts_epid++;
9505
9506 if (VBDTCAST(int64_t)epid - 1 >= state->dts_necbs) {
9507 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9508 int necbs = state->dts_necbs << 1;
9509
9510 ASSERT(epid == VBDTCAST(dtrace_epid_t)state->dts_necbs + 1);
9511
9512 if (necbs == 0) {
9513 ASSERT(oecbs == NULL);
9514 necbs = 1;
9515 }
9516
9517 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9518
9519 if (oecbs != NULL)
9520 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9521
9522 dtrace_membar_producer();
9523 state->dts_ecbs = ecbs;
9524
9525 if (oecbs != NULL) {
9526 /*
9527 * If this state is active, we must dtrace_sync()
9528 * before we can free the old dts_ecbs array: we're
9529 * coming in hot, and there may be active ring
9530 * buffer processing (which indexes into the dts_ecbs
9531 * array) on another CPU.
9532 */
9533 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9534 dtrace_sync();
9535
9536 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9537 }
9538
9539 dtrace_membar_producer();
9540 state->dts_necbs = necbs;
9541 }
9542
9543 ecb->dte_state = state;
9544
9545 ASSERT(state->dts_ecbs[epid - 1] == NULL);
9546 dtrace_membar_producer();
9547 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9548
9549 return (ecb);
9550}
9551
9552static int
9553dtrace_ecb_enable(dtrace_ecb_t *ecb)
9554{
9555 dtrace_probe_t *probe = ecb->dte_probe;
9556
9557 ASSERT(MUTEX_HELD(&cpu_lock));
9558 ASSERT(MUTEX_HELD(&dtrace_lock));
9559 ASSERT(ecb->dte_next == NULL);
9560
9561 if (probe == NULL) {
9562 /*
9563 * This is the NULL probe -- there's nothing to do.
9564 */
9565 return (0);
9566 }
9567
9568 if (probe->dtpr_ecb == NULL) {
9569 dtrace_provider_t *prov = probe->dtpr_provider;
9570
9571 /*
9572 * We're the first ECB on this probe.
9573 */
9574 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9575
9576 if (ecb->dte_predicate != NULL)
9577 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9578
9579 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9580 probe->dtpr_id, probe->dtpr_arg));
9581 } else {
9582 /*
9583 * This probe is already active. Swing the last pointer to
9584 * point to the new ECB, and issue a dtrace_sync() to assure
9585 * that all CPUs have seen the change.
9586 */
9587 ASSERT(probe->dtpr_ecb_last != NULL);
9588 probe->dtpr_ecb_last->dte_next = ecb;
9589 probe->dtpr_ecb_last = ecb;
9590 probe->dtpr_predcache = 0;
9591
9592 dtrace_sync();
9593 return (0);
9594 }
9595}
9596
9597static void
9598dtrace_ecb_resize(dtrace_ecb_t *ecb)
9599{
9600 uint32_t maxalign = sizeof (dtrace_epid_t);
9601 uint32_t align = sizeof (uint8_t), offs, diff;
9602 dtrace_action_t *act;
9603 int wastuple = 0;
9604 uint32_t aggbase = UINT32_MAX;
9605 dtrace_state_t *state = ecb->dte_state;
9606
9607 /*
9608 * If we record anything, we always record the epid. (And we always
9609 * record it first.)
9610 */
9611 offs = sizeof (dtrace_epid_t);
9612 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9613
9614 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9615 dtrace_recdesc_t *rec = &act->dta_rec;
9616
9617 if ((align = rec->dtrd_alignment) > maxalign)
9618 maxalign = align;
9619
9620 if (!wastuple && act->dta_intuple) {
9621 /*
9622 * This is the first record in a tuple. Align the
9623 * offset to be at offset 4 in an 8-byte aligned
9624 * block.
9625 */
9626 diff = offs + sizeof (dtrace_aggid_t);
9627
9628 if ((diff = (diff & (sizeof (uint64_t) - 1))))
9629 offs += sizeof (uint64_t) - diff;
9630
9631 aggbase = offs - sizeof (dtrace_aggid_t);
9632 ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9633 }
9634
9635 /*LINTED*/
9636 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9637 /*
9638 * The current offset is not properly aligned; align it.
9639 */
9640 offs += align - diff;
9641 }
9642
9643 rec->dtrd_offset = offs;
9644
9645 if (offs + rec->dtrd_size > ecb->dte_needed) {
9646 ecb->dte_needed = offs + rec->dtrd_size;
9647
9648 if (ecb->dte_needed > state->dts_needed)
9649 state->dts_needed = ecb->dte_needed;
9650 }
9651
9652 if (DTRACEACT_ISAGG(act->dta_kind)) {
9653 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9654 dtrace_action_t *first = agg->dtag_first, *prev;
9655
9656 ASSERT(rec->dtrd_size != 0 && first != NULL);
9657 ASSERT(wastuple);
9658 ASSERT(aggbase != UINT32_MAX);
9659
9660 agg->dtag_base = aggbase;
9661
9662 while ((prev = first->dta_prev) != NULL &&
9663 DTRACEACT_ISAGG(prev->dta_kind)) {
9664 agg = (dtrace_aggregation_t *)prev;
9665 first = agg->dtag_first;
9666 }
9667
9668 if (prev != NULL) {
9669 offs = prev->dta_rec.dtrd_offset +
9670 prev->dta_rec.dtrd_size;
9671 } else {
9672 offs = sizeof (dtrace_epid_t);
9673 }
9674 wastuple = 0;
9675 } else {
9676 if (!act->dta_intuple)
9677 ecb->dte_size = offs + rec->dtrd_size;
9678
9679 offs += rec->dtrd_size;
9680 }
9681
9682 wastuple = act->dta_intuple;
9683 }
9684
9685 if ((act = ecb->dte_action) != NULL &&
9686 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9687 ecb->dte_size == sizeof (dtrace_epid_t)) {
9688 /*
9689 * If the size is still sizeof (dtrace_epid_t), then all
9690 * actions store no data; set the size to 0.
9691 */
9692 ecb->dte_alignment = maxalign;
9693 ecb->dte_size = 0;
9694
9695 /*
9696 * If the needed space is still sizeof (dtrace_epid_t), then
9697 * all actions need no additional space; set the needed
9698 * size to 0.
9699 */
9700 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9701 ecb->dte_needed = 0;
9702
9703 return;
9704 }
9705
9706 /*
9707 * Set our alignment, and make sure that the dte_size and dte_needed
9708 * are aligned to the size of an EPID.
9709 */
9710 ecb->dte_alignment = maxalign;
9711 ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9712 ~(sizeof (dtrace_epid_t) - 1);
9713 ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9714 ~(sizeof (dtrace_epid_t) - 1);
9715 ASSERT(ecb->dte_size <= ecb->dte_needed);
9716}
9717
9718static dtrace_action_t *
9719dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9720{
9721 dtrace_aggregation_t *agg;
9722 size_t size = sizeof (uint64_t);
9723 int ntuple = desc->dtad_ntuple;
9724 dtrace_action_t *act;
9725 dtrace_recdesc_t *frec;
9726 dtrace_aggid_t aggid;
9727 dtrace_state_t *state = ecb->dte_state;
9728
9729 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9730 agg->dtag_ecb = ecb;
9731
9732 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9733
9734 switch (desc->dtad_kind) {
9735 case DTRACEAGG_MIN:
9736 agg->dtag_initial = INT64_MAX;
9737 agg->dtag_aggregate = dtrace_aggregate_min;
9738 break;
9739
9740 case DTRACEAGG_MAX:
9741 agg->dtag_initial = (uint64_t)INT64_MIN;
9742 agg->dtag_aggregate = dtrace_aggregate_max;
9743 break;
9744
9745 case DTRACEAGG_COUNT:
9746 agg->dtag_aggregate = dtrace_aggregate_count;
9747 break;
9748
9749 case DTRACEAGG_QUANTIZE:
9750 agg->dtag_aggregate = dtrace_aggregate_quantize;
9751 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9752 sizeof (uint64_t);
9753 break;
9754
9755 case DTRACEAGG_LQUANTIZE: {
9756 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9757 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9758
9759 agg->dtag_initial = desc->dtad_arg;
9760 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9761
9762 if (step == 0 || levels == 0)
9763 goto err;
9764
9765 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9766 break;
9767 }
9768
9769 case DTRACEAGG_AVG:
9770 agg->dtag_aggregate = dtrace_aggregate_avg;
9771 size = sizeof (uint64_t) * 2;
9772 break;
9773
9774 case DTRACEAGG_STDDEV:
9775 agg->dtag_aggregate = dtrace_aggregate_stddev;
9776 size = sizeof (uint64_t) * 4;
9777 break;
9778
9779 case DTRACEAGG_SUM:
9780 agg->dtag_aggregate = dtrace_aggregate_sum;
9781 break;
9782
9783 default:
9784 goto err;
9785 }
9786
9787 agg->dtag_action.dta_rec.dtrd_size = VBDTCAST(uint32_t)size;
9788
9789 if (ntuple == 0)
9790 goto err;
9791
9792 /*
9793 * We must make sure that we have enough actions for the n-tuple.
9794 */
9795 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9796 if (DTRACEACT_ISAGG(act->dta_kind))
9797 break;
9798
9799 if (--ntuple == 0) {
9800 /*
9801 * This is the action with which our n-tuple begins.
9802 */
9803 agg->dtag_first = act;
9804 goto success;
9805 }
9806 }
9807
9808 /*
9809 * This n-tuple is short by ntuple elements. Return failure.
9810 */
9811 ASSERT(ntuple != 0);
9812err:
9813 kmem_free(agg, sizeof (dtrace_aggregation_t));
9814 return (NULL);
9815
9816success:
9817 /*
9818 * If the last action in the tuple has a size of zero, it's actually
9819 * an expression argument for the aggregating action.
9820 */
9821 ASSERT(ecb->dte_action_last != NULL);
9822 act = ecb->dte_action_last;
9823
9824 if (act->dta_kind == DTRACEACT_DIFEXPR) {
9825 ASSERT(act->dta_difo != NULL);
9826
9827 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9828 agg->dtag_hasarg = 1;
9829 }
9830
9831 /*
9832 * We need to allocate an id for this aggregation.
9833 */
9834 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9835 VM_BESTFIT | VM_SLEEP);
9836
9837 if (VBDTCAST(int64_t)aggid - 1 >= state->dts_naggregations) {
9838 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9839 dtrace_aggregation_t **aggs;
9840 int naggs = state->dts_naggregations << 1;
9841 int onaggs = state->dts_naggregations;
9842
9843 ASSERT(aggid == VBDTCAST(dtrace_aggid_t)state->dts_naggregations + 1);
9844
9845 if (naggs == 0) {
9846 ASSERT(oaggs == NULL);
9847 naggs = 1;
9848 }
9849
9850 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9851
9852 if (oaggs != NULL) {
9853 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9854 kmem_free(oaggs, onaggs * sizeof (*aggs));
9855 }
9856
9857 state->dts_aggregations = aggs;
9858 state->dts_naggregations = naggs;
9859 }
9860
9861 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9862 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9863
9864 frec = &agg->dtag_first->dta_rec;
9865 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9866 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9867
9868 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9869 ASSERT(!act->dta_intuple);
9870 act->dta_intuple = 1;
9871 }
9872
9873 return (&agg->dtag_action);
9874}
9875
9876static void
9877dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9878{
9879 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9880 dtrace_state_t *state = ecb->dte_state;
9881 dtrace_aggid_t aggid = agg->dtag_id;
9882
9883 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9884 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9885
9886 ASSERT(state->dts_aggregations[aggid - 1] == agg);
9887 state->dts_aggregations[aggid - 1] = NULL;
9888
9889 kmem_free(agg, sizeof (dtrace_aggregation_t));
9890}
9891
9892static int
9893dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9894{
9895 dtrace_action_t *action, *last;
9896 dtrace_difo_t *dp = desc->dtad_difo;
9897 uint32_t size = 0, align = sizeof (uint8_t), mask;
9898 uint16_t format = 0;
9899 dtrace_recdesc_t *rec;
9900 dtrace_state_t *state = ecb->dte_state;
9901 dtrace_optval_t *opt = state->dts_options, nframes VBDTUNASS(0), strsize;
9902 uint64_t arg = desc->dtad_arg;
9903
9904 ASSERT(MUTEX_HELD(&dtrace_lock));
9905 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9906
9907 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9908 /*
9909 * If this is an aggregating action, there must be neither
9910 * a speculate nor a commit on the action chain.
9911 */
9912 dtrace_action_t *act;
9913
9914 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9915 if (act->dta_kind == DTRACEACT_COMMIT)
9916 return (EINVAL);
9917
9918 if (act->dta_kind == DTRACEACT_SPECULATE)
9919 return (EINVAL);
9920 }
9921
9922 action = dtrace_ecb_aggregation_create(ecb, desc);
9923
9924 if (action == NULL)
9925 return (EINVAL);
9926 } else {
9927 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9928 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9929 dp != NULL && dp->dtdo_destructive)) {
9930 state->dts_destructive = 1;
9931 }
9932
9933 switch (desc->dtad_kind) {
9934 case DTRACEACT_PRINTF:
9935 case DTRACEACT_PRINTA:
9936 case DTRACEACT_SYSTEM:
9937 case DTRACEACT_FREOPEN:
9938 /*
9939 * We know that our arg is a string -- turn it into a
9940 * format.
9941 */
9942 if (arg == NULL) {
9943 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9944 format = 0;
9945 } else {
9946 ASSERT(arg != NULL);
9947 ASSERT(VBDT_IS_VALID_KRNL_ADDR(arg));
9948 format = dtrace_format_add(state,
9949 (char *)(uintptr_t)arg);
9950 }
9951
9952 /*FALLTHROUGH*/
9953 case DTRACEACT_LIBACT:
9954 case DTRACEACT_DIFEXPR:
9955 if (dp == NULL)
9956 return (EINVAL);
9957
9958 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9959 break;
9960
9961 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9962 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9963 return (EINVAL);
9964
9965 size = opt[DTRACEOPT_STRSIZE];
9966 }
9967
9968 break;
9969
9970 case DTRACEACT_STACK:
9971 if ((nframes = arg) == 0) {
9972 nframes = opt[DTRACEOPT_STACKFRAMES];
9973 ASSERT(nframes > 0);
9974 arg = nframes;
9975 }
9976
9977 size = VBDTCAST(uint32_t)(nframes * sizeof (pc_t));
9978 break;
9979
9980 case DTRACEACT_JSTACK:
9981 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9982 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9983
9984 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9985 nframes = opt[DTRACEOPT_JSTACKFRAMES];
9986
9987 arg = DTRACE_USTACK_ARG(nframes, strsize);
9988
9989 /*FALLTHROUGH*/
9990 case DTRACEACT_USTACK:
9991 if (desc->dtad_kind != DTRACEACT_JSTACK &&
9992 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9993 strsize = DTRACE_USTACK_STRSIZE(arg);
9994 nframes = opt[DTRACEOPT_USTACKFRAMES];
9995 ASSERT(nframes > 0);
9996 arg = DTRACE_USTACK_ARG(nframes, strsize);
9997 }
9998
9999 /*
10000 * Save a slot for the pid.
10001 */
10002 size = VBDTCAST(uint32_t)((nframes + 1) * sizeof (uint64_t));
10003 size += DTRACE_USTACK_STRSIZE(arg);
10004 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10005
10006 break;
10007
10008 case DTRACEACT_SYM:
10009 case DTRACEACT_MOD:
10010 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10011 sizeof (uint64_t)) ||
10012 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10013 return (EINVAL);
10014 break;
10015
10016 case DTRACEACT_USYM:
10017 case DTRACEACT_UMOD:
10018 case DTRACEACT_UADDR:
10019 if (dp == NULL ||
10020 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10021 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10022 return (EINVAL);
10023
10024 /*
10025 * We have a slot for the pid, plus a slot for the
10026 * argument. To keep things simple (aligned with
10027 * bitness-neutral sizing), we store each as a 64-bit
10028 * quantity.
10029 */
10030 size = 2 * sizeof (uint64_t);
10031 break;
10032
10033 case DTRACEACT_STOP:
10034 case DTRACEACT_BREAKPOINT:
10035 case DTRACEACT_PANIC:
10036 break;
10037
10038 case DTRACEACT_CHILL:
10039 case DTRACEACT_DISCARD:
10040 case DTRACEACT_RAISE:
10041 if (dp == NULL)
10042 return (EINVAL);
10043 break;
10044
10045 case DTRACEACT_EXIT:
10046 if (dp == NULL ||
10047 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10048 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10049 return (EINVAL);
10050 break;
10051
10052 case DTRACEACT_SPECULATE:
10053 if (ecb->dte_size > sizeof (dtrace_epid_t))
10054 return (EINVAL);
10055
10056 if (dp == NULL)
10057 return (EINVAL);
10058
10059 state->dts_speculates = 1;
10060 break;
10061
10062 case DTRACEACT_COMMIT: {
10063 dtrace_action_t *act = ecb->dte_action;
10064
10065 for (; act != NULL; act = act->dta_next) {
10066 if (act->dta_kind == DTRACEACT_COMMIT)
10067 return (EINVAL);
10068 }
10069
10070 if (dp == NULL)
10071 return (EINVAL);
10072 break;
10073 }
10074
10075 default:
10076 return (EINVAL);
10077 }
10078
10079 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10080 /*
10081 * If this is a data-storing action or a speculate,
10082 * we must be sure that there isn't a commit on the
10083 * action chain.
10084 */
10085 dtrace_action_t *act = ecb->dte_action;
10086
10087 for (; act != NULL; act = act->dta_next) {
10088 if (act->dta_kind == DTRACEACT_COMMIT)
10089 return (EINVAL);
10090 }
10091 }
10092
10093 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10094 action->dta_rec.dtrd_size = size;
10095 }
10096
10097 action->dta_refcnt = 1;
10098 rec = &action->dta_rec;
10099 size = rec->dtrd_size;
10100
10101 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10102 if (!(size & mask)) {
10103 align = mask + 1;
10104 break;
10105 }
10106 }
10107
10108 action->dta_kind = desc->dtad_kind;
10109
10110 if ((action->dta_difo = dp) != NULL)
10111 dtrace_difo_hold(dp);
10112
10113 rec->dtrd_action = action->dta_kind;
10114 rec->dtrd_arg = arg;
10115 rec->dtrd_uarg = desc->dtad_uarg;
10116 rec->dtrd_alignment = (uint16_t)align;
10117 rec->dtrd_format = format;
10118
10119 if ((last = ecb->dte_action_last) != NULL) {
10120 ASSERT(ecb->dte_action != NULL);
10121 action->dta_prev = last;
10122 last->dta_next = action;
10123 } else {
10124 ASSERT(ecb->dte_action == NULL);
10125 ecb->dte_action = action;
10126 }
10127
10128 ecb->dte_action_last = action;
10129
10130 return (0);
10131}
10132
10133static void
10134dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10135{
10136 dtrace_action_t *act = ecb->dte_action, *next;
10137 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10138 dtrace_difo_t *dp;
10139 uint16_t format;
10140
10141 if (act != NULL && act->dta_refcnt > 1) {
10142 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10143 act->dta_refcnt--;
10144 } else {
10145 for (; act != NULL; act = next) {
10146 next = act->dta_next;
10147 ASSERT(next != NULL || act == ecb->dte_action_last);
10148 ASSERT(act->dta_refcnt == 1);
10149
10150 if ((format = act->dta_rec.dtrd_format) != 0)
10151 dtrace_format_remove(ecb->dte_state, format);
10152
10153 if ((dp = act->dta_difo) != NULL)
10154 dtrace_difo_release(dp, vstate);
10155
10156 if (DTRACEACT_ISAGG(act->dta_kind)) {
10157 dtrace_ecb_aggregation_destroy(ecb, act);
10158 } else {
10159 kmem_free(act, sizeof (dtrace_action_t));
10160 }
10161 }
10162 }
10163
10164 ecb->dte_action = NULL;
10165 ecb->dte_action_last = NULL;
10166 ecb->dte_size = sizeof (dtrace_epid_t);
10167}
10168
10169static void
10170dtrace_ecb_disable(dtrace_ecb_t *ecb)
10171{
10172 /*
10173 * We disable the ECB by removing it from its probe.
10174 */
10175 dtrace_ecb_t *pecb, *prev = NULL;
10176 dtrace_probe_t *probe = ecb->dte_probe;
10177
10178 ASSERT(MUTEX_HELD(&dtrace_lock));
10179
10180 if (probe == NULL) {
10181 /*
10182 * This is the NULL probe; there is nothing to disable.
10183 */
10184 return;
10185 }
10186
10187 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10188 if (pecb == ecb)
10189 break;
10190 prev = pecb;
10191 }
10192
10193 ASSERT(pecb != NULL);
10194
10195 if (prev == NULL) {
10196 probe->dtpr_ecb = ecb->dte_next;
10197 } else {
10198 prev->dte_next = ecb->dte_next;
10199 }
10200
10201 if (ecb == probe->dtpr_ecb_last) {
10202 ASSERT(ecb->dte_next == NULL);
10203 probe->dtpr_ecb_last = prev;
10204 }
10205
10206 /*
10207 * The ECB has been disconnected from the probe; now sync to assure
10208 * that all CPUs have seen the change before returning.
10209 */
10210 dtrace_sync();
10211
10212 if (probe->dtpr_ecb == NULL) {
10213 /*
10214 * That was the last ECB on the probe; clear the predicate
10215 * cache ID for the probe, disable it and sync one more time
10216 * to assure that we'll never hit it again.
10217 */
10218 dtrace_provider_t *prov = probe->dtpr_provider;
10219
10220 ASSERT(ecb->dte_next == NULL);
10221 ASSERT(probe->dtpr_ecb_last == NULL);
10222 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10223 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10224 probe->dtpr_id, probe->dtpr_arg);
10225 dtrace_sync();
10226 } else {
10227 /*
10228 * There is at least one ECB remaining on the probe. If there
10229 * is _exactly_ one, set the probe's predicate cache ID to be
10230 * the predicate cache ID of the remaining ECB.
10231 */
10232 ASSERT(probe->dtpr_ecb_last != NULL);
10233 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10234
10235 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10236 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10237
10238 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10239
10240 if (p != NULL)
10241 probe->dtpr_predcache = p->dtp_cacheid;
10242 }
10243
10244 ecb->dte_next = NULL;
10245 }
10246}
10247
10248static void
10249dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10250{
10251 dtrace_state_t *state = ecb->dte_state;
10252 dtrace_vstate_t *vstate = &state->dts_vstate;
10253 dtrace_predicate_t *pred;
10254 dtrace_epid_t epid = ecb->dte_epid;
10255
10256 ASSERT(MUTEX_HELD(&dtrace_lock));
10257 ASSERT(ecb->dte_next == NULL);
10258 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10259
10260 if ((pred = ecb->dte_predicate) != NULL)
10261 dtrace_predicate_release(pred, vstate);
10262
10263 dtrace_ecb_action_remove(ecb);
10264
10265 ASSERT(state->dts_ecbs[epid - 1] == ecb);
10266 state->dts_ecbs[epid - 1] = NULL;
10267
10268 kmem_free(ecb, sizeof (dtrace_ecb_t));
10269}
10270
10271static dtrace_ecb_t *
10272dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10273 dtrace_enabling_t *enab)
10274{
10275 dtrace_ecb_t *ecb;
10276 dtrace_predicate_t *pred;
10277 dtrace_actdesc_t *act;
10278 dtrace_provider_t *prov;
10279 dtrace_ecbdesc_t *desc = enab->dten_current;
10280
10281 ASSERT(MUTEX_HELD(&dtrace_lock));
10282 ASSERT(state != NULL);
10283
10284 ecb = dtrace_ecb_add(state, probe);
10285 ecb->dte_uarg = desc->dted_uarg;
10286
10287 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10288 dtrace_predicate_hold(pred);
10289 ecb->dte_predicate = pred;
10290 }
10291
10292 if (probe != NULL) {
10293 /*
10294 * If the provider shows more leg than the consumer is old
10295 * enough to see, we need to enable the appropriate implicit
10296 * predicate bits to prevent the ecb from activating at
10297 * revealing times.
10298 *
10299 * Providers specifying DTRACE_PRIV_USER at register time
10300 * are stating that they need the /proc-style privilege
10301 * model to be enforced, and this is what DTRACE_COND_OWNER
10302 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10303 */
10304 prov = probe->dtpr_provider;
10305 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10306 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10307 ecb->dte_cond |= DTRACE_COND_OWNER;
10308
10309 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10310 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10311 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10312
10313 /*
10314 * If the provider shows us kernel innards and the user
10315 * is lacking sufficient privilege, enable the
10316 * DTRACE_COND_USERMODE implicit predicate.
10317 */
10318 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10319 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10320 ecb->dte_cond |= DTRACE_COND_USERMODE;
10321 }
10322
10323 if (dtrace_ecb_create_cache != NULL) {
10324 /*
10325 * If we have a cached ecb, we'll use its action list instead
10326 * of creating our own (saving both time and space).
10327 */
10328 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10329 dtrace_action_t *act2 = cached->dte_action;
10330
10331 if (act2 != NULL) {
10332 ASSERT(act2->dta_refcnt > 0);
10333 act2->dta_refcnt++;
10334 ecb->dte_action = act2;
10335 ecb->dte_action_last = cached->dte_action_last;
10336 ecb->dte_needed = cached->dte_needed;
10337 ecb->dte_size = cached->dte_size;
10338 ecb->dte_alignment = cached->dte_alignment;
10339 }
10340
10341 return (ecb);
10342 }
10343
10344 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10345 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10346 dtrace_ecb_destroy(ecb);
10347 return (NULL);
10348 }
10349 }
10350
10351 dtrace_ecb_resize(ecb);
10352
10353 return (dtrace_ecb_create_cache = ecb);
10354}
10355
10356static int
10357dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10358{
10359 dtrace_ecb_t *ecb;
10360 dtrace_enabling_t *enab = arg;
10361 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10362
10363 ASSERT(state != NULL);
10364
10365 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10366 /*
10367 * This probe was created in a generation for which this
10368 * enabling has previously created ECBs; we don't want to
10369 * enable it again, so just kick out.
10370 */
10371 return (DTRACE_MATCH_NEXT);
10372 }
10373
10374 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10375 return (DTRACE_MATCH_DONE);
10376
10377 if (dtrace_ecb_enable(ecb) < 0)
10378 return (DTRACE_MATCH_FAIL);
10379
10380 return (DTRACE_MATCH_NEXT);
10381}
10382
10383static dtrace_ecb_t *
10384dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10385{
10386 dtrace_ecb_t *ecb; NOREF(ecb);
10387
10388 ASSERT(MUTEX_HELD(&dtrace_lock));
10389
10390 if (id == 0 || VBDTCAST(int64_t)id > state->dts_necbs)
10391 return (NULL);
10392
10393 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10394 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10395
10396 return (state->dts_ecbs[id - 1]);
10397}
10398
10399static dtrace_aggregation_t *
10400dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10401{
10402 dtrace_aggregation_t *agg; NOREF(agg);
10403
10404 ASSERT(MUTEX_HELD(&dtrace_lock));
10405
10406 if (id == 0 || VBDTCAST(int64_t)id > state->dts_naggregations)
10407 return (NULL);
10408
10409 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10410 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10411 agg->dtag_id == id);
10412
10413 return (state->dts_aggregations[id - 1]);
10414}
10415
10416/*
10417 * DTrace Buffer Functions
10418 *
10419 * The following functions manipulate DTrace buffers. Most of these functions
10420 * are called in the context of establishing or processing consumer state;
10421 * exceptions are explicitly noted.
10422 */
10423
10424/*
10425 * Note: called from cross call context. This function switches the two
10426 * buffers on a given CPU. The atomicity of this operation is assured by
10427 * disabling interrupts while the actual switch takes place; the disabling of
10428 * interrupts serializes the execution with any execution of dtrace_probe() on
10429 * the same CPU.
10430 */
10431static void
10432dtrace_buffer_switch(dtrace_buffer_t *buf)
10433{
10434 caddr_t tomax = buf->dtb_tomax;
10435 caddr_t xamot = buf->dtb_xamot;
10436 dtrace_icookie_t cookie;
10437
10438 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10439 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10440
10441 cookie = dtrace_interrupt_disable();
10442 buf->dtb_tomax = xamot;
10443 buf->dtb_xamot = tomax;
10444 buf->dtb_xamot_drops = buf->dtb_drops;
10445 buf->dtb_xamot_offset = buf->dtb_offset;
10446 buf->dtb_xamot_errors = buf->dtb_errors;
10447 buf->dtb_xamot_flags = buf->dtb_flags;
10448 buf->dtb_offset = 0;
10449 buf->dtb_drops = 0;
10450 buf->dtb_errors = 0;
10451 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10452 dtrace_interrupt_enable(cookie);
10453}
10454
10455#ifdef VBOX
10456static DECLCALLBACK(void) dtrace_buffer_switch_wrapper(RTCPUID idCpu, void *pvUser1, void *pvUser2)
10457{
10458 dtrace_buffer_switch((dtrace_buffer_t *)pvUser1);
10459 NOREF(pvUser2); NOREF(idCpu);
10460}
10461#endif
10462
10463/*
10464 * Note: called from cross call context. This function activates a buffer
10465 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
10466 * is guaranteed by the disabling of interrupts.
10467 */
10468static void
10469dtrace_buffer_activate(dtrace_state_t *state)
10470{
10471 dtrace_buffer_t *buf;
10472 dtrace_icookie_t cookie = dtrace_interrupt_disable();
10473
10474 buf = &state->dts_buffer[VBDT_GET_CPUID()];
10475
10476 if (buf->dtb_tomax != NULL) {
10477 /*
10478 * We might like to assert that the buffer is marked inactive,
10479 * but this isn't necessarily true: the buffer for the CPU
10480 * that processes the BEGIN probe has its buffer activated
10481 * manually. In this case, we take the (harmless) action
10482 * re-clearing the bit INACTIVE bit.
10483 */
10484 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10485 }
10486
10487 dtrace_interrupt_enable(cookie);
10488}
10489
10490#ifdef VBOX
10491static DECLCALLBACK(void) dtrace_buffer_activate_wrapper(RTCPUID idCpu, void *pvUser1, void *pvUser2)
10492{
10493 dtrace_buffer_activate((dtrace_state_t *)pvUser1);
10494 NOREF(pvUser2); NOREF(idCpu);
10495}
10496#endif
10497
10498static int
10499dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10500 processorid_t cpu)
10501{
10502#ifndef VBOX
10503 cpu_t *cp;
10504#else
10505 RTCPUSET CpuSet;
10506 unsigned iCpu;
10507#endif
10508 dtrace_buffer_t *buf;
10509
10510 ASSERT(MUTEX_HELD(&cpu_lock));
10511 ASSERT(MUTEX_HELD(&dtrace_lock));
10512
10513 if (VBDTCAST(int64_t)size > dtrace_nonroot_maxsize
10514#ifndef VBOX
10515 && !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)
10516#endif
10517 )
10518 return (EFBIG);
10519
10520#ifndef VBOX
10521 cp = cpu_list;
10522#else
10523 RTMpGetSet(&CpuSet);
10524#endif
10525
10526#ifndef VBOX
10527 do {
10528 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10529 continue;
10530
10531 buf = &bufs[cp->cpu_id];
10532#else
10533 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10534 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10535 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10536 continue;
10537
10538 buf = &bufs[iCpu];
10539#endif
10540
10541 /*
10542 * If there is already a buffer allocated for this CPU, it
10543 * is only possible that this is a DR event. In this case,
10544 * the buffer size must match our specified size.
10545 */
10546 if (buf->dtb_tomax != NULL) {
10547 ASSERT(buf->dtb_size == size);
10548 continue;
10549 }
10550
10551 ASSERT(buf->dtb_xamot == NULL);
10552
10553 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10554 goto err;
10555
10556 buf->dtb_size = size;
10557 buf->dtb_flags = flags;
10558 buf->dtb_offset = 0;
10559 buf->dtb_drops = 0;
10560
10561 if (flags & DTRACEBUF_NOSWITCH)
10562 continue;
10563
10564 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10565 goto err;
10566#ifndef VBOX
10567 } while ((cp = cp->cpu_next) != cpu_list);
10568#else
10569 }
10570#endif
10571
10572 return (0);
10573
10574err:
10575#ifndef VBOX
10576 cp = cpu_list;
10577
10578 do {
10579 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10580 continue;
10581
10582 buf = &bufs[cp->cpu_id];
10583#else
10584 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10585 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10586 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10587 continue;
10588
10589 buf = &bufs[iCpu];
10590#endif
10591
10592 if (buf->dtb_xamot != NULL) {
10593 ASSERT(buf->dtb_tomax != NULL);
10594 ASSERT(buf->dtb_size == size);
10595 kmem_free(buf->dtb_xamot, size);
10596 }
10597
10598 if (buf->dtb_tomax != NULL) {
10599 ASSERT(buf->dtb_size == size);
10600 kmem_free(buf->dtb_tomax, size);
10601 }
10602
10603 buf->dtb_tomax = NULL;
10604 buf->dtb_xamot = NULL;
10605 buf->dtb_size = 0;
10606#ifndef VBOX
10607 } while ((cp = cp->cpu_next) != cpu_list);
10608#else
10609 }
10610#endif
10611
10612 return (ENOMEM);
10613}
10614
10615/*
10616 * Note: called from probe context. This function just increments the drop
10617 * count on a buffer. It has been made a function to allow for the
10618 * possibility of understanding the source of mysterious drop counts. (A
10619 * problem for which one may be particularly disappointed that DTrace cannot
10620 * be used to understand DTrace.)
10621 */
10622static void
10623dtrace_buffer_drop(dtrace_buffer_t *buf)
10624{
10625 buf->dtb_drops++;
10626}
10627
10628/*
10629 * Note: called from probe context. This function is called to reserve space
10630 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
10631 * mstate. Returns the new offset in the buffer, or a negative value if an
10632 * error has occurred.
10633 */
10634static intptr_t
10635dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10636 dtrace_state_t *state, dtrace_mstate_t *mstate)
10637{
10638 intptr_t offs = buf->dtb_offset, soffs;
10639 intptr_t woffs;
10640 caddr_t tomax;
10641 size_t total;
10642
10643 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10644 return (-1);
10645
10646 if ((tomax = buf->dtb_tomax) == NULL) {
10647 dtrace_buffer_drop(buf);
10648 return (-1);
10649 }
10650
10651 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10652 while (offs & (align - 1)) {
10653 /*
10654 * Assert that our alignment is off by a number which
10655 * is itself sizeof (uint32_t) aligned.
10656 */
10657 ASSERT(!((align - (offs & (align - 1))) &
10658 (sizeof (uint32_t) - 1)));
10659 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10660 offs += sizeof (uint32_t);
10661 }
10662
10663 if (VBDTCAST(uintptr_t)(soffs = offs + needed) > buf->dtb_size) {
10664 dtrace_buffer_drop(buf);
10665 return (-1);
10666 }
10667
10668 if (mstate == NULL)
10669 return (offs);
10670
10671 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10672 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10673 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10674
10675 return (offs);
10676 }
10677
10678 if (buf->dtb_flags & DTRACEBUF_FILL) {
10679 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10680 (buf->dtb_flags & DTRACEBUF_FULL))
10681 return (-1);
10682 goto out;
10683 }
10684
10685 total = needed + (offs & (align - 1));
10686
10687 /*
10688 * For a ring buffer, life is quite a bit more complicated. Before
10689 * we can store any padding, we need to adjust our wrapping offset.
10690 * (If we've never before wrapped or we're not about to, no adjustment
10691 * is required.)
10692 */
10693 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10694 offs + total > buf->dtb_size) {
10695 woffs = buf->dtb_xamot_offset;
10696
10697 if (offs + total > buf->dtb_size) {
10698 /*
10699 * We can't fit in the end of the buffer. First, a
10700 * sanity check that we can fit in the buffer at all.
10701 */
10702 if (total > buf->dtb_size) {
10703 dtrace_buffer_drop(buf);
10704 return (-1);
10705 }
10706
10707 /*
10708 * We're going to be storing at the top of the buffer,
10709 * so now we need to deal with the wrapped offset. We
10710 * only reset our wrapped offset to 0 if it is
10711 * currently greater than the current offset. If it
10712 * is less than the current offset, it is because a
10713 * previous allocation induced a wrap -- but the
10714 * allocation didn't subsequently take the space due
10715 * to an error or false predicate evaluation. In this
10716 * case, we'll just leave the wrapped offset alone: if
10717 * the wrapped offset hasn't been advanced far enough
10718 * for this allocation, it will be adjusted in the
10719 * lower loop.
10720 */
10721 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10722 if (woffs >= offs)
10723 woffs = 0;
10724 } else {
10725 woffs = 0;
10726 }
10727
10728 /*
10729 * Now we know that we're going to be storing to the
10730 * top of the buffer and that there is room for us
10731 * there. We need to clear the buffer from the current
10732 * offset to the end (there may be old gunk there).
10733 */
10734 while (VBDTCAST(uintptr_t)offs < buf->dtb_size)
10735 tomax[offs++] = 0;
10736
10737 /*
10738 * We need to set our offset to zero. And because we
10739 * are wrapping, we need to set the bit indicating as
10740 * much. We can also adjust our needed space back
10741 * down to the space required by the ECB -- we know
10742 * that the top of the buffer is aligned.
10743 */
10744 offs = 0;
10745 total = needed;
10746 buf->dtb_flags |= DTRACEBUF_WRAPPED;
10747 } else {
10748 /*
10749 * There is room for us in the buffer, so we simply
10750 * need to check the wrapped offset.
10751 */
10752 if (woffs < offs) {
10753 /*
10754 * The wrapped offset is less than the offset.
10755 * This can happen if we allocated buffer space
10756 * that induced a wrap, but then we didn't
10757 * subsequently take the space due to an error
10758 * or false predicate evaluation. This is
10759 * okay; we know that _this_ allocation isn't
10760 * going to induce a wrap. We still can't
10761 * reset the wrapped offset to be zero,
10762 * however: the space may have been trashed in
10763 * the previous failed probe attempt. But at
10764 * least the wrapped offset doesn't need to
10765 * be adjusted at all...
10766 */
10767 goto out;
10768 }
10769 }
10770
10771 while (VBDTCAST(uintptr_t)offs + total > VBDTCAST(uintptr_t)woffs) {
10772 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10773 size_t size;
10774
10775 if (epid == DTRACE_EPIDNONE) {
10776 size = sizeof (uint32_t);
10777 } else {
10778 ASSERT(VBDTCAST(int64_t)epid <= state->dts_necbs);
10779 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10780
10781 size = state->dts_ecbs[epid - 1]->dte_size;
10782 }
10783
10784 ASSERT(woffs + size <= buf->dtb_size);
10785 ASSERT(size != 0);
10786
10787 if (woffs + size == buf->dtb_size) {
10788 /*
10789 * We've reached the end of the buffer; we want
10790 * to set the wrapped offset to 0 and break
10791 * out. However, if the offs is 0, then we're
10792 * in a strange edge-condition: the amount of
10793 * space that we want to reserve plus the size
10794 * of the record that we're overwriting is
10795 * greater than the size of the buffer. This
10796 * is problematic because if we reserve the
10797 * space but subsequently don't consume it (due
10798 * to a failed predicate or error) the wrapped
10799 * offset will be 0 -- yet the EPID at offset 0
10800 * will not be committed. This situation is
10801 * relatively easy to deal with: if we're in
10802 * this case, the buffer is indistinguishable
10803 * from one that hasn't wrapped; we need only
10804 * finish the job by clearing the wrapped bit,
10805 * explicitly setting the offset to be 0, and
10806 * zero'ing out the old data in the buffer.
10807 */
10808 if (offs == 0) {
10809 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10810 buf->dtb_offset = 0;
10811 woffs = total;
10812
10813 while (VBDTCAST(uintptr_t)woffs < buf->dtb_size)
10814 tomax[woffs++] = 0;
10815 }
10816
10817 woffs = 0;
10818 break;
10819 }
10820
10821 woffs += size;
10822 }
10823
10824 /*
10825 * We have a wrapped offset. It may be that the wrapped offset
10826 * has become zero -- that's okay.
10827 */
10828 buf->dtb_xamot_offset = woffs;
10829 }
10830
10831out:
10832 /*
10833 * Now we can plow the buffer with any necessary padding.
10834 */
10835 while (offs & (align - 1)) {
10836 /*
10837 * Assert that our alignment is off by a number which
10838 * is itself sizeof (uint32_t) aligned.
10839 */
10840 ASSERT(!((align - (offs & (align - 1))) &
10841 (sizeof (uint32_t) - 1)));
10842 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10843 offs += sizeof (uint32_t);
10844 }
10845
10846 if (buf->dtb_flags & DTRACEBUF_FILL) {
10847 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10848 buf->dtb_flags |= DTRACEBUF_FULL;
10849 return (-1);
10850 }
10851 }
10852
10853 if (mstate == NULL)
10854 return (offs);
10855
10856 /*
10857 * For ring buffers and fill buffers, the scratch space is always
10858 * the inactive buffer.
10859 */
10860 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10861 mstate->dtms_scratch_size = buf->dtb_size;
10862 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10863
10864 return (offs);
10865}
10866
10867static void
10868dtrace_buffer_polish(dtrace_buffer_t *buf)
10869{
10870 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10871 ASSERT(MUTEX_HELD(&dtrace_lock));
10872
10873 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10874 return;
10875
10876 /*
10877 * We need to polish the ring buffer. There are three cases:
10878 *
10879 * - The first (and presumably most common) is that there is no gap
10880 * between the buffer offset and the wrapped offset. In this case,
10881 * there is nothing in the buffer that isn't valid data; we can
10882 * mark the buffer as polished and return.
10883 *
10884 * - The second (less common than the first but still more common
10885 * than the third) is that there is a gap between the buffer offset
10886 * and the wrapped offset, and the wrapped offset is larger than the
10887 * buffer offset. This can happen because of an alignment issue, or
10888 * can happen because of a call to dtrace_buffer_reserve() that
10889 * didn't subsequently consume the buffer space. In this case,
10890 * we need to zero the data from the buffer offset to the wrapped
10891 * offset.
10892 *
10893 * - The third (and least common) is that there is a gap between the
10894 * buffer offset and the wrapped offset, but the wrapped offset is
10895 * _less_ than the buffer offset. This can only happen because a
10896 * call to dtrace_buffer_reserve() induced a wrap, but the space
10897 * was not subsequently consumed. In this case, we need to zero the
10898 * space from the offset to the end of the buffer _and_ from the
10899 * top of the buffer to the wrapped offset.
10900 */
10901 if (buf->dtb_offset < buf->dtb_xamot_offset) {
10902 bzero(buf->dtb_tomax + buf->dtb_offset,
10903 buf->dtb_xamot_offset - buf->dtb_offset);
10904 }
10905
10906 if (buf->dtb_offset > buf->dtb_xamot_offset) {
10907 bzero(buf->dtb_tomax + buf->dtb_offset,
10908 buf->dtb_size - buf->dtb_offset);
10909 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10910 }
10911}
10912
10913static void
10914dtrace_buffer_free(dtrace_buffer_t *bufs)
10915{
10916 int i;
10917
10918 for (i = 0; i < NCPU; i++) {
10919 dtrace_buffer_t *buf = &bufs[i];
10920
10921 if (buf->dtb_tomax == NULL) {
10922 ASSERT(buf->dtb_xamot == NULL);
10923 ASSERT(buf->dtb_size == 0);
10924 continue;
10925 }
10926
10927 if (buf->dtb_xamot != NULL) {
10928 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10929 kmem_free(buf->dtb_xamot, buf->dtb_size);
10930 }
10931
10932 kmem_free(buf->dtb_tomax, buf->dtb_size);
10933 buf->dtb_size = 0;
10934 buf->dtb_tomax = NULL;
10935 buf->dtb_xamot = NULL;
10936 }
10937}
10938
10939/*
10940 * DTrace Enabling Functions
10941 */
10942static dtrace_enabling_t *
10943dtrace_enabling_create(dtrace_vstate_t *vstate)
10944{
10945 dtrace_enabling_t *enab;
10946
10947 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10948 enab->dten_vstate = vstate;
10949
10950 return (enab);
10951}
10952
10953static void
10954dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10955{
10956 dtrace_ecbdesc_t **ndesc;
10957 size_t osize, nsize;
10958
10959 /*
10960 * We can't add to enablings after we've enabled them, or after we've
10961 * retained them.
10962 */
10963 ASSERT(enab->dten_probegen == 0);
10964 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10965
10966 if (enab->dten_ndesc < enab->dten_maxdesc) {
10967 enab->dten_desc[enab->dten_ndesc++] = ecb;
10968 return;
10969 }
10970
10971 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10972
10973 if (enab->dten_maxdesc == 0) {
10974 enab->dten_maxdesc = 1;
10975 } else {
10976 enab->dten_maxdesc <<= 1;
10977 }
10978
10979 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10980
10981 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10982 ndesc = kmem_zalloc(nsize, KM_SLEEP);
10983 bcopy(enab->dten_desc, ndesc, osize);
10984 kmem_free(enab->dten_desc, osize);
10985
10986 enab->dten_desc = ndesc;
10987 enab->dten_desc[enab->dten_ndesc++] = ecb;
10988}
10989
10990static void
10991dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10992 dtrace_probedesc_t *pd)
10993{
10994 dtrace_ecbdesc_t *new;
10995 dtrace_predicate_t *pred;
10996 dtrace_actdesc_t *act;
10997
10998 /*
10999 * We're going to create a new ECB description that matches the
11000 * specified ECB in every way, but has the specified probe description.
11001 */
11002 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11003
11004 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11005 dtrace_predicate_hold(pred);
11006
11007 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11008 dtrace_actdesc_hold(act);
11009
11010 new->dted_action = ecb->dted_action;
11011 new->dted_pred = ecb->dted_pred;
11012 new->dted_probe = *pd;
11013 new->dted_uarg = ecb->dted_uarg;
11014
11015 dtrace_enabling_add(enab, new);
11016}
11017
11018static void
11019dtrace_enabling_dump(dtrace_enabling_t *enab)
11020{
11021 int i;
11022
11023 for (i = 0; i < enab->dten_ndesc; i++) {
11024 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11025
11026 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11027 desc->dtpd_provider, desc->dtpd_mod,
11028 desc->dtpd_func, desc->dtpd_name);
11029 }
11030}
11031
11032static void
11033dtrace_enabling_destroy(dtrace_enabling_t *enab)
11034{
11035 int i;
11036 dtrace_ecbdesc_t *ep;
11037 dtrace_vstate_t *vstate = enab->dten_vstate;
11038
11039 ASSERT(MUTEX_HELD(&dtrace_lock));
11040
11041 for (i = 0; i < enab->dten_ndesc; i++) {
11042 dtrace_actdesc_t *act, *next;
11043 dtrace_predicate_t *pred;
11044
11045 ep = enab->dten_desc[i];
11046
11047 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11048 dtrace_predicate_release(pred, vstate);
11049
11050 for (act = ep->dted_action; act != NULL; act = next) {
11051 next = act->dtad_next;
11052 dtrace_actdesc_release(act, vstate);
11053 }
11054
11055 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11056 }
11057
11058 kmem_free(enab->dten_desc,
11059 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11060
11061 /*
11062 * If this was a retained enabling, decrement the dts_nretained count
11063 * and take it off of the dtrace_retained list.
11064 */
11065 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11066 dtrace_retained == enab) {
11067 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11068 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11069 enab->dten_vstate->dtvs_state->dts_nretained--;
11070 dtrace_retained_gen++;
11071 }
11072
11073 if (enab->dten_prev == NULL) {
11074 if (dtrace_retained == enab) {
11075 dtrace_retained = enab->dten_next;
11076
11077 if (dtrace_retained != NULL)
11078 dtrace_retained->dten_prev = NULL;
11079 }
11080 } else {
11081 ASSERT(enab != dtrace_retained);
11082 ASSERT(dtrace_retained != NULL);
11083 enab->dten_prev->dten_next = enab->dten_next;
11084 }
11085
11086 if (enab->dten_next != NULL) {
11087 ASSERT(dtrace_retained != NULL);
11088 enab->dten_next->dten_prev = enab->dten_prev;
11089 }
11090
11091 kmem_free(enab, sizeof (dtrace_enabling_t));
11092}
11093
11094static int
11095dtrace_enabling_retain(dtrace_enabling_t *enab)
11096{
11097 dtrace_state_t *state;
11098
11099 ASSERT(MUTEX_HELD(&dtrace_lock));
11100 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11101 ASSERT(enab->dten_vstate != NULL);
11102
11103 state = enab->dten_vstate->dtvs_state;
11104 ASSERT(state != NULL);
11105
11106 /*
11107 * We only allow each state to retain dtrace_retain_max enablings.
11108 */
11109 if (state->dts_nretained >= dtrace_retain_max)
11110 return (ENOSPC);
11111
11112 state->dts_nretained++;
11113 dtrace_retained_gen++;
11114
11115 if (dtrace_retained == NULL) {
11116 dtrace_retained = enab;
11117 return (0);
11118 }
11119
11120 enab->dten_next = dtrace_retained;
11121 dtrace_retained->dten_prev = enab;
11122 dtrace_retained = enab;
11123
11124 return (0);
11125}
11126
11127static int
11128dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11129 dtrace_probedesc_t *create)
11130{
11131 dtrace_enabling_t *new, *enab;
11132 int found = 0, err = ENOENT;
11133
11134 ASSERT(MUTEX_HELD(&dtrace_lock));
11135 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11136 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11137 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11138 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11139
11140 new = dtrace_enabling_create(&state->dts_vstate);
11141
11142 /*
11143 * Iterate over all retained enablings, looking for enablings that
11144 * match the specified state.
11145 */
11146 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11147 int i;
11148
11149 /*
11150 * dtvs_state can only be NULL for helper enablings -- and
11151 * helper enablings can't be retained.
11152 */
11153 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11154
11155 if (enab->dten_vstate->dtvs_state != state)
11156 continue;
11157
11158 /*
11159 * Now iterate over each probe description; we're looking for
11160 * an exact match to the specified probe description.
11161 */
11162 for (i = 0; i < enab->dten_ndesc; i++) {
11163 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11164 dtrace_probedesc_t *pd = &ep->dted_probe;
11165
11166 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11167 continue;
11168
11169 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11170 continue;
11171
11172 if (strcmp(pd->dtpd_func, match->dtpd_func))
11173 continue;
11174
11175 if (strcmp(pd->dtpd_name, match->dtpd_name))
11176 continue;
11177
11178 /*
11179 * We have a winning probe! Add it to our growing
11180 * enabling.
11181 */
11182 found = 1;
11183 dtrace_enabling_addlike(new, ep, create);
11184 }
11185 }
11186
11187 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11188 dtrace_enabling_destroy(new);
11189 return (err);
11190 }
11191
11192 return (0);
11193}
11194
11195static void
11196dtrace_enabling_retract(dtrace_state_t *state)
11197{
11198 dtrace_enabling_t *enab, *next;
11199
11200 ASSERT(MUTEX_HELD(&dtrace_lock));
11201
11202 /*
11203 * Iterate over all retained enablings, destroy the enablings retained
11204 * for the specified state.
11205 */
11206 for (enab = dtrace_retained; enab != NULL; enab = next) {
11207 next = enab->dten_next;
11208
11209 /*
11210 * dtvs_state can only be NULL for helper enablings -- and
11211 * helper enablings can't be retained.
11212 */
11213 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11214
11215 if (enab->dten_vstate->dtvs_state == state) {
11216 ASSERT(state->dts_nretained > 0);
11217 dtrace_enabling_destroy(enab);
11218 }
11219 }
11220
11221 ASSERT(state->dts_nretained == 0);
11222}
11223
11224static int
11225dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11226{
11227 int i = 0;
11228 int total_matched = 0, matched = 0;
11229
11230 ASSERT(MUTEX_HELD(&cpu_lock));
11231 ASSERT(MUTEX_HELD(&dtrace_lock));
11232
11233 for (i = 0; i < enab->dten_ndesc; i++) {
11234 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11235
11236 enab->dten_current = ep;
11237 enab->dten_error = 0;
11238
11239 /*
11240 * If a provider failed to enable a probe then get out and
11241 * let the consumer know we failed.
11242 */
11243 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11244 return (EBUSY);
11245
11246 total_matched += matched;
11247
11248 if (enab->dten_error != 0) {
11249 /*
11250 * If we get an error half-way through enabling the
11251 * probes, we kick out -- perhaps with some number of
11252 * them enabled. Leaving enabled probes enabled may
11253 * be slightly confusing for user-level, but we expect
11254 * that no one will attempt to actually drive on in
11255 * the face of such errors. If this is an anonymous
11256 * enabling (indicated with a NULL nmatched pointer),
11257 * we cmn_err() a message. We aren't expecting to
11258 * get such an error -- such as it can exist at all,
11259 * it would be a result of corrupted DOF in the driver
11260 * properties.
11261 */
11262 if (nmatched == NULL) {
11263 cmn_err(CE_WARN, "dtrace_enabling_match() "
11264 "error on %p: %d", (void *)ep,
11265 enab->dten_error);
11266 }
11267
11268 return (enab->dten_error);
11269 }
11270 }
11271
11272 enab->dten_probegen = dtrace_probegen;
11273 if (nmatched != NULL)
11274 *nmatched = total_matched;
11275
11276 return (0);
11277}
11278
11279static void
11280dtrace_enabling_matchall(void)
11281{
11282 dtrace_enabling_t *enab;
11283
11284 mutex_enter(&cpu_lock);
11285 mutex_enter(&dtrace_lock);
11286
11287 /*
11288 * Iterate over all retained enablings to see if any probes match
11289 * against them. We only perform this operation on enablings for which
11290 * we have sufficient permissions by virtue of being in the global zone
11291 * or in the same zone as the DTrace client. Because we can be called
11292 * after dtrace_detach() has been called, we cannot assert that there
11293 * are retained enablings. We can safely load from dtrace_retained,
11294 * however: the taskq_destroy() at the end of dtrace_detach() will
11295 * block pending our completion.
11296 */
11297 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11298#ifndef VBOX
11299 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
11300
11301 if (INGLOBALZONE(curproc) ||
11302 cr != NULL && getzoneid() == crgetzoneid(cr))
11303#endif
11304 (void) dtrace_enabling_match(enab, NULL);
11305 }
11306
11307 mutex_exit(&dtrace_lock);
11308 mutex_exit(&cpu_lock);
11309}
11310
11311/*
11312 * If an enabling is to be enabled without having matched probes (that is, if
11313 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11314 * enabling must be _primed_ by creating an ECB for every ECB description.
11315 * This must be done to assure that we know the number of speculations, the
11316 * number of aggregations, the minimum buffer size needed, etc. before we
11317 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
11318 * enabling any probes, we create ECBs for every ECB decription, but with a
11319 * NULL probe -- which is exactly what this function does.
11320 */
11321static void
11322dtrace_enabling_prime(dtrace_state_t *state)
11323{
11324 dtrace_enabling_t *enab;
11325 int i;
11326
11327 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11328 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11329
11330 if (enab->dten_vstate->dtvs_state != state)
11331 continue;
11332
11333 /*
11334 * We don't want to prime an enabling more than once, lest
11335 * we allow a malicious user to induce resource exhaustion.
11336 * (The ECBs that result from priming an enabling aren't
11337 * leaked -- but they also aren't deallocated until the
11338 * consumer state is destroyed.)
11339 */
11340 if (enab->dten_primed)
11341 continue;
11342
11343 for (i = 0; i < enab->dten_ndesc; i++) {
11344 enab->dten_current = enab->dten_desc[i];
11345 (void) dtrace_probe_enable(NULL, enab);
11346 }
11347
11348 enab->dten_primed = 1;
11349 }
11350}
11351
11352/*
11353 * Called to indicate that probes should be provided due to retained
11354 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
11355 * must take an initial lap through the enabling calling the dtps_provide()
11356 * entry point explicitly to allow for autocreated probes.
11357 */
11358static void
11359dtrace_enabling_provide(dtrace_provider_t *prv)
11360{
11361 int i, all = 0;
11362 dtrace_probedesc_t desc;
11363 dtrace_genid_t gen;
11364
11365 ASSERT(MUTEX_HELD(&dtrace_lock));
11366 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11367
11368 if (prv == NULL) {
11369 all = 1;
11370 prv = dtrace_provider;
11371 }
11372
11373 do {
11374 dtrace_enabling_t *enab;
11375 void *parg = prv->dtpv_arg;
11376
11377retry:
11378 gen = dtrace_retained_gen;
11379 for (enab = dtrace_retained; enab != NULL;
11380 enab = enab->dten_next) {
11381 for (i = 0; i < enab->dten_ndesc; i++) {
11382 desc = enab->dten_desc[i]->dted_probe;
11383 mutex_exit(&dtrace_lock);
11384 prv->dtpv_pops.dtps_provide(parg, &desc);
11385 mutex_enter(&dtrace_lock);
11386 /*
11387 * Process the retained enablings again if
11388 * they have changed while we weren't holding
11389 * dtrace_lock.
11390 */
11391 if (gen != dtrace_retained_gen)
11392 goto retry;
11393 }
11394 }
11395 } while (all && (prv = prv->dtpv_next) != NULL);
11396
11397 mutex_exit(&dtrace_lock);
11398 dtrace_probe_provide(NULL, all ? NULL : prv);
11399 mutex_enter(&dtrace_lock);
11400}
11401
11402/*
11403 * DTrace DOF Functions
11404 */
11405/*ARGSUSED*/
11406static void
11407dtrace_dof_error(dof_hdr_t *dof, const char *str)
11408{
11409 if (dtrace_err_verbose)
11410 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11411
11412#ifdef DTRACE_ERRDEBUG
11413 dtrace_errdebug(str);
11414#endif
11415}
11416
11417/*
11418 * Create DOF out of a currently enabled state. Right now, we only create
11419 * DOF containing the run-time options -- but this could be expanded to create
11420 * complete DOF representing the enabled state.
11421 */
11422static dof_hdr_t *
11423dtrace_dof_create(dtrace_state_t *state)
11424{
11425 dof_hdr_t *dof;
11426 dof_sec_t *sec;
11427 dof_optdesc_t *opt;
11428 int i, len = sizeof (dof_hdr_t) +
11429 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11430 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11431
11432 ASSERT(MUTEX_HELD(&dtrace_lock));
11433
11434 dof = kmem_zalloc(len, KM_SLEEP);
11435 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11436 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11437 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11438 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11439
11440 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11441 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11442 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11443 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11444 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11445 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11446
11447 dof->dofh_flags = 0;
11448 dof->dofh_hdrsize = sizeof (dof_hdr_t);
11449 dof->dofh_secsize = sizeof (dof_sec_t);
11450 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
11451 dof->dofh_secoff = sizeof (dof_hdr_t);
11452 dof->dofh_loadsz = len;
11453 dof->dofh_filesz = len;
11454 dof->dofh_pad = 0;
11455
11456 /*
11457 * Fill in the option section header...
11458 */
11459 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11460 sec->dofs_type = DOF_SECT_OPTDESC;
11461 sec->dofs_align = sizeof (uint64_t);
11462 sec->dofs_flags = DOF_SECF_LOAD;
11463 sec->dofs_entsize = sizeof (dof_optdesc_t);
11464
11465 opt = (dof_optdesc_t *)((uintptr_t)sec +
11466 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11467
11468 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11469 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11470
11471 for (i = 0; i < DTRACEOPT_MAX; i++) {
11472 opt[i].dofo_option = i;
11473 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11474 opt[i].dofo_value = state->dts_options[i];
11475 }
11476
11477 return (dof);
11478}
11479
11480static dof_hdr_t *
11481dtrace_dof_copyin(uintptr_t uarg, int *errp)
11482{
11483 dof_hdr_t hdr, *dof;
11484
11485 ASSERT(!MUTEX_HELD(&dtrace_lock));
11486
11487 /*
11488 * First, we're going to copyin() the sizeof (dof_hdr_t).
11489 */
11490 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11491 dtrace_dof_error(NULL, "failed to copyin DOF header");
11492 *errp = EFAULT;
11493 return (NULL);
11494 }
11495
11496 /*
11497 * Now we'll allocate the entire DOF and copy it in -- provided
11498 * that the length isn't outrageous.
11499 */
11500 if (hdr.dofh_loadsz >= VBDTCAST(uint64_t)dtrace_dof_maxsize) {
11501 dtrace_dof_error(&hdr, "load size exceeds maximum");
11502 *errp = E2BIG;
11503 return (NULL);
11504 }
11505
11506 if (hdr.dofh_loadsz < sizeof (hdr)) {
11507 dtrace_dof_error(&hdr, "invalid load size");
11508 *errp = EINVAL;
11509 return (NULL);
11510 }
11511
11512 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11513
11514 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11515 dof->dofh_loadsz != hdr.dofh_loadsz) {
11516 kmem_free(dof, hdr.dofh_loadsz);
11517 *errp = EFAULT;
11518 return (NULL);
11519 }
11520
11521 return (dof);
11522}
11523
11524static dof_hdr_t *
11525dtrace_dof_property(const char *name)
11526{
11527#ifndef VBOX
11528 uchar_t *buf;
11529 uint64_t loadsz;
11530 unsigned int len, i;
11531 dof_hdr_t *dof;
11532
11533 /*
11534 * Unfortunately, array of values in .conf files are always (and
11535 * only) interpreted to be integer arrays. We must read our DOF
11536 * as an integer array, and then squeeze it into a byte array.
11537 */
11538 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11539 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11540 return (NULL);
11541
11542 for (i = 0; i < len; i++)
11543 buf[i] = (uchar_t)(((int *)buf)[i]);
11544
11545 if (len < sizeof (dof_hdr_t)) {
11546 ddi_prop_free(buf);
11547 dtrace_dof_error(NULL, "truncated header");
11548 return (NULL);
11549 }
11550
11551 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11552 ddi_prop_free(buf);
11553 dtrace_dof_error(NULL, "truncated DOF");
11554 return (NULL);
11555 }
11556
11557 if (loadsz >= dtrace_dof_maxsize) {
11558 ddi_prop_free(buf);
11559 dtrace_dof_error(NULL, "oversized DOF");
11560 return (NULL);
11561 }
11562
11563 dof = kmem_alloc(loadsz, KM_SLEEP);
11564 bcopy(buf, dof, loadsz);
11565 ddi_prop_free(buf);
11566
11567 return (dof);
11568#else /* VBOX */
11569 return (NULL);
11570#endif /* VBOX */
11571}
11572
11573static void
11574dtrace_dof_destroy(dof_hdr_t *dof)
11575{
11576 kmem_free(dof, dof->dofh_loadsz);
11577}
11578
11579/*
11580 * Return the dof_sec_t pointer corresponding to a given section index. If the
11581 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
11582 * a type other than DOF_SECT_NONE is specified, the header is checked against
11583 * this type and NULL is returned if the types do not match.
11584 */
11585static dof_sec_t *
11586dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11587{
11588 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11589 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11590
11591 if (i >= dof->dofh_secnum) {
11592 dtrace_dof_error(dof, "referenced section index is invalid");
11593 return (NULL);
11594 }
11595
11596 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11597 dtrace_dof_error(dof, "referenced section is not loadable");
11598 return (NULL);
11599 }
11600
11601 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11602 dtrace_dof_error(dof, "referenced section is the wrong type");
11603 return (NULL);
11604 }
11605
11606 return (sec);
11607}
11608
11609static dtrace_probedesc_t *
11610dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11611{
11612 dof_probedesc_t *probe;
11613 dof_sec_t *strtab;
11614 uintptr_t daddr = (uintptr_t)dof;
11615 uintptr_t str;
11616 size_t size;
11617
11618 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11619 dtrace_dof_error(dof, "invalid probe section");
11620 return (NULL);
11621 }
11622
11623 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11624 dtrace_dof_error(dof, "bad alignment in probe description");
11625 return (NULL);
11626 }
11627
11628 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11629 dtrace_dof_error(dof, "truncated probe description");
11630 return (NULL);
11631 }
11632
11633 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11634 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11635
11636 if (strtab == NULL)
11637 return (NULL);
11638
11639 str = daddr + strtab->dofs_offset;
11640 size = strtab->dofs_size;
11641
11642 if (probe->dofp_provider >= strtab->dofs_size) {
11643 dtrace_dof_error(dof, "corrupt probe provider");
11644 return (NULL);
11645 }
11646
11647 (void) strncpy(desc->dtpd_provider,
11648 (char *)(str + probe->dofp_provider),
11649 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11650
11651 if (probe->dofp_mod >= strtab->dofs_size) {
11652 dtrace_dof_error(dof, "corrupt probe module");
11653 return (NULL);
11654 }
11655
11656 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11657 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11658
11659 if (probe->dofp_func >= strtab->dofs_size) {
11660 dtrace_dof_error(dof, "corrupt probe function");
11661 return (NULL);
11662 }
11663
11664 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11665 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11666
11667 if (probe->dofp_name >= strtab->dofs_size) {
11668 dtrace_dof_error(dof, "corrupt probe name");
11669 return (NULL);
11670 }
11671
11672 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11673 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11674
11675 return (desc);
11676}
11677
11678static dtrace_difo_t *
11679dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11680 cred_t *cr)
11681{
11682 dtrace_difo_t *dp;
11683 size_t ttl = 0;
11684 dof_difohdr_t *dofd;
11685 uintptr_t daddr = (uintptr_t)dof;
11686 size_t max = dtrace_difo_maxsize;
11687 int i, l, n;
11688
11689 static const struct {
11690 int section;
11691 int bufoffs;
11692 int lenoffs;
11693 int entsize;
11694 int align;
11695 const char *msg;
11696 } difo[] = {
11697 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11698 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11699 sizeof (dif_instr_t), "multiple DIF sections" },
11700
11701 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11702 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11703 sizeof (uint64_t), "multiple integer tables" },
11704
11705 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11706 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11707 sizeof (char), "multiple string tables" },
11708
11709 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11710 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11711 sizeof (uint_t), "multiple variable tables" },
11712
11713 { DOF_SECT_NONE, 0, 0, 0, NULL }
11714 };
11715
11716 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11717 dtrace_dof_error(dof, "invalid DIFO header section");
11718 return (NULL);
11719 }
11720
11721 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11722 dtrace_dof_error(dof, "bad alignment in DIFO header");
11723 return (NULL);
11724 }
11725
11726 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11727 sec->dofs_size % sizeof (dof_secidx_t)) {
11728 dtrace_dof_error(dof, "bad size in DIFO header");
11729 return (NULL);
11730 }
11731
11732 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11733 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11734
11735 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11736 dp->dtdo_rtype = dofd->dofd_rtype;
11737
11738 for (l = 0; l < n; l++) {
11739 dof_sec_t *subsec;
11740 void **bufp;
11741 uint32_t *lenp;
11742
11743 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11744 dofd->dofd_links[l])) == NULL)
11745 goto err; /* invalid section link */
11746
11747 if (ttl + subsec->dofs_size > max) {
11748 dtrace_dof_error(dof, "exceeds maximum size");
11749 goto err;
11750 }
11751
11752 ttl += subsec->dofs_size;
11753
11754 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11755 if (subsec->dofs_type != VBDTCAST(uint32_t)difo[i].section)
11756 continue;
11757
11758 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11759 dtrace_dof_error(dof, "section not loaded");
11760 goto err;
11761 }
11762
11763 if (subsec->dofs_align != VBDTCAST(uint32_t)difo[i].align) {
11764 dtrace_dof_error(dof, "bad alignment");
11765 goto err;
11766 }
11767
11768 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11769 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11770
11771 if (*bufp != NULL) {
11772 dtrace_dof_error(dof, difo[i].msg);
11773 goto err;
11774 }
11775
11776 if (VBDTCAST(uint32_t)difo[i].entsize != subsec->dofs_entsize) {
11777 dtrace_dof_error(dof, "entry size mismatch");
11778 goto err;
11779 }
11780
11781 if (subsec->dofs_entsize != 0 &&
11782 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11783 dtrace_dof_error(dof, "corrupt entry size");
11784 goto err;
11785 }
11786
11787 *lenp = subsec->dofs_size;
11788 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11789 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11790 *bufp, subsec->dofs_size);
11791
11792 if (subsec->dofs_entsize != 0)
11793 *lenp /= subsec->dofs_entsize;
11794
11795 break;
11796 }
11797
11798 /*
11799 * If we encounter a loadable DIFO sub-section that is not
11800 * known to us, assume this is a broken program and fail.
11801 */
11802 if (difo[i].section == DOF_SECT_NONE &&
11803 (subsec->dofs_flags & DOF_SECF_LOAD)) {
11804 dtrace_dof_error(dof, "unrecognized DIFO subsection");
11805 goto err;
11806 }
11807 }
11808
11809 if (dp->dtdo_buf == NULL) {
11810 /*
11811 * We can't have a DIF object without DIF text.
11812 */
11813 dtrace_dof_error(dof, "missing DIF text");
11814 goto err;
11815 }
11816
11817 /*
11818 * Before we validate the DIF object, run through the variable table
11819 * looking for the strings -- if any of their size are under, we'll set
11820 * their size to be the system-wide default string size. Note that
11821 * this should _not_ happen if the "strsize" option has been set --
11822 * in this case, the compiler should have set the size to reflect the
11823 * setting of the option.
11824 */
11825 for (i = 0; VBDTCAST(unsigned)i < dp->dtdo_varlen; i++) {
11826 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11827 dtrace_diftype_t *t = &v->dtdv_type;
11828
11829 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11830 continue;
11831
11832 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11833 t->dtdt_size = VBDTCAST(uint32_t)dtrace_strsize_default;
11834 }
11835
11836 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11837 goto err;
11838
11839 dtrace_difo_init(dp, vstate);
11840 return (dp);
11841
11842err:
11843 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11844 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11845 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11846 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11847
11848 kmem_free(dp, sizeof (dtrace_difo_t));
11849 return (NULL);
11850}
11851
11852static dtrace_predicate_t *
11853dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11854 cred_t *cr)
11855{
11856 dtrace_difo_t *dp;
11857
11858 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11859 return (NULL);
11860
11861 return (dtrace_predicate_create(dp));
11862}
11863
11864static dtrace_actdesc_t *
11865dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11866 cred_t *cr)
11867{
11868 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11869 dof_actdesc_t *desc;
11870 dof_sec_t *difosec;
11871 size_t offs;
11872 uintptr_t daddr = (uintptr_t)dof;
11873 uint64_t arg;
11874 dtrace_actkind_t kind;
11875
11876 if (sec->dofs_type != DOF_SECT_ACTDESC) {
11877 dtrace_dof_error(dof, "invalid action section");
11878 return (NULL);
11879 }
11880
11881 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11882 dtrace_dof_error(dof, "truncated action description");
11883 return (NULL);
11884 }
11885
11886 if (sec->dofs_align != sizeof (uint64_t)) {
11887 dtrace_dof_error(dof, "bad alignment in action description");
11888 return (NULL);
11889 }
11890
11891 if (sec->dofs_size < sec->dofs_entsize) {
11892 dtrace_dof_error(dof, "section entry size exceeds total size");
11893 return (NULL);
11894 }
11895
11896 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11897 dtrace_dof_error(dof, "bad entry size in action description");
11898 return (NULL);
11899 }
11900
11901 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11902 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11903 return (NULL);
11904 }
11905
11906 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11907 desc = (dof_actdesc_t *)(daddr +
11908 (uintptr_t)sec->dofs_offset + offs);
11909 kind = (dtrace_actkind_t)desc->dofa_kind;
11910
11911 if (DTRACEACT_ISPRINTFLIKE(kind) &&
11912 (kind != DTRACEACT_PRINTA ||
11913 desc->dofa_strtab != DOF_SECIDX_NONE)) {
11914 dof_sec_t *strtab;
11915 char *str, *fmt;
11916 uint64_t i;
11917
11918 /*
11919 * printf()-like actions must have a format string.
11920 */
11921 if ((strtab = dtrace_dof_sect(dof,
11922 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11923 goto err;
11924
11925 str = (char *)((uintptr_t)dof +
11926 (uintptr_t)strtab->dofs_offset);
11927
11928 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11929 if (str[i] == '\0')
11930 break;
11931 }
11932
11933 if (i >= strtab->dofs_size) {
11934 dtrace_dof_error(dof, "bogus format string");
11935 goto err;
11936 }
11937
11938 if (i == desc->dofa_arg) {
11939 dtrace_dof_error(dof, "empty format string");
11940 goto err;
11941 }
11942
11943 i -= desc->dofa_arg;
11944 fmt = kmem_alloc(i + 1, KM_SLEEP);
11945 bcopy(&str[desc->dofa_arg], fmt, i + 1);
11946 arg = (uint64_t)(uintptr_t)fmt;
11947 } else {
11948 if (kind == DTRACEACT_PRINTA) {
11949 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11950 arg = 0;
11951 } else {
11952 arg = desc->dofa_arg;
11953 }
11954 }
11955
11956 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11957 desc->dofa_uarg, arg);
11958
11959 if (last != NULL) {
11960 last->dtad_next = act;
11961 } else {
11962 first = act;
11963 }
11964
11965 last = act;
11966
11967 if (desc->dofa_difo == DOF_SECIDX_NONE)
11968 continue;
11969
11970 if ((difosec = dtrace_dof_sect(dof,
11971 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11972 goto err;
11973
11974 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11975
11976 if (act->dtad_difo == NULL)
11977 goto err;
11978 }
11979
11980 ASSERT(first != NULL);
11981 return (first);
11982
11983err:
11984 for (act = first; act != NULL; act = next) {
11985 next = act->dtad_next;
11986 dtrace_actdesc_release(act, vstate);
11987 }
11988
11989 return (NULL);
11990}
11991
11992static dtrace_ecbdesc_t *
11993dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11994 cred_t *cr)
11995{
11996 dtrace_ecbdesc_t *ep;
11997 dof_ecbdesc_t *ecb;
11998 dtrace_probedesc_t *desc;
11999 dtrace_predicate_t *pred = NULL;
12000
12001 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12002 dtrace_dof_error(dof, "truncated ECB description");
12003 return (NULL);
12004 }
12005
12006 if (sec->dofs_align != sizeof (uint64_t)) {
12007 dtrace_dof_error(dof, "bad alignment in ECB description");
12008 return (NULL);
12009 }
12010
12011 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12012 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12013
12014 if (sec == NULL)
12015 return (NULL);
12016
12017 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12018 ep->dted_uarg = ecb->dofe_uarg;
12019 desc = &ep->dted_probe;
12020
12021 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12022 goto err;
12023
12024 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12025 if ((sec = dtrace_dof_sect(dof,
12026 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12027 goto err;
12028
12029 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12030 goto err;
12031
12032 ep->dted_pred.dtpdd_predicate = pred;
12033 }
12034
12035 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12036 if ((sec = dtrace_dof_sect(dof,
12037 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12038 goto err;
12039
12040 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12041
12042 if (ep->dted_action == NULL)
12043 goto err;
12044 }
12045
12046 return (ep);
12047
12048err:
12049 if (pred != NULL)
12050 dtrace_predicate_release(pred, vstate);
12051 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12052 return (NULL);
12053}
12054
12055/*
12056 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
12057 * specified DOF. At present, this amounts to simply adding 'ubase' to the
12058 * site of any user SETX relocations to account for load object base address.
12059 * In the future, if we need other relocations, this function can be extended.
12060 */
12061static int
12062dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
12063{
12064 uintptr_t daddr = (uintptr_t)dof;
12065 dof_relohdr_t *dofr =
12066 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12067 dof_sec_t *ss, *rs, *ts;
12068 dof_relodesc_t *r;
12069 uint_t i, n;
12070
12071 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
12072 sec->dofs_align != sizeof (dof_secidx_t)) {
12073 dtrace_dof_error(dof, "invalid relocation header");
12074 return (-1);
12075 }
12076
12077 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
12078 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
12079 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
12080
12081 if (ss == NULL || rs == NULL || ts == NULL)
12082 return (-1); /* dtrace_dof_error() has been called already */
12083
12084 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
12085 rs->dofs_align != sizeof (uint64_t)) {
12086 dtrace_dof_error(dof, "invalid relocation section");
12087 return (-1);
12088 }
12089
12090 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
12091 n = rs->dofs_size / rs->dofs_entsize;
12092
12093 for (i = 0; i < n; i++) {
12094 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
12095
12096 switch (r->dofr_type) {
12097 case DOF_RELO_NONE:
12098 break;
12099 case DOF_RELO_SETX:
12100 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
12101 sizeof (uint64_t) > ts->dofs_size) {
12102 dtrace_dof_error(dof, "bad relocation offset");
12103 return (-1);
12104 }
12105
12106 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
12107 dtrace_dof_error(dof, "misaligned setx relo");
12108 return (-1);
12109 }
12110
12111 *(uint64_t *)taddr += ubase;
12112 break;
12113 default:
12114 dtrace_dof_error(dof, "invalid relocation type");
12115 return (-1);
12116 }
12117
12118 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
12119 }
12120
12121 return (0);
12122}
12123
12124/*
12125 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12126 * header: it should be at the front of a memory region that is at least
12127 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12128 * size. It need not be validated in any other way.
12129 */
12130static int
12131dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12132 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12133{
12134 uint64_t len = dof->dofh_loadsz, seclen;
12135 uintptr_t daddr = (uintptr_t)dof;
12136 dtrace_ecbdesc_t *ep;
12137 dtrace_enabling_t *enab;
12138 uint_t i;
12139
12140 ASSERT(MUTEX_HELD(&dtrace_lock));
12141 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12142
12143 /*
12144 * Check the DOF header identification bytes. In addition to checking
12145 * valid settings, we also verify that unused bits/bytes are zeroed so
12146 * we can use them later without fear of regressing existing binaries.
12147 */
12148 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12149 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12150 dtrace_dof_error(dof, "DOF magic string mismatch");
12151 return (-1);
12152 }
12153
12154 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12155 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12156 dtrace_dof_error(dof, "DOF has invalid data model");
12157 return (-1);
12158 }
12159
12160 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12161 dtrace_dof_error(dof, "DOF encoding mismatch");
12162 return (-1);
12163 }
12164
12165 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12166 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12167 dtrace_dof_error(dof, "DOF version mismatch");
12168 return (-1);
12169 }
12170
12171 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12172 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12173 return (-1);
12174 }
12175
12176 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12177 dtrace_dof_error(dof, "DOF uses too many integer registers");
12178 return (-1);
12179 }
12180
12181 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12182 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12183 return (-1);
12184 }
12185
12186 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12187 if (dof->dofh_ident[i] != 0) {
12188 dtrace_dof_error(dof, "DOF has invalid ident byte set");
12189 return (-1);
12190 }
12191 }
12192
12193 if (dof->dofh_flags & ~DOF_FL_VALID) {
12194 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12195 return (-1);
12196 }
12197
12198 if (dof->dofh_secsize == 0) {
12199 dtrace_dof_error(dof, "zero section header size");
12200 return (-1);
12201 }
12202
12203 /*
12204 * Check that the section headers don't exceed the amount of DOF
12205 * data. Note that we cast the section size and number of sections
12206 * to uint64_t's to prevent possible overflow in the multiplication.
12207 */
12208 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12209
12210 if (dof->dofh_secoff > len || seclen > len ||
12211 dof->dofh_secoff + seclen > len) {
12212 dtrace_dof_error(dof, "truncated section headers");
12213 return (-1);
12214 }
12215
12216 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12217 dtrace_dof_error(dof, "misaligned section headers");
12218 return (-1);
12219 }
12220
12221 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12222 dtrace_dof_error(dof, "misaligned section size");
12223 return (-1);
12224 }
12225
12226 /*
12227 * Take an initial pass through the section headers to be sure that
12228 * the headers don't have stray offsets. If the 'noprobes' flag is
12229 * set, do not permit sections relating to providers, probes, or args.
12230 */
12231 for (i = 0; i < dof->dofh_secnum; i++) {
12232 dof_sec_t *sec = (dof_sec_t *)(daddr +
12233 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12234
12235 if (noprobes) {
12236 switch (sec->dofs_type) {
12237 case DOF_SECT_PROVIDER:
12238 case DOF_SECT_PROBES:
12239 case DOF_SECT_PRARGS:
12240 case DOF_SECT_PROFFS:
12241 dtrace_dof_error(dof, "illegal sections "
12242 "for enabling");
12243 return (-1);
12244 }
12245 }
12246
12247 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12248 !(sec->dofs_flags & DOF_SECF_LOAD)) {
12249 dtrace_dof_error(dof, "loadable section with load "
12250 "flag unset");
12251 return (-1);
12252 }
12253
12254 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12255 continue; /* just ignore non-loadable sections */
12256
12257 if (sec->dofs_align & (sec->dofs_align - 1)) {
12258 dtrace_dof_error(dof, "bad section alignment");
12259 return (-1);
12260 }
12261
12262 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12263 dtrace_dof_error(dof, "misaligned section");
12264 return (-1);
12265 }
12266
12267 if (sec->dofs_offset > len || sec->dofs_size > len ||
12268 sec->dofs_offset + sec->dofs_size > len) {
12269 dtrace_dof_error(dof, "corrupt section header");
12270 return (-1);
12271 }
12272
12273 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12274 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12275 dtrace_dof_error(dof, "non-terminating string table");
12276 return (-1);
12277 }
12278 }
12279
12280 /*
12281 * Take a second pass through the sections and locate and perform any
12282 * relocations that are present. We do this after the first pass to
12283 * be sure that all sections have had their headers validated.
12284 */
12285 for (i = 0; i < dof->dofh_secnum; i++) {
12286 dof_sec_t *sec = (dof_sec_t *)(daddr +
12287 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12288
12289 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12290 continue; /* skip sections that are not loadable */
12291
12292 switch (sec->dofs_type) {
12293 case DOF_SECT_URELHDR:
12294 if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12295 return (-1);
12296 break;
12297 }
12298 }
12299
12300 if ((enab = *enabp) == NULL)
12301 enab = *enabp = dtrace_enabling_create(vstate);
12302
12303 for (i = 0; i < dof->dofh_secnum; i++) {
12304 dof_sec_t *sec = (dof_sec_t *)(daddr +
12305 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12306
12307 if (sec->dofs_type != DOF_SECT_ECBDESC)
12308 continue;
12309
12310 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12311 dtrace_enabling_destroy(enab);
12312 *enabp = NULL;
12313 return (-1);
12314 }
12315
12316 dtrace_enabling_add(enab, ep);
12317 }
12318
12319 return (0);
12320}
12321
12322/*
12323 * Process DOF for any options. This routine assumes that the DOF has been
12324 * at least processed by dtrace_dof_slurp().
12325 */
12326static int
12327dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12328{
12329 int i, rval;
12330 uint32_t entsize;
12331 size_t offs;
12332 dof_optdesc_t *desc;
12333
12334 for (i = 0; VBDTCAST(unsigned)i < dof->dofh_secnum; i++) {
12335 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12336 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12337
12338 if (sec->dofs_type != DOF_SECT_OPTDESC)
12339 continue;
12340
12341 if (sec->dofs_align != sizeof (uint64_t)) {
12342 dtrace_dof_error(dof, "bad alignment in "
12343 "option description");
12344 return (EINVAL);
12345 }
12346
12347 if ((entsize = sec->dofs_entsize) == 0) {
12348 dtrace_dof_error(dof, "zeroed option entry size");
12349 return (EINVAL);
12350 }
12351
12352 if (entsize < sizeof (dof_optdesc_t)) {
12353 dtrace_dof_error(dof, "bad option entry size");
12354 return (EINVAL);
12355 }
12356
12357 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12358 desc = (dof_optdesc_t *)((uintptr_t)dof +
12359 (uintptr_t)sec->dofs_offset + offs);
12360
12361 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12362 dtrace_dof_error(dof, "non-zero option string");
12363 return (EINVAL);
12364 }
12365
12366 if (desc->dofo_value == VBDTCAST(uint64_t)DTRACEOPT_UNSET) {
12367 dtrace_dof_error(dof, "unset option");
12368 return (EINVAL);
12369 }
12370
12371 if ((rval = dtrace_state_option(state,
12372 desc->dofo_option, desc->dofo_value)) != 0) {
12373 dtrace_dof_error(dof, "rejected option");
12374 return (rval);
12375 }
12376 }
12377 }
12378
12379 return (0);
12380}
12381
12382/*
12383 * DTrace Consumer State Functions
12384 */
12385VBDTSTATIC int
12386dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12387{
12388 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12389 void *base;
12390 uintptr_t limit;
12391 dtrace_dynvar_t *dvar, *next, *start;
12392 VBDTTYPE(size_t,int) i;
12393
12394 ASSERT(MUTEX_HELD(&dtrace_lock));
12395 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12396
12397 bzero(dstate, sizeof (dtrace_dstate_t));
12398
12399 if ((dstate->dtds_chunksize = chunksize) == 0)
12400 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12401
12402 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12403 size = min;
12404
12405 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12406 return (ENOMEM);
12407
12408 dstate->dtds_size = size;
12409 dstate->dtds_base = base;
12410 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12411 bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12412
12413 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12414
12415 if (hashsize != 1 && (hashsize & 1))
12416 hashsize--;
12417
12418 dstate->dtds_hashsize = hashsize;
12419 dstate->dtds_hash = dstate->dtds_base;
12420
12421 /*
12422 * Set all of our hash buckets to point to the single sink, and (if
12423 * it hasn't already been set), set the sink's hash value to be the
12424 * sink sentinel value. The sink is needed for dynamic variable
12425 * lookups to know that they have iterated over an entire, valid hash
12426 * chain.
12427 */
12428 for (i = 0; i < hashsize; i++)
12429 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12430
12431 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12432 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12433
12434 /*
12435 * Determine number of active CPUs. Divide free list evenly among
12436 * active CPUs.
12437 */
12438 start = (dtrace_dynvar_t *)
12439 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12440 limit = (uintptr_t)base + size;
12441
12442 maxper = (limit - (uintptr_t)start) / NCPU;
12443 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12444
12445 for (i = 0; i < NCPU; i++) {
12446 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12447
12448 /*
12449 * If we don't even have enough chunks to make it once through
12450 * NCPUs, we're just going to allocate everything to the first
12451 * CPU. And if we're on the last CPU, we're going to allocate
12452 * whatever is left over. In either case, we set the limit to
12453 * be the limit of the dynamic variable space.
12454 */
12455 if (maxper == 0 || i == NCPU - 1) {
12456 limit = (uintptr_t)base + size;
12457 start = NULL;
12458 } else {
12459 limit = (uintptr_t)start + maxper;
12460 start = (dtrace_dynvar_t *)limit;
12461 }
12462
12463 ASSERT(limit <= (uintptr_t)base + size);
12464
12465 for (;;) {
12466 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12467 dstate->dtds_chunksize);
12468
12469 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12470 break;
12471
12472 dvar->dtdv_next = next;
12473 dvar = next;
12474 }
12475
12476 if (maxper == 0)
12477 break;
12478 }
12479
12480 return (0);
12481}
12482
12483VBDTSTATIC void
12484dtrace_dstate_fini(dtrace_dstate_t *dstate)
12485{
12486 ASSERT(MUTEX_HELD(&cpu_lock));
12487
12488 if (dstate->dtds_base == NULL)
12489 return;
12490
12491 kmem_free(dstate->dtds_base, dstate->dtds_size);
12492 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12493}
12494
12495static void
12496dtrace_vstate_fini(dtrace_vstate_t *vstate)
12497{
12498 /*
12499 * Logical XOR, where are you?
12500 */
12501 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12502
12503 if (vstate->dtvs_nglobals > 0) {
12504 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12505 sizeof (dtrace_statvar_t *));
12506 }
12507
12508 if (vstate->dtvs_ntlocals > 0) {
12509 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12510 sizeof (dtrace_difv_t));
12511 }
12512
12513 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12514
12515 if (vstate->dtvs_nlocals > 0) {
12516 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12517 sizeof (dtrace_statvar_t *));
12518 }
12519}
12520
12521static void
12522dtrace_state_clean(dtrace_state_t *state)
12523{
12524 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12525 return;
12526
12527 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12528 dtrace_speculation_clean(state);
12529}
12530#ifdef VBOX
12531static DECLCALLBACK(void) dtrace_state_clean_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12532{
12533 dtrace_state_clean((dtrace_state_t *)pvUser);
12534 NOREF(pTimer); NOREF(iTick);
12535}
12536#endif
12537
12538static void
12539dtrace_state_deadman(dtrace_state_t *state)
12540{
12541 hrtime_t now;
12542
12543 dtrace_sync();
12544
12545 now = dtrace_gethrtime();
12546
12547 if (state != dtrace_anon.dta_state &&
12548 now - state->dts_laststatus >= dtrace_deadman_user)
12549 return;
12550
12551 /*
12552 * We must be sure that dts_alive never appears to be less than the
12553 * value upon entry to dtrace_state_deadman(), and because we lack a
12554 * dtrace_cas64(), we cannot store to it atomically. We thus instead
12555 * store INT64_MAX to it, followed by a memory barrier, followed by
12556 * the new value. This assures that dts_alive never appears to be
12557 * less than its true value, regardless of the order in which the
12558 * stores to the underlying storage are issued.
12559 */
12560 state->dts_alive = INT64_MAX;
12561 dtrace_membar_producer();
12562 state->dts_alive = now;
12563}
12564
12565#ifdef VBOX
12566static DECLCALLBACK(void) dtrace_state_deadman_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12567{
12568 dtrace_state_deadman((dtrace_state_t *)pvUser);
12569 NOREF(pTimer); NOREF(iTick);
12570}
12571#endif
12572
12573VBDTSTATIC dtrace_state_t *
12574#ifdef VBOX
12575dtrace_state_create(cred_t *cr)
12576#else
12577dtrace_state_create(dev_t *devp, cred_t *cr)
12578#endif
12579{
12580#ifndef VBOX
12581 minor_t minor;
12582 major_t major;
12583#endif
12584 char c[30];
12585 dtrace_state_t *state;
12586 dtrace_optval_t *opt;
12587 int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12588
12589 ASSERT(MUTEX_HELD(&dtrace_lock));
12590 ASSERT(MUTEX_HELD(&cpu_lock));
12591
12592#ifndef VBOX
12593 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12594 VM_BESTFIT | VM_SLEEP);
12595
12596 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12597 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12598 return (NULL);
12599 }
12600
12601 state = ddi_get_soft_state(dtrace_softstate, minor);
12602#else
12603 state = kmem_zalloc(sizeof (*state), KM_SLEEP);
12604 if (!state) {
12605 return (NULL);
12606 }
12607#endif
12608 state->dts_epid = DTRACE_EPIDNONE + 1;
12609
12610#ifndef VBOX
12611 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12612#else
12613 (void) snprintf(c, sizeof (c), "dtrace_aggid_%p", state);
12614#endif
12615#ifndef VBOX /* Avoid idProbe = UINT32_MAX as it is used as invalid value by VTG. */
12616 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12617 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12618#else
12619 state->dts_aggid_arena = vmem_create(c, (void *)(uintptr_t)1, _1G, 1,
12620 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12621#endif
12622
12623#ifndef VBOX
12624 if (devp != NULL) {
12625 major = getemajor(*devp);
12626 } else {
12627 major = ddi_driver_major(dtrace_devi);
12628 }
12629
12630 state->dts_dev = makedevice(major, minor);
12631
12632 if (devp != NULL)
12633 *devp = state->dts_dev;
12634#endif
12635
12636 /*
12637 * We allocate NCPU buffers. On the one hand, this can be quite
12638 * a bit of memory per instance (nearly 36K on a Starcat). On the
12639 * other hand, it saves an additional memory reference in the probe
12640 * path.
12641 */
12642 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12643 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12644 state->dts_cleaner = CYCLIC_NONE;
12645 state->dts_deadman = CYCLIC_NONE;
12646 state->dts_vstate.dtvs_state = state;
12647
12648 for (i = 0; i < DTRACEOPT_MAX; i++)
12649 state->dts_options[i] = DTRACEOPT_UNSET;
12650
12651 /*
12652 * Set the default options.
12653 */
12654 opt = state->dts_options;
12655 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12656 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12657 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12658 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12659 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12660 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12661 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12662 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12663 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12664 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12665 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12666 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12667 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12668 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12669
12670 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12671
12672 /*
12673 * Depending on the user credentials, we set flag bits which alter probe
12674 * visibility or the amount of destructiveness allowed. In the case of
12675 * actual anonymous tracing, or the possession of all privileges, all of
12676 * the normal checks are bypassed.
12677 */
12678 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12679 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12680 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12681 } else {
12682 /*
12683 * Set up the credentials for this instantiation. We take a
12684 * hold on the credential to prevent it from disappearing on
12685 * us; this in turn prevents the zone_t referenced by this
12686 * credential from disappearing. This means that we can
12687 * examine the credential and the zone from probe context.
12688 */
12689 crhold(cr);
12690 state->dts_cred.dcr_cred = cr;
12691
12692 /*
12693 * CRA_PROC means "we have *some* privilege for dtrace" and
12694 * unlocks the use of variables like pid, zonename, etc.
12695 */
12696 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12697 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12698 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12699 }
12700
12701 /*
12702 * dtrace_user allows use of syscall and profile providers.
12703 * If the user also has proc_owner and/or proc_zone, we
12704 * extend the scope to include additional visibility and
12705 * destructive power.
12706 */
12707 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12708 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12709 state->dts_cred.dcr_visible |=
12710 DTRACE_CRV_ALLPROC;
12711
12712 state->dts_cred.dcr_action |=
12713 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12714 }
12715
12716 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12717 state->dts_cred.dcr_visible |=
12718 DTRACE_CRV_ALLZONE;
12719
12720 state->dts_cred.dcr_action |=
12721 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12722 }
12723
12724 /*
12725 * If we have all privs in whatever zone this is,
12726 * we can do destructive things to processes which
12727 * have altered credentials.
12728 */
12729 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12730 cr->cr_zone->zone_privset)) {
12731 state->dts_cred.dcr_action |=
12732 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12733 }
12734 }
12735
12736 /*
12737 * Holding the dtrace_kernel privilege also implies that
12738 * the user has the dtrace_user privilege from a visibility
12739 * perspective. But without further privileges, some
12740 * destructive actions are not available.
12741 */
12742 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12743 /*
12744 * Make all probes in all zones visible. However,
12745 * this doesn't mean that all actions become available
12746 * to all zones.
12747 */
12748 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12749 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12750
12751 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12752 DTRACE_CRA_PROC;
12753 /*
12754 * Holding proc_owner means that destructive actions
12755 * for *this* zone are allowed.
12756 */
12757 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12758 state->dts_cred.dcr_action |=
12759 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12760
12761 /*
12762 * Holding proc_zone means that destructive actions
12763 * for this user/group ID in all zones is allowed.
12764 */
12765 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12766 state->dts_cred.dcr_action |=
12767 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12768
12769 /*
12770 * If we have all privs in whatever zone this is,
12771 * we can do destructive things to processes which
12772 * have altered credentials.
12773 */
12774 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12775 cr->cr_zone->zone_privset)) {
12776 state->dts_cred.dcr_action |=
12777 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12778 }
12779 }
12780
12781 /*
12782 * Holding the dtrace_proc privilege gives control over fasttrap
12783 * and pid providers. We need to grant wider destructive
12784 * privileges in the event that the user has proc_owner and/or
12785 * proc_zone.
12786 */
12787 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12788 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12789 state->dts_cred.dcr_action |=
12790 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12791
12792 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12793 state->dts_cred.dcr_action |=
12794 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12795 }
12796 }
12797
12798 return (state);
12799}
12800
12801static int
12802dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12803{
12804 dtrace_optval_t *opt = state->dts_options, size;
12805 processorid_t cpu VBDTUNASS((processorid_t)DTRACE_CPUALL);
12806 int flags = 0, rval;
12807
12808 ASSERT(MUTEX_HELD(&dtrace_lock));
12809 ASSERT(MUTEX_HELD(&cpu_lock));
12810 ASSERT(which < DTRACEOPT_MAX);
12811 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12812 (state == dtrace_anon.dta_state &&
12813 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12814
12815 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12816 return (0);
12817
12818 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12819 cpu = opt[DTRACEOPT_CPU];
12820
12821 if (which == DTRACEOPT_SPECSIZE)
12822 flags |= DTRACEBUF_NOSWITCH;
12823
12824 if (which == DTRACEOPT_BUFSIZE) {
12825 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12826 flags |= DTRACEBUF_RING;
12827
12828 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12829 flags |= DTRACEBUF_FILL;
12830
12831 if (state != dtrace_anon.dta_state ||
12832 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12833 flags |= DTRACEBUF_INACTIVE;
12834 }
12835
12836 for (size = opt[which]; size >= VBDTCAST(dtrace_optval_t)sizeof (uint64_t); size >>= 1) {
12837 /*
12838 * The size must be 8-byte aligned. If the size is not 8-byte
12839 * aligned, drop it down by the difference.
12840 */
12841 if (size & (sizeof (uint64_t) - 1))
12842 size -= size & (sizeof (uint64_t) - 1);
12843
12844 if (size < state->dts_reserve) {
12845 /*
12846 * Buffers always must be large enough to accommodate
12847 * their prereserved space. We return E2BIG instead
12848 * of ENOMEM in this case to allow for user-level
12849 * software to differentiate the cases.
12850 */
12851 return (E2BIG);
12852 }
12853
12854 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12855
12856 if (rval != ENOMEM) {
12857 opt[which] = size;
12858 return (rval);
12859 }
12860
12861 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12862 return (rval);
12863 }
12864
12865 return (ENOMEM);
12866}
12867
12868static int
12869dtrace_state_buffers(dtrace_state_t *state)
12870{
12871 dtrace_speculation_t *spec = state->dts_speculations;
12872 int rval, i;
12873
12874 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12875 DTRACEOPT_BUFSIZE)) != 0)
12876 return (rval);
12877
12878 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12879 DTRACEOPT_AGGSIZE)) != 0)
12880 return (rval);
12881
12882 for (i = 0; i < state->dts_nspeculations; i++) {
12883 if ((rval = dtrace_state_buffer(state,
12884 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12885 return (rval);
12886 }
12887
12888 return (0);
12889}
12890
12891static void
12892dtrace_state_prereserve(dtrace_state_t *state)
12893{
12894 dtrace_ecb_t *ecb;
12895 dtrace_probe_t *probe;
12896
12897 state->dts_reserve = 0;
12898
12899 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12900 return;
12901
12902 /*
12903 * If our buffer policy is a "fill" buffer policy, we need to set the
12904 * prereserved space to be the space required by the END probes.
12905 */
12906 probe = dtrace_probes[dtrace_probeid_end - 1];
12907 ASSERT(probe != NULL);
12908
12909 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12910 if (ecb->dte_state != state)
12911 continue;
12912
12913 state->dts_reserve += VBDTCAST(uint32_t)ecb->dte_needed + ecb->dte_alignment;
12914 }
12915}
12916
12917static int
12918dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12919{
12920 dtrace_optval_t *opt = state->dts_options, sz, nspec;
12921 dtrace_speculation_t *spec;
12922 dtrace_buffer_t *buf;
12923#ifndef VBOX
12924 cyc_handler_t hdlr;
12925 cyc_time_t when;
12926#endif
12927 int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12928 dtrace_icookie_t cookie;
12929
12930 mutex_enter(&cpu_lock);
12931 mutex_enter(&dtrace_lock);
12932
12933 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12934 rval = EBUSY;
12935 goto out;
12936 }
12937
12938 /*
12939 * Before we can perform any checks, we must prime all of the
12940 * retained enablings that correspond to this state.
12941 */
12942 dtrace_enabling_prime(state);
12943
12944 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12945 rval = EACCES;
12946 goto out;
12947 }
12948
12949 dtrace_state_prereserve(state);
12950
12951 /*
12952 * Now we want to do is try to allocate our speculations.
12953 * We do not automatically resize the number of speculations; if
12954 * this fails, we will fail the operation.
12955 */
12956 nspec = opt[DTRACEOPT_NSPEC];
12957 ASSERT(nspec != DTRACEOPT_UNSET);
12958
12959 if (nspec > INT_MAX) {
12960 rval = ENOMEM;
12961 goto out;
12962 }
12963
12964 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
12965
12966 if (spec == NULL) {
12967 rval = ENOMEM;
12968 goto out;
12969 }
12970
12971 state->dts_speculations = spec;
12972 state->dts_nspeculations = (int)nspec;
12973
12974 for (i = 0; i < nspec; i++) {
12975 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
12976 rval = ENOMEM;
12977 goto err;
12978 }
12979
12980 spec[i].dtsp_buffer = buf;
12981 }
12982
12983 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12984 if (dtrace_anon.dta_state == NULL) {
12985 rval = ENOENT;
12986 goto out;
12987 }
12988
12989 if (state->dts_necbs != 0) {
12990 rval = EALREADY;
12991 goto out;
12992 }
12993
12994 state->dts_anon = dtrace_anon_grab();
12995 ASSERT(state->dts_anon != NULL);
12996 state = state->dts_anon;
12997
12998 /*
12999 * We want "grabanon" to be set in the grabbed state, so we'll
13000 * copy that option value from the grabbing state into the
13001 * grabbed state.
13002 */
13003 state->dts_options[DTRACEOPT_GRABANON] =
13004 opt[DTRACEOPT_GRABANON];
13005
13006 *cpu = dtrace_anon.dta_beganon;
13007
13008 /*
13009 * If the anonymous state is active (as it almost certainly
13010 * is if the anonymous enabling ultimately matched anything),
13011 * we don't allow any further option processing -- but we
13012 * don't return failure.
13013 */
13014 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13015 goto out;
13016 }
13017
13018 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13019 opt[DTRACEOPT_AGGSIZE] != 0) {
13020 if (state->dts_aggregations == NULL) {
13021 /*
13022 * We're not going to create an aggregation buffer
13023 * because we don't have any ECBs that contain
13024 * aggregations -- set this option to 0.
13025 */
13026 opt[DTRACEOPT_AGGSIZE] = 0;
13027 } else {
13028 /*
13029 * If we have an aggregation buffer, we must also have
13030 * a buffer to use as scratch.
13031 */
13032 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13033 opt[DTRACEOPT_BUFSIZE] < VBDTCAST(dtrace_optval_t)state->dts_needed) {
13034 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13035 }
13036 }
13037 }
13038
13039 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13040 opt[DTRACEOPT_SPECSIZE] != 0) {
13041 if (!state->dts_speculates) {
13042 /*
13043 * We're not going to create speculation buffers
13044 * because we don't have any ECBs that actually
13045 * speculate -- set the speculation size to 0.
13046 */
13047 opt[DTRACEOPT_SPECSIZE] = 0;
13048 }
13049 }
13050
13051 /*
13052 * The bare minimum size for any buffer that we're actually going to
13053 * do anything to is sizeof (uint64_t).
13054 */
13055 sz = sizeof (uint64_t);
13056
13057 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13058 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13059 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13060 /*
13061 * A buffer size has been explicitly set to 0 (or to a size
13062 * that will be adjusted to 0) and we need the space -- we
13063 * need to return failure. We return ENOSPC to differentiate
13064 * it from failing to allocate a buffer due to failure to meet
13065 * the reserve (for which we return E2BIG).
13066 */
13067 rval = ENOSPC;
13068 goto out;
13069 }
13070
13071 if ((rval = dtrace_state_buffers(state)) != 0)
13072 goto err;
13073
13074 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13075 sz = dtrace_dstate_defsize;
13076
13077 do {
13078 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13079
13080 if (rval == 0)
13081 break;
13082
13083 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13084 goto err;
13085 } while (sz >>= 1);
13086
13087 opt[DTRACEOPT_DYNVARSIZE] = sz;
13088
13089 if (rval != 0)
13090 goto err;
13091
13092 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13093 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13094
13095 if (opt[DTRACEOPT_CLEANRATE] == 0)
13096 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13097
13098 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13099 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13100
13101 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13102 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13103
13104#ifndef VBOX
13105 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13106 hdlr.cyh_arg = state;
13107 hdlr.cyh_level = CY_LOW_LEVEL;
13108
13109 when.cyt_when = 0;
13110 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13111
13112 state->dts_cleaner = cyclic_add(&hdlr, &when);
13113
13114 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13115 hdlr.cyh_arg = state;
13116 hdlr.cyh_level = CY_LOW_LEVEL;
13117
13118 when.cyt_when = 0;
13119 when.cyt_interval = dtrace_deadman_interval;
13120
13121 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13122 state->dts_deadman = cyclic_add(&hdlr, &when);
13123#else /* VBOX */
13124
13125 rval = RTTimerCreateEx(&state->dts_cleaner, opt[DTRACEOPT_CLEANRATE],
13126 RTTIMER_FLAGS_CPU_ANY, dtrace_state_clean_timer, state);
13127 if (RT_FAILURE(rval)) {
13128 rval = RTErrConvertToErrno(rval);
13129 goto err;
13130 }
13131
13132 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13133 rval = RTTimerCreateEx(&state->dts_deadman, dtrace_deadman_interval,
13134 RTTIMER_FLAGS_CPU_ANY, dtrace_state_deadman_timer, state);
13135 if (RT_FAILURE(rval)) {
13136 RTTimerDestroy(state->dts_cleaner);
13137 state->dts_cleaner = CYCLIC_NONE;
13138 state->dts_deadman = CYCLIC_NONE;
13139 rval = RTErrConvertToErrno(rval);
13140 goto err;
13141 }
13142
13143 rval = RTTimerStart(state->dts_cleaner, 0);
13144 if (RT_SUCCESS(rval))
13145 rval = RTTimerStart(state->dts_deadman, 0);
13146 if (RT_FAILURE(rval)) {
13147 rval = RTErrConvertToErrno(rval);
13148 goto err;
13149 }
13150#endif /* VBOX */
13151
13152 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13153
13154 /*
13155 * Now it's time to actually fire the BEGIN probe. We need to disable
13156 * interrupts here both to record the CPU on which we fired the BEGIN
13157 * probe (the data from this CPU will be processed first at user
13158 * level) and to manually activate the buffer for this CPU.
13159 */
13160 cookie = dtrace_interrupt_disable();
13161 *cpu = VBDT_GET_CPUID();
13162 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13163 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13164
13165 dtrace_probe(dtrace_probeid_begin,
13166 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13167 dtrace_interrupt_enable(cookie);
13168 /*
13169 * We may have had an exit action from a BEGIN probe; only change our
13170 * state to ACTIVE if we're still in WARMUP.
13171 */
13172 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13173 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13174
13175 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13176 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13177
13178 /*
13179 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13180 * want each CPU to transition its principal buffer out of the
13181 * INACTIVE state. Doing this assures that no CPU will suddenly begin
13182 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13183 * atomically transition from processing none of a state's ECBs to
13184 * processing all of them.
13185 */
13186#ifndef VBOX
13187 dtrace_xcall(DTRACE_CPUALL,
13188 (dtrace_xcall_t)dtrace_buffer_activate, state);
13189#else
13190 RTMpOnAll(dtrace_buffer_activate_wrapper, state, NULL);
13191#endif
13192 goto out;
13193
13194err:
13195 dtrace_buffer_free(state->dts_buffer);
13196 dtrace_buffer_free(state->dts_aggbuffer);
13197
13198 if ((nspec = state->dts_nspeculations) == 0) {
13199 ASSERT(state->dts_speculations == NULL);
13200 goto out;
13201 }
13202
13203 spec = state->dts_speculations;
13204 ASSERT(spec != NULL);
13205
13206 for (i = 0; i < state->dts_nspeculations; i++) {
13207 if ((buf = spec[i].dtsp_buffer) == NULL)
13208 break;
13209
13210 dtrace_buffer_free(buf);
13211 kmem_free(buf, bufsize);
13212 }
13213
13214 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13215 state->dts_nspeculations = 0;
13216 state->dts_speculations = NULL;
13217
13218out:
13219 mutex_exit(&dtrace_lock);
13220 mutex_exit(&cpu_lock);
13221
13222 return (rval);
13223}
13224
13225static int
13226dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13227{
13228 dtrace_icookie_t cookie;
13229
13230 ASSERT(MUTEX_HELD(&dtrace_lock));
13231
13232 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13233 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13234 return (EINVAL);
13235
13236 /*
13237 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13238 * to be sure that every CPU has seen it. See below for the details
13239 * on why this is done.
13240 */
13241 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13242 dtrace_sync();
13243
13244 /*
13245 * By this point, it is impossible for any CPU to be still processing
13246 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13247 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13248 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13249 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13250 * iff we're in the END probe.
13251 */
13252 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13253 dtrace_sync();
13254 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13255
13256 /*
13257 * Finally, we can release the reserve and call the END probe. We
13258 * disable interrupts across calling the END probe to allow us to
13259 * return the CPU on which we actually called the END probe. This
13260 * allows user-land to be sure that this CPU's principal buffer is
13261 * processed last.
13262 */
13263 state->dts_reserve = 0;
13264
13265 cookie = dtrace_interrupt_disable();
13266 *cpu = VBDT_GET_CPUID();
13267 dtrace_probe(dtrace_probeid_end,
13268 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13269 dtrace_interrupt_enable(cookie);
13270
13271 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13272 dtrace_sync();
13273
13274 return (0);
13275}
13276
13277static int
13278dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13279 dtrace_optval_t val)
13280{
13281 ASSERT(MUTEX_HELD(&dtrace_lock));
13282
13283 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13284 return (EBUSY);
13285
13286 if (option >= DTRACEOPT_MAX)
13287 return (EINVAL);
13288
13289 if (option != DTRACEOPT_CPU && val < 0)
13290 return (EINVAL);
13291
13292 switch (option) {
13293 case DTRACEOPT_DESTRUCTIVE:
13294 if (dtrace_destructive_disallow)
13295 return (EACCES);
13296
13297 state->dts_cred.dcr_destructive = 1;
13298 break;
13299
13300 case DTRACEOPT_BUFSIZE:
13301 case DTRACEOPT_DYNVARSIZE:
13302 case DTRACEOPT_AGGSIZE:
13303 case DTRACEOPT_SPECSIZE:
13304 case DTRACEOPT_STRSIZE:
13305 if (val < 0)
13306 return (EINVAL);
13307
13308 if (val >= LONG_MAX) {
13309 /*
13310 * If this is an otherwise negative value, set it to
13311 * the highest multiple of 128m less than LONG_MAX.
13312 * Technically, we're adjusting the size without
13313 * regard to the buffer resizing policy, but in fact,
13314 * this has no effect -- if we set the buffer size to
13315 * ~LONG_MAX and the buffer policy is ultimately set to
13316 * be "manual", the buffer allocation is guaranteed to
13317 * fail, if only because the allocation requires two
13318 * buffers. (We set the the size to the highest
13319 * multiple of 128m because it ensures that the size
13320 * will remain a multiple of a megabyte when
13321 * repeatedly halved -- all the way down to 15m.)
13322 */
13323 val = LONG_MAX - (1 << 27) + 1;
13324 }
13325 }
13326
13327 state->dts_options[option] = val;
13328
13329 return (0);
13330}
13331
13332static void
13333dtrace_state_destroy(dtrace_state_t *state)
13334{
13335 dtrace_ecb_t *ecb;
13336 dtrace_vstate_t *vstate = &state->dts_vstate;
13337#ifndef VBOX
13338 minor_t minor = getminor(state->dts_dev);
13339#endif
13340 int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13341 dtrace_speculation_t *spec = state->dts_speculations;
13342 int nspec = state->dts_nspeculations;
13343 uint32_t match;
13344
13345 ASSERT(MUTEX_HELD(&dtrace_lock));
13346 ASSERT(MUTEX_HELD(&cpu_lock));
13347
13348 /*
13349 * First, retract any retained enablings for this state.
13350 */
13351 dtrace_enabling_retract(state);
13352 ASSERT(state->dts_nretained == 0);
13353
13354 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13355 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13356 /*
13357 * We have managed to come into dtrace_state_destroy() on a
13358 * hot enabling -- almost certainly because of a disorderly
13359 * shutdown of a consumer. (That is, a consumer that is
13360 * exiting without having called dtrace_stop().) In this case,
13361 * we're going to set our activity to be KILLED, and then
13362 * issue a sync to be sure that everyone is out of probe
13363 * context before we start blowing away ECBs.
13364 */
13365 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13366 dtrace_sync();
13367 }
13368
13369 /*
13370 * Release the credential hold we took in dtrace_state_create().
13371 */
13372 if (state->dts_cred.dcr_cred != NULL)
13373 crfree(state->dts_cred.dcr_cred);
13374
13375 /*
13376 * Now we can safely disable and destroy any enabled probes. Because
13377 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13378 * (especially if they're all enabled), we take two passes through the
13379 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13380 * in the second we disable whatever is left over.
13381 */
13382 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13383 for (i = 0; i < state->dts_necbs; i++) {
13384 if ((ecb = state->dts_ecbs[i]) == NULL)
13385 continue;
13386
13387 if (match && ecb->dte_probe != NULL) {
13388 dtrace_probe_t *probe = ecb->dte_probe;
13389 dtrace_provider_t *prov = probe->dtpr_provider;
13390
13391 if (!(prov->dtpv_priv.dtpp_flags & match))
13392 continue;
13393 }
13394
13395 dtrace_ecb_disable(ecb);
13396 dtrace_ecb_destroy(ecb);
13397 }
13398
13399 if (!match)
13400 break;
13401 }
13402
13403 /*
13404 * Before we free the buffers, perform one more sync to assure that
13405 * every CPU is out of probe context.
13406 */
13407 dtrace_sync();
13408
13409 dtrace_buffer_free(state->dts_buffer);
13410 dtrace_buffer_free(state->dts_aggbuffer);
13411
13412 for (i = 0; i < nspec; i++)
13413 dtrace_buffer_free(spec[i].dtsp_buffer);
13414
13415 if (state->dts_cleaner != CYCLIC_NONE)
13416 cyclic_remove(state->dts_cleaner);
13417
13418 if (state->dts_deadman != CYCLIC_NONE)
13419 cyclic_remove(state->dts_deadman);
13420
13421 dtrace_dstate_fini(&vstate->dtvs_dynvars);
13422 dtrace_vstate_fini(vstate);
13423 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13424
13425 if (state->dts_aggregations != NULL) {
13426#ifdef DEBUG
13427 for (i = 0; i < state->dts_naggregations; i++)
13428 ASSERT(state->dts_aggregations[i] == NULL);
13429#endif
13430 ASSERT(state->dts_naggregations > 0);
13431 kmem_free(state->dts_aggregations,
13432 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13433 }
13434
13435 kmem_free(state->dts_buffer, bufsize);
13436 kmem_free(state->dts_aggbuffer, bufsize);
13437
13438 for (i = 0; i < nspec; i++)
13439 kmem_free(spec[i].dtsp_buffer, bufsize);
13440
13441 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13442
13443 dtrace_format_destroy(state);
13444
13445 vmem_destroy(state->dts_aggid_arena);
13446#ifndef VBOX
13447 ddi_soft_state_free(dtrace_softstate, minor);
13448 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13449#else
13450 kmem_free(state, sizeof (*state));
13451#endif
13452}
13453
13454/*
13455 * DTrace Anonymous Enabling Functions
13456 */
13457static dtrace_state_t *
13458dtrace_anon_grab(void)
13459{
13460 dtrace_state_t *state;
13461
13462 ASSERT(MUTEX_HELD(&dtrace_lock));
13463
13464 if ((state = dtrace_anon.dta_state) == NULL) {
13465 ASSERT(dtrace_anon.dta_enabling == NULL);
13466 return (NULL);
13467 }
13468
13469 ASSERT(dtrace_anon.dta_enabling != NULL);
13470 ASSERT(dtrace_retained != NULL);
13471
13472 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13473 dtrace_anon.dta_enabling = NULL;
13474 dtrace_anon.dta_state = NULL;
13475
13476 return (state);
13477}
13478
13479#ifndef VBOX
13480static void
13481dtrace_anon_property(void)
13482{
13483 int i, rv;
13484 dtrace_state_t *state;
13485 dof_hdr_t *dof;
13486 char c[32]; /* enough for "dof-data-" + digits */
13487
13488 ASSERT(MUTEX_HELD(&dtrace_lock));
13489 ASSERT(MUTEX_HELD(&cpu_lock));
13490
13491 for (i = 0; ; i++) {
13492 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13493
13494 dtrace_err_verbose = 1;
13495
13496 if ((dof = dtrace_dof_property(c)) == NULL) {
13497 dtrace_err_verbose = 0;
13498 break;
13499 }
13500
13501#ifndef VBOX
13502 /*
13503 * We want to create anonymous state, so we need to transition
13504 * the kernel debugger to indicate that DTrace is active. If
13505 * this fails (e.g. because the debugger has modified text in
13506 * some way), we won't continue with the processing.
13507 */
13508 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13509 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13510 "enabling ignored.");
13511 dtrace_dof_destroy(dof);
13512 break;
13513 }
13514#endif
13515
13516 /*
13517 * If we haven't allocated an anonymous state, we'll do so now.
13518 */
13519 if ((state = dtrace_anon.dta_state) == NULL) {
13520 state = dtrace_state_create(NULL, NULL);
13521 dtrace_anon.dta_state = state;
13522
13523 if (state == NULL) {
13524 /*
13525 * This basically shouldn't happen: the only
13526 * failure mode from dtrace_state_create() is a
13527 * failure of ddi_soft_state_zalloc() that
13528 * itself should never happen. Still, the
13529 * interface allows for a failure mode, and
13530 * we want to fail as gracefully as possible:
13531 * we'll emit an error message and cease
13532 * processing anonymous state in this case.
13533 */
13534 cmn_err(CE_WARN, "failed to create "
13535 "anonymous state");
13536 dtrace_dof_destroy(dof);
13537 break;
13538 }
13539 }
13540
13541 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13542 &dtrace_anon.dta_enabling, 0, B_TRUE);
13543
13544 if (rv == 0)
13545 rv = dtrace_dof_options(dof, state);
13546
13547 dtrace_err_verbose = 0;
13548 dtrace_dof_destroy(dof);
13549
13550 if (rv != 0) {
13551 /*
13552 * This is malformed DOF; chuck any anonymous state
13553 * that we created.
13554 */
13555 ASSERT(dtrace_anon.dta_enabling == NULL);
13556 dtrace_state_destroy(state);
13557 dtrace_anon.dta_state = NULL;
13558 break;
13559 }
13560
13561 ASSERT(dtrace_anon.dta_enabling != NULL);
13562 }
13563
13564 if (dtrace_anon.dta_enabling != NULL) {
13565 int rval;
13566
13567 /*
13568 * dtrace_enabling_retain() can only fail because we are
13569 * trying to retain more enablings than are allowed -- but
13570 * we only have one anonymous enabling, and we are guaranteed
13571 * to be allowed at least one retained enabling; we assert
13572 * that dtrace_enabling_retain() returns success.
13573 */
13574 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13575 ASSERT(rval == 0);
13576
13577 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13578 }
13579}
13580#endif /* !VBOX */
13581
13582/*
13583 * DTrace Helper Functions
13584 */
13585#ifndef VBOX /* No helper stuff */
13586static void
13587dtrace_helper_trace(dtrace_helper_action_t *helper,
13588 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13589{
13590 uint32_t size, next, nnext, i;
13591 dtrace_helptrace_t *ent;
13592 uint16_t flags = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13593
13594 if (!dtrace_helptrace_enabled)
13595 return;
13596
13597 ASSERT(vstate->dtvs_nlocals <= VBDTCAST(int32_t)dtrace_helptrace_nlocals);
13598
13599 /*
13600 * What would a tracing framework be without its own tracing
13601 * framework? (Well, a hell of a lot simpler, for starters...)
13602 */
13603 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13604 sizeof (uint64_t) - sizeof (uint64_t);
13605
13606 /*
13607 * Iterate until we can allocate a slot in the trace buffer.
13608 */
13609 do {
13610 next = dtrace_helptrace_next;
13611
13612 if (next + size < VBDTCAST(unsigned)dtrace_helptrace_bufsize) {
13613 nnext = next + size;
13614 } else {
13615 nnext = size;
13616 }
13617 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13618
13619 /*
13620 * We have our slot; fill it in.
13621 */
13622 if (nnext == size)
13623 next = 0;
13624
13625 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13626 ent->dtht_helper = helper;
13627 ent->dtht_where = where;
13628 ent->dtht_nlocals = vstate->dtvs_nlocals;
13629
13630 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13631 mstate->dtms_fltoffs : -1;
13632 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13633 ent->dtht_illval = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
13634
13635 for (i = 0; VBDTCAST(int32_t)i < vstate->dtvs_nlocals; i++) {
13636 dtrace_statvar_t *svar;
13637
13638 if ((svar = vstate->dtvs_locals[i]) == NULL)
13639 continue;
13640
13641 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13642 ent->dtht_locals[i] =
13643 ((uint64_t *)(uintptr_t)svar->dtsv_data)[VBDT_GET_CPUID()];
13644 }
13645}
13646
13647static uint64_t
13648dtrace_helper(int which, dtrace_mstate_t *mstate,
13649 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13650{
13651 VBDTTYPE(uint16_t volatile *, uint16_t *)flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13652 uint64_t sarg0 = mstate->dtms_arg[0];
13653 uint64_t sarg1 = mstate->dtms_arg[1];
13654 uint64_t rval VBDTUNASS(666);
13655 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13656 dtrace_helper_action_t *helper;
13657 dtrace_vstate_t *vstate;
13658 dtrace_difo_t *pred;
13659 int i, trace = dtrace_helptrace_enabled;
13660
13661 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13662
13663 if (helpers == NULL)
13664 return (0);
13665
13666 if ((helper = helpers->dthps_actions[which]) == NULL)
13667 return (0);
13668
13669 vstate = &helpers->dthps_vstate;
13670 mstate->dtms_arg[0] = arg0;
13671 mstate->dtms_arg[1] = arg1;
13672
13673 /*
13674 * Now iterate over each helper. If its predicate evaluates to 'true',
13675 * we'll call the corresponding actions. Note that the below calls
13676 * to dtrace_dif_emulate() may set faults in machine state. This is
13677 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
13678 * the stored DIF offset with its own (which is the desired behavior).
13679 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13680 * from machine state; this is okay, too.
13681 */
13682 for (; helper != NULL; helper = helper->dtha_next) {
13683 if ((pred = helper->dtha_predicate) != NULL) {
13684 if (trace)
13685 dtrace_helper_trace(helper, mstate, vstate, 0);
13686
13687 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13688 goto next;
13689
13690 if (*flags & CPU_DTRACE_FAULT)
13691 goto err;
13692 }
13693
13694 for (i = 0; i < helper->dtha_nactions; i++) {
13695 if (trace)
13696 dtrace_helper_trace(helper,
13697 mstate, vstate, i + 1);
13698
13699 rval = dtrace_dif_emulate(helper->dtha_actions[i],
13700 mstate, vstate, state);
13701
13702 if (*flags & CPU_DTRACE_FAULT)
13703 goto err;
13704 }
13705
13706next:
13707 if (trace)
13708 dtrace_helper_trace(helper, mstate, vstate,
13709 DTRACE_HELPTRACE_NEXT);
13710 }
13711
13712 if (trace)
13713 dtrace_helper_trace(helper, mstate, vstate,
13714 DTRACE_HELPTRACE_DONE);
13715
13716 /*
13717 * Restore the arg0 that we saved upon entry.
13718 */
13719 mstate->dtms_arg[0] = sarg0;
13720 mstate->dtms_arg[1] = sarg1;
13721
13722 return (rval);
13723
13724err:
13725 if (trace)
13726 dtrace_helper_trace(helper, mstate, vstate,
13727 DTRACE_HELPTRACE_ERR);
13728
13729 /*
13730 * Restore the arg0 that we saved upon entry.
13731 */
13732 mstate->dtms_arg[0] = sarg0;
13733 mstate->dtms_arg[1] = sarg1;
13734
13735 return (NULL);
13736}
13737
13738static void
13739dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13740 dtrace_vstate_t *vstate)
13741{
13742 int i;
13743
13744 if (helper->dtha_predicate != NULL)
13745 dtrace_difo_release(helper->dtha_predicate, vstate);
13746
13747 for (i = 0; i < helper->dtha_nactions; i++) {
13748 ASSERT(helper->dtha_actions[i] != NULL);
13749 dtrace_difo_release(helper->dtha_actions[i], vstate);
13750 }
13751
13752 kmem_free(helper->dtha_actions,
13753 helper->dtha_nactions * sizeof (dtrace_difo_t *));
13754 kmem_free(helper, sizeof (dtrace_helper_action_t));
13755}
13756
13757static int
13758dtrace_helper_destroygen(int gen)
13759{
13760 proc_t *p = curproc;
13761 dtrace_helpers_t *help = p->p_dtrace_helpers;
13762 dtrace_vstate_t *vstate;
13763 VBDTTYPE(uint_t,int) i;
13764
13765 ASSERT(MUTEX_HELD(&dtrace_lock));
13766
13767 if (help == NULL || gen > help->dthps_generation)
13768 return (EINVAL);
13769
13770 vstate = &help->dthps_vstate;
13771
13772 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13773 dtrace_helper_action_t *last = NULL, *h, *next;
13774
13775 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13776 next = h->dtha_next;
13777
13778 if (h->dtha_generation == gen) {
13779 if (last != NULL) {
13780 last->dtha_next = next;
13781 } else {
13782 help->dthps_actions[i] = next;
13783 }
13784
13785 dtrace_helper_action_destroy(h, vstate);
13786 } else {
13787 last = h;
13788 }
13789 }
13790 }
13791
13792 /*
13793 * Interate until we've cleared out all helper providers with the
13794 * given generation number.
13795 */
13796 for (;;) {
13797 dtrace_helper_provider_t *prov VBDTGCC(NULL);
13798
13799 /*
13800 * Look for a helper provider with the right generation. We
13801 * have to start back at the beginning of the list each time
13802 * because we drop dtrace_lock. It's unlikely that we'll make
13803 * more than two passes.
13804 */
13805 for (i = 0; i < help->dthps_nprovs; i++) {
13806 prov = help->dthps_provs[i];
13807
13808 if (prov->dthp_generation == gen)
13809 break;
13810 }
13811
13812 /*
13813 * If there were no matches, we're done.
13814 */
13815 if (i == help->dthps_nprovs)
13816 break;
13817
13818 /*
13819 * Move the last helper provider into this slot.
13820 */
13821 help->dthps_nprovs--;
13822 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13823 help->dthps_provs[help->dthps_nprovs] = NULL;
13824
13825 mutex_exit(&dtrace_lock);
13826
13827 /*
13828 * If we have a meta provider, remove this helper provider.
13829 */
13830 mutex_enter(&dtrace_meta_lock);
13831 if (dtrace_meta_pid != NULL) {
13832 ASSERT(dtrace_deferred_pid == NULL);
13833 dtrace_helper_provider_remove(&prov->dthp_prov,
13834 p->p_pid);
13835 }
13836 mutex_exit(&dtrace_meta_lock);
13837
13838 dtrace_helper_provider_destroy(prov);
13839
13840 mutex_enter(&dtrace_lock);
13841 }
13842
13843 return (0);
13844}
13845
13846static int
13847dtrace_helper_validate(dtrace_helper_action_t *helper)
13848{
13849 int err = 0, i;
13850 dtrace_difo_t *dp;
13851
13852 if ((dp = helper->dtha_predicate) != NULL)
13853 err += dtrace_difo_validate_helper(dp);
13854
13855 for (i = 0; i < helper->dtha_nactions; i++)
13856 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13857
13858 return (err == 0);
13859}
13860
13861static int
13862dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13863{
13864 dtrace_helpers_t *help;
13865 dtrace_helper_action_t *helper, *last;
13866 dtrace_actdesc_t *act;
13867 dtrace_vstate_t *vstate;
13868 dtrace_predicate_t *pred;
13869 int count = 0, nactions = 0, i;
13870
13871 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13872 return (EINVAL);
13873
13874 help = curproc->p_dtrace_helpers;
13875 last = help->dthps_actions[which];
13876 vstate = &help->dthps_vstate;
13877
13878 for (count = 0; last != NULL; last = last->dtha_next) {
13879 count++;
13880 if (last->dtha_next == NULL)
13881 break;
13882 }
13883
13884 /*
13885 * If we already have dtrace_helper_actions_max helper actions for this
13886 * helper action type, we'll refuse to add a new one.
13887 */
13888 if (count >= dtrace_helper_actions_max)
13889 return (ENOSPC);
13890
13891 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13892 helper->dtha_generation = help->dthps_generation;
13893
13894 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13895 ASSERT(pred->dtp_difo != NULL);
13896 dtrace_difo_hold(pred->dtp_difo);
13897 helper->dtha_predicate = pred->dtp_difo;
13898 }
13899
13900 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13901 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13902 goto err;
13903
13904 if (act->dtad_difo == NULL)
13905 goto err;
13906
13907 nactions++;
13908 }
13909
13910 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13911 (helper->dtha_nactions = nactions), KM_SLEEP);
13912
13913 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13914 dtrace_difo_hold(act->dtad_difo);
13915 helper->dtha_actions[i++] = act->dtad_difo;
13916 }
13917
13918 if (!dtrace_helper_validate(helper))
13919 goto err;
13920
13921 if (last == NULL) {
13922 help->dthps_actions[which] = helper;
13923 } else {
13924 last->dtha_next = helper;
13925 }
13926
13927 if (vstate->dtvs_nlocals > VBDTCAST(int32_t)dtrace_helptrace_nlocals) {
13928 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13929 dtrace_helptrace_next = 0;
13930 }
13931
13932 return (0);
13933err:
13934 dtrace_helper_action_destroy(helper, vstate);
13935 return (EINVAL);
13936}
13937
13938static void
13939dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13940 dof_helper_t *dofhp)
13941{
13942 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13943
13944 mutex_enter(&dtrace_meta_lock);
13945 mutex_enter(&dtrace_lock);
13946
13947 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13948 /*
13949 * If the dtrace module is loaded but not attached, or if
13950 * there aren't isn't a meta provider registered to deal with
13951 * these provider descriptions, we need to postpone creating
13952 * the actual providers until later.
13953 */
13954
13955 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13956 dtrace_deferred_pid != help) {
13957 help->dthps_deferred = 1;
13958 help->dthps_pid = p->p_pid;
13959 help->dthps_next = dtrace_deferred_pid;
13960 help->dthps_prev = NULL;
13961 if (dtrace_deferred_pid != NULL)
13962 dtrace_deferred_pid->dthps_prev = help;
13963 dtrace_deferred_pid = help;
13964 }
13965
13966 mutex_exit(&dtrace_lock);
13967
13968 } else if (dofhp != NULL) {
13969 /*
13970 * If the dtrace module is loaded and we have a particular
13971 * helper provider description, pass that off to the
13972 * meta provider.
13973 */
13974
13975 mutex_exit(&dtrace_lock);
13976
13977 dtrace_helper_provide(dofhp, p->p_pid);
13978
13979 } else {
13980 /*
13981 * Otherwise, just pass all the helper provider descriptions
13982 * off to the meta provider.
13983 */
13984
13985 VBDTTYPE(uint_t,int) i;
13986 mutex_exit(&dtrace_lock);
13987
13988 for (i = 0; i < help->dthps_nprovs; i++) {
13989 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13990 p->p_pid);
13991 }
13992 }
13993
13994 mutex_exit(&dtrace_meta_lock);
13995}
13996
13997static int
13998dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13999{
14000 dtrace_helpers_t *help;
14001 dtrace_helper_provider_t *hprov, **tmp_provs;
14002 uint_t tmp_maxprovs, i;
14003
14004 ASSERT(MUTEX_HELD(&dtrace_lock));
14005
14006 help = curproc->p_dtrace_helpers;
14007 ASSERT(help != NULL);
14008
14009 /*
14010 * If we already have dtrace_helper_providers_max helper providers,
14011 * we're refuse to add a new one.
14012 */
14013 if (help->dthps_nprovs >= dtrace_helper_providers_max)
14014 return (ENOSPC);
14015
14016 /*
14017 * Check to make sure this isn't a duplicate.
14018 */
14019 for (i = 0; i < help->dthps_nprovs; i++) {
14020 if (dofhp->dofhp_addr ==
14021 help->dthps_provs[i]->dthp_prov.dofhp_addr)
14022 return (EALREADY);
14023 }
14024
14025 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14026 hprov->dthp_prov = *dofhp;
14027 hprov->dthp_ref = 1;
14028 hprov->dthp_generation = gen;
14029
14030 /*
14031 * Allocate a bigger table for helper providers if it's already full.
14032 */
14033 if (help->dthps_maxprovs == help->dthps_nprovs) {
14034 tmp_maxprovs = help->dthps_maxprovs;
14035 tmp_provs = help->dthps_provs;
14036
14037 if (help->dthps_maxprovs == 0)
14038 help->dthps_maxprovs = 2;
14039 else
14040 help->dthps_maxprovs *= 2;
14041 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14042 help->dthps_maxprovs = dtrace_helper_providers_max;
14043
14044 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14045
14046 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14047 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14048
14049 if (tmp_provs != NULL) {
14050 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14051 sizeof (dtrace_helper_provider_t *));
14052 kmem_free(tmp_provs, tmp_maxprovs *
14053 sizeof (dtrace_helper_provider_t *));
14054 }
14055 }
14056
14057 help->dthps_provs[help->dthps_nprovs] = hprov;
14058 help->dthps_nprovs++;
14059
14060 return (0);
14061}
14062
14063static void
14064dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14065{
14066 mutex_enter(&dtrace_lock);
14067
14068 if (--hprov->dthp_ref == 0) {
14069 dof_hdr_t *dof;
14070 mutex_exit(&dtrace_lock);
14071 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14072 dtrace_dof_destroy(dof);
14073 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14074 } else {
14075 mutex_exit(&dtrace_lock);
14076 }
14077}
14078
14079static int
14080dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14081{
14082 uintptr_t daddr = (uintptr_t)dof;
14083 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14084 dof_provider_t *provider;
14085 dof_probe_t *probe;
14086 uint8_t *arg;
14087 char *strtab, *typestr;
14088 dof_stridx_t typeidx;
14089 size_t typesz;
14090 uint_t nprobes, j, k;
14091
14092 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14093
14094 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14095 dtrace_dof_error(dof, "misaligned section offset");
14096 return (-1);
14097 }
14098
14099 /*
14100 * The section needs to be large enough to contain the DOF provider
14101 * structure appropriate for the given version.
14102 */
14103 if (sec->dofs_size <
14104 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14105 offsetof(dof_provider_t, dofpv_prenoffs) :
14106 sizeof (dof_provider_t))) {
14107 dtrace_dof_error(dof, "provider section too small");
14108 return (-1);
14109 }
14110
14111 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14112 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14113 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14114 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14115 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14116
14117 if (str_sec == NULL || prb_sec == NULL ||
14118 arg_sec == NULL || off_sec == NULL)
14119 return (-1);
14120
14121 enoff_sec = NULL;
14122
14123 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14124 provider->dofpv_prenoffs != DOF_SECT_NONE &&
14125 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14126 provider->dofpv_prenoffs)) == NULL)
14127 return (-1);
14128
14129 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14130
14131 if (provider->dofpv_name >= str_sec->dofs_size ||
14132 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14133 dtrace_dof_error(dof, "invalid provider name");
14134 return (-1);
14135 }
14136
14137 if (prb_sec->dofs_entsize == 0 ||
14138 prb_sec->dofs_entsize > prb_sec->dofs_size) {
14139 dtrace_dof_error(dof, "invalid entry size");
14140 return (-1);
14141 }
14142
14143 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14144 dtrace_dof_error(dof, "misaligned entry size");
14145 return (-1);
14146 }
14147
14148 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14149 dtrace_dof_error(dof, "invalid entry size");
14150 return (-1);
14151 }
14152
14153 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14154 dtrace_dof_error(dof, "misaligned section offset");
14155 return (-1);
14156 }
14157
14158 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14159 dtrace_dof_error(dof, "invalid entry size");
14160 return (-1);
14161 }
14162
14163 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14164
14165 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14166
14167 /*
14168 * Take a pass through the probes to check for errors.
14169 */
14170 for (j = 0; j < nprobes; j++) {
14171 probe = (dof_probe_t *)(uintptr_t)(daddr +
14172 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14173
14174 if (probe->dofpr_func >= str_sec->dofs_size) {
14175 dtrace_dof_error(dof, "invalid function name");
14176 return (-1);
14177 }
14178
14179 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14180 dtrace_dof_error(dof, "function name too long");
14181 return (-1);
14182 }
14183
14184 if (probe->dofpr_name >= str_sec->dofs_size ||
14185 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14186 dtrace_dof_error(dof, "invalid probe name");
14187 return (-1);
14188 }
14189
14190 /*
14191 * The offset count must not wrap the index, and the offsets
14192 * must also not overflow the section's data.
14193 */
14194 if (probe->dofpr_offidx + probe->dofpr_noffs <
14195 probe->dofpr_offidx ||
14196 (probe->dofpr_offidx + probe->dofpr_noffs) *
14197 off_sec->dofs_entsize > off_sec->dofs_size) {
14198 dtrace_dof_error(dof, "invalid probe offset");
14199 return (-1);
14200 }
14201
14202 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14203 /*
14204 * If there's no is-enabled offset section, make sure
14205 * there aren't any is-enabled offsets. Otherwise
14206 * perform the same checks as for probe offsets
14207 * (immediately above).
14208 */
14209 if (enoff_sec == NULL) {
14210 if (probe->dofpr_enoffidx != 0 ||
14211 probe->dofpr_nenoffs != 0) {
14212 dtrace_dof_error(dof, "is-enabled "
14213 "offsets with null section");
14214 return (-1);
14215 }
14216 } else if (probe->dofpr_enoffidx +
14217 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14218 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14219 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14220 dtrace_dof_error(dof, "invalid is-enabled "
14221 "offset");
14222 return (-1);
14223 }
14224
14225 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14226 dtrace_dof_error(dof, "zero probe and "
14227 "is-enabled offsets");
14228 return (-1);
14229 }
14230 } else if (probe->dofpr_noffs == 0) {
14231 dtrace_dof_error(dof, "zero probe offsets");
14232 return (-1);
14233 }
14234
14235 if (probe->dofpr_argidx + probe->dofpr_xargc <
14236 probe->dofpr_argidx ||
14237 (probe->dofpr_argidx + probe->dofpr_xargc) *
14238 arg_sec->dofs_entsize > arg_sec->dofs_size) {
14239 dtrace_dof_error(dof, "invalid args");
14240 return (-1);
14241 }
14242
14243 typeidx = probe->dofpr_nargv;
14244 typestr = strtab + probe->dofpr_nargv;
14245 for (k = 0; k < probe->dofpr_nargc; k++) {
14246 if (typeidx >= str_sec->dofs_size) {
14247 dtrace_dof_error(dof, "bad "
14248 "native argument type");
14249 return (-1);
14250 }
14251
14252 typesz = strlen(typestr) + 1;
14253 if (typesz > DTRACE_ARGTYPELEN) {
14254 dtrace_dof_error(dof, "native "
14255 "argument type too long");
14256 return (-1);
14257 }
14258 typeidx += VBDTCAST(dof_stridx_t)typesz;
14259 typestr += typesz;
14260 }
14261
14262 typeidx = probe->dofpr_xargv;
14263 typestr = strtab + probe->dofpr_xargv;
14264 for (k = 0; k < probe->dofpr_xargc; k++) {
14265 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14266 dtrace_dof_error(dof, "bad "
14267 "native argument index");
14268 return (-1);
14269 }
14270
14271 if (typeidx >= str_sec->dofs_size) {
14272 dtrace_dof_error(dof, "bad "
14273 "translated argument type");
14274 return (-1);
14275 }
14276
14277 typesz = strlen(typestr) + 1;
14278 if (typesz > DTRACE_ARGTYPELEN) {
14279 dtrace_dof_error(dof, "translated argument "
14280 "type too long");
14281 return (-1);
14282 }
14283
14284 typeidx += VBDTCAST(dof_stridx_t)typesz;
14285 typestr += typesz;
14286 }
14287 }
14288
14289 return (0);
14290}
14291
14292static int
14293dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14294{
14295 dtrace_helpers_t *help;
14296 dtrace_vstate_t *vstate;
14297 dtrace_enabling_t *enab = NULL;
14298 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14299 uintptr_t daddr = (uintptr_t)dof;
14300
14301 ASSERT(MUTEX_HELD(&dtrace_lock));
14302
14303 if ((help = curproc->p_dtrace_helpers) == NULL)
14304 help = dtrace_helpers_create(curproc);
14305
14306 vstate = &help->dthps_vstate;
14307
14308 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14309 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14310 dtrace_dof_destroy(dof);
14311 return (rv);
14312 }
14313
14314 /*
14315 * Look for helper providers and validate their descriptions.
14316 */
14317 if (dhp != NULL) {
14318 for (i = 0; i < VBDTCAST(int)dof->dofh_secnum; i++) {
14319 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14320 dof->dofh_secoff + i * dof->dofh_secsize);
14321
14322 if (sec->dofs_type != DOF_SECT_PROVIDER)
14323 continue;
14324
14325 if (dtrace_helper_provider_validate(dof, sec) != 0) {
14326 dtrace_enabling_destroy(enab);
14327 dtrace_dof_destroy(dof);
14328 return (-1);
14329 }
14330
14331 nprovs++;
14332 }
14333 }
14334
14335 /*
14336 * Now we need to walk through the ECB descriptions in the enabling.
14337 */
14338 for (i = 0; i < enab->dten_ndesc; i++) {
14339 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14340 dtrace_probedesc_t *desc = &ep->dted_probe;
14341
14342 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14343 continue;
14344
14345 if (strcmp(desc->dtpd_mod, "helper") != 0)
14346 continue;
14347
14348 if (strcmp(desc->dtpd_func, "ustack") != 0)
14349 continue;
14350
14351 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14352 ep)) != 0) {
14353 /*
14354 * Adding this helper action failed -- we are now going
14355 * to rip out the entire generation and return failure.
14356 */
14357 (void) dtrace_helper_destroygen(help->dthps_generation);
14358 dtrace_enabling_destroy(enab);
14359 dtrace_dof_destroy(dof);
14360 return (-1);
14361 }
14362
14363 nhelpers++;
14364 }
14365
14366 if (nhelpers < enab->dten_ndesc)
14367 dtrace_dof_error(dof, "unmatched helpers");
14368
14369 gen = help->dthps_generation++;
14370 dtrace_enabling_destroy(enab);
14371
14372 if (dhp != NULL && nprovs > 0) {
14373 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14374 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14375 mutex_exit(&dtrace_lock);
14376 dtrace_helper_provider_register(curproc, help, dhp);
14377 mutex_enter(&dtrace_lock);
14378
14379 destroy = 0;
14380 }
14381 }
14382
14383 if (destroy)
14384 dtrace_dof_destroy(dof);
14385
14386 return (gen);
14387}
14388
14389static dtrace_helpers_t *
14390dtrace_helpers_create(proc_t *p)
14391{
14392 dtrace_helpers_t *help;
14393
14394 ASSERT(MUTEX_HELD(&dtrace_lock));
14395 ASSERT(p->p_dtrace_helpers == NULL);
14396
14397 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14398 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14399 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14400
14401 p->p_dtrace_helpers = help;
14402 dtrace_helpers++;
14403
14404 return (help);
14405}
14406
14407static void
14408dtrace_helpers_destroy(void)
14409{
14410 dtrace_helpers_t *help;
14411 dtrace_vstate_t *vstate;
14412 proc_t *p = curproc;
14413 VBDTTYPE(uint_t, int) i;
14414
14415 mutex_enter(&dtrace_lock);
14416
14417 ASSERT(p->p_dtrace_helpers != NULL);
14418 ASSERT(dtrace_helpers > 0);
14419
14420 help = p->p_dtrace_helpers;
14421 vstate = &help->dthps_vstate;
14422
14423 /*
14424 * We're now going to lose the help from this process.
14425 */
14426 p->p_dtrace_helpers = NULL;
14427 dtrace_sync();
14428
14429 /*
14430 * Destory the helper actions.
14431 */
14432 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14433 dtrace_helper_action_t *h, *next;
14434
14435 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14436 next = h->dtha_next;
14437 dtrace_helper_action_destroy(h, vstate);
14438 h = next;
14439 }
14440 }
14441
14442 mutex_exit(&dtrace_lock);
14443
14444 /*
14445 * Destroy the helper providers.
14446 */
14447 if (help->dthps_maxprovs > 0) {
14448 mutex_enter(&dtrace_meta_lock);
14449 if (dtrace_meta_pid != NULL) {
14450 ASSERT(dtrace_deferred_pid == NULL);
14451
14452 for (i = 0; i < help->dthps_nprovs; i++) {
14453 dtrace_helper_provider_remove(
14454 &help->dthps_provs[i]->dthp_prov, p->p_pid);
14455 }
14456 } else {
14457 mutex_enter(&dtrace_lock);
14458 ASSERT(help->dthps_deferred == 0 ||
14459 help->dthps_next != NULL ||
14460 help->dthps_prev != NULL ||
14461 help == dtrace_deferred_pid);
14462
14463 /*
14464 * Remove the helper from the deferred list.
14465 */
14466 if (help->dthps_next != NULL)
14467 help->dthps_next->dthps_prev = help->dthps_prev;
14468 if (help->dthps_prev != NULL)
14469 help->dthps_prev->dthps_next = help->dthps_next;
14470 if (dtrace_deferred_pid == help) {
14471 dtrace_deferred_pid = help->dthps_next;
14472 ASSERT(help->dthps_prev == NULL);
14473 }
14474
14475 mutex_exit(&dtrace_lock);
14476 }
14477
14478 mutex_exit(&dtrace_meta_lock);
14479
14480 for (i = 0; i < help->dthps_nprovs; i++) {
14481 dtrace_helper_provider_destroy(help->dthps_provs[i]);
14482 }
14483
14484 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14485 sizeof (dtrace_helper_provider_t *));
14486 }
14487
14488 mutex_enter(&dtrace_lock);
14489
14490 dtrace_vstate_fini(&help->dthps_vstate);
14491 kmem_free(help->dthps_actions,
14492 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14493 kmem_free(help, sizeof (dtrace_helpers_t));
14494
14495 --dtrace_helpers;
14496 mutex_exit(&dtrace_lock);
14497}
14498
14499static void
14500dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14501{
14502 dtrace_helpers_t *help, *newhelp;
14503 dtrace_helper_action_t *helper, *new, *last;
14504 dtrace_difo_t *dp;
14505 dtrace_vstate_t *vstate;
14506 int i, j, sz, hasprovs = 0;
14507
14508 mutex_enter(&dtrace_lock);
14509 ASSERT(from->p_dtrace_helpers != NULL);
14510 ASSERT(dtrace_helpers > 0);
14511
14512 help = from->p_dtrace_helpers;
14513 newhelp = dtrace_helpers_create(to);
14514 ASSERT(to->p_dtrace_helpers != NULL);
14515
14516 newhelp->dthps_generation = help->dthps_generation;
14517 vstate = &newhelp->dthps_vstate;
14518
14519 /*
14520 * Duplicate the helper actions.
14521 */
14522 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14523 if ((helper = help->dthps_actions[i]) == NULL)
14524 continue;
14525
14526 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14527 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14528 KM_SLEEP);
14529 new->dtha_generation = helper->dtha_generation;
14530
14531 if ((dp = helper->dtha_predicate) != NULL) {
14532 dp = dtrace_difo_duplicate(dp, vstate);
14533 new->dtha_predicate = dp;
14534 }
14535
14536 new->dtha_nactions = helper->dtha_nactions;
14537 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14538 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14539
14540 for (j = 0; j < new->dtha_nactions; j++) {
14541 dtrace_difo_t *dp2 = helper->dtha_actions[j];
14542
14543 ASSERT(dp2 != NULL);
14544 dp2 = dtrace_difo_duplicate(dp2, vstate);
14545 new->dtha_actions[j] = dp2;
14546 }
14547
14548 if (last != NULL) {
14549 last->dtha_next = new;
14550 } else {
14551 newhelp->dthps_actions[i] = new;
14552 }
14553
14554 last = new;
14555 }
14556 }
14557
14558 /*
14559 * Duplicate the helper providers and register them with the
14560 * DTrace framework.
14561 */
14562 if (help->dthps_nprovs > 0) {
14563 newhelp->dthps_nprovs = help->dthps_nprovs;
14564 newhelp->dthps_maxprovs = help->dthps_nprovs;
14565 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14566 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14567 for (i = 0; i < VBDTCAST(int)newhelp->dthps_nprovs; i++) {
14568 newhelp->dthps_provs[i] = help->dthps_provs[i];
14569 newhelp->dthps_provs[i]->dthp_ref++;
14570 }
14571
14572 hasprovs = 1;
14573 }
14574
14575 mutex_exit(&dtrace_lock);
14576
14577 if (hasprovs)
14578 dtrace_helper_provider_register(to, newhelp, NULL);
14579}
14580
14581/*
14582 * DTrace Hook Functions
14583 */
14584static void
14585dtrace_module_loaded(struct modctl *ctl)
14586{
14587 dtrace_provider_t *prv;
14588
14589 mutex_enter(&dtrace_provider_lock);
14590 mutex_enter(&mod_lock);
14591
14592 ASSERT(ctl->mod_busy);
14593
14594 /*
14595 * We're going to call each providers per-module provide operation
14596 * specifying only this module.
14597 */
14598 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14599 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14600
14601 mutex_exit(&mod_lock);
14602 mutex_exit(&dtrace_provider_lock);
14603
14604 /*
14605 * If we have any retained enablings, we need to match against them.
14606 * Enabling probes requires that cpu_lock be held, and we cannot hold
14607 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14608 * module. (In particular, this happens when loading scheduling
14609 * classes.) So if we have any retained enablings, we need to dispatch
14610 * our task queue to do the match for us.
14611 */
14612 mutex_enter(&dtrace_lock);
14613
14614 if (dtrace_retained == NULL) {
14615 mutex_exit(&dtrace_lock);
14616 return;
14617 }
14618
14619 (void) taskq_dispatch(dtrace_taskq,
14620 (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14621
14622 mutex_exit(&dtrace_lock);
14623
14624 /*
14625 * And now, for a little heuristic sleaze: in general, we want to
14626 * match modules as soon as they load. However, we cannot guarantee
14627 * this, because it would lead us to the lock ordering violation
14628 * outlined above. The common case, of course, is that cpu_lock is
14629 * _not_ held -- so we delay here for a clock tick, hoping that that's
14630 * long enough for the task queue to do its work. If it's not, it's
14631 * not a serious problem -- it just means that the module that we
14632 * just loaded may not be immediately instrumentable.
14633 */
14634 delay(1);
14635}
14636
14637static void
14638dtrace_module_unloaded(struct modctl *ctl)
14639{
14640 dtrace_probe_t template, *probe, *first, *next;
14641 dtrace_provider_t *prov;
14642
14643 template.dtpr_mod = ctl->mod_modname;
14644
14645 mutex_enter(&dtrace_provider_lock);
14646 mutex_enter(&mod_lock);
14647 mutex_enter(&dtrace_lock);
14648
14649 if (dtrace_bymod == NULL) {
14650 /*
14651 * The DTrace module is loaded (obviously) but not attached;
14652 * we don't have any work to do.
14653 */
14654 mutex_exit(&dtrace_provider_lock);
14655 mutex_exit(&mod_lock);
14656 mutex_exit(&dtrace_lock);
14657 return;
14658 }
14659
14660 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14661 probe != NULL; probe = probe->dtpr_nextmod) {
14662 if (probe->dtpr_ecb != NULL) {
14663 mutex_exit(&dtrace_provider_lock);
14664 mutex_exit(&mod_lock);
14665 mutex_exit(&dtrace_lock);
14666
14667 /*
14668 * This shouldn't _actually_ be possible -- we're
14669 * unloading a module that has an enabled probe in it.
14670 * (It's normally up to the provider to make sure that
14671 * this can't happen.) However, because dtps_enable()
14672 * doesn't have a failure mode, there can be an
14673 * enable/unload race. Upshot: we don't want to
14674 * assert, but we're not going to disable the
14675 * probe, either.
14676 */
14677 if (dtrace_err_verbose) {
14678 cmn_err(CE_WARN, "unloaded module '%s' had "
14679 "enabled probes", ctl->mod_modname);
14680 }
14681
14682 return;
14683 }
14684 }
14685
14686 probe = first;
14687
14688 for (first = NULL; probe != NULL; probe = next) {
14689 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14690
14691 dtrace_probes[probe->dtpr_id - 1] = NULL;
14692
14693 next = probe->dtpr_nextmod;
14694 dtrace_hash_remove(dtrace_bymod, probe);
14695 dtrace_hash_remove(dtrace_byfunc, probe);
14696 dtrace_hash_remove(dtrace_byname, probe);
14697
14698 if (first == NULL) {
14699 first = probe;
14700 probe->dtpr_nextmod = NULL;
14701 } else {
14702 probe->dtpr_nextmod = first;
14703 first = probe;
14704 }
14705 }
14706
14707 /*
14708 * We've removed all of the module's probes from the hash chains and
14709 * from the probe array. Now issue a dtrace_sync() to be sure that
14710 * everyone has cleared out from any probe array processing.
14711 */
14712 dtrace_sync();
14713
14714 for (probe = first; probe != NULL; probe = first) {
14715 first = probe->dtpr_nextmod;
14716 prov = probe->dtpr_provider;
14717 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14718 probe->dtpr_arg);
14719 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14720 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14721 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14722 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14723 kmem_free(probe, sizeof (dtrace_probe_t));
14724 }
14725
14726 mutex_exit(&dtrace_lock);
14727 mutex_exit(&mod_lock);
14728 mutex_exit(&dtrace_provider_lock);
14729}
14730
14731#endif /* !VBOX */
14732
14733VBDTSTATIC void
14734dtrace_suspend(void)
14735{
14736 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14737}
14738
14739VBDTSTATIC void
14740dtrace_resume(void)
14741{
14742 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14743}
14744
14745#ifdef VBOX
14746typedef enum {
14747 CPU_INVALID,
14748 CPU_CONFIG,
14749 CPU_UNCONFIG
14750} cpu_setup_t;
14751#endif
14752
14753
14754static int
14755dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14756{
14757 ASSERT(MUTEX_HELD(&cpu_lock));
14758 mutex_enter(&dtrace_lock);
14759
14760 switch (what) {
14761 case CPU_CONFIG: {
14762 dtrace_state_t *state;
14763 dtrace_optval_t *opt, rs, c;
14764
14765 /*
14766 * For now, we only allocate a new buffer for anonymous state.
14767 */
14768 if ((state = dtrace_anon.dta_state) == NULL)
14769 break;
14770
14771 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14772 break;
14773
14774 opt = state->dts_options;
14775 c = opt[DTRACEOPT_CPU];
14776
14777 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14778 break;
14779
14780 /*
14781 * Regardless of what the actual policy is, we're going to
14782 * temporarily set our resize policy to be manual. We're
14783 * also going to temporarily set our CPU option to denote
14784 * the newly configured CPU.
14785 */
14786 rs = opt[DTRACEOPT_BUFRESIZE];
14787 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14788 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14789
14790 (void) dtrace_state_buffers(state);
14791
14792 opt[DTRACEOPT_BUFRESIZE] = rs;
14793 opt[DTRACEOPT_CPU] = c;
14794
14795 break;
14796 }
14797
14798 case CPU_UNCONFIG:
14799 /*
14800 * We don't free the buffer in the CPU_UNCONFIG case. (The
14801 * buffer will be freed when the consumer exits.)
14802 */
14803 break;
14804
14805 default:
14806 break;
14807 }
14808
14809 mutex_exit(&dtrace_lock);
14810 return (0);
14811}
14812
14813#ifndef VBOX
14814static void
14815dtrace_cpu_setup_initial(processorid_t cpu)
14816{
14817 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14818}
14819#endif /* !VBOX */
14820
14821static void
14822dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14823{
14824 if (dtrace_toxranges >= dtrace_toxranges_max) {
14825 int osize, nsize;
14826 dtrace_toxrange_t *range;
14827
14828 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14829
14830 if (osize == 0) {
14831 ASSERT(dtrace_toxrange == NULL);
14832 ASSERT(dtrace_toxranges_max == 0);
14833 dtrace_toxranges_max = 1;
14834 } else {
14835 dtrace_toxranges_max <<= 1;
14836 }
14837
14838 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14839 range = kmem_zalloc(nsize, KM_SLEEP);
14840
14841 if (dtrace_toxrange != NULL) {
14842 ASSERT(osize != 0);
14843 bcopy(dtrace_toxrange, range, osize);
14844 kmem_free(dtrace_toxrange, osize);
14845 }
14846
14847 dtrace_toxrange = range;
14848 }
14849
14850 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14851 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14852
14853 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14854 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14855 dtrace_toxranges++;
14856}
14857
14858/*
14859 * DTrace Driver Cookbook Functions
14860 */
14861#ifdef VBOX
14862int dtrace_attach(void)
14863#else
14864/*ARGSUSED*/
14865static int
14866dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14867#endif
14868{
14869 dtrace_provider_id_t id;
14870 dtrace_state_t *state = NULL;
14871 dtrace_enabling_t *enab;
14872
14873#ifdef VBOX
14874 if ( VBoxDtMutexInit(&dtrace_lock)
14875 || VBoxDtMutexInit(&dtrace_provider_lock)
14876 || VBoxDtMutexInit(&dtrace_meta_lock)
14877# ifdef DEBUG
14878 || VBoxDtMutexInit(&dtrace_errlock)
14879# endif
14880 )
14881 return (DDI_FAILURE);
14882#endif
14883
14884 mutex_enter(&cpu_lock);
14885 mutex_enter(&dtrace_provider_lock);
14886 mutex_enter(&dtrace_lock);
14887
14888#ifndef VBOX
14889 if (ddi_soft_state_init(&dtrace_softstate,
14890 sizeof (dtrace_state_t), 0) != 0) {
14891 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14892 mutex_exit(&cpu_lock);
14893 mutex_exit(&dtrace_provider_lock);
14894 mutex_exit(&dtrace_lock);
14895 return (DDI_FAILURE);
14896 }
14897
14898 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14899 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14900 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14901 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14902 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14903 ddi_remove_minor_node(devi, NULL);
14904 ddi_soft_state_fini(&dtrace_softstate);
14905 mutex_exit(&cpu_lock);
14906 mutex_exit(&dtrace_provider_lock);
14907 mutex_exit(&dtrace_lock);
14908 return (DDI_FAILURE);
14909 }
14910
14911 ddi_report_dev(devi);
14912 dtrace_devi = devi;
14913
14914 dtrace_modload = dtrace_module_loaded;
14915 dtrace_modunload = dtrace_module_unloaded;
14916 dtrace_cpu_init = dtrace_cpu_setup_initial;
14917 dtrace_helpers_cleanup = dtrace_helpers_destroy;
14918 dtrace_helpers_fork = dtrace_helpers_duplicate;
14919 dtrace_cpustart_init = dtrace_suspend;
14920 dtrace_cpustart_fini = dtrace_resume;
14921 dtrace_debugger_init = dtrace_suspend;
14922 dtrace_debugger_fini = dtrace_resume;
14923
14924 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14925#else
14926 /** @todo some of these hooks needs checking out! */
14927#endif
14928
14929 ASSERT(MUTEX_HELD(&cpu_lock));
14930
14931#ifndef VBOX /* Reduce the area a bit just to be sure our vmem fake doesn't blow up. */
14932 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14933 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14934#else
14935 dtrace_arena = vmem_create("dtrace", (void *)(uintptr_t)1, UINT32_MAX - 16, 1,
14936 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14937#endif
14938#ifndef VBOX
14939 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14940 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14941 VM_SLEEP | VMC_IDENTIFIER);
14942 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14943 1, INT_MAX, 0);
14944#endif
14945
14946 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14947 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14948 NULL, NULL, NULL, NULL, NULL, 0);
14949
14950 ASSERT(MUTEX_HELD(&cpu_lock));
14951 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14952 offsetof(dtrace_probe_t, dtpr_nextmod),
14953 offsetof(dtrace_probe_t, dtpr_prevmod));
14954
14955 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14956 offsetof(dtrace_probe_t, dtpr_nextfunc),
14957 offsetof(dtrace_probe_t, dtpr_prevfunc));
14958
14959 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14960 offsetof(dtrace_probe_t, dtpr_nextname),
14961 offsetof(dtrace_probe_t, dtpr_prevname));
14962
14963 if (dtrace_retain_max < 1) {
14964 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14965 "setting to 1", dtrace_retain_max);
14966 dtrace_retain_max = 1;
14967 }
14968
14969 /*
14970 * Now discover our toxic ranges.
14971 */
14972 dtrace_toxic_ranges(dtrace_toxrange_add);
14973
14974 /*
14975 * Before we register ourselves as a provider to our own framework,
14976 * we would like to assert that dtrace_provider is NULL -- but that's
14977 * not true if we were loaded as a dependency of a DTrace provider.
14978 * Once we've registered, we can assert that dtrace_provider is our
14979 * pseudo provider.
14980 */
14981 (void) dtrace_register("dtrace", &dtrace_provider_attr,
14982 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14983
14984 ASSERT(dtrace_provider != NULL);
14985 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14986
14987 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14988 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14989 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14990 dtrace_provider, NULL, NULL, "END", 0, NULL);
14991 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14992 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14993
14994#ifndef VBOX
14995 dtrace_anon_property();
14996#endif
14997 mutex_exit(&cpu_lock);
14998
14999 /*
15000 * If DTrace helper tracing is enabled, we need to allocate the
15001 * trace buffer and initialize the values.
15002 */
15003 if (dtrace_helptrace_enabled) {
15004 ASSERT(dtrace_helptrace_buffer == NULL);
15005 dtrace_helptrace_buffer =
15006 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15007 dtrace_helptrace_next = 0;
15008 }
15009
15010 /*
15011 * If there are already providers, we must ask them to provide their
15012 * probes, and then match any anonymous enabling against them. Note
15013 * that there should be no other retained enablings at this time:
15014 * the only retained enablings at this time should be the anonymous
15015 * enabling.
15016 */
15017 if (dtrace_anon.dta_enabling != NULL) {
15018 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15019
15020 dtrace_enabling_provide(NULL);
15021 state = dtrace_anon.dta_state;
15022
15023 /*
15024 * We couldn't hold cpu_lock across the above call to
15025 * dtrace_enabling_provide(), but we must hold it to actually
15026 * enable the probes. We have to drop all of our locks, pick
15027 * up cpu_lock, and regain our locks before matching the
15028 * retained anonymous enabling.
15029 */
15030 mutex_exit(&dtrace_lock);
15031 mutex_exit(&dtrace_provider_lock);
15032
15033 mutex_enter(&cpu_lock);
15034 mutex_enter(&dtrace_provider_lock);
15035 mutex_enter(&dtrace_lock);
15036
15037 if ((enab = dtrace_anon.dta_enabling) != NULL)
15038 (void) dtrace_enabling_match(enab, NULL);
15039
15040 mutex_exit(&cpu_lock);
15041 }
15042
15043 mutex_exit(&dtrace_lock);
15044 mutex_exit(&dtrace_provider_lock);
15045
15046 if (state != NULL) {
15047 /*
15048 * If we created any anonymous state, set it going now.
15049 */
15050 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15051 }
15052
15053 return (DDI_SUCCESS);
15054}
15055
15056#ifdef VBOX
15057int dtrace_open(dtrace_state_t **ppState, cred_t *cred_p)
15058#else
15059/*ARGSUSED*/
15060static int
15061dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15062#endif
15063{
15064 dtrace_state_t *state;
15065 uint32_t priv;
15066 uid_t uid;
15067 zoneid_t zoneid;
15068
15069#ifndef VBOX
15070 if (getminor(*devp) == DTRACEMNRN_HELPER)
15071 return (0);
15072
15073 /*
15074 * If this wasn't an open with the "helper" minor, then it must be
15075 * the "dtrace" minor.
15076 */
15077 if (getminor(*devp) != DTRACEMNRN_DTRACE)
15078 return (ENXIO);
15079#endif /* !VBOX */
15080
15081 /*
15082 * If no DTRACE_PRIV_* bits are set in the credential, then the
15083 * caller lacks sufficient permission to do anything with DTrace.
15084 */
15085 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15086 if (priv == DTRACE_PRIV_NONE)
15087 return (EACCES);
15088
15089 /*
15090 * Ask all providers to provide all their probes.
15091 */
15092 mutex_enter(&dtrace_provider_lock);
15093 dtrace_probe_provide(NULL, NULL);
15094 mutex_exit(&dtrace_provider_lock);
15095
15096 mutex_enter(&cpu_lock);
15097 mutex_enter(&dtrace_lock);
15098 dtrace_opens++;
15099 dtrace_membar_producer();
15100
15101#ifndef VBOX
15102 /*
15103 * If the kernel debugger is active (that is, if the kernel debugger
15104 * modified text in some way), we won't allow the open.
15105 */
15106 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15107 dtrace_opens--;
15108 mutex_exit(&cpu_lock);
15109 mutex_exit(&dtrace_lock);
15110 return (EBUSY);
15111 }
15112#endif
15113
15114#ifndef VBOX
15115 state = dtrace_state_create(devp, cred_p);
15116#else
15117 state = dtrace_state_create(cred_p);
15118#endif
15119 mutex_exit(&cpu_lock);
15120
15121 if (state == NULL) {
15122#ifndef VBOX
15123 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15124 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15125#else
15126 dtrace_opens--;
15127#endif
15128 mutex_exit(&dtrace_lock);
15129 return (EAGAIN);
15130 }
15131
15132 mutex_exit(&dtrace_lock);
15133
15134#ifdef VBOX
15135 *ppState = state;
15136#endif
15137 return (0);
15138}
15139
15140#ifdef VBOX
15141int dtrace_close(dtrace_state_t *state)
15142#else
15143/*ARGSUSED*/
15144static int
15145dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15146#endif
15147{
15148#ifndef VBOX
15149 minor_t minor = getminor(dev);
15150 dtrace_state_t *state;
15151
15152 if (minor == DTRACEMNRN_HELPER)
15153 return (0);
15154
15155 state = ddi_get_soft_state(dtrace_softstate, minor);
15156#endif
15157
15158 mutex_enter(&cpu_lock);
15159 mutex_enter(&dtrace_lock);
15160
15161 if (state->dts_anon) {
15162 /*
15163 * There is anonymous state. Destroy that first.
15164 */
15165 ASSERT(dtrace_anon.dta_state == NULL);
15166 dtrace_state_destroy(state->dts_anon);
15167 }
15168
15169 dtrace_state_destroy(state);
15170 ASSERT(dtrace_opens > 0);
15171
15172#ifndef VBOX
15173 /*
15174 * Only relinquish control of the kernel debugger interface when there
15175 * are no consumers and no anonymous enablings.
15176 */
15177 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15178 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15179#else
15180 dtrace_opens--;
15181#endif
15182
15183 mutex_exit(&dtrace_lock);
15184 mutex_exit(&cpu_lock);
15185
15186 return (0);
15187}
15188
15189#ifndef VBOX
15190/*ARGSUSED*/
15191static int
15192dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
15193{
15194 int rval;
15195 dof_helper_t help, *dhp = NULL;
15196
15197 switch (cmd) {
15198 case DTRACEHIOC_ADDDOF:
15199 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
15200 dtrace_dof_error(NULL, "failed to copyin DOF helper");
15201 return (EFAULT);
15202 }
15203
15204 dhp = &help;
15205 arg = (intptr_t)help.dofhp_dof;
15206 /*FALLTHROUGH*/
15207
15208 case DTRACEHIOC_ADD: {
15209 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15210
15211 if (dof == NULL)
15212 return (rval);
15213
15214 mutex_enter(&dtrace_lock);
15215
15216 /*
15217 * dtrace_helper_slurp() takes responsibility for the dof --
15218 * it may free it now or it may save it and free it later.
15219 */
15220 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15221 *rv = rval;
15222 rval = 0;
15223 } else {
15224 rval = EINVAL;
15225 }
15226
15227 mutex_exit(&dtrace_lock);
15228 return (rval);
15229 }
15230
15231 case DTRACEHIOC_REMOVE: {
15232 mutex_enter(&dtrace_lock);
15233 rval = dtrace_helper_destroygen(arg);
15234 mutex_exit(&dtrace_lock);
15235
15236 return (rval);
15237 }
15238
15239 default:
15240 break;
15241 }
15242
15243 return (ENOTTY);
15244}
15245#endif /* !VBOX */
15246
15247#ifdef VBOX
15248int dtrace_ioctl(dtrace_state_t *state, int cmd, intptr_t arg, int32_t *rv)
15249#else
15250/*ARGSUSED*/
15251static int
15252dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15253#endif
15254{
15255#ifndef VBOX
15256 minor_t minor = getminor(dev);
15257 dtrace_state_t *state;
15258#endif
15259 int rval;
15260
15261#ifndef VBOX
15262 if (minor == DTRACEMNRN_HELPER)
15263 return (dtrace_ioctl_helper(cmd, arg, rv));
15264
15265 state = ddi_get_soft_state(dtrace_softstate, minor);
15266#endif
15267
15268 if (state->dts_anon) {
15269 ASSERT(dtrace_anon.dta_state == NULL);
15270 state = state->dts_anon;
15271 }
15272
15273 switch (cmd) {
15274 case DTRACEIOC_PROVIDER: {
15275 dtrace_providerdesc_t pvd;
15276 dtrace_provider_t *pvp;
15277
15278 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15279 return (EFAULT);
15280
15281 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15282 mutex_enter(&dtrace_provider_lock);
15283
15284 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15285 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15286 break;
15287 }
15288
15289 mutex_exit(&dtrace_provider_lock);
15290
15291 if (pvp == NULL)
15292 return (ESRCH);
15293
15294 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15295 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15296 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15297 return (EFAULT);
15298
15299 return (0);
15300 }
15301
15302 case DTRACEIOC_EPROBE: {
15303 dtrace_eprobedesc_t epdesc;
15304 dtrace_ecb_t *ecb;
15305 dtrace_action_t *act;
15306 void *buf;
15307 size_t size;
15308 uintptr_t dest;
15309 int nrecs;
15310
15311 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15312 return (EFAULT);
15313
15314 mutex_enter(&dtrace_lock);
15315
15316 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15317 mutex_exit(&dtrace_lock);
15318 return (EINVAL);
15319 }
15320
15321 if (ecb->dte_probe == NULL) {
15322 mutex_exit(&dtrace_lock);
15323 return (EINVAL);
15324 }
15325
15326 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15327 epdesc.dtepd_uarg = ecb->dte_uarg;
15328 epdesc.dtepd_size = VBDTCAST(uint32_t)ecb->dte_size;
15329
15330 nrecs = epdesc.dtepd_nrecs;
15331 epdesc.dtepd_nrecs = 0;
15332 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15333 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15334 continue;
15335
15336 epdesc.dtepd_nrecs++;
15337 }
15338
15339 /*
15340 * Now that we have the size, we need to allocate a temporary
15341 * buffer in which to store the complete description. We need
15342 * the temporary buffer to be able to drop dtrace_lock()
15343 * across the copyout(), below.
15344 */
15345 size = sizeof (dtrace_eprobedesc_t) +
15346 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15347
15348 buf = kmem_alloc(size, KM_SLEEP);
15349 dest = (uintptr_t)buf;
15350
15351 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15352 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15353
15354 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15355 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15356 continue;
15357
15358 if (nrecs-- == 0)
15359 break;
15360
15361 bcopy(&act->dta_rec, (void *)dest,
15362 sizeof (dtrace_recdesc_t));
15363 dest += sizeof (dtrace_recdesc_t);
15364 }
15365
15366 mutex_exit(&dtrace_lock);
15367
15368 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15369 kmem_free(buf, size);
15370 return (EFAULT);
15371 }
15372
15373 kmem_free(buf, size);
15374 return (0);
15375 }
15376
15377 case DTRACEIOC_AGGDESC: {
15378 dtrace_aggdesc_t aggdesc;
15379 dtrace_action_t *act;
15380 dtrace_aggregation_t *agg;
15381 int nrecs;
15382 uint32_t offs;
15383 dtrace_recdesc_t *lrec;
15384 void *buf;
15385 size_t size;
15386 uintptr_t dest;
15387
15388 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15389 return (EFAULT);
15390
15391 mutex_enter(&dtrace_lock);
15392
15393 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15394 mutex_exit(&dtrace_lock);
15395 return (EINVAL);
15396 }
15397
15398 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15399
15400 nrecs = aggdesc.dtagd_nrecs;
15401 aggdesc.dtagd_nrecs = 0;
15402
15403 offs = agg->dtag_base;
15404 lrec = &agg->dtag_action.dta_rec;
15405 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15406
15407 for (act = agg->dtag_first; ; act = act->dta_next) {
15408 ASSERT(act->dta_intuple ||
15409 DTRACEACT_ISAGG(act->dta_kind));
15410
15411 /*
15412 * If this action has a record size of zero, it
15413 * denotes an argument to the aggregating action.
15414 * Because the presence of this record doesn't (or
15415 * shouldn't) affect the way the data is interpreted,
15416 * we don't copy it out to save user-level the
15417 * confusion of dealing with a zero-length record.
15418 */
15419 if (act->dta_rec.dtrd_size == 0) {
15420 ASSERT(agg->dtag_hasarg);
15421 continue;
15422 }
15423
15424 aggdesc.dtagd_nrecs++;
15425
15426 if (act == &agg->dtag_action)
15427 break;
15428 }
15429
15430 /*
15431 * Now that we have the size, we need to allocate a temporary
15432 * buffer in which to store the complete description. We need
15433 * the temporary buffer to be able to drop dtrace_lock()
15434 * across the copyout(), below.
15435 */
15436 size = sizeof (dtrace_aggdesc_t) +
15437 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15438
15439 buf = kmem_alloc(size, KM_SLEEP);
15440 dest = (uintptr_t)buf;
15441
15442 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15443 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15444
15445 for (act = agg->dtag_first; ; act = act->dta_next) {
15446 dtrace_recdesc_t rec = act->dta_rec;
15447
15448 /*
15449 * See the comment in the above loop for why we pass
15450 * over zero-length records.
15451 */
15452 if (rec.dtrd_size == 0) {
15453 ASSERT(agg->dtag_hasarg);
15454 continue;
15455 }
15456
15457 if (nrecs-- == 0)
15458 break;
15459
15460 rec.dtrd_offset -= offs;
15461 bcopy(&rec, (void *)dest, sizeof (rec));
15462 dest += sizeof (dtrace_recdesc_t);
15463
15464 if (act == &agg->dtag_action)
15465 break;
15466 }
15467
15468 mutex_exit(&dtrace_lock);
15469
15470 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15471 kmem_free(buf, size);
15472 return (EFAULT);
15473 }
15474
15475 kmem_free(buf, size);
15476 return (0);
15477 }
15478
15479 case DTRACEIOC_ENABLE: {
15480 dof_hdr_t *dof;
15481 dtrace_enabling_t *enab = NULL;
15482 dtrace_vstate_t *vstate;
15483 int err = 0;
15484#ifdef VBOX
15485 cred_t *cr = CRED();
15486#endif
15487
15488 *rv = 0;
15489
15490 /*
15491 * If a NULL argument has been passed, we take this as our
15492 * cue to reevaluate our enablings.
15493 */
15494 if (arg == NULL) {
15495 dtrace_enabling_matchall();
15496
15497 return (0);
15498 }
15499
15500 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15501 return (rval);
15502
15503 mutex_enter(&cpu_lock);
15504 mutex_enter(&dtrace_lock);
15505 vstate = &state->dts_vstate;
15506
15507 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15508 mutex_exit(&dtrace_lock);
15509 mutex_exit(&cpu_lock);
15510 dtrace_dof_destroy(dof);
15511 return (EBUSY);
15512 }
15513
15514 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15515 mutex_exit(&dtrace_lock);
15516 mutex_exit(&cpu_lock);
15517 dtrace_dof_destroy(dof);
15518 return (EINVAL);
15519 }
15520
15521 if ((rval = dtrace_dof_options(dof, state)) != 0) {
15522 dtrace_enabling_destroy(enab);
15523 mutex_exit(&dtrace_lock);
15524 mutex_exit(&cpu_lock);
15525 dtrace_dof_destroy(dof);
15526 return (rval);
15527 }
15528
15529 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15530 err = dtrace_enabling_retain(enab);
15531 } else {
15532 dtrace_enabling_destroy(enab);
15533 }
15534
15535 mutex_exit(&cpu_lock);
15536 mutex_exit(&dtrace_lock);
15537 dtrace_dof_destroy(dof);
15538
15539 return (err);
15540 }
15541
15542 case DTRACEIOC_REPLICATE: {
15543 dtrace_repldesc_t desc;
15544 dtrace_probedesc_t *match = &desc.dtrpd_match;
15545 dtrace_probedesc_t *create = &desc.dtrpd_create;
15546 int err;
15547
15548 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15549 return (EFAULT);
15550
15551 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15552 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15553 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15554 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15555
15556 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15557 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15558 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15559 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15560
15561 mutex_enter(&dtrace_lock);
15562 err = dtrace_enabling_replicate(state, match, create);
15563 mutex_exit(&dtrace_lock);
15564
15565 return (err);
15566 }
15567
15568 case DTRACEIOC_PROBEMATCH:
15569 case DTRACEIOC_PROBES: {
15570 dtrace_probe_t *probe = NULL;
15571 dtrace_probedesc_t desc;
15572 dtrace_probekey_t pkey;
15573 dtrace_id_t i;
15574 int m = 0;
15575 uint32_t priv;
15576 uid_t uid;
15577 zoneid_t zoneid;
15578#ifdef VBOX
15579 cred_t *cr = CRED();
15580#endif
15581
15582 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15583 return (EFAULT);
15584
15585 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15586 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15587 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15588 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15589
15590 /*
15591 * Before we attempt to match this probe, we want to give
15592 * all providers the opportunity to provide it.
15593 */
15594 if (desc.dtpd_id == DTRACE_IDNONE) {
15595 mutex_enter(&dtrace_provider_lock);
15596 dtrace_probe_provide(&desc, NULL);
15597 mutex_exit(&dtrace_provider_lock);
15598 desc.dtpd_id++;
15599 }
15600
15601 if (cmd == DTRACEIOC_PROBEMATCH) {
15602 dtrace_probekey(&desc, &pkey);
15603 pkey.dtpk_id = DTRACE_IDNONE;
15604 }
15605
15606 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15607
15608 mutex_enter(&dtrace_lock);
15609
15610 if (cmd == DTRACEIOC_PROBEMATCH) {
15611 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15612 if ((probe = dtrace_probes[i - 1]) != NULL &&
15613 (m = dtrace_match_probe(probe, &pkey,
15614 priv, uid, zoneid)) != 0)
15615 break;
15616 }
15617
15618 if (m < 0) {
15619 mutex_exit(&dtrace_lock);
15620 return (EINVAL);
15621 }
15622
15623 } else {
15624 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15625 if ((probe = dtrace_probes[i - 1]) != NULL &&
15626 dtrace_match_priv(probe, priv, uid, zoneid))
15627 break;
15628 }
15629 }
15630
15631 if (probe == NULL) {
15632 mutex_exit(&dtrace_lock);
15633 return (ESRCH);
15634 }
15635
15636 dtrace_probe_description(probe, &desc);
15637 mutex_exit(&dtrace_lock);
15638
15639 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15640 return (EFAULT);
15641
15642 return (0);
15643 }
15644
15645 case DTRACEIOC_PROBEARG: {
15646 dtrace_argdesc_t desc;
15647 dtrace_probe_t *probe;
15648 dtrace_provider_t *prov;
15649
15650 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15651 return (EFAULT);
15652
15653 if (desc.dtargd_id == DTRACE_IDNONE)
15654 return (EINVAL);
15655
15656 if (desc.dtargd_ndx == DTRACE_ARGNONE)
15657 return (EINVAL);
15658
15659 mutex_enter(&dtrace_provider_lock);
15660 mutex_enter(&mod_lock);
15661 mutex_enter(&dtrace_lock);
15662
15663 if (desc.dtargd_id > dtrace_nprobes) {
15664 mutex_exit(&dtrace_lock);
15665 mutex_exit(&mod_lock);
15666 mutex_exit(&dtrace_provider_lock);
15667 return (EINVAL);
15668 }
15669
15670 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15671 mutex_exit(&dtrace_lock);
15672 mutex_exit(&mod_lock);
15673 mutex_exit(&dtrace_provider_lock);
15674 return (EINVAL);
15675 }
15676
15677 mutex_exit(&dtrace_lock);
15678
15679 prov = probe->dtpr_provider;
15680
15681 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15682 /*
15683 * There isn't any typed information for this probe.
15684 * Set the argument number to DTRACE_ARGNONE.
15685 */
15686 desc.dtargd_ndx = DTRACE_ARGNONE;
15687 } else {
15688 desc.dtargd_native[0] = '\0';
15689 desc.dtargd_xlate[0] = '\0';
15690 desc.dtargd_mapping = desc.dtargd_ndx;
15691
15692 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15693 probe->dtpr_id, probe->dtpr_arg, &desc);
15694 }
15695
15696 mutex_exit(&mod_lock);
15697 mutex_exit(&dtrace_provider_lock);
15698
15699 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15700 return (EFAULT);
15701
15702 return (0);
15703 }
15704
15705 case DTRACEIOC_GO: {
15706 processorid_t cpuid;
15707 rval = dtrace_state_go(state, &cpuid);
15708
15709 if (rval != 0)
15710 return (rval);
15711
15712 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15713 return (EFAULT);
15714
15715 return (0);
15716 }
15717
15718 case DTRACEIOC_STOP: {
15719 processorid_t cpuid;
15720
15721 mutex_enter(&dtrace_lock);
15722 rval = dtrace_state_stop(state, &cpuid);
15723 mutex_exit(&dtrace_lock);
15724
15725 if (rval != 0)
15726 return (rval);
15727
15728 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15729 return (EFAULT);
15730
15731 return (0);
15732 }
15733
15734 case DTRACEIOC_DOFGET: {
15735 dof_hdr_t hdr, *dof;
15736 uint64_t len;
15737
15738 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15739 return (EFAULT);
15740
15741 mutex_enter(&dtrace_lock);
15742 dof = dtrace_dof_create(state);
15743 mutex_exit(&dtrace_lock);
15744
15745 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15746 rval = copyout(dof, (void *)arg, len);
15747 dtrace_dof_destroy(dof);
15748
15749 return (rval == 0 ? 0 : EFAULT);
15750 }
15751
15752 case DTRACEIOC_AGGSNAP:
15753 case DTRACEIOC_BUFSNAP: {
15754 dtrace_bufdesc_t desc;
15755 caddr_t cached;
15756 dtrace_buffer_t *buf;
15757
15758 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15759 return (EFAULT);
15760
15761 if (/*VBox value is is unsigned: desc.dtbd_cpu < 0 ||*/ desc.dtbd_cpu >= NCPU)
15762 return (EINVAL);
15763
15764 mutex_enter(&dtrace_lock);
15765
15766 if (cmd == DTRACEIOC_BUFSNAP) {
15767 buf = &state->dts_buffer[desc.dtbd_cpu];
15768 } else {
15769 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15770 }
15771
15772 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15773 size_t sz = buf->dtb_offset;
15774
15775 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15776 mutex_exit(&dtrace_lock);
15777 return (EBUSY);
15778 }
15779
15780 /*
15781 * If this buffer has already been consumed, we're
15782 * going to indicate that there's nothing left here
15783 * to consume.
15784 */
15785 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15786 mutex_exit(&dtrace_lock);
15787
15788 desc.dtbd_size = 0;
15789 desc.dtbd_drops = 0;
15790 desc.dtbd_errors = 0;
15791 desc.dtbd_oldest = 0;
15792 sz = sizeof (desc);
15793
15794 if (copyout(&desc, (void *)arg, sz) != 0)
15795 return (EFAULT);
15796
15797 return (0);
15798 }
15799
15800 /*
15801 * If this is a ring buffer that has wrapped, we want
15802 * to copy the whole thing out.
15803 */
15804 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15805 dtrace_buffer_polish(buf);
15806 sz = buf->dtb_size;
15807 }
15808
15809 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15810 mutex_exit(&dtrace_lock);
15811 return (EFAULT);
15812 }
15813
15814 desc.dtbd_size = sz;
15815 desc.dtbd_drops = buf->dtb_drops;
15816 desc.dtbd_errors = buf->dtb_errors;
15817 desc.dtbd_oldest = buf->dtb_xamot_offset;
15818
15819 mutex_exit(&dtrace_lock);
15820
15821 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15822 return (EFAULT);
15823
15824 buf->dtb_flags |= DTRACEBUF_CONSUMED;
15825
15826 return (0);
15827 }
15828
15829 if (buf->dtb_tomax == NULL) {
15830 ASSERT(buf->dtb_xamot == NULL);
15831 mutex_exit(&dtrace_lock);
15832 return (ENOENT);
15833 }
15834
15835 cached = buf->dtb_tomax;
15836 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15837
15838#ifndef VBOX
15839 dtrace_xcall(desc.dtbd_cpu,
15840 (dtrace_xcall_t)dtrace_buffer_switch, buf);
15841#else
15842 if ((int32_t)desc.dtbd_cpu == DTRACE_CPUALL)
15843 RTMpOnAll(dtrace_buffer_switch_wrapper, buf, NULL);
15844 else
15845 RTMpOnSpecific(desc.dtbd_cpu, dtrace_buffer_switch_wrapper, buf, NULL);
15846#endif
15847
15848 state->dts_errors += buf->dtb_xamot_errors;
15849
15850 /*
15851 * If the buffers did not actually switch, then the cross call
15852 * did not take place -- presumably because the given CPU is
15853 * not in the ready set. If this is the case, we'll return
15854 * ENOENT.
15855 */
15856 if (buf->dtb_tomax == cached) {
15857 ASSERT(buf->dtb_xamot != cached);
15858 mutex_exit(&dtrace_lock);
15859 return (ENOENT);
15860 }
15861
15862 ASSERT(cached == buf->dtb_xamot);
15863
15864 /*
15865 * We have our snapshot; now copy it out.
15866 */
15867 if (copyout(buf->dtb_xamot, desc.dtbd_data,
15868 buf->dtb_xamot_offset) != 0) {
15869 mutex_exit(&dtrace_lock);
15870 return (EFAULT);
15871 }
15872
15873 desc.dtbd_size = buf->dtb_xamot_offset;
15874 desc.dtbd_drops = buf->dtb_xamot_drops;
15875 desc.dtbd_errors = buf->dtb_xamot_errors;
15876 desc.dtbd_oldest = 0;
15877
15878 mutex_exit(&dtrace_lock);
15879
15880 /*
15881 * Finally, copy out the buffer description.
15882 */
15883 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15884 return (EFAULT);
15885
15886 return (0);
15887 }
15888
15889 case DTRACEIOC_CONF: {
15890 dtrace_conf_t conf;
15891
15892 bzero(&conf, sizeof (conf));
15893 conf.dtc_difversion = DIF_VERSION;
15894 conf.dtc_difintregs = DIF_DIR_NREGS;
15895 conf.dtc_diftupregs = DIF_DTR_NREGS;
15896 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15897
15898 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15899 return (EFAULT);
15900
15901 return (0);
15902 }
15903
15904 case DTRACEIOC_STATUS: {
15905 dtrace_status_t stat;
15906 dtrace_dstate_t *dstate;
15907 int i, j;
15908 uint64_t nerrs;
15909
15910 /*
15911 * See the comment in dtrace_state_deadman() for the reason
15912 * for setting dts_laststatus to INT64_MAX before setting
15913 * it to the correct value.
15914 */
15915 state->dts_laststatus = INT64_MAX;
15916 dtrace_membar_producer();
15917 state->dts_laststatus = dtrace_gethrtime();
15918
15919 bzero(&stat, sizeof (stat));
15920
15921 mutex_enter(&dtrace_lock);
15922
15923 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15924 mutex_exit(&dtrace_lock);
15925 return (ENOENT);
15926 }
15927
15928 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15929 stat.dtst_exiting = 1;
15930
15931 nerrs = state->dts_errors;
15932 dstate = &state->dts_vstate.dtvs_dynvars;
15933
15934 for (i = 0; i < NCPU; i++) {
15935 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15936
15937 stat.dtst_dyndrops += dcpu->dtdsc_drops;
15938 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15939 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15940
15941 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15942 stat.dtst_filled++;
15943
15944 nerrs += state->dts_buffer[i].dtb_errors;
15945
15946 for (j = 0; j < state->dts_nspeculations; j++) {
15947 dtrace_speculation_t *spec;
15948 dtrace_buffer_t *buf;
15949
15950 spec = &state->dts_speculations[j];
15951 buf = &spec->dtsp_buffer[i];
15952 stat.dtst_specdrops += buf->dtb_xamot_drops;
15953 }
15954 }
15955
15956 stat.dtst_specdrops_busy = state->dts_speculations_busy;
15957 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15958 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15959 stat.dtst_dblerrors = state->dts_dblerrors;
15960 stat.dtst_killed =
15961 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15962 stat.dtst_errors = nerrs;
15963
15964 mutex_exit(&dtrace_lock);
15965
15966 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15967 return (EFAULT);
15968
15969 return (0);
15970 }
15971
15972 case DTRACEIOC_FORMAT: {
15973 dtrace_fmtdesc_t fmt;
15974 char *str;
15975 int len;
15976
15977 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15978 return (EFAULT);
15979
15980 mutex_enter(&dtrace_lock);
15981
15982 if (fmt.dtfd_format == 0 ||
15983 fmt.dtfd_format > state->dts_nformats) {
15984 mutex_exit(&dtrace_lock);
15985 return (EINVAL);
15986 }
15987
15988 /*
15989 * Format strings are allocated contiguously and they are
15990 * never freed; if a format index is less than the number
15991 * of formats, we can assert that the format map is non-NULL
15992 * and that the format for the specified index is non-NULL.
15993 */
15994 ASSERT(state->dts_formats != NULL);
15995 str = state->dts_formats[fmt.dtfd_format - 1];
15996 ASSERT(str != NULL);
15997
15998 len = VBDTCAST(int)strlen(str) + 1;
15999
16000 if (len > fmt.dtfd_length) {
16001 fmt.dtfd_length = len;
16002
16003 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
16004 mutex_exit(&dtrace_lock);
16005 return (EINVAL);
16006 }
16007 } else {
16008 if (copyout(str, fmt.dtfd_string, len) != 0) {
16009 mutex_exit(&dtrace_lock);
16010 return (EINVAL);
16011 }
16012 }
16013
16014 mutex_exit(&dtrace_lock);
16015 return (0);
16016 }
16017
16018 default:
16019 break;
16020 }
16021
16022 return (ENOTTY);
16023}
16024
16025#ifdef VBOX
16026int dtrace_detach(void)
16027#else
16028/*ARGSUSED*/
16029static int
16030dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
16031#endif
16032{
16033 dtrace_state_t *state;
16034
16035#ifndef VBOX
16036 switch (cmd) {
16037 case DDI_DETACH:
16038 break;
16039
16040 case DDI_SUSPEND:
16041 return (DDI_SUCCESS);
16042
16043 default:
16044 return (DDI_FAILURE);
16045 }
16046#endif
16047
16048 mutex_enter(&cpu_lock);
16049 mutex_enter(&dtrace_provider_lock);
16050 mutex_enter(&dtrace_lock);
16051
16052 ASSERT(dtrace_opens == 0);
16053
16054 if (dtrace_helpers > 0) {
16055 mutex_exit(&dtrace_provider_lock);
16056 mutex_exit(&dtrace_lock);
16057 mutex_exit(&cpu_lock);
16058 return (DDI_FAILURE);
16059 }
16060
16061 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
16062 mutex_exit(&dtrace_provider_lock);
16063 mutex_exit(&dtrace_lock);
16064 mutex_exit(&cpu_lock);
16065 return (DDI_FAILURE);
16066 }
16067
16068 dtrace_provider = NULL;
16069
16070 if ((state = dtrace_anon_grab()) != NULL) {
16071 /*
16072 * If there were ECBs on this state, the provider should
16073 * have not been allowed to detach; assert that there is
16074 * none.
16075 */
16076 ASSERT(state->dts_necbs == 0);
16077 dtrace_state_destroy(state);
16078
16079#ifndef VBOX
16080 /*
16081 * If we're being detached with anonymous state, we need to
16082 * indicate to the kernel debugger that DTrace is now inactive.
16083 */
16084 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16085#endif
16086 }
16087
16088 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
16089#ifndef VBOX /** @todo CPU hooks */
16090 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16091 dtrace_cpu_init = NULL;
16092 dtrace_helpers_cleanup = NULL;
16093 dtrace_helpers_fork = NULL;
16094 dtrace_cpustart_init = NULL;
16095 dtrace_cpustart_fini = NULL;
16096 dtrace_debugger_init = NULL;
16097 dtrace_debugger_fini = NULL;
16098 dtrace_modload = NULL;
16099 dtrace_modunload = NULL;
16100#endif
16101
16102 mutex_exit(&cpu_lock);
16103
16104 if (dtrace_helptrace_enabled) {
16105 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
16106 dtrace_helptrace_buffer = NULL;
16107 }
16108
16109 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
16110 dtrace_probes = NULL;
16111 dtrace_nprobes = 0;
16112
16113 dtrace_hash_destroy(dtrace_bymod);
16114 dtrace_hash_destroy(dtrace_byfunc);
16115 dtrace_hash_destroy(dtrace_byname);
16116 dtrace_bymod = NULL;
16117 dtrace_byfunc = NULL;
16118 dtrace_byname = NULL;
16119
16120 kmem_cache_destroy(dtrace_state_cache);
16121#ifndef VBOX
16122 vmem_destroy(dtrace_minor);
16123#endif
16124 vmem_destroy(dtrace_arena);
16125
16126 if (dtrace_toxrange != NULL) {
16127 kmem_free(dtrace_toxrange,
16128 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
16129 dtrace_toxrange = NULL;
16130 dtrace_toxranges = 0;
16131 dtrace_toxranges_max = 0;
16132 }
16133
16134#ifndef VBOX
16135 ddi_remove_minor_node(dtrace_devi, NULL);
16136 dtrace_devi = NULL;
16137
16138 ddi_soft_state_fini(&dtrace_softstate);
16139#endif
16140
16141 ASSERT(dtrace_vtime_references == 0);
16142 ASSERT(dtrace_opens == 0);
16143 ASSERT(dtrace_retained == NULL);
16144
16145 mutex_exit(&dtrace_lock);
16146 mutex_exit(&dtrace_provider_lock);
16147#ifdef VBOX
16148 VBoxDtMutexDelete(&dtrace_lock);
16149 VBoxDtMutexDelete(&dtrace_provider_lock);
16150 VBoxDtMutexDelete(&dtrace_meta_lock);
16151# ifdef DEBUG
16152 VBoxDtMutexDelete(&dtrace_errlock);
16153# endif
16154#endif
16155
16156 /*
16157 * We don't destroy the task queue until after we have dropped our
16158 * locks (taskq_destroy() may block on running tasks). To prevent
16159 * attempting to do work after we have effectively detached but before
16160 * the task queue has been destroyed, all tasks dispatched via the
16161 * task queue must check that DTrace is still attached before
16162 * performing any operation.
16163 */
16164#ifndef VBOX
16165 taskq_destroy(dtrace_taskq);
16166 dtrace_taskq = NULL;
16167#endif
16168
16169 return (DDI_SUCCESS);
16170}
16171
16172#ifndef VBOX
16173/*ARGSUSED*/
16174static int
16175dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
16176{
16177 int error;
16178
16179 switch (infocmd) {
16180 case DDI_INFO_DEVT2DEVINFO:
16181 *result = (void *)dtrace_devi;
16182 error = DDI_SUCCESS;
16183 break;
16184 case DDI_INFO_DEVT2INSTANCE:
16185 *result = (void *)0;
16186 error = DDI_SUCCESS;
16187 break;
16188 default:
16189 error = DDI_FAILURE;
16190 }
16191 return (error);
16192}
16193
16194static struct cb_ops dtrace_cb_ops = {
16195 dtrace_open, /* open */
16196 dtrace_close, /* close */
16197 nulldev, /* strategy */
16198 nulldev, /* print */
16199 nodev, /* dump */
16200 nodev, /* read */
16201 nodev, /* write */
16202 dtrace_ioctl, /* ioctl */
16203 nodev, /* devmap */
16204 nodev, /* mmap */
16205 nodev, /* segmap */
16206 nochpoll, /* poll */
16207 ddi_prop_op, /* cb_prop_op */
16208 0, /* streamtab */
16209 D_NEW | D_MP /* Driver compatibility flag */
16210};
16211
16212static struct dev_ops dtrace_ops = {
16213 DEVO_REV, /* devo_rev */
16214 0, /* refcnt */
16215 dtrace_info, /* get_dev_info */
16216 nulldev, /* identify */
16217 nulldev, /* probe */
16218 dtrace_attach, /* attach */
16219 dtrace_detach, /* detach */
16220 nodev, /* reset */
16221 &dtrace_cb_ops, /* driver operations */
16222 NULL, /* bus operations */
16223 nodev, /* dev power */
16224 ddi_quiesce_not_needed, /* quiesce */
16225};
16226
16227static struct modldrv modldrv = {
16228 &mod_driverops, /* module type (this is a pseudo driver) */
16229 "Dynamic Tracing", /* name of module */
16230 &dtrace_ops, /* driver ops */
16231};
16232
16233static struct modlinkage modlinkage = {
16234 MODREV_1,
16235 (void *)&modldrv,
16236 NULL
16237};
16238
16239int
16240_init(void)
16241{
16242 return (mod_install(&modlinkage));
16243}
16244
16245int
16246_info(struct modinfo *modinfop)
16247{
16248 return (mod_info(&modlinkage, modinfop));
16249}
16250
16251int
16252_fini(void)
16253{
16254 return (mod_remove(&modlinkage));
16255}
16256
16257#endif /* !VBOX */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette