VirtualBox

source: vbox/trunk/src/VBox/ExtPacks/VBoxDTrace/onnv/uts/common/dtrace/dtrace.c@ 53647

Last change on this file since 53647 was 53647, checked in by vboxsync, 10 years ago

VBoxDTrace: darwin build fixes. (r17)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 401.4 KB
Line 
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * DTrace - Dynamic Tracing for Solaris
28 *
29 * This is the implementation of the Solaris Dynamic Tracing framework
30 * (DTrace). The user-visible interface to DTrace is described at length in
31 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
32 * library, the in-kernel DTrace framework, and the DTrace providers are
33 * described in the block comments in the <sys/dtrace.h> header file. The
34 * internal architecture of DTrace is described in the block comments in the
35 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
36 * implementation very much assume mastery of all of these sources; if one has
37 * an unanswered question about the implementation, one should consult them
38 * first.
39 *
40 * The functions here are ordered roughly as follows:
41 *
42 * - Probe context functions
43 * - Probe hashing functions
44 * - Non-probe context utility functions
45 * - Matching functions
46 * - Provider-to-Framework API functions
47 * - Probe management functions
48 * - DIF object functions
49 * - Format functions
50 * - Predicate functions
51 * - ECB functions
52 * - Buffer functions
53 * - Enabling functions
54 * - DOF functions
55 * - Anonymous enabling functions
56 * - Consumer state functions
57 * - Helper functions
58 * - Hook functions
59 * - Driver cookbook functions
60 *
61 * Each group of functions begins with a block comment labelled the "DTrace
62 * [Group] Functions", allowing one to find each block by searching forward
63 * on capital-f functions.
64 */
65#ifndef VBOX
66#include <sys/errno.h>
67#include <sys/stat.h>
68#include <sys/modctl.h>
69#include <sys/conf.h>
70#include <sys/systm.h>
71#include <sys/ddi.h>
72#include <sys/sunddi.h>
73#include <sys/cpuvar.h>
74#include <sys/kmem.h>
75#include <sys/strsubr.h>
76#include <sys/sysmacros.h>
77#include <sys/dtrace_impl.h>
78#include <sys/atomic.h>
79#include <sys/cmn_err.h>
80#include <sys/mutex_impl.h>
81#include <sys/rwlock_impl.h>
82#include <sys/ctf_api.h>
83#include <sys/panic.h>
84#include <sys/priv_impl.h>
85#include <sys/policy.h>
86#include <sys/cred_impl.h>
87#include <sys/procfs_isa.h>
88#include <sys/taskq.h>
89#include <sys/mkdev.h>
90#include <sys/kdi.h>
91#include <sys/zone.h>
92#include <sys/socket.h>
93#include <netinet/in.h>
94
95#else /* VBOX */
96# include <sys/dtrace_impl.h>
97# include <iprt/assert.h>
98# include <iprt/cpuset.h>
99# include <iprt/mp.h>
100# include <iprt/string.h>
101# include <iprt/process.h>
102# include <iprt/thread.h>
103# include <iprt/timer.h>
104# include <limits.h>
105
106/*
107 * Use asm.h to implemente some of the simple stuff in dtrace_asm.s.
108 */
109# include <iprt/asm.h>
110# include <iprt/asm-amd64-x86.h>
111# define dtrace_casptr(a_ppvDst, a_pvOld, a_pvNew) \
112 VBoxDtCompareAndSwapPtr((void * volatile *)a_ppvDst, a_pvOld, a_pvNew)
113DECLINLINE(void *) VBoxDtCompareAndSwapPtr(void * volatile *ppvDst, void *pvOld, void *pvNew)
114{
115 void *pvRet;
116 ASMAtomicCmpXchgExPtrVoid(ppvDst, pvNew, pvOld, &pvRet);
117 return pvRet;
118}
119
120# define dtrace_cas32(a_pu32Dst, a_pu32Old, a_pu32New) \
121 VBoxDtCompareAndSwapU32(a_pu32Dst, a_pu32Old, a_pu32New)
122DECLINLINE(uint32_t) VBoxDtCompareAndSwapU32(uint32_t volatile *pu32Dst, uint32_t u32Old, uint32_t u32New)
123{
124 uint32_t u32Ret;
125 ASMAtomicCmpXchgExU32(pu32Dst, u32New, u32Old, &u32Ret);
126 return u32Ret;
127}
128
129#define dtrace_membar_consumer() ASMReadFence()
130#define dtrace_membar_producer() ASMWriteFence()
131#define dtrace_interrupt_disable() ASMIntDisableFlags()
132#define dtrace_interrupt_enable(a_EFL) ASMSetFlags(a_EFL)
133
134/*
135 * NULL must be set to 0 or we'll end up with a billion warnings(=errors).
136 */
137# undef NULL
138# define NULL (0)
139#endif /* VBOX */
140
141/*
142 * DTrace Tunable Variables
143 *
144 * The following variables may be tuned by adding a line to /etc/system that
145 * includes both the name of the DTrace module ("dtrace") and the name of the
146 * variable. For example:
147 *
148 * set dtrace:dtrace_destructive_disallow = 1
149 *
150 * In general, the only variables that one should be tuning this way are those
151 * that affect system-wide DTrace behavior, and for which the default behavior
152 * is undesirable. Most of these variables are tunable on a per-consumer
153 * basis using DTrace options, and need not be tuned on a system-wide basis.
154 * When tuning these variables, avoid pathological values; while some attempt
155 * is made to verify the integrity of these variables, they are not considered
156 * part of the supported interface to DTrace, and they are therefore not
157 * checked comprehensively. Further, these variables should not be tuned
158 * dynamically via "mdb -kw" or other means; they should only be tuned via
159 * /etc/system.
160 */
161int dtrace_destructive_disallow = 0;
162dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
163size_t dtrace_difo_maxsize = (256 * 1024);
164dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
165size_t dtrace_global_maxsize = (16 * 1024);
166size_t dtrace_actions_max = (16 * 1024);
167size_t dtrace_retain_max = 1024;
168dtrace_optval_t dtrace_helper_actions_max = 32;
169dtrace_optval_t dtrace_helper_providers_max = 32;
170dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
171size_t dtrace_strsize_default = 256;
172dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
173dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
174dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
175dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
176dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
177dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
178dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
179dtrace_optval_t dtrace_nspec_default = 1;
180dtrace_optval_t dtrace_specsize_default = 32 * 1024;
181dtrace_optval_t dtrace_stackframes_default = 20;
182dtrace_optval_t dtrace_ustackframes_default = 20;
183dtrace_optval_t dtrace_jstackframes_default = 50;
184dtrace_optval_t dtrace_jstackstrsize_default = 512;
185int dtrace_msgdsize_max = 128;
186hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
187hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
188int dtrace_devdepth_max = 32;
189int dtrace_err_verbose;
190hrtime_t dtrace_deadman_interval = NANOSEC;
191hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
192hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
193
194/*
195 * DTrace External Variables
196 *
197 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
198 * available to DTrace consumers via the backtick (`) syntax. One of these,
199 * dtrace_zero, is made deliberately so: it is provided as a source of
200 * well-known, zero-filled memory. While this variable is not documented,
201 * it is used by some translators as an implementation detail.
202 */
203const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
204
205/*
206 * DTrace Internal Variables
207 */
208static dev_info_t *dtrace_devi; /* device info */
209static vmem_t *dtrace_arena; /* probe ID arena */
210static vmem_t *dtrace_minor; /* minor number arena */
211#ifndef VBOX
212static taskq_t *dtrace_taskq; /* task queue */
213#endif
214static dtrace_probe_t **dtrace_probes; /* array of all probes */
215static VBDTTYPE(uint32_t,int) dtrace_nprobes; /* number of probes */
216static dtrace_provider_t *dtrace_provider; /* provider list */
217static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
218static int dtrace_opens; /* number of opens */
219static int dtrace_helpers; /* number of helpers */
220static void *dtrace_softstate; /* softstate pointer */
221static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
222static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
223static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
224static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
225static int dtrace_toxranges; /* number of toxic ranges */
226static int dtrace_toxranges_max; /* size of toxic range array */
227static dtrace_anon_t dtrace_anon; /* anonymous enabling */
228static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
229static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
230static kthread_t *dtrace_panicked; /* panicking thread */
231static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
232static dtrace_genid_t dtrace_probegen; /* current probe generation */
233static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
234static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
235static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
236static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
237static int dtrace_dynvar_failclean; /* dynvars failed to clean */
238
239/*
240 * DTrace Locking
241 * DTrace is protected by three (relatively coarse-grained) locks:
242 *
243 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
244 * including enabling state, probes, ECBs, consumer state, helper state,
245 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
246 * probe context is lock-free -- synchronization is handled via the
247 * dtrace_sync() cross call mechanism.
248 *
249 * (2) dtrace_provider_lock is required when manipulating provider state, or
250 * when provider state must be held constant.
251 *
252 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
253 * when meta provider state must be held constant.
254 *
255 * The lock ordering between these three locks is dtrace_meta_lock before
256 * dtrace_provider_lock before dtrace_lock. (In particular, there are
257 * several places where dtrace_provider_lock is held by the framework as it
258 * calls into the providers -- which then call back into the framework,
259 * grabbing dtrace_lock.)
260 *
261 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
262 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
263 * role as a coarse-grained lock; it is acquired before both of these locks.
264 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
265 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
266 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
267 * acquired _between_ dtrace_provider_lock and dtrace_lock.
268 */
269static kmutex_t dtrace_lock; /* probe state lock */
270static kmutex_t dtrace_provider_lock; /* provider state lock */
271static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
272
273/*
274 * DTrace Provider Variables
275 *
276 * These are the variables relating to DTrace as a provider (that is, the
277 * provider of the BEGIN, END, and ERROR probes).
278 */
279static dtrace_pattr_t dtrace_provider_attr = {
280{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
281{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
282{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
283{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
284{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
285};
286
287static void
288dtrace_nullop(void)
289{}
290
291static int
292dtrace_enable_nullop(void)
293{
294 return (0);
295}
296
297static dtrace_pops_t dtrace_provider_ops = {
298 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
299 (void (*)(void *, struct modctl *))dtrace_nullop,
300 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
301 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
302 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
303 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
304 NULL,
305 NULL,
306 NULL,
307 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
308};
309
310static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
311static dtrace_id_t dtrace_probeid_end; /* special END probe */
312dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
313
314/*
315 * DTrace Helper Tracing Variables
316 */
317uint32_t dtrace_helptrace_next = 0;
318uint32_t dtrace_helptrace_nlocals;
319char *dtrace_helptrace_buffer;
320int dtrace_helptrace_bufsize = 512 * 1024;
321
322#ifdef DEBUG
323int dtrace_helptrace_enabled = 1;
324#else
325int dtrace_helptrace_enabled = 0;
326#endif
327
328/*
329 * DTrace Error Hashing
330 *
331 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
332 * table. This is very useful for checking coverage of tests that are
333 * expected to induce DIF or DOF processing errors, and may be useful for
334 * debugging problems in the DIF code generator or in DOF generation . The
335 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
336 */
337#ifdef DEBUG
338static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
339static const char *dtrace_errlast;
340static kthread_t *dtrace_errthread;
341static kmutex_t dtrace_errlock;
342#endif
343
344/*
345 * DTrace Macros and Constants
346 *
347 * These are various macros that are useful in various spots in the
348 * implementation, along with a few random constants that have no meaning
349 * outside of the implementation. There is no real structure to this cpp
350 * mishmash -- but is there ever?
351 */
352#define DTRACE_HASHSTR(hash, probe) \
353 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
354
355#define DTRACE_HASHNEXT(hash, probe) \
356 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
357
358#define DTRACE_HASHPREV(hash, probe) \
359 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
360
361#define DTRACE_HASHEQ(hash, lhs, rhs) \
362 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
363 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
364
365#define DTRACE_AGGHASHSIZE_SLEW 17
366
367#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
368
369/*
370 * The key for a thread-local variable consists of the lower 61 bits of the
371 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
372 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
373 * equal to a variable identifier. This is necessary (but not sufficient) to
374 * assure that global associative arrays never collide with thread-local
375 * variables. To guarantee that they cannot collide, we must also define the
376 * order for keying dynamic variables. That order is:
377 *
378 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
379 *
380 * Because the variable-key and the tls-key are in orthogonal spaces, there is
381 * no way for a global variable key signature to match a thread-local key
382 * signature.
383 */
384#ifndef VBOX
385#define DTRACE_TLS_THRKEY(where) { \
386 uint_t intr = 0; \
387 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
388 for (; actv; actv >>= 1) \
389 intr++; \
390 ASSERT(intr < (1 << 3)); \
391 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
392 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
393}
394#else
395#define DTRACE_TLS_THRKEY(where) do { \
396 (where) = (((uintptr_t)RTThreadNativeSelf() + DIF_VARIABLE_MAX) & (RT_BIT_64(61) - 1)) \
397 | (RTThreadIsInInterrupt(NIL_RTTHREAD) ? RT_BIT_64(61) : 0); \
398} while (0)
399#endif
400
401#define DT_BSWAP_8(x) ((x) & 0xff)
402#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
403#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
404#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
405
406#define DT_MASK_LO 0x00000000FFFFFFFFULL
407
408#define DTRACE_STORE(type, tomax, offset, what) \
409 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
410
411#ifndef __i386
412#define DTRACE_ALIGNCHECK(addr, size, flags) \
413 if (addr & (size - 1)) { \
414 *flags |= CPU_DTRACE_BADALIGN; \
415 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = addr; \
416 return (0); \
417 }
418#else
419#define DTRACE_ALIGNCHECK(addr, size, flags)
420#endif
421
422/*
423 * Test whether a range of memory starting at testaddr of size testsz falls
424 * within the range of memory described by addr, sz. We take care to avoid
425 * problems with overflow and underflow of the unsigned quantities, and
426 * disallow all negative sizes. Ranges of size 0 are allowed.
427 */
428#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
429 ((testaddr) - (baseaddr) < (basesz) && \
430 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
431 (testaddr) + (testsz) >= (testaddr))
432
433/*
434 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
435 * alloc_sz on the righthand side of the comparison in order to avoid overflow
436 * or underflow in the comparison with it. This is simpler than the INRANGE
437 * check above, because we know that the dtms_scratch_ptr is valid in the
438 * range. Allocations of size zero are allowed.
439 */
440#define DTRACE_INSCRATCH(mstate, alloc_sz) \
441 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
442 (mstate)->dtms_scratch_ptr >= (alloc_sz))
443
444#define DTRACE_LOADFUNC(bits) \
445/*CSTYLED*/ \
446VBDTSTATIC uint##bits##_t \
447dtrace_load##bits(uintptr_t addr) \
448{ \
449 size_t size = bits / NBBY; \
450 /*CSTYLED*/ \
451 uint##bits##_t rval; \
452 int i; \
453 processorid_t me = VBDT_GET_CPUID(); \
454 volatile uint16_t *flags = (volatile uint16_t *) \
455 &cpu_core[me].cpuc_dtrace_flags; \
456 \
457 DTRACE_ALIGNCHECK(addr, size, flags); \
458 \
459 for (i = 0; i < dtrace_toxranges; i++) { \
460 if (addr >= dtrace_toxrange[i].dtt_limit) \
461 continue; \
462 \
463 if (addr + size <= dtrace_toxrange[i].dtt_base) \
464 continue; \
465 \
466 /* \
467 * This address falls within a toxic region; return 0. \
468 */ \
469 *flags |= CPU_DTRACE_BADADDR; \
470 cpu_core[me].cpuc_dtrace_illval = addr; \
471 return (0); \
472 } \
473 \
474 *flags |= CPU_DTRACE_NOFAULT; \
475 /*CSTYLED*/ \
476 rval = *((volatile uint##bits##_t *)addr); \
477 *flags &= ~CPU_DTRACE_NOFAULT; \
478 \
479 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
480}
481
482#ifdef _LP64
483#define dtrace_loadptr dtrace_load64
484#else
485#define dtrace_loadptr dtrace_load32
486#endif
487
488#define DTRACE_DYNHASH_FREE 0
489#define DTRACE_DYNHASH_SINK 1
490#define DTRACE_DYNHASH_VALID 2
491
492#define DTRACE_MATCH_FAIL -1
493#define DTRACE_MATCH_NEXT 0
494#define DTRACE_MATCH_DONE 1
495#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
496#define DTRACE_STATE_ALIGN 64
497
498#define DTRACE_FLAGS2FLT(flags) \
499 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
500 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
501 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
502 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
503 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
504 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
505 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
506 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
507 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
508 DTRACEFLT_UNKNOWN)
509
510#define DTRACEACT_ISSTRING(act) \
511 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
512 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
513
514static size_t dtrace_strlen(const char *, size_t);
515static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
516static void dtrace_enabling_provide(dtrace_provider_t *);
517static int dtrace_enabling_match(dtrace_enabling_t *, int *);
518static void dtrace_enabling_matchall(void);
519static dtrace_state_t *dtrace_anon_grab(void);
520static uint64_t dtrace_helper(int, dtrace_mstate_t *,
521 dtrace_state_t *, uint64_t, uint64_t);
522static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
523static void dtrace_buffer_drop(dtrace_buffer_t *);
524static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
525 dtrace_state_t *, dtrace_mstate_t *);
526static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
527 dtrace_optval_t);
528static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
529static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
530
531/*
532 * DTrace Probe Context Functions
533 *
534 * These functions are called from probe context. Because probe context is
535 * any context in which C may be called, arbitrarily locks may be held,
536 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
537 * As a result, functions called from probe context may only call other DTrace
538 * support functions -- they may not interact at all with the system at large.
539 * (Note that the ASSERT macro is made probe-context safe by redefining it in
540 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
541 * loads are to be performed from probe context, they _must_ be in terms of
542 * the safe dtrace_load*() variants.
543 *
544 * Some functions in this block are not actually called from probe context;
545 * for these functions, there will be a comment above the function reading
546 * "Note: not called from probe context."
547 */
548void
549dtrace_panic(const char *format, ...)
550{
551 va_list alist;
552
553 va_start(alist, format);
554 dtrace_vpanic(format, alist);
555 va_end(alist);
556}
557
558int
559dtrace_assfail(const char *a, const char *f, int l)
560{
561 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
562
563 /*
564 * We just need something here that even the most clever compiler
565 * cannot optimize away.
566 */
567 return (a[(uintptr_t)f]);
568}
569
570/*
571 * Atomically increment a specified error counter from probe context.
572 */
573static void
574dtrace_error(uint32_t *counter)
575{
576 /*
577 * Most counters stored to in probe context are per-CPU counters.
578 * However, there are some error conditions that are sufficiently
579 * arcane that they don't merit per-CPU storage. If these counters
580 * are incremented concurrently on different CPUs, scalability will be
581 * adversely affected -- but we don't expect them to be white-hot in a
582 * correctly constructed enabling...
583 */
584 uint32_t oval, nval;
585
586 do {
587 oval = *counter;
588
589 if ((nval = oval + 1) == 0) {
590 /*
591 * If the counter would wrap, set it to 1 -- assuring
592 * that the counter is never zero when we have seen
593 * errors. (The counter must be 32-bits because we
594 * aren't guaranteed a 64-bit compare&swap operation.)
595 * To save this code both the infamy of being fingered
596 * by a priggish news story and the indignity of being
597 * the target of a neo-puritan witch trial, we're
598 * carefully avoiding any colorful description of the
599 * likelihood of this condition -- but suffice it to
600 * say that it is only slightly more likely than the
601 * overflow of predicate cache IDs, as discussed in
602 * dtrace_predicate_create().
603 */
604 nval = 1;
605 }
606 } while (dtrace_cas32(counter, oval, nval) != oval);
607}
608
609/*
610 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
611 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
612 */
613DTRACE_LOADFUNC(8)
614DTRACE_LOADFUNC(16)
615DTRACE_LOADFUNC(32)
616DTRACE_LOADFUNC(64)
617
618static int
619dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
620{
621 if (dest < mstate->dtms_scratch_base)
622 return (0);
623
624 if (dest + size < dest)
625 return (0);
626
627 if (dest + size > mstate->dtms_scratch_ptr)
628 return (0);
629
630 return (1);
631}
632
633static int
634dtrace_canstore_statvar(uint64_t addr, size_t sz,
635 dtrace_statvar_t **svars, int nsvars)
636{
637 int i;
638
639 for (i = 0; i < nsvars; i++) {
640 dtrace_statvar_t *svar = svars[i];
641
642 if (svar == NULL || svar->dtsv_size == 0)
643 continue;
644
645 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
646 return (1);
647 }
648
649 return (0);
650}
651
652/*
653 * Check to see if the address is within a memory region to which a store may
654 * be issued. This includes the DTrace scratch areas, and any DTrace variable
655 * region. The caller of dtrace_canstore() is responsible for performing any
656 * alignment checks that are needed before stores are actually executed.
657 */
658static int
659dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
660 dtrace_vstate_t *vstate)
661{
662 /*
663 * First, check to see if the address is in scratch space...
664 */
665 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
666 mstate->dtms_scratch_size))
667 return (1);
668
669 /*
670 * Now check to see if it's a dynamic variable. This check will pick
671 * up both thread-local variables and any global dynamically-allocated
672 * variables.
673 */
674 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
675 vstate->dtvs_dynvars.dtds_size)) {
676 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
677 uintptr_t base = (uintptr_t)dstate->dtds_base +
678 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
679 uintptr_t chunkoffs;
680
681 /*
682 * Before we assume that we can store here, we need to make
683 * sure that it isn't in our metadata -- storing to our
684 * dynamic variable metadata would corrupt our state. For
685 * the range to not include any dynamic variable metadata,
686 * it must:
687 *
688 * (1) Start above the hash table that is at the base of
689 * the dynamic variable space
690 *
691 * (2) Have a starting chunk offset that is beyond the
692 * dtrace_dynvar_t that is at the base of every chunk
693 *
694 * (3) Not span a chunk boundary
695 *
696 */
697 if (addr < base)
698 return (0);
699
700 chunkoffs = (addr - base) % dstate->dtds_chunksize;
701
702 if (chunkoffs < sizeof (dtrace_dynvar_t))
703 return (0);
704
705 if (chunkoffs + sz > dstate->dtds_chunksize)
706 return (0);
707
708 return (1);
709 }
710
711 /*
712 * Finally, check the static local and global variables. These checks
713 * take the longest, so we perform them last.
714 */
715 if (dtrace_canstore_statvar(addr, sz,
716 vstate->dtvs_locals, vstate->dtvs_nlocals))
717 return (1);
718
719 if (dtrace_canstore_statvar(addr, sz,
720 vstate->dtvs_globals, vstate->dtvs_nglobals))
721 return (1);
722
723 return (0);
724}
725
726
727/*
728 * Convenience routine to check to see if the address is within a memory
729 * region in which a load may be issued given the user's privilege level;
730 * if not, it sets the appropriate error flags and loads 'addr' into the
731 * illegal value slot.
732 *
733 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
734 * appropriate memory access protection.
735 */
736static int
737dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
738 dtrace_vstate_t *vstate)
739{
740 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
741
742 /*
743 * If we hold the privilege to read from kernel memory, then
744 * everything is readable.
745 */
746 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
747 return (1);
748
749 /*
750 * You can obviously read that which you can store.
751 */
752 if (dtrace_canstore(addr, sz, mstate, vstate))
753 return (1);
754
755 /*
756 * We're allowed to read from our own string table.
757 */
758 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
759 mstate->dtms_difo->dtdo_strlen))
760 return (1);
761
762 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
763 *illval = addr;
764 return (0);
765}
766
767/*
768 * Convenience routine to check to see if a given string is within a memory
769 * region in which a load may be issued given the user's privilege level;
770 * this exists so that we don't need to issue unnecessary dtrace_strlen()
771 * calls in the event that the user has all privileges.
772 */
773static int
774dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
775 dtrace_vstate_t *vstate)
776{
777 size_t strsz;
778
779 /*
780 * If we hold the privilege to read from kernel memory, then
781 * everything is readable.
782 */
783 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
784 return (1);
785
786 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
787 if (dtrace_canload(addr, strsz, mstate, vstate))
788 return (1);
789
790 return (0);
791}
792
793/*
794 * Convenience routine to check to see if a given variable is within a memory
795 * region in which a load may be issued given the user's privilege level.
796 */
797static int
798dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
799 dtrace_vstate_t *vstate)
800{
801 size_t sz;
802 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
803
804 /*
805 * If we hold the privilege to read from kernel memory, then
806 * everything is readable.
807 */
808 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
809 return (1);
810
811 if (type->dtdt_kind == DIF_TYPE_STRING)
812 sz = dtrace_strlen(src,
813 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
814 else
815 sz = type->dtdt_size;
816
817 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
818}
819
820/*
821 * Compare two strings using safe loads.
822 */
823static int
824dtrace_strncmp(char *s1, char *s2, size_t limit)
825{
826 uint8_t c1, c2;
827 volatile uint16_t *flags;
828
829 if (s1 == s2 || limit == 0)
830 return (0);
831
832 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
833
834 do {
835 if (s1 == NULL) {
836 c1 = '\0';
837 } else {
838 c1 = dtrace_load8((uintptr_t)s1++);
839 }
840
841 if (s2 == NULL) {
842 c2 = '\0';
843 } else {
844 c2 = dtrace_load8((uintptr_t)s2++);
845 }
846
847 if (c1 != c2)
848 return (c1 - c2);
849 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
850
851 return (0);
852}
853
854/*
855 * Compute strlen(s) for a string using safe memory accesses. The additional
856 * len parameter is used to specify a maximum length to ensure completion.
857 */
858static size_t
859dtrace_strlen(const char *s, size_t lim)
860{
861 uint_t len;
862
863 for (len = 0; len != lim; len++) {
864 if (dtrace_load8((uintptr_t)s++) == '\0')
865 break;
866 }
867
868 return (len);
869}
870
871/*
872 * Check if an address falls within a toxic region.
873 */
874static int
875dtrace_istoxic(uintptr_t kaddr, size_t size)
876{
877 uintptr_t taddr, tsize;
878 int i;
879
880 for (i = 0; i < dtrace_toxranges; i++) {
881 taddr = dtrace_toxrange[i].dtt_base;
882 tsize = dtrace_toxrange[i].dtt_limit - taddr;
883
884 if (kaddr - taddr < tsize) {
885 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
886 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = kaddr;
887 return (1);
888 }
889
890 if (taddr - kaddr < size) {
891 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
892 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = taddr;
893 return (1);
894 }
895 }
896
897 return (0);
898}
899
900/*
901 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
902 * memory specified by the DIF program. The dst is assumed to be safe memory
903 * that we can store to directly because it is managed by DTrace. As with
904 * standard bcopy, overlapping copies are handled properly.
905 */
906static void
907dtrace_bcopy(const void *src, void *dst, size_t len)
908{
909 if (len != 0) {
910 uint8_t *s1 = dst;
911 const uint8_t *s2 = src;
912
913 if (s1 <= s2) {
914 do {
915 *s1++ = dtrace_load8((uintptr_t)s2++);
916 } while (--len != 0);
917 } else {
918 s2 += len;
919 s1 += len;
920
921 do {
922 *--s1 = dtrace_load8((uintptr_t)--s2);
923 } while (--len != 0);
924 }
925 }
926}
927
928/*
929 * Copy src to dst using safe memory accesses, up to either the specified
930 * length, or the point that a nul byte is encountered. The src is assumed to
931 * be unsafe memory specified by the DIF program. The dst is assumed to be
932 * safe memory that we can store to directly because it is managed by DTrace.
933 * Unlike dtrace_bcopy(), overlapping regions are not handled.
934 */
935static void
936dtrace_strcpy(const void *src, void *dst, size_t len)
937{
938 if (len != 0) {
939 uint8_t *s1 = dst, c;
940 const uint8_t *s2 = src;
941
942 do {
943 *s1++ = c = dtrace_load8((uintptr_t)s2++);
944 } while (--len != 0 && c != '\0');
945 }
946}
947
948/*
949 * Copy src to dst, deriving the size and type from the specified (BYREF)
950 * variable type. The src is assumed to be unsafe memory specified by the DIF
951 * program. The dst is assumed to be DTrace variable memory that is of the
952 * specified type; we assume that we can store to directly.
953 */
954static void
955dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
956{
957 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
958
959 if (type->dtdt_kind == DIF_TYPE_STRING) {
960 dtrace_strcpy(src, dst, type->dtdt_size);
961 } else {
962 dtrace_bcopy(src, dst, type->dtdt_size);
963 }
964}
965
966/*
967 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
968 * unsafe memory specified by the DIF program. The s2 data is assumed to be
969 * safe memory that we can access directly because it is managed by DTrace.
970 */
971static int
972dtrace_bcmp(const void *s1, const void *s2, size_t len)
973{
974 volatile uint16_t *flags;
975
976 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
977
978 if (s1 == s2)
979 return (0);
980
981 if (s1 == NULL || s2 == NULL)
982 return (1);
983
984 if (s1 != s2 && len != 0) {
985 const uint8_t *ps1 = s1;
986 const uint8_t *ps2 = s2;
987
988 do {
989 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
990 return (1);
991 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
992 }
993 return (0);
994}
995
996/*
997 * Zero the specified region using a simple byte-by-byte loop. Note that this
998 * is for safe DTrace-managed memory only.
999 */
1000static void
1001dtrace_bzero(void *dst, size_t len)
1002{
1003 uchar_t *cp;
1004
1005 for (cp = dst; len != 0; len--)
1006 *cp++ = 0;
1007}
1008
1009static void
1010dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1011{
1012 uint64_t result[2];
1013
1014 result[0] = addend1[0] + addend2[0];
1015 result[1] = addend1[1] + addend2[1] +
1016 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1017
1018 sum[0] = result[0];
1019 sum[1] = result[1];
1020}
1021
1022/*
1023 * Shift the 128-bit value in a by b. If b is positive, shift left.
1024 * If b is negative, shift right.
1025 */
1026static void
1027dtrace_shift_128(uint64_t *a, int b)
1028{
1029 uint64_t mask;
1030
1031 if (b == 0)
1032 return;
1033
1034 if (b < 0) {
1035 b = -b;
1036 if (b >= 64) {
1037 a[0] = a[1] >> (b - 64);
1038 a[1] = 0;
1039 } else {
1040 a[0] >>= b;
1041 mask = 1LL << (64 - b);
1042 mask -= 1;
1043 a[0] |= ((a[1] & mask) << (64 - b));
1044 a[1] >>= b;
1045 }
1046 } else {
1047 if (b >= 64) {
1048 a[1] = a[0] << (b - 64);
1049 a[0] = 0;
1050 } else {
1051 a[1] <<= b;
1052 mask = a[0] >> (64 - b);
1053 a[1] |= mask;
1054 a[0] <<= b;
1055 }
1056 }
1057}
1058
1059/*
1060 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1061 * use native multiplication on those, and then re-combine into the
1062 * resulting 128-bit value.
1063 *
1064 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1065 * hi1 * hi2 << 64 +
1066 * hi1 * lo2 << 32 +
1067 * hi2 * lo1 << 32 +
1068 * lo1 * lo2
1069 */
1070static void
1071dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1072{
1073 uint64_t hi1, hi2, lo1, lo2;
1074 uint64_t tmp[2];
1075
1076 hi1 = factor1 >> 32;
1077 hi2 = factor2 >> 32;
1078
1079 lo1 = factor1 & DT_MASK_LO;
1080 lo2 = factor2 & DT_MASK_LO;
1081
1082 product[0] = lo1 * lo2;
1083 product[1] = hi1 * hi2;
1084
1085 tmp[0] = hi1 * lo2;
1086 tmp[1] = 0;
1087 dtrace_shift_128(tmp, 32);
1088 dtrace_add_128(product, tmp, product);
1089
1090 tmp[0] = hi2 * lo1;
1091 tmp[1] = 0;
1092 dtrace_shift_128(tmp, 32);
1093 dtrace_add_128(product, tmp, product);
1094}
1095
1096/*
1097 * This privilege check should be used by actions and subroutines to
1098 * verify that the user credentials of the process that enabled the
1099 * invoking ECB match the target credentials
1100 */
1101static int
1102dtrace_priv_proc_common_user(dtrace_state_t *state)
1103{
1104 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1105
1106 /*
1107 * We should always have a non-NULL state cred here, since if cred
1108 * is null (anonymous tracing), we fast-path bypass this routine.
1109 */
1110 ASSERT(s_cr != NULL);
1111
1112 if ((cr = CRED()) != NULL &&
1113 s_cr->cr_uid == cr->cr_uid &&
1114 s_cr->cr_uid == cr->cr_ruid &&
1115 s_cr->cr_uid == cr->cr_suid &&
1116 s_cr->cr_gid == cr->cr_gid &&
1117 s_cr->cr_gid == cr->cr_rgid &&
1118 s_cr->cr_gid == cr->cr_sgid)
1119 return (1);
1120
1121 return (0);
1122}
1123
1124/*
1125 * This privilege check should be used by actions and subroutines to
1126 * verify that the zone of the process that enabled the invoking ECB
1127 * matches the target credentials
1128 */
1129static int
1130dtrace_priv_proc_common_zone(dtrace_state_t *state)
1131{
1132 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1133
1134 /*
1135 * We should always have a non-NULL state cred here, since if cred
1136 * is null (anonymous tracing), we fast-path bypass this routine.
1137 */
1138 ASSERT(s_cr != NULL);
1139
1140 if ((cr = CRED()) != NULL &&
1141 s_cr->cr_zone == cr->cr_zone)
1142 return (1);
1143
1144 return (0);
1145}
1146
1147/*
1148 * This privilege check should be used by actions and subroutines to
1149 * verify that the process has not setuid or changed credentials.
1150 */
1151static int
1152dtrace_priv_proc_common_nocd(VBDTVOID)
1153{
1154 proc_t *proc;
1155
1156 if ((proc = VBDT_GET_PROC()) != NULL &&
1157 !(proc->p_flag & SNOCD))
1158 return (1);
1159
1160 return (0);
1161}
1162
1163static int
1164dtrace_priv_proc_destructive(dtrace_state_t *state)
1165{
1166 int action = state->dts_cred.dcr_action;
1167
1168 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1169 dtrace_priv_proc_common_zone(state) == 0)
1170 goto bad;
1171
1172 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1173 dtrace_priv_proc_common_user(state) == 0)
1174 goto bad;
1175
1176 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1177 dtrace_priv_proc_common_nocd() == 0)
1178 goto bad;
1179
1180 return (1);
1181
1182bad:
1183 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1184
1185 return (0);
1186}
1187
1188static int
1189dtrace_priv_proc_control(dtrace_state_t *state)
1190{
1191 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1192 return (1);
1193
1194 if (dtrace_priv_proc_common_zone(state) &&
1195 dtrace_priv_proc_common_user(state) &&
1196 dtrace_priv_proc_common_nocd())
1197 return (1);
1198
1199 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1200
1201 return (0);
1202}
1203
1204static int
1205dtrace_priv_proc(dtrace_state_t *state)
1206{
1207 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1208 return (1);
1209
1210 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1211
1212 return (0);
1213}
1214
1215static int
1216dtrace_priv_kernel(dtrace_state_t *state)
1217{
1218 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1219 return (1);
1220
1221 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1222
1223 return (0);
1224}
1225
1226static int
1227dtrace_priv_kernel_destructive(dtrace_state_t *state)
1228{
1229 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1230 return (1);
1231
1232 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1233
1234 return (0);
1235}
1236
1237/*
1238 * Note: not called from probe context. This function is called
1239 * asynchronously (and at a regular interval) from outside of probe context to
1240 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1241 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1242 */
1243VBDTSTATIC void
1244dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1245{
1246 dtrace_dynvar_t *dirty;
1247 dtrace_dstate_percpu_t *dcpu;
1248 dtrace_dynvar_t **rinsep;
1249 int i, j, work = 0;
1250
1251 for (i = 0; i < NCPU; i++) {
1252 dcpu = &dstate->dtds_percpu[i];
1253 rinsep = &dcpu->dtdsc_rinsing;
1254
1255 /*
1256 * If the dirty list is NULL, there is no dirty work to do.
1257 */
1258 if (dcpu->dtdsc_dirty == NULL)
1259 continue;
1260
1261 if (dcpu->dtdsc_rinsing != NULL) {
1262 /*
1263 * If the rinsing list is non-NULL, then it is because
1264 * this CPU was selected to accept another CPU's
1265 * dirty list -- and since that time, dirty buffers
1266 * have accumulated. This is a highly unlikely
1267 * condition, but we choose to ignore the dirty
1268 * buffers -- they'll be picked up a future cleanse.
1269 */
1270 continue;
1271 }
1272
1273 if (dcpu->dtdsc_clean != NULL) {
1274 /*
1275 * If the clean list is non-NULL, then we're in a
1276 * situation where a CPU has done deallocations (we
1277 * have a non-NULL dirty list) but no allocations (we
1278 * also have a non-NULL clean list). We can't simply
1279 * move the dirty list into the clean list on this
1280 * CPU, yet we also don't want to allow this condition
1281 * to persist, lest a short clean list prevent a
1282 * massive dirty list from being cleaned (which in
1283 * turn could lead to otherwise avoidable dynamic
1284 * drops). To deal with this, we look for some CPU
1285 * with a NULL clean list, NULL dirty list, and NULL
1286 * rinsing list -- and then we borrow this CPU to
1287 * rinse our dirty list.
1288 */
1289 for (j = 0; j < NCPU; j++) {
1290 dtrace_dstate_percpu_t *rinser;
1291
1292 rinser = &dstate->dtds_percpu[j];
1293
1294 if (rinser->dtdsc_rinsing != NULL)
1295 continue;
1296
1297 if (rinser->dtdsc_dirty != NULL)
1298 continue;
1299
1300 if (rinser->dtdsc_clean != NULL)
1301 continue;
1302
1303 rinsep = &rinser->dtdsc_rinsing;
1304 break;
1305 }
1306
1307 if (j == NCPU) {
1308 /*
1309 * We were unable to find another CPU that
1310 * could accept this dirty list -- we are
1311 * therefore unable to clean it now.
1312 */
1313 dtrace_dynvar_failclean++;
1314 continue;
1315 }
1316 }
1317
1318 work = 1;
1319
1320 /*
1321 * Atomically move the dirty list aside.
1322 */
1323 do {
1324 dirty = dcpu->dtdsc_dirty;
1325
1326 /*
1327 * Before we zap the dirty list, set the rinsing list.
1328 * (This allows for a potential assertion in
1329 * dtrace_dynvar(): if a free dynamic variable appears
1330 * on a hash chain, either the dirty list or the
1331 * rinsing list for some CPU must be non-NULL.)
1332 */
1333 *rinsep = dirty;
1334 dtrace_membar_producer();
1335 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1336 dirty, NULL) != dirty);
1337 }
1338
1339 if (!work) {
1340 /*
1341 * We have no work to do; we can simply return.
1342 */
1343 return;
1344 }
1345
1346 dtrace_sync();
1347
1348 for (i = 0; i < NCPU; i++) {
1349 dcpu = &dstate->dtds_percpu[i];
1350
1351 if (dcpu->dtdsc_rinsing == NULL)
1352 continue;
1353
1354 /*
1355 * We are now guaranteed that no hash chain contains a pointer
1356 * into this dirty list; we can make it clean.
1357 */
1358 ASSERT(dcpu->dtdsc_clean == NULL);
1359 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1360 dcpu->dtdsc_rinsing = NULL;
1361 }
1362
1363 /*
1364 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1365 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1366 * This prevents a race whereby a CPU incorrectly decides that
1367 * the state should be something other than DTRACE_DSTATE_CLEAN
1368 * after dtrace_dynvar_clean() has completed.
1369 */
1370 dtrace_sync();
1371
1372 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1373}
1374
1375/*
1376 * Depending on the value of the op parameter, this function looks-up,
1377 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1378 * allocation is requested, this function will return a pointer to a
1379 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1380 * variable can be allocated. If NULL is returned, the appropriate counter
1381 * will be incremented.
1382 */
1383VBDTSTATIC dtrace_dynvar_t *
1384dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1385 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1386 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1387{
1388 uint64_t hashval = DTRACE_DYNHASH_VALID;
1389 dtrace_dynhash_t *hash = dstate->dtds_hash;
1390 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1391 processorid_t me = VBDT_GET_CPUID(), cpu = me;
1392 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1393 size_t bucket, ksize;
1394 size_t chunksize = dstate->dtds_chunksize;
1395 uintptr_t kdata, lock, nstate;
1396 uint_t i;
1397
1398 ASSERT(nkeys != 0);
1399
1400 /*
1401 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1402 * algorithm. For the by-value portions, we perform the algorithm in
1403 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1404 * bit, and seems to have only a minute effect on distribution. For
1405 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1406 * over each referenced byte. It's painful to do this, but it's much
1407 * better than pathological hash distribution. The efficacy of the
1408 * hashing algorithm (and a comparison with other algorithms) may be
1409 * found by running the ::dtrace_dynstat MDB dcmd.
1410 */
1411 for (i = 0; i < nkeys; i++) {
1412 if (key[i].dttk_size == 0) {
1413 uint64_t val = key[i].dttk_value;
1414
1415 hashval += (val >> 48) & 0xffff;
1416 hashval += (hashval << 10);
1417 hashval ^= (hashval >> 6);
1418
1419 hashval += (val >> 32) & 0xffff;
1420 hashval += (hashval << 10);
1421 hashval ^= (hashval >> 6);
1422
1423 hashval += (val >> 16) & 0xffff;
1424 hashval += (hashval << 10);
1425 hashval ^= (hashval >> 6);
1426
1427 hashval += val & 0xffff;
1428 hashval += (hashval << 10);
1429 hashval ^= (hashval >> 6);
1430 } else {
1431 /*
1432 * This is incredibly painful, but it beats the hell
1433 * out of the alternative.
1434 */
1435 uint64_t j, size = key[i].dttk_size;
1436 uintptr_t base = (uintptr_t)key[i].dttk_value;
1437
1438 if (!dtrace_canload(base, size, mstate, vstate))
1439 break;
1440
1441 for (j = 0; j < size; j++) {
1442 hashval += dtrace_load8(base + j);
1443 hashval += (hashval << 10);
1444 hashval ^= (hashval >> 6);
1445 }
1446 }
1447 }
1448
1449 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1450 return (NULL);
1451
1452 hashval += (hashval << 3);
1453 hashval ^= (hashval >> 11);
1454 hashval += (hashval << 15);
1455
1456 /*
1457 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1458 * comes out to be one of our two sentinel hash values. If this
1459 * actually happens, we set the hashval to be a value known to be a
1460 * non-sentinel value.
1461 */
1462 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1463 hashval = DTRACE_DYNHASH_VALID;
1464
1465 /*
1466 * Yes, it's painful to do a divide here. If the cycle count becomes
1467 * important here, tricks can be pulled to reduce it. (However, it's
1468 * critical that hash collisions be kept to an absolute minimum;
1469 * they're much more painful than a divide.) It's better to have a
1470 * solution that generates few collisions and still keeps things
1471 * relatively simple.
1472 */
1473 bucket = hashval % dstate->dtds_hashsize;
1474
1475 if (op == DTRACE_DYNVAR_DEALLOC) {
1476 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1477
1478 for (;;) {
1479 while ((lock = *lockp) & 1)
1480 continue;
1481
1482 if (dtrace_casptr((void *)lockp,
1483 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1484 break;
1485 }
1486
1487 dtrace_membar_producer();
1488 }
1489
1490top:
1491 prev = NULL;
1492 lock = hash[bucket].dtdh_lock;
1493
1494 dtrace_membar_consumer();
1495
1496 start = hash[bucket].dtdh_chain;
1497 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1498 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1499 op != DTRACE_DYNVAR_DEALLOC));
1500
1501 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1502 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1503 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1504
1505 if (dvar->dtdv_hashval != hashval) {
1506 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1507 /*
1508 * We've reached the sink, and therefore the
1509 * end of the hash chain; we can kick out of
1510 * the loop knowing that we have seen a valid
1511 * snapshot of state.
1512 */
1513 ASSERT(dvar->dtdv_next == NULL);
1514 ASSERT(dvar == &dtrace_dynhash_sink);
1515 break;
1516 }
1517
1518 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1519 /*
1520 * We've gone off the rails: somewhere along
1521 * the line, one of the members of this hash
1522 * chain was deleted. Note that we could also
1523 * detect this by simply letting this loop run
1524 * to completion, as we would eventually hit
1525 * the end of the dirty list. However, we
1526 * want to avoid running the length of the
1527 * dirty list unnecessarily (it might be quite
1528 * long), so we catch this as early as
1529 * possible by detecting the hash marker. In
1530 * this case, we simply set dvar to NULL and
1531 * break; the conditional after the loop will
1532 * send us back to top.
1533 */
1534 dvar = NULL;
1535 break;
1536 }
1537
1538 goto next;
1539 }
1540
1541 if (dtuple->dtt_nkeys != nkeys)
1542 goto next;
1543
1544 for (i = 0; i < nkeys; i++, dkey++) {
1545 if (dkey->dttk_size != key[i].dttk_size)
1546 goto next; /* size or type mismatch */
1547
1548 if (dkey->dttk_size != 0) {
1549 if (dtrace_bcmp(
1550 (void *)(uintptr_t)key[i].dttk_value,
1551 (void *)(uintptr_t)dkey->dttk_value,
1552 dkey->dttk_size))
1553 goto next;
1554 } else {
1555 if (dkey->dttk_value != key[i].dttk_value)
1556 goto next;
1557 }
1558 }
1559
1560 if (op != DTRACE_DYNVAR_DEALLOC)
1561 return (dvar);
1562
1563 ASSERT(dvar->dtdv_next == NULL ||
1564 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1565
1566 if (prev != NULL) {
1567 ASSERT(hash[bucket].dtdh_chain != dvar);
1568 ASSERT(start != dvar);
1569 ASSERT(prev->dtdv_next == dvar);
1570 prev->dtdv_next = dvar->dtdv_next;
1571 } else {
1572 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1573 start, dvar->dtdv_next) != start) {
1574 /*
1575 * We have failed to atomically swing the
1576 * hash table head pointer, presumably because
1577 * of a conflicting allocation on another CPU.
1578 * We need to reread the hash chain and try
1579 * again.
1580 */
1581 goto top;
1582 }
1583 }
1584
1585 dtrace_membar_producer();
1586
1587 /*
1588 * Now set the hash value to indicate that it's free.
1589 */
1590 ASSERT(hash[bucket].dtdh_chain != dvar);
1591 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1592
1593 dtrace_membar_producer();
1594
1595 /*
1596 * Set the next pointer to point at the dirty list, and
1597 * atomically swing the dirty pointer to the newly freed dvar.
1598 */
1599 do {
1600 next = dcpu->dtdsc_dirty;
1601 dvar->dtdv_next = next;
1602 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1603
1604 /*
1605 * Finally, unlock this hash bucket.
1606 */
1607 ASSERT(hash[bucket].dtdh_lock == lock);
1608 ASSERT(lock & 1);
1609 hash[bucket].dtdh_lock++;
1610
1611 return (NULL);
1612next:
1613 prev = dvar;
1614 continue;
1615 }
1616
1617 if (dvar == NULL) {
1618 /*
1619 * If dvar is NULL, it is because we went off the rails:
1620 * one of the elements that we traversed in the hash chain
1621 * was deleted while we were traversing it. In this case,
1622 * we assert that we aren't doing a dealloc (deallocs lock
1623 * the hash bucket to prevent themselves from racing with
1624 * one another), and retry the hash chain traversal.
1625 */
1626 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1627 goto top;
1628 }
1629
1630 if (op != DTRACE_DYNVAR_ALLOC) {
1631 /*
1632 * If we are not to allocate a new variable, we want to
1633 * return NULL now. Before we return, check that the value
1634 * of the lock word hasn't changed. If it has, we may have
1635 * seen an inconsistent snapshot.
1636 */
1637 if (op == DTRACE_DYNVAR_NOALLOC) {
1638 if (hash[bucket].dtdh_lock != lock)
1639 goto top;
1640 } else {
1641 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1642 ASSERT(hash[bucket].dtdh_lock == lock);
1643 ASSERT(lock & 1);
1644 hash[bucket].dtdh_lock++;
1645 }
1646
1647 return (NULL);
1648 }
1649
1650 /*
1651 * We need to allocate a new dynamic variable. The size we need is the
1652 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1653 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1654 * the size of any referred-to data (dsize). We then round the final
1655 * size up to the chunksize for allocation.
1656 */
1657 for (ksize = 0, i = 0; i < nkeys; i++)
1658 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1659
1660 /*
1661 * This should be pretty much impossible, but could happen if, say,
1662 * strange DIF specified the tuple. Ideally, this should be an
1663 * assertion and not an error condition -- but that requires that the
1664 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1665 * bullet-proof. (That is, it must not be able to be fooled by
1666 * malicious DIF.) Given the lack of backwards branches in DIF,
1667 * solving this would presumably not amount to solving the Halting
1668 * Problem -- but it still seems awfully hard.
1669 */
1670 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1671 ksize + dsize > chunksize) {
1672 dcpu->dtdsc_drops++;
1673 return (NULL);
1674 }
1675
1676 nstate = DTRACE_DSTATE_EMPTY;
1677
1678 do {
1679retry:
1680 free = dcpu->dtdsc_free;
1681
1682 if (free == NULL) {
1683 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1684 void *rval;
1685
1686 if (clean == NULL) {
1687 /*
1688 * We're out of dynamic variable space on
1689 * this CPU. Unless we have tried all CPUs,
1690 * we'll try to allocate from a different
1691 * CPU.
1692 */
1693 switch (dstate->dtds_state) {
1694 case DTRACE_DSTATE_CLEAN: {
1695 void *sp = &dstate->dtds_state;
1696
1697 if (++cpu >= NCPU)
1698 cpu = 0;
1699
1700 if (dcpu->dtdsc_dirty != NULL &&
1701 nstate == DTRACE_DSTATE_EMPTY)
1702 nstate = DTRACE_DSTATE_DIRTY;
1703
1704 if (dcpu->dtdsc_rinsing != NULL)
1705 nstate = DTRACE_DSTATE_RINSING;
1706
1707 dcpu = &dstate->dtds_percpu[cpu];
1708
1709 if (cpu != me)
1710 goto retry;
1711
1712 (void) dtrace_cas32(sp,
1713 DTRACE_DSTATE_CLEAN, nstate);
1714
1715 /*
1716 * To increment the correct bean
1717 * counter, take another lap.
1718 */
1719 goto retry;
1720 }
1721
1722 case DTRACE_DSTATE_DIRTY:
1723 dcpu->dtdsc_dirty_drops++;
1724 break;
1725
1726 case DTRACE_DSTATE_RINSING:
1727 dcpu->dtdsc_rinsing_drops++;
1728 break;
1729
1730 case DTRACE_DSTATE_EMPTY:
1731 dcpu->dtdsc_drops++;
1732 break;
1733 }
1734
1735 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1736 return (NULL);
1737 }
1738
1739 /*
1740 * The clean list appears to be non-empty. We want to
1741 * move the clean list to the free list; we start by
1742 * moving the clean pointer aside.
1743 */
1744 if (dtrace_casptr(&dcpu->dtdsc_clean,
1745 clean, NULL) != clean) {
1746 /*
1747 * We are in one of two situations:
1748 *
1749 * (a) The clean list was switched to the
1750 * free list by another CPU.
1751 *
1752 * (b) The clean list was added to by the
1753 * cleansing cyclic.
1754 *
1755 * In either of these situations, we can
1756 * just reattempt the free list allocation.
1757 */
1758 goto retry;
1759 }
1760
1761 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1762
1763 /*
1764 * Now we'll move the clean list to our free list.
1765 * It's impossible for this to fail: the only way
1766 * the free list can be updated is through this
1767 * code path, and only one CPU can own the clean list.
1768 * Thus, it would only be possible for this to fail if
1769 * this code were racing with dtrace_dynvar_clean().
1770 * (That is, if dtrace_dynvar_clean() updated the clean
1771 * list, and we ended up racing to update the free
1772 * list.) This race is prevented by the dtrace_sync()
1773 * in dtrace_dynvar_clean() -- which flushes the
1774 * owners of the clean lists out before resetting
1775 * the clean lists.
1776 */
1777 dcpu = &dstate->dtds_percpu[me];
1778 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1779 ASSERT(rval == NULL);
1780 goto retry;
1781 }
1782
1783 dvar = free;
1784 new_free = dvar->dtdv_next;
1785 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1786
1787 /*
1788 * We have now allocated a new chunk. We copy the tuple keys into the
1789 * tuple array and copy any referenced key data into the data space
1790 * following the tuple array. As we do this, we relocate dttk_value
1791 * in the final tuple to point to the key data address in the chunk.
1792 */
1793 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1794 dvar->dtdv_data = (void *)(kdata + ksize);
1795 dvar->dtdv_tuple.dtt_nkeys = nkeys;
1796
1797 for (i = 0; i < nkeys; i++) {
1798 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1799 size_t kesize = key[i].dttk_size;
1800
1801 if (kesize != 0) {
1802 dtrace_bcopy(
1803 (const void *)(uintptr_t)key[i].dttk_value,
1804 (void *)kdata, kesize);
1805 dkey->dttk_value = kdata;
1806 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1807 } else {
1808 dkey->dttk_value = key[i].dttk_value;
1809 }
1810
1811 dkey->dttk_size = kesize;
1812 }
1813
1814 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1815 dvar->dtdv_hashval = hashval;
1816 dvar->dtdv_next = start;
1817
1818 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1819 return (dvar);
1820
1821 /*
1822 * The cas has failed. Either another CPU is adding an element to
1823 * this hash chain, or another CPU is deleting an element from this
1824 * hash chain. The simplest way to deal with both of these cases
1825 * (though not necessarily the most efficient) is to free our
1826 * allocated block and tail-call ourselves. Note that the free is
1827 * to the dirty list and _not_ to the free list. This is to prevent
1828 * races with allocators, above.
1829 */
1830 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1831
1832 dtrace_membar_producer();
1833
1834 do {
1835 free = dcpu->dtdsc_dirty;
1836 dvar->dtdv_next = free;
1837 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1838
1839 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1840}
1841
1842/*ARGSUSED*/
1843static void
1844dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1845{
1846 if ((int64_t)nval < (int64_t)*oval)
1847 *oval = nval;
1848}
1849
1850/*ARGSUSED*/
1851static void
1852dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1853{
1854 if ((int64_t)nval > (int64_t)*oval)
1855 *oval = nval;
1856}
1857
1858static void
1859dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1860{
1861 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1862 int64_t val = (int64_t)nval;
1863
1864 if (val < 0) {
1865 for (i = 0; i < zero; i++) {
1866 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1867 quanta[i] += incr;
1868 return;
1869 }
1870 }
1871 } else {
1872 for (i = zero + 1; i < VBDTCAST(int)DTRACE_QUANTIZE_NBUCKETS; i++) {
1873 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1874 quanta[i - 1] += incr;
1875 return;
1876 }
1877 }
1878
1879 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1880 return;
1881 }
1882
1883#ifndef VBOX
1884 ASSERT(0);
1885#else
1886 AssertFatalFailed();
1887#endif
1888}
1889
1890static void
1891dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1892{
1893 uint64_t arg = *lquanta++;
1894 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1895 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1896 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1897 int32_t val = (int32_t)nval, level;
1898
1899 ASSERT(step != 0);
1900 ASSERT(levels != 0);
1901
1902 if (val < base) {
1903 /*
1904 * This is an underflow.
1905 */
1906 lquanta[0] += incr;
1907 return;
1908 }
1909
1910 level = (val - base) / step;
1911
1912 if (level < levels) {
1913 lquanta[level + 1] += incr;
1914 return;
1915 }
1916
1917 /*
1918 * This is an overflow.
1919 */
1920 lquanta[levels + 1] += incr;
1921}
1922
1923/*ARGSUSED*/
1924static void
1925dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1926{
1927 data[0]++;
1928 data[1] += nval;
1929}
1930
1931/*ARGSUSED*/
1932static void
1933dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
1934{
1935 int64_t snval = (int64_t)nval;
1936 uint64_t tmp[2];
1937
1938 data[0]++;
1939 data[1] += nval;
1940
1941 /*
1942 * What we want to say here is:
1943 *
1944 * data[2] += nval * nval;
1945 *
1946 * But given that nval is 64-bit, we could easily overflow, so
1947 * we do this as 128-bit arithmetic.
1948 */
1949 if (snval < 0)
1950 snval = -snval;
1951
1952 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
1953 dtrace_add_128(data + 2, tmp, data + 2);
1954}
1955
1956/*ARGSUSED*/
1957static void
1958dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
1959{
1960 *oval = *oval + 1;
1961}
1962
1963/*ARGSUSED*/
1964static void
1965dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
1966{
1967 *oval += nval;
1968}
1969
1970/*
1971 * Aggregate given the tuple in the principal data buffer, and the aggregating
1972 * action denoted by the specified dtrace_aggregation_t. The aggregation
1973 * buffer is specified as the buf parameter. This routine does not return
1974 * failure; if there is no space in the aggregation buffer, the data will be
1975 * dropped, and a corresponding counter incremented.
1976 */
1977static void
1978dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
1979 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
1980{
1981 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
1982 uint32_t i, ndx, size, fsize;
1983 uint32_t align = sizeof (uint64_t) - 1;
1984 dtrace_aggbuffer_t *agb;
1985 dtrace_aggkey_t *key;
1986 uint32_t hashval = 0, limit, isstr;
1987 caddr_t tomax, data, kdata;
1988 dtrace_actkind_t action;
1989 dtrace_action_t *act;
1990 uintptr_t offs;
1991
1992 if (buf == NULL)
1993 return;
1994
1995 if (!agg->dtag_hasarg) {
1996 /*
1997 * Currently, only quantize() and lquantize() take additional
1998 * arguments, and they have the same semantics: an increment
1999 * value that defaults to 1 when not present. If additional
2000 * aggregating actions take arguments, the setting of the
2001 * default argument value will presumably have to become more
2002 * sophisticated...
2003 */
2004 arg = 1;
2005 }
2006
2007 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2008 size = rec->dtrd_offset - agg->dtag_base;
2009 fsize = size + rec->dtrd_size;
2010
2011 ASSERT(dbuf->dtb_tomax != NULL);
2012 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2013
2014 if ((tomax = buf->dtb_tomax) == NULL) {
2015 dtrace_buffer_drop(buf);
2016 return;
2017 }
2018
2019 /*
2020 * The metastructure is always at the bottom of the buffer.
2021 */
2022 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2023 sizeof (dtrace_aggbuffer_t));
2024
2025 if (buf->dtb_offset == 0) {
2026 /*
2027 * We just kludge up approximately 1/8th of the size to be
2028 * buckets. If this guess ends up being routinely
2029 * off-the-mark, we may need to dynamically readjust this
2030 * based on past performance.
2031 */
2032 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2033
2034 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2035 (uintptr_t)tomax || hashsize == 0) {
2036 /*
2037 * We've been given a ludicrously small buffer;
2038 * increment our drop count and leave.
2039 */
2040 dtrace_buffer_drop(buf);
2041 return;
2042 }
2043
2044 /*
2045 * And now, a pathetic attempt to try to get a an odd (or
2046 * perchance, a prime) hash size for better hash distribution.
2047 */
2048 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2049 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2050
2051 agb->dtagb_hashsize = hashsize;
2052 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2053 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2054 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2055
2056 for (i = 0; i < agb->dtagb_hashsize; i++)
2057 agb->dtagb_hash[i] = NULL;
2058 }
2059
2060 ASSERT(agg->dtag_first != NULL);
2061 ASSERT(agg->dtag_first->dta_intuple);
2062
2063 /*
2064 * Calculate the hash value based on the key. Note that we _don't_
2065 * include the aggid in the hashing (but we will store it as part of
2066 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2067 * algorithm: a simple, quick algorithm that has no known funnels, and
2068 * gets good distribution in practice. The efficacy of the hashing
2069 * algorithm (and a comparison with other algorithms) may be found by
2070 * running the ::dtrace_aggstat MDB dcmd.
2071 */
2072 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2073 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2074 limit = i + act->dta_rec.dtrd_size;
2075 ASSERT(limit <= size);
2076 isstr = DTRACEACT_ISSTRING(act);
2077
2078 for (; i < limit; i++) {
2079 hashval += data[i];
2080 hashval += (hashval << 10);
2081 hashval ^= (hashval >> 6);
2082
2083 if (isstr && data[i] == '\0')
2084 break;
2085 }
2086 }
2087
2088 hashval += (hashval << 3);
2089 hashval ^= (hashval >> 11);
2090 hashval += (hashval << 15);
2091
2092 /*
2093 * Yes, the divide here is expensive -- but it's generally the least
2094 * of the performance issues given the amount of data that we iterate
2095 * over to compute hash values, compare data, etc.
2096 */
2097 ndx = hashval % agb->dtagb_hashsize;
2098
2099 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2100 ASSERT((caddr_t)key >= tomax);
2101 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2102
2103 if (hashval != key->dtak_hashval || key->dtak_size != size)
2104 continue;
2105
2106 kdata = key->dtak_data;
2107 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2108
2109 for (act = agg->dtag_first; act->dta_intuple;
2110 act = act->dta_next) {
2111 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2112 limit = i + act->dta_rec.dtrd_size;
2113 ASSERT(limit <= size);
2114 isstr = DTRACEACT_ISSTRING(act);
2115
2116 for (; i < limit; i++) {
2117 if (kdata[i] != data[i])
2118 goto next;
2119
2120 if (isstr && data[i] == '\0')
2121 break;
2122 }
2123 }
2124
2125 if (action != key->dtak_action) {
2126 /*
2127 * We are aggregating on the same value in the same
2128 * aggregation with two different aggregating actions.
2129 * (This should have been picked up in the compiler,
2130 * so we may be dealing with errant or devious DIF.)
2131 * This is an error condition; we indicate as much,
2132 * and return.
2133 */
2134 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2135 return;
2136 }
2137
2138 /*
2139 * This is a hit: we need to apply the aggregator to
2140 * the value at this key.
2141 */
2142 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2143 return;
2144next:
2145 continue;
2146 }
2147
2148 /*
2149 * We didn't find it. We need to allocate some zero-filled space,
2150 * link it into the hash table appropriately, and apply the aggregator
2151 * to the (zero-filled) value.
2152 */
2153 offs = buf->dtb_offset;
2154 while (offs & (align - 1))
2155 offs += sizeof (uint32_t);
2156
2157 /*
2158 * If we don't have enough room to both allocate a new key _and_
2159 * its associated data, increment the drop count and return.
2160 */
2161 if ((uintptr_t)tomax + offs + fsize >
2162 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2163 dtrace_buffer_drop(buf);
2164 return;
2165 }
2166
2167 /*CONSTCOND*/
2168 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2169 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2170 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2171
2172 key->dtak_data = kdata = tomax + offs;
2173 buf->dtb_offset = offs + fsize;
2174
2175 /*
2176 * Now copy the data across.
2177 */
2178 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2179
2180 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2181 kdata[i] = data[i];
2182
2183 /*
2184 * Because strings are not zeroed out by default, we need to iterate
2185 * looking for actions that store strings, and we need to explicitly
2186 * pad these strings out with zeroes.
2187 */
2188 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2189 int nul;
2190
2191 if (!DTRACEACT_ISSTRING(act))
2192 continue;
2193
2194 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2195 limit = i + act->dta_rec.dtrd_size;
2196 ASSERT(limit <= size);
2197
2198 for (nul = 0; i < limit; i++) {
2199 if (nul) {
2200 kdata[i] = '\0';
2201 continue;
2202 }
2203
2204 if (data[i] != '\0')
2205 continue;
2206
2207 nul = 1;
2208 }
2209 }
2210
2211 for (i = size; i < fsize; i++)
2212 kdata[i] = 0;
2213
2214 key->dtak_hashval = hashval;
2215 key->dtak_size = size;
2216 key->dtak_action = action;
2217 key->dtak_next = agb->dtagb_hash[ndx];
2218 agb->dtagb_hash[ndx] = key;
2219
2220 /*
2221 * Finally, apply the aggregator.
2222 */
2223 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2224 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2225}
2226
2227/*
2228 * Given consumer state, this routine finds a speculation in the INACTIVE
2229 * state and transitions it into the ACTIVE state. If there is no speculation
2230 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2231 * incremented -- it is up to the caller to take appropriate action.
2232 */
2233static int
2234dtrace_speculation(dtrace_state_t *state)
2235{
2236 int i = 0;
2237 dtrace_speculation_state_t current;
2238 uint32_t *stat = &state->dts_speculations_unavail, count;
2239
2240 while (i < state->dts_nspeculations) {
2241 dtrace_speculation_t *spec = &state->dts_speculations[i];
2242
2243 current = spec->dtsp_state;
2244
2245 if (current != DTRACESPEC_INACTIVE) {
2246 if (current == DTRACESPEC_COMMITTINGMANY ||
2247 current == DTRACESPEC_COMMITTING ||
2248 current == DTRACESPEC_DISCARDING)
2249 stat = &state->dts_speculations_busy;
2250 i++;
2251 continue;
2252 }
2253
2254 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2255 current, DTRACESPEC_ACTIVE) == current)
2256 return (i + 1);
2257 }
2258
2259 /*
2260 * We couldn't find a speculation. If we found as much as a single
2261 * busy speculation buffer, we'll attribute this failure as "busy"
2262 * instead of "unavail".
2263 */
2264 do {
2265 count = *stat;
2266 } while (dtrace_cas32(stat, count, count + 1) != count);
2267
2268 return (0);
2269}
2270
2271/*
2272 * This routine commits an active speculation. If the specified speculation
2273 * is not in a valid state to perform a commit(), this routine will silently do
2274 * nothing. The state of the specified speculation is transitioned according
2275 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2276 */
2277static void
2278dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2279 dtrace_specid_t which)
2280{
2281 dtrace_speculation_t *spec;
2282 dtrace_buffer_t *src, *dest;
2283 uintptr_t daddr, saddr, dlimit;
2284 dtrace_speculation_state_t current, new VBDTUNASS(-1);
2285 intptr_t offs;
2286
2287 if (which == 0)
2288 return;
2289
2290 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2291 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2292 return;
2293 }
2294
2295 spec = &state->dts_speculations[which - 1];
2296 src = &spec->dtsp_buffer[cpu];
2297 dest = &state->dts_buffer[cpu];
2298
2299 do {
2300 current = spec->dtsp_state;
2301
2302 if (current == DTRACESPEC_COMMITTINGMANY)
2303 break;
2304
2305 switch (current) {
2306 case DTRACESPEC_INACTIVE:
2307 case DTRACESPEC_DISCARDING:
2308 return;
2309
2310 case DTRACESPEC_COMMITTING:
2311 /*
2312 * This is only possible if we are (a) commit()'ing
2313 * without having done a prior speculate() on this CPU
2314 * and (b) racing with another commit() on a different
2315 * CPU. There's nothing to do -- we just assert that
2316 * our offset is 0.
2317 */
2318 ASSERT(src->dtb_offset == 0);
2319 return;
2320
2321 case DTRACESPEC_ACTIVE:
2322 new = DTRACESPEC_COMMITTING;
2323 break;
2324
2325 case DTRACESPEC_ACTIVEONE:
2326 /*
2327 * This speculation is active on one CPU. If our
2328 * buffer offset is non-zero, we know that the one CPU
2329 * must be us. Otherwise, we are committing on a
2330 * different CPU from the speculate(), and we must
2331 * rely on being asynchronously cleaned.
2332 */
2333 if (src->dtb_offset != 0) {
2334 new = DTRACESPEC_COMMITTING;
2335 break;
2336 }
2337 /*FALLTHROUGH*/
2338
2339 case DTRACESPEC_ACTIVEMANY:
2340 new = DTRACESPEC_COMMITTINGMANY;
2341 break;
2342
2343 default:
2344#ifndef VBOX
2345 ASSERT(0);
2346#else
2347 AssertFatalMsgFailed(("%d\n", current));
2348#endif
2349 }
2350 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2351 current, new) != current);
2352
2353 /*
2354 * We have set the state to indicate that we are committing this
2355 * speculation. Now reserve the necessary space in the destination
2356 * buffer.
2357 */
2358 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2359 sizeof (uint64_t), state, NULL)) < 0) {
2360 dtrace_buffer_drop(dest);
2361 goto out;
2362 }
2363
2364 /*
2365 * We have the space; copy the buffer across. (Note that this is a
2366 * highly subobtimal bcopy(); in the unlikely event that this becomes
2367 * a serious performance issue, a high-performance DTrace-specific
2368 * bcopy() should obviously be invented.)
2369 */
2370 daddr = (uintptr_t)dest->dtb_tomax + offs;
2371 dlimit = daddr + src->dtb_offset;
2372 saddr = (uintptr_t)src->dtb_tomax;
2373
2374 /*
2375 * First, the aligned portion.
2376 */
2377 while (dlimit - daddr >= sizeof (uint64_t)) {
2378 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2379
2380 daddr += sizeof (uint64_t);
2381 saddr += sizeof (uint64_t);
2382 }
2383
2384 /*
2385 * Now any left-over bit...
2386 */
2387 while (dlimit - daddr)
2388 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2389
2390 /*
2391 * Finally, commit the reserved space in the destination buffer.
2392 */
2393 dest->dtb_offset = offs + src->dtb_offset;
2394
2395out:
2396 /*
2397 * If we're lucky enough to be the only active CPU on this speculation
2398 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2399 */
2400 if (current == DTRACESPEC_ACTIVE ||
2401 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2402 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2403 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2404
2405 ASSERT(rval == DTRACESPEC_COMMITTING);
2406 }
2407
2408 src->dtb_offset = 0;
2409 src->dtb_xamot_drops += src->dtb_drops;
2410 src->dtb_drops = 0;
2411}
2412
2413/*
2414 * This routine discards an active speculation. If the specified speculation
2415 * is not in a valid state to perform a discard(), this routine will silently
2416 * do nothing. The state of the specified speculation is transitioned
2417 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2418 */
2419static void
2420dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2421 dtrace_specid_t which)
2422{
2423 dtrace_speculation_t *spec;
2424 dtrace_speculation_state_t current, new;
2425 dtrace_buffer_t *buf;
2426
2427 if (which == 0)
2428 return;
2429
2430 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2431 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2432 return;
2433 }
2434
2435 spec = &state->dts_speculations[which - 1];
2436 buf = &spec->dtsp_buffer[cpu];
2437
2438 do {
2439 current = spec->dtsp_state;
2440
2441 switch (current) {
2442 case DTRACESPEC_INACTIVE:
2443 case DTRACESPEC_COMMITTINGMANY:
2444 case DTRACESPEC_COMMITTING:
2445 case DTRACESPEC_DISCARDING:
2446 return;
2447
2448 case DTRACESPEC_ACTIVE:
2449 case DTRACESPEC_ACTIVEMANY:
2450 new = DTRACESPEC_DISCARDING;
2451 break;
2452
2453 case DTRACESPEC_ACTIVEONE:
2454 if (buf->dtb_offset != 0) {
2455 new = DTRACESPEC_INACTIVE;
2456 } else {
2457 new = DTRACESPEC_DISCARDING;
2458 }
2459 break;
2460
2461 default:
2462#ifndef VBOX
2463 ASSERT(0);
2464#else
2465 AssertFatalMsgFailed(("%d\n", current));
2466#endif
2467 }
2468 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2469 current, new) != current);
2470
2471 buf->dtb_offset = 0;
2472 buf->dtb_drops = 0;
2473}
2474
2475/*
2476 * Note: not called from probe context. This function is called
2477 * asynchronously from cross call context to clean any speculations that are
2478 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2479 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2480 * speculation.
2481 */
2482static void
2483dtrace_speculation_clean_here(dtrace_state_t *state)
2484{
2485 dtrace_icookie_t cookie;
2486 processorid_t cpu = VBDT_GET_CPUID();
2487 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2488 dtrace_specid_t i;
2489
2490 cookie = dtrace_interrupt_disable();
2491
2492 if (dest->dtb_tomax == NULL) {
2493 dtrace_interrupt_enable(cookie);
2494 return;
2495 }
2496
2497 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2498 dtrace_speculation_t *spec = &state->dts_speculations[i];
2499 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2500
2501 if (src->dtb_tomax == NULL)
2502 continue;
2503
2504 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2505 src->dtb_offset = 0;
2506 continue;
2507 }
2508
2509 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2510 continue;
2511
2512 if (src->dtb_offset == 0)
2513 continue;
2514
2515 dtrace_speculation_commit(state, cpu, i + 1);
2516 }
2517
2518 dtrace_interrupt_enable(cookie);
2519}
2520
2521/*
2522 * Note: not called from probe context. This function is called
2523 * asynchronously (and at a regular interval) to clean any speculations that
2524 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2525 * is work to be done, it cross calls all CPUs to perform that work;
2526 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2527 * INACTIVE state until they have been cleaned by all CPUs.
2528 */
2529static void
2530dtrace_speculation_clean(dtrace_state_t *state)
2531{
2532 int work = 0, rv;
2533 dtrace_specid_t i;
2534
2535 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2536 dtrace_speculation_t *spec = &state->dts_speculations[i];
2537
2538 ASSERT(!spec->dtsp_cleaning);
2539
2540 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2541 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2542 continue;
2543
2544 work++;
2545 spec->dtsp_cleaning = 1;
2546 }
2547
2548 if (!work)
2549 return;
2550
2551 dtrace_xcall(DTRACE_CPUALL,
2552 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2553
2554 /*
2555 * We now know that all CPUs have committed or discarded their
2556 * speculation buffers, as appropriate. We can now set the state
2557 * to inactive.
2558 */
2559 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2560 dtrace_speculation_t *spec = &state->dts_speculations[i];
2561 dtrace_speculation_state_t current, new;
2562
2563 if (!spec->dtsp_cleaning)
2564 continue;
2565
2566 current = spec->dtsp_state;
2567 ASSERT(current == DTRACESPEC_DISCARDING ||
2568 current == DTRACESPEC_COMMITTINGMANY);
2569
2570 new = DTRACESPEC_INACTIVE;
2571
2572 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2573 ASSERT(VBDTCAST(dtrace_speculation_state_t)rv == current);
2574 spec->dtsp_cleaning = 0;
2575 }
2576}
2577
2578/*
2579 * Called as part of a speculate() to get the speculative buffer associated
2580 * with a given speculation. Returns NULL if the specified speculation is not
2581 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2582 * the active CPU is not the specified CPU -- the speculation will be
2583 * atomically transitioned into the ACTIVEMANY state.
2584 */
2585static dtrace_buffer_t *
2586dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2587 dtrace_specid_t which)
2588{
2589 dtrace_speculation_t *spec;
2590 dtrace_speculation_state_t current, new VBDTUNASS(-1);
2591 dtrace_buffer_t *buf;
2592
2593 if (which == 0)
2594 return (NULL);
2595
2596 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2597 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2598 return (NULL);
2599 }
2600
2601 spec = &state->dts_speculations[which - 1];
2602 buf = &spec->dtsp_buffer[cpuid];
2603
2604 do {
2605 current = spec->dtsp_state;
2606
2607 switch (current) {
2608 case DTRACESPEC_INACTIVE:
2609 case DTRACESPEC_COMMITTINGMANY:
2610 case DTRACESPEC_DISCARDING:
2611 return (NULL);
2612
2613 case DTRACESPEC_COMMITTING:
2614 ASSERT(buf->dtb_offset == 0);
2615 return (NULL);
2616
2617 case DTRACESPEC_ACTIVEONE:
2618 /*
2619 * This speculation is currently active on one CPU.
2620 * Check the offset in the buffer; if it's non-zero,
2621 * that CPU must be us (and we leave the state alone).
2622 * If it's zero, assume that we're starting on a new
2623 * CPU -- and change the state to indicate that the
2624 * speculation is active on more than one CPU.
2625 */
2626 if (buf->dtb_offset != 0)
2627 return (buf);
2628
2629 new = DTRACESPEC_ACTIVEMANY;
2630 break;
2631
2632 case DTRACESPEC_ACTIVEMANY:
2633 return (buf);
2634
2635 case DTRACESPEC_ACTIVE:
2636 new = DTRACESPEC_ACTIVEONE;
2637 break;
2638
2639 default:
2640#ifndef VBOX
2641 ASSERT(0);
2642#else
2643 AssertFatalMsgFailed(("%d\n", current));
2644#endif
2645 }
2646 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2647 current, new) != current);
2648
2649 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2650 return (buf);
2651}
2652
2653/*
2654 * Return a string. In the event that the user lacks the privilege to access
2655 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2656 * don't fail access checking.
2657 *
2658 * dtrace_dif_variable() uses this routine as a helper for various
2659 * builtin values such as 'execname' and 'probefunc.'
2660 */
2661VBDTSTATIC uintptr_t
2662dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2663 dtrace_mstate_t *mstate)
2664{
2665 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2666 uintptr_t ret;
2667 size_t strsz;
2668
2669 /*
2670 * The easy case: this probe is allowed to read all of memory, so
2671 * we can just return this as a vanilla pointer.
2672 */
2673 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2674 return (addr);
2675
2676 /*
2677 * This is the tougher case: we copy the string in question from
2678 * kernel memory into scratch memory and return it that way: this
2679 * ensures that we won't trip up when access checking tests the
2680 * BYREF return value.
2681 */
2682 strsz = dtrace_strlen((char *)addr, size) + 1;
2683
2684 if (mstate->dtms_scratch_ptr + strsz >
2685 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2686 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2687 return (NULL);
2688 }
2689
2690 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2691 strsz);
2692 ret = mstate->dtms_scratch_ptr;
2693 mstate->dtms_scratch_ptr += strsz;
2694 return (ret);
2695}
2696
2697/*
2698 * This function implements the DIF emulator's variable lookups. The emulator
2699 * passes a reserved variable identifier and optional built-in array index.
2700 */
2701static uint64_t
2702dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2703 uint64_t ndx)
2704{
2705 /*
2706 * If we're accessing one of the uncached arguments, we'll turn this
2707 * into a reference in the args array.
2708 */
2709 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2710 ndx = v - DIF_VAR_ARG0;
2711 v = DIF_VAR_ARGS;
2712 }
2713
2714 switch (v) {
2715 case DIF_VAR_ARGS:
2716 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2717 if (ndx >= sizeof (mstate->dtms_arg) /
2718 sizeof (mstate->dtms_arg[0])) {
2719 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2720 dtrace_provider_t *pv;
2721 uint64_t val;
2722
2723 pv = mstate->dtms_probe->dtpr_provider;
2724 if (pv->dtpv_pops.dtps_getargval != NULL)
2725 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2726 mstate->dtms_probe->dtpr_id,
2727 mstate->dtms_probe->dtpr_arg, ndx, aframes);
2728 else
2729 val = dtrace_getarg(ndx, aframes);
2730
2731 /*
2732 * This is regrettably required to keep the compiler
2733 * from tail-optimizing the call to dtrace_getarg().
2734 * The condition always evaluates to true, but the
2735 * compiler has no way of figuring that out a priori.
2736 * (None of this would be necessary if the compiler
2737 * could be relied upon to _always_ tail-optimize
2738 * the call to dtrace_getarg() -- but it can't.)
2739 */
2740 if (mstate->dtms_probe != NULL)
2741 return (val);
2742
2743#ifndef VBOX
2744 ASSERT(0);
2745#else
2746 AssertFatalFailed();
2747#endif
2748 }
2749
2750 return (mstate->dtms_arg[ndx]);
2751
2752 case DIF_VAR_UREGS: {
2753#ifndef VBOX
2754 klwp_t *lwp;
2755
2756 if (!dtrace_priv_proc(state))
2757 return (0);
2758
2759 if ((lwp = curthread->t_lwp) == NULL) {
2760 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2761 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = NULL;
2762 return (0);
2763 }
2764
2765 return (dtrace_getreg(lwp->lwp_regs, ndx));
2766#else
2767 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2768 return (0);
2769#endif
2770 }
2771
2772 case DIF_VAR_CURTHREAD:
2773 if (!dtrace_priv_kernel(state))
2774 return (0);
2775#ifndef VBOX
2776 return ((uint64_t)(uintptr_t)curthread);
2777#else
2778 return ((uintptr_t)RTThreadNativeSelf());
2779#endif
2780
2781 case DIF_VAR_TIMESTAMP:
2782 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2783 mstate->dtms_timestamp = dtrace_gethrtime();
2784 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2785 }
2786 return (mstate->dtms_timestamp);
2787
2788 case DIF_VAR_VTIMESTAMP:
2789#ifndef VBOX
2790 ASSERT(dtrace_vtime_references != 0);
2791 return (curthread->t_dtrace_vtime);
2792#else
2793 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2794 return (0);
2795#endif
2796
2797 case DIF_VAR_WALLTIMESTAMP:
2798 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2799 mstate->dtms_walltimestamp = dtrace_gethrestime();
2800 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2801 }
2802 return (mstate->dtms_walltimestamp);
2803
2804 case DIF_VAR_IPL:
2805 if (!dtrace_priv_kernel(state))
2806 return (0);
2807 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2808 mstate->dtms_ipl = dtrace_getipl();
2809 mstate->dtms_present |= DTRACE_MSTATE_IPL;
2810 }
2811 return (mstate->dtms_ipl);
2812
2813 case DIF_VAR_EPID:
2814 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2815 return (mstate->dtms_epid);
2816
2817 case DIF_VAR_ID:
2818 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2819 return (mstate->dtms_probe->dtpr_id);
2820
2821 case DIF_VAR_STACKDEPTH:
2822 if (!dtrace_priv_kernel(state))
2823 return (0);
2824 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2825 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2826
2827 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2828 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2829 }
2830 return (mstate->dtms_stackdepth);
2831
2832 case DIF_VAR_USTACKDEPTH:
2833 if (!dtrace_priv_proc(state))
2834 return (0);
2835 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2836 /*
2837 * See comment in DIF_VAR_PID.
2838 */
2839 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2840 CPU_ON_INTR(CPU)) {
2841 mstate->dtms_ustackdepth = 0;
2842 } else {
2843 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2844 mstate->dtms_ustackdepth =
2845 dtrace_getustackdepth();
2846 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2847 }
2848 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2849 }
2850 return (mstate->dtms_ustackdepth);
2851
2852 case DIF_VAR_CALLER:
2853 if (!dtrace_priv_kernel(state))
2854 return (0);
2855 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2856 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2857
2858 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2859 /*
2860 * If this is an unanchored probe, we are
2861 * required to go through the slow path:
2862 * dtrace_caller() only guarantees correct
2863 * results for anchored probes.
2864 */
2865 pc_t caller[2];
2866
2867 dtrace_getpcstack(caller, 2, aframes,
2868 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2869 mstate->dtms_caller = caller[1];
2870 } else if ((mstate->dtms_caller =
2871 dtrace_caller(aframes)) == VBDTCAST(uintptr_t)-1) {
2872 /*
2873 * We have failed to do this the quick way;
2874 * we must resort to the slower approach of
2875 * calling dtrace_getpcstack().
2876 */
2877 pc_t caller;
2878
2879 dtrace_getpcstack(&caller, 1, aframes, NULL);
2880 mstate->dtms_caller = caller;
2881 }
2882
2883 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2884 }
2885 return (mstate->dtms_caller);
2886
2887 case DIF_VAR_UCALLER:
2888 if (!dtrace_priv_proc(state))
2889 return (0);
2890
2891 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
2892 uint64_t ustack[3];
2893
2894 /*
2895 * dtrace_getupcstack() fills in the first uint64_t
2896 * with the current PID. The second uint64_t will
2897 * be the program counter at user-level. The third
2898 * uint64_t will contain the caller, which is what
2899 * we're after.
2900 */
2901 ustack[2] = NULL;
2902 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2903 dtrace_getupcstack(ustack, 3);
2904 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2905 mstate->dtms_ucaller = ustack[2];
2906 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
2907 }
2908
2909 return (mstate->dtms_ucaller);
2910
2911 case DIF_VAR_PROBEPROV:
2912 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2913 return (dtrace_dif_varstr(
2914 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
2915 state, mstate));
2916
2917 case DIF_VAR_PROBEMOD:
2918 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2919 return (dtrace_dif_varstr(
2920 (uintptr_t)mstate->dtms_probe->dtpr_mod,
2921 state, mstate));
2922
2923 case DIF_VAR_PROBEFUNC:
2924 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2925 return (dtrace_dif_varstr(
2926 (uintptr_t)mstate->dtms_probe->dtpr_func,
2927 state, mstate));
2928
2929 case DIF_VAR_PROBENAME:
2930 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2931 return (dtrace_dif_varstr(
2932 (uintptr_t)mstate->dtms_probe->dtpr_name,
2933 state, mstate));
2934
2935 case DIF_VAR_PID:
2936 if (!dtrace_priv_proc(state))
2937 return (0);
2938
2939#ifndef VBOX
2940 /*
2941 * Note that we are assuming that an unanchored probe is
2942 * always due to a high-level interrupt. (And we're assuming
2943 * that there is only a single high level interrupt.)
2944 */
2945 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2946 return (pid0.pid_id);
2947
2948 /*
2949 * It is always safe to dereference one's own t_procp pointer:
2950 * it always points to a valid, allocated proc structure.
2951 * Further, it is always safe to dereference the p_pidp member
2952 * of one's own proc structure. (These are truisms becuase
2953 * threads and processes don't clean up their own state --
2954 * they leave that task to whomever reaps them.)
2955 */
2956 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
2957#else
2958 return (RTProcSelf());
2959#endif
2960
2961 case DIF_VAR_PPID:
2962 if (!dtrace_priv_proc(state))
2963 return (0);
2964
2965#ifndef VBOX
2966 /*
2967 * See comment in DIF_VAR_PID.
2968 */
2969 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2970 return (pid0.pid_id);
2971
2972 /*
2973 * It is always safe to dereference one's own t_procp pointer:
2974 * it always points to a valid, allocated proc structure.
2975 * (This is true because threads don't clean up their own
2976 * state -- they leave that task to whomever reaps them.)
2977 */
2978 return ((uint64_t)curthread->t_procp->p_ppid);
2979#else
2980 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2981 return (0); /** @todo parent pid? */
2982#endif
2983
2984 case DIF_VAR_TID:
2985#ifndef VBOX
2986 /*
2987 * See comment in DIF_VAR_PID.
2988 */
2989 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2990 return (0);
2991
2992 return ((uint64_t)curthread->t_tid);
2993#else
2994 return (RTThreadNativeSelf()); /** @todo proper tid? */
2995#endif
2996
2997 case DIF_VAR_EXECNAME:
2998 if (!dtrace_priv_proc(state))
2999 return (0);
3000
3001#ifndef VBOX
3002 /*
3003 * See comment in DIF_VAR_PID.
3004 */
3005 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3006 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3007
3008 /*
3009 * It is always safe to dereference one's own t_procp pointer:
3010 * it always points to a valid, allocated proc structure.
3011 * (This is true because threads don't clean up their own
3012 * state -- they leave that task to whomever reaps them.)
3013 */
3014 return (dtrace_dif_varstr(
3015 (uintptr_t)curthread->t_procp->p_user.u_comm,
3016 state, mstate));
3017#else
3018 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3019 return (0); /** @todo execname */
3020#endif
3021
3022 case DIF_VAR_ZONENAME:
3023 if (!dtrace_priv_proc(state))
3024 return (0);
3025
3026#ifndef VBOX
3027 /*
3028 * See comment in DIF_VAR_PID.
3029 */
3030 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3031 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3032
3033 /*
3034 * It is always safe to dereference one's own t_procp pointer:
3035 * it always points to a valid, allocated proc structure.
3036 * (This is true because threads don't clean up their own
3037 * state -- they leave that task to whomever reaps them.)
3038 */
3039 return (dtrace_dif_varstr(
3040 (uintptr_t)curthread->t_procp->p_zone->zone_name,
3041 state, mstate));
3042#else
3043 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3044 return (0);
3045#endif
3046
3047 case DIF_VAR_UID:
3048 if (!dtrace_priv_proc(state))
3049 return (0);
3050
3051#ifndef VBOX
3052 /*
3053 * See comment in DIF_VAR_PID.
3054 */
3055 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3056 return ((uint64_t)p0.p_cred->cr_uid);
3057
3058 /*
3059 * It is always safe to dereference one's own t_procp pointer:
3060 * it always points to a valid, allocated proc structure.
3061 * (This is true because threads don't clean up their own
3062 * state -- they leave that task to whomever reaps them.)
3063 *
3064 * Additionally, it is safe to dereference one's own process
3065 * credential, since this is never NULL after process birth.
3066 */
3067 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3068#else
3069 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3070 return (0);
3071#endif
3072
3073 case DIF_VAR_GID:
3074 if (!dtrace_priv_proc(state))
3075 return (0);
3076
3077#ifndef VBOX
3078 /*
3079 * See comment in DIF_VAR_PID.
3080 */
3081 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3082 return ((uint64_t)p0.p_cred->cr_gid);
3083
3084 /*
3085 * It is always safe to dereference one's own t_procp pointer:
3086 * it always points to a valid, allocated proc structure.
3087 * (This is true because threads don't clean up their own
3088 * state -- they leave that task to whomever reaps them.)
3089 *
3090 * Additionally, it is safe to dereference one's own process
3091 * credential, since this is never NULL after process birth.
3092 */
3093 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3094#else
3095 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3096 return (0);
3097#endif
3098
3099 case DIF_VAR_ERRNO: {
3100#ifndef VBOX
3101 klwp_t *lwp;
3102#endif
3103 if (!dtrace_priv_proc(state))
3104 return (0);
3105
3106#ifndef VBOX
3107 /*
3108 * See comment in DIF_VAR_PID.
3109 */
3110 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3111 return (0);
3112
3113 /*
3114 * It is always safe to dereference one's own t_lwp pointer in
3115 * the event that this pointer is non-NULL. (This is true
3116 * because threads and lwps don't clean up their own state --
3117 * they leave that task to whomever reaps them.)
3118 */
3119 if ((lwp = curthread->t_lwp) == NULL)
3120 return (0);
3121
3122 return ((uint64_t)lwp->lwp_errno);
3123#else
3124 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3125 return (0);
3126#endif
3127 }
3128 default:
3129 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3130 return (0);
3131 }
3132}
3133
3134/*
3135 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3136 * Notice that we don't bother validating the proper number of arguments or
3137 * their types in the tuple stack. This isn't needed because all argument
3138 * interpretation is safe because of our load safety -- the worst that can
3139 * happen is that a bogus program can obtain bogus results.
3140 */
3141static void
3142dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3143 dtrace_key_t *tupregs, int nargs,
3144 dtrace_mstate_t *mstate, dtrace_state_t *state)
3145{
3146 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
3147 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
3148 dtrace_vstate_t *vstate = &state->dts_vstate;
3149
3150#ifndef VBOX
3151 union {
3152 mutex_impl_t mi;
3153 uint64_t mx;
3154 } m;
3155
3156 union {
3157 krwlock_t ri;
3158 uintptr_t rw;
3159 } r;
3160#endif
3161
3162 switch (subr) {
3163 case DIF_SUBR_RAND:
3164 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3165 break;
3166
3167 case DIF_SUBR_MUTEX_OWNED:
3168#ifndef VBOX
3169 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3170 mstate, vstate)) {
3171 regs[rd] = NULL;
3172 break;
3173 }
3174
3175 m.mx = dtrace_load64(tupregs[0].dttk_value);
3176 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3177 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3178 else
3179 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3180#else
3181 regs[rd] = 0;
3182 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3183#endif
3184 break;
3185
3186 case DIF_SUBR_MUTEX_OWNER:
3187#ifndef VBOX
3188 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3189 mstate, vstate)) {
3190 regs[rd] = NULL;
3191 break;
3192 }
3193
3194 m.mx = dtrace_load64(tupregs[0].dttk_value);
3195 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3196 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3197 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3198 else
3199 regs[rd] = 0;
3200#else
3201 regs[rd] = 0;
3202 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3203#endif
3204 break;
3205
3206 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3207#ifndef VBOX
3208 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3209 mstate, vstate)) {
3210 regs[rd] = NULL;
3211 break;
3212 }
3213
3214 m.mx = dtrace_load64(tupregs[0].dttk_value);
3215 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3216#else
3217 regs[rd] = 0;
3218 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3219#endif
3220 break;
3221
3222 case DIF_SUBR_MUTEX_TYPE_SPIN:
3223#ifndef VBOX
3224 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3225 mstate, vstate)) {
3226 regs[rd] = NULL;
3227 break;
3228 }
3229
3230 m.mx = dtrace_load64(tupregs[0].dttk_value);
3231 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3232#else
3233 regs[rd] = 0;
3234 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3235#endif
3236 break;
3237
3238 case DIF_SUBR_RW_READ_HELD: {
3239#ifndef VBOX
3240 uintptr_t tmp;
3241
3242 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3243 mstate, vstate)) {
3244 regs[rd] = NULL;
3245 break;
3246 }
3247
3248 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3249 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3250#else
3251 regs[rd] = 0;
3252 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3253#endif
3254 break;
3255 }
3256
3257 case DIF_SUBR_RW_WRITE_HELD:
3258#ifndef VBOX
3259 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3260 mstate, vstate)) {
3261 regs[rd] = NULL;
3262 break;
3263 }
3264
3265 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3266 regs[rd] = _RW_WRITE_HELD(&r.ri);
3267#else
3268 regs[rd] = 0;
3269 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3270#endif
3271 break;
3272
3273 case DIF_SUBR_RW_ISWRITER:
3274#ifndef VBOX
3275 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3276 mstate, vstate)) {
3277 regs[rd] = NULL;
3278 break;
3279 }
3280
3281 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3282 regs[rd] = _RW_ISWRITER(&r.ri);
3283#else
3284 regs[rd] = 0;
3285 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3286#endif
3287 break;
3288
3289 case DIF_SUBR_BCOPY: {
3290 /*
3291 * We need to be sure that the destination is in the scratch
3292 * region -- no other region is allowed.
3293 */
3294 uintptr_t src = tupregs[0].dttk_value;
3295 uintptr_t dest = tupregs[1].dttk_value;
3296 size_t size = tupregs[2].dttk_value;
3297
3298 if (!dtrace_inscratch(dest, size, mstate)) {
3299 *flags |= CPU_DTRACE_BADADDR;
3300 *illval = regs[rd];
3301 break;
3302 }
3303
3304 if (!dtrace_canload(src, size, mstate, vstate)) {
3305 regs[rd] = NULL;
3306 break;
3307 }
3308
3309 dtrace_bcopy((void *)src, (void *)dest, size);
3310 break;
3311 }
3312
3313 case DIF_SUBR_ALLOCA:
3314 case DIF_SUBR_COPYIN: {
3315 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3316 uint64_t size =
3317 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3318 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3319
3320 /*
3321 * This action doesn't require any credential checks since
3322 * probes will not activate in user contexts to which the
3323 * enabling user does not have permissions.
3324 */
3325
3326 /*
3327 * Rounding up the user allocation size could have overflowed
3328 * a large, bogus allocation (like -1ULL) to 0.
3329 */
3330 if (scratch_size < size ||
3331 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3332 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3333 regs[rd] = NULL;
3334 break;
3335 }
3336
3337 if (subr == DIF_SUBR_COPYIN) {
3338 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3339 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3340 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3341 }
3342
3343 mstate->dtms_scratch_ptr += scratch_size;
3344 regs[rd] = dest;
3345 break;
3346 }
3347
3348 case DIF_SUBR_COPYINTO: {
3349 uint64_t size = tupregs[1].dttk_value;
3350 uintptr_t dest = tupregs[2].dttk_value;
3351
3352 /*
3353 * This action doesn't require any credential checks since
3354 * probes will not activate in user contexts to which the
3355 * enabling user does not have permissions.
3356 */
3357 if (!dtrace_inscratch(dest, size, mstate)) {
3358 *flags |= CPU_DTRACE_BADADDR;
3359 *illval = regs[rd];
3360 break;
3361 }
3362
3363 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3364 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3365 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3366 break;
3367 }
3368
3369 case DIF_SUBR_COPYINSTR: {
3370 uintptr_t dest = mstate->dtms_scratch_ptr;
3371 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3372
3373 if (nargs > 1 && tupregs[1].dttk_value < size)
3374 size = tupregs[1].dttk_value + 1;
3375
3376 /*
3377 * This action doesn't require any credential checks since
3378 * probes will not activate in user contexts to which the
3379 * enabling user does not have permissions.
3380 */
3381 if (!DTRACE_INSCRATCH(mstate, size)) {
3382 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3383 regs[rd] = NULL;
3384 break;
3385 }
3386
3387 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3388 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3389 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3390
3391 ((char *)dest)[size - 1] = '\0';
3392 mstate->dtms_scratch_ptr += size;
3393 regs[rd] = dest;
3394 break;
3395 }
3396
3397 case DIF_SUBR_MSGSIZE:
3398 case DIF_SUBR_MSGDSIZE: {
3399#ifndef VBOX
3400 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3401 uintptr_t wptr, rptr;
3402 size_t count = 0;
3403 int cont = 0;
3404
3405 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3406
3407 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3408 vstate)) {
3409 regs[rd] = NULL;
3410 break;
3411 }
3412
3413 wptr = dtrace_loadptr(baddr +
3414 offsetof(mblk_t, b_wptr));
3415
3416 rptr = dtrace_loadptr(baddr +
3417 offsetof(mblk_t, b_rptr));
3418
3419 if (wptr < rptr) {
3420 *flags |= CPU_DTRACE_BADADDR;
3421 *illval = tupregs[0].dttk_value;
3422 break;
3423 }
3424
3425 daddr = dtrace_loadptr(baddr +
3426 offsetof(mblk_t, b_datap));
3427
3428 baddr = dtrace_loadptr(baddr +
3429 offsetof(mblk_t, b_cont));
3430
3431 /*
3432 * We want to prevent against denial-of-service here,
3433 * so we're only going to search the list for
3434 * dtrace_msgdsize_max mblks.
3435 */
3436 if (cont++ > dtrace_msgdsize_max) {
3437 *flags |= CPU_DTRACE_ILLOP;
3438 break;
3439 }
3440
3441 if (subr == DIF_SUBR_MSGDSIZE) {
3442 if (dtrace_load8(daddr +
3443 offsetof(dblk_t, db_type)) != M_DATA)
3444 continue;
3445 }
3446
3447 count += wptr - rptr;
3448 }
3449
3450 if (!(*flags & CPU_DTRACE_FAULT))
3451 regs[rd] = count;
3452
3453#else
3454 regs[rd] = 0;
3455 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3456#endif
3457 break;
3458 }
3459
3460 case DIF_SUBR_PROGENYOF: {
3461#ifndef VBOX
3462 pid_t pid = tupregs[0].dttk_value;
3463 proc_t *p;
3464 int rval = 0;
3465
3466 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3467
3468 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3469 if (p->p_pidp->pid_id == pid) {
3470 rval = 1;
3471 break;
3472 }
3473 }
3474
3475 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3476
3477 regs[rd] = rval;
3478#else
3479 regs[rd] = 0;
3480 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3481#endif
3482 break;
3483 }
3484
3485 case DIF_SUBR_SPECULATION:
3486 regs[rd] = dtrace_speculation(state);
3487 break;
3488
3489 case DIF_SUBR_COPYOUT: {
3490 uintptr_t kaddr = tupregs[0].dttk_value;
3491 uintptr_t uaddr = tupregs[1].dttk_value;
3492 uint64_t size = tupregs[2].dttk_value;
3493
3494 if (!dtrace_destructive_disallow &&
3495 dtrace_priv_proc_control(state) &&
3496 !dtrace_istoxic(kaddr, size)) {
3497 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3498 dtrace_copyout(kaddr, uaddr, size, flags);
3499 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3500 }
3501 break;
3502 }
3503
3504 case DIF_SUBR_COPYOUTSTR: {
3505 uintptr_t kaddr = tupregs[0].dttk_value;
3506 uintptr_t uaddr = tupregs[1].dttk_value;
3507 uint64_t size = tupregs[2].dttk_value;
3508
3509 if (!dtrace_destructive_disallow &&
3510 dtrace_priv_proc_control(state) &&
3511 !dtrace_istoxic(kaddr, size)) {
3512 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3513 dtrace_copyoutstr(kaddr, uaddr, size, flags);
3514 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3515 }
3516 break;
3517 }
3518
3519 case DIF_SUBR_STRLEN: {
3520 size_t sz;
3521 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3522 sz = dtrace_strlen((char *)addr,
3523 state->dts_options[DTRACEOPT_STRSIZE]);
3524
3525 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3526 regs[rd] = NULL;
3527 break;
3528 }
3529
3530 regs[rd] = sz;
3531
3532 break;
3533 }
3534
3535 case DIF_SUBR_STRCHR:
3536 case DIF_SUBR_STRRCHR: {
3537 /*
3538 * We're going to iterate over the string looking for the
3539 * specified character. We will iterate until we have reached
3540 * the string length or we have found the character. If this
3541 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3542 * of the specified character instead of the first.
3543 */
3544 uintptr_t saddr = tupregs[0].dttk_value;
3545 uintptr_t addr = tupregs[0].dttk_value;
3546 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3547 char c, target = (char)tupregs[1].dttk_value;
3548
3549 for (regs[rd] = NULL; addr < limit; addr++) {
3550 if ((c = dtrace_load8(addr)) == target) {
3551 regs[rd] = addr;
3552
3553 if (subr == DIF_SUBR_STRCHR)
3554 break;
3555 }
3556
3557 if (c == '\0')
3558 break;
3559 }
3560
3561 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3562 regs[rd] = NULL;
3563 break;
3564 }
3565
3566 break;
3567 }
3568
3569 case DIF_SUBR_STRSTR:
3570 case DIF_SUBR_INDEX:
3571 case DIF_SUBR_RINDEX: {
3572 /*
3573 * We're going to iterate over the string looking for the
3574 * specified string. We will iterate until we have reached
3575 * the string length or we have found the string. (Yes, this
3576 * is done in the most naive way possible -- but considering
3577 * that the string we're searching for is likely to be
3578 * relatively short, the complexity of Rabin-Karp or similar
3579 * hardly seems merited.)
3580 */
3581 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3582 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3583 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3584 size_t len = dtrace_strlen(addr, size);
3585 size_t sublen = dtrace_strlen(substr, size);
3586 char *limit = addr + len, *orig = addr;
3587 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3588 int inc = 1;
3589
3590 regs[rd] = notfound;
3591
3592 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3593 regs[rd] = NULL;
3594 break;
3595 }
3596
3597 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3598 vstate)) {
3599 regs[rd] = NULL;
3600 break;
3601 }
3602
3603 /*
3604 * strstr() and index()/rindex() have similar semantics if
3605 * both strings are the empty string: strstr() returns a
3606 * pointer to the (empty) string, and index() and rindex()
3607 * both return index 0 (regardless of any position argument).
3608 */
3609 if (sublen == 0 && len == 0) {
3610 if (subr == DIF_SUBR_STRSTR)
3611 regs[rd] = (uintptr_t)addr;
3612 else
3613 regs[rd] = 0;
3614 break;
3615 }
3616
3617 if (subr != DIF_SUBR_STRSTR) {
3618 if (subr == DIF_SUBR_RINDEX) {
3619 limit = orig - 1;
3620 addr += len;
3621 inc = -1;
3622 }
3623
3624 /*
3625 * Both index() and rindex() take an optional position
3626 * argument that denotes the starting position.
3627 */
3628 if (nargs == 3) {
3629 int64_t pos = (int64_t)tupregs[2].dttk_value;
3630
3631 /*
3632 * If the position argument to index() is
3633 * negative, Perl implicitly clamps it at
3634 * zero. This semantic is a little surprising
3635 * given the special meaning of negative
3636 * positions to similar Perl functions like
3637 * substr(), but it appears to reflect a
3638 * notion that index() can start from a
3639 * negative index and increment its way up to
3640 * the string. Given this notion, Perl's
3641 * rindex() is at least self-consistent in
3642 * that it implicitly clamps positions greater
3643 * than the string length to be the string
3644 * length. Where Perl completely loses
3645 * coherence, however, is when the specified
3646 * substring is the empty string (""). In
3647 * this case, even if the position is
3648 * negative, rindex() returns 0 -- and even if
3649 * the position is greater than the length,
3650 * index() returns the string length. These
3651 * semantics violate the notion that index()
3652 * should never return a value less than the
3653 * specified position and that rindex() should
3654 * never return a value greater than the
3655 * specified position. (One assumes that
3656 * these semantics are artifacts of Perl's
3657 * implementation and not the results of
3658 * deliberate design -- it beggars belief that
3659 * even Larry Wall could desire such oddness.)
3660 * While in the abstract one would wish for
3661 * consistent position semantics across
3662 * substr(), index() and rindex() -- or at the
3663 * very least self-consistent position
3664 * semantics for index() and rindex() -- we
3665 * instead opt to keep with the extant Perl
3666 * semantics, in all their broken glory. (Do
3667 * we have more desire to maintain Perl's
3668 * semantics than Perl does? Probably.)
3669 */
3670 if (subr == DIF_SUBR_RINDEX) {
3671 if (pos < 0) {
3672 if (sublen == 0)
3673 regs[rd] = 0;
3674 break;
3675 }
3676
3677 if (VBDTCAST(uint64_t)pos > len)
3678 pos = len;
3679 } else {
3680 if (pos < 0)
3681 pos = 0;
3682
3683 if (VBDTCAST(uint64_t)pos >= len) {
3684 if (sublen == 0)
3685 regs[rd] = len;
3686 break;
3687 }
3688 }
3689
3690 addr = orig + pos;
3691 }
3692 }
3693
3694 for (regs[rd] = notfound; addr != limit; addr += inc) {
3695 if (dtrace_strncmp(addr, substr, sublen) == 0) {
3696 if (subr != DIF_SUBR_STRSTR) {
3697 /*
3698 * As D index() and rindex() are
3699 * modeled on Perl (and not on awk),
3700 * we return a zero-based (and not a
3701 * one-based) index. (For you Perl
3702 * weenies: no, we're not going to add
3703 * $[ -- and shouldn't you be at a con
3704 * or something?)
3705 */
3706 regs[rd] = (uintptr_t)(addr - orig);
3707 break;
3708 }
3709
3710 ASSERT(subr == DIF_SUBR_STRSTR);
3711 regs[rd] = (uintptr_t)addr;
3712 break;
3713 }
3714 }
3715
3716 break;
3717 }
3718
3719 case DIF_SUBR_STRTOK: {
3720 uintptr_t addr = tupregs[0].dttk_value;
3721 uintptr_t tokaddr = tupregs[1].dttk_value;
3722 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3723 uintptr_t limit, toklimit = tokaddr + size;
3724 uint8_t c VBDTUNASS(0), tokmap[32]; /* 256 / 8 */
3725 char *dest = (char *)mstate->dtms_scratch_ptr;
3726 VBDTTYPE(unsigned,int) i;
3727
3728 /*
3729 * Check both the token buffer and (later) the input buffer,
3730 * since both could be non-scratch addresses.
3731 */
3732 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3733 regs[rd] = NULL;
3734 break;
3735 }
3736
3737 if (!DTRACE_INSCRATCH(mstate, size)) {
3738 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3739 regs[rd] = NULL;
3740 break;
3741 }
3742
3743 if (addr == NULL) {
3744 /*
3745 * If the address specified is NULL, we use our saved
3746 * strtok pointer from the mstate. Note that this
3747 * means that the saved strtok pointer is _only_
3748 * valid within multiple enablings of the same probe --
3749 * it behaves like an implicit clause-local variable.
3750 */
3751 addr = mstate->dtms_strtok;
3752 } else {
3753 /*
3754 * If the user-specified address is non-NULL we must
3755 * access check it. This is the only time we have
3756 * a chance to do so, since this address may reside
3757 * in the string table of this clause-- future calls
3758 * (when we fetch addr from mstate->dtms_strtok)
3759 * would fail this access check.
3760 */
3761 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3762 regs[rd] = NULL;
3763 break;
3764 }
3765 }
3766
3767 /*
3768 * First, zero the token map, and then process the token
3769 * string -- setting a bit in the map for every character
3770 * found in the token string.
3771 */
3772 for (i = 0; i < sizeof (tokmap); i++)
3773 tokmap[i] = 0;
3774
3775 for (; tokaddr < toklimit; tokaddr++) {
3776 if ((c = dtrace_load8(tokaddr)) == '\0')
3777 break;
3778
3779 ASSERT((c >> 3) < sizeof (tokmap));
3780 tokmap[c >> 3] |= (1 << (c & 0x7));
3781 }
3782
3783 for (limit = addr + size; addr < limit; addr++) {
3784 /*
3785 * We're looking for a character that is _not_ contained
3786 * in the token string.
3787 */
3788 if ((c = dtrace_load8(addr)) == '\0')
3789 break;
3790
3791 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3792 break;
3793 }
3794
3795 if (c == '\0') {
3796 /*
3797 * We reached the end of the string without finding
3798 * any character that was not in the token string.
3799 * We return NULL in this case, and we set the saved
3800 * address to NULL as well.
3801 */
3802 regs[rd] = NULL;
3803 mstate->dtms_strtok = NULL;
3804 break;
3805 }
3806
3807 /*
3808 * From here on, we're copying into the destination string.
3809 */
3810 for (i = 0; addr < limit && i < size - 1; addr++) {
3811 if ((c = dtrace_load8(addr)) == '\0')
3812 break;
3813
3814 if (tokmap[c >> 3] & (1 << (c & 0x7)))
3815 break;
3816
3817 ASSERT(i < size);
3818 dest[i++] = c;
3819 }
3820
3821 ASSERT(i < size);
3822 dest[i] = '\0';
3823 regs[rd] = (uintptr_t)dest;
3824 mstate->dtms_scratch_ptr += size;
3825 mstate->dtms_strtok = addr;
3826 break;
3827 }
3828
3829 case DIF_SUBR_SUBSTR: {
3830 uintptr_t s = tupregs[0].dttk_value;
3831 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3832 char *d = (char *)mstate->dtms_scratch_ptr;
3833 int64_t index = (int64_t)tupregs[1].dttk_value;
3834 int64_t remaining = (int64_t)tupregs[2].dttk_value;
3835 size_t len = dtrace_strlen((char *)s, size);
3836 int64_t i;
3837
3838 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3839 regs[rd] = NULL;
3840 break;
3841 }
3842
3843 if (!DTRACE_INSCRATCH(mstate, size)) {
3844 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3845 regs[rd] = NULL;
3846 break;
3847 }
3848
3849 if (nargs <= 2)
3850 remaining = (int64_t)size;
3851
3852 if (index < 0) {
3853 index += len;
3854
3855 if (index < 0 && index + remaining > 0) {
3856 remaining += index;
3857 index = 0;
3858 }
3859 }
3860
3861 if (VBDTCAST(uint64_t)index >= len || index < 0) {
3862 remaining = 0;
3863 } else if (remaining < 0) {
3864 remaining += len - index;
3865 } else if (VBDTCAST(uint64_t)index + remaining > size) {
3866 remaining = size - index;
3867 }
3868
3869 for (i = 0; i < remaining; i++) {
3870 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3871 break;
3872 }
3873
3874 d[i] = '\0';
3875
3876 mstate->dtms_scratch_ptr += size;
3877 regs[rd] = (uintptr_t)d;
3878 break;
3879 }
3880
3881 case DIF_SUBR_GETMAJOR:
3882#ifndef VBOX
3883#ifdef _LP64
3884 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3885#else
3886 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3887#endif
3888#else
3889 regs[rd] = 0;
3890 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3891#endif
3892 break;
3893
3894 case DIF_SUBR_GETMINOR:
3895#ifndef VBOX
3896#ifdef _LP64
3897 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3898#else
3899 regs[rd] = tupregs[0].dttk_value & MAXMIN;
3900#endif
3901#else
3902 regs[rd] = 0;
3903 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3904#endif
3905 break;
3906
3907 case DIF_SUBR_DDI_PATHNAME: {
3908#ifndef VBOX
3909 /*
3910 * This one is a galactic mess. We are going to roughly
3911 * emulate ddi_pathname(), but it's made more complicated
3912 * by the fact that we (a) want to include the minor name and
3913 * (b) must proceed iteratively instead of recursively.
3914 */
3915 uintptr_t dest = mstate->dtms_scratch_ptr;
3916 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3917 char *start = (char *)dest, *end = start + size - 1;
3918 uintptr_t daddr = tupregs[0].dttk_value;
3919 int64_t minor = (int64_t)tupregs[1].dttk_value;
3920 char *s;
3921 int i, len, depth = 0;
3922
3923 /*
3924 * Due to all the pointer jumping we do and context we must
3925 * rely upon, we just mandate that the user must have kernel
3926 * read privileges to use this routine.
3927 */
3928 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
3929 *flags |= CPU_DTRACE_KPRIV;
3930 *illval = daddr;
3931 regs[rd] = NULL;
3932 }
3933
3934 if (!DTRACE_INSCRATCH(mstate, size)) {
3935 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3936 regs[rd] = NULL;
3937 break;
3938 }
3939
3940 *end = '\0';
3941
3942 /*
3943 * We want to have a name for the minor. In order to do this,
3944 * we need to walk the minor list from the devinfo. We want
3945 * to be sure that we don't infinitely walk a circular list,
3946 * so we check for circularity by sending a scout pointer
3947 * ahead two elements for every element that we iterate over;
3948 * if the list is circular, these will ultimately point to the
3949 * same element. You may recognize this little trick as the
3950 * answer to a stupid interview question -- one that always
3951 * seems to be asked by those who had to have it laboriously
3952 * explained to them, and who can't even concisely describe
3953 * the conditions under which one would be forced to resort to
3954 * this technique. Needless to say, those conditions are
3955 * found here -- and probably only here. Is this the only use
3956 * of this infamous trick in shipping, production code? If it
3957 * isn't, it probably should be...
3958 */
3959 if (minor != -1) {
3960 uintptr_t maddr = dtrace_loadptr(daddr +
3961 offsetof(struct dev_info, devi_minor));
3962
3963 uintptr_t next = offsetof(struct ddi_minor_data, next);
3964 uintptr_t name = offsetof(struct ddi_minor_data,
3965 d_minor) + offsetof(struct ddi_minor, name);
3966 uintptr_t dev = offsetof(struct ddi_minor_data,
3967 d_minor) + offsetof(struct ddi_minor, dev);
3968 uintptr_t scout;
3969
3970 if (maddr != NULL)
3971 scout = dtrace_loadptr(maddr + next);
3972
3973 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3974 uint64_t m;
3975#ifdef _LP64
3976 m = dtrace_load64(maddr + dev) & MAXMIN64;
3977#else
3978 m = dtrace_load32(maddr + dev) & MAXMIN;
3979#endif
3980 if (m != minor) {
3981 maddr = dtrace_loadptr(maddr + next);
3982
3983 if (scout == NULL)
3984 continue;
3985
3986 scout = dtrace_loadptr(scout + next);
3987
3988 if (scout == NULL)
3989 continue;
3990
3991 scout = dtrace_loadptr(scout + next);
3992
3993 if (scout == NULL)
3994 continue;
3995
3996 if (scout == maddr) {
3997 *flags |= CPU_DTRACE_ILLOP;
3998 break;
3999 }
4000
4001 continue;
4002 }
4003
4004 /*
4005 * We have the minor data. Now we need to
4006 * copy the minor's name into the end of the
4007 * pathname.
4008 */
4009 s = (char *)dtrace_loadptr(maddr + name);
4010 len = dtrace_strlen(s, size);
4011
4012 if (*flags & CPU_DTRACE_FAULT)
4013 break;
4014
4015 if (len != 0) {
4016 if ((end -= (len + 1)) < start)
4017 break;
4018
4019 *end = ':';
4020 }
4021
4022 for (i = 1; i <= len; i++)
4023 end[i] = dtrace_load8((uintptr_t)s++);
4024 break;
4025 }
4026 }
4027
4028 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4029 ddi_node_state_t devi_state;
4030
4031 devi_state = dtrace_load32(daddr +
4032 offsetof(struct dev_info, devi_node_state));
4033
4034 if (*flags & CPU_DTRACE_FAULT)
4035 break;
4036
4037 if (devi_state >= DS_INITIALIZED) {
4038 s = (char *)dtrace_loadptr(daddr +
4039 offsetof(struct dev_info, devi_addr));
4040 len = dtrace_strlen(s, size);
4041
4042 if (*flags & CPU_DTRACE_FAULT)
4043 break;
4044
4045 if (len != 0) {
4046 if ((end -= (len + 1)) < start)
4047 break;
4048
4049 *end = '@';
4050 }
4051
4052 for (i = 1; i <= len; i++)
4053 end[i] = dtrace_load8((uintptr_t)s++);
4054 }
4055
4056 /*
4057 * Now for the node name...
4058 */
4059 s = (char *)dtrace_loadptr(daddr +
4060 offsetof(struct dev_info, devi_node_name));
4061
4062 daddr = dtrace_loadptr(daddr +
4063 offsetof(struct dev_info, devi_parent));
4064
4065 /*
4066 * If our parent is NULL (that is, if we're the root
4067 * node), we're going to use the special path
4068 * "devices".
4069 */
4070 if (daddr == NULL)
4071 s = "devices";
4072
4073 len = dtrace_strlen(s, size);
4074 if (*flags & CPU_DTRACE_FAULT)
4075 break;
4076
4077 if ((end -= (len + 1)) < start)
4078 break;
4079
4080 for (i = 1; i <= len; i++)
4081 end[i] = dtrace_load8((uintptr_t)s++);
4082 *end = '/';
4083
4084 if (depth++ > dtrace_devdepth_max) {
4085 *flags |= CPU_DTRACE_ILLOP;
4086 break;
4087 }
4088 }
4089
4090 if (end < start)
4091 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4092
4093 if (daddr == NULL) {
4094 regs[rd] = (uintptr_t)end;
4095 mstate->dtms_scratch_ptr += size;
4096 }
4097
4098#else
4099 regs[rd] = 0;
4100 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4101#endif
4102 break;
4103 }
4104
4105 case DIF_SUBR_STRJOIN: {
4106 char *d = (char *)mstate->dtms_scratch_ptr;
4107 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4108 uintptr_t s1 = tupregs[0].dttk_value;
4109 uintptr_t s2 = tupregs[1].dttk_value;
4110 VBDTTYPE(unsigned,int) i = 0;
4111
4112 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4113 !dtrace_strcanload(s2, size, mstate, vstate)) {
4114 regs[rd] = NULL;
4115 break;
4116 }
4117
4118 if (!DTRACE_INSCRATCH(mstate, size)) {
4119 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4120 regs[rd] = NULL;
4121 break;
4122 }
4123
4124 for (;;) {
4125 if (i >= size) {
4126 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4127 regs[rd] = NULL;
4128 break;
4129 }
4130
4131 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4132 i--;
4133 break;
4134 }
4135 }
4136
4137 for (;;) {
4138 if (i >= size) {
4139 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4140 regs[rd] = NULL;
4141 break;
4142 }
4143
4144 if ((d[i++] = dtrace_load8(s2++)) == '\0')
4145 break;
4146 }
4147
4148 if (i < size) {
4149 mstate->dtms_scratch_ptr += i;
4150 regs[rd] = (uintptr_t)d;
4151 }
4152
4153 break;
4154 }
4155
4156 case DIF_SUBR_LLTOSTR: {
4157 int64_t i = (int64_t)tupregs[0].dttk_value;
4158 int64_t val = i < 0 ? i * -1 : i;
4159 uint64_t size = 22; /* enough room for 2^64 in decimal */
4160 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4161
4162 if (!DTRACE_INSCRATCH(mstate, size)) {
4163 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4164 regs[rd] = NULL;
4165 break;
4166 }
4167
4168 for (*end-- = '\0'; val; val /= 10)
4169 *end-- = '0' + (val % 10);
4170
4171 if (i == 0)
4172 *end-- = '0';
4173
4174 if (i < 0)
4175 *end-- = '-';
4176
4177 regs[rd] = (uintptr_t)end + 1;
4178 mstate->dtms_scratch_ptr += size;
4179 break;
4180 }
4181
4182 case DIF_SUBR_HTONS:
4183 case DIF_SUBR_NTOHS:
4184#ifdef _BIG_ENDIAN
4185 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4186#else
4187 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4188#endif
4189 break;
4190
4191
4192 case DIF_SUBR_HTONL:
4193 case DIF_SUBR_NTOHL:
4194#ifdef _BIG_ENDIAN
4195 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4196#else
4197 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4198#endif
4199 break;
4200
4201
4202 case DIF_SUBR_HTONLL:
4203 case DIF_SUBR_NTOHLL:
4204#ifdef _BIG_ENDIAN
4205 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4206#else
4207 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4208#endif
4209 break;
4210
4211
4212 case DIF_SUBR_DIRNAME:
4213 case DIF_SUBR_BASENAME: {
4214 char *dest = (char *)mstate->dtms_scratch_ptr;
4215 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4216 uintptr_t src = tupregs[0].dttk_value;
4217 int i, j, len = VBDTCAST(int)dtrace_strlen((char *)src, size);
4218 int lastbase = -1, firstbase = -1, lastdir = -1;
4219 int start, end;
4220
4221 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4222 regs[rd] = NULL;
4223 break;
4224 }
4225
4226 if (!DTRACE_INSCRATCH(mstate, size)) {
4227 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4228 regs[rd] = NULL;
4229 break;
4230 }
4231
4232 /*
4233 * The basename and dirname for a zero-length string is
4234 * defined to be "."
4235 */
4236 if (len == 0) {
4237 len = 1;
4238 src = (uintptr_t)".";
4239 }
4240
4241 /*
4242 * Start from the back of the string, moving back toward the
4243 * front until we see a character that isn't a slash. That
4244 * character is the last character in the basename.
4245 */
4246 for (i = len - 1; i >= 0; i--) {
4247 if (dtrace_load8(src + i) != '/')
4248 break;
4249 }
4250
4251 if (i >= 0)
4252 lastbase = i;
4253
4254 /*
4255 * Starting from the last character in the basename, move
4256 * towards the front until we find a slash. The character
4257 * that we processed immediately before that is the first
4258 * character in the basename.
4259 */
4260 for (; i >= 0; i--) {
4261 if (dtrace_load8(src + i) == '/')
4262 break;
4263 }
4264
4265 if (i >= 0)
4266 firstbase = i + 1;
4267
4268 /*
4269 * Now keep going until we find a non-slash character. That
4270 * character is the last character in the dirname.
4271 */
4272 for (; i >= 0; i--) {
4273 if (dtrace_load8(src + i) != '/')
4274 break;
4275 }
4276
4277 if (i >= 0)
4278 lastdir = i;
4279
4280 ASSERT(!(lastbase == -1 && firstbase != -1));
4281 ASSERT(!(firstbase == -1 && lastdir != -1));
4282
4283 if (lastbase == -1) {
4284 /*
4285 * We didn't find a non-slash character. We know that
4286 * the length is non-zero, so the whole string must be
4287 * slashes. In either the dirname or the basename
4288 * case, we return '/'.
4289 */
4290 ASSERT(firstbase == -1);
4291 firstbase = lastbase = lastdir = 0;
4292 }
4293
4294 if (firstbase == -1) {
4295 /*
4296 * The entire string consists only of a basename
4297 * component. If we're looking for dirname, we need
4298 * to change our string to be just "."; if we're
4299 * looking for a basename, we'll just set the first
4300 * character of the basename to be 0.
4301 */
4302 if (subr == DIF_SUBR_DIRNAME) {
4303 ASSERT(lastdir == -1);
4304 src = (uintptr_t)".";
4305 lastdir = 0;
4306 } else {
4307 firstbase = 0;
4308 }
4309 }
4310
4311 if (subr == DIF_SUBR_DIRNAME) {
4312 if (lastdir == -1) {
4313 /*
4314 * We know that we have a slash in the name --
4315 * or lastdir would be set to 0, above. And
4316 * because lastdir is -1, we know that this
4317 * slash must be the first character. (That
4318 * is, the full string must be of the form
4319 * "/basename".) In this case, the last
4320 * character of the directory name is 0.
4321 */
4322 lastdir = 0;
4323 }
4324
4325 start = 0;
4326 end = lastdir;
4327 } else {
4328 ASSERT(subr == DIF_SUBR_BASENAME);
4329 ASSERT(firstbase != -1 && lastbase != -1);
4330 start = firstbase;
4331 end = lastbase;
4332 }
4333
4334 for (i = start, j = 0; i <= end && VBDTCAST(unsigned)j < size - 1; i++, j++)
4335 dest[j] = dtrace_load8(src + i);
4336
4337 dest[j] = '\0';
4338 regs[rd] = (uintptr_t)dest;
4339 mstate->dtms_scratch_ptr += size;
4340 break;
4341 }
4342
4343 case DIF_SUBR_CLEANPATH: {
4344 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4345 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4346 uintptr_t src = tupregs[0].dttk_value;
4347 int i = 0, j = 0;
4348
4349 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4350 regs[rd] = NULL;
4351 break;
4352 }
4353
4354 if (!DTRACE_INSCRATCH(mstate, size)) {
4355 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4356 regs[rd] = NULL;
4357 break;
4358 }
4359
4360 /*
4361 * Move forward, loading each character.
4362 */
4363 do {
4364 c = dtrace_load8(src + i++);
4365next:
4366 if (j + 5 >= VBDTCAST(int64_t)size) /* 5 = strlen("/..c\0") */
4367 break;
4368
4369 if (c != '/') {
4370 dest[j++] = c;
4371 continue;
4372 }
4373
4374 c = dtrace_load8(src + i++);
4375
4376 if (c == '/') {
4377 /*
4378 * We have two slashes -- we can just advance
4379 * to the next character.
4380 */
4381 goto next;
4382 }
4383
4384 if (c != '.') {
4385 /*
4386 * This is not "." and it's not ".." -- we can
4387 * just store the "/" and this character and
4388 * drive on.
4389 */
4390 dest[j++] = '/';
4391 dest[j++] = c;
4392 continue;
4393 }
4394
4395 c = dtrace_load8(src + i++);
4396
4397 if (c == '/') {
4398 /*
4399 * This is a "/./" component. We're not going
4400 * to store anything in the destination buffer;
4401 * we're just going to go to the next component.
4402 */
4403 goto next;
4404 }
4405
4406 if (c != '.') {
4407 /*
4408 * This is not ".." -- we can just store the
4409 * "/." and this character and continue
4410 * processing.
4411 */
4412 dest[j++] = '/';
4413 dest[j++] = '.';
4414 dest[j++] = c;
4415 continue;
4416 }
4417
4418 c = dtrace_load8(src + i++);
4419
4420 if (c != '/' && c != '\0') {
4421 /*
4422 * This is not ".." -- it's "..[mumble]".
4423 * We'll store the "/.." and this character
4424 * and continue processing.
4425 */
4426 dest[j++] = '/';
4427 dest[j++] = '.';
4428 dest[j++] = '.';
4429 dest[j++] = c;
4430 continue;
4431 }
4432
4433 /*
4434 * This is "/../" or "/..\0". We need to back up
4435 * our destination pointer until we find a "/".
4436 */
4437 i--;
4438 while (j != 0 && dest[--j] != '/')
4439 continue;
4440
4441 if (c == '\0')
4442 dest[++j] = '/';
4443 } while (c != '\0');
4444
4445 dest[j] = '\0';
4446 regs[rd] = (uintptr_t)dest;
4447 mstate->dtms_scratch_ptr += size;
4448 break;
4449 }
4450
4451 case DIF_SUBR_INET_NTOA:
4452 case DIF_SUBR_INET_NTOA6:
4453 case DIF_SUBR_INET_NTOP: {
4454#ifndef VBOX
4455 size_t size;
4456 int af, argi, i;
4457 char *base, *end;
4458
4459 if (subr == DIF_SUBR_INET_NTOP) {
4460 af = (int)tupregs[0].dttk_value;
4461 argi = 1;
4462 } else {
4463 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4464 argi = 0;
4465 }
4466
4467 if (af == AF_INET) {
4468 ipaddr_t ip4;
4469 uint8_t *ptr8, val;
4470
4471 /*
4472 * Safely load the IPv4 address.
4473 */
4474 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4475
4476 /*
4477 * Check an IPv4 string will fit in scratch.
4478 */
4479 size = INET_ADDRSTRLEN;
4480 if (!DTRACE_INSCRATCH(mstate, size)) {
4481 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4482 regs[rd] = NULL;
4483 break;
4484 }
4485 base = (char *)mstate->dtms_scratch_ptr;
4486 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4487
4488 /*
4489 * Stringify as a dotted decimal quad.
4490 */
4491 *end-- = '\0';
4492 ptr8 = (uint8_t *)&ip4;
4493 for (i = 3; i >= 0; i--) {
4494 val = ptr8[i];
4495
4496 if (val == 0) {
4497 *end-- = '0';
4498 } else {
4499 for (; val; val /= 10) {
4500 *end-- = '0' + (val % 10);
4501 }
4502 }
4503
4504 if (i > 0)
4505 *end-- = '.';
4506 }
4507 ASSERT(end + 1 >= base);
4508
4509 } else if (af == AF_INET6) {
4510 struct in6_addr ip6;
4511 int firstzero, tryzero, numzero, v6end;
4512 uint16_t val;
4513 const char digits[] = "0123456789abcdef";
4514
4515 /*
4516 * Stringify using RFC 1884 convention 2 - 16 bit
4517 * hexadecimal values with a zero-run compression.
4518 * Lower case hexadecimal digits are used.
4519 * eg, fe80::214:4fff:fe0b:76c8.
4520 * The IPv4 embedded form is returned for inet_ntop,
4521 * just the IPv4 string is returned for inet_ntoa6.
4522 */
4523
4524 /*
4525 * Safely load the IPv6 address.
4526 */
4527 dtrace_bcopy(
4528 (void *)(uintptr_t)tupregs[argi].dttk_value,
4529 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4530
4531 /*
4532 * Check an IPv6 string will fit in scratch.
4533 */
4534 size = INET6_ADDRSTRLEN;
4535 if (!DTRACE_INSCRATCH(mstate, size)) {
4536 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4537 regs[rd] = NULL;
4538 break;
4539 }
4540 base = (char *)mstate->dtms_scratch_ptr;
4541 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4542 *end-- = '\0';
4543
4544 /*
4545 * Find the longest run of 16 bit zero values
4546 * for the single allowed zero compression - "::".
4547 */
4548 firstzero = -1;
4549 tryzero = -1;
4550 numzero = 1;
4551 for (i = 0; i < sizeof (struct in6_addr); i++) {
4552 if (ip6._S6_un._S6_u8[i] == 0 &&
4553 tryzero == -1 && i % 2 == 0) {
4554 tryzero = i;
4555 continue;
4556 }
4557
4558 if (tryzero != -1 &&
4559 (ip6._S6_un._S6_u8[i] != 0 ||
4560 i == sizeof (struct in6_addr) - 1)) {
4561
4562 if (i - tryzero <= numzero) {
4563 tryzero = -1;
4564 continue;
4565 }
4566
4567 firstzero = tryzero;
4568 numzero = i - i % 2 - tryzero;
4569 tryzero = -1;
4570
4571 if (ip6._S6_un._S6_u8[i] == 0 &&
4572 i == sizeof (struct in6_addr) - 1)
4573 numzero += 2;
4574 }
4575 }
4576 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4577
4578 /*
4579 * Check for an IPv4 embedded address.
4580 */
4581 v6end = sizeof (struct in6_addr) - 2;
4582 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4583 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4584 for (i = sizeof (struct in6_addr) - 1;
4585 i >= DTRACE_V4MAPPED_OFFSET; i--) {
4586 ASSERT(end >= base);
4587
4588 val = ip6._S6_un._S6_u8[i];
4589
4590 if (val == 0) {
4591 *end-- = '0';
4592 } else {
4593 for (; val; val /= 10) {
4594 *end-- = '0' + val % 10;
4595 }
4596 }
4597
4598 if (i > DTRACE_V4MAPPED_OFFSET)
4599 *end-- = '.';
4600 }
4601
4602 if (subr == DIF_SUBR_INET_NTOA6)
4603 goto inetout;
4604
4605 /*
4606 * Set v6end to skip the IPv4 address that
4607 * we have already stringified.
4608 */
4609 v6end = 10;
4610 }
4611
4612 /*
4613 * Build the IPv6 string by working through the
4614 * address in reverse.
4615 */
4616 for (i = v6end; i >= 0; i -= 2) {
4617 ASSERT(end >= base);
4618
4619 if (i == firstzero + numzero - 2) {
4620 *end-- = ':';
4621 *end-- = ':';
4622 i -= numzero - 2;
4623 continue;
4624 }
4625
4626 if (i < 14 && i != firstzero - 2)
4627 *end-- = ':';
4628
4629 val = (ip6._S6_un._S6_u8[i] << 8) +
4630 ip6._S6_un._S6_u8[i + 1];
4631
4632 if (val == 0) {
4633 *end-- = '0';
4634 } else {
4635 for (; val; val /= 16) {
4636 *end-- = digits[val % 16];
4637 }
4638 }
4639 }
4640 ASSERT(end + 1 >= base);
4641
4642 } else {
4643 /*
4644 * The user didn't use AH_INET or AH_INET6.
4645 */
4646 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4647 regs[rd] = NULL;
4648 break;
4649 }
4650
4651inetout: regs[rd] = (uintptr_t)end + 1;
4652 mstate->dtms_scratch_ptr += size;
4653#else /* VBOX */
4654 regs[rd] = 0;
4655 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4656#endif /* VBOX */
4657 break;
4658 }
4659
4660 }
4661}
4662
4663/*
4664 * Emulate the execution of DTrace IR instructions specified by the given
4665 * DIF object. This function is deliberately void of assertions as all of
4666 * the necessary checks are handled by a call to dtrace_difo_validate().
4667 */
4668static uint64_t
4669dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4670 dtrace_vstate_t *vstate, dtrace_state_t *state)
4671{
4672 const dif_instr_t *text = difo->dtdo_buf;
4673 const uint_t textlen = difo->dtdo_len;
4674 const char *strtab = difo->dtdo_strtab;
4675 const uint64_t *inttab = difo->dtdo_inttab;
4676
4677 uint64_t rval = 0;
4678 dtrace_statvar_t *svar;
4679 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4680 dtrace_difv_t *v;
4681 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
4682 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
4683
4684 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4685 uint64_t regs[DIF_DIR_NREGS];
4686 uint64_t *tmp;
4687
4688 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4689 int64_t cc_r;
4690 uint_t pc = 0, id, opc VBDTUNASS(0);
4691 uint8_t ttop = 0;
4692 dif_instr_t instr;
4693 uint_t r1, r2, rd;
4694
4695 /*
4696 * We stash the current DIF object into the machine state: we need it
4697 * for subsequent access checking.
4698 */
4699 mstate->dtms_difo = difo;
4700
4701 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
4702
4703 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4704 opc = pc;
4705
4706 instr = text[pc++];
4707 r1 = DIF_INSTR_R1(instr);
4708 r2 = DIF_INSTR_R2(instr);
4709 rd = DIF_INSTR_RD(instr);
4710
4711 switch (DIF_INSTR_OP(instr)) {
4712 case DIF_OP_OR:
4713 regs[rd] = regs[r1] | regs[r2];
4714 break;
4715 case DIF_OP_XOR:
4716 regs[rd] = regs[r1] ^ regs[r2];
4717 break;
4718 case DIF_OP_AND:
4719 regs[rd] = regs[r1] & regs[r2];
4720 break;
4721 case DIF_OP_SLL:
4722 regs[rd] = regs[r1] << regs[r2];
4723 break;
4724 case DIF_OP_SRL:
4725 regs[rd] = regs[r1] >> regs[r2];
4726 break;
4727 case DIF_OP_SUB:
4728 regs[rd] = regs[r1] - regs[r2];
4729 break;
4730 case DIF_OP_ADD:
4731 regs[rd] = regs[r1] + regs[r2];
4732 break;
4733 case DIF_OP_MUL:
4734 regs[rd] = regs[r1] * regs[r2];
4735 break;
4736 case DIF_OP_SDIV:
4737 if (regs[r2] == 0) {
4738 regs[rd] = 0;
4739 *flags |= CPU_DTRACE_DIVZERO;
4740 } else {
4741 regs[rd] = (int64_t)regs[r1] /
4742 (int64_t)regs[r2];
4743 }
4744 break;
4745
4746 case DIF_OP_UDIV:
4747 if (regs[r2] == 0) {
4748 regs[rd] = 0;
4749 *flags |= CPU_DTRACE_DIVZERO;
4750 } else {
4751 regs[rd] = regs[r1] / regs[r2];
4752 }
4753 break;
4754
4755 case DIF_OP_SREM:
4756 if (regs[r2] == 0) {
4757 regs[rd] = 0;
4758 *flags |= CPU_DTRACE_DIVZERO;
4759 } else {
4760 regs[rd] = (int64_t)regs[r1] %
4761 (int64_t)regs[r2];
4762 }
4763 break;
4764
4765 case DIF_OP_UREM:
4766 if (regs[r2] == 0) {
4767 regs[rd] = 0;
4768 *flags |= CPU_DTRACE_DIVZERO;
4769 } else {
4770 regs[rd] = regs[r1] % regs[r2];
4771 }
4772 break;
4773
4774 case DIF_OP_NOT:
4775 regs[rd] = ~regs[r1];
4776 break;
4777 case DIF_OP_MOV:
4778 regs[rd] = regs[r1];
4779 break;
4780 case DIF_OP_CMP:
4781 cc_r = regs[r1] - regs[r2];
4782 cc_n = cc_r < 0;
4783 cc_z = cc_r == 0;
4784 cc_v = 0;
4785 cc_c = regs[r1] < regs[r2];
4786 break;
4787 case DIF_OP_TST:
4788 cc_n = cc_v = cc_c = 0;
4789 cc_z = regs[r1] == 0;
4790 break;
4791 case DIF_OP_BA:
4792 pc = DIF_INSTR_LABEL(instr);
4793 break;
4794 case DIF_OP_BE:
4795 if (cc_z)
4796 pc = DIF_INSTR_LABEL(instr);
4797 break;
4798 case DIF_OP_BNE:
4799 if (cc_z == 0)
4800 pc = DIF_INSTR_LABEL(instr);
4801 break;
4802 case DIF_OP_BG:
4803 if ((cc_z | (cc_n ^ cc_v)) == 0)
4804 pc = DIF_INSTR_LABEL(instr);
4805 break;
4806 case DIF_OP_BGU:
4807 if ((cc_c | cc_z) == 0)
4808 pc = DIF_INSTR_LABEL(instr);
4809 break;
4810 case DIF_OP_BGE:
4811 if ((cc_n ^ cc_v) == 0)
4812 pc = DIF_INSTR_LABEL(instr);
4813 break;
4814 case DIF_OP_BGEU:
4815 if (cc_c == 0)
4816 pc = DIF_INSTR_LABEL(instr);
4817 break;
4818 case DIF_OP_BL:
4819 if (cc_n ^ cc_v)
4820 pc = DIF_INSTR_LABEL(instr);
4821 break;
4822 case DIF_OP_BLU:
4823 if (cc_c)
4824 pc = DIF_INSTR_LABEL(instr);
4825 break;
4826 case DIF_OP_BLE:
4827 if (cc_z | (cc_n ^ cc_v))
4828 pc = DIF_INSTR_LABEL(instr);
4829 break;
4830 case DIF_OP_BLEU:
4831 if (cc_c | cc_z)
4832 pc = DIF_INSTR_LABEL(instr);
4833 break;
4834 case DIF_OP_RLDSB:
4835 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4836 *flags |= CPU_DTRACE_KPRIV;
4837 *illval = regs[r1];
4838 break;
4839 }
4840 /*FALLTHROUGH*/
4841 case DIF_OP_LDSB:
4842 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4843 break;
4844 case DIF_OP_RLDSH:
4845 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4846 *flags |= CPU_DTRACE_KPRIV;
4847 *illval = regs[r1];
4848 break;
4849 }
4850 /*FALLTHROUGH*/
4851 case DIF_OP_LDSH:
4852 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4853 break;
4854 case DIF_OP_RLDSW:
4855 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4856 *flags |= CPU_DTRACE_KPRIV;
4857 *illval = regs[r1];
4858 break;
4859 }
4860 /*FALLTHROUGH*/
4861 case DIF_OP_LDSW:
4862 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4863 break;
4864 case DIF_OP_RLDUB:
4865 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4866 *flags |= CPU_DTRACE_KPRIV;
4867 *illval = regs[r1];
4868 break;
4869 }
4870 /*FALLTHROUGH*/
4871 case DIF_OP_LDUB:
4872 regs[rd] = dtrace_load8(regs[r1]);
4873 break;
4874 case DIF_OP_RLDUH:
4875 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4876 *flags |= CPU_DTRACE_KPRIV;
4877 *illval = regs[r1];
4878 break;
4879 }
4880 /*FALLTHROUGH*/
4881 case DIF_OP_LDUH:
4882 regs[rd] = dtrace_load16(regs[r1]);
4883 break;
4884 case DIF_OP_RLDUW:
4885 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4886 *flags |= CPU_DTRACE_KPRIV;
4887 *illval = regs[r1];
4888 break;
4889 }
4890 /*FALLTHROUGH*/
4891 case DIF_OP_LDUW:
4892 regs[rd] = dtrace_load32(regs[r1]);
4893 break;
4894 case DIF_OP_RLDX:
4895 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4896 *flags |= CPU_DTRACE_KPRIV;
4897 *illval = regs[r1];
4898 break;
4899 }
4900 /*FALLTHROUGH*/
4901 case DIF_OP_LDX:
4902 regs[rd] = dtrace_load64(regs[r1]);
4903 break;
4904 case DIF_OP_ULDSB:
4905 regs[rd] = (int8_t)
4906 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4907 break;
4908 case DIF_OP_ULDSH:
4909 regs[rd] = (int16_t)
4910 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4911 break;
4912 case DIF_OP_ULDSW:
4913 regs[rd] = (int32_t)
4914 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4915 break;
4916 case DIF_OP_ULDUB:
4917 regs[rd] =
4918 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4919 break;
4920 case DIF_OP_ULDUH:
4921 regs[rd] =
4922 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4923 break;
4924 case DIF_OP_ULDUW:
4925 regs[rd] =
4926 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4927 break;
4928 case DIF_OP_ULDX:
4929 regs[rd] =
4930 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
4931 break;
4932 case DIF_OP_RET:
4933 rval = regs[rd];
4934 pc = textlen;
4935 break;
4936 case DIF_OP_NOP:
4937 break;
4938 case DIF_OP_SETX:
4939 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4940 break;
4941 case DIF_OP_SETS:
4942 regs[rd] = (uint64_t)(uintptr_t)
4943 (strtab + DIF_INSTR_STRING(instr));
4944 break;
4945 case DIF_OP_SCMP: {
4946 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
4947 uintptr_t s1 = regs[r1];
4948 uintptr_t s2 = regs[r2];
4949
4950 if (s1 != NULL &&
4951 !dtrace_strcanload(s1, sz, mstate, vstate))
4952 break;
4953 if (s2 != NULL &&
4954 !dtrace_strcanload(s2, sz, mstate, vstate))
4955 break;
4956
4957 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
4958
4959 cc_n = cc_r < 0;
4960 cc_z = cc_r == 0;
4961 cc_v = cc_c = 0;
4962 break;
4963 }
4964 case DIF_OP_LDGA:
4965 regs[rd] = dtrace_dif_variable(mstate, state,
4966 r1, regs[r2]);
4967 break;
4968 case DIF_OP_LDGS:
4969 id = DIF_INSTR_VAR(instr);
4970
4971 if (id >= DIF_VAR_OTHER_UBASE) {
4972 uintptr_t a;
4973
4974 id -= DIF_VAR_OTHER_UBASE;
4975 svar = vstate->dtvs_globals[id];
4976 ASSERT(svar != NULL);
4977 v = &svar->dtsv_var;
4978
4979 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
4980 regs[rd] = svar->dtsv_data;
4981 break;
4982 }
4983
4984 a = (uintptr_t)svar->dtsv_data;
4985
4986 if (*(uint8_t *)a == UINT8_MAX) {
4987 /*
4988 * If the 0th byte is set to UINT8_MAX
4989 * then this is to be treated as a
4990 * reference to a NULL variable.
4991 */
4992 regs[rd] = NULL;
4993 } else {
4994 regs[rd] = a + sizeof (uint64_t);
4995 }
4996
4997 break;
4998 }
4999
5000 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5001 break;
5002
5003 case DIF_OP_STGS:
5004 id = DIF_INSTR_VAR(instr);
5005
5006 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5007 id -= DIF_VAR_OTHER_UBASE;
5008
5009 svar = vstate->dtvs_globals[id];
5010 ASSERT(svar != NULL);
5011 v = &svar->dtsv_var;
5012
5013 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5014 uintptr_t a = (uintptr_t)svar->dtsv_data;
5015
5016 ASSERT(a != NULL);
5017 ASSERT(svar->dtsv_size != 0);
5018
5019 if (regs[rd] == NULL) {
5020 *(uint8_t *)a = UINT8_MAX;
5021 break;
5022 } else {
5023 *(uint8_t *)a = 0;
5024 a += sizeof (uint64_t);
5025 }
5026 if (!dtrace_vcanload(
5027 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5028 mstate, vstate))
5029 break;
5030
5031 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5032 (void *)a, &v->dtdv_type);
5033 break;
5034 }
5035
5036 svar->dtsv_data = regs[rd];
5037 break;
5038
5039 case DIF_OP_LDTA:
5040 /*
5041 * There are no DTrace built-in thread-local arrays at
5042 * present. This opcode is saved for future work.
5043 */
5044 *flags |= CPU_DTRACE_ILLOP;
5045 regs[rd] = 0;
5046 break;
5047
5048 case DIF_OP_LDLS:
5049 id = DIF_INSTR_VAR(instr);
5050
5051 if (id < DIF_VAR_OTHER_UBASE) {
5052 /*
5053 * For now, this has no meaning.
5054 */
5055 regs[rd] = 0;
5056 break;
5057 }
5058
5059 id -= DIF_VAR_OTHER_UBASE;
5060
5061 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5062 ASSERT(vstate->dtvs_locals != NULL);
5063
5064 svar = vstate->dtvs_locals[id];
5065 ASSERT(svar != NULL);
5066 v = &svar->dtsv_var;
5067
5068 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5069 uintptr_t a = (uintptr_t)svar->dtsv_data;
5070 size_t sz = v->dtdv_type.dtdt_size;
5071
5072 sz += sizeof (uint64_t);
5073 ASSERT(svar->dtsv_size == NCPU * sz);
5074 a += VBDT_GET_CPUID() * sz;
5075
5076 if (*(uint8_t *)a == UINT8_MAX) {
5077 /*
5078 * If the 0th byte is set to UINT8_MAX
5079 * then this is to be treated as a
5080 * reference to a NULL variable.
5081 */
5082 regs[rd] = NULL;
5083 } else {
5084 regs[rd] = a + sizeof (uint64_t);
5085 }
5086
5087 break;
5088 }
5089
5090 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5091 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5092 regs[rd] = tmp[VBDT_GET_CPUID()];
5093 break;
5094
5095 case DIF_OP_STLS:
5096 id = DIF_INSTR_VAR(instr);
5097
5098 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5099 id -= DIF_VAR_OTHER_UBASE;
5100 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5101
5102 ASSERT(vstate->dtvs_locals != NULL);
5103 svar = vstate->dtvs_locals[id];
5104 ASSERT(svar != NULL);
5105 v = &svar->dtsv_var;
5106
5107 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5108 uintptr_t a = (uintptr_t)svar->dtsv_data;
5109 size_t sz = v->dtdv_type.dtdt_size;
5110
5111 sz += sizeof (uint64_t);
5112 ASSERT(svar->dtsv_size == NCPU * sz);
5113 a += VBDT_GET_CPUID() * sz;
5114
5115 if (regs[rd] == NULL) {
5116 *(uint8_t *)a = UINT8_MAX;
5117 break;
5118 } else {
5119 *(uint8_t *)a = 0;
5120 a += sizeof (uint64_t);
5121 }
5122
5123 if (!dtrace_vcanload(
5124 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5125 mstate, vstate))
5126 break;
5127
5128 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5129 (void *)a, &v->dtdv_type);
5130 break;
5131 }
5132
5133 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5134 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5135 tmp[VBDT_GET_CPUID()] = regs[rd];
5136 break;
5137
5138 case DIF_OP_LDTS: {
5139 dtrace_dynvar_t *dvar;
5140 dtrace_key_t *key;
5141
5142 id = DIF_INSTR_VAR(instr);
5143 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5144 id -= DIF_VAR_OTHER_UBASE;
5145 v = &vstate->dtvs_tlocals[id];
5146
5147 key = &tupregs[DIF_DTR_NREGS];
5148 key[0].dttk_value = (uint64_t)id;
5149 key[0].dttk_size = 0;
5150 DTRACE_TLS_THRKEY(key[1].dttk_value);
5151 key[1].dttk_size = 0;
5152
5153 dvar = dtrace_dynvar(dstate, 2, key,
5154 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5155 mstate, vstate);
5156
5157 if (dvar == NULL) {
5158 regs[rd] = 0;
5159 break;
5160 }
5161
5162 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5163 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5164 } else {
5165 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5166 }
5167
5168 break;
5169 }
5170
5171 case DIF_OP_STTS: {
5172 dtrace_dynvar_t *dvar;
5173 dtrace_key_t *key;
5174
5175 id = DIF_INSTR_VAR(instr);
5176 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5177 id -= DIF_VAR_OTHER_UBASE;
5178
5179 key = &tupregs[DIF_DTR_NREGS];
5180 key[0].dttk_value = (uint64_t)id;
5181 key[0].dttk_size = 0;
5182 DTRACE_TLS_THRKEY(key[1].dttk_value);
5183 key[1].dttk_size = 0;
5184 v = &vstate->dtvs_tlocals[id];
5185
5186 dvar = dtrace_dynvar(dstate, 2, key,
5187 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5188 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5189 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5190 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5191
5192 /*
5193 * Given that we're storing to thread-local data,
5194 * we need to flush our predicate cache.
5195 */
5196 curthread->t_predcache = NULL;
5197
5198 if (dvar == NULL)
5199 break;
5200
5201 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5202 if (!dtrace_vcanload(
5203 (void *)(uintptr_t)regs[rd],
5204 &v->dtdv_type, mstate, vstate))
5205 break;
5206
5207 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5208 dvar->dtdv_data, &v->dtdv_type);
5209 } else {
5210 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5211 }
5212
5213 break;
5214 }
5215
5216 case DIF_OP_SRA:
5217 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5218 break;
5219
5220 case DIF_OP_CALL:
5221 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5222 regs, tupregs, ttop, mstate, state);
5223 break;
5224
5225 case DIF_OP_PUSHTR:
5226 if (ttop == DIF_DTR_NREGS) {
5227 *flags |= CPU_DTRACE_TUPOFLOW;
5228 break;
5229 }
5230
5231 if (r1 == DIF_TYPE_STRING) {
5232 /*
5233 * If this is a string type and the size is 0,
5234 * we'll use the system-wide default string
5235 * size. Note that we are _not_ looking at
5236 * the value of the DTRACEOPT_STRSIZE option;
5237 * had this been set, we would expect to have
5238 * a non-zero size value in the "pushtr".
5239 */
5240 tupregs[ttop].dttk_size =
5241 dtrace_strlen((char *)(uintptr_t)regs[rd],
5242 regs[r2] ? regs[r2] :
5243 dtrace_strsize_default) + 1;
5244 } else {
5245 tupregs[ttop].dttk_size = regs[r2];
5246 }
5247
5248 tupregs[ttop++].dttk_value = regs[rd];
5249 break;
5250
5251 case DIF_OP_PUSHTV:
5252 if (ttop == DIF_DTR_NREGS) {
5253 *flags |= CPU_DTRACE_TUPOFLOW;
5254 break;
5255 }
5256
5257 tupregs[ttop].dttk_value = regs[rd];
5258 tupregs[ttop++].dttk_size = 0;
5259 break;
5260
5261 case DIF_OP_POPTS:
5262 if (ttop != 0)
5263 ttop--;
5264 break;
5265
5266 case DIF_OP_FLUSHTS:
5267 ttop = 0;
5268 break;
5269
5270 case DIF_OP_LDGAA:
5271 case DIF_OP_LDTAA: {
5272 dtrace_dynvar_t *dvar;
5273 dtrace_key_t *key = tupregs;
5274 uint_t nkeys = ttop;
5275
5276 id = DIF_INSTR_VAR(instr);
5277 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5278 id -= DIF_VAR_OTHER_UBASE;
5279
5280 key[nkeys].dttk_value = (uint64_t)id;
5281 key[nkeys++].dttk_size = 0;
5282
5283 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5284 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5285 key[nkeys++].dttk_size = 0;
5286 v = &vstate->dtvs_tlocals[id];
5287 } else {
5288 v = &vstate->dtvs_globals[id]->dtsv_var;
5289 }
5290
5291 dvar = dtrace_dynvar(dstate, nkeys, key,
5292 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5293 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5294 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5295
5296 if (dvar == NULL) {
5297 regs[rd] = 0;
5298 break;
5299 }
5300
5301 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5302 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5303 } else {
5304 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5305 }
5306
5307 break;
5308 }
5309
5310 case DIF_OP_STGAA:
5311 case DIF_OP_STTAA: {
5312 dtrace_dynvar_t *dvar;
5313 dtrace_key_t *key = tupregs;
5314 uint_t nkeys = ttop;
5315
5316 id = DIF_INSTR_VAR(instr);
5317 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5318 id -= DIF_VAR_OTHER_UBASE;
5319
5320 key[nkeys].dttk_value = (uint64_t)id;
5321 key[nkeys++].dttk_size = 0;
5322
5323 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5324 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5325 key[nkeys++].dttk_size = 0;
5326 v = &vstate->dtvs_tlocals[id];
5327 } else {
5328 v = &vstate->dtvs_globals[id]->dtsv_var;
5329 }
5330
5331 dvar = dtrace_dynvar(dstate, nkeys, key,
5332 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5333 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5334 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5335 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5336
5337 if (dvar == NULL)
5338 break;
5339
5340 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5341 if (!dtrace_vcanload(
5342 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5343 mstate, vstate))
5344 break;
5345
5346 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5347 dvar->dtdv_data, &v->dtdv_type);
5348 } else {
5349 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5350 }
5351
5352 break;
5353 }
5354
5355 case DIF_OP_ALLOCS: {
5356 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5357 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5358
5359 /*
5360 * Rounding up the user allocation size could have
5361 * overflowed large, bogus allocations (like -1ULL) to
5362 * 0.
5363 */
5364 if (size < regs[r1] ||
5365 !DTRACE_INSCRATCH(mstate, size)) {
5366 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5367 regs[rd] = NULL;
5368 break;
5369 }
5370
5371 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5372 mstate->dtms_scratch_ptr += size;
5373 regs[rd] = ptr;
5374 break;
5375 }
5376
5377 case DIF_OP_COPYS:
5378 if (!dtrace_canstore(regs[rd], regs[r2],
5379 mstate, vstate)) {
5380 *flags |= CPU_DTRACE_BADADDR;
5381 *illval = regs[rd];
5382 break;
5383 }
5384
5385 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5386 break;
5387
5388 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5389 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5390 break;
5391
5392 case DIF_OP_STB:
5393 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5394 *flags |= CPU_DTRACE_BADADDR;
5395 *illval = regs[rd];
5396 break;
5397 }
5398 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5399 break;
5400
5401 case DIF_OP_STH:
5402 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5403 *flags |= CPU_DTRACE_BADADDR;
5404 *illval = regs[rd];
5405 break;
5406 }
5407 if (regs[rd] & 1) {
5408 *flags |= CPU_DTRACE_BADALIGN;
5409 *illval = regs[rd];
5410 break;
5411 }
5412 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5413 break;
5414
5415 case DIF_OP_STW:
5416 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5417 *flags |= CPU_DTRACE_BADADDR;
5418 *illval = regs[rd];
5419 break;
5420 }
5421 if (regs[rd] & 3) {
5422 *flags |= CPU_DTRACE_BADALIGN;
5423 *illval = regs[rd];
5424 break;
5425 }
5426 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5427 break;
5428
5429 case DIF_OP_STX:
5430 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5431 *flags |= CPU_DTRACE_BADADDR;
5432 *illval = regs[rd];
5433 break;
5434 }
5435 if (regs[rd] & 7) {
5436 *flags |= CPU_DTRACE_BADALIGN;
5437 *illval = regs[rd];
5438 break;
5439 }
5440 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5441 break;
5442 }
5443 }
5444
5445 if (!(*flags & CPU_DTRACE_FAULT))
5446 return (rval);
5447
5448 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5449 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5450
5451 return (0);
5452}
5453
5454#ifndef VBOX /* no destructive stuff */
5455
5456static void
5457dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5458{
5459 dtrace_probe_t *probe = ecb->dte_probe;
5460 dtrace_provider_t *prov = probe->dtpr_provider;
5461 char c[DTRACE_FULLNAMELEN + 80], *str;
5462 char *msg = "dtrace: breakpoint action at probe ";
5463 char *ecbmsg = " (ecb ";
5464 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5465 uintptr_t val = (uintptr_t)ecb;
5466 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5467
5468 if (dtrace_destructive_disallow)
5469 return;
5470
5471 /*
5472 * It's impossible to be taking action on the NULL probe.
5473 */
5474 ASSERT(probe != NULL);
5475
5476 /*
5477 * This is a poor man's (destitute man's?) sprintf(): we want to
5478 * print the provider name, module name, function name and name of
5479 * the probe, along with the hex address of the ECB with the breakpoint
5480 * action -- all of which we must place in the character buffer by
5481 * hand.
5482 */
5483 while (*msg != '\0')
5484 c[i++] = *msg++;
5485
5486 for (str = prov->dtpv_name; *str != '\0'; str++)
5487 c[i++] = *str;
5488 c[i++] = ':';
5489
5490 for (str = probe->dtpr_mod; *str != '\0'; str++)
5491 c[i++] = *str;
5492 c[i++] = ':';
5493
5494 for (str = probe->dtpr_func; *str != '\0'; str++)
5495 c[i++] = *str;
5496 c[i++] = ':';
5497
5498 for (str = probe->dtpr_name; *str != '\0'; str++)
5499 c[i++] = *str;
5500
5501 while (*ecbmsg != '\0')
5502 c[i++] = *ecbmsg++;
5503
5504 while (shift >= 0) {
5505 mask = (uintptr_t)0xf << shift;
5506
5507 if (val >= ((uintptr_t)1 << shift))
5508 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5509 shift -= 4;
5510 }
5511
5512 c[i++] = ')';
5513 c[i] = '\0';
5514
5515 debug_enter(c);
5516}
5517
5518static void
5519dtrace_action_panic(dtrace_ecb_t *ecb)
5520{
5521 dtrace_probe_t *probe = ecb->dte_probe;
5522
5523 /*
5524 * It's impossible to be taking action on the NULL probe.
5525 */
5526 ASSERT(probe != NULL);
5527
5528 if (dtrace_destructive_disallow)
5529 return;
5530
5531 if (dtrace_panicked != NULL)
5532 return;
5533
5534 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5535 return;
5536
5537 /*
5538 * We won the right to panic. (We want to be sure that only one
5539 * thread calls panic() from dtrace_probe(), and that panic() is
5540 * called exactly once.)
5541 */
5542 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5543 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5544 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5545}
5546
5547static void
5548dtrace_action_raise(uint64_t sig)
5549{
5550 if (dtrace_destructive_disallow)
5551 return;
5552
5553 if (sig >= NSIG) {
5554 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5555 return;
5556 }
5557
5558 /*
5559 * raise() has a queue depth of 1 -- we ignore all subsequent
5560 * invocations of the raise() action.
5561 */
5562 if (curthread->t_dtrace_sig == 0)
5563 curthread->t_dtrace_sig = (uint8_t)sig;
5564
5565 curthread->t_sig_check = 1;
5566 aston(curthread);
5567}
5568
5569static void
5570dtrace_action_stop(void)
5571{
5572 if (dtrace_destructive_disallow)
5573 return;
5574
5575 if (!curthread->t_dtrace_stop) {
5576 curthread->t_dtrace_stop = 1;
5577 curthread->t_sig_check = 1;
5578 aston(curthread);
5579 }
5580}
5581
5582static void
5583dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5584{
5585 hrtime_t now;
5586 volatile uint16_t *flags;
5587 cpu_t *cpu = CPU;
5588
5589 if (dtrace_destructive_disallow)
5590 return;
5591
5592 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5593
5594 now = dtrace_gethrtime();
5595
5596 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5597 /*
5598 * We need to advance the mark to the current time.
5599 */
5600 cpu->cpu_dtrace_chillmark = now;
5601 cpu->cpu_dtrace_chilled = 0;
5602 }
5603
5604 /*
5605 * Now check to see if the requested chill time would take us over
5606 * the maximum amount of time allowed in the chill interval. (Or
5607 * worse, if the calculation itself induces overflow.)
5608 */
5609 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5610 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5611 *flags |= CPU_DTRACE_ILLOP;
5612 return;
5613 }
5614
5615 while (dtrace_gethrtime() - now < val)
5616 continue;
5617
5618 /*
5619 * Normally, we assure that the value of the variable "timestamp" does
5620 * not change within an ECB. The presence of chill() represents an
5621 * exception to this rule, however.
5622 */
5623 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5624 cpu->cpu_dtrace_chilled += val;
5625}
5626
5627#endif /* !VBOX */
5628
5629static void
5630dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5631 uint64_t *buf, uint64_t arg)
5632{
5633 int nframes = DTRACE_USTACK_NFRAMES(arg);
5634 int strsize = DTRACE_USTACK_STRSIZE(arg);
5635 uint64_t *pcs = &buf[1], *fps;
5636 char *str = (char *)&pcs[nframes];
5637 int size, offs = 0, i, j;
5638 uintptr_t old = mstate->dtms_scratch_ptr, saved;
5639#ifndef VBOX
5640 uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5641#else
5642 uint16_t volatile *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5643#endif
5644 char *sym;
5645
5646 /*
5647 * Should be taking a faster path if string space has not been
5648 * allocated.
5649 */
5650 ASSERT(strsize != 0);
5651
5652 /*
5653 * We will first allocate some temporary space for the frame pointers.
5654 */
5655 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5656 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5657 (nframes * sizeof (uint64_t));
5658
5659 if (!DTRACE_INSCRATCH(mstate, VBDTCAST(unsigned)size)) {
5660 /*
5661 * Not enough room for our frame pointers -- need to indicate
5662 * that we ran out of scratch space.
5663 */
5664 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5665 return;
5666 }
5667
5668 mstate->dtms_scratch_ptr += size;
5669 saved = mstate->dtms_scratch_ptr;
5670
5671 /*
5672 * Now get a stack with both program counters and frame pointers.
5673 */
5674 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5675 dtrace_getufpstack(buf, fps, nframes + 1);
5676 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5677
5678 /*
5679 * If that faulted, we're cooked.
5680 */
5681 if (*flags & CPU_DTRACE_FAULT)
5682 goto out;
5683
5684 /*
5685 * Now we want to walk up the stack, calling the USTACK helper. For
5686 * each iteration, we restore the scratch pointer.
5687 */
5688 for (i = 0; i < nframes; i++) {
5689 mstate->dtms_scratch_ptr = saved;
5690
5691 if (offs >= strsize)
5692 break;
5693
5694 sym = (char *)(uintptr_t)dtrace_helper(
5695 DTRACE_HELPER_ACTION_USTACK,
5696 mstate, state, pcs[i], fps[i]);
5697
5698 /*
5699 * If we faulted while running the helper, we're going to
5700 * clear the fault and null out the corresponding string.
5701 */
5702 if (*flags & CPU_DTRACE_FAULT) {
5703 *flags &= ~CPU_DTRACE_FAULT;
5704 str[offs++] = '\0';
5705 continue;
5706 }
5707
5708 if (sym == NULL) {
5709 str[offs++] = '\0';
5710 continue;
5711 }
5712
5713 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5714
5715 /*
5716 * Now copy in the string that the helper returned to us.
5717 */
5718 for (j = 0; offs + j < strsize; j++) {
5719 if ((str[offs + j] = sym[j]) == '\0')
5720 break;
5721 }
5722
5723 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5724
5725 offs += j + 1;
5726 }
5727
5728 if (offs >= strsize) {
5729 /*
5730 * If we didn't have room for all of the strings, we don't
5731 * abort processing -- this needn't be a fatal error -- but we
5732 * still want to increment a counter (dts_stkstroverflows) to
5733 * allow this condition to be warned about. (If this is from
5734 * a jstack() action, it is easily tuned via jstackstrsize.)
5735 */
5736 dtrace_error(&state->dts_stkstroverflows);
5737 }
5738
5739 while (offs < strsize)
5740 str[offs++] = '\0';
5741
5742out:
5743 mstate->dtms_scratch_ptr = old;
5744}
5745
5746#ifdef VBOX
5747extern void dtrace_probe6(dtrace_id_t, uintptr_t arg0, uintptr_t arg1,
5748 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5);
5749# define dtrace_probe_error(a1, a2, a3, a4, a5, a6) \
5750 dtrace_probe6(dtrace_probeid_error, (uintptr_t)a1, a2, a3, a4, a5, a6)
5751#endif
5752
5753/*
5754 * If you're looking for the epicenter of DTrace, you just found it. This
5755 * is the function called by the provider to fire a probe -- from which all
5756 * subsequent probe-context DTrace activity emanates.
5757 */
5758void
5759dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5760 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5761{
5762 processorid_t cpuid;
5763 dtrace_icookie_t cookie;
5764 dtrace_probe_t *probe;
5765 dtrace_mstate_t mstate;
5766 dtrace_ecb_t *ecb;
5767 dtrace_action_t *act;
5768 intptr_t offs;
5769 size_t size;
5770 int vtime, onintr;
5771 volatile uint16_t *flags;
5772 hrtime_t now;
5773
5774#ifndef VBOX
5775 /*
5776 * Kick out immediately if this CPU is still being born (in which case
5777 * curthread will be set to -1) or the current thread can't allow
5778 * probes in its current context.
5779 */
5780 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5781 return;
5782#endif
5783
5784 cookie = dtrace_interrupt_disable();
5785 probe = dtrace_probes[id - 1];
5786 cpuid = VBDT_GET_CPUID();
5787 onintr = CPU_ON_INTR(CPU);
5788
5789 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5790 probe->dtpr_predcache == curthread->t_predcache) {
5791 /*
5792 * We have hit in the predicate cache; we know that
5793 * this predicate would evaluate to be false.
5794 */
5795 dtrace_interrupt_enable(cookie);
5796 return;
5797 }
5798
5799#ifndef VBOX
5800 if (panic_quiesce) {
5801 /*
5802 * We don't trace anything if we're panicking.
5803 */
5804 dtrace_interrupt_enable(cookie);
5805 return;
5806 }
5807#endif
5808
5809 now = dtrace_gethrtime();
5810 vtime = dtrace_vtime_references != 0;
5811
5812 if (vtime && curthread->t_dtrace_start)
5813 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5814
5815 mstate.dtms_difo = NULL;
5816 mstate.dtms_probe = probe;
5817 mstate.dtms_strtok = NULL;
5818 mstate.dtms_arg[0] = arg0;
5819 mstate.dtms_arg[1] = arg1;
5820 mstate.dtms_arg[2] = arg2;
5821 mstate.dtms_arg[3] = arg3;
5822 mstate.dtms_arg[4] = arg4;
5823
5824 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5825
5826 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5827 dtrace_predicate_t *pred = ecb->dte_predicate;
5828 dtrace_state_t *state = ecb->dte_state;
5829 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5830 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5831 dtrace_vstate_t *vstate = &state->dts_vstate;
5832 dtrace_provider_t *prov = probe->dtpr_provider;
5833 int committed = 0;
5834 caddr_t tomax;
5835
5836 /*
5837 * A little subtlety with the following (seemingly innocuous)
5838 * declaration of the automatic 'val': by looking at the
5839 * code, you might think that it could be declared in the
5840 * action processing loop, below. (That is, it's only used in
5841 * the action processing loop.) However, it must be declared
5842 * out of that scope because in the case of DIF expression
5843 * arguments to aggregating actions, one iteration of the
5844 * action loop will use the last iteration's value.
5845 */
5846#ifdef lint
5847 uint64_t val = 0;
5848#else
5849 uint64_t val VBDTUNASS(0);
5850#endif
5851
5852 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5853 *flags &= ~CPU_DTRACE_ERROR;
5854
5855 if (prov == dtrace_provider) {
5856 /*
5857 * If dtrace itself is the provider of this probe,
5858 * we're only going to continue processing the ECB if
5859 * arg0 (the dtrace_state_t) is equal to the ECB's
5860 * creating state. (This prevents disjoint consumers
5861 * from seeing one another's metaprobes.)
5862 */
5863 if (arg0 != (uint64_t)(uintptr_t)state)
5864 continue;
5865 }
5866
5867 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5868 /*
5869 * We're not currently active. If our provider isn't
5870 * the dtrace pseudo provider, we're not interested.
5871 */
5872 if (prov != dtrace_provider)
5873 continue;
5874
5875 /*
5876 * Now we must further check if we are in the BEGIN
5877 * probe. If we are, we will only continue processing
5878 * if we're still in WARMUP -- if one BEGIN enabling
5879 * has invoked the exit() action, we don't want to
5880 * evaluate subsequent BEGIN enablings.
5881 */
5882 if (probe->dtpr_id == dtrace_probeid_begin &&
5883 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5884 ASSERT(state->dts_activity ==
5885 DTRACE_ACTIVITY_DRAINING);
5886 continue;
5887 }
5888 }
5889
5890 if (ecb->dte_cond) {
5891 /*
5892 * If the dte_cond bits indicate that this
5893 * consumer is only allowed to see user-mode firings
5894 * of this probe, call the provider's dtps_usermode()
5895 * entry point to check that the probe was fired
5896 * while in a user context. Skip this ECB if that's
5897 * not the case.
5898 */
5899 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
5900 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
5901 probe->dtpr_id, probe->dtpr_arg) == 0)
5902 continue;
5903
5904 /*
5905 * This is more subtle than it looks. We have to be
5906 * absolutely certain that CRED() isn't going to
5907 * change out from under us so it's only legit to
5908 * examine that structure if we're in constrained
5909 * situations. Currently, the only times we'll this
5910 * check is if a non-super-user has enabled the
5911 * profile or syscall providers -- providers that
5912 * allow visibility of all processes. For the
5913 * profile case, the check above will ensure that
5914 * we're examining a user context.
5915 */
5916 if (ecb->dte_cond & DTRACE_COND_OWNER) {
5917 cred_t *cr;
5918 cred_t *s_cr =
5919 ecb->dte_state->dts_cred.dcr_cred;
5920 proc_t *proc;
5921
5922 ASSERT(s_cr != NULL);
5923
5924 if ((cr = CRED()) == NULL ||
5925 s_cr->cr_uid != cr->cr_uid ||
5926 s_cr->cr_uid != cr->cr_ruid ||
5927 s_cr->cr_uid != cr->cr_suid ||
5928 s_cr->cr_gid != cr->cr_gid ||
5929 s_cr->cr_gid != cr->cr_rgid ||
5930 s_cr->cr_gid != cr->cr_sgid ||
5931 (proc = VBDT_GET_PROC()) == NULL ||
5932 (proc->p_flag & SNOCD))
5933 continue;
5934 }
5935
5936#ifndef VBOX
5937 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
5938 cred_t *cr;
5939 cred_t *s_cr =
5940 ecb->dte_state->dts_cred.dcr_cred;
5941
5942 ASSERT(s_cr != NULL);
5943
5944 if ((cr = CRED()) == NULL ||
5945 s_cr->cr_zone->zone_id !=
5946 cr->cr_zone->zone_id)
5947 continue;
5948 }
5949#endif
5950 }
5951
5952 if (now - state->dts_alive > dtrace_deadman_timeout) {
5953 /*
5954 * We seem to be dead. Unless we (a) have kernel
5955 * destructive permissions (b) have expicitly enabled
5956 * destructive actions and (c) destructive actions have
5957 * not been disabled, we're going to transition into
5958 * the KILLED state, from which no further processing
5959 * on this state will be performed.
5960 */
5961 if (!dtrace_priv_kernel_destructive(state) ||
5962 !state->dts_cred.dcr_destructive ||
5963 dtrace_destructive_disallow) {
5964 void *activity = &state->dts_activity;
5965 dtrace_activity_t current;
5966
5967 do {
5968 current = state->dts_activity;
5969 } while (dtrace_cas32(activity, current,
5970 DTRACE_ACTIVITY_KILLED) != current);
5971
5972 continue;
5973 }
5974 }
5975
5976 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
5977 ecb->dte_alignment, state, &mstate)) < 0)
5978 continue;
5979
5980 tomax = buf->dtb_tomax;
5981 ASSERT(tomax != NULL);
5982
5983 if (ecb->dte_size != 0)
5984 DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
5985
5986 mstate.dtms_epid = ecb->dte_epid;
5987 mstate.dtms_present |= DTRACE_MSTATE_EPID;
5988
5989 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
5990 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
5991 else
5992 mstate.dtms_access = 0;
5993
5994 if (pred != NULL) {
5995 dtrace_difo_t *dp = pred->dtp_difo;
5996 int rval;
5997
5998 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
5999
6000 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6001 dtrace_cacheid_t cid = probe->dtpr_predcache;
6002
6003 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6004 /*
6005 * Update the predicate cache...
6006 */
6007 ASSERT(cid == pred->dtp_cacheid);
6008 curthread->t_predcache = cid;
6009 }
6010
6011 continue;
6012 }
6013 }
6014
6015 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6016 act != NULL; act = act->dta_next) {
6017 size_t valoffs;
6018 dtrace_difo_t *dp;
6019 dtrace_recdesc_t *rec = &act->dta_rec;
6020
6021 size = rec->dtrd_size;
6022 valoffs = offs + rec->dtrd_offset;
6023
6024 if (DTRACEACT_ISAGG(act->dta_kind)) {
6025 uint64_t v = 0xbad;
6026 dtrace_aggregation_t *agg;
6027
6028 agg = (dtrace_aggregation_t *)act;
6029
6030 if ((dp = act->dta_difo) != NULL)
6031 v = dtrace_dif_emulate(dp,
6032 &mstate, vstate, state);
6033
6034 if (*flags & CPU_DTRACE_ERROR)
6035 continue;
6036
6037 /*
6038 * Note that we always pass the expression
6039 * value from the previous iteration of the
6040 * action loop. This value will only be used
6041 * if there is an expression argument to the
6042 * aggregating action, denoted by the
6043 * dtag_hasarg field.
6044 */
6045 dtrace_aggregate(agg, buf,
6046 offs, aggbuf, v, val);
6047 continue;
6048 }
6049
6050 switch (act->dta_kind) {
6051 case DTRACEACT_STOP:
6052#ifndef VBOX
6053 if (dtrace_priv_proc_destructive(state))
6054 dtrace_action_stop();
6055#else
6056 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6057#endif
6058 continue;
6059
6060 case DTRACEACT_BREAKPOINT:
6061#ifndef VBOX
6062 if (dtrace_priv_kernel_destructive(state))
6063 dtrace_action_breakpoint(ecb);
6064#else
6065 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6066#endif
6067 continue;
6068
6069 case DTRACEACT_PANIC:
6070#ifndef VBOX
6071 if (dtrace_priv_kernel_destructive(state))
6072 dtrace_action_panic(ecb);
6073#endif
6074 continue;
6075
6076 case DTRACEACT_STACK:
6077 if (!dtrace_priv_kernel(state))
6078 continue;
6079
6080 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6081 VBDTCAST(int)(size / sizeof (pc_t)), probe->dtpr_aframes,
6082 DTRACE_ANCHORED(probe) ? NULL :
6083 (uint32_t *)arg0);
6084
6085 continue;
6086
6087 case DTRACEACT_JSTACK:
6088 case DTRACEACT_USTACK:
6089 if (!dtrace_priv_proc(state))
6090 continue;
6091
6092 /*
6093 * See comment in DIF_VAR_PID.
6094 */
6095 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6096 CPU_ON_INTR(CPU)) {
6097 int depth = DTRACE_USTACK_NFRAMES(
6098 rec->dtrd_arg) + 1;
6099
6100 dtrace_bzero((void *)(tomax + valoffs),
6101 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6102 + depth * sizeof (uint64_t));
6103
6104 continue;
6105 }
6106
6107 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6108 curproc->p_dtrace_helpers != NULL) {
6109 /*
6110 * This is the slow path -- we have
6111 * allocated string space, and we're
6112 * getting the stack of a process that
6113 * has helpers. Call into a separate
6114 * routine to perform this processing.
6115 */
6116 dtrace_action_ustack(&mstate, state,
6117 (uint64_t *)(tomax + valoffs),
6118 rec->dtrd_arg);
6119 continue;
6120 }
6121
6122 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6123 dtrace_getupcstack((uint64_t *)
6124 (tomax + valoffs),
6125 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6126 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6127 continue;
6128
6129 default:
6130 break;
6131 }
6132
6133 dp = act->dta_difo;
6134 ASSERT(dp != NULL);
6135
6136 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6137
6138 if (*flags & CPU_DTRACE_ERROR)
6139 continue;
6140
6141 switch (act->dta_kind) {
6142 case DTRACEACT_SPECULATE:
6143 ASSERT(buf == &state->dts_buffer[cpuid]);
6144 buf = dtrace_speculation_buffer(state,
6145 cpuid, val);
6146
6147 if (buf == NULL) {
6148 *flags |= CPU_DTRACE_DROP;
6149 continue;
6150 }
6151
6152 offs = dtrace_buffer_reserve(buf,
6153 ecb->dte_needed, ecb->dte_alignment,
6154 state, NULL);
6155
6156 if (offs < 0) {
6157 *flags |= CPU_DTRACE_DROP;
6158 continue;
6159 }
6160
6161 tomax = buf->dtb_tomax;
6162 ASSERT(tomax != NULL);
6163
6164 if (ecb->dte_size != 0)
6165 DTRACE_STORE(uint32_t, tomax, offs,
6166 ecb->dte_epid);
6167 continue;
6168
6169 case DTRACEACT_CHILL:
6170#ifndef VBOX
6171 if (dtrace_priv_kernel_destructive(state))
6172 dtrace_action_chill(&mstate, val);
6173#else
6174 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6175#endif
6176 continue;
6177
6178 case DTRACEACT_RAISE:
6179#ifndef VBOX
6180 if (dtrace_priv_proc_destructive(state))
6181 dtrace_action_raise(val);
6182#else
6183 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6184#endif
6185 continue;
6186
6187 case DTRACEACT_COMMIT:
6188 ASSERT(!committed);
6189
6190 /*
6191 * We need to commit our buffer state.
6192 */
6193 if (ecb->dte_size)
6194 buf->dtb_offset = offs + ecb->dte_size;
6195 buf = &state->dts_buffer[cpuid];
6196 dtrace_speculation_commit(state, cpuid, val);
6197 committed = 1;
6198 continue;
6199
6200 case DTRACEACT_DISCARD:
6201 dtrace_speculation_discard(state, cpuid, val);
6202 continue;
6203
6204 case DTRACEACT_DIFEXPR:
6205 case DTRACEACT_LIBACT:
6206 case DTRACEACT_PRINTF:
6207 case DTRACEACT_PRINTA:
6208 case DTRACEACT_SYSTEM:
6209 case DTRACEACT_FREOPEN:
6210 break;
6211
6212 case DTRACEACT_SYM:
6213 case DTRACEACT_MOD:
6214 if (!dtrace_priv_kernel(state))
6215 continue;
6216 break;
6217
6218 case DTRACEACT_USYM:
6219 case DTRACEACT_UMOD:
6220 case DTRACEACT_UADDR: {
6221#ifndef VBOX
6222 struct pid *pid = curthread->t_procp->p_pidp;
6223
6224 if (!dtrace_priv_proc(state))
6225 continue;
6226
6227 DTRACE_STORE(uint64_t, tomax,
6228 valoffs, (uint64_t)pid->pid_id);
6229 DTRACE_STORE(uint64_t, tomax,
6230 valoffs + sizeof (uint64_t), val);
6231#else
6232 DTRACE_CPUFLAG_SET(CPU_DTRACE_UPRIV);
6233#endif
6234 continue;
6235 }
6236
6237 case DTRACEACT_EXIT: {
6238 /*
6239 * For the exit action, we are going to attempt
6240 * to atomically set our activity to be
6241 * draining. If this fails (either because
6242 * another CPU has beat us to the exit action,
6243 * or because our current activity is something
6244 * other than ACTIVE or WARMUP), we will
6245 * continue. This assures that the exit action
6246 * can be successfully recorded at most once
6247 * when we're in the ACTIVE state. If we're
6248 * encountering the exit() action while in
6249 * COOLDOWN, however, we want to honor the new
6250 * status code. (We know that we're the only
6251 * thread in COOLDOWN, so there is no race.)
6252 */
6253 void *activity = &state->dts_activity;
6254 dtrace_activity_t current = state->dts_activity;
6255
6256 if (current == DTRACE_ACTIVITY_COOLDOWN)
6257 break;
6258
6259 if (current != DTRACE_ACTIVITY_WARMUP)
6260 current = DTRACE_ACTIVITY_ACTIVE;
6261
6262 if (dtrace_cas32(activity, current,
6263 DTRACE_ACTIVITY_DRAINING) != current) {
6264 *flags |= CPU_DTRACE_DROP;
6265 continue;
6266 }
6267
6268 break;
6269 }
6270
6271 default:
6272#ifndef VBOX
6273 ASSERT(0);
6274#else
6275 AssertFatalMsgFailed(("%d\n", act->dta_kind));
6276#endif
6277 }
6278
6279 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6280 uintptr_t end = valoffs + size;
6281
6282 if (!dtrace_vcanload((void *)(uintptr_t)val,
6283 &dp->dtdo_rtype, &mstate, vstate))
6284 continue;
6285
6286 /*
6287 * If this is a string, we're going to only
6288 * load until we find the zero byte -- after
6289 * which we'll store zero bytes.
6290 */
6291 if (dp->dtdo_rtype.dtdt_kind ==
6292 DIF_TYPE_STRING) {
6293 char c = '\0' + 1;
6294 int intuple = act->dta_intuple;
6295 size_t s;
6296
6297 for (s = 0; s < size; s++) {
6298 if (c != '\0')
6299 c = dtrace_load8(val++);
6300
6301 DTRACE_STORE(uint8_t, tomax,
6302 valoffs++, c);
6303
6304 if (c == '\0' && intuple)
6305 break;
6306 }
6307
6308 continue;
6309 }
6310
6311 while (valoffs < end) {
6312 DTRACE_STORE(uint8_t, tomax, valoffs++,
6313 dtrace_load8(val++));
6314 }
6315
6316 continue;
6317 }
6318
6319 switch (size) {
6320 case 0:
6321 break;
6322
6323 case sizeof (uint8_t):
6324 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6325 break;
6326 case sizeof (uint16_t):
6327 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6328 break;
6329 case sizeof (uint32_t):
6330 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6331 break;
6332 case sizeof (uint64_t):
6333 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6334 break;
6335 default:
6336 /*
6337 * Any other size should have been returned by
6338 * reference, not by value.
6339 */
6340#ifndef VBOX
6341 ASSERT(0);
6342#else
6343 AssertFatalMsgFailed(("%zu\n", size));
6344#endif
6345 break;
6346 }
6347 }
6348
6349 if (*flags & CPU_DTRACE_DROP)
6350 continue;
6351
6352 if (*flags & CPU_DTRACE_FAULT) {
6353 int ndx;
6354 dtrace_action_t *err;
6355
6356 buf->dtb_errors++;
6357
6358 if (probe->dtpr_id == dtrace_probeid_error) {
6359 /*
6360 * There's nothing we can do -- we had an
6361 * error on the error probe. We bump an
6362 * error counter to at least indicate that
6363 * this condition happened.
6364 */
6365 dtrace_error(&state->dts_dblerrors);
6366 continue;
6367 }
6368
6369 if (vtime) {
6370 /*
6371 * Before recursing on dtrace_probe(), we
6372 * need to explicitly clear out our start
6373 * time to prevent it from being accumulated
6374 * into t_dtrace_vtime.
6375 */
6376 curthread->t_dtrace_start = 0;
6377 }
6378
6379 /*
6380 * Iterate over the actions to figure out which action
6381 * we were processing when we experienced the error.
6382 * Note that act points _past_ the faulting action; if
6383 * act is ecb->dte_action, the fault was in the
6384 * predicate, if it's ecb->dte_action->dta_next it's
6385 * in action #1, and so on.
6386 */
6387 for (err = ecb->dte_action, ndx = 0;
6388 err != act; err = err->dta_next, ndx++)
6389 continue;
6390
6391 dtrace_probe_error(state, ecb->dte_epid, ndx,
6392 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6393 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6394 cpu_core[cpuid].cpuc_dtrace_illval);
6395
6396 continue;
6397 }
6398
6399 if (!committed)
6400 buf->dtb_offset = offs + ecb->dte_size;
6401 }
6402
6403 if (vtime)
6404 curthread->t_dtrace_start = dtrace_gethrtime();
6405
6406 dtrace_interrupt_enable(cookie);
6407}
6408
6409/*
6410 * DTrace Probe Hashing Functions
6411 *
6412 * The functions in this section (and indeed, the functions in remaining
6413 * sections) are not _called_ from probe context. (Any exceptions to this are
6414 * marked with a "Note:".) Rather, they are called from elsewhere in the
6415 * DTrace framework to look-up probes in, add probes to and remove probes from
6416 * the DTrace probe hashes. (Each probe is hashed by each element of the
6417 * probe tuple -- allowing for fast lookups, regardless of what was
6418 * specified.)
6419 */
6420static uint_t
6421dtrace_hash_str(char *p)
6422{
6423 unsigned int g;
6424 uint_t hval = 0;
6425
6426 while (*p) {
6427 hval = (hval << 4) + *p++;
6428 if ((g = (hval & 0xf0000000)) != 0)
6429 hval ^= g >> 24;
6430 hval &= ~g;
6431 }
6432 return (hval);
6433}
6434
6435static dtrace_hash_t *
6436dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6437{
6438 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6439
6440 hash->dth_stroffs = stroffs;
6441 hash->dth_nextoffs = nextoffs;
6442 hash->dth_prevoffs = prevoffs;
6443
6444 hash->dth_size = 1;
6445 hash->dth_mask = hash->dth_size - 1;
6446
6447 hash->dth_tab = kmem_zalloc(hash->dth_size *
6448 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6449
6450 return (hash);
6451}
6452
6453static void
6454dtrace_hash_destroy(dtrace_hash_t *hash)
6455{
6456#ifdef DEBUG
6457 int i;
6458
6459 for (i = 0; i < hash->dth_size; i++)
6460 ASSERT(hash->dth_tab[i] == NULL);
6461#endif
6462
6463 kmem_free(hash->dth_tab,
6464 hash->dth_size * sizeof (dtrace_hashbucket_t *));
6465 kmem_free(hash, sizeof (dtrace_hash_t));
6466}
6467
6468static void
6469dtrace_hash_resize(dtrace_hash_t *hash)
6470{
6471 int size = hash->dth_size, i, ndx;
6472 int new_size = hash->dth_size << 1;
6473 int new_mask = new_size - 1;
6474 dtrace_hashbucket_t **new_tab, *bucket, *next;
6475
6476 ASSERT((new_size & new_mask) == 0);
6477
6478 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6479
6480 for (i = 0; i < size; i++) {
6481 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6482 dtrace_probe_t *probe = bucket->dthb_chain;
6483
6484 ASSERT(probe != NULL);
6485 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6486
6487 next = bucket->dthb_next;
6488 bucket->dthb_next = new_tab[ndx];
6489 new_tab[ndx] = bucket;
6490 }
6491 }
6492
6493 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6494 hash->dth_tab = new_tab;
6495 hash->dth_size = new_size;
6496 hash->dth_mask = new_mask;
6497}
6498
6499static void
6500dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6501{
6502 int hashval = DTRACE_HASHSTR(hash, new);
6503 int ndx = hashval & hash->dth_mask;
6504 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6505 dtrace_probe_t **nextp, **prevp;
6506
6507 for (; bucket != NULL; bucket = bucket->dthb_next) {
6508 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6509 goto add;
6510 }
6511
6512 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6513 dtrace_hash_resize(hash);
6514 dtrace_hash_add(hash, new);
6515 return;
6516 }
6517
6518 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6519 bucket->dthb_next = hash->dth_tab[ndx];
6520 hash->dth_tab[ndx] = bucket;
6521 hash->dth_nbuckets++;
6522
6523add:
6524 nextp = DTRACE_HASHNEXT(hash, new);
6525 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6526 *nextp = bucket->dthb_chain;
6527
6528 if (bucket->dthb_chain != NULL) {
6529 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6530 ASSERT(*prevp == NULL);
6531 *prevp = new;
6532 }
6533
6534 bucket->dthb_chain = new;
6535 bucket->dthb_len++;
6536}
6537
6538static dtrace_probe_t *
6539dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6540{
6541 int hashval = DTRACE_HASHSTR(hash, template);
6542 int ndx = hashval & hash->dth_mask;
6543 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6544
6545 for (; bucket != NULL; bucket = bucket->dthb_next) {
6546 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6547 return (bucket->dthb_chain);
6548 }
6549
6550 return (NULL);
6551}
6552
6553static int
6554dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6555{
6556 int hashval = DTRACE_HASHSTR(hash, template);
6557 int ndx = hashval & hash->dth_mask;
6558 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6559
6560 for (; bucket != NULL; bucket = bucket->dthb_next) {
6561 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6562 return (bucket->dthb_len);
6563 }
6564
6565 return (NULL);
6566}
6567
6568static void
6569dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6570{
6571 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6572 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6573
6574 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6575 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6576
6577 /*
6578 * Find the bucket that we're removing this probe from.
6579 */
6580 for (; bucket != NULL; bucket = bucket->dthb_next) {
6581 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6582 break;
6583 }
6584
6585 ASSERT(bucket != NULL);
6586
6587 if (*prevp == NULL) {
6588 if (*nextp == NULL) {
6589 /*
6590 * The removed probe was the only probe on this
6591 * bucket; we need to remove the bucket.
6592 */
6593 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6594
6595 ASSERT(bucket->dthb_chain == probe);
6596 ASSERT(b != NULL);
6597
6598 if (b == bucket) {
6599 hash->dth_tab[ndx] = bucket->dthb_next;
6600 } else {
6601 while (b->dthb_next != bucket)
6602 b = b->dthb_next;
6603 b->dthb_next = bucket->dthb_next;
6604 }
6605
6606 ASSERT(hash->dth_nbuckets > 0);
6607 hash->dth_nbuckets--;
6608 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6609 return;
6610 }
6611
6612 bucket->dthb_chain = *nextp;
6613 } else {
6614 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6615 }
6616
6617 if (*nextp != NULL)
6618 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6619}
6620
6621/*
6622 * DTrace Utility Functions
6623 *
6624 * These are random utility functions that are _not_ called from probe context.
6625 */
6626static int
6627dtrace_badattr(const dtrace_attribute_t *a)
6628{
6629 return (a->dtat_name > DTRACE_STABILITY_MAX ||
6630 a->dtat_data > DTRACE_STABILITY_MAX ||
6631 a->dtat_class > DTRACE_CLASS_MAX);
6632}
6633
6634/*
6635 * Return a duplicate copy of a string. If the specified string is NULL,
6636 * this function returns a zero-length string.
6637 */
6638static char *
6639dtrace_strdup(const char *str)
6640{
6641 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6642
6643 if (str != NULL)
6644 (void) strcpy(new, str);
6645
6646 return (new);
6647}
6648
6649#define DTRACE_ISALPHA(c) \
6650 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6651
6652static int
6653dtrace_badname(const char *s)
6654{
6655 char c;
6656
6657 if (s == NULL || (c = *s++) == '\0')
6658 return (0);
6659
6660 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6661 return (1);
6662
6663 while ((c = *s++) != '\0') {
6664 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6665 c != '-' && c != '_' && c != '.' && c != '`')
6666 return (1);
6667 }
6668
6669 return (0);
6670}
6671
6672static void
6673dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6674{
6675 uint32_t priv;
6676
6677 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6678 /*
6679 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6680 */
6681 priv = DTRACE_PRIV_ALL;
6682#ifdef VBOX
6683 *uidp = ~0;
6684 *zoneidp = 0;
6685#endif
6686 } else {
6687 *uidp = crgetuid(cr);
6688 *zoneidp = crgetzoneid(cr);
6689
6690 priv = 0;
6691 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6692 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6693 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6694 priv |= DTRACE_PRIV_USER;
6695 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6696 priv |= DTRACE_PRIV_PROC;
6697 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6698 priv |= DTRACE_PRIV_OWNER;
6699 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6700 priv |= DTRACE_PRIV_ZONEOWNER;
6701 }
6702
6703 *privp = priv;
6704}
6705
6706#ifdef DTRACE_ERRDEBUG
6707static void
6708dtrace_errdebug(const char *str)
6709{
6710 int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6711 int occupied = 0;
6712
6713 mutex_enter(&dtrace_errlock);
6714 dtrace_errlast = str;
6715 dtrace_errthread = curthread;
6716
6717 while (occupied++ < DTRACE_ERRHASHSZ) {
6718 if (dtrace_errhash[hval].dter_msg == str) {
6719 dtrace_errhash[hval].dter_count++;
6720 goto out;
6721 }
6722
6723 if (dtrace_errhash[hval].dter_msg != NULL) {
6724 hval = (hval + 1) % DTRACE_ERRHASHSZ;
6725 continue;
6726 }
6727
6728 dtrace_errhash[hval].dter_msg = str;
6729 dtrace_errhash[hval].dter_count = 1;
6730 goto out;
6731 }
6732
6733 panic("dtrace: undersized error hash");
6734out:
6735 mutex_exit(&dtrace_errlock);
6736}
6737#endif
6738
6739/*
6740 * DTrace Matching Functions
6741 *
6742 * These functions are used to match groups of probes, given some elements of
6743 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6744 */
6745static int
6746dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6747 zoneid_t zoneid)
6748{
6749 if (priv != DTRACE_PRIV_ALL) {
6750 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6751 uint32_t match = priv & ppriv;
6752
6753 /*
6754 * No PRIV_DTRACE_* privileges...
6755 */
6756 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6757 DTRACE_PRIV_KERNEL)) == 0)
6758 return (0);
6759
6760 /*
6761 * No matching bits, but there were bits to match...
6762 */
6763 if (match == 0 && ppriv != 0)
6764 return (0);
6765
6766 /*
6767 * Need to have permissions to the process, but don't...
6768 */
6769 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6770 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6771 return (0);
6772 }
6773
6774 /*
6775 * Need to be in the same zone unless we possess the
6776 * privilege to examine all zones.
6777 */
6778 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6779 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6780 return (0);
6781 }
6782 }
6783
6784 return (1);
6785}
6786
6787/*
6788 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6789 * consists of input pattern strings and an ops-vector to evaluate them.
6790 * This function returns >0 for match, 0 for no match, and <0 for error.
6791 */
6792static int
6793dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6794 uint32_t priv, uid_t uid, zoneid_t zoneid)
6795{
6796 dtrace_provider_t *pvp = prp->dtpr_provider;
6797 int rv;
6798
6799 if (pvp->dtpv_defunct)
6800 return (0);
6801
6802 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6803 return (rv);
6804
6805 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6806 return (rv);
6807
6808 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6809 return (rv);
6810
6811 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6812 return (rv);
6813
6814 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6815 return (0);
6816
6817 return (rv);
6818}
6819
6820/*
6821 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6822 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
6823 * libc's version, the kernel version only applies to 8-bit ASCII strings.
6824 * In addition, all of the recursion cases except for '*' matching have been
6825 * unwound. For '*', we still implement recursive evaluation, but a depth
6826 * counter is maintained and matching is aborted if we recurse too deep.
6827 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6828 */
6829static int
6830dtrace_match_glob(const char *s, const char *p, int depth)
6831{
6832 const char *olds;
6833 char s1, c;
6834 int gs;
6835
6836 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6837 return (-1);
6838
6839 if (s == NULL)
6840 s = ""; /* treat NULL as empty string */
6841
6842top:
6843 olds = s;
6844 s1 = *s++;
6845
6846 if (p == NULL)
6847 return (0);
6848
6849 if ((c = *p++) == '\0')
6850 return (s1 == '\0');
6851
6852 switch (c) {
6853 case '[': {
6854 int ok = 0, notflag = 0;
6855 char lc = '\0';
6856
6857 if (s1 == '\0')
6858 return (0);
6859
6860 if (*p == '!') {
6861 notflag = 1;
6862 p++;
6863 }
6864
6865 if ((c = *p++) == '\0')
6866 return (0);
6867
6868 do {
6869 if (c == '-' && lc != '\0' && *p != ']') {
6870 if ((c = *p++) == '\0')
6871 return (0);
6872 if (c == '\\' && (c = *p++) == '\0')
6873 return (0);
6874
6875 if (notflag) {
6876 if (s1 < lc || s1 > c)
6877 ok++;
6878 else
6879 return (0);
6880 } else if (lc <= s1 && s1 <= c)
6881 ok++;
6882
6883 } else if (c == '\\' && (c = *p++) == '\0')
6884 return (0);
6885
6886 lc = c; /* save left-hand 'c' for next iteration */
6887
6888 if (notflag) {
6889 if (s1 != c)
6890 ok++;
6891 else
6892 return (0);
6893 } else if (s1 == c)
6894 ok++;
6895
6896 if ((c = *p++) == '\0')
6897 return (0);
6898
6899 } while (c != ']');
6900
6901 if (ok)
6902 goto top;
6903
6904 return (0);
6905 }
6906
6907 case '\\':
6908 if ((c = *p++) == '\0')
6909 return (0);
6910 /*FALLTHRU*/
6911
6912 default:
6913 if (c != s1)
6914 return (0);
6915 /*FALLTHRU*/
6916
6917 case '?':
6918 if (s1 != '\0')
6919 goto top;
6920 return (0);
6921
6922 case '*':
6923 while (*p == '*')
6924 p++; /* consecutive *'s are identical to a single one */
6925
6926 if (*p == '\0')
6927 return (1);
6928
6929 for (s = olds; *s != '\0'; s++) {
6930 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6931 return (gs);
6932 }
6933
6934 return (0);
6935 }
6936}
6937
6938/*ARGSUSED*/
6939static int
6940dtrace_match_string(const char *s, const char *p, int depth)
6941{
6942 return (s != NULL && strcmp(s, p) == 0);
6943}
6944
6945/*ARGSUSED*/
6946static int
6947dtrace_match_nul(const char *s, const char *p, int depth)
6948{
6949 return (1); /* always match the empty pattern */
6950}
6951
6952/*ARGSUSED*/
6953static int
6954dtrace_match_nonzero(const char *s, const char *p, int depth)
6955{
6956 return (s != NULL && s[0] != '\0');
6957}
6958
6959static int
6960dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6961 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6962{
6963 dtrace_probe_t template, *probe;
6964 dtrace_hash_t *hash = NULL;
6965 int len, rc, best = INT_MAX, nmatched = 0;
6966 dtrace_id_t i;
6967
6968 ASSERT(MUTEX_HELD(&dtrace_lock));
6969
6970 /*
6971 * If the probe ID is specified in the key, just lookup by ID and
6972 * invoke the match callback once if a matching probe is found.
6973 */
6974 if (pkp->dtpk_id != DTRACE_IDNONE) {
6975 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6976 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6977 if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
6978 return (DTRACE_MATCH_FAIL);
6979 nmatched++;
6980 }
6981 return (nmatched);
6982 }
6983
6984 template.dtpr_mod = (char *)pkp->dtpk_mod;
6985 template.dtpr_func = (char *)pkp->dtpk_func;
6986 template.dtpr_name = (char *)pkp->dtpk_name;
6987
6988 /*
6989 * We want to find the most distinct of the module name, function
6990 * name, and name. So for each one that is not a glob pattern or
6991 * empty string, we perform a lookup in the corresponding hash and
6992 * use the hash table with the fewest collisions to do our search.
6993 */
6994 if (pkp->dtpk_mmatch == &dtrace_match_string &&
6995 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
6996 best = len;
6997 hash = dtrace_bymod;
6998 }
6999
7000 if (pkp->dtpk_fmatch == &dtrace_match_string &&
7001 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7002 best = len;
7003 hash = dtrace_byfunc;
7004 }
7005
7006 if (pkp->dtpk_nmatch == &dtrace_match_string &&
7007 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7008 best = len;
7009 hash = dtrace_byname;
7010 }
7011
7012 /*
7013 * If we did not select a hash table, iterate over every probe and
7014 * invoke our callback for each one that matches our input probe key.
7015 */
7016 if (hash == NULL) {
7017 for (i = 0; i < VBDTCAST(dtrace_id_t)dtrace_nprobes; i++) {
7018 if ((probe = dtrace_probes[i]) == NULL ||
7019 dtrace_match_probe(probe, pkp, priv, uid,
7020 zoneid) <= 0)
7021 continue;
7022
7023 nmatched++;
7024
7025 if ((rc = (*matched)(probe, arg)) !=
7026 DTRACE_MATCH_NEXT) {
7027 if (rc == DTRACE_MATCH_FAIL)
7028 return (DTRACE_MATCH_FAIL);
7029 break;
7030 }
7031 }
7032
7033 return (nmatched);
7034 }
7035
7036 /*
7037 * If we selected a hash table, iterate over each probe of the same key
7038 * name and invoke the callback for every probe that matches the other
7039 * attributes of our input probe key.
7040 */
7041 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7042 probe = *(DTRACE_HASHNEXT(hash, probe))) {
7043
7044 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7045 continue;
7046
7047 nmatched++;
7048
7049 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7050 if (rc == DTRACE_MATCH_FAIL)
7051 return (DTRACE_MATCH_FAIL);
7052 break;
7053 }
7054 }
7055
7056 return (nmatched);
7057}
7058
7059/*
7060 * Return the function pointer dtrace_probecmp() should use to compare the
7061 * specified pattern with a string. For NULL or empty patterns, we select
7062 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7063 * For non-empty non-glob strings, we use dtrace_match_string().
7064 */
7065static dtrace_probekey_f *
7066dtrace_probekey_func(const char *p)
7067{
7068 char c;
7069
7070 if (p == NULL || *p == '\0')
7071 return (&dtrace_match_nul);
7072
7073 while ((c = *p++) != '\0') {
7074 if (c == '[' || c == '?' || c == '*' || c == '\\')
7075 return (&dtrace_match_glob);
7076 }
7077
7078 return (&dtrace_match_string);
7079}
7080
7081/*
7082 * Build a probe comparison key for use with dtrace_match_probe() from the
7083 * given probe description. By convention, a null key only matches anchored
7084 * probes: if each field is the empty string, reset dtpk_fmatch to
7085 * dtrace_match_nonzero().
7086 */
7087static void
7088dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7089{
7090 pkp->dtpk_prov = pdp->dtpd_provider;
7091 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7092
7093 pkp->dtpk_mod = pdp->dtpd_mod;
7094 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7095
7096 pkp->dtpk_func = pdp->dtpd_func;
7097 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7098
7099 pkp->dtpk_name = pdp->dtpd_name;
7100 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7101
7102 pkp->dtpk_id = pdp->dtpd_id;
7103
7104 if (pkp->dtpk_id == DTRACE_IDNONE &&
7105 pkp->dtpk_pmatch == &dtrace_match_nul &&
7106 pkp->dtpk_mmatch == &dtrace_match_nul &&
7107 pkp->dtpk_fmatch == &dtrace_match_nul &&
7108 pkp->dtpk_nmatch == &dtrace_match_nul)
7109 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7110}
7111
7112/*
7113 * DTrace Provider-to-Framework API Functions
7114 *
7115 * These functions implement much of the Provider-to-Framework API, as
7116 * described in <sys/dtrace.h>. The parts of the API not in this section are
7117 * the functions in the API for probe management (found below), and
7118 * dtrace_probe() itself (found above).
7119 */
7120
7121/*
7122 * Register the calling provider with the DTrace framework. This should
7123 * generally be called by DTrace providers in their attach(9E) entry point.
7124 */
7125int
7126dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7127 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7128{
7129 dtrace_provider_t *provider;
7130
7131 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7132 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7133 "arguments", name ? name : "<NULL>");
7134 return (EINVAL);
7135 }
7136
7137 if (name[0] == '\0' || dtrace_badname(name)) {
7138 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7139 "provider name", name);
7140 return (EINVAL);
7141 }
7142
7143 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7144 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7145 pops->dtps_destroy == NULL ||
7146 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7147 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7148 "provider ops", name);
7149 return (EINVAL);
7150 }
7151
7152 if (dtrace_badattr(&pap->dtpa_provider) ||
7153 dtrace_badattr(&pap->dtpa_mod) ||
7154 dtrace_badattr(&pap->dtpa_func) ||
7155 dtrace_badattr(&pap->dtpa_name) ||
7156 dtrace_badattr(&pap->dtpa_args)) {
7157 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7158 "provider attributes", name);
7159 return (EINVAL);
7160 }
7161
7162 if (priv & ~DTRACE_PRIV_ALL) {
7163 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7164 "privilege attributes", name);
7165 return (EINVAL);
7166 }
7167
7168 if ((priv & DTRACE_PRIV_KERNEL) &&
7169 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7170 pops->dtps_usermode == NULL) {
7171 cmn_err(CE_WARN, "failed to register provider '%s': need "
7172 "dtps_usermode() op for given privilege attributes", name);
7173 return (EINVAL);
7174 }
7175
7176 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7177 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7178 (void) strcpy(provider->dtpv_name, name);
7179
7180 provider->dtpv_attr = *pap;
7181 provider->dtpv_priv.dtpp_flags = priv;
7182 if (cr != NULL) {
7183 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7184 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7185 }
7186 provider->dtpv_pops = *pops;
7187
7188 if (pops->dtps_provide == NULL) {
7189 ASSERT(pops->dtps_provide_module != NULL);
7190 provider->dtpv_pops.dtps_provide =
7191 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7192 }
7193
7194 if (pops->dtps_provide_module == NULL) {
7195 ASSERT(pops->dtps_provide != NULL);
7196 provider->dtpv_pops.dtps_provide_module =
7197 (void (*)(void *, struct modctl *))dtrace_nullop;
7198 }
7199
7200 if (pops->dtps_suspend == NULL) {
7201 ASSERT(pops->dtps_resume == NULL);
7202 provider->dtpv_pops.dtps_suspend =
7203 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7204 provider->dtpv_pops.dtps_resume =
7205 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7206 }
7207
7208 provider->dtpv_arg = arg;
7209 *idp = (dtrace_provider_id_t)provider;
7210
7211 if (pops == &dtrace_provider_ops) {
7212 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7213 ASSERT(MUTEX_HELD(&dtrace_lock));
7214 ASSERT(dtrace_anon.dta_enabling == NULL);
7215
7216 /*
7217 * We make sure that the DTrace provider is at the head of
7218 * the provider chain.
7219 */
7220 provider->dtpv_next = dtrace_provider;
7221 dtrace_provider = provider;
7222 return (0);
7223 }
7224
7225 mutex_enter(&dtrace_provider_lock);
7226 mutex_enter(&dtrace_lock);
7227
7228 /*
7229 * If there is at least one provider registered, we'll add this
7230 * provider after the first provider.
7231 */
7232 if (dtrace_provider != NULL) {
7233 provider->dtpv_next = dtrace_provider->dtpv_next;
7234 dtrace_provider->dtpv_next = provider;
7235 } else {
7236 dtrace_provider = provider;
7237 }
7238
7239 if (dtrace_retained != NULL) {
7240 dtrace_enabling_provide(provider);
7241
7242 /*
7243 * Now we need to call dtrace_enabling_matchall() -- which
7244 * will acquire cpu_lock and dtrace_lock. We therefore need
7245 * to drop all of our locks before calling into it...
7246 */
7247 mutex_exit(&dtrace_lock);
7248 mutex_exit(&dtrace_provider_lock);
7249 dtrace_enabling_matchall();
7250
7251 return (0);
7252 }
7253
7254 mutex_exit(&dtrace_lock);
7255 mutex_exit(&dtrace_provider_lock);
7256
7257 return (0);
7258}
7259
7260/*
7261 * Unregister the specified provider from the DTrace framework. This should
7262 * generally be called by DTrace providers in their detach(9E) entry point.
7263 */
7264int
7265dtrace_unregister(dtrace_provider_id_t id)
7266{
7267 dtrace_provider_t *old = (dtrace_provider_t *)id;
7268 dtrace_provider_t *prev = NULL;
7269 VBDTTYPE(uint32_t,int) i, self = 0;
7270 dtrace_probe_t *probe, *first = NULL;
7271
7272 if (old->dtpv_pops.dtps_enable ==
7273 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7274 /*
7275 * If DTrace itself is the provider, we're called with locks
7276 * already held.
7277 */
7278 ASSERT(old == dtrace_provider);
7279 ASSERT(dtrace_devi != NULL);
7280 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7281 ASSERT(MUTEX_HELD(&dtrace_lock));
7282 self = 1;
7283
7284 if (dtrace_provider->dtpv_next != NULL) {
7285 /*
7286 * There's another provider here; return failure.
7287 */
7288 return (EBUSY);
7289 }
7290 } else {
7291 mutex_enter(&dtrace_provider_lock);
7292 mutex_enter(&mod_lock);
7293 mutex_enter(&dtrace_lock);
7294 }
7295
7296 /*
7297 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7298 * probes, we refuse to let providers slither away, unless this
7299 * provider has already been explicitly invalidated.
7300 */
7301 if (!old->dtpv_defunct &&
7302 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7303 dtrace_anon.dta_state->dts_necbs > 0))) {
7304 if (!self) {
7305 mutex_exit(&dtrace_lock);
7306 mutex_exit(&mod_lock);
7307 mutex_exit(&dtrace_provider_lock);
7308 }
7309 return (EBUSY);
7310 }
7311
7312 /*
7313 * Attempt to destroy the probes associated with this provider.
7314 */
7315 for (i = 0; i < dtrace_nprobes; i++) {
7316 if ((probe = dtrace_probes[i]) == NULL)
7317 continue;
7318
7319 if (probe->dtpr_provider != old)
7320 continue;
7321
7322 if (probe->dtpr_ecb == NULL)
7323 continue;
7324
7325 /*
7326 * We have at least one ECB; we can't remove this provider.
7327 */
7328 if (!self) {
7329 mutex_exit(&dtrace_lock);
7330 mutex_exit(&mod_lock);
7331 mutex_exit(&dtrace_provider_lock);
7332 }
7333 return (EBUSY);
7334 }
7335
7336 /*
7337 * All of the probes for this provider are disabled; we can safely
7338 * remove all of them from their hash chains and from the probe array.
7339 */
7340 for (i = 0; i < dtrace_nprobes; i++) {
7341 if ((probe = dtrace_probes[i]) == NULL)
7342 continue;
7343
7344 if (probe->dtpr_provider != old)
7345 continue;
7346
7347 dtrace_probes[i] = NULL;
7348
7349 dtrace_hash_remove(dtrace_bymod, probe);
7350 dtrace_hash_remove(dtrace_byfunc, probe);
7351 dtrace_hash_remove(dtrace_byname, probe);
7352
7353 if (first == NULL) {
7354 first = probe;
7355 probe->dtpr_nextmod = NULL;
7356 } else {
7357 probe->dtpr_nextmod = first;
7358 first = probe;
7359 }
7360 }
7361
7362 /*
7363 * The provider's probes have been removed from the hash chains and
7364 * from the probe array. Now issue a dtrace_sync() to be sure that
7365 * everyone has cleared out from any probe array processing.
7366 */
7367 dtrace_sync();
7368
7369 for (probe = first; probe != NULL; probe = first) {
7370 first = probe->dtpr_nextmod;
7371
7372 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7373 probe->dtpr_arg);
7374 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7375 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7376 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7377 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7378 kmem_free(probe, sizeof (dtrace_probe_t));
7379 }
7380
7381 if ((prev = dtrace_provider) == old) {
7382 ASSERT(self || dtrace_devi == NULL);
7383 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7384 dtrace_provider = old->dtpv_next;
7385 } else {
7386 while (prev != NULL && prev->dtpv_next != old)
7387 prev = prev->dtpv_next;
7388
7389 if (prev == NULL) {
7390 panic("attempt to unregister non-existent "
7391 "dtrace provider %p\n", (void *)id);
7392 }
7393
7394 prev->dtpv_next = old->dtpv_next;
7395 }
7396
7397 if (!self) {
7398 mutex_exit(&dtrace_lock);
7399 mutex_exit(&mod_lock);
7400 mutex_exit(&dtrace_provider_lock);
7401 }
7402
7403 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7404 kmem_free(old, sizeof (dtrace_provider_t));
7405
7406 return (0);
7407}
7408
7409/*
7410 * Invalidate the specified provider. All subsequent probe lookups for the
7411 * specified provider will fail, but its probes will not be removed.
7412 */
7413void
7414dtrace_invalidate(dtrace_provider_id_t id)
7415{
7416 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7417
7418 ASSERT(pvp->dtpv_pops.dtps_enable !=
7419 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7420
7421 mutex_enter(&dtrace_provider_lock);
7422 mutex_enter(&dtrace_lock);
7423
7424 pvp->dtpv_defunct = 1;
7425
7426 mutex_exit(&dtrace_lock);
7427 mutex_exit(&dtrace_provider_lock);
7428}
7429
7430/*
7431 * Indicate whether or not DTrace has attached.
7432 */
7433int
7434dtrace_attached(void)
7435{
7436 /*
7437 * dtrace_provider will be non-NULL iff the DTrace driver has
7438 * attached. (It's non-NULL because DTrace is always itself a
7439 * provider.)
7440 */
7441 return (dtrace_provider != NULL);
7442}
7443
7444/*
7445 * Remove all the unenabled probes for the given provider. This function is
7446 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7447 * -- just as many of its associated probes as it can.
7448 */
7449int
7450dtrace_condense(dtrace_provider_id_t id)
7451{
7452 dtrace_provider_t *prov = (dtrace_provider_t *)id;
7453 VBDTTYPE(uint32_t,int) i;
7454 dtrace_probe_t *probe;
7455
7456 /*
7457 * Make sure this isn't the dtrace provider itself.
7458 */
7459 ASSERT(prov->dtpv_pops.dtps_enable !=
7460 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7461
7462 mutex_enter(&dtrace_provider_lock);
7463 mutex_enter(&dtrace_lock);
7464
7465 /*
7466 * Attempt to destroy the probes associated with this provider.
7467 */
7468 for (i = 0; i < dtrace_nprobes; i++) {
7469 if ((probe = dtrace_probes[i]) == NULL)
7470 continue;
7471
7472 if (probe->dtpr_provider != prov)
7473 continue;
7474
7475 if (probe->dtpr_ecb != NULL)
7476 continue;
7477
7478 dtrace_probes[i] = NULL;
7479
7480 dtrace_hash_remove(dtrace_bymod, probe);
7481 dtrace_hash_remove(dtrace_byfunc, probe);
7482 dtrace_hash_remove(dtrace_byname, probe);
7483
7484 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7485 probe->dtpr_arg);
7486 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7487 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7488 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7489 kmem_free(probe, sizeof (dtrace_probe_t));
7490 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7491 }
7492
7493 mutex_exit(&dtrace_lock);
7494 mutex_exit(&dtrace_provider_lock);
7495
7496 return (0);
7497}
7498
7499/*
7500 * DTrace Probe Management Functions
7501 *
7502 * The functions in this section perform the DTrace probe management,
7503 * including functions to create probes, look-up probes, and call into the
7504 * providers to request that probes be provided. Some of these functions are
7505 * in the Provider-to-Framework API; these functions can be identified by the
7506 * fact that they are not declared "static".
7507 */
7508
7509/*
7510 * Create a probe with the specified module name, function name, and name.
7511 */
7512dtrace_id_t
7513dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7514 const char *func, const char *name, int aframes, void *arg)
7515{
7516 dtrace_probe_t *probe, **probes;
7517 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7518 dtrace_id_t id;
7519
7520 if (provider == dtrace_provider) {
7521 ASSERT(MUTEX_HELD(&dtrace_lock));
7522 } else {
7523 mutex_enter(&dtrace_lock);
7524 }
7525
7526 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7527 VM_BESTFIT | VM_SLEEP);
7528 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7529
7530 probe->dtpr_id = id;
7531 probe->dtpr_gen = dtrace_probegen++;
7532 probe->dtpr_mod = dtrace_strdup(mod);
7533 probe->dtpr_func = dtrace_strdup(func);
7534 probe->dtpr_name = dtrace_strdup(name);
7535 probe->dtpr_arg = arg;
7536 probe->dtpr_aframes = aframes;
7537 probe->dtpr_provider = provider;
7538
7539 dtrace_hash_add(dtrace_bymod, probe);
7540 dtrace_hash_add(dtrace_byfunc, probe);
7541 dtrace_hash_add(dtrace_byname, probe);
7542
7543 if (id - 1 >= dtrace_nprobes) {
7544 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7545 size_t nsize = osize << 1;
7546
7547 if (nsize == 0) {
7548 ASSERT(osize == 0);
7549 ASSERT(dtrace_probes == NULL);
7550 nsize = sizeof (dtrace_probe_t *);
7551 }
7552
7553 probes = kmem_zalloc(nsize, KM_SLEEP);
7554
7555 if (dtrace_probes == NULL) {
7556 ASSERT(osize == 0);
7557 dtrace_probes = probes;
7558 dtrace_nprobes = 1;
7559 } else {
7560 dtrace_probe_t **oprobes = dtrace_probes;
7561
7562 bcopy(oprobes, probes, osize);
7563 dtrace_membar_producer();
7564 dtrace_probes = probes;
7565
7566 dtrace_sync();
7567
7568 /*
7569 * All CPUs are now seeing the new probes array; we can
7570 * safely free the old array.
7571 */
7572 kmem_free(oprobes, osize);
7573 dtrace_nprobes <<= 1;
7574 }
7575
7576 ASSERT(id - 1 < dtrace_nprobes);
7577 }
7578
7579 ASSERT(dtrace_probes[id - 1] == NULL);
7580 dtrace_probes[id - 1] = probe;
7581
7582 if (provider != dtrace_provider)
7583 mutex_exit(&dtrace_lock);
7584
7585 return (id);
7586}
7587
7588static dtrace_probe_t *
7589dtrace_probe_lookup_id(dtrace_id_t id)
7590{
7591 ASSERT(MUTEX_HELD(&dtrace_lock));
7592
7593 if (id == 0 || id > dtrace_nprobes)
7594 return (NULL);
7595
7596 return (dtrace_probes[id - 1]);
7597}
7598
7599static int
7600dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7601{
7602 *((dtrace_id_t *)arg) = probe->dtpr_id;
7603
7604 return (DTRACE_MATCH_DONE);
7605}
7606
7607/*
7608 * Look up a probe based on provider and one or more of module name, function
7609 * name and probe name.
7610 */
7611dtrace_id_t
7612dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7613 const char *func, const char *name)
7614{
7615 dtrace_probekey_t pkey;
7616 dtrace_id_t id;
7617 int match;
7618
7619 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7620 pkey.dtpk_pmatch = &dtrace_match_string;
7621 pkey.dtpk_mod = mod;
7622 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7623 pkey.dtpk_func = func;
7624 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7625 pkey.dtpk_name = name;
7626 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7627 pkey.dtpk_id = DTRACE_IDNONE;
7628
7629 mutex_enter(&dtrace_lock);
7630 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7631 dtrace_probe_lookup_match, &id);
7632 mutex_exit(&dtrace_lock);
7633
7634 ASSERT(match == 1 || match == 0);
7635 return (match ? id : 0);
7636}
7637
7638/*
7639 * Returns the probe argument associated with the specified probe.
7640 */
7641void *
7642dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7643{
7644 dtrace_probe_t *probe;
7645 void *rval = NULL;
7646
7647 mutex_enter(&dtrace_lock);
7648
7649 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7650 probe->dtpr_provider == (dtrace_provider_t *)id)
7651 rval = probe->dtpr_arg;
7652
7653 mutex_exit(&dtrace_lock);
7654
7655 return (rval);
7656}
7657
7658/*
7659 * Copy a probe into a probe description.
7660 */
7661static void
7662dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7663{
7664 bzero(pdp, sizeof (dtrace_probedesc_t));
7665 pdp->dtpd_id = prp->dtpr_id;
7666
7667 (void) strncpy(pdp->dtpd_provider,
7668 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7669
7670 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7671 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7672 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7673}
7674
7675/*
7676 * Called to indicate that a probe -- or probes -- should be provided by a
7677 * specfied provider. If the specified description is NULL, the provider will
7678 * be told to provide all of its probes. (This is done whenever a new
7679 * consumer comes along, or whenever a retained enabling is to be matched.) If
7680 * the specified description is non-NULL, the provider is given the
7681 * opportunity to dynamically provide the specified probe, allowing providers
7682 * to support the creation of probes on-the-fly. (So-called _autocreated_
7683 * probes.) If the provider is NULL, the operations will be applied to all
7684 * providers; if the provider is non-NULL the operations will only be applied
7685 * to the specified provider. The dtrace_provider_lock must be held, and the
7686 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7687 * will need to grab the dtrace_lock when it reenters the framework through
7688 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7689 */
7690static void
7691dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7692{
7693#ifndef VBOX
7694 struct modctl *ctl;
7695#endif
7696 int all = 0;
7697
7698 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7699
7700 if (prv == NULL) {
7701 all = 1;
7702 prv = dtrace_provider;
7703 }
7704
7705 do {
7706 /*
7707 * First, call the blanket provide operation.
7708 */
7709 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7710
7711#ifndef VBOX
7712 /*
7713 * Now call the per-module provide operation. We will grab
7714 * mod_lock to prevent the list from being modified. Note
7715 * that this also prevents the mod_busy bits from changing.
7716 * (mod_busy can only be changed with mod_lock held.)
7717 */
7718 mutex_enter(&mod_lock);
7719
7720 ctl = &modules;
7721 do {
7722 if (ctl->mod_busy || ctl->mod_mp == NULL)
7723 continue;
7724
7725 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7726
7727 } while ((ctl = ctl->mod_next) != &modules);
7728
7729 mutex_exit(&mod_lock);
7730#endif
7731 } while (all && (prv = prv->dtpv_next) != NULL);
7732}
7733
7734/*
7735 * Iterate over each probe, and call the Framework-to-Provider API function
7736 * denoted by offs.
7737 */
7738static void
7739dtrace_probe_foreach(uintptr_t offs)
7740{
7741 dtrace_provider_t *prov;
7742 void (*func)(void *, dtrace_id_t, void *);
7743 dtrace_probe_t *probe;
7744 dtrace_icookie_t cookie;
7745 VBDTTYPE(uint32_t,int) i;
7746
7747 /*
7748 * We disable interrupts to walk through the probe array. This is
7749 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7750 * won't see stale data.
7751 */
7752 cookie = dtrace_interrupt_disable();
7753
7754 for (i = 0; i < dtrace_nprobes; i++) {
7755 if ((probe = dtrace_probes[i]) == NULL)
7756 continue;
7757
7758 if (probe->dtpr_ecb == NULL) {
7759 /*
7760 * This probe isn't enabled -- don't call the function.
7761 */
7762 continue;
7763 }
7764
7765 prov = probe->dtpr_provider;
7766 func = *((void(**)(void *, dtrace_id_t, void *))
7767 ((uintptr_t)&prov->dtpv_pops + offs));
7768
7769 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7770 }
7771
7772 dtrace_interrupt_enable(cookie);
7773}
7774
7775static int
7776dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7777{
7778 dtrace_probekey_t pkey;
7779 uint32_t priv;
7780 uid_t uid;
7781 zoneid_t zoneid;
7782
7783 ASSERT(MUTEX_HELD(&dtrace_lock));
7784 dtrace_ecb_create_cache = NULL;
7785
7786 if (desc == NULL) {
7787 /*
7788 * If we're passed a NULL description, we're being asked to
7789 * create an ECB with a NULL probe.
7790 */
7791 (void) dtrace_ecb_create_enable(NULL, enab);
7792 return (0);
7793 }
7794
7795 dtrace_probekey(desc, &pkey);
7796 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7797 &priv, &uid, &zoneid);
7798
7799 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7800 enab));
7801}
7802
7803/*
7804 * DTrace Helper Provider Functions
7805 */
7806static void
7807dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7808{
7809 attr->dtat_name = DOF_ATTR_NAME(dofattr);
7810 attr->dtat_data = DOF_ATTR_DATA(dofattr);
7811 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7812}
7813
7814static void
7815dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7816 const dof_provider_t *dofprov, char *strtab)
7817{
7818 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7819 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7820 dofprov->dofpv_provattr);
7821 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7822 dofprov->dofpv_modattr);
7823 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7824 dofprov->dofpv_funcattr);
7825 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7826 dofprov->dofpv_nameattr);
7827 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7828 dofprov->dofpv_argsattr);
7829}
7830
7831static void
7832dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7833{
7834 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7835 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7836 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7837 dof_provider_t *provider;
7838 dof_probe_t *probe;
7839 uint32_t *off, *enoff;
7840 uint8_t *arg;
7841 char *strtab;
7842 uint_t i, nprobes;
7843 dtrace_helper_provdesc_t dhpv;
7844 dtrace_helper_probedesc_t dhpb;
7845 dtrace_meta_t *meta = dtrace_meta_pid;
7846 dtrace_mops_t *mops = &meta->dtm_mops;
7847 void *parg;
7848
7849 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7850 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7851 provider->dofpv_strtab * dof->dofh_secsize);
7852 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7853 provider->dofpv_probes * dof->dofh_secsize);
7854 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7855 provider->dofpv_prargs * dof->dofh_secsize);
7856 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7857 provider->dofpv_proffs * dof->dofh_secsize);
7858
7859 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7860 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7861 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7862 enoff = NULL;
7863
7864 /*
7865 * See dtrace_helper_provider_validate().
7866 */
7867 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7868 provider->dofpv_prenoffs != DOF_SECT_NONE) {
7869 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7870 provider->dofpv_prenoffs * dof->dofh_secsize);
7871 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7872 }
7873
7874 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7875
7876 /*
7877 * Create the provider.
7878 */
7879 dtrace_dofprov2hprov(&dhpv, provider, strtab);
7880
7881 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7882 return;
7883
7884 meta->dtm_count++;
7885
7886 /*
7887 * Create the probes.
7888 */
7889 for (i = 0; i < nprobes; i++) {
7890 probe = (dof_probe_t *)(uintptr_t)(daddr +
7891 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7892
7893 dhpb.dthpb_mod = dhp->dofhp_mod;
7894 dhpb.dthpb_func = strtab + probe->dofpr_func;
7895 dhpb.dthpb_name = strtab + probe->dofpr_name;
7896 dhpb.dthpb_base = probe->dofpr_addr;
7897 dhpb.dthpb_offs = off + probe->dofpr_offidx;
7898 dhpb.dthpb_noffs = probe->dofpr_noffs;
7899 if (enoff != NULL) {
7900 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7901 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7902 } else {
7903 dhpb.dthpb_enoffs = NULL;
7904 dhpb.dthpb_nenoffs = 0;
7905 }
7906 dhpb.dthpb_args = arg + probe->dofpr_argidx;
7907 dhpb.dthpb_nargc = probe->dofpr_nargc;
7908 dhpb.dthpb_xargc = probe->dofpr_xargc;
7909 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7910 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7911
7912 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7913 }
7914}
7915
7916static void
7917dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7918{
7919 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7920 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7921 VBDTTYPE(uint32_t,int) i;
7922
7923 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7924
7925 for (i = 0; i < dof->dofh_secnum; i++) {
7926 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7927 dof->dofh_secoff + i * dof->dofh_secsize);
7928
7929 if (sec->dofs_type != DOF_SECT_PROVIDER)
7930 continue;
7931
7932 dtrace_helper_provide_one(dhp, sec, pid);
7933 }
7934
7935 /*
7936 * We may have just created probes, so we must now rematch against
7937 * any retained enablings. Note that this call will acquire both
7938 * cpu_lock and dtrace_lock; the fact that we are holding
7939 * dtrace_meta_lock now is what defines the ordering with respect to
7940 * these three locks.
7941 */
7942 dtrace_enabling_matchall();
7943}
7944
7945static void
7946dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7947{
7948 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7949 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7950 dof_sec_t *str_sec;
7951 dof_provider_t *provider;
7952 char *strtab;
7953 dtrace_helper_provdesc_t dhpv;
7954 dtrace_meta_t *meta = dtrace_meta_pid;
7955 dtrace_mops_t *mops = &meta->dtm_mops;
7956
7957 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7958 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7959 provider->dofpv_strtab * dof->dofh_secsize);
7960
7961 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7962
7963 /*
7964 * Create the provider.
7965 */
7966 dtrace_dofprov2hprov(&dhpv, provider, strtab);
7967
7968 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
7969
7970 meta->dtm_count--;
7971}
7972
7973static void
7974dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
7975{
7976 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7977 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7978 VBDTTYPE(uint32_t,int) i;
7979
7980 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7981
7982 for (i = 0; i < dof->dofh_secnum; i++) {
7983 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7984 dof->dofh_secoff + i * dof->dofh_secsize);
7985
7986 if (sec->dofs_type != DOF_SECT_PROVIDER)
7987 continue;
7988
7989 dtrace_helper_provider_remove_one(dhp, sec, pid);
7990 }
7991}
7992
7993/*
7994 * DTrace Meta Provider-to-Framework API Functions
7995 *
7996 * These functions implement the Meta Provider-to-Framework API, as described
7997 * in <sys/dtrace.h>.
7998 */
7999int
8000dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8001 dtrace_meta_provider_id_t *idp)
8002{
8003 dtrace_meta_t *meta;
8004 dtrace_helpers_t *help, *next;
8005 VBDTTYPE(uint32_t,int) i;
8006
8007 *idp = DTRACE_METAPROVNONE;
8008
8009 /*
8010 * We strictly don't need the name, but we hold onto it for
8011 * debuggability. All hail error queues!
8012 */
8013 if (name == NULL) {
8014 cmn_err(CE_WARN, "failed to register meta-provider: "
8015 "invalid name");
8016 return (EINVAL);
8017 }
8018
8019 if (mops == NULL ||
8020 mops->dtms_create_probe == NULL ||
8021 mops->dtms_provide_pid == NULL ||
8022 mops->dtms_remove_pid == NULL) {
8023 cmn_err(CE_WARN, "failed to register meta-register %s: "
8024 "invalid ops", name);
8025 return (EINVAL);
8026 }
8027
8028 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8029 meta->dtm_mops = *mops;
8030 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8031 (void) strcpy(meta->dtm_name, name);
8032 meta->dtm_arg = arg;
8033
8034 mutex_enter(&dtrace_meta_lock);
8035 mutex_enter(&dtrace_lock);
8036
8037 if (dtrace_meta_pid != NULL) {
8038 mutex_exit(&dtrace_lock);
8039 mutex_exit(&dtrace_meta_lock);
8040 cmn_err(CE_WARN, "failed to register meta-register %s: "
8041 "user-land meta-provider exists", name);
8042 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8043 kmem_free(meta, sizeof (dtrace_meta_t));
8044 return (EINVAL);
8045 }
8046
8047 dtrace_meta_pid = meta;
8048 *idp = (dtrace_meta_provider_id_t)meta;
8049
8050 /*
8051 * If there are providers and probes ready to go, pass them
8052 * off to the new meta provider now.
8053 */
8054
8055 help = dtrace_deferred_pid;
8056 dtrace_deferred_pid = NULL;
8057
8058 mutex_exit(&dtrace_lock);
8059
8060 while (help != NULL) {
8061 for (i = 0; i < help->dthps_nprovs; i++) {
8062 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8063 help->dthps_pid);
8064 }
8065
8066 next = help->dthps_next;
8067 help->dthps_next = NULL;
8068 help->dthps_prev = NULL;
8069 help->dthps_deferred = 0;
8070 help = next;
8071 }
8072
8073 mutex_exit(&dtrace_meta_lock);
8074
8075 return (0);
8076}
8077
8078int
8079dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8080{
8081 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8082
8083 mutex_enter(&dtrace_meta_lock);
8084 mutex_enter(&dtrace_lock);
8085
8086 if (old == dtrace_meta_pid) {
8087 pp = &dtrace_meta_pid;
8088 } else {
8089 panic("attempt to unregister non-existent "
8090 "dtrace meta-provider %p\n", (void *)old);
8091#ifdef VBOX
8092 return EINVAL;
8093#endif
8094 }
8095
8096 if (old->dtm_count != 0) {
8097 mutex_exit(&dtrace_lock);
8098 mutex_exit(&dtrace_meta_lock);
8099 return (EBUSY);
8100 }
8101
8102 *pp = NULL;
8103
8104 mutex_exit(&dtrace_lock);
8105 mutex_exit(&dtrace_meta_lock);
8106
8107 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8108 kmem_free(old, sizeof (dtrace_meta_t));
8109
8110 return (0);
8111}
8112
8113
8114/*
8115 * DTrace DIF Object Functions
8116 */
8117static int
8118dtrace_difo_err(uint_t pc, const char *format, ...)
8119{
8120 if (dtrace_err_verbose) {
8121 va_list alist;
8122
8123 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8124 va_start(alist, format);
8125 (void) vuprintf(format, alist);
8126 va_end(alist);
8127 }
8128
8129#ifdef DTRACE_ERRDEBUG
8130 dtrace_errdebug(format);
8131#endif
8132 return (1);
8133}
8134
8135/*
8136 * Validate a DTrace DIF object by checking the IR instructions. The following
8137 * rules are currently enforced by dtrace_difo_validate():
8138 *
8139 * 1. Each instruction must have a valid opcode
8140 * 2. Each register, string, variable, or subroutine reference must be valid
8141 * 3. No instruction can modify register %r0 (must be zero)
8142 * 4. All instruction reserved bits must be set to zero
8143 * 5. The last instruction must be a "ret" instruction
8144 * 6. All branch targets must reference a valid instruction _after_ the branch
8145 */
8146static int
8147dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8148 cred_t *cr)
8149{
8150#ifndef VBOX
8151 int err = 0, i;
8152#else
8153 int err = 0;
8154 uint_t i;
8155#endif
8156 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8157 int kcheckload;
8158 uint_t pc;
8159
8160 kcheckload = cr == NULL ||
8161 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8162
8163 dp->dtdo_destructive = 0;
8164
8165 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8166 dif_instr_t instr = dp->dtdo_buf[pc];
8167
8168 uint_t r1 = DIF_INSTR_R1(instr);
8169 uint_t r2 = DIF_INSTR_R2(instr);
8170 uint_t rd = DIF_INSTR_RD(instr);
8171 uint_t rs = DIF_INSTR_RS(instr);
8172 uint_t label = DIF_INSTR_LABEL(instr);
8173 uint_t v = DIF_INSTR_VAR(instr);
8174 uint_t subr = DIF_INSTR_SUBR(instr);
8175 uint_t type = DIF_INSTR_TYPE(instr);
8176 uint_t op = DIF_INSTR_OP(instr);
8177
8178 switch (op) {
8179 case DIF_OP_OR:
8180 case DIF_OP_XOR:
8181 case DIF_OP_AND:
8182 case DIF_OP_SLL:
8183 case DIF_OP_SRL:
8184 case DIF_OP_SRA:
8185 case DIF_OP_SUB:
8186 case DIF_OP_ADD:
8187 case DIF_OP_MUL:
8188 case DIF_OP_SDIV:
8189 case DIF_OP_UDIV:
8190 case DIF_OP_SREM:
8191 case DIF_OP_UREM:
8192 case DIF_OP_COPYS:
8193 if (r1 >= nregs)
8194 err += efunc(pc, "invalid register %u\n", r1);
8195 if (r2 >= nregs)
8196 err += efunc(pc, "invalid register %u\n", r2);
8197 if (rd >= nregs)
8198 err += efunc(pc, "invalid register %u\n", rd);
8199 if (rd == 0)
8200 err += efunc(pc, "cannot write to %r0\n");
8201 break;
8202 case DIF_OP_NOT:
8203 case DIF_OP_MOV:
8204 case DIF_OP_ALLOCS:
8205 if (r1 >= nregs)
8206 err += efunc(pc, "invalid register %u\n", r1);
8207 if (r2 != 0)
8208 err += efunc(pc, "non-zero reserved bits\n");
8209 if (rd >= nregs)
8210 err += efunc(pc, "invalid register %u\n", rd);
8211 if (rd == 0)
8212 err += efunc(pc, "cannot write to %r0\n");
8213 break;
8214 case DIF_OP_LDSB:
8215 case DIF_OP_LDSH:
8216 case DIF_OP_LDSW:
8217 case DIF_OP_LDUB:
8218 case DIF_OP_LDUH:
8219 case DIF_OP_LDUW:
8220 case DIF_OP_LDX:
8221 if (r1 >= nregs)
8222 err += efunc(pc, "invalid register %u\n", r1);
8223 if (r2 != 0)
8224 err += efunc(pc, "non-zero reserved bits\n");
8225 if (rd >= nregs)
8226 err += efunc(pc, "invalid register %u\n", rd);
8227 if (rd == 0)
8228 err += efunc(pc, "cannot write to %r0\n");
8229 if (kcheckload)
8230 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8231 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8232 break;
8233 case DIF_OP_RLDSB:
8234 case DIF_OP_RLDSH:
8235 case DIF_OP_RLDSW:
8236 case DIF_OP_RLDUB:
8237 case DIF_OP_RLDUH:
8238 case DIF_OP_RLDUW:
8239 case DIF_OP_RLDX:
8240 if (r1 >= nregs)
8241 err += efunc(pc, "invalid register %u\n", r1);
8242 if (r2 != 0)
8243 err += efunc(pc, "non-zero reserved bits\n");
8244 if (rd >= nregs)
8245 err += efunc(pc, "invalid register %u\n", rd);
8246 if (rd == 0)
8247 err += efunc(pc, "cannot write to %r0\n");
8248 break;
8249 case DIF_OP_ULDSB:
8250 case DIF_OP_ULDSH:
8251 case DIF_OP_ULDSW:
8252 case DIF_OP_ULDUB:
8253 case DIF_OP_ULDUH:
8254 case DIF_OP_ULDUW:
8255 case DIF_OP_ULDX:
8256 if (r1 >= nregs)
8257 err += efunc(pc, "invalid register %u\n", r1);
8258 if (r2 != 0)
8259 err += efunc(pc, "non-zero reserved bits\n");
8260 if (rd >= nregs)
8261 err += efunc(pc, "invalid register %u\n", rd);
8262 if (rd == 0)
8263 err += efunc(pc, "cannot write to %r0\n");
8264 break;
8265 case DIF_OP_STB:
8266 case DIF_OP_STH:
8267 case DIF_OP_STW:
8268 case DIF_OP_STX:
8269 if (r1 >= nregs)
8270 err += efunc(pc, "invalid register %u\n", r1);
8271 if (r2 != 0)
8272 err += efunc(pc, "non-zero reserved bits\n");
8273 if (rd >= nregs)
8274 err += efunc(pc, "invalid register %u\n", rd);
8275 if (rd == 0)
8276 err += efunc(pc, "cannot write to 0 address\n");
8277 break;
8278 case DIF_OP_CMP:
8279 case DIF_OP_SCMP:
8280 if (r1 >= nregs)
8281 err += efunc(pc, "invalid register %u\n", r1);
8282 if (r2 >= nregs)
8283 err += efunc(pc, "invalid register %u\n", r2);
8284 if (rd != 0)
8285 err += efunc(pc, "non-zero reserved bits\n");
8286 break;
8287 case DIF_OP_TST:
8288 if (r1 >= nregs)
8289 err += efunc(pc, "invalid register %u\n", r1);
8290 if (r2 != 0 || rd != 0)
8291 err += efunc(pc, "non-zero reserved bits\n");
8292 break;
8293 case DIF_OP_BA:
8294 case DIF_OP_BE:
8295 case DIF_OP_BNE:
8296 case DIF_OP_BG:
8297 case DIF_OP_BGU:
8298 case DIF_OP_BGE:
8299 case DIF_OP_BGEU:
8300 case DIF_OP_BL:
8301 case DIF_OP_BLU:
8302 case DIF_OP_BLE:
8303 case DIF_OP_BLEU:
8304 if (label >= dp->dtdo_len) {
8305 err += efunc(pc, "invalid branch target %u\n",
8306 label);
8307 }
8308 if (label <= pc) {
8309 err += efunc(pc, "backward branch to %u\n",
8310 label);
8311 }
8312 break;
8313 case DIF_OP_RET:
8314 if (r1 != 0 || r2 != 0)
8315 err += efunc(pc, "non-zero reserved bits\n");
8316 if (rd >= nregs)
8317 err += efunc(pc, "invalid register %u\n", rd);
8318 break;
8319 case DIF_OP_NOP:
8320 case DIF_OP_POPTS:
8321 case DIF_OP_FLUSHTS:
8322 if (r1 != 0 || r2 != 0 || rd != 0)
8323 err += efunc(pc, "non-zero reserved bits\n");
8324 break;
8325 case DIF_OP_SETX:
8326 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8327 err += efunc(pc, "invalid integer ref %u\n",
8328 DIF_INSTR_INTEGER(instr));
8329 }
8330 if (rd >= nregs)
8331 err += efunc(pc, "invalid register %u\n", rd);
8332 if (rd == 0)
8333 err += efunc(pc, "cannot write to %r0\n");
8334 break;
8335 case DIF_OP_SETS:
8336 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8337 err += efunc(pc, "invalid string ref %u\n",
8338 DIF_INSTR_STRING(instr));
8339 }
8340 if (rd >= nregs)
8341 err += efunc(pc, "invalid register %u\n", rd);
8342 if (rd == 0)
8343 err += efunc(pc, "cannot write to %r0\n");
8344 break;
8345 case DIF_OP_LDGA:
8346 case DIF_OP_LDTA:
8347 if (r1 > DIF_VAR_ARRAY_MAX)
8348 err += efunc(pc, "invalid array %u\n", r1);
8349 if (r2 >= nregs)
8350 err += efunc(pc, "invalid register %u\n", r2);
8351 if (rd >= nregs)
8352 err += efunc(pc, "invalid register %u\n", rd);
8353 if (rd == 0)
8354 err += efunc(pc, "cannot write to %r0\n");
8355 break;
8356 case DIF_OP_LDGS:
8357 case DIF_OP_LDTS:
8358 case DIF_OP_LDLS:
8359 case DIF_OP_LDGAA:
8360 case DIF_OP_LDTAA:
8361 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8362 err += efunc(pc, "invalid variable %u\n", v);
8363 if (rd >= nregs)
8364 err += efunc(pc, "invalid register %u\n", rd);
8365 if (rd == 0)
8366 err += efunc(pc, "cannot write to %r0\n");
8367 break;
8368 case DIF_OP_STGS:
8369 case DIF_OP_STTS:
8370 case DIF_OP_STLS:
8371 case DIF_OP_STGAA:
8372 case DIF_OP_STTAA:
8373 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8374 err += efunc(pc, "invalid variable %u\n", v);
8375 if (rs >= nregs)
8376 err += efunc(pc, "invalid register %u\n", rd);
8377 break;
8378 case DIF_OP_CALL:
8379 if (subr > DIF_SUBR_MAX)
8380 err += efunc(pc, "invalid subr %u\n", subr);
8381 if (rd >= nregs)
8382 err += efunc(pc, "invalid register %u\n", rd);
8383 if (rd == 0)
8384 err += efunc(pc, "cannot write to %r0\n");
8385
8386 if (subr == DIF_SUBR_COPYOUT ||
8387 subr == DIF_SUBR_COPYOUTSTR) {
8388 dp->dtdo_destructive = 1;
8389 }
8390 break;
8391 case DIF_OP_PUSHTR:
8392 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8393 err += efunc(pc, "invalid ref type %u\n", type);
8394 if (r2 >= nregs)
8395 err += efunc(pc, "invalid register %u\n", r2);
8396 if (rs >= nregs)
8397 err += efunc(pc, "invalid register %u\n", rs);
8398 break;
8399 case DIF_OP_PUSHTV:
8400 if (type != DIF_TYPE_CTF)
8401 err += efunc(pc, "invalid val type %u\n", type);
8402 if (r2 >= nregs)
8403 err += efunc(pc, "invalid register %u\n", r2);
8404 if (rs >= nregs)
8405 err += efunc(pc, "invalid register %u\n", rs);
8406 break;
8407 default:
8408 err += efunc(pc, "invalid opcode %u\n",
8409 DIF_INSTR_OP(instr));
8410 }
8411 }
8412
8413 if (dp->dtdo_len != 0 &&
8414 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8415 err += efunc(dp->dtdo_len - 1,
8416 "expected 'ret' as last DIF instruction\n");
8417 }
8418
8419 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8420 /*
8421 * If we're not returning by reference, the size must be either
8422 * 0 or the size of one of the base types.
8423 */
8424 switch (dp->dtdo_rtype.dtdt_size) {
8425 case 0:
8426 case sizeof (uint8_t):
8427 case sizeof (uint16_t):
8428 case sizeof (uint32_t):
8429 case sizeof (uint64_t):
8430 break;
8431
8432 default:
8433 err += efunc(dp->dtdo_len - 1, "bad return size\n");
8434 }
8435 }
8436
8437 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8438 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8439 dtrace_diftype_t *vt, *et;
8440 uint_t id, ndx;
8441
8442 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8443 v->dtdv_scope != DIFV_SCOPE_THREAD &&
8444 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8445 err += efunc(i, "unrecognized variable scope %d\n",
8446 v->dtdv_scope);
8447 break;
8448 }
8449
8450 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8451 v->dtdv_kind != DIFV_KIND_SCALAR) {
8452 err += efunc(i, "unrecognized variable type %d\n",
8453 v->dtdv_kind);
8454 break;
8455 }
8456
8457 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8458 err += efunc(i, "%d exceeds variable id limit\n", id);
8459 break;
8460 }
8461
8462 if (id < DIF_VAR_OTHER_UBASE)
8463 continue;
8464
8465 /*
8466 * For user-defined variables, we need to check that this
8467 * definition is identical to any previous definition that we
8468 * encountered.
8469 */
8470 ndx = id - DIF_VAR_OTHER_UBASE;
8471
8472 switch (v->dtdv_scope) {
8473 case DIFV_SCOPE_GLOBAL:
8474 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nglobals) {
8475 dtrace_statvar_t *svar;
8476
8477 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8478 existing = &svar->dtsv_var;
8479 }
8480
8481 break;
8482
8483 case DIFV_SCOPE_THREAD:
8484 if (VBDTCAST(int64_t)ndx < vstate->dtvs_ntlocals)
8485 existing = &vstate->dtvs_tlocals[ndx];
8486 break;
8487
8488 case DIFV_SCOPE_LOCAL:
8489 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nlocals) {
8490 dtrace_statvar_t *svar;
8491
8492 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8493 existing = &svar->dtsv_var;
8494 }
8495
8496 break;
8497 }
8498
8499 vt = &v->dtdv_type;
8500
8501 if (vt->dtdt_flags & DIF_TF_BYREF) {
8502 if (vt->dtdt_size == 0) {
8503 err += efunc(i, "zero-sized variable\n");
8504 break;
8505 }
8506
8507 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8508 vt->dtdt_size > dtrace_global_maxsize) {
8509 err += efunc(i, "oversized by-ref global\n");
8510 break;
8511 }
8512 }
8513
8514 if (existing == NULL || existing->dtdv_id == 0)
8515 continue;
8516
8517 ASSERT(existing->dtdv_id == v->dtdv_id);
8518 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8519
8520 if (existing->dtdv_kind != v->dtdv_kind)
8521 err += efunc(i, "%d changed variable kind\n", id);
8522
8523 et = &existing->dtdv_type;
8524
8525 if (vt->dtdt_flags != et->dtdt_flags) {
8526 err += efunc(i, "%d changed variable type flags\n", id);
8527 break;
8528 }
8529
8530 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8531 err += efunc(i, "%d changed variable type size\n", id);
8532 break;
8533 }
8534 }
8535
8536 return (err);
8537}
8538
8539/*
8540 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
8541 * are much more constrained than normal DIFOs. Specifically, they may
8542 * not:
8543 *
8544 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8545 * miscellaneous string routines
8546 * 2. Access DTrace variables other than the args[] array, and the
8547 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8548 * 3. Have thread-local variables.
8549 * 4. Have dynamic variables.
8550 */
8551static int
8552dtrace_difo_validate_helper(dtrace_difo_t *dp)
8553{
8554 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8555 int err = 0;
8556 uint_t pc;
8557
8558 for (pc = 0; pc < dp->dtdo_len; pc++) {
8559 dif_instr_t instr = dp->dtdo_buf[pc];
8560
8561 uint_t v = DIF_INSTR_VAR(instr);
8562 uint_t subr = DIF_INSTR_SUBR(instr);
8563 uint_t op = DIF_INSTR_OP(instr);
8564
8565 switch (op) {
8566 case DIF_OP_OR:
8567 case DIF_OP_XOR:
8568 case DIF_OP_AND:
8569 case DIF_OP_SLL:
8570 case DIF_OP_SRL:
8571 case DIF_OP_SRA:
8572 case DIF_OP_SUB:
8573 case DIF_OP_ADD:
8574 case DIF_OP_MUL:
8575 case DIF_OP_SDIV:
8576 case DIF_OP_UDIV:
8577 case DIF_OP_SREM:
8578 case DIF_OP_UREM:
8579 case DIF_OP_COPYS:
8580 case DIF_OP_NOT:
8581 case DIF_OP_MOV:
8582 case DIF_OP_RLDSB:
8583 case DIF_OP_RLDSH:
8584 case DIF_OP_RLDSW:
8585 case DIF_OP_RLDUB:
8586 case DIF_OP_RLDUH:
8587 case DIF_OP_RLDUW:
8588 case DIF_OP_RLDX:
8589 case DIF_OP_ULDSB:
8590 case DIF_OP_ULDSH:
8591 case DIF_OP_ULDSW:
8592 case DIF_OP_ULDUB:
8593 case DIF_OP_ULDUH:
8594 case DIF_OP_ULDUW:
8595 case DIF_OP_ULDX:
8596 case DIF_OP_STB:
8597 case DIF_OP_STH:
8598 case DIF_OP_STW:
8599 case DIF_OP_STX:
8600 case DIF_OP_ALLOCS:
8601 case DIF_OP_CMP:
8602 case DIF_OP_SCMP:
8603 case DIF_OP_TST:
8604 case DIF_OP_BA:
8605 case DIF_OP_BE:
8606 case DIF_OP_BNE:
8607 case DIF_OP_BG:
8608 case DIF_OP_BGU:
8609 case DIF_OP_BGE:
8610 case DIF_OP_BGEU:
8611 case DIF_OP_BL:
8612 case DIF_OP_BLU:
8613 case DIF_OP_BLE:
8614 case DIF_OP_BLEU:
8615 case DIF_OP_RET:
8616 case DIF_OP_NOP:
8617 case DIF_OP_POPTS:
8618 case DIF_OP_FLUSHTS:
8619 case DIF_OP_SETX:
8620 case DIF_OP_SETS:
8621 case DIF_OP_LDGA:
8622 case DIF_OP_LDLS:
8623 case DIF_OP_STGS:
8624 case DIF_OP_STLS:
8625 case DIF_OP_PUSHTR:
8626 case DIF_OP_PUSHTV:
8627 break;
8628
8629 case DIF_OP_LDGS:
8630 if (v >= DIF_VAR_OTHER_UBASE)
8631 break;
8632
8633 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8634 break;
8635
8636 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8637 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8638 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8639 v == DIF_VAR_UID || v == DIF_VAR_GID)
8640 break;
8641
8642 err += efunc(pc, "illegal variable %u\n", v);
8643 break;
8644
8645 case DIF_OP_LDTA:
8646 case DIF_OP_LDTS:
8647 case DIF_OP_LDGAA:
8648 case DIF_OP_LDTAA:
8649 err += efunc(pc, "illegal dynamic variable load\n");
8650 break;
8651
8652 case DIF_OP_STTS:
8653 case DIF_OP_STGAA:
8654 case DIF_OP_STTAA:
8655 err += efunc(pc, "illegal dynamic variable store\n");
8656 break;
8657
8658 case DIF_OP_CALL:
8659 if (subr == DIF_SUBR_ALLOCA ||
8660 subr == DIF_SUBR_BCOPY ||
8661 subr == DIF_SUBR_COPYIN ||
8662 subr == DIF_SUBR_COPYINTO ||
8663 subr == DIF_SUBR_COPYINSTR ||
8664 subr == DIF_SUBR_INDEX ||
8665 subr == DIF_SUBR_INET_NTOA ||
8666 subr == DIF_SUBR_INET_NTOA6 ||
8667 subr == DIF_SUBR_INET_NTOP ||
8668 subr == DIF_SUBR_LLTOSTR ||
8669 subr == DIF_SUBR_RINDEX ||
8670 subr == DIF_SUBR_STRCHR ||
8671 subr == DIF_SUBR_STRJOIN ||
8672 subr == DIF_SUBR_STRRCHR ||
8673 subr == DIF_SUBR_STRSTR ||
8674 subr == DIF_SUBR_HTONS ||
8675 subr == DIF_SUBR_HTONL ||
8676 subr == DIF_SUBR_HTONLL ||
8677 subr == DIF_SUBR_NTOHS ||
8678 subr == DIF_SUBR_NTOHL ||
8679 subr == DIF_SUBR_NTOHLL)
8680 break;
8681
8682 err += efunc(pc, "invalid subr %u\n", subr);
8683 break;
8684
8685 default:
8686 err += efunc(pc, "invalid opcode %u\n",
8687 DIF_INSTR_OP(instr));
8688 }
8689 }
8690
8691 return (err);
8692}
8693
8694/*
8695 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8696 * basis; 0 if not.
8697 */
8698static int
8699dtrace_difo_cacheable(dtrace_difo_t *dp)
8700{
8701 VBDTTYPE(uint_t,int) i;
8702
8703 if (dp == NULL)
8704 return (0);
8705
8706 for (i = 0; i < dp->dtdo_varlen; i++) {
8707 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8708
8709 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8710 continue;
8711
8712 switch (v->dtdv_id) {
8713 case DIF_VAR_CURTHREAD:
8714 case DIF_VAR_PID:
8715 case DIF_VAR_TID:
8716 case DIF_VAR_EXECNAME:
8717 case DIF_VAR_ZONENAME:
8718 break;
8719
8720 default:
8721 return (0);
8722 }
8723 }
8724
8725 /*
8726 * This DIF object may be cacheable. Now we need to look for any
8727 * array loading instructions, any memory loading instructions, or
8728 * any stores to thread-local variables.
8729 */
8730 for (i = 0; i < dp->dtdo_len; i++) {
8731 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8732
8733 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8734 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8735 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8736 op == DIF_OP_LDGA || op == DIF_OP_STTS)
8737 return (0);
8738 }
8739
8740 return (1);
8741}
8742
8743static void
8744dtrace_difo_hold(dtrace_difo_t *dp)
8745{
8746#ifndef VBOX
8747 VBDTTYPE(uint_t,int) i;
8748#endif
8749
8750 ASSERT(MUTEX_HELD(&dtrace_lock));
8751
8752 dp->dtdo_refcnt++;
8753 ASSERT(dp->dtdo_refcnt != 0);
8754
8755#ifndef VBOX
8756 /*
8757 * We need to check this DIF object for references to the variable
8758 * DIF_VAR_VTIMESTAMP.
8759 */
8760 for (i = 0; i < dp->dtdo_varlen; i++) {
8761 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8762
8763 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8764 continue;
8765
8766 if (dtrace_vtime_references++ == 0)
8767 dtrace_vtime_enable();
8768 }
8769#endif
8770}
8771
8772/*
8773 * This routine calculates the dynamic variable chunksize for a given DIF
8774 * object. The calculation is not fool-proof, and can probably be tricked by
8775 * malicious DIF -- but it works for all compiler-generated DIF. Because this
8776 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8777 * if a dynamic variable size exceeds the chunksize.
8778 */
8779static void
8780dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8781{
8782 uint64_t sval;
8783 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8784 const dif_instr_t *text = dp->dtdo_buf;
8785 uint_t pc, srd = 0;
8786 uint_t ttop = 0;
8787 size_t size, ksize;
8788 uint_t id, i;
8789
8790 for (pc = 0; pc < dp->dtdo_len; pc++) {
8791 dif_instr_t instr = text[pc];
8792 uint_t op = DIF_INSTR_OP(instr);
8793 uint_t rd = DIF_INSTR_RD(instr);
8794 uint_t r1 = DIF_INSTR_R1(instr);
8795 uint_t nkeys = 0;
8796 uchar_t scope;
8797
8798 dtrace_key_t *key = tupregs;
8799
8800 switch (op) {
8801 case DIF_OP_SETX:
8802 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8803 srd = rd;
8804 continue;
8805
8806 case DIF_OP_STTS:
8807 key = &tupregs[DIF_DTR_NREGS];
8808 key[0].dttk_size = 0;
8809 key[1].dttk_size = 0;
8810 nkeys = 2;
8811 scope = DIFV_SCOPE_THREAD;
8812 break;
8813
8814 case DIF_OP_STGAA:
8815 case DIF_OP_STTAA:
8816 nkeys = ttop;
8817
8818 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8819 key[nkeys++].dttk_size = 0;
8820
8821 key[nkeys++].dttk_size = 0;
8822
8823 if (op == DIF_OP_STTAA) {
8824 scope = DIFV_SCOPE_THREAD;
8825 } else {
8826 scope = DIFV_SCOPE_GLOBAL;
8827 }
8828
8829 break;
8830
8831 case DIF_OP_PUSHTR:
8832 if (ttop == DIF_DTR_NREGS)
8833 return;
8834
8835 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8836 /*
8837 * If the register for the size of the "pushtr"
8838 * is %r0 (or the value is 0) and the type is
8839 * a string, we'll use the system-wide default
8840 * string size.
8841 */
8842 tupregs[ttop++].dttk_size =
8843 dtrace_strsize_default;
8844 } else {
8845 if (srd == 0)
8846 return;
8847
8848 tupregs[ttop++].dttk_size = sval;
8849 }
8850
8851 break;
8852
8853 case DIF_OP_PUSHTV:
8854 if (ttop == DIF_DTR_NREGS)
8855 return;
8856
8857 tupregs[ttop++].dttk_size = 0;
8858 break;
8859
8860 case DIF_OP_FLUSHTS:
8861 ttop = 0;
8862 break;
8863
8864 case DIF_OP_POPTS:
8865 if (ttop != 0)
8866 ttop--;
8867 break;
8868 }
8869
8870 sval = 0;
8871 srd = 0;
8872
8873 if (nkeys == 0)
8874 continue;
8875
8876 /*
8877 * We have a dynamic variable allocation; calculate its size.
8878 */
8879 for (ksize = 0, i = 0; i < nkeys; i++)
8880 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8881
8882 size = sizeof (dtrace_dynvar_t);
8883 size += sizeof (dtrace_key_t) * (nkeys - 1);
8884 size += ksize;
8885
8886 /*
8887 * Now we need to determine the size of the stored data.
8888 */
8889 id = DIF_INSTR_VAR(instr);
8890
8891 for (i = 0; i < dp->dtdo_varlen; i++) {
8892 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8893
8894 if (v->dtdv_id == id && v->dtdv_scope == scope) {
8895 size += v->dtdv_type.dtdt_size;
8896 break;
8897 }
8898 }
8899
8900 if (i == dp->dtdo_varlen)
8901 return;
8902
8903 /*
8904 * We have the size. If this is larger than the chunk size
8905 * for our dynamic variable state, reset the chunk size.
8906 */
8907 size = P2ROUNDUP(size, sizeof (uint64_t));
8908
8909 if (size > vstate->dtvs_dynvars.dtds_chunksize)
8910 vstate->dtvs_dynvars.dtds_chunksize = size;
8911 }
8912}
8913
8914static void
8915dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8916{
8917#ifndef VBOX
8918 int i, oldsvars, osz, nsz, otlocals, ntlocals;
8919#else
8920 int oldsvars, osz, nsz, otlocals, ntlocals;
8921 uint_t i;
8922#endif
8923 uint_t id;
8924
8925 ASSERT(MUTEX_HELD(&dtrace_lock));
8926 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8927
8928 for (i = 0; i < dp->dtdo_varlen; i++) {
8929 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8930 dtrace_statvar_t *svar, ***svarp;
8931 size_t dsize = 0;
8932 uint8_t scope = v->dtdv_scope;
8933 int *np;
8934
8935 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8936 continue;
8937
8938 id -= DIF_VAR_OTHER_UBASE;
8939
8940 switch (scope) {
8941 case DIFV_SCOPE_THREAD:
8942 while (VBDTCAST(int64_t)id >= (otlocals = vstate->dtvs_ntlocals)) {
8943 dtrace_difv_t *tlocals;
8944
8945 if ((ntlocals = (otlocals << 1)) == 0)
8946 ntlocals = 1;
8947
8948 osz = otlocals * sizeof (dtrace_difv_t);
8949 nsz = ntlocals * sizeof (dtrace_difv_t);
8950
8951 tlocals = kmem_zalloc(nsz, KM_SLEEP);
8952
8953 if (osz != 0) {
8954 bcopy(vstate->dtvs_tlocals,
8955 tlocals, osz);
8956 kmem_free(vstate->dtvs_tlocals, osz);
8957 }
8958
8959 vstate->dtvs_tlocals = tlocals;
8960 vstate->dtvs_ntlocals = ntlocals;
8961 }
8962
8963 vstate->dtvs_tlocals[id] = *v;
8964 continue;
8965
8966 case DIFV_SCOPE_LOCAL:
8967 np = &vstate->dtvs_nlocals;
8968 svarp = &vstate->dtvs_locals;
8969
8970 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8971 dsize = NCPU * (v->dtdv_type.dtdt_size +
8972 sizeof (uint64_t));
8973 else
8974 dsize = NCPU * sizeof (uint64_t);
8975
8976 break;
8977
8978 case DIFV_SCOPE_GLOBAL:
8979 np = &vstate->dtvs_nglobals;
8980 svarp = &vstate->dtvs_globals;
8981
8982 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8983 dsize = v->dtdv_type.dtdt_size +
8984 sizeof (uint64_t);
8985
8986 break;
8987
8988 default:
8989#ifndef VBOX
8990 ASSERT(0);
8991#else
8992 AssertFatalMsgFailed(("%d\n", scope));
8993#endif
8994 }
8995
8996 while (VBDTCAST(int64_t)id >= (oldsvars = *np)) {
8997 dtrace_statvar_t **statics;
8998 int newsvars, oldsize, newsize;
8999
9000 if ((newsvars = (oldsvars << 1)) == 0)
9001 newsvars = 1;
9002
9003 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9004 newsize = newsvars * sizeof (dtrace_statvar_t *);
9005
9006 statics = kmem_zalloc(newsize, KM_SLEEP);
9007
9008 if (oldsize != 0) {
9009 bcopy(*svarp, statics, oldsize);
9010 kmem_free(*svarp, oldsize);
9011 }
9012
9013 *svarp = statics;
9014 *np = newsvars;
9015 }
9016
9017 if ((svar = (*svarp)[id]) == NULL) {
9018 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9019 svar->dtsv_var = *v;
9020
9021 if ((svar->dtsv_size = dsize) != 0) {
9022 svar->dtsv_data = (uint64_t)(uintptr_t)
9023 kmem_zalloc(dsize, KM_SLEEP);
9024 }
9025
9026 (*svarp)[id] = svar;
9027 }
9028
9029 svar->dtsv_refcnt++;
9030 }
9031
9032 dtrace_difo_chunksize(dp, vstate);
9033 dtrace_difo_hold(dp);
9034}
9035
9036static dtrace_difo_t *
9037dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9038{
9039 dtrace_difo_t *new;
9040 size_t sz;
9041
9042 ASSERT(dp->dtdo_buf != NULL);
9043 ASSERT(dp->dtdo_refcnt != 0);
9044
9045 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9046
9047 ASSERT(dp->dtdo_buf != NULL);
9048 sz = dp->dtdo_len * sizeof (dif_instr_t);
9049 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9050 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9051 new->dtdo_len = dp->dtdo_len;
9052
9053 if (dp->dtdo_strtab != NULL) {
9054 ASSERT(dp->dtdo_strlen != 0);
9055 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9056 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9057 new->dtdo_strlen = dp->dtdo_strlen;
9058 }
9059
9060 if (dp->dtdo_inttab != NULL) {
9061 ASSERT(dp->dtdo_intlen != 0);
9062 sz = dp->dtdo_intlen * sizeof (uint64_t);
9063 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9064 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9065 new->dtdo_intlen = dp->dtdo_intlen;
9066 }
9067
9068 if (dp->dtdo_vartab != NULL) {
9069 ASSERT(dp->dtdo_varlen != 0);
9070 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9071 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9072 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9073 new->dtdo_varlen = dp->dtdo_varlen;
9074 }
9075
9076 dtrace_difo_init(new, vstate);
9077 return (new);
9078}
9079
9080static void
9081dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9082{
9083 VBDTTYPE(uint_t,int) i;
9084
9085 ASSERT(dp->dtdo_refcnt == 0);
9086
9087 for (i = 0; i < dp->dtdo_varlen; i++) {
9088 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9089 dtrace_statvar_t *svar, **svarp;
9090 uint_t id;
9091 uint8_t scope = v->dtdv_scope;
9092 int *np;
9093
9094 switch (scope) {
9095 case DIFV_SCOPE_THREAD:
9096 continue;
9097
9098 case DIFV_SCOPE_LOCAL:
9099 np = &vstate->dtvs_nlocals;
9100 svarp = vstate->dtvs_locals;
9101 break;
9102
9103 case DIFV_SCOPE_GLOBAL:
9104 np = &vstate->dtvs_nglobals;
9105 svarp = vstate->dtvs_globals;
9106 break;
9107
9108 default:
9109#ifndef VBOX
9110 ASSERT(0);
9111#else
9112 AssertFatalMsgFailed(("%d\n", scope));
9113#endif
9114 }
9115
9116 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9117 continue;
9118
9119 id -= DIF_VAR_OTHER_UBASE;
9120 ASSERT(VBDTCAST(int64_t)id < *np);
9121
9122 svar = svarp[id];
9123 ASSERT(svar != NULL);
9124 ASSERT(svar->dtsv_refcnt > 0);
9125
9126 if (--svar->dtsv_refcnt > 0)
9127 continue;
9128
9129 if (svar->dtsv_size != 0) {
9130 ASSERT(svar->dtsv_data != NULL);
9131 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9132 svar->dtsv_size);
9133 }
9134
9135 kmem_free(svar, sizeof (dtrace_statvar_t));
9136 svarp[id] = NULL;
9137 }
9138
9139 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9140 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9141 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9142 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9143
9144 kmem_free(dp, sizeof (dtrace_difo_t));
9145}
9146
9147static void
9148dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9149{
9150#ifndef VBOX
9151 VBDTTYPE(uint_t,int) i;
9152#endif
9153
9154 ASSERT(MUTEX_HELD(&dtrace_lock));
9155 ASSERT(dp->dtdo_refcnt != 0);
9156
9157#ifndef VBOX
9158 for (i = 0; i < dp->dtdo_varlen; i++) {
9159 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9160
9161 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9162 continue;
9163
9164 ASSERT(dtrace_vtime_references > 0);
9165 if (--dtrace_vtime_references == 0)
9166 dtrace_vtime_disable();
9167 }
9168#endif
9169
9170 if (--dp->dtdo_refcnt == 0)
9171 dtrace_difo_destroy(dp, vstate);
9172}
9173
9174/*
9175 * DTrace Format Functions
9176 */
9177static uint16_t
9178dtrace_format_add(dtrace_state_t *state, char *str)
9179{
9180 char *fmt, **new;
9181 uint16_t ndx, len = VBDTCAST(uint16_t)strlen(str) + 1;
9182
9183 fmt = kmem_zalloc(len, KM_SLEEP);
9184 bcopy(str, fmt, len);
9185
9186 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9187 if (state->dts_formats[ndx] == NULL) {
9188 state->dts_formats[ndx] = fmt;
9189 return (ndx + 1);
9190 }
9191 }
9192
9193 if (state->dts_nformats == USHRT_MAX) {
9194 /*
9195 * This is only likely if a denial-of-service attack is being
9196 * attempted. As such, it's okay to fail silently here.
9197 */
9198 kmem_free(fmt, len);
9199 return (0);
9200 }
9201
9202 /*
9203 * For simplicity, we always resize the formats array to be exactly the
9204 * number of formats.
9205 */
9206 ndx = state->dts_nformats++;
9207 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9208
9209 if (state->dts_formats != NULL) {
9210 ASSERT(ndx != 0);
9211 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9212 kmem_free(state->dts_formats, ndx * sizeof (char *));
9213 }
9214
9215 state->dts_formats = new;
9216 state->dts_formats[ndx] = fmt;
9217
9218 return (ndx + 1);
9219}
9220
9221static void
9222dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9223{
9224 char *fmt;
9225
9226 ASSERT(state->dts_formats != NULL);
9227 ASSERT(format <= state->dts_nformats);
9228 ASSERT(state->dts_formats[format - 1] != NULL);
9229
9230 fmt = state->dts_formats[format - 1];
9231 kmem_free(fmt, strlen(fmt) + 1);
9232 state->dts_formats[format - 1] = NULL;
9233}
9234
9235static void
9236dtrace_format_destroy(dtrace_state_t *state)
9237{
9238 int i;
9239
9240 if (state->dts_nformats == 0) {
9241 ASSERT(state->dts_formats == NULL);
9242 return;
9243 }
9244
9245 ASSERT(state->dts_formats != NULL);
9246
9247 for (i = 0; i < state->dts_nformats; i++) {
9248 char *fmt = state->dts_formats[i];
9249
9250 if (fmt == NULL)
9251 continue;
9252
9253 kmem_free(fmt, strlen(fmt) + 1);
9254 }
9255
9256 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9257 state->dts_nformats = 0;
9258 state->dts_formats = NULL;
9259}
9260
9261/*
9262 * DTrace Predicate Functions
9263 */
9264static dtrace_predicate_t *
9265dtrace_predicate_create(dtrace_difo_t *dp)
9266{
9267 dtrace_predicate_t *pred;
9268
9269 ASSERT(MUTEX_HELD(&dtrace_lock));
9270 ASSERT(dp->dtdo_refcnt != 0);
9271
9272 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9273 pred->dtp_difo = dp;
9274 pred->dtp_refcnt = 1;
9275
9276 if (!dtrace_difo_cacheable(dp))
9277 return (pred);
9278
9279 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9280 /*
9281 * This is only theoretically possible -- we have had 2^32
9282 * cacheable predicates on this machine. We cannot allow any
9283 * more predicates to become cacheable: as unlikely as it is,
9284 * there may be a thread caching a (now stale) predicate cache
9285 * ID. (N.B.: the temptation is being successfully resisted to
9286 * have this cmn_err() "Holy shit -- we executed this code!")
9287 */
9288 return (pred);
9289 }
9290
9291 pred->dtp_cacheid = dtrace_predcache_id++;
9292
9293 return (pred);
9294}
9295
9296static void
9297dtrace_predicate_hold(dtrace_predicate_t *pred)
9298{
9299 ASSERT(MUTEX_HELD(&dtrace_lock));
9300 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9301 ASSERT(pred->dtp_refcnt > 0);
9302
9303 pred->dtp_refcnt++;
9304}
9305
9306static void
9307dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9308{
9309 dtrace_difo_t *dp = pred->dtp_difo;
9310
9311 ASSERT(MUTEX_HELD(&dtrace_lock));
9312 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9313 ASSERT(pred->dtp_refcnt > 0);
9314
9315 if (--pred->dtp_refcnt == 0) {
9316 dtrace_difo_release(pred->dtp_difo, vstate);
9317 kmem_free(pred, sizeof (dtrace_predicate_t));
9318 }
9319}
9320
9321/*
9322 * DTrace Action Description Functions
9323 */
9324static dtrace_actdesc_t *
9325dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9326 uint64_t uarg, uint64_t arg)
9327{
9328 dtrace_actdesc_t *act;
9329
9330 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9331 arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9332
9333 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9334 act->dtad_kind = kind;
9335 act->dtad_ntuple = ntuple;
9336 act->dtad_uarg = uarg;
9337 act->dtad_arg = arg;
9338 act->dtad_refcnt = 1;
9339
9340 return (act);
9341}
9342
9343static void
9344dtrace_actdesc_hold(dtrace_actdesc_t *act)
9345{
9346 ASSERT(act->dtad_refcnt >= 1);
9347 act->dtad_refcnt++;
9348}
9349
9350static void
9351dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9352{
9353 dtrace_actkind_t kind = act->dtad_kind;
9354 dtrace_difo_t *dp;
9355
9356 ASSERT(act->dtad_refcnt >= 1);
9357
9358 if (--act->dtad_refcnt != 0)
9359 return;
9360
9361 if ((dp = act->dtad_difo) != NULL)
9362 dtrace_difo_release(dp, vstate);
9363
9364 if (DTRACEACT_ISPRINTFLIKE(kind)) {
9365 char *str = (char *)(uintptr_t)act->dtad_arg;
9366
9367 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9368 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9369
9370 if (str != NULL)
9371 kmem_free(str, strlen(str) + 1);
9372 }
9373
9374 kmem_free(act, sizeof (dtrace_actdesc_t));
9375}
9376
9377/*
9378 * DTrace ECB Functions
9379 */
9380static dtrace_ecb_t *
9381dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9382{
9383 dtrace_ecb_t *ecb;
9384 dtrace_epid_t epid;
9385
9386 ASSERT(MUTEX_HELD(&dtrace_lock));
9387
9388 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9389 ecb->dte_predicate = NULL;
9390 ecb->dte_probe = probe;
9391
9392 /*
9393 * The default size is the size of the default action: recording
9394 * the epid.
9395 */
9396 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9397 ecb->dte_alignment = sizeof (dtrace_epid_t);
9398
9399 epid = state->dts_epid++;
9400
9401 if (VBDTCAST(int64_t)epid - 1 >= state->dts_necbs) {
9402 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9403 int necbs = state->dts_necbs << 1;
9404
9405 ASSERT(epid == VBDTCAST(dtrace_epid_t)state->dts_necbs + 1);
9406
9407 if (necbs == 0) {
9408 ASSERT(oecbs == NULL);
9409 necbs = 1;
9410 }
9411
9412 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9413
9414 if (oecbs != NULL)
9415 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9416
9417 dtrace_membar_producer();
9418 state->dts_ecbs = ecbs;
9419
9420 if (oecbs != NULL) {
9421 /*
9422 * If this state is active, we must dtrace_sync()
9423 * before we can free the old dts_ecbs array: we're
9424 * coming in hot, and there may be active ring
9425 * buffer processing (which indexes into the dts_ecbs
9426 * array) on another CPU.
9427 */
9428 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9429 dtrace_sync();
9430
9431 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9432 }
9433
9434 dtrace_membar_producer();
9435 state->dts_necbs = necbs;
9436 }
9437
9438 ecb->dte_state = state;
9439
9440 ASSERT(state->dts_ecbs[epid - 1] == NULL);
9441 dtrace_membar_producer();
9442 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9443
9444 return (ecb);
9445}
9446
9447static int
9448dtrace_ecb_enable(dtrace_ecb_t *ecb)
9449{
9450 dtrace_probe_t *probe = ecb->dte_probe;
9451
9452 ASSERT(MUTEX_HELD(&cpu_lock));
9453 ASSERT(MUTEX_HELD(&dtrace_lock));
9454 ASSERT(ecb->dte_next == NULL);
9455
9456 if (probe == NULL) {
9457 /*
9458 * This is the NULL probe -- there's nothing to do.
9459 */
9460 return (0);
9461 }
9462
9463 if (probe->dtpr_ecb == NULL) {
9464 dtrace_provider_t *prov = probe->dtpr_provider;
9465
9466 /*
9467 * We're the first ECB on this probe.
9468 */
9469 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9470
9471 if (ecb->dte_predicate != NULL)
9472 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9473
9474 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9475 probe->dtpr_id, probe->dtpr_arg));
9476 } else {
9477 /*
9478 * This probe is already active. Swing the last pointer to
9479 * point to the new ECB, and issue a dtrace_sync() to assure
9480 * that all CPUs have seen the change.
9481 */
9482 ASSERT(probe->dtpr_ecb_last != NULL);
9483 probe->dtpr_ecb_last->dte_next = ecb;
9484 probe->dtpr_ecb_last = ecb;
9485 probe->dtpr_predcache = 0;
9486
9487 dtrace_sync();
9488 return (0);
9489 }
9490}
9491
9492static void
9493dtrace_ecb_resize(dtrace_ecb_t *ecb)
9494{
9495 uint32_t maxalign = sizeof (dtrace_epid_t);
9496 uint32_t align = sizeof (uint8_t), offs, diff;
9497 dtrace_action_t *act;
9498 int wastuple = 0;
9499 uint32_t aggbase = UINT32_MAX;
9500 dtrace_state_t *state = ecb->dte_state;
9501
9502 /*
9503 * If we record anything, we always record the epid. (And we always
9504 * record it first.)
9505 */
9506 offs = sizeof (dtrace_epid_t);
9507 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9508
9509 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9510 dtrace_recdesc_t *rec = &act->dta_rec;
9511
9512 if ((align = rec->dtrd_alignment) > maxalign)
9513 maxalign = align;
9514
9515 if (!wastuple && act->dta_intuple) {
9516 /*
9517 * This is the first record in a tuple. Align the
9518 * offset to be at offset 4 in an 8-byte aligned
9519 * block.
9520 */
9521 diff = offs + sizeof (dtrace_aggid_t);
9522
9523 if ((diff = (diff & (sizeof (uint64_t) - 1))))
9524 offs += sizeof (uint64_t) - diff;
9525
9526 aggbase = offs - sizeof (dtrace_aggid_t);
9527 ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9528 }
9529
9530 /*LINTED*/
9531 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9532 /*
9533 * The current offset is not properly aligned; align it.
9534 */
9535 offs += align - diff;
9536 }
9537
9538 rec->dtrd_offset = offs;
9539
9540 if (offs + rec->dtrd_size > ecb->dte_needed) {
9541 ecb->dte_needed = offs + rec->dtrd_size;
9542
9543 if (ecb->dte_needed > state->dts_needed)
9544 state->dts_needed = ecb->dte_needed;
9545 }
9546
9547 if (DTRACEACT_ISAGG(act->dta_kind)) {
9548 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9549 dtrace_action_t *first = agg->dtag_first, *prev;
9550
9551 ASSERT(rec->dtrd_size != 0 && first != NULL);
9552 ASSERT(wastuple);
9553 ASSERT(aggbase != UINT32_MAX);
9554
9555 agg->dtag_base = aggbase;
9556
9557 while ((prev = first->dta_prev) != NULL &&
9558 DTRACEACT_ISAGG(prev->dta_kind)) {
9559 agg = (dtrace_aggregation_t *)prev;
9560 first = agg->dtag_first;
9561 }
9562
9563 if (prev != NULL) {
9564 offs = prev->dta_rec.dtrd_offset +
9565 prev->dta_rec.dtrd_size;
9566 } else {
9567 offs = sizeof (dtrace_epid_t);
9568 }
9569 wastuple = 0;
9570 } else {
9571 if (!act->dta_intuple)
9572 ecb->dte_size = offs + rec->dtrd_size;
9573
9574 offs += rec->dtrd_size;
9575 }
9576
9577 wastuple = act->dta_intuple;
9578 }
9579
9580 if ((act = ecb->dte_action) != NULL &&
9581 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9582 ecb->dte_size == sizeof (dtrace_epid_t)) {
9583 /*
9584 * If the size is still sizeof (dtrace_epid_t), then all
9585 * actions store no data; set the size to 0.
9586 */
9587 ecb->dte_alignment = maxalign;
9588 ecb->dte_size = 0;
9589
9590 /*
9591 * If the needed space is still sizeof (dtrace_epid_t), then
9592 * all actions need no additional space; set the needed
9593 * size to 0.
9594 */
9595 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9596 ecb->dte_needed = 0;
9597
9598 return;
9599 }
9600
9601 /*
9602 * Set our alignment, and make sure that the dte_size and dte_needed
9603 * are aligned to the size of an EPID.
9604 */
9605 ecb->dte_alignment = maxalign;
9606 ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9607 ~(sizeof (dtrace_epid_t) - 1);
9608 ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9609 ~(sizeof (dtrace_epid_t) - 1);
9610 ASSERT(ecb->dte_size <= ecb->dte_needed);
9611}
9612
9613static dtrace_action_t *
9614dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9615{
9616 dtrace_aggregation_t *agg;
9617 size_t size = sizeof (uint64_t);
9618 int ntuple = desc->dtad_ntuple;
9619 dtrace_action_t *act;
9620 dtrace_recdesc_t *frec;
9621 dtrace_aggid_t aggid;
9622 dtrace_state_t *state = ecb->dte_state;
9623
9624 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9625 agg->dtag_ecb = ecb;
9626
9627 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9628
9629 switch (desc->dtad_kind) {
9630 case DTRACEAGG_MIN:
9631 agg->dtag_initial = INT64_MAX;
9632 agg->dtag_aggregate = dtrace_aggregate_min;
9633 break;
9634
9635 case DTRACEAGG_MAX:
9636 agg->dtag_initial = INT64_MIN;
9637 agg->dtag_aggregate = dtrace_aggregate_max;
9638 break;
9639
9640 case DTRACEAGG_COUNT:
9641 agg->dtag_aggregate = dtrace_aggregate_count;
9642 break;
9643
9644 case DTRACEAGG_QUANTIZE:
9645 agg->dtag_aggregate = dtrace_aggregate_quantize;
9646 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9647 sizeof (uint64_t);
9648 break;
9649
9650 case DTRACEAGG_LQUANTIZE: {
9651 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9652 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9653
9654 agg->dtag_initial = desc->dtad_arg;
9655 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9656
9657 if (step == 0 || levels == 0)
9658 goto err;
9659
9660 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9661 break;
9662 }
9663
9664 case DTRACEAGG_AVG:
9665 agg->dtag_aggregate = dtrace_aggregate_avg;
9666 size = sizeof (uint64_t) * 2;
9667 break;
9668
9669 case DTRACEAGG_STDDEV:
9670 agg->dtag_aggregate = dtrace_aggregate_stddev;
9671 size = sizeof (uint64_t) * 4;
9672 break;
9673
9674 case DTRACEAGG_SUM:
9675 agg->dtag_aggregate = dtrace_aggregate_sum;
9676 break;
9677
9678 default:
9679 goto err;
9680 }
9681
9682 agg->dtag_action.dta_rec.dtrd_size = VBDTCAST(uint32_t)size;
9683
9684 if (ntuple == 0)
9685 goto err;
9686
9687 /*
9688 * We must make sure that we have enough actions for the n-tuple.
9689 */
9690 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9691 if (DTRACEACT_ISAGG(act->dta_kind))
9692 break;
9693
9694 if (--ntuple == 0) {
9695 /*
9696 * This is the action with which our n-tuple begins.
9697 */
9698 agg->dtag_first = act;
9699 goto success;
9700 }
9701 }
9702
9703 /*
9704 * This n-tuple is short by ntuple elements. Return failure.
9705 */
9706 ASSERT(ntuple != 0);
9707err:
9708 kmem_free(agg, sizeof (dtrace_aggregation_t));
9709 return (NULL);
9710
9711success:
9712 /*
9713 * If the last action in the tuple has a size of zero, it's actually
9714 * an expression argument for the aggregating action.
9715 */
9716 ASSERT(ecb->dte_action_last != NULL);
9717 act = ecb->dte_action_last;
9718
9719 if (act->dta_kind == DTRACEACT_DIFEXPR) {
9720 ASSERT(act->dta_difo != NULL);
9721
9722 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9723 agg->dtag_hasarg = 1;
9724 }
9725
9726 /*
9727 * We need to allocate an id for this aggregation.
9728 */
9729 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9730 VM_BESTFIT | VM_SLEEP);
9731
9732 if (VBDTCAST(int64_t)aggid - 1 >= state->dts_naggregations) {
9733 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9734 dtrace_aggregation_t **aggs;
9735 int naggs = state->dts_naggregations << 1;
9736 int onaggs = state->dts_naggregations;
9737
9738 ASSERT(aggid == VBDTCAST(dtrace_aggid_t)state->dts_naggregations + 1);
9739
9740 if (naggs == 0) {
9741 ASSERT(oaggs == NULL);
9742 naggs = 1;
9743 }
9744
9745 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9746
9747 if (oaggs != NULL) {
9748 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9749 kmem_free(oaggs, onaggs * sizeof (*aggs));
9750 }
9751
9752 state->dts_aggregations = aggs;
9753 state->dts_naggregations = naggs;
9754 }
9755
9756 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9757 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9758
9759 frec = &agg->dtag_first->dta_rec;
9760 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9761 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9762
9763 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9764 ASSERT(!act->dta_intuple);
9765 act->dta_intuple = 1;
9766 }
9767
9768 return (&agg->dtag_action);
9769}
9770
9771static void
9772dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9773{
9774 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9775 dtrace_state_t *state = ecb->dte_state;
9776 dtrace_aggid_t aggid = agg->dtag_id;
9777
9778 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9779 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9780
9781 ASSERT(state->dts_aggregations[aggid - 1] == agg);
9782 state->dts_aggregations[aggid - 1] = NULL;
9783
9784 kmem_free(agg, sizeof (dtrace_aggregation_t));
9785}
9786
9787static int
9788dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9789{
9790 dtrace_action_t *action, *last;
9791 dtrace_difo_t *dp = desc->dtad_difo;
9792 uint32_t size = 0, align = sizeof (uint8_t), mask;
9793 uint16_t format = 0;
9794 dtrace_recdesc_t *rec;
9795 dtrace_state_t *state = ecb->dte_state;
9796 dtrace_optval_t *opt = state->dts_options, nframes VBDTUNASS(0), strsize;
9797 uint64_t arg = desc->dtad_arg;
9798
9799 ASSERT(MUTEX_HELD(&dtrace_lock));
9800 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9801
9802 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9803 /*
9804 * If this is an aggregating action, there must be neither
9805 * a speculate nor a commit on the action chain.
9806 */
9807 dtrace_action_t *act;
9808
9809 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9810 if (act->dta_kind == DTRACEACT_COMMIT)
9811 return (EINVAL);
9812
9813 if (act->dta_kind == DTRACEACT_SPECULATE)
9814 return (EINVAL);
9815 }
9816
9817 action = dtrace_ecb_aggregation_create(ecb, desc);
9818
9819 if (action == NULL)
9820 return (EINVAL);
9821 } else {
9822 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9823 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9824 dp != NULL && dp->dtdo_destructive)) {
9825 state->dts_destructive = 1;
9826 }
9827
9828 switch (desc->dtad_kind) {
9829 case DTRACEACT_PRINTF:
9830 case DTRACEACT_PRINTA:
9831 case DTRACEACT_SYSTEM:
9832 case DTRACEACT_FREOPEN:
9833 /*
9834 * We know that our arg is a string -- turn it into a
9835 * format.
9836 */
9837 if (arg == NULL) {
9838 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9839 format = 0;
9840 } else {
9841 ASSERT(arg != NULL);
9842 ASSERT(arg > KERNELBASE);
9843 format = dtrace_format_add(state,
9844 (char *)(uintptr_t)arg);
9845 }
9846
9847 /*FALLTHROUGH*/
9848 case DTRACEACT_LIBACT:
9849 case DTRACEACT_DIFEXPR:
9850 if (dp == NULL)
9851 return (EINVAL);
9852
9853 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9854 break;
9855
9856 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9857 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9858 return (EINVAL);
9859
9860 size = opt[DTRACEOPT_STRSIZE];
9861 }
9862
9863 break;
9864
9865 case DTRACEACT_STACK:
9866 if ((nframes = arg) == 0) {
9867 nframes = opt[DTRACEOPT_STACKFRAMES];
9868 ASSERT(nframes > 0);
9869 arg = nframes;
9870 }
9871
9872 size = VBDTCAST(uint32_t)(nframes * sizeof (pc_t));
9873 break;
9874
9875 case DTRACEACT_JSTACK:
9876 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9877 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9878
9879 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9880 nframes = opt[DTRACEOPT_JSTACKFRAMES];
9881
9882 arg = DTRACE_USTACK_ARG(nframes, strsize);
9883
9884 /*FALLTHROUGH*/
9885 case DTRACEACT_USTACK:
9886 if (desc->dtad_kind != DTRACEACT_JSTACK &&
9887 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9888 strsize = DTRACE_USTACK_STRSIZE(arg);
9889 nframes = opt[DTRACEOPT_USTACKFRAMES];
9890 ASSERT(nframes > 0);
9891 arg = DTRACE_USTACK_ARG(nframes, strsize);
9892 }
9893
9894 /*
9895 * Save a slot for the pid.
9896 */
9897 size = VBDTCAST(uint32_t)((nframes + 1) * sizeof (uint64_t));
9898 size += DTRACE_USTACK_STRSIZE(arg);
9899 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9900
9901 break;
9902
9903 case DTRACEACT_SYM:
9904 case DTRACEACT_MOD:
9905 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9906 sizeof (uint64_t)) ||
9907 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9908 return (EINVAL);
9909 break;
9910
9911 case DTRACEACT_USYM:
9912 case DTRACEACT_UMOD:
9913 case DTRACEACT_UADDR:
9914 if (dp == NULL ||
9915 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9916 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9917 return (EINVAL);
9918
9919 /*
9920 * We have a slot for the pid, plus a slot for the
9921 * argument. To keep things simple (aligned with
9922 * bitness-neutral sizing), we store each as a 64-bit
9923 * quantity.
9924 */
9925 size = 2 * sizeof (uint64_t);
9926 break;
9927
9928 case DTRACEACT_STOP:
9929 case DTRACEACT_BREAKPOINT:
9930 case DTRACEACT_PANIC:
9931 break;
9932
9933 case DTRACEACT_CHILL:
9934 case DTRACEACT_DISCARD:
9935 case DTRACEACT_RAISE:
9936 if (dp == NULL)
9937 return (EINVAL);
9938 break;
9939
9940 case DTRACEACT_EXIT:
9941 if (dp == NULL ||
9942 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9943 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9944 return (EINVAL);
9945 break;
9946
9947 case DTRACEACT_SPECULATE:
9948 if (ecb->dte_size > sizeof (dtrace_epid_t))
9949 return (EINVAL);
9950
9951 if (dp == NULL)
9952 return (EINVAL);
9953
9954 state->dts_speculates = 1;
9955 break;
9956
9957 case DTRACEACT_COMMIT: {
9958 dtrace_action_t *act = ecb->dte_action;
9959
9960 for (; act != NULL; act = act->dta_next) {
9961 if (act->dta_kind == DTRACEACT_COMMIT)
9962 return (EINVAL);
9963 }
9964
9965 if (dp == NULL)
9966 return (EINVAL);
9967 break;
9968 }
9969
9970 default:
9971 return (EINVAL);
9972 }
9973
9974 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
9975 /*
9976 * If this is a data-storing action or a speculate,
9977 * we must be sure that there isn't a commit on the
9978 * action chain.
9979 */
9980 dtrace_action_t *act = ecb->dte_action;
9981
9982 for (; act != NULL; act = act->dta_next) {
9983 if (act->dta_kind == DTRACEACT_COMMIT)
9984 return (EINVAL);
9985 }
9986 }
9987
9988 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
9989 action->dta_rec.dtrd_size = size;
9990 }
9991
9992 action->dta_refcnt = 1;
9993 rec = &action->dta_rec;
9994 size = rec->dtrd_size;
9995
9996 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
9997 if (!(size & mask)) {
9998 align = mask + 1;
9999 break;
10000 }
10001 }
10002
10003 action->dta_kind = desc->dtad_kind;
10004
10005 if ((action->dta_difo = dp) != NULL)
10006 dtrace_difo_hold(dp);
10007
10008 rec->dtrd_action = action->dta_kind;
10009 rec->dtrd_arg = arg;
10010 rec->dtrd_uarg = desc->dtad_uarg;
10011 rec->dtrd_alignment = (uint16_t)align;
10012 rec->dtrd_format = format;
10013
10014 if ((last = ecb->dte_action_last) != NULL) {
10015 ASSERT(ecb->dte_action != NULL);
10016 action->dta_prev = last;
10017 last->dta_next = action;
10018 } else {
10019 ASSERT(ecb->dte_action == NULL);
10020 ecb->dte_action = action;
10021 }
10022
10023 ecb->dte_action_last = action;
10024
10025 return (0);
10026}
10027
10028static void
10029dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10030{
10031 dtrace_action_t *act = ecb->dte_action, *next;
10032 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10033 dtrace_difo_t *dp;
10034 uint16_t format;
10035
10036 if (act != NULL && act->dta_refcnt > 1) {
10037 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10038 act->dta_refcnt--;
10039 } else {
10040 for (; act != NULL; act = next) {
10041 next = act->dta_next;
10042 ASSERT(next != NULL || act == ecb->dte_action_last);
10043 ASSERT(act->dta_refcnt == 1);
10044
10045 if ((format = act->dta_rec.dtrd_format) != 0)
10046 dtrace_format_remove(ecb->dte_state, format);
10047
10048 if ((dp = act->dta_difo) != NULL)
10049 dtrace_difo_release(dp, vstate);
10050
10051 if (DTRACEACT_ISAGG(act->dta_kind)) {
10052 dtrace_ecb_aggregation_destroy(ecb, act);
10053 } else {
10054 kmem_free(act, sizeof (dtrace_action_t));
10055 }
10056 }
10057 }
10058
10059 ecb->dte_action = NULL;
10060 ecb->dte_action_last = NULL;
10061 ecb->dte_size = sizeof (dtrace_epid_t);
10062}
10063
10064static void
10065dtrace_ecb_disable(dtrace_ecb_t *ecb)
10066{
10067 /*
10068 * We disable the ECB by removing it from its probe.
10069 */
10070 dtrace_ecb_t *pecb, *prev = NULL;
10071 dtrace_probe_t *probe = ecb->dte_probe;
10072
10073 ASSERT(MUTEX_HELD(&dtrace_lock));
10074
10075 if (probe == NULL) {
10076 /*
10077 * This is the NULL probe; there is nothing to disable.
10078 */
10079 return;
10080 }
10081
10082 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10083 if (pecb == ecb)
10084 break;
10085 prev = pecb;
10086 }
10087
10088 ASSERT(pecb != NULL);
10089
10090 if (prev == NULL) {
10091 probe->dtpr_ecb = ecb->dte_next;
10092 } else {
10093 prev->dte_next = ecb->dte_next;
10094 }
10095
10096 if (ecb == probe->dtpr_ecb_last) {
10097 ASSERT(ecb->dte_next == NULL);
10098 probe->dtpr_ecb_last = prev;
10099 }
10100
10101 /*
10102 * The ECB has been disconnected from the probe; now sync to assure
10103 * that all CPUs have seen the change before returning.
10104 */
10105 dtrace_sync();
10106
10107 if (probe->dtpr_ecb == NULL) {
10108 /*
10109 * That was the last ECB on the probe; clear the predicate
10110 * cache ID for the probe, disable it and sync one more time
10111 * to assure that we'll never hit it again.
10112 */
10113 dtrace_provider_t *prov = probe->dtpr_provider;
10114
10115 ASSERT(ecb->dte_next == NULL);
10116 ASSERT(probe->dtpr_ecb_last == NULL);
10117 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10118 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10119 probe->dtpr_id, probe->dtpr_arg);
10120 dtrace_sync();
10121 } else {
10122 /*
10123 * There is at least one ECB remaining on the probe. If there
10124 * is _exactly_ one, set the probe's predicate cache ID to be
10125 * the predicate cache ID of the remaining ECB.
10126 */
10127 ASSERT(probe->dtpr_ecb_last != NULL);
10128 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10129
10130 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10131 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10132
10133 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10134
10135 if (p != NULL)
10136 probe->dtpr_predcache = p->dtp_cacheid;
10137 }
10138
10139 ecb->dte_next = NULL;
10140 }
10141}
10142
10143static void
10144dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10145{
10146 dtrace_state_t *state = ecb->dte_state;
10147 dtrace_vstate_t *vstate = &state->dts_vstate;
10148 dtrace_predicate_t *pred;
10149 dtrace_epid_t epid = ecb->dte_epid;
10150
10151 ASSERT(MUTEX_HELD(&dtrace_lock));
10152 ASSERT(ecb->dte_next == NULL);
10153 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10154
10155 if ((pred = ecb->dte_predicate) != NULL)
10156 dtrace_predicate_release(pred, vstate);
10157
10158 dtrace_ecb_action_remove(ecb);
10159
10160 ASSERT(state->dts_ecbs[epid - 1] == ecb);
10161 state->dts_ecbs[epid - 1] = NULL;
10162
10163 kmem_free(ecb, sizeof (dtrace_ecb_t));
10164}
10165
10166static dtrace_ecb_t *
10167dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10168 dtrace_enabling_t *enab)
10169{
10170 dtrace_ecb_t *ecb;
10171 dtrace_predicate_t *pred;
10172 dtrace_actdesc_t *act;
10173 dtrace_provider_t *prov;
10174 dtrace_ecbdesc_t *desc = enab->dten_current;
10175
10176 ASSERT(MUTEX_HELD(&dtrace_lock));
10177 ASSERT(state != NULL);
10178
10179 ecb = dtrace_ecb_add(state, probe);
10180 ecb->dte_uarg = desc->dted_uarg;
10181
10182 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10183 dtrace_predicate_hold(pred);
10184 ecb->dte_predicate = pred;
10185 }
10186
10187 if (probe != NULL) {
10188 /*
10189 * If the provider shows more leg than the consumer is old
10190 * enough to see, we need to enable the appropriate implicit
10191 * predicate bits to prevent the ecb from activating at
10192 * revealing times.
10193 *
10194 * Providers specifying DTRACE_PRIV_USER at register time
10195 * are stating that they need the /proc-style privilege
10196 * model to be enforced, and this is what DTRACE_COND_OWNER
10197 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10198 */
10199 prov = probe->dtpr_provider;
10200 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10201 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10202 ecb->dte_cond |= DTRACE_COND_OWNER;
10203
10204 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10205 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10206 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10207
10208 /*
10209 * If the provider shows us kernel innards and the user
10210 * is lacking sufficient privilege, enable the
10211 * DTRACE_COND_USERMODE implicit predicate.
10212 */
10213 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10214 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10215 ecb->dte_cond |= DTRACE_COND_USERMODE;
10216 }
10217
10218 if (dtrace_ecb_create_cache != NULL) {
10219 /*
10220 * If we have a cached ecb, we'll use its action list instead
10221 * of creating our own (saving both time and space).
10222 */
10223 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10224 dtrace_action_t *act2 = cached->dte_action;
10225
10226 if (act2 != NULL) {
10227 ASSERT(act2->dta_refcnt > 0);
10228 act2->dta_refcnt++;
10229 ecb->dte_action = act2;
10230 ecb->dte_action_last = cached->dte_action_last;
10231 ecb->dte_needed = cached->dte_needed;
10232 ecb->dte_size = cached->dte_size;
10233 ecb->dte_alignment = cached->dte_alignment;
10234 }
10235
10236 return (ecb);
10237 }
10238
10239 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10240 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10241 dtrace_ecb_destroy(ecb);
10242 return (NULL);
10243 }
10244 }
10245
10246 dtrace_ecb_resize(ecb);
10247
10248 return (dtrace_ecb_create_cache = ecb);
10249}
10250
10251static int
10252dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10253{
10254 dtrace_ecb_t *ecb;
10255 dtrace_enabling_t *enab = arg;
10256 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10257
10258 ASSERT(state != NULL);
10259
10260 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10261 /*
10262 * This probe was created in a generation for which this
10263 * enabling has previously created ECBs; we don't want to
10264 * enable it again, so just kick out.
10265 */
10266 return (DTRACE_MATCH_NEXT);
10267 }
10268
10269 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10270 return (DTRACE_MATCH_DONE);
10271
10272 if (dtrace_ecb_enable(ecb) < 0)
10273 return (DTRACE_MATCH_FAIL);
10274
10275 return (DTRACE_MATCH_NEXT);
10276}
10277
10278static dtrace_ecb_t *
10279dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10280{
10281 dtrace_ecb_t *ecb;
10282
10283 ASSERT(MUTEX_HELD(&dtrace_lock));
10284
10285 if (id == 0 || VBDTCAST(int64_t)id > state->dts_necbs)
10286 return (NULL);
10287
10288 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10289 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10290
10291 return (state->dts_ecbs[id - 1]);
10292}
10293
10294static dtrace_aggregation_t *
10295dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10296{
10297 dtrace_aggregation_t *agg;
10298
10299 ASSERT(MUTEX_HELD(&dtrace_lock));
10300
10301 if (id == 0 || VBDTCAST(int64_t)id > state->dts_naggregations)
10302 return (NULL);
10303
10304 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10305 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10306 agg->dtag_id == id);
10307
10308 return (state->dts_aggregations[id - 1]);
10309}
10310
10311/*
10312 * DTrace Buffer Functions
10313 *
10314 * The following functions manipulate DTrace buffers. Most of these functions
10315 * are called in the context of establishing or processing consumer state;
10316 * exceptions are explicitly noted.
10317 */
10318
10319/*
10320 * Note: called from cross call context. This function switches the two
10321 * buffers on a given CPU. The atomicity of this operation is assured by
10322 * disabling interrupts while the actual switch takes place; the disabling of
10323 * interrupts serializes the execution with any execution of dtrace_probe() on
10324 * the same CPU.
10325 */
10326static void
10327dtrace_buffer_switch(dtrace_buffer_t *buf)
10328{
10329 caddr_t tomax = buf->dtb_tomax;
10330 caddr_t xamot = buf->dtb_xamot;
10331 dtrace_icookie_t cookie;
10332
10333 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10334 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10335
10336 cookie = dtrace_interrupt_disable();
10337 buf->dtb_tomax = xamot;
10338 buf->dtb_xamot = tomax;
10339 buf->dtb_xamot_drops = buf->dtb_drops;
10340 buf->dtb_xamot_offset = buf->dtb_offset;
10341 buf->dtb_xamot_errors = buf->dtb_errors;
10342 buf->dtb_xamot_flags = buf->dtb_flags;
10343 buf->dtb_offset = 0;
10344 buf->dtb_drops = 0;
10345 buf->dtb_errors = 0;
10346 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10347 dtrace_interrupt_enable(cookie);
10348}
10349
10350/*
10351 * Note: called from cross call context. This function activates a buffer
10352 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
10353 * is guaranteed by the disabling of interrupts.
10354 */
10355static void
10356dtrace_buffer_activate(dtrace_state_t *state)
10357{
10358 dtrace_buffer_t *buf;
10359 dtrace_icookie_t cookie = dtrace_interrupt_disable();
10360
10361 buf = &state->dts_buffer[VBDT_GET_CPUID()];
10362
10363 if (buf->dtb_tomax != NULL) {
10364 /*
10365 * We might like to assert that the buffer is marked inactive,
10366 * but this isn't necessarily true: the buffer for the CPU
10367 * that processes the BEGIN probe has its buffer activated
10368 * manually. In this case, we take the (harmless) action
10369 * re-clearing the bit INACTIVE bit.
10370 */
10371 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10372 }
10373
10374 dtrace_interrupt_enable(cookie);
10375}
10376
10377static int
10378dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10379 processorid_t cpu)
10380{
10381#ifndef VBOX
10382 cpu_t *cp;
10383#else
10384 RTCPUSET CpuSet;
10385 unsigned iCpu;
10386#endif
10387 dtrace_buffer_t *buf;
10388
10389 ASSERT(MUTEX_HELD(&cpu_lock));
10390 ASSERT(MUTEX_HELD(&dtrace_lock));
10391
10392 if (VBDTCAST(int64_t)size > dtrace_nonroot_maxsize
10393#ifndef VBOX
10394 && !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)
10395#endif
10396 )
10397 return (EFBIG);
10398
10399#ifndef VBOX
10400 cp = cpu_list;
10401#else
10402 RTMpGetSet(&CpuSet);
10403#endif
10404
10405#ifndef VBOX
10406 do {
10407 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10408 continue;
10409
10410 buf = &bufs[cp->cpu_id];
10411#else
10412 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10413 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10414 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10415 continue;
10416
10417 buf = &bufs[iCpu];
10418#endif
10419
10420 /*
10421 * If there is already a buffer allocated for this CPU, it
10422 * is only possible that this is a DR event. In this case,
10423 * the buffer size must match our specified size.
10424 */
10425 if (buf->dtb_tomax != NULL) {
10426 ASSERT(buf->dtb_size == size);
10427 continue;
10428 }
10429
10430 ASSERT(buf->dtb_xamot == NULL);
10431
10432 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10433 goto err;
10434
10435 buf->dtb_size = size;
10436 buf->dtb_flags = flags;
10437 buf->dtb_offset = 0;
10438 buf->dtb_drops = 0;
10439
10440 if (flags & DTRACEBUF_NOSWITCH)
10441 continue;
10442
10443 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10444 goto err;
10445#ifndef VBOX
10446 } while ((cp = cp->cpu_next) != cpu_list);
10447#else
10448 }
10449#endif
10450
10451 return (0);
10452
10453err:
10454#ifndef VBOX
10455 cp = cpu_list;
10456
10457 do {
10458 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10459 continue;
10460
10461 buf = &bufs[cp->cpu_id];
10462#else
10463 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10464 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10465 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10466 continue;
10467
10468 buf = &bufs[iCpu];
10469#endif
10470
10471 if (buf->dtb_xamot != NULL) {
10472 ASSERT(buf->dtb_tomax != NULL);
10473 ASSERT(buf->dtb_size == size);
10474 kmem_free(buf->dtb_xamot, size);
10475 }
10476
10477 if (buf->dtb_tomax != NULL) {
10478 ASSERT(buf->dtb_size == size);
10479 kmem_free(buf->dtb_tomax, size);
10480 }
10481
10482 buf->dtb_tomax = NULL;
10483 buf->dtb_xamot = NULL;
10484 buf->dtb_size = 0;
10485#ifndef VBOX
10486 } while ((cp = cp->cpu_next) != cpu_list);
10487#else
10488 }
10489#endif
10490
10491 return (ENOMEM);
10492}
10493
10494/*
10495 * Note: called from probe context. This function just increments the drop
10496 * count on a buffer. It has been made a function to allow for the
10497 * possibility of understanding the source of mysterious drop counts. (A
10498 * problem for which one may be particularly disappointed that DTrace cannot
10499 * be used to understand DTrace.)
10500 */
10501static void
10502dtrace_buffer_drop(dtrace_buffer_t *buf)
10503{
10504 buf->dtb_drops++;
10505}
10506
10507/*
10508 * Note: called from probe context. This function is called to reserve space
10509 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
10510 * mstate. Returns the new offset in the buffer, or a negative value if an
10511 * error has occurred.
10512 */
10513static intptr_t
10514dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10515 dtrace_state_t *state, dtrace_mstate_t *mstate)
10516{
10517 intptr_t offs = buf->dtb_offset, soffs;
10518 intptr_t woffs;
10519 caddr_t tomax;
10520 size_t total;
10521
10522 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10523 return (-1);
10524
10525 if ((tomax = buf->dtb_tomax) == NULL) {
10526 dtrace_buffer_drop(buf);
10527 return (-1);
10528 }
10529
10530 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10531 while (offs & (align - 1)) {
10532 /*
10533 * Assert that our alignment is off by a number which
10534 * is itself sizeof (uint32_t) aligned.
10535 */
10536 ASSERT(!((align - (offs & (align - 1))) &
10537 (sizeof (uint32_t) - 1)));
10538 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10539 offs += sizeof (uint32_t);
10540 }
10541
10542 if (VBDTCAST(uintptr_t)(soffs = offs + needed) > buf->dtb_size) {
10543 dtrace_buffer_drop(buf);
10544 return (-1);
10545 }
10546
10547 if (mstate == NULL)
10548 return (offs);
10549
10550 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10551 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10552 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10553
10554 return (offs);
10555 }
10556
10557 if (buf->dtb_flags & DTRACEBUF_FILL) {
10558 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10559 (buf->dtb_flags & DTRACEBUF_FULL))
10560 return (-1);
10561 goto out;
10562 }
10563
10564 total = needed + (offs & (align - 1));
10565
10566 /*
10567 * For a ring buffer, life is quite a bit more complicated. Before
10568 * we can store any padding, we need to adjust our wrapping offset.
10569 * (If we've never before wrapped or we're not about to, no adjustment
10570 * is required.)
10571 */
10572 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10573 offs + total > buf->dtb_size) {
10574 woffs = buf->dtb_xamot_offset;
10575
10576 if (offs + total > buf->dtb_size) {
10577 /*
10578 * We can't fit in the end of the buffer. First, a
10579 * sanity check that we can fit in the buffer at all.
10580 */
10581 if (total > buf->dtb_size) {
10582 dtrace_buffer_drop(buf);
10583 return (-1);
10584 }
10585
10586 /*
10587 * We're going to be storing at the top of the buffer,
10588 * so now we need to deal with the wrapped offset. We
10589 * only reset our wrapped offset to 0 if it is
10590 * currently greater than the current offset. If it
10591 * is less than the current offset, it is because a
10592 * previous allocation induced a wrap -- but the
10593 * allocation didn't subsequently take the space due
10594 * to an error or false predicate evaluation. In this
10595 * case, we'll just leave the wrapped offset alone: if
10596 * the wrapped offset hasn't been advanced far enough
10597 * for this allocation, it will be adjusted in the
10598 * lower loop.
10599 */
10600 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10601 if (woffs >= offs)
10602 woffs = 0;
10603 } else {
10604 woffs = 0;
10605 }
10606
10607 /*
10608 * Now we know that we're going to be storing to the
10609 * top of the buffer and that there is room for us
10610 * there. We need to clear the buffer from the current
10611 * offset to the end (there may be old gunk there).
10612 */
10613 while (VBDTCAST(uintptr_t)offs < buf->dtb_size)
10614 tomax[offs++] = 0;
10615
10616 /*
10617 * We need to set our offset to zero. And because we
10618 * are wrapping, we need to set the bit indicating as
10619 * much. We can also adjust our needed space back
10620 * down to the space required by the ECB -- we know
10621 * that the top of the buffer is aligned.
10622 */
10623 offs = 0;
10624 total = needed;
10625 buf->dtb_flags |= DTRACEBUF_WRAPPED;
10626 } else {
10627 /*
10628 * There is room for us in the buffer, so we simply
10629 * need to check the wrapped offset.
10630 */
10631 if (woffs < offs) {
10632 /*
10633 * The wrapped offset is less than the offset.
10634 * This can happen if we allocated buffer space
10635 * that induced a wrap, but then we didn't
10636 * subsequently take the space due to an error
10637 * or false predicate evaluation. This is
10638 * okay; we know that _this_ allocation isn't
10639 * going to induce a wrap. We still can't
10640 * reset the wrapped offset to be zero,
10641 * however: the space may have been trashed in
10642 * the previous failed probe attempt. But at
10643 * least the wrapped offset doesn't need to
10644 * be adjusted at all...
10645 */
10646 goto out;
10647 }
10648 }
10649
10650 while (VBDTCAST(uintptr_t)offs + total > VBDTCAST(uintptr_t)woffs) {
10651 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10652 size_t size;
10653
10654 if (epid == DTRACE_EPIDNONE) {
10655 size = sizeof (uint32_t);
10656 } else {
10657 ASSERT(VBDTCAST(int64_t)epid <= state->dts_necbs);
10658 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10659
10660 size = state->dts_ecbs[epid - 1]->dte_size;
10661 }
10662
10663 ASSERT(woffs + size <= buf->dtb_size);
10664 ASSERT(size != 0);
10665
10666 if (woffs + size == buf->dtb_size) {
10667 /*
10668 * We've reached the end of the buffer; we want
10669 * to set the wrapped offset to 0 and break
10670 * out. However, if the offs is 0, then we're
10671 * in a strange edge-condition: the amount of
10672 * space that we want to reserve plus the size
10673 * of the record that we're overwriting is
10674 * greater than the size of the buffer. This
10675 * is problematic because if we reserve the
10676 * space but subsequently don't consume it (due
10677 * to a failed predicate or error) the wrapped
10678 * offset will be 0 -- yet the EPID at offset 0
10679 * will not be committed. This situation is
10680 * relatively easy to deal with: if we're in
10681 * this case, the buffer is indistinguishable
10682 * from one that hasn't wrapped; we need only
10683 * finish the job by clearing the wrapped bit,
10684 * explicitly setting the offset to be 0, and
10685 * zero'ing out the old data in the buffer.
10686 */
10687 if (offs == 0) {
10688 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10689 buf->dtb_offset = 0;
10690 woffs = total;
10691
10692 while (VBDTCAST(uintptr_t)woffs < buf->dtb_size)
10693 tomax[woffs++] = 0;
10694 }
10695
10696 woffs = 0;
10697 break;
10698 }
10699
10700 woffs += size;
10701 }
10702
10703 /*
10704 * We have a wrapped offset. It may be that the wrapped offset
10705 * has become zero -- that's okay.
10706 */
10707 buf->dtb_xamot_offset = woffs;
10708 }
10709
10710out:
10711 /*
10712 * Now we can plow the buffer with any necessary padding.
10713 */
10714 while (offs & (align - 1)) {
10715 /*
10716 * Assert that our alignment is off by a number which
10717 * is itself sizeof (uint32_t) aligned.
10718 */
10719 ASSERT(!((align - (offs & (align - 1))) &
10720 (sizeof (uint32_t) - 1)));
10721 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10722 offs += sizeof (uint32_t);
10723 }
10724
10725 if (buf->dtb_flags & DTRACEBUF_FILL) {
10726 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10727 buf->dtb_flags |= DTRACEBUF_FULL;
10728 return (-1);
10729 }
10730 }
10731
10732 if (mstate == NULL)
10733 return (offs);
10734
10735 /*
10736 * For ring buffers and fill buffers, the scratch space is always
10737 * the inactive buffer.
10738 */
10739 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10740 mstate->dtms_scratch_size = buf->dtb_size;
10741 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10742
10743 return (offs);
10744}
10745
10746static void
10747dtrace_buffer_polish(dtrace_buffer_t *buf)
10748{
10749 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10750 ASSERT(MUTEX_HELD(&dtrace_lock));
10751
10752 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10753 return;
10754
10755 /*
10756 * We need to polish the ring buffer. There are three cases:
10757 *
10758 * - The first (and presumably most common) is that there is no gap
10759 * between the buffer offset and the wrapped offset. In this case,
10760 * there is nothing in the buffer that isn't valid data; we can
10761 * mark the buffer as polished and return.
10762 *
10763 * - The second (less common than the first but still more common
10764 * than the third) is that there is a gap between the buffer offset
10765 * and the wrapped offset, and the wrapped offset is larger than the
10766 * buffer offset. This can happen because of an alignment issue, or
10767 * can happen because of a call to dtrace_buffer_reserve() that
10768 * didn't subsequently consume the buffer space. In this case,
10769 * we need to zero the data from the buffer offset to the wrapped
10770 * offset.
10771 *
10772 * - The third (and least common) is that there is a gap between the
10773 * buffer offset and the wrapped offset, but the wrapped offset is
10774 * _less_ than the buffer offset. This can only happen because a
10775 * call to dtrace_buffer_reserve() induced a wrap, but the space
10776 * was not subsequently consumed. In this case, we need to zero the
10777 * space from the offset to the end of the buffer _and_ from the
10778 * top of the buffer to the wrapped offset.
10779 */
10780 if (buf->dtb_offset < buf->dtb_xamot_offset) {
10781 bzero(buf->dtb_tomax + buf->dtb_offset,
10782 buf->dtb_xamot_offset - buf->dtb_offset);
10783 }
10784
10785 if (buf->dtb_offset > buf->dtb_xamot_offset) {
10786 bzero(buf->dtb_tomax + buf->dtb_offset,
10787 buf->dtb_size - buf->dtb_offset);
10788 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10789 }
10790}
10791
10792static void
10793dtrace_buffer_free(dtrace_buffer_t *bufs)
10794{
10795 int i;
10796
10797 for (i = 0; i < NCPU; i++) {
10798 dtrace_buffer_t *buf = &bufs[i];
10799
10800 if (buf->dtb_tomax == NULL) {
10801 ASSERT(buf->dtb_xamot == NULL);
10802 ASSERT(buf->dtb_size == 0);
10803 continue;
10804 }
10805
10806 if (buf->dtb_xamot != NULL) {
10807 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10808 kmem_free(buf->dtb_xamot, buf->dtb_size);
10809 }
10810
10811 kmem_free(buf->dtb_tomax, buf->dtb_size);
10812 buf->dtb_size = 0;
10813 buf->dtb_tomax = NULL;
10814 buf->dtb_xamot = NULL;
10815 }
10816}
10817
10818/*
10819 * DTrace Enabling Functions
10820 */
10821static dtrace_enabling_t *
10822dtrace_enabling_create(dtrace_vstate_t *vstate)
10823{
10824 dtrace_enabling_t *enab;
10825
10826 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10827 enab->dten_vstate = vstate;
10828
10829 return (enab);
10830}
10831
10832static void
10833dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10834{
10835 dtrace_ecbdesc_t **ndesc;
10836 size_t osize, nsize;
10837
10838 /*
10839 * We can't add to enablings after we've enabled them, or after we've
10840 * retained them.
10841 */
10842 ASSERT(enab->dten_probegen == 0);
10843 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10844
10845 if (enab->dten_ndesc < enab->dten_maxdesc) {
10846 enab->dten_desc[enab->dten_ndesc++] = ecb;
10847 return;
10848 }
10849
10850 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10851
10852 if (enab->dten_maxdesc == 0) {
10853 enab->dten_maxdesc = 1;
10854 } else {
10855 enab->dten_maxdesc <<= 1;
10856 }
10857
10858 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10859
10860 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10861 ndesc = kmem_zalloc(nsize, KM_SLEEP);
10862 bcopy(enab->dten_desc, ndesc, osize);
10863 kmem_free(enab->dten_desc, osize);
10864
10865 enab->dten_desc = ndesc;
10866 enab->dten_desc[enab->dten_ndesc++] = ecb;
10867}
10868
10869static void
10870dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10871 dtrace_probedesc_t *pd)
10872{
10873 dtrace_ecbdesc_t *new;
10874 dtrace_predicate_t *pred;
10875 dtrace_actdesc_t *act;
10876
10877 /*
10878 * We're going to create a new ECB description that matches the
10879 * specified ECB in every way, but has the specified probe description.
10880 */
10881 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10882
10883 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10884 dtrace_predicate_hold(pred);
10885
10886 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10887 dtrace_actdesc_hold(act);
10888
10889 new->dted_action = ecb->dted_action;
10890 new->dted_pred = ecb->dted_pred;
10891 new->dted_probe = *pd;
10892 new->dted_uarg = ecb->dted_uarg;
10893
10894 dtrace_enabling_add(enab, new);
10895}
10896
10897static void
10898dtrace_enabling_dump(dtrace_enabling_t *enab)
10899{
10900 int i;
10901
10902 for (i = 0; i < enab->dten_ndesc; i++) {
10903 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10904
10905 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10906 desc->dtpd_provider, desc->dtpd_mod,
10907 desc->dtpd_func, desc->dtpd_name);
10908 }
10909}
10910
10911static void
10912dtrace_enabling_destroy(dtrace_enabling_t *enab)
10913{
10914 int i;
10915 dtrace_ecbdesc_t *ep;
10916 dtrace_vstate_t *vstate = enab->dten_vstate;
10917
10918 ASSERT(MUTEX_HELD(&dtrace_lock));
10919
10920 for (i = 0; i < enab->dten_ndesc; i++) {
10921 dtrace_actdesc_t *act, *next;
10922 dtrace_predicate_t *pred;
10923
10924 ep = enab->dten_desc[i];
10925
10926 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10927 dtrace_predicate_release(pred, vstate);
10928
10929 for (act = ep->dted_action; act != NULL; act = next) {
10930 next = act->dtad_next;
10931 dtrace_actdesc_release(act, vstate);
10932 }
10933
10934 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10935 }
10936
10937 kmem_free(enab->dten_desc,
10938 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10939
10940 /*
10941 * If this was a retained enabling, decrement the dts_nretained count
10942 * and take it off of the dtrace_retained list.
10943 */
10944 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10945 dtrace_retained == enab) {
10946 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10947 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10948 enab->dten_vstate->dtvs_state->dts_nretained--;
10949 dtrace_retained_gen++;
10950 }
10951
10952 if (enab->dten_prev == NULL) {
10953 if (dtrace_retained == enab) {
10954 dtrace_retained = enab->dten_next;
10955
10956 if (dtrace_retained != NULL)
10957 dtrace_retained->dten_prev = NULL;
10958 }
10959 } else {
10960 ASSERT(enab != dtrace_retained);
10961 ASSERT(dtrace_retained != NULL);
10962 enab->dten_prev->dten_next = enab->dten_next;
10963 }
10964
10965 if (enab->dten_next != NULL) {
10966 ASSERT(dtrace_retained != NULL);
10967 enab->dten_next->dten_prev = enab->dten_prev;
10968 }
10969
10970 kmem_free(enab, sizeof (dtrace_enabling_t));
10971}
10972
10973static int
10974dtrace_enabling_retain(dtrace_enabling_t *enab)
10975{
10976 dtrace_state_t *state;
10977
10978 ASSERT(MUTEX_HELD(&dtrace_lock));
10979 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10980 ASSERT(enab->dten_vstate != NULL);
10981
10982 state = enab->dten_vstate->dtvs_state;
10983 ASSERT(state != NULL);
10984
10985 /*
10986 * We only allow each state to retain dtrace_retain_max enablings.
10987 */
10988 if (state->dts_nretained >= dtrace_retain_max)
10989 return (ENOSPC);
10990
10991 state->dts_nretained++;
10992 dtrace_retained_gen++;
10993
10994 if (dtrace_retained == NULL) {
10995 dtrace_retained = enab;
10996 return (0);
10997 }
10998
10999 enab->dten_next = dtrace_retained;
11000 dtrace_retained->dten_prev = enab;
11001 dtrace_retained = enab;
11002
11003 return (0);
11004}
11005
11006static int
11007dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11008 dtrace_probedesc_t *create)
11009{
11010 dtrace_enabling_t *new, *enab;
11011 int found = 0, err = ENOENT;
11012
11013 ASSERT(MUTEX_HELD(&dtrace_lock));
11014 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11015 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11016 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11017 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11018
11019 new = dtrace_enabling_create(&state->dts_vstate);
11020
11021 /*
11022 * Iterate over all retained enablings, looking for enablings that
11023 * match the specified state.
11024 */
11025 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11026 int i;
11027
11028 /*
11029 * dtvs_state can only be NULL for helper enablings -- and
11030 * helper enablings can't be retained.
11031 */
11032 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11033
11034 if (enab->dten_vstate->dtvs_state != state)
11035 continue;
11036
11037 /*
11038 * Now iterate over each probe description; we're looking for
11039 * an exact match to the specified probe description.
11040 */
11041 for (i = 0; i < enab->dten_ndesc; i++) {
11042 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11043 dtrace_probedesc_t *pd = &ep->dted_probe;
11044
11045 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11046 continue;
11047
11048 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11049 continue;
11050
11051 if (strcmp(pd->dtpd_func, match->dtpd_func))
11052 continue;
11053
11054 if (strcmp(pd->dtpd_name, match->dtpd_name))
11055 continue;
11056
11057 /*
11058 * We have a winning probe! Add it to our growing
11059 * enabling.
11060 */
11061 found = 1;
11062 dtrace_enabling_addlike(new, ep, create);
11063 }
11064 }
11065
11066 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11067 dtrace_enabling_destroy(new);
11068 return (err);
11069 }
11070
11071 return (0);
11072}
11073
11074static void
11075dtrace_enabling_retract(dtrace_state_t *state)
11076{
11077 dtrace_enabling_t *enab, *next;
11078
11079 ASSERT(MUTEX_HELD(&dtrace_lock));
11080
11081 /*
11082 * Iterate over all retained enablings, destroy the enablings retained
11083 * for the specified state.
11084 */
11085 for (enab = dtrace_retained; enab != NULL; enab = next) {
11086 next = enab->dten_next;
11087
11088 /*
11089 * dtvs_state can only be NULL for helper enablings -- and
11090 * helper enablings can't be retained.
11091 */
11092 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11093
11094 if (enab->dten_vstate->dtvs_state == state) {
11095 ASSERT(state->dts_nretained > 0);
11096 dtrace_enabling_destroy(enab);
11097 }
11098 }
11099
11100 ASSERT(state->dts_nretained == 0);
11101}
11102
11103static int
11104dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11105{
11106 int i = 0;
11107 int total_matched = 0, matched = 0;
11108
11109 ASSERT(MUTEX_HELD(&cpu_lock));
11110 ASSERT(MUTEX_HELD(&dtrace_lock));
11111
11112 for (i = 0; i < enab->dten_ndesc; i++) {
11113 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11114
11115 enab->dten_current = ep;
11116 enab->dten_error = 0;
11117
11118 /*
11119 * If a provider failed to enable a probe then get out and
11120 * let the consumer know we failed.
11121 */
11122 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11123 return (EBUSY);
11124
11125 total_matched += matched;
11126
11127 if (enab->dten_error != 0) {
11128 /*
11129 * If we get an error half-way through enabling the
11130 * probes, we kick out -- perhaps with some number of
11131 * them enabled. Leaving enabled probes enabled may
11132 * be slightly confusing for user-level, but we expect
11133 * that no one will attempt to actually drive on in
11134 * the face of such errors. If this is an anonymous
11135 * enabling (indicated with a NULL nmatched pointer),
11136 * we cmn_err() a message. We aren't expecting to
11137 * get such an error -- such as it can exist at all,
11138 * it would be a result of corrupted DOF in the driver
11139 * properties.
11140 */
11141 if (nmatched == NULL) {
11142 cmn_err(CE_WARN, "dtrace_enabling_match() "
11143 "error on %p: %d", (void *)ep,
11144 enab->dten_error);
11145 }
11146
11147 return (enab->dten_error);
11148 }
11149 }
11150
11151 enab->dten_probegen = dtrace_probegen;
11152 if (nmatched != NULL)
11153 *nmatched = total_matched;
11154
11155 return (0);
11156}
11157
11158static void
11159dtrace_enabling_matchall(void)
11160{
11161 dtrace_enabling_t *enab;
11162
11163 mutex_enter(&cpu_lock);
11164 mutex_enter(&dtrace_lock);
11165
11166 /*
11167 * Iterate over all retained enablings to see if any probes match
11168 * against them. We only perform this operation on enablings for which
11169 * we have sufficient permissions by virtue of being in the global zone
11170 * or in the same zone as the DTrace client. Because we can be called
11171 * after dtrace_detach() has been called, we cannot assert that there
11172 * are retained enablings. We can safely load from dtrace_retained,
11173 * however: the taskq_destroy() at the end of dtrace_detach() will
11174 * block pending our completion.
11175 */
11176 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11177 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
11178
11179#ifndef VBOX
11180 if (INGLOBALZONE(curproc) ||
11181 cr != NULL && getzoneid() == crgetzoneid(cr))
11182#endif
11183 (void) dtrace_enabling_match(enab, NULL);
11184 }
11185
11186 mutex_exit(&dtrace_lock);
11187 mutex_exit(&cpu_lock);
11188}
11189
11190/*
11191 * If an enabling is to be enabled without having matched probes (that is, if
11192 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11193 * enabling must be _primed_ by creating an ECB for every ECB description.
11194 * This must be done to assure that we know the number of speculations, the
11195 * number of aggregations, the minimum buffer size needed, etc. before we
11196 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
11197 * enabling any probes, we create ECBs for every ECB decription, but with a
11198 * NULL probe -- which is exactly what this function does.
11199 */
11200static void
11201dtrace_enabling_prime(dtrace_state_t *state)
11202{
11203 dtrace_enabling_t *enab;
11204 int i;
11205
11206 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11207 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11208
11209 if (enab->dten_vstate->dtvs_state != state)
11210 continue;
11211
11212 /*
11213 * We don't want to prime an enabling more than once, lest
11214 * we allow a malicious user to induce resource exhaustion.
11215 * (The ECBs that result from priming an enabling aren't
11216 * leaked -- but they also aren't deallocated until the
11217 * consumer state is destroyed.)
11218 */
11219 if (enab->dten_primed)
11220 continue;
11221
11222 for (i = 0; i < enab->dten_ndesc; i++) {
11223 enab->dten_current = enab->dten_desc[i];
11224 (void) dtrace_probe_enable(NULL, enab);
11225 }
11226
11227 enab->dten_primed = 1;
11228 }
11229}
11230
11231/*
11232 * Called to indicate that probes should be provided due to retained
11233 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
11234 * must take an initial lap through the enabling calling the dtps_provide()
11235 * entry point explicitly to allow for autocreated probes.
11236 */
11237static void
11238dtrace_enabling_provide(dtrace_provider_t *prv)
11239{
11240 int i, all = 0;
11241 dtrace_probedesc_t desc;
11242 dtrace_genid_t gen;
11243
11244 ASSERT(MUTEX_HELD(&dtrace_lock));
11245 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11246
11247 if (prv == NULL) {
11248 all = 1;
11249 prv = dtrace_provider;
11250 }
11251
11252 do {
11253 dtrace_enabling_t *enab;
11254 void *parg = prv->dtpv_arg;
11255
11256retry:
11257 gen = dtrace_retained_gen;
11258 for (enab = dtrace_retained; enab != NULL;
11259 enab = enab->dten_next) {
11260 for (i = 0; i < enab->dten_ndesc; i++) {
11261 desc = enab->dten_desc[i]->dted_probe;
11262 mutex_exit(&dtrace_lock);
11263 prv->dtpv_pops.dtps_provide(parg, &desc);
11264 mutex_enter(&dtrace_lock);
11265 /*
11266 * Process the retained enablings again if
11267 * they have changed while we weren't holding
11268 * dtrace_lock.
11269 */
11270 if (gen != dtrace_retained_gen)
11271 goto retry;
11272 }
11273 }
11274 } while (all && (prv = prv->dtpv_next) != NULL);
11275
11276 mutex_exit(&dtrace_lock);
11277 dtrace_probe_provide(NULL, all ? NULL : prv);
11278 mutex_enter(&dtrace_lock);
11279}
11280
11281/*
11282 * DTrace DOF Functions
11283 */
11284/*ARGSUSED*/
11285static void
11286dtrace_dof_error(dof_hdr_t *dof, const char *str)
11287{
11288 if (dtrace_err_verbose)
11289 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11290
11291#ifdef DTRACE_ERRDEBUG
11292 dtrace_errdebug(str);
11293#endif
11294}
11295
11296/*
11297 * Create DOF out of a currently enabled state. Right now, we only create
11298 * DOF containing the run-time options -- but this could be expanded to create
11299 * complete DOF representing the enabled state.
11300 */
11301static dof_hdr_t *
11302dtrace_dof_create(dtrace_state_t *state)
11303{
11304 dof_hdr_t *dof;
11305 dof_sec_t *sec;
11306 dof_optdesc_t *opt;
11307 int i, len = sizeof (dof_hdr_t) +
11308 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11309 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11310
11311 ASSERT(MUTEX_HELD(&dtrace_lock));
11312
11313 dof = kmem_zalloc(len, KM_SLEEP);
11314 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11315 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11316 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11317 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11318
11319 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11320 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11321 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11322 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11323 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11324 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11325
11326 dof->dofh_flags = 0;
11327 dof->dofh_hdrsize = sizeof (dof_hdr_t);
11328 dof->dofh_secsize = sizeof (dof_sec_t);
11329 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
11330 dof->dofh_secoff = sizeof (dof_hdr_t);
11331 dof->dofh_loadsz = len;
11332 dof->dofh_filesz = len;
11333 dof->dofh_pad = 0;
11334
11335 /*
11336 * Fill in the option section header...
11337 */
11338 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11339 sec->dofs_type = DOF_SECT_OPTDESC;
11340 sec->dofs_align = sizeof (uint64_t);
11341 sec->dofs_flags = DOF_SECF_LOAD;
11342 sec->dofs_entsize = sizeof (dof_optdesc_t);
11343
11344 opt = (dof_optdesc_t *)((uintptr_t)sec +
11345 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11346
11347 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11348 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11349
11350 for (i = 0; i < DTRACEOPT_MAX; i++) {
11351 opt[i].dofo_option = i;
11352 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11353 opt[i].dofo_value = state->dts_options[i];
11354 }
11355
11356 return (dof);
11357}
11358
11359static dof_hdr_t *
11360dtrace_dof_copyin(uintptr_t uarg, int *errp)
11361{
11362 dof_hdr_t hdr, *dof;
11363
11364 ASSERT(!MUTEX_HELD(&dtrace_lock));
11365
11366 /*
11367 * First, we're going to copyin() the sizeof (dof_hdr_t).
11368 */
11369 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11370 dtrace_dof_error(NULL, "failed to copyin DOF header");
11371 *errp = EFAULT;
11372 return (NULL);
11373 }
11374
11375 /*
11376 * Now we'll allocate the entire DOF and copy it in -- provided
11377 * that the length isn't outrageous.
11378 */
11379 if (hdr.dofh_loadsz >= VBDTCAST(uint64_t)dtrace_dof_maxsize) {
11380 dtrace_dof_error(&hdr, "load size exceeds maximum");
11381 *errp = E2BIG;
11382 return (NULL);
11383 }
11384
11385 if (hdr.dofh_loadsz < sizeof (hdr)) {
11386 dtrace_dof_error(&hdr, "invalid load size");
11387 *errp = EINVAL;
11388 return (NULL);
11389 }
11390
11391 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11392
11393 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11394 dof->dofh_loadsz != hdr.dofh_loadsz) {
11395 kmem_free(dof, hdr.dofh_loadsz);
11396 *errp = EFAULT;
11397 return (NULL);
11398 }
11399
11400 return (dof);
11401}
11402
11403static dof_hdr_t *
11404dtrace_dof_property(const char *name)
11405{
11406#ifndef VBOX
11407 uchar_t *buf;
11408 uint64_t loadsz;
11409 unsigned int len, i;
11410 dof_hdr_t *dof;
11411
11412 /*
11413 * Unfortunately, array of values in .conf files are always (and
11414 * only) interpreted to be integer arrays. We must read our DOF
11415 * as an integer array, and then squeeze it into a byte array.
11416 */
11417 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11418 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11419 return (NULL);
11420
11421 for (i = 0; i < len; i++)
11422 buf[i] = (uchar_t)(((int *)buf)[i]);
11423
11424 if (len < sizeof (dof_hdr_t)) {
11425 ddi_prop_free(buf);
11426 dtrace_dof_error(NULL, "truncated header");
11427 return (NULL);
11428 }
11429
11430 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11431 ddi_prop_free(buf);
11432 dtrace_dof_error(NULL, "truncated DOF");
11433 return (NULL);
11434 }
11435
11436 if (loadsz >= dtrace_dof_maxsize) {
11437 ddi_prop_free(buf);
11438 dtrace_dof_error(NULL, "oversized DOF");
11439 return (NULL);
11440 }
11441
11442 dof = kmem_alloc(loadsz, KM_SLEEP);
11443 bcopy(buf, dof, loadsz);
11444 ddi_prop_free(buf);
11445
11446 return (dof);
11447#else /* VBOX */
11448 return (NULL);
11449#endif /* VBOX */
11450}
11451
11452static void
11453dtrace_dof_destroy(dof_hdr_t *dof)
11454{
11455 kmem_free(dof, dof->dofh_loadsz);
11456}
11457
11458/*
11459 * Return the dof_sec_t pointer corresponding to a given section index. If the
11460 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
11461 * a type other than DOF_SECT_NONE is specified, the header is checked against
11462 * this type and NULL is returned if the types do not match.
11463 */
11464static dof_sec_t *
11465dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11466{
11467 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11468 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11469
11470 if (i >= dof->dofh_secnum) {
11471 dtrace_dof_error(dof, "referenced section index is invalid");
11472 return (NULL);
11473 }
11474
11475 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11476 dtrace_dof_error(dof, "referenced section is not loadable");
11477 return (NULL);
11478 }
11479
11480 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11481 dtrace_dof_error(dof, "referenced section is the wrong type");
11482 return (NULL);
11483 }
11484
11485 return (sec);
11486}
11487
11488static dtrace_probedesc_t *
11489dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11490{
11491 dof_probedesc_t *probe;
11492 dof_sec_t *strtab;
11493 uintptr_t daddr = (uintptr_t)dof;
11494 uintptr_t str;
11495 size_t size;
11496
11497 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11498 dtrace_dof_error(dof, "invalid probe section");
11499 return (NULL);
11500 }
11501
11502 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11503 dtrace_dof_error(dof, "bad alignment in probe description");
11504 return (NULL);
11505 }
11506
11507 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11508 dtrace_dof_error(dof, "truncated probe description");
11509 return (NULL);
11510 }
11511
11512 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11513 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11514
11515 if (strtab == NULL)
11516 return (NULL);
11517
11518 str = daddr + strtab->dofs_offset;
11519 size = strtab->dofs_size;
11520
11521 if (probe->dofp_provider >= strtab->dofs_size) {
11522 dtrace_dof_error(dof, "corrupt probe provider");
11523 return (NULL);
11524 }
11525
11526 (void) strncpy(desc->dtpd_provider,
11527 (char *)(str + probe->dofp_provider),
11528 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11529
11530 if (probe->dofp_mod >= strtab->dofs_size) {
11531 dtrace_dof_error(dof, "corrupt probe module");
11532 return (NULL);
11533 }
11534
11535 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11536 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11537
11538 if (probe->dofp_func >= strtab->dofs_size) {
11539 dtrace_dof_error(dof, "corrupt probe function");
11540 return (NULL);
11541 }
11542
11543 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11544 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11545
11546 if (probe->dofp_name >= strtab->dofs_size) {
11547 dtrace_dof_error(dof, "corrupt probe name");
11548 return (NULL);
11549 }
11550
11551 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11552 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11553
11554 return (desc);
11555}
11556
11557static dtrace_difo_t *
11558dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11559 cred_t *cr)
11560{
11561 dtrace_difo_t *dp;
11562 size_t ttl = 0;
11563 dof_difohdr_t *dofd;
11564 uintptr_t daddr = (uintptr_t)dof;
11565 size_t max = dtrace_difo_maxsize;
11566 int i, l, n;
11567
11568 static const struct {
11569 int section;
11570 int bufoffs;
11571 int lenoffs;
11572 int entsize;
11573 int align;
11574 const char *msg;
11575 } difo[] = {
11576 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11577 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11578 sizeof (dif_instr_t), "multiple DIF sections" },
11579
11580 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11581 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11582 sizeof (uint64_t), "multiple integer tables" },
11583
11584 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11585 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11586 sizeof (char), "multiple string tables" },
11587
11588 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11589 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11590 sizeof (uint_t), "multiple variable tables" },
11591
11592 { DOF_SECT_NONE, 0, 0, 0, NULL }
11593 };
11594
11595 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11596 dtrace_dof_error(dof, "invalid DIFO header section");
11597 return (NULL);
11598 }
11599
11600 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11601 dtrace_dof_error(dof, "bad alignment in DIFO header");
11602 return (NULL);
11603 }
11604
11605 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11606 sec->dofs_size % sizeof (dof_secidx_t)) {
11607 dtrace_dof_error(dof, "bad size in DIFO header");
11608 return (NULL);
11609 }
11610
11611 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11612 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11613
11614 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11615 dp->dtdo_rtype = dofd->dofd_rtype;
11616
11617 for (l = 0; l < n; l++) {
11618 dof_sec_t *subsec;
11619 void **bufp;
11620 uint32_t *lenp;
11621
11622 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11623 dofd->dofd_links[l])) == NULL)
11624 goto err; /* invalid section link */
11625
11626 if (ttl + subsec->dofs_size > max) {
11627 dtrace_dof_error(dof, "exceeds maximum size");
11628 goto err;
11629 }
11630
11631 ttl += subsec->dofs_size;
11632
11633 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11634 if (subsec->dofs_type != VBDTCAST(uint32_t)difo[i].section)
11635 continue;
11636
11637 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11638 dtrace_dof_error(dof, "section not loaded");
11639 goto err;
11640 }
11641
11642 if (subsec->dofs_align != VBDTCAST(uint32_t)difo[i].align) {
11643 dtrace_dof_error(dof, "bad alignment");
11644 goto err;
11645 }
11646
11647 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11648 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11649
11650 if (*bufp != NULL) {
11651 dtrace_dof_error(dof, difo[i].msg);
11652 goto err;
11653 }
11654
11655 if (VBDTCAST(uint32_t)difo[i].entsize != subsec->dofs_entsize) {
11656 dtrace_dof_error(dof, "entry size mismatch");
11657 goto err;
11658 }
11659
11660 if (subsec->dofs_entsize != 0 &&
11661 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11662 dtrace_dof_error(dof, "corrupt entry size");
11663 goto err;
11664 }
11665
11666 *lenp = subsec->dofs_size;
11667 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11668 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11669 *bufp, subsec->dofs_size);
11670
11671 if (subsec->dofs_entsize != 0)
11672 *lenp /= subsec->dofs_entsize;
11673
11674 break;
11675 }
11676
11677 /*
11678 * If we encounter a loadable DIFO sub-section that is not
11679 * known to us, assume this is a broken program and fail.
11680 */
11681 if (difo[i].section == DOF_SECT_NONE &&
11682 (subsec->dofs_flags & DOF_SECF_LOAD)) {
11683 dtrace_dof_error(dof, "unrecognized DIFO subsection");
11684 goto err;
11685 }
11686 }
11687
11688 if (dp->dtdo_buf == NULL) {
11689 /*
11690 * We can't have a DIF object without DIF text.
11691 */
11692 dtrace_dof_error(dof, "missing DIF text");
11693 goto err;
11694 }
11695
11696 /*
11697 * Before we validate the DIF object, run through the variable table
11698 * looking for the strings -- if any of their size are under, we'll set
11699 * their size to be the system-wide default string size. Note that
11700 * this should _not_ happen if the "strsize" option has been set --
11701 * in this case, the compiler should have set the size to reflect the
11702 * setting of the option.
11703 */
11704 for (i = 0; VBDTCAST(unsigned)i < dp->dtdo_varlen; i++) {
11705 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11706 dtrace_diftype_t *t = &v->dtdv_type;
11707
11708 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11709 continue;
11710
11711 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11712 t->dtdt_size = VBDTCAST(uint32_t)dtrace_strsize_default;
11713 }
11714
11715 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11716 goto err;
11717
11718 dtrace_difo_init(dp, vstate);
11719 return (dp);
11720
11721err:
11722 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11723 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11724 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11725 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11726
11727 kmem_free(dp, sizeof (dtrace_difo_t));
11728 return (NULL);
11729}
11730
11731static dtrace_predicate_t *
11732dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11733 cred_t *cr)
11734{
11735 dtrace_difo_t *dp;
11736
11737 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11738 return (NULL);
11739
11740 return (dtrace_predicate_create(dp));
11741}
11742
11743static dtrace_actdesc_t *
11744dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11745 cred_t *cr)
11746{
11747 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11748 dof_actdesc_t *desc;
11749 dof_sec_t *difosec;
11750 size_t offs;
11751 uintptr_t daddr = (uintptr_t)dof;
11752 uint64_t arg;
11753 dtrace_actkind_t kind;
11754
11755 if (sec->dofs_type != DOF_SECT_ACTDESC) {
11756 dtrace_dof_error(dof, "invalid action section");
11757 return (NULL);
11758 }
11759
11760 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11761 dtrace_dof_error(dof, "truncated action description");
11762 return (NULL);
11763 }
11764
11765 if (sec->dofs_align != sizeof (uint64_t)) {
11766 dtrace_dof_error(dof, "bad alignment in action description");
11767 return (NULL);
11768 }
11769
11770 if (sec->dofs_size < sec->dofs_entsize) {
11771 dtrace_dof_error(dof, "section entry size exceeds total size");
11772 return (NULL);
11773 }
11774
11775 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11776 dtrace_dof_error(dof, "bad entry size in action description");
11777 return (NULL);
11778 }
11779
11780 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11781 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11782 return (NULL);
11783 }
11784
11785 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11786 desc = (dof_actdesc_t *)(daddr +
11787 (uintptr_t)sec->dofs_offset + offs);
11788 kind = (dtrace_actkind_t)desc->dofa_kind;
11789
11790 if (DTRACEACT_ISPRINTFLIKE(kind) &&
11791 (kind != DTRACEACT_PRINTA ||
11792 desc->dofa_strtab != DOF_SECIDX_NONE)) {
11793 dof_sec_t *strtab;
11794 char *str, *fmt;
11795 uint64_t i;
11796
11797 /*
11798 * printf()-like actions must have a format string.
11799 */
11800 if ((strtab = dtrace_dof_sect(dof,
11801 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11802 goto err;
11803
11804 str = (char *)((uintptr_t)dof +
11805 (uintptr_t)strtab->dofs_offset);
11806
11807 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11808 if (str[i] == '\0')
11809 break;
11810 }
11811
11812 if (i >= strtab->dofs_size) {
11813 dtrace_dof_error(dof, "bogus format string");
11814 goto err;
11815 }
11816
11817 if (i == desc->dofa_arg) {
11818 dtrace_dof_error(dof, "empty format string");
11819 goto err;
11820 }
11821
11822 i -= desc->dofa_arg;
11823 fmt = kmem_alloc(i + 1, KM_SLEEP);
11824 bcopy(&str[desc->dofa_arg], fmt, i + 1);
11825 arg = (uint64_t)(uintptr_t)fmt;
11826 } else {
11827 if (kind == DTRACEACT_PRINTA) {
11828 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11829 arg = 0;
11830 } else {
11831 arg = desc->dofa_arg;
11832 }
11833 }
11834
11835 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11836 desc->dofa_uarg, arg);
11837
11838 if (last != NULL) {
11839 last->dtad_next = act;
11840 } else {
11841 first = act;
11842 }
11843
11844 last = act;
11845
11846 if (desc->dofa_difo == DOF_SECIDX_NONE)
11847 continue;
11848
11849 if ((difosec = dtrace_dof_sect(dof,
11850 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11851 goto err;
11852
11853 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11854
11855 if (act->dtad_difo == NULL)
11856 goto err;
11857 }
11858
11859 ASSERT(first != NULL);
11860 return (first);
11861
11862err:
11863 for (act = first; act != NULL; act = next) {
11864 next = act->dtad_next;
11865 dtrace_actdesc_release(act, vstate);
11866 }
11867
11868 return (NULL);
11869}
11870
11871static dtrace_ecbdesc_t *
11872dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11873 cred_t *cr)
11874{
11875 dtrace_ecbdesc_t *ep;
11876 dof_ecbdesc_t *ecb;
11877 dtrace_probedesc_t *desc;
11878 dtrace_predicate_t *pred = NULL;
11879
11880 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
11881 dtrace_dof_error(dof, "truncated ECB description");
11882 return (NULL);
11883 }
11884
11885 if (sec->dofs_align != sizeof (uint64_t)) {
11886 dtrace_dof_error(dof, "bad alignment in ECB description");
11887 return (NULL);
11888 }
11889
11890 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
11891 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
11892
11893 if (sec == NULL)
11894 return (NULL);
11895
11896 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11897 ep->dted_uarg = ecb->dofe_uarg;
11898 desc = &ep->dted_probe;
11899
11900 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
11901 goto err;
11902
11903 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
11904 if ((sec = dtrace_dof_sect(dof,
11905 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
11906 goto err;
11907
11908 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
11909 goto err;
11910
11911 ep->dted_pred.dtpdd_predicate = pred;
11912 }
11913
11914 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
11915 if ((sec = dtrace_dof_sect(dof,
11916 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
11917 goto err;
11918
11919 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
11920
11921 if (ep->dted_action == NULL)
11922 goto err;
11923 }
11924
11925 return (ep);
11926
11927err:
11928 if (pred != NULL)
11929 dtrace_predicate_release(pred, vstate);
11930 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11931 return (NULL);
11932}
11933
11934/*
11935 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
11936 * specified DOF. At present, this amounts to simply adding 'ubase' to the
11937 * site of any user SETX relocations to account for load object base address.
11938 * In the future, if we need other relocations, this function can be extended.
11939 */
11940static int
11941dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
11942{
11943 uintptr_t daddr = (uintptr_t)dof;
11944 dof_relohdr_t *dofr =
11945 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11946 dof_sec_t *ss, *rs, *ts;
11947 dof_relodesc_t *r;
11948 uint_t i, n;
11949
11950 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
11951 sec->dofs_align != sizeof (dof_secidx_t)) {
11952 dtrace_dof_error(dof, "invalid relocation header");
11953 return (-1);
11954 }
11955
11956 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
11957 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
11958 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
11959
11960 if (ss == NULL || rs == NULL || ts == NULL)
11961 return (-1); /* dtrace_dof_error() has been called already */
11962
11963 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
11964 rs->dofs_align != sizeof (uint64_t)) {
11965 dtrace_dof_error(dof, "invalid relocation section");
11966 return (-1);
11967 }
11968
11969 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
11970 n = rs->dofs_size / rs->dofs_entsize;
11971
11972 for (i = 0; i < n; i++) {
11973 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
11974
11975 switch (r->dofr_type) {
11976 case DOF_RELO_NONE:
11977 break;
11978 case DOF_RELO_SETX:
11979 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
11980 sizeof (uint64_t) > ts->dofs_size) {
11981 dtrace_dof_error(dof, "bad relocation offset");
11982 return (-1);
11983 }
11984
11985 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
11986 dtrace_dof_error(dof, "misaligned setx relo");
11987 return (-1);
11988 }
11989
11990 *(uint64_t *)taddr += ubase;
11991 break;
11992 default:
11993 dtrace_dof_error(dof, "invalid relocation type");
11994 return (-1);
11995 }
11996
11997 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
11998 }
11999
12000 return (0);
12001}
12002
12003/*
12004 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12005 * header: it should be at the front of a memory region that is at least
12006 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12007 * size. It need not be validated in any other way.
12008 */
12009static int
12010dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12011 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12012{
12013 uint64_t len = dof->dofh_loadsz, seclen;
12014 uintptr_t daddr = (uintptr_t)dof;
12015 dtrace_ecbdesc_t *ep;
12016 dtrace_enabling_t *enab;
12017 uint_t i;
12018
12019 ASSERT(MUTEX_HELD(&dtrace_lock));
12020 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12021
12022 /*
12023 * Check the DOF header identification bytes. In addition to checking
12024 * valid settings, we also verify that unused bits/bytes are zeroed so
12025 * we can use them later without fear of regressing existing binaries.
12026 */
12027 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12028 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12029 dtrace_dof_error(dof, "DOF magic string mismatch");
12030 return (-1);
12031 }
12032
12033 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12034 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12035 dtrace_dof_error(dof, "DOF has invalid data model");
12036 return (-1);
12037 }
12038
12039 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12040 dtrace_dof_error(dof, "DOF encoding mismatch");
12041 return (-1);
12042 }
12043
12044 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12045 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12046 dtrace_dof_error(dof, "DOF version mismatch");
12047 return (-1);
12048 }
12049
12050 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12051 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12052 return (-1);
12053 }
12054
12055 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12056 dtrace_dof_error(dof, "DOF uses too many integer registers");
12057 return (-1);
12058 }
12059
12060 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12061 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12062 return (-1);
12063 }
12064
12065 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12066 if (dof->dofh_ident[i] != 0) {
12067 dtrace_dof_error(dof, "DOF has invalid ident byte set");
12068 return (-1);
12069 }
12070 }
12071
12072 if (dof->dofh_flags & ~DOF_FL_VALID) {
12073 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12074 return (-1);
12075 }
12076
12077 if (dof->dofh_secsize == 0) {
12078 dtrace_dof_error(dof, "zero section header size");
12079 return (-1);
12080 }
12081
12082 /*
12083 * Check that the section headers don't exceed the amount of DOF
12084 * data. Note that we cast the section size and number of sections
12085 * to uint64_t's to prevent possible overflow in the multiplication.
12086 */
12087 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12088
12089 if (dof->dofh_secoff > len || seclen > len ||
12090 dof->dofh_secoff + seclen > len) {
12091 dtrace_dof_error(dof, "truncated section headers");
12092 return (-1);
12093 }
12094
12095 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12096 dtrace_dof_error(dof, "misaligned section headers");
12097 return (-1);
12098 }
12099
12100 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12101 dtrace_dof_error(dof, "misaligned section size");
12102 return (-1);
12103 }
12104
12105 /*
12106 * Take an initial pass through the section headers to be sure that
12107 * the headers don't have stray offsets. If the 'noprobes' flag is
12108 * set, do not permit sections relating to providers, probes, or args.
12109 */
12110 for (i = 0; i < dof->dofh_secnum; i++) {
12111 dof_sec_t *sec = (dof_sec_t *)(daddr +
12112 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12113
12114 if (noprobes) {
12115 switch (sec->dofs_type) {
12116 case DOF_SECT_PROVIDER:
12117 case DOF_SECT_PROBES:
12118 case DOF_SECT_PRARGS:
12119 case DOF_SECT_PROFFS:
12120 dtrace_dof_error(dof, "illegal sections "
12121 "for enabling");
12122 return (-1);
12123 }
12124 }
12125
12126 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12127 !(sec->dofs_flags & DOF_SECF_LOAD)) {
12128 dtrace_dof_error(dof, "loadable section with load "
12129 "flag unset");
12130 return (-1);
12131 }
12132
12133 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12134 continue; /* just ignore non-loadable sections */
12135
12136 if (sec->dofs_align & (sec->dofs_align - 1)) {
12137 dtrace_dof_error(dof, "bad section alignment");
12138 return (-1);
12139 }
12140
12141 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12142 dtrace_dof_error(dof, "misaligned section");
12143 return (-1);
12144 }
12145
12146 if (sec->dofs_offset > len || sec->dofs_size > len ||
12147 sec->dofs_offset + sec->dofs_size > len) {
12148 dtrace_dof_error(dof, "corrupt section header");
12149 return (-1);
12150 }
12151
12152 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12153 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12154 dtrace_dof_error(dof, "non-terminating string table");
12155 return (-1);
12156 }
12157 }
12158
12159 /*
12160 * Take a second pass through the sections and locate and perform any
12161 * relocations that are present. We do this after the first pass to
12162 * be sure that all sections have had their headers validated.
12163 */
12164 for (i = 0; i < dof->dofh_secnum; i++) {
12165 dof_sec_t *sec = (dof_sec_t *)(daddr +
12166 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12167
12168 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12169 continue; /* skip sections that are not loadable */
12170
12171 switch (sec->dofs_type) {
12172 case DOF_SECT_URELHDR:
12173 if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12174 return (-1);
12175 break;
12176 }
12177 }
12178
12179 if ((enab = *enabp) == NULL)
12180 enab = *enabp = dtrace_enabling_create(vstate);
12181
12182 for (i = 0; i < dof->dofh_secnum; i++) {
12183 dof_sec_t *sec = (dof_sec_t *)(daddr +
12184 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12185
12186 if (sec->dofs_type != DOF_SECT_ECBDESC)
12187 continue;
12188
12189 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12190 dtrace_enabling_destroy(enab);
12191 *enabp = NULL;
12192 return (-1);
12193 }
12194
12195 dtrace_enabling_add(enab, ep);
12196 }
12197
12198 return (0);
12199}
12200
12201/*
12202 * Process DOF for any options. This routine assumes that the DOF has been
12203 * at least processed by dtrace_dof_slurp().
12204 */
12205static int
12206dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12207{
12208 int i, rval;
12209 uint32_t entsize;
12210 size_t offs;
12211 dof_optdesc_t *desc;
12212
12213 for (i = 0; VBDTCAST(unsigned)i < dof->dofh_secnum; i++) {
12214 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12215 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12216
12217 if (sec->dofs_type != DOF_SECT_OPTDESC)
12218 continue;
12219
12220 if (sec->dofs_align != sizeof (uint64_t)) {
12221 dtrace_dof_error(dof, "bad alignment in "
12222 "option description");
12223 return (EINVAL);
12224 }
12225
12226 if ((entsize = sec->dofs_entsize) == 0) {
12227 dtrace_dof_error(dof, "zeroed option entry size");
12228 return (EINVAL);
12229 }
12230
12231 if (entsize < sizeof (dof_optdesc_t)) {
12232 dtrace_dof_error(dof, "bad option entry size");
12233 return (EINVAL);
12234 }
12235
12236 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12237 desc = (dof_optdesc_t *)((uintptr_t)dof +
12238 (uintptr_t)sec->dofs_offset + offs);
12239
12240 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12241 dtrace_dof_error(dof, "non-zero option string");
12242 return (EINVAL);
12243 }
12244
12245 if (desc->dofo_value == VBDTCAST(uint64_t)DTRACEOPT_UNSET) {
12246 dtrace_dof_error(dof, "unset option");
12247 return (EINVAL);
12248 }
12249
12250 if ((rval = dtrace_state_option(state,
12251 desc->dofo_option, desc->dofo_value)) != 0) {
12252 dtrace_dof_error(dof, "rejected option");
12253 return (rval);
12254 }
12255 }
12256 }
12257
12258 return (0);
12259}
12260
12261/*
12262 * DTrace Consumer State Functions
12263 */
12264VBDTSTATIC int
12265dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12266{
12267 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12268 void *base;
12269 uintptr_t limit;
12270 dtrace_dynvar_t *dvar, *next, *start;
12271 VBDTTYPE(size_t,int) i;
12272
12273 ASSERT(MUTEX_HELD(&dtrace_lock));
12274 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12275
12276 bzero(dstate, sizeof (dtrace_dstate_t));
12277
12278 if ((dstate->dtds_chunksize = chunksize) == 0)
12279 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12280
12281 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12282 size = min;
12283
12284 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12285 return (ENOMEM);
12286
12287 dstate->dtds_size = size;
12288 dstate->dtds_base = base;
12289 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12290 bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12291
12292 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12293
12294 if (hashsize != 1 && (hashsize & 1))
12295 hashsize--;
12296
12297 dstate->dtds_hashsize = hashsize;
12298 dstate->dtds_hash = dstate->dtds_base;
12299
12300 /*
12301 * Set all of our hash buckets to point to the single sink, and (if
12302 * it hasn't already been set), set the sink's hash value to be the
12303 * sink sentinel value. The sink is needed for dynamic variable
12304 * lookups to know that they have iterated over an entire, valid hash
12305 * chain.
12306 */
12307 for (i = 0; i < hashsize; i++)
12308 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12309
12310 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12311 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12312
12313 /*
12314 * Determine number of active CPUs. Divide free list evenly among
12315 * active CPUs.
12316 */
12317 start = (dtrace_dynvar_t *)
12318 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12319 limit = (uintptr_t)base + size;
12320
12321 maxper = (limit - (uintptr_t)start) / NCPU;
12322 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12323
12324 for (i = 0; i < NCPU; i++) {
12325 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12326
12327 /*
12328 * If we don't even have enough chunks to make it once through
12329 * NCPUs, we're just going to allocate everything to the first
12330 * CPU. And if we're on the last CPU, we're going to allocate
12331 * whatever is left over. In either case, we set the limit to
12332 * be the limit of the dynamic variable space.
12333 */
12334 if (maxper == 0 || i == NCPU - 1) {
12335 limit = (uintptr_t)base + size;
12336 start = NULL;
12337 } else {
12338 limit = (uintptr_t)start + maxper;
12339 start = (dtrace_dynvar_t *)limit;
12340 }
12341
12342 ASSERT(limit <= (uintptr_t)base + size);
12343
12344 for (;;) {
12345 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12346 dstate->dtds_chunksize);
12347
12348 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12349 break;
12350
12351 dvar->dtdv_next = next;
12352 dvar = next;
12353 }
12354
12355 if (maxper == 0)
12356 break;
12357 }
12358
12359 return (0);
12360}
12361
12362VBDTSTATIC void
12363dtrace_dstate_fini(dtrace_dstate_t *dstate)
12364{
12365 ASSERT(MUTEX_HELD(&cpu_lock));
12366
12367 if (dstate->dtds_base == NULL)
12368 return;
12369
12370 kmem_free(dstate->dtds_base, dstate->dtds_size);
12371 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12372}
12373
12374static void
12375dtrace_vstate_fini(dtrace_vstate_t *vstate)
12376{
12377 /*
12378 * Logical XOR, where are you?
12379 */
12380 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12381
12382 if (vstate->dtvs_nglobals > 0) {
12383 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12384 sizeof (dtrace_statvar_t *));
12385 }
12386
12387 if (vstate->dtvs_ntlocals > 0) {
12388 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12389 sizeof (dtrace_difv_t));
12390 }
12391
12392 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12393
12394 if (vstate->dtvs_nlocals > 0) {
12395 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12396 sizeof (dtrace_statvar_t *));
12397 }
12398}
12399
12400static void
12401dtrace_state_clean(dtrace_state_t *state)
12402{
12403 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12404 return;
12405
12406 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12407 dtrace_speculation_clean(state);
12408}
12409#ifdef VBOX
12410static DECLCALLBACK(void) dtrace_state_clean_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12411{
12412 dtrace_state_clean((dtrace_state_t *)pvUser);
12413 NOREF(pTimer); NOREF(iTick);
12414}
12415#endif
12416
12417static void
12418dtrace_state_deadman(dtrace_state_t *state)
12419{
12420 hrtime_t now;
12421
12422 dtrace_sync();
12423
12424 now = dtrace_gethrtime();
12425
12426 if (state != dtrace_anon.dta_state &&
12427 now - state->dts_laststatus >= dtrace_deadman_user)
12428 return;
12429
12430 /*
12431 * We must be sure that dts_alive never appears to be less than the
12432 * value upon entry to dtrace_state_deadman(), and because we lack a
12433 * dtrace_cas64(), we cannot store to it atomically. We thus instead
12434 * store INT64_MAX to it, followed by a memory barrier, followed by
12435 * the new value. This assures that dts_alive never appears to be
12436 * less than its true value, regardless of the order in which the
12437 * stores to the underlying storage are issued.
12438 */
12439 state->dts_alive = INT64_MAX;
12440 dtrace_membar_producer();
12441 state->dts_alive = now;
12442}
12443
12444#ifdef VBOX
12445static DECLCALLBACK(void) dtrace_state_deadman_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12446{
12447 dtrace_state_deadman((dtrace_state_t *)pvUser);
12448 NOREF(pTimer); NOREF(iTick);
12449}
12450#endif
12451
12452VBDTSTATIC dtrace_state_t *
12453dtrace_state_create(dev_t *devp, cred_t *cr)
12454{
12455 minor_t minor;
12456 major_t major;
12457 char c[30];
12458 dtrace_state_t *state;
12459 dtrace_optval_t *opt;
12460 int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12461
12462 ASSERT(MUTEX_HELD(&dtrace_lock));
12463 ASSERT(MUTEX_HELD(&cpu_lock));
12464
12465 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12466 VM_BESTFIT | VM_SLEEP);
12467
12468 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12469 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12470 return (NULL);
12471 }
12472
12473 state = ddi_get_soft_state(dtrace_softstate, minor);
12474 state->dts_epid = DTRACE_EPIDNONE + 1;
12475
12476 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12477 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12478 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12479
12480 if (devp != NULL) {
12481 major = getemajor(*devp);
12482 } else {
12483 major = ddi_driver_major(dtrace_devi);
12484 }
12485
12486 state->dts_dev = makedevice(major, minor);
12487
12488 if (devp != NULL)
12489 *devp = state->dts_dev;
12490
12491 /*
12492 * We allocate NCPU buffers. On the one hand, this can be quite
12493 * a bit of memory per instance (nearly 36K on a Starcat). On the
12494 * other hand, it saves an additional memory reference in the probe
12495 * path.
12496 */
12497 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12498 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12499 state->dts_cleaner = CYCLIC_NONE;
12500 state->dts_deadman = CYCLIC_NONE;
12501 state->dts_vstate.dtvs_state = state;
12502
12503 for (i = 0; i < DTRACEOPT_MAX; i++)
12504 state->dts_options[i] = DTRACEOPT_UNSET;
12505
12506 /*
12507 * Set the default options.
12508 */
12509 opt = state->dts_options;
12510 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12511 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12512 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12513 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12514 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12515 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12516 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12517 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12518 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12519 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12520 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12521 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12522 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12523 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12524
12525 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12526
12527 /*
12528 * Depending on the user credentials, we set flag bits which alter probe
12529 * visibility or the amount of destructiveness allowed. In the case of
12530 * actual anonymous tracing, or the possession of all privileges, all of
12531 * the normal checks are bypassed.
12532 */
12533 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12534 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12535 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12536 } else {
12537 /*
12538 * Set up the credentials for this instantiation. We take a
12539 * hold on the credential to prevent it from disappearing on
12540 * us; this in turn prevents the zone_t referenced by this
12541 * credential from disappearing. This means that we can
12542 * examine the credential and the zone from probe context.
12543 */
12544 crhold(cr);
12545 state->dts_cred.dcr_cred = cr;
12546
12547 /*
12548 * CRA_PROC means "we have *some* privilege for dtrace" and
12549 * unlocks the use of variables like pid, zonename, etc.
12550 */
12551 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12552 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12553 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12554 }
12555
12556 /*
12557 * dtrace_user allows use of syscall and profile providers.
12558 * If the user also has proc_owner and/or proc_zone, we
12559 * extend the scope to include additional visibility and
12560 * destructive power.
12561 */
12562 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12563 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12564 state->dts_cred.dcr_visible |=
12565 DTRACE_CRV_ALLPROC;
12566
12567 state->dts_cred.dcr_action |=
12568 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12569 }
12570
12571 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12572 state->dts_cred.dcr_visible |=
12573 DTRACE_CRV_ALLZONE;
12574
12575 state->dts_cred.dcr_action |=
12576 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12577 }
12578
12579 /*
12580 * If we have all privs in whatever zone this is,
12581 * we can do destructive things to processes which
12582 * have altered credentials.
12583 */
12584 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12585 cr->cr_zone->zone_privset)) {
12586 state->dts_cred.dcr_action |=
12587 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12588 }
12589 }
12590
12591 /*
12592 * Holding the dtrace_kernel privilege also implies that
12593 * the user has the dtrace_user privilege from a visibility
12594 * perspective. But without further privileges, some
12595 * destructive actions are not available.
12596 */
12597 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12598 /*
12599 * Make all probes in all zones visible. However,
12600 * this doesn't mean that all actions become available
12601 * to all zones.
12602 */
12603 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12604 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12605
12606 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12607 DTRACE_CRA_PROC;
12608 /*
12609 * Holding proc_owner means that destructive actions
12610 * for *this* zone are allowed.
12611 */
12612 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12613 state->dts_cred.dcr_action |=
12614 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12615
12616 /*
12617 * Holding proc_zone means that destructive actions
12618 * for this user/group ID in all zones is allowed.
12619 */
12620 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12621 state->dts_cred.dcr_action |=
12622 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12623
12624 /*
12625 * If we have all privs in whatever zone this is,
12626 * we can do destructive things to processes which
12627 * have altered credentials.
12628 */
12629 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12630 cr->cr_zone->zone_privset)) {
12631 state->dts_cred.dcr_action |=
12632 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12633 }
12634 }
12635
12636 /*
12637 * Holding the dtrace_proc privilege gives control over fasttrap
12638 * and pid providers. We need to grant wider destructive
12639 * privileges in the event that the user has proc_owner and/or
12640 * proc_zone.
12641 */
12642 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12643 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12644 state->dts_cred.dcr_action |=
12645 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12646
12647 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12648 state->dts_cred.dcr_action |=
12649 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12650 }
12651 }
12652
12653 return (state);
12654}
12655
12656static int
12657dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12658{
12659 dtrace_optval_t *opt = state->dts_options, size;
12660 processorid_t cpu;
12661 int flags = 0, rval;
12662
12663 ASSERT(MUTEX_HELD(&dtrace_lock));
12664 ASSERT(MUTEX_HELD(&cpu_lock));
12665 ASSERT(which < DTRACEOPT_MAX);
12666 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12667 (state == dtrace_anon.dta_state &&
12668 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12669
12670 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12671 return (0);
12672
12673 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12674 cpu = opt[DTRACEOPT_CPU];
12675
12676 if (which == DTRACEOPT_SPECSIZE)
12677 flags |= DTRACEBUF_NOSWITCH;
12678
12679 if (which == DTRACEOPT_BUFSIZE) {
12680 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12681 flags |= DTRACEBUF_RING;
12682
12683 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12684 flags |= DTRACEBUF_FILL;
12685
12686 if (state != dtrace_anon.dta_state ||
12687 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12688 flags |= DTRACEBUF_INACTIVE;
12689 }
12690
12691 for (size = opt[which]; size >= VBDTCAST(dtrace_optval_t)sizeof (uint64_t); size >>= 1) {
12692 /*
12693 * The size must be 8-byte aligned. If the size is not 8-byte
12694 * aligned, drop it down by the difference.
12695 */
12696 if (size & (sizeof (uint64_t) - 1))
12697 size -= size & (sizeof (uint64_t) - 1);
12698
12699 if (size < state->dts_reserve) {
12700 /*
12701 * Buffers always must be large enough to accommodate
12702 * their prereserved space. We return E2BIG instead
12703 * of ENOMEM in this case to allow for user-level
12704 * software to differentiate the cases.
12705 */
12706 return (E2BIG);
12707 }
12708
12709 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12710
12711 if (rval != ENOMEM) {
12712 opt[which] = size;
12713 return (rval);
12714 }
12715
12716 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12717 return (rval);
12718 }
12719
12720 return (ENOMEM);
12721}
12722
12723static int
12724dtrace_state_buffers(dtrace_state_t *state)
12725{
12726 dtrace_speculation_t *spec = state->dts_speculations;
12727 int rval, i;
12728
12729 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12730 DTRACEOPT_BUFSIZE)) != 0)
12731 return (rval);
12732
12733 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12734 DTRACEOPT_AGGSIZE)) != 0)
12735 return (rval);
12736
12737 for (i = 0; i < state->dts_nspeculations; i++) {
12738 if ((rval = dtrace_state_buffer(state,
12739 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12740 return (rval);
12741 }
12742
12743 return (0);
12744}
12745
12746static void
12747dtrace_state_prereserve(dtrace_state_t *state)
12748{
12749 dtrace_ecb_t *ecb;
12750 dtrace_probe_t *probe;
12751
12752 state->dts_reserve = 0;
12753
12754 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12755 return;
12756
12757 /*
12758 * If our buffer policy is a "fill" buffer policy, we need to set the
12759 * prereserved space to be the space required by the END probes.
12760 */
12761 probe = dtrace_probes[dtrace_probeid_end - 1];
12762 ASSERT(probe != NULL);
12763
12764 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12765 if (ecb->dte_state != state)
12766 continue;
12767
12768 state->dts_reserve += VBDTCAST(uint32_t)ecb->dte_needed + ecb->dte_alignment;
12769 }
12770}
12771
12772static int
12773dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12774{
12775 dtrace_optval_t *opt = state->dts_options, sz, nspec;
12776 dtrace_speculation_t *spec;
12777 dtrace_buffer_t *buf;
12778#ifndef VBOX
12779 cyc_handler_t hdlr;
12780 cyc_time_t when;
12781#endif
12782 int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12783 dtrace_icookie_t cookie;
12784
12785 mutex_enter(&cpu_lock);
12786 mutex_enter(&dtrace_lock);
12787
12788 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12789 rval = EBUSY;
12790 goto out;
12791 }
12792
12793 /*
12794 * Before we can perform any checks, we must prime all of the
12795 * retained enablings that correspond to this state.
12796 */
12797 dtrace_enabling_prime(state);
12798
12799 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12800 rval = EACCES;
12801 goto out;
12802 }
12803
12804 dtrace_state_prereserve(state);
12805
12806 /*
12807 * Now we want to do is try to allocate our speculations.
12808 * We do not automatically resize the number of speculations; if
12809 * this fails, we will fail the operation.
12810 */
12811 nspec = opt[DTRACEOPT_NSPEC];
12812 ASSERT(nspec != DTRACEOPT_UNSET);
12813
12814 if (nspec > INT_MAX) {
12815 rval = ENOMEM;
12816 goto out;
12817 }
12818
12819 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
12820
12821 if (spec == NULL) {
12822 rval = ENOMEM;
12823 goto out;
12824 }
12825
12826 state->dts_speculations = spec;
12827 state->dts_nspeculations = (int)nspec;
12828
12829 for (i = 0; i < nspec; i++) {
12830 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
12831 rval = ENOMEM;
12832 goto err;
12833 }
12834
12835 spec[i].dtsp_buffer = buf;
12836 }
12837
12838 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12839 if (dtrace_anon.dta_state == NULL) {
12840 rval = ENOENT;
12841 goto out;
12842 }
12843
12844 if (state->dts_necbs != 0) {
12845 rval = EALREADY;
12846 goto out;
12847 }
12848
12849 state->dts_anon = dtrace_anon_grab();
12850 ASSERT(state->dts_anon != NULL);
12851 state = state->dts_anon;
12852
12853 /*
12854 * We want "grabanon" to be set in the grabbed state, so we'll
12855 * copy that option value from the grabbing state into the
12856 * grabbed state.
12857 */
12858 state->dts_options[DTRACEOPT_GRABANON] =
12859 opt[DTRACEOPT_GRABANON];
12860
12861 *cpu = dtrace_anon.dta_beganon;
12862
12863 /*
12864 * If the anonymous state is active (as it almost certainly
12865 * is if the anonymous enabling ultimately matched anything),
12866 * we don't allow any further option processing -- but we
12867 * don't return failure.
12868 */
12869 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12870 goto out;
12871 }
12872
12873 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12874 opt[DTRACEOPT_AGGSIZE] != 0) {
12875 if (state->dts_aggregations == NULL) {
12876 /*
12877 * We're not going to create an aggregation buffer
12878 * because we don't have any ECBs that contain
12879 * aggregations -- set this option to 0.
12880 */
12881 opt[DTRACEOPT_AGGSIZE] = 0;
12882 } else {
12883 /*
12884 * If we have an aggregation buffer, we must also have
12885 * a buffer to use as scratch.
12886 */
12887 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12888 opt[DTRACEOPT_BUFSIZE] < VBDTCAST(dtrace_optval_t)state->dts_needed) {
12889 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12890 }
12891 }
12892 }
12893
12894 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
12895 opt[DTRACEOPT_SPECSIZE] != 0) {
12896 if (!state->dts_speculates) {
12897 /*
12898 * We're not going to create speculation buffers
12899 * because we don't have any ECBs that actually
12900 * speculate -- set the speculation size to 0.
12901 */
12902 opt[DTRACEOPT_SPECSIZE] = 0;
12903 }
12904 }
12905
12906 /*
12907 * The bare minimum size for any buffer that we're actually going to
12908 * do anything to is sizeof (uint64_t).
12909 */
12910 sz = sizeof (uint64_t);
12911
12912 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
12913 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
12914 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
12915 /*
12916 * A buffer size has been explicitly set to 0 (or to a size
12917 * that will be adjusted to 0) and we need the space -- we
12918 * need to return failure. We return ENOSPC to differentiate
12919 * it from failing to allocate a buffer due to failure to meet
12920 * the reserve (for which we return E2BIG).
12921 */
12922 rval = ENOSPC;
12923 goto out;
12924 }
12925
12926 if ((rval = dtrace_state_buffers(state)) != 0)
12927 goto err;
12928
12929 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
12930 sz = dtrace_dstate_defsize;
12931
12932 do {
12933 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
12934
12935 if (rval == 0)
12936 break;
12937
12938 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12939 goto err;
12940 } while (sz >>= 1);
12941
12942 opt[DTRACEOPT_DYNVARSIZE] = sz;
12943
12944 if (rval != 0)
12945 goto err;
12946
12947 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
12948 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
12949
12950 if (opt[DTRACEOPT_CLEANRATE] == 0)
12951 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12952
12953 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
12954 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
12955
12956 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
12957 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12958
12959#ifndef VBOX
12960 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
12961 hdlr.cyh_arg = state;
12962 hdlr.cyh_level = CY_LOW_LEVEL;
12963
12964 when.cyt_when = 0;
12965 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
12966
12967 state->dts_cleaner = cyclic_add(&hdlr, &when);
12968
12969 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
12970 hdlr.cyh_arg = state;
12971 hdlr.cyh_level = CY_LOW_LEVEL;
12972
12973 when.cyt_when = 0;
12974 when.cyt_interval = dtrace_deadman_interval;
12975
12976 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12977 state->dts_deadman = cyclic_add(&hdlr, &when);
12978#else /* VBOX */
12979
12980 rval = RTTimerCreateEx(&state->dts_cleaner, opt[DTRACEOPT_CLEANRATE],
12981 RTTIMER_FLAGS_CPU_ANY, dtrace_state_clean_timer, state);
12982 if (RT_FAILURE(rval)) {
12983 rval = RTErrConvertToErrno(rval);
12984 goto err;
12985 }
12986
12987 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12988 rval = RTTimerCreateEx(&state->dts_deadman, dtrace_deadman_interval,
12989 RTTIMER_FLAGS_CPU_ANY, dtrace_state_deadman_timer, state);
12990 if (RT_FAILURE(rval)) {
12991 RTTimerDestroy(state->dts_cleaner);
12992 state->dts_cleaner = CYCLIC_NONE;
12993 state->dts_deadman = CYCLIC_NONE;
12994 rval = RTErrConvertToErrno(rval);
12995 goto err;
12996 }
12997#endif /* VBOX */
12998
12999 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13000
13001 /*
13002 * Now it's time to actually fire the BEGIN probe. We need to disable
13003 * interrupts here both to record the CPU on which we fired the BEGIN
13004 * probe (the data from this CPU will be processed first at user
13005 * level) and to manually activate the buffer for this CPU.
13006 */
13007 cookie = dtrace_interrupt_disable();
13008 *cpu = VBDT_GET_CPUID();
13009 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13010 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13011
13012 dtrace_probe(dtrace_probeid_begin,
13013 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13014 dtrace_interrupt_enable(cookie);
13015 /*
13016 * We may have had an exit action from a BEGIN probe; only change our
13017 * state to ACTIVE if we're still in WARMUP.
13018 */
13019 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13020 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13021
13022 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13023 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13024
13025 /*
13026 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13027 * want each CPU to transition its principal buffer out of the
13028 * INACTIVE state. Doing this assures that no CPU will suddenly begin
13029 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13030 * atomically transition from processing none of a state's ECBs to
13031 * processing all of them.
13032 */
13033 dtrace_xcall(DTRACE_CPUALL,
13034 (dtrace_xcall_t)dtrace_buffer_activate, state);
13035 goto out;
13036
13037err:
13038 dtrace_buffer_free(state->dts_buffer);
13039 dtrace_buffer_free(state->dts_aggbuffer);
13040
13041 if ((nspec = state->dts_nspeculations) == 0) {
13042 ASSERT(state->dts_speculations == NULL);
13043 goto out;
13044 }
13045
13046 spec = state->dts_speculations;
13047 ASSERT(spec != NULL);
13048
13049 for (i = 0; i < state->dts_nspeculations; i++) {
13050 if ((buf = spec[i].dtsp_buffer) == NULL)
13051 break;
13052
13053 dtrace_buffer_free(buf);
13054 kmem_free(buf, bufsize);
13055 }
13056
13057 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13058 state->dts_nspeculations = 0;
13059 state->dts_speculations = NULL;
13060
13061out:
13062 mutex_exit(&dtrace_lock);
13063 mutex_exit(&cpu_lock);
13064
13065 return (rval);
13066}
13067
13068static int
13069dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13070{
13071 dtrace_icookie_t cookie;
13072
13073 ASSERT(MUTEX_HELD(&dtrace_lock));
13074
13075 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13076 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13077 return (EINVAL);
13078
13079 /*
13080 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13081 * to be sure that every CPU has seen it. See below for the details
13082 * on why this is done.
13083 */
13084 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13085 dtrace_sync();
13086
13087 /*
13088 * By this point, it is impossible for any CPU to be still processing
13089 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13090 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13091 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13092 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13093 * iff we're in the END probe.
13094 */
13095 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13096 dtrace_sync();
13097 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13098
13099 /*
13100 * Finally, we can release the reserve and call the END probe. We
13101 * disable interrupts across calling the END probe to allow us to
13102 * return the CPU on which we actually called the END probe. This
13103 * allows user-land to be sure that this CPU's principal buffer is
13104 * processed last.
13105 */
13106 state->dts_reserve = 0;
13107
13108 cookie = dtrace_interrupt_disable();
13109 *cpu = VBDT_GET_CPUID();
13110 dtrace_probe(dtrace_probeid_end,
13111 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13112 dtrace_interrupt_enable(cookie);
13113
13114 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13115 dtrace_sync();
13116
13117 return (0);
13118}
13119
13120static int
13121dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13122 dtrace_optval_t val)
13123{
13124 ASSERT(MUTEX_HELD(&dtrace_lock));
13125
13126 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13127 return (EBUSY);
13128
13129 if (option >= DTRACEOPT_MAX)
13130 return (EINVAL);
13131
13132 if (option != DTRACEOPT_CPU && val < 0)
13133 return (EINVAL);
13134
13135 switch (option) {
13136 case DTRACEOPT_DESTRUCTIVE:
13137 if (dtrace_destructive_disallow)
13138 return (EACCES);
13139
13140 state->dts_cred.dcr_destructive = 1;
13141 break;
13142
13143 case DTRACEOPT_BUFSIZE:
13144 case DTRACEOPT_DYNVARSIZE:
13145 case DTRACEOPT_AGGSIZE:
13146 case DTRACEOPT_SPECSIZE:
13147 case DTRACEOPT_STRSIZE:
13148 if (val < 0)
13149 return (EINVAL);
13150
13151 if (val >= LONG_MAX) {
13152 /*
13153 * If this is an otherwise negative value, set it to
13154 * the highest multiple of 128m less than LONG_MAX.
13155 * Technically, we're adjusting the size without
13156 * regard to the buffer resizing policy, but in fact,
13157 * this has no effect -- if we set the buffer size to
13158 * ~LONG_MAX and the buffer policy is ultimately set to
13159 * be "manual", the buffer allocation is guaranteed to
13160 * fail, if only because the allocation requires two
13161 * buffers. (We set the the size to the highest
13162 * multiple of 128m because it ensures that the size
13163 * will remain a multiple of a megabyte when
13164 * repeatedly halved -- all the way down to 15m.)
13165 */
13166 val = LONG_MAX - (1 << 27) + 1;
13167 }
13168 }
13169
13170 state->dts_options[option] = val;
13171
13172 return (0);
13173}
13174
13175static void
13176dtrace_state_destroy(dtrace_state_t *state)
13177{
13178 dtrace_ecb_t *ecb;
13179 dtrace_vstate_t *vstate = &state->dts_vstate;
13180 minor_t minor = getminor(state->dts_dev);
13181 int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13182 dtrace_speculation_t *spec = state->dts_speculations;
13183 int nspec = state->dts_nspeculations;
13184 uint32_t match;
13185
13186 ASSERT(MUTEX_HELD(&dtrace_lock));
13187 ASSERT(MUTEX_HELD(&cpu_lock));
13188
13189 /*
13190 * First, retract any retained enablings for this state.
13191 */
13192 dtrace_enabling_retract(state);
13193 ASSERT(state->dts_nretained == 0);
13194
13195 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13196 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13197 /*
13198 * We have managed to come into dtrace_state_destroy() on a
13199 * hot enabling -- almost certainly because of a disorderly
13200 * shutdown of a consumer. (That is, a consumer that is
13201 * exiting without having called dtrace_stop().) In this case,
13202 * we're going to set our activity to be KILLED, and then
13203 * issue a sync to be sure that everyone is out of probe
13204 * context before we start blowing away ECBs.
13205 */
13206 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13207 dtrace_sync();
13208 }
13209
13210 /*
13211 * Release the credential hold we took in dtrace_state_create().
13212 */
13213 if (state->dts_cred.dcr_cred != NULL)
13214 crfree(state->dts_cred.dcr_cred);
13215
13216 /*
13217 * Now we can safely disable and destroy any enabled probes. Because
13218 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13219 * (especially if they're all enabled), we take two passes through the
13220 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13221 * in the second we disable whatever is left over.
13222 */
13223 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13224 for (i = 0; i < state->dts_necbs; i++) {
13225 if ((ecb = state->dts_ecbs[i]) == NULL)
13226 continue;
13227
13228 if (match && ecb->dte_probe != NULL) {
13229 dtrace_probe_t *probe = ecb->dte_probe;
13230 dtrace_provider_t *prov = probe->dtpr_provider;
13231
13232 if (!(prov->dtpv_priv.dtpp_flags & match))
13233 continue;
13234 }
13235
13236 dtrace_ecb_disable(ecb);
13237 dtrace_ecb_destroy(ecb);
13238 }
13239
13240 if (!match)
13241 break;
13242 }
13243
13244 /*
13245 * Before we free the buffers, perform one more sync to assure that
13246 * every CPU is out of probe context.
13247 */
13248 dtrace_sync();
13249
13250 dtrace_buffer_free(state->dts_buffer);
13251 dtrace_buffer_free(state->dts_aggbuffer);
13252
13253 for (i = 0; i < nspec; i++)
13254 dtrace_buffer_free(spec[i].dtsp_buffer);
13255
13256 if (state->dts_cleaner != CYCLIC_NONE)
13257 cyclic_remove(state->dts_cleaner);
13258
13259 if (state->dts_deadman != CYCLIC_NONE)
13260 cyclic_remove(state->dts_deadman);
13261
13262 dtrace_dstate_fini(&vstate->dtvs_dynvars);
13263 dtrace_vstate_fini(vstate);
13264 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13265
13266 if (state->dts_aggregations != NULL) {
13267#ifdef DEBUG
13268 for (i = 0; i < state->dts_naggregations; i++)
13269 ASSERT(state->dts_aggregations[i] == NULL);
13270#endif
13271 ASSERT(state->dts_naggregations > 0);
13272 kmem_free(state->dts_aggregations,
13273 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13274 }
13275
13276 kmem_free(state->dts_buffer, bufsize);
13277 kmem_free(state->dts_aggbuffer, bufsize);
13278
13279 for (i = 0; i < nspec; i++)
13280 kmem_free(spec[i].dtsp_buffer, bufsize);
13281
13282 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13283
13284 dtrace_format_destroy(state);
13285
13286 vmem_destroy(state->dts_aggid_arena);
13287 ddi_soft_state_free(dtrace_softstate, minor);
13288 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13289}
13290
13291/*
13292 * DTrace Anonymous Enabling Functions
13293 */
13294static dtrace_state_t *
13295dtrace_anon_grab(void)
13296{
13297 dtrace_state_t *state;
13298
13299 ASSERT(MUTEX_HELD(&dtrace_lock));
13300
13301 if ((state = dtrace_anon.dta_state) == NULL) {
13302 ASSERT(dtrace_anon.dta_enabling == NULL);
13303 return (NULL);
13304 }
13305
13306 ASSERT(dtrace_anon.dta_enabling != NULL);
13307 ASSERT(dtrace_retained != NULL);
13308
13309 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13310 dtrace_anon.dta_enabling = NULL;
13311 dtrace_anon.dta_state = NULL;
13312
13313 return (state);
13314}
13315
13316static void
13317dtrace_anon_property(void)
13318{
13319 int i, rv;
13320 dtrace_state_t *state;
13321 dof_hdr_t *dof;
13322 char c[32]; /* enough for "dof-data-" + digits */
13323
13324 ASSERT(MUTEX_HELD(&dtrace_lock));
13325 ASSERT(MUTEX_HELD(&cpu_lock));
13326
13327 for (i = 0; ; i++) {
13328 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13329
13330 dtrace_err_verbose = 1;
13331
13332 if ((dof = dtrace_dof_property(c)) == NULL) {
13333 dtrace_err_verbose = 0;
13334 break;
13335 }
13336
13337#ifndef VBOX
13338 /*
13339 * We want to create anonymous state, so we need to transition
13340 * the kernel debugger to indicate that DTrace is active. If
13341 * this fails (e.g. because the debugger has modified text in
13342 * some way), we won't continue with the processing.
13343 */
13344 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13345 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13346 "enabling ignored.");
13347 dtrace_dof_destroy(dof);
13348 break;
13349 }
13350#endif
13351
13352 /*
13353 * If we haven't allocated an anonymous state, we'll do so now.
13354 */
13355 if ((state = dtrace_anon.dta_state) == NULL) {
13356 state = dtrace_state_create(NULL, NULL);
13357 dtrace_anon.dta_state = state;
13358
13359 if (state == NULL) {
13360 /*
13361 * This basically shouldn't happen: the only
13362 * failure mode from dtrace_state_create() is a
13363 * failure of ddi_soft_state_zalloc() that
13364 * itself should never happen. Still, the
13365 * interface allows for a failure mode, and
13366 * we want to fail as gracefully as possible:
13367 * we'll emit an error message and cease
13368 * processing anonymous state in this case.
13369 */
13370 cmn_err(CE_WARN, "failed to create "
13371 "anonymous state");
13372 dtrace_dof_destroy(dof);
13373 break;
13374 }
13375 }
13376
13377 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13378 &dtrace_anon.dta_enabling, 0, B_TRUE);
13379
13380 if (rv == 0)
13381 rv = dtrace_dof_options(dof, state);
13382
13383 dtrace_err_verbose = 0;
13384 dtrace_dof_destroy(dof);
13385
13386 if (rv != 0) {
13387 /*
13388 * This is malformed DOF; chuck any anonymous state
13389 * that we created.
13390 */
13391 ASSERT(dtrace_anon.dta_enabling == NULL);
13392 dtrace_state_destroy(state);
13393 dtrace_anon.dta_state = NULL;
13394 break;
13395 }
13396
13397 ASSERT(dtrace_anon.dta_enabling != NULL);
13398 }
13399
13400 if (dtrace_anon.dta_enabling != NULL) {
13401 int rval;
13402
13403 /*
13404 * dtrace_enabling_retain() can only fail because we are
13405 * trying to retain more enablings than are allowed -- but
13406 * we only have one anonymous enabling, and we are guaranteed
13407 * to be allowed at least one retained enabling; we assert
13408 * that dtrace_enabling_retain() returns success.
13409 */
13410 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13411 ASSERT(rval == 0);
13412
13413 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13414 }
13415}
13416
13417/*
13418 * DTrace Helper Functions
13419 */
13420static void
13421dtrace_helper_trace(dtrace_helper_action_t *helper,
13422 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13423{
13424 uint32_t size, next, nnext, i;
13425 dtrace_helptrace_t *ent;
13426 uint16_t flags = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13427
13428 if (!dtrace_helptrace_enabled)
13429 return;
13430
13431 ASSERT(vstate->dtvs_nlocals <= VBDTCAST(int32_t)dtrace_helptrace_nlocals);
13432
13433 /*
13434 * What would a tracing framework be without its own tracing
13435 * framework? (Well, a hell of a lot simpler, for starters...)
13436 */
13437 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13438 sizeof (uint64_t) - sizeof (uint64_t);
13439
13440 /*
13441 * Iterate until we can allocate a slot in the trace buffer.
13442 */
13443 do {
13444 next = dtrace_helptrace_next;
13445
13446 if (next + size < VBDTCAST(unsigned)dtrace_helptrace_bufsize) {
13447 nnext = next + size;
13448 } else {
13449 nnext = size;
13450 }
13451 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13452
13453 /*
13454 * We have our slot; fill it in.
13455 */
13456 if (nnext == size)
13457 next = 0;
13458
13459 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13460 ent->dtht_helper = helper;
13461 ent->dtht_where = where;
13462 ent->dtht_nlocals = vstate->dtvs_nlocals;
13463
13464 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13465 mstate->dtms_fltoffs : -1;
13466 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13467 ent->dtht_illval = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
13468
13469 for (i = 0; VBDTCAST(int32_t)i < vstate->dtvs_nlocals; i++) {
13470 dtrace_statvar_t *svar;
13471
13472 if ((svar = vstate->dtvs_locals[i]) == NULL)
13473 continue;
13474
13475 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13476 ent->dtht_locals[i] =
13477 ((uint64_t *)(uintptr_t)svar->dtsv_data)[VBDT_GET_CPUID()];
13478 }
13479}
13480
13481static uint64_t
13482dtrace_helper(int which, dtrace_mstate_t *mstate,
13483 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13484{
13485 VBDTTYPE(uint16_t volatile *, uint16_t *)flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13486 uint64_t sarg0 = mstate->dtms_arg[0];
13487 uint64_t sarg1 = mstate->dtms_arg[1];
13488 uint64_t rval VBDTUNASS(666);
13489 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13490 dtrace_helper_action_t *helper;
13491 dtrace_vstate_t *vstate;
13492 dtrace_difo_t *pred;
13493 int i, trace = dtrace_helptrace_enabled;
13494
13495 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13496
13497 if (helpers == NULL)
13498 return (0);
13499
13500 if ((helper = helpers->dthps_actions[which]) == NULL)
13501 return (0);
13502
13503 vstate = &helpers->dthps_vstate;
13504 mstate->dtms_arg[0] = arg0;
13505 mstate->dtms_arg[1] = arg1;
13506
13507 /*
13508 * Now iterate over each helper. If its predicate evaluates to 'true',
13509 * we'll call the corresponding actions. Note that the below calls
13510 * to dtrace_dif_emulate() may set faults in machine state. This is
13511 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
13512 * the stored DIF offset with its own (which is the desired behavior).
13513 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13514 * from machine state; this is okay, too.
13515 */
13516 for (; helper != NULL; helper = helper->dtha_next) {
13517 if ((pred = helper->dtha_predicate) != NULL) {
13518 if (trace)
13519 dtrace_helper_trace(helper, mstate, vstate, 0);
13520
13521 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13522 goto next;
13523
13524 if (*flags & CPU_DTRACE_FAULT)
13525 goto err;
13526 }
13527
13528 for (i = 0; i < helper->dtha_nactions; i++) {
13529 if (trace)
13530 dtrace_helper_trace(helper,
13531 mstate, vstate, i + 1);
13532
13533 rval = dtrace_dif_emulate(helper->dtha_actions[i],
13534 mstate, vstate, state);
13535
13536 if (*flags & CPU_DTRACE_FAULT)
13537 goto err;
13538 }
13539
13540next:
13541 if (trace)
13542 dtrace_helper_trace(helper, mstate, vstate,
13543 DTRACE_HELPTRACE_NEXT);
13544 }
13545
13546 if (trace)
13547 dtrace_helper_trace(helper, mstate, vstate,
13548 DTRACE_HELPTRACE_DONE);
13549
13550 /*
13551 * Restore the arg0 that we saved upon entry.
13552 */
13553 mstate->dtms_arg[0] = sarg0;
13554 mstate->dtms_arg[1] = sarg1;
13555
13556 return (rval);
13557
13558err:
13559 if (trace)
13560 dtrace_helper_trace(helper, mstate, vstate,
13561 DTRACE_HELPTRACE_ERR);
13562
13563 /*
13564 * Restore the arg0 that we saved upon entry.
13565 */
13566 mstate->dtms_arg[0] = sarg0;
13567 mstate->dtms_arg[1] = sarg1;
13568
13569 return (NULL);
13570}
13571
13572static void
13573dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13574 dtrace_vstate_t *vstate)
13575{
13576 int i;
13577
13578 if (helper->dtha_predicate != NULL)
13579 dtrace_difo_release(helper->dtha_predicate, vstate);
13580
13581 for (i = 0; i < helper->dtha_nactions; i++) {
13582 ASSERT(helper->dtha_actions[i] != NULL);
13583 dtrace_difo_release(helper->dtha_actions[i], vstate);
13584 }
13585
13586 kmem_free(helper->dtha_actions,
13587 helper->dtha_nactions * sizeof (dtrace_difo_t *));
13588 kmem_free(helper, sizeof (dtrace_helper_action_t));
13589}
13590
13591static int
13592dtrace_helper_destroygen(int gen)
13593{
13594 proc_t *p = curproc;
13595 dtrace_helpers_t *help = p->p_dtrace_helpers;
13596 dtrace_vstate_t *vstate;
13597 VBDTTYPE(uint_t,int) i;
13598
13599 ASSERT(MUTEX_HELD(&dtrace_lock));
13600
13601 if (help == NULL || gen > help->dthps_generation)
13602 return (EINVAL);
13603
13604 vstate = &help->dthps_vstate;
13605
13606 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13607 dtrace_helper_action_t *last = NULL, *h, *next;
13608
13609 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13610 next = h->dtha_next;
13611
13612 if (h->dtha_generation == gen) {
13613 if (last != NULL) {
13614 last->dtha_next = next;
13615 } else {
13616 help->dthps_actions[i] = next;
13617 }
13618
13619 dtrace_helper_action_destroy(h, vstate);
13620 } else {
13621 last = h;
13622 }
13623 }
13624 }
13625
13626 /*
13627 * Interate until we've cleared out all helper providers with the
13628 * given generation number.
13629 */
13630 for (;;) {
13631 dtrace_helper_provider_t *prov;
13632
13633 /*
13634 * Look for a helper provider with the right generation. We
13635 * have to start back at the beginning of the list each time
13636 * because we drop dtrace_lock. It's unlikely that we'll make
13637 * more than two passes.
13638 */
13639 for (i = 0; i < help->dthps_nprovs; i++) {
13640 prov = help->dthps_provs[i];
13641
13642 if (prov->dthp_generation == gen)
13643 break;
13644 }
13645
13646 /*
13647 * If there were no matches, we're done.
13648 */
13649 if (i == help->dthps_nprovs)
13650 break;
13651
13652 /*
13653 * Move the last helper provider into this slot.
13654 */
13655 help->dthps_nprovs--;
13656 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13657 help->dthps_provs[help->dthps_nprovs] = NULL;
13658
13659 mutex_exit(&dtrace_lock);
13660
13661 /*
13662 * If we have a meta provider, remove this helper provider.
13663 */
13664 mutex_enter(&dtrace_meta_lock);
13665 if (dtrace_meta_pid != NULL) {
13666 ASSERT(dtrace_deferred_pid == NULL);
13667 dtrace_helper_provider_remove(&prov->dthp_prov,
13668 p->p_pid);
13669 }
13670 mutex_exit(&dtrace_meta_lock);
13671
13672 dtrace_helper_provider_destroy(prov);
13673
13674 mutex_enter(&dtrace_lock);
13675 }
13676
13677 return (0);
13678}
13679
13680static int
13681dtrace_helper_validate(dtrace_helper_action_t *helper)
13682{
13683 int err = 0, i;
13684 dtrace_difo_t *dp;
13685
13686 if ((dp = helper->dtha_predicate) != NULL)
13687 err += dtrace_difo_validate_helper(dp);
13688
13689 for (i = 0; i < helper->dtha_nactions; i++)
13690 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13691
13692 return (err == 0);
13693}
13694
13695static int
13696dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13697{
13698 dtrace_helpers_t *help;
13699 dtrace_helper_action_t *helper, *last;
13700 dtrace_actdesc_t *act;
13701 dtrace_vstate_t *vstate;
13702 dtrace_predicate_t *pred;
13703 int count = 0, nactions = 0, i;
13704
13705 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13706 return (EINVAL);
13707
13708 help = curproc->p_dtrace_helpers;
13709 last = help->dthps_actions[which];
13710 vstate = &help->dthps_vstate;
13711
13712 for (count = 0; last != NULL; last = last->dtha_next) {
13713 count++;
13714 if (last->dtha_next == NULL)
13715 break;
13716 }
13717
13718 /*
13719 * If we already have dtrace_helper_actions_max helper actions for this
13720 * helper action type, we'll refuse to add a new one.
13721 */
13722 if (count >= dtrace_helper_actions_max)
13723 return (ENOSPC);
13724
13725 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13726 helper->dtha_generation = help->dthps_generation;
13727
13728 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13729 ASSERT(pred->dtp_difo != NULL);
13730 dtrace_difo_hold(pred->dtp_difo);
13731 helper->dtha_predicate = pred->dtp_difo;
13732 }
13733
13734 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13735 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13736 goto err;
13737
13738 if (act->dtad_difo == NULL)
13739 goto err;
13740
13741 nactions++;
13742 }
13743
13744 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13745 (helper->dtha_nactions = nactions), KM_SLEEP);
13746
13747 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13748 dtrace_difo_hold(act->dtad_difo);
13749 helper->dtha_actions[i++] = act->dtad_difo;
13750 }
13751
13752 if (!dtrace_helper_validate(helper))
13753 goto err;
13754
13755 if (last == NULL) {
13756 help->dthps_actions[which] = helper;
13757 } else {
13758 last->dtha_next = helper;
13759 }
13760
13761 if (vstate->dtvs_nlocals > VBDTCAST(int32_t)dtrace_helptrace_nlocals) {
13762 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13763 dtrace_helptrace_next = 0;
13764 }
13765
13766 return (0);
13767err:
13768 dtrace_helper_action_destroy(helper, vstate);
13769 return (EINVAL);
13770}
13771
13772static void
13773dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13774 dof_helper_t *dofhp)
13775{
13776 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13777
13778 mutex_enter(&dtrace_meta_lock);
13779 mutex_enter(&dtrace_lock);
13780
13781 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13782 /*
13783 * If the dtrace module is loaded but not attached, or if
13784 * there aren't isn't a meta provider registered to deal with
13785 * these provider descriptions, we need to postpone creating
13786 * the actual providers until later.
13787 */
13788
13789 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13790 dtrace_deferred_pid != help) {
13791 help->dthps_deferred = 1;
13792 help->dthps_pid = p->p_pid;
13793 help->dthps_next = dtrace_deferred_pid;
13794 help->dthps_prev = NULL;
13795 if (dtrace_deferred_pid != NULL)
13796 dtrace_deferred_pid->dthps_prev = help;
13797 dtrace_deferred_pid = help;
13798 }
13799
13800 mutex_exit(&dtrace_lock);
13801
13802 } else if (dofhp != NULL) {
13803 /*
13804 * If the dtrace module is loaded and we have a particular
13805 * helper provider description, pass that off to the
13806 * meta provider.
13807 */
13808
13809 mutex_exit(&dtrace_lock);
13810
13811 dtrace_helper_provide(dofhp, p->p_pid);
13812
13813 } else {
13814 /*
13815 * Otherwise, just pass all the helper provider descriptions
13816 * off to the meta provider.
13817 */
13818
13819 VBDTTYPE(uint_t,int) i;
13820 mutex_exit(&dtrace_lock);
13821
13822 for (i = 0; i < help->dthps_nprovs; i++) {
13823 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13824 p->p_pid);
13825 }
13826 }
13827
13828 mutex_exit(&dtrace_meta_lock);
13829}
13830
13831static int
13832dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13833{
13834 dtrace_helpers_t *help;
13835 dtrace_helper_provider_t *hprov, **tmp_provs;
13836 uint_t tmp_maxprovs, i;
13837
13838 ASSERT(MUTEX_HELD(&dtrace_lock));
13839
13840 help = curproc->p_dtrace_helpers;
13841 ASSERT(help != NULL);
13842
13843 /*
13844 * If we already have dtrace_helper_providers_max helper providers,
13845 * we're refuse to add a new one.
13846 */
13847 if (help->dthps_nprovs >= dtrace_helper_providers_max)
13848 return (ENOSPC);
13849
13850 /*
13851 * Check to make sure this isn't a duplicate.
13852 */
13853 for (i = 0; i < help->dthps_nprovs; i++) {
13854 if (dofhp->dofhp_addr ==
13855 help->dthps_provs[i]->dthp_prov.dofhp_addr)
13856 return (EALREADY);
13857 }
13858
13859 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13860 hprov->dthp_prov = *dofhp;
13861 hprov->dthp_ref = 1;
13862 hprov->dthp_generation = gen;
13863
13864 /*
13865 * Allocate a bigger table for helper providers if it's already full.
13866 */
13867 if (help->dthps_maxprovs == help->dthps_nprovs) {
13868 tmp_maxprovs = help->dthps_maxprovs;
13869 tmp_provs = help->dthps_provs;
13870
13871 if (help->dthps_maxprovs == 0)
13872 help->dthps_maxprovs = 2;
13873 else
13874 help->dthps_maxprovs *= 2;
13875 if (help->dthps_maxprovs > dtrace_helper_providers_max)
13876 help->dthps_maxprovs = dtrace_helper_providers_max;
13877
13878 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13879
13880 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13881 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13882
13883 if (tmp_provs != NULL) {
13884 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13885 sizeof (dtrace_helper_provider_t *));
13886 kmem_free(tmp_provs, tmp_maxprovs *
13887 sizeof (dtrace_helper_provider_t *));
13888 }
13889 }
13890
13891 help->dthps_provs[help->dthps_nprovs] = hprov;
13892 help->dthps_nprovs++;
13893
13894 return (0);
13895}
13896
13897static void
13898dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13899{
13900 mutex_enter(&dtrace_lock);
13901
13902 if (--hprov->dthp_ref == 0) {
13903 dof_hdr_t *dof;
13904 mutex_exit(&dtrace_lock);
13905 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13906 dtrace_dof_destroy(dof);
13907 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13908 } else {
13909 mutex_exit(&dtrace_lock);
13910 }
13911}
13912
13913static int
13914dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13915{
13916 uintptr_t daddr = (uintptr_t)dof;
13917 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
13918 dof_provider_t *provider;
13919 dof_probe_t *probe;
13920 uint8_t *arg;
13921 char *strtab, *typestr;
13922 dof_stridx_t typeidx;
13923 size_t typesz;
13924 uint_t nprobes, j, k;
13925
13926 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
13927
13928 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
13929 dtrace_dof_error(dof, "misaligned section offset");
13930 return (-1);
13931 }
13932
13933 /*
13934 * The section needs to be large enough to contain the DOF provider
13935 * structure appropriate for the given version.
13936 */
13937 if (sec->dofs_size <
13938 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
13939 offsetof(dof_provider_t, dofpv_prenoffs) :
13940 sizeof (dof_provider_t))) {
13941 dtrace_dof_error(dof, "provider section too small");
13942 return (-1);
13943 }
13944
13945 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
13946 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
13947 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
13948 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
13949 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
13950
13951 if (str_sec == NULL || prb_sec == NULL ||
13952 arg_sec == NULL || off_sec == NULL)
13953 return (-1);
13954
13955 enoff_sec = NULL;
13956
13957 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13958 provider->dofpv_prenoffs != DOF_SECT_NONE &&
13959 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
13960 provider->dofpv_prenoffs)) == NULL)
13961 return (-1);
13962
13963 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
13964
13965 if (provider->dofpv_name >= str_sec->dofs_size ||
13966 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
13967 dtrace_dof_error(dof, "invalid provider name");
13968 return (-1);
13969 }
13970
13971 if (prb_sec->dofs_entsize == 0 ||
13972 prb_sec->dofs_entsize > prb_sec->dofs_size) {
13973 dtrace_dof_error(dof, "invalid entry size");
13974 return (-1);
13975 }
13976
13977 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
13978 dtrace_dof_error(dof, "misaligned entry size");
13979 return (-1);
13980 }
13981
13982 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
13983 dtrace_dof_error(dof, "invalid entry size");
13984 return (-1);
13985 }
13986
13987 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
13988 dtrace_dof_error(dof, "misaligned section offset");
13989 return (-1);
13990 }
13991
13992 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
13993 dtrace_dof_error(dof, "invalid entry size");
13994 return (-1);
13995 }
13996
13997 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
13998
13999 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14000
14001 /*
14002 * Take a pass through the probes to check for errors.
14003 */
14004 for (j = 0; j < nprobes; j++) {
14005 probe = (dof_probe_t *)(uintptr_t)(daddr +
14006 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14007
14008 if (probe->dofpr_func >= str_sec->dofs_size) {
14009 dtrace_dof_error(dof, "invalid function name");
14010 return (-1);
14011 }
14012
14013 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14014 dtrace_dof_error(dof, "function name too long");
14015 return (-1);
14016 }
14017
14018 if (probe->dofpr_name >= str_sec->dofs_size ||
14019 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14020 dtrace_dof_error(dof, "invalid probe name");
14021 return (-1);
14022 }
14023
14024 /*
14025 * The offset count must not wrap the index, and the offsets
14026 * must also not overflow the section's data.
14027 */
14028 if (probe->dofpr_offidx + probe->dofpr_noffs <
14029 probe->dofpr_offidx ||
14030 (probe->dofpr_offidx + probe->dofpr_noffs) *
14031 off_sec->dofs_entsize > off_sec->dofs_size) {
14032 dtrace_dof_error(dof, "invalid probe offset");
14033 return (-1);
14034 }
14035
14036 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14037 /*
14038 * If there's no is-enabled offset section, make sure
14039 * there aren't any is-enabled offsets. Otherwise
14040 * perform the same checks as for probe offsets
14041 * (immediately above).
14042 */
14043 if (enoff_sec == NULL) {
14044 if (probe->dofpr_enoffidx != 0 ||
14045 probe->dofpr_nenoffs != 0) {
14046 dtrace_dof_error(dof, "is-enabled "
14047 "offsets with null section");
14048 return (-1);
14049 }
14050 } else if (probe->dofpr_enoffidx +
14051 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14052 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14053 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14054 dtrace_dof_error(dof, "invalid is-enabled "
14055 "offset");
14056 return (-1);
14057 }
14058
14059 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14060 dtrace_dof_error(dof, "zero probe and "
14061 "is-enabled offsets");
14062 return (-1);
14063 }
14064 } else if (probe->dofpr_noffs == 0) {
14065 dtrace_dof_error(dof, "zero probe offsets");
14066 return (-1);
14067 }
14068
14069 if (probe->dofpr_argidx + probe->dofpr_xargc <
14070 probe->dofpr_argidx ||
14071 (probe->dofpr_argidx + probe->dofpr_xargc) *
14072 arg_sec->dofs_entsize > arg_sec->dofs_size) {
14073 dtrace_dof_error(dof, "invalid args");
14074 return (-1);
14075 }
14076
14077 typeidx = probe->dofpr_nargv;
14078 typestr = strtab + probe->dofpr_nargv;
14079 for (k = 0; k < probe->dofpr_nargc; k++) {
14080 if (typeidx >= str_sec->dofs_size) {
14081 dtrace_dof_error(dof, "bad "
14082 "native argument type");
14083 return (-1);
14084 }
14085
14086 typesz = strlen(typestr) + 1;
14087 if (typesz > DTRACE_ARGTYPELEN) {
14088 dtrace_dof_error(dof, "native "
14089 "argument type too long");
14090 return (-1);
14091 }
14092 typeidx += VBDTCAST(dof_stridx_t)typesz;
14093 typestr += typesz;
14094 }
14095
14096 typeidx = probe->dofpr_xargv;
14097 typestr = strtab + probe->dofpr_xargv;
14098 for (k = 0; k < probe->dofpr_xargc; k++) {
14099 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14100 dtrace_dof_error(dof, "bad "
14101 "native argument index");
14102 return (-1);
14103 }
14104
14105 if (typeidx >= str_sec->dofs_size) {
14106 dtrace_dof_error(dof, "bad "
14107 "translated argument type");
14108 return (-1);
14109 }
14110
14111 typesz = strlen(typestr) + 1;
14112 if (typesz > DTRACE_ARGTYPELEN) {
14113 dtrace_dof_error(dof, "translated argument "
14114 "type too long");
14115 return (-1);
14116 }
14117
14118 typeidx += VBDTCAST(dof_stridx_t)typesz;
14119 typestr += typesz;
14120 }
14121 }
14122
14123 return (0);
14124}
14125
14126static int
14127dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14128{
14129 dtrace_helpers_t *help;
14130 dtrace_vstate_t *vstate;
14131 dtrace_enabling_t *enab = NULL;
14132 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14133 uintptr_t daddr = (uintptr_t)dof;
14134
14135 ASSERT(MUTEX_HELD(&dtrace_lock));
14136
14137 if ((help = curproc->p_dtrace_helpers) == NULL)
14138 help = dtrace_helpers_create(curproc);
14139
14140 vstate = &help->dthps_vstate;
14141
14142 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14143 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14144 dtrace_dof_destroy(dof);
14145 return (rv);
14146 }
14147
14148 /*
14149 * Look for helper providers and validate their descriptions.
14150 */
14151 if (dhp != NULL) {
14152 for (i = 0; i < VBDTCAST(int)dof->dofh_secnum; i++) {
14153 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14154 dof->dofh_secoff + i * dof->dofh_secsize);
14155
14156 if (sec->dofs_type != DOF_SECT_PROVIDER)
14157 continue;
14158
14159 if (dtrace_helper_provider_validate(dof, sec) != 0) {
14160 dtrace_enabling_destroy(enab);
14161 dtrace_dof_destroy(dof);
14162 return (-1);
14163 }
14164
14165 nprovs++;
14166 }
14167 }
14168
14169 /*
14170 * Now we need to walk through the ECB descriptions in the enabling.
14171 */
14172 for (i = 0; i < enab->dten_ndesc; i++) {
14173 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14174 dtrace_probedesc_t *desc = &ep->dted_probe;
14175
14176 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14177 continue;
14178
14179 if (strcmp(desc->dtpd_mod, "helper") != 0)
14180 continue;
14181
14182 if (strcmp(desc->dtpd_func, "ustack") != 0)
14183 continue;
14184
14185 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14186 ep)) != 0) {
14187 /*
14188 * Adding this helper action failed -- we are now going
14189 * to rip out the entire generation and return failure.
14190 */
14191 (void) dtrace_helper_destroygen(help->dthps_generation);
14192 dtrace_enabling_destroy(enab);
14193 dtrace_dof_destroy(dof);
14194 return (-1);
14195 }
14196
14197 nhelpers++;
14198 }
14199
14200 if (nhelpers < enab->dten_ndesc)
14201 dtrace_dof_error(dof, "unmatched helpers");
14202
14203 gen = help->dthps_generation++;
14204 dtrace_enabling_destroy(enab);
14205
14206 if (dhp != NULL && nprovs > 0) {
14207 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14208 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14209 mutex_exit(&dtrace_lock);
14210 dtrace_helper_provider_register(curproc, help, dhp);
14211 mutex_enter(&dtrace_lock);
14212
14213 destroy = 0;
14214 }
14215 }
14216
14217 if (destroy)
14218 dtrace_dof_destroy(dof);
14219
14220 return (gen);
14221}
14222
14223static dtrace_helpers_t *
14224dtrace_helpers_create(proc_t *p)
14225{
14226 dtrace_helpers_t *help;
14227
14228 ASSERT(MUTEX_HELD(&dtrace_lock));
14229 ASSERT(p->p_dtrace_helpers == NULL);
14230
14231 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14232 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14233 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14234
14235 p->p_dtrace_helpers = help;
14236 dtrace_helpers++;
14237
14238 return (help);
14239}
14240
14241static void
14242dtrace_helpers_destroy(void)
14243{
14244 dtrace_helpers_t *help;
14245 dtrace_vstate_t *vstate;
14246 proc_t *p = curproc;
14247 VBDTTYPE(uint_t, int) i;
14248
14249 mutex_enter(&dtrace_lock);
14250
14251 ASSERT(p->p_dtrace_helpers != NULL);
14252 ASSERT(dtrace_helpers > 0);
14253
14254 help = p->p_dtrace_helpers;
14255 vstate = &help->dthps_vstate;
14256
14257 /*
14258 * We're now going to lose the help from this process.
14259 */
14260 p->p_dtrace_helpers = NULL;
14261 dtrace_sync();
14262
14263 /*
14264 * Destory the helper actions.
14265 */
14266 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14267 dtrace_helper_action_t *h, *next;
14268
14269 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14270 next = h->dtha_next;
14271 dtrace_helper_action_destroy(h, vstate);
14272 h = next;
14273 }
14274 }
14275
14276 mutex_exit(&dtrace_lock);
14277
14278 /*
14279 * Destroy the helper providers.
14280 */
14281 if (help->dthps_maxprovs > 0) {
14282 mutex_enter(&dtrace_meta_lock);
14283 if (dtrace_meta_pid != NULL) {
14284 ASSERT(dtrace_deferred_pid == NULL);
14285
14286 for (i = 0; i < help->dthps_nprovs; i++) {
14287 dtrace_helper_provider_remove(
14288 &help->dthps_provs[i]->dthp_prov, p->p_pid);
14289 }
14290 } else {
14291 mutex_enter(&dtrace_lock);
14292 ASSERT(help->dthps_deferred == 0 ||
14293 help->dthps_next != NULL ||
14294 help->dthps_prev != NULL ||
14295 help == dtrace_deferred_pid);
14296
14297 /*
14298 * Remove the helper from the deferred list.
14299 */
14300 if (help->dthps_next != NULL)
14301 help->dthps_next->dthps_prev = help->dthps_prev;
14302 if (help->dthps_prev != NULL)
14303 help->dthps_prev->dthps_next = help->dthps_next;
14304 if (dtrace_deferred_pid == help) {
14305 dtrace_deferred_pid = help->dthps_next;
14306 ASSERT(help->dthps_prev == NULL);
14307 }
14308
14309 mutex_exit(&dtrace_lock);
14310 }
14311
14312 mutex_exit(&dtrace_meta_lock);
14313
14314 for (i = 0; i < help->dthps_nprovs; i++) {
14315 dtrace_helper_provider_destroy(help->dthps_provs[i]);
14316 }
14317
14318 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14319 sizeof (dtrace_helper_provider_t *));
14320 }
14321
14322 mutex_enter(&dtrace_lock);
14323
14324 dtrace_vstate_fini(&help->dthps_vstate);
14325 kmem_free(help->dthps_actions,
14326 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14327 kmem_free(help, sizeof (dtrace_helpers_t));
14328
14329 --dtrace_helpers;
14330 mutex_exit(&dtrace_lock);
14331}
14332
14333static void
14334dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14335{
14336 dtrace_helpers_t *help, *newhelp;
14337 dtrace_helper_action_t *helper, *new, *last;
14338 dtrace_difo_t *dp;
14339 dtrace_vstate_t *vstate;
14340 int i, j, sz, hasprovs = 0;
14341
14342 mutex_enter(&dtrace_lock);
14343 ASSERT(from->p_dtrace_helpers != NULL);
14344 ASSERT(dtrace_helpers > 0);
14345
14346 help = from->p_dtrace_helpers;
14347 newhelp = dtrace_helpers_create(to);
14348 ASSERT(to->p_dtrace_helpers != NULL);
14349
14350 newhelp->dthps_generation = help->dthps_generation;
14351 vstate = &newhelp->dthps_vstate;
14352
14353 /*
14354 * Duplicate the helper actions.
14355 */
14356 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14357 if ((helper = help->dthps_actions[i]) == NULL)
14358 continue;
14359
14360 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14361 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14362 KM_SLEEP);
14363 new->dtha_generation = helper->dtha_generation;
14364
14365 if ((dp = helper->dtha_predicate) != NULL) {
14366 dp = dtrace_difo_duplicate(dp, vstate);
14367 new->dtha_predicate = dp;
14368 }
14369
14370 new->dtha_nactions = helper->dtha_nactions;
14371 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14372 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14373
14374 for (j = 0; j < new->dtha_nactions; j++) {
14375 dtrace_difo_t *dp2 = helper->dtha_actions[j];
14376
14377 ASSERT(dp2 != NULL);
14378 dp2 = dtrace_difo_duplicate(dp2, vstate);
14379 new->dtha_actions[j] = dp2;
14380 }
14381
14382 if (last != NULL) {
14383 last->dtha_next = new;
14384 } else {
14385 newhelp->dthps_actions[i] = new;
14386 }
14387
14388 last = new;
14389 }
14390 }
14391
14392 /*
14393 * Duplicate the helper providers and register them with the
14394 * DTrace framework.
14395 */
14396 if (help->dthps_nprovs > 0) {
14397 newhelp->dthps_nprovs = help->dthps_nprovs;
14398 newhelp->dthps_maxprovs = help->dthps_nprovs;
14399 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14400 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14401 for (i = 0; i < VBDTCAST(int)newhelp->dthps_nprovs; i++) {
14402 newhelp->dthps_provs[i] = help->dthps_provs[i];
14403 newhelp->dthps_provs[i]->dthp_ref++;
14404 }
14405
14406 hasprovs = 1;
14407 }
14408
14409 mutex_exit(&dtrace_lock);
14410
14411 if (hasprovs)
14412 dtrace_helper_provider_register(to, newhelp, NULL);
14413}
14414
14415#ifndef VBOX
14416
14417/*
14418 * DTrace Hook Functions
14419 */
14420static void
14421dtrace_module_loaded(struct modctl *ctl)
14422{
14423 dtrace_provider_t *prv;
14424
14425 mutex_enter(&dtrace_provider_lock);
14426 mutex_enter(&mod_lock);
14427
14428 ASSERT(ctl->mod_busy);
14429
14430 /*
14431 * We're going to call each providers per-module provide operation
14432 * specifying only this module.
14433 */
14434 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14435 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14436
14437 mutex_exit(&mod_lock);
14438 mutex_exit(&dtrace_provider_lock);
14439
14440 /*
14441 * If we have any retained enablings, we need to match against them.
14442 * Enabling probes requires that cpu_lock be held, and we cannot hold
14443 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14444 * module. (In particular, this happens when loading scheduling
14445 * classes.) So if we have any retained enablings, we need to dispatch
14446 * our task queue to do the match for us.
14447 */
14448 mutex_enter(&dtrace_lock);
14449
14450 if (dtrace_retained == NULL) {
14451 mutex_exit(&dtrace_lock);
14452 return;
14453 }
14454
14455 (void) taskq_dispatch(dtrace_taskq,
14456 (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14457
14458 mutex_exit(&dtrace_lock);
14459
14460 /*
14461 * And now, for a little heuristic sleaze: in general, we want to
14462 * match modules as soon as they load. However, we cannot guarantee
14463 * this, because it would lead us to the lock ordering violation
14464 * outlined above. The common case, of course, is that cpu_lock is
14465 * _not_ held -- so we delay here for a clock tick, hoping that that's
14466 * long enough for the task queue to do its work. If it's not, it's
14467 * not a serious problem -- it just means that the module that we
14468 * just loaded may not be immediately instrumentable.
14469 */
14470 delay(1);
14471}
14472
14473static void
14474dtrace_module_unloaded(struct modctl *ctl)
14475{
14476 dtrace_probe_t template, *probe, *first, *next;
14477 dtrace_provider_t *prov;
14478
14479 template.dtpr_mod = ctl->mod_modname;
14480
14481 mutex_enter(&dtrace_provider_lock);
14482 mutex_enter(&mod_lock);
14483 mutex_enter(&dtrace_lock);
14484
14485 if (dtrace_bymod == NULL) {
14486 /*
14487 * The DTrace module is loaded (obviously) but not attached;
14488 * we don't have any work to do.
14489 */
14490 mutex_exit(&dtrace_provider_lock);
14491 mutex_exit(&mod_lock);
14492 mutex_exit(&dtrace_lock);
14493 return;
14494 }
14495
14496 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14497 probe != NULL; probe = probe->dtpr_nextmod) {
14498 if (probe->dtpr_ecb != NULL) {
14499 mutex_exit(&dtrace_provider_lock);
14500 mutex_exit(&mod_lock);
14501 mutex_exit(&dtrace_lock);
14502
14503 /*
14504 * This shouldn't _actually_ be possible -- we're
14505 * unloading a module that has an enabled probe in it.
14506 * (It's normally up to the provider to make sure that
14507 * this can't happen.) However, because dtps_enable()
14508 * doesn't have a failure mode, there can be an
14509 * enable/unload race. Upshot: we don't want to
14510 * assert, but we're not going to disable the
14511 * probe, either.
14512 */
14513 if (dtrace_err_verbose) {
14514 cmn_err(CE_WARN, "unloaded module '%s' had "
14515 "enabled probes", ctl->mod_modname);
14516 }
14517
14518 return;
14519 }
14520 }
14521
14522 probe = first;
14523
14524 for (first = NULL; probe != NULL; probe = next) {
14525 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14526
14527 dtrace_probes[probe->dtpr_id - 1] = NULL;
14528
14529 next = probe->dtpr_nextmod;
14530 dtrace_hash_remove(dtrace_bymod, probe);
14531 dtrace_hash_remove(dtrace_byfunc, probe);
14532 dtrace_hash_remove(dtrace_byname, probe);
14533
14534 if (first == NULL) {
14535 first = probe;
14536 probe->dtpr_nextmod = NULL;
14537 } else {
14538 probe->dtpr_nextmod = first;
14539 first = probe;
14540 }
14541 }
14542
14543 /*
14544 * We've removed all of the module's probes from the hash chains and
14545 * from the probe array. Now issue a dtrace_sync() to be sure that
14546 * everyone has cleared out from any probe array processing.
14547 */
14548 dtrace_sync();
14549
14550 for (probe = first; probe != NULL; probe = first) {
14551 first = probe->dtpr_nextmod;
14552 prov = probe->dtpr_provider;
14553 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14554 probe->dtpr_arg);
14555 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14556 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14557 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14558 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14559 kmem_free(probe, sizeof (dtrace_probe_t));
14560 }
14561
14562 mutex_exit(&dtrace_lock);
14563 mutex_exit(&mod_lock);
14564 mutex_exit(&dtrace_provider_lock);
14565}
14566
14567#endif /* !VBOX */
14568
14569VBDTSTATIC void
14570dtrace_suspend(void)
14571{
14572 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14573}
14574
14575VBDTSTATIC void
14576dtrace_resume(void)
14577{
14578 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14579}
14580
14581#ifdef VBOX
14582typedef enum {
14583 CPU_INVALID,
14584 CPU_CONFIG,
14585 CPU_UNCONFIG
14586} cpu_setup_t;
14587#endif
14588
14589
14590static int
14591dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14592{
14593 ASSERT(MUTEX_HELD(&cpu_lock));
14594 mutex_enter(&dtrace_lock);
14595
14596 switch (what) {
14597 case CPU_CONFIG: {
14598 dtrace_state_t *state;
14599 dtrace_optval_t *opt, rs, c;
14600
14601 /*
14602 * For now, we only allocate a new buffer for anonymous state.
14603 */
14604 if ((state = dtrace_anon.dta_state) == NULL)
14605 break;
14606
14607 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14608 break;
14609
14610 opt = state->dts_options;
14611 c = opt[DTRACEOPT_CPU];
14612
14613 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14614 break;
14615
14616 /*
14617 * Regardless of what the actual policy is, we're going to
14618 * temporarily set our resize policy to be manual. We're
14619 * also going to temporarily set our CPU option to denote
14620 * the newly configured CPU.
14621 */
14622 rs = opt[DTRACEOPT_BUFRESIZE];
14623 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14624 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14625
14626 (void) dtrace_state_buffers(state);
14627
14628 opt[DTRACEOPT_BUFRESIZE] = rs;
14629 opt[DTRACEOPT_CPU] = c;
14630
14631 break;
14632 }
14633
14634 case CPU_UNCONFIG:
14635 /*
14636 * We don't free the buffer in the CPU_UNCONFIG case. (The
14637 * buffer will be freed when the consumer exits.)
14638 */
14639 break;
14640
14641 default:
14642 break;
14643 }
14644
14645 mutex_exit(&dtrace_lock);
14646 return (0);
14647}
14648
14649#ifndef VBOX
14650static void
14651dtrace_cpu_setup_initial(processorid_t cpu)
14652{
14653 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14654}
14655#endif /* !VBOX */
14656
14657static void
14658dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14659{
14660 if (dtrace_toxranges >= dtrace_toxranges_max) {
14661 int osize, nsize;
14662 dtrace_toxrange_t *range;
14663
14664 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14665
14666 if (osize == 0) {
14667 ASSERT(dtrace_toxrange == NULL);
14668 ASSERT(dtrace_toxranges_max == 0);
14669 dtrace_toxranges_max = 1;
14670 } else {
14671 dtrace_toxranges_max <<= 1;
14672 }
14673
14674 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14675 range = kmem_zalloc(nsize, KM_SLEEP);
14676
14677 if (dtrace_toxrange != NULL) {
14678 ASSERT(osize != 0);
14679 bcopy(dtrace_toxrange, range, osize);
14680 kmem_free(dtrace_toxrange, osize);
14681 }
14682
14683 dtrace_toxrange = range;
14684 }
14685
14686 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14687 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14688
14689 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14690 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14691 dtrace_toxranges++;
14692}
14693
14694/*
14695 * DTrace Driver Cookbook Functions
14696 */
14697/*ARGSUSED*/
14698static int
14699dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14700{
14701 dtrace_provider_id_t id;
14702 dtrace_state_t *state = NULL;
14703 dtrace_enabling_t *enab;
14704
14705 mutex_enter(&cpu_lock);
14706 mutex_enter(&dtrace_provider_lock);
14707 mutex_enter(&dtrace_lock);
14708
14709 if (ddi_soft_state_init(&dtrace_softstate,
14710 sizeof (dtrace_state_t), 0) != 0) {
14711 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14712 mutex_exit(&cpu_lock);
14713 mutex_exit(&dtrace_provider_lock);
14714 mutex_exit(&dtrace_lock);
14715 return (DDI_FAILURE);
14716 }
14717
14718#ifndef VBOX
14719 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14720 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14721 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14722 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14723 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14724 ddi_remove_minor_node(devi, NULL);
14725 ddi_soft_state_fini(&dtrace_softstate);
14726 mutex_exit(&cpu_lock);
14727 mutex_exit(&dtrace_provider_lock);
14728 mutex_exit(&dtrace_lock);
14729 return (DDI_FAILURE);
14730 }
14731#endif
14732
14733 ddi_report_dev(devi);
14734 dtrace_devi = devi;
14735
14736#ifndef VBOX
14737 dtrace_modload = dtrace_module_loaded;
14738 dtrace_modunload = dtrace_module_unloaded;
14739 dtrace_cpu_init = dtrace_cpu_setup_initial;
14740 dtrace_helpers_cleanup = dtrace_helpers_destroy;
14741 dtrace_helpers_fork = dtrace_helpers_duplicate;
14742 dtrace_cpustart_init = dtrace_suspend;
14743 dtrace_cpustart_fini = dtrace_resume;
14744 dtrace_debugger_init = dtrace_suspend;
14745 dtrace_debugger_fini = dtrace_resume;
14746
14747 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14748#else
14749 /** @todo some of these hooks needs checking out! */
14750#endif
14751
14752 ASSERT(MUTEX_HELD(&cpu_lock));
14753
14754 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14755 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14756 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14757 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14758 VM_SLEEP | VMC_IDENTIFIER);
14759#ifndef VBOX
14760 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14761 1, INT_MAX, 0);
14762#endif
14763
14764 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14765 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14766 NULL, NULL, NULL, NULL, NULL, 0);
14767
14768 ASSERT(MUTEX_HELD(&cpu_lock));
14769 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14770 offsetof(dtrace_probe_t, dtpr_nextmod),
14771 offsetof(dtrace_probe_t, dtpr_prevmod));
14772
14773 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14774 offsetof(dtrace_probe_t, dtpr_nextfunc),
14775 offsetof(dtrace_probe_t, dtpr_prevfunc));
14776
14777 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14778 offsetof(dtrace_probe_t, dtpr_nextname),
14779 offsetof(dtrace_probe_t, dtpr_prevname));
14780
14781 if (dtrace_retain_max < 1) {
14782 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14783 "setting to 1", dtrace_retain_max);
14784 dtrace_retain_max = 1;
14785 }
14786
14787 /*
14788 * Now discover our toxic ranges.
14789 */
14790 dtrace_toxic_ranges(dtrace_toxrange_add);
14791
14792 /*
14793 * Before we register ourselves as a provider to our own framework,
14794 * we would like to assert that dtrace_provider is NULL -- but that's
14795 * not true if we were loaded as a dependency of a DTrace provider.
14796 * Once we've registered, we can assert that dtrace_provider is our
14797 * pseudo provider.
14798 */
14799 (void) dtrace_register("dtrace", &dtrace_provider_attr,
14800 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14801
14802 ASSERT(dtrace_provider != NULL);
14803 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14804
14805 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14806 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14807 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14808 dtrace_provider, NULL, NULL, "END", 0, NULL);
14809 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14810 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14811
14812 dtrace_anon_property();
14813 mutex_exit(&cpu_lock);
14814
14815 /*
14816 * If DTrace helper tracing is enabled, we need to allocate the
14817 * trace buffer and initialize the values.
14818 */
14819 if (dtrace_helptrace_enabled) {
14820 ASSERT(dtrace_helptrace_buffer == NULL);
14821 dtrace_helptrace_buffer =
14822 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14823 dtrace_helptrace_next = 0;
14824 }
14825
14826 /*
14827 * If there are already providers, we must ask them to provide their
14828 * probes, and then match any anonymous enabling against them. Note
14829 * that there should be no other retained enablings at this time:
14830 * the only retained enablings at this time should be the anonymous
14831 * enabling.
14832 */
14833 if (dtrace_anon.dta_enabling != NULL) {
14834 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14835
14836 dtrace_enabling_provide(NULL);
14837 state = dtrace_anon.dta_state;
14838
14839 /*
14840 * We couldn't hold cpu_lock across the above call to
14841 * dtrace_enabling_provide(), but we must hold it to actually
14842 * enable the probes. We have to drop all of our locks, pick
14843 * up cpu_lock, and regain our locks before matching the
14844 * retained anonymous enabling.
14845 */
14846 mutex_exit(&dtrace_lock);
14847 mutex_exit(&dtrace_provider_lock);
14848
14849 mutex_enter(&cpu_lock);
14850 mutex_enter(&dtrace_provider_lock);
14851 mutex_enter(&dtrace_lock);
14852
14853 if ((enab = dtrace_anon.dta_enabling) != NULL)
14854 (void) dtrace_enabling_match(enab, NULL);
14855
14856 mutex_exit(&cpu_lock);
14857 }
14858
14859 mutex_exit(&dtrace_lock);
14860 mutex_exit(&dtrace_provider_lock);
14861
14862 if (state != NULL) {
14863 /*
14864 * If we created any anonymous state, set it going now.
14865 */
14866 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14867 }
14868
14869 return (DDI_SUCCESS);
14870}
14871
14872/*ARGSUSED*/
14873static int
14874dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14875{
14876 dtrace_state_t *state;
14877 uint32_t priv;
14878 uid_t uid;
14879 zoneid_t zoneid;
14880
14881 if (getminor(*devp) == DTRACEMNRN_HELPER)
14882 return (0);
14883
14884 /*
14885 * If this wasn't an open with the "helper" minor, then it must be
14886 * the "dtrace" minor.
14887 */
14888 if (getminor(*devp) != DTRACEMNRN_DTRACE)
14889 return (ENXIO);
14890
14891 /*
14892 * If no DTRACE_PRIV_* bits are set in the credential, then the
14893 * caller lacks sufficient permission to do anything with DTrace.
14894 */
14895 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14896 if (priv == DTRACE_PRIV_NONE)
14897 return (EACCES);
14898
14899 /*
14900 * Ask all providers to provide all their probes.
14901 */
14902 mutex_enter(&dtrace_provider_lock);
14903 dtrace_probe_provide(NULL, NULL);
14904 mutex_exit(&dtrace_provider_lock);
14905
14906 mutex_enter(&cpu_lock);
14907 mutex_enter(&dtrace_lock);
14908 dtrace_opens++;
14909 dtrace_membar_producer();
14910
14911#ifndef VBOX
14912 /*
14913 * If the kernel debugger is active (that is, if the kernel debugger
14914 * modified text in some way), we won't allow the open.
14915 */
14916 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14917 dtrace_opens--;
14918 mutex_exit(&cpu_lock);
14919 mutex_exit(&dtrace_lock);
14920 return (EBUSY);
14921 }
14922#endif
14923
14924 state = dtrace_state_create(devp, cred_p);
14925 mutex_exit(&cpu_lock);
14926
14927 if (state == NULL) {
14928#ifndef VBOX
14929 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14930 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14931#endif
14932 mutex_exit(&dtrace_lock);
14933 return (EAGAIN);
14934 }
14935
14936 mutex_exit(&dtrace_lock);
14937
14938 return (0);
14939}
14940
14941/*ARGSUSED*/
14942static int
14943dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
14944{
14945 minor_t minor = getminor(dev);
14946 dtrace_state_t *state;
14947
14948 if (minor == DTRACEMNRN_HELPER)
14949 return (0);
14950
14951 state = ddi_get_soft_state(dtrace_softstate, minor);
14952
14953 mutex_enter(&cpu_lock);
14954 mutex_enter(&dtrace_lock);
14955
14956 if (state->dts_anon) {
14957 /*
14958 * There is anonymous state. Destroy that first.
14959 */
14960 ASSERT(dtrace_anon.dta_state == NULL);
14961 dtrace_state_destroy(state->dts_anon);
14962 }
14963
14964 dtrace_state_destroy(state);
14965 ASSERT(dtrace_opens > 0);
14966
14967#ifndef VBOX
14968 /*
14969 * Only relinquish control of the kernel debugger interface when there
14970 * are no consumers and no anonymous enablings.
14971 */
14972 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14973 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14974#endif
14975
14976 mutex_exit(&dtrace_lock);
14977 mutex_exit(&cpu_lock);
14978
14979 return (0);
14980}
14981
14982/*ARGSUSED*/
14983static int
14984dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
14985{
14986 int rval;
14987 dof_helper_t help, *dhp = NULL;
14988
14989 switch (cmd) {
14990 case DTRACEHIOC_ADDDOF:
14991 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
14992 dtrace_dof_error(NULL, "failed to copyin DOF helper");
14993 return (EFAULT);
14994 }
14995
14996 dhp = &help;
14997 arg = (intptr_t)help.dofhp_dof;
14998 /*FALLTHROUGH*/
14999
15000 case DTRACEHIOC_ADD: {
15001 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15002
15003 if (dof == NULL)
15004 return (rval);
15005
15006 mutex_enter(&dtrace_lock);
15007
15008 /*
15009 * dtrace_helper_slurp() takes responsibility for the dof --
15010 * it may free it now or it may save it and free it later.
15011 */
15012 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15013 *rv = rval;
15014 rval = 0;
15015 } else {
15016 rval = EINVAL;
15017 }
15018
15019 mutex_exit(&dtrace_lock);
15020 return (rval);
15021 }
15022
15023 case DTRACEHIOC_REMOVE: {
15024 mutex_enter(&dtrace_lock);
15025 rval = dtrace_helper_destroygen(arg);
15026 mutex_exit(&dtrace_lock);
15027
15028 return (rval);
15029 }
15030
15031 default:
15032 break;
15033 }
15034
15035 return (ENOTTY);
15036}
15037
15038/*ARGSUSED*/
15039static int
15040dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15041{
15042 minor_t minor = getminor(dev);
15043 dtrace_state_t *state;
15044 int rval;
15045
15046 if (minor == DTRACEMNRN_HELPER)
15047 return (dtrace_ioctl_helper(cmd, arg, rv));
15048
15049 state = ddi_get_soft_state(dtrace_softstate, minor);
15050
15051 if (state->dts_anon) {
15052 ASSERT(dtrace_anon.dta_state == NULL);
15053 state = state->dts_anon;
15054 }
15055
15056 switch (cmd) {
15057 case DTRACEIOC_PROVIDER: {
15058 dtrace_providerdesc_t pvd;
15059 dtrace_provider_t *pvp;
15060
15061 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15062 return (EFAULT);
15063
15064 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15065 mutex_enter(&dtrace_provider_lock);
15066
15067 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15068 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15069 break;
15070 }
15071
15072 mutex_exit(&dtrace_provider_lock);
15073
15074 if (pvp == NULL)
15075 return (ESRCH);
15076
15077 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15078 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15079 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15080 return (EFAULT);
15081
15082 return (0);
15083 }
15084
15085 case DTRACEIOC_EPROBE: {
15086 dtrace_eprobedesc_t epdesc;
15087 dtrace_ecb_t *ecb;
15088 dtrace_action_t *act;
15089 void *buf;
15090 size_t size;
15091 uintptr_t dest;
15092 int nrecs;
15093
15094 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15095 return (EFAULT);
15096
15097 mutex_enter(&dtrace_lock);
15098
15099 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15100 mutex_exit(&dtrace_lock);
15101 return (EINVAL);
15102 }
15103
15104 if (ecb->dte_probe == NULL) {
15105 mutex_exit(&dtrace_lock);
15106 return (EINVAL);
15107 }
15108
15109 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15110 epdesc.dtepd_uarg = ecb->dte_uarg;
15111 epdesc.dtepd_size = VBDTCAST(uint32_t)ecb->dte_size;
15112
15113 nrecs = epdesc.dtepd_nrecs;
15114 epdesc.dtepd_nrecs = 0;
15115 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15116 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15117 continue;
15118
15119 epdesc.dtepd_nrecs++;
15120 }
15121
15122 /*
15123 * Now that we have the size, we need to allocate a temporary
15124 * buffer in which to store the complete description. We need
15125 * the temporary buffer to be able to drop dtrace_lock()
15126 * across the copyout(), below.
15127 */
15128 size = sizeof (dtrace_eprobedesc_t) +
15129 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15130
15131 buf = kmem_alloc(size, KM_SLEEP);
15132 dest = (uintptr_t)buf;
15133
15134 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15135 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15136
15137 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15138 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15139 continue;
15140
15141 if (nrecs-- == 0)
15142 break;
15143
15144 bcopy(&act->dta_rec, (void *)dest,
15145 sizeof (dtrace_recdesc_t));
15146 dest += sizeof (dtrace_recdesc_t);
15147 }
15148
15149 mutex_exit(&dtrace_lock);
15150
15151 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15152 kmem_free(buf, size);
15153 return (EFAULT);
15154 }
15155
15156 kmem_free(buf, size);
15157 return (0);
15158 }
15159
15160 case DTRACEIOC_AGGDESC: {
15161 dtrace_aggdesc_t aggdesc;
15162 dtrace_action_t *act;
15163 dtrace_aggregation_t *agg;
15164 int nrecs;
15165 uint32_t offs;
15166 dtrace_recdesc_t *lrec;
15167 void *buf;
15168 size_t size;
15169 uintptr_t dest;
15170
15171 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15172 return (EFAULT);
15173
15174 mutex_enter(&dtrace_lock);
15175
15176 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15177 mutex_exit(&dtrace_lock);
15178 return (EINVAL);
15179 }
15180
15181 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15182
15183 nrecs = aggdesc.dtagd_nrecs;
15184 aggdesc.dtagd_nrecs = 0;
15185
15186 offs = agg->dtag_base;
15187 lrec = &agg->dtag_action.dta_rec;
15188 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15189
15190 for (act = agg->dtag_first; ; act = act->dta_next) {
15191 ASSERT(act->dta_intuple ||
15192 DTRACEACT_ISAGG(act->dta_kind));
15193
15194 /*
15195 * If this action has a record size of zero, it
15196 * denotes an argument to the aggregating action.
15197 * Because the presence of this record doesn't (or
15198 * shouldn't) affect the way the data is interpreted,
15199 * we don't copy it out to save user-level the
15200 * confusion of dealing with a zero-length record.
15201 */
15202 if (act->dta_rec.dtrd_size == 0) {
15203 ASSERT(agg->dtag_hasarg);
15204 continue;
15205 }
15206
15207 aggdesc.dtagd_nrecs++;
15208
15209 if (act == &agg->dtag_action)
15210 break;
15211 }
15212
15213 /*
15214 * Now that we have the size, we need to allocate a temporary
15215 * buffer in which to store the complete description. We need
15216 * the temporary buffer to be able to drop dtrace_lock()
15217 * across the copyout(), below.
15218 */
15219 size = sizeof (dtrace_aggdesc_t) +
15220 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15221
15222 buf = kmem_alloc(size, KM_SLEEP);
15223 dest = (uintptr_t)buf;
15224
15225 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15226 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15227
15228 for (act = agg->dtag_first; ; act = act->dta_next) {
15229 dtrace_recdesc_t rec = act->dta_rec;
15230
15231 /*
15232 * See the comment in the above loop for why we pass
15233 * over zero-length records.
15234 */
15235 if (rec.dtrd_size == 0) {
15236 ASSERT(agg->dtag_hasarg);
15237 continue;
15238 }
15239
15240 if (nrecs-- == 0)
15241 break;
15242
15243 rec.dtrd_offset -= offs;
15244 bcopy(&rec, (void *)dest, sizeof (rec));
15245 dest += sizeof (dtrace_recdesc_t);
15246
15247 if (act == &agg->dtag_action)
15248 break;
15249 }
15250
15251 mutex_exit(&dtrace_lock);
15252
15253 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15254 kmem_free(buf, size);
15255 return (EFAULT);
15256 }
15257
15258 kmem_free(buf, size);
15259 return (0);
15260 }
15261
15262 case DTRACEIOC_ENABLE: {
15263 dof_hdr_t *dof;
15264 dtrace_enabling_t *enab = NULL;
15265 dtrace_vstate_t *vstate;
15266 int err = 0;
15267
15268 *rv = 0;
15269
15270 /*
15271 * If a NULL argument has been passed, we take this as our
15272 * cue to reevaluate our enablings.
15273 */
15274 if (arg == NULL) {
15275 dtrace_enabling_matchall();
15276
15277 return (0);
15278 }
15279
15280 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15281 return (rval);
15282
15283 mutex_enter(&cpu_lock);
15284 mutex_enter(&dtrace_lock);
15285 vstate = &state->dts_vstate;
15286
15287 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15288 mutex_exit(&dtrace_lock);
15289 mutex_exit(&cpu_lock);
15290 dtrace_dof_destroy(dof);
15291 return (EBUSY);
15292 }
15293
15294 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15295 mutex_exit(&dtrace_lock);
15296 mutex_exit(&cpu_lock);
15297 dtrace_dof_destroy(dof);
15298 return (EINVAL);
15299 }
15300
15301 if ((rval = dtrace_dof_options(dof, state)) != 0) {
15302 dtrace_enabling_destroy(enab);
15303 mutex_exit(&dtrace_lock);
15304 mutex_exit(&cpu_lock);
15305 dtrace_dof_destroy(dof);
15306 return (rval);
15307 }
15308
15309 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15310 err = dtrace_enabling_retain(enab);
15311 } else {
15312 dtrace_enabling_destroy(enab);
15313 }
15314
15315 mutex_exit(&cpu_lock);
15316 mutex_exit(&dtrace_lock);
15317 dtrace_dof_destroy(dof);
15318
15319 return (err);
15320 }
15321
15322 case DTRACEIOC_REPLICATE: {
15323 dtrace_repldesc_t desc;
15324 dtrace_probedesc_t *match = &desc.dtrpd_match;
15325 dtrace_probedesc_t *create = &desc.dtrpd_create;
15326 int err;
15327
15328 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15329 return (EFAULT);
15330
15331 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15332 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15333 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15334 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15335
15336 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15337 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15338 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15339 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15340
15341 mutex_enter(&dtrace_lock);
15342 err = dtrace_enabling_replicate(state, match, create);
15343 mutex_exit(&dtrace_lock);
15344
15345 return (err);
15346 }
15347
15348 case DTRACEIOC_PROBEMATCH:
15349 case DTRACEIOC_PROBES: {
15350 dtrace_probe_t *probe = NULL;
15351 dtrace_probedesc_t desc;
15352 dtrace_probekey_t pkey;
15353 dtrace_id_t i;
15354 int m = 0;
15355 uint32_t priv;
15356 uid_t uid;
15357 zoneid_t zoneid;
15358
15359 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15360 return (EFAULT);
15361
15362 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15363 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15364 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15365 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15366
15367 /*
15368 * Before we attempt to match this probe, we want to give
15369 * all providers the opportunity to provide it.
15370 */
15371 if (desc.dtpd_id == DTRACE_IDNONE) {
15372 mutex_enter(&dtrace_provider_lock);
15373 dtrace_probe_provide(&desc, NULL);
15374 mutex_exit(&dtrace_provider_lock);
15375 desc.dtpd_id++;
15376 }
15377
15378 if (cmd == DTRACEIOC_PROBEMATCH) {
15379 dtrace_probekey(&desc, &pkey);
15380 pkey.dtpk_id = DTRACE_IDNONE;
15381 }
15382
15383 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15384
15385 mutex_enter(&dtrace_lock);
15386
15387 if (cmd == DTRACEIOC_PROBEMATCH) {
15388 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15389 if ((probe = dtrace_probes[i - 1]) != NULL &&
15390 (m = dtrace_match_probe(probe, &pkey,
15391 priv, uid, zoneid)) != 0)
15392 break;
15393 }
15394
15395 if (m < 0) {
15396 mutex_exit(&dtrace_lock);
15397 return (EINVAL);
15398 }
15399
15400 } else {
15401 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15402 if ((probe = dtrace_probes[i - 1]) != NULL &&
15403 dtrace_match_priv(probe, priv, uid, zoneid))
15404 break;
15405 }
15406 }
15407
15408 if (probe == NULL) {
15409 mutex_exit(&dtrace_lock);
15410 return (ESRCH);
15411 }
15412
15413 dtrace_probe_description(probe, &desc);
15414 mutex_exit(&dtrace_lock);
15415
15416 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15417 return (EFAULT);
15418
15419 return (0);
15420 }
15421
15422 case DTRACEIOC_PROBEARG: {
15423 dtrace_argdesc_t desc;
15424 dtrace_probe_t *probe;
15425 dtrace_provider_t *prov;
15426
15427 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15428 return (EFAULT);
15429
15430 if (desc.dtargd_id == DTRACE_IDNONE)
15431 return (EINVAL);
15432
15433 if (desc.dtargd_ndx == DTRACE_ARGNONE)
15434 return (EINVAL);
15435
15436 mutex_enter(&dtrace_provider_lock);
15437 mutex_enter(&mod_lock);
15438 mutex_enter(&dtrace_lock);
15439
15440 if (desc.dtargd_id > dtrace_nprobes) {
15441 mutex_exit(&dtrace_lock);
15442 mutex_exit(&mod_lock);
15443 mutex_exit(&dtrace_provider_lock);
15444 return (EINVAL);
15445 }
15446
15447 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15448 mutex_exit(&dtrace_lock);
15449 mutex_exit(&mod_lock);
15450 mutex_exit(&dtrace_provider_lock);
15451 return (EINVAL);
15452 }
15453
15454 mutex_exit(&dtrace_lock);
15455
15456 prov = probe->dtpr_provider;
15457
15458 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15459 /*
15460 * There isn't any typed information for this probe.
15461 * Set the argument number to DTRACE_ARGNONE.
15462 */
15463 desc.dtargd_ndx = DTRACE_ARGNONE;
15464 } else {
15465 desc.dtargd_native[0] = '\0';
15466 desc.dtargd_xlate[0] = '\0';
15467 desc.dtargd_mapping = desc.dtargd_ndx;
15468
15469 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15470 probe->dtpr_id, probe->dtpr_arg, &desc);
15471 }
15472
15473 mutex_exit(&mod_lock);
15474 mutex_exit(&dtrace_provider_lock);
15475
15476 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15477 return (EFAULT);
15478
15479 return (0);
15480 }
15481
15482 case DTRACEIOC_GO: {
15483 processorid_t cpuid;
15484 rval = dtrace_state_go(state, &cpuid);
15485
15486 if (rval != 0)
15487 return (rval);
15488
15489 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15490 return (EFAULT);
15491
15492 return (0);
15493 }
15494
15495 case DTRACEIOC_STOP: {
15496 processorid_t cpuid;
15497
15498 mutex_enter(&dtrace_lock);
15499 rval = dtrace_state_stop(state, &cpuid);
15500 mutex_exit(&dtrace_lock);
15501
15502 if (rval != 0)
15503 return (rval);
15504
15505 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15506 return (EFAULT);
15507
15508 return (0);
15509 }
15510
15511 case DTRACEIOC_DOFGET: {
15512 dof_hdr_t hdr, *dof;
15513 uint64_t len;
15514
15515 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15516 return (EFAULT);
15517
15518 mutex_enter(&dtrace_lock);
15519 dof = dtrace_dof_create(state);
15520 mutex_exit(&dtrace_lock);
15521
15522 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15523 rval = copyout(dof, (void *)arg, len);
15524 dtrace_dof_destroy(dof);
15525
15526 return (rval == 0 ? 0 : EFAULT);
15527 }
15528
15529 case DTRACEIOC_AGGSNAP:
15530 case DTRACEIOC_BUFSNAP: {
15531 dtrace_bufdesc_t desc;
15532 caddr_t cached;
15533 dtrace_buffer_t *buf;
15534
15535 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15536 return (EFAULT);
15537
15538 if (/*VBox value is is unsigned: desc.dtbd_cpu < 0 ||*/ desc.dtbd_cpu >= NCPU)
15539 return (EINVAL);
15540
15541 mutex_enter(&dtrace_lock);
15542
15543 if (cmd == DTRACEIOC_BUFSNAP) {
15544 buf = &state->dts_buffer[desc.dtbd_cpu];
15545 } else {
15546 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15547 }
15548
15549 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15550 size_t sz = buf->dtb_offset;
15551
15552 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15553 mutex_exit(&dtrace_lock);
15554 return (EBUSY);
15555 }
15556
15557 /*
15558 * If this buffer has already been consumed, we're
15559 * going to indicate that there's nothing left here
15560 * to consume.
15561 */
15562 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15563 mutex_exit(&dtrace_lock);
15564
15565 desc.dtbd_size = 0;
15566 desc.dtbd_drops = 0;
15567 desc.dtbd_errors = 0;
15568 desc.dtbd_oldest = 0;
15569 sz = sizeof (desc);
15570
15571 if (copyout(&desc, (void *)arg, sz) != 0)
15572 return (EFAULT);
15573
15574 return (0);
15575 }
15576
15577 /*
15578 * If this is a ring buffer that has wrapped, we want
15579 * to copy the whole thing out.
15580 */
15581 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15582 dtrace_buffer_polish(buf);
15583 sz = buf->dtb_size;
15584 }
15585
15586 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15587 mutex_exit(&dtrace_lock);
15588 return (EFAULT);
15589 }
15590
15591 desc.dtbd_size = sz;
15592 desc.dtbd_drops = buf->dtb_drops;
15593 desc.dtbd_errors = buf->dtb_errors;
15594 desc.dtbd_oldest = buf->dtb_xamot_offset;
15595
15596 mutex_exit(&dtrace_lock);
15597
15598 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15599 return (EFAULT);
15600
15601 buf->dtb_flags |= DTRACEBUF_CONSUMED;
15602
15603 return (0);
15604 }
15605
15606 if (buf->dtb_tomax == NULL) {
15607 ASSERT(buf->dtb_xamot == NULL);
15608 mutex_exit(&dtrace_lock);
15609 return (ENOENT);
15610 }
15611
15612 cached = buf->dtb_tomax;
15613 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15614
15615 dtrace_xcall(desc.dtbd_cpu,
15616 (dtrace_xcall_t)dtrace_buffer_switch, buf);
15617
15618 state->dts_errors += buf->dtb_xamot_errors;
15619
15620 /*
15621 * If the buffers did not actually switch, then the cross call
15622 * did not take place -- presumably because the given CPU is
15623 * not in the ready set. If this is the case, we'll return
15624 * ENOENT.
15625 */
15626 if (buf->dtb_tomax == cached) {
15627 ASSERT(buf->dtb_xamot != cached);
15628 mutex_exit(&dtrace_lock);
15629 return (ENOENT);
15630 }
15631
15632 ASSERT(cached == buf->dtb_xamot);
15633
15634 /*
15635 * We have our snapshot; now copy it out.
15636 */
15637 if (copyout(buf->dtb_xamot, desc.dtbd_data,
15638 buf->dtb_xamot_offset) != 0) {
15639 mutex_exit(&dtrace_lock);
15640 return (EFAULT);
15641 }
15642
15643 desc.dtbd_size = buf->dtb_xamot_offset;
15644 desc.dtbd_drops = buf->dtb_xamot_drops;
15645 desc.dtbd_errors = buf->dtb_xamot_errors;
15646 desc.dtbd_oldest = 0;
15647
15648 mutex_exit(&dtrace_lock);
15649
15650 /*
15651 * Finally, copy out the buffer description.
15652 */
15653 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15654 return (EFAULT);
15655
15656 return (0);
15657 }
15658
15659 case DTRACEIOC_CONF: {
15660 dtrace_conf_t conf;
15661
15662 bzero(&conf, sizeof (conf));
15663 conf.dtc_difversion = DIF_VERSION;
15664 conf.dtc_difintregs = DIF_DIR_NREGS;
15665 conf.dtc_diftupregs = DIF_DTR_NREGS;
15666 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15667
15668 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15669 return (EFAULT);
15670
15671 return (0);
15672 }
15673
15674 case DTRACEIOC_STATUS: {
15675 dtrace_status_t stat;
15676 dtrace_dstate_t *dstate;
15677 int i, j;
15678 uint64_t nerrs;
15679
15680 /*
15681 * See the comment in dtrace_state_deadman() for the reason
15682 * for setting dts_laststatus to INT64_MAX before setting
15683 * it to the correct value.
15684 */
15685 state->dts_laststatus = INT64_MAX;
15686 dtrace_membar_producer();
15687 state->dts_laststatus = dtrace_gethrtime();
15688
15689 bzero(&stat, sizeof (stat));
15690
15691 mutex_enter(&dtrace_lock);
15692
15693 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15694 mutex_exit(&dtrace_lock);
15695 return (ENOENT);
15696 }
15697
15698 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15699 stat.dtst_exiting = 1;
15700
15701 nerrs = state->dts_errors;
15702 dstate = &state->dts_vstate.dtvs_dynvars;
15703
15704 for (i = 0; i < NCPU; i++) {
15705 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15706
15707 stat.dtst_dyndrops += dcpu->dtdsc_drops;
15708 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15709 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15710
15711 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15712 stat.dtst_filled++;
15713
15714 nerrs += state->dts_buffer[i].dtb_errors;
15715
15716 for (j = 0; j < state->dts_nspeculations; j++) {
15717 dtrace_speculation_t *spec;
15718 dtrace_buffer_t *buf;
15719
15720 spec = &state->dts_speculations[j];
15721 buf = &spec->dtsp_buffer[i];
15722 stat.dtst_specdrops += buf->dtb_xamot_drops;
15723 }
15724 }
15725
15726 stat.dtst_specdrops_busy = state->dts_speculations_busy;
15727 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15728 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15729 stat.dtst_dblerrors = state->dts_dblerrors;
15730 stat.dtst_killed =
15731 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15732 stat.dtst_errors = nerrs;
15733
15734 mutex_exit(&dtrace_lock);
15735
15736 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15737 return (EFAULT);
15738
15739 return (0);
15740 }
15741
15742 case DTRACEIOC_FORMAT: {
15743 dtrace_fmtdesc_t fmt;
15744 char *str;
15745 int len;
15746
15747 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15748 return (EFAULT);
15749
15750 mutex_enter(&dtrace_lock);
15751
15752 if (fmt.dtfd_format == 0 ||
15753 fmt.dtfd_format > state->dts_nformats) {
15754 mutex_exit(&dtrace_lock);
15755 return (EINVAL);
15756 }
15757
15758 /*
15759 * Format strings are allocated contiguously and they are
15760 * never freed; if a format index is less than the number
15761 * of formats, we can assert that the format map is non-NULL
15762 * and that the format for the specified index is non-NULL.
15763 */
15764 ASSERT(state->dts_formats != NULL);
15765 str = state->dts_formats[fmt.dtfd_format - 1];
15766 ASSERT(str != NULL);
15767
15768 len = VBDTCAST(int)strlen(str) + 1;
15769
15770 if (len > fmt.dtfd_length) {
15771 fmt.dtfd_length = len;
15772
15773 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15774 mutex_exit(&dtrace_lock);
15775 return (EINVAL);
15776 }
15777 } else {
15778 if (copyout(str, fmt.dtfd_string, len) != 0) {
15779 mutex_exit(&dtrace_lock);
15780 return (EINVAL);
15781 }
15782 }
15783
15784 mutex_exit(&dtrace_lock);
15785 return (0);
15786 }
15787
15788 default:
15789 break;
15790 }
15791
15792 return (ENOTTY);
15793}
15794
15795/*ARGSUSED*/
15796static int
15797dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15798{
15799 dtrace_state_t *state;
15800
15801 switch (cmd) {
15802 case DDI_DETACH:
15803 break;
15804
15805 case DDI_SUSPEND:
15806 return (DDI_SUCCESS);
15807
15808 default:
15809 return (DDI_FAILURE);
15810 }
15811
15812 mutex_enter(&cpu_lock);
15813 mutex_enter(&dtrace_provider_lock);
15814 mutex_enter(&dtrace_lock);
15815
15816 ASSERT(dtrace_opens == 0);
15817
15818 if (dtrace_helpers > 0) {
15819 mutex_exit(&dtrace_provider_lock);
15820 mutex_exit(&dtrace_lock);
15821 mutex_exit(&cpu_lock);
15822 return (DDI_FAILURE);
15823 }
15824
15825 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15826 mutex_exit(&dtrace_provider_lock);
15827 mutex_exit(&dtrace_lock);
15828 mutex_exit(&cpu_lock);
15829 return (DDI_FAILURE);
15830 }
15831
15832 dtrace_provider = NULL;
15833
15834 if ((state = dtrace_anon_grab()) != NULL) {
15835 /*
15836 * If there were ECBs on this state, the provider should
15837 * have not been allowed to detach; assert that there is
15838 * none.
15839 */
15840 ASSERT(state->dts_necbs == 0);
15841 dtrace_state_destroy(state);
15842
15843#ifndef VBOX
15844 /*
15845 * If we're being detached with anonymous state, we need to
15846 * indicate to the kernel debugger that DTrace is now inactive.
15847 */
15848 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15849#endif
15850 }
15851
15852 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15853#ifndef VBOX /** @todo CPU hooks */
15854 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15855#endif
15856 dtrace_cpu_init = NULL;
15857 dtrace_helpers_cleanup = NULL;
15858 dtrace_helpers_fork = NULL;
15859 dtrace_cpustart_init = NULL;
15860 dtrace_cpustart_fini = NULL;
15861 dtrace_debugger_init = NULL;
15862 dtrace_debugger_fini = NULL;
15863 dtrace_modload = NULL;
15864 dtrace_modunload = NULL;
15865
15866 mutex_exit(&cpu_lock);
15867
15868 if (dtrace_helptrace_enabled) {
15869 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15870 dtrace_helptrace_buffer = NULL;
15871 }
15872
15873 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15874 dtrace_probes = NULL;
15875 dtrace_nprobes = 0;
15876
15877 dtrace_hash_destroy(dtrace_bymod);
15878 dtrace_hash_destroy(dtrace_byfunc);
15879 dtrace_hash_destroy(dtrace_byname);
15880 dtrace_bymod = NULL;
15881 dtrace_byfunc = NULL;
15882 dtrace_byname = NULL;
15883
15884 kmem_cache_destroy(dtrace_state_cache);
15885 vmem_destroy(dtrace_minor);
15886 vmem_destroy(dtrace_arena);
15887
15888 if (dtrace_toxrange != NULL) {
15889 kmem_free(dtrace_toxrange,
15890 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15891 dtrace_toxrange = NULL;
15892 dtrace_toxranges = 0;
15893 dtrace_toxranges_max = 0;
15894 }
15895
15896#ifndef VBOX
15897 ddi_remove_minor_node(dtrace_devi, NULL);
15898#endif
15899 dtrace_devi = NULL;
15900
15901 ddi_soft_state_fini(&dtrace_softstate);
15902
15903 ASSERT(dtrace_vtime_references == 0);
15904 ASSERT(dtrace_opens == 0);
15905 ASSERT(dtrace_retained == NULL);
15906
15907 mutex_exit(&dtrace_lock);
15908 mutex_exit(&dtrace_provider_lock);
15909
15910 /*
15911 * We don't destroy the task queue until after we have dropped our
15912 * locks (taskq_destroy() may block on running tasks). To prevent
15913 * attempting to do work after we have effectively detached but before
15914 * the task queue has been destroyed, all tasks dispatched via the
15915 * task queue must check that DTrace is still attached before
15916 * performing any operation.
15917 */
15918#ifndef VBOX
15919 taskq_destroy(dtrace_taskq);
15920 dtrace_taskq = NULL;
15921#endif
15922
15923 return (DDI_SUCCESS);
15924}
15925
15926#ifndef VBOX
15927/*ARGSUSED*/
15928static int
15929dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15930{
15931 int error;
15932
15933 switch (infocmd) {
15934 case DDI_INFO_DEVT2DEVINFO:
15935 *result = (void *)dtrace_devi;
15936 error = DDI_SUCCESS;
15937 break;
15938 case DDI_INFO_DEVT2INSTANCE:
15939 *result = (void *)0;
15940 error = DDI_SUCCESS;
15941 break;
15942 default:
15943 error = DDI_FAILURE;
15944 }
15945 return (error);
15946}
15947
15948static struct cb_ops dtrace_cb_ops = {
15949 dtrace_open, /* open */
15950 dtrace_close, /* close */
15951 nulldev, /* strategy */
15952 nulldev, /* print */
15953 nodev, /* dump */
15954 nodev, /* read */
15955 nodev, /* write */
15956 dtrace_ioctl, /* ioctl */
15957 nodev, /* devmap */
15958 nodev, /* mmap */
15959 nodev, /* segmap */
15960 nochpoll, /* poll */
15961 ddi_prop_op, /* cb_prop_op */
15962 0, /* streamtab */
15963 D_NEW | D_MP /* Driver compatibility flag */
15964};
15965
15966static struct dev_ops dtrace_ops = {
15967 DEVO_REV, /* devo_rev */
15968 0, /* refcnt */
15969 dtrace_info, /* get_dev_info */
15970 nulldev, /* identify */
15971 nulldev, /* probe */
15972 dtrace_attach, /* attach */
15973 dtrace_detach, /* detach */
15974 nodev, /* reset */
15975 &dtrace_cb_ops, /* driver operations */
15976 NULL, /* bus operations */
15977 nodev, /* dev power */
15978 ddi_quiesce_not_needed, /* quiesce */
15979};
15980
15981static struct modldrv modldrv = {
15982 &mod_driverops, /* module type (this is a pseudo driver) */
15983 "Dynamic Tracing", /* name of module */
15984 &dtrace_ops, /* driver ops */
15985};
15986
15987static struct modlinkage modlinkage = {
15988 MODREV_1,
15989 (void *)&modldrv,
15990 NULL
15991};
15992
15993int
15994_init(void)
15995{
15996 return (mod_install(&modlinkage));
15997}
15998
15999int
16000_info(struct modinfo *modinfop)
16001{
16002 return (mod_info(&modlinkage, modinfop));
16003}
16004
16005int
16006_fini(void)
16007{
16008 return (mod_remove(&modlinkage));
16009}
16010
16011#endif /* !VBOX */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette