VirtualBox

source: vbox/trunk/src/VBox/ExtPacks/VBoxDTrace/onnv/uts/common/dtrace/dtrace.c@ 53642

Last change on this file since 53642 was 53642, checked in by vboxsync, 10 years ago

VBoxDTrace: Made dtrace.c compile on linux. (r12)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 399.1 KB
Line 
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * DTrace - Dynamic Tracing for Solaris
28 *
29 * This is the implementation of the Solaris Dynamic Tracing framework
30 * (DTrace). The user-visible interface to DTrace is described at length in
31 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
32 * library, the in-kernel DTrace framework, and the DTrace providers are
33 * described in the block comments in the <sys/dtrace.h> header file. The
34 * internal architecture of DTrace is described in the block comments in the
35 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
36 * implementation very much assume mastery of all of these sources; if one has
37 * an unanswered question about the implementation, one should consult them
38 * first.
39 *
40 * The functions here are ordered roughly as follows:
41 *
42 * - Probe context functions
43 * - Probe hashing functions
44 * - Non-probe context utility functions
45 * - Matching functions
46 * - Provider-to-Framework API functions
47 * - Probe management functions
48 * - DIF object functions
49 * - Format functions
50 * - Predicate functions
51 * - ECB functions
52 * - Buffer functions
53 * - Enabling functions
54 * - DOF functions
55 * - Anonymous enabling functions
56 * - Consumer state functions
57 * - Helper functions
58 * - Hook functions
59 * - Driver cookbook functions
60 *
61 * Each group of functions begins with a block comment labelled the "DTrace
62 * [Group] Functions", allowing one to find each block by searching forward
63 * on capital-f functions.
64 */
65#ifndef VBOX
66#include <sys/errno.h>
67#include <sys/stat.h>
68#include <sys/modctl.h>
69#include <sys/conf.h>
70#include <sys/systm.h>
71#include <sys/ddi.h>
72#include <sys/sunddi.h>
73#include <sys/cpuvar.h>
74#include <sys/kmem.h>
75#include <sys/strsubr.h>
76#include <sys/sysmacros.h>
77#include <sys/dtrace_impl.h>
78#include <sys/atomic.h>
79#include <sys/cmn_err.h>
80#include <sys/mutex_impl.h>
81#include <sys/rwlock_impl.h>
82#include <sys/ctf_api.h>
83#include <sys/panic.h>
84#include <sys/priv_impl.h>
85#include <sys/policy.h>
86#include <sys/cred_impl.h>
87#include <sys/procfs_isa.h>
88#include <sys/taskq.h>
89#include <sys/mkdev.h>
90#include <sys/kdi.h>
91#include <sys/zone.h>
92#include <sys/socket.h>
93#include <netinet/in.h>
94
95#else /* VBOX */
96# include <sys/dtrace_impl.h>
97# include <iprt/assert.h>
98# include <iprt/cpuset.h>
99# include <iprt/mp.h>
100# include <iprt/string.h>
101# include <iprt/process.h>
102# include <iprt/thread.h>
103# include <iprt/timer.h>
104# include <limits.h>
105
106# undef NULL
107# define NULL (0)
108#endif /* VBOX */
109
110/*
111 * DTrace Tunable Variables
112 *
113 * The following variables may be tuned by adding a line to /etc/system that
114 * includes both the name of the DTrace module ("dtrace") and the name of the
115 * variable. For example:
116 *
117 * set dtrace:dtrace_destructive_disallow = 1
118 *
119 * In general, the only variables that one should be tuning this way are those
120 * that affect system-wide DTrace behavior, and for which the default behavior
121 * is undesirable. Most of these variables are tunable on a per-consumer
122 * basis using DTrace options, and need not be tuned on a system-wide basis.
123 * When tuning these variables, avoid pathological values; while some attempt
124 * is made to verify the integrity of these variables, they are not considered
125 * part of the supported interface to DTrace, and they are therefore not
126 * checked comprehensively. Further, these variables should not be tuned
127 * dynamically via "mdb -kw" or other means; they should only be tuned via
128 * /etc/system.
129 */
130int dtrace_destructive_disallow = 0;
131dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
132size_t dtrace_difo_maxsize = (256 * 1024);
133dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
134size_t dtrace_global_maxsize = (16 * 1024);
135size_t dtrace_actions_max = (16 * 1024);
136size_t dtrace_retain_max = 1024;
137dtrace_optval_t dtrace_helper_actions_max = 32;
138dtrace_optval_t dtrace_helper_providers_max = 32;
139dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
140size_t dtrace_strsize_default = 256;
141dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
142dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
143dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
144dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
145dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
146dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
147dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
148dtrace_optval_t dtrace_nspec_default = 1;
149dtrace_optval_t dtrace_specsize_default = 32 * 1024;
150dtrace_optval_t dtrace_stackframes_default = 20;
151dtrace_optval_t dtrace_ustackframes_default = 20;
152dtrace_optval_t dtrace_jstackframes_default = 50;
153dtrace_optval_t dtrace_jstackstrsize_default = 512;
154int dtrace_msgdsize_max = 128;
155hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
156hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
157int dtrace_devdepth_max = 32;
158int dtrace_err_verbose;
159hrtime_t dtrace_deadman_interval = NANOSEC;
160hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
161hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
162
163/*
164 * DTrace External Variables
165 *
166 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
167 * available to DTrace consumers via the backtick (`) syntax. One of these,
168 * dtrace_zero, is made deliberately so: it is provided as a source of
169 * well-known, zero-filled memory. While this variable is not documented,
170 * it is used by some translators as an implementation detail.
171 */
172const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
173
174/*
175 * DTrace Internal Variables
176 */
177static dev_info_t *dtrace_devi; /* device info */
178static vmem_t *dtrace_arena; /* probe ID arena */
179static vmem_t *dtrace_minor; /* minor number arena */
180#ifndef VBOX
181static taskq_t *dtrace_taskq; /* task queue */
182#endif
183static dtrace_probe_t **dtrace_probes; /* array of all probes */
184static VBDTTYPE(uint32_t,int) dtrace_nprobes; /* number of probes */
185static dtrace_provider_t *dtrace_provider; /* provider list */
186static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
187static int dtrace_opens; /* number of opens */
188static int dtrace_helpers; /* number of helpers */
189static void *dtrace_softstate; /* softstate pointer */
190static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
191static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
192static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
193static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
194static int dtrace_toxranges; /* number of toxic ranges */
195static int dtrace_toxranges_max; /* size of toxic range array */
196static dtrace_anon_t dtrace_anon; /* anonymous enabling */
197static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
198static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
199static kthread_t *dtrace_panicked; /* panicking thread */
200static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
201static dtrace_genid_t dtrace_probegen; /* current probe generation */
202static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
203static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
204static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
205static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
206static int dtrace_dynvar_failclean; /* dynvars failed to clean */
207
208/*
209 * DTrace Locking
210 * DTrace is protected by three (relatively coarse-grained) locks:
211 *
212 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
213 * including enabling state, probes, ECBs, consumer state, helper state,
214 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
215 * probe context is lock-free -- synchronization is handled via the
216 * dtrace_sync() cross call mechanism.
217 *
218 * (2) dtrace_provider_lock is required when manipulating provider state, or
219 * when provider state must be held constant.
220 *
221 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
222 * when meta provider state must be held constant.
223 *
224 * The lock ordering between these three locks is dtrace_meta_lock before
225 * dtrace_provider_lock before dtrace_lock. (In particular, there are
226 * several places where dtrace_provider_lock is held by the framework as it
227 * calls into the providers -- which then call back into the framework,
228 * grabbing dtrace_lock.)
229 *
230 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
231 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
232 * role as a coarse-grained lock; it is acquired before both of these locks.
233 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
234 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
235 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
236 * acquired _between_ dtrace_provider_lock and dtrace_lock.
237 */
238static kmutex_t dtrace_lock; /* probe state lock */
239static kmutex_t dtrace_provider_lock; /* provider state lock */
240static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
241
242/*
243 * DTrace Provider Variables
244 *
245 * These are the variables relating to DTrace as a provider (that is, the
246 * provider of the BEGIN, END, and ERROR probes).
247 */
248static dtrace_pattr_t dtrace_provider_attr = {
249{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
250{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
251{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
252{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
253{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
254};
255
256static void
257dtrace_nullop(void)
258{}
259
260static int
261dtrace_enable_nullop(void)
262{
263 return (0);
264}
265
266static dtrace_pops_t dtrace_provider_ops = {
267 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
268 (void (*)(void *, struct modctl *))dtrace_nullop,
269 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
270 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
271 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
272 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
273 NULL,
274 NULL,
275 NULL,
276 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
277};
278
279static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
280static dtrace_id_t dtrace_probeid_end; /* special END probe */
281dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
282
283/*
284 * DTrace Helper Tracing Variables
285 */
286uint32_t dtrace_helptrace_next = 0;
287uint32_t dtrace_helptrace_nlocals;
288char *dtrace_helptrace_buffer;
289int dtrace_helptrace_bufsize = 512 * 1024;
290
291#ifdef DEBUG
292int dtrace_helptrace_enabled = 1;
293#else
294int dtrace_helptrace_enabled = 0;
295#endif
296
297/*
298 * DTrace Error Hashing
299 *
300 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
301 * table. This is very useful for checking coverage of tests that are
302 * expected to induce DIF or DOF processing errors, and may be useful for
303 * debugging problems in the DIF code generator or in DOF generation . The
304 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
305 */
306#ifdef DEBUG
307static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
308static const char *dtrace_errlast;
309static kthread_t *dtrace_errthread;
310static kmutex_t dtrace_errlock;
311#endif
312
313/*
314 * DTrace Macros and Constants
315 *
316 * These are various macros that are useful in various spots in the
317 * implementation, along with a few random constants that have no meaning
318 * outside of the implementation. There is no real structure to this cpp
319 * mishmash -- but is there ever?
320 */
321#define DTRACE_HASHSTR(hash, probe) \
322 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
323
324#define DTRACE_HASHNEXT(hash, probe) \
325 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
326
327#define DTRACE_HASHPREV(hash, probe) \
328 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
329
330#define DTRACE_HASHEQ(hash, lhs, rhs) \
331 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
332 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
333
334#define DTRACE_AGGHASHSIZE_SLEW 17
335
336#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
337
338/*
339 * The key for a thread-local variable consists of the lower 61 bits of the
340 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
341 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
342 * equal to a variable identifier. This is necessary (but not sufficient) to
343 * assure that global associative arrays never collide with thread-local
344 * variables. To guarantee that they cannot collide, we must also define the
345 * order for keying dynamic variables. That order is:
346 *
347 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
348 *
349 * Because the variable-key and the tls-key are in orthogonal spaces, there is
350 * no way for a global variable key signature to match a thread-local key
351 * signature.
352 */
353#ifndef VBOX
354#define DTRACE_TLS_THRKEY(where) { \
355 uint_t intr = 0; \
356 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
357 for (; actv; actv >>= 1) \
358 intr++; \
359 ASSERT(intr < (1 << 3)); \
360 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
361 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
362}
363#else
364#define DTRACE_TLS_THRKEY(where) do { \
365 (where) = (((uintptr_t)RTThreadNativeSelf() + DIF_VARIABLE_MAX) & (RT_BIT_64(61) - 1)) \
366 | (RTThreadIsInInterrupt(NIL_RTTHREAD) ? RT_BIT_64(61) : 0); \
367} while (0)
368#endif
369
370#define DT_BSWAP_8(x) ((x) & 0xff)
371#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
372#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
373#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
374
375#define DT_MASK_LO 0x00000000FFFFFFFFULL
376
377#define DTRACE_STORE(type, tomax, offset, what) \
378 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
379
380#ifndef __i386
381#define DTRACE_ALIGNCHECK(addr, size, flags) \
382 if (addr & (size - 1)) { \
383 *flags |= CPU_DTRACE_BADALIGN; \
384 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = addr; \
385 return (0); \
386 }
387#else
388#define DTRACE_ALIGNCHECK(addr, size, flags)
389#endif
390
391/*
392 * Test whether a range of memory starting at testaddr of size testsz falls
393 * within the range of memory described by addr, sz. We take care to avoid
394 * problems with overflow and underflow of the unsigned quantities, and
395 * disallow all negative sizes. Ranges of size 0 are allowed.
396 */
397#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
398 ((testaddr) - (baseaddr) < (basesz) && \
399 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
400 (testaddr) + (testsz) >= (testaddr))
401
402/*
403 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
404 * alloc_sz on the righthand side of the comparison in order to avoid overflow
405 * or underflow in the comparison with it. This is simpler than the INRANGE
406 * check above, because we know that the dtms_scratch_ptr is valid in the
407 * range. Allocations of size zero are allowed.
408 */
409#define DTRACE_INSCRATCH(mstate, alloc_sz) \
410 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
411 (mstate)->dtms_scratch_ptr >= (alloc_sz))
412
413#define DTRACE_LOADFUNC(bits) \
414/*CSTYLED*/ \
415VBDTSTATIC uint##bits##_t \
416dtrace_load##bits(uintptr_t addr) \
417{ \
418 size_t size = bits / NBBY; \
419 /*CSTYLED*/ \
420 uint##bits##_t rval; \
421 int i; \
422 processorid_t me = VBDT_GET_CPUID(); \
423 volatile uint16_t *flags = (volatile uint16_t *) \
424 &cpu_core[me].cpuc_dtrace_flags; \
425 \
426 DTRACE_ALIGNCHECK(addr, size, flags); \
427 \
428 for (i = 0; i < dtrace_toxranges; i++) { \
429 if (addr >= dtrace_toxrange[i].dtt_limit) \
430 continue; \
431 \
432 if (addr + size <= dtrace_toxrange[i].dtt_base) \
433 continue; \
434 \
435 /* \
436 * This address falls within a toxic region; return 0. \
437 */ \
438 *flags |= CPU_DTRACE_BADADDR; \
439 cpu_core[me].cpuc_dtrace_illval = addr; \
440 return (0); \
441 } \
442 \
443 *flags |= CPU_DTRACE_NOFAULT; \
444 /*CSTYLED*/ \
445 rval = *((volatile uint##bits##_t *)addr); \
446 *flags &= ~CPU_DTRACE_NOFAULT; \
447 \
448 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
449}
450
451#ifdef _LP64
452#define dtrace_loadptr dtrace_load64
453#else
454#define dtrace_loadptr dtrace_load32
455#endif
456
457#define DTRACE_DYNHASH_FREE 0
458#define DTRACE_DYNHASH_SINK 1
459#define DTRACE_DYNHASH_VALID 2
460
461#define DTRACE_MATCH_FAIL -1
462#define DTRACE_MATCH_NEXT 0
463#define DTRACE_MATCH_DONE 1
464#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
465#define DTRACE_STATE_ALIGN 64
466
467#define DTRACE_FLAGS2FLT(flags) \
468 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
469 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
470 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
471 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
472 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
473 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
474 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
475 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
476 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
477 DTRACEFLT_UNKNOWN)
478
479#define DTRACEACT_ISSTRING(act) \
480 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
481 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
482
483static size_t dtrace_strlen(const char *, size_t);
484static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
485static void dtrace_enabling_provide(dtrace_provider_t *);
486static int dtrace_enabling_match(dtrace_enabling_t *, int *);
487static void dtrace_enabling_matchall(void);
488static dtrace_state_t *dtrace_anon_grab(void);
489static uint64_t dtrace_helper(int, dtrace_mstate_t *,
490 dtrace_state_t *, uint64_t, uint64_t);
491static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
492static void dtrace_buffer_drop(dtrace_buffer_t *);
493static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
494 dtrace_state_t *, dtrace_mstate_t *);
495static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
496 dtrace_optval_t);
497static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
498static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
499
500/*
501 * DTrace Probe Context Functions
502 *
503 * These functions are called from probe context. Because probe context is
504 * any context in which C may be called, arbitrarily locks may be held,
505 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
506 * As a result, functions called from probe context may only call other DTrace
507 * support functions -- they may not interact at all with the system at large.
508 * (Note that the ASSERT macro is made probe-context safe by redefining it in
509 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
510 * loads are to be performed from probe context, they _must_ be in terms of
511 * the safe dtrace_load*() variants.
512 *
513 * Some functions in this block are not actually called from probe context;
514 * for these functions, there will be a comment above the function reading
515 * "Note: not called from probe context."
516 */
517void
518dtrace_panic(const char *format, ...)
519{
520 va_list alist;
521
522 va_start(alist, format);
523 dtrace_vpanic(format, alist);
524 va_end(alist);
525}
526
527int
528dtrace_assfail(const char *a, const char *f, int l)
529{
530 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
531
532 /*
533 * We just need something here that even the most clever compiler
534 * cannot optimize away.
535 */
536 return (a[(uintptr_t)f]);
537}
538
539/*
540 * Atomically increment a specified error counter from probe context.
541 */
542static void
543dtrace_error(uint32_t *counter)
544{
545 /*
546 * Most counters stored to in probe context are per-CPU counters.
547 * However, there are some error conditions that are sufficiently
548 * arcane that they don't merit per-CPU storage. If these counters
549 * are incremented concurrently on different CPUs, scalability will be
550 * adversely affected -- but we don't expect them to be white-hot in a
551 * correctly constructed enabling...
552 */
553 uint32_t oval, nval;
554
555 do {
556 oval = *counter;
557
558 if ((nval = oval + 1) == 0) {
559 /*
560 * If the counter would wrap, set it to 1 -- assuring
561 * that the counter is never zero when we have seen
562 * errors. (The counter must be 32-bits because we
563 * aren't guaranteed a 64-bit compare&swap operation.)
564 * To save this code both the infamy of being fingered
565 * by a priggish news story and the indignity of being
566 * the target of a neo-puritan witch trial, we're
567 * carefully avoiding any colorful description of the
568 * likelihood of this condition -- but suffice it to
569 * say that it is only slightly more likely than the
570 * overflow of predicate cache IDs, as discussed in
571 * dtrace_predicate_create().
572 */
573 nval = 1;
574 }
575 } while (dtrace_cas32(counter, oval, nval) != oval);
576}
577
578/*
579 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
580 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
581 */
582DTRACE_LOADFUNC(8)
583DTRACE_LOADFUNC(16)
584DTRACE_LOADFUNC(32)
585DTRACE_LOADFUNC(64)
586
587static int
588dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
589{
590 if (dest < mstate->dtms_scratch_base)
591 return (0);
592
593 if (dest + size < dest)
594 return (0);
595
596 if (dest + size > mstate->dtms_scratch_ptr)
597 return (0);
598
599 return (1);
600}
601
602static int
603dtrace_canstore_statvar(uint64_t addr, size_t sz,
604 dtrace_statvar_t **svars, int nsvars)
605{
606 int i;
607
608 for (i = 0; i < nsvars; i++) {
609 dtrace_statvar_t *svar = svars[i];
610
611 if (svar == NULL || svar->dtsv_size == 0)
612 continue;
613
614 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
615 return (1);
616 }
617
618 return (0);
619}
620
621/*
622 * Check to see if the address is within a memory region to which a store may
623 * be issued. This includes the DTrace scratch areas, and any DTrace variable
624 * region. The caller of dtrace_canstore() is responsible for performing any
625 * alignment checks that are needed before stores are actually executed.
626 */
627static int
628dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
629 dtrace_vstate_t *vstate)
630{
631 /*
632 * First, check to see if the address is in scratch space...
633 */
634 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
635 mstate->dtms_scratch_size))
636 return (1);
637
638 /*
639 * Now check to see if it's a dynamic variable. This check will pick
640 * up both thread-local variables and any global dynamically-allocated
641 * variables.
642 */
643 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
644 vstate->dtvs_dynvars.dtds_size)) {
645 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
646 uintptr_t base = (uintptr_t)dstate->dtds_base +
647 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
648 uintptr_t chunkoffs;
649
650 /*
651 * Before we assume that we can store here, we need to make
652 * sure that it isn't in our metadata -- storing to our
653 * dynamic variable metadata would corrupt our state. For
654 * the range to not include any dynamic variable metadata,
655 * it must:
656 *
657 * (1) Start above the hash table that is at the base of
658 * the dynamic variable space
659 *
660 * (2) Have a starting chunk offset that is beyond the
661 * dtrace_dynvar_t that is at the base of every chunk
662 *
663 * (3) Not span a chunk boundary
664 *
665 */
666 if (addr < base)
667 return (0);
668
669 chunkoffs = (addr - base) % dstate->dtds_chunksize;
670
671 if (chunkoffs < sizeof (dtrace_dynvar_t))
672 return (0);
673
674 if (chunkoffs + sz > dstate->dtds_chunksize)
675 return (0);
676
677 return (1);
678 }
679
680 /*
681 * Finally, check the static local and global variables. These checks
682 * take the longest, so we perform them last.
683 */
684 if (dtrace_canstore_statvar(addr, sz,
685 vstate->dtvs_locals, vstate->dtvs_nlocals))
686 return (1);
687
688 if (dtrace_canstore_statvar(addr, sz,
689 vstate->dtvs_globals, vstate->dtvs_nglobals))
690 return (1);
691
692 return (0);
693}
694
695
696/*
697 * Convenience routine to check to see if the address is within a memory
698 * region in which a load may be issued given the user's privilege level;
699 * if not, it sets the appropriate error flags and loads 'addr' into the
700 * illegal value slot.
701 *
702 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
703 * appropriate memory access protection.
704 */
705static int
706dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
707 dtrace_vstate_t *vstate)
708{
709 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
710
711 /*
712 * If we hold the privilege to read from kernel memory, then
713 * everything is readable.
714 */
715 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
716 return (1);
717
718 /*
719 * You can obviously read that which you can store.
720 */
721 if (dtrace_canstore(addr, sz, mstate, vstate))
722 return (1);
723
724 /*
725 * We're allowed to read from our own string table.
726 */
727 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
728 mstate->dtms_difo->dtdo_strlen))
729 return (1);
730
731 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
732 *illval = addr;
733 return (0);
734}
735
736/*
737 * Convenience routine to check to see if a given string is within a memory
738 * region in which a load may be issued given the user's privilege level;
739 * this exists so that we don't need to issue unnecessary dtrace_strlen()
740 * calls in the event that the user has all privileges.
741 */
742static int
743dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
744 dtrace_vstate_t *vstate)
745{
746 size_t strsz;
747
748 /*
749 * If we hold the privilege to read from kernel memory, then
750 * everything is readable.
751 */
752 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
753 return (1);
754
755 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
756 if (dtrace_canload(addr, strsz, mstate, vstate))
757 return (1);
758
759 return (0);
760}
761
762/*
763 * Convenience routine to check to see if a given variable is within a memory
764 * region in which a load may be issued given the user's privilege level.
765 */
766static int
767dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
768 dtrace_vstate_t *vstate)
769{
770 size_t sz;
771 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
772
773 /*
774 * If we hold the privilege to read from kernel memory, then
775 * everything is readable.
776 */
777 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
778 return (1);
779
780 if (type->dtdt_kind == DIF_TYPE_STRING)
781 sz = dtrace_strlen(src,
782 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
783 else
784 sz = type->dtdt_size;
785
786 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
787}
788
789/*
790 * Compare two strings using safe loads.
791 */
792static int
793dtrace_strncmp(char *s1, char *s2, size_t limit)
794{
795 uint8_t c1, c2;
796 volatile uint16_t *flags;
797
798 if (s1 == s2 || limit == 0)
799 return (0);
800
801 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
802
803 do {
804 if (s1 == NULL) {
805 c1 = '\0';
806 } else {
807 c1 = dtrace_load8((uintptr_t)s1++);
808 }
809
810 if (s2 == NULL) {
811 c2 = '\0';
812 } else {
813 c2 = dtrace_load8((uintptr_t)s2++);
814 }
815
816 if (c1 != c2)
817 return (c1 - c2);
818 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
819
820 return (0);
821}
822
823/*
824 * Compute strlen(s) for a string using safe memory accesses. The additional
825 * len parameter is used to specify a maximum length to ensure completion.
826 */
827static size_t
828dtrace_strlen(const char *s, size_t lim)
829{
830 uint_t len;
831
832 for (len = 0; len != lim; len++) {
833 if (dtrace_load8((uintptr_t)s++) == '\0')
834 break;
835 }
836
837 return (len);
838}
839
840/*
841 * Check if an address falls within a toxic region.
842 */
843static int
844dtrace_istoxic(uintptr_t kaddr, size_t size)
845{
846 uintptr_t taddr, tsize;
847 int i;
848
849 for (i = 0; i < dtrace_toxranges; i++) {
850 taddr = dtrace_toxrange[i].dtt_base;
851 tsize = dtrace_toxrange[i].dtt_limit - taddr;
852
853 if (kaddr - taddr < tsize) {
854 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
855 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = kaddr;
856 return (1);
857 }
858
859 if (taddr - kaddr < size) {
860 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
861 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = taddr;
862 return (1);
863 }
864 }
865
866 return (0);
867}
868
869/*
870 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
871 * memory specified by the DIF program. The dst is assumed to be safe memory
872 * that we can store to directly because it is managed by DTrace. As with
873 * standard bcopy, overlapping copies are handled properly.
874 */
875static void
876dtrace_bcopy(const void *src, void *dst, size_t len)
877{
878 if (len != 0) {
879 uint8_t *s1 = dst;
880 const uint8_t *s2 = src;
881
882 if (s1 <= s2) {
883 do {
884 *s1++ = dtrace_load8((uintptr_t)s2++);
885 } while (--len != 0);
886 } else {
887 s2 += len;
888 s1 += len;
889
890 do {
891 *--s1 = dtrace_load8((uintptr_t)--s2);
892 } while (--len != 0);
893 }
894 }
895}
896
897/*
898 * Copy src to dst using safe memory accesses, up to either the specified
899 * length, or the point that a nul byte is encountered. The src is assumed to
900 * be unsafe memory specified by the DIF program. The dst is assumed to be
901 * safe memory that we can store to directly because it is managed by DTrace.
902 * Unlike dtrace_bcopy(), overlapping regions are not handled.
903 */
904static void
905dtrace_strcpy(const void *src, void *dst, size_t len)
906{
907 if (len != 0) {
908 uint8_t *s1 = dst, c;
909 const uint8_t *s2 = src;
910
911 do {
912 *s1++ = c = dtrace_load8((uintptr_t)s2++);
913 } while (--len != 0 && c != '\0');
914 }
915}
916
917/*
918 * Copy src to dst, deriving the size and type from the specified (BYREF)
919 * variable type. The src is assumed to be unsafe memory specified by the DIF
920 * program. The dst is assumed to be DTrace variable memory that is of the
921 * specified type; we assume that we can store to directly.
922 */
923static void
924dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
925{
926 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
927
928 if (type->dtdt_kind == DIF_TYPE_STRING) {
929 dtrace_strcpy(src, dst, type->dtdt_size);
930 } else {
931 dtrace_bcopy(src, dst, type->dtdt_size);
932 }
933}
934
935/*
936 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
937 * unsafe memory specified by the DIF program. The s2 data is assumed to be
938 * safe memory that we can access directly because it is managed by DTrace.
939 */
940static int
941dtrace_bcmp(const void *s1, const void *s2, size_t len)
942{
943 volatile uint16_t *flags;
944
945 flags = (volatile uint16_t *)&cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
946
947 if (s1 == s2)
948 return (0);
949
950 if (s1 == NULL || s2 == NULL)
951 return (1);
952
953 if (s1 != s2 && len != 0) {
954 const uint8_t *ps1 = s1;
955 const uint8_t *ps2 = s2;
956
957 do {
958 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
959 return (1);
960 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
961 }
962 return (0);
963}
964
965/*
966 * Zero the specified region using a simple byte-by-byte loop. Note that this
967 * is for safe DTrace-managed memory only.
968 */
969static void
970dtrace_bzero(void *dst, size_t len)
971{
972 uchar_t *cp;
973
974 for (cp = dst; len != 0; len--)
975 *cp++ = 0;
976}
977
978static void
979dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
980{
981 uint64_t result[2];
982
983 result[0] = addend1[0] + addend2[0];
984 result[1] = addend1[1] + addend2[1] +
985 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
986
987 sum[0] = result[0];
988 sum[1] = result[1];
989}
990
991/*
992 * Shift the 128-bit value in a by b. If b is positive, shift left.
993 * If b is negative, shift right.
994 */
995static void
996dtrace_shift_128(uint64_t *a, int b)
997{
998 uint64_t mask;
999
1000 if (b == 0)
1001 return;
1002
1003 if (b < 0) {
1004 b = -b;
1005 if (b >= 64) {
1006 a[0] = a[1] >> (b - 64);
1007 a[1] = 0;
1008 } else {
1009 a[0] >>= b;
1010 mask = 1LL << (64 - b);
1011 mask -= 1;
1012 a[0] |= ((a[1] & mask) << (64 - b));
1013 a[1] >>= b;
1014 }
1015 } else {
1016 if (b >= 64) {
1017 a[1] = a[0] << (b - 64);
1018 a[0] = 0;
1019 } else {
1020 a[1] <<= b;
1021 mask = a[0] >> (64 - b);
1022 a[1] |= mask;
1023 a[0] <<= b;
1024 }
1025 }
1026}
1027
1028/*
1029 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1030 * use native multiplication on those, and then re-combine into the
1031 * resulting 128-bit value.
1032 *
1033 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1034 * hi1 * hi2 << 64 +
1035 * hi1 * lo2 << 32 +
1036 * hi2 * lo1 << 32 +
1037 * lo1 * lo2
1038 */
1039static void
1040dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1041{
1042 uint64_t hi1, hi2, lo1, lo2;
1043 uint64_t tmp[2];
1044
1045 hi1 = factor1 >> 32;
1046 hi2 = factor2 >> 32;
1047
1048 lo1 = factor1 & DT_MASK_LO;
1049 lo2 = factor2 & DT_MASK_LO;
1050
1051 product[0] = lo1 * lo2;
1052 product[1] = hi1 * hi2;
1053
1054 tmp[0] = hi1 * lo2;
1055 tmp[1] = 0;
1056 dtrace_shift_128(tmp, 32);
1057 dtrace_add_128(product, tmp, product);
1058
1059 tmp[0] = hi2 * lo1;
1060 tmp[1] = 0;
1061 dtrace_shift_128(tmp, 32);
1062 dtrace_add_128(product, tmp, product);
1063}
1064
1065/*
1066 * This privilege check should be used by actions and subroutines to
1067 * verify that the user credentials of the process that enabled the
1068 * invoking ECB match the target credentials
1069 */
1070static int
1071dtrace_priv_proc_common_user(dtrace_state_t *state)
1072{
1073 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1074
1075 /*
1076 * We should always have a non-NULL state cred here, since if cred
1077 * is null (anonymous tracing), we fast-path bypass this routine.
1078 */
1079 ASSERT(s_cr != NULL);
1080
1081 if ((cr = CRED()) != NULL &&
1082 s_cr->cr_uid == cr->cr_uid &&
1083 s_cr->cr_uid == cr->cr_ruid &&
1084 s_cr->cr_uid == cr->cr_suid &&
1085 s_cr->cr_gid == cr->cr_gid &&
1086 s_cr->cr_gid == cr->cr_rgid &&
1087 s_cr->cr_gid == cr->cr_sgid)
1088 return (1);
1089
1090 return (0);
1091}
1092
1093/*
1094 * This privilege check should be used by actions and subroutines to
1095 * verify that the zone of the process that enabled the invoking ECB
1096 * matches the target credentials
1097 */
1098static int
1099dtrace_priv_proc_common_zone(dtrace_state_t *state)
1100{
1101 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1102
1103 /*
1104 * We should always have a non-NULL state cred here, since if cred
1105 * is null (anonymous tracing), we fast-path bypass this routine.
1106 */
1107 ASSERT(s_cr != NULL);
1108
1109 if ((cr = CRED()) != NULL &&
1110 s_cr->cr_zone == cr->cr_zone)
1111 return (1);
1112
1113 return (0);
1114}
1115
1116/*
1117 * This privilege check should be used by actions and subroutines to
1118 * verify that the process has not setuid or changed credentials.
1119 */
1120static int
1121dtrace_priv_proc_common_nocd(VBDTVOID)
1122{
1123 proc_t *proc;
1124
1125 if ((proc = VBDT_GET_PROC()) != NULL &&
1126 !(proc->p_flag & SNOCD))
1127 return (1);
1128
1129 return (0);
1130}
1131
1132static int
1133dtrace_priv_proc_destructive(dtrace_state_t *state)
1134{
1135 int action = state->dts_cred.dcr_action;
1136
1137 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1138 dtrace_priv_proc_common_zone(state) == 0)
1139 goto bad;
1140
1141 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1142 dtrace_priv_proc_common_user(state) == 0)
1143 goto bad;
1144
1145 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1146 dtrace_priv_proc_common_nocd() == 0)
1147 goto bad;
1148
1149 return (1);
1150
1151bad:
1152 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1153
1154 return (0);
1155}
1156
1157static int
1158dtrace_priv_proc_control(dtrace_state_t *state)
1159{
1160 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1161 return (1);
1162
1163 if (dtrace_priv_proc_common_zone(state) &&
1164 dtrace_priv_proc_common_user(state) &&
1165 dtrace_priv_proc_common_nocd())
1166 return (1);
1167
1168 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1169
1170 return (0);
1171}
1172
1173static int
1174dtrace_priv_proc(dtrace_state_t *state)
1175{
1176 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1177 return (1);
1178
1179 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1180
1181 return (0);
1182}
1183
1184static int
1185dtrace_priv_kernel(dtrace_state_t *state)
1186{
1187 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1188 return (1);
1189
1190 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1191
1192 return (0);
1193}
1194
1195static int
1196dtrace_priv_kernel_destructive(dtrace_state_t *state)
1197{
1198 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1199 return (1);
1200
1201 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1202
1203 return (0);
1204}
1205
1206/*
1207 * Note: not called from probe context. This function is called
1208 * asynchronously (and at a regular interval) from outside of probe context to
1209 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1210 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1211 */
1212VBDTSTATIC void
1213dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1214{
1215 dtrace_dynvar_t *dirty;
1216 dtrace_dstate_percpu_t *dcpu;
1217 dtrace_dynvar_t **rinsep;
1218 int i, j, work = 0;
1219
1220 for (i = 0; i < NCPU; i++) {
1221 dcpu = &dstate->dtds_percpu[i];
1222 rinsep = &dcpu->dtdsc_rinsing;
1223
1224 /*
1225 * If the dirty list is NULL, there is no dirty work to do.
1226 */
1227 if (dcpu->dtdsc_dirty == NULL)
1228 continue;
1229
1230 if (dcpu->dtdsc_rinsing != NULL) {
1231 /*
1232 * If the rinsing list is non-NULL, then it is because
1233 * this CPU was selected to accept another CPU's
1234 * dirty list -- and since that time, dirty buffers
1235 * have accumulated. This is a highly unlikely
1236 * condition, but we choose to ignore the dirty
1237 * buffers -- they'll be picked up a future cleanse.
1238 */
1239 continue;
1240 }
1241
1242 if (dcpu->dtdsc_clean != NULL) {
1243 /*
1244 * If the clean list is non-NULL, then we're in a
1245 * situation where a CPU has done deallocations (we
1246 * have a non-NULL dirty list) but no allocations (we
1247 * also have a non-NULL clean list). We can't simply
1248 * move the dirty list into the clean list on this
1249 * CPU, yet we also don't want to allow this condition
1250 * to persist, lest a short clean list prevent a
1251 * massive dirty list from being cleaned (which in
1252 * turn could lead to otherwise avoidable dynamic
1253 * drops). To deal with this, we look for some CPU
1254 * with a NULL clean list, NULL dirty list, and NULL
1255 * rinsing list -- and then we borrow this CPU to
1256 * rinse our dirty list.
1257 */
1258 for (j = 0; j < NCPU; j++) {
1259 dtrace_dstate_percpu_t *rinser;
1260
1261 rinser = &dstate->dtds_percpu[j];
1262
1263 if (rinser->dtdsc_rinsing != NULL)
1264 continue;
1265
1266 if (rinser->dtdsc_dirty != NULL)
1267 continue;
1268
1269 if (rinser->dtdsc_clean != NULL)
1270 continue;
1271
1272 rinsep = &rinser->dtdsc_rinsing;
1273 break;
1274 }
1275
1276 if (j == NCPU) {
1277 /*
1278 * We were unable to find another CPU that
1279 * could accept this dirty list -- we are
1280 * therefore unable to clean it now.
1281 */
1282 dtrace_dynvar_failclean++;
1283 continue;
1284 }
1285 }
1286
1287 work = 1;
1288
1289 /*
1290 * Atomically move the dirty list aside.
1291 */
1292 do {
1293 dirty = dcpu->dtdsc_dirty;
1294
1295 /*
1296 * Before we zap the dirty list, set the rinsing list.
1297 * (This allows for a potential assertion in
1298 * dtrace_dynvar(): if a free dynamic variable appears
1299 * on a hash chain, either the dirty list or the
1300 * rinsing list for some CPU must be non-NULL.)
1301 */
1302 *rinsep = dirty;
1303 dtrace_membar_producer();
1304 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1305 dirty, NULL) != dirty);
1306 }
1307
1308 if (!work) {
1309 /*
1310 * We have no work to do; we can simply return.
1311 */
1312 return;
1313 }
1314
1315 dtrace_sync();
1316
1317 for (i = 0; i < NCPU; i++) {
1318 dcpu = &dstate->dtds_percpu[i];
1319
1320 if (dcpu->dtdsc_rinsing == NULL)
1321 continue;
1322
1323 /*
1324 * We are now guaranteed that no hash chain contains a pointer
1325 * into this dirty list; we can make it clean.
1326 */
1327 ASSERT(dcpu->dtdsc_clean == NULL);
1328 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1329 dcpu->dtdsc_rinsing = NULL;
1330 }
1331
1332 /*
1333 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1334 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1335 * This prevents a race whereby a CPU incorrectly decides that
1336 * the state should be something other than DTRACE_DSTATE_CLEAN
1337 * after dtrace_dynvar_clean() has completed.
1338 */
1339 dtrace_sync();
1340
1341 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1342}
1343
1344/*
1345 * Depending on the value of the op parameter, this function looks-up,
1346 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1347 * allocation is requested, this function will return a pointer to a
1348 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1349 * variable can be allocated. If NULL is returned, the appropriate counter
1350 * will be incremented.
1351 */
1352VBDTSTATIC dtrace_dynvar_t *
1353dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1354 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1355 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1356{
1357 uint64_t hashval = DTRACE_DYNHASH_VALID;
1358 dtrace_dynhash_t *hash = dstate->dtds_hash;
1359 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1360 processorid_t me = VBDT_GET_CPUID(), cpu = me;
1361 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1362 size_t bucket, ksize;
1363 size_t chunksize = dstate->dtds_chunksize;
1364 uintptr_t kdata, lock, nstate;
1365 uint_t i;
1366
1367 ASSERT(nkeys != 0);
1368
1369 /*
1370 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1371 * algorithm. For the by-value portions, we perform the algorithm in
1372 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1373 * bit, and seems to have only a minute effect on distribution. For
1374 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1375 * over each referenced byte. It's painful to do this, but it's much
1376 * better than pathological hash distribution. The efficacy of the
1377 * hashing algorithm (and a comparison with other algorithms) may be
1378 * found by running the ::dtrace_dynstat MDB dcmd.
1379 */
1380 for (i = 0; i < nkeys; i++) {
1381 if (key[i].dttk_size == 0) {
1382 uint64_t val = key[i].dttk_value;
1383
1384 hashval += (val >> 48) & 0xffff;
1385 hashval += (hashval << 10);
1386 hashval ^= (hashval >> 6);
1387
1388 hashval += (val >> 32) & 0xffff;
1389 hashval += (hashval << 10);
1390 hashval ^= (hashval >> 6);
1391
1392 hashval += (val >> 16) & 0xffff;
1393 hashval += (hashval << 10);
1394 hashval ^= (hashval >> 6);
1395
1396 hashval += val & 0xffff;
1397 hashval += (hashval << 10);
1398 hashval ^= (hashval >> 6);
1399 } else {
1400 /*
1401 * This is incredibly painful, but it beats the hell
1402 * out of the alternative.
1403 */
1404 uint64_t j, size = key[i].dttk_size;
1405 uintptr_t base = (uintptr_t)key[i].dttk_value;
1406
1407 if (!dtrace_canload(base, size, mstate, vstate))
1408 break;
1409
1410 for (j = 0; j < size; j++) {
1411 hashval += dtrace_load8(base + j);
1412 hashval += (hashval << 10);
1413 hashval ^= (hashval >> 6);
1414 }
1415 }
1416 }
1417
1418 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1419 return (NULL);
1420
1421 hashval += (hashval << 3);
1422 hashval ^= (hashval >> 11);
1423 hashval += (hashval << 15);
1424
1425 /*
1426 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1427 * comes out to be one of our two sentinel hash values. If this
1428 * actually happens, we set the hashval to be a value known to be a
1429 * non-sentinel value.
1430 */
1431 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1432 hashval = DTRACE_DYNHASH_VALID;
1433
1434 /*
1435 * Yes, it's painful to do a divide here. If the cycle count becomes
1436 * important here, tricks can be pulled to reduce it. (However, it's
1437 * critical that hash collisions be kept to an absolute minimum;
1438 * they're much more painful than a divide.) It's better to have a
1439 * solution that generates few collisions and still keeps things
1440 * relatively simple.
1441 */
1442 bucket = hashval % dstate->dtds_hashsize;
1443
1444 if (op == DTRACE_DYNVAR_DEALLOC) {
1445 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1446
1447 for (;;) {
1448 while ((lock = *lockp) & 1)
1449 continue;
1450
1451 if (dtrace_casptr((void *)lockp,
1452 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1453 break;
1454 }
1455
1456 dtrace_membar_producer();
1457 }
1458
1459top:
1460 prev = NULL;
1461 lock = hash[bucket].dtdh_lock;
1462
1463 dtrace_membar_consumer();
1464
1465 start = hash[bucket].dtdh_chain;
1466 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1467 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1468 op != DTRACE_DYNVAR_DEALLOC));
1469
1470 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1471 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1472 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1473
1474 if (dvar->dtdv_hashval != hashval) {
1475 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1476 /*
1477 * We've reached the sink, and therefore the
1478 * end of the hash chain; we can kick out of
1479 * the loop knowing that we have seen a valid
1480 * snapshot of state.
1481 */
1482 ASSERT(dvar->dtdv_next == NULL);
1483 ASSERT(dvar == &dtrace_dynhash_sink);
1484 break;
1485 }
1486
1487 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1488 /*
1489 * We've gone off the rails: somewhere along
1490 * the line, one of the members of this hash
1491 * chain was deleted. Note that we could also
1492 * detect this by simply letting this loop run
1493 * to completion, as we would eventually hit
1494 * the end of the dirty list. However, we
1495 * want to avoid running the length of the
1496 * dirty list unnecessarily (it might be quite
1497 * long), so we catch this as early as
1498 * possible by detecting the hash marker. In
1499 * this case, we simply set dvar to NULL and
1500 * break; the conditional after the loop will
1501 * send us back to top.
1502 */
1503 dvar = NULL;
1504 break;
1505 }
1506
1507 goto next;
1508 }
1509
1510 if (dtuple->dtt_nkeys != nkeys)
1511 goto next;
1512
1513 for (i = 0; i < nkeys; i++, dkey++) {
1514 if (dkey->dttk_size != key[i].dttk_size)
1515 goto next; /* size or type mismatch */
1516
1517 if (dkey->dttk_size != 0) {
1518 if (dtrace_bcmp(
1519 (void *)(uintptr_t)key[i].dttk_value,
1520 (void *)(uintptr_t)dkey->dttk_value,
1521 dkey->dttk_size))
1522 goto next;
1523 } else {
1524 if (dkey->dttk_value != key[i].dttk_value)
1525 goto next;
1526 }
1527 }
1528
1529 if (op != DTRACE_DYNVAR_DEALLOC)
1530 return (dvar);
1531
1532 ASSERT(dvar->dtdv_next == NULL ||
1533 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1534
1535 if (prev != NULL) {
1536 ASSERT(hash[bucket].dtdh_chain != dvar);
1537 ASSERT(start != dvar);
1538 ASSERT(prev->dtdv_next == dvar);
1539 prev->dtdv_next = dvar->dtdv_next;
1540 } else {
1541 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1542 start, dvar->dtdv_next) != start) {
1543 /*
1544 * We have failed to atomically swing the
1545 * hash table head pointer, presumably because
1546 * of a conflicting allocation on another CPU.
1547 * We need to reread the hash chain and try
1548 * again.
1549 */
1550 goto top;
1551 }
1552 }
1553
1554 dtrace_membar_producer();
1555
1556 /*
1557 * Now set the hash value to indicate that it's free.
1558 */
1559 ASSERT(hash[bucket].dtdh_chain != dvar);
1560 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1561
1562 dtrace_membar_producer();
1563
1564 /*
1565 * Set the next pointer to point at the dirty list, and
1566 * atomically swing the dirty pointer to the newly freed dvar.
1567 */
1568 do {
1569 next = dcpu->dtdsc_dirty;
1570 dvar->dtdv_next = next;
1571 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1572
1573 /*
1574 * Finally, unlock this hash bucket.
1575 */
1576 ASSERT(hash[bucket].dtdh_lock == lock);
1577 ASSERT(lock & 1);
1578 hash[bucket].dtdh_lock++;
1579
1580 return (NULL);
1581next:
1582 prev = dvar;
1583 continue;
1584 }
1585
1586 if (dvar == NULL) {
1587 /*
1588 * If dvar is NULL, it is because we went off the rails:
1589 * one of the elements that we traversed in the hash chain
1590 * was deleted while we were traversing it. In this case,
1591 * we assert that we aren't doing a dealloc (deallocs lock
1592 * the hash bucket to prevent themselves from racing with
1593 * one another), and retry the hash chain traversal.
1594 */
1595 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1596 goto top;
1597 }
1598
1599 if (op != DTRACE_DYNVAR_ALLOC) {
1600 /*
1601 * If we are not to allocate a new variable, we want to
1602 * return NULL now. Before we return, check that the value
1603 * of the lock word hasn't changed. If it has, we may have
1604 * seen an inconsistent snapshot.
1605 */
1606 if (op == DTRACE_DYNVAR_NOALLOC) {
1607 if (hash[bucket].dtdh_lock != lock)
1608 goto top;
1609 } else {
1610 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1611 ASSERT(hash[bucket].dtdh_lock == lock);
1612 ASSERT(lock & 1);
1613 hash[bucket].dtdh_lock++;
1614 }
1615
1616 return (NULL);
1617 }
1618
1619 /*
1620 * We need to allocate a new dynamic variable. The size we need is the
1621 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1622 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1623 * the size of any referred-to data (dsize). We then round the final
1624 * size up to the chunksize for allocation.
1625 */
1626 for (ksize = 0, i = 0; i < nkeys; i++)
1627 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1628
1629 /*
1630 * This should be pretty much impossible, but could happen if, say,
1631 * strange DIF specified the tuple. Ideally, this should be an
1632 * assertion and not an error condition -- but that requires that the
1633 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1634 * bullet-proof. (That is, it must not be able to be fooled by
1635 * malicious DIF.) Given the lack of backwards branches in DIF,
1636 * solving this would presumably not amount to solving the Halting
1637 * Problem -- but it still seems awfully hard.
1638 */
1639 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1640 ksize + dsize > chunksize) {
1641 dcpu->dtdsc_drops++;
1642 return (NULL);
1643 }
1644
1645 nstate = DTRACE_DSTATE_EMPTY;
1646
1647 do {
1648retry:
1649 free = dcpu->dtdsc_free;
1650
1651 if (free == NULL) {
1652 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1653 void *rval;
1654
1655 if (clean == NULL) {
1656 /*
1657 * We're out of dynamic variable space on
1658 * this CPU. Unless we have tried all CPUs,
1659 * we'll try to allocate from a different
1660 * CPU.
1661 */
1662 switch (dstate->dtds_state) {
1663 case DTRACE_DSTATE_CLEAN: {
1664 void *sp = &dstate->dtds_state;
1665
1666 if (++cpu >= NCPU)
1667 cpu = 0;
1668
1669 if (dcpu->dtdsc_dirty != NULL &&
1670 nstate == DTRACE_DSTATE_EMPTY)
1671 nstate = DTRACE_DSTATE_DIRTY;
1672
1673 if (dcpu->dtdsc_rinsing != NULL)
1674 nstate = DTRACE_DSTATE_RINSING;
1675
1676 dcpu = &dstate->dtds_percpu[cpu];
1677
1678 if (cpu != me)
1679 goto retry;
1680
1681 (void) dtrace_cas32(sp,
1682 DTRACE_DSTATE_CLEAN, nstate);
1683
1684 /*
1685 * To increment the correct bean
1686 * counter, take another lap.
1687 */
1688 goto retry;
1689 }
1690
1691 case DTRACE_DSTATE_DIRTY:
1692 dcpu->dtdsc_dirty_drops++;
1693 break;
1694
1695 case DTRACE_DSTATE_RINSING:
1696 dcpu->dtdsc_rinsing_drops++;
1697 break;
1698
1699 case DTRACE_DSTATE_EMPTY:
1700 dcpu->dtdsc_drops++;
1701 break;
1702 }
1703
1704 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1705 return (NULL);
1706 }
1707
1708 /*
1709 * The clean list appears to be non-empty. We want to
1710 * move the clean list to the free list; we start by
1711 * moving the clean pointer aside.
1712 */
1713 if (dtrace_casptr(&dcpu->dtdsc_clean,
1714 clean, NULL) != clean) {
1715 /*
1716 * We are in one of two situations:
1717 *
1718 * (a) The clean list was switched to the
1719 * free list by another CPU.
1720 *
1721 * (b) The clean list was added to by the
1722 * cleansing cyclic.
1723 *
1724 * In either of these situations, we can
1725 * just reattempt the free list allocation.
1726 */
1727 goto retry;
1728 }
1729
1730 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1731
1732 /*
1733 * Now we'll move the clean list to our free list.
1734 * It's impossible for this to fail: the only way
1735 * the free list can be updated is through this
1736 * code path, and only one CPU can own the clean list.
1737 * Thus, it would only be possible for this to fail if
1738 * this code were racing with dtrace_dynvar_clean().
1739 * (That is, if dtrace_dynvar_clean() updated the clean
1740 * list, and we ended up racing to update the free
1741 * list.) This race is prevented by the dtrace_sync()
1742 * in dtrace_dynvar_clean() -- which flushes the
1743 * owners of the clean lists out before resetting
1744 * the clean lists.
1745 */
1746 dcpu = &dstate->dtds_percpu[me];
1747 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1748 ASSERT(rval == NULL);
1749 goto retry;
1750 }
1751
1752 dvar = free;
1753 new_free = dvar->dtdv_next;
1754 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1755
1756 /*
1757 * We have now allocated a new chunk. We copy the tuple keys into the
1758 * tuple array and copy any referenced key data into the data space
1759 * following the tuple array. As we do this, we relocate dttk_value
1760 * in the final tuple to point to the key data address in the chunk.
1761 */
1762 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1763 dvar->dtdv_data = (void *)(kdata + ksize);
1764 dvar->dtdv_tuple.dtt_nkeys = nkeys;
1765
1766 for (i = 0; i < nkeys; i++) {
1767 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1768 size_t kesize = key[i].dttk_size;
1769
1770 if (kesize != 0) {
1771 dtrace_bcopy(
1772 (const void *)(uintptr_t)key[i].dttk_value,
1773 (void *)kdata, kesize);
1774 dkey->dttk_value = kdata;
1775 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1776 } else {
1777 dkey->dttk_value = key[i].dttk_value;
1778 }
1779
1780 dkey->dttk_size = kesize;
1781 }
1782
1783 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1784 dvar->dtdv_hashval = hashval;
1785 dvar->dtdv_next = start;
1786
1787 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1788 return (dvar);
1789
1790 /*
1791 * The cas has failed. Either another CPU is adding an element to
1792 * this hash chain, or another CPU is deleting an element from this
1793 * hash chain. The simplest way to deal with both of these cases
1794 * (though not necessarily the most efficient) is to free our
1795 * allocated block and tail-call ourselves. Note that the free is
1796 * to the dirty list and _not_ to the free list. This is to prevent
1797 * races with allocators, above.
1798 */
1799 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1800
1801 dtrace_membar_producer();
1802
1803 do {
1804 free = dcpu->dtdsc_dirty;
1805 dvar->dtdv_next = free;
1806 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1807
1808 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1809}
1810
1811/*ARGSUSED*/
1812static void
1813dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1814{
1815 if ((int64_t)nval < (int64_t)*oval)
1816 *oval = nval;
1817}
1818
1819/*ARGSUSED*/
1820static void
1821dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1822{
1823 if ((int64_t)nval > (int64_t)*oval)
1824 *oval = nval;
1825}
1826
1827static void
1828dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1829{
1830 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1831 int64_t val = (int64_t)nval;
1832
1833 if (val < 0) {
1834 for (i = 0; i < zero; i++) {
1835 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1836 quanta[i] += incr;
1837 return;
1838 }
1839 }
1840 } else {
1841 for (i = zero + 1; i < VBDTCAST(int)DTRACE_QUANTIZE_NBUCKETS; i++) {
1842 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1843 quanta[i - 1] += incr;
1844 return;
1845 }
1846 }
1847
1848 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1849 return;
1850 }
1851
1852 ASSERT(0);
1853}
1854
1855static void
1856dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1857{
1858 uint64_t arg = *lquanta++;
1859 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1860 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1861 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1862 int32_t val = (int32_t)nval, level;
1863
1864 ASSERT(step != 0);
1865 ASSERT(levels != 0);
1866
1867 if (val < base) {
1868 /*
1869 * This is an underflow.
1870 */
1871 lquanta[0] += incr;
1872 return;
1873 }
1874
1875 level = (val - base) / step;
1876
1877 if (level < levels) {
1878 lquanta[level + 1] += incr;
1879 return;
1880 }
1881
1882 /*
1883 * This is an overflow.
1884 */
1885 lquanta[levels + 1] += incr;
1886}
1887
1888/*ARGSUSED*/
1889static void
1890dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1891{
1892 data[0]++;
1893 data[1] += nval;
1894}
1895
1896/*ARGSUSED*/
1897static void
1898dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
1899{
1900 int64_t snval = (int64_t)nval;
1901 uint64_t tmp[2];
1902
1903 data[0]++;
1904 data[1] += nval;
1905
1906 /*
1907 * What we want to say here is:
1908 *
1909 * data[2] += nval * nval;
1910 *
1911 * But given that nval is 64-bit, we could easily overflow, so
1912 * we do this as 128-bit arithmetic.
1913 */
1914 if (snval < 0)
1915 snval = -snval;
1916
1917 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
1918 dtrace_add_128(data + 2, tmp, data + 2);
1919}
1920
1921/*ARGSUSED*/
1922static void
1923dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
1924{
1925 *oval = *oval + 1;
1926}
1927
1928/*ARGSUSED*/
1929static void
1930dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
1931{
1932 *oval += nval;
1933}
1934
1935/*
1936 * Aggregate given the tuple in the principal data buffer, and the aggregating
1937 * action denoted by the specified dtrace_aggregation_t. The aggregation
1938 * buffer is specified as the buf parameter. This routine does not return
1939 * failure; if there is no space in the aggregation buffer, the data will be
1940 * dropped, and a corresponding counter incremented.
1941 */
1942static void
1943dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
1944 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
1945{
1946 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
1947 uint32_t i, ndx, size, fsize;
1948 uint32_t align = sizeof (uint64_t) - 1;
1949 dtrace_aggbuffer_t *agb;
1950 dtrace_aggkey_t *key;
1951 uint32_t hashval = 0, limit, isstr;
1952 caddr_t tomax, data, kdata;
1953 dtrace_actkind_t action;
1954 dtrace_action_t *act;
1955 uintptr_t offs;
1956
1957 if (buf == NULL)
1958 return;
1959
1960 if (!agg->dtag_hasarg) {
1961 /*
1962 * Currently, only quantize() and lquantize() take additional
1963 * arguments, and they have the same semantics: an increment
1964 * value that defaults to 1 when not present. If additional
1965 * aggregating actions take arguments, the setting of the
1966 * default argument value will presumably have to become more
1967 * sophisticated...
1968 */
1969 arg = 1;
1970 }
1971
1972 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
1973 size = rec->dtrd_offset - agg->dtag_base;
1974 fsize = size + rec->dtrd_size;
1975
1976 ASSERT(dbuf->dtb_tomax != NULL);
1977 data = dbuf->dtb_tomax + offset + agg->dtag_base;
1978
1979 if ((tomax = buf->dtb_tomax) == NULL) {
1980 dtrace_buffer_drop(buf);
1981 return;
1982 }
1983
1984 /*
1985 * The metastructure is always at the bottom of the buffer.
1986 */
1987 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
1988 sizeof (dtrace_aggbuffer_t));
1989
1990 if (buf->dtb_offset == 0) {
1991 /*
1992 * We just kludge up approximately 1/8th of the size to be
1993 * buckets. If this guess ends up being routinely
1994 * off-the-mark, we may need to dynamically readjust this
1995 * based on past performance.
1996 */
1997 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
1998
1999 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2000 (uintptr_t)tomax || hashsize == 0) {
2001 /*
2002 * We've been given a ludicrously small buffer;
2003 * increment our drop count and leave.
2004 */
2005 dtrace_buffer_drop(buf);
2006 return;
2007 }
2008
2009 /*
2010 * And now, a pathetic attempt to try to get a an odd (or
2011 * perchance, a prime) hash size for better hash distribution.
2012 */
2013 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2014 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2015
2016 agb->dtagb_hashsize = hashsize;
2017 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2018 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2019 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2020
2021 for (i = 0; i < agb->dtagb_hashsize; i++)
2022 agb->dtagb_hash[i] = NULL;
2023 }
2024
2025 ASSERT(agg->dtag_first != NULL);
2026 ASSERT(agg->dtag_first->dta_intuple);
2027
2028 /*
2029 * Calculate the hash value based on the key. Note that we _don't_
2030 * include the aggid in the hashing (but we will store it as part of
2031 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2032 * algorithm: a simple, quick algorithm that has no known funnels, and
2033 * gets good distribution in practice. The efficacy of the hashing
2034 * algorithm (and a comparison with other algorithms) may be found by
2035 * running the ::dtrace_aggstat MDB dcmd.
2036 */
2037 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2038 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2039 limit = i + act->dta_rec.dtrd_size;
2040 ASSERT(limit <= size);
2041 isstr = DTRACEACT_ISSTRING(act);
2042
2043 for (; i < limit; i++) {
2044 hashval += data[i];
2045 hashval += (hashval << 10);
2046 hashval ^= (hashval >> 6);
2047
2048 if (isstr && data[i] == '\0')
2049 break;
2050 }
2051 }
2052
2053 hashval += (hashval << 3);
2054 hashval ^= (hashval >> 11);
2055 hashval += (hashval << 15);
2056
2057 /*
2058 * Yes, the divide here is expensive -- but it's generally the least
2059 * of the performance issues given the amount of data that we iterate
2060 * over to compute hash values, compare data, etc.
2061 */
2062 ndx = hashval % agb->dtagb_hashsize;
2063
2064 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2065 ASSERT((caddr_t)key >= tomax);
2066 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2067
2068 if (hashval != key->dtak_hashval || key->dtak_size != size)
2069 continue;
2070
2071 kdata = key->dtak_data;
2072 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2073
2074 for (act = agg->dtag_first; act->dta_intuple;
2075 act = act->dta_next) {
2076 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2077 limit = i + act->dta_rec.dtrd_size;
2078 ASSERT(limit <= size);
2079 isstr = DTRACEACT_ISSTRING(act);
2080
2081 for (; i < limit; i++) {
2082 if (kdata[i] != data[i])
2083 goto next;
2084
2085 if (isstr && data[i] == '\0')
2086 break;
2087 }
2088 }
2089
2090 if (action != key->dtak_action) {
2091 /*
2092 * We are aggregating on the same value in the same
2093 * aggregation with two different aggregating actions.
2094 * (This should have been picked up in the compiler,
2095 * so we may be dealing with errant or devious DIF.)
2096 * This is an error condition; we indicate as much,
2097 * and return.
2098 */
2099 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2100 return;
2101 }
2102
2103 /*
2104 * This is a hit: we need to apply the aggregator to
2105 * the value at this key.
2106 */
2107 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2108 return;
2109next:
2110 continue;
2111 }
2112
2113 /*
2114 * We didn't find it. We need to allocate some zero-filled space,
2115 * link it into the hash table appropriately, and apply the aggregator
2116 * to the (zero-filled) value.
2117 */
2118 offs = buf->dtb_offset;
2119 while (offs & (align - 1))
2120 offs += sizeof (uint32_t);
2121
2122 /*
2123 * If we don't have enough room to both allocate a new key _and_
2124 * its associated data, increment the drop count and return.
2125 */
2126 if ((uintptr_t)tomax + offs + fsize >
2127 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2128 dtrace_buffer_drop(buf);
2129 return;
2130 }
2131
2132 /*CONSTCOND*/
2133 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2134 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2135 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2136
2137 key->dtak_data = kdata = tomax + offs;
2138 buf->dtb_offset = offs + fsize;
2139
2140 /*
2141 * Now copy the data across.
2142 */
2143 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2144
2145 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2146 kdata[i] = data[i];
2147
2148 /*
2149 * Because strings are not zeroed out by default, we need to iterate
2150 * looking for actions that store strings, and we need to explicitly
2151 * pad these strings out with zeroes.
2152 */
2153 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2154 int nul;
2155
2156 if (!DTRACEACT_ISSTRING(act))
2157 continue;
2158
2159 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2160 limit = i + act->dta_rec.dtrd_size;
2161 ASSERT(limit <= size);
2162
2163 for (nul = 0; i < limit; i++) {
2164 if (nul) {
2165 kdata[i] = '\0';
2166 continue;
2167 }
2168
2169 if (data[i] != '\0')
2170 continue;
2171
2172 nul = 1;
2173 }
2174 }
2175
2176 for (i = size; i < fsize; i++)
2177 kdata[i] = 0;
2178
2179 key->dtak_hashval = hashval;
2180 key->dtak_size = size;
2181 key->dtak_action = action;
2182 key->dtak_next = agb->dtagb_hash[ndx];
2183 agb->dtagb_hash[ndx] = key;
2184
2185 /*
2186 * Finally, apply the aggregator.
2187 */
2188 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2189 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2190}
2191
2192/*
2193 * Given consumer state, this routine finds a speculation in the INACTIVE
2194 * state and transitions it into the ACTIVE state. If there is no speculation
2195 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2196 * incremented -- it is up to the caller to take appropriate action.
2197 */
2198static int
2199dtrace_speculation(dtrace_state_t *state)
2200{
2201 int i = 0;
2202 dtrace_speculation_state_t current;
2203 uint32_t *stat = &state->dts_speculations_unavail, count;
2204
2205 while (i < state->dts_nspeculations) {
2206 dtrace_speculation_t *spec = &state->dts_speculations[i];
2207
2208 current = spec->dtsp_state;
2209
2210 if (current != DTRACESPEC_INACTIVE) {
2211 if (current == DTRACESPEC_COMMITTINGMANY ||
2212 current == DTRACESPEC_COMMITTING ||
2213 current == DTRACESPEC_DISCARDING)
2214 stat = &state->dts_speculations_busy;
2215 i++;
2216 continue;
2217 }
2218
2219 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2220 current, DTRACESPEC_ACTIVE) == current)
2221 return (i + 1);
2222 }
2223
2224 /*
2225 * We couldn't find a speculation. If we found as much as a single
2226 * busy speculation buffer, we'll attribute this failure as "busy"
2227 * instead of "unavail".
2228 */
2229 do {
2230 count = *stat;
2231 } while (dtrace_cas32(stat, count, count + 1) != count);
2232
2233 return (0);
2234}
2235
2236/*
2237 * This routine commits an active speculation. If the specified speculation
2238 * is not in a valid state to perform a commit(), this routine will silently do
2239 * nothing. The state of the specified speculation is transitioned according
2240 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2241 */
2242static void
2243dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2244 dtrace_specid_t which)
2245{
2246 dtrace_speculation_t *spec;
2247 dtrace_buffer_t *src, *dest;
2248 uintptr_t daddr, saddr, dlimit;
2249 dtrace_speculation_state_t current, new;
2250 intptr_t offs;
2251
2252 if (which == 0)
2253 return;
2254
2255 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2256 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2257 return;
2258 }
2259
2260 spec = &state->dts_speculations[which - 1];
2261 src = &spec->dtsp_buffer[cpu];
2262 dest = &state->dts_buffer[cpu];
2263
2264 do {
2265 current = spec->dtsp_state;
2266
2267 if (current == DTRACESPEC_COMMITTINGMANY)
2268 break;
2269
2270 switch (current) {
2271 case DTRACESPEC_INACTIVE:
2272 case DTRACESPEC_DISCARDING:
2273 return;
2274
2275 case DTRACESPEC_COMMITTING:
2276 /*
2277 * This is only possible if we are (a) commit()'ing
2278 * without having done a prior speculate() on this CPU
2279 * and (b) racing with another commit() on a different
2280 * CPU. There's nothing to do -- we just assert that
2281 * our offset is 0.
2282 */
2283 ASSERT(src->dtb_offset == 0);
2284 return;
2285
2286 case DTRACESPEC_ACTIVE:
2287 new = DTRACESPEC_COMMITTING;
2288 break;
2289
2290 case DTRACESPEC_ACTIVEONE:
2291 /*
2292 * This speculation is active on one CPU. If our
2293 * buffer offset is non-zero, we know that the one CPU
2294 * must be us. Otherwise, we are committing on a
2295 * different CPU from the speculate(), and we must
2296 * rely on being asynchronously cleaned.
2297 */
2298 if (src->dtb_offset != 0) {
2299 new = DTRACESPEC_COMMITTING;
2300 break;
2301 }
2302 /*FALLTHROUGH*/
2303
2304 case DTRACESPEC_ACTIVEMANY:
2305 new = DTRACESPEC_COMMITTINGMANY;
2306 break;
2307
2308 default:
2309 ASSERT(0);
2310 }
2311 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2312 current, new) != current);
2313
2314 /*
2315 * We have set the state to indicate that we are committing this
2316 * speculation. Now reserve the necessary space in the destination
2317 * buffer.
2318 */
2319 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2320 sizeof (uint64_t), state, NULL)) < 0) {
2321 dtrace_buffer_drop(dest);
2322 goto out;
2323 }
2324
2325 /*
2326 * We have the space; copy the buffer across. (Note that this is a
2327 * highly subobtimal bcopy(); in the unlikely event that this becomes
2328 * a serious performance issue, a high-performance DTrace-specific
2329 * bcopy() should obviously be invented.)
2330 */
2331 daddr = (uintptr_t)dest->dtb_tomax + offs;
2332 dlimit = daddr + src->dtb_offset;
2333 saddr = (uintptr_t)src->dtb_tomax;
2334
2335 /*
2336 * First, the aligned portion.
2337 */
2338 while (dlimit - daddr >= sizeof (uint64_t)) {
2339 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2340
2341 daddr += sizeof (uint64_t);
2342 saddr += sizeof (uint64_t);
2343 }
2344
2345 /*
2346 * Now any left-over bit...
2347 */
2348 while (dlimit - daddr)
2349 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2350
2351 /*
2352 * Finally, commit the reserved space in the destination buffer.
2353 */
2354 dest->dtb_offset = offs + src->dtb_offset;
2355
2356out:
2357 /*
2358 * If we're lucky enough to be the only active CPU on this speculation
2359 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2360 */
2361 if (current == DTRACESPEC_ACTIVE ||
2362 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2363 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2364 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2365
2366 ASSERT(rval == DTRACESPEC_COMMITTING);
2367 }
2368
2369 src->dtb_offset = 0;
2370 src->dtb_xamot_drops += src->dtb_drops;
2371 src->dtb_drops = 0;
2372}
2373
2374/*
2375 * This routine discards an active speculation. If the specified speculation
2376 * is not in a valid state to perform a discard(), this routine will silently
2377 * do nothing. The state of the specified speculation is transitioned
2378 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2379 */
2380static void
2381dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2382 dtrace_specid_t which)
2383{
2384 dtrace_speculation_t *spec;
2385 dtrace_speculation_state_t current, new;
2386 dtrace_buffer_t *buf;
2387
2388 if (which == 0)
2389 return;
2390
2391 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2392 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2393 return;
2394 }
2395
2396 spec = &state->dts_speculations[which - 1];
2397 buf = &spec->dtsp_buffer[cpu];
2398
2399 do {
2400 current = spec->dtsp_state;
2401
2402 switch (current) {
2403 case DTRACESPEC_INACTIVE:
2404 case DTRACESPEC_COMMITTINGMANY:
2405 case DTRACESPEC_COMMITTING:
2406 case DTRACESPEC_DISCARDING:
2407 return;
2408
2409 case DTRACESPEC_ACTIVE:
2410 case DTRACESPEC_ACTIVEMANY:
2411 new = DTRACESPEC_DISCARDING;
2412 break;
2413
2414 case DTRACESPEC_ACTIVEONE:
2415 if (buf->dtb_offset != 0) {
2416 new = DTRACESPEC_INACTIVE;
2417 } else {
2418 new = DTRACESPEC_DISCARDING;
2419 }
2420 break;
2421
2422 default:
2423 ASSERT(0);
2424 }
2425 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2426 current, new) != current);
2427
2428 buf->dtb_offset = 0;
2429 buf->dtb_drops = 0;
2430}
2431
2432/*
2433 * Note: not called from probe context. This function is called
2434 * asynchronously from cross call context to clean any speculations that are
2435 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2436 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2437 * speculation.
2438 */
2439static void
2440dtrace_speculation_clean_here(dtrace_state_t *state)
2441{
2442 dtrace_icookie_t cookie;
2443 processorid_t cpu = VBDT_GET_CPUID();
2444 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2445 dtrace_specid_t i;
2446
2447 cookie = dtrace_interrupt_disable();
2448
2449 if (dest->dtb_tomax == NULL) {
2450 dtrace_interrupt_enable(cookie);
2451 return;
2452 }
2453
2454 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2455 dtrace_speculation_t *spec = &state->dts_speculations[i];
2456 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2457
2458 if (src->dtb_tomax == NULL)
2459 continue;
2460
2461 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2462 src->dtb_offset = 0;
2463 continue;
2464 }
2465
2466 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2467 continue;
2468
2469 if (src->dtb_offset == 0)
2470 continue;
2471
2472 dtrace_speculation_commit(state, cpu, i + 1);
2473 }
2474
2475 dtrace_interrupt_enable(cookie);
2476}
2477
2478/*
2479 * Note: not called from probe context. This function is called
2480 * asynchronously (and at a regular interval) to clean any speculations that
2481 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2482 * is work to be done, it cross calls all CPUs to perform that work;
2483 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2484 * INACTIVE state until they have been cleaned by all CPUs.
2485 */
2486static void
2487dtrace_speculation_clean(dtrace_state_t *state)
2488{
2489 int work = 0, rv;
2490 dtrace_specid_t i;
2491
2492 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2493 dtrace_speculation_t *spec = &state->dts_speculations[i];
2494
2495 ASSERT(!spec->dtsp_cleaning);
2496
2497 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2498 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2499 continue;
2500
2501 work++;
2502 spec->dtsp_cleaning = 1;
2503 }
2504
2505 if (!work)
2506 return;
2507
2508 dtrace_xcall(DTRACE_CPUALL,
2509 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2510
2511 /*
2512 * We now know that all CPUs have committed or discarded their
2513 * speculation buffers, as appropriate. We can now set the state
2514 * to inactive.
2515 */
2516 for (i = 0; i < VBDTCAST(unsigned)state->dts_nspeculations; i++) {
2517 dtrace_speculation_t *spec = &state->dts_speculations[i];
2518 dtrace_speculation_state_t current, new;
2519
2520 if (!spec->dtsp_cleaning)
2521 continue;
2522
2523 current = spec->dtsp_state;
2524 ASSERT(current == DTRACESPEC_DISCARDING ||
2525 current == DTRACESPEC_COMMITTINGMANY);
2526
2527 new = DTRACESPEC_INACTIVE;
2528
2529 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2530 ASSERT(VBDTCAST(dtrace_speculation_state_t)rv == current);
2531 spec->dtsp_cleaning = 0;
2532 }
2533}
2534
2535/*
2536 * Called as part of a speculate() to get the speculative buffer associated
2537 * with a given speculation. Returns NULL if the specified speculation is not
2538 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2539 * the active CPU is not the specified CPU -- the speculation will be
2540 * atomically transitioned into the ACTIVEMANY state.
2541 */
2542static dtrace_buffer_t *
2543dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2544 dtrace_specid_t which)
2545{
2546 dtrace_speculation_t *spec;
2547 dtrace_speculation_state_t current, new;
2548 dtrace_buffer_t *buf;
2549
2550 if (which == 0)
2551 return (NULL);
2552
2553 if (which > VBDTCAST(unsigned)state->dts_nspeculations) {
2554 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2555 return (NULL);
2556 }
2557
2558 spec = &state->dts_speculations[which - 1];
2559 buf = &spec->dtsp_buffer[cpuid];
2560
2561 do {
2562 current = spec->dtsp_state;
2563
2564 switch (current) {
2565 case DTRACESPEC_INACTIVE:
2566 case DTRACESPEC_COMMITTINGMANY:
2567 case DTRACESPEC_DISCARDING:
2568 return (NULL);
2569
2570 case DTRACESPEC_COMMITTING:
2571 ASSERT(buf->dtb_offset == 0);
2572 return (NULL);
2573
2574 case DTRACESPEC_ACTIVEONE:
2575 /*
2576 * This speculation is currently active on one CPU.
2577 * Check the offset in the buffer; if it's non-zero,
2578 * that CPU must be us (and we leave the state alone).
2579 * If it's zero, assume that we're starting on a new
2580 * CPU -- and change the state to indicate that the
2581 * speculation is active on more than one CPU.
2582 */
2583 if (buf->dtb_offset != 0)
2584 return (buf);
2585
2586 new = DTRACESPEC_ACTIVEMANY;
2587 break;
2588
2589 case DTRACESPEC_ACTIVEMANY:
2590 return (buf);
2591
2592 case DTRACESPEC_ACTIVE:
2593 new = DTRACESPEC_ACTIVEONE;
2594 break;
2595
2596 default:
2597 ASSERT(0);
2598 }
2599 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2600 current, new) != current);
2601
2602 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2603 return (buf);
2604}
2605
2606/*
2607 * Return a string. In the event that the user lacks the privilege to access
2608 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2609 * don't fail access checking.
2610 *
2611 * dtrace_dif_variable() uses this routine as a helper for various
2612 * builtin values such as 'execname' and 'probefunc.'
2613 */
2614VBDTSTATIC uintptr_t
2615dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2616 dtrace_mstate_t *mstate)
2617{
2618 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2619 uintptr_t ret;
2620 size_t strsz;
2621
2622 /*
2623 * The easy case: this probe is allowed to read all of memory, so
2624 * we can just return this as a vanilla pointer.
2625 */
2626 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2627 return (addr);
2628
2629 /*
2630 * This is the tougher case: we copy the string in question from
2631 * kernel memory into scratch memory and return it that way: this
2632 * ensures that we won't trip up when access checking tests the
2633 * BYREF return value.
2634 */
2635 strsz = dtrace_strlen((char *)addr, size) + 1;
2636
2637 if (mstate->dtms_scratch_ptr + strsz >
2638 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2639 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2640 return (NULL);
2641 }
2642
2643 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2644 strsz);
2645 ret = mstate->dtms_scratch_ptr;
2646 mstate->dtms_scratch_ptr += strsz;
2647 return (ret);
2648}
2649
2650/*
2651 * This function implements the DIF emulator's variable lookups. The emulator
2652 * passes a reserved variable identifier and optional built-in array index.
2653 */
2654static uint64_t
2655dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2656 uint64_t ndx)
2657{
2658 /*
2659 * If we're accessing one of the uncached arguments, we'll turn this
2660 * into a reference in the args array.
2661 */
2662 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2663 ndx = v - DIF_VAR_ARG0;
2664 v = DIF_VAR_ARGS;
2665 }
2666
2667 switch (v) {
2668 case DIF_VAR_ARGS:
2669 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2670 if (ndx >= sizeof (mstate->dtms_arg) /
2671 sizeof (mstate->dtms_arg[0])) {
2672 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2673 dtrace_provider_t *pv;
2674 uint64_t val;
2675
2676 pv = mstate->dtms_probe->dtpr_provider;
2677 if (pv->dtpv_pops.dtps_getargval != NULL)
2678 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2679 mstate->dtms_probe->dtpr_id,
2680 mstate->dtms_probe->dtpr_arg, ndx, aframes);
2681 else
2682 val = dtrace_getarg(ndx, aframes);
2683
2684 /*
2685 * This is regrettably required to keep the compiler
2686 * from tail-optimizing the call to dtrace_getarg().
2687 * The condition always evaluates to true, but the
2688 * compiler has no way of figuring that out a priori.
2689 * (None of this would be necessary if the compiler
2690 * could be relied upon to _always_ tail-optimize
2691 * the call to dtrace_getarg() -- but it can't.)
2692 */
2693 if (mstate->dtms_probe != NULL)
2694 return (val);
2695
2696 ASSERT(0);
2697 }
2698
2699 return (mstate->dtms_arg[ndx]);
2700
2701 case DIF_VAR_UREGS: {
2702#ifndef VBOX
2703 klwp_t *lwp;
2704
2705 if (!dtrace_priv_proc(state))
2706 return (0);
2707
2708 if ((lwp = curthread->t_lwp) == NULL) {
2709 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2710 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval = NULL;
2711 return (0);
2712 }
2713
2714 return (dtrace_getreg(lwp->lwp_regs, ndx));
2715#else
2716 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2717 return (0);
2718#endif
2719 }
2720
2721 case DIF_VAR_CURTHREAD:
2722 if (!dtrace_priv_kernel(state))
2723 return (0);
2724#ifndef VBOX
2725 return ((uint64_t)(uintptr_t)curthread);
2726#else
2727 return ((uintptr_t)RTThreadNativeSelf());
2728#endif
2729
2730 case DIF_VAR_TIMESTAMP:
2731 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2732 mstate->dtms_timestamp = dtrace_gethrtime();
2733 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2734 }
2735 return (mstate->dtms_timestamp);
2736
2737 case DIF_VAR_VTIMESTAMP:
2738 ASSERT(dtrace_vtime_references != 0);
2739 return (curthread->t_dtrace_vtime);
2740
2741 case DIF_VAR_WALLTIMESTAMP:
2742 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2743 mstate->dtms_walltimestamp = dtrace_gethrestime();
2744 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2745 }
2746 return (mstate->dtms_walltimestamp);
2747
2748 case DIF_VAR_IPL:
2749 if (!dtrace_priv_kernel(state))
2750 return (0);
2751 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2752 mstate->dtms_ipl = dtrace_getipl();
2753 mstate->dtms_present |= DTRACE_MSTATE_IPL;
2754 }
2755 return (mstate->dtms_ipl);
2756
2757 case DIF_VAR_EPID:
2758 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2759 return (mstate->dtms_epid);
2760
2761 case DIF_VAR_ID:
2762 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2763 return (mstate->dtms_probe->dtpr_id);
2764
2765 case DIF_VAR_STACKDEPTH:
2766 if (!dtrace_priv_kernel(state))
2767 return (0);
2768 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2769 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2770
2771 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2772 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2773 }
2774 return (mstate->dtms_stackdepth);
2775
2776 case DIF_VAR_USTACKDEPTH:
2777 if (!dtrace_priv_proc(state))
2778 return (0);
2779 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2780 /*
2781 * See comment in DIF_VAR_PID.
2782 */
2783 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2784 CPU_ON_INTR(CPU)) {
2785 mstate->dtms_ustackdepth = 0;
2786 } else {
2787 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2788 mstate->dtms_ustackdepth =
2789 dtrace_getustackdepth();
2790 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2791 }
2792 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2793 }
2794 return (mstate->dtms_ustackdepth);
2795
2796 case DIF_VAR_CALLER:
2797 if (!dtrace_priv_kernel(state))
2798 return (0);
2799 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2800 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2801
2802 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2803 /*
2804 * If this is an unanchored probe, we are
2805 * required to go through the slow path:
2806 * dtrace_caller() only guarantees correct
2807 * results for anchored probes.
2808 */
2809 pc_t caller[2];
2810
2811 dtrace_getpcstack(caller, 2, aframes,
2812 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2813 mstate->dtms_caller = caller[1];
2814 } else if ((mstate->dtms_caller =
2815 dtrace_caller(aframes)) == VBDTCAST(uintptr_t)-1) {
2816 /*
2817 * We have failed to do this the quick way;
2818 * we must resort to the slower approach of
2819 * calling dtrace_getpcstack().
2820 */
2821 pc_t caller;
2822
2823 dtrace_getpcstack(&caller, 1, aframes, NULL);
2824 mstate->dtms_caller = caller;
2825 }
2826
2827 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2828 }
2829 return (mstate->dtms_caller);
2830
2831 case DIF_VAR_UCALLER:
2832 if (!dtrace_priv_proc(state))
2833 return (0);
2834
2835 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
2836 uint64_t ustack[3];
2837
2838 /*
2839 * dtrace_getupcstack() fills in the first uint64_t
2840 * with the current PID. The second uint64_t will
2841 * be the program counter at user-level. The third
2842 * uint64_t will contain the caller, which is what
2843 * we're after.
2844 */
2845 ustack[2] = NULL;
2846 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2847 dtrace_getupcstack(ustack, 3);
2848 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2849 mstate->dtms_ucaller = ustack[2];
2850 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
2851 }
2852
2853 return (mstate->dtms_ucaller);
2854
2855 case DIF_VAR_PROBEPROV:
2856 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2857 return (dtrace_dif_varstr(
2858 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
2859 state, mstate));
2860
2861 case DIF_VAR_PROBEMOD:
2862 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2863 return (dtrace_dif_varstr(
2864 (uintptr_t)mstate->dtms_probe->dtpr_mod,
2865 state, mstate));
2866
2867 case DIF_VAR_PROBEFUNC:
2868 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2869 return (dtrace_dif_varstr(
2870 (uintptr_t)mstate->dtms_probe->dtpr_func,
2871 state, mstate));
2872
2873 case DIF_VAR_PROBENAME:
2874 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2875 return (dtrace_dif_varstr(
2876 (uintptr_t)mstate->dtms_probe->dtpr_name,
2877 state, mstate));
2878
2879 case DIF_VAR_PID:
2880 if (!dtrace_priv_proc(state))
2881 return (0);
2882
2883#ifndef VBOX
2884 /*
2885 * Note that we are assuming that an unanchored probe is
2886 * always due to a high-level interrupt. (And we're assuming
2887 * that there is only a single high level interrupt.)
2888 */
2889 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2890 return (pid0.pid_id);
2891
2892 /*
2893 * It is always safe to dereference one's own t_procp pointer:
2894 * it always points to a valid, allocated proc structure.
2895 * Further, it is always safe to dereference the p_pidp member
2896 * of one's own proc structure. (These are truisms becuase
2897 * threads and processes don't clean up their own state --
2898 * they leave that task to whomever reaps them.)
2899 */
2900 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
2901#else
2902 return (RTProcSelf());
2903#endif
2904
2905 case DIF_VAR_PPID:
2906 if (!dtrace_priv_proc(state))
2907 return (0);
2908
2909#ifndef VBOX
2910 /*
2911 * See comment in DIF_VAR_PID.
2912 */
2913 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2914 return (pid0.pid_id);
2915
2916 /*
2917 * It is always safe to dereference one's own t_procp pointer:
2918 * it always points to a valid, allocated proc structure.
2919 * (This is true because threads don't clean up their own
2920 * state -- they leave that task to whomever reaps them.)
2921 */
2922 return ((uint64_t)curthread->t_procp->p_ppid);
2923#else
2924 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2925 return (0); /** @todo parent pid? */
2926#endif
2927
2928 case DIF_VAR_TID:
2929#ifndef VBOX
2930 /*
2931 * See comment in DIF_VAR_PID.
2932 */
2933 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2934 return (0);
2935
2936 return ((uint64_t)curthread->t_tid);
2937#else
2938 return (RTThreadNativeSelf()); /** @todo proper tid? */
2939#endif
2940
2941 case DIF_VAR_EXECNAME:
2942 if (!dtrace_priv_proc(state))
2943 return (0);
2944
2945#ifndef VBOX
2946 /*
2947 * See comment in DIF_VAR_PID.
2948 */
2949 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2950 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
2951
2952 /*
2953 * It is always safe to dereference one's own t_procp pointer:
2954 * it always points to a valid, allocated proc structure.
2955 * (This is true because threads don't clean up their own
2956 * state -- they leave that task to whomever reaps them.)
2957 */
2958 return (dtrace_dif_varstr(
2959 (uintptr_t)curthread->t_procp->p_user.u_comm,
2960 state, mstate));
2961#else
2962 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2963 return (0); /** @todo execname */
2964#endif
2965
2966 case DIF_VAR_ZONENAME:
2967 if (!dtrace_priv_proc(state))
2968 return (0);
2969
2970#ifndef VBOX
2971 /*
2972 * See comment in DIF_VAR_PID.
2973 */
2974 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2975 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
2976
2977 /*
2978 * It is always safe to dereference one's own t_procp pointer:
2979 * it always points to a valid, allocated proc structure.
2980 * (This is true because threads don't clean up their own
2981 * state -- they leave that task to whomever reaps them.)
2982 */
2983 return (dtrace_dif_varstr(
2984 (uintptr_t)curthread->t_procp->p_zone->zone_name,
2985 state, mstate));
2986#else
2987 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2988 return (0);
2989#endif
2990
2991 case DIF_VAR_UID:
2992 if (!dtrace_priv_proc(state))
2993 return (0);
2994
2995#ifndef VBOX
2996 /*
2997 * See comment in DIF_VAR_PID.
2998 */
2999 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3000 return ((uint64_t)p0.p_cred->cr_uid);
3001
3002 /*
3003 * It is always safe to dereference one's own t_procp pointer:
3004 * it always points to a valid, allocated proc structure.
3005 * (This is true because threads don't clean up their own
3006 * state -- they leave that task to whomever reaps them.)
3007 *
3008 * Additionally, it is safe to dereference one's own process
3009 * credential, since this is never NULL after process birth.
3010 */
3011 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3012#else
3013 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3014 return (0);
3015#endif
3016
3017 case DIF_VAR_GID:
3018 if (!dtrace_priv_proc(state))
3019 return (0);
3020
3021#ifndef VBOX
3022 /*
3023 * See comment in DIF_VAR_PID.
3024 */
3025 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3026 return ((uint64_t)p0.p_cred->cr_gid);
3027
3028 /*
3029 * It is always safe to dereference one's own t_procp pointer:
3030 * it always points to a valid, allocated proc structure.
3031 * (This is true because threads don't clean up their own
3032 * state -- they leave that task to whomever reaps them.)
3033 *
3034 * Additionally, it is safe to dereference one's own process
3035 * credential, since this is never NULL after process birth.
3036 */
3037 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3038#else
3039 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3040 return (0);
3041#endif
3042
3043 case DIF_VAR_ERRNO: {
3044#ifndef VBOX
3045 klwp_t *lwp;
3046#endif
3047 if (!dtrace_priv_proc(state))
3048 return (0);
3049
3050#ifndef VBOX
3051 /*
3052 * See comment in DIF_VAR_PID.
3053 */
3054 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3055 return (0);
3056
3057 /*
3058 * It is always safe to dereference one's own t_lwp pointer in
3059 * the event that this pointer is non-NULL. (This is true
3060 * because threads and lwps don't clean up their own state --
3061 * they leave that task to whomever reaps them.)
3062 */
3063 if ((lwp = curthread->t_lwp) == NULL)
3064 return (0);
3065
3066 return ((uint64_t)lwp->lwp_errno);
3067#else
3068 cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3069 return (0);
3070#endif
3071 }
3072 default:
3073 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3074 return (0);
3075 }
3076}
3077
3078/*
3079 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3080 * Notice that we don't bother validating the proper number of arguments or
3081 * their types in the tuple stack. This isn't needed because all argument
3082 * interpretation is safe because of our load safety -- the worst that can
3083 * happen is that a bogus program can obtain bogus results.
3084 */
3085static void
3086dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3087 dtrace_key_t *tupregs, int nargs,
3088 dtrace_mstate_t *mstate, dtrace_state_t *state)
3089{
3090 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
3091 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
3092 dtrace_vstate_t *vstate = &state->dts_vstate;
3093
3094#ifndef VBOX
3095 union {
3096 mutex_impl_t mi;
3097 uint64_t mx;
3098 } m;
3099
3100 union {
3101 krwlock_t ri;
3102 uintptr_t rw;
3103 } r;
3104#endif
3105
3106 switch (subr) {
3107 case DIF_SUBR_RAND:
3108 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3109 break;
3110
3111 case DIF_SUBR_MUTEX_OWNED:
3112#ifndef VBOX
3113 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3114 mstate, vstate)) {
3115 regs[rd] = NULL;
3116 break;
3117 }
3118
3119 m.mx = dtrace_load64(tupregs[0].dttk_value);
3120 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3121 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3122 else
3123 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3124#else
3125 regs[rd] = 0;
3126 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3127#endif
3128 break;
3129
3130 case DIF_SUBR_MUTEX_OWNER:
3131#ifndef VBOX
3132 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3133 mstate, vstate)) {
3134 regs[rd] = NULL;
3135 break;
3136 }
3137
3138 m.mx = dtrace_load64(tupregs[0].dttk_value);
3139 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3140 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3141 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3142 else
3143 regs[rd] = 0;
3144#else
3145 regs[rd] = 0;
3146 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3147#endif
3148 break;
3149
3150 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3151#ifndef VBOX
3152 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3153 mstate, vstate)) {
3154 regs[rd] = NULL;
3155 break;
3156 }
3157
3158 m.mx = dtrace_load64(tupregs[0].dttk_value);
3159 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3160#else
3161 regs[rd] = 0;
3162 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3163#endif
3164 break;
3165
3166 case DIF_SUBR_MUTEX_TYPE_SPIN:
3167#ifndef VBOX
3168 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3169 mstate, vstate)) {
3170 regs[rd] = NULL;
3171 break;
3172 }
3173
3174 m.mx = dtrace_load64(tupregs[0].dttk_value);
3175 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3176#else
3177 regs[rd] = 0;
3178 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3179#endif
3180 break;
3181
3182 case DIF_SUBR_RW_READ_HELD: {
3183#ifndef VBOX
3184 uintptr_t tmp;
3185
3186 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3187 mstate, vstate)) {
3188 regs[rd] = NULL;
3189 break;
3190 }
3191
3192 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3193 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3194#else
3195 regs[rd] = 0;
3196 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3197#endif
3198 break;
3199 }
3200
3201 case DIF_SUBR_RW_WRITE_HELD:
3202#ifndef VBOX
3203 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3204 mstate, vstate)) {
3205 regs[rd] = NULL;
3206 break;
3207 }
3208
3209 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3210 regs[rd] = _RW_WRITE_HELD(&r.ri);
3211#else
3212 regs[rd] = 0;
3213 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3214#endif
3215 break;
3216
3217 case DIF_SUBR_RW_ISWRITER:
3218#ifndef VBOX
3219 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3220 mstate, vstate)) {
3221 regs[rd] = NULL;
3222 break;
3223 }
3224
3225 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3226 regs[rd] = _RW_ISWRITER(&r.ri);
3227#else
3228 regs[rd] = 0;
3229 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3230#endif
3231 break;
3232
3233 case DIF_SUBR_BCOPY: {
3234 /*
3235 * We need to be sure that the destination is in the scratch
3236 * region -- no other region is allowed.
3237 */
3238 uintptr_t src = tupregs[0].dttk_value;
3239 uintptr_t dest = tupregs[1].dttk_value;
3240 size_t size = tupregs[2].dttk_value;
3241
3242 if (!dtrace_inscratch(dest, size, mstate)) {
3243 *flags |= CPU_DTRACE_BADADDR;
3244 *illval = regs[rd];
3245 break;
3246 }
3247
3248 if (!dtrace_canload(src, size, mstate, vstate)) {
3249 regs[rd] = NULL;
3250 break;
3251 }
3252
3253 dtrace_bcopy((void *)src, (void *)dest, size);
3254 break;
3255 }
3256
3257 case DIF_SUBR_ALLOCA:
3258 case DIF_SUBR_COPYIN: {
3259 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3260 uint64_t size =
3261 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3262 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3263
3264 /*
3265 * This action doesn't require any credential checks since
3266 * probes will not activate in user contexts to which the
3267 * enabling user does not have permissions.
3268 */
3269
3270 /*
3271 * Rounding up the user allocation size could have overflowed
3272 * a large, bogus allocation (like -1ULL) to 0.
3273 */
3274 if (scratch_size < size ||
3275 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3276 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3277 regs[rd] = NULL;
3278 break;
3279 }
3280
3281 if (subr == DIF_SUBR_COPYIN) {
3282 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3283 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3284 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3285 }
3286
3287 mstate->dtms_scratch_ptr += scratch_size;
3288 regs[rd] = dest;
3289 break;
3290 }
3291
3292 case DIF_SUBR_COPYINTO: {
3293 uint64_t size = tupregs[1].dttk_value;
3294 uintptr_t dest = tupregs[2].dttk_value;
3295
3296 /*
3297 * This action doesn't require any credential checks since
3298 * probes will not activate in user contexts to which the
3299 * enabling user does not have permissions.
3300 */
3301 if (!dtrace_inscratch(dest, size, mstate)) {
3302 *flags |= CPU_DTRACE_BADADDR;
3303 *illval = regs[rd];
3304 break;
3305 }
3306
3307 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3308 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3309 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3310 break;
3311 }
3312
3313 case DIF_SUBR_COPYINSTR: {
3314 uintptr_t dest = mstate->dtms_scratch_ptr;
3315 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3316
3317 if (nargs > 1 && tupregs[1].dttk_value < size)
3318 size = tupregs[1].dttk_value + 1;
3319
3320 /*
3321 * This action doesn't require any credential checks since
3322 * probes will not activate in user contexts to which the
3323 * enabling user does not have permissions.
3324 */
3325 if (!DTRACE_INSCRATCH(mstate, size)) {
3326 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3327 regs[rd] = NULL;
3328 break;
3329 }
3330
3331 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3332 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3333 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3334
3335 ((char *)dest)[size - 1] = '\0';
3336 mstate->dtms_scratch_ptr += size;
3337 regs[rd] = dest;
3338 break;
3339 }
3340
3341 case DIF_SUBR_MSGSIZE:
3342 case DIF_SUBR_MSGDSIZE: {
3343#ifndef VBOX
3344 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3345 uintptr_t wptr, rptr;
3346 size_t count = 0;
3347 int cont = 0;
3348
3349 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3350
3351 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3352 vstate)) {
3353 regs[rd] = NULL;
3354 break;
3355 }
3356
3357 wptr = dtrace_loadptr(baddr +
3358 offsetof(mblk_t, b_wptr));
3359
3360 rptr = dtrace_loadptr(baddr +
3361 offsetof(mblk_t, b_rptr));
3362
3363 if (wptr < rptr) {
3364 *flags |= CPU_DTRACE_BADADDR;
3365 *illval = tupregs[0].dttk_value;
3366 break;
3367 }
3368
3369 daddr = dtrace_loadptr(baddr +
3370 offsetof(mblk_t, b_datap));
3371
3372 baddr = dtrace_loadptr(baddr +
3373 offsetof(mblk_t, b_cont));
3374
3375 /*
3376 * We want to prevent against denial-of-service here,
3377 * so we're only going to search the list for
3378 * dtrace_msgdsize_max mblks.
3379 */
3380 if (cont++ > dtrace_msgdsize_max) {
3381 *flags |= CPU_DTRACE_ILLOP;
3382 break;
3383 }
3384
3385 if (subr == DIF_SUBR_MSGDSIZE) {
3386 if (dtrace_load8(daddr +
3387 offsetof(dblk_t, db_type)) != M_DATA)
3388 continue;
3389 }
3390
3391 count += wptr - rptr;
3392 }
3393
3394 if (!(*flags & CPU_DTRACE_FAULT))
3395 regs[rd] = count;
3396
3397#else
3398 regs[rd] = 0;
3399 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3400#endif
3401 break;
3402 }
3403
3404 case DIF_SUBR_PROGENYOF: {
3405#ifndef VBOX
3406 pid_t pid = tupregs[0].dttk_value;
3407 proc_t *p;
3408 int rval = 0;
3409
3410 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3411
3412 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3413 if (p->p_pidp->pid_id == pid) {
3414 rval = 1;
3415 break;
3416 }
3417 }
3418
3419 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3420
3421 regs[rd] = rval;
3422#else
3423 regs[rd] = 0;
3424 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3425#endif
3426 break;
3427 }
3428
3429 case DIF_SUBR_SPECULATION:
3430 regs[rd] = dtrace_speculation(state);
3431 break;
3432
3433 case DIF_SUBR_COPYOUT: {
3434 uintptr_t kaddr = tupregs[0].dttk_value;
3435 uintptr_t uaddr = tupregs[1].dttk_value;
3436 uint64_t size = tupregs[2].dttk_value;
3437
3438 if (!dtrace_destructive_disallow &&
3439 dtrace_priv_proc_control(state) &&
3440 !dtrace_istoxic(kaddr, size)) {
3441 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3442 dtrace_copyout(kaddr, uaddr, size, flags);
3443 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3444 }
3445 break;
3446 }
3447
3448 case DIF_SUBR_COPYOUTSTR: {
3449 uintptr_t kaddr = tupregs[0].dttk_value;
3450 uintptr_t uaddr = tupregs[1].dttk_value;
3451 uint64_t size = tupregs[2].dttk_value;
3452
3453 if (!dtrace_destructive_disallow &&
3454 dtrace_priv_proc_control(state) &&
3455 !dtrace_istoxic(kaddr, size)) {
3456 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3457 dtrace_copyoutstr(kaddr, uaddr, size, flags);
3458 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3459 }
3460 break;
3461 }
3462
3463 case DIF_SUBR_STRLEN: {
3464 size_t sz;
3465 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3466 sz = dtrace_strlen((char *)addr,
3467 state->dts_options[DTRACEOPT_STRSIZE]);
3468
3469 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3470 regs[rd] = NULL;
3471 break;
3472 }
3473
3474 regs[rd] = sz;
3475
3476 break;
3477 }
3478
3479 case DIF_SUBR_STRCHR:
3480 case DIF_SUBR_STRRCHR: {
3481 /*
3482 * We're going to iterate over the string looking for the
3483 * specified character. We will iterate until we have reached
3484 * the string length or we have found the character. If this
3485 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3486 * of the specified character instead of the first.
3487 */
3488 uintptr_t saddr = tupregs[0].dttk_value;
3489 uintptr_t addr = tupregs[0].dttk_value;
3490 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3491 char c, target = (char)tupregs[1].dttk_value;
3492
3493 for (regs[rd] = NULL; addr < limit; addr++) {
3494 if ((c = dtrace_load8(addr)) == target) {
3495 regs[rd] = addr;
3496
3497 if (subr == DIF_SUBR_STRCHR)
3498 break;
3499 }
3500
3501 if (c == '\0')
3502 break;
3503 }
3504
3505 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3506 regs[rd] = NULL;
3507 break;
3508 }
3509
3510 break;
3511 }
3512
3513 case DIF_SUBR_STRSTR:
3514 case DIF_SUBR_INDEX:
3515 case DIF_SUBR_RINDEX: {
3516 /*
3517 * We're going to iterate over the string looking for the
3518 * specified string. We will iterate until we have reached
3519 * the string length or we have found the string. (Yes, this
3520 * is done in the most naive way possible -- but considering
3521 * that the string we're searching for is likely to be
3522 * relatively short, the complexity of Rabin-Karp or similar
3523 * hardly seems merited.)
3524 */
3525 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3526 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3527 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3528 size_t len = dtrace_strlen(addr, size);
3529 size_t sublen = dtrace_strlen(substr, size);
3530 char *limit = addr + len, *orig = addr;
3531 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3532 int inc = 1;
3533
3534 regs[rd] = notfound;
3535
3536 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3537 regs[rd] = NULL;
3538 break;
3539 }
3540
3541 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3542 vstate)) {
3543 regs[rd] = NULL;
3544 break;
3545 }
3546
3547 /*
3548 * strstr() and index()/rindex() have similar semantics if
3549 * both strings are the empty string: strstr() returns a
3550 * pointer to the (empty) string, and index() and rindex()
3551 * both return index 0 (regardless of any position argument).
3552 */
3553 if (sublen == 0 && len == 0) {
3554 if (subr == DIF_SUBR_STRSTR)
3555 regs[rd] = (uintptr_t)addr;
3556 else
3557 regs[rd] = 0;
3558 break;
3559 }
3560
3561 if (subr != DIF_SUBR_STRSTR) {
3562 if (subr == DIF_SUBR_RINDEX) {
3563 limit = orig - 1;
3564 addr += len;
3565 inc = -1;
3566 }
3567
3568 /*
3569 * Both index() and rindex() take an optional position
3570 * argument that denotes the starting position.
3571 */
3572 if (nargs == 3) {
3573 int64_t pos = (int64_t)tupregs[2].dttk_value;
3574
3575 /*
3576 * If the position argument to index() is
3577 * negative, Perl implicitly clamps it at
3578 * zero. This semantic is a little surprising
3579 * given the special meaning of negative
3580 * positions to similar Perl functions like
3581 * substr(), but it appears to reflect a
3582 * notion that index() can start from a
3583 * negative index and increment its way up to
3584 * the string. Given this notion, Perl's
3585 * rindex() is at least self-consistent in
3586 * that it implicitly clamps positions greater
3587 * than the string length to be the string
3588 * length. Where Perl completely loses
3589 * coherence, however, is when the specified
3590 * substring is the empty string (""). In
3591 * this case, even if the position is
3592 * negative, rindex() returns 0 -- and even if
3593 * the position is greater than the length,
3594 * index() returns the string length. These
3595 * semantics violate the notion that index()
3596 * should never return a value less than the
3597 * specified position and that rindex() should
3598 * never return a value greater than the
3599 * specified position. (One assumes that
3600 * these semantics are artifacts of Perl's
3601 * implementation and not the results of
3602 * deliberate design -- it beggars belief that
3603 * even Larry Wall could desire such oddness.)
3604 * While in the abstract one would wish for
3605 * consistent position semantics across
3606 * substr(), index() and rindex() -- or at the
3607 * very least self-consistent position
3608 * semantics for index() and rindex() -- we
3609 * instead opt to keep with the extant Perl
3610 * semantics, in all their broken glory. (Do
3611 * we have more desire to maintain Perl's
3612 * semantics than Perl does? Probably.)
3613 */
3614 if (subr == DIF_SUBR_RINDEX) {
3615 if (pos < 0) {
3616 if (sublen == 0)
3617 regs[rd] = 0;
3618 break;
3619 }
3620
3621 if (VBDTCAST(uint64_t)pos > len)
3622 pos = len;
3623 } else {
3624 if (pos < 0)
3625 pos = 0;
3626
3627 if (VBDTCAST(uint64_t)pos >= len) {
3628 if (sublen == 0)
3629 regs[rd] = len;
3630 break;
3631 }
3632 }
3633
3634 addr = orig + pos;
3635 }
3636 }
3637
3638 for (regs[rd] = notfound; addr != limit; addr += inc) {
3639 if (dtrace_strncmp(addr, substr, sublen) == 0) {
3640 if (subr != DIF_SUBR_STRSTR) {
3641 /*
3642 * As D index() and rindex() are
3643 * modeled on Perl (and not on awk),
3644 * we return a zero-based (and not a
3645 * one-based) index. (For you Perl
3646 * weenies: no, we're not going to add
3647 * $[ -- and shouldn't you be at a con
3648 * or something?)
3649 */
3650 regs[rd] = (uintptr_t)(addr - orig);
3651 break;
3652 }
3653
3654 ASSERT(subr == DIF_SUBR_STRSTR);
3655 regs[rd] = (uintptr_t)addr;
3656 break;
3657 }
3658 }
3659
3660 break;
3661 }
3662
3663 case DIF_SUBR_STRTOK: {
3664 uintptr_t addr = tupregs[0].dttk_value;
3665 uintptr_t tokaddr = tupregs[1].dttk_value;
3666 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3667 uintptr_t limit, toklimit = tokaddr + size;
3668 uint8_t c, tokmap[32]; /* 256 / 8 */
3669 char *dest = (char *)mstate->dtms_scratch_ptr;
3670 VBDTTYPE(unsigned,int) i;
3671
3672 /*
3673 * Check both the token buffer and (later) the input buffer,
3674 * since both could be non-scratch addresses.
3675 */
3676 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3677 regs[rd] = NULL;
3678 break;
3679 }
3680
3681 if (!DTRACE_INSCRATCH(mstate, size)) {
3682 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3683 regs[rd] = NULL;
3684 break;
3685 }
3686
3687 if (addr == NULL) {
3688 /*
3689 * If the address specified is NULL, we use our saved
3690 * strtok pointer from the mstate. Note that this
3691 * means that the saved strtok pointer is _only_
3692 * valid within multiple enablings of the same probe --
3693 * it behaves like an implicit clause-local variable.
3694 */
3695 addr = mstate->dtms_strtok;
3696 } else {
3697 /*
3698 * If the user-specified address is non-NULL we must
3699 * access check it. This is the only time we have
3700 * a chance to do so, since this address may reside
3701 * in the string table of this clause-- future calls
3702 * (when we fetch addr from mstate->dtms_strtok)
3703 * would fail this access check.
3704 */
3705 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3706 regs[rd] = NULL;
3707 break;
3708 }
3709 }
3710
3711 /*
3712 * First, zero the token map, and then process the token
3713 * string -- setting a bit in the map for every character
3714 * found in the token string.
3715 */
3716 for (i = 0; i < sizeof (tokmap); i++)
3717 tokmap[i] = 0;
3718
3719 for (; tokaddr < toklimit; tokaddr++) {
3720 if ((c = dtrace_load8(tokaddr)) == '\0')
3721 break;
3722
3723 ASSERT((c >> 3) < sizeof (tokmap));
3724 tokmap[c >> 3] |= (1 << (c & 0x7));
3725 }
3726
3727 for (limit = addr + size; addr < limit; addr++) {
3728 /*
3729 * We're looking for a character that is _not_ contained
3730 * in the token string.
3731 */
3732 if ((c = dtrace_load8(addr)) == '\0')
3733 break;
3734
3735 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3736 break;
3737 }
3738
3739 if (c == '\0') {
3740 /*
3741 * We reached the end of the string without finding
3742 * any character that was not in the token string.
3743 * We return NULL in this case, and we set the saved
3744 * address to NULL as well.
3745 */
3746 regs[rd] = NULL;
3747 mstate->dtms_strtok = NULL;
3748 break;
3749 }
3750
3751 /*
3752 * From here on, we're copying into the destination string.
3753 */
3754 for (i = 0; addr < limit && i < size - 1; addr++) {
3755 if ((c = dtrace_load8(addr)) == '\0')
3756 break;
3757
3758 if (tokmap[c >> 3] & (1 << (c & 0x7)))
3759 break;
3760
3761 ASSERT(i < size);
3762 dest[i++] = c;
3763 }
3764
3765 ASSERT(i < size);
3766 dest[i] = '\0';
3767 regs[rd] = (uintptr_t)dest;
3768 mstate->dtms_scratch_ptr += size;
3769 mstate->dtms_strtok = addr;
3770 break;
3771 }
3772
3773 case DIF_SUBR_SUBSTR: {
3774 uintptr_t s = tupregs[0].dttk_value;
3775 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3776 char *d = (char *)mstate->dtms_scratch_ptr;
3777 int64_t index = (int64_t)tupregs[1].dttk_value;
3778 int64_t remaining = (int64_t)tupregs[2].dttk_value;
3779 size_t len = dtrace_strlen((char *)s, size);
3780 int64_t i;
3781
3782 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3783 regs[rd] = NULL;
3784 break;
3785 }
3786
3787 if (!DTRACE_INSCRATCH(mstate, size)) {
3788 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3789 regs[rd] = NULL;
3790 break;
3791 }
3792
3793 if (nargs <= 2)
3794 remaining = (int64_t)size;
3795
3796 if (index < 0) {
3797 index += len;
3798
3799 if (index < 0 && index + remaining > 0) {
3800 remaining += index;
3801 index = 0;
3802 }
3803 }
3804
3805 if (VBDTCAST(uint64_t)index >= len || index < 0) {
3806 remaining = 0;
3807 } else if (remaining < 0) {
3808 remaining += len - index;
3809 } else if (VBDTCAST(uint64_t)index + remaining > size) {
3810 remaining = size - index;
3811 }
3812
3813 for (i = 0; i < remaining; i++) {
3814 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3815 break;
3816 }
3817
3818 d[i] = '\0';
3819
3820 mstate->dtms_scratch_ptr += size;
3821 regs[rd] = (uintptr_t)d;
3822 break;
3823 }
3824
3825 case DIF_SUBR_GETMAJOR:
3826#ifndef VBOX
3827#ifdef _LP64
3828 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3829#else
3830 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3831#endif
3832#else
3833 regs[rd] = 0;
3834 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3835#endif
3836 break;
3837
3838 case DIF_SUBR_GETMINOR:
3839#ifndef VBOX
3840#ifdef _LP64
3841 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3842#else
3843 regs[rd] = tupregs[0].dttk_value & MAXMIN;
3844#endif
3845#else
3846 regs[rd] = 0;
3847 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3848#endif
3849 break;
3850
3851 case DIF_SUBR_DDI_PATHNAME: {
3852#ifndef VBOX
3853 /*
3854 * This one is a galactic mess. We are going to roughly
3855 * emulate ddi_pathname(), but it's made more complicated
3856 * by the fact that we (a) want to include the minor name and
3857 * (b) must proceed iteratively instead of recursively.
3858 */
3859 uintptr_t dest = mstate->dtms_scratch_ptr;
3860 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3861 char *start = (char *)dest, *end = start + size - 1;
3862 uintptr_t daddr = tupregs[0].dttk_value;
3863 int64_t minor = (int64_t)tupregs[1].dttk_value;
3864 char *s;
3865 int i, len, depth = 0;
3866
3867 /*
3868 * Due to all the pointer jumping we do and context we must
3869 * rely upon, we just mandate that the user must have kernel
3870 * read privileges to use this routine.
3871 */
3872 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
3873 *flags |= CPU_DTRACE_KPRIV;
3874 *illval = daddr;
3875 regs[rd] = NULL;
3876 }
3877
3878 if (!DTRACE_INSCRATCH(mstate, size)) {
3879 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3880 regs[rd] = NULL;
3881 break;
3882 }
3883
3884 *end = '\0';
3885
3886 /*
3887 * We want to have a name for the minor. In order to do this,
3888 * we need to walk the minor list from the devinfo. We want
3889 * to be sure that we don't infinitely walk a circular list,
3890 * so we check for circularity by sending a scout pointer
3891 * ahead two elements for every element that we iterate over;
3892 * if the list is circular, these will ultimately point to the
3893 * same element. You may recognize this little trick as the
3894 * answer to a stupid interview question -- one that always
3895 * seems to be asked by those who had to have it laboriously
3896 * explained to them, and who can't even concisely describe
3897 * the conditions under which one would be forced to resort to
3898 * this technique. Needless to say, those conditions are
3899 * found here -- and probably only here. Is this the only use
3900 * of this infamous trick in shipping, production code? If it
3901 * isn't, it probably should be...
3902 */
3903 if (minor != -1) {
3904 uintptr_t maddr = dtrace_loadptr(daddr +
3905 offsetof(struct dev_info, devi_minor));
3906
3907 uintptr_t next = offsetof(struct ddi_minor_data, next);
3908 uintptr_t name = offsetof(struct ddi_minor_data,
3909 d_minor) + offsetof(struct ddi_minor, name);
3910 uintptr_t dev = offsetof(struct ddi_minor_data,
3911 d_minor) + offsetof(struct ddi_minor, dev);
3912 uintptr_t scout;
3913
3914 if (maddr != NULL)
3915 scout = dtrace_loadptr(maddr + next);
3916
3917 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3918 uint64_t m;
3919#ifdef _LP64
3920 m = dtrace_load64(maddr + dev) & MAXMIN64;
3921#else
3922 m = dtrace_load32(maddr + dev) & MAXMIN;
3923#endif
3924 if (m != minor) {
3925 maddr = dtrace_loadptr(maddr + next);
3926
3927 if (scout == NULL)
3928 continue;
3929
3930 scout = dtrace_loadptr(scout + next);
3931
3932 if (scout == NULL)
3933 continue;
3934
3935 scout = dtrace_loadptr(scout + next);
3936
3937 if (scout == NULL)
3938 continue;
3939
3940 if (scout == maddr) {
3941 *flags |= CPU_DTRACE_ILLOP;
3942 break;
3943 }
3944
3945 continue;
3946 }
3947
3948 /*
3949 * We have the minor data. Now we need to
3950 * copy the minor's name into the end of the
3951 * pathname.
3952 */
3953 s = (char *)dtrace_loadptr(maddr + name);
3954 len = dtrace_strlen(s, size);
3955
3956 if (*flags & CPU_DTRACE_FAULT)
3957 break;
3958
3959 if (len != 0) {
3960 if ((end -= (len + 1)) < start)
3961 break;
3962
3963 *end = ':';
3964 }
3965
3966 for (i = 1; i <= len; i++)
3967 end[i] = dtrace_load8((uintptr_t)s++);
3968 break;
3969 }
3970 }
3971
3972 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3973 ddi_node_state_t devi_state;
3974
3975 devi_state = dtrace_load32(daddr +
3976 offsetof(struct dev_info, devi_node_state));
3977
3978 if (*flags & CPU_DTRACE_FAULT)
3979 break;
3980
3981 if (devi_state >= DS_INITIALIZED) {
3982 s = (char *)dtrace_loadptr(daddr +
3983 offsetof(struct dev_info, devi_addr));
3984 len = dtrace_strlen(s, size);
3985
3986 if (*flags & CPU_DTRACE_FAULT)
3987 break;
3988
3989 if (len != 0) {
3990 if ((end -= (len + 1)) < start)
3991 break;
3992
3993 *end = '@';
3994 }
3995
3996 for (i = 1; i <= len; i++)
3997 end[i] = dtrace_load8((uintptr_t)s++);
3998 }
3999
4000 /*
4001 * Now for the node name...
4002 */
4003 s = (char *)dtrace_loadptr(daddr +
4004 offsetof(struct dev_info, devi_node_name));
4005
4006 daddr = dtrace_loadptr(daddr +
4007 offsetof(struct dev_info, devi_parent));
4008
4009 /*
4010 * If our parent is NULL (that is, if we're the root
4011 * node), we're going to use the special path
4012 * "devices".
4013 */
4014 if (daddr == NULL)
4015 s = "devices";
4016
4017 len = dtrace_strlen(s, size);
4018 if (*flags & CPU_DTRACE_FAULT)
4019 break;
4020
4021 if ((end -= (len + 1)) < start)
4022 break;
4023
4024 for (i = 1; i <= len; i++)
4025 end[i] = dtrace_load8((uintptr_t)s++);
4026 *end = '/';
4027
4028 if (depth++ > dtrace_devdepth_max) {
4029 *flags |= CPU_DTRACE_ILLOP;
4030 break;
4031 }
4032 }
4033
4034 if (end < start)
4035 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4036
4037 if (daddr == NULL) {
4038 regs[rd] = (uintptr_t)end;
4039 mstate->dtms_scratch_ptr += size;
4040 }
4041
4042#else
4043 regs[rd] = 0;
4044 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4045#endif
4046 break;
4047 }
4048
4049 case DIF_SUBR_STRJOIN: {
4050 char *d = (char *)mstate->dtms_scratch_ptr;
4051 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4052 uintptr_t s1 = tupregs[0].dttk_value;
4053 uintptr_t s2 = tupregs[1].dttk_value;
4054 VBDTTYPE(unsigned,int) i = 0;
4055
4056 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4057 !dtrace_strcanload(s2, size, mstate, vstate)) {
4058 regs[rd] = NULL;
4059 break;
4060 }
4061
4062 if (!DTRACE_INSCRATCH(mstate, size)) {
4063 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4064 regs[rd] = NULL;
4065 break;
4066 }
4067
4068 for (;;) {
4069 if (i >= size) {
4070 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4071 regs[rd] = NULL;
4072 break;
4073 }
4074
4075 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4076 i--;
4077 break;
4078 }
4079 }
4080
4081 for (;;) {
4082 if (i >= size) {
4083 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4084 regs[rd] = NULL;
4085 break;
4086 }
4087
4088 if ((d[i++] = dtrace_load8(s2++)) == '\0')
4089 break;
4090 }
4091
4092 if (i < size) {
4093 mstate->dtms_scratch_ptr += i;
4094 regs[rd] = (uintptr_t)d;
4095 }
4096
4097 break;
4098 }
4099
4100 case DIF_SUBR_LLTOSTR: {
4101 int64_t i = (int64_t)tupregs[0].dttk_value;
4102 int64_t val = i < 0 ? i * -1 : i;
4103 uint64_t size = 22; /* enough room for 2^64 in decimal */
4104 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4105
4106 if (!DTRACE_INSCRATCH(mstate, size)) {
4107 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4108 regs[rd] = NULL;
4109 break;
4110 }
4111
4112 for (*end-- = '\0'; val; val /= 10)
4113 *end-- = '0' + (val % 10);
4114
4115 if (i == 0)
4116 *end-- = '0';
4117
4118 if (i < 0)
4119 *end-- = '-';
4120
4121 regs[rd] = (uintptr_t)end + 1;
4122 mstate->dtms_scratch_ptr += size;
4123 break;
4124 }
4125
4126 case DIF_SUBR_HTONS:
4127 case DIF_SUBR_NTOHS:
4128#ifdef _BIG_ENDIAN
4129 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4130#else
4131 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4132#endif
4133 break;
4134
4135
4136 case DIF_SUBR_HTONL:
4137 case DIF_SUBR_NTOHL:
4138#ifdef _BIG_ENDIAN
4139 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4140#else
4141 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4142#endif
4143 break;
4144
4145
4146 case DIF_SUBR_HTONLL:
4147 case DIF_SUBR_NTOHLL:
4148#ifdef _BIG_ENDIAN
4149 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4150#else
4151 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4152#endif
4153 break;
4154
4155
4156 case DIF_SUBR_DIRNAME:
4157 case DIF_SUBR_BASENAME: {
4158 char *dest = (char *)mstate->dtms_scratch_ptr;
4159 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4160 uintptr_t src = tupregs[0].dttk_value;
4161 int i, j, len = VBDTCAST(int)dtrace_strlen((char *)src, size);
4162 int lastbase = -1, firstbase = -1, lastdir = -1;
4163 int start, end;
4164
4165 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4166 regs[rd] = NULL;
4167 break;
4168 }
4169
4170 if (!DTRACE_INSCRATCH(mstate, size)) {
4171 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4172 regs[rd] = NULL;
4173 break;
4174 }
4175
4176 /*
4177 * The basename and dirname for a zero-length string is
4178 * defined to be "."
4179 */
4180 if (len == 0) {
4181 len = 1;
4182 src = (uintptr_t)".";
4183 }
4184
4185 /*
4186 * Start from the back of the string, moving back toward the
4187 * front until we see a character that isn't a slash. That
4188 * character is the last character in the basename.
4189 */
4190 for (i = len - 1; i >= 0; i--) {
4191 if (dtrace_load8(src + i) != '/')
4192 break;
4193 }
4194
4195 if (i >= 0)
4196 lastbase = i;
4197
4198 /*
4199 * Starting from the last character in the basename, move
4200 * towards the front until we find a slash. The character
4201 * that we processed immediately before that is the first
4202 * character in the basename.
4203 */
4204 for (; i >= 0; i--) {
4205 if (dtrace_load8(src + i) == '/')
4206 break;
4207 }
4208
4209 if (i >= 0)
4210 firstbase = i + 1;
4211
4212 /*
4213 * Now keep going until we find a non-slash character. That
4214 * character is the last character in the dirname.
4215 */
4216 for (; i >= 0; i--) {
4217 if (dtrace_load8(src + i) != '/')
4218 break;
4219 }
4220
4221 if (i >= 0)
4222 lastdir = i;
4223
4224 ASSERT(!(lastbase == -1 && firstbase != -1));
4225 ASSERT(!(firstbase == -1 && lastdir != -1));
4226
4227 if (lastbase == -1) {
4228 /*
4229 * We didn't find a non-slash character. We know that
4230 * the length is non-zero, so the whole string must be
4231 * slashes. In either the dirname or the basename
4232 * case, we return '/'.
4233 */
4234 ASSERT(firstbase == -1);
4235 firstbase = lastbase = lastdir = 0;
4236 }
4237
4238 if (firstbase == -1) {
4239 /*
4240 * The entire string consists only of a basename
4241 * component. If we're looking for dirname, we need
4242 * to change our string to be just "."; if we're
4243 * looking for a basename, we'll just set the first
4244 * character of the basename to be 0.
4245 */
4246 if (subr == DIF_SUBR_DIRNAME) {
4247 ASSERT(lastdir == -1);
4248 src = (uintptr_t)".";
4249 lastdir = 0;
4250 } else {
4251 firstbase = 0;
4252 }
4253 }
4254
4255 if (subr == DIF_SUBR_DIRNAME) {
4256 if (lastdir == -1) {
4257 /*
4258 * We know that we have a slash in the name --
4259 * or lastdir would be set to 0, above. And
4260 * because lastdir is -1, we know that this
4261 * slash must be the first character. (That
4262 * is, the full string must be of the form
4263 * "/basename".) In this case, the last
4264 * character of the directory name is 0.
4265 */
4266 lastdir = 0;
4267 }
4268
4269 start = 0;
4270 end = lastdir;
4271 } else {
4272 ASSERT(subr == DIF_SUBR_BASENAME);
4273 ASSERT(firstbase != -1 && lastbase != -1);
4274 start = firstbase;
4275 end = lastbase;
4276 }
4277
4278 for (i = start, j = 0; i <= end && VBDTCAST(unsigned)j < size - 1; i++, j++)
4279 dest[j] = dtrace_load8(src + i);
4280
4281 dest[j] = '\0';
4282 regs[rd] = (uintptr_t)dest;
4283 mstate->dtms_scratch_ptr += size;
4284 break;
4285 }
4286
4287 case DIF_SUBR_CLEANPATH: {
4288 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4289 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4290 uintptr_t src = tupregs[0].dttk_value;
4291 int i = 0, j = 0;
4292
4293 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4294 regs[rd] = NULL;
4295 break;
4296 }
4297
4298 if (!DTRACE_INSCRATCH(mstate, size)) {
4299 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4300 regs[rd] = NULL;
4301 break;
4302 }
4303
4304 /*
4305 * Move forward, loading each character.
4306 */
4307 do {
4308 c = dtrace_load8(src + i++);
4309next:
4310 if (j + 5 >= VBDTCAST(int64_t)size) /* 5 = strlen("/..c\0") */
4311 break;
4312
4313 if (c != '/') {
4314 dest[j++] = c;
4315 continue;
4316 }
4317
4318 c = dtrace_load8(src + i++);
4319
4320 if (c == '/') {
4321 /*
4322 * We have two slashes -- we can just advance
4323 * to the next character.
4324 */
4325 goto next;
4326 }
4327
4328 if (c != '.') {
4329 /*
4330 * This is not "." and it's not ".." -- we can
4331 * just store the "/" and this character and
4332 * drive on.
4333 */
4334 dest[j++] = '/';
4335 dest[j++] = c;
4336 continue;
4337 }
4338
4339 c = dtrace_load8(src + i++);
4340
4341 if (c == '/') {
4342 /*
4343 * This is a "/./" component. We're not going
4344 * to store anything in the destination buffer;
4345 * we're just going to go to the next component.
4346 */
4347 goto next;
4348 }
4349
4350 if (c != '.') {
4351 /*
4352 * This is not ".." -- we can just store the
4353 * "/." and this character and continue
4354 * processing.
4355 */
4356 dest[j++] = '/';
4357 dest[j++] = '.';
4358 dest[j++] = c;
4359 continue;
4360 }
4361
4362 c = dtrace_load8(src + i++);
4363
4364 if (c != '/' && c != '\0') {
4365 /*
4366 * This is not ".." -- it's "..[mumble]".
4367 * We'll store the "/.." and this character
4368 * and continue processing.
4369 */
4370 dest[j++] = '/';
4371 dest[j++] = '.';
4372 dest[j++] = '.';
4373 dest[j++] = c;
4374 continue;
4375 }
4376
4377 /*
4378 * This is "/../" or "/..\0". We need to back up
4379 * our destination pointer until we find a "/".
4380 */
4381 i--;
4382 while (j != 0 && dest[--j] != '/')
4383 continue;
4384
4385 if (c == '\0')
4386 dest[++j] = '/';
4387 } while (c != '\0');
4388
4389 dest[j] = '\0';
4390 regs[rd] = (uintptr_t)dest;
4391 mstate->dtms_scratch_ptr += size;
4392 break;
4393 }
4394
4395 case DIF_SUBR_INET_NTOA:
4396 case DIF_SUBR_INET_NTOA6:
4397 case DIF_SUBR_INET_NTOP: {
4398#ifndef VBOX
4399 size_t size;
4400 int af, argi, i;
4401 char *base, *end;
4402
4403 if (subr == DIF_SUBR_INET_NTOP) {
4404 af = (int)tupregs[0].dttk_value;
4405 argi = 1;
4406 } else {
4407 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4408 argi = 0;
4409 }
4410
4411 if (af == AF_INET) {
4412 ipaddr_t ip4;
4413 uint8_t *ptr8, val;
4414
4415 /*
4416 * Safely load the IPv4 address.
4417 */
4418 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4419
4420 /*
4421 * Check an IPv4 string will fit in scratch.
4422 */
4423 size = INET_ADDRSTRLEN;
4424 if (!DTRACE_INSCRATCH(mstate, size)) {
4425 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4426 regs[rd] = NULL;
4427 break;
4428 }
4429 base = (char *)mstate->dtms_scratch_ptr;
4430 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4431
4432 /*
4433 * Stringify as a dotted decimal quad.
4434 */
4435 *end-- = '\0';
4436 ptr8 = (uint8_t *)&ip4;
4437 for (i = 3; i >= 0; i--) {
4438 val = ptr8[i];
4439
4440 if (val == 0) {
4441 *end-- = '0';
4442 } else {
4443 for (; val; val /= 10) {
4444 *end-- = '0' + (val % 10);
4445 }
4446 }
4447
4448 if (i > 0)
4449 *end-- = '.';
4450 }
4451 ASSERT(end + 1 >= base);
4452
4453 } else if (af == AF_INET6) {
4454 struct in6_addr ip6;
4455 int firstzero, tryzero, numzero, v6end;
4456 uint16_t val;
4457 const char digits[] = "0123456789abcdef";
4458
4459 /*
4460 * Stringify using RFC 1884 convention 2 - 16 bit
4461 * hexadecimal values with a zero-run compression.
4462 * Lower case hexadecimal digits are used.
4463 * eg, fe80::214:4fff:fe0b:76c8.
4464 * The IPv4 embedded form is returned for inet_ntop,
4465 * just the IPv4 string is returned for inet_ntoa6.
4466 */
4467
4468 /*
4469 * Safely load the IPv6 address.
4470 */
4471 dtrace_bcopy(
4472 (void *)(uintptr_t)tupregs[argi].dttk_value,
4473 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4474
4475 /*
4476 * Check an IPv6 string will fit in scratch.
4477 */
4478 size = INET6_ADDRSTRLEN;
4479 if (!DTRACE_INSCRATCH(mstate, size)) {
4480 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4481 regs[rd] = NULL;
4482 break;
4483 }
4484 base = (char *)mstate->dtms_scratch_ptr;
4485 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4486 *end-- = '\0';
4487
4488 /*
4489 * Find the longest run of 16 bit zero values
4490 * for the single allowed zero compression - "::".
4491 */
4492 firstzero = -1;
4493 tryzero = -1;
4494 numzero = 1;
4495 for (i = 0; i < sizeof (struct in6_addr); i++) {
4496 if (ip6._S6_un._S6_u8[i] == 0 &&
4497 tryzero == -1 && i % 2 == 0) {
4498 tryzero = i;
4499 continue;
4500 }
4501
4502 if (tryzero != -1 &&
4503 (ip6._S6_un._S6_u8[i] != 0 ||
4504 i == sizeof (struct in6_addr) - 1)) {
4505
4506 if (i - tryzero <= numzero) {
4507 tryzero = -1;
4508 continue;
4509 }
4510
4511 firstzero = tryzero;
4512 numzero = i - i % 2 - tryzero;
4513 tryzero = -1;
4514
4515 if (ip6._S6_un._S6_u8[i] == 0 &&
4516 i == sizeof (struct in6_addr) - 1)
4517 numzero += 2;
4518 }
4519 }
4520 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4521
4522 /*
4523 * Check for an IPv4 embedded address.
4524 */
4525 v6end = sizeof (struct in6_addr) - 2;
4526 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4527 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4528 for (i = sizeof (struct in6_addr) - 1;
4529 i >= DTRACE_V4MAPPED_OFFSET; i--) {
4530 ASSERT(end >= base);
4531
4532 val = ip6._S6_un._S6_u8[i];
4533
4534 if (val == 0) {
4535 *end-- = '0';
4536 } else {
4537 for (; val; val /= 10) {
4538 *end-- = '0' + val % 10;
4539 }
4540 }
4541
4542 if (i > DTRACE_V4MAPPED_OFFSET)
4543 *end-- = '.';
4544 }
4545
4546 if (subr == DIF_SUBR_INET_NTOA6)
4547 goto inetout;
4548
4549 /*
4550 * Set v6end to skip the IPv4 address that
4551 * we have already stringified.
4552 */
4553 v6end = 10;
4554 }
4555
4556 /*
4557 * Build the IPv6 string by working through the
4558 * address in reverse.
4559 */
4560 for (i = v6end; i >= 0; i -= 2) {
4561 ASSERT(end >= base);
4562
4563 if (i == firstzero + numzero - 2) {
4564 *end-- = ':';
4565 *end-- = ':';
4566 i -= numzero - 2;
4567 continue;
4568 }
4569
4570 if (i < 14 && i != firstzero - 2)
4571 *end-- = ':';
4572
4573 val = (ip6._S6_un._S6_u8[i] << 8) +
4574 ip6._S6_un._S6_u8[i + 1];
4575
4576 if (val == 0) {
4577 *end-- = '0';
4578 } else {
4579 for (; val; val /= 16) {
4580 *end-- = digits[val % 16];
4581 }
4582 }
4583 }
4584 ASSERT(end + 1 >= base);
4585
4586 } else {
4587 /*
4588 * The user didn't use AH_INET or AH_INET6.
4589 */
4590 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4591 regs[rd] = NULL;
4592 break;
4593 }
4594
4595inetout: regs[rd] = (uintptr_t)end + 1;
4596 mstate->dtms_scratch_ptr += size;
4597#else /* VBOX */
4598 regs[rd] = 0;
4599 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4600#endif /* VBOX */
4601 break;
4602 }
4603
4604 }
4605}
4606
4607/*
4608 * Emulate the execution of DTrace IR instructions specified by the given
4609 * DIF object. This function is deliberately void of assertions as all of
4610 * the necessary checks are handled by a call to dtrace_difo_validate().
4611 */
4612static uint64_t
4613dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4614 dtrace_vstate_t *vstate, dtrace_state_t *state)
4615{
4616 const dif_instr_t *text = difo->dtdo_buf;
4617 const uint_t textlen = difo->dtdo_len;
4618 const char *strtab = difo->dtdo_strtab;
4619 const uint64_t *inttab = difo->dtdo_inttab;
4620
4621 uint64_t rval = 0;
4622 dtrace_statvar_t *svar;
4623 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4624 dtrace_difv_t *v;
4625 volatile uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
4626 volatile uintptr_t *illval = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
4627
4628 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4629 uint64_t regs[DIF_DIR_NREGS];
4630 uint64_t *tmp;
4631
4632 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4633 int64_t cc_r;
4634 uint_t pc = 0, id, opc VBDTUNASS(0);
4635 uint8_t ttop = 0;
4636 dif_instr_t instr;
4637 uint_t r1, r2, rd;
4638
4639 /*
4640 * We stash the current DIF object into the machine state: we need it
4641 * for subsequent access checking.
4642 */
4643 mstate->dtms_difo = difo;
4644
4645 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
4646
4647 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4648 opc = pc;
4649
4650 instr = text[pc++];
4651 r1 = DIF_INSTR_R1(instr);
4652 r2 = DIF_INSTR_R2(instr);
4653 rd = DIF_INSTR_RD(instr);
4654
4655 switch (DIF_INSTR_OP(instr)) {
4656 case DIF_OP_OR:
4657 regs[rd] = regs[r1] | regs[r2];
4658 break;
4659 case DIF_OP_XOR:
4660 regs[rd] = regs[r1] ^ regs[r2];
4661 break;
4662 case DIF_OP_AND:
4663 regs[rd] = regs[r1] & regs[r2];
4664 break;
4665 case DIF_OP_SLL:
4666 regs[rd] = regs[r1] << regs[r2];
4667 break;
4668 case DIF_OP_SRL:
4669 regs[rd] = regs[r1] >> regs[r2];
4670 break;
4671 case DIF_OP_SUB:
4672 regs[rd] = regs[r1] - regs[r2];
4673 break;
4674 case DIF_OP_ADD:
4675 regs[rd] = regs[r1] + regs[r2];
4676 break;
4677 case DIF_OP_MUL:
4678 regs[rd] = regs[r1] * regs[r2];
4679 break;
4680 case DIF_OP_SDIV:
4681 if (regs[r2] == 0) {
4682 regs[rd] = 0;
4683 *flags |= CPU_DTRACE_DIVZERO;
4684 } else {
4685 regs[rd] = (int64_t)regs[r1] /
4686 (int64_t)regs[r2];
4687 }
4688 break;
4689
4690 case DIF_OP_UDIV:
4691 if (regs[r2] == 0) {
4692 regs[rd] = 0;
4693 *flags |= CPU_DTRACE_DIVZERO;
4694 } else {
4695 regs[rd] = regs[r1] / regs[r2];
4696 }
4697 break;
4698
4699 case DIF_OP_SREM:
4700 if (regs[r2] == 0) {
4701 regs[rd] = 0;
4702 *flags |= CPU_DTRACE_DIVZERO;
4703 } else {
4704 regs[rd] = (int64_t)regs[r1] %
4705 (int64_t)regs[r2];
4706 }
4707 break;
4708
4709 case DIF_OP_UREM:
4710 if (regs[r2] == 0) {
4711 regs[rd] = 0;
4712 *flags |= CPU_DTRACE_DIVZERO;
4713 } else {
4714 regs[rd] = regs[r1] % regs[r2];
4715 }
4716 break;
4717
4718 case DIF_OP_NOT:
4719 regs[rd] = ~regs[r1];
4720 break;
4721 case DIF_OP_MOV:
4722 regs[rd] = regs[r1];
4723 break;
4724 case DIF_OP_CMP:
4725 cc_r = regs[r1] - regs[r2];
4726 cc_n = cc_r < 0;
4727 cc_z = cc_r == 0;
4728 cc_v = 0;
4729 cc_c = regs[r1] < regs[r2];
4730 break;
4731 case DIF_OP_TST:
4732 cc_n = cc_v = cc_c = 0;
4733 cc_z = regs[r1] == 0;
4734 break;
4735 case DIF_OP_BA:
4736 pc = DIF_INSTR_LABEL(instr);
4737 break;
4738 case DIF_OP_BE:
4739 if (cc_z)
4740 pc = DIF_INSTR_LABEL(instr);
4741 break;
4742 case DIF_OP_BNE:
4743 if (cc_z == 0)
4744 pc = DIF_INSTR_LABEL(instr);
4745 break;
4746 case DIF_OP_BG:
4747 if ((cc_z | (cc_n ^ cc_v)) == 0)
4748 pc = DIF_INSTR_LABEL(instr);
4749 break;
4750 case DIF_OP_BGU:
4751 if ((cc_c | cc_z) == 0)
4752 pc = DIF_INSTR_LABEL(instr);
4753 break;
4754 case DIF_OP_BGE:
4755 if ((cc_n ^ cc_v) == 0)
4756 pc = DIF_INSTR_LABEL(instr);
4757 break;
4758 case DIF_OP_BGEU:
4759 if (cc_c == 0)
4760 pc = DIF_INSTR_LABEL(instr);
4761 break;
4762 case DIF_OP_BL:
4763 if (cc_n ^ cc_v)
4764 pc = DIF_INSTR_LABEL(instr);
4765 break;
4766 case DIF_OP_BLU:
4767 if (cc_c)
4768 pc = DIF_INSTR_LABEL(instr);
4769 break;
4770 case DIF_OP_BLE:
4771 if (cc_z | (cc_n ^ cc_v))
4772 pc = DIF_INSTR_LABEL(instr);
4773 break;
4774 case DIF_OP_BLEU:
4775 if (cc_c | cc_z)
4776 pc = DIF_INSTR_LABEL(instr);
4777 break;
4778 case DIF_OP_RLDSB:
4779 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4780 *flags |= CPU_DTRACE_KPRIV;
4781 *illval = regs[r1];
4782 break;
4783 }
4784 /*FALLTHROUGH*/
4785 case DIF_OP_LDSB:
4786 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4787 break;
4788 case DIF_OP_RLDSH:
4789 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4790 *flags |= CPU_DTRACE_KPRIV;
4791 *illval = regs[r1];
4792 break;
4793 }
4794 /*FALLTHROUGH*/
4795 case DIF_OP_LDSH:
4796 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4797 break;
4798 case DIF_OP_RLDSW:
4799 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4800 *flags |= CPU_DTRACE_KPRIV;
4801 *illval = regs[r1];
4802 break;
4803 }
4804 /*FALLTHROUGH*/
4805 case DIF_OP_LDSW:
4806 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4807 break;
4808 case DIF_OP_RLDUB:
4809 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4810 *flags |= CPU_DTRACE_KPRIV;
4811 *illval = regs[r1];
4812 break;
4813 }
4814 /*FALLTHROUGH*/
4815 case DIF_OP_LDUB:
4816 regs[rd] = dtrace_load8(regs[r1]);
4817 break;
4818 case DIF_OP_RLDUH:
4819 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4820 *flags |= CPU_DTRACE_KPRIV;
4821 *illval = regs[r1];
4822 break;
4823 }
4824 /*FALLTHROUGH*/
4825 case DIF_OP_LDUH:
4826 regs[rd] = dtrace_load16(regs[r1]);
4827 break;
4828 case DIF_OP_RLDUW:
4829 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4830 *flags |= CPU_DTRACE_KPRIV;
4831 *illval = regs[r1];
4832 break;
4833 }
4834 /*FALLTHROUGH*/
4835 case DIF_OP_LDUW:
4836 regs[rd] = dtrace_load32(regs[r1]);
4837 break;
4838 case DIF_OP_RLDX:
4839 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4840 *flags |= CPU_DTRACE_KPRIV;
4841 *illval = regs[r1];
4842 break;
4843 }
4844 /*FALLTHROUGH*/
4845 case DIF_OP_LDX:
4846 regs[rd] = dtrace_load64(regs[r1]);
4847 break;
4848 case DIF_OP_ULDSB:
4849 regs[rd] = (int8_t)
4850 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4851 break;
4852 case DIF_OP_ULDSH:
4853 regs[rd] = (int16_t)
4854 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4855 break;
4856 case DIF_OP_ULDSW:
4857 regs[rd] = (int32_t)
4858 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4859 break;
4860 case DIF_OP_ULDUB:
4861 regs[rd] =
4862 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4863 break;
4864 case DIF_OP_ULDUH:
4865 regs[rd] =
4866 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4867 break;
4868 case DIF_OP_ULDUW:
4869 regs[rd] =
4870 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4871 break;
4872 case DIF_OP_ULDX:
4873 regs[rd] =
4874 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
4875 break;
4876 case DIF_OP_RET:
4877 rval = regs[rd];
4878 pc = textlen;
4879 break;
4880 case DIF_OP_NOP:
4881 break;
4882 case DIF_OP_SETX:
4883 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4884 break;
4885 case DIF_OP_SETS:
4886 regs[rd] = (uint64_t)(uintptr_t)
4887 (strtab + DIF_INSTR_STRING(instr));
4888 break;
4889 case DIF_OP_SCMP: {
4890 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
4891 uintptr_t s1 = regs[r1];
4892 uintptr_t s2 = regs[r2];
4893
4894 if (s1 != NULL &&
4895 !dtrace_strcanload(s1, sz, mstate, vstate))
4896 break;
4897 if (s2 != NULL &&
4898 !dtrace_strcanload(s2, sz, mstate, vstate))
4899 break;
4900
4901 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
4902
4903 cc_n = cc_r < 0;
4904 cc_z = cc_r == 0;
4905 cc_v = cc_c = 0;
4906 break;
4907 }
4908 case DIF_OP_LDGA:
4909 regs[rd] = dtrace_dif_variable(mstate, state,
4910 r1, regs[r2]);
4911 break;
4912 case DIF_OP_LDGS:
4913 id = DIF_INSTR_VAR(instr);
4914
4915 if (id >= DIF_VAR_OTHER_UBASE) {
4916 uintptr_t a;
4917
4918 id -= DIF_VAR_OTHER_UBASE;
4919 svar = vstate->dtvs_globals[id];
4920 ASSERT(svar != NULL);
4921 v = &svar->dtsv_var;
4922
4923 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
4924 regs[rd] = svar->dtsv_data;
4925 break;
4926 }
4927
4928 a = (uintptr_t)svar->dtsv_data;
4929
4930 if (*(uint8_t *)a == UINT8_MAX) {
4931 /*
4932 * If the 0th byte is set to UINT8_MAX
4933 * then this is to be treated as a
4934 * reference to a NULL variable.
4935 */
4936 regs[rd] = NULL;
4937 } else {
4938 regs[rd] = a + sizeof (uint64_t);
4939 }
4940
4941 break;
4942 }
4943
4944 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
4945 break;
4946
4947 case DIF_OP_STGS:
4948 id = DIF_INSTR_VAR(instr);
4949
4950 ASSERT(id >= DIF_VAR_OTHER_UBASE);
4951 id -= DIF_VAR_OTHER_UBASE;
4952
4953 svar = vstate->dtvs_globals[id];
4954 ASSERT(svar != NULL);
4955 v = &svar->dtsv_var;
4956
4957 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4958 uintptr_t a = (uintptr_t)svar->dtsv_data;
4959
4960 ASSERT(a != NULL);
4961 ASSERT(svar->dtsv_size != 0);
4962
4963 if (regs[rd] == NULL) {
4964 *(uint8_t *)a = UINT8_MAX;
4965 break;
4966 } else {
4967 *(uint8_t *)a = 0;
4968 a += sizeof (uint64_t);
4969 }
4970 if (!dtrace_vcanload(
4971 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
4972 mstate, vstate))
4973 break;
4974
4975 dtrace_vcopy((void *)(uintptr_t)regs[rd],
4976 (void *)a, &v->dtdv_type);
4977 break;
4978 }
4979
4980 svar->dtsv_data = regs[rd];
4981 break;
4982
4983 case DIF_OP_LDTA:
4984 /*
4985 * There are no DTrace built-in thread-local arrays at
4986 * present. This opcode is saved for future work.
4987 */
4988 *flags |= CPU_DTRACE_ILLOP;
4989 regs[rd] = 0;
4990 break;
4991
4992 case DIF_OP_LDLS:
4993 id = DIF_INSTR_VAR(instr);
4994
4995 if (id < DIF_VAR_OTHER_UBASE) {
4996 /*
4997 * For now, this has no meaning.
4998 */
4999 regs[rd] = 0;
5000 break;
5001 }
5002
5003 id -= DIF_VAR_OTHER_UBASE;
5004
5005 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5006 ASSERT(vstate->dtvs_locals != NULL);
5007
5008 svar = vstate->dtvs_locals[id];
5009 ASSERT(svar != NULL);
5010 v = &svar->dtsv_var;
5011
5012 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5013 uintptr_t a = (uintptr_t)svar->dtsv_data;
5014 size_t sz = v->dtdv_type.dtdt_size;
5015
5016 sz += sizeof (uint64_t);
5017 ASSERT(svar->dtsv_size == NCPU * sz);
5018 a += VBDT_GET_CPUID() * sz;
5019
5020 if (*(uint8_t *)a == UINT8_MAX) {
5021 /*
5022 * If the 0th byte is set to UINT8_MAX
5023 * then this is to be treated as a
5024 * reference to a NULL variable.
5025 */
5026 regs[rd] = NULL;
5027 } else {
5028 regs[rd] = a + sizeof (uint64_t);
5029 }
5030
5031 break;
5032 }
5033
5034 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5035 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5036 regs[rd] = tmp[VBDT_GET_CPUID()];
5037 break;
5038
5039 case DIF_OP_STLS:
5040 id = DIF_INSTR_VAR(instr);
5041
5042 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5043 id -= DIF_VAR_OTHER_UBASE;
5044 ASSERT(VBDTCAST(int64_t)id < vstate->dtvs_nlocals);
5045
5046 ASSERT(vstate->dtvs_locals != NULL);
5047 svar = vstate->dtvs_locals[id];
5048 ASSERT(svar != NULL);
5049 v = &svar->dtsv_var;
5050
5051 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5052 uintptr_t a = (uintptr_t)svar->dtsv_data;
5053 size_t sz = v->dtdv_type.dtdt_size;
5054
5055 sz += sizeof (uint64_t);
5056 ASSERT(svar->dtsv_size == NCPU * sz);
5057 a += VBDT_GET_CPUID() * sz;
5058
5059 if (regs[rd] == NULL) {
5060 *(uint8_t *)a = UINT8_MAX;
5061 break;
5062 } else {
5063 *(uint8_t *)a = 0;
5064 a += sizeof (uint64_t);
5065 }
5066
5067 if (!dtrace_vcanload(
5068 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5069 mstate, vstate))
5070 break;
5071
5072 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5073 (void *)a, &v->dtdv_type);
5074 break;
5075 }
5076
5077 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5078 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5079 tmp[VBDT_GET_CPUID()] = regs[rd];
5080 break;
5081
5082 case DIF_OP_LDTS: {
5083 dtrace_dynvar_t *dvar;
5084 dtrace_key_t *key;
5085
5086 id = DIF_INSTR_VAR(instr);
5087 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5088 id -= DIF_VAR_OTHER_UBASE;
5089 v = &vstate->dtvs_tlocals[id];
5090
5091 key = &tupregs[DIF_DTR_NREGS];
5092 key[0].dttk_value = (uint64_t)id;
5093 key[0].dttk_size = 0;
5094 DTRACE_TLS_THRKEY(key[1].dttk_value);
5095 key[1].dttk_size = 0;
5096
5097 dvar = dtrace_dynvar(dstate, 2, key,
5098 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5099 mstate, vstate);
5100
5101 if (dvar == NULL) {
5102 regs[rd] = 0;
5103 break;
5104 }
5105
5106 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5107 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5108 } else {
5109 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5110 }
5111
5112 break;
5113 }
5114
5115 case DIF_OP_STTS: {
5116 dtrace_dynvar_t *dvar;
5117 dtrace_key_t *key;
5118
5119 id = DIF_INSTR_VAR(instr);
5120 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5121 id -= DIF_VAR_OTHER_UBASE;
5122
5123 key = &tupregs[DIF_DTR_NREGS];
5124 key[0].dttk_value = (uint64_t)id;
5125 key[0].dttk_size = 0;
5126 DTRACE_TLS_THRKEY(key[1].dttk_value);
5127 key[1].dttk_size = 0;
5128 v = &vstate->dtvs_tlocals[id];
5129
5130 dvar = dtrace_dynvar(dstate, 2, key,
5131 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5132 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5133 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5134 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5135
5136 /*
5137 * Given that we're storing to thread-local data,
5138 * we need to flush our predicate cache.
5139 */
5140 curthread->t_predcache = NULL;
5141
5142 if (dvar == NULL)
5143 break;
5144
5145 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5146 if (!dtrace_vcanload(
5147 (void *)(uintptr_t)regs[rd],
5148 &v->dtdv_type, mstate, vstate))
5149 break;
5150
5151 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5152 dvar->dtdv_data, &v->dtdv_type);
5153 } else {
5154 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5155 }
5156
5157 break;
5158 }
5159
5160 case DIF_OP_SRA:
5161 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5162 break;
5163
5164 case DIF_OP_CALL:
5165 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5166 regs, tupregs, ttop, mstate, state);
5167 break;
5168
5169 case DIF_OP_PUSHTR:
5170 if (ttop == DIF_DTR_NREGS) {
5171 *flags |= CPU_DTRACE_TUPOFLOW;
5172 break;
5173 }
5174
5175 if (r1 == DIF_TYPE_STRING) {
5176 /*
5177 * If this is a string type and the size is 0,
5178 * we'll use the system-wide default string
5179 * size. Note that we are _not_ looking at
5180 * the value of the DTRACEOPT_STRSIZE option;
5181 * had this been set, we would expect to have
5182 * a non-zero size value in the "pushtr".
5183 */
5184 tupregs[ttop].dttk_size =
5185 dtrace_strlen((char *)(uintptr_t)regs[rd],
5186 regs[r2] ? regs[r2] :
5187 dtrace_strsize_default) + 1;
5188 } else {
5189 tupregs[ttop].dttk_size = regs[r2];
5190 }
5191
5192 tupregs[ttop++].dttk_value = regs[rd];
5193 break;
5194
5195 case DIF_OP_PUSHTV:
5196 if (ttop == DIF_DTR_NREGS) {
5197 *flags |= CPU_DTRACE_TUPOFLOW;
5198 break;
5199 }
5200
5201 tupregs[ttop].dttk_value = regs[rd];
5202 tupregs[ttop++].dttk_size = 0;
5203 break;
5204
5205 case DIF_OP_POPTS:
5206 if (ttop != 0)
5207 ttop--;
5208 break;
5209
5210 case DIF_OP_FLUSHTS:
5211 ttop = 0;
5212 break;
5213
5214 case DIF_OP_LDGAA:
5215 case DIF_OP_LDTAA: {
5216 dtrace_dynvar_t *dvar;
5217 dtrace_key_t *key = tupregs;
5218 uint_t nkeys = ttop;
5219
5220 id = DIF_INSTR_VAR(instr);
5221 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5222 id -= DIF_VAR_OTHER_UBASE;
5223
5224 key[nkeys].dttk_value = (uint64_t)id;
5225 key[nkeys++].dttk_size = 0;
5226
5227 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5228 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5229 key[nkeys++].dttk_size = 0;
5230 v = &vstate->dtvs_tlocals[id];
5231 } else {
5232 v = &vstate->dtvs_globals[id]->dtsv_var;
5233 }
5234
5235 dvar = dtrace_dynvar(dstate, nkeys, key,
5236 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5237 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5238 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5239
5240 if (dvar == NULL) {
5241 regs[rd] = 0;
5242 break;
5243 }
5244
5245 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5246 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5247 } else {
5248 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5249 }
5250
5251 break;
5252 }
5253
5254 case DIF_OP_STGAA:
5255 case DIF_OP_STTAA: {
5256 dtrace_dynvar_t *dvar;
5257 dtrace_key_t *key = tupregs;
5258 uint_t nkeys = ttop;
5259
5260 id = DIF_INSTR_VAR(instr);
5261 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5262 id -= DIF_VAR_OTHER_UBASE;
5263
5264 key[nkeys].dttk_value = (uint64_t)id;
5265 key[nkeys++].dttk_size = 0;
5266
5267 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5268 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5269 key[nkeys++].dttk_size = 0;
5270 v = &vstate->dtvs_tlocals[id];
5271 } else {
5272 v = &vstate->dtvs_globals[id]->dtsv_var;
5273 }
5274
5275 dvar = dtrace_dynvar(dstate, nkeys, key,
5276 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5277 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5278 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5279 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5280
5281 if (dvar == NULL)
5282 break;
5283
5284 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5285 if (!dtrace_vcanload(
5286 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5287 mstate, vstate))
5288 break;
5289
5290 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5291 dvar->dtdv_data, &v->dtdv_type);
5292 } else {
5293 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5294 }
5295
5296 break;
5297 }
5298
5299 case DIF_OP_ALLOCS: {
5300 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5301 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5302
5303 /*
5304 * Rounding up the user allocation size could have
5305 * overflowed large, bogus allocations (like -1ULL) to
5306 * 0.
5307 */
5308 if (size < regs[r1] ||
5309 !DTRACE_INSCRATCH(mstate, size)) {
5310 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5311 regs[rd] = NULL;
5312 break;
5313 }
5314
5315 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5316 mstate->dtms_scratch_ptr += size;
5317 regs[rd] = ptr;
5318 break;
5319 }
5320
5321 case DIF_OP_COPYS:
5322 if (!dtrace_canstore(regs[rd], regs[r2],
5323 mstate, vstate)) {
5324 *flags |= CPU_DTRACE_BADADDR;
5325 *illval = regs[rd];
5326 break;
5327 }
5328
5329 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5330 break;
5331
5332 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5333 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5334 break;
5335
5336 case DIF_OP_STB:
5337 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5338 *flags |= CPU_DTRACE_BADADDR;
5339 *illval = regs[rd];
5340 break;
5341 }
5342 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5343 break;
5344
5345 case DIF_OP_STH:
5346 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5347 *flags |= CPU_DTRACE_BADADDR;
5348 *illval = regs[rd];
5349 break;
5350 }
5351 if (regs[rd] & 1) {
5352 *flags |= CPU_DTRACE_BADALIGN;
5353 *illval = regs[rd];
5354 break;
5355 }
5356 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5357 break;
5358
5359 case DIF_OP_STW:
5360 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5361 *flags |= CPU_DTRACE_BADADDR;
5362 *illval = regs[rd];
5363 break;
5364 }
5365 if (regs[rd] & 3) {
5366 *flags |= CPU_DTRACE_BADALIGN;
5367 *illval = regs[rd];
5368 break;
5369 }
5370 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5371 break;
5372
5373 case DIF_OP_STX:
5374 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5375 *flags |= CPU_DTRACE_BADADDR;
5376 *illval = regs[rd];
5377 break;
5378 }
5379 if (regs[rd] & 7) {
5380 *flags |= CPU_DTRACE_BADALIGN;
5381 *illval = regs[rd];
5382 break;
5383 }
5384 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5385 break;
5386 }
5387 }
5388
5389 if (!(*flags & CPU_DTRACE_FAULT))
5390 return (rval);
5391
5392 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5393 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5394
5395 return (0);
5396}
5397
5398#ifndef VBOX /* no destructive stuff */
5399
5400static void
5401dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5402{
5403 dtrace_probe_t *probe = ecb->dte_probe;
5404 dtrace_provider_t *prov = probe->dtpr_provider;
5405 char c[DTRACE_FULLNAMELEN + 80], *str;
5406 char *msg = "dtrace: breakpoint action at probe ";
5407 char *ecbmsg = " (ecb ";
5408 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5409 uintptr_t val = (uintptr_t)ecb;
5410 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5411
5412 if (dtrace_destructive_disallow)
5413 return;
5414
5415 /*
5416 * It's impossible to be taking action on the NULL probe.
5417 */
5418 ASSERT(probe != NULL);
5419
5420 /*
5421 * This is a poor man's (destitute man's?) sprintf(): we want to
5422 * print the provider name, module name, function name and name of
5423 * the probe, along with the hex address of the ECB with the breakpoint
5424 * action -- all of which we must place in the character buffer by
5425 * hand.
5426 */
5427 while (*msg != '\0')
5428 c[i++] = *msg++;
5429
5430 for (str = prov->dtpv_name; *str != '\0'; str++)
5431 c[i++] = *str;
5432 c[i++] = ':';
5433
5434 for (str = probe->dtpr_mod; *str != '\0'; str++)
5435 c[i++] = *str;
5436 c[i++] = ':';
5437
5438 for (str = probe->dtpr_func; *str != '\0'; str++)
5439 c[i++] = *str;
5440 c[i++] = ':';
5441
5442 for (str = probe->dtpr_name; *str != '\0'; str++)
5443 c[i++] = *str;
5444
5445 while (*ecbmsg != '\0')
5446 c[i++] = *ecbmsg++;
5447
5448 while (shift >= 0) {
5449 mask = (uintptr_t)0xf << shift;
5450
5451 if (val >= ((uintptr_t)1 << shift))
5452 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5453 shift -= 4;
5454 }
5455
5456 c[i++] = ')';
5457 c[i] = '\0';
5458
5459 debug_enter(c);
5460}
5461
5462static void
5463dtrace_action_panic(dtrace_ecb_t *ecb)
5464{
5465 dtrace_probe_t *probe = ecb->dte_probe;
5466
5467 /*
5468 * It's impossible to be taking action on the NULL probe.
5469 */
5470 ASSERT(probe != NULL);
5471
5472 if (dtrace_destructive_disallow)
5473 return;
5474
5475 if (dtrace_panicked != NULL)
5476 return;
5477
5478 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5479 return;
5480
5481 /*
5482 * We won the right to panic. (We want to be sure that only one
5483 * thread calls panic() from dtrace_probe(), and that panic() is
5484 * called exactly once.)
5485 */
5486 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5487 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5488 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5489}
5490
5491static void
5492dtrace_action_raise(uint64_t sig)
5493{
5494 if (dtrace_destructive_disallow)
5495 return;
5496
5497 if (sig >= NSIG) {
5498 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5499 return;
5500 }
5501
5502 /*
5503 * raise() has a queue depth of 1 -- we ignore all subsequent
5504 * invocations of the raise() action.
5505 */
5506 if (curthread->t_dtrace_sig == 0)
5507 curthread->t_dtrace_sig = (uint8_t)sig;
5508
5509 curthread->t_sig_check = 1;
5510 aston(curthread);
5511}
5512
5513static void
5514dtrace_action_stop(void)
5515{
5516 if (dtrace_destructive_disallow)
5517 return;
5518
5519 if (!curthread->t_dtrace_stop) {
5520 curthread->t_dtrace_stop = 1;
5521 curthread->t_sig_check = 1;
5522 aston(curthread);
5523 }
5524}
5525
5526static void
5527dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5528{
5529 hrtime_t now;
5530 volatile uint16_t *flags;
5531 cpu_t *cpu = CPU;
5532
5533 if (dtrace_destructive_disallow)
5534 return;
5535
5536 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5537
5538 now = dtrace_gethrtime();
5539
5540 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5541 /*
5542 * We need to advance the mark to the current time.
5543 */
5544 cpu->cpu_dtrace_chillmark = now;
5545 cpu->cpu_dtrace_chilled = 0;
5546 }
5547
5548 /*
5549 * Now check to see if the requested chill time would take us over
5550 * the maximum amount of time allowed in the chill interval. (Or
5551 * worse, if the calculation itself induces overflow.)
5552 */
5553 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5554 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5555 *flags |= CPU_DTRACE_ILLOP;
5556 return;
5557 }
5558
5559 while (dtrace_gethrtime() - now < val)
5560 continue;
5561
5562 /*
5563 * Normally, we assure that the value of the variable "timestamp" does
5564 * not change within an ECB. The presence of chill() represents an
5565 * exception to this rule, however.
5566 */
5567 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5568 cpu->cpu_dtrace_chilled += val;
5569}
5570
5571#endif /* !VBOX */
5572
5573static void
5574dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5575 uint64_t *buf, uint64_t arg)
5576{
5577 int nframes = DTRACE_USTACK_NFRAMES(arg);
5578 int strsize = DTRACE_USTACK_STRSIZE(arg);
5579 uint64_t *pcs = &buf[1], *fps;
5580 char *str = (char *)&pcs[nframes];
5581 int size, offs = 0, i, j;
5582 uintptr_t old = mstate->dtms_scratch_ptr, saved;
5583#ifndef VBOX
5584 uint16_t *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5585#else
5586 uint16_t volatile *flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
5587#endif
5588 char *sym;
5589
5590 /*
5591 * Should be taking a faster path if string space has not been
5592 * allocated.
5593 */
5594 ASSERT(strsize != 0);
5595
5596 /*
5597 * We will first allocate some temporary space for the frame pointers.
5598 */
5599 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5600 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5601 (nframes * sizeof (uint64_t));
5602
5603 if (!DTRACE_INSCRATCH(mstate, VBDTCAST(unsigned)size)) {
5604 /*
5605 * Not enough room for our frame pointers -- need to indicate
5606 * that we ran out of scratch space.
5607 */
5608 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5609 return;
5610 }
5611
5612 mstate->dtms_scratch_ptr += size;
5613 saved = mstate->dtms_scratch_ptr;
5614
5615 /*
5616 * Now get a stack with both program counters and frame pointers.
5617 */
5618 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5619 dtrace_getufpstack(buf, fps, nframes + 1);
5620 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5621
5622 /*
5623 * If that faulted, we're cooked.
5624 */
5625 if (*flags & CPU_DTRACE_FAULT)
5626 goto out;
5627
5628 /*
5629 * Now we want to walk up the stack, calling the USTACK helper. For
5630 * each iteration, we restore the scratch pointer.
5631 */
5632 for (i = 0; i < nframes; i++) {
5633 mstate->dtms_scratch_ptr = saved;
5634
5635 if (offs >= strsize)
5636 break;
5637
5638 sym = (char *)(uintptr_t)dtrace_helper(
5639 DTRACE_HELPER_ACTION_USTACK,
5640 mstate, state, pcs[i], fps[i]);
5641
5642 /*
5643 * If we faulted while running the helper, we're going to
5644 * clear the fault and null out the corresponding string.
5645 */
5646 if (*flags & CPU_DTRACE_FAULT) {
5647 *flags &= ~CPU_DTRACE_FAULT;
5648 str[offs++] = '\0';
5649 continue;
5650 }
5651
5652 if (sym == NULL) {
5653 str[offs++] = '\0';
5654 continue;
5655 }
5656
5657 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5658
5659 /*
5660 * Now copy in the string that the helper returned to us.
5661 */
5662 for (j = 0; offs + j < strsize; j++) {
5663 if ((str[offs + j] = sym[j]) == '\0')
5664 break;
5665 }
5666
5667 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5668
5669 offs += j + 1;
5670 }
5671
5672 if (offs >= strsize) {
5673 /*
5674 * If we didn't have room for all of the strings, we don't
5675 * abort processing -- this needn't be a fatal error -- but we
5676 * still want to increment a counter (dts_stkstroverflows) to
5677 * allow this condition to be warned about. (If this is from
5678 * a jstack() action, it is easily tuned via jstackstrsize.)
5679 */
5680 dtrace_error(&state->dts_stkstroverflows);
5681 }
5682
5683 while (offs < strsize)
5684 str[offs++] = '\0';
5685
5686out:
5687 mstate->dtms_scratch_ptr = old;
5688}
5689
5690/*
5691 * If you're looking for the epicenter of DTrace, you just found it. This
5692 * is the function called by the provider to fire a probe -- from which all
5693 * subsequent probe-context DTrace activity emanates.
5694 */
5695void
5696dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5697 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5698{
5699 processorid_t cpuid;
5700 dtrace_icookie_t cookie;
5701 dtrace_probe_t *probe;
5702 dtrace_mstate_t mstate;
5703 dtrace_ecb_t *ecb;
5704 dtrace_action_t *act;
5705 intptr_t offs;
5706 size_t size;
5707 int vtime, onintr;
5708 volatile uint16_t *flags;
5709 hrtime_t now;
5710
5711#ifndef VBOX
5712 /*
5713 * Kick out immediately if this CPU is still being born (in which case
5714 * curthread will be set to -1) or the current thread can't allow
5715 * probes in its current context.
5716 */
5717 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5718 return;
5719#endif
5720
5721 cookie = dtrace_interrupt_disable();
5722 probe = dtrace_probes[id - 1];
5723 cpuid = VBDT_GET_CPUID();
5724 onintr = CPU_ON_INTR(CPU);
5725
5726 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5727 probe->dtpr_predcache == curthread->t_predcache) {
5728 /*
5729 * We have hit in the predicate cache; we know that
5730 * this predicate would evaluate to be false.
5731 */
5732 dtrace_interrupt_enable(cookie);
5733 return;
5734 }
5735
5736#ifndef VBOX
5737 if (panic_quiesce) {
5738 /*
5739 * We don't trace anything if we're panicking.
5740 */
5741 dtrace_interrupt_enable(cookie);
5742 return;
5743 }
5744#endif
5745
5746 now = dtrace_gethrtime();
5747 vtime = dtrace_vtime_references != 0;
5748
5749 if (vtime && curthread->t_dtrace_start)
5750 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5751
5752 mstate.dtms_difo = NULL;
5753 mstate.dtms_probe = probe;
5754 mstate.dtms_strtok = NULL;
5755 mstate.dtms_arg[0] = arg0;
5756 mstate.dtms_arg[1] = arg1;
5757 mstate.dtms_arg[2] = arg2;
5758 mstate.dtms_arg[3] = arg3;
5759 mstate.dtms_arg[4] = arg4;
5760
5761 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5762
5763 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5764 dtrace_predicate_t *pred = ecb->dte_predicate;
5765 dtrace_state_t *state = ecb->dte_state;
5766 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5767 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5768 dtrace_vstate_t *vstate = &state->dts_vstate;
5769 dtrace_provider_t *prov = probe->dtpr_provider;
5770 int committed = 0;
5771 caddr_t tomax;
5772
5773 /*
5774 * A little subtlety with the following (seemingly innocuous)
5775 * declaration of the automatic 'val': by looking at the
5776 * code, you might think that it could be declared in the
5777 * action processing loop, below. (That is, it's only used in
5778 * the action processing loop.) However, it must be declared
5779 * out of that scope because in the case of DIF expression
5780 * arguments to aggregating actions, one iteration of the
5781 * action loop will use the last iteration's value.
5782 */
5783#ifdef lint
5784 uint64_t val = 0;
5785#else
5786 uint64_t val VBDTUNASS(0);
5787#endif
5788
5789 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5790 *flags &= ~CPU_DTRACE_ERROR;
5791
5792 if (prov == dtrace_provider) {
5793 /*
5794 * If dtrace itself is the provider of this probe,
5795 * we're only going to continue processing the ECB if
5796 * arg0 (the dtrace_state_t) is equal to the ECB's
5797 * creating state. (This prevents disjoint consumers
5798 * from seeing one another's metaprobes.)
5799 */
5800 if (arg0 != (uint64_t)(uintptr_t)state)
5801 continue;
5802 }
5803
5804 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5805 /*
5806 * We're not currently active. If our provider isn't
5807 * the dtrace pseudo provider, we're not interested.
5808 */
5809 if (prov != dtrace_provider)
5810 continue;
5811
5812 /*
5813 * Now we must further check if we are in the BEGIN
5814 * probe. If we are, we will only continue processing
5815 * if we're still in WARMUP -- if one BEGIN enabling
5816 * has invoked the exit() action, we don't want to
5817 * evaluate subsequent BEGIN enablings.
5818 */
5819 if (probe->dtpr_id == dtrace_probeid_begin &&
5820 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5821 ASSERT(state->dts_activity ==
5822 DTRACE_ACTIVITY_DRAINING);
5823 continue;
5824 }
5825 }
5826
5827 if (ecb->dte_cond) {
5828 /*
5829 * If the dte_cond bits indicate that this
5830 * consumer is only allowed to see user-mode firings
5831 * of this probe, call the provider's dtps_usermode()
5832 * entry point to check that the probe was fired
5833 * while in a user context. Skip this ECB if that's
5834 * not the case.
5835 */
5836 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
5837 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
5838 probe->dtpr_id, probe->dtpr_arg) == 0)
5839 continue;
5840
5841 /*
5842 * This is more subtle than it looks. We have to be
5843 * absolutely certain that CRED() isn't going to
5844 * change out from under us so it's only legit to
5845 * examine that structure if we're in constrained
5846 * situations. Currently, the only times we'll this
5847 * check is if a non-super-user has enabled the
5848 * profile or syscall providers -- providers that
5849 * allow visibility of all processes. For the
5850 * profile case, the check above will ensure that
5851 * we're examining a user context.
5852 */
5853 if (ecb->dte_cond & DTRACE_COND_OWNER) {
5854 cred_t *cr;
5855 cred_t *s_cr =
5856 ecb->dte_state->dts_cred.dcr_cred;
5857 proc_t *proc;
5858
5859 ASSERT(s_cr != NULL);
5860
5861 if ((cr = CRED()) == NULL ||
5862 s_cr->cr_uid != cr->cr_uid ||
5863 s_cr->cr_uid != cr->cr_ruid ||
5864 s_cr->cr_uid != cr->cr_suid ||
5865 s_cr->cr_gid != cr->cr_gid ||
5866 s_cr->cr_gid != cr->cr_rgid ||
5867 s_cr->cr_gid != cr->cr_sgid ||
5868 (proc = VBDT_GET_PROC()) == NULL ||
5869 (proc->p_flag & SNOCD))
5870 continue;
5871 }
5872
5873#ifndef VBOX
5874 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
5875 cred_t *cr;
5876 cred_t *s_cr =
5877 ecb->dte_state->dts_cred.dcr_cred;
5878
5879 ASSERT(s_cr != NULL);
5880
5881 if ((cr = CRED()) == NULL ||
5882 s_cr->cr_zone->zone_id !=
5883 cr->cr_zone->zone_id)
5884 continue;
5885 }
5886#endif
5887 }
5888
5889 if (now - state->dts_alive > dtrace_deadman_timeout) {
5890 /*
5891 * We seem to be dead. Unless we (a) have kernel
5892 * destructive permissions (b) have expicitly enabled
5893 * destructive actions and (c) destructive actions have
5894 * not been disabled, we're going to transition into
5895 * the KILLED state, from which no further processing
5896 * on this state will be performed.
5897 */
5898 if (!dtrace_priv_kernel_destructive(state) ||
5899 !state->dts_cred.dcr_destructive ||
5900 dtrace_destructive_disallow) {
5901 void *activity = &state->dts_activity;
5902 dtrace_activity_t current;
5903
5904 do {
5905 current = state->dts_activity;
5906 } while (dtrace_cas32(activity, current,
5907 DTRACE_ACTIVITY_KILLED) != current);
5908
5909 continue;
5910 }
5911 }
5912
5913 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
5914 ecb->dte_alignment, state, &mstate)) < 0)
5915 continue;
5916
5917 tomax = buf->dtb_tomax;
5918 ASSERT(tomax != NULL);
5919
5920 if (ecb->dte_size != 0)
5921 DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
5922
5923 mstate.dtms_epid = ecb->dte_epid;
5924 mstate.dtms_present |= DTRACE_MSTATE_EPID;
5925
5926 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
5927 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
5928 else
5929 mstate.dtms_access = 0;
5930
5931 if (pred != NULL) {
5932 dtrace_difo_t *dp = pred->dtp_difo;
5933 int rval;
5934
5935 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
5936
5937 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
5938 dtrace_cacheid_t cid = probe->dtpr_predcache;
5939
5940 if (cid != DTRACE_CACHEIDNONE && !onintr) {
5941 /*
5942 * Update the predicate cache...
5943 */
5944 ASSERT(cid == pred->dtp_cacheid);
5945 curthread->t_predcache = cid;
5946 }
5947
5948 continue;
5949 }
5950 }
5951
5952 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
5953 act != NULL; act = act->dta_next) {
5954 size_t valoffs;
5955 dtrace_difo_t *dp;
5956 dtrace_recdesc_t *rec = &act->dta_rec;
5957
5958 size = rec->dtrd_size;
5959 valoffs = offs + rec->dtrd_offset;
5960
5961 if (DTRACEACT_ISAGG(act->dta_kind)) {
5962 uint64_t v = 0xbad;
5963 dtrace_aggregation_t *agg;
5964
5965 agg = (dtrace_aggregation_t *)act;
5966
5967 if ((dp = act->dta_difo) != NULL)
5968 v = dtrace_dif_emulate(dp,
5969 &mstate, vstate, state);
5970
5971 if (*flags & CPU_DTRACE_ERROR)
5972 continue;
5973
5974 /*
5975 * Note that we always pass the expression
5976 * value from the previous iteration of the
5977 * action loop. This value will only be used
5978 * if there is an expression argument to the
5979 * aggregating action, denoted by the
5980 * dtag_hasarg field.
5981 */
5982 dtrace_aggregate(agg, buf,
5983 offs, aggbuf, v, val);
5984 continue;
5985 }
5986
5987 switch (act->dta_kind) {
5988 case DTRACEACT_STOP:
5989#ifndef VBOX
5990 if (dtrace_priv_proc_destructive(state))
5991 dtrace_action_stop();
5992#else
5993 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5994#endif
5995 continue;
5996
5997 case DTRACEACT_BREAKPOINT:
5998#ifndef VBOX
5999 if (dtrace_priv_kernel_destructive(state))
6000 dtrace_action_breakpoint(ecb);
6001#else
6002 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6003#endif
6004 continue;
6005
6006 case DTRACEACT_PANIC:
6007#ifndef VBOX
6008 if (dtrace_priv_kernel_destructive(state))
6009 dtrace_action_panic(ecb);
6010#endif
6011 continue;
6012
6013 case DTRACEACT_STACK:
6014 if (!dtrace_priv_kernel(state))
6015 continue;
6016
6017 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6018 VBDTCAST(int)(size / sizeof (pc_t)), probe->dtpr_aframes,
6019 DTRACE_ANCHORED(probe) ? NULL :
6020 (uint32_t *)arg0);
6021
6022 continue;
6023
6024 case DTRACEACT_JSTACK:
6025 case DTRACEACT_USTACK:
6026 if (!dtrace_priv_proc(state))
6027 continue;
6028
6029 /*
6030 * See comment in DIF_VAR_PID.
6031 */
6032 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6033 CPU_ON_INTR(CPU)) {
6034 int depth = DTRACE_USTACK_NFRAMES(
6035 rec->dtrd_arg) + 1;
6036
6037 dtrace_bzero((void *)(tomax + valoffs),
6038 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6039 + depth * sizeof (uint64_t));
6040
6041 continue;
6042 }
6043
6044 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6045 curproc->p_dtrace_helpers != NULL) {
6046 /*
6047 * This is the slow path -- we have
6048 * allocated string space, and we're
6049 * getting the stack of a process that
6050 * has helpers. Call into a separate
6051 * routine to perform this processing.
6052 */
6053 dtrace_action_ustack(&mstate, state,
6054 (uint64_t *)(tomax + valoffs),
6055 rec->dtrd_arg);
6056 continue;
6057 }
6058
6059 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6060 dtrace_getupcstack((uint64_t *)
6061 (tomax + valoffs),
6062 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6063 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6064 continue;
6065
6066 default:
6067 break;
6068 }
6069
6070 dp = act->dta_difo;
6071 ASSERT(dp != NULL);
6072
6073 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6074
6075 if (*flags & CPU_DTRACE_ERROR)
6076 continue;
6077
6078 switch (act->dta_kind) {
6079 case DTRACEACT_SPECULATE:
6080 ASSERT(buf == &state->dts_buffer[cpuid]);
6081 buf = dtrace_speculation_buffer(state,
6082 cpuid, val);
6083
6084 if (buf == NULL) {
6085 *flags |= CPU_DTRACE_DROP;
6086 continue;
6087 }
6088
6089 offs = dtrace_buffer_reserve(buf,
6090 ecb->dte_needed, ecb->dte_alignment,
6091 state, NULL);
6092
6093 if (offs < 0) {
6094 *flags |= CPU_DTRACE_DROP;
6095 continue;
6096 }
6097
6098 tomax = buf->dtb_tomax;
6099 ASSERT(tomax != NULL);
6100
6101 if (ecb->dte_size != 0)
6102 DTRACE_STORE(uint32_t, tomax, offs,
6103 ecb->dte_epid);
6104 continue;
6105
6106 case DTRACEACT_CHILL:
6107#ifndef VBOX
6108 if (dtrace_priv_kernel_destructive(state))
6109 dtrace_action_chill(&mstate, val);
6110#else
6111 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6112#endif
6113 continue;
6114
6115 case DTRACEACT_RAISE:
6116#ifndef VBOX
6117 if (dtrace_priv_proc_destructive(state))
6118 dtrace_action_raise(val);
6119#else
6120 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6121#endif
6122 continue;
6123
6124 case DTRACEACT_COMMIT:
6125 ASSERT(!committed);
6126
6127 /*
6128 * We need to commit our buffer state.
6129 */
6130 if (ecb->dte_size)
6131 buf->dtb_offset = offs + ecb->dte_size;
6132 buf = &state->dts_buffer[cpuid];
6133 dtrace_speculation_commit(state, cpuid, val);
6134 committed = 1;
6135 continue;
6136
6137 case DTRACEACT_DISCARD:
6138 dtrace_speculation_discard(state, cpuid, val);
6139 continue;
6140
6141 case DTRACEACT_DIFEXPR:
6142 case DTRACEACT_LIBACT:
6143 case DTRACEACT_PRINTF:
6144 case DTRACEACT_PRINTA:
6145 case DTRACEACT_SYSTEM:
6146 case DTRACEACT_FREOPEN:
6147 break;
6148
6149 case DTRACEACT_SYM:
6150 case DTRACEACT_MOD:
6151 if (!dtrace_priv_kernel(state))
6152 continue;
6153 break;
6154
6155 case DTRACEACT_USYM:
6156 case DTRACEACT_UMOD:
6157 case DTRACEACT_UADDR: {
6158#ifndef VBOX
6159 struct pid *pid = curthread->t_procp->p_pidp;
6160
6161 if (!dtrace_priv_proc(state))
6162 continue;
6163
6164 DTRACE_STORE(uint64_t, tomax,
6165 valoffs, (uint64_t)pid->pid_id);
6166 DTRACE_STORE(uint64_t, tomax,
6167 valoffs + sizeof (uint64_t), val);
6168#else
6169 DTRACE_CPUFLAG_SET(CPU_DTRACE_UPRIV);
6170#endif
6171 continue;
6172 }
6173
6174 case DTRACEACT_EXIT: {
6175 /*
6176 * For the exit action, we are going to attempt
6177 * to atomically set our activity to be
6178 * draining. If this fails (either because
6179 * another CPU has beat us to the exit action,
6180 * or because our current activity is something
6181 * other than ACTIVE or WARMUP), we will
6182 * continue. This assures that the exit action
6183 * can be successfully recorded at most once
6184 * when we're in the ACTIVE state. If we're
6185 * encountering the exit() action while in
6186 * COOLDOWN, however, we want to honor the new
6187 * status code. (We know that we're the only
6188 * thread in COOLDOWN, so there is no race.)
6189 */
6190 void *activity = &state->dts_activity;
6191 dtrace_activity_t current = state->dts_activity;
6192
6193 if (current == DTRACE_ACTIVITY_COOLDOWN)
6194 break;
6195
6196 if (current != DTRACE_ACTIVITY_WARMUP)
6197 current = DTRACE_ACTIVITY_ACTIVE;
6198
6199 if (dtrace_cas32(activity, current,
6200 DTRACE_ACTIVITY_DRAINING) != current) {
6201 *flags |= CPU_DTRACE_DROP;
6202 continue;
6203 }
6204
6205 break;
6206 }
6207
6208 default:
6209 ASSERT(0);
6210 }
6211
6212 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6213 uintptr_t end = valoffs + size;
6214
6215 if (!dtrace_vcanload((void *)(uintptr_t)val,
6216 &dp->dtdo_rtype, &mstate, vstate))
6217 continue;
6218
6219 /*
6220 * If this is a string, we're going to only
6221 * load until we find the zero byte -- after
6222 * which we'll store zero bytes.
6223 */
6224 if (dp->dtdo_rtype.dtdt_kind ==
6225 DIF_TYPE_STRING) {
6226 char c = '\0' + 1;
6227 int intuple = act->dta_intuple;
6228 size_t s;
6229
6230 for (s = 0; s < size; s++) {
6231 if (c != '\0')
6232 c = dtrace_load8(val++);
6233
6234 DTRACE_STORE(uint8_t, tomax,
6235 valoffs++, c);
6236
6237 if (c == '\0' && intuple)
6238 break;
6239 }
6240
6241 continue;
6242 }
6243
6244 while (valoffs < end) {
6245 DTRACE_STORE(uint8_t, tomax, valoffs++,
6246 dtrace_load8(val++));
6247 }
6248
6249 continue;
6250 }
6251
6252 switch (size) {
6253 case 0:
6254 break;
6255
6256 case sizeof (uint8_t):
6257 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6258 break;
6259 case sizeof (uint16_t):
6260 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6261 break;
6262 case sizeof (uint32_t):
6263 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6264 break;
6265 case sizeof (uint64_t):
6266 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6267 break;
6268 default:
6269 /*
6270 * Any other size should have been returned by
6271 * reference, not by value.
6272 */
6273 ASSERT(0);
6274 break;
6275 }
6276 }
6277
6278 if (*flags & CPU_DTRACE_DROP)
6279 continue;
6280
6281 if (*flags & CPU_DTRACE_FAULT) {
6282 int ndx;
6283 dtrace_action_t *err;
6284
6285 buf->dtb_errors++;
6286
6287 if (probe->dtpr_id == dtrace_probeid_error) {
6288 /*
6289 * There's nothing we can do -- we had an
6290 * error on the error probe. We bump an
6291 * error counter to at least indicate that
6292 * this condition happened.
6293 */
6294 dtrace_error(&state->dts_dblerrors);
6295 continue;
6296 }
6297
6298 if (vtime) {
6299 /*
6300 * Before recursing on dtrace_probe(), we
6301 * need to explicitly clear out our start
6302 * time to prevent it from being accumulated
6303 * into t_dtrace_vtime.
6304 */
6305 curthread->t_dtrace_start = 0;
6306 }
6307
6308 /*
6309 * Iterate over the actions to figure out which action
6310 * we were processing when we experienced the error.
6311 * Note that act points _past_ the faulting action; if
6312 * act is ecb->dte_action, the fault was in the
6313 * predicate, if it's ecb->dte_action->dta_next it's
6314 * in action #1, and so on.
6315 */
6316 for (err = ecb->dte_action, ndx = 0;
6317 err != act; err = err->dta_next, ndx++)
6318 continue;
6319
6320 dtrace_probe_error(state, ecb->dte_epid, ndx,
6321 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6322 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6323 cpu_core[cpuid].cpuc_dtrace_illval);
6324
6325 continue;
6326 }
6327
6328 if (!committed)
6329 buf->dtb_offset = offs + ecb->dte_size;
6330 }
6331
6332 if (vtime)
6333 curthread->t_dtrace_start = dtrace_gethrtime();
6334
6335 dtrace_interrupt_enable(cookie);
6336}
6337
6338/*
6339 * DTrace Probe Hashing Functions
6340 *
6341 * The functions in this section (and indeed, the functions in remaining
6342 * sections) are not _called_ from probe context. (Any exceptions to this are
6343 * marked with a "Note:".) Rather, they are called from elsewhere in the
6344 * DTrace framework to look-up probes in, add probes to and remove probes from
6345 * the DTrace probe hashes. (Each probe is hashed by each element of the
6346 * probe tuple -- allowing for fast lookups, regardless of what was
6347 * specified.)
6348 */
6349static uint_t
6350dtrace_hash_str(char *p)
6351{
6352 unsigned int g;
6353 uint_t hval = 0;
6354
6355 while (*p) {
6356 hval = (hval << 4) + *p++;
6357 if ((g = (hval & 0xf0000000)) != 0)
6358 hval ^= g >> 24;
6359 hval &= ~g;
6360 }
6361 return (hval);
6362}
6363
6364static dtrace_hash_t *
6365dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6366{
6367 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6368
6369 hash->dth_stroffs = stroffs;
6370 hash->dth_nextoffs = nextoffs;
6371 hash->dth_prevoffs = prevoffs;
6372
6373 hash->dth_size = 1;
6374 hash->dth_mask = hash->dth_size - 1;
6375
6376 hash->dth_tab = kmem_zalloc(hash->dth_size *
6377 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6378
6379 return (hash);
6380}
6381
6382static void
6383dtrace_hash_destroy(dtrace_hash_t *hash)
6384{
6385#ifdef DEBUG
6386 int i;
6387
6388 for (i = 0; i < hash->dth_size; i++)
6389 ASSERT(hash->dth_tab[i] == NULL);
6390#endif
6391
6392 kmem_free(hash->dth_tab,
6393 hash->dth_size * sizeof (dtrace_hashbucket_t *));
6394 kmem_free(hash, sizeof (dtrace_hash_t));
6395}
6396
6397static void
6398dtrace_hash_resize(dtrace_hash_t *hash)
6399{
6400 int size = hash->dth_size, i, ndx;
6401 int new_size = hash->dth_size << 1;
6402 int new_mask = new_size - 1;
6403 dtrace_hashbucket_t **new_tab, *bucket, *next;
6404
6405 ASSERT((new_size & new_mask) == 0);
6406
6407 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6408
6409 for (i = 0; i < size; i++) {
6410 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6411 dtrace_probe_t *probe = bucket->dthb_chain;
6412
6413 ASSERT(probe != NULL);
6414 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6415
6416 next = bucket->dthb_next;
6417 bucket->dthb_next = new_tab[ndx];
6418 new_tab[ndx] = bucket;
6419 }
6420 }
6421
6422 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6423 hash->dth_tab = new_tab;
6424 hash->dth_size = new_size;
6425 hash->dth_mask = new_mask;
6426}
6427
6428static void
6429dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6430{
6431 int hashval = DTRACE_HASHSTR(hash, new);
6432 int ndx = hashval & hash->dth_mask;
6433 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6434 dtrace_probe_t **nextp, **prevp;
6435
6436 for (; bucket != NULL; bucket = bucket->dthb_next) {
6437 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6438 goto add;
6439 }
6440
6441 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6442 dtrace_hash_resize(hash);
6443 dtrace_hash_add(hash, new);
6444 return;
6445 }
6446
6447 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6448 bucket->dthb_next = hash->dth_tab[ndx];
6449 hash->dth_tab[ndx] = bucket;
6450 hash->dth_nbuckets++;
6451
6452add:
6453 nextp = DTRACE_HASHNEXT(hash, new);
6454 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6455 *nextp = bucket->dthb_chain;
6456
6457 if (bucket->dthb_chain != NULL) {
6458 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6459 ASSERT(*prevp == NULL);
6460 *prevp = new;
6461 }
6462
6463 bucket->dthb_chain = new;
6464 bucket->dthb_len++;
6465}
6466
6467static dtrace_probe_t *
6468dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6469{
6470 int hashval = DTRACE_HASHSTR(hash, template);
6471 int ndx = hashval & hash->dth_mask;
6472 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6473
6474 for (; bucket != NULL; bucket = bucket->dthb_next) {
6475 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6476 return (bucket->dthb_chain);
6477 }
6478
6479 return (NULL);
6480}
6481
6482static int
6483dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6484{
6485 int hashval = DTRACE_HASHSTR(hash, template);
6486 int ndx = hashval & hash->dth_mask;
6487 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6488
6489 for (; bucket != NULL; bucket = bucket->dthb_next) {
6490 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6491 return (bucket->dthb_len);
6492 }
6493
6494 return (NULL);
6495}
6496
6497static void
6498dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6499{
6500 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6501 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6502
6503 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6504 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6505
6506 /*
6507 * Find the bucket that we're removing this probe from.
6508 */
6509 for (; bucket != NULL; bucket = bucket->dthb_next) {
6510 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6511 break;
6512 }
6513
6514 ASSERT(bucket != NULL);
6515
6516 if (*prevp == NULL) {
6517 if (*nextp == NULL) {
6518 /*
6519 * The removed probe was the only probe on this
6520 * bucket; we need to remove the bucket.
6521 */
6522 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6523
6524 ASSERT(bucket->dthb_chain == probe);
6525 ASSERT(b != NULL);
6526
6527 if (b == bucket) {
6528 hash->dth_tab[ndx] = bucket->dthb_next;
6529 } else {
6530 while (b->dthb_next != bucket)
6531 b = b->dthb_next;
6532 b->dthb_next = bucket->dthb_next;
6533 }
6534
6535 ASSERT(hash->dth_nbuckets > 0);
6536 hash->dth_nbuckets--;
6537 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6538 return;
6539 }
6540
6541 bucket->dthb_chain = *nextp;
6542 } else {
6543 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6544 }
6545
6546 if (*nextp != NULL)
6547 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6548}
6549
6550/*
6551 * DTrace Utility Functions
6552 *
6553 * These are random utility functions that are _not_ called from probe context.
6554 */
6555static int
6556dtrace_badattr(const dtrace_attribute_t *a)
6557{
6558 return (a->dtat_name > DTRACE_STABILITY_MAX ||
6559 a->dtat_data > DTRACE_STABILITY_MAX ||
6560 a->dtat_class > DTRACE_CLASS_MAX);
6561}
6562
6563/*
6564 * Return a duplicate copy of a string. If the specified string is NULL,
6565 * this function returns a zero-length string.
6566 */
6567static char *
6568dtrace_strdup(const char *str)
6569{
6570 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6571
6572 if (str != NULL)
6573 (void) strcpy(new, str);
6574
6575 return (new);
6576}
6577
6578#define DTRACE_ISALPHA(c) \
6579 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6580
6581static int
6582dtrace_badname(const char *s)
6583{
6584 char c;
6585
6586 if (s == NULL || (c = *s++) == '\0')
6587 return (0);
6588
6589 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6590 return (1);
6591
6592 while ((c = *s++) != '\0') {
6593 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6594 c != '-' && c != '_' && c != '.' && c != '`')
6595 return (1);
6596 }
6597
6598 return (0);
6599}
6600
6601static void
6602dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6603{
6604 uint32_t priv;
6605
6606 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6607 /*
6608 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6609 */
6610 priv = DTRACE_PRIV_ALL;
6611 } else {
6612 *uidp = crgetuid(cr);
6613 *zoneidp = crgetzoneid(cr);
6614
6615 priv = 0;
6616 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6617 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6618 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6619 priv |= DTRACE_PRIV_USER;
6620 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6621 priv |= DTRACE_PRIV_PROC;
6622 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6623 priv |= DTRACE_PRIV_OWNER;
6624 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6625 priv |= DTRACE_PRIV_ZONEOWNER;
6626 }
6627
6628 *privp = priv;
6629}
6630
6631#ifdef DTRACE_ERRDEBUG
6632static void
6633dtrace_errdebug(const char *str)
6634{
6635 int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6636 int occupied = 0;
6637
6638 mutex_enter(&dtrace_errlock);
6639 dtrace_errlast = str;
6640 dtrace_errthread = curthread;
6641
6642 while (occupied++ < DTRACE_ERRHASHSZ) {
6643 if (dtrace_errhash[hval].dter_msg == str) {
6644 dtrace_errhash[hval].dter_count++;
6645 goto out;
6646 }
6647
6648 if (dtrace_errhash[hval].dter_msg != NULL) {
6649 hval = (hval + 1) % DTRACE_ERRHASHSZ;
6650 continue;
6651 }
6652
6653 dtrace_errhash[hval].dter_msg = str;
6654 dtrace_errhash[hval].dter_count = 1;
6655 goto out;
6656 }
6657
6658 panic("dtrace: undersized error hash");
6659out:
6660 mutex_exit(&dtrace_errlock);
6661}
6662#endif
6663
6664/*
6665 * DTrace Matching Functions
6666 *
6667 * These functions are used to match groups of probes, given some elements of
6668 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6669 */
6670static int
6671dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6672 zoneid_t zoneid)
6673{
6674 if (priv != DTRACE_PRIV_ALL) {
6675 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6676 uint32_t match = priv & ppriv;
6677
6678 /*
6679 * No PRIV_DTRACE_* privileges...
6680 */
6681 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6682 DTRACE_PRIV_KERNEL)) == 0)
6683 return (0);
6684
6685 /*
6686 * No matching bits, but there were bits to match...
6687 */
6688 if (match == 0 && ppriv != 0)
6689 return (0);
6690
6691 /*
6692 * Need to have permissions to the process, but don't...
6693 */
6694 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6695 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6696 return (0);
6697 }
6698
6699 /*
6700 * Need to be in the same zone unless we possess the
6701 * privilege to examine all zones.
6702 */
6703 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6704 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6705 return (0);
6706 }
6707 }
6708
6709 return (1);
6710}
6711
6712/*
6713 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6714 * consists of input pattern strings and an ops-vector to evaluate them.
6715 * This function returns >0 for match, 0 for no match, and <0 for error.
6716 */
6717static int
6718dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6719 uint32_t priv, uid_t uid, zoneid_t zoneid)
6720{
6721 dtrace_provider_t *pvp = prp->dtpr_provider;
6722 int rv;
6723
6724 if (pvp->dtpv_defunct)
6725 return (0);
6726
6727 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6728 return (rv);
6729
6730 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6731 return (rv);
6732
6733 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6734 return (rv);
6735
6736 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6737 return (rv);
6738
6739 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6740 return (0);
6741
6742 return (rv);
6743}
6744
6745/*
6746 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6747 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
6748 * libc's version, the kernel version only applies to 8-bit ASCII strings.
6749 * In addition, all of the recursion cases except for '*' matching have been
6750 * unwound. For '*', we still implement recursive evaluation, but a depth
6751 * counter is maintained and matching is aborted if we recurse too deep.
6752 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6753 */
6754static int
6755dtrace_match_glob(const char *s, const char *p, int depth)
6756{
6757 const char *olds;
6758 char s1, c;
6759 int gs;
6760
6761 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6762 return (-1);
6763
6764 if (s == NULL)
6765 s = ""; /* treat NULL as empty string */
6766
6767top:
6768 olds = s;
6769 s1 = *s++;
6770
6771 if (p == NULL)
6772 return (0);
6773
6774 if ((c = *p++) == '\0')
6775 return (s1 == '\0');
6776
6777 switch (c) {
6778 case '[': {
6779 int ok = 0, notflag = 0;
6780 char lc = '\0';
6781
6782 if (s1 == '\0')
6783 return (0);
6784
6785 if (*p == '!') {
6786 notflag = 1;
6787 p++;
6788 }
6789
6790 if ((c = *p++) == '\0')
6791 return (0);
6792
6793 do {
6794 if (c == '-' && lc != '\0' && *p != ']') {
6795 if ((c = *p++) == '\0')
6796 return (0);
6797 if (c == '\\' && (c = *p++) == '\0')
6798 return (0);
6799
6800 if (notflag) {
6801 if (s1 < lc || s1 > c)
6802 ok++;
6803 else
6804 return (0);
6805 } else if (lc <= s1 && s1 <= c)
6806 ok++;
6807
6808 } else if (c == '\\' && (c = *p++) == '\0')
6809 return (0);
6810
6811 lc = c; /* save left-hand 'c' for next iteration */
6812
6813 if (notflag) {
6814 if (s1 != c)
6815 ok++;
6816 else
6817 return (0);
6818 } else if (s1 == c)
6819 ok++;
6820
6821 if ((c = *p++) == '\0')
6822 return (0);
6823
6824 } while (c != ']');
6825
6826 if (ok)
6827 goto top;
6828
6829 return (0);
6830 }
6831
6832 case '\\':
6833 if ((c = *p++) == '\0')
6834 return (0);
6835 /*FALLTHRU*/
6836
6837 default:
6838 if (c != s1)
6839 return (0);
6840 /*FALLTHRU*/
6841
6842 case '?':
6843 if (s1 != '\0')
6844 goto top;
6845 return (0);
6846
6847 case '*':
6848 while (*p == '*')
6849 p++; /* consecutive *'s are identical to a single one */
6850
6851 if (*p == '\0')
6852 return (1);
6853
6854 for (s = olds; *s != '\0'; s++) {
6855 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6856 return (gs);
6857 }
6858
6859 return (0);
6860 }
6861}
6862
6863/*ARGSUSED*/
6864static int
6865dtrace_match_string(const char *s, const char *p, int depth)
6866{
6867 return (s != NULL && strcmp(s, p) == 0);
6868}
6869
6870/*ARGSUSED*/
6871static int
6872dtrace_match_nul(const char *s, const char *p, int depth)
6873{
6874 return (1); /* always match the empty pattern */
6875}
6876
6877/*ARGSUSED*/
6878static int
6879dtrace_match_nonzero(const char *s, const char *p, int depth)
6880{
6881 return (s != NULL && s[0] != '\0');
6882}
6883
6884static int
6885dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6886 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6887{
6888 dtrace_probe_t template, *probe;
6889 dtrace_hash_t *hash = NULL;
6890 int len, rc, best = INT_MAX, nmatched = 0;
6891 dtrace_id_t i;
6892
6893 ASSERT(MUTEX_HELD(&dtrace_lock));
6894
6895 /*
6896 * If the probe ID is specified in the key, just lookup by ID and
6897 * invoke the match callback once if a matching probe is found.
6898 */
6899 if (pkp->dtpk_id != DTRACE_IDNONE) {
6900 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6901 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6902 if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
6903 return (DTRACE_MATCH_FAIL);
6904 nmatched++;
6905 }
6906 return (nmatched);
6907 }
6908
6909 template.dtpr_mod = (char *)pkp->dtpk_mod;
6910 template.dtpr_func = (char *)pkp->dtpk_func;
6911 template.dtpr_name = (char *)pkp->dtpk_name;
6912
6913 /*
6914 * We want to find the most distinct of the module name, function
6915 * name, and name. So for each one that is not a glob pattern or
6916 * empty string, we perform a lookup in the corresponding hash and
6917 * use the hash table with the fewest collisions to do our search.
6918 */
6919 if (pkp->dtpk_mmatch == &dtrace_match_string &&
6920 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
6921 best = len;
6922 hash = dtrace_bymod;
6923 }
6924
6925 if (pkp->dtpk_fmatch == &dtrace_match_string &&
6926 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
6927 best = len;
6928 hash = dtrace_byfunc;
6929 }
6930
6931 if (pkp->dtpk_nmatch == &dtrace_match_string &&
6932 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
6933 best = len;
6934 hash = dtrace_byname;
6935 }
6936
6937 /*
6938 * If we did not select a hash table, iterate over every probe and
6939 * invoke our callback for each one that matches our input probe key.
6940 */
6941 if (hash == NULL) {
6942 for (i = 0; i < VBDTCAST(dtrace_id_t)dtrace_nprobes; i++) {
6943 if ((probe = dtrace_probes[i]) == NULL ||
6944 dtrace_match_probe(probe, pkp, priv, uid,
6945 zoneid) <= 0)
6946 continue;
6947
6948 nmatched++;
6949
6950 if ((rc = (*matched)(probe, arg)) !=
6951 DTRACE_MATCH_NEXT) {
6952 if (rc == DTRACE_MATCH_FAIL)
6953 return (DTRACE_MATCH_FAIL);
6954 break;
6955 }
6956 }
6957
6958 return (nmatched);
6959 }
6960
6961 /*
6962 * If we selected a hash table, iterate over each probe of the same key
6963 * name and invoke the callback for every probe that matches the other
6964 * attributes of our input probe key.
6965 */
6966 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
6967 probe = *(DTRACE_HASHNEXT(hash, probe))) {
6968
6969 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
6970 continue;
6971
6972 nmatched++;
6973
6974 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
6975 if (rc == DTRACE_MATCH_FAIL)
6976 return (DTRACE_MATCH_FAIL);
6977 break;
6978 }
6979 }
6980
6981 return (nmatched);
6982}
6983
6984/*
6985 * Return the function pointer dtrace_probecmp() should use to compare the
6986 * specified pattern with a string. For NULL or empty patterns, we select
6987 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
6988 * For non-empty non-glob strings, we use dtrace_match_string().
6989 */
6990static dtrace_probekey_f *
6991dtrace_probekey_func(const char *p)
6992{
6993 char c;
6994
6995 if (p == NULL || *p == '\0')
6996 return (&dtrace_match_nul);
6997
6998 while ((c = *p++) != '\0') {
6999 if (c == '[' || c == '?' || c == '*' || c == '\\')
7000 return (&dtrace_match_glob);
7001 }
7002
7003 return (&dtrace_match_string);
7004}
7005
7006/*
7007 * Build a probe comparison key for use with dtrace_match_probe() from the
7008 * given probe description. By convention, a null key only matches anchored
7009 * probes: if each field is the empty string, reset dtpk_fmatch to
7010 * dtrace_match_nonzero().
7011 */
7012static void
7013dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7014{
7015 pkp->dtpk_prov = pdp->dtpd_provider;
7016 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7017
7018 pkp->dtpk_mod = pdp->dtpd_mod;
7019 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7020
7021 pkp->dtpk_func = pdp->dtpd_func;
7022 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7023
7024 pkp->dtpk_name = pdp->dtpd_name;
7025 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7026
7027 pkp->dtpk_id = pdp->dtpd_id;
7028
7029 if (pkp->dtpk_id == DTRACE_IDNONE &&
7030 pkp->dtpk_pmatch == &dtrace_match_nul &&
7031 pkp->dtpk_mmatch == &dtrace_match_nul &&
7032 pkp->dtpk_fmatch == &dtrace_match_nul &&
7033 pkp->dtpk_nmatch == &dtrace_match_nul)
7034 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7035}
7036
7037/*
7038 * DTrace Provider-to-Framework API Functions
7039 *
7040 * These functions implement much of the Provider-to-Framework API, as
7041 * described in <sys/dtrace.h>. The parts of the API not in this section are
7042 * the functions in the API for probe management (found below), and
7043 * dtrace_probe() itself (found above).
7044 */
7045
7046/*
7047 * Register the calling provider with the DTrace framework. This should
7048 * generally be called by DTrace providers in their attach(9E) entry point.
7049 */
7050int
7051dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7052 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7053{
7054 dtrace_provider_t *provider;
7055
7056 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7057 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7058 "arguments", name ? name : "<NULL>");
7059 return (EINVAL);
7060 }
7061
7062 if (name[0] == '\0' || dtrace_badname(name)) {
7063 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7064 "provider name", name);
7065 return (EINVAL);
7066 }
7067
7068 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7069 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7070 pops->dtps_destroy == NULL ||
7071 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7072 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7073 "provider ops", name);
7074 return (EINVAL);
7075 }
7076
7077 if (dtrace_badattr(&pap->dtpa_provider) ||
7078 dtrace_badattr(&pap->dtpa_mod) ||
7079 dtrace_badattr(&pap->dtpa_func) ||
7080 dtrace_badattr(&pap->dtpa_name) ||
7081 dtrace_badattr(&pap->dtpa_args)) {
7082 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7083 "provider attributes", name);
7084 return (EINVAL);
7085 }
7086
7087 if (priv & ~DTRACE_PRIV_ALL) {
7088 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7089 "privilege attributes", name);
7090 return (EINVAL);
7091 }
7092
7093 if ((priv & DTRACE_PRIV_KERNEL) &&
7094 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7095 pops->dtps_usermode == NULL) {
7096 cmn_err(CE_WARN, "failed to register provider '%s': need "
7097 "dtps_usermode() op for given privilege attributes", name);
7098 return (EINVAL);
7099 }
7100
7101 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7102 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7103 (void) strcpy(provider->dtpv_name, name);
7104
7105 provider->dtpv_attr = *pap;
7106 provider->dtpv_priv.dtpp_flags = priv;
7107 if (cr != NULL) {
7108 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7109 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7110 }
7111 provider->dtpv_pops = *pops;
7112
7113 if (pops->dtps_provide == NULL) {
7114 ASSERT(pops->dtps_provide_module != NULL);
7115 provider->dtpv_pops.dtps_provide =
7116 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7117 }
7118
7119 if (pops->dtps_provide_module == NULL) {
7120 ASSERT(pops->dtps_provide != NULL);
7121 provider->dtpv_pops.dtps_provide_module =
7122 (void (*)(void *, struct modctl *))dtrace_nullop;
7123 }
7124
7125 if (pops->dtps_suspend == NULL) {
7126 ASSERT(pops->dtps_resume == NULL);
7127 provider->dtpv_pops.dtps_suspend =
7128 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7129 provider->dtpv_pops.dtps_resume =
7130 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7131 }
7132
7133 provider->dtpv_arg = arg;
7134 *idp = (dtrace_provider_id_t)provider;
7135
7136 if (pops == &dtrace_provider_ops) {
7137 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7138 ASSERT(MUTEX_HELD(&dtrace_lock));
7139 ASSERT(dtrace_anon.dta_enabling == NULL);
7140
7141 /*
7142 * We make sure that the DTrace provider is at the head of
7143 * the provider chain.
7144 */
7145 provider->dtpv_next = dtrace_provider;
7146 dtrace_provider = provider;
7147 return (0);
7148 }
7149
7150 mutex_enter(&dtrace_provider_lock);
7151 mutex_enter(&dtrace_lock);
7152
7153 /*
7154 * If there is at least one provider registered, we'll add this
7155 * provider after the first provider.
7156 */
7157 if (dtrace_provider != NULL) {
7158 provider->dtpv_next = dtrace_provider->dtpv_next;
7159 dtrace_provider->dtpv_next = provider;
7160 } else {
7161 dtrace_provider = provider;
7162 }
7163
7164 if (dtrace_retained != NULL) {
7165 dtrace_enabling_provide(provider);
7166
7167 /*
7168 * Now we need to call dtrace_enabling_matchall() -- which
7169 * will acquire cpu_lock and dtrace_lock. We therefore need
7170 * to drop all of our locks before calling into it...
7171 */
7172 mutex_exit(&dtrace_lock);
7173 mutex_exit(&dtrace_provider_lock);
7174 dtrace_enabling_matchall();
7175
7176 return (0);
7177 }
7178
7179 mutex_exit(&dtrace_lock);
7180 mutex_exit(&dtrace_provider_lock);
7181
7182 return (0);
7183}
7184
7185/*
7186 * Unregister the specified provider from the DTrace framework. This should
7187 * generally be called by DTrace providers in their detach(9E) entry point.
7188 */
7189int
7190dtrace_unregister(dtrace_provider_id_t id)
7191{
7192 dtrace_provider_t *old = (dtrace_provider_t *)id;
7193 dtrace_provider_t *prev = NULL;
7194 VBDTTYPE(uint32_t,int) i, self = 0;
7195 dtrace_probe_t *probe, *first = NULL;
7196
7197 if (old->dtpv_pops.dtps_enable ==
7198 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7199 /*
7200 * If DTrace itself is the provider, we're called with locks
7201 * already held.
7202 */
7203 ASSERT(old == dtrace_provider);
7204 ASSERT(dtrace_devi != NULL);
7205 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7206 ASSERT(MUTEX_HELD(&dtrace_lock));
7207 self = 1;
7208
7209 if (dtrace_provider->dtpv_next != NULL) {
7210 /*
7211 * There's another provider here; return failure.
7212 */
7213 return (EBUSY);
7214 }
7215 } else {
7216 mutex_enter(&dtrace_provider_lock);
7217 mutex_enter(&mod_lock);
7218 mutex_enter(&dtrace_lock);
7219 }
7220
7221 /*
7222 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7223 * probes, we refuse to let providers slither away, unless this
7224 * provider has already been explicitly invalidated.
7225 */
7226 if (!old->dtpv_defunct &&
7227 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7228 dtrace_anon.dta_state->dts_necbs > 0))) {
7229 if (!self) {
7230 mutex_exit(&dtrace_lock);
7231 mutex_exit(&mod_lock);
7232 mutex_exit(&dtrace_provider_lock);
7233 }
7234 return (EBUSY);
7235 }
7236
7237 /*
7238 * Attempt to destroy the probes associated with this provider.
7239 */
7240 for (i = 0; i < dtrace_nprobes; i++) {
7241 if ((probe = dtrace_probes[i]) == NULL)
7242 continue;
7243
7244 if (probe->dtpr_provider != old)
7245 continue;
7246
7247 if (probe->dtpr_ecb == NULL)
7248 continue;
7249
7250 /*
7251 * We have at least one ECB; we can't remove this provider.
7252 */
7253 if (!self) {
7254 mutex_exit(&dtrace_lock);
7255 mutex_exit(&mod_lock);
7256 mutex_exit(&dtrace_provider_lock);
7257 }
7258 return (EBUSY);
7259 }
7260
7261 /*
7262 * All of the probes for this provider are disabled; we can safely
7263 * remove all of them from their hash chains and from the probe array.
7264 */
7265 for (i = 0; i < dtrace_nprobes; i++) {
7266 if ((probe = dtrace_probes[i]) == NULL)
7267 continue;
7268
7269 if (probe->dtpr_provider != old)
7270 continue;
7271
7272 dtrace_probes[i] = NULL;
7273
7274 dtrace_hash_remove(dtrace_bymod, probe);
7275 dtrace_hash_remove(dtrace_byfunc, probe);
7276 dtrace_hash_remove(dtrace_byname, probe);
7277
7278 if (first == NULL) {
7279 first = probe;
7280 probe->dtpr_nextmod = NULL;
7281 } else {
7282 probe->dtpr_nextmod = first;
7283 first = probe;
7284 }
7285 }
7286
7287 /*
7288 * The provider's probes have been removed from the hash chains and
7289 * from the probe array. Now issue a dtrace_sync() to be sure that
7290 * everyone has cleared out from any probe array processing.
7291 */
7292 dtrace_sync();
7293
7294 for (probe = first; probe != NULL; probe = first) {
7295 first = probe->dtpr_nextmod;
7296
7297 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7298 probe->dtpr_arg);
7299 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7300 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7301 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7302 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7303 kmem_free(probe, sizeof (dtrace_probe_t));
7304 }
7305
7306 if ((prev = dtrace_provider) == old) {
7307 ASSERT(self || dtrace_devi == NULL);
7308 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7309 dtrace_provider = old->dtpv_next;
7310 } else {
7311 while (prev != NULL && prev->dtpv_next != old)
7312 prev = prev->dtpv_next;
7313
7314 if (prev == NULL) {
7315 panic("attempt to unregister non-existent "
7316 "dtrace provider %p\n", (void *)id);
7317 }
7318
7319 prev->dtpv_next = old->dtpv_next;
7320 }
7321
7322 if (!self) {
7323 mutex_exit(&dtrace_lock);
7324 mutex_exit(&mod_lock);
7325 mutex_exit(&dtrace_provider_lock);
7326 }
7327
7328 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7329 kmem_free(old, sizeof (dtrace_provider_t));
7330
7331 return (0);
7332}
7333
7334/*
7335 * Invalidate the specified provider. All subsequent probe lookups for the
7336 * specified provider will fail, but its probes will not be removed.
7337 */
7338void
7339dtrace_invalidate(dtrace_provider_id_t id)
7340{
7341 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7342
7343 ASSERT(pvp->dtpv_pops.dtps_enable !=
7344 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7345
7346 mutex_enter(&dtrace_provider_lock);
7347 mutex_enter(&dtrace_lock);
7348
7349 pvp->dtpv_defunct = 1;
7350
7351 mutex_exit(&dtrace_lock);
7352 mutex_exit(&dtrace_provider_lock);
7353}
7354
7355/*
7356 * Indicate whether or not DTrace has attached.
7357 */
7358int
7359dtrace_attached(void)
7360{
7361 /*
7362 * dtrace_provider will be non-NULL iff the DTrace driver has
7363 * attached. (It's non-NULL because DTrace is always itself a
7364 * provider.)
7365 */
7366 return (dtrace_provider != NULL);
7367}
7368
7369/*
7370 * Remove all the unenabled probes for the given provider. This function is
7371 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7372 * -- just as many of its associated probes as it can.
7373 */
7374int
7375dtrace_condense(dtrace_provider_id_t id)
7376{
7377 dtrace_provider_t *prov = (dtrace_provider_t *)id;
7378 VBDTTYPE(uint32_t,int) i;
7379 dtrace_probe_t *probe;
7380
7381 /*
7382 * Make sure this isn't the dtrace provider itself.
7383 */
7384 ASSERT(prov->dtpv_pops.dtps_enable !=
7385 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7386
7387 mutex_enter(&dtrace_provider_lock);
7388 mutex_enter(&dtrace_lock);
7389
7390 /*
7391 * Attempt to destroy the probes associated with this provider.
7392 */
7393 for (i = 0; i < dtrace_nprobes; i++) {
7394 if ((probe = dtrace_probes[i]) == NULL)
7395 continue;
7396
7397 if (probe->dtpr_provider != prov)
7398 continue;
7399
7400 if (probe->dtpr_ecb != NULL)
7401 continue;
7402
7403 dtrace_probes[i] = NULL;
7404
7405 dtrace_hash_remove(dtrace_bymod, probe);
7406 dtrace_hash_remove(dtrace_byfunc, probe);
7407 dtrace_hash_remove(dtrace_byname, probe);
7408
7409 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7410 probe->dtpr_arg);
7411 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7412 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7413 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7414 kmem_free(probe, sizeof (dtrace_probe_t));
7415 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7416 }
7417
7418 mutex_exit(&dtrace_lock);
7419 mutex_exit(&dtrace_provider_lock);
7420
7421 return (0);
7422}
7423
7424/*
7425 * DTrace Probe Management Functions
7426 *
7427 * The functions in this section perform the DTrace probe management,
7428 * including functions to create probes, look-up probes, and call into the
7429 * providers to request that probes be provided. Some of these functions are
7430 * in the Provider-to-Framework API; these functions can be identified by the
7431 * fact that they are not declared "static".
7432 */
7433
7434/*
7435 * Create a probe with the specified module name, function name, and name.
7436 */
7437dtrace_id_t
7438dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7439 const char *func, const char *name, int aframes, void *arg)
7440{
7441 dtrace_probe_t *probe, **probes;
7442 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7443 dtrace_id_t id;
7444
7445 if (provider == dtrace_provider) {
7446 ASSERT(MUTEX_HELD(&dtrace_lock));
7447 } else {
7448 mutex_enter(&dtrace_lock);
7449 }
7450
7451 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7452 VM_BESTFIT | VM_SLEEP);
7453 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7454
7455 probe->dtpr_id = id;
7456 probe->dtpr_gen = dtrace_probegen++;
7457 probe->dtpr_mod = dtrace_strdup(mod);
7458 probe->dtpr_func = dtrace_strdup(func);
7459 probe->dtpr_name = dtrace_strdup(name);
7460 probe->dtpr_arg = arg;
7461 probe->dtpr_aframes = aframes;
7462 probe->dtpr_provider = provider;
7463
7464 dtrace_hash_add(dtrace_bymod, probe);
7465 dtrace_hash_add(dtrace_byfunc, probe);
7466 dtrace_hash_add(dtrace_byname, probe);
7467
7468 if (id - 1 >= dtrace_nprobes) {
7469 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7470 size_t nsize = osize << 1;
7471
7472 if (nsize == 0) {
7473 ASSERT(osize == 0);
7474 ASSERT(dtrace_probes == NULL);
7475 nsize = sizeof (dtrace_probe_t *);
7476 }
7477
7478 probes = kmem_zalloc(nsize, KM_SLEEP);
7479
7480 if (dtrace_probes == NULL) {
7481 ASSERT(osize == 0);
7482 dtrace_probes = probes;
7483 dtrace_nprobes = 1;
7484 } else {
7485 dtrace_probe_t **oprobes = dtrace_probes;
7486
7487 bcopy(oprobes, probes, osize);
7488 dtrace_membar_producer();
7489 dtrace_probes = probes;
7490
7491 dtrace_sync();
7492
7493 /*
7494 * All CPUs are now seeing the new probes array; we can
7495 * safely free the old array.
7496 */
7497 kmem_free(oprobes, osize);
7498 dtrace_nprobes <<= 1;
7499 }
7500
7501 ASSERT(id - 1 < dtrace_nprobes);
7502 }
7503
7504 ASSERT(dtrace_probes[id - 1] == NULL);
7505 dtrace_probes[id - 1] = probe;
7506
7507 if (provider != dtrace_provider)
7508 mutex_exit(&dtrace_lock);
7509
7510 return (id);
7511}
7512
7513static dtrace_probe_t *
7514dtrace_probe_lookup_id(dtrace_id_t id)
7515{
7516 ASSERT(MUTEX_HELD(&dtrace_lock));
7517
7518 if (id == 0 || id > dtrace_nprobes)
7519 return (NULL);
7520
7521 return (dtrace_probes[id - 1]);
7522}
7523
7524static int
7525dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7526{
7527 *((dtrace_id_t *)arg) = probe->dtpr_id;
7528
7529 return (DTRACE_MATCH_DONE);
7530}
7531
7532/*
7533 * Look up a probe based on provider and one or more of module name, function
7534 * name and probe name.
7535 */
7536dtrace_id_t
7537dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7538 const char *func, const char *name)
7539{
7540 dtrace_probekey_t pkey;
7541 dtrace_id_t id;
7542 int match;
7543
7544 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7545 pkey.dtpk_pmatch = &dtrace_match_string;
7546 pkey.dtpk_mod = mod;
7547 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7548 pkey.dtpk_func = func;
7549 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7550 pkey.dtpk_name = name;
7551 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7552 pkey.dtpk_id = DTRACE_IDNONE;
7553
7554 mutex_enter(&dtrace_lock);
7555 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7556 dtrace_probe_lookup_match, &id);
7557 mutex_exit(&dtrace_lock);
7558
7559 ASSERT(match == 1 || match == 0);
7560 return (match ? id : 0);
7561}
7562
7563/*
7564 * Returns the probe argument associated with the specified probe.
7565 */
7566void *
7567dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7568{
7569 dtrace_probe_t *probe;
7570 void *rval = NULL;
7571
7572 mutex_enter(&dtrace_lock);
7573
7574 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7575 probe->dtpr_provider == (dtrace_provider_t *)id)
7576 rval = probe->dtpr_arg;
7577
7578 mutex_exit(&dtrace_lock);
7579
7580 return (rval);
7581}
7582
7583/*
7584 * Copy a probe into a probe description.
7585 */
7586static void
7587dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7588{
7589 bzero(pdp, sizeof (dtrace_probedesc_t));
7590 pdp->dtpd_id = prp->dtpr_id;
7591
7592 (void) strncpy(pdp->dtpd_provider,
7593 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7594
7595 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7596 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7597 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7598}
7599
7600/*
7601 * Called to indicate that a probe -- or probes -- should be provided by a
7602 * specfied provider. If the specified description is NULL, the provider will
7603 * be told to provide all of its probes. (This is done whenever a new
7604 * consumer comes along, or whenever a retained enabling is to be matched.) If
7605 * the specified description is non-NULL, the provider is given the
7606 * opportunity to dynamically provide the specified probe, allowing providers
7607 * to support the creation of probes on-the-fly. (So-called _autocreated_
7608 * probes.) If the provider is NULL, the operations will be applied to all
7609 * providers; if the provider is non-NULL the operations will only be applied
7610 * to the specified provider. The dtrace_provider_lock must be held, and the
7611 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7612 * will need to grab the dtrace_lock when it reenters the framework through
7613 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7614 */
7615static void
7616dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7617{
7618#ifndef VBOX
7619 struct modctl *ctl;
7620#endif
7621 int all = 0;
7622
7623 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7624
7625 if (prv == NULL) {
7626 all = 1;
7627 prv = dtrace_provider;
7628 }
7629
7630 do {
7631 /*
7632 * First, call the blanket provide operation.
7633 */
7634 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7635
7636#ifndef VBOX
7637 /*
7638 * Now call the per-module provide operation. We will grab
7639 * mod_lock to prevent the list from being modified. Note
7640 * that this also prevents the mod_busy bits from changing.
7641 * (mod_busy can only be changed with mod_lock held.)
7642 */
7643 mutex_enter(&mod_lock);
7644
7645 ctl = &modules;
7646 do {
7647 if (ctl->mod_busy || ctl->mod_mp == NULL)
7648 continue;
7649
7650 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7651
7652 } while ((ctl = ctl->mod_next) != &modules);
7653
7654 mutex_exit(&mod_lock);
7655#endif
7656 } while (all && (prv = prv->dtpv_next) != NULL);
7657}
7658
7659/*
7660 * Iterate over each probe, and call the Framework-to-Provider API function
7661 * denoted by offs.
7662 */
7663static void
7664dtrace_probe_foreach(uintptr_t offs)
7665{
7666 dtrace_provider_t *prov;
7667 void (*func)(void *, dtrace_id_t, void *);
7668 dtrace_probe_t *probe;
7669 dtrace_icookie_t cookie;
7670 VBDTTYPE(uint32_t,int) i;
7671
7672 /*
7673 * We disable interrupts to walk through the probe array. This is
7674 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7675 * won't see stale data.
7676 */
7677 cookie = dtrace_interrupt_disable();
7678
7679 for (i = 0; i < dtrace_nprobes; i++) {
7680 if ((probe = dtrace_probes[i]) == NULL)
7681 continue;
7682
7683 if (probe->dtpr_ecb == NULL) {
7684 /*
7685 * This probe isn't enabled -- don't call the function.
7686 */
7687 continue;
7688 }
7689
7690 prov = probe->dtpr_provider;
7691 func = *((void(**)(void *, dtrace_id_t, void *))
7692 ((uintptr_t)&prov->dtpv_pops + offs));
7693
7694 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7695 }
7696
7697 dtrace_interrupt_enable(cookie);
7698}
7699
7700static int
7701dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7702{
7703 dtrace_probekey_t pkey;
7704 uint32_t priv;
7705 uid_t uid;
7706 zoneid_t zoneid;
7707
7708 ASSERT(MUTEX_HELD(&dtrace_lock));
7709 dtrace_ecb_create_cache = NULL;
7710
7711 if (desc == NULL) {
7712 /*
7713 * If we're passed a NULL description, we're being asked to
7714 * create an ECB with a NULL probe.
7715 */
7716 (void) dtrace_ecb_create_enable(NULL, enab);
7717 return (0);
7718 }
7719
7720 dtrace_probekey(desc, &pkey);
7721 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7722 &priv, &uid, &zoneid);
7723
7724 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7725 enab));
7726}
7727
7728/*
7729 * DTrace Helper Provider Functions
7730 */
7731static void
7732dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7733{
7734 attr->dtat_name = DOF_ATTR_NAME(dofattr);
7735 attr->dtat_data = DOF_ATTR_DATA(dofattr);
7736 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7737}
7738
7739static void
7740dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7741 const dof_provider_t *dofprov, char *strtab)
7742{
7743 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7744 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7745 dofprov->dofpv_provattr);
7746 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7747 dofprov->dofpv_modattr);
7748 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7749 dofprov->dofpv_funcattr);
7750 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7751 dofprov->dofpv_nameattr);
7752 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7753 dofprov->dofpv_argsattr);
7754}
7755
7756static void
7757dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7758{
7759 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7760 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7761 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7762 dof_provider_t *provider;
7763 dof_probe_t *probe;
7764 uint32_t *off, *enoff;
7765 uint8_t *arg;
7766 char *strtab;
7767 uint_t i, nprobes;
7768 dtrace_helper_provdesc_t dhpv;
7769 dtrace_helper_probedesc_t dhpb;
7770 dtrace_meta_t *meta = dtrace_meta_pid;
7771 dtrace_mops_t *mops = &meta->dtm_mops;
7772 void *parg;
7773
7774 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7775 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7776 provider->dofpv_strtab * dof->dofh_secsize);
7777 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7778 provider->dofpv_probes * dof->dofh_secsize);
7779 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7780 provider->dofpv_prargs * dof->dofh_secsize);
7781 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7782 provider->dofpv_proffs * dof->dofh_secsize);
7783
7784 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7785 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7786 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7787 enoff = NULL;
7788
7789 /*
7790 * See dtrace_helper_provider_validate().
7791 */
7792 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7793 provider->dofpv_prenoffs != DOF_SECT_NONE) {
7794 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7795 provider->dofpv_prenoffs * dof->dofh_secsize);
7796 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7797 }
7798
7799 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7800
7801 /*
7802 * Create the provider.
7803 */
7804 dtrace_dofprov2hprov(&dhpv, provider, strtab);
7805
7806 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7807 return;
7808
7809 meta->dtm_count++;
7810
7811 /*
7812 * Create the probes.
7813 */
7814 for (i = 0; i < nprobes; i++) {
7815 probe = (dof_probe_t *)(uintptr_t)(daddr +
7816 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7817
7818 dhpb.dthpb_mod = dhp->dofhp_mod;
7819 dhpb.dthpb_func = strtab + probe->dofpr_func;
7820 dhpb.dthpb_name = strtab + probe->dofpr_name;
7821 dhpb.dthpb_base = probe->dofpr_addr;
7822 dhpb.dthpb_offs = off + probe->dofpr_offidx;
7823 dhpb.dthpb_noffs = probe->dofpr_noffs;
7824 if (enoff != NULL) {
7825 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7826 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7827 } else {
7828 dhpb.dthpb_enoffs = NULL;
7829 dhpb.dthpb_nenoffs = 0;
7830 }
7831 dhpb.dthpb_args = arg + probe->dofpr_argidx;
7832 dhpb.dthpb_nargc = probe->dofpr_nargc;
7833 dhpb.dthpb_xargc = probe->dofpr_xargc;
7834 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7835 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7836
7837 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7838 }
7839}
7840
7841static void
7842dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7843{
7844 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7845 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7846 VBDTTYPE(uint32_t,int) i;
7847
7848 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7849
7850 for (i = 0; i < dof->dofh_secnum; i++) {
7851 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7852 dof->dofh_secoff + i * dof->dofh_secsize);
7853
7854 if (sec->dofs_type != DOF_SECT_PROVIDER)
7855 continue;
7856
7857 dtrace_helper_provide_one(dhp, sec, pid);
7858 }
7859
7860 /*
7861 * We may have just created probes, so we must now rematch against
7862 * any retained enablings. Note that this call will acquire both
7863 * cpu_lock and dtrace_lock; the fact that we are holding
7864 * dtrace_meta_lock now is what defines the ordering with respect to
7865 * these three locks.
7866 */
7867 dtrace_enabling_matchall();
7868}
7869
7870static void
7871dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7872{
7873 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7874 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7875 dof_sec_t *str_sec;
7876 dof_provider_t *provider;
7877 char *strtab;
7878 dtrace_helper_provdesc_t dhpv;
7879 dtrace_meta_t *meta = dtrace_meta_pid;
7880 dtrace_mops_t *mops = &meta->dtm_mops;
7881
7882 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7883 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7884 provider->dofpv_strtab * dof->dofh_secsize);
7885
7886 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7887
7888 /*
7889 * Create the provider.
7890 */
7891 dtrace_dofprov2hprov(&dhpv, provider, strtab);
7892
7893 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
7894
7895 meta->dtm_count--;
7896}
7897
7898static void
7899dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
7900{
7901 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7902 dof_hdr_t *dof = (dof_hdr_t *)daddr;
7903 VBDTTYPE(uint32_t,int) i;
7904
7905 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7906
7907 for (i = 0; i < dof->dofh_secnum; i++) {
7908 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7909 dof->dofh_secoff + i * dof->dofh_secsize);
7910
7911 if (sec->dofs_type != DOF_SECT_PROVIDER)
7912 continue;
7913
7914 dtrace_helper_provider_remove_one(dhp, sec, pid);
7915 }
7916}
7917
7918/*
7919 * DTrace Meta Provider-to-Framework API Functions
7920 *
7921 * These functions implement the Meta Provider-to-Framework API, as described
7922 * in <sys/dtrace.h>.
7923 */
7924int
7925dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
7926 dtrace_meta_provider_id_t *idp)
7927{
7928 dtrace_meta_t *meta;
7929 dtrace_helpers_t *help, *next;
7930 VBDTTYPE(uint32_t,int) i;
7931
7932 *idp = DTRACE_METAPROVNONE;
7933
7934 /*
7935 * We strictly don't need the name, but we hold onto it for
7936 * debuggability. All hail error queues!
7937 */
7938 if (name == NULL) {
7939 cmn_err(CE_WARN, "failed to register meta-provider: "
7940 "invalid name");
7941 return (EINVAL);
7942 }
7943
7944 if (mops == NULL ||
7945 mops->dtms_create_probe == NULL ||
7946 mops->dtms_provide_pid == NULL ||
7947 mops->dtms_remove_pid == NULL) {
7948 cmn_err(CE_WARN, "failed to register meta-register %s: "
7949 "invalid ops", name);
7950 return (EINVAL);
7951 }
7952
7953 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
7954 meta->dtm_mops = *mops;
7955 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7956 (void) strcpy(meta->dtm_name, name);
7957 meta->dtm_arg = arg;
7958
7959 mutex_enter(&dtrace_meta_lock);
7960 mutex_enter(&dtrace_lock);
7961
7962 if (dtrace_meta_pid != NULL) {
7963 mutex_exit(&dtrace_lock);
7964 mutex_exit(&dtrace_meta_lock);
7965 cmn_err(CE_WARN, "failed to register meta-register %s: "
7966 "user-land meta-provider exists", name);
7967 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
7968 kmem_free(meta, sizeof (dtrace_meta_t));
7969 return (EINVAL);
7970 }
7971
7972 dtrace_meta_pid = meta;
7973 *idp = (dtrace_meta_provider_id_t)meta;
7974
7975 /*
7976 * If there are providers and probes ready to go, pass them
7977 * off to the new meta provider now.
7978 */
7979
7980 help = dtrace_deferred_pid;
7981 dtrace_deferred_pid = NULL;
7982
7983 mutex_exit(&dtrace_lock);
7984
7985 while (help != NULL) {
7986 for (i = 0; i < help->dthps_nprovs; i++) {
7987 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
7988 help->dthps_pid);
7989 }
7990
7991 next = help->dthps_next;
7992 help->dthps_next = NULL;
7993 help->dthps_prev = NULL;
7994 help->dthps_deferred = 0;
7995 help = next;
7996 }
7997
7998 mutex_exit(&dtrace_meta_lock);
7999
8000 return (0);
8001}
8002
8003int
8004dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8005{
8006 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8007
8008 mutex_enter(&dtrace_meta_lock);
8009 mutex_enter(&dtrace_lock);
8010
8011 if (old == dtrace_meta_pid) {
8012 pp = &dtrace_meta_pid;
8013 } else {
8014 panic("attempt to unregister non-existent "
8015 "dtrace meta-provider %p\n", (void *)old);
8016 }
8017
8018 if (old->dtm_count != 0) {
8019 mutex_exit(&dtrace_lock);
8020 mutex_exit(&dtrace_meta_lock);
8021 return (EBUSY);
8022 }
8023
8024 *pp = NULL;
8025
8026 mutex_exit(&dtrace_lock);
8027 mutex_exit(&dtrace_meta_lock);
8028
8029 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8030 kmem_free(old, sizeof (dtrace_meta_t));
8031
8032 return (0);
8033}
8034
8035
8036/*
8037 * DTrace DIF Object Functions
8038 */
8039static int
8040dtrace_difo_err(uint_t pc, const char *format, ...)
8041{
8042 if (dtrace_err_verbose) {
8043 va_list alist;
8044
8045 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8046 va_start(alist, format);
8047 (void) vuprintf(format, alist);
8048 va_end(alist);
8049 }
8050
8051#ifdef DTRACE_ERRDEBUG
8052 dtrace_errdebug(format);
8053#endif
8054 return (1);
8055}
8056
8057/*
8058 * Validate a DTrace DIF object by checking the IR instructions. The following
8059 * rules are currently enforced by dtrace_difo_validate():
8060 *
8061 * 1. Each instruction must have a valid opcode
8062 * 2. Each register, string, variable, or subroutine reference must be valid
8063 * 3. No instruction can modify register %r0 (must be zero)
8064 * 4. All instruction reserved bits must be set to zero
8065 * 5. The last instruction must be a "ret" instruction
8066 * 6. All branch targets must reference a valid instruction _after_ the branch
8067 */
8068static int
8069dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8070 cred_t *cr)
8071{
8072#ifndef VBOX
8073 int err = 0, i;
8074#else
8075 int err = 0;
8076 uint_t i;
8077#endif
8078 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8079 int kcheckload;
8080 uint_t pc;
8081
8082 kcheckload = cr == NULL ||
8083 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8084
8085 dp->dtdo_destructive = 0;
8086
8087 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8088 dif_instr_t instr = dp->dtdo_buf[pc];
8089
8090 uint_t r1 = DIF_INSTR_R1(instr);
8091 uint_t r2 = DIF_INSTR_R2(instr);
8092 uint_t rd = DIF_INSTR_RD(instr);
8093 uint_t rs = DIF_INSTR_RS(instr);
8094 uint_t label = DIF_INSTR_LABEL(instr);
8095 uint_t v = DIF_INSTR_VAR(instr);
8096 uint_t subr = DIF_INSTR_SUBR(instr);
8097 uint_t type = DIF_INSTR_TYPE(instr);
8098 uint_t op = DIF_INSTR_OP(instr);
8099
8100 switch (op) {
8101 case DIF_OP_OR:
8102 case DIF_OP_XOR:
8103 case DIF_OP_AND:
8104 case DIF_OP_SLL:
8105 case DIF_OP_SRL:
8106 case DIF_OP_SRA:
8107 case DIF_OP_SUB:
8108 case DIF_OP_ADD:
8109 case DIF_OP_MUL:
8110 case DIF_OP_SDIV:
8111 case DIF_OP_UDIV:
8112 case DIF_OP_SREM:
8113 case DIF_OP_UREM:
8114 case DIF_OP_COPYS:
8115 if (r1 >= nregs)
8116 err += efunc(pc, "invalid register %u\n", r1);
8117 if (r2 >= nregs)
8118 err += efunc(pc, "invalid register %u\n", r2);
8119 if (rd >= nregs)
8120 err += efunc(pc, "invalid register %u\n", rd);
8121 if (rd == 0)
8122 err += efunc(pc, "cannot write to %r0\n");
8123 break;
8124 case DIF_OP_NOT:
8125 case DIF_OP_MOV:
8126 case DIF_OP_ALLOCS:
8127 if (r1 >= nregs)
8128 err += efunc(pc, "invalid register %u\n", r1);
8129 if (r2 != 0)
8130 err += efunc(pc, "non-zero reserved bits\n");
8131 if (rd >= nregs)
8132 err += efunc(pc, "invalid register %u\n", rd);
8133 if (rd == 0)
8134 err += efunc(pc, "cannot write to %r0\n");
8135 break;
8136 case DIF_OP_LDSB:
8137 case DIF_OP_LDSH:
8138 case DIF_OP_LDSW:
8139 case DIF_OP_LDUB:
8140 case DIF_OP_LDUH:
8141 case DIF_OP_LDUW:
8142 case DIF_OP_LDX:
8143 if (r1 >= nregs)
8144 err += efunc(pc, "invalid register %u\n", r1);
8145 if (r2 != 0)
8146 err += efunc(pc, "non-zero reserved bits\n");
8147 if (rd >= nregs)
8148 err += efunc(pc, "invalid register %u\n", rd);
8149 if (rd == 0)
8150 err += efunc(pc, "cannot write to %r0\n");
8151 if (kcheckload)
8152 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8153 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8154 break;
8155 case DIF_OP_RLDSB:
8156 case DIF_OP_RLDSH:
8157 case DIF_OP_RLDSW:
8158 case DIF_OP_RLDUB:
8159 case DIF_OP_RLDUH:
8160 case DIF_OP_RLDUW:
8161 case DIF_OP_RLDX:
8162 if (r1 >= nregs)
8163 err += efunc(pc, "invalid register %u\n", r1);
8164 if (r2 != 0)
8165 err += efunc(pc, "non-zero reserved bits\n");
8166 if (rd >= nregs)
8167 err += efunc(pc, "invalid register %u\n", rd);
8168 if (rd == 0)
8169 err += efunc(pc, "cannot write to %r0\n");
8170 break;
8171 case DIF_OP_ULDSB:
8172 case DIF_OP_ULDSH:
8173 case DIF_OP_ULDSW:
8174 case DIF_OP_ULDUB:
8175 case DIF_OP_ULDUH:
8176 case DIF_OP_ULDUW:
8177 case DIF_OP_ULDX:
8178 if (r1 >= nregs)
8179 err += efunc(pc, "invalid register %u\n", r1);
8180 if (r2 != 0)
8181 err += efunc(pc, "non-zero reserved bits\n");
8182 if (rd >= nregs)
8183 err += efunc(pc, "invalid register %u\n", rd);
8184 if (rd == 0)
8185 err += efunc(pc, "cannot write to %r0\n");
8186 break;
8187 case DIF_OP_STB:
8188 case DIF_OP_STH:
8189 case DIF_OP_STW:
8190 case DIF_OP_STX:
8191 if (r1 >= nregs)
8192 err += efunc(pc, "invalid register %u\n", r1);
8193 if (r2 != 0)
8194 err += efunc(pc, "non-zero reserved bits\n");
8195 if (rd >= nregs)
8196 err += efunc(pc, "invalid register %u\n", rd);
8197 if (rd == 0)
8198 err += efunc(pc, "cannot write to 0 address\n");
8199 break;
8200 case DIF_OP_CMP:
8201 case DIF_OP_SCMP:
8202 if (r1 >= nregs)
8203 err += efunc(pc, "invalid register %u\n", r1);
8204 if (r2 >= nregs)
8205 err += efunc(pc, "invalid register %u\n", r2);
8206 if (rd != 0)
8207 err += efunc(pc, "non-zero reserved bits\n");
8208 break;
8209 case DIF_OP_TST:
8210 if (r1 >= nregs)
8211 err += efunc(pc, "invalid register %u\n", r1);
8212 if (r2 != 0 || rd != 0)
8213 err += efunc(pc, "non-zero reserved bits\n");
8214 break;
8215 case DIF_OP_BA:
8216 case DIF_OP_BE:
8217 case DIF_OP_BNE:
8218 case DIF_OP_BG:
8219 case DIF_OP_BGU:
8220 case DIF_OP_BGE:
8221 case DIF_OP_BGEU:
8222 case DIF_OP_BL:
8223 case DIF_OP_BLU:
8224 case DIF_OP_BLE:
8225 case DIF_OP_BLEU:
8226 if (label >= dp->dtdo_len) {
8227 err += efunc(pc, "invalid branch target %u\n",
8228 label);
8229 }
8230 if (label <= pc) {
8231 err += efunc(pc, "backward branch to %u\n",
8232 label);
8233 }
8234 break;
8235 case DIF_OP_RET:
8236 if (r1 != 0 || r2 != 0)
8237 err += efunc(pc, "non-zero reserved bits\n");
8238 if (rd >= nregs)
8239 err += efunc(pc, "invalid register %u\n", rd);
8240 break;
8241 case DIF_OP_NOP:
8242 case DIF_OP_POPTS:
8243 case DIF_OP_FLUSHTS:
8244 if (r1 != 0 || r2 != 0 || rd != 0)
8245 err += efunc(pc, "non-zero reserved bits\n");
8246 break;
8247 case DIF_OP_SETX:
8248 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8249 err += efunc(pc, "invalid integer ref %u\n",
8250 DIF_INSTR_INTEGER(instr));
8251 }
8252 if (rd >= nregs)
8253 err += efunc(pc, "invalid register %u\n", rd);
8254 if (rd == 0)
8255 err += efunc(pc, "cannot write to %r0\n");
8256 break;
8257 case DIF_OP_SETS:
8258 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8259 err += efunc(pc, "invalid string ref %u\n",
8260 DIF_INSTR_STRING(instr));
8261 }
8262 if (rd >= nregs)
8263 err += efunc(pc, "invalid register %u\n", rd);
8264 if (rd == 0)
8265 err += efunc(pc, "cannot write to %r0\n");
8266 break;
8267 case DIF_OP_LDGA:
8268 case DIF_OP_LDTA:
8269 if (r1 > DIF_VAR_ARRAY_MAX)
8270 err += efunc(pc, "invalid array %u\n", r1);
8271 if (r2 >= nregs)
8272 err += efunc(pc, "invalid register %u\n", r2);
8273 if (rd >= nregs)
8274 err += efunc(pc, "invalid register %u\n", rd);
8275 if (rd == 0)
8276 err += efunc(pc, "cannot write to %r0\n");
8277 break;
8278 case DIF_OP_LDGS:
8279 case DIF_OP_LDTS:
8280 case DIF_OP_LDLS:
8281 case DIF_OP_LDGAA:
8282 case DIF_OP_LDTAA:
8283 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8284 err += efunc(pc, "invalid variable %u\n", v);
8285 if (rd >= nregs)
8286 err += efunc(pc, "invalid register %u\n", rd);
8287 if (rd == 0)
8288 err += efunc(pc, "cannot write to %r0\n");
8289 break;
8290 case DIF_OP_STGS:
8291 case DIF_OP_STTS:
8292 case DIF_OP_STLS:
8293 case DIF_OP_STGAA:
8294 case DIF_OP_STTAA:
8295 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8296 err += efunc(pc, "invalid variable %u\n", v);
8297 if (rs >= nregs)
8298 err += efunc(pc, "invalid register %u\n", rd);
8299 break;
8300 case DIF_OP_CALL:
8301 if (subr > DIF_SUBR_MAX)
8302 err += efunc(pc, "invalid subr %u\n", subr);
8303 if (rd >= nregs)
8304 err += efunc(pc, "invalid register %u\n", rd);
8305 if (rd == 0)
8306 err += efunc(pc, "cannot write to %r0\n");
8307
8308 if (subr == DIF_SUBR_COPYOUT ||
8309 subr == DIF_SUBR_COPYOUTSTR) {
8310 dp->dtdo_destructive = 1;
8311 }
8312 break;
8313 case DIF_OP_PUSHTR:
8314 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8315 err += efunc(pc, "invalid ref type %u\n", type);
8316 if (r2 >= nregs)
8317 err += efunc(pc, "invalid register %u\n", r2);
8318 if (rs >= nregs)
8319 err += efunc(pc, "invalid register %u\n", rs);
8320 break;
8321 case DIF_OP_PUSHTV:
8322 if (type != DIF_TYPE_CTF)
8323 err += efunc(pc, "invalid val type %u\n", type);
8324 if (r2 >= nregs)
8325 err += efunc(pc, "invalid register %u\n", r2);
8326 if (rs >= nregs)
8327 err += efunc(pc, "invalid register %u\n", rs);
8328 break;
8329 default:
8330 err += efunc(pc, "invalid opcode %u\n",
8331 DIF_INSTR_OP(instr));
8332 }
8333 }
8334
8335 if (dp->dtdo_len != 0 &&
8336 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8337 err += efunc(dp->dtdo_len - 1,
8338 "expected 'ret' as last DIF instruction\n");
8339 }
8340
8341 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8342 /*
8343 * If we're not returning by reference, the size must be either
8344 * 0 or the size of one of the base types.
8345 */
8346 switch (dp->dtdo_rtype.dtdt_size) {
8347 case 0:
8348 case sizeof (uint8_t):
8349 case sizeof (uint16_t):
8350 case sizeof (uint32_t):
8351 case sizeof (uint64_t):
8352 break;
8353
8354 default:
8355 err += efunc(dp->dtdo_len - 1, "bad return size\n");
8356 }
8357 }
8358
8359 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8360 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8361 dtrace_diftype_t *vt, *et;
8362 uint_t id, ndx;
8363
8364 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8365 v->dtdv_scope != DIFV_SCOPE_THREAD &&
8366 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8367 err += efunc(i, "unrecognized variable scope %d\n",
8368 v->dtdv_scope);
8369 break;
8370 }
8371
8372 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8373 v->dtdv_kind != DIFV_KIND_SCALAR) {
8374 err += efunc(i, "unrecognized variable type %d\n",
8375 v->dtdv_kind);
8376 break;
8377 }
8378
8379 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8380 err += efunc(i, "%d exceeds variable id limit\n", id);
8381 break;
8382 }
8383
8384 if (id < DIF_VAR_OTHER_UBASE)
8385 continue;
8386
8387 /*
8388 * For user-defined variables, we need to check that this
8389 * definition is identical to any previous definition that we
8390 * encountered.
8391 */
8392 ndx = id - DIF_VAR_OTHER_UBASE;
8393
8394 switch (v->dtdv_scope) {
8395 case DIFV_SCOPE_GLOBAL:
8396 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nglobals) {
8397 dtrace_statvar_t *svar;
8398
8399 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8400 existing = &svar->dtsv_var;
8401 }
8402
8403 break;
8404
8405 case DIFV_SCOPE_THREAD:
8406 if (VBDTCAST(int64_t)ndx < vstate->dtvs_ntlocals)
8407 existing = &vstate->dtvs_tlocals[ndx];
8408 break;
8409
8410 case DIFV_SCOPE_LOCAL:
8411 if (VBDTCAST(int64_t)ndx < vstate->dtvs_nlocals) {
8412 dtrace_statvar_t *svar;
8413
8414 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8415 existing = &svar->dtsv_var;
8416 }
8417
8418 break;
8419 }
8420
8421 vt = &v->dtdv_type;
8422
8423 if (vt->dtdt_flags & DIF_TF_BYREF) {
8424 if (vt->dtdt_size == 0) {
8425 err += efunc(i, "zero-sized variable\n");
8426 break;
8427 }
8428
8429 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8430 vt->dtdt_size > dtrace_global_maxsize) {
8431 err += efunc(i, "oversized by-ref global\n");
8432 break;
8433 }
8434 }
8435
8436 if (existing == NULL || existing->dtdv_id == 0)
8437 continue;
8438
8439 ASSERT(existing->dtdv_id == v->dtdv_id);
8440 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8441
8442 if (existing->dtdv_kind != v->dtdv_kind)
8443 err += efunc(i, "%d changed variable kind\n", id);
8444
8445 et = &existing->dtdv_type;
8446
8447 if (vt->dtdt_flags != et->dtdt_flags) {
8448 err += efunc(i, "%d changed variable type flags\n", id);
8449 break;
8450 }
8451
8452 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8453 err += efunc(i, "%d changed variable type size\n", id);
8454 break;
8455 }
8456 }
8457
8458 return (err);
8459}
8460
8461/*
8462 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
8463 * are much more constrained than normal DIFOs. Specifically, they may
8464 * not:
8465 *
8466 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8467 * miscellaneous string routines
8468 * 2. Access DTrace variables other than the args[] array, and the
8469 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8470 * 3. Have thread-local variables.
8471 * 4. Have dynamic variables.
8472 */
8473static int
8474dtrace_difo_validate_helper(dtrace_difo_t *dp)
8475{
8476 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8477 int err = 0;
8478 uint_t pc;
8479
8480 for (pc = 0; pc < dp->dtdo_len; pc++) {
8481 dif_instr_t instr = dp->dtdo_buf[pc];
8482
8483 uint_t v = DIF_INSTR_VAR(instr);
8484 uint_t subr = DIF_INSTR_SUBR(instr);
8485 uint_t op = DIF_INSTR_OP(instr);
8486
8487 switch (op) {
8488 case DIF_OP_OR:
8489 case DIF_OP_XOR:
8490 case DIF_OP_AND:
8491 case DIF_OP_SLL:
8492 case DIF_OP_SRL:
8493 case DIF_OP_SRA:
8494 case DIF_OP_SUB:
8495 case DIF_OP_ADD:
8496 case DIF_OP_MUL:
8497 case DIF_OP_SDIV:
8498 case DIF_OP_UDIV:
8499 case DIF_OP_SREM:
8500 case DIF_OP_UREM:
8501 case DIF_OP_COPYS:
8502 case DIF_OP_NOT:
8503 case DIF_OP_MOV:
8504 case DIF_OP_RLDSB:
8505 case DIF_OP_RLDSH:
8506 case DIF_OP_RLDSW:
8507 case DIF_OP_RLDUB:
8508 case DIF_OP_RLDUH:
8509 case DIF_OP_RLDUW:
8510 case DIF_OP_RLDX:
8511 case DIF_OP_ULDSB:
8512 case DIF_OP_ULDSH:
8513 case DIF_OP_ULDSW:
8514 case DIF_OP_ULDUB:
8515 case DIF_OP_ULDUH:
8516 case DIF_OP_ULDUW:
8517 case DIF_OP_ULDX:
8518 case DIF_OP_STB:
8519 case DIF_OP_STH:
8520 case DIF_OP_STW:
8521 case DIF_OP_STX:
8522 case DIF_OP_ALLOCS:
8523 case DIF_OP_CMP:
8524 case DIF_OP_SCMP:
8525 case DIF_OP_TST:
8526 case DIF_OP_BA:
8527 case DIF_OP_BE:
8528 case DIF_OP_BNE:
8529 case DIF_OP_BG:
8530 case DIF_OP_BGU:
8531 case DIF_OP_BGE:
8532 case DIF_OP_BGEU:
8533 case DIF_OP_BL:
8534 case DIF_OP_BLU:
8535 case DIF_OP_BLE:
8536 case DIF_OP_BLEU:
8537 case DIF_OP_RET:
8538 case DIF_OP_NOP:
8539 case DIF_OP_POPTS:
8540 case DIF_OP_FLUSHTS:
8541 case DIF_OP_SETX:
8542 case DIF_OP_SETS:
8543 case DIF_OP_LDGA:
8544 case DIF_OP_LDLS:
8545 case DIF_OP_STGS:
8546 case DIF_OP_STLS:
8547 case DIF_OP_PUSHTR:
8548 case DIF_OP_PUSHTV:
8549 break;
8550
8551 case DIF_OP_LDGS:
8552 if (v >= DIF_VAR_OTHER_UBASE)
8553 break;
8554
8555 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8556 break;
8557
8558 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8559 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8560 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8561 v == DIF_VAR_UID || v == DIF_VAR_GID)
8562 break;
8563
8564 err += efunc(pc, "illegal variable %u\n", v);
8565 break;
8566
8567 case DIF_OP_LDTA:
8568 case DIF_OP_LDTS:
8569 case DIF_OP_LDGAA:
8570 case DIF_OP_LDTAA:
8571 err += efunc(pc, "illegal dynamic variable load\n");
8572 break;
8573
8574 case DIF_OP_STTS:
8575 case DIF_OP_STGAA:
8576 case DIF_OP_STTAA:
8577 err += efunc(pc, "illegal dynamic variable store\n");
8578 break;
8579
8580 case DIF_OP_CALL:
8581 if (subr == DIF_SUBR_ALLOCA ||
8582 subr == DIF_SUBR_BCOPY ||
8583 subr == DIF_SUBR_COPYIN ||
8584 subr == DIF_SUBR_COPYINTO ||
8585 subr == DIF_SUBR_COPYINSTR ||
8586 subr == DIF_SUBR_INDEX ||
8587 subr == DIF_SUBR_INET_NTOA ||
8588 subr == DIF_SUBR_INET_NTOA6 ||
8589 subr == DIF_SUBR_INET_NTOP ||
8590 subr == DIF_SUBR_LLTOSTR ||
8591 subr == DIF_SUBR_RINDEX ||
8592 subr == DIF_SUBR_STRCHR ||
8593 subr == DIF_SUBR_STRJOIN ||
8594 subr == DIF_SUBR_STRRCHR ||
8595 subr == DIF_SUBR_STRSTR ||
8596 subr == DIF_SUBR_HTONS ||
8597 subr == DIF_SUBR_HTONL ||
8598 subr == DIF_SUBR_HTONLL ||
8599 subr == DIF_SUBR_NTOHS ||
8600 subr == DIF_SUBR_NTOHL ||
8601 subr == DIF_SUBR_NTOHLL)
8602 break;
8603
8604 err += efunc(pc, "invalid subr %u\n", subr);
8605 break;
8606
8607 default:
8608 err += efunc(pc, "invalid opcode %u\n",
8609 DIF_INSTR_OP(instr));
8610 }
8611 }
8612
8613 return (err);
8614}
8615
8616/*
8617 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8618 * basis; 0 if not.
8619 */
8620static int
8621dtrace_difo_cacheable(dtrace_difo_t *dp)
8622{
8623 VBDTTYPE(uint_t,int) i;
8624
8625 if (dp == NULL)
8626 return (0);
8627
8628 for (i = 0; i < dp->dtdo_varlen; i++) {
8629 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8630
8631 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8632 continue;
8633
8634 switch (v->dtdv_id) {
8635 case DIF_VAR_CURTHREAD:
8636 case DIF_VAR_PID:
8637 case DIF_VAR_TID:
8638 case DIF_VAR_EXECNAME:
8639 case DIF_VAR_ZONENAME:
8640 break;
8641
8642 default:
8643 return (0);
8644 }
8645 }
8646
8647 /*
8648 * This DIF object may be cacheable. Now we need to look for any
8649 * array loading instructions, any memory loading instructions, or
8650 * any stores to thread-local variables.
8651 */
8652 for (i = 0; i < dp->dtdo_len; i++) {
8653 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8654
8655 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8656 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8657 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8658 op == DIF_OP_LDGA || op == DIF_OP_STTS)
8659 return (0);
8660 }
8661
8662 return (1);
8663}
8664
8665static void
8666dtrace_difo_hold(dtrace_difo_t *dp)
8667{
8668 VBDTTYPE(uint_t,int) i;
8669
8670 ASSERT(MUTEX_HELD(&dtrace_lock));
8671
8672 dp->dtdo_refcnt++;
8673 ASSERT(dp->dtdo_refcnt != 0);
8674
8675 /*
8676 * We need to check this DIF object for references to the variable
8677 * DIF_VAR_VTIMESTAMP.
8678 */
8679 for (i = 0; i < dp->dtdo_varlen; i++) {
8680 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8681
8682 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8683 continue;
8684
8685 if (dtrace_vtime_references++ == 0)
8686 dtrace_vtime_enable();
8687 }
8688}
8689
8690/*
8691 * This routine calculates the dynamic variable chunksize for a given DIF
8692 * object. The calculation is not fool-proof, and can probably be tricked by
8693 * malicious DIF -- but it works for all compiler-generated DIF. Because this
8694 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8695 * if a dynamic variable size exceeds the chunksize.
8696 */
8697static void
8698dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8699{
8700 uint64_t sval;
8701 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8702 const dif_instr_t *text = dp->dtdo_buf;
8703 uint_t pc, srd = 0;
8704 uint_t ttop = 0;
8705 size_t size, ksize;
8706 uint_t id, i;
8707
8708 for (pc = 0; pc < dp->dtdo_len; pc++) {
8709 dif_instr_t instr = text[pc];
8710 uint_t op = DIF_INSTR_OP(instr);
8711 uint_t rd = DIF_INSTR_RD(instr);
8712 uint_t r1 = DIF_INSTR_R1(instr);
8713 uint_t nkeys = 0;
8714 uchar_t scope;
8715
8716 dtrace_key_t *key = tupregs;
8717
8718 switch (op) {
8719 case DIF_OP_SETX:
8720 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8721 srd = rd;
8722 continue;
8723
8724 case DIF_OP_STTS:
8725 key = &tupregs[DIF_DTR_NREGS];
8726 key[0].dttk_size = 0;
8727 key[1].dttk_size = 0;
8728 nkeys = 2;
8729 scope = DIFV_SCOPE_THREAD;
8730 break;
8731
8732 case DIF_OP_STGAA:
8733 case DIF_OP_STTAA:
8734 nkeys = ttop;
8735
8736 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8737 key[nkeys++].dttk_size = 0;
8738
8739 key[nkeys++].dttk_size = 0;
8740
8741 if (op == DIF_OP_STTAA) {
8742 scope = DIFV_SCOPE_THREAD;
8743 } else {
8744 scope = DIFV_SCOPE_GLOBAL;
8745 }
8746
8747 break;
8748
8749 case DIF_OP_PUSHTR:
8750 if (ttop == DIF_DTR_NREGS)
8751 return;
8752
8753 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8754 /*
8755 * If the register for the size of the "pushtr"
8756 * is %r0 (or the value is 0) and the type is
8757 * a string, we'll use the system-wide default
8758 * string size.
8759 */
8760 tupregs[ttop++].dttk_size =
8761 dtrace_strsize_default;
8762 } else {
8763 if (srd == 0)
8764 return;
8765
8766 tupregs[ttop++].dttk_size = sval;
8767 }
8768
8769 break;
8770
8771 case DIF_OP_PUSHTV:
8772 if (ttop == DIF_DTR_NREGS)
8773 return;
8774
8775 tupregs[ttop++].dttk_size = 0;
8776 break;
8777
8778 case DIF_OP_FLUSHTS:
8779 ttop = 0;
8780 break;
8781
8782 case DIF_OP_POPTS:
8783 if (ttop != 0)
8784 ttop--;
8785 break;
8786 }
8787
8788 sval = 0;
8789 srd = 0;
8790
8791 if (nkeys == 0)
8792 continue;
8793
8794 /*
8795 * We have a dynamic variable allocation; calculate its size.
8796 */
8797 for (ksize = 0, i = 0; i < nkeys; i++)
8798 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8799
8800 size = sizeof (dtrace_dynvar_t);
8801 size += sizeof (dtrace_key_t) * (nkeys - 1);
8802 size += ksize;
8803
8804 /*
8805 * Now we need to determine the size of the stored data.
8806 */
8807 id = DIF_INSTR_VAR(instr);
8808
8809 for (i = 0; i < dp->dtdo_varlen; i++) {
8810 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8811
8812 if (v->dtdv_id == id && v->dtdv_scope == scope) {
8813 size += v->dtdv_type.dtdt_size;
8814 break;
8815 }
8816 }
8817
8818 if (i == dp->dtdo_varlen)
8819 return;
8820
8821 /*
8822 * We have the size. If this is larger than the chunk size
8823 * for our dynamic variable state, reset the chunk size.
8824 */
8825 size = P2ROUNDUP(size, sizeof (uint64_t));
8826
8827 if (size > vstate->dtvs_dynvars.dtds_chunksize)
8828 vstate->dtvs_dynvars.dtds_chunksize = size;
8829 }
8830}
8831
8832static void
8833dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8834{
8835#ifndef VBOX
8836 int i, oldsvars, osz, nsz, otlocals, ntlocals;
8837#else
8838 int oldsvars, osz, nsz, otlocals, ntlocals;
8839 uint_t i;
8840#endif
8841 uint_t id;
8842
8843 ASSERT(MUTEX_HELD(&dtrace_lock));
8844 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8845
8846 for (i = 0; i < dp->dtdo_varlen; i++) {
8847 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8848 dtrace_statvar_t *svar, ***svarp;
8849 size_t dsize = 0;
8850 uint8_t scope = v->dtdv_scope;
8851 int *np;
8852
8853 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8854 continue;
8855
8856 id -= DIF_VAR_OTHER_UBASE;
8857
8858 switch (scope) {
8859 case DIFV_SCOPE_THREAD:
8860 while (VBDTCAST(int64_t)id >= (otlocals = vstate->dtvs_ntlocals)) {
8861 dtrace_difv_t *tlocals;
8862
8863 if ((ntlocals = (otlocals << 1)) == 0)
8864 ntlocals = 1;
8865
8866 osz = otlocals * sizeof (dtrace_difv_t);
8867 nsz = ntlocals * sizeof (dtrace_difv_t);
8868
8869 tlocals = kmem_zalloc(nsz, KM_SLEEP);
8870
8871 if (osz != 0) {
8872 bcopy(vstate->dtvs_tlocals,
8873 tlocals, osz);
8874 kmem_free(vstate->dtvs_tlocals, osz);
8875 }
8876
8877 vstate->dtvs_tlocals = tlocals;
8878 vstate->dtvs_ntlocals = ntlocals;
8879 }
8880
8881 vstate->dtvs_tlocals[id] = *v;
8882 continue;
8883
8884 case DIFV_SCOPE_LOCAL:
8885 np = &vstate->dtvs_nlocals;
8886 svarp = &vstate->dtvs_locals;
8887
8888 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8889 dsize = NCPU * (v->dtdv_type.dtdt_size +
8890 sizeof (uint64_t));
8891 else
8892 dsize = NCPU * sizeof (uint64_t);
8893
8894 break;
8895
8896 case DIFV_SCOPE_GLOBAL:
8897 np = &vstate->dtvs_nglobals;
8898 svarp = &vstate->dtvs_globals;
8899
8900 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8901 dsize = v->dtdv_type.dtdt_size +
8902 sizeof (uint64_t);
8903
8904 break;
8905
8906 default:
8907 ASSERT(0);
8908 }
8909
8910 while (VBDTCAST(int64_t)id >= (oldsvars = *np)) {
8911 dtrace_statvar_t **statics;
8912 int newsvars, oldsize, newsize;
8913
8914 if ((newsvars = (oldsvars << 1)) == 0)
8915 newsvars = 1;
8916
8917 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
8918 newsize = newsvars * sizeof (dtrace_statvar_t *);
8919
8920 statics = kmem_zalloc(newsize, KM_SLEEP);
8921
8922 if (oldsize != 0) {
8923 bcopy(*svarp, statics, oldsize);
8924 kmem_free(*svarp, oldsize);
8925 }
8926
8927 *svarp = statics;
8928 *np = newsvars;
8929 }
8930
8931 if ((svar = (*svarp)[id]) == NULL) {
8932 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
8933 svar->dtsv_var = *v;
8934
8935 if ((svar->dtsv_size = dsize) != 0) {
8936 svar->dtsv_data = (uint64_t)(uintptr_t)
8937 kmem_zalloc(dsize, KM_SLEEP);
8938 }
8939
8940 (*svarp)[id] = svar;
8941 }
8942
8943 svar->dtsv_refcnt++;
8944 }
8945
8946 dtrace_difo_chunksize(dp, vstate);
8947 dtrace_difo_hold(dp);
8948}
8949
8950static dtrace_difo_t *
8951dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8952{
8953 dtrace_difo_t *new;
8954 size_t sz;
8955
8956 ASSERT(dp->dtdo_buf != NULL);
8957 ASSERT(dp->dtdo_refcnt != 0);
8958
8959 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
8960
8961 ASSERT(dp->dtdo_buf != NULL);
8962 sz = dp->dtdo_len * sizeof (dif_instr_t);
8963 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
8964 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
8965 new->dtdo_len = dp->dtdo_len;
8966
8967 if (dp->dtdo_strtab != NULL) {
8968 ASSERT(dp->dtdo_strlen != 0);
8969 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
8970 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
8971 new->dtdo_strlen = dp->dtdo_strlen;
8972 }
8973
8974 if (dp->dtdo_inttab != NULL) {
8975 ASSERT(dp->dtdo_intlen != 0);
8976 sz = dp->dtdo_intlen * sizeof (uint64_t);
8977 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
8978 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
8979 new->dtdo_intlen = dp->dtdo_intlen;
8980 }
8981
8982 if (dp->dtdo_vartab != NULL) {
8983 ASSERT(dp->dtdo_varlen != 0);
8984 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
8985 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
8986 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
8987 new->dtdo_varlen = dp->dtdo_varlen;
8988 }
8989
8990 dtrace_difo_init(new, vstate);
8991 return (new);
8992}
8993
8994static void
8995dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8996{
8997 VBDTTYPE(uint_t,int) i;
8998
8999 ASSERT(dp->dtdo_refcnt == 0);
9000
9001 for (i = 0; i < dp->dtdo_varlen; i++) {
9002 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9003 dtrace_statvar_t *svar, **svarp;
9004 uint_t id;
9005 uint8_t scope = v->dtdv_scope;
9006 int *np;
9007
9008 switch (scope) {
9009 case DIFV_SCOPE_THREAD:
9010 continue;
9011
9012 case DIFV_SCOPE_LOCAL:
9013 np = &vstate->dtvs_nlocals;
9014 svarp = vstate->dtvs_locals;
9015 break;
9016
9017 case DIFV_SCOPE_GLOBAL:
9018 np = &vstate->dtvs_nglobals;
9019 svarp = vstate->dtvs_globals;
9020 break;
9021
9022 default:
9023 ASSERT(0);
9024 }
9025
9026 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9027 continue;
9028
9029 id -= DIF_VAR_OTHER_UBASE;
9030 ASSERT(VBDTCAST(int64_t)id < *np);
9031
9032 svar = svarp[id];
9033 ASSERT(svar != NULL);
9034 ASSERT(svar->dtsv_refcnt > 0);
9035
9036 if (--svar->dtsv_refcnt > 0)
9037 continue;
9038
9039 if (svar->dtsv_size != 0) {
9040 ASSERT(svar->dtsv_data != NULL);
9041 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9042 svar->dtsv_size);
9043 }
9044
9045 kmem_free(svar, sizeof (dtrace_statvar_t));
9046 svarp[id] = NULL;
9047 }
9048
9049 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9050 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9051 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9052 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9053
9054 kmem_free(dp, sizeof (dtrace_difo_t));
9055}
9056
9057static void
9058dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9059{
9060 VBDTTYPE(uint_t,int) i;
9061
9062 ASSERT(MUTEX_HELD(&dtrace_lock));
9063 ASSERT(dp->dtdo_refcnt != 0);
9064
9065 for (i = 0; i < dp->dtdo_varlen; i++) {
9066 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9067
9068 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9069 continue;
9070
9071 ASSERT(dtrace_vtime_references > 0);
9072 if (--dtrace_vtime_references == 0)
9073 dtrace_vtime_disable();
9074 }
9075
9076 if (--dp->dtdo_refcnt == 0)
9077 dtrace_difo_destroy(dp, vstate);
9078}
9079
9080/*
9081 * DTrace Format Functions
9082 */
9083static uint16_t
9084dtrace_format_add(dtrace_state_t *state, char *str)
9085{
9086 char *fmt, **new;
9087 uint16_t ndx, len = VBDTCAST(uint16_t)strlen(str) + 1;
9088
9089 fmt = kmem_zalloc(len, KM_SLEEP);
9090 bcopy(str, fmt, len);
9091
9092 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9093 if (state->dts_formats[ndx] == NULL) {
9094 state->dts_formats[ndx] = fmt;
9095 return (ndx + 1);
9096 }
9097 }
9098
9099 if (state->dts_nformats == USHRT_MAX) {
9100 /*
9101 * This is only likely if a denial-of-service attack is being
9102 * attempted. As such, it's okay to fail silently here.
9103 */
9104 kmem_free(fmt, len);
9105 return (0);
9106 }
9107
9108 /*
9109 * For simplicity, we always resize the formats array to be exactly the
9110 * number of formats.
9111 */
9112 ndx = state->dts_nformats++;
9113 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9114
9115 if (state->dts_formats != NULL) {
9116 ASSERT(ndx != 0);
9117 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9118 kmem_free(state->dts_formats, ndx * sizeof (char *));
9119 }
9120
9121 state->dts_formats = new;
9122 state->dts_formats[ndx] = fmt;
9123
9124 return (ndx + 1);
9125}
9126
9127static void
9128dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9129{
9130 char *fmt;
9131
9132 ASSERT(state->dts_formats != NULL);
9133 ASSERT(format <= state->dts_nformats);
9134 ASSERT(state->dts_formats[format - 1] != NULL);
9135
9136 fmt = state->dts_formats[format - 1];
9137 kmem_free(fmt, strlen(fmt) + 1);
9138 state->dts_formats[format - 1] = NULL;
9139}
9140
9141static void
9142dtrace_format_destroy(dtrace_state_t *state)
9143{
9144 int i;
9145
9146 if (state->dts_nformats == 0) {
9147 ASSERT(state->dts_formats == NULL);
9148 return;
9149 }
9150
9151 ASSERT(state->dts_formats != NULL);
9152
9153 for (i = 0; i < state->dts_nformats; i++) {
9154 char *fmt = state->dts_formats[i];
9155
9156 if (fmt == NULL)
9157 continue;
9158
9159 kmem_free(fmt, strlen(fmt) + 1);
9160 }
9161
9162 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9163 state->dts_nformats = 0;
9164 state->dts_formats = NULL;
9165}
9166
9167/*
9168 * DTrace Predicate Functions
9169 */
9170static dtrace_predicate_t *
9171dtrace_predicate_create(dtrace_difo_t *dp)
9172{
9173 dtrace_predicate_t *pred;
9174
9175 ASSERT(MUTEX_HELD(&dtrace_lock));
9176 ASSERT(dp->dtdo_refcnt != 0);
9177
9178 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9179 pred->dtp_difo = dp;
9180 pred->dtp_refcnt = 1;
9181
9182 if (!dtrace_difo_cacheable(dp))
9183 return (pred);
9184
9185 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9186 /*
9187 * This is only theoretically possible -- we have had 2^32
9188 * cacheable predicates on this machine. We cannot allow any
9189 * more predicates to become cacheable: as unlikely as it is,
9190 * there may be a thread caching a (now stale) predicate cache
9191 * ID. (N.B.: the temptation is being successfully resisted to
9192 * have this cmn_err() "Holy shit -- we executed this code!")
9193 */
9194 return (pred);
9195 }
9196
9197 pred->dtp_cacheid = dtrace_predcache_id++;
9198
9199 return (pred);
9200}
9201
9202static void
9203dtrace_predicate_hold(dtrace_predicate_t *pred)
9204{
9205 ASSERT(MUTEX_HELD(&dtrace_lock));
9206 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9207 ASSERT(pred->dtp_refcnt > 0);
9208
9209 pred->dtp_refcnt++;
9210}
9211
9212static void
9213dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9214{
9215 dtrace_difo_t *dp = pred->dtp_difo;
9216
9217 ASSERT(MUTEX_HELD(&dtrace_lock));
9218 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9219 ASSERT(pred->dtp_refcnt > 0);
9220
9221 if (--pred->dtp_refcnt == 0) {
9222 dtrace_difo_release(pred->dtp_difo, vstate);
9223 kmem_free(pred, sizeof (dtrace_predicate_t));
9224 }
9225}
9226
9227/*
9228 * DTrace Action Description Functions
9229 */
9230static dtrace_actdesc_t *
9231dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9232 uint64_t uarg, uint64_t arg)
9233{
9234 dtrace_actdesc_t *act;
9235
9236 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9237 arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9238
9239 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9240 act->dtad_kind = kind;
9241 act->dtad_ntuple = ntuple;
9242 act->dtad_uarg = uarg;
9243 act->dtad_arg = arg;
9244 act->dtad_refcnt = 1;
9245
9246 return (act);
9247}
9248
9249static void
9250dtrace_actdesc_hold(dtrace_actdesc_t *act)
9251{
9252 ASSERT(act->dtad_refcnt >= 1);
9253 act->dtad_refcnt++;
9254}
9255
9256static void
9257dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9258{
9259 dtrace_actkind_t kind = act->dtad_kind;
9260 dtrace_difo_t *dp;
9261
9262 ASSERT(act->dtad_refcnt >= 1);
9263
9264 if (--act->dtad_refcnt != 0)
9265 return;
9266
9267 if ((dp = act->dtad_difo) != NULL)
9268 dtrace_difo_release(dp, vstate);
9269
9270 if (DTRACEACT_ISPRINTFLIKE(kind)) {
9271 char *str = (char *)(uintptr_t)act->dtad_arg;
9272
9273 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9274 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9275
9276 if (str != NULL)
9277 kmem_free(str, strlen(str) + 1);
9278 }
9279
9280 kmem_free(act, sizeof (dtrace_actdesc_t));
9281}
9282
9283/*
9284 * DTrace ECB Functions
9285 */
9286static dtrace_ecb_t *
9287dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9288{
9289 dtrace_ecb_t *ecb;
9290 dtrace_epid_t epid;
9291
9292 ASSERT(MUTEX_HELD(&dtrace_lock));
9293
9294 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9295 ecb->dte_predicate = NULL;
9296 ecb->dte_probe = probe;
9297
9298 /*
9299 * The default size is the size of the default action: recording
9300 * the epid.
9301 */
9302 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9303 ecb->dte_alignment = sizeof (dtrace_epid_t);
9304
9305 epid = state->dts_epid++;
9306
9307 if (VBDTCAST(int64_t)epid - 1 >= state->dts_necbs) {
9308 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9309 int necbs = state->dts_necbs << 1;
9310
9311 ASSERT(epid == VBDTCAST(dtrace_epid_t)state->dts_necbs + 1);
9312
9313 if (necbs == 0) {
9314 ASSERT(oecbs == NULL);
9315 necbs = 1;
9316 }
9317
9318 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9319
9320 if (oecbs != NULL)
9321 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9322
9323 dtrace_membar_producer();
9324 state->dts_ecbs = ecbs;
9325
9326 if (oecbs != NULL) {
9327 /*
9328 * If this state is active, we must dtrace_sync()
9329 * before we can free the old dts_ecbs array: we're
9330 * coming in hot, and there may be active ring
9331 * buffer processing (which indexes into the dts_ecbs
9332 * array) on another CPU.
9333 */
9334 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9335 dtrace_sync();
9336
9337 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9338 }
9339
9340 dtrace_membar_producer();
9341 state->dts_necbs = necbs;
9342 }
9343
9344 ecb->dte_state = state;
9345
9346 ASSERT(state->dts_ecbs[epid - 1] == NULL);
9347 dtrace_membar_producer();
9348 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9349
9350 return (ecb);
9351}
9352
9353static int
9354dtrace_ecb_enable(dtrace_ecb_t *ecb)
9355{
9356 dtrace_probe_t *probe = ecb->dte_probe;
9357
9358 ASSERT(MUTEX_HELD(&cpu_lock));
9359 ASSERT(MUTEX_HELD(&dtrace_lock));
9360 ASSERT(ecb->dte_next == NULL);
9361
9362 if (probe == NULL) {
9363 /*
9364 * This is the NULL probe -- there's nothing to do.
9365 */
9366 return (0);
9367 }
9368
9369 if (probe->dtpr_ecb == NULL) {
9370 dtrace_provider_t *prov = probe->dtpr_provider;
9371
9372 /*
9373 * We're the first ECB on this probe.
9374 */
9375 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9376
9377 if (ecb->dte_predicate != NULL)
9378 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9379
9380 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9381 probe->dtpr_id, probe->dtpr_arg));
9382 } else {
9383 /*
9384 * This probe is already active. Swing the last pointer to
9385 * point to the new ECB, and issue a dtrace_sync() to assure
9386 * that all CPUs have seen the change.
9387 */
9388 ASSERT(probe->dtpr_ecb_last != NULL);
9389 probe->dtpr_ecb_last->dte_next = ecb;
9390 probe->dtpr_ecb_last = ecb;
9391 probe->dtpr_predcache = 0;
9392
9393 dtrace_sync();
9394 return (0);
9395 }
9396}
9397
9398static void
9399dtrace_ecb_resize(dtrace_ecb_t *ecb)
9400{
9401 uint32_t maxalign = sizeof (dtrace_epid_t);
9402 uint32_t align = sizeof (uint8_t), offs, diff;
9403 dtrace_action_t *act;
9404 int wastuple = 0;
9405 uint32_t aggbase = UINT32_MAX;
9406 dtrace_state_t *state = ecb->dte_state;
9407
9408 /*
9409 * If we record anything, we always record the epid. (And we always
9410 * record it first.)
9411 */
9412 offs = sizeof (dtrace_epid_t);
9413 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9414
9415 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9416 dtrace_recdesc_t *rec = &act->dta_rec;
9417
9418 if ((align = rec->dtrd_alignment) > maxalign)
9419 maxalign = align;
9420
9421 if (!wastuple && act->dta_intuple) {
9422 /*
9423 * This is the first record in a tuple. Align the
9424 * offset to be at offset 4 in an 8-byte aligned
9425 * block.
9426 */
9427 diff = offs + sizeof (dtrace_aggid_t);
9428
9429 if ((diff = (diff & (sizeof (uint64_t) - 1))))
9430 offs += sizeof (uint64_t) - diff;
9431
9432 aggbase = offs - sizeof (dtrace_aggid_t);
9433 ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9434 }
9435
9436 /*LINTED*/
9437 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9438 /*
9439 * The current offset is not properly aligned; align it.
9440 */
9441 offs += align - diff;
9442 }
9443
9444 rec->dtrd_offset = offs;
9445
9446 if (offs + rec->dtrd_size > ecb->dte_needed) {
9447 ecb->dte_needed = offs + rec->dtrd_size;
9448
9449 if (ecb->dte_needed > state->dts_needed)
9450 state->dts_needed = ecb->dte_needed;
9451 }
9452
9453 if (DTRACEACT_ISAGG(act->dta_kind)) {
9454 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9455 dtrace_action_t *first = agg->dtag_first, *prev;
9456
9457 ASSERT(rec->dtrd_size != 0 && first != NULL);
9458 ASSERT(wastuple);
9459 ASSERT(aggbase != UINT32_MAX);
9460
9461 agg->dtag_base = aggbase;
9462
9463 while ((prev = first->dta_prev) != NULL &&
9464 DTRACEACT_ISAGG(prev->dta_kind)) {
9465 agg = (dtrace_aggregation_t *)prev;
9466 first = agg->dtag_first;
9467 }
9468
9469 if (prev != NULL) {
9470 offs = prev->dta_rec.dtrd_offset +
9471 prev->dta_rec.dtrd_size;
9472 } else {
9473 offs = sizeof (dtrace_epid_t);
9474 }
9475 wastuple = 0;
9476 } else {
9477 if (!act->dta_intuple)
9478 ecb->dte_size = offs + rec->dtrd_size;
9479
9480 offs += rec->dtrd_size;
9481 }
9482
9483 wastuple = act->dta_intuple;
9484 }
9485
9486 if ((act = ecb->dte_action) != NULL &&
9487 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9488 ecb->dte_size == sizeof (dtrace_epid_t)) {
9489 /*
9490 * If the size is still sizeof (dtrace_epid_t), then all
9491 * actions store no data; set the size to 0.
9492 */
9493 ecb->dte_alignment = maxalign;
9494 ecb->dte_size = 0;
9495
9496 /*
9497 * If the needed space is still sizeof (dtrace_epid_t), then
9498 * all actions need no additional space; set the needed
9499 * size to 0.
9500 */
9501 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9502 ecb->dte_needed = 0;
9503
9504 return;
9505 }
9506
9507 /*
9508 * Set our alignment, and make sure that the dte_size and dte_needed
9509 * are aligned to the size of an EPID.
9510 */
9511 ecb->dte_alignment = maxalign;
9512 ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9513 ~(sizeof (dtrace_epid_t) - 1);
9514 ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9515 ~(sizeof (dtrace_epid_t) - 1);
9516 ASSERT(ecb->dte_size <= ecb->dte_needed);
9517}
9518
9519static dtrace_action_t *
9520dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9521{
9522 dtrace_aggregation_t *agg;
9523 size_t size = sizeof (uint64_t);
9524 int ntuple = desc->dtad_ntuple;
9525 dtrace_action_t *act;
9526 dtrace_recdesc_t *frec;
9527 dtrace_aggid_t aggid;
9528 dtrace_state_t *state = ecb->dte_state;
9529
9530 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9531 agg->dtag_ecb = ecb;
9532
9533 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9534
9535 switch (desc->dtad_kind) {
9536 case DTRACEAGG_MIN:
9537 agg->dtag_initial = INT64_MAX;
9538 agg->dtag_aggregate = dtrace_aggregate_min;
9539 break;
9540
9541 case DTRACEAGG_MAX:
9542 agg->dtag_initial = INT64_MIN;
9543 agg->dtag_aggregate = dtrace_aggregate_max;
9544 break;
9545
9546 case DTRACEAGG_COUNT:
9547 agg->dtag_aggregate = dtrace_aggregate_count;
9548 break;
9549
9550 case DTRACEAGG_QUANTIZE:
9551 agg->dtag_aggregate = dtrace_aggregate_quantize;
9552 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9553 sizeof (uint64_t);
9554 break;
9555
9556 case DTRACEAGG_LQUANTIZE: {
9557 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9558 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9559
9560 agg->dtag_initial = desc->dtad_arg;
9561 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9562
9563 if (step == 0 || levels == 0)
9564 goto err;
9565
9566 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9567 break;
9568 }
9569
9570 case DTRACEAGG_AVG:
9571 agg->dtag_aggregate = dtrace_aggregate_avg;
9572 size = sizeof (uint64_t) * 2;
9573 break;
9574
9575 case DTRACEAGG_STDDEV:
9576 agg->dtag_aggregate = dtrace_aggregate_stddev;
9577 size = sizeof (uint64_t) * 4;
9578 break;
9579
9580 case DTRACEAGG_SUM:
9581 agg->dtag_aggregate = dtrace_aggregate_sum;
9582 break;
9583
9584 default:
9585 goto err;
9586 }
9587
9588 agg->dtag_action.dta_rec.dtrd_size = VBDTCAST(uint32_t)size;
9589
9590 if (ntuple == 0)
9591 goto err;
9592
9593 /*
9594 * We must make sure that we have enough actions for the n-tuple.
9595 */
9596 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9597 if (DTRACEACT_ISAGG(act->dta_kind))
9598 break;
9599
9600 if (--ntuple == 0) {
9601 /*
9602 * This is the action with which our n-tuple begins.
9603 */
9604 agg->dtag_first = act;
9605 goto success;
9606 }
9607 }
9608
9609 /*
9610 * This n-tuple is short by ntuple elements. Return failure.
9611 */
9612 ASSERT(ntuple != 0);
9613err:
9614 kmem_free(agg, sizeof (dtrace_aggregation_t));
9615 return (NULL);
9616
9617success:
9618 /*
9619 * If the last action in the tuple has a size of zero, it's actually
9620 * an expression argument for the aggregating action.
9621 */
9622 ASSERT(ecb->dte_action_last != NULL);
9623 act = ecb->dte_action_last;
9624
9625 if (act->dta_kind == DTRACEACT_DIFEXPR) {
9626 ASSERT(act->dta_difo != NULL);
9627
9628 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9629 agg->dtag_hasarg = 1;
9630 }
9631
9632 /*
9633 * We need to allocate an id for this aggregation.
9634 */
9635 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9636 VM_BESTFIT | VM_SLEEP);
9637
9638 if (VBDTCAST(int64_t)aggid - 1 >= state->dts_naggregations) {
9639 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9640 dtrace_aggregation_t **aggs;
9641 int naggs = state->dts_naggregations << 1;
9642 int onaggs = state->dts_naggregations;
9643
9644 ASSERT(aggid == VBDTCAST(dtrace_aggid_t)state->dts_naggregations + 1);
9645
9646 if (naggs == 0) {
9647 ASSERT(oaggs == NULL);
9648 naggs = 1;
9649 }
9650
9651 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9652
9653 if (oaggs != NULL) {
9654 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9655 kmem_free(oaggs, onaggs * sizeof (*aggs));
9656 }
9657
9658 state->dts_aggregations = aggs;
9659 state->dts_naggregations = naggs;
9660 }
9661
9662 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9663 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9664
9665 frec = &agg->dtag_first->dta_rec;
9666 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9667 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9668
9669 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9670 ASSERT(!act->dta_intuple);
9671 act->dta_intuple = 1;
9672 }
9673
9674 return (&agg->dtag_action);
9675}
9676
9677static void
9678dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9679{
9680 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9681 dtrace_state_t *state = ecb->dte_state;
9682 dtrace_aggid_t aggid = agg->dtag_id;
9683
9684 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9685 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9686
9687 ASSERT(state->dts_aggregations[aggid - 1] == agg);
9688 state->dts_aggregations[aggid - 1] = NULL;
9689
9690 kmem_free(agg, sizeof (dtrace_aggregation_t));
9691}
9692
9693static int
9694dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9695{
9696 dtrace_action_t *action, *last;
9697 dtrace_difo_t *dp = desc->dtad_difo;
9698 uint32_t size = 0, align = sizeof (uint8_t), mask;
9699 uint16_t format = 0;
9700 dtrace_recdesc_t *rec;
9701 dtrace_state_t *state = ecb->dte_state;
9702 dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9703 uint64_t arg = desc->dtad_arg;
9704
9705 ASSERT(MUTEX_HELD(&dtrace_lock));
9706 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9707
9708 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9709 /*
9710 * If this is an aggregating action, there must be neither
9711 * a speculate nor a commit on the action chain.
9712 */
9713 dtrace_action_t *act;
9714
9715 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9716 if (act->dta_kind == DTRACEACT_COMMIT)
9717 return (EINVAL);
9718
9719 if (act->dta_kind == DTRACEACT_SPECULATE)
9720 return (EINVAL);
9721 }
9722
9723 action = dtrace_ecb_aggregation_create(ecb, desc);
9724
9725 if (action == NULL)
9726 return (EINVAL);
9727 } else {
9728 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9729 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9730 dp != NULL && dp->dtdo_destructive)) {
9731 state->dts_destructive = 1;
9732 }
9733
9734 switch (desc->dtad_kind) {
9735 case DTRACEACT_PRINTF:
9736 case DTRACEACT_PRINTA:
9737 case DTRACEACT_SYSTEM:
9738 case DTRACEACT_FREOPEN:
9739 /*
9740 * We know that our arg is a string -- turn it into a
9741 * format.
9742 */
9743 if (arg == NULL) {
9744 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9745 format = 0;
9746 } else {
9747 ASSERT(arg != NULL);
9748 ASSERT(arg > KERNELBASE);
9749 format = dtrace_format_add(state,
9750 (char *)(uintptr_t)arg);
9751 }
9752
9753 /*FALLTHROUGH*/
9754 case DTRACEACT_LIBACT:
9755 case DTRACEACT_DIFEXPR:
9756 if (dp == NULL)
9757 return (EINVAL);
9758
9759 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9760 break;
9761
9762 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9763 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9764 return (EINVAL);
9765
9766 size = opt[DTRACEOPT_STRSIZE];
9767 }
9768
9769 break;
9770
9771 case DTRACEACT_STACK:
9772 if ((nframes = arg) == 0) {
9773 nframes = opt[DTRACEOPT_STACKFRAMES];
9774 ASSERT(nframes > 0);
9775 arg = nframes;
9776 }
9777
9778 size = VBDTCAST(uint32_t)(nframes * sizeof (pc_t));
9779 break;
9780
9781 case DTRACEACT_JSTACK:
9782 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9783 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9784
9785 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9786 nframes = opt[DTRACEOPT_JSTACKFRAMES];
9787
9788 arg = DTRACE_USTACK_ARG(nframes, strsize);
9789
9790 /*FALLTHROUGH*/
9791 case DTRACEACT_USTACK:
9792 if (desc->dtad_kind != DTRACEACT_JSTACK &&
9793 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9794 strsize = DTRACE_USTACK_STRSIZE(arg);
9795 nframes = opt[DTRACEOPT_USTACKFRAMES];
9796 ASSERT(nframes > 0);
9797 arg = DTRACE_USTACK_ARG(nframes, strsize);
9798 }
9799
9800 /*
9801 * Save a slot for the pid.
9802 */
9803 size = VBDTCAST(uint32_t)((nframes + 1) * sizeof (uint64_t));
9804 size += DTRACE_USTACK_STRSIZE(arg);
9805 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9806
9807 break;
9808
9809 case DTRACEACT_SYM:
9810 case DTRACEACT_MOD:
9811 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9812 sizeof (uint64_t)) ||
9813 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9814 return (EINVAL);
9815 break;
9816
9817 case DTRACEACT_USYM:
9818 case DTRACEACT_UMOD:
9819 case DTRACEACT_UADDR:
9820 if (dp == NULL ||
9821 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9822 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9823 return (EINVAL);
9824
9825 /*
9826 * We have a slot for the pid, plus a slot for the
9827 * argument. To keep things simple (aligned with
9828 * bitness-neutral sizing), we store each as a 64-bit
9829 * quantity.
9830 */
9831 size = 2 * sizeof (uint64_t);
9832 break;
9833
9834 case DTRACEACT_STOP:
9835 case DTRACEACT_BREAKPOINT:
9836 case DTRACEACT_PANIC:
9837 break;
9838
9839 case DTRACEACT_CHILL:
9840 case DTRACEACT_DISCARD:
9841 case DTRACEACT_RAISE:
9842 if (dp == NULL)
9843 return (EINVAL);
9844 break;
9845
9846 case DTRACEACT_EXIT:
9847 if (dp == NULL ||
9848 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9849 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9850 return (EINVAL);
9851 break;
9852
9853 case DTRACEACT_SPECULATE:
9854 if (ecb->dte_size > sizeof (dtrace_epid_t))
9855 return (EINVAL);
9856
9857 if (dp == NULL)
9858 return (EINVAL);
9859
9860 state->dts_speculates = 1;
9861 break;
9862
9863 case DTRACEACT_COMMIT: {
9864 dtrace_action_t *act = ecb->dte_action;
9865
9866 for (; act != NULL; act = act->dta_next) {
9867 if (act->dta_kind == DTRACEACT_COMMIT)
9868 return (EINVAL);
9869 }
9870
9871 if (dp == NULL)
9872 return (EINVAL);
9873 break;
9874 }
9875
9876 default:
9877 return (EINVAL);
9878 }
9879
9880 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
9881 /*
9882 * If this is a data-storing action or a speculate,
9883 * we must be sure that there isn't a commit on the
9884 * action chain.
9885 */
9886 dtrace_action_t *act = ecb->dte_action;
9887
9888 for (; act != NULL; act = act->dta_next) {
9889 if (act->dta_kind == DTRACEACT_COMMIT)
9890 return (EINVAL);
9891 }
9892 }
9893
9894 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
9895 action->dta_rec.dtrd_size = size;
9896 }
9897
9898 action->dta_refcnt = 1;
9899 rec = &action->dta_rec;
9900 size = rec->dtrd_size;
9901
9902 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
9903 if (!(size & mask)) {
9904 align = mask + 1;
9905 break;
9906 }
9907 }
9908
9909 action->dta_kind = desc->dtad_kind;
9910
9911 if ((action->dta_difo = dp) != NULL)
9912 dtrace_difo_hold(dp);
9913
9914 rec->dtrd_action = action->dta_kind;
9915 rec->dtrd_arg = arg;
9916 rec->dtrd_uarg = desc->dtad_uarg;
9917 rec->dtrd_alignment = (uint16_t)align;
9918 rec->dtrd_format = format;
9919
9920 if ((last = ecb->dte_action_last) != NULL) {
9921 ASSERT(ecb->dte_action != NULL);
9922 action->dta_prev = last;
9923 last->dta_next = action;
9924 } else {
9925 ASSERT(ecb->dte_action == NULL);
9926 ecb->dte_action = action;
9927 }
9928
9929 ecb->dte_action_last = action;
9930
9931 return (0);
9932}
9933
9934static void
9935dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
9936{
9937 dtrace_action_t *act = ecb->dte_action, *next;
9938 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
9939 dtrace_difo_t *dp;
9940 uint16_t format;
9941
9942 if (act != NULL && act->dta_refcnt > 1) {
9943 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
9944 act->dta_refcnt--;
9945 } else {
9946 for (; act != NULL; act = next) {
9947 next = act->dta_next;
9948 ASSERT(next != NULL || act == ecb->dte_action_last);
9949 ASSERT(act->dta_refcnt == 1);
9950
9951 if ((format = act->dta_rec.dtrd_format) != 0)
9952 dtrace_format_remove(ecb->dte_state, format);
9953
9954 if ((dp = act->dta_difo) != NULL)
9955 dtrace_difo_release(dp, vstate);
9956
9957 if (DTRACEACT_ISAGG(act->dta_kind)) {
9958 dtrace_ecb_aggregation_destroy(ecb, act);
9959 } else {
9960 kmem_free(act, sizeof (dtrace_action_t));
9961 }
9962 }
9963 }
9964
9965 ecb->dte_action = NULL;
9966 ecb->dte_action_last = NULL;
9967 ecb->dte_size = sizeof (dtrace_epid_t);
9968}
9969
9970static void
9971dtrace_ecb_disable(dtrace_ecb_t *ecb)
9972{
9973 /*
9974 * We disable the ECB by removing it from its probe.
9975 */
9976 dtrace_ecb_t *pecb, *prev = NULL;
9977 dtrace_probe_t *probe = ecb->dte_probe;
9978
9979 ASSERT(MUTEX_HELD(&dtrace_lock));
9980
9981 if (probe == NULL) {
9982 /*
9983 * This is the NULL probe; there is nothing to disable.
9984 */
9985 return;
9986 }
9987
9988 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
9989 if (pecb == ecb)
9990 break;
9991 prev = pecb;
9992 }
9993
9994 ASSERT(pecb != NULL);
9995
9996 if (prev == NULL) {
9997 probe->dtpr_ecb = ecb->dte_next;
9998 } else {
9999 prev->dte_next = ecb->dte_next;
10000 }
10001
10002 if (ecb == probe->dtpr_ecb_last) {
10003 ASSERT(ecb->dte_next == NULL);
10004 probe->dtpr_ecb_last = prev;
10005 }
10006
10007 /*
10008 * The ECB has been disconnected from the probe; now sync to assure
10009 * that all CPUs have seen the change before returning.
10010 */
10011 dtrace_sync();
10012
10013 if (probe->dtpr_ecb == NULL) {
10014 /*
10015 * That was the last ECB on the probe; clear the predicate
10016 * cache ID for the probe, disable it and sync one more time
10017 * to assure that we'll never hit it again.
10018 */
10019 dtrace_provider_t *prov = probe->dtpr_provider;
10020
10021 ASSERT(ecb->dte_next == NULL);
10022 ASSERT(probe->dtpr_ecb_last == NULL);
10023 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10024 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10025 probe->dtpr_id, probe->dtpr_arg);
10026 dtrace_sync();
10027 } else {
10028 /*
10029 * There is at least one ECB remaining on the probe. If there
10030 * is _exactly_ one, set the probe's predicate cache ID to be
10031 * the predicate cache ID of the remaining ECB.
10032 */
10033 ASSERT(probe->dtpr_ecb_last != NULL);
10034 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10035
10036 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10037 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10038
10039 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10040
10041 if (p != NULL)
10042 probe->dtpr_predcache = p->dtp_cacheid;
10043 }
10044
10045 ecb->dte_next = NULL;
10046 }
10047}
10048
10049static void
10050dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10051{
10052 dtrace_state_t *state = ecb->dte_state;
10053 dtrace_vstate_t *vstate = &state->dts_vstate;
10054 dtrace_predicate_t *pred;
10055 dtrace_epid_t epid = ecb->dte_epid;
10056
10057 ASSERT(MUTEX_HELD(&dtrace_lock));
10058 ASSERT(ecb->dte_next == NULL);
10059 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10060
10061 if ((pred = ecb->dte_predicate) != NULL)
10062 dtrace_predicate_release(pred, vstate);
10063
10064 dtrace_ecb_action_remove(ecb);
10065
10066 ASSERT(state->dts_ecbs[epid - 1] == ecb);
10067 state->dts_ecbs[epid - 1] = NULL;
10068
10069 kmem_free(ecb, sizeof (dtrace_ecb_t));
10070}
10071
10072static dtrace_ecb_t *
10073dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10074 dtrace_enabling_t *enab)
10075{
10076 dtrace_ecb_t *ecb;
10077 dtrace_predicate_t *pred;
10078 dtrace_actdesc_t *act;
10079 dtrace_provider_t *prov;
10080 dtrace_ecbdesc_t *desc = enab->dten_current;
10081
10082 ASSERT(MUTEX_HELD(&dtrace_lock));
10083 ASSERT(state != NULL);
10084
10085 ecb = dtrace_ecb_add(state, probe);
10086 ecb->dte_uarg = desc->dted_uarg;
10087
10088 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10089 dtrace_predicate_hold(pred);
10090 ecb->dte_predicate = pred;
10091 }
10092
10093 if (probe != NULL) {
10094 /*
10095 * If the provider shows more leg than the consumer is old
10096 * enough to see, we need to enable the appropriate implicit
10097 * predicate bits to prevent the ecb from activating at
10098 * revealing times.
10099 *
10100 * Providers specifying DTRACE_PRIV_USER at register time
10101 * are stating that they need the /proc-style privilege
10102 * model to be enforced, and this is what DTRACE_COND_OWNER
10103 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10104 */
10105 prov = probe->dtpr_provider;
10106 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10107 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10108 ecb->dte_cond |= DTRACE_COND_OWNER;
10109
10110 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10111 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10112 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10113
10114 /*
10115 * If the provider shows us kernel innards and the user
10116 * is lacking sufficient privilege, enable the
10117 * DTRACE_COND_USERMODE implicit predicate.
10118 */
10119 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10120 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10121 ecb->dte_cond |= DTRACE_COND_USERMODE;
10122 }
10123
10124 if (dtrace_ecb_create_cache != NULL) {
10125 /*
10126 * If we have a cached ecb, we'll use its action list instead
10127 * of creating our own (saving both time and space).
10128 */
10129 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10130 dtrace_action_t *act2 = cached->dte_action;
10131
10132 if (act2 != NULL) {
10133 ASSERT(act2->dta_refcnt > 0);
10134 act2->dta_refcnt++;
10135 ecb->dte_action = act2;
10136 ecb->dte_action_last = cached->dte_action_last;
10137 ecb->dte_needed = cached->dte_needed;
10138 ecb->dte_size = cached->dte_size;
10139 ecb->dte_alignment = cached->dte_alignment;
10140 }
10141
10142 return (ecb);
10143 }
10144
10145 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10146 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10147 dtrace_ecb_destroy(ecb);
10148 return (NULL);
10149 }
10150 }
10151
10152 dtrace_ecb_resize(ecb);
10153
10154 return (dtrace_ecb_create_cache = ecb);
10155}
10156
10157static int
10158dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10159{
10160 dtrace_ecb_t *ecb;
10161 dtrace_enabling_t *enab = arg;
10162 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10163
10164 ASSERT(state != NULL);
10165
10166 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10167 /*
10168 * This probe was created in a generation for which this
10169 * enabling has previously created ECBs; we don't want to
10170 * enable it again, so just kick out.
10171 */
10172 return (DTRACE_MATCH_NEXT);
10173 }
10174
10175 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10176 return (DTRACE_MATCH_DONE);
10177
10178 if (dtrace_ecb_enable(ecb) < 0)
10179 return (DTRACE_MATCH_FAIL);
10180
10181 return (DTRACE_MATCH_NEXT);
10182}
10183
10184static dtrace_ecb_t *
10185dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10186{
10187 dtrace_ecb_t *ecb;
10188
10189 ASSERT(MUTEX_HELD(&dtrace_lock));
10190
10191 if (id == 0 || VBDTCAST(int64_t)id > state->dts_necbs)
10192 return (NULL);
10193
10194 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10195 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10196
10197 return (state->dts_ecbs[id - 1]);
10198}
10199
10200static dtrace_aggregation_t *
10201dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10202{
10203 dtrace_aggregation_t *agg;
10204
10205 ASSERT(MUTEX_HELD(&dtrace_lock));
10206
10207 if (id == 0 || VBDTCAST(int64_t)id > state->dts_naggregations)
10208 return (NULL);
10209
10210 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10211 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10212 agg->dtag_id == id);
10213
10214 return (state->dts_aggregations[id - 1]);
10215}
10216
10217/*
10218 * DTrace Buffer Functions
10219 *
10220 * The following functions manipulate DTrace buffers. Most of these functions
10221 * are called in the context of establishing or processing consumer state;
10222 * exceptions are explicitly noted.
10223 */
10224
10225/*
10226 * Note: called from cross call context. This function switches the two
10227 * buffers on a given CPU. The atomicity of this operation is assured by
10228 * disabling interrupts while the actual switch takes place; the disabling of
10229 * interrupts serializes the execution with any execution of dtrace_probe() on
10230 * the same CPU.
10231 */
10232static void
10233dtrace_buffer_switch(dtrace_buffer_t *buf)
10234{
10235 caddr_t tomax = buf->dtb_tomax;
10236 caddr_t xamot = buf->dtb_xamot;
10237 dtrace_icookie_t cookie;
10238
10239 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10240 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10241
10242 cookie = dtrace_interrupt_disable();
10243 buf->dtb_tomax = xamot;
10244 buf->dtb_xamot = tomax;
10245 buf->dtb_xamot_drops = buf->dtb_drops;
10246 buf->dtb_xamot_offset = buf->dtb_offset;
10247 buf->dtb_xamot_errors = buf->dtb_errors;
10248 buf->dtb_xamot_flags = buf->dtb_flags;
10249 buf->dtb_offset = 0;
10250 buf->dtb_drops = 0;
10251 buf->dtb_errors = 0;
10252 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10253 dtrace_interrupt_enable(cookie);
10254}
10255
10256/*
10257 * Note: called from cross call context. This function activates a buffer
10258 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
10259 * is guaranteed by the disabling of interrupts.
10260 */
10261static void
10262dtrace_buffer_activate(dtrace_state_t *state)
10263{
10264 dtrace_buffer_t *buf;
10265 dtrace_icookie_t cookie = dtrace_interrupt_disable();
10266
10267 buf = &state->dts_buffer[VBDT_GET_CPUID()];
10268
10269 if (buf->dtb_tomax != NULL) {
10270 /*
10271 * We might like to assert that the buffer is marked inactive,
10272 * but this isn't necessarily true: the buffer for the CPU
10273 * that processes the BEGIN probe has its buffer activated
10274 * manually. In this case, we take the (harmless) action
10275 * re-clearing the bit INACTIVE bit.
10276 */
10277 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10278 }
10279
10280 dtrace_interrupt_enable(cookie);
10281}
10282
10283static int
10284dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10285 processorid_t cpu)
10286{
10287#ifndef VBOX
10288 cpu_t *cp;
10289#else
10290 RTCPUSET CpuSet;
10291 unsigned iCpu;
10292#endif
10293 dtrace_buffer_t *buf;
10294
10295 ASSERT(MUTEX_HELD(&cpu_lock));
10296 ASSERT(MUTEX_HELD(&dtrace_lock));
10297
10298 if (VBDTCAST(int64_t)size > dtrace_nonroot_maxsize
10299#ifndef VBOX
10300 && !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)
10301#endif
10302 )
10303 return (EFBIG);
10304
10305#ifndef VBOX
10306 cp = cpu_list;
10307#else
10308 RTMpGetSet(&CpuSet);
10309#endif
10310
10311#ifndef VBOX
10312 do {
10313 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10314 continue;
10315
10316 buf = &bufs[cp->cpu_id];
10317#else
10318 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10319 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10320 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10321 continue;
10322
10323 buf = &bufs[iCpu];
10324#endif
10325
10326 /*
10327 * If there is already a buffer allocated for this CPU, it
10328 * is only possible that this is a DR event. In this case,
10329 * the buffer size must match our specified size.
10330 */
10331 if (buf->dtb_tomax != NULL) {
10332 ASSERT(buf->dtb_size == size);
10333 continue;
10334 }
10335
10336 ASSERT(buf->dtb_xamot == NULL);
10337
10338 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10339 goto err;
10340
10341 buf->dtb_size = size;
10342 buf->dtb_flags = flags;
10343 buf->dtb_offset = 0;
10344 buf->dtb_drops = 0;
10345
10346 if (flags & DTRACEBUF_NOSWITCH)
10347 continue;
10348
10349 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10350 goto err;
10351#ifndef VBOX
10352 } while ((cp = cp->cpu_next) != cpu_list);
10353#else
10354 }
10355#endif
10356
10357 return (0);
10358
10359err:
10360#ifndef VBOX
10361 cp = cpu_list;
10362
10363 do {
10364 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10365 continue;
10366
10367 buf = &bufs[cp->cpu_id];
10368#else
10369 for (iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) {
10370 if ( !RTCpuSetIsMember(&CpuSet, iCpu)
10371 || (cpu != (processorid_t)DTRACE_CPUALL && cpu != iCpu))
10372 continue;
10373
10374 buf = &bufs[iCpu];
10375#endif
10376
10377 if (buf->dtb_xamot != NULL) {
10378 ASSERT(buf->dtb_tomax != NULL);
10379 ASSERT(buf->dtb_size == size);
10380 kmem_free(buf->dtb_xamot, size);
10381 }
10382
10383 if (buf->dtb_tomax != NULL) {
10384 ASSERT(buf->dtb_size == size);
10385 kmem_free(buf->dtb_tomax, size);
10386 }
10387
10388 buf->dtb_tomax = NULL;
10389 buf->dtb_xamot = NULL;
10390 buf->dtb_size = 0;
10391#ifndef VBOX
10392 } while ((cp = cp->cpu_next) != cpu_list);
10393#else
10394 }
10395#endif
10396
10397 return (ENOMEM);
10398}
10399
10400/*
10401 * Note: called from probe context. This function just increments the drop
10402 * count on a buffer. It has been made a function to allow for the
10403 * possibility of understanding the source of mysterious drop counts. (A
10404 * problem for which one may be particularly disappointed that DTrace cannot
10405 * be used to understand DTrace.)
10406 */
10407static void
10408dtrace_buffer_drop(dtrace_buffer_t *buf)
10409{
10410 buf->dtb_drops++;
10411}
10412
10413/*
10414 * Note: called from probe context. This function is called to reserve space
10415 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
10416 * mstate. Returns the new offset in the buffer, or a negative value if an
10417 * error has occurred.
10418 */
10419static intptr_t
10420dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10421 dtrace_state_t *state, dtrace_mstate_t *mstate)
10422{
10423 intptr_t offs = buf->dtb_offset, soffs;
10424 intptr_t woffs;
10425 caddr_t tomax;
10426 size_t total;
10427
10428 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10429 return (-1);
10430
10431 if ((tomax = buf->dtb_tomax) == NULL) {
10432 dtrace_buffer_drop(buf);
10433 return (-1);
10434 }
10435
10436 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10437 while (offs & (align - 1)) {
10438 /*
10439 * Assert that our alignment is off by a number which
10440 * is itself sizeof (uint32_t) aligned.
10441 */
10442 ASSERT(!((align - (offs & (align - 1))) &
10443 (sizeof (uint32_t) - 1)));
10444 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10445 offs += sizeof (uint32_t);
10446 }
10447
10448 if (VBDTCAST(uintptr_t)(soffs = offs + needed) > buf->dtb_size) {
10449 dtrace_buffer_drop(buf);
10450 return (-1);
10451 }
10452
10453 if (mstate == NULL)
10454 return (offs);
10455
10456 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10457 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10458 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10459
10460 return (offs);
10461 }
10462
10463 if (buf->dtb_flags & DTRACEBUF_FILL) {
10464 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10465 (buf->dtb_flags & DTRACEBUF_FULL))
10466 return (-1);
10467 goto out;
10468 }
10469
10470 total = needed + (offs & (align - 1));
10471
10472 /*
10473 * For a ring buffer, life is quite a bit more complicated. Before
10474 * we can store any padding, we need to adjust our wrapping offset.
10475 * (If we've never before wrapped or we're not about to, no adjustment
10476 * is required.)
10477 */
10478 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10479 offs + total > buf->dtb_size) {
10480 woffs = buf->dtb_xamot_offset;
10481
10482 if (offs + total > buf->dtb_size) {
10483 /*
10484 * We can't fit in the end of the buffer. First, a
10485 * sanity check that we can fit in the buffer at all.
10486 */
10487 if (total > buf->dtb_size) {
10488 dtrace_buffer_drop(buf);
10489 return (-1);
10490 }
10491
10492 /*
10493 * We're going to be storing at the top of the buffer,
10494 * so now we need to deal with the wrapped offset. We
10495 * only reset our wrapped offset to 0 if it is
10496 * currently greater than the current offset. If it
10497 * is less than the current offset, it is because a
10498 * previous allocation induced a wrap -- but the
10499 * allocation didn't subsequently take the space due
10500 * to an error or false predicate evaluation. In this
10501 * case, we'll just leave the wrapped offset alone: if
10502 * the wrapped offset hasn't been advanced far enough
10503 * for this allocation, it will be adjusted in the
10504 * lower loop.
10505 */
10506 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10507 if (woffs >= offs)
10508 woffs = 0;
10509 } else {
10510 woffs = 0;
10511 }
10512
10513 /*
10514 * Now we know that we're going to be storing to the
10515 * top of the buffer and that there is room for us
10516 * there. We need to clear the buffer from the current
10517 * offset to the end (there may be old gunk there).
10518 */
10519 while (VBDTCAST(uintptr_t)offs < buf->dtb_size)
10520 tomax[offs++] = 0;
10521
10522 /*
10523 * We need to set our offset to zero. And because we
10524 * are wrapping, we need to set the bit indicating as
10525 * much. We can also adjust our needed space back
10526 * down to the space required by the ECB -- we know
10527 * that the top of the buffer is aligned.
10528 */
10529 offs = 0;
10530 total = needed;
10531 buf->dtb_flags |= DTRACEBUF_WRAPPED;
10532 } else {
10533 /*
10534 * There is room for us in the buffer, so we simply
10535 * need to check the wrapped offset.
10536 */
10537 if (woffs < offs) {
10538 /*
10539 * The wrapped offset is less than the offset.
10540 * This can happen if we allocated buffer space
10541 * that induced a wrap, but then we didn't
10542 * subsequently take the space due to an error
10543 * or false predicate evaluation. This is
10544 * okay; we know that _this_ allocation isn't
10545 * going to induce a wrap. We still can't
10546 * reset the wrapped offset to be zero,
10547 * however: the space may have been trashed in
10548 * the previous failed probe attempt. But at
10549 * least the wrapped offset doesn't need to
10550 * be adjusted at all...
10551 */
10552 goto out;
10553 }
10554 }
10555
10556 while (VBDTCAST(uintptr_t)offs + total > VBDTCAST(uintptr_t)woffs) {
10557 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10558 size_t size;
10559
10560 if (epid == DTRACE_EPIDNONE) {
10561 size = sizeof (uint32_t);
10562 } else {
10563 ASSERT(VBDTCAST(int64_t)epid <= state->dts_necbs);
10564 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10565
10566 size = state->dts_ecbs[epid - 1]->dte_size;
10567 }
10568
10569 ASSERT(woffs + size <= buf->dtb_size);
10570 ASSERT(size != 0);
10571
10572 if (woffs + size == buf->dtb_size) {
10573 /*
10574 * We've reached the end of the buffer; we want
10575 * to set the wrapped offset to 0 and break
10576 * out. However, if the offs is 0, then we're
10577 * in a strange edge-condition: the amount of
10578 * space that we want to reserve plus the size
10579 * of the record that we're overwriting is
10580 * greater than the size of the buffer. This
10581 * is problematic because if we reserve the
10582 * space but subsequently don't consume it (due
10583 * to a failed predicate or error) the wrapped
10584 * offset will be 0 -- yet the EPID at offset 0
10585 * will not be committed. This situation is
10586 * relatively easy to deal with: if we're in
10587 * this case, the buffer is indistinguishable
10588 * from one that hasn't wrapped; we need only
10589 * finish the job by clearing the wrapped bit,
10590 * explicitly setting the offset to be 0, and
10591 * zero'ing out the old data in the buffer.
10592 */
10593 if (offs == 0) {
10594 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10595 buf->dtb_offset = 0;
10596 woffs = total;
10597
10598 while (VBDTCAST(uintptr_t)woffs < buf->dtb_size)
10599 tomax[woffs++] = 0;
10600 }
10601
10602 woffs = 0;
10603 break;
10604 }
10605
10606 woffs += size;
10607 }
10608
10609 /*
10610 * We have a wrapped offset. It may be that the wrapped offset
10611 * has become zero -- that's okay.
10612 */
10613 buf->dtb_xamot_offset = woffs;
10614 }
10615
10616out:
10617 /*
10618 * Now we can plow the buffer with any necessary padding.
10619 */
10620 while (offs & (align - 1)) {
10621 /*
10622 * Assert that our alignment is off by a number which
10623 * is itself sizeof (uint32_t) aligned.
10624 */
10625 ASSERT(!((align - (offs & (align - 1))) &
10626 (sizeof (uint32_t) - 1)));
10627 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10628 offs += sizeof (uint32_t);
10629 }
10630
10631 if (buf->dtb_flags & DTRACEBUF_FILL) {
10632 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10633 buf->dtb_flags |= DTRACEBUF_FULL;
10634 return (-1);
10635 }
10636 }
10637
10638 if (mstate == NULL)
10639 return (offs);
10640
10641 /*
10642 * For ring buffers and fill buffers, the scratch space is always
10643 * the inactive buffer.
10644 */
10645 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10646 mstate->dtms_scratch_size = buf->dtb_size;
10647 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10648
10649 return (offs);
10650}
10651
10652static void
10653dtrace_buffer_polish(dtrace_buffer_t *buf)
10654{
10655 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10656 ASSERT(MUTEX_HELD(&dtrace_lock));
10657
10658 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10659 return;
10660
10661 /*
10662 * We need to polish the ring buffer. There are three cases:
10663 *
10664 * - The first (and presumably most common) is that there is no gap
10665 * between the buffer offset and the wrapped offset. In this case,
10666 * there is nothing in the buffer that isn't valid data; we can
10667 * mark the buffer as polished and return.
10668 *
10669 * - The second (less common than the first but still more common
10670 * than the third) is that there is a gap between the buffer offset
10671 * and the wrapped offset, and the wrapped offset is larger than the
10672 * buffer offset. This can happen because of an alignment issue, or
10673 * can happen because of a call to dtrace_buffer_reserve() that
10674 * didn't subsequently consume the buffer space. In this case,
10675 * we need to zero the data from the buffer offset to the wrapped
10676 * offset.
10677 *
10678 * - The third (and least common) is that there is a gap between the
10679 * buffer offset and the wrapped offset, but the wrapped offset is
10680 * _less_ than the buffer offset. This can only happen because a
10681 * call to dtrace_buffer_reserve() induced a wrap, but the space
10682 * was not subsequently consumed. In this case, we need to zero the
10683 * space from the offset to the end of the buffer _and_ from the
10684 * top of the buffer to the wrapped offset.
10685 */
10686 if (buf->dtb_offset < buf->dtb_xamot_offset) {
10687 bzero(buf->dtb_tomax + buf->dtb_offset,
10688 buf->dtb_xamot_offset - buf->dtb_offset);
10689 }
10690
10691 if (buf->dtb_offset > buf->dtb_xamot_offset) {
10692 bzero(buf->dtb_tomax + buf->dtb_offset,
10693 buf->dtb_size - buf->dtb_offset);
10694 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10695 }
10696}
10697
10698static void
10699dtrace_buffer_free(dtrace_buffer_t *bufs)
10700{
10701 int i;
10702
10703 for (i = 0; i < NCPU; i++) {
10704 dtrace_buffer_t *buf = &bufs[i];
10705
10706 if (buf->dtb_tomax == NULL) {
10707 ASSERT(buf->dtb_xamot == NULL);
10708 ASSERT(buf->dtb_size == 0);
10709 continue;
10710 }
10711
10712 if (buf->dtb_xamot != NULL) {
10713 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10714 kmem_free(buf->dtb_xamot, buf->dtb_size);
10715 }
10716
10717 kmem_free(buf->dtb_tomax, buf->dtb_size);
10718 buf->dtb_size = 0;
10719 buf->dtb_tomax = NULL;
10720 buf->dtb_xamot = NULL;
10721 }
10722}
10723
10724/*
10725 * DTrace Enabling Functions
10726 */
10727static dtrace_enabling_t *
10728dtrace_enabling_create(dtrace_vstate_t *vstate)
10729{
10730 dtrace_enabling_t *enab;
10731
10732 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10733 enab->dten_vstate = vstate;
10734
10735 return (enab);
10736}
10737
10738static void
10739dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10740{
10741 dtrace_ecbdesc_t **ndesc;
10742 size_t osize, nsize;
10743
10744 /*
10745 * We can't add to enablings after we've enabled them, or after we've
10746 * retained them.
10747 */
10748 ASSERT(enab->dten_probegen == 0);
10749 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10750
10751 if (enab->dten_ndesc < enab->dten_maxdesc) {
10752 enab->dten_desc[enab->dten_ndesc++] = ecb;
10753 return;
10754 }
10755
10756 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10757
10758 if (enab->dten_maxdesc == 0) {
10759 enab->dten_maxdesc = 1;
10760 } else {
10761 enab->dten_maxdesc <<= 1;
10762 }
10763
10764 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10765
10766 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10767 ndesc = kmem_zalloc(nsize, KM_SLEEP);
10768 bcopy(enab->dten_desc, ndesc, osize);
10769 kmem_free(enab->dten_desc, osize);
10770
10771 enab->dten_desc = ndesc;
10772 enab->dten_desc[enab->dten_ndesc++] = ecb;
10773}
10774
10775static void
10776dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10777 dtrace_probedesc_t *pd)
10778{
10779 dtrace_ecbdesc_t *new;
10780 dtrace_predicate_t *pred;
10781 dtrace_actdesc_t *act;
10782
10783 /*
10784 * We're going to create a new ECB description that matches the
10785 * specified ECB in every way, but has the specified probe description.
10786 */
10787 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10788
10789 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10790 dtrace_predicate_hold(pred);
10791
10792 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10793 dtrace_actdesc_hold(act);
10794
10795 new->dted_action = ecb->dted_action;
10796 new->dted_pred = ecb->dted_pred;
10797 new->dted_probe = *pd;
10798 new->dted_uarg = ecb->dted_uarg;
10799
10800 dtrace_enabling_add(enab, new);
10801}
10802
10803static void
10804dtrace_enabling_dump(dtrace_enabling_t *enab)
10805{
10806 int i;
10807
10808 for (i = 0; i < enab->dten_ndesc; i++) {
10809 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10810
10811 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10812 desc->dtpd_provider, desc->dtpd_mod,
10813 desc->dtpd_func, desc->dtpd_name);
10814 }
10815}
10816
10817static void
10818dtrace_enabling_destroy(dtrace_enabling_t *enab)
10819{
10820 int i;
10821 dtrace_ecbdesc_t *ep;
10822 dtrace_vstate_t *vstate = enab->dten_vstate;
10823
10824 ASSERT(MUTEX_HELD(&dtrace_lock));
10825
10826 for (i = 0; i < enab->dten_ndesc; i++) {
10827 dtrace_actdesc_t *act, *next;
10828 dtrace_predicate_t *pred;
10829
10830 ep = enab->dten_desc[i];
10831
10832 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10833 dtrace_predicate_release(pred, vstate);
10834
10835 for (act = ep->dted_action; act != NULL; act = next) {
10836 next = act->dtad_next;
10837 dtrace_actdesc_release(act, vstate);
10838 }
10839
10840 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10841 }
10842
10843 kmem_free(enab->dten_desc,
10844 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10845
10846 /*
10847 * If this was a retained enabling, decrement the dts_nretained count
10848 * and take it off of the dtrace_retained list.
10849 */
10850 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10851 dtrace_retained == enab) {
10852 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10853 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10854 enab->dten_vstate->dtvs_state->dts_nretained--;
10855 dtrace_retained_gen++;
10856 }
10857
10858 if (enab->dten_prev == NULL) {
10859 if (dtrace_retained == enab) {
10860 dtrace_retained = enab->dten_next;
10861
10862 if (dtrace_retained != NULL)
10863 dtrace_retained->dten_prev = NULL;
10864 }
10865 } else {
10866 ASSERT(enab != dtrace_retained);
10867 ASSERT(dtrace_retained != NULL);
10868 enab->dten_prev->dten_next = enab->dten_next;
10869 }
10870
10871 if (enab->dten_next != NULL) {
10872 ASSERT(dtrace_retained != NULL);
10873 enab->dten_next->dten_prev = enab->dten_prev;
10874 }
10875
10876 kmem_free(enab, sizeof (dtrace_enabling_t));
10877}
10878
10879static int
10880dtrace_enabling_retain(dtrace_enabling_t *enab)
10881{
10882 dtrace_state_t *state;
10883
10884 ASSERT(MUTEX_HELD(&dtrace_lock));
10885 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10886 ASSERT(enab->dten_vstate != NULL);
10887
10888 state = enab->dten_vstate->dtvs_state;
10889 ASSERT(state != NULL);
10890
10891 /*
10892 * We only allow each state to retain dtrace_retain_max enablings.
10893 */
10894 if (state->dts_nretained >= dtrace_retain_max)
10895 return (ENOSPC);
10896
10897 state->dts_nretained++;
10898 dtrace_retained_gen++;
10899
10900 if (dtrace_retained == NULL) {
10901 dtrace_retained = enab;
10902 return (0);
10903 }
10904
10905 enab->dten_next = dtrace_retained;
10906 dtrace_retained->dten_prev = enab;
10907 dtrace_retained = enab;
10908
10909 return (0);
10910}
10911
10912static int
10913dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
10914 dtrace_probedesc_t *create)
10915{
10916 dtrace_enabling_t *new, *enab;
10917 int found = 0, err = ENOENT;
10918
10919 ASSERT(MUTEX_HELD(&dtrace_lock));
10920 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
10921 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
10922 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
10923 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
10924
10925 new = dtrace_enabling_create(&state->dts_vstate);
10926
10927 /*
10928 * Iterate over all retained enablings, looking for enablings that
10929 * match the specified state.
10930 */
10931 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10932 int i;
10933
10934 /*
10935 * dtvs_state can only be NULL for helper enablings -- and
10936 * helper enablings can't be retained.
10937 */
10938 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10939
10940 if (enab->dten_vstate->dtvs_state != state)
10941 continue;
10942
10943 /*
10944 * Now iterate over each probe description; we're looking for
10945 * an exact match to the specified probe description.
10946 */
10947 for (i = 0; i < enab->dten_ndesc; i++) {
10948 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10949 dtrace_probedesc_t *pd = &ep->dted_probe;
10950
10951 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
10952 continue;
10953
10954 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
10955 continue;
10956
10957 if (strcmp(pd->dtpd_func, match->dtpd_func))
10958 continue;
10959
10960 if (strcmp(pd->dtpd_name, match->dtpd_name))
10961 continue;
10962
10963 /*
10964 * We have a winning probe! Add it to our growing
10965 * enabling.
10966 */
10967 found = 1;
10968 dtrace_enabling_addlike(new, ep, create);
10969 }
10970 }
10971
10972 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
10973 dtrace_enabling_destroy(new);
10974 return (err);
10975 }
10976
10977 return (0);
10978}
10979
10980static void
10981dtrace_enabling_retract(dtrace_state_t *state)
10982{
10983 dtrace_enabling_t *enab, *next;
10984
10985 ASSERT(MUTEX_HELD(&dtrace_lock));
10986
10987 /*
10988 * Iterate over all retained enablings, destroy the enablings retained
10989 * for the specified state.
10990 */
10991 for (enab = dtrace_retained; enab != NULL; enab = next) {
10992 next = enab->dten_next;
10993
10994 /*
10995 * dtvs_state can only be NULL for helper enablings -- and
10996 * helper enablings can't be retained.
10997 */
10998 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10999
11000 if (enab->dten_vstate->dtvs_state == state) {
11001 ASSERT(state->dts_nretained > 0);
11002 dtrace_enabling_destroy(enab);
11003 }
11004 }
11005
11006 ASSERT(state->dts_nretained == 0);
11007}
11008
11009static int
11010dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11011{
11012 int i = 0;
11013 int total_matched = 0, matched = 0;
11014
11015 ASSERT(MUTEX_HELD(&cpu_lock));
11016 ASSERT(MUTEX_HELD(&dtrace_lock));
11017
11018 for (i = 0; i < enab->dten_ndesc; i++) {
11019 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11020
11021 enab->dten_current = ep;
11022 enab->dten_error = 0;
11023
11024 /*
11025 * If a provider failed to enable a probe then get out and
11026 * let the consumer know we failed.
11027 */
11028 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11029 return (EBUSY);
11030
11031 total_matched += matched;
11032
11033 if (enab->dten_error != 0) {
11034 /*
11035 * If we get an error half-way through enabling the
11036 * probes, we kick out -- perhaps with some number of
11037 * them enabled. Leaving enabled probes enabled may
11038 * be slightly confusing for user-level, but we expect
11039 * that no one will attempt to actually drive on in
11040 * the face of such errors. If this is an anonymous
11041 * enabling (indicated with a NULL nmatched pointer),
11042 * we cmn_err() a message. We aren't expecting to
11043 * get such an error -- such as it can exist at all,
11044 * it would be a result of corrupted DOF in the driver
11045 * properties.
11046 */
11047 if (nmatched == NULL) {
11048 cmn_err(CE_WARN, "dtrace_enabling_match() "
11049 "error on %p: %d", (void *)ep,
11050 enab->dten_error);
11051 }
11052
11053 return (enab->dten_error);
11054 }
11055 }
11056
11057 enab->dten_probegen = dtrace_probegen;
11058 if (nmatched != NULL)
11059 *nmatched = total_matched;
11060
11061 return (0);
11062}
11063
11064static void
11065dtrace_enabling_matchall(void)
11066{
11067 dtrace_enabling_t *enab;
11068
11069 mutex_enter(&cpu_lock);
11070 mutex_enter(&dtrace_lock);
11071
11072 /*
11073 * Iterate over all retained enablings to see if any probes match
11074 * against them. We only perform this operation on enablings for which
11075 * we have sufficient permissions by virtue of being in the global zone
11076 * or in the same zone as the DTrace client. Because we can be called
11077 * after dtrace_detach() has been called, we cannot assert that there
11078 * are retained enablings. We can safely load from dtrace_retained,
11079 * however: the taskq_destroy() at the end of dtrace_detach() will
11080 * block pending our completion.
11081 */
11082 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11083 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
11084
11085#ifndef VBOX
11086 if (INGLOBALZONE(curproc) ||
11087 cr != NULL && getzoneid() == crgetzoneid(cr))
11088#endif
11089 (void) dtrace_enabling_match(enab, NULL);
11090 }
11091
11092 mutex_exit(&dtrace_lock);
11093 mutex_exit(&cpu_lock);
11094}
11095
11096/*
11097 * If an enabling is to be enabled without having matched probes (that is, if
11098 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11099 * enabling must be _primed_ by creating an ECB for every ECB description.
11100 * This must be done to assure that we know the number of speculations, the
11101 * number of aggregations, the minimum buffer size needed, etc. before we
11102 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
11103 * enabling any probes, we create ECBs for every ECB decription, but with a
11104 * NULL probe -- which is exactly what this function does.
11105 */
11106static void
11107dtrace_enabling_prime(dtrace_state_t *state)
11108{
11109 dtrace_enabling_t *enab;
11110 int i;
11111
11112 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11113 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11114
11115 if (enab->dten_vstate->dtvs_state != state)
11116 continue;
11117
11118 /*
11119 * We don't want to prime an enabling more than once, lest
11120 * we allow a malicious user to induce resource exhaustion.
11121 * (The ECBs that result from priming an enabling aren't
11122 * leaked -- but they also aren't deallocated until the
11123 * consumer state is destroyed.)
11124 */
11125 if (enab->dten_primed)
11126 continue;
11127
11128 for (i = 0; i < enab->dten_ndesc; i++) {
11129 enab->dten_current = enab->dten_desc[i];
11130 (void) dtrace_probe_enable(NULL, enab);
11131 }
11132
11133 enab->dten_primed = 1;
11134 }
11135}
11136
11137/*
11138 * Called to indicate that probes should be provided due to retained
11139 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
11140 * must take an initial lap through the enabling calling the dtps_provide()
11141 * entry point explicitly to allow for autocreated probes.
11142 */
11143static void
11144dtrace_enabling_provide(dtrace_provider_t *prv)
11145{
11146 int i, all = 0;
11147 dtrace_probedesc_t desc;
11148 dtrace_genid_t gen;
11149
11150 ASSERT(MUTEX_HELD(&dtrace_lock));
11151 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11152
11153 if (prv == NULL) {
11154 all = 1;
11155 prv = dtrace_provider;
11156 }
11157
11158 do {
11159 dtrace_enabling_t *enab;
11160 void *parg = prv->dtpv_arg;
11161
11162retry:
11163 gen = dtrace_retained_gen;
11164 for (enab = dtrace_retained; enab != NULL;
11165 enab = enab->dten_next) {
11166 for (i = 0; i < enab->dten_ndesc; i++) {
11167 desc = enab->dten_desc[i]->dted_probe;
11168 mutex_exit(&dtrace_lock);
11169 prv->dtpv_pops.dtps_provide(parg, &desc);
11170 mutex_enter(&dtrace_lock);
11171 /*
11172 * Process the retained enablings again if
11173 * they have changed while we weren't holding
11174 * dtrace_lock.
11175 */
11176 if (gen != dtrace_retained_gen)
11177 goto retry;
11178 }
11179 }
11180 } while (all && (prv = prv->dtpv_next) != NULL);
11181
11182 mutex_exit(&dtrace_lock);
11183 dtrace_probe_provide(NULL, all ? NULL : prv);
11184 mutex_enter(&dtrace_lock);
11185}
11186
11187/*
11188 * DTrace DOF Functions
11189 */
11190/*ARGSUSED*/
11191static void
11192dtrace_dof_error(dof_hdr_t *dof, const char *str)
11193{
11194 if (dtrace_err_verbose)
11195 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11196
11197#ifdef DTRACE_ERRDEBUG
11198 dtrace_errdebug(str);
11199#endif
11200}
11201
11202/*
11203 * Create DOF out of a currently enabled state. Right now, we only create
11204 * DOF containing the run-time options -- but this could be expanded to create
11205 * complete DOF representing the enabled state.
11206 */
11207static dof_hdr_t *
11208dtrace_dof_create(dtrace_state_t *state)
11209{
11210 dof_hdr_t *dof;
11211 dof_sec_t *sec;
11212 dof_optdesc_t *opt;
11213 int i, len = sizeof (dof_hdr_t) +
11214 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11215 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11216
11217 ASSERT(MUTEX_HELD(&dtrace_lock));
11218
11219 dof = kmem_zalloc(len, KM_SLEEP);
11220 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11221 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11222 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11223 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11224
11225 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11226 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11227 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11228 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11229 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11230 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11231
11232 dof->dofh_flags = 0;
11233 dof->dofh_hdrsize = sizeof (dof_hdr_t);
11234 dof->dofh_secsize = sizeof (dof_sec_t);
11235 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
11236 dof->dofh_secoff = sizeof (dof_hdr_t);
11237 dof->dofh_loadsz = len;
11238 dof->dofh_filesz = len;
11239 dof->dofh_pad = 0;
11240
11241 /*
11242 * Fill in the option section header...
11243 */
11244 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11245 sec->dofs_type = DOF_SECT_OPTDESC;
11246 sec->dofs_align = sizeof (uint64_t);
11247 sec->dofs_flags = DOF_SECF_LOAD;
11248 sec->dofs_entsize = sizeof (dof_optdesc_t);
11249
11250 opt = (dof_optdesc_t *)((uintptr_t)sec +
11251 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11252
11253 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11254 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11255
11256 for (i = 0; i < DTRACEOPT_MAX; i++) {
11257 opt[i].dofo_option = i;
11258 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11259 opt[i].dofo_value = state->dts_options[i];
11260 }
11261
11262 return (dof);
11263}
11264
11265static dof_hdr_t *
11266dtrace_dof_copyin(uintptr_t uarg, int *errp)
11267{
11268 dof_hdr_t hdr, *dof;
11269
11270 ASSERT(!MUTEX_HELD(&dtrace_lock));
11271
11272 /*
11273 * First, we're going to copyin() the sizeof (dof_hdr_t).
11274 */
11275 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11276 dtrace_dof_error(NULL, "failed to copyin DOF header");
11277 *errp = EFAULT;
11278 return (NULL);
11279 }
11280
11281 /*
11282 * Now we'll allocate the entire DOF and copy it in -- provided
11283 * that the length isn't outrageous.
11284 */
11285 if (hdr.dofh_loadsz >= VBDTCAST(uint64_t)dtrace_dof_maxsize) {
11286 dtrace_dof_error(&hdr, "load size exceeds maximum");
11287 *errp = E2BIG;
11288 return (NULL);
11289 }
11290
11291 if (hdr.dofh_loadsz < sizeof (hdr)) {
11292 dtrace_dof_error(&hdr, "invalid load size");
11293 *errp = EINVAL;
11294 return (NULL);
11295 }
11296
11297 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11298
11299 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11300 dof->dofh_loadsz != hdr.dofh_loadsz) {
11301 kmem_free(dof, hdr.dofh_loadsz);
11302 *errp = EFAULT;
11303 return (NULL);
11304 }
11305
11306 return (dof);
11307}
11308
11309static dof_hdr_t *
11310dtrace_dof_property(const char *name)
11311{
11312#ifndef VBOX
11313 uchar_t *buf;
11314 uint64_t loadsz;
11315 unsigned int len, i;
11316 dof_hdr_t *dof;
11317
11318 /*
11319 * Unfortunately, array of values in .conf files are always (and
11320 * only) interpreted to be integer arrays. We must read our DOF
11321 * as an integer array, and then squeeze it into a byte array.
11322 */
11323 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11324 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11325 return (NULL);
11326
11327 for (i = 0; i < len; i++)
11328 buf[i] = (uchar_t)(((int *)buf)[i]);
11329
11330 if (len < sizeof (dof_hdr_t)) {
11331 ddi_prop_free(buf);
11332 dtrace_dof_error(NULL, "truncated header");
11333 return (NULL);
11334 }
11335
11336 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11337 ddi_prop_free(buf);
11338 dtrace_dof_error(NULL, "truncated DOF");
11339 return (NULL);
11340 }
11341
11342 if (loadsz >= dtrace_dof_maxsize) {
11343 ddi_prop_free(buf);
11344 dtrace_dof_error(NULL, "oversized DOF");
11345 return (NULL);
11346 }
11347
11348 dof = kmem_alloc(loadsz, KM_SLEEP);
11349 bcopy(buf, dof, loadsz);
11350 ddi_prop_free(buf);
11351
11352 return (dof);
11353#else /* VBOX */
11354 return (NULL);
11355#endif /* VBOX */
11356}
11357
11358static void
11359dtrace_dof_destroy(dof_hdr_t *dof)
11360{
11361 kmem_free(dof, dof->dofh_loadsz);
11362}
11363
11364/*
11365 * Return the dof_sec_t pointer corresponding to a given section index. If the
11366 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
11367 * a type other than DOF_SECT_NONE is specified, the header is checked against
11368 * this type and NULL is returned if the types do not match.
11369 */
11370static dof_sec_t *
11371dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11372{
11373 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11374 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11375
11376 if (i >= dof->dofh_secnum) {
11377 dtrace_dof_error(dof, "referenced section index is invalid");
11378 return (NULL);
11379 }
11380
11381 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11382 dtrace_dof_error(dof, "referenced section is not loadable");
11383 return (NULL);
11384 }
11385
11386 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11387 dtrace_dof_error(dof, "referenced section is the wrong type");
11388 return (NULL);
11389 }
11390
11391 return (sec);
11392}
11393
11394static dtrace_probedesc_t *
11395dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11396{
11397 dof_probedesc_t *probe;
11398 dof_sec_t *strtab;
11399 uintptr_t daddr = (uintptr_t)dof;
11400 uintptr_t str;
11401 size_t size;
11402
11403 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11404 dtrace_dof_error(dof, "invalid probe section");
11405 return (NULL);
11406 }
11407
11408 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11409 dtrace_dof_error(dof, "bad alignment in probe description");
11410 return (NULL);
11411 }
11412
11413 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11414 dtrace_dof_error(dof, "truncated probe description");
11415 return (NULL);
11416 }
11417
11418 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11419 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11420
11421 if (strtab == NULL)
11422 return (NULL);
11423
11424 str = daddr + strtab->dofs_offset;
11425 size = strtab->dofs_size;
11426
11427 if (probe->dofp_provider >= strtab->dofs_size) {
11428 dtrace_dof_error(dof, "corrupt probe provider");
11429 return (NULL);
11430 }
11431
11432 (void) strncpy(desc->dtpd_provider,
11433 (char *)(str + probe->dofp_provider),
11434 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11435
11436 if (probe->dofp_mod >= strtab->dofs_size) {
11437 dtrace_dof_error(dof, "corrupt probe module");
11438 return (NULL);
11439 }
11440
11441 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11442 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11443
11444 if (probe->dofp_func >= strtab->dofs_size) {
11445 dtrace_dof_error(dof, "corrupt probe function");
11446 return (NULL);
11447 }
11448
11449 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11450 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11451
11452 if (probe->dofp_name >= strtab->dofs_size) {
11453 dtrace_dof_error(dof, "corrupt probe name");
11454 return (NULL);
11455 }
11456
11457 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11458 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11459
11460 return (desc);
11461}
11462
11463static dtrace_difo_t *
11464dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11465 cred_t *cr)
11466{
11467 dtrace_difo_t *dp;
11468 size_t ttl = 0;
11469 dof_difohdr_t *dofd;
11470 uintptr_t daddr = (uintptr_t)dof;
11471 size_t max = dtrace_difo_maxsize;
11472 int i, l, n;
11473
11474 static const struct {
11475 int section;
11476 int bufoffs;
11477 int lenoffs;
11478 int entsize;
11479 int align;
11480 const char *msg;
11481 } difo[] = {
11482 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11483 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11484 sizeof (dif_instr_t), "multiple DIF sections" },
11485
11486 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11487 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11488 sizeof (uint64_t), "multiple integer tables" },
11489
11490 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11491 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11492 sizeof (char), "multiple string tables" },
11493
11494 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11495 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11496 sizeof (uint_t), "multiple variable tables" },
11497
11498 { DOF_SECT_NONE, 0, 0, 0, NULL }
11499 };
11500
11501 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11502 dtrace_dof_error(dof, "invalid DIFO header section");
11503 return (NULL);
11504 }
11505
11506 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11507 dtrace_dof_error(dof, "bad alignment in DIFO header");
11508 return (NULL);
11509 }
11510
11511 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11512 sec->dofs_size % sizeof (dof_secidx_t)) {
11513 dtrace_dof_error(dof, "bad size in DIFO header");
11514 return (NULL);
11515 }
11516
11517 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11518 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11519
11520 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11521 dp->dtdo_rtype = dofd->dofd_rtype;
11522
11523 for (l = 0; l < n; l++) {
11524 dof_sec_t *subsec;
11525 void **bufp;
11526 uint32_t *lenp;
11527
11528 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11529 dofd->dofd_links[l])) == NULL)
11530 goto err; /* invalid section link */
11531
11532 if (ttl + subsec->dofs_size > max) {
11533 dtrace_dof_error(dof, "exceeds maximum size");
11534 goto err;
11535 }
11536
11537 ttl += subsec->dofs_size;
11538
11539 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11540 if (subsec->dofs_type != VBDTCAST(uint32_t)difo[i].section)
11541 continue;
11542
11543 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11544 dtrace_dof_error(dof, "section not loaded");
11545 goto err;
11546 }
11547
11548 if (subsec->dofs_align != VBDTCAST(uint32_t)difo[i].align) {
11549 dtrace_dof_error(dof, "bad alignment");
11550 goto err;
11551 }
11552
11553 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11554 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11555
11556 if (*bufp != NULL) {
11557 dtrace_dof_error(dof, difo[i].msg);
11558 goto err;
11559 }
11560
11561 if (VBDTCAST(uint32_t)difo[i].entsize != subsec->dofs_entsize) {
11562 dtrace_dof_error(dof, "entry size mismatch");
11563 goto err;
11564 }
11565
11566 if (subsec->dofs_entsize != 0 &&
11567 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11568 dtrace_dof_error(dof, "corrupt entry size");
11569 goto err;
11570 }
11571
11572 *lenp = subsec->dofs_size;
11573 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11574 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11575 *bufp, subsec->dofs_size);
11576
11577 if (subsec->dofs_entsize != 0)
11578 *lenp /= subsec->dofs_entsize;
11579
11580 break;
11581 }
11582
11583 /*
11584 * If we encounter a loadable DIFO sub-section that is not
11585 * known to us, assume this is a broken program and fail.
11586 */
11587 if (difo[i].section == DOF_SECT_NONE &&
11588 (subsec->dofs_flags & DOF_SECF_LOAD)) {
11589 dtrace_dof_error(dof, "unrecognized DIFO subsection");
11590 goto err;
11591 }
11592 }
11593
11594 if (dp->dtdo_buf == NULL) {
11595 /*
11596 * We can't have a DIF object without DIF text.
11597 */
11598 dtrace_dof_error(dof, "missing DIF text");
11599 goto err;
11600 }
11601
11602 /*
11603 * Before we validate the DIF object, run through the variable table
11604 * looking for the strings -- if any of their size are under, we'll set
11605 * their size to be the system-wide default string size. Note that
11606 * this should _not_ happen if the "strsize" option has been set --
11607 * in this case, the compiler should have set the size to reflect the
11608 * setting of the option.
11609 */
11610 for (i = 0; VBDTCAST(unsigned)i < dp->dtdo_varlen; i++) {
11611 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11612 dtrace_diftype_t *t = &v->dtdv_type;
11613
11614 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11615 continue;
11616
11617 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11618 t->dtdt_size = dtrace_strsize_default;
11619 }
11620
11621 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11622 goto err;
11623
11624 dtrace_difo_init(dp, vstate);
11625 return (dp);
11626
11627err:
11628 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11629 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11630 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11631 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11632
11633 kmem_free(dp, sizeof (dtrace_difo_t));
11634 return (NULL);
11635}
11636
11637static dtrace_predicate_t *
11638dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11639 cred_t *cr)
11640{
11641 dtrace_difo_t *dp;
11642
11643 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11644 return (NULL);
11645
11646 return (dtrace_predicate_create(dp));
11647}
11648
11649static dtrace_actdesc_t *
11650dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11651 cred_t *cr)
11652{
11653 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11654 dof_actdesc_t *desc;
11655 dof_sec_t *difosec;
11656 size_t offs;
11657 uintptr_t daddr = (uintptr_t)dof;
11658 uint64_t arg;
11659 dtrace_actkind_t kind;
11660
11661 if (sec->dofs_type != DOF_SECT_ACTDESC) {
11662 dtrace_dof_error(dof, "invalid action section");
11663 return (NULL);
11664 }
11665
11666 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11667 dtrace_dof_error(dof, "truncated action description");
11668 return (NULL);
11669 }
11670
11671 if (sec->dofs_align != sizeof (uint64_t)) {
11672 dtrace_dof_error(dof, "bad alignment in action description");
11673 return (NULL);
11674 }
11675
11676 if (sec->dofs_size < sec->dofs_entsize) {
11677 dtrace_dof_error(dof, "section entry size exceeds total size");
11678 return (NULL);
11679 }
11680
11681 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11682 dtrace_dof_error(dof, "bad entry size in action description");
11683 return (NULL);
11684 }
11685
11686 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11687 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11688 return (NULL);
11689 }
11690
11691 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11692 desc = (dof_actdesc_t *)(daddr +
11693 (uintptr_t)sec->dofs_offset + offs);
11694 kind = (dtrace_actkind_t)desc->dofa_kind;
11695
11696 if (DTRACEACT_ISPRINTFLIKE(kind) &&
11697 (kind != DTRACEACT_PRINTA ||
11698 desc->dofa_strtab != DOF_SECIDX_NONE)) {
11699 dof_sec_t *strtab;
11700 char *str, *fmt;
11701 uint64_t i;
11702
11703 /*
11704 * printf()-like actions must have a format string.
11705 */
11706 if ((strtab = dtrace_dof_sect(dof,
11707 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11708 goto err;
11709
11710 str = (char *)((uintptr_t)dof +
11711 (uintptr_t)strtab->dofs_offset);
11712
11713 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11714 if (str[i] == '\0')
11715 break;
11716 }
11717
11718 if (i >= strtab->dofs_size) {
11719 dtrace_dof_error(dof, "bogus format string");
11720 goto err;
11721 }
11722
11723 if (i == desc->dofa_arg) {
11724 dtrace_dof_error(dof, "empty format string");
11725 goto err;
11726 }
11727
11728 i -= desc->dofa_arg;
11729 fmt = kmem_alloc(i + 1, KM_SLEEP);
11730 bcopy(&str[desc->dofa_arg], fmt, i + 1);
11731 arg = (uint64_t)(uintptr_t)fmt;
11732 } else {
11733 if (kind == DTRACEACT_PRINTA) {
11734 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11735 arg = 0;
11736 } else {
11737 arg = desc->dofa_arg;
11738 }
11739 }
11740
11741 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11742 desc->dofa_uarg, arg);
11743
11744 if (last != NULL) {
11745 last->dtad_next = act;
11746 } else {
11747 first = act;
11748 }
11749
11750 last = act;
11751
11752 if (desc->dofa_difo == DOF_SECIDX_NONE)
11753 continue;
11754
11755 if ((difosec = dtrace_dof_sect(dof,
11756 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11757 goto err;
11758
11759 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11760
11761 if (act->dtad_difo == NULL)
11762 goto err;
11763 }
11764
11765 ASSERT(first != NULL);
11766 return (first);
11767
11768err:
11769 for (act = first; act != NULL; act = next) {
11770 next = act->dtad_next;
11771 dtrace_actdesc_release(act, vstate);
11772 }
11773
11774 return (NULL);
11775}
11776
11777static dtrace_ecbdesc_t *
11778dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11779 cred_t *cr)
11780{
11781 dtrace_ecbdesc_t *ep;
11782 dof_ecbdesc_t *ecb;
11783 dtrace_probedesc_t *desc;
11784 dtrace_predicate_t *pred = NULL;
11785
11786 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
11787 dtrace_dof_error(dof, "truncated ECB description");
11788 return (NULL);
11789 }
11790
11791 if (sec->dofs_align != sizeof (uint64_t)) {
11792 dtrace_dof_error(dof, "bad alignment in ECB description");
11793 return (NULL);
11794 }
11795
11796 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
11797 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
11798
11799 if (sec == NULL)
11800 return (NULL);
11801
11802 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11803 ep->dted_uarg = ecb->dofe_uarg;
11804 desc = &ep->dted_probe;
11805
11806 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
11807 goto err;
11808
11809 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
11810 if ((sec = dtrace_dof_sect(dof,
11811 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
11812 goto err;
11813
11814 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
11815 goto err;
11816
11817 ep->dted_pred.dtpdd_predicate = pred;
11818 }
11819
11820 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
11821 if ((sec = dtrace_dof_sect(dof,
11822 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
11823 goto err;
11824
11825 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
11826
11827 if (ep->dted_action == NULL)
11828 goto err;
11829 }
11830
11831 return (ep);
11832
11833err:
11834 if (pred != NULL)
11835 dtrace_predicate_release(pred, vstate);
11836 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11837 return (NULL);
11838}
11839
11840/*
11841 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
11842 * specified DOF. At present, this amounts to simply adding 'ubase' to the
11843 * site of any user SETX relocations to account for load object base address.
11844 * In the future, if we need other relocations, this function can be extended.
11845 */
11846static int
11847dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
11848{
11849 uintptr_t daddr = (uintptr_t)dof;
11850 dof_relohdr_t *dofr =
11851 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11852 dof_sec_t *ss, *rs, *ts;
11853 dof_relodesc_t *r;
11854 uint_t i, n;
11855
11856 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
11857 sec->dofs_align != sizeof (dof_secidx_t)) {
11858 dtrace_dof_error(dof, "invalid relocation header");
11859 return (-1);
11860 }
11861
11862 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
11863 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
11864 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
11865
11866 if (ss == NULL || rs == NULL || ts == NULL)
11867 return (-1); /* dtrace_dof_error() has been called already */
11868
11869 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
11870 rs->dofs_align != sizeof (uint64_t)) {
11871 dtrace_dof_error(dof, "invalid relocation section");
11872 return (-1);
11873 }
11874
11875 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
11876 n = rs->dofs_size / rs->dofs_entsize;
11877
11878 for (i = 0; i < n; i++) {
11879 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
11880
11881 switch (r->dofr_type) {
11882 case DOF_RELO_NONE:
11883 break;
11884 case DOF_RELO_SETX:
11885 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
11886 sizeof (uint64_t) > ts->dofs_size) {
11887 dtrace_dof_error(dof, "bad relocation offset");
11888 return (-1);
11889 }
11890
11891 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
11892 dtrace_dof_error(dof, "misaligned setx relo");
11893 return (-1);
11894 }
11895
11896 *(uint64_t *)taddr += ubase;
11897 break;
11898 default:
11899 dtrace_dof_error(dof, "invalid relocation type");
11900 return (-1);
11901 }
11902
11903 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
11904 }
11905
11906 return (0);
11907}
11908
11909/*
11910 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
11911 * header: it should be at the front of a memory region that is at least
11912 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
11913 * size. It need not be validated in any other way.
11914 */
11915static int
11916dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
11917 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
11918{
11919 uint64_t len = dof->dofh_loadsz, seclen;
11920 uintptr_t daddr = (uintptr_t)dof;
11921 dtrace_ecbdesc_t *ep;
11922 dtrace_enabling_t *enab;
11923 uint_t i;
11924
11925 ASSERT(MUTEX_HELD(&dtrace_lock));
11926 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
11927
11928 /*
11929 * Check the DOF header identification bytes. In addition to checking
11930 * valid settings, we also verify that unused bits/bytes are zeroed so
11931 * we can use them later without fear of regressing existing binaries.
11932 */
11933 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
11934 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
11935 dtrace_dof_error(dof, "DOF magic string mismatch");
11936 return (-1);
11937 }
11938
11939 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
11940 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
11941 dtrace_dof_error(dof, "DOF has invalid data model");
11942 return (-1);
11943 }
11944
11945 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
11946 dtrace_dof_error(dof, "DOF encoding mismatch");
11947 return (-1);
11948 }
11949
11950 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
11951 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
11952 dtrace_dof_error(dof, "DOF version mismatch");
11953 return (-1);
11954 }
11955
11956 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
11957 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
11958 return (-1);
11959 }
11960
11961 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
11962 dtrace_dof_error(dof, "DOF uses too many integer registers");
11963 return (-1);
11964 }
11965
11966 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
11967 dtrace_dof_error(dof, "DOF uses too many tuple registers");
11968 return (-1);
11969 }
11970
11971 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
11972 if (dof->dofh_ident[i] != 0) {
11973 dtrace_dof_error(dof, "DOF has invalid ident byte set");
11974 return (-1);
11975 }
11976 }
11977
11978 if (dof->dofh_flags & ~DOF_FL_VALID) {
11979 dtrace_dof_error(dof, "DOF has invalid flag bits set");
11980 return (-1);
11981 }
11982
11983 if (dof->dofh_secsize == 0) {
11984 dtrace_dof_error(dof, "zero section header size");
11985 return (-1);
11986 }
11987
11988 /*
11989 * Check that the section headers don't exceed the amount of DOF
11990 * data. Note that we cast the section size and number of sections
11991 * to uint64_t's to prevent possible overflow in the multiplication.
11992 */
11993 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
11994
11995 if (dof->dofh_secoff > len || seclen > len ||
11996 dof->dofh_secoff + seclen > len) {
11997 dtrace_dof_error(dof, "truncated section headers");
11998 return (-1);
11999 }
12000
12001 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12002 dtrace_dof_error(dof, "misaligned section headers");
12003 return (-1);
12004 }
12005
12006 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12007 dtrace_dof_error(dof, "misaligned section size");
12008 return (-1);
12009 }
12010
12011 /*
12012 * Take an initial pass through the section headers to be sure that
12013 * the headers don't have stray offsets. If the 'noprobes' flag is
12014 * set, do not permit sections relating to providers, probes, or args.
12015 */
12016 for (i = 0; i < dof->dofh_secnum; i++) {
12017 dof_sec_t *sec = (dof_sec_t *)(daddr +
12018 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12019
12020 if (noprobes) {
12021 switch (sec->dofs_type) {
12022 case DOF_SECT_PROVIDER:
12023 case DOF_SECT_PROBES:
12024 case DOF_SECT_PRARGS:
12025 case DOF_SECT_PROFFS:
12026 dtrace_dof_error(dof, "illegal sections "
12027 "for enabling");
12028 return (-1);
12029 }
12030 }
12031
12032 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12033 !(sec->dofs_flags & DOF_SECF_LOAD)) {
12034 dtrace_dof_error(dof, "loadable section with load "
12035 "flag unset");
12036 return (-1);
12037 }
12038
12039 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12040 continue; /* just ignore non-loadable sections */
12041
12042 if (sec->dofs_align & (sec->dofs_align - 1)) {
12043 dtrace_dof_error(dof, "bad section alignment");
12044 return (-1);
12045 }
12046
12047 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12048 dtrace_dof_error(dof, "misaligned section");
12049 return (-1);
12050 }
12051
12052 if (sec->dofs_offset > len || sec->dofs_size > len ||
12053 sec->dofs_offset + sec->dofs_size > len) {
12054 dtrace_dof_error(dof, "corrupt section header");
12055 return (-1);
12056 }
12057
12058 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12059 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12060 dtrace_dof_error(dof, "non-terminating string table");
12061 return (-1);
12062 }
12063 }
12064
12065 /*
12066 * Take a second pass through the sections and locate and perform any
12067 * relocations that are present. We do this after the first pass to
12068 * be sure that all sections have had their headers validated.
12069 */
12070 for (i = 0; i < dof->dofh_secnum; i++) {
12071 dof_sec_t *sec = (dof_sec_t *)(daddr +
12072 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12073
12074 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12075 continue; /* skip sections that are not loadable */
12076
12077 switch (sec->dofs_type) {
12078 case DOF_SECT_URELHDR:
12079 if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12080 return (-1);
12081 break;
12082 }
12083 }
12084
12085 if ((enab = *enabp) == NULL)
12086 enab = *enabp = dtrace_enabling_create(vstate);
12087
12088 for (i = 0; i < dof->dofh_secnum; i++) {
12089 dof_sec_t *sec = (dof_sec_t *)(daddr +
12090 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12091
12092 if (sec->dofs_type != DOF_SECT_ECBDESC)
12093 continue;
12094
12095 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12096 dtrace_enabling_destroy(enab);
12097 *enabp = NULL;
12098 return (-1);
12099 }
12100
12101 dtrace_enabling_add(enab, ep);
12102 }
12103
12104 return (0);
12105}
12106
12107/*
12108 * Process DOF for any options. This routine assumes that the DOF has been
12109 * at least processed by dtrace_dof_slurp().
12110 */
12111static int
12112dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12113{
12114 int i, rval;
12115 uint32_t entsize;
12116 size_t offs;
12117 dof_optdesc_t *desc;
12118
12119 for (i = 0; VBDTCAST(unsigned)i < dof->dofh_secnum; i++) {
12120 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12121 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12122
12123 if (sec->dofs_type != DOF_SECT_OPTDESC)
12124 continue;
12125
12126 if (sec->dofs_align != sizeof (uint64_t)) {
12127 dtrace_dof_error(dof, "bad alignment in "
12128 "option description");
12129 return (EINVAL);
12130 }
12131
12132 if ((entsize = sec->dofs_entsize) == 0) {
12133 dtrace_dof_error(dof, "zeroed option entry size");
12134 return (EINVAL);
12135 }
12136
12137 if (entsize < sizeof (dof_optdesc_t)) {
12138 dtrace_dof_error(dof, "bad option entry size");
12139 return (EINVAL);
12140 }
12141
12142 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12143 desc = (dof_optdesc_t *)((uintptr_t)dof +
12144 (uintptr_t)sec->dofs_offset + offs);
12145
12146 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12147 dtrace_dof_error(dof, "non-zero option string");
12148 return (EINVAL);
12149 }
12150
12151 if (desc->dofo_value == VBDTCAST(uint64_t)DTRACEOPT_UNSET) {
12152 dtrace_dof_error(dof, "unset option");
12153 return (EINVAL);
12154 }
12155
12156 if ((rval = dtrace_state_option(state,
12157 desc->dofo_option, desc->dofo_value)) != 0) {
12158 dtrace_dof_error(dof, "rejected option");
12159 return (rval);
12160 }
12161 }
12162 }
12163
12164 return (0);
12165}
12166
12167/*
12168 * DTrace Consumer State Functions
12169 */
12170VBDTSTATIC int
12171dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12172{
12173 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12174 void *base;
12175 uintptr_t limit;
12176 dtrace_dynvar_t *dvar, *next, *start;
12177 VBDTTYPE(size_t,int) i;
12178
12179 ASSERT(MUTEX_HELD(&dtrace_lock));
12180 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12181
12182 bzero(dstate, sizeof (dtrace_dstate_t));
12183
12184 if ((dstate->dtds_chunksize = chunksize) == 0)
12185 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12186
12187 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12188 size = min;
12189
12190 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12191 return (ENOMEM);
12192
12193 dstate->dtds_size = size;
12194 dstate->dtds_base = base;
12195 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12196 bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12197
12198 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12199
12200 if (hashsize != 1 && (hashsize & 1))
12201 hashsize--;
12202
12203 dstate->dtds_hashsize = hashsize;
12204 dstate->dtds_hash = dstate->dtds_base;
12205
12206 /*
12207 * Set all of our hash buckets to point to the single sink, and (if
12208 * it hasn't already been set), set the sink's hash value to be the
12209 * sink sentinel value. The sink is needed for dynamic variable
12210 * lookups to know that they have iterated over an entire, valid hash
12211 * chain.
12212 */
12213 for (i = 0; i < hashsize; i++)
12214 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12215
12216 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12217 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12218
12219 /*
12220 * Determine number of active CPUs. Divide free list evenly among
12221 * active CPUs.
12222 */
12223 start = (dtrace_dynvar_t *)
12224 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12225 limit = (uintptr_t)base + size;
12226
12227 maxper = (limit - (uintptr_t)start) / NCPU;
12228 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12229
12230 for (i = 0; i < NCPU; i++) {
12231 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12232
12233 /*
12234 * If we don't even have enough chunks to make it once through
12235 * NCPUs, we're just going to allocate everything to the first
12236 * CPU. And if we're on the last CPU, we're going to allocate
12237 * whatever is left over. In either case, we set the limit to
12238 * be the limit of the dynamic variable space.
12239 */
12240 if (maxper == 0 || i == NCPU - 1) {
12241 limit = (uintptr_t)base + size;
12242 start = NULL;
12243 } else {
12244 limit = (uintptr_t)start + maxper;
12245 start = (dtrace_dynvar_t *)limit;
12246 }
12247
12248 ASSERT(limit <= (uintptr_t)base + size);
12249
12250 for (;;) {
12251 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12252 dstate->dtds_chunksize);
12253
12254 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12255 break;
12256
12257 dvar->dtdv_next = next;
12258 dvar = next;
12259 }
12260
12261 if (maxper == 0)
12262 break;
12263 }
12264
12265 return (0);
12266}
12267
12268VBDTSTATIC void
12269dtrace_dstate_fini(dtrace_dstate_t *dstate)
12270{
12271 ASSERT(MUTEX_HELD(&cpu_lock));
12272
12273 if (dstate->dtds_base == NULL)
12274 return;
12275
12276 kmem_free(dstate->dtds_base, dstate->dtds_size);
12277 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12278}
12279
12280static void
12281dtrace_vstate_fini(dtrace_vstate_t *vstate)
12282{
12283 /*
12284 * Logical XOR, where are you?
12285 */
12286 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12287
12288 if (vstate->dtvs_nglobals > 0) {
12289 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12290 sizeof (dtrace_statvar_t *));
12291 }
12292
12293 if (vstate->dtvs_ntlocals > 0) {
12294 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12295 sizeof (dtrace_difv_t));
12296 }
12297
12298 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12299
12300 if (vstate->dtvs_nlocals > 0) {
12301 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12302 sizeof (dtrace_statvar_t *));
12303 }
12304}
12305
12306static void
12307dtrace_state_clean(dtrace_state_t *state)
12308{
12309 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12310 return;
12311
12312 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12313 dtrace_speculation_clean(state);
12314}
12315#ifdef VBOX
12316static DECLCALLBACK(void) dtrace_state_clean_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12317{
12318 dtrace_state_clean((dtrace_state_t *)pvUser);
12319 NOREF(pTimer); NOREF(iTick);
12320}
12321#endif
12322
12323static void
12324dtrace_state_deadman(dtrace_state_t *state)
12325{
12326 hrtime_t now;
12327
12328 dtrace_sync();
12329
12330 now = dtrace_gethrtime();
12331
12332 if (state != dtrace_anon.dta_state &&
12333 now - state->dts_laststatus >= dtrace_deadman_user)
12334 return;
12335
12336 /*
12337 * We must be sure that dts_alive never appears to be less than the
12338 * value upon entry to dtrace_state_deadman(), and because we lack a
12339 * dtrace_cas64(), we cannot store to it atomically. We thus instead
12340 * store INT64_MAX to it, followed by a memory barrier, followed by
12341 * the new value. This assures that dts_alive never appears to be
12342 * less than its true value, regardless of the order in which the
12343 * stores to the underlying storage are issued.
12344 */
12345 state->dts_alive = INT64_MAX;
12346 dtrace_membar_producer();
12347 state->dts_alive = now;
12348}
12349
12350#ifdef VBOX
12351static DECLCALLBACK(void) dtrace_state_deadman_timer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
12352{
12353 dtrace_state_deadman((dtrace_state_t *)pvUser);
12354 NOREF(pTimer); NOREF(iTick);
12355}
12356#endif
12357
12358VBDTSTATIC dtrace_state_t *
12359dtrace_state_create(dev_t *devp, cred_t *cr)
12360{
12361 minor_t minor;
12362 major_t major;
12363 char c[30];
12364 dtrace_state_t *state;
12365 dtrace_optval_t *opt;
12366 int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12367
12368 ASSERT(MUTEX_HELD(&dtrace_lock));
12369 ASSERT(MUTEX_HELD(&cpu_lock));
12370
12371 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12372 VM_BESTFIT | VM_SLEEP);
12373
12374 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12375 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12376 return (NULL);
12377 }
12378
12379 state = ddi_get_soft_state(dtrace_softstate, minor);
12380 state->dts_epid = DTRACE_EPIDNONE + 1;
12381
12382 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12383 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12384 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12385
12386 if (devp != NULL) {
12387 major = getemajor(*devp);
12388 } else {
12389 major = ddi_driver_major(dtrace_devi);
12390 }
12391
12392 state->dts_dev = makedevice(major, minor);
12393
12394 if (devp != NULL)
12395 *devp = state->dts_dev;
12396
12397 /*
12398 * We allocate NCPU buffers. On the one hand, this can be quite
12399 * a bit of memory per instance (nearly 36K on a Starcat). On the
12400 * other hand, it saves an additional memory reference in the probe
12401 * path.
12402 */
12403 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12404 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12405 state->dts_cleaner = CYCLIC_NONE;
12406 state->dts_deadman = CYCLIC_NONE;
12407 state->dts_vstate.dtvs_state = state;
12408
12409 for (i = 0; i < DTRACEOPT_MAX; i++)
12410 state->dts_options[i] = DTRACEOPT_UNSET;
12411
12412 /*
12413 * Set the default options.
12414 */
12415 opt = state->dts_options;
12416 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12417 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12418 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12419 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12420 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12421 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12422 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12423 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12424 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12425 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12426 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12427 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12428 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12429 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12430
12431 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12432
12433 /*
12434 * Depending on the user credentials, we set flag bits which alter probe
12435 * visibility or the amount of destructiveness allowed. In the case of
12436 * actual anonymous tracing, or the possession of all privileges, all of
12437 * the normal checks are bypassed.
12438 */
12439 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12440 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12441 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12442 } else {
12443 /*
12444 * Set up the credentials for this instantiation. We take a
12445 * hold on the credential to prevent it from disappearing on
12446 * us; this in turn prevents the zone_t referenced by this
12447 * credential from disappearing. This means that we can
12448 * examine the credential and the zone from probe context.
12449 */
12450 crhold(cr);
12451 state->dts_cred.dcr_cred = cr;
12452
12453 /*
12454 * CRA_PROC means "we have *some* privilege for dtrace" and
12455 * unlocks the use of variables like pid, zonename, etc.
12456 */
12457 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12458 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12459 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12460 }
12461
12462 /*
12463 * dtrace_user allows use of syscall and profile providers.
12464 * If the user also has proc_owner and/or proc_zone, we
12465 * extend the scope to include additional visibility and
12466 * destructive power.
12467 */
12468 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12469 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12470 state->dts_cred.dcr_visible |=
12471 DTRACE_CRV_ALLPROC;
12472
12473 state->dts_cred.dcr_action |=
12474 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12475 }
12476
12477 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12478 state->dts_cred.dcr_visible |=
12479 DTRACE_CRV_ALLZONE;
12480
12481 state->dts_cred.dcr_action |=
12482 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12483 }
12484
12485 /*
12486 * If we have all privs in whatever zone this is,
12487 * we can do destructive things to processes which
12488 * have altered credentials.
12489 */
12490 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12491 cr->cr_zone->zone_privset)) {
12492 state->dts_cred.dcr_action |=
12493 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12494 }
12495 }
12496
12497 /*
12498 * Holding the dtrace_kernel privilege also implies that
12499 * the user has the dtrace_user privilege from a visibility
12500 * perspective. But without further privileges, some
12501 * destructive actions are not available.
12502 */
12503 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12504 /*
12505 * Make all probes in all zones visible. However,
12506 * this doesn't mean that all actions become available
12507 * to all zones.
12508 */
12509 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12510 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12511
12512 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12513 DTRACE_CRA_PROC;
12514 /*
12515 * Holding proc_owner means that destructive actions
12516 * for *this* zone are allowed.
12517 */
12518 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12519 state->dts_cred.dcr_action |=
12520 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12521
12522 /*
12523 * Holding proc_zone means that destructive actions
12524 * for this user/group ID in all zones is allowed.
12525 */
12526 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12527 state->dts_cred.dcr_action |=
12528 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12529
12530 /*
12531 * If we have all privs in whatever zone this is,
12532 * we can do destructive things to processes which
12533 * have altered credentials.
12534 */
12535 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12536 cr->cr_zone->zone_privset)) {
12537 state->dts_cred.dcr_action |=
12538 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12539 }
12540 }
12541
12542 /*
12543 * Holding the dtrace_proc privilege gives control over fasttrap
12544 * and pid providers. We need to grant wider destructive
12545 * privileges in the event that the user has proc_owner and/or
12546 * proc_zone.
12547 */
12548 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12549 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12550 state->dts_cred.dcr_action |=
12551 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12552
12553 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12554 state->dts_cred.dcr_action |=
12555 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12556 }
12557 }
12558
12559 return (state);
12560}
12561
12562static int
12563dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12564{
12565 dtrace_optval_t *opt = state->dts_options, size;
12566 processorid_t cpu;
12567 int flags = 0, rval;
12568
12569 ASSERT(MUTEX_HELD(&dtrace_lock));
12570 ASSERT(MUTEX_HELD(&cpu_lock));
12571 ASSERT(which < DTRACEOPT_MAX);
12572 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12573 (state == dtrace_anon.dta_state &&
12574 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12575
12576 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12577 return (0);
12578
12579 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12580 cpu = opt[DTRACEOPT_CPU];
12581
12582 if (which == DTRACEOPT_SPECSIZE)
12583 flags |= DTRACEBUF_NOSWITCH;
12584
12585 if (which == DTRACEOPT_BUFSIZE) {
12586 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12587 flags |= DTRACEBUF_RING;
12588
12589 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12590 flags |= DTRACEBUF_FILL;
12591
12592 if (state != dtrace_anon.dta_state ||
12593 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12594 flags |= DTRACEBUF_INACTIVE;
12595 }
12596
12597 for (size = opt[which]; size >= VBDTCAST(dtrace_optval_t)sizeof (uint64_t); size >>= 1) {
12598 /*
12599 * The size must be 8-byte aligned. If the size is not 8-byte
12600 * aligned, drop it down by the difference.
12601 */
12602 if (size & (sizeof (uint64_t) - 1))
12603 size -= size & (sizeof (uint64_t) - 1);
12604
12605 if (size < state->dts_reserve) {
12606 /*
12607 * Buffers always must be large enough to accommodate
12608 * their prereserved space. We return E2BIG instead
12609 * of ENOMEM in this case to allow for user-level
12610 * software to differentiate the cases.
12611 */
12612 return (E2BIG);
12613 }
12614
12615 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12616
12617 if (rval != ENOMEM) {
12618 opt[which] = size;
12619 return (rval);
12620 }
12621
12622 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12623 return (rval);
12624 }
12625
12626 return (ENOMEM);
12627}
12628
12629static int
12630dtrace_state_buffers(dtrace_state_t *state)
12631{
12632 dtrace_speculation_t *spec = state->dts_speculations;
12633 int rval, i;
12634
12635 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12636 DTRACEOPT_BUFSIZE)) != 0)
12637 return (rval);
12638
12639 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12640 DTRACEOPT_AGGSIZE)) != 0)
12641 return (rval);
12642
12643 for (i = 0; i < state->dts_nspeculations; i++) {
12644 if ((rval = dtrace_state_buffer(state,
12645 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12646 return (rval);
12647 }
12648
12649 return (0);
12650}
12651
12652static void
12653dtrace_state_prereserve(dtrace_state_t *state)
12654{
12655 dtrace_ecb_t *ecb;
12656 dtrace_probe_t *probe;
12657
12658 state->dts_reserve = 0;
12659
12660 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12661 return;
12662
12663 /*
12664 * If our buffer policy is a "fill" buffer policy, we need to set the
12665 * prereserved space to be the space required by the END probes.
12666 */
12667 probe = dtrace_probes[dtrace_probeid_end - 1];
12668 ASSERT(probe != NULL);
12669
12670 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12671 if (ecb->dte_state != state)
12672 continue;
12673
12674 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12675 }
12676}
12677
12678static int
12679dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12680{
12681 dtrace_optval_t *opt = state->dts_options, sz, nspec;
12682 dtrace_speculation_t *spec;
12683 dtrace_buffer_t *buf;
12684#ifndef VBOX
12685 cyc_handler_t hdlr;
12686 cyc_time_t when;
12687#endif
12688 int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12689 dtrace_icookie_t cookie;
12690
12691 mutex_enter(&cpu_lock);
12692 mutex_enter(&dtrace_lock);
12693
12694 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12695 rval = EBUSY;
12696 goto out;
12697 }
12698
12699 /*
12700 * Before we can perform any checks, we must prime all of the
12701 * retained enablings that correspond to this state.
12702 */
12703 dtrace_enabling_prime(state);
12704
12705 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12706 rval = EACCES;
12707 goto out;
12708 }
12709
12710 dtrace_state_prereserve(state);
12711
12712 /*
12713 * Now we want to do is try to allocate our speculations.
12714 * We do not automatically resize the number of speculations; if
12715 * this fails, we will fail the operation.
12716 */
12717 nspec = opt[DTRACEOPT_NSPEC];
12718 ASSERT(nspec != DTRACEOPT_UNSET);
12719
12720 if (nspec > INT_MAX) {
12721 rval = ENOMEM;
12722 goto out;
12723 }
12724
12725 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
12726
12727 if (spec == NULL) {
12728 rval = ENOMEM;
12729 goto out;
12730 }
12731
12732 state->dts_speculations = spec;
12733 state->dts_nspeculations = (int)nspec;
12734
12735 for (i = 0; i < nspec; i++) {
12736 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
12737 rval = ENOMEM;
12738 goto err;
12739 }
12740
12741 spec[i].dtsp_buffer = buf;
12742 }
12743
12744 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12745 if (dtrace_anon.dta_state == NULL) {
12746 rval = ENOENT;
12747 goto out;
12748 }
12749
12750 if (state->dts_necbs != 0) {
12751 rval = EALREADY;
12752 goto out;
12753 }
12754
12755 state->dts_anon = dtrace_anon_grab();
12756 ASSERT(state->dts_anon != NULL);
12757 state = state->dts_anon;
12758
12759 /*
12760 * We want "grabanon" to be set in the grabbed state, so we'll
12761 * copy that option value from the grabbing state into the
12762 * grabbed state.
12763 */
12764 state->dts_options[DTRACEOPT_GRABANON] =
12765 opt[DTRACEOPT_GRABANON];
12766
12767 *cpu = dtrace_anon.dta_beganon;
12768
12769 /*
12770 * If the anonymous state is active (as it almost certainly
12771 * is if the anonymous enabling ultimately matched anything),
12772 * we don't allow any further option processing -- but we
12773 * don't return failure.
12774 */
12775 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12776 goto out;
12777 }
12778
12779 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12780 opt[DTRACEOPT_AGGSIZE] != 0) {
12781 if (state->dts_aggregations == NULL) {
12782 /*
12783 * We're not going to create an aggregation buffer
12784 * because we don't have any ECBs that contain
12785 * aggregations -- set this option to 0.
12786 */
12787 opt[DTRACEOPT_AGGSIZE] = 0;
12788 } else {
12789 /*
12790 * If we have an aggregation buffer, we must also have
12791 * a buffer to use as scratch.
12792 */
12793 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12794 opt[DTRACEOPT_BUFSIZE] < VBDTCAST(dtrace_optval_t)state->dts_needed) {
12795 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12796 }
12797 }
12798 }
12799
12800 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
12801 opt[DTRACEOPT_SPECSIZE] != 0) {
12802 if (!state->dts_speculates) {
12803 /*
12804 * We're not going to create speculation buffers
12805 * because we don't have any ECBs that actually
12806 * speculate -- set the speculation size to 0.
12807 */
12808 opt[DTRACEOPT_SPECSIZE] = 0;
12809 }
12810 }
12811
12812 /*
12813 * The bare minimum size for any buffer that we're actually going to
12814 * do anything to is sizeof (uint64_t).
12815 */
12816 sz = sizeof (uint64_t);
12817
12818 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
12819 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
12820 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
12821 /*
12822 * A buffer size has been explicitly set to 0 (or to a size
12823 * that will be adjusted to 0) and we need the space -- we
12824 * need to return failure. We return ENOSPC to differentiate
12825 * it from failing to allocate a buffer due to failure to meet
12826 * the reserve (for which we return E2BIG).
12827 */
12828 rval = ENOSPC;
12829 goto out;
12830 }
12831
12832 if ((rval = dtrace_state_buffers(state)) != 0)
12833 goto err;
12834
12835 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
12836 sz = dtrace_dstate_defsize;
12837
12838 do {
12839 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
12840
12841 if (rval == 0)
12842 break;
12843
12844 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12845 goto err;
12846 } while (sz >>= 1);
12847
12848 opt[DTRACEOPT_DYNVARSIZE] = sz;
12849
12850 if (rval != 0)
12851 goto err;
12852
12853 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
12854 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
12855
12856 if (opt[DTRACEOPT_CLEANRATE] == 0)
12857 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12858
12859 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
12860 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
12861
12862 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
12863 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12864
12865#ifndef VBOX
12866 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
12867 hdlr.cyh_arg = state;
12868 hdlr.cyh_level = CY_LOW_LEVEL;
12869
12870 when.cyt_when = 0;
12871 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
12872
12873 state->dts_cleaner = cyclic_add(&hdlr, &when);
12874
12875 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
12876 hdlr.cyh_arg = state;
12877 hdlr.cyh_level = CY_LOW_LEVEL;
12878
12879 when.cyt_when = 0;
12880 when.cyt_interval = dtrace_deadman_interval;
12881
12882 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12883 state->dts_deadman = cyclic_add(&hdlr, &when);
12884#else /* VBOX */
12885
12886 rval = RTTimerCreateEx(&state->dts_cleaner, opt[DTRACEOPT_CLEANRATE],
12887 RTTIMER_FLAGS_CPU_ANY, dtrace_state_clean_timer, state);
12888 if (RT_FAILURE(rval)) {
12889 rval = RTErrConvertToErrno(rval);
12890 goto err;
12891 }
12892
12893 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12894 rval = RTTimerCreateEx(&state->dts_deadman, dtrace_deadman_interval,
12895 RTTIMER_FLAGS_CPU_ANY, dtrace_state_deadman_timer, state);
12896 if (RT_FAILURE(rval)) {
12897 RTTimerDestroy(state->dts_cleaner);
12898 state->dts_cleaner = CYCLIC_NONE;
12899 state->dts_deadman = CYCLIC_NONE;
12900 rval = RTErrConvertToErrno(rval);
12901 goto err;
12902 }
12903#endif /* VBOX */
12904
12905 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
12906
12907 /*
12908 * Now it's time to actually fire the BEGIN probe. We need to disable
12909 * interrupts here both to record the CPU on which we fired the BEGIN
12910 * probe (the data from this CPU will be processed first at user
12911 * level) and to manually activate the buffer for this CPU.
12912 */
12913 cookie = dtrace_interrupt_disable();
12914 *cpu = VBDT_GET_CPUID();
12915 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
12916 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12917
12918 dtrace_probe(dtrace_probeid_begin,
12919 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12920 dtrace_interrupt_enable(cookie);
12921 /*
12922 * We may have had an exit action from a BEGIN probe; only change our
12923 * state to ACTIVE if we're still in WARMUP.
12924 */
12925 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
12926 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
12927
12928 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
12929 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
12930
12931 /*
12932 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
12933 * want each CPU to transition its principal buffer out of the
12934 * INACTIVE state. Doing this assures that no CPU will suddenly begin
12935 * processing an ECB halfway down a probe's ECB chain; all CPUs will
12936 * atomically transition from processing none of a state's ECBs to
12937 * processing all of them.
12938 */
12939 dtrace_xcall(DTRACE_CPUALL,
12940 (dtrace_xcall_t)dtrace_buffer_activate, state);
12941 goto out;
12942
12943err:
12944 dtrace_buffer_free(state->dts_buffer);
12945 dtrace_buffer_free(state->dts_aggbuffer);
12946
12947 if ((nspec = state->dts_nspeculations) == 0) {
12948 ASSERT(state->dts_speculations == NULL);
12949 goto out;
12950 }
12951
12952 spec = state->dts_speculations;
12953 ASSERT(spec != NULL);
12954
12955 for (i = 0; i < state->dts_nspeculations; i++) {
12956 if ((buf = spec[i].dtsp_buffer) == NULL)
12957 break;
12958
12959 dtrace_buffer_free(buf);
12960 kmem_free(buf, bufsize);
12961 }
12962
12963 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
12964 state->dts_nspeculations = 0;
12965 state->dts_speculations = NULL;
12966
12967out:
12968 mutex_exit(&dtrace_lock);
12969 mutex_exit(&cpu_lock);
12970
12971 return (rval);
12972}
12973
12974static int
12975dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
12976{
12977 dtrace_icookie_t cookie;
12978
12979 ASSERT(MUTEX_HELD(&dtrace_lock));
12980
12981 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
12982 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
12983 return (EINVAL);
12984
12985 /*
12986 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
12987 * to be sure that every CPU has seen it. See below for the details
12988 * on why this is done.
12989 */
12990 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
12991 dtrace_sync();
12992
12993 /*
12994 * By this point, it is impossible for any CPU to be still processing
12995 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
12996 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
12997 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
12998 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
12999 * iff we're in the END probe.
13000 */
13001 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13002 dtrace_sync();
13003 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13004
13005 /*
13006 * Finally, we can release the reserve and call the END probe. We
13007 * disable interrupts across calling the END probe to allow us to
13008 * return the CPU on which we actually called the END probe. This
13009 * allows user-land to be sure that this CPU's principal buffer is
13010 * processed last.
13011 */
13012 state->dts_reserve = 0;
13013
13014 cookie = dtrace_interrupt_disable();
13015 *cpu = VBDT_GET_CPUID();
13016 dtrace_probe(dtrace_probeid_end,
13017 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13018 dtrace_interrupt_enable(cookie);
13019
13020 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13021 dtrace_sync();
13022
13023 return (0);
13024}
13025
13026static int
13027dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13028 dtrace_optval_t val)
13029{
13030 ASSERT(MUTEX_HELD(&dtrace_lock));
13031
13032 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13033 return (EBUSY);
13034
13035 if (option >= DTRACEOPT_MAX)
13036 return (EINVAL);
13037
13038 if (option != DTRACEOPT_CPU && val < 0)
13039 return (EINVAL);
13040
13041 switch (option) {
13042 case DTRACEOPT_DESTRUCTIVE:
13043 if (dtrace_destructive_disallow)
13044 return (EACCES);
13045
13046 state->dts_cred.dcr_destructive = 1;
13047 break;
13048
13049 case DTRACEOPT_BUFSIZE:
13050 case DTRACEOPT_DYNVARSIZE:
13051 case DTRACEOPT_AGGSIZE:
13052 case DTRACEOPT_SPECSIZE:
13053 case DTRACEOPT_STRSIZE:
13054 if (val < 0)
13055 return (EINVAL);
13056
13057 if (val >= LONG_MAX) {
13058 /*
13059 * If this is an otherwise negative value, set it to
13060 * the highest multiple of 128m less than LONG_MAX.
13061 * Technically, we're adjusting the size without
13062 * regard to the buffer resizing policy, but in fact,
13063 * this has no effect -- if we set the buffer size to
13064 * ~LONG_MAX and the buffer policy is ultimately set to
13065 * be "manual", the buffer allocation is guaranteed to
13066 * fail, if only because the allocation requires two
13067 * buffers. (We set the the size to the highest
13068 * multiple of 128m because it ensures that the size
13069 * will remain a multiple of a megabyte when
13070 * repeatedly halved -- all the way down to 15m.)
13071 */
13072 val = LONG_MAX - (1 << 27) + 1;
13073 }
13074 }
13075
13076 state->dts_options[option] = val;
13077
13078 return (0);
13079}
13080
13081static void
13082dtrace_state_destroy(dtrace_state_t *state)
13083{
13084 dtrace_ecb_t *ecb;
13085 dtrace_vstate_t *vstate = &state->dts_vstate;
13086 minor_t minor = getminor(state->dts_dev);
13087 int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13088 dtrace_speculation_t *spec = state->dts_speculations;
13089 int nspec = state->dts_nspeculations;
13090 uint32_t match;
13091
13092 ASSERT(MUTEX_HELD(&dtrace_lock));
13093 ASSERT(MUTEX_HELD(&cpu_lock));
13094
13095 /*
13096 * First, retract any retained enablings for this state.
13097 */
13098 dtrace_enabling_retract(state);
13099 ASSERT(state->dts_nretained == 0);
13100
13101 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13102 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13103 /*
13104 * We have managed to come into dtrace_state_destroy() on a
13105 * hot enabling -- almost certainly because of a disorderly
13106 * shutdown of a consumer. (That is, a consumer that is
13107 * exiting without having called dtrace_stop().) In this case,
13108 * we're going to set our activity to be KILLED, and then
13109 * issue a sync to be sure that everyone is out of probe
13110 * context before we start blowing away ECBs.
13111 */
13112 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13113 dtrace_sync();
13114 }
13115
13116 /*
13117 * Release the credential hold we took in dtrace_state_create().
13118 */
13119 if (state->dts_cred.dcr_cred != NULL)
13120 crfree(state->dts_cred.dcr_cred);
13121
13122 /*
13123 * Now we can safely disable and destroy any enabled probes. Because
13124 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13125 * (especially if they're all enabled), we take two passes through the
13126 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13127 * in the second we disable whatever is left over.
13128 */
13129 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13130 for (i = 0; i < state->dts_necbs; i++) {
13131 if ((ecb = state->dts_ecbs[i]) == NULL)
13132 continue;
13133
13134 if (match && ecb->dte_probe != NULL) {
13135 dtrace_probe_t *probe = ecb->dte_probe;
13136 dtrace_provider_t *prov = probe->dtpr_provider;
13137
13138 if (!(prov->dtpv_priv.dtpp_flags & match))
13139 continue;
13140 }
13141
13142 dtrace_ecb_disable(ecb);
13143 dtrace_ecb_destroy(ecb);
13144 }
13145
13146 if (!match)
13147 break;
13148 }
13149
13150 /*
13151 * Before we free the buffers, perform one more sync to assure that
13152 * every CPU is out of probe context.
13153 */
13154 dtrace_sync();
13155
13156 dtrace_buffer_free(state->dts_buffer);
13157 dtrace_buffer_free(state->dts_aggbuffer);
13158
13159 for (i = 0; i < nspec; i++)
13160 dtrace_buffer_free(spec[i].dtsp_buffer);
13161
13162 if (state->dts_cleaner != CYCLIC_NONE)
13163 cyclic_remove(state->dts_cleaner);
13164
13165 if (state->dts_deadman != CYCLIC_NONE)
13166 cyclic_remove(state->dts_deadman);
13167
13168 dtrace_dstate_fini(&vstate->dtvs_dynvars);
13169 dtrace_vstate_fini(vstate);
13170 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13171
13172 if (state->dts_aggregations != NULL) {
13173#ifdef DEBUG
13174 for (i = 0; i < state->dts_naggregations; i++)
13175 ASSERT(state->dts_aggregations[i] == NULL);
13176#endif
13177 ASSERT(state->dts_naggregations > 0);
13178 kmem_free(state->dts_aggregations,
13179 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13180 }
13181
13182 kmem_free(state->dts_buffer, bufsize);
13183 kmem_free(state->dts_aggbuffer, bufsize);
13184
13185 for (i = 0; i < nspec; i++)
13186 kmem_free(spec[i].dtsp_buffer, bufsize);
13187
13188 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13189
13190 dtrace_format_destroy(state);
13191
13192 vmem_destroy(state->dts_aggid_arena);
13193 ddi_soft_state_free(dtrace_softstate, minor);
13194 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13195}
13196
13197/*
13198 * DTrace Anonymous Enabling Functions
13199 */
13200static dtrace_state_t *
13201dtrace_anon_grab(void)
13202{
13203 dtrace_state_t *state;
13204
13205 ASSERT(MUTEX_HELD(&dtrace_lock));
13206
13207 if ((state = dtrace_anon.dta_state) == NULL) {
13208 ASSERT(dtrace_anon.dta_enabling == NULL);
13209 return (NULL);
13210 }
13211
13212 ASSERT(dtrace_anon.dta_enabling != NULL);
13213 ASSERT(dtrace_retained != NULL);
13214
13215 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13216 dtrace_anon.dta_enabling = NULL;
13217 dtrace_anon.dta_state = NULL;
13218
13219 return (state);
13220}
13221
13222static void
13223dtrace_anon_property(void)
13224{
13225 int i, rv;
13226 dtrace_state_t *state;
13227 dof_hdr_t *dof;
13228 char c[32]; /* enough for "dof-data-" + digits */
13229
13230 ASSERT(MUTEX_HELD(&dtrace_lock));
13231 ASSERT(MUTEX_HELD(&cpu_lock));
13232
13233 for (i = 0; ; i++) {
13234 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13235
13236 dtrace_err_verbose = 1;
13237
13238 if ((dof = dtrace_dof_property(c)) == NULL) {
13239 dtrace_err_verbose = 0;
13240 break;
13241 }
13242
13243#ifndef VBOX
13244 /*
13245 * We want to create anonymous state, so we need to transition
13246 * the kernel debugger to indicate that DTrace is active. If
13247 * this fails (e.g. because the debugger has modified text in
13248 * some way), we won't continue with the processing.
13249 */
13250 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13251 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13252 "enabling ignored.");
13253 dtrace_dof_destroy(dof);
13254 break;
13255 }
13256#endif
13257
13258 /*
13259 * If we haven't allocated an anonymous state, we'll do so now.
13260 */
13261 if ((state = dtrace_anon.dta_state) == NULL) {
13262 state = dtrace_state_create(NULL, NULL);
13263 dtrace_anon.dta_state = state;
13264
13265 if (state == NULL) {
13266 /*
13267 * This basically shouldn't happen: the only
13268 * failure mode from dtrace_state_create() is a
13269 * failure of ddi_soft_state_zalloc() that
13270 * itself should never happen. Still, the
13271 * interface allows for a failure mode, and
13272 * we want to fail as gracefully as possible:
13273 * we'll emit an error message and cease
13274 * processing anonymous state in this case.
13275 */
13276 cmn_err(CE_WARN, "failed to create "
13277 "anonymous state");
13278 dtrace_dof_destroy(dof);
13279 break;
13280 }
13281 }
13282
13283 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13284 &dtrace_anon.dta_enabling, 0, B_TRUE);
13285
13286 if (rv == 0)
13287 rv = dtrace_dof_options(dof, state);
13288
13289 dtrace_err_verbose = 0;
13290 dtrace_dof_destroy(dof);
13291
13292 if (rv != 0) {
13293 /*
13294 * This is malformed DOF; chuck any anonymous state
13295 * that we created.
13296 */
13297 ASSERT(dtrace_anon.dta_enabling == NULL);
13298 dtrace_state_destroy(state);
13299 dtrace_anon.dta_state = NULL;
13300 break;
13301 }
13302
13303 ASSERT(dtrace_anon.dta_enabling != NULL);
13304 }
13305
13306 if (dtrace_anon.dta_enabling != NULL) {
13307 int rval;
13308
13309 /*
13310 * dtrace_enabling_retain() can only fail because we are
13311 * trying to retain more enablings than are allowed -- but
13312 * we only have one anonymous enabling, and we are guaranteed
13313 * to be allowed at least one retained enabling; we assert
13314 * that dtrace_enabling_retain() returns success.
13315 */
13316 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13317 ASSERT(rval == 0);
13318
13319 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13320 }
13321}
13322
13323/*
13324 * DTrace Helper Functions
13325 */
13326static void
13327dtrace_helper_trace(dtrace_helper_action_t *helper,
13328 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13329{
13330 uint32_t size, next, nnext, i;
13331 dtrace_helptrace_t *ent;
13332 uint16_t flags = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13333
13334 if (!dtrace_helptrace_enabled)
13335 return;
13336
13337 ASSERT(vstate->dtvs_nlocals <= VBDTCAST(int32_t)dtrace_helptrace_nlocals);
13338
13339 /*
13340 * What would a tracing framework be without its own tracing
13341 * framework? (Well, a hell of a lot simpler, for starters...)
13342 */
13343 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13344 sizeof (uint64_t) - sizeof (uint64_t);
13345
13346 /*
13347 * Iterate until we can allocate a slot in the trace buffer.
13348 */
13349 do {
13350 next = dtrace_helptrace_next;
13351
13352 if (next + size < VBDTCAST(unsigned)dtrace_helptrace_bufsize) {
13353 nnext = next + size;
13354 } else {
13355 nnext = size;
13356 }
13357 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13358
13359 /*
13360 * We have our slot; fill it in.
13361 */
13362 if (nnext == size)
13363 next = 0;
13364
13365 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13366 ent->dtht_helper = helper;
13367 ent->dtht_where = where;
13368 ent->dtht_nlocals = vstate->dtvs_nlocals;
13369
13370 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13371 mstate->dtms_fltoffs : -1;
13372 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13373 ent->dtht_illval = cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_illval;
13374
13375 for (i = 0; VBDTCAST(int32_t)i < vstate->dtvs_nlocals; i++) {
13376 dtrace_statvar_t *svar;
13377
13378 if ((svar = vstate->dtvs_locals[i]) == NULL)
13379 continue;
13380
13381 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13382 ent->dtht_locals[i] =
13383 ((uint64_t *)(uintptr_t)svar->dtsv_data)[VBDT_GET_CPUID()];
13384 }
13385}
13386
13387static uint64_t
13388dtrace_helper(int which, dtrace_mstate_t *mstate,
13389 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13390{
13391 VBDTTYPE(uint16_t volatile *, uint16_t *)flags = &cpu_core[VBDT_GET_CPUID()].cpuc_dtrace_flags;
13392 uint64_t sarg0 = mstate->dtms_arg[0];
13393 uint64_t sarg1 = mstate->dtms_arg[1];
13394 uint64_t rval;
13395 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13396 dtrace_helper_action_t *helper;
13397 dtrace_vstate_t *vstate;
13398 dtrace_difo_t *pred;
13399 int i, trace = dtrace_helptrace_enabled;
13400
13401 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13402
13403 if (helpers == NULL)
13404 return (0);
13405
13406 if ((helper = helpers->dthps_actions[which]) == NULL)
13407 return (0);
13408
13409 vstate = &helpers->dthps_vstate;
13410 mstate->dtms_arg[0] = arg0;
13411 mstate->dtms_arg[1] = arg1;
13412
13413 /*
13414 * Now iterate over each helper. If its predicate evaluates to 'true',
13415 * we'll call the corresponding actions. Note that the below calls
13416 * to dtrace_dif_emulate() may set faults in machine state. This is
13417 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
13418 * the stored DIF offset with its own (which is the desired behavior).
13419 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13420 * from machine state; this is okay, too.
13421 */
13422 for (; helper != NULL; helper = helper->dtha_next) {
13423 if ((pred = helper->dtha_predicate) != NULL) {
13424 if (trace)
13425 dtrace_helper_trace(helper, mstate, vstate, 0);
13426
13427 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13428 goto next;
13429
13430 if (*flags & CPU_DTRACE_FAULT)
13431 goto err;
13432 }
13433
13434 for (i = 0; i < helper->dtha_nactions; i++) {
13435 if (trace)
13436 dtrace_helper_trace(helper,
13437 mstate, vstate, i + 1);
13438
13439 rval = dtrace_dif_emulate(helper->dtha_actions[i],
13440 mstate, vstate, state);
13441
13442 if (*flags & CPU_DTRACE_FAULT)
13443 goto err;
13444 }
13445
13446next:
13447 if (trace)
13448 dtrace_helper_trace(helper, mstate, vstate,
13449 DTRACE_HELPTRACE_NEXT);
13450 }
13451
13452 if (trace)
13453 dtrace_helper_trace(helper, mstate, vstate,
13454 DTRACE_HELPTRACE_DONE);
13455
13456 /*
13457 * Restore the arg0 that we saved upon entry.
13458 */
13459 mstate->dtms_arg[0] = sarg0;
13460 mstate->dtms_arg[1] = sarg1;
13461
13462 return (rval);
13463
13464err:
13465 if (trace)
13466 dtrace_helper_trace(helper, mstate, vstate,
13467 DTRACE_HELPTRACE_ERR);
13468
13469 /*
13470 * Restore the arg0 that we saved upon entry.
13471 */
13472 mstate->dtms_arg[0] = sarg0;
13473 mstate->dtms_arg[1] = sarg1;
13474
13475 return (NULL);
13476}
13477
13478static void
13479dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13480 dtrace_vstate_t *vstate)
13481{
13482 int i;
13483
13484 if (helper->dtha_predicate != NULL)
13485 dtrace_difo_release(helper->dtha_predicate, vstate);
13486
13487 for (i = 0; i < helper->dtha_nactions; i++) {
13488 ASSERT(helper->dtha_actions[i] != NULL);
13489 dtrace_difo_release(helper->dtha_actions[i], vstate);
13490 }
13491
13492 kmem_free(helper->dtha_actions,
13493 helper->dtha_nactions * sizeof (dtrace_difo_t *));
13494 kmem_free(helper, sizeof (dtrace_helper_action_t));
13495}
13496
13497static int
13498dtrace_helper_destroygen(int gen)
13499{
13500 proc_t *p = curproc;
13501 dtrace_helpers_t *help = p->p_dtrace_helpers;
13502 dtrace_vstate_t *vstate;
13503 VBDTTYPE(uint_t,int) i;
13504
13505 ASSERT(MUTEX_HELD(&dtrace_lock));
13506
13507 if (help == NULL || gen > help->dthps_generation)
13508 return (EINVAL);
13509
13510 vstate = &help->dthps_vstate;
13511
13512 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13513 dtrace_helper_action_t *last = NULL, *h, *next;
13514
13515 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13516 next = h->dtha_next;
13517
13518 if (h->dtha_generation == gen) {
13519 if (last != NULL) {
13520 last->dtha_next = next;
13521 } else {
13522 help->dthps_actions[i] = next;
13523 }
13524
13525 dtrace_helper_action_destroy(h, vstate);
13526 } else {
13527 last = h;
13528 }
13529 }
13530 }
13531
13532 /*
13533 * Interate until we've cleared out all helper providers with the
13534 * given generation number.
13535 */
13536 for (;;) {
13537 dtrace_helper_provider_t *prov;
13538
13539 /*
13540 * Look for a helper provider with the right generation. We
13541 * have to start back at the beginning of the list each time
13542 * because we drop dtrace_lock. It's unlikely that we'll make
13543 * more than two passes.
13544 */
13545 for (i = 0; i < help->dthps_nprovs; i++) {
13546 prov = help->dthps_provs[i];
13547
13548 if (prov->dthp_generation == gen)
13549 break;
13550 }
13551
13552 /*
13553 * If there were no matches, we're done.
13554 */
13555 if (i == help->dthps_nprovs)
13556 break;
13557
13558 /*
13559 * Move the last helper provider into this slot.
13560 */
13561 help->dthps_nprovs--;
13562 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13563 help->dthps_provs[help->dthps_nprovs] = NULL;
13564
13565 mutex_exit(&dtrace_lock);
13566
13567 /*
13568 * If we have a meta provider, remove this helper provider.
13569 */
13570 mutex_enter(&dtrace_meta_lock);
13571 if (dtrace_meta_pid != NULL) {
13572 ASSERT(dtrace_deferred_pid == NULL);
13573 dtrace_helper_provider_remove(&prov->dthp_prov,
13574 p->p_pid);
13575 }
13576 mutex_exit(&dtrace_meta_lock);
13577
13578 dtrace_helper_provider_destroy(prov);
13579
13580 mutex_enter(&dtrace_lock);
13581 }
13582
13583 return (0);
13584}
13585
13586static int
13587dtrace_helper_validate(dtrace_helper_action_t *helper)
13588{
13589 int err = 0, i;
13590 dtrace_difo_t *dp;
13591
13592 if ((dp = helper->dtha_predicate) != NULL)
13593 err += dtrace_difo_validate_helper(dp);
13594
13595 for (i = 0; i < helper->dtha_nactions; i++)
13596 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13597
13598 return (err == 0);
13599}
13600
13601static int
13602dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13603{
13604 dtrace_helpers_t *help;
13605 dtrace_helper_action_t *helper, *last;
13606 dtrace_actdesc_t *act;
13607 dtrace_vstate_t *vstate;
13608 dtrace_predicate_t *pred;
13609 int count = 0, nactions = 0, i;
13610
13611 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13612 return (EINVAL);
13613
13614 help = curproc->p_dtrace_helpers;
13615 last = help->dthps_actions[which];
13616 vstate = &help->dthps_vstate;
13617
13618 for (count = 0; last != NULL; last = last->dtha_next) {
13619 count++;
13620 if (last->dtha_next == NULL)
13621 break;
13622 }
13623
13624 /*
13625 * If we already have dtrace_helper_actions_max helper actions for this
13626 * helper action type, we'll refuse to add a new one.
13627 */
13628 if (count >= dtrace_helper_actions_max)
13629 return (ENOSPC);
13630
13631 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13632 helper->dtha_generation = help->dthps_generation;
13633
13634 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13635 ASSERT(pred->dtp_difo != NULL);
13636 dtrace_difo_hold(pred->dtp_difo);
13637 helper->dtha_predicate = pred->dtp_difo;
13638 }
13639
13640 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13641 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13642 goto err;
13643
13644 if (act->dtad_difo == NULL)
13645 goto err;
13646
13647 nactions++;
13648 }
13649
13650 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13651 (helper->dtha_nactions = nactions), KM_SLEEP);
13652
13653 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13654 dtrace_difo_hold(act->dtad_difo);
13655 helper->dtha_actions[i++] = act->dtad_difo;
13656 }
13657
13658 if (!dtrace_helper_validate(helper))
13659 goto err;
13660
13661 if (last == NULL) {
13662 help->dthps_actions[which] = helper;
13663 } else {
13664 last->dtha_next = helper;
13665 }
13666
13667 if (vstate->dtvs_nlocals > VBDTCAST(int32_t)dtrace_helptrace_nlocals) {
13668 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13669 dtrace_helptrace_next = 0;
13670 }
13671
13672 return (0);
13673err:
13674 dtrace_helper_action_destroy(helper, vstate);
13675 return (EINVAL);
13676}
13677
13678static void
13679dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13680 dof_helper_t *dofhp)
13681{
13682 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13683
13684 mutex_enter(&dtrace_meta_lock);
13685 mutex_enter(&dtrace_lock);
13686
13687 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13688 /*
13689 * If the dtrace module is loaded but not attached, or if
13690 * there aren't isn't a meta provider registered to deal with
13691 * these provider descriptions, we need to postpone creating
13692 * the actual providers until later.
13693 */
13694
13695 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13696 dtrace_deferred_pid != help) {
13697 help->dthps_deferred = 1;
13698 help->dthps_pid = p->p_pid;
13699 help->dthps_next = dtrace_deferred_pid;
13700 help->dthps_prev = NULL;
13701 if (dtrace_deferred_pid != NULL)
13702 dtrace_deferred_pid->dthps_prev = help;
13703 dtrace_deferred_pid = help;
13704 }
13705
13706 mutex_exit(&dtrace_lock);
13707
13708 } else if (dofhp != NULL) {
13709 /*
13710 * If the dtrace module is loaded and we have a particular
13711 * helper provider description, pass that off to the
13712 * meta provider.
13713 */
13714
13715 mutex_exit(&dtrace_lock);
13716
13717 dtrace_helper_provide(dofhp, p->p_pid);
13718
13719 } else {
13720 /*
13721 * Otherwise, just pass all the helper provider descriptions
13722 * off to the meta provider.
13723 */
13724
13725 VBDTTYPE(uint_t,int) i;
13726 mutex_exit(&dtrace_lock);
13727
13728 for (i = 0; i < help->dthps_nprovs; i++) {
13729 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13730 p->p_pid);
13731 }
13732 }
13733
13734 mutex_exit(&dtrace_meta_lock);
13735}
13736
13737static int
13738dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13739{
13740 dtrace_helpers_t *help;
13741 dtrace_helper_provider_t *hprov, **tmp_provs;
13742 uint_t tmp_maxprovs, i;
13743
13744 ASSERT(MUTEX_HELD(&dtrace_lock));
13745
13746 help = curproc->p_dtrace_helpers;
13747 ASSERT(help != NULL);
13748
13749 /*
13750 * If we already have dtrace_helper_providers_max helper providers,
13751 * we're refuse to add a new one.
13752 */
13753 if (help->dthps_nprovs >= dtrace_helper_providers_max)
13754 return (ENOSPC);
13755
13756 /*
13757 * Check to make sure this isn't a duplicate.
13758 */
13759 for (i = 0; i < help->dthps_nprovs; i++) {
13760 if (dofhp->dofhp_addr ==
13761 help->dthps_provs[i]->dthp_prov.dofhp_addr)
13762 return (EALREADY);
13763 }
13764
13765 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13766 hprov->dthp_prov = *dofhp;
13767 hprov->dthp_ref = 1;
13768 hprov->dthp_generation = gen;
13769
13770 /*
13771 * Allocate a bigger table for helper providers if it's already full.
13772 */
13773 if (help->dthps_maxprovs == help->dthps_nprovs) {
13774 tmp_maxprovs = help->dthps_maxprovs;
13775 tmp_provs = help->dthps_provs;
13776
13777 if (help->dthps_maxprovs == 0)
13778 help->dthps_maxprovs = 2;
13779 else
13780 help->dthps_maxprovs *= 2;
13781 if (help->dthps_maxprovs > dtrace_helper_providers_max)
13782 help->dthps_maxprovs = dtrace_helper_providers_max;
13783
13784 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13785
13786 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13787 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13788
13789 if (tmp_provs != NULL) {
13790 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13791 sizeof (dtrace_helper_provider_t *));
13792 kmem_free(tmp_provs, tmp_maxprovs *
13793 sizeof (dtrace_helper_provider_t *));
13794 }
13795 }
13796
13797 help->dthps_provs[help->dthps_nprovs] = hprov;
13798 help->dthps_nprovs++;
13799
13800 return (0);
13801}
13802
13803static void
13804dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13805{
13806 mutex_enter(&dtrace_lock);
13807
13808 if (--hprov->dthp_ref == 0) {
13809 dof_hdr_t *dof;
13810 mutex_exit(&dtrace_lock);
13811 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13812 dtrace_dof_destroy(dof);
13813 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13814 } else {
13815 mutex_exit(&dtrace_lock);
13816 }
13817}
13818
13819static int
13820dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13821{
13822 uintptr_t daddr = (uintptr_t)dof;
13823 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
13824 dof_provider_t *provider;
13825 dof_probe_t *probe;
13826 uint8_t *arg;
13827 char *strtab, *typestr;
13828 dof_stridx_t typeidx;
13829 size_t typesz;
13830 uint_t nprobes, j, k;
13831
13832 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
13833
13834 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
13835 dtrace_dof_error(dof, "misaligned section offset");
13836 return (-1);
13837 }
13838
13839 /*
13840 * The section needs to be large enough to contain the DOF provider
13841 * structure appropriate for the given version.
13842 */
13843 if (sec->dofs_size <
13844 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
13845 offsetof(dof_provider_t, dofpv_prenoffs) :
13846 sizeof (dof_provider_t))) {
13847 dtrace_dof_error(dof, "provider section too small");
13848 return (-1);
13849 }
13850
13851 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
13852 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
13853 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
13854 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
13855 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
13856
13857 if (str_sec == NULL || prb_sec == NULL ||
13858 arg_sec == NULL || off_sec == NULL)
13859 return (-1);
13860
13861 enoff_sec = NULL;
13862
13863 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13864 provider->dofpv_prenoffs != DOF_SECT_NONE &&
13865 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
13866 provider->dofpv_prenoffs)) == NULL)
13867 return (-1);
13868
13869 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
13870
13871 if (provider->dofpv_name >= str_sec->dofs_size ||
13872 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
13873 dtrace_dof_error(dof, "invalid provider name");
13874 return (-1);
13875 }
13876
13877 if (prb_sec->dofs_entsize == 0 ||
13878 prb_sec->dofs_entsize > prb_sec->dofs_size) {
13879 dtrace_dof_error(dof, "invalid entry size");
13880 return (-1);
13881 }
13882
13883 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
13884 dtrace_dof_error(dof, "misaligned entry size");
13885 return (-1);
13886 }
13887
13888 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
13889 dtrace_dof_error(dof, "invalid entry size");
13890 return (-1);
13891 }
13892
13893 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
13894 dtrace_dof_error(dof, "misaligned section offset");
13895 return (-1);
13896 }
13897
13898 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
13899 dtrace_dof_error(dof, "invalid entry size");
13900 return (-1);
13901 }
13902
13903 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
13904
13905 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
13906
13907 /*
13908 * Take a pass through the probes to check for errors.
13909 */
13910 for (j = 0; j < nprobes; j++) {
13911 probe = (dof_probe_t *)(uintptr_t)(daddr +
13912 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
13913
13914 if (probe->dofpr_func >= str_sec->dofs_size) {
13915 dtrace_dof_error(dof, "invalid function name");
13916 return (-1);
13917 }
13918
13919 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
13920 dtrace_dof_error(dof, "function name too long");
13921 return (-1);
13922 }
13923
13924 if (probe->dofpr_name >= str_sec->dofs_size ||
13925 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
13926 dtrace_dof_error(dof, "invalid probe name");
13927 return (-1);
13928 }
13929
13930 /*
13931 * The offset count must not wrap the index, and the offsets
13932 * must also not overflow the section's data.
13933 */
13934 if (probe->dofpr_offidx + probe->dofpr_noffs <
13935 probe->dofpr_offidx ||
13936 (probe->dofpr_offidx + probe->dofpr_noffs) *
13937 off_sec->dofs_entsize > off_sec->dofs_size) {
13938 dtrace_dof_error(dof, "invalid probe offset");
13939 return (-1);
13940 }
13941
13942 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
13943 /*
13944 * If there's no is-enabled offset section, make sure
13945 * there aren't any is-enabled offsets. Otherwise
13946 * perform the same checks as for probe offsets
13947 * (immediately above).
13948 */
13949 if (enoff_sec == NULL) {
13950 if (probe->dofpr_enoffidx != 0 ||
13951 probe->dofpr_nenoffs != 0) {
13952 dtrace_dof_error(dof, "is-enabled "
13953 "offsets with null section");
13954 return (-1);
13955 }
13956 } else if (probe->dofpr_enoffidx +
13957 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
13958 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
13959 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
13960 dtrace_dof_error(dof, "invalid is-enabled "
13961 "offset");
13962 return (-1);
13963 }
13964
13965 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
13966 dtrace_dof_error(dof, "zero probe and "
13967 "is-enabled offsets");
13968 return (-1);
13969 }
13970 } else if (probe->dofpr_noffs == 0) {
13971 dtrace_dof_error(dof, "zero probe offsets");
13972 return (-1);
13973 }
13974
13975 if (probe->dofpr_argidx + probe->dofpr_xargc <
13976 probe->dofpr_argidx ||
13977 (probe->dofpr_argidx + probe->dofpr_xargc) *
13978 arg_sec->dofs_entsize > arg_sec->dofs_size) {
13979 dtrace_dof_error(dof, "invalid args");
13980 return (-1);
13981 }
13982
13983 typeidx = probe->dofpr_nargv;
13984 typestr = strtab + probe->dofpr_nargv;
13985 for (k = 0; k < probe->dofpr_nargc; k++) {
13986 if (typeidx >= str_sec->dofs_size) {
13987 dtrace_dof_error(dof, "bad "
13988 "native argument type");
13989 return (-1);
13990 }
13991
13992 typesz = strlen(typestr) + 1;
13993 if (typesz > DTRACE_ARGTYPELEN) {
13994 dtrace_dof_error(dof, "native "
13995 "argument type too long");
13996 return (-1);
13997 }
13998 typeidx += typesz;
13999 typestr += typesz;
14000 }
14001
14002 typeidx = probe->dofpr_xargv;
14003 typestr = strtab + probe->dofpr_xargv;
14004 for (k = 0; k < probe->dofpr_xargc; k++) {
14005 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14006 dtrace_dof_error(dof, "bad "
14007 "native argument index");
14008 return (-1);
14009 }
14010
14011 if (typeidx >= str_sec->dofs_size) {
14012 dtrace_dof_error(dof, "bad "
14013 "translated argument type");
14014 return (-1);
14015 }
14016
14017 typesz = strlen(typestr) + 1;
14018 if (typesz > DTRACE_ARGTYPELEN) {
14019 dtrace_dof_error(dof, "translated argument "
14020 "type too long");
14021 return (-1);
14022 }
14023
14024 typeidx += typesz;
14025 typestr += typesz;
14026 }
14027 }
14028
14029 return (0);
14030}
14031
14032static int
14033dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14034{
14035 dtrace_helpers_t *help;
14036 dtrace_vstate_t *vstate;
14037 dtrace_enabling_t *enab = NULL;
14038 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14039 uintptr_t daddr = (uintptr_t)dof;
14040
14041 ASSERT(MUTEX_HELD(&dtrace_lock));
14042
14043 if ((help = curproc->p_dtrace_helpers) == NULL)
14044 help = dtrace_helpers_create(curproc);
14045
14046 vstate = &help->dthps_vstate;
14047
14048 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14049 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14050 dtrace_dof_destroy(dof);
14051 return (rv);
14052 }
14053
14054 /*
14055 * Look for helper providers and validate their descriptions.
14056 */
14057 if (dhp != NULL) {
14058 for (i = 0; i < VBDTCAST(int)dof->dofh_secnum; i++) {
14059 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14060 dof->dofh_secoff + i * dof->dofh_secsize);
14061
14062 if (sec->dofs_type != DOF_SECT_PROVIDER)
14063 continue;
14064
14065 if (dtrace_helper_provider_validate(dof, sec) != 0) {
14066 dtrace_enabling_destroy(enab);
14067 dtrace_dof_destroy(dof);
14068 return (-1);
14069 }
14070
14071 nprovs++;
14072 }
14073 }
14074
14075 /*
14076 * Now we need to walk through the ECB descriptions in the enabling.
14077 */
14078 for (i = 0; i < enab->dten_ndesc; i++) {
14079 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14080 dtrace_probedesc_t *desc = &ep->dted_probe;
14081
14082 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14083 continue;
14084
14085 if (strcmp(desc->dtpd_mod, "helper") != 0)
14086 continue;
14087
14088 if (strcmp(desc->dtpd_func, "ustack") != 0)
14089 continue;
14090
14091 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14092 ep)) != 0) {
14093 /*
14094 * Adding this helper action failed -- we are now going
14095 * to rip out the entire generation and return failure.
14096 */
14097 (void) dtrace_helper_destroygen(help->dthps_generation);
14098 dtrace_enabling_destroy(enab);
14099 dtrace_dof_destroy(dof);
14100 return (-1);
14101 }
14102
14103 nhelpers++;
14104 }
14105
14106 if (nhelpers < enab->dten_ndesc)
14107 dtrace_dof_error(dof, "unmatched helpers");
14108
14109 gen = help->dthps_generation++;
14110 dtrace_enabling_destroy(enab);
14111
14112 if (dhp != NULL && nprovs > 0) {
14113 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14114 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14115 mutex_exit(&dtrace_lock);
14116 dtrace_helper_provider_register(curproc, help, dhp);
14117 mutex_enter(&dtrace_lock);
14118
14119 destroy = 0;
14120 }
14121 }
14122
14123 if (destroy)
14124 dtrace_dof_destroy(dof);
14125
14126 return (gen);
14127}
14128
14129static dtrace_helpers_t *
14130dtrace_helpers_create(proc_t *p)
14131{
14132 dtrace_helpers_t *help;
14133
14134 ASSERT(MUTEX_HELD(&dtrace_lock));
14135 ASSERT(p->p_dtrace_helpers == NULL);
14136
14137 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14138 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14139 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14140
14141 p->p_dtrace_helpers = help;
14142 dtrace_helpers++;
14143
14144 return (help);
14145}
14146
14147static void
14148dtrace_helpers_destroy(void)
14149{
14150 dtrace_helpers_t *help;
14151 dtrace_vstate_t *vstate;
14152 proc_t *p = curproc;
14153 VBDTTYPE(uint_t, int) i;
14154
14155 mutex_enter(&dtrace_lock);
14156
14157 ASSERT(p->p_dtrace_helpers != NULL);
14158 ASSERT(dtrace_helpers > 0);
14159
14160 help = p->p_dtrace_helpers;
14161 vstate = &help->dthps_vstate;
14162
14163 /*
14164 * We're now going to lose the help from this process.
14165 */
14166 p->p_dtrace_helpers = NULL;
14167 dtrace_sync();
14168
14169 /*
14170 * Destory the helper actions.
14171 */
14172 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14173 dtrace_helper_action_t *h, *next;
14174
14175 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14176 next = h->dtha_next;
14177 dtrace_helper_action_destroy(h, vstate);
14178 h = next;
14179 }
14180 }
14181
14182 mutex_exit(&dtrace_lock);
14183
14184 /*
14185 * Destroy the helper providers.
14186 */
14187 if (help->dthps_maxprovs > 0) {
14188 mutex_enter(&dtrace_meta_lock);
14189 if (dtrace_meta_pid != NULL) {
14190 ASSERT(dtrace_deferred_pid == NULL);
14191
14192 for (i = 0; i < help->dthps_nprovs; i++) {
14193 dtrace_helper_provider_remove(
14194 &help->dthps_provs[i]->dthp_prov, p->p_pid);
14195 }
14196 } else {
14197 mutex_enter(&dtrace_lock);
14198 ASSERT(help->dthps_deferred == 0 ||
14199 help->dthps_next != NULL ||
14200 help->dthps_prev != NULL ||
14201 help == dtrace_deferred_pid);
14202
14203 /*
14204 * Remove the helper from the deferred list.
14205 */
14206 if (help->dthps_next != NULL)
14207 help->dthps_next->dthps_prev = help->dthps_prev;
14208 if (help->dthps_prev != NULL)
14209 help->dthps_prev->dthps_next = help->dthps_next;
14210 if (dtrace_deferred_pid == help) {
14211 dtrace_deferred_pid = help->dthps_next;
14212 ASSERT(help->dthps_prev == NULL);
14213 }
14214
14215 mutex_exit(&dtrace_lock);
14216 }
14217
14218 mutex_exit(&dtrace_meta_lock);
14219
14220 for (i = 0; i < help->dthps_nprovs; i++) {
14221 dtrace_helper_provider_destroy(help->dthps_provs[i]);
14222 }
14223
14224 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14225 sizeof (dtrace_helper_provider_t *));
14226 }
14227
14228 mutex_enter(&dtrace_lock);
14229
14230 dtrace_vstate_fini(&help->dthps_vstate);
14231 kmem_free(help->dthps_actions,
14232 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14233 kmem_free(help, sizeof (dtrace_helpers_t));
14234
14235 --dtrace_helpers;
14236 mutex_exit(&dtrace_lock);
14237}
14238
14239static void
14240dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14241{
14242 dtrace_helpers_t *help, *newhelp;
14243 dtrace_helper_action_t *helper, *new, *last;
14244 dtrace_difo_t *dp;
14245 dtrace_vstate_t *vstate;
14246 int i, j, sz, hasprovs = 0;
14247
14248 mutex_enter(&dtrace_lock);
14249 ASSERT(from->p_dtrace_helpers != NULL);
14250 ASSERT(dtrace_helpers > 0);
14251
14252 help = from->p_dtrace_helpers;
14253 newhelp = dtrace_helpers_create(to);
14254 ASSERT(to->p_dtrace_helpers != NULL);
14255
14256 newhelp->dthps_generation = help->dthps_generation;
14257 vstate = &newhelp->dthps_vstate;
14258
14259 /*
14260 * Duplicate the helper actions.
14261 */
14262 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14263 if ((helper = help->dthps_actions[i]) == NULL)
14264 continue;
14265
14266 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14267 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14268 KM_SLEEP);
14269 new->dtha_generation = helper->dtha_generation;
14270
14271 if ((dp = helper->dtha_predicate) != NULL) {
14272 dp = dtrace_difo_duplicate(dp, vstate);
14273 new->dtha_predicate = dp;
14274 }
14275
14276 new->dtha_nactions = helper->dtha_nactions;
14277 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14278 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14279
14280 for (j = 0; j < new->dtha_nactions; j++) {
14281 dtrace_difo_t *dp2 = helper->dtha_actions[j];
14282
14283 ASSERT(dp2 != NULL);
14284 dp2 = dtrace_difo_duplicate(dp2, vstate);
14285 new->dtha_actions[j] = dp2;
14286 }
14287
14288 if (last != NULL) {
14289 last->dtha_next = new;
14290 } else {
14291 newhelp->dthps_actions[i] = new;
14292 }
14293
14294 last = new;
14295 }
14296 }
14297
14298 /*
14299 * Duplicate the helper providers and register them with the
14300 * DTrace framework.
14301 */
14302 if (help->dthps_nprovs > 0) {
14303 newhelp->dthps_nprovs = help->dthps_nprovs;
14304 newhelp->dthps_maxprovs = help->dthps_nprovs;
14305 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14306 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14307 for (i = 0; i < VBDTCAST(int)newhelp->dthps_nprovs; i++) {
14308 newhelp->dthps_provs[i] = help->dthps_provs[i];
14309 newhelp->dthps_provs[i]->dthp_ref++;
14310 }
14311
14312 hasprovs = 1;
14313 }
14314
14315 mutex_exit(&dtrace_lock);
14316
14317 if (hasprovs)
14318 dtrace_helper_provider_register(to, newhelp, NULL);
14319}
14320
14321#ifndef VBOX
14322
14323/*
14324 * DTrace Hook Functions
14325 */
14326static void
14327dtrace_module_loaded(struct modctl *ctl)
14328{
14329 dtrace_provider_t *prv;
14330
14331 mutex_enter(&dtrace_provider_lock);
14332 mutex_enter(&mod_lock);
14333
14334 ASSERT(ctl->mod_busy);
14335
14336 /*
14337 * We're going to call each providers per-module provide operation
14338 * specifying only this module.
14339 */
14340 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14341 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14342
14343 mutex_exit(&mod_lock);
14344 mutex_exit(&dtrace_provider_lock);
14345
14346 /*
14347 * If we have any retained enablings, we need to match against them.
14348 * Enabling probes requires that cpu_lock be held, and we cannot hold
14349 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14350 * module. (In particular, this happens when loading scheduling
14351 * classes.) So if we have any retained enablings, we need to dispatch
14352 * our task queue to do the match for us.
14353 */
14354 mutex_enter(&dtrace_lock);
14355
14356 if (dtrace_retained == NULL) {
14357 mutex_exit(&dtrace_lock);
14358 return;
14359 }
14360
14361 (void) taskq_dispatch(dtrace_taskq,
14362 (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14363
14364 mutex_exit(&dtrace_lock);
14365
14366 /*
14367 * And now, for a little heuristic sleaze: in general, we want to
14368 * match modules as soon as they load. However, we cannot guarantee
14369 * this, because it would lead us to the lock ordering violation
14370 * outlined above. The common case, of course, is that cpu_lock is
14371 * _not_ held -- so we delay here for a clock tick, hoping that that's
14372 * long enough for the task queue to do its work. If it's not, it's
14373 * not a serious problem -- it just means that the module that we
14374 * just loaded may not be immediately instrumentable.
14375 */
14376 delay(1);
14377}
14378
14379static void
14380dtrace_module_unloaded(struct modctl *ctl)
14381{
14382 dtrace_probe_t template, *probe, *first, *next;
14383 dtrace_provider_t *prov;
14384
14385 template.dtpr_mod = ctl->mod_modname;
14386
14387 mutex_enter(&dtrace_provider_lock);
14388 mutex_enter(&mod_lock);
14389 mutex_enter(&dtrace_lock);
14390
14391 if (dtrace_bymod == NULL) {
14392 /*
14393 * The DTrace module is loaded (obviously) but not attached;
14394 * we don't have any work to do.
14395 */
14396 mutex_exit(&dtrace_provider_lock);
14397 mutex_exit(&mod_lock);
14398 mutex_exit(&dtrace_lock);
14399 return;
14400 }
14401
14402 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14403 probe != NULL; probe = probe->dtpr_nextmod) {
14404 if (probe->dtpr_ecb != NULL) {
14405 mutex_exit(&dtrace_provider_lock);
14406 mutex_exit(&mod_lock);
14407 mutex_exit(&dtrace_lock);
14408
14409 /*
14410 * This shouldn't _actually_ be possible -- we're
14411 * unloading a module that has an enabled probe in it.
14412 * (It's normally up to the provider to make sure that
14413 * this can't happen.) However, because dtps_enable()
14414 * doesn't have a failure mode, there can be an
14415 * enable/unload race. Upshot: we don't want to
14416 * assert, but we're not going to disable the
14417 * probe, either.
14418 */
14419 if (dtrace_err_verbose) {
14420 cmn_err(CE_WARN, "unloaded module '%s' had "
14421 "enabled probes", ctl->mod_modname);
14422 }
14423
14424 return;
14425 }
14426 }
14427
14428 probe = first;
14429
14430 for (first = NULL; probe != NULL; probe = next) {
14431 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14432
14433 dtrace_probes[probe->dtpr_id - 1] = NULL;
14434
14435 next = probe->dtpr_nextmod;
14436 dtrace_hash_remove(dtrace_bymod, probe);
14437 dtrace_hash_remove(dtrace_byfunc, probe);
14438 dtrace_hash_remove(dtrace_byname, probe);
14439
14440 if (first == NULL) {
14441 first = probe;
14442 probe->dtpr_nextmod = NULL;
14443 } else {
14444 probe->dtpr_nextmod = first;
14445 first = probe;
14446 }
14447 }
14448
14449 /*
14450 * We've removed all of the module's probes from the hash chains and
14451 * from the probe array. Now issue a dtrace_sync() to be sure that
14452 * everyone has cleared out from any probe array processing.
14453 */
14454 dtrace_sync();
14455
14456 for (probe = first; probe != NULL; probe = first) {
14457 first = probe->dtpr_nextmod;
14458 prov = probe->dtpr_provider;
14459 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14460 probe->dtpr_arg);
14461 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14462 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14463 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14464 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14465 kmem_free(probe, sizeof (dtrace_probe_t));
14466 }
14467
14468 mutex_exit(&dtrace_lock);
14469 mutex_exit(&mod_lock);
14470 mutex_exit(&dtrace_provider_lock);
14471}
14472
14473#endif /* !VBOX */
14474
14475VBDTSTATIC void
14476dtrace_suspend(void)
14477{
14478 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14479}
14480
14481VBDTSTATIC void
14482dtrace_resume(void)
14483{
14484 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14485}
14486
14487#ifdef VBOX
14488typedef enum {
14489 CPU_INVALID,
14490 CPU_CONFIG,
14491 CPU_UNCONFIG
14492} cpu_setup_t;
14493#endif
14494
14495
14496static int
14497dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14498{
14499 ASSERT(MUTEX_HELD(&cpu_lock));
14500 mutex_enter(&dtrace_lock);
14501
14502 switch (what) {
14503 case CPU_CONFIG: {
14504 dtrace_state_t *state;
14505 dtrace_optval_t *opt, rs, c;
14506
14507 /*
14508 * For now, we only allocate a new buffer for anonymous state.
14509 */
14510 if ((state = dtrace_anon.dta_state) == NULL)
14511 break;
14512
14513 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14514 break;
14515
14516 opt = state->dts_options;
14517 c = opt[DTRACEOPT_CPU];
14518
14519 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14520 break;
14521
14522 /*
14523 * Regardless of what the actual policy is, we're going to
14524 * temporarily set our resize policy to be manual. We're
14525 * also going to temporarily set our CPU option to denote
14526 * the newly configured CPU.
14527 */
14528 rs = opt[DTRACEOPT_BUFRESIZE];
14529 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14530 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14531
14532 (void) dtrace_state_buffers(state);
14533
14534 opt[DTRACEOPT_BUFRESIZE] = rs;
14535 opt[DTRACEOPT_CPU] = c;
14536
14537 break;
14538 }
14539
14540 case CPU_UNCONFIG:
14541 /*
14542 * We don't free the buffer in the CPU_UNCONFIG case. (The
14543 * buffer will be freed when the consumer exits.)
14544 */
14545 break;
14546
14547 default:
14548 break;
14549 }
14550
14551 mutex_exit(&dtrace_lock);
14552 return (0);
14553}
14554
14555#ifndef VBOX
14556static void
14557dtrace_cpu_setup_initial(processorid_t cpu)
14558{
14559 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14560}
14561#endif /* !VBOX */
14562
14563static void
14564dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14565{
14566 if (dtrace_toxranges >= dtrace_toxranges_max) {
14567 int osize, nsize;
14568 dtrace_toxrange_t *range;
14569
14570 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14571
14572 if (osize == 0) {
14573 ASSERT(dtrace_toxrange == NULL);
14574 ASSERT(dtrace_toxranges_max == 0);
14575 dtrace_toxranges_max = 1;
14576 } else {
14577 dtrace_toxranges_max <<= 1;
14578 }
14579
14580 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14581 range = kmem_zalloc(nsize, KM_SLEEP);
14582
14583 if (dtrace_toxrange != NULL) {
14584 ASSERT(osize != 0);
14585 bcopy(dtrace_toxrange, range, osize);
14586 kmem_free(dtrace_toxrange, osize);
14587 }
14588
14589 dtrace_toxrange = range;
14590 }
14591
14592 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14593 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14594
14595 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14596 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14597 dtrace_toxranges++;
14598}
14599
14600/*
14601 * DTrace Driver Cookbook Functions
14602 */
14603/*ARGSUSED*/
14604static int
14605dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14606{
14607 dtrace_provider_id_t id;
14608 dtrace_state_t *state = NULL;
14609 dtrace_enabling_t *enab;
14610
14611 mutex_enter(&cpu_lock);
14612 mutex_enter(&dtrace_provider_lock);
14613 mutex_enter(&dtrace_lock);
14614
14615 if (ddi_soft_state_init(&dtrace_softstate,
14616 sizeof (dtrace_state_t), 0) != 0) {
14617 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14618 mutex_exit(&cpu_lock);
14619 mutex_exit(&dtrace_provider_lock);
14620 mutex_exit(&dtrace_lock);
14621 return (DDI_FAILURE);
14622 }
14623
14624#ifndef VBOX
14625 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14626 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14627 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14628 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14629 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14630 ddi_remove_minor_node(devi, NULL);
14631 ddi_soft_state_fini(&dtrace_softstate);
14632 mutex_exit(&cpu_lock);
14633 mutex_exit(&dtrace_provider_lock);
14634 mutex_exit(&dtrace_lock);
14635 return (DDI_FAILURE);
14636 }
14637#endif
14638
14639 ddi_report_dev(devi);
14640 dtrace_devi = devi;
14641
14642#ifndef VBOX
14643 dtrace_modload = dtrace_module_loaded;
14644 dtrace_modunload = dtrace_module_unloaded;
14645 dtrace_cpu_init = dtrace_cpu_setup_initial;
14646 dtrace_helpers_cleanup = dtrace_helpers_destroy;
14647 dtrace_helpers_fork = dtrace_helpers_duplicate;
14648 dtrace_cpustart_init = dtrace_suspend;
14649 dtrace_cpustart_fini = dtrace_resume;
14650 dtrace_debugger_init = dtrace_suspend;
14651 dtrace_debugger_fini = dtrace_resume;
14652
14653 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14654#else
14655 /** @todo some of these hooks needs checking out! */
14656#endif
14657
14658 ASSERT(MUTEX_HELD(&cpu_lock));
14659
14660 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14661 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14662 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14663 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14664 VM_SLEEP | VMC_IDENTIFIER);
14665#ifndef VBOX
14666 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14667 1, INT_MAX, 0);
14668#endif
14669
14670 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14671 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14672 NULL, NULL, NULL, NULL, NULL, 0);
14673
14674 ASSERT(MUTEX_HELD(&cpu_lock));
14675 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14676 offsetof(dtrace_probe_t, dtpr_nextmod),
14677 offsetof(dtrace_probe_t, dtpr_prevmod));
14678
14679 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14680 offsetof(dtrace_probe_t, dtpr_nextfunc),
14681 offsetof(dtrace_probe_t, dtpr_prevfunc));
14682
14683 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14684 offsetof(dtrace_probe_t, dtpr_nextname),
14685 offsetof(dtrace_probe_t, dtpr_prevname));
14686
14687 if (dtrace_retain_max < 1) {
14688 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14689 "setting to 1", dtrace_retain_max);
14690 dtrace_retain_max = 1;
14691 }
14692
14693 /*
14694 * Now discover our toxic ranges.
14695 */
14696 dtrace_toxic_ranges(dtrace_toxrange_add);
14697
14698 /*
14699 * Before we register ourselves as a provider to our own framework,
14700 * we would like to assert that dtrace_provider is NULL -- but that's
14701 * not true if we were loaded as a dependency of a DTrace provider.
14702 * Once we've registered, we can assert that dtrace_provider is our
14703 * pseudo provider.
14704 */
14705 (void) dtrace_register("dtrace", &dtrace_provider_attr,
14706 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14707
14708 ASSERT(dtrace_provider != NULL);
14709 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14710
14711 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14712 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14713 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14714 dtrace_provider, NULL, NULL, "END", 0, NULL);
14715 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14716 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14717
14718 dtrace_anon_property();
14719 mutex_exit(&cpu_lock);
14720
14721 /*
14722 * If DTrace helper tracing is enabled, we need to allocate the
14723 * trace buffer and initialize the values.
14724 */
14725 if (dtrace_helptrace_enabled) {
14726 ASSERT(dtrace_helptrace_buffer == NULL);
14727 dtrace_helptrace_buffer =
14728 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14729 dtrace_helptrace_next = 0;
14730 }
14731
14732 /*
14733 * If there are already providers, we must ask them to provide their
14734 * probes, and then match any anonymous enabling against them. Note
14735 * that there should be no other retained enablings at this time:
14736 * the only retained enablings at this time should be the anonymous
14737 * enabling.
14738 */
14739 if (dtrace_anon.dta_enabling != NULL) {
14740 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14741
14742 dtrace_enabling_provide(NULL);
14743 state = dtrace_anon.dta_state;
14744
14745 /*
14746 * We couldn't hold cpu_lock across the above call to
14747 * dtrace_enabling_provide(), but we must hold it to actually
14748 * enable the probes. We have to drop all of our locks, pick
14749 * up cpu_lock, and regain our locks before matching the
14750 * retained anonymous enabling.
14751 */
14752 mutex_exit(&dtrace_lock);
14753 mutex_exit(&dtrace_provider_lock);
14754
14755 mutex_enter(&cpu_lock);
14756 mutex_enter(&dtrace_provider_lock);
14757 mutex_enter(&dtrace_lock);
14758
14759 if ((enab = dtrace_anon.dta_enabling) != NULL)
14760 (void) dtrace_enabling_match(enab, NULL);
14761
14762 mutex_exit(&cpu_lock);
14763 }
14764
14765 mutex_exit(&dtrace_lock);
14766 mutex_exit(&dtrace_provider_lock);
14767
14768 if (state != NULL) {
14769 /*
14770 * If we created any anonymous state, set it going now.
14771 */
14772 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14773 }
14774
14775 return (DDI_SUCCESS);
14776}
14777
14778/*ARGSUSED*/
14779static int
14780dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14781{
14782 dtrace_state_t *state;
14783 uint32_t priv;
14784 uid_t uid;
14785 zoneid_t zoneid;
14786
14787 if (getminor(*devp) == DTRACEMNRN_HELPER)
14788 return (0);
14789
14790 /*
14791 * If this wasn't an open with the "helper" minor, then it must be
14792 * the "dtrace" minor.
14793 */
14794 if (getminor(*devp) != DTRACEMNRN_DTRACE)
14795 return (ENXIO);
14796
14797 /*
14798 * If no DTRACE_PRIV_* bits are set in the credential, then the
14799 * caller lacks sufficient permission to do anything with DTrace.
14800 */
14801 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14802 if (priv == DTRACE_PRIV_NONE)
14803 return (EACCES);
14804
14805 /*
14806 * Ask all providers to provide all their probes.
14807 */
14808 mutex_enter(&dtrace_provider_lock);
14809 dtrace_probe_provide(NULL, NULL);
14810 mutex_exit(&dtrace_provider_lock);
14811
14812 mutex_enter(&cpu_lock);
14813 mutex_enter(&dtrace_lock);
14814 dtrace_opens++;
14815 dtrace_membar_producer();
14816
14817#ifndef VBOX
14818 /*
14819 * If the kernel debugger is active (that is, if the kernel debugger
14820 * modified text in some way), we won't allow the open.
14821 */
14822 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14823 dtrace_opens--;
14824 mutex_exit(&cpu_lock);
14825 mutex_exit(&dtrace_lock);
14826 return (EBUSY);
14827 }
14828#endif
14829
14830 state = dtrace_state_create(devp, cred_p);
14831 mutex_exit(&cpu_lock);
14832
14833 if (state == NULL) {
14834#ifndef VBOX
14835 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14836 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14837#endif
14838 mutex_exit(&dtrace_lock);
14839 return (EAGAIN);
14840 }
14841
14842 mutex_exit(&dtrace_lock);
14843
14844 return (0);
14845}
14846
14847/*ARGSUSED*/
14848static int
14849dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
14850{
14851 minor_t minor = getminor(dev);
14852 dtrace_state_t *state;
14853
14854 if (minor == DTRACEMNRN_HELPER)
14855 return (0);
14856
14857 state = ddi_get_soft_state(dtrace_softstate, minor);
14858
14859 mutex_enter(&cpu_lock);
14860 mutex_enter(&dtrace_lock);
14861
14862 if (state->dts_anon) {
14863 /*
14864 * There is anonymous state. Destroy that first.
14865 */
14866 ASSERT(dtrace_anon.dta_state == NULL);
14867 dtrace_state_destroy(state->dts_anon);
14868 }
14869
14870 dtrace_state_destroy(state);
14871 ASSERT(dtrace_opens > 0);
14872
14873#ifndef VBOX
14874 /*
14875 * Only relinquish control of the kernel debugger interface when there
14876 * are no consumers and no anonymous enablings.
14877 */
14878 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14879 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14880#endif
14881
14882 mutex_exit(&dtrace_lock);
14883 mutex_exit(&cpu_lock);
14884
14885 return (0);
14886}
14887
14888/*ARGSUSED*/
14889static int
14890dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
14891{
14892 int rval;
14893 dof_helper_t help, *dhp = NULL;
14894
14895 switch (cmd) {
14896 case DTRACEHIOC_ADDDOF:
14897 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
14898 dtrace_dof_error(NULL, "failed to copyin DOF helper");
14899 return (EFAULT);
14900 }
14901
14902 dhp = &help;
14903 arg = (intptr_t)help.dofhp_dof;
14904 /*FALLTHROUGH*/
14905
14906 case DTRACEHIOC_ADD: {
14907 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
14908
14909 if (dof == NULL)
14910 return (rval);
14911
14912 mutex_enter(&dtrace_lock);
14913
14914 /*
14915 * dtrace_helper_slurp() takes responsibility for the dof --
14916 * it may free it now or it may save it and free it later.
14917 */
14918 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
14919 *rv = rval;
14920 rval = 0;
14921 } else {
14922 rval = EINVAL;
14923 }
14924
14925 mutex_exit(&dtrace_lock);
14926 return (rval);
14927 }
14928
14929 case DTRACEHIOC_REMOVE: {
14930 mutex_enter(&dtrace_lock);
14931 rval = dtrace_helper_destroygen(arg);
14932 mutex_exit(&dtrace_lock);
14933
14934 return (rval);
14935 }
14936
14937 default:
14938 break;
14939 }
14940
14941 return (ENOTTY);
14942}
14943
14944/*ARGSUSED*/
14945static int
14946dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
14947{
14948 minor_t minor = getminor(dev);
14949 dtrace_state_t *state;
14950 int rval;
14951
14952 if (minor == DTRACEMNRN_HELPER)
14953 return (dtrace_ioctl_helper(cmd, arg, rv));
14954
14955 state = ddi_get_soft_state(dtrace_softstate, minor);
14956
14957 if (state->dts_anon) {
14958 ASSERT(dtrace_anon.dta_state == NULL);
14959 state = state->dts_anon;
14960 }
14961
14962 switch (cmd) {
14963 case DTRACEIOC_PROVIDER: {
14964 dtrace_providerdesc_t pvd;
14965 dtrace_provider_t *pvp;
14966
14967 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
14968 return (EFAULT);
14969
14970 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
14971 mutex_enter(&dtrace_provider_lock);
14972
14973 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
14974 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
14975 break;
14976 }
14977
14978 mutex_exit(&dtrace_provider_lock);
14979
14980 if (pvp == NULL)
14981 return (ESRCH);
14982
14983 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
14984 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
14985 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
14986 return (EFAULT);
14987
14988 return (0);
14989 }
14990
14991 case DTRACEIOC_EPROBE: {
14992 dtrace_eprobedesc_t epdesc;
14993 dtrace_ecb_t *ecb;
14994 dtrace_action_t *act;
14995 void *buf;
14996 size_t size;
14997 uintptr_t dest;
14998 int nrecs;
14999
15000 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15001 return (EFAULT);
15002
15003 mutex_enter(&dtrace_lock);
15004
15005 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15006 mutex_exit(&dtrace_lock);
15007 return (EINVAL);
15008 }
15009
15010 if (ecb->dte_probe == NULL) {
15011 mutex_exit(&dtrace_lock);
15012 return (EINVAL);
15013 }
15014
15015 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15016 epdesc.dtepd_uarg = ecb->dte_uarg;
15017 epdesc.dtepd_size = ecb->dte_size;
15018
15019 nrecs = epdesc.dtepd_nrecs;
15020 epdesc.dtepd_nrecs = 0;
15021 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15022 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15023 continue;
15024
15025 epdesc.dtepd_nrecs++;
15026 }
15027
15028 /*
15029 * Now that we have the size, we need to allocate a temporary
15030 * buffer in which to store the complete description. We need
15031 * the temporary buffer to be able to drop dtrace_lock()
15032 * across the copyout(), below.
15033 */
15034 size = sizeof (dtrace_eprobedesc_t) +
15035 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15036
15037 buf = kmem_alloc(size, KM_SLEEP);
15038 dest = (uintptr_t)buf;
15039
15040 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15041 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15042
15043 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15044 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15045 continue;
15046
15047 if (nrecs-- == 0)
15048 break;
15049
15050 bcopy(&act->dta_rec, (void *)dest,
15051 sizeof (dtrace_recdesc_t));
15052 dest += sizeof (dtrace_recdesc_t);
15053 }
15054
15055 mutex_exit(&dtrace_lock);
15056
15057 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15058 kmem_free(buf, size);
15059 return (EFAULT);
15060 }
15061
15062 kmem_free(buf, size);
15063 return (0);
15064 }
15065
15066 case DTRACEIOC_AGGDESC: {
15067 dtrace_aggdesc_t aggdesc;
15068 dtrace_action_t *act;
15069 dtrace_aggregation_t *agg;
15070 int nrecs;
15071 uint32_t offs;
15072 dtrace_recdesc_t *lrec;
15073 void *buf;
15074 size_t size;
15075 uintptr_t dest;
15076
15077 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15078 return (EFAULT);
15079
15080 mutex_enter(&dtrace_lock);
15081
15082 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15083 mutex_exit(&dtrace_lock);
15084 return (EINVAL);
15085 }
15086
15087 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15088
15089 nrecs = aggdesc.dtagd_nrecs;
15090 aggdesc.dtagd_nrecs = 0;
15091
15092 offs = agg->dtag_base;
15093 lrec = &agg->dtag_action.dta_rec;
15094 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15095
15096 for (act = agg->dtag_first; ; act = act->dta_next) {
15097 ASSERT(act->dta_intuple ||
15098 DTRACEACT_ISAGG(act->dta_kind));
15099
15100 /*
15101 * If this action has a record size of zero, it
15102 * denotes an argument to the aggregating action.
15103 * Because the presence of this record doesn't (or
15104 * shouldn't) affect the way the data is interpreted,
15105 * we don't copy it out to save user-level the
15106 * confusion of dealing with a zero-length record.
15107 */
15108 if (act->dta_rec.dtrd_size == 0) {
15109 ASSERT(agg->dtag_hasarg);
15110 continue;
15111 }
15112
15113 aggdesc.dtagd_nrecs++;
15114
15115 if (act == &agg->dtag_action)
15116 break;
15117 }
15118
15119 /*
15120 * Now that we have the size, we need to allocate a temporary
15121 * buffer in which to store the complete description. We need
15122 * the temporary buffer to be able to drop dtrace_lock()
15123 * across the copyout(), below.
15124 */
15125 size = sizeof (dtrace_aggdesc_t) +
15126 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15127
15128 buf = kmem_alloc(size, KM_SLEEP);
15129 dest = (uintptr_t)buf;
15130
15131 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15132 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15133
15134 for (act = agg->dtag_first; ; act = act->dta_next) {
15135 dtrace_recdesc_t rec = act->dta_rec;
15136
15137 /*
15138 * See the comment in the above loop for why we pass
15139 * over zero-length records.
15140 */
15141 if (rec.dtrd_size == 0) {
15142 ASSERT(agg->dtag_hasarg);
15143 continue;
15144 }
15145
15146 if (nrecs-- == 0)
15147 break;
15148
15149 rec.dtrd_offset -= offs;
15150 bcopy(&rec, (void *)dest, sizeof (rec));
15151 dest += sizeof (dtrace_recdesc_t);
15152
15153 if (act == &agg->dtag_action)
15154 break;
15155 }
15156
15157 mutex_exit(&dtrace_lock);
15158
15159 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15160 kmem_free(buf, size);
15161 return (EFAULT);
15162 }
15163
15164 kmem_free(buf, size);
15165 return (0);
15166 }
15167
15168 case DTRACEIOC_ENABLE: {
15169 dof_hdr_t *dof;
15170 dtrace_enabling_t *enab = NULL;
15171 dtrace_vstate_t *vstate;
15172 int err = 0;
15173
15174 *rv = 0;
15175
15176 /*
15177 * If a NULL argument has been passed, we take this as our
15178 * cue to reevaluate our enablings.
15179 */
15180 if (arg == NULL) {
15181 dtrace_enabling_matchall();
15182
15183 return (0);
15184 }
15185
15186 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15187 return (rval);
15188
15189 mutex_enter(&cpu_lock);
15190 mutex_enter(&dtrace_lock);
15191 vstate = &state->dts_vstate;
15192
15193 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15194 mutex_exit(&dtrace_lock);
15195 mutex_exit(&cpu_lock);
15196 dtrace_dof_destroy(dof);
15197 return (EBUSY);
15198 }
15199
15200 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15201 mutex_exit(&dtrace_lock);
15202 mutex_exit(&cpu_lock);
15203 dtrace_dof_destroy(dof);
15204 return (EINVAL);
15205 }
15206
15207 if ((rval = dtrace_dof_options(dof, state)) != 0) {
15208 dtrace_enabling_destroy(enab);
15209 mutex_exit(&dtrace_lock);
15210 mutex_exit(&cpu_lock);
15211 dtrace_dof_destroy(dof);
15212 return (rval);
15213 }
15214
15215 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15216 err = dtrace_enabling_retain(enab);
15217 } else {
15218 dtrace_enabling_destroy(enab);
15219 }
15220
15221 mutex_exit(&cpu_lock);
15222 mutex_exit(&dtrace_lock);
15223 dtrace_dof_destroy(dof);
15224
15225 return (err);
15226 }
15227
15228 case DTRACEIOC_REPLICATE: {
15229 dtrace_repldesc_t desc;
15230 dtrace_probedesc_t *match = &desc.dtrpd_match;
15231 dtrace_probedesc_t *create = &desc.dtrpd_create;
15232 int err;
15233
15234 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15235 return (EFAULT);
15236
15237 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15238 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15239 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15240 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15241
15242 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15243 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15244 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15245 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15246
15247 mutex_enter(&dtrace_lock);
15248 err = dtrace_enabling_replicate(state, match, create);
15249 mutex_exit(&dtrace_lock);
15250
15251 return (err);
15252 }
15253
15254 case DTRACEIOC_PROBEMATCH:
15255 case DTRACEIOC_PROBES: {
15256 dtrace_probe_t *probe = NULL;
15257 dtrace_probedesc_t desc;
15258 dtrace_probekey_t pkey;
15259 dtrace_id_t i;
15260 int m = 0;
15261 uint32_t priv;
15262 uid_t uid;
15263 zoneid_t zoneid;
15264
15265 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15266 return (EFAULT);
15267
15268 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15269 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15270 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15271 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15272
15273 /*
15274 * Before we attempt to match this probe, we want to give
15275 * all providers the opportunity to provide it.
15276 */
15277 if (desc.dtpd_id == DTRACE_IDNONE) {
15278 mutex_enter(&dtrace_provider_lock);
15279 dtrace_probe_provide(&desc, NULL);
15280 mutex_exit(&dtrace_provider_lock);
15281 desc.dtpd_id++;
15282 }
15283
15284 if (cmd == DTRACEIOC_PROBEMATCH) {
15285 dtrace_probekey(&desc, &pkey);
15286 pkey.dtpk_id = DTRACE_IDNONE;
15287 }
15288
15289 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15290
15291 mutex_enter(&dtrace_lock);
15292
15293 if (cmd == DTRACEIOC_PROBEMATCH) {
15294 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15295 if ((probe = dtrace_probes[i - 1]) != NULL &&
15296 (m = dtrace_match_probe(probe, &pkey,
15297 priv, uid, zoneid)) != 0)
15298 break;
15299 }
15300
15301 if (m < 0) {
15302 mutex_exit(&dtrace_lock);
15303 return (EINVAL);
15304 }
15305
15306 } else {
15307 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15308 if ((probe = dtrace_probes[i - 1]) != NULL &&
15309 dtrace_match_priv(probe, priv, uid, zoneid))
15310 break;
15311 }
15312 }
15313
15314 if (probe == NULL) {
15315 mutex_exit(&dtrace_lock);
15316 return (ESRCH);
15317 }
15318
15319 dtrace_probe_description(probe, &desc);
15320 mutex_exit(&dtrace_lock);
15321
15322 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15323 return (EFAULT);
15324
15325 return (0);
15326 }
15327
15328 case DTRACEIOC_PROBEARG: {
15329 dtrace_argdesc_t desc;
15330 dtrace_probe_t *probe;
15331 dtrace_provider_t *prov;
15332
15333 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15334 return (EFAULT);
15335
15336 if (desc.dtargd_id == DTRACE_IDNONE)
15337 return (EINVAL);
15338
15339 if (desc.dtargd_ndx == DTRACE_ARGNONE)
15340 return (EINVAL);
15341
15342 mutex_enter(&dtrace_provider_lock);
15343 mutex_enter(&mod_lock);
15344 mutex_enter(&dtrace_lock);
15345
15346 if (desc.dtargd_id > dtrace_nprobes) {
15347 mutex_exit(&dtrace_lock);
15348 mutex_exit(&mod_lock);
15349 mutex_exit(&dtrace_provider_lock);
15350 return (EINVAL);
15351 }
15352
15353 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15354 mutex_exit(&dtrace_lock);
15355 mutex_exit(&mod_lock);
15356 mutex_exit(&dtrace_provider_lock);
15357 return (EINVAL);
15358 }
15359
15360 mutex_exit(&dtrace_lock);
15361
15362 prov = probe->dtpr_provider;
15363
15364 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15365 /*
15366 * There isn't any typed information for this probe.
15367 * Set the argument number to DTRACE_ARGNONE.
15368 */
15369 desc.dtargd_ndx = DTRACE_ARGNONE;
15370 } else {
15371 desc.dtargd_native[0] = '\0';
15372 desc.dtargd_xlate[0] = '\0';
15373 desc.dtargd_mapping = desc.dtargd_ndx;
15374
15375 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15376 probe->dtpr_id, probe->dtpr_arg, &desc);
15377 }
15378
15379 mutex_exit(&mod_lock);
15380 mutex_exit(&dtrace_provider_lock);
15381
15382 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15383 return (EFAULT);
15384
15385 return (0);
15386 }
15387
15388 case DTRACEIOC_GO: {
15389 processorid_t cpuid;
15390 rval = dtrace_state_go(state, &cpuid);
15391
15392 if (rval != 0)
15393 return (rval);
15394
15395 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15396 return (EFAULT);
15397
15398 return (0);
15399 }
15400
15401 case DTRACEIOC_STOP: {
15402 processorid_t cpuid;
15403
15404 mutex_enter(&dtrace_lock);
15405 rval = dtrace_state_stop(state, &cpuid);
15406 mutex_exit(&dtrace_lock);
15407
15408 if (rval != 0)
15409 return (rval);
15410
15411 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15412 return (EFAULT);
15413
15414 return (0);
15415 }
15416
15417 case DTRACEIOC_DOFGET: {
15418 dof_hdr_t hdr, *dof;
15419 uint64_t len;
15420
15421 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15422 return (EFAULT);
15423
15424 mutex_enter(&dtrace_lock);
15425 dof = dtrace_dof_create(state);
15426 mutex_exit(&dtrace_lock);
15427
15428 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15429 rval = copyout(dof, (void *)arg, len);
15430 dtrace_dof_destroy(dof);
15431
15432 return (rval == 0 ? 0 : EFAULT);
15433 }
15434
15435 case DTRACEIOC_AGGSNAP:
15436 case DTRACEIOC_BUFSNAP: {
15437 dtrace_bufdesc_t desc;
15438 caddr_t cached;
15439 dtrace_buffer_t *buf;
15440
15441 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15442 return (EFAULT);
15443
15444 if (/*VBox value is is unsigned: desc.dtbd_cpu < 0 ||*/ desc.dtbd_cpu >= NCPU)
15445 return (EINVAL);
15446
15447 mutex_enter(&dtrace_lock);
15448
15449 if (cmd == DTRACEIOC_BUFSNAP) {
15450 buf = &state->dts_buffer[desc.dtbd_cpu];
15451 } else {
15452 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15453 }
15454
15455 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15456 size_t sz = buf->dtb_offset;
15457
15458 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15459 mutex_exit(&dtrace_lock);
15460 return (EBUSY);
15461 }
15462
15463 /*
15464 * If this buffer has already been consumed, we're
15465 * going to indicate that there's nothing left here
15466 * to consume.
15467 */
15468 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15469 mutex_exit(&dtrace_lock);
15470
15471 desc.dtbd_size = 0;
15472 desc.dtbd_drops = 0;
15473 desc.dtbd_errors = 0;
15474 desc.dtbd_oldest = 0;
15475 sz = sizeof (desc);
15476
15477 if (copyout(&desc, (void *)arg, sz) != 0)
15478 return (EFAULT);
15479
15480 return (0);
15481 }
15482
15483 /*
15484 * If this is a ring buffer that has wrapped, we want
15485 * to copy the whole thing out.
15486 */
15487 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15488 dtrace_buffer_polish(buf);
15489 sz = buf->dtb_size;
15490 }
15491
15492 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15493 mutex_exit(&dtrace_lock);
15494 return (EFAULT);
15495 }
15496
15497 desc.dtbd_size = sz;
15498 desc.dtbd_drops = buf->dtb_drops;
15499 desc.dtbd_errors = buf->dtb_errors;
15500 desc.dtbd_oldest = buf->dtb_xamot_offset;
15501
15502 mutex_exit(&dtrace_lock);
15503
15504 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15505 return (EFAULT);
15506
15507 buf->dtb_flags |= DTRACEBUF_CONSUMED;
15508
15509 return (0);
15510 }
15511
15512 if (buf->dtb_tomax == NULL) {
15513 ASSERT(buf->dtb_xamot == NULL);
15514 mutex_exit(&dtrace_lock);
15515 return (ENOENT);
15516 }
15517
15518 cached = buf->dtb_tomax;
15519 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15520
15521 dtrace_xcall(desc.dtbd_cpu,
15522 (dtrace_xcall_t)dtrace_buffer_switch, buf);
15523
15524 state->dts_errors += buf->dtb_xamot_errors;
15525
15526 /*
15527 * If the buffers did not actually switch, then the cross call
15528 * did not take place -- presumably because the given CPU is
15529 * not in the ready set. If this is the case, we'll return
15530 * ENOENT.
15531 */
15532 if (buf->dtb_tomax == cached) {
15533 ASSERT(buf->dtb_xamot != cached);
15534 mutex_exit(&dtrace_lock);
15535 return (ENOENT);
15536 }
15537
15538 ASSERT(cached == buf->dtb_xamot);
15539
15540 /*
15541 * We have our snapshot; now copy it out.
15542 */
15543 if (copyout(buf->dtb_xamot, desc.dtbd_data,
15544 buf->dtb_xamot_offset) != 0) {
15545 mutex_exit(&dtrace_lock);
15546 return (EFAULT);
15547 }
15548
15549 desc.dtbd_size = buf->dtb_xamot_offset;
15550 desc.dtbd_drops = buf->dtb_xamot_drops;
15551 desc.dtbd_errors = buf->dtb_xamot_errors;
15552 desc.dtbd_oldest = 0;
15553
15554 mutex_exit(&dtrace_lock);
15555
15556 /*
15557 * Finally, copy out the buffer description.
15558 */
15559 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15560 return (EFAULT);
15561
15562 return (0);
15563 }
15564
15565 case DTRACEIOC_CONF: {
15566 dtrace_conf_t conf;
15567
15568 bzero(&conf, sizeof (conf));
15569 conf.dtc_difversion = DIF_VERSION;
15570 conf.dtc_difintregs = DIF_DIR_NREGS;
15571 conf.dtc_diftupregs = DIF_DTR_NREGS;
15572 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15573
15574 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15575 return (EFAULT);
15576
15577 return (0);
15578 }
15579
15580 case DTRACEIOC_STATUS: {
15581 dtrace_status_t stat;
15582 dtrace_dstate_t *dstate;
15583 int i, j;
15584 uint64_t nerrs;
15585
15586 /*
15587 * See the comment in dtrace_state_deadman() for the reason
15588 * for setting dts_laststatus to INT64_MAX before setting
15589 * it to the correct value.
15590 */
15591 state->dts_laststatus = INT64_MAX;
15592 dtrace_membar_producer();
15593 state->dts_laststatus = dtrace_gethrtime();
15594
15595 bzero(&stat, sizeof (stat));
15596
15597 mutex_enter(&dtrace_lock);
15598
15599 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15600 mutex_exit(&dtrace_lock);
15601 return (ENOENT);
15602 }
15603
15604 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15605 stat.dtst_exiting = 1;
15606
15607 nerrs = state->dts_errors;
15608 dstate = &state->dts_vstate.dtvs_dynvars;
15609
15610 for (i = 0; i < NCPU; i++) {
15611 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15612
15613 stat.dtst_dyndrops += dcpu->dtdsc_drops;
15614 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15615 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15616
15617 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15618 stat.dtst_filled++;
15619
15620 nerrs += state->dts_buffer[i].dtb_errors;
15621
15622 for (j = 0; j < state->dts_nspeculations; j++) {
15623 dtrace_speculation_t *spec;
15624 dtrace_buffer_t *buf;
15625
15626 spec = &state->dts_speculations[j];
15627 buf = &spec->dtsp_buffer[i];
15628 stat.dtst_specdrops += buf->dtb_xamot_drops;
15629 }
15630 }
15631
15632 stat.dtst_specdrops_busy = state->dts_speculations_busy;
15633 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15634 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15635 stat.dtst_dblerrors = state->dts_dblerrors;
15636 stat.dtst_killed =
15637 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15638 stat.dtst_errors = nerrs;
15639
15640 mutex_exit(&dtrace_lock);
15641
15642 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15643 return (EFAULT);
15644
15645 return (0);
15646 }
15647
15648 case DTRACEIOC_FORMAT: {
15649 dtrace_fmtdesc_t fmt;
15650 char *str;
15651 int len;
15652
15653 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15654 return (EFAULT);
15655
15656 mutex_enter(&dtrace_lock);
15657
15658 if (fmt.dtfd_format == 0 ||
15659 fmt.dtfd_format > state->dts_nformats) {
15660 mutex_exit(&dtrace_lock);
15661 return (EINVAL);
15662 }
15663
15664 /*
15665 * Format strings are allocated contiguously and they are
15666 * never freed; if a format index is less than the number
15667 * of formats, we can assert that the format map is non-NULL
15668 * and that the format for the specified index is non-NULL.
15669 */
15670 ASSERT(state->dts_formats != NULL);
15671 str = state->dts_formats[fmt.dtfd_format - 1];
15672 ASSERT(str != NULL);
15673
15674 len = strlen(str) + 1;
15675
15676 if (len > fmt.dtfd_length) {
15677 fmt.dtfd_length = len;
15678
15679 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15680 mutex_exit(&dtrace_lock);
15681 return (EINVAL);
15682 }
15683 } else {
15684 if (copyout(str, fmt.dtfd_string, len) != 0) {
15685 mutex_exit(&dtrace_lock);
15686 return (EINVAL);
15687 }
15688 }
15689
15690 mutex_exit(&dtrace_lock);
15691 return (0);
15692 }
15693
15694 default:
15695 break;
15696 }
15697
15698 return (ENOTTY);
15699}
15700
15701/*ARGSUSED*/
15702static int
15703dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15704{
15705 dtrace_state_t *state;
15706
15707 switch (cmd) {
15708 case DDI_DETACH:
15709 break;
15710
15711 case DDI_SUSPEND:
15712 return (DDI_SUCCESS);
15713
15714 default:
15715 return (DDI_FAILURE);
15716 }
15717
15718 mutex_enter(&cpu_lock);
15719 mutex_enter(&dtrace_provider_lock);
15720 mutex_enter(&dtrace_lock);
15721
15722 ASSERT(dtrace_opens == 0);
15723
15724 if (dtrace_helpers > 0) {
15725 mutex_exit(&dtrace_provider_lock);
15726 mutex_exit(&dtrace_lock);
15727 mutex_exit(&cpu_lock);
15728 return (DDI_FAILURE);
15729 }
15730
15731 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15732 mutex_exit(&dtrace_provider_lock);
15733 mutex_exit(&dtrace_lock);
15734 mutex_exit(&cpu_lock);
15735 return (DDI_FAILURE);
15736 }
15737
15738 dtrace_provider = NULL;
15739
15740 if ((state = dtrace_anon_grab()) != NULL) {
15741 /*
15742 * If there were ECBs on this state, the provider should
15743 * have not been allowed to detach; assert that there is
15744 * none.
15745 */
15746 ASSERT(state->dts_necbs == 0);
15747 dtrace_state_destroy(state);
15748
15749#ifndef VBOX
15750 /*
15751 * If we're being detached with anonymous state, we need to
15752 * indicate to the kernel debugger that DTrace is now inactive.
15753 */
15754 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15755#endif
15756 }
15757
15758 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15759#ifndef VBOX /** @todo CPU hooks */
15760 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15761#endif
15762 dtrace_cpu_init = NULL;
15763 dtrace_helpers_cleanup = NULL;
15764 dtrace_helpers_fork = NULL;
15765 dtrace_cpustart_init = NULL;
15766 dtrace_cpustart_fini = NULL;
15767 dtrace_debugger_init = NULL;
15768 dtrace_debugger_fini = NULL;
15769 dtrace_modload = NULL;
15770 dtrace_modunload = NULL;
15771
15772 mutex_exit(&cpu_lock);
15773
15774 if (dtrace_helptrace_enabled) {
15775 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15776 dtrace_helptrace_buffer = NULL;
15777 }
15778
15779 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15780 dtrace_probes = NULL;
15781 dtrace_nprobes = 0;
15782
15783 dtrace_hash_destroy(dtrace_bymod);
15784 dtrace_hash_destroy(dtrace_byfunc);
15785 dtrace_hash_destroy(dtrace_byname);
15786 dtrace_bymod = NULL;
15787 dtrace_byfunc = NULL;
15788 dtrace_byname = NULL;
15789
15790 kmem_cache_destroy(dtrace_state_cache);
15791 vmem_destroy(dtrace_minor);
15792 vmem_destroy(dtrace_arena);
15793
15794 if (dtrace_toxrange != NULL) {
15795 kmem_free(dtrace_toxrange,
15796 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15797 dtrace_toxrange = NULL;
15798 dtrace_toxranges = 0;
15799 dtrace_toxranges_max = 0;
15800 }
15801
15802#ifndef VBOX
15803 ddi_remove_minor_node(dtrace_devi, NULL);
15804#endif
15805 dtrace_devi = NULL;
15806
15807 ddi_soft_state_fini(&dtrace_softstate);
15808
15809 ASSERT(dtrace_vtime_references == 0);
15810 ASSERT(dtrace_opens == 0);
15811 ASSERT(dtrace_retained == NULL);
15812
15813 mutex_exit(&dtrace_lock);
15814 mutex_exit(&dtrace_provider_lock);
15815
15816 /*
15817 * We don't destroy the task queue until after we have dropped our
15818 * locks (taskq_destroy() may block on running tasks). To prevent
15819 * attempting to do work after we have effectively detached but before
15820 * the task queue has been destroyed, all tasks dispatched via the
15821 * task queue must check that DTrace is still attached before
15822 * performing any operation.
15823 */
15824#ifndef VBOX
15825 taskq_destroy(dtrace_taskq);
15826 dtrace_taskq = NULL;
15827#endif
15828
15829 return (DDI_SUCCESS);
15830}
15831
15832#ifndef VBOX
15833/*ARGSUSED*/
15834static int
15835dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15836{
15837 int error;
15838
15839 switch (infocmd) {
15840 case DDI_INFO_DEVT2DEVINFO:
15841 *result = (void *)dtrace_devi;
15842 error = DDI_SUCCESS;
15843 break;
15844 case DDI_INFO_DEVT2INSTANCE:
15845 *result = (void *)0;
15846 error = DDI_SUCCESS;
15847 break;
15848 default:
15849 error = DDI_FAILURE;
15850 }
15851 return (error);
15852}
15853
15854static struct cb_ops dtrace_cb_ops = {
15855 dtrace_open, /* open */
15856 dtrace_close, /* close */
15857 nulldev, /* strategy */
15858 nulldev, /* print */
15859 nodev, /* dump */
15860 nodev, /* read */
15861 nodev, /* write */
15862 dtrace_ioctl, /* ioctl */
15863 nodev, /* devmap */
15864 nodev, /* mmap */
15865 nodev, /* segmap */
15866 nochpoll, /* poll */
15867 ddi_prop_op, /* cb_prop_op */
15868 0, /* streamtab */
15869 D_NEW | D_MP /* Driver compatibility flag */
15870};
15871
15872static struct dev_ops dtrace_ops = {
15873 DEVO_REV, /* devo_rev */
15874 0, /* refcnt */
15875 dtrace_info, /* get_dev_info */
15876 nulldev, /* identify */
15877 nulldev, /* probe */
15878 dtrace_attach, /* attach */
15879 dtrace_detach, /* detach */
15880 nodev, /* reset */
15881 &dtrace_cb_ops, /* driver operations */
15882 NULL, /* bus operations */
15883 nodev, /* dev power */
15884 ddi_quiesce_not_needed, /* quiesce */
15885};
15886
15887static struct modldrv modldrv = {
15888 &mod_driverops, /* module type (this is a pseudo driver) */
15889 "Dynamic Tracing", /* name of module */
15890 &dtrace_ops, /* driver ops */
15891};
15892
15893static struct modlinkage modlinkage = {
15894 MODREV_1,
15895 (void *)&modldrv,
15896 NULL
15897};
15898
15899int
15900_init(void)
15901{
15902 return (mod_install(&modlinkage));
15903}
15904
15905int
15906_info(struct modinfo *modinfop)
15907{
15908 return (mod_info(&modlinkage, modinfop));
15909}
15910
15911int
15912_fini(void)
15913{
15914 return (mod_remove(&modlinkage));
15915}
15916
15917#endif /* !VBOX */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette