regcomp.c

Last change on this file was 3548, checked in by bird, 3 years ago
grep: Use get_crt_codepage(). Don't default to the UTF-8 manifest for older VCC versions as the CRT won't do the right thing.
Property svn:eol-style set to `native`
File size: 111.9 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. */
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t *token, int token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127
128	/* This table gives an error message for each of the error codes listed
129	in regex.h. Obviously the order here has to be same as there.
130	POSIX doesn't require that we do anything for REG_NOERROR,
131	but why not be nice? */
132
133	static const char __re_error_msgid[] =
134	{
135	#define REG_NOERROR_IDX 0
136	gettext_noop ("Success") /* REG_NOERROR */
137	"\0"
138	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139	gettext_noop ("No match") /* REG_NOMATCH */
140	"\0"
141	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143	"\0"
144	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146	"\0"
147	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149	"\0"
150	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152	"\0"
153	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155	"\0"
156	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157	gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158	"\0"
159	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
160	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161	"\0"
162	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164	"\0"
165	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167	"\0"
168	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169	gettext_noop ("Invalid range end") /* REG_ERANGE */
170	"\0"
171	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172	gettext_noop ("Memory exhausted") /* REG_ESPACE */
173	"\0"
174	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176	"\0"
177	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178	gettext_noop ("Premature end of regular expression") /* REG_EEND */
179	"\0"
180	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181	gettext_noop ("Regular expression too big") /* REG_ESIZE */
182	"\0"
183	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185	};
186
187	static const size_t __re_error_msgid_idx[] =
188	{
189	REG_NOERROR_IDX,
190	REG_NOMATCH_IDX,
191	REG_BADPAT_IDX,
192	REG_ECOLLATE_IDX,
193	REG_ECTYPE_IDX,
194	REG_EESCAPE_IDX,
195	REG_ESUBREG_IDX,
196	REG_EBRACK_IDX,
197	REG_EPAREN_IDX,
198	REG_EBRACE_IDX,
199	REG_BADBR_IDX,
200	REG_ERANGE_IDX,
201	REG_ESPACE_IDX,
202	REG_BADRPT_IDX,
203	REG_EEND_IDX,
204	REG_ESIZE_IDX,
205	REG_ERPAREN_IDX
206	};
207
208
209	/* Entry points for GNU code. */
210
211	/* re_compile_pattern is the GNU regular expression compiler: it
212	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213	Returns 0 if the pattern was valid, otherwise an error string.
214
215	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216	are set in BUFP on entry. */
217
218	const char *
219	re_compile_pattern (const char *pattern, size_t length,
220	struct re_pattern_buffer *bufp)
221	{
222	reg_errcode_t ret;
223
224	/* And GNU code determines whether or not to get register information
225	by passing null for the REGS argument to re_match, etc., not by
226	setting no_sub, unless RE_NO_SUB is set. */
227	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
228
229	/* Match anchors at newline. */
230	bufp->newline_anchor = 1;
231
232	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
233
234	if (!ret)
235	return NULL;
236	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237	}
238	weak_alias (__re_compile_pattern, re_compile_pattern)
239
240	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241	also be assigned to arbitrarily: each pattern buffer stores its own
242	syntax, so it can be changed between regex compilations. */
243	/* This has no initializer because initialized variables in Emacs
244	become read-only after dumping. */
245	reg_syntax_t re_syntax_options;
246
247
248	/* Specify the precise syntax of regexps for compilation. This provides
249	for compatibility for various utilities which historically have
250	different, incompatible syntaxes.
251
252	The argument SYNTAX is a bit mask comprised of the various bits
253	defined in regex.h. We return the old syntax. */
254
255	reg_syntax_t
256	re_set_syntax (reg_syntax_t syntax)
257	{
258	reg_syntax_t ret = re_syntax_options;
259
260	re_syntax_options = syntax;
261	return ret;
262	}
263	weak_alias (__re_set_syntax, re_set_syntax)
264
265	int
266	re_compile_fastmap (struct re_pattern_buffer *bufp)
267	{
268	re_dfa_t *dfa = bufp->buffer;
269	char *fastmap = bufp->fastmap;
270
271	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
272	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
273	if (dfa->init_state != dfa->init_state_word)
274	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
275	if (dfa->init_state != dfa->init_state_nl)
276	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
277	if (dfa->init_state != dfa->init_state_begbuf)
278	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
279	bufp->fastmap_accurate = 1;
280	return 0;
281	}
282	weak_alias (__re_compile_fastmap, re_compile_fastmap)
283
284	static inline void
285	__attribute__ ((always_inline))
286	re_set_fastmap (char *fastmap, bool icase, int ch)
287	{
288	fastmap[ch] = 1;
289	if (icase)
290	fastmap[tolower (ch)] = 1;
291	}
292
293	/* Helper function for re_compile_fastmap.
294	Compile fastmap for the initial_state INIT_STATE. */
295
296	static void
297	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
298	char *fastmap)
299	{
300	re_dfa_t *dfa = bufp->buffer;
301	Idx node_cnt;
302	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
303	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304	{
305	Idx node = init_state->nodes.elems[node_cnt];
306	re_token_type_t type = dfa->nodes[node].type;
307
308	if (type == CHARACTER)
309	{
310	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311	#ifdef RE_ENABLE_I18N
312	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
313	{
314	unsigned char buf[MB_LEN_MAX];
315	unsigned char *p;
316	wchar_t wc;
317	mbstate_t state;
318
319	p = buf;
320	*p++ = dfa->nodes[node].opr.c;
321	while (++node < dfa->nodes_len
322	&& dfa->nodes[node].type == CHARACTER
323	&& dfa->nodes[node].mb_partial)
324	*p++ = dfa->nodes[node].opr.c;
325	memset (&state, '\0', sizeof (state));
326	if (__mbrtowc (&wc, (const char *) buf, p - buf,
327	&state) == p - buf
328	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
329	!= (size_t) -1))
330	re_set_fastmap (fastmap, false, buf[0]);
331	}
332	#endif
333	}
334	else if (type == SIMPLE_BRACKET)
335	{
336	int i, ch;
337	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338	{
339	int j;
340	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342	if (w & ((bitset_word_t) 1 << j))
343	re_set_fastmap (fastmap, icase, ch);
344	}
345	}
346	#ifdef RE_ENABLE_I18N
347	else if (type == COMPLEX_BRACKET)
348	{
349	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350	Idx i;
351
352	# ifdef _LIBC
353	/* See if we have to try all bytes which start multiple collation
354	elements.
355	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
356	collation element, and don't catch 'b' since 'b' is
357	the only collation element which starts from 'b' (and
358	it is caught by SIMPLE_BRACKET). */
359	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
360	&& (cset->ncoll_syms \|\| cset->nranges))
361	{
362	const int32_t table = (const int32_t )
363	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364	for (i = 0; i < SBC_MAX; ++i)
365	if (table[i] < 0)
366	re_set_fastmap (fastmap, icase, i);
367	}
368	# endif /* _LIBC */
369
370	/* See if we have to start the match at all multibyte characters,
371	i.e. where we would not find an invalid sequence. This only
372	applies to multibyte character sets; for single byte character
373	sets, the SIMPLE_BRACKET again suffices. */
374	if (dfa->mb_cur_max > 1
375	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
376	# ifdef _LIBC
377	\|\| cset->nequiv_classes
378	# endif /* _LIBC */
379	))
380	{
381	unsigned char c = 0;
382	do
383	{
384	mbstate_t mbs;
385	memset (&mbs, 0, sizeof (mbs));
386	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
387	re_set_fastmap (fastmap, false, (int) c);
388	}
389	while (++c != 0);
390	}
391
392	else
393	{
394	/* ... Else catch all bytes which can start the mbchars. */
395	for (i = 0; i < cset->nmbchars; ++i)
396	{
397	char buf[256];
398	mbstate_t state;
399	memset (&state, '\0', sizeof (state));
400	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
401	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
402	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
403	{
404	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
405	!= (size_t) -1)
406	re_set_fastmap (fastmap, false, (unsigned char ) buf);
407	}
408	}
409	}
410	}
411	#endif /* RE_ENABLE_I18N */
412	else if (type == OP_PERIOD
413	#ifdef RE_ENABLE_I18N
414	\|\| type == OP_UTF8_PERIOD
415	#endif /* RE_ENABLE_I18N */
416	\|\| type == END_OF_RE)
417	{
418	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419	if (type == END_OF_RE)
420	bufp->can_be_null = 1;
421	return;
422	}
423	}
424	}
425
426
427	/* Entry point for POSIX code. */
428	/* regcomp takes a regular expression as a string and compiles it.
429
430	PREG is a regex_t *. We do not expect any fields to be initialized,
431	since POSIX says we shouldn't. Thus, we set
432
433	'buffer' to the compiled pattern;
434	'used' to the length of the compiled pattern;
435	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436	REG_EXTENDED bit in CFLAGS is set; otherwise, to
437	RE_SYNTAX_POSIX_BASIC;
438	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
439	'fastmap' to an allocated space for the fastmap;
440	'fastmap_accurate' to zero;
441	're_nsub' to the number of subexpressions in PATTERN.
442
443	PATTERN is the address of the pattern string.
444
445	CFLAGS is a series of bits which affect compilation.
446
447	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448	use POSIX basic syntax.
449
450	If REG_NEWLINE is set, then . and [^...] don't match newline.
451	Also, regexec will try a match beginning after every newline.
452
453	If REG_ICASE is set, then we considers upper- and lowercase
454	versions of letters to be equivalent when matching.
455
456	If REG_NOSUB is set, then when PREG is passed to regexec, that
457	routine will report only success or failure, and nothing about the
458	registers.
459
460	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461	the return codes and their meanings.) */
462
463	int
464	regcomp (regex_t __restrict preg, const char __restrict pattern, int cflags)
465	{
466	reg_errcode_t ret;
467	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468	: RE_SYNTAX_POSIX_BASIC);
469
470	preg->buffer = NULL;
471	preg->allocated = 0;
472	preg->used = 0;
473
474	/* Try to allocate space for the fastmap. */
475	preg->fastmap = re_malloc (char, SBC_MAX);
476	if (__glibc_unlikely (preg->fastmap == NULL))
477	return REG_ESPACE;
478
479	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
480
481	/* If REG_NEWLINE is set, newlines are treated differently. */
482	if (cflags & REG_NEWLINE)
483	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
484	syntax &= ~RE_DOT_NEWLINE;
485	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
486	/* It also changes the matching behavior. */
487	preg->newline_anchor = 1;
488	}
489	else
490	preg->newline_anchor = 0;
491	preg->no_sub = !!(cflags & REG_NOSUB);
492	preg->translate = NULL;
493
494	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495
496	/* POSIX doesn't distinguish between an unmatched open-group and an
497	unmatched close-group: both are REG_EPAREN. */
498	if (ret == REG_ERPAREN)
499	ret = REG_EPAREN;
500
501	/* We have already checked preg->fastmap != NULL. */
502	if (__glibc_likely (ret == REG_NOERROR))
503	/* Compute the fastmap now, since regexec cannot modify the pattern
504	buffer. This function never fails in this implementation. */
505	(void) re_compile_fastmap (preg);
506	else
507	{
508	/* Some error occurred while compiling the expression. */
509	re_free (preg->fastmap);
510	preg->fastmap = NULL;
511	}
512
513	return (int) ret;
514	}
515	libc_hidden_def (__regcomp)
516	weak_alias (__regcomp, regcomp)
517
518	/* Returns a message corresponding to an error code, ERRCODE, returned
519	from either regcomp or regexec. We don't use PREG here. */
520
521	size_t
522	regerror (int errcode, const regex_t __restrict preg, char __restrict errbuf,
523	size_t errbuf_size)
524	{
525	const char *msg;
526	size_t msg_size;
527	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
528
529	if (__glibc_unlikely (errcode < 0 \|\| errcode >= nerrcodes))
530	/* Only error codes returned by the rest of the code should be passed
531	to this routine. If we are given anything else, or if other regex
532	code generates an invalid error code, then the program has a bug.
533	Dump core so we can fix it. */
534	abort ();
535
536	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
537
538	msg_size = strlen (msg) + 1; /* Includes the null. */
539
540	if (__glibc_likely (errbuf_size != 0))
541	{
542	size_t cpy_size = msg_size;
543	if (__glibc_unlikely (msg_size > errbuf_size))
544	{
545	cpy_size = errbuf_size - 1;
546	errbuf[cpy_size] = '\0';
547	}
548	memcpy (errbuf, msg, cpy_size);
549	}
550
551	return msg_size;
552	}
553	weak_alias (__regerror, regerror)
554
555
556	#ifdef RE_ENABLE_I18N
557	/* This static array is used for the map to single-byte characters when
558	UTF-8 is used. Otherwise we would allocate memory just to initialize
559	it the same all the time. UTF-8 is the preferred encoding so this is
560	a worthwhile optimization. */
561	static const bitset_t utf8_sb_map =
562	{
563	/* Set the first 128 bits. */
564	# if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
565	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
566	# else
567	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
568	# error "bitset_word_t is narrower than 32 bits"
569	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
570	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
571	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
572	BITSET_WORD_MAX, BITSET_WORD_MAX,
573	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574	BITSET_WORD_MAX,
575	# endif
576	(BITSET_WORD_MAX
577	>> (SBC_MAX % BITSET_WORD_BITS == 0
578	? 0
579	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
580	# endif
581	};
582	#endif
583
584
585	static void
586	free_dfa_content (re_dfa_t *dfa)
587	{
588	Idx i, j;
589
590	if (dfa->nodes)
591	for (i = 0; i < dfa->nodes_len; ++i)
592	free_token (dfa->nodes + i);
593	re_free (dfa->nexts);
594	for (i = 0; i < dfa->nodes_len; ++i)
595	{
596	if (dfa->eclosures != NULL)
597	re_node_set_free (dfa->eclosures + i);
598	if (dfa->inveclosures != NULL)
599	re_node_set_free (dfa->inveclosures + i);
600	if (dfa->edests != NULL)
601	re_node_set_free (dfa->edests + i);
602	}
603	re_free (dfa->edests);
604	re_free (dfa->eclosures);
605	re_free (dfa->inveclosures);
606	re_free (dfa->nodes);
607
608	if (dfa->state_table)
609	for (i = 0; i <= dfa->state_hash_mask; ++i)
610	{
611	struct re_state_table_entry *entry = dfa->state_table + i;
612	for (j = 0; j < entry->num; ++j)
613	{
614	re_dfastate_t *state = entry->array[j];
615	free_state (state);
616	}
617	re_free (entry->array);
618	}
619	re_free (dfa->state_table);
620	#ifdef RE_ENABLE_I18N
621	if (dfa->sb_char != utf8_sb_map)
622	re_free (dfa->sb_char);
623	#endif
624	re_free (dfa->subexp_map);
625	#ifdef DEBUG
626	re_free (dfa->re_str);
627	#endif
628
629	re_free (dfa);
630	}
631
632
633	/* Free dynamically allocated space used by PREG. */
634
635	void
636	regfree (regex_t *preg)
637	{
638	re_dfa_t *dfa = preg->buffer;
639	if (__glibc_likely (dfa != NULL))
640	{
641	lock_fini (dfa->lock);
642	free_dfa_content (dfa);
643	}
644	preg->buffer = NULL;
645	preg->allocated = 0;
646
647	re_free (preg->fastmap);
648	preg->fastmap = NULL;
649
650	re_free (preg->translate);
651	preg->translate = NULL;
652	}
653	libc_hidden_def (__regfree)
654	weak_alias (__regfree, regfree)
655
656
657	/* Entry points compatible with 4.2 BSD regex library. We don't define
658	them unless specifically requested. */
659
660	#if defined _REGEX_RE_COMP \|\| defined _LIBC
661
662	/* BSD has one and only one pattern buffer. */
663	static struct re_pattern_buffer re_comp_buf;
664
665	char *
666	# ifdef _LIBC
667	/* Make these definitions weak in libc, so POSIX programs can redefine
668	these names if they don't use our functions, and still use
669	regcomp/regexec above without link errors. */
670	weak_function
671	# endif
672	re_comp (const char *s)
673	{
674	reg_errcode_t ret;
675	char *fastmap;
676
677	if (!s)
678	{
679	if (!re_comp_buf.buffer)
680	return gettext ("No previous regular expression");
681	return 0;
682	}
683
684	if (re_comp_buf.buffer)
685	{
686	fastmap = re_comp_buf.fastmap;
687	re_comp_buf.fastmap = NULL;
688	__regfree (&re_comp_buf);
689	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690	re_comp_buf.fastmap = fastmap;
691	}
692
693	if (re_comp_buf.fastmap == NULL)
694	{
695	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
696	if (re_comp_buf.fastmap == NULL)
697	return (char *) gettext (__re_error_msgid
698	+ __re_error_msgid_idx[(int) REG_ESPACE]);
699	}
700
701	/* Since 're_exec' always passes NULL for the 'regs' argument, we
702	don't need to initialize the pattern buffer fields which affect it. */
703
704	/* Match anchors at newlines. */
705	re_comp_buf.newline_anchor = 1;
706
707	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709	if (!ret)
710	return NULL;
711
712	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
713	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714	}
715
716	#ifdef _LIBC
717	libc_freeres_fn (free_mem)
718	{
719	__regfree (&re_comp_buf);
720	}
721	#endif
722
723	#endif /* _REGEX_RE_COMP */
724
725
726	/* Internal entry point.
727	Compile the regular expression PATTERN, whose length is LENGTH.
728	SYNTAX indicate regular expression's syntax. */
729
730	static reg_errcode_t
731	re_compile_internal (regex_t preg, const char pattern, size_t length,
732	reg_syntax_t syntax)
733	{
734	reg_errcode_t err = REG_NOERROR;
735	re_dfa_t *dfa;
736	re_string_t regexp;
737
738	/* Initialize the pattern buffer. */
739	preg->fastmap_accurate = 0;
740	preg->syntax = syntax;
741	preg->not_bol = preg->not_eol = 0;
742	preg->used = 0;
743	preg->re_nsub = 0;
744	preg->can_be_null = 0;
745	preg->regs_allocated = REGS_UNALLOCATED;
746
747	/* Initialize the dfa. */
748	dfa = preg->buffer;
749	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
750	{
751	/* If zero allocated, but buffer is non-null, try to realloc
752	enough space. This loses if buffer's address is bogus, but
753	that is the user's responsibility. If ->buffer is NULL this
754	is a simple allocation. */
755	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756	if (dfa == NULL)
757	return REG_ESPACE;
758	preg->allocated = sizeof (re_dfa_t);
759	preg->buffer = dfa;
760	}
761	preg->used = sizeof (re_dfa_t);
762
763	err = init_dfa (dfa, length);
764	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
765	err = REG_ESPACE;
766	if (__glibc_unlikely (err != REG_NOERROR))
767	{
768	free_dfa_content (dfa);
769	preg->buffer = NULL;
770	preg->allocated = 0;
771	return err;
772	}
773	#ifdef DEBUG
774	/* Note: length+1 will not overflow since it is checked in init_dfa. */
775	dfa->re_str = re_malloc (char, length + 1);
776	strncpy (dfa->re_str, pattern, length + 1);
777	#endif
778
779	err = re_string_construct (&regexp, pattern, length, preg->translate,
780	(syntax & RE_ICASE) != 0, dfa);
781	if (__glibc_unlikely (err != REG_NOERROR))
782	{
783	re_compile_internal_free_return:
784	free_workarea_compile (preg);
785	re_string_destruct (&regexp);
786	lock_fini (dfa->lock);
787	free_dfa_content (dfa);
788	preg->buffer = NULL;
789	preg->allocated = 0;
790	return err;
791	}
792
793	/* Parse the regular expression, and build a structure tree. */
794	preg->re_nsub = 0;
795	dfa->str_tree = parse (&regexp, preg, syntax, &err);
796	if (__glibc_unlikely (dfa->str_tree == NULL))
797	goto re_compile_internal_free_return;
798
799	/* Analyze the tree and create the nfa. */
800	err = analyze (preg);
801	if (__glibc_unlikely (err != REG_NOERROR))
802	goto re_compile_internal_free_return;
803
804	#ifdef RE_ENABLE_I18N
805	/* If possible, do searching in single byte encoding to speed things up. */
806	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
807	optimize_utf8 (dfa);
808	#endif
809
810	/* Then create the initial state of the dfa. */
811	err = create_initial_state (dfa);
812
813	/* Release work areas. */
814	free_workarea_compile (preg);
815	re_string_destruct (&regexp);
816
817	if (__glibc_unlikely (err != REG_NOERROR))
818	{
819	lock_fini (dfa->lock);
820	free_dfa_content (dfa);
821	preg->buffer = NULL;
822	preg->allocated = 0;
823	}
824
825	return err;
826	}
827
828	/* Initialize DFA. We use the length of the regular expression PAT_LEN
829	as the initial length of some arrays. */
830
831	static reg_errcode_t
832	init_dfa (re_dfa_t *dfa, size_t pat_len)
833	{
834	__re_size_t table_size;
835	#ifndef _LIBC
836	const char *codeset_name;
837	#endif
838	#ifdef RE_ENABLE_I18N
839	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
840	#else
841	size_t max_i18n_object_size = 0;
842	#endif
843	size_t max_object_size =
844	MAX (sizeof (struct re_state_table_entry),
845	MAX (sizeof (re_token_t),
846	MAX (sizeof (re_node_set),
847	MAX (sizeof (regmatch_t),
848	max_i18n_object_size))));
849
850	memset (dfa, '\0', sizeof (re_dfa_t));
851
852	/* Force allocation of str_tree_storage the first time. */
853	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
856	calculation below, and for similar doubling calculations
857	elsewhere. And it's <= rather than <, because some of the
858	doubling calculations add 1 afterwards. */
859	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
860	<= pat_len))
861	return REG_ESPACE;
862
863	dfa->nodes_alloc = pat_len + 1;
864	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865
866	/* table_size = 2 ^ ceil(log pat_len) */
867	for (table_size = 1; ; table_size <<= 1)
868	if (table_size > pat_len)
869	break;
870
871	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872	dfa->state_hash_mask = table_size - 1;
873
874	dfa->mb_cur_max = MB_CUR_MAX;
875	#ifdef _LIBC
876	if (dfa->mb_cur_max == 6
877	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878	dfa->is_utf8 = 1;
879	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880	!= 0);
881	#else
882	# ifdef _MSC_VER
883	(void)codeset_name;
884	if (get_crt_codepage() == CP_UTF8)
885	# else
886	codeset_name = nl_langinfo (CODESET);
887	if ((codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
888	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
889	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
890	&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
891	# endif
892	dfa->is_utf8 = 1;
893
894	/* We check exhaustively in the loop below if this charset is a
895	superset of ASCII. */
896	dfa->map_notascii = 0;
897	#endif
898
899	#ifdef RE_ENABLE_I18N
900	if (dfa->mb_cur_max > 1)
901	{
902	if (dfa->is_utf8)
903	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904	else
905	{
906	int i, j, ch;
907
908	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
909	if (__glibc_unlikely (dfa->sb_char == NULL))
910	return REG_ESPACE;
911
912	/* Set the bits corresponding to single byte chars. */
913	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
914	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
915	{
916	wint_t wch = __btowc (ch);
917	if (wch != WEOF)
918	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
919	# ifndef _LIBC
920	if (isascii (ch) && wch != ch)
921	dfa->map_notascii = 1;
922	# endif
923	}
924	}
925	}
926	#endif
927
928	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
929	return REG_ESPACE;
930	return REG_NOERROR;
931	}
932
933	/* Initialize WORD_CHAR table, which indicate which character is
934	"word". In this case "word" means that it is the word construction
935	character used by some operators like "\<", "\>", etc. */
936
937	static void
938	init_word_char (re_dfa_t *dfa)
939	{
940	int i = 0;
941	int j;
942	int ch = 0;
943	dfa->word_ops_used = 1;
944	if (__glibc_likely (dfa->map_notascii == 0))
945	{
946	/* Avoid uint32_t and uint64_t as some non-GCC platforms lack
947	them, an issue when this code is used in Gnulib. */
948	bitset_word_t bits0 = 0x00000000;
949	bitset_word_t bits1 = 0x03ff0000;
950	bitset_word_t bits2 = 0x87fffffe;
951	bitset_word_t bits3 = 0x07fffffe;
952	if (BITSET_WORD_BITS == 64)
953	{
954	/* Pacify gcc -Woverflow on 32-bit platformns. */
955	dfa->word_char[0] = bits1 << 31 << 1 \| bits0;
956	dfa->word_char[1] = bits3 << 31 << 1 \| bits2;
957	i = 2;
958	}
959	else if (BITSET_WORD_BITS == 32)
960	{
961	dfa->word_char[0] = bits0;
962	dfa->word_char[1] = bits1;
963	dfa->word_char[2] = bits2;
964	dfa->word_char[3] = bits3;
965	i = 4;
966	}
967	else
968	goto general_case;
969	ch = 128;
970
971	if (__glibc_likely (dfa->is_utf8))
972	{
973	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
974	return;
975	}
976	}
977
978	general_case:
979	for (; i < BITSET_WORDS; ++i)
980	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
981	if (isalnum (ch) \|\| ch == '_')
982	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
983	}
984
985	/* Free the work area which are only used while compiling. */
986
987	static void
988	free_workarea_compile (regex_t *preg)
989	{
990	re_dfa_t *dfa = preg->buffer;
991	bin_tree_storage_t storage, next;
992	for (storage = dfa->str_tree_storage; storage; storage = next)
993	{
994	next = storage->next;
995	re_free (storage);
996	}
997	dfa->str_tree_storage = NULL;
998	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
999	dfa->str_tree = NULL;
1000	re_free (dfa->org_indices);
1001	dfa->org_indices = NULL;
1002	}
1003
1004	/* Create initial states for all contexts. */
1005
1006	static reg_errcode_t
1007	create_initial_state (re_dfa_t *dfa)
1008	{
1009	Idx first, i;
1010	reg_errcode_t err;
1011	re_node_set init_nodes;
1012
1013	/* Initial states have the epsilon closure of the node which is
1014	the first node of the regular expression. */
1015	first = dfa->str_tree->first->node_idx;
1016	dfa->init_node = first;
1017	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1018	if (__glibc_unlikely (err != REG_NOERROR))
1019	return err;
1020
1021	/* The back-references which are in initial states can epsilon transit,
1022	since in this case all of the subexpressions can be null.
1023	Then we add epsilon closures of the nodes which are the next nodes of
1024	the back-references. */
1025	if (dfa->nbackref > 0)
1026	for (i = 0; i < init_nodes.nelem; ++i)
1027	{
1028	Idx node_idx = init_nodes.elems[i];
1029	re_token_type_t type = dfa->nodes[node_idx].type;
1030
1031	Idx clexp_idx;
1032	if (type != OP_BACK_REF)
1033	continue;
1034	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1035	{
1036	re_token_t *clexp_node;
1037	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1038	if (clexp_node->type == OP_CLOSE_SUBEXP
1039	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1040	break;
1041	}
1042	if (clexp_idx == init_nodes.nelem)
1043	continue;
1044
1045	if (type == OP_BACK_REF)
1046	{
1047	Idx dest_idx = dfa->edests[node_idx].elems[0];
1048	if (!re_node_set_contains (&init_nodes, dest_idx))
1049	{
1050	reg_errcode_t merge_err
1051	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1052	if (merge_err != REG_NOERROR)
1053	return merge_err;
1054	i = 0;
1055	}
1056	}
1057	}
1058
1059	/* It must be the first time to invoke acquire_state. */
1060	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1061	/* We don't check ERR here, since the initial state must not be NULL. */
1062	if (__glibc_unlikely (dfa->init_state == NULL))
1063	return err;
1064	if (dfa->init_state->has_constraint)
1065	{
1066	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1067	CONTEXT_WORD);
1068	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1069	CONTEXT_NEWLINE);
1070	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1071	&init_nodes,
1072	CONTEXT_NEWLINE
1073	\| CONTEXT_BEGBUF);
1074	if (__glibc_unlikely (dfa->init_state_word == NULL
1075	\|\| dfa->init_state_nl == NULL
1076	\|\| dfa->init_state_begbuf == NULL))
1077	return err;
1078	}
1079	else
1080	dfa->init_state_word = dfa->init_state_nl
1081	= dfa->init_state_begbuf = dfa->init_state;
1082
1083	re_node_set_free (&init_nodes);
1084	return REG_NOERROR;
1085	}
1086
1087
1088	#ifdef RE_ENABLE_I18N
1089	/* If it is possible to do searching in single byte encoding instead of UTF-8
1090	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091	DFA nodes where needed. */
1092
1093	static void
1094	optimize_utf8 (re_dfa_t *dfa)
1095	{
1096	Idx node;
1097	int i;
1098	bool mb_chars = false;
1099	bool has_period = false;
1100
1101	for (node = 0; node < dfa->nodes_len; ++node)
1102	switch (dfa->nodes[node].type)
1103	{
1104	case CHARACTER:
1105	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106	mb_chars = true;
1107	break;
1108	case ANCHOR:
1109	switch (dfa->nodes[node].opr.ctx_type)
1110	{
1111	case LINE_FIRST:
1112	case LINE_LAST:
1113	case BUF_FIRST:
1114	case BUF_LAST:
1115	break;
1116	default:
1117	/* Word anchors etc. cannot be handled. It's okay to test
1118	opr.ctx_type since constraints (for all DFA nodes) are
1119	created by ORing one or more opr.ctx_type values. */
1120	return;
1121	}
1122	break;
1123	case OP_PERIOD:
1124	has_period = true;
1125	break;
1126	case OP_BACK_REF:
1127	case OP_ALT:
1128	case END_OF_RE:
1129	case OP_DUP_ASTERISK:
1130	case OP_OPEN_SUBEXP:
1131	case OP_CLOSE_SUBEXP:
1132	break;
1133	case COMPLEX_BRACKET:
1134	return;
1135	case SIMPLE_BRACKET:
1136	/* Just double check. */
1137	{
1138	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139	? 0
1140	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142	{
1143	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144	return;
1145	rshift = 0;
1146	}
1147	}
1148	break;
1149	default:
1150	abort ();
1151	}
1152
1153	if (mb_chars \|\| has_period)
1154	for (node = 0; node < dfa->nodes_len; ++node)
1155	{
1156	if (dfa->nodes[node].type == CHARACTER
1157	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1158	dfa->nodes[node].mb_partial = 0;
1159	else if (dfa->nodes[node].type == OP_PERIOD)
1160	dfa->nodes[node].type = OP_UTF8_PERIOD;
1161	}
1162
1163	/* The search can be in single byte locale. */
1164	dfa->mb_cur_max = 1;
1165	dfa->is_utf8 = 0;
1166	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1167	}
1168	#endif
1169
1170
1171	/* Analyze the structure tree, and calculate "first", "next", "edest",
1172	"eclosure", and "inveclosure". */
1173
1174	static reg_errcode_t
1175	analyze (regex_t *preg)
1176	{
1177	re_dfa_t *dfa = preg->buffer;
1178	reg_errcode_t ret;
1179
1180	/* Allocate arrays. */
1181	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1182	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1183	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1184	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1185	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1186	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1187	return REG_ESPACE;
1188
1189	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1190	if (dfa->subexp_map != NULL)
1191	{
1192	Idx i;
1193	for (i = 0; i < preg->re_nsub; i++)
1194	dfa->subexp_map[i] = i;
1195	preorder (dfa->str_tree, optimize_subexps, dfa);
1196	for (i = 0; i < preg->re_nsub; i++)
1197	if (dfa->subexp_map[i] != i)
1198	break;
1199	if (i == preg->re_nsub)
1200	{
1201	re_free (dfa->subexp_map);
1202	dfa->subexp_map = NULL;
1203	}
1204	}
1205
1206	ret = postorder (dfa->str_tree, lower_subexps, preg);
1207	if (__glibc_unlikely (ret != REG_NOERROR))
1208	return ret;
1209	ret = postorder (dfa->str_tree, calc_first, dfa);
1210	if (__glibc_unlikely (ret != REG_NOERROR))
1211	return ret;
1212	preorder (dfa->str_tree, calc_next, dfa);
1213	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1214	if (__glibc_unlikely (ret != REG_NOERROR))
1215	return ret;
1216	ret = calc_eclosure (dfa);
1217	if (__glibc_unlikely (ret != REG_NOERROR))
1218	return ret;
1219
1220	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1221	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1222	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1223	\|\| dfa->nbackref)
1224	{
1225	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1226	if (__glibc_unlikely (dfa->inveclosures == NULL))
1227	return REG_ESPACE;
1228	ret = calc_inveclosure (dfa);
1229	}
1230
1231	return ret;
1232	}
1233
1234	/* Our parse trees are very unbalanced, so we cannot use a stack to
1235	implement parse tree visits. Instead, we use parent pointers and
1236	some hairy code in these two functions. */
1237	static reg_errcode_t
1238	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1239	void *extra)
1240	{
1241	bin_tree_t node, prev;
1242
1243	for (node = root; ; )
1244	{
1245	/* Descend down the tree, preferably to the left (or to the right
1246	if that's the only child). */
1247	while (node->left \|\| node->right)
1248	if (node->left)
1249	node = node->left;
1250	else
1251	node = node->right;
1252
1253	do
1254	{
1255	reg_errcode_t err = fn (extra, node);
1256	if (__glibc_unlikely (err != REG_NOERROR))
1257	return err;
1258	if (node->parent == NULL)
1259	return REG_NOERROR;
1260	prev = node;
1261	node = node->parent;
1262	}
1263	/* Go up while we have a node that is reached from the right. */
1264	while (node->right == prev \|\| node->right == NULL);
1265	node = node->right;
1266	}
1267	}
1268
1269	static reg_errcode_t
1270	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1271	void *extra)
1272	{
1273	bin_tree_t *node;
1274
1275	for (node = root; ; )
1276	{
1277	reg_errcode_t err = fn (extra, node);
1278	if (__glibc_unlikely (err != REG_NOERROR))
1279	return err;
1280
1281	/* Go to the left node, or up and to the right. */
1282	if (node->left)
1283	node = node->left;
1284	else
1285	{
1286	bin_tree_t *prev = NULL;
1287	while (node->right == prev \|\| node->right == NULL)
1288	{
1289	prev = node;
1290	node = node->parent;
1291	if (!node)
1292	return REG_NOERROR;
1293	}
1294	node = node->right;
1295	}
1296	}
1297	}
1298
1299	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1300	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1301	backreferences as well. Requires a preorder visit. */
1302	static reg_errcode_t
1303	optimize_subexps (void extra, bin_tree_t node)
1304	{
1305	re_dfa_t dfa = (re_dfa_t ) extra;
1306
1307	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1308	{
1309	int idx = node->token.opr.idx;
1310	node->token.opr.idx = dfa->subexp_map[idx];
1311	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1312	}
1313
1314	else if (node->token.type == SUBEXP
1315	&& node->left && node->left->token.type == SUBEXP)
1316	{
1317	Idx other_idx = node->left->token.opr.idx;
1318
1319	node->left = node->left->left;
1320	if (node->left)
1321	node->left->parent = node;
1322
1323	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1324	if (other_idx < BITSET_WORD_BITS)
1325	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1326	}
1327
1328	return REG_NOERROR;
1329	}
1330
1331	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1332	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1333	static reg_errcode_t
1334	lower_subexps (void extra, bin_tree_t node)
1335	{
1336	regex_t preg = (regex_t ) extra;
1337	reg_errcode_t err = REG_NOERROR;
1338
1339	if (node->left && node->left->token.type == SUBEXP)
1340	{
1341	node->left = lower_subexp (&err, preg, node->left);
1342	if (node->left)
1343	node->left->parent = node;
1344	}
1345	if (node->right && node->right->token.type == SUBEXP)
1346	{
1347	node->right = lower_subexp (&err, preg, node->right);
1348	if (node->right)
1349	node->right->parent = node;
1350	}
1351
1352	return err;
1353	}
1354
1355	static bin_tree_t *
1356	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1357	{
1358	re_dfa_t *dfa = preg->buffer;
1359	bin_tree_t *body = node->left;
1360	bin_tree_t op, cls, tree1, tree;
1361
1362	if (preg->no_sub
1363	/* We do not optimize empty subexpressions, because otherwise we may
1364	have bad CONCAT nodes with NULL children. This is obviously not
1365	very common, so we do not lose much. An example that triggers
1366	this case is the sed "script" //x. */
1367	&& node->left != NULL
1368	&& (node->token.opr.idx >= BITSET_WORD_BITS
1369	\|\| !(dfa->used_bkref_map
1370	& ((bitset_word_t) 1 << node->token.opr.idx))))
1371	return node->left;
1372
1373	/* Convert the SUBEXP node to the concatenation of an
1374	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1375	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1376	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1377	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1378	tree = create_tree (dfa, op, tree1, CONCAT);
1379	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1380	\|\| op == NULL \|\| cls == NULL))
1381	{
1382	*err = REG_ESPACE;
1383	return NULL;
1384	}
1385
1386	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388	return tree;
1389	}
1390
1391	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392	nodes. Requires a postorder visit. */
1393	static reg_errcode_t
1394	calc_first (void extra, bin_tree_t node)
1395	{
1396	re_dfa_t dfa = (re_dfa_t ) extra;
1397	if (node->token.type == CONCAT)
1398	{
1399	node->first = node->left->first;
1400	node->node_idx = node->left->node_idx;
1401	}
1402	else
1403	{
1404	node->first = node;
1405	node->node_idx = re_dfa_add_node (dfa, node->token);
1406	if (__glibc_unlikely (node->node_idx == -1))
1407	return REG_ESPACE;
1408	if (node->token.type == ANCHOR)
1409	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1410	}
1411	return REG_NOERROR;
1412	}
1413
1414	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1415	static reg_errcode_t
1416	calc_next (void extra, bin_tree_t node)
1417	{
1418	switch (node->token.type)
1419	{
1420	case OP_DUP_ASTERISK:
1421	node->left->next = node;
1422	break;
1423	case CONCAT:
1424	node->left->next = node->right->first;
1425	node->right->next = node->next;
1426	break;
1427	default:
1428	if (node->left)
1429	node->left->next = node->next;
1430	if (node->right)
1431	node->right->next = node->next;
1432	break;
1433	}
1434	return REG_NOERROR;
1435	}
1436
1437	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438	static reg_errcode_t
1439	link_nfa_nodes (void extra, bin_tree_t node)
1440	{
1441	re_dfa_t dfa = (re_dfa_t ) extra;
1442	Idx idx = node->node_idx;
1443	reg_errcode_t err = REG_NOERROR;
1444
1445	switch (node->token.type)
1446	{
1447	case CONCAT:
1448	break;
1449
1450	case END_OF_RE:
1451	DEBUG_ASSERT (node->next == NULL);
1452	break;
1453
1454	case OP_DUP_ASTERISK:
1455	case OP_ALT:
1456	{
1457	Idx left, right;
1458	dfa->has_plural_match = 1;
1459	if (node->left != NULL)
1460	left = node->left->first->node_idx;
1461	else
1462	left = node->next->node_idx;
1463	if (node->right != NULL)
1464	right = node->right->first->node_idx;
1465	else
1466	right = node->next->node_idx;
1467	DEBUG_ASSERT (left > -1);
1468	DEBUG_ASSERT (right > -1);
1469	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1470	}
1471	break;
1472
1473	case ANCHOR:
1474	case OP_OPEN_SUBEXP:
1475	case OP_CLOSE_SUBEXP:
1476	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477	break;
1478
1479	case OP_BACK_REF:
1480	dfa->nexts[idx] = node->next->node_idx;
1481	if (node->token.type == OP_BACK_REF)
1482	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483	break;
1484
1485	default:
1486	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1487	dfa->nexts[idx] = node->next->node_idx;
1488	break;
1489	}
1490
1491	return err;
1492	}
1493
1494	/* Duplicate the epsilon closure of the node ROOT_NODE.
1495	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496	to their own constraint. */
1497
1498	static reg_errcode_t
1499	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500	Idx root_node, unsigned int init_constraint)
1501	{
1502	Idx org_node, clone_node;
1503	bool ok;
1504	unsigned int constraint = init_constraint;
1505	for (org_node = top_org_node, clone_node = top_clone_node;;)
1506	{
1507	Idx org_dest, clone_dest;
1508	if (dfa->nodes[org_node].type == OP_BACK_REF)
1509	{
1510	/* If the back reference epsilon-transit, its destination must
1511	also have the constraint. Then duplicate the epsilon closure
1512	of the destination of the back reference, and store it in
1513	edests of the back reference. */
1514	org_dest = dfa->nexts[org_node];
1515	re_node_set_empty (dfa->edests + clone_node);
1516	clone_dest = duplicate_node (dfa, org_dest, constraint);
1517	if (__glibc_unlikely (clone_dest == -1))
1518	return REG_ESPACE;
1519	dfa->nexts[clone_node] = dfa->nexts[org_node];
1520	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521	if (__glibc_unlikely (! ok))
1522	return REG_ESPACE;
1523	}
1524	else if (dfa->edests[org_node].nelem == 0)
1525	{
1526	/* In case of the node can't epsilon-transit, don't duplicate the
1527	destination and store the original destination as the
1528	destination of the node. */
1529	dfa->nexts[clone_node] = dfa->nexts[org_node];
1530	break;
1531	}
1532	else if (dfa->edests[org_node].nelem == 1)
1533	{
1534	/* In case of the node can epsilon-transit, and it has only one
1535	destination. */
1536	org_dest = dfa->edests[org_node].elems[0];
1537	re_node_set_empty (dfa->edests + clone_node);
1538	/* If the node is root_node itself, it means the epsilon closure
1539	has a loop. Then tie it to the destination of the root_node. */
1540	if (org_node == root_node && clone_node != org_node)
1541	{
1542	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543	if (__glibc_unlikely (! ok))
1544	return REG_ESPACE;
1545	break;
1546	}
1547	/* In case the node has another constraint, append it. */
1548	constraint \|= dfa->nodes[org_node].constraint;
1549	clone_dest = duplicate_node (dfa, org_dest, constraint);
1550	if (__glibc_unlikely (clone_dest == -1))
1551	return REG_ESPACE;
1552	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553	if (__glibc_unlikely (! ok))
1554	return REG_ESPACE;
1555	}
1556	else /* dfa->edests[org_node].nelem == 2 */
1557	{
1558	/* In case of the node can epsilon-transit, and it has two
1559	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1560	org_dest = dfa->edests[org_node].elems[0];
1561	re_node_set_empty (dfa->edests + clone_node);
1562	/* Search for a duplicated node which satisfies the constraint. */
1563	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564	if (clone_dest == -1)
1565	{
1566	/* There is no such duplicated node, create a new one. */
1567	reg_errcode_t err;
1568	clone_dest = duplicate_node (dfa, org_dest, constraint);
1569	if (__glibc_unlikely (clone_dest == -1))
1570	return REG_ESPACE;
1571	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572	if (__glibc_unlikely (! ok))
1573	return REG_ESPACE;
1574	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575	root_node, constraint);
1576	if (__glibc_unlikely (err != REG_NOERROR))
1577	return err;
1578	}
1579	else
1580	{
1581	/* There is a duplicated node which satisfies the constraint,
1582	use it to avoid infinite loop. */
1583	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584	if (__glibc_unlikely (! ok))
1585	return REG_ESPACE;
1586	}
1587
1588	org_dest = dfa->edests[org_node].elems[1];
1589	clone_dest = duplicate_node (dfa, org_dest, constraint);
1590	if (__glibc_unlikely (clone_dest == -1))
1591	return REG_ESPACE;
1592	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593	if (__glibc_unlikely (! ok))
1594	return REG_ESPACE;
1595	}
1596	org_node = org_dest;
1597	clone_node = clone_dest;
1598	}
1599	return REG_NOERROR;
1600	}
1601
1602	/* Search for a node which is duplicated from the node ORG_NODE, and
1603	satisfies the constraint CONSTRAINT. */
1604
1605	static Idx
1606	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607	unsigned int constraint)
1608	{
1609	Idx idx;
1610	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1611	{
1612	if (org_node == dfa->org_indices[idx]
1613	&& constraint == dfa->nodes[idx].constraint)
1614	return idx; /* Found. */
1615	}
1616	return -1; /* Not found. */
1617	}
1618
1619	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620	Return the index of the new node, or -1 if insufficient storage is
1621	available. */
1622
1623	static Idx
1624	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1625	{
1626	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627	if (__glibc_likely (dup_idx != -1))
1628	{
1629	dfa->nodes[dup_idx].constraint = constraint;
1630	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1631	dfa->nodes[dup_idx].duplicated = 1;
1632
1633	/* Store the index of the original node. */
1634	dfa->org_indices[dup_idx] = org_idx;
1635	}
1636	return dup_idx;
1637	}
1638
1639	static reg_errcode_t
1640	calc_inveclosure (re_dfa_t *dfa)
1641	{
1642	Idx src, idx;
1643	bool ok;
1644	for (idx = 0; idx < dfa->nodes_len; ++idx)
1645	re_node_set_init_empty (dfa->inveclosures + idx);
1646
1647	for (src = 0; src < dfa->nodes_len; ++src)
1648	{
1649	Idx *elems = dfa->eclosures[src].elems;
1650	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1651	{
1652	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653	if (__glibc_unlikely (! ok))
1654	return REG_ESPACE;
1655	}
1656	}
1657
1658	return REG_NOERROR;
1659	}
1660
1661	/* Calculate "eclosure" for all the node in DFA. */
1662
1663	static reg_errcode_t
1664	calc_eclosure (re_dfa_t *dfa)
1665	{
1666	Idx node_idx;
1667	bool incomplete;
1668	DEBUG_ASSERT (dfa->nodes_len > 0);
1669	incomplete = false;
1670	/* For each nodes, calculate epsilon closure. */
1671	for (node_idx = 0; ; ++node_idx)
1672	{
1673	reg_errcode_t err;
1674	re_node_set eclosure_elem;
1675	if (node_idx == dfa->nodes_len)
1676	{
1677	if (!incomplete)
1678	break;
1679	incomplete = false;
1680	node_idx = 0;
1681	}
1682
1683	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1684
1685	/* If we have already calculated, skip it. */
1686	if (dfa->eclosures[node_idx].nelem != 0)
1687	continue;
1688	/* Calculate epsilon closure of 'node_idx'. */
1689	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1690	if (__glibc_unlikely (err != REG_NOERROR))
1691	return err;
1692
1693	if (dfa->eclosures[node_idx].nelem == 0)
1694	{
1695	incomplete = true;
1696	re_node_set_free (&eclosure_elem);
1697	}
1698	}
1699	return REG_NOERROR;
1700	}
1701
1702	/* Calculate epsilon closure of NODE. */
1703
1704	static reg_errcode_t
1705	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1706	{
1707	reg_errcode_t err;
1708	Idx i;
1709	re_node_set eclosure;
1710	bool incomplete = false;
1711	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712	if (__glibc_unlikely (err != REG_NOERROR))
1713	return err;
1714
1715	/* An epsilon closure includes itself. */
1716	eclosure.elems[eclosure.nelem++] = node;
1717
1718	/* This indicates that we are calculating this node now.
1719	We reference this value to avoid infinite loop. */
1720	dfa->eclosures[node].nelem = -1;
1721
1722	/* If the current node has constraints, duplicate all nodes
1723	since they must inherit the constraints. */
1724	if (dfa->nodes[node].constraint
1725	&& dfa->edests[node].nelem
1726	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727	{
1728	err = duplicate_node_closure (dfa, node, node, node,
1729	dfa->nodes[node].constraint);
1730	if (__glibc_unlikely (err != REG_NOERROR))
1731	return err;
1732	}
1733
1734	/* Expand each epsilon destination nodes. */
1735	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736	for (i = 0; i < dfa->edests[node].nelem; ++i)
1737	{
1738	re_node_set eclosure_elem;
1739	Idx edest = dfa->edests[node].elems[i];
1740	/* If calculating the epsilon closure of 'edest' is in progress,
1741	return intermediate result. */
1742	if (dfa->eclosures[edest].nelem == -1)
1743	{
1744	incomplete = true;
1745	continue;
1746	}
1747	/* If we haven't calculated the epsilon closure of 'edest' yet,
1748	calculate now. Otherwise use calculated epsilon closure. */
1749	if (dfa->eclosures[edest].nelem == 0)
1750	{
1751	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752	if (__glibc_unlikely (err != REG_NOERROR))
1753	return err;
1754	}
1755	else
1756	eclosure_elem = dfa->eclosures[edest];
1757	/* Merge the epsilon closure of 'edest'. */
1758	err = re_node_set_merge (&eclosure, &eclosure_elem);
1759	if (__glibc_unlikely (err != REG_NOERROR))
1760	return err;
1761	/* If the epsilon closure of 'edest' is incomplete,
1762	the epsilon closure of this node is also incomplete. */
1763	if (dfa->eclosures[edest].nelem == 0)
1764	{
1765	incomplete = true;
1766	re_node_set_free (&eclosure_elem);
1767	}
1768	}
1769
1770	if (incomplete && !root)
1771	dfa->eclosures[node].nelem = 0;
1772	else
1773	dfa->eclosures[node] = eclosure;
1774	*new_set = eclosure;
1775	return REG_NOERROR;
1776	}
1777
1778
1779	/* Functions for token which are used in the parser. */
1780
1781	/* Fetch a token from INPUT.
1782	We must not use this function inside bracket expressions. */
1783
1784	static void
1785	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1786	{
1787	re_string_skip_bytes (input, peek_token (result, input, syntax));
1788	}
1789
1790	/* Peek a token from INPUT, and return the length of the token.
1791	We must not use this function inside bracket expressions. */
1792
1793	static int
1794	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1795	{
1796	unsigned char c;
1797
1798	if (re_string_eoi (input))
1799	{
1800	token->type = END_OF_RE;
1801	return 0;
1802	}
1803
1804	c = re_string_peek_byte (input, 0);
1805	token->opr.c = c;
1806
1807	token->word_char = 0;
1808	#ifdef RE_ENABLE_I18N
1809	token->mb_partial = 0;
1810	if (input->mb_cur_max > 1
1811	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1812	{
1813	token->type = CHARACTER;
1814	token->mb_partial = 1;
1815	return 1;
1816	}
1817	#endif
1818	if (c == '\\')
1819	{
1820	unsigned char c2;
1821	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1822	{
1823	token->type = BACK_SLASH;
1824	return 1;
1825	}
1826
1827	c2 = re_string_peek_byte_case (input, 1);
1828	token->opr.c = c2;
1829	token->type = CHARACTER;
1830	#ifdef RE_ENABLE_I18N
1831	if (input->mb_cur_max > 1)
1832	{
1833	wint_t wc = re_string_wchar_at (input,
1834	re_string_cur_idx (input) + 1);
1835	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1836	}
1837	else
1838	#endif
1839	token->word_char = IS_WORD_CHAR (c2) != 0;
1840
1841	switch (c2)
1842	{
1843	case '\|':
1844	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1845	token->type = OP_ALT;
1846	break;
1847	case '1': case '2': case '3': case '4': case '5':
1848	case '6': case '7': case '8': case '9':
1849	if (!(syntax & RE_NO_BK_REFS))
1850	{
1851	token->type = OP_BACK_REF;
1852	token->opr.idx = c2 - '1';
1853	}
1854	break;
1855	case '<':
1856	if (!(syntax & RE_NO_GNU_OPS))
1857	{
1858	token->type = ANCHOR;
1859	token->opr.ctx_type = WORD_FIRST;
1860	}
1861	break;
1862	case '>':
1863	if (!(syntax & RE_NO_GNU_OPS))
1864	{
1865	token->type = ANCHOR;
1866	token->opr.ctx_type = WORD_LAST;
1867	}
1868	break;
1869	case 'b':
1870	if (!(syntax & RE_NO_GNU_OPS))
1871	{
1872	token->type = ANCHOR;
1873	token->opr.ctx_type = WORD_DELIM;
1874	}
1875	break;
1876	case 'B':
1877	if (!(syntax & RE_NO_GNU_OPS))
1878	{
1879	token->type = ANCHOR;
1880	token->opr.ctx_type = NOT_WORD_DELIM;
1881	}
1882	break;
1883	case 'w':
1884	if (!(syntax & RE_NO_GNU_OPS))
1885	token->type = OP_WORD;
1886	break;
1887	case 'W':
1888	if (!(syntax & RE_NO_GNU_OPS))
1889	token->type = OP_NOTWORD;
1890	break;
1891	case 's':
1892	if (!(syntax & RE_NO_GNU_OPS))
1893	token->type = OP_SPACE;
1894	break;
1895	case 'S':
1896	if (!(syntax & RE_NO_GNU_OPS))
1897	token->type = OP_NOTSPACE;
1898	break;
1899	case '`':
1900	if (!(syntax & RE_NO_GNU_OPS))
1901	{
1902	token->type = ANCHOR;
1903	token->opr.ctx_type = BUF_FIRST;
1904	}
1905	break;
1906	case '\'':
1907	if (!(syntax & RE_NO_GNU_OPS))
1908	{
1909	token->type = ANCHOR;
1910	token->opr.ctx_type = BUF_LAST;
1911	}
1912	break;
1913	case '(':
1914	if (!(syntax & RE_NO_BK_PARENS))
1915	token->type = OP_OPEN_SUBEXP;
1916	break;
1917	case ')':
1918	if (!(syntax & RE_NO_BK_PARENS))
1919	token->type = OP_CLOSE_SUBEXP;
1920	break;
1921	case '+':
1922	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1923	token->type = OP_DUP_PLUS;
1924	break;
1925	case '?':
1926	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1927	token->type = OP_DUP_QUESTION;
1928	break;
1929	case '{':
1930	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1931	token->type = OP_OPEN_DUP_NUM;
1932	break;
1933	case '}':
1934	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1935	token->type = OP_CLOSE_DUP_NUM;
1936	break;
1937	default:
1938	break;
1939	}
1940	return 2;
1941	}
1942
1943	token->type = CHARACTER;
1944	#ifdef RE_ENABLE_I18N
1945	if (input->mb_cur_max > 1)
1946	{
1947	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1948	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1949	}
1950	else
1951	#endif
1952	token->word_char = IS_WORD_CHAR (token->opr.c);
1953
1954	switch (c)
1955	{
1956	case '\n':
1957	if (syntax & RE_NEWLINE_ALT)
1958	token->type = OP_ALT;
1959	break;
1960	case '\|':
1961	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1962	token->type = OP_ALT;
1963	break;
1964	case '*':
1965	token->type = OP_DUP_ASTERISK;
1966	break;
1967	case '+':
1968	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1969	token->type = OP_DUP_PLUS;
1970	break;
1971	case '?':
1972	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1973	token->type = OP_DUP_QUESTION;
1974	break;
1975	case '{':
1976	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1977	token->type = OP_OPEN_DUP_NUM;
1978	break;
1979	case '}':
1980	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1981	token->type = OP_CLOSE_DUP_NUM;
1982	break;
1983	case '(':
1984	if (syntax & RE_NO_BK_PARENS)
1985	token->type = OP_OPEN_SUBEXP;
1986	break;
1987	case ')':
1988	if (syntax & RE_NO_BK_PARENS)
1989	token->type = OP_CLOSE_SUBEXP;
1990	break;
1991	case '[':
1992	token->type = OP_OPEN_BRACKET;
1993	break;
1994	case '.':
1995	token->type = OP_PERIOD;
1996	break;
1997	case '^':
1998	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1999	&& re_string_cur_idx (input) != 0)
2000	{
2001	char prev = re_string_peek_byte (input, -1);
2002	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
2003	break;
2004	}
2005	token->type = ANCHOR;
2006	token->opr.ctx_type = LINE_FIRST;
2007	break;
2008	case '$':
2009	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2010	&& re_string_cur_idx (input) + 1 != re_string_length (input))
2011	{
2012	re_token_t next;
2013	re_string_skip_bytes (input, 1);
2014	peek_token (&next, input, syntax);
2015	re_string_skip_bytes (input, -1);
2016	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2017	break;
2018	}
2019	token->type = ANCHOR;
2020	token->opr.ctx_type = LINE_LAST;
2021	break;
2022	default:
2023	break;
2024	}
2025	return 1;
2026	}
2027
2028	/* Peek a token from INPUT, and return the length of the token.
2029	We must not use this function out of bracket expressions. */
2030
2031	static int
2032	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2033	{
2034	unsigned char c;
2035	if (re_string_eoi (input))
2036	{
2037	token->type = END_OF_RE;
2038	return 0;
2039	}
2040	c = re_string_peek_byte (input, 0);
2041	token->opr.c = c;
2042
2043	#ifdef RE_ENABLE_I18N
2044	if (input->mb_cur_max > 1
2045	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2046	{
2047	token->type = CHARACTER;
2048	return 1;
2049	}
2050	#endif /* RE_ENABLE_I18N */
2051
2052	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2053	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2054	{
2055	/* In this case, '\' escape a character. */
2056	unsigned char c2;
2057	re_string_skip_bytes (input, 1);
2058	c2 = re_string_peek_byte (input, 0);
2059	token->opr.c = c2;
2060	token->type = CHARACTER;
2061	return 1;
2062	}
2063	if (c == '[') /* '[' is a special char in a bracket exps. */
2064	{
2065	unsigned char c2;
2066	int token_len;
2067	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2068	c2 = re_string_peek_byte (input, 1);
2069	else
2070	c2 = 0;
2071	token->opr.c = c2;
2072	token_len = 2;
2073	switch (c2)
2074	{
2075	case '.':
2076	token->type = OP_OPEN_COLL_ELEM;
2077	break;
2078
2079	case '=':
2080	token->type = OP_OPEN_EQUIV_CLASS;
2081	break;
2082
2083	case ':':
2084	if (syntax & RE_CHAR_CLASSES)
2085	{
2086	token->type = OP_OPEN_CHAR_CLASS;
2087	break;
2088	}
2089	FALLTHROUGH;
2090	default:
2091	token->type = CHARACTER;
2092	token->opr.c = c;
2093	token_len = 1;
2094	break;
2095	}
2096	return token_len;
2097	}
2098	switch (c)
2099	{
2100	case '-':
2101	token->type = OP_CHARSET_RANGE;
2102	break;
2103	case ']':
2104	token->type = OP_CLOSE_BRACKET;
2105	break;
2106	case '^':
2107	token->type = OP_NON_MATCH_LIST;
2108	break;
2109	default:
2110	token->type = CHARACTER;
2111	}
2112	return 1;
2113	}
2114
2115
2116	/* Functions for parser. */
2117
2118	/* Entry point of the parser.
2119	Parse the regular expression REGEXP and return the structure tree.
2120	If an error occurs, ERR is set by error code, and return NULL.
2121	This function build the following tree, from regular expression <reg_exp>:
2122	CAT
2123	/ \
2124	/ \
2125	<reg_exp> EOR
2126
2127	CAT means concatenation.
2128	EOR means end of regular expression. */
2129
2130	static bin_tree_t *
2131	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2132	reg_errcode_t *err)
2133	{
2134	re_dfa_t *dfa = preg->buffer;
2135	bin_tree_t tree, eor, *root;
2136	re_token_t current_token;
2137	dfa->syntax = syntax;
2138	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2139	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2140	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2141	return NULL;
2142	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2143	if (tree != NULL)
2144	root = create_tree (dfa, tree, eor, CONCAT);
2145	else
2146	root = eor;
2147	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2148	{
2149	*err = REG_ESPACE;
2150	return NULL;
2151	}
2152	return root;
2153	}
2154
2155	/* This function build the following tree, from regular expression
2156	<branch1>\|<branch2>:
2157	ALT
2158	/ \
2159	/ \
2160	<branch1> <branch2>
2161
2162	ALT means alternative, which represents the operator '\|'. */
2163
2164	static bin_tree_t *
2165	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2166	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2167	{
2168	re_dfa_t *dfa = preg->buffer;
2169	bin_tree_t tree, branch = NULL;
2170	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2171	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2173	return NULL;
2174
2175	while (token->type == OP_ALT)
2176	{
2177	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2178	if (token->type != OP_ALT && token->type != END_OF_RE
2179	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2180	{
2181	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2182	dfa->completed_bkref_map = initial_bkref_map;
2183	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2184	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2185	{
2186	if (tree != NULL)
2187	postorder (tree, free_tree, NULL);
2188	return NULL;
2189	}
2190	dfa->completed_bkref_map \|= accumulated_bkref_map;
2191	}
2192	else
2193	branch = NULL;
2194	tree = create_tree (dfa, tree, branch, OP_ALT);
2195	if (__glibc_unlikely (tree == NULL))
2196	{
2197	*err = REG_ESPACE;
2198	return NULL;
2199	}
2200	}
2201	return tree;
2202	}
2203
2204	/* This function build the following tree, from regular expression
2205	<exp1><exp2>:
2206	CAT
2207	/ \
2208	/ \
2209	<exp1> <exp2>
2210
2211	CAT means concatenation. */
2212
2213	static bin_tree_t *
2214	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2215	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216	{
2217	bin_tree_t tree, expr;
2218	re_dfa_t *dfa = preg->buffer;
2219	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2221	return NULL;
2222
2223	while (token->type != OP_ALT && token->type != END_OF_RE
2224	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2225	{
2226	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2228	{
2229	if (tree != NULL)
2230	postorder (tree, free_tree, NULL);
2231	return NULL;
2232	}
2233	if (tree != NULL && expr != NULL)
2234	{
2235	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236	if (newtree == NULL)
2237	{
2238	postorder (expr, free_tree, NULL);
2239	postorder (tree, free_tree, NULL);
2240	*err = REG_ESPACE;
2241	return NULL;
2242	}
2243	tree = newtree;
2244	}
2245	else if (tree == NULL)
2246	tree = expr;
2247	/* Otherwise expr == NULL, we don't need to create new tree. */
2248	}
2249	return tree;
2250	}
2251
2252	/* This function build the following tree, from regular expression a*:
2253	*
2254	\|
2255	a
2256	*/
2257
2258	static bin_tree_t *
2259	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2260	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261	{
2262	re_dfa_t *dfa = preg->buffer;
2263	bin_tree_t *tree;
2264	switch (token->type)
2265	{
2266	case CHARACTER:
2267	tree = create_token_tree (dfa, NULL, NULL, token);
2268	if (__glibc_unlikely (tree == NULL))
2269	{
2270	*err = REG_ESPACE;
2271	return NULL;
2272	}
2273	#ifdef RE_ENABLE_I18N
2274	if (dfa->mb_cur_max > 1)
2275	{
2276	while (!re_string_eoi (regexp)
2277	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278	{
2279	bin_tree_t *mbc_remain;
2280	fetch_token (token, regexp, syntax);
2281	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2284	{
2285	*err = REG_ESPACE;
2286	return NULL;
2287	}
2288	}
2289	}
2290	#endif
2291	break;
2292
2293	case OP_OPEN_SUBEXP:
2294	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2295	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2296	return NULL;
2297	break;
2298
2299	case OP_OPEN_BRACKET:
2300	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2301	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2302	return NULL;
2303	break;
2304
2305	case OP_BACK_REF:
2306	if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2307	{
2308	*err = REG_ESUBREG;
2309	return NULL;
2310	}
2311	dfa->used_bkref_map \|= 1 << token->opr.idx;
2312	tree = create_token_tree (dfa, NULL, NULL, token);
2313	if (__glibc_unlikely (tree == NULL))
2314	{
2315	*err = REG_ESPACE;
2316	return NULL;
2317	}
2318	++dfa->nbackref;
2319	dfa->has_mb_node = 1;
2320	break;
2321
2322	case OP_OPEN_DUP_NUM:
2323	if (syntax & RE_CONTEXT_INVALID_DUP)
2324	{
2325	*err = REG_BADRPT;
2326	return NULL;
2327	}
2328	FALLTHROUGH;
2329	case OP_DUP_ASTERISK:
2330	case OP_DUP_PLUS:
2331	case OP_DUP_QUESTION:
2332	if (syntax & RE_CONTEXT_INVALID_OPS)
2333	{
2334	*err = REG_BADRPT;
2335	return NULL;
2336	}
2337	else if (syntax & RE_CONTEXT_INDEP_OPS)
2338	{
2339	fetch_token (token, regexp, syntax);
2340	return parse_expression (regexp, preg, token, syntax, nest, err);
2341	}
2342	FALLTHROUGH;
2343	case OP_CLOSE_SUBEXP:
2344	if ((token->type == OP_CLOSE_SUBEXP)
2345	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346	{
2347	*err = REG_ERPAREN;
2348	return NULL;
2349	}
2350	FALLTHROUGH;
2351	case OP_CLOSE_DUP_NUM:
2352	/* We treat it as a normal character. */
2353
2354	/* Then we can these characters as normal characters. */
2355	token->type = CHARACTER;
2356	/* mb_partial and word_char bits should be initialized already
2357	by peek_token. */
2358	tree = create_token_tree (dfa, NULL, NULL, token);
2359	if (__glibc_unlikely (tree == NULL))
2360	{
2361	*err = REG_ESPACE;
2362	return NULL;
2363	}
2364	break;
2365
2366	case ANCHOR:
2367	if ((token->opr.ctx_type
2368	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2369	&& dfa->word_ops_used == 0)
2370	init_word_char (dfa);
2371	if (token->opr.ctx_type == WORD_DELIM
2372	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2373	{
2374	bin_tree_t tree_first, tree_last;
2375	if (token->opr.ctx_type == WORD_DELIM)
2376	{
2377	token->opr.ctx_type = WORD_FIRST;
2378	tree_first = create_token_tree (dfa, NULL, NULL, token);
2379	token->opr.ctx_type = WORD_LAST;
2380	}
2381	else
2382	{
2383	token->opr.ctx_type = INSIDE_WORD;
2384	tree_first = create_token_tree (dfa, NULL, NULL, token);
2385	token->opr.ctx_type = INSIDE_NOTWORD;
2386	}
2387	tree_last = create_token_tree (dfa, NULL, NULL, token);
2388	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2389	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2390	\|\| tree == NULL))
2391	{
2392	*err = REG_ESPACE;
2393	return NULL;
2394	}
2395	}
2396	else
2397	{
2398	tree = create_token_tree (dfa, NULL, NULL, token);
2399	if (__glibc_unlikely (tree == NULL))
2400	{
2401	*err = REG_ESPACE;
2402	return NULL;
2403	}
2404	}
2405	/* We must return here, since ANCHORs can't be followed
2406	by repetition operators.
2407	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2408	it must not be "<ANCHOR(^)><REPEAT()>". /
2409	fetch_token (token, regexp, syntax);
2410	return tree;
2411
2412	case OP_PERIOD:
2413	tree = create_token_tree (dfa, NULL, NULL, token);
2414	if (__glibc_unlikely (tree == NULL))
2415	{
2416	*err = REG_ESPACE;
2417	return NULL;
2418	}
2419	if (dfa->mb_cur_max > 1)
2420	dfa->has_mb_node = 1;
2421	break;
2422
2423	case OP_WORD:
2424	case OP_NOTWORD:
2425	tree = build_charclass_op (dfa, regexp->trans,
2426	"alnum",
2427	"_",
2428	token->type == OP_NOTWORD, err);
2429	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2430	return NULL;
2431	break;
2432
2433	case OP_SPACE:
2434	case OP_NOTSPACE:
2435	tree = build_charclass_op (dfa, regexp->trans,
2436	"space",
2437	"",
2438	token->type == OP_NOTSPACE, err);
2439	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2440	return NULL;
2441	break;
2442
2443	case OP_ALT:
2444	case END_OF_RE:
2445	return NULL;
2446
2447	case BACK_SLASH:
2448	*err = REG_EESCAPE;
2449	return NULL;
2450
2451	default:
2452	/* Must not happen? */
2453	DEBUG_ASSERT (false);
2454	return NULL;
2455	}
2456	fetch_token (token, regexp, syntax);
2457
2458	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2459	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2460	{
2461	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2462	syntax, err);
2463	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2464	{
2465	if (tree != NULL)
2466	postorder (tree, free_tree, NULL);
2467	return NULL;
2468	}
2469	tree = dup_tree;
2470	/* In BRE consecutive duplications are not allowed. */
2471	if ((syntax & RE_CONTEXT_INVALID_DUP)
2472	&& (token->type == OP_DUP_ASTERISK
2473	\|\| token->type == OP_OPEN_DUP_NUM))
2474	{
2475	if (tree != NULL)
2476	postorder (tree, free_tree, NULL);
2477	*err = REG_BADRPT;
2478	return NULL;
2479	}
2480	}
2481
2482	return tree;
2483	}
2484
2485	/* This function build the following tree, from regular expression
2486	(<reg_exp>):
2487	SUBEXP
2488	\|
2489	<reg_exp>
2490	*/
2491
2492	static bin_tree_t *
2493	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2494	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2495	{
2496	re_dfa_t *dfa = preg->buffer;
2497	bin_tree_t *tree;
2498	size_t cur_nsub;
2499	cur_nsub = preg->re_nsub++;
2500
2501	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2502
2503	/* The subexpression may be a null string. */
2504	if (token->type == OP_CLOSE_SUBEXP)
2505	tree = NULL;
2506	else
2507	{
2508	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2509	if (__glibc_unlikely (*err == REG_NOERROR
2510	&& token->type != OP_CLOSE_SUBEXP))
2511	{
2512	if (tree != NULL)
2513	postorder (tree, free_tree, NULL);
2514	*err = REG_EPAREN;
2515	}
2516	if (__glibc_unlikely (*err != REG_NOERROR))
2517	return NULL;
2518	}
2519
2520	if (cur_nsub <= '9' - '1')
2521	dfa->completed_bkref_map \|= 1 << cur_nsub;
2522
2523	tree = create_tree (dfa, tree, NULL, SUBEXP);
2524	if (__glibc_unlikely (tree == NULL))
2525	{
2526	*err = REG_ESPACE;
2527	return NULL;
2528	}
2529	tree->token.opr.idx = cur_nsub;
2530	return tree;
2531	}
2532
2533	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2534
2535	static bin_tree_t *
2536	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2537	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2538	{
2539	bin_tree_t tree = NULL, old_tree = NULL;
2540	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2541	re_token_t start_token = *token;
2542
2543	if (token->type == OP_OPEN_DUP_NUM)
2544	{
2545	end = 0;
2546	start = fetch_number (regexp, token, syntax);
2547	if (start == -1)
2548	{
2549	if (token->type == CHARACTER && token->opr.c == ',')
2550	start = 0; /* We treat "{,m}" as "{0,m}". */
2551	else
2552	{
2553	err = REG_BADBR; / <re>{} is invalid. */
2554	return NULL;
2555	}
2556	}
2557	if (__glibc_likely (start != -2))
2558	{
2559	/* We treat "{n}" as "{n,n}". */
2560	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2561	: ((token->type == CHARACTER && token->opr.c == ',')
2562	? fetch_number (regexp, token, syntax) : -2));
2563	}
2564	if (__glibc_unlikely (start == -2 \|\| end == -2))
2565	{
2566	/* Invalid sequence. */
2567	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2568	{
2569	if (token->type == END_OF_RE)
2570	*err = REG_EBRACE;
2571	else
2572	*err = REG_BADBR;
2573
2574	return NULL;
2575	}
2576
2577	/* If the syntax bit is set, rollback. */
2578	re_string_set_index (regexp, start_idx);
2579	*token = start_token;
2580	token->type = CHARACTER;
2581	/* mb_partial and word_char bits should be already initialized by
2582	peek_token. */
2583	return elem;
2584	}
2585
2586	if (__glibc_unlikely ((end != -1 && start > end)
2587	\|\| token->type != OP_CLOSE_DUP_NUM))
2588	{
2589	/* First number greater than second. */
2590	*err = REG_BADBR;
2591	return NULL;
2592	}
2593
2594	if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2595	{
2596	*err = REG_ESIZE;
2597	return NULL;
2598	}
2599	}
2600	else
2601	{
2602	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2603	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2604	}
2605
2606	fetch_token (token, regexp, syntax);
2607
2608	if (__glibc_unlikely (elem == NULL))
2609	return NULL;
2610	if (__glibc_unlikely (start == 0 && end == 0))
2611	{
2612	postorder (elem, free_tree, NULL);
2613	return NULL;
2614	}
2615
2616	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2617	if (__glibc_unlikely (start > 0))
2618	{
2619	tree = elem;
2620	for (i = 2; i <= start; ++i)
2621	{
2622	elem = duplicate_tree (elem, dfa);
2623	tree = create_tree (dfa, tree, elem, CONCAT);
2624	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2625	goto parse_dup_op_espace;
2626	}
2627
2628	if (start == end)
2629	return tree;
2630
2631	/* Duplicate ELEM before it is marked optional. */
2632	elem = duplicate_tree (elem, dfa);
2633	if (__glibc_unlikely (elem == NULL))
2634	goto parse_dup_op_espace;
2635	old_tree = tree;
2636	}
2637	else
2638	old_tree = NULL;
2639
2640	if (elem->token.type == SUBEXP)
2641	{
2642	uintptr_t subidx = elem->token.opr.idx;
2643	postorder (elem, mark_opt_subexp, (void *) subidx);
2644	}
2645
2646	tree = create_tree (dfa, elem, NULL,
2647	(end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2648	if (__glibc_unlikely (tree == NULL))
2649	goto parse_dup_op_espace;
2650
2651	/* This loop is actually executed only when end != -1,
2652	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2653	already created the start+1-th copy. */
2654	if (TYPE_SIGNED (Idx) \|\| end != -1)
2655	for (i = start + 2; i <= end; ++i)
2656	{
2657	elem = duplicate_tree (elem, dfa);
2658	tree = create_tree (dfa, tree, elem, CONCAT);
2659	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2660	goto parse_dup_op_espace;
2661
2662	tree = create_tree (dfa, tree, NULL, OP_ALT);
2663	if (__glibc_unlikely (tree == NULL))
2664	goto parse_dup_op_espace;
2665	}
2666
2667	if (old_tree)
2668	tree = create_tree (dfa, old_tree, tree, CONCAT);
2669
2670	return tree;
2671
2672	parse_dup_op_espace:
2673	*err = REG_ESPACE;
2674	return NULL;
2675	}
2676
2677	/* Size of the names for collating symbol/equivalence_class/character_class.
2678	I'm not sure, but maybe enough. */
2679	#define BRACKET_NAME_BUF_SIZE 32
2680
2681	#ifndef _LIBC
2682
2683	# ifdef RE_ENABLE_I18N
2684	/* Convert the byte B to the corresponding wide character. In a
2685	unibyte locale, treat B as itself. In a multibyte locale, return
2686	WEOF if B is an encoding error. */
2687	static wint_t
2688	parse_byte (unsigned char b, re_charset_t *mbcset)
2689	{
2690	return mbcset == NULL ? b : __btowc (b);
2691	}
2692	# endif
2693
2694	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2695	Build the range expression which starts from START_ELEM, and ends
2696	at END_ELEM. The result are written to MBCSET and SBCSET.
2697	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2698	mbcset->range_ends, is a pointer argument since we may
2699	update it. */
2700
2701	static reg_errcode_t
2702	# ifdef RE_ENABLE_I18N
2703	build_range_exp (const reg_syntax_t syntax,
2704	bitset_t sbcset,
2705	re_charset_t *mbcset,
2706	Idx *range_alloc,
2707	const bracket_elem_t *start_elem,
2708	const bracket_elem_t *end_elem)
2709	# else /* not RE_ENABLE_I18N */
2710	build_range_exp (const reg_syntax_t syntax,
2711	bitset_t sbcset,
2712	const bracket_elem_t *start_elem,
2713	const bracket_elem_t *end_elem)
2714	# endif /* not RE_ENABLE_I18N */
2715	{
2716	unsigned int start_ch, end_ch;
2717	/* Equivalence Classes and Character Classes can't be a range start/end. */
2718	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2719	\|\| start_elem->type == CHAR_CLASS
2720	\|\| end_elem->type == EQUIV_CLASS
2721	\|\| end_elem->type == CHAR_CLASS))
2722	return REG_ERANGE;
2723
2724	/* We can handle no multi character collating elements without libc
2725	support. */
2726	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2727	&& strlen ((char *) start_elem->opr.name) > 1)
2728	\|\| (end_elem->type == COLL_SYM
2729	&& strlen ((char *) end_elem->opr.name) > 1)))
2730	return REG_ECOLLATE;
2731
2732	# ifdef RE_ENABLE_I18N
2733	{
2734	wchar_t wc;
2735	wint_t start_wc;
2736	wint_t end_wc;
2737
2738	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2739	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740	: 0));
2741	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2742	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743	: 0));
2744	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2745	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2746	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2747	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2748	if (start_wc == WEOF \|\| end_wc == WEOF)
2749	return REG_ECOLLATE;
2750	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2751	&& start_wc > end_wc))
2752	return REG_ERANGE;
2753
2754	/* Got valid collation sequence values, add them as a new entry.
2755	However, for !_LIBC we have no collation elements: if the
2756	character set is single byte, the single byte character set
2757	that we build below suffices. parse_bracket_exp passes
2758	no MBCSET if dfa->mb_cur_max == 1. */
2759	if (mbcset)
2760	{
2761	/* Check the space of the arrays. */
2762	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2763	{
2764	/* There is not enough space, need realloc. */
2765	wchar_t new_array_start, new_array_end;
2766	Idx new_nranges;
2767
2768	/* +1 in case of mbcset->nranges is 0. */
2769	new_nranges = 2 * mbcset->nranges + 1;
2770	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2771	are NULL if range_alloc == 0. /
2772	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2773	new_nranges);
2774	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2775	new_nranges);
2776
2777	if (__glibc_unlikely (new_array_start == NULL
2778	\|\| new_array_end == NULL))
2779	{
2780	re_free (new_array_start);
2781	re_free (new_array_end);
2782	return REG_ESPACE;
2783	}
2784
2785	mbcset->range_starts = new_array_start;
2786	mbcset->range_ends = new_array_end;
2787	*range_alloc = new_nranges;
2788	}
2789
2790	mbcset->range_starts[mbcset->nranges] = start_wc;
2791	mbcset->range_ends[mbcset->nranges++] = end_wc;
2792	}
2793
2794	/* Build the table for single byte characters. */
2795	for (wc = 0; wc < SBC_MAX; ++wc)
2796	{
2797	if (start_wc <= wc && wc <= end_wc)
2798	bitset_set (sbcset, wc);
2799	}
2800	}
2801	# else /* not RE_ENABLE_I18N */
2802	{
2803	unsigned int ch;
2804	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2805	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806	: 0));
2807	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2808	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809	: 0));
2810	if (start_ch > end_ch)
2811	return REG_ERANGE;
2812	/* Build the table for single byte characters. */
2813	for (ch = 0; ch < SBC_MAX; ++ch)
2814	if (start_ch <= ch && ch <= end_ch)
2815	bitset_set (sbcset, ch);
2816	}
2817	# endif /* not RE_ENABLE_I18N */
2818	return REG_NOERROR;
2819	}
2820	#endif /* not _LIBC */
2821
2822	#ifndef _LIBC
2823	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824	Build the collating element which is represented by NAME.
2825	The result are written to MBCSET and SBCSET.
2826	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827	pointer argument since we may update it. */
2828
2829	static reg_errcode_t
2830	# ifdef RE_ENABLE_I18N
2831	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832	Idx coll_sym_alloc, const unsigned char name)
2833	# else /* not RE_ENABLE_I18N */
2834	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835	# endif /* not RE_ENABLE_I18N */
2836	{
2837	size_t name_len = strlen ((const char *) name);
2838	if (__glibc_unlikely (name_len != 1))
2839	return REG_ECOLLATE;
2840	else
2841	{
2842	bitset_set (sbcset, name[0]);
2843	return REG_NOERROR;
2844	}
2845	}
2846	#endif /* not _LIBC */
2847
2848	/* This function parse bracket expression like "[abc]", "[a-c]",
2849	"[[.a-a.]]" etc. */
2850
2851	static bin_tree_t *
2852	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2853	reg_syntax_t syntax, reg_errcode_t *err)
2854	{
2855	#ifdef _LIBC
2856	const unsigned char *collseqmb;
2857	const char *collseqwc;
2858	uint32_t nrules;
2859	int32_t table_size;
2860	const int32_t *symb_table;
2861	const unsigned char *extra;
2862
2863	/* Local function for parse_bracket_exp used in _LIBC environment.
2864	Seek the collating symbol entry corresponding to NAME.
2865	Return the index of the symbol in the SYMB_TABLE,
2866	or -1 if not found. */
2867
2868	auto inline int32_t
2869	__attribute__ ((always_inline))
2870	seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871	{
2872	int32_t elem;
2873
2874	for (elem = 0; elem < table_size; elem++)
2875	if (symb_table[2 * elem] != 0)
2876	{
2877	int32_t idx = symb_table[2 * elem + 1];
2878	/* Skip the name of collating element name. */
2879	idx += 1 + extra[idx];
2880	if (/* Compare the length of the name. */
2881	name_len == extra[idx]
2882	/* Compare the name. */
2883	&& memcmp (name, &extra[idx + 1], name_len) == 0)
2884	/* Yep, this is the entry. */
2885	return elem;
2886	}
2887	return -1;
2888	}
2889
2890	/* Local function for parse_bracket_exp used in _LIBC environment.
2891	Look up the collation sequence value of BR_ELEM.
2892	Return the value if succeeded, UINT_MAX otherwise. */
2893
2894	auto inline unsigned int
2895	__attribute__ ((always_inline))
2896	lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897	{
2898	if (br_elem->type == SB_CHAR)
2899	{
2900	/*
2901	if (MB_CUR_MAX == 1)
2902	*/
2903	if (nrules == 0)
2904	return collseqmb[br_elem->opr.ch];
2905	else
2906	{
2907	wint_t wc = __btowc (br_elem->opr.ch);
2908	return __collseq_table_lookup (collseqwc, wc);
2909	}
2910	}
2911	else if (br_elem->type == MB_CHAR)
2912	{
2913	if (nrules != 0)
2914	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915	}
2916	else if (br_elem->type == COLL_SYM)
2917	{
2918	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919	if (nrules != 0)
2920	{
2921	int32_t elem, idx;
2922	elem = seek_collating_symbol_entry (br_elem->opr.name,
2923	sym_name_len);
2924	if (elem != -1)
2925	{
2926	/* We found the entry. */
2927	idx = symb_table[2 * elem + 1];
2928	/* Skip the name of collating element name. */
2929	idx += 1 + extra[idx];
2930	/* Skip the byte sequence of the collating element. */
2931	idx += 1 + extra[idx];
2932	/* Adjust for the alignment. */
2933	idx = (idx + 3) & ~3;
2934	/* Skip the multibyte collation sequence value. */
2935	idx += sizeof (unsigned int);
2936	/* Skip the wide char sequence of the collating element. */
2937	idx += sizeof (unsigned int) *
2938	(1 + (unsigned int ) (extra + idx));
2939	/* Return the collation sequence value. */
2940	return (unsigned int ) (extra + idx);
2941	}
2942	else if (sym_name_len == 1)
2943	{
2944	/* No valid character. Match it as a single byte
2945	character. */
2946	return collseqmb[br_elem->opr.name[0]];
2947	}
2948	}
2949	else if (sym_name_len == 1)
2950	return collseqmb[br_elem->opr.name[0]];
2951	}
2952	return UINT_MAX;
2953	}
2954
2955	/* Local function for parse_bracket_exp used in _LIBC environment.
2956	Build the range expression which starts from START_ELEM, and ends
2957	at END_ELEM. The result are written to MBCSET and SBCSET.
2958	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959	mbcset->range_ends, is a pointer argument since we may
2960	update it. */
2961
2962	auto inline reg_errcode_t
2963	__attribute__ ((always_inline))
2964	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int range_alloc,
2965	bracket_elem_t start_elem, bracket_elem_t end_elem)
2966	{
2967	unsigned int ch;
2968	uint32_t start_collseq;
2969	uint32_t end_collseq;
2970
2971	/* Equivalence Classes and Character Classes can't be a range
2972	start/end. */
2973	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2974	\|\| start_elem->type == CHAR_CLASS
2975	\|\| end_elem->type == EQUIV_CLASS
2976	\|\| end_elem->type == CHAR_CLASS))
2977	return REG_ERANGE;
2978
2979	/* FIXME: Implement rational ranges here, too. */
2980	start_collseq = lookup_collation_sequence_value (start_elem);
2981	end_collseq = lookup_collation_sequence_value (end_elem);
2982	/* Check start/end collation sequence values. */
2983	if (__glibc_unlikely (start_collseq == UINT_MAX
2984	\|\| end_collseq == UINT_MAX))
2985	return REG_ECOLLATE;
2986	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2987	&& start_collseq > end_collseq))
2988	return REG_ERANGE;
2989
2990	/* Got valid collation sequence values, add them as a new entry.
2991	However, if we have no collation elements, and the character set
2992	is single byte, the single byte character set that we
2993	build below suffices. */
2994	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2995	{
2996	/* Check the space of the arrays. */
2997	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2998	{
2999	/* There is not enough space, need realloc. */
3000	uint32_t *new_array_start;
3001	uint32_t *new_array_end;
3002	Idx new_nranges;
3003
3004	/* +1 in case of mbcset->nranges is 0. */
3005	new_nranges = 2 * mbcset->nranges + 1;
3006	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3007	new_nranges);
3008	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3009	new_nranges);
3010
3011	if (__glibc_unlikely (new_array_start == NULL
3012	\|\| new_array_end == NULL))
3013	return REG_ESPACE;
3014
3015	mbcset->range_starts = new_array_start;
3016	mbcset->range_ends = new_array_end;
3017	*range_alloc = new_nranges;
3018	}
3019
3020	mbcset->range_starts[mbcset->nranges] = start_collseq;
3021	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3022	}
3023
3024	/* Build the table for single byte characters. */
3025	for (ch = 0; ch < SBC_MAX; ch++)
3026	{
3027	uint32_t ch_collseq;
3028	/*
3029	if (MB_CUR_MAX == 1)
3030	*/
3031	if (nrules == 0)
3032	ch_collseq = collseqmb[ch];
3033	else
3034	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3035	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3036	bitset_set (sbcset, ch);
3037	}
3038	return REG_NOERROR;
3039	}
3040
3041	/* Local function for parse_bracket_exp used in _LIBC environment.
3042	Build the collating element which is represented by NAME.
3043	The result are written to MBCSET and SBCSET.
3044	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3045	pointer argument since we may update it. */
3046
3047	auto inline reg_errcode_t
3048	__attribute__ ((always_inline))
3049	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3050	Idx coll_sym_alloc, const unsigned char name)
3051	{
3052	int32_t elem, idx;
3053	size_t name_len = strlen ((const char *) name);
3054	if (nrules != 0)
3055	{
3056	elem = seek_collating_symbol_entry (name, name_len);
3057	if (elem != -1)
3058	{
3059	/* We found the entry. */
3060	idx = symb_table[2 * elem + 1];
3061	/* Skip the name of collating element name. */
3062	idx += 1 + extra[idx];
3063	}
3064	else if (name_len == 1)
3065	{
3066	/* No valid character, treat it as a normal
3067	character. */
3068	bitset_set (sbcset, name[0]);
3069	return REG_NOERROR;
3070	}
3071	else
3072	return REG_ECOLLATE;
3073
3074	/* Got valid collation sequence, add it as a new entry. */
3075	/* Check the space of the arrays. */
3076	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3077	{
3078	/* Not enough, realloc it. */
3079	/* +1 in case of mbcset->ncoll_syms is 0. */
3080	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3081	/* Use realloc since mbcset->coll_syms is NULL
3082	if alloc == 0. /
3083	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3084	new_coll_sym_alloc);
3085	if (__glibc_unlikely (new_coll_syms == NULL))
3086	return REG_ESPACE;
3087	mbcset->coll_syms = new_coll_syms;
3088	*coll_sym_alloc = new_coll_sym_alloc;
3089	}
3090	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3091	return REG_NOERROR;
3092	}
3093	else
3094	{
3095	if (__glibc_unlikely (name_len != 1))
3096	return REG_ECOLLATE;
3097	else
3098	{
3099	bitset_set (sbcset, name[0]);
3100	return REG_NOERROR;
3101	}
3102	}
3103	}
3104	#endif
3105
3106	re_token_t br_token;
3107	re_bitset_ptr_t sbcset;
3108	#ifdef RE_ENABLE_I18N
3109	re_charset_t *mbcset;
3110	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3111	Idx equiv_class_alloc = 0, char_class_alloc = 0;
3112	#endif /* not RE_ENABLE_I18N */
3113	bool non_match = false;
3114	bin_tree_t *work_tree;
3115	int token_len;
3116	bool first_round = true;
3117	#ifdef _LIBC
3118	collseqmb = (const unsigned char *)
3119	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3120	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3121	if (nrules)
3122	{
3123	/*
3124	if (MB_CUR_MAX > 1)
3125	*/
3126	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3127	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3128	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3129	_NL_COLLATE_SYMB_TABLEMB);
3130	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3131	_NL_COLLATE_SYMB_EXTRAMB);
3132	}
3133	#endif
3134	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3135	#ifdef RE_ENABLE_I18N
3136	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3137	#endif /* RE_ENABLE_I18N */
3138	#ifdef RE_ENABLE_I18N
3139	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3140	#else
3141	if (__glibc_unlikely (sbcset == NULL))
3142	#endif /* RE_ENABLE_I18N */
3143	{
3144	re_free (sbcset);
3145	#ifdef RE_ENABLE_I18N
3146	re_free (mbcset);
3147	#endif
3148	*err = REG_ESPACE;
3149	return NULL;
3150	}
3151
3152	token_len = peek_token_bracket (token, regexp, syntax);
3153	if (__glibc_unlikely (token->type == END_OF_RE))
3154	{
3155	*err = REG_BADPAT;
3156	goto parse_bracket_exp_free_return;
3157	}
3158	if (token->type == OP_NON_MATCH_LIST)
3159	{
3160	#ifdef RE_ENABLE_I18N
3161	mbcset->non_match = 1;
3162	#endif /* not RE_ENABLE_I18N */
3163	non_match = true;
3164	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3165	bitset_set (sbcset, '\n');
3166	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3167	token_len = peek_token_bracket (token, regexp, syntax);
3168	if (__glibc_unlikely (token->type == END_OF_RE))
3169	{
3170	*err = REG_BADPAT;
3171	goto parse_bracket_exp_free_return;
3172	}
3173	}
3174
3175	/* We treat the first ']' as a normal character. */
3176	if (token->type == OP_CLOSE_BRACKET)
3177	token->type = CHARACTER;
3178
3179	while (1)
3180	{
3181	bracket_elem_t start_elem, end_elem;
3182	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3183	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3184	reg_errcode_t ret;
3185	int token_len2 = 0;
3186	bool is_range_exp = false;
3187	re_token_t token2;
3188
3189	start_elem.opr.name = start_name_buf;
3190	start_elem.type = COLL_SYM;
3191	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3192	syntax, first_round);
3193	if (__glibc_unlikely (ret != REG_NOERROR))
3194	{
3195	*err = ret;
3196	goto parse_bracket_exp_free_return;
3197	}
3198	first_round = false;
3199
3200	/* Get information about the next token. We need it in any case. */
3201	token_len = peek_token_bracket (token, regexp, syntax);
3202
3203	/* Do not check for ranges if we know they are not allowed. */
3204	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3205	{
3206	if (__glibc_unlikely (token->type == END_OF_RE))
3207	{
3208	*err = REG_EBRACK;
3209	goto parse_bracket_exp_free_return;
3210	}
3211	if (token->type == OP_CHARSET_RANGE)
3212	{
3213	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3214	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3215	if (__glibc_unlikely (token2.type == END_OF_RE))
3216	{
3217	*err = REG_EBRACK;
3218	goto parse_bracket_exp_free_return;
3219	}
3220	if (token2.type == OP_CLOSE_BRACKET)
3221	{
3222	/* We treat the last '-' as a normal character. */
3223	re_string_skip_bytes (regexp, -token_len);
3224	token->type = CHARACTER;
3225	}
3226	else
3227	is_range_exp = true;
3228	}
3229	}
3230
3231	if (is_range_exp == true)
3232	{
3233	end_elem.opr.name = end_name_buf;
3234	end_elem.type = COLL_SYM;
3235	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3236	dfa, syntax, true);
3237	if (__glibc_unlikely (ret != REG_NOERROR))
3238	{
3239	*err = ret;
3240	goto parse_bracket_exp_free_return;
3241	}
3242
3243	token_len = peek_token_bracket (token, regexp, syntax);
3244
3245	#ifdef _LIBC
3246	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3247	&start_elem, &end_elem);
3248	#else
3249	# ifdef RE_ENABLE_I18N
3250	*err = build_range_exp (syntax, sbcset,
3251	dfa->mb_cur_max > 1 ? mbcset : NULL,
3252	&range_alloc, &start_elem, &end_elem);
3253	# else
3254	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3255	# endif
3256	#endif /* RE_ENABLE_I18N */
3257	if (__glibc_unlikely (*err != REG_NOERROR))
3258	goto parse_bracket_exp_free_return;
3259	}
3260	else
3261	{
3262	switch (start_elem.type)
3263	{
3264	case SB_CHAR:
3265	bitset_set (sbcset, start_elem.opr.ch);
3266	break;
3267	#ifdef RE_ENABLE_I18N
3268	case MB_CHAR:
3269	/* Check whether the array has enough space. */
3270	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3271	{
3272	wchar_t *new_mbchars;
3273	/* Not enough, realloc it. */
3274	/* +1 in case of mbcset->nmbchars is 0. */
3275	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3276	/* Use realloc since array is NULL if alloc == 0. /
3277	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3278	mbchar_alloc);
3279	if (__glibc_unlikely (new_mbchars == NULL))
3280	goto parse_bracket_exp_espace;
3281	mbcset->mbchars = new_mbchars;
3282	}
3283	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3284	break;
3285	#endif /* RE_ENABLE_I18N */
3286	case EQUIV_CLASS:
3287	*err = build_equiv_class (sbcset,
3288	#ifdef RE_ENABLE_I18N
3289	mbcset, &equiv_class_alloc,
3290	#endif /* RE_ENABLE_I18N */
3291	start_elem.opr.name);
3292	if (__glibc_unlikely (*err != REG_NOERROR))
3293	goto parse_bracket_exp_free_return;
3294	break;
3295	case COLL_SYM:
3296	*err = build_collating_symbol (sbcset,
3297	#ifdef RE_ENABLE_I18N
3298	mbcset, &coll_sym_alloc,
3299	#endif /* RE_ENABLE_I18N */
3300	start_elem.opr.name);
3301	if (__glibc_unlikely (*err != REG_NOERROR))
3302	goto parse_bracket_exp_free_return;
3303	break;
3304	case CHAR_CLASS:
3305	*err = build_charclass (regexp->trans, sbcset,
3306	#ifdef RE_ENABLE_I18N
3307	mbcset, &char_class_alloc,
3308	#endif /* RE_ENABLE_I18N */
3309	(const char *) start_elem.opr.name,
3310	syntax);
3311	if (__glibc_unlikely (*err != REG_NOERROR))
3312	goto parse_bracket_exp_free_return;
3313	break;
3314	default:
3315	DEBUG_ASSERT (false);
3316	break;
3317	}
3318	}
3319	if (__glibc_unlikely (token->type == END_OF_RE))
3320	{
3321	*err = REG_EBRACK;
3322	goto parse_bracket_exp_free_return;
3323	}
3324	if (token->type == OP_CLOSE_BRACKET)
3325	break;
3326	}
3327
3328	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3329
3330	/* If it is non-matching list. */
3331	if (non_match)
3332	bitset_not (sbcset);
3333
3334	#ifdef RE_ENABLE_I18N
3335	/* Ensure only single byte characters are set. */
3336	if (dfa->mb_cur_max > 1)
3337	bitset_mask (sbcset, dfa->sb_char);
3338
3339	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3340	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3341	\|\| mbcset->non_match)))
3342	{
3343	bin_tree_t *mbc_tree;
3344	int sbc_idx;
3345	/* Build a tree for complex bracket. */
3346	dfa->has_mb_node = 1;
3347	br_token.type = COMPLEX_BRACKET;
3348	br_token.opr.mbcset = mbcset;
3349	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3350	if (__glibc_unlikely (mbc_tree == NULL))
3351	goto parse_bracket_exp_espace;
3352	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3353	if (sbcset[sbc_idx])
3354	break;
3355	/* If there are no bits set in sbcset, there is no point
3356	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3357	if (sbc_idx < BITSET_WORDS)
3358	{
3359	/* Build a tree for simple bracket. */
3360	br_token.type = SIMPLE_BRACKET;
3361	br_token.opr.sbcset = sbcset;
3362	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3363	if (__glibc_unlikely (work_tree == NULL))
3364	goto parse_bracket_exp_espace;
3365
3366	/* Then join them by ALT node. */
3367	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3368	if (__glibc_unlikely (work_tree == NULL))
3369	goto parse_bracket_exp_espace;
3370	}
3371	else
3372	{
3373	re_free (sbcset);
3374	work_tree = mbc_tree;
3375	}
3376	}
3377	else
3378	#endif /* not RE_ENABLE_I18N */
3379	{
3380	#ifdef RE_ENABLE_I18N
3381	free_charset (mbcset);
3382	#endif
3383	/* Build a tree for simple bracket. */
3384	br_token.type = SIMPLE_BRACKET;
3385	br_token.opr.sbcset = sbcset;
3386	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3387	if (__glibc_unlikely (work_tree == NULL))
3388	goto parse_bracket_exp_espace;
3389	}
3390	return work_tree;
3391
3392	parse_bracket_exp_espace:
3393	*err = REG_ESPACE;
3394	parse_bracket_exp_free_return:
3395	re_free (sbcset);
3396	#ifdef RE_ENABLE_I18N
3397	free_charset (mbcset);
3398	#endif /* RE_ENABLE_I18N */
3399	return NULL;
3400	}
3401
3402	/* Parse an element in the bracket expression. */
3403
3404	static reg_errcode_t
3405	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3406	re_token_t token, int token_len, re_dfa_t dfa,
3407	reg_syntax_t syntax, bool accept_hyphen)
3408	{
3409	#ifdef RE_ENABLE_I18N
3410	int cur_char_size;
3411	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3412	if (cur_char_size > 1)
3413	{
3414	elem->type = MB_CHAR;
3415	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3416	re_string_skip_bytes (regexp, cur_char_size);
3417	return REG_NOERROR;
3418	}
3419	#endif /* RE_ENABLE_I18N */
3420	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3421	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3422	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3423	return parse_bracket_symbol (elem, regexp, token);
3424	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3425	{
3426	/* A '-' must only appear as anything but a range indicator before
3427	the closing bracket. Everything else is an error. */
3428	re_token_t token2;
3429	(void) peek_token_bracket (&token2, regexp, syntax);
3430	if (token2.type != OP_CLOSE_BRACKET)
3431	/* The actual error value is not standardized since this whole
3432	case is undefined. But ERANGE makes good sense. */
3433	return REG_ERANGE;
3434	}
3435	elem->type = SB_CHAR;
3436	elem->opr.ch = token->opr.c;
3437	return REG_NOERROR;
3438	}
3439
3440	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3441	such as [:<character_class>:], [.<collating_element>.], and
3442	[=<equivalent_class>=]. */
3443
3444	static reg_errcode_t
3445	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3446	re_token_t *token)
3447	{
3448	unsigned char ch, delim = token->opr.c;
3449	int i = 0;
3450	if (re_string_eoi(regexp))
3451	return REG_EBRACK;
3452	for (;; ++i)
3453	{
3454	if (i >= BRACKET_NAME_BUF_SIZE)
3455	return REG_EBRACK;
3456	if (token->type == OP_OPEN_CHAR_CLASS)
3457	ch = re_string_fetch_byte_case (regexp);
3458	else
3459	ch = re_string_fetch_byte (regexp);
3460	if (re_string_eoi(regexp))
3461	return REG_EBRACK;
3462	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3463	break;
3464	elem->opr.name[i] = ch;
3465	}
3466	re_string_skip_bytes (regexp, 1);
3467	elem->opr.name[i] = '\0';
3468	switch (token->type)
3469	{
3470	case OP_OPEN_COLL_ELEM:
3471	elem->type = COLL_SYM;
3472	break;
3473	case OP_OPEN_EQUIV_CLASS:
3474	elem->type = EQUIV_CLASS;
3475	break;
3476	case OP_OPEN_CHAR_CLASS:
3477	elem->type = CHAR_CLASS;
3478	break;
3479	default:
3480	break;
3481	}
3482	return REG_NOERROR;
3483	}
3484
3485	/* Helper function for parse_bracket_exp.
3486	Build the equivalence class which is represented by NAME.
3487	The result are written to MBCSET and SBCSET.
3488	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3489	is a pointer argument since we may update it. */
3490
3491	static reg_errcode_t
3492	#ifdef RE_ENABLE_I18N
3493	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3494	Idx equiv_class_alloc, const unsigned char name)
3495	#else /* not RE_ENABLE_I18N */
3496	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3497	#endif /* not RE_ENABLE_I18N */
3498	{
3499	#ifdef _LIBC
3500	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3501	if (nrules != 0)
3502	{
3503	const int32_t table, indirect;
3504	const unsigned char weights, extra, *cp;
3505	unsigned char char_buf[2];
3506	int32_t idx1, idx2;
3507	unsigned int ch;
3508	size_t len;
3509	/* Calculate the index for equivalence class. */
3510	cp = name;
3511	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3512	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3513	_NL_COLLATE_WEIGHTMB);
3514	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3515	_NL_COLLATE_EXTRAMB);
3516	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3517	_NL_COLLATE_INDIRECTMB);
3518	idx1 = findidx (table, indirect, extra, &cp, -1);
3519	if (__glibc_unlikely (idx1 == 0 \|\| *cp != '\0'))
3520	/* This isn't a valid character. */
3521	return REG_ECOLLATE;
3522
3523	/* Build single byte matching table for this equivalence class. */
3524	len = weights[idx1 & 0xffffff];
3525	for (ch = 0; ch < SBC_MAX; ++ch)
3526	{
3527	char_buf[0] = ch;
3528	cp = char_buf;
3529	idx2 = findidx (table, indirect, extra, &cp, 1);
3530	/*
3531	idx2 = table[ch];
3532	*/
3533	if (idx2 == 0)
3534	/* This isn't a valid character. */
3535	continue;
3536	/* Compare only if the length matches and the collation rule
3537	index is the same. */
3538	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3539	&& memcmp (weights + (idx1 & 0xffffff) + 1,
3540	weights + (idx2 & 0xffffff) + 1, len) == 0)
3541	bitset_set (sbcset, ch);
3542	}
3543	/* Check whether the array has enough space. */
3544	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3545	{
3546	/* Not enough, realloc it. */
3547	/* +1 in case of mbcset->nequiv_classes is 0. */
3548	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3549	/* Use realloc since the array is NULL if alloc == 0. /
3550	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3551	int32_t,
3552	new_equiv_class_alloc);
3553	if (__glibc_unlikely (new_equiv_classes == NULL))
3554	return REG_ESPACE;
3555	mbcset->equiv_classes = new_equiv_classes;
3556	*equiv_class_alloc = new_equiv_class_alloc;
3557	}
3558	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3559	}
3560	else
3561	#endif /* _LIBC */
3562	{
3563	if (__glibc_unlikely (strlen ((const char *) name) != 1))
3564	return REG_ECOLLATE;
3565	bitset_set (sbcset, *name);
3566	}
3567	return REG_NOERROR;
3568	}
3569
3570	/* Helper function for parse_bracket_exp.
3571	Build the character class which is represented by NAME.
3572	The result are written to MBCSET and SBCSET.
3573	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3574	is a pointer argument since we may update it. */
3575
3576	static reg_errcode_t
3577	#ifdef RE_ENABLE_I18N
3578	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579	re_charset_t mbcset, Idx char_class_alloc,
3580	const char *class_name, reg_syntax_t syntax)
3581	#else /* not RE_ENABLE_I18N */
3582	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583	const char *class_name, reg_syntax_t syntax)
3584	#endif /* not RE_ENABLE_I18N */
3585	{
3586	int i;
3587	const char *name = class_name;
3588
3589	/* In case of REG_ICASE "upper" and "lower" match the both of
3590	upper and lower cases. */
3591	if ((syntax & RE_ICASE)
3592	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
3593	name = "alpha";
3594
3595	#ifdef RE_ENABLE_I18N
3596	/* Check the space of the arrays. */
3597	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3598	{
3599	/* Not enough, realloc it. */
3600	/* +1 in case of mbcset->nchar_classes is 0. */
3601	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3602	/* Use realloc since array is NULL if alloc == 0. /
3603	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3604	new_char_class_alloc);
3605	if (__glibc_unlikely (new_char_classes == NULL))
3606	return REG_ESPACE;
3607	mbcset->char_classes = new_char_classes;
3608	*char_class_alloc = new_char_class_alloc;
3609	}
3610	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3611	#endif /* RE_ENABLE_I18N */
3612
3613	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3614	do { \
3615	if (__glibc_unlikely (trans != NULL)) \
3616	{ \
3617	for (i = 0; i < SBC_MAX; ++i) \
3618	if (ctype_func (i)) \
3619	bitset_set (sbcset, trans[i]); \
3620	} \
3621	else \
3622	{ \
3623	for (i = 0; i < SBC_MAX; ++i) \
3624	if (ctype_func (i)) \
3625	bitset_set (sbcset, i); \
3626	} \
3627	} while (0)
3628
3629	if (strcmp (name, "alnum") == 0)
3630	BUILD_CHARCLASS_LOOP (isalnum);
3631	else if (strcmp (name, "cntrl") == 0)
3632	BUILD_CHARCLASS_LOOP (iscntrl);
3633	else if (strcmp (name, "lower") == 0)
3634	BUILD_CHARCLASS_LOOP (islower);
3635	else if (strcmp (name, "space") == 0)
3636	BUILD_CHARCLASS_LOOP (isspace);
3637	else if (strcmp (name, "alpha") == 0)
3638	BUILD_CHARCLASS_LOOP (isalpha);
3639	else if (strcmp (name, "digit") == 0)
3640	BUILD_CHARCLASS_LOOP (isdigit);
3641	else if (strcmp (name, "print") == 0)
3642	BUILD_CHARCLASS_LOOP (isprint);
3643	else if (strcmp (name, "upper") == 0)
3644	BUILD_CHARCLASS_LOOP (isupper);
3645	else if (strcmp (name, "blank") == 0)
3646	BUILD_CHARCLASS_LOOP (isblank);
3647	else if (strcmp (name, "graph") == 0)
3648	BUILD_CHARCLASS_LOOP (isgraph);
3649	else if (strcmp (name, "punct") == 0)
3650	BUILD_CHARCLASS_LOOP (ispunct);
3651	else if (strcmp (name, "xdigit") == 0)
3652	BUILD_CHARCLASS_LOOP (isxdigit);
3653	else
3654	return REG_ECTYPE;
3655
3656	return REG_NOERROR;
3657	}
3658
3659	static bin_tree_t *
3660	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3661	const char *class_name,
3662	const char *extra, bool non_match,
3663	reg_errcode_t *err)
3664	{
3665	re_bitset_ptr_t sbcset;
3666	#ifdef RE_ENABLE_I18N
3667	re_charset_t *mbcset;
3668	Idx alloc = 0;
3669	#endif /* not RE_ENABLE_I18N */
3670	reg_errcode_t ret;
3671	bin_tree_t *tree;
3672
3673	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3674	if (__glibc_unlikely (sbcset == NULL))
3675	{
3676	*err = REG_ESPACE;
3677	return NULL;
3678	}
3679	#ifdef RE_ENABLE_I18N
3680	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681	if (__glibc_unlikely (mbcset == NULL))
3682	{
3683	re_free (sbcset);
3684	*err = REG_ESPACE;
3685	return NULL;
3686	}
3687	mbcset->non_match = non_match;
3688	#endif /* RE_ENABLE_I18N */
3689
3690	/* We don't care the syntax in this case. */
3691	ret = build_charclass (trans, sbcset,
3692	#ifdef RE_ENABLE_I18N
3693	mbcset, &alloc,
3694	#endif /* RE_ENABLE_I18N */
3695	class_name, 0);
3696
3697	if (__glibc_unlikely (ret != REG_NOERROR))
3698	{
3699	re_free (sbcset);
3700	#ifdef RE_ENABLE_I18N
3701	free_charset (mbcset);
3702	#endif /* RE_ENABLE_I18N */
3703	*err = ret;
3704	return NULL;
3705	}
3706	/* \w match '_' also. */
3707	for (; *extra; extra++)
3708	bitset_set (sbcset, *extra);
3709
3710	/* If it is non-matching list. */
3711	if (non_match)
3712	bitset_not (sbcset);
3713
3714	#ifdef RE_ENABLE_I18N
3715	/* Ensure only single byte characters are set. */
3716	if (dfa->mb_cur_max > 1)
3717	bitset_mask (sbcset, dfa->sb_char);
3718	#endif
3719
3720	/* Build a tree for simple bracket. */
3721	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3722	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723	if (__glibc_unlikely (tree == NULL))
3724	goto build_word_op_espace;
3725
3726	#ifdef RE_ENABLE_I18N
3727	if (dfa->mb_cur_max > 1)
3728	{
3729	bin_tree_t *mbc_tree;
3730	/* Build a tree for complex bracket. */
3731	br_token.type = COMPLEX_BRACKET;
3732	br_token.opr.mbcset = mbcset;
3733	dfa->has_mb_node = 1;
3734	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3735	if (__glibc_unlikely (mbc_tree == NULL))
3736	goto build_word_op_espace;
3737	/* Then join them by ALT node. */
3738	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3739	if (__glibc_likely (mbc_tree != NULL))
3740	return tree;
3741	}
3742	else
3743	{
3744	free_charset (mbcset);
3745	return tree;
3746	}
3747	#else /* not RE_ENABLE_I18N */
3748	return tree;
3749	#endif /* not RE_ENABLE_I18N */
3750
3751	build_word_op_espace:
3752	re_free (sbcset);
3753	#ifdef RE_ENABLE_I18N
3754	free_charset (mbcset);
3755	#endif /* RE_ENABLE_I18N */
3756	*err = REG_ESPACE;
3757	return NULL;
3758	}
3759
3760	/* This is intended for the expressions like "a{1,3}".
3761	Fetch a number from 'input', and return the number.
3762	Return -1 if the number field is empty like "{,1}".
3763	Return RE_DUP_MAX + 1 if the number field is too large.
3764	Return -2 if an error occurred. */
3765
3766	static Idx
3767	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3768	{
3769	Idx num = -1;
3770	unsigned char c;
3771	while (1)
3772	{
3773	fetch_token (token, input, syntax);
3774	c = token->opr.c;
3775	if (__glibc_unlikely (token->type == END_OF_RE))
3776	return -2;
3777	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3778	break;
3779	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
3780	? -2
3781	: num == -1
3782	? c - '0'
3783	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3784	}
3785	return num;
3786	}
3787
3788
3789	#ifdef RE_ENABLE_I18N
3790	static void
3791	free_charset (re_charset_t *cset)
3792	{
3793	re_free (cset->mbchars);
3794	# ifdef _LIBC
3795	re_free (cset->coll_syms);
3796	re_free (cset->equiv_classes);
3797	# endif
3798	re_free (cset->range_starts);
3799	re_free (cset->range_ends);
3800	re_free (cset->char_classes);
3801	re_free (cset);
3802	}
3803	#endif /* RE_ENABLE_I18N */
3804
3805
3806	/* Functions for binary tree operation. */
3807
3808	/* Create a tree node. */
3809
3810	static bin_tree_t *
3811	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3812	re_token_type_t type)
3813	{
3814	re_token_t t = { .type = type };
3815	return create_token_tree (dfa, left, right, &t);
3816	}
3817
3818	static bin_tree_t *
3819	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3820	const re_token_t *token)
3821	{
3822	bin_tree_t *tree;
3823	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3824	{
3825	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3826
3827	if (storage == NULL)
3828	return NULL;
3829	storage->next = dfa->str_tree_storage;
3830	dfa->str_tree_storage = storage;
3831	dfa->str_tree_storage_idx = 0;
3832	}
3833	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3834
3835	tree->parent = NULL;
3836	tree->left = left;
3837	tree->right = right;
3838	tree->token = *token;
3839	tree->token.duplicated = 0;
3840	tree->token.opt_subexp = 0;
3841	tree->first = NULL;
3842	tree->next = NULL;
3843	tree->node_idx = -1;
3844
3845	if (left != NULL)
3846	left->parent = tree;
3847	if (right != NULL)
3848	right->parent = tree;
3849	return tree;
3850	}
3851
3852	/* Mark the tree SRC as an optional subexpression.
3853	To be called from preorder or postorder. */
3854
3855	static reg_errcode_t
3856	mark_opt_subexp (void extra, bin_tree_t node)
3857	{
3858	Idx idx = (uintptr_t) extra;
3859	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3860	node->token.opt_subexp = 1;
3861
3862	return REG_NOERROR;
3863	}
3864
3865	/* Free the allocated memory inside NODE. */
3866
3867	static void
3868	free_token (re_token_t *node)
3869	{
3870	#ifdef RE_ENABLE_I18N
3871	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3872	free_charset (node->opr.mbcset);
3873	else
3874	#endif /* RE_ENABLE_I18N */
3875	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3876	re_free (node->opr.sbcset);
3877	}
3878
3879	/* Worker function for tree walking. Free the allocated memory inside NODE
3880	and its children. */
3881
3882	static reg_errcode_t
3883	free_tree (void extra, bin_tree_t node)
3884	{
3885	free_token (&node->token);
3886	return REG_NOERROR;
3887	}
3888
3889
3890	/* Duplicate the node SRC, and return new node. This is a preorder
3891	visit similar to the one implemented by the generic visitor, but
3892	we need more infrastructure to maintain two parallel trees --- so,
3893	it's easier to duplicate. */
3894
3895	static bin_tree_t *
3896	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3897	{
3898	const bin_tree_t *node;
3899	bin_tree_t *dup_root;
3900	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3901
3902	for (node = root; ; )
3903	{
3904	/* Create a new tree and link it back to the current parent. */
3905	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3906	if (*p_new == NULL)
3907	return NULL;
3908	(*p_new)->parent = dup_node;
3909	(*p_new)->token.duplicated = 1;
3910	dup_node = *p_new;
3911
3912	/* Go to the left node, or up and to the right. */
3913	if (node->left)
3914	{
3915	node = node->left;
3916	p_new = &dup_node->left;
3917	}
3918	else
3919	{
3920	const bin_tree_t *prev = NULL;
3921	while (node->right == prev \|\| node->right == NULL)
3922	{
3923	prev = node;
3924	node = node->parent;
3925	dup_node = dup_node->parent;
3926	if (!node)
3927	return dup_root;
3928	}
3929	node = node->right;
3930	p_new = &dup_node->right;
3931	}
3932	}
3933	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/grep/lib/regcomp.c

Download in other formats: