regcomp.c

Last change on this file was 2595, checked in by bird, 13 years ago
gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)
Property svn:eol-style set to `native`
File size: 110.8 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2012 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <[email protected]>.
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 3, or (at your option)
9	any later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License along
17	with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
20	size_t length, reg_syntax_t syntax);
21	static void re_compile_fastmap_iter (regex_t *bufp,
22	const re_dfastate_t *init_state,
23	char *fastmap);
24	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25	#ifdef RE_ENABLE_I18N
26	static void free_charset (re_charset_t *cset);
27	#endif /* RE_ENABLE_I18N */
28	static void free_workarea_compile (regex_t *preg);
29	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30	#ifdef RE_ENABLE_I18N
31	static void optimize_utf8 (re_dfa_t *dfa);
32	#endif
33	static reg_errcode_t analyze (regex_t *preg);
34	static reg_errcode_t preorder (bin_tree_t *root,
35	reg_errcode_t (fn (void , bin_tree_t )),
36	void *extra);
37	static reg_errcode_t postorder (bin_tree_t *root,
38	reg_errcode_t (fn (void , bin_tree_t )),
39	void *extra);
40	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
41	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
42	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
43	bin_tree_t *node);
44	static reg_errcode_t calc_first (void extra, bin_tree_t node);
45	static reg_errcode_t calc_next (void extra, bin_tree_t node);
46	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
47	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49	unsigned int constraint);
50	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
52	Idx node, bool root);
53	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54	static Idx fetch_number (re_string_t input, re_token_t token,
55	reg_syntax_t syntax);
56	static int peek_token (re_token_t token, re_string_t input,
57	reg_syntax_t syntax) internal_function;
58	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
59	reg_syntax_t syntax, reg_errcode_t *err);
60	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
61	re_token_t *token, reg_syntax_t syntax,
62	Idx nest, reg_errcode_t *err);
63	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
64	re_token_t *token, reg_syntax_t syntax,
65	Idx nest, reg_errcode_t *err);
66	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
67	re_token_t *token, reg_syntax_t syntax,
68	Idx nest, reg_errcode_t *err);
69	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
70	re_token_t *token, reg_syntax_t syntax,
71	Idx nest, reg_errcode_t *err);
72	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
73	re_dfa_t dfa, re_token_t token,
74	reg_syntax_t syntax, reg_errcode_t *err);
75	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
76	re_token_t *token, reg_syntax_t syntax,
77	reg_errcode_t *err);
78	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79	re_string_t *regexp,
80	re_token_t *token, int token_len,
81	re_dfa_t *dfa,
82	reg_syntax_t syntax,
83	bool accept_hyphen);
84	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85	re_string_t *regexp,
86	re_token_t *token);
87	#ifdef RE_ENABLE_I18N
88	static reg_errcode_t build_equiv_class (bitset_t sbcset,
89	re_charset_t *mbcset,
90	Idx *equiv_class_alloc,
91	const unsigned char *name);
92	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93	bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *char_class_alloc,
96	const unsigned char *class_name,
97	reg_syntax_t syntax);
98	#else /* not RE_ENABLE_I18N */
99	static reg_errcode_t build_equiv_class (bitset_t sbcset,
100	const unsigned char *name);
101	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102	bitset_t sbcset,
103	const unsigned char *class_name,
104	reg_syntax_t syntax);
105	#endif /* not RE_ENABLE_I18N */
106	static bin_tree_t build_charclass_op (re_dfa_t dfa,
107	RE_TRANSLATE_TYPE trans,
108	const unsigned char *class_name,
109	const unsigned char *extra,
110	bool non_match, reg_errcode_t *err);
111	static bin_tree_t create_tree (re_dfa_t dfa,
112	bin_tree_t left, bin_tree_t right,
113	re_token_type_t type);
114	static bin_tree_t create_token_tree (re_dfa_t dfa,
115	bin_tree_t left, bin_tree_t right,
116	const re_token_t *token);
117	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
118	static void free_token (re_token_t *node);
119	static reg_errcode_t free_tree (void extra, bin_tree_t node);
120	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
121
122
123	/* This table gives an error message for each of the error codes listed
124	in regex.h. Obviously the order here has to be same as there.
125	POSIX doesn't require that we do anything for REG_NOERROR,
126	but why not be nice? */
127
128	static const char __re_error_msgid[] =
129	{
130	#define REG_NOERROR_IDX 0
131	gettext_noop ("Success") /* REG_NOERROR */
132	"\0"
133	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134	gettext_noop ("No match") /* REG_NOMATCH */
135	"\0"
136	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138	"\0"
139	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141	"\0"
142	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144	"\0"
145	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147	"\0"
148	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150	"\0"
151	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153	"\0"
154	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156	"\0"
157	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159	"\0"
160	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162	"\0"
163	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164	gettext_noop ("Invalid range end") /* REG_ERANGE */
165	"\0"
166	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167	gettext_noop ("Memory exhausted") /* REG_ESPACE */
168	"\0"
169	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171	"\0"
172	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173	gettext_noop ("Premature end of regular expression") /* REG_EEND */
174	"\0"
175	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176	gettext_noop ("Regular expression too big") /* REG_ESIZE */
177	"\0"
178	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180	};
181
182	static const size_t __re_error_msgid_idx[] =
183	{
184	REG_NOERROR_IDX,
185	REG_NOMATCH_IDX,
186	REG_BADPAT_IDX,
187	REG_ECOLLATE_IDX,
188	REG_ECTYPE_IDX,
189	REG_EESCAPE_IDX,
190	REG_ESUBREG_IDX,
191	REG_EBRACK_IDX,
192	REG_EPAREN_IDX,
193	REG_EBRACE_IDX,
194	REG_BADBR_IDX,
195	REG_ERANGE_IDX,
196	REG_ESPACE_IDX,
197	REG_BADRPT_IDX,
198	REG_EEND_IDX,
199	REG_ESIZE_IDX,
200	REG_ERPAREN_IDX
201	};
202
203
204	/* Entry points for GNU code. */
205
206	/* re_compile_pattern is the GNU regular expression compiler: it
207	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208	Returns 0 if the pattern was valid, otherwise an error string.
209
210	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
211	are set in BUFP on entry. */
212
213	#ifdef _LIBC
214	const char *
215	re_compile_pattern (pattern, length, bufp)
216	const char *pattern;
217	size_t length;
218	struct re_pattern_buffer *bufp;
219	#else /* size_t might promote */
220	const char *
221	re_compile_pattern (const char *pattern, size_t length,
222	struct re_pattern_buffer *bufp)
223	#endif
224	{
225	reg_errcode_t ret;
226
227	/* And GNU code determines whether or not to get register information
228	by passing null for the REGS argument to re_match, etc., not by
229	setting no_sub, unless RE_NO_SUB is set. */
230	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231
232	/* Match anchors at newline. */
233	bufp->newline_anchor = 1;
234
235	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
236
237	if (!ret)
238	return NULL;
239	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240	}
241	#ifdef _LIBC
242	weak_alias (__re_compile_pattern, re_compile_pattern)
243	#endif
244
245	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
246	also be assigned to arbitrarily: each pattern buffer stores its own
247	syntax, so it can be changed between regex compilations. */
248	/* This has no initializer because initialized variables in Emacs
249	become read-only after dumping. */
250	reg_syntax_t re_syntax_options;
251
252
253	/* Specify the precise syntax of regexps for compilation. This provides
254	for compatibility for various utilities which historically have
255	different, incompatible syntaxes.
256
257	The argument SYNTAX is a bit mask comprised of the various bits
258	defined in regex.h. We return the old syntax. */
259
260	reg_syntax_t
261	re_set_syntax (syntax)
262	reg_syntax_t syntax;
263	{
264	reg_syntax_t ret = re_syntax_options;
265
266	re_syntax_options = syntax;
267	return ret;
268	}
269	#ifdef _LIBC
270	weak_alias (__re_set_syntax, re_set_syntax)
271	#endif
272
273	int
274	re_compile_fastmap (bufp)
275	struct re_pattern_buffer *bufp;
276	{
277	re_dfa_t *dfa = bufp->buffer;
278	char *fastmap = bufp->fastmap;
279
280	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282	if (dfa->init_state != dfa->init_state_word)
283	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284	if (dfa->init_state != dfa->init_state_nl)
285	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286	if (dfa->init_state != dfa->init_state_begbuf)
287	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288	bufp->fastmap_accurate = 1;
289	return 0;
290	}
291	#ifdef _LIBC
292	weak_alias (__re_compile_fastmap, re_compile_fastmap)
293	#endif
294
295	static inline void
296	__attribute ((always_inline))
297	re_set_fastmap (char *fastmap, bool icase, int ch)
298	{
299	fastmap[ch] = 1;
300	if (icase)
301	fastmap[tolower (ch)] = 1;
302	}
303
304	/* Helper function for re_compile_fastmap.
305	Compile fastmap for the initial_state INIT_STATE. */
306
307	static void
308	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
309	char *fastmap)
310	{
311	re_dfa_t *dfa = bufp->buffer;
312	Idx node_cnt;
313	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315	{
316	Idx node = init_state->nodes.elems[node_cnt];
317	re_token_type_t type = dfa->nodes[node].type;
318
319	if (type == CHARACTER)
320	{
321	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322	#ifdef RE_ENABLE_I18N
323	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324	{
325	unsigned char buf[MB_LEN_MAX];
326	unsigned char *p;
327	wchar_t wc;
328	mbstate_t state;
329
330	p = buf;
331	*p++ = dfa->nodes[node].opr.c;
332	while (++node < dfa->nodes_len
333	&& dfa->nodes[node].type == CHARACTER
334	&& dfa->nodes[node].mb_partial)
335	*p++ = dfa->nodes[node].opr.c;
336	memset (&state, '\0', sizeof (state));
337	if (__mbrtowc (&wc, (const char *) buf, p - buf,
338	&state) == p - buf
339	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
340	!= (size_t) -1))
341	re_set_fastmap (fastmap, false, buf[0]);
342	}
343	#endif
344	}
345	else if (type == SIMPLE_BRACKET)
346	{
347	int i, ch;
348	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349	{
350	int j;
351	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353	if (w & ((bitset_word_t) 1 << j))
354	re_set_fastmap (fastmap, icase, ch);
355	}
356	}
357	#ifdef RE_ENABLE_I18N
358	else if (type == COMPLEX_BRACKET)
359	{
360	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361	Idx i;
362
363	# ifdef _LIBC
364	/* See if we have to try all bytes which start multiple collation
365	elements.
366	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367	collation element, and don't catch 'b' since 'b' is
368	the only collation element which starts from 'b' (and
369	it is caught by SIMPLE_BRACKET). */
370	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371	&& (cset->ncoll_syms \|\| cset->nranges))
372	{
373	const int32_t table = (const int32_t )
374	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375	for (i = 0; i < SBC_MAX; ++i)
376	if (table[i] < 0)
377	re_set_fastmap (fastmap, icase, i);
378	}
379	# endif /* _LIBC */
380
381	/* See if we have to start the match at all multibyte characters,
382	i.e. where we would not find an invalid sequence. This only
383	applies to multibyte character sets; for single byte character
384	sets, the SIMPLE_BRACKET again suffices. */
385	if (dfa->mb_cur_max > 1
386	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
387	# ifdef _LIBC
388	\|\| cset->nequiv_classes
389	# endif /* _LIBC */
390	))
391	{
392	unsigned char c = 0;
393	do
394	{
395	mbstate_t mbs;
396	memset (&mbs, 0, sizeof (mbs));
397	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398	re_set_fastmap (fastmap, false, (int) c);
399	}
400	while (++c != 0);
401	}
402
403	else
404	{
405	/* ... Else catch all bytes which can start the mbchars. */
406	for (i = 0; i < cset->nmbchars; ++i)
407	{
408	char buf[256];
409	mbstate_t state;
410	memset (&state, '\0', sizeof (state));
411	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
413	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414	{
415	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416	!= (size_t) -1)
417	re_set_fastmap (fastmap, false, (unsigned char ) buf);
418	}
419	}
420	}
421	}
422	#endif /* RE_ENABLE_I18N */
423	else if (type == OP_PERIOD
424	#ifdef RE_ENABLE_I18N
425	\|\| type == OP_UTF8_PERIOD
426	#endif /* RE_ENABLE_I18N */
427	\|\| type == END_OF_RE)
428	{
429	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430	if (type == END_OF_RE)
431	bufp->can_be_null = 1;
432	return;
433	}
434	}
435	}
436
437
438	/* Entry point for POSIX code. */
439	/* regcomp takes a regular expression as a string and compiles it.
440
441	PREG is a regex_t *. We do not expect any fields to be initialized,
442	since POSIX says we shouldn't. Thus, we set
443
444	'buffer' to the compiled pattern;
445	'used' to the length of the compiled pattern;
446	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447	REG_EXTENDED bit in CFLAGS is set; otherwise, to
448	RE_SYNTAX_POSIX_BASIC;
449	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
450	'fastmap' to an allocated space for the fastmap;
451	'fastmap_accurate' to zero;
452	're_nsub' to the number of subexpressions in PATTERN.
453
454	PATTERN is the address of the pattern string.
455
456	CFLAGS is a series of bits which affect compilation.
457
458	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459	use POSIX basic syntax.
460
461	If REG_NEWLINE is set, then . and [^...] don't match newline.
462	Also, regexec will try a match beginning after every newline.
463
464	If REG_ICASE is set, then we considers upper- and lowercase
465	versions of letters to be equivalent when matching.
466
467	If REG_NOSUB is set, then when PREG is passed to regexec, that
468	routine will report only success or failure, and nothing about the
469	registers.
470
471	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472	the return codes and their meanings.) */
473
474	int
475	regcomp (preg, pattern, cflags)
476	regex_t *_Restrict_ preg;
477	const char *_Restrict_ pattern;
478	int cflags;
479	{
480	reg_errcode_t ret;
481	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482	: RE_SYNTAX_POSIX_BASIC);
483
484	preg->buffer = NULL;
485	preg->allocated = 0;
486	preg->used = 0;
487
488	/* Try to allocate space for the fastmap. */
489	preg->fastmap = re_malloc (char, SBC_MAX);
490	if (BE (preg->fastmap == NULL, 0))
491	return REG_ESPACE;
492
493	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
494
495	/* If REG_NEWLINE is set, newlines are treated differently. */
496	if (cflags & REG_NEWLINE)
497	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
498	syntax &= ~RE_DOT_NEWLINE;
499	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
500	/* It also changes the matching behavior. */
501	preg->newline_anchor = 1;
502	}
503	else
504	preg->newline_anchor = 0;
505	preg->no_sub = !!(cflags & REG_NOSUB);
506	preg->translate = NULL;
507
508	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509
510	/* POSIX doesn't distinguish between an unmatched open-group and an
511	unmatched close-group: both are REG_EPAREN. */
512	if (ret == REG_ERPAREN)
513	ret = REG_EPAREN;
514
515	/* We have already checked preg->fastmap != NULL. */
516	if (BE (ret == REG_NOERROR, 1))
517	/* Compute the fastmap now, since regexec cannot modify the pattern
518	buffer. This function never fails in this implementation. */
519	(void) re_compile_fastmap (preg);
520	else
521	{
522	/* Some error occurred while compiling the expression. */
523	re_free (preg->fastmap);
524	preg->fastmap = NULL;
525	}
526
527	return (int) ret;
528	}
529	#ifdef _LIBC
530	weak_alias (__regcomp, regcomp)
531	#endif
532
533	/* Returns a message corresponding to an error code, ERRCODE, returned
534	from either regcomp or regexec. We don't use PREG here. */
535
536	#ifdef _LIBC
537	size_t
538	regerror (errcode, preg, errbuf, errbuf_size)
539	int errcode;
540	const regex_t *_Restrict_ preg;
541	char *_Restrict_ errbuf;
542	size_t errbuf_size;
543	#else /* size_t might promote */
544	size_t
545	regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
546	char *_Restrict_ errbuf, size_t errbuf_size)
547	#endif
548	{
549	const char *msg;
550	size_t msg_size;
551
552	if (BE (errcode < 0
553	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
554	/ sizeof (__re_error_msgid_idx[0])), 0))
555	/* Only error codes returned by the rest of the code should be passed
556	to this routine. If we are given anything else, or if other regex
557	code generates an invalid error code, then the program has a bug.
558	Dump core so we can fix it. */
559	abort ();
560
561	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562
563	msg_size = strlen (msg) + 1; /* Includes the null. */
564
565	if (BE (errbuf_size != 0, 1))
566	{
567	size_t cpy_size = msg_size;
568	if (BE (msg_size > errbuf_size, 0))
569	{
570	cpy_size = errbuf_size - 1;
571	errbuf[cpy_size] = '\0';
572	}
573	memcpy (errbuf, msg, cpy_size);
574	}
575
576	return msg_size;
577	}
578	#ifdef _LIBC
579	weak_alias (__regerror, regerror)
580	#endif
581
582
583	#ifdef RE_ENABLE_I18N
584	/* This static array is used for the map to single-byte characters when
585	UTF-8 is used. Otherwise we would allocate memory just to initialize
586	it the same all the time. UTF-8 is the preferred encoding so this is
587	a worthwhile optimization. */
588	static const bitset_t utf8_sb_map =
589	{
590	/* Set the first 128 bits. */
591	# ifdef __GNUC__
592	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
593	# else
594	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
595	# error "bitset_word_t is narrower than 32 bits"
596	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
597	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
598	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
599	BITSET_WORD_MAX, BITSET_WORD_MAX,
600	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601	BITSET_WORD_MAX,
602	# endif
603	(BITSET_WORD_MAX
604	>> (SBC_MAX % BITSET_WORD_BITS == 0
605	? 0
606	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
607	# endif
608	};
609	#endif
610
611
612	static void
613	free_dfa_content (re_dfa_t *dfa)
614	{
615	Idx i, j;
616
617	if (dfa->nodes)
618	for (i = 0; i < dfa->nodes_len; ++i)
619	free_token (dfa->nodes + i);
620	re_free (dfa->nexts);
621	for (i = 0; i < dfa->nodes_len; ++i)
622	{
623	if (dfa->eclosures != NULL)
624	re_node_set_free (dfa->eclosures + i);
625	if (dfa->inveclosures != NULL)
626	re_node_set_free (dfa->inveclosures + i);
627	if (dfa->edests != NULL)
628	re_node_set_free (dfa->edests + i);
629	}
630	re_free (dfa->edests);
631	re_free (dfa->eclosures);
632	re_free (dfa->inveclosures);
633	re_free (dfa->nodes);
634
635	if (dfa->state_table)
636	for (i = 0; i <= dfa->state_hash_mask; ++i)
637	{
638	struct re_state_table_entry *entry = dfa->state_table + i;
639	for (j = 0; j < entry->num; ++j)
640	{
641	re_dfastate_t *state = entry->array[j];
642	free_state (state);
643	}
644	re_free (entry->array);
645	}
646	re_free (dfa->state_table);
647	#ifdef RE_ENABLE_I18N
648	if (dfa->sb_char != utf8_sb_map)
649	re_free (dfa->sb_char);
650	#endif
651	re_free (dfa->subexp_map);
652	#ifdef DEBUG
653	re_free (dfa->re_str);
654	#endif
655
656	re_free (dfa);
657	}
658
659
660	/* Free dynamically allocated space used by PREG. */
661
662	void
663	regfree (preg)
664	regex_t *preg;
665	{
666	re_dfa_t *dfa = preg->buffer;
667	if (BE (dfa != NULL, 1))
668	free_dfa_content (dfa);
669	preg->buffer = NULL;
670	preg->allocated = 0;
671
672	re_free (preg->fastmap);
673	preg->fastmap = NULL;
674
675	re_free (preg->translate);
676	preg->translate = NULL;
677	}
678	#ifdef _LIBC
679	weak_alias (__regfree, regfree)
680	#endif
681
682
683	/* Entry points compatible with 4.2 BSD regex library. We don't define
684	them unless specifically requested. */
685
686	#if defined _REGEX_RE_COMP \|\| defined _LIBC
687
688	/* BSD has one and only one pattern buffer. */
689	static struct re_pattern_buffer re_comp_buf;
690
691	char *
692	# ifdef _LIBC
693	/* Make these definitions weak in libc, so POSIX programs can redefine
694	these names if they don't use our functions, and still use
695	regcomp/regexec above without link errors. */
696	weak_function
697	# endif
698	re_comp (s)
699	const char *s;
700	{
701	reg_errcode_t ret;
702	char *fastmap;
703
704	if (!s)
705	{
706	if (!re_comp_buf.buffer)
707	return gettext ("No previous regular expression");
708	return 0;
709	}
710
711	if (re_comp_buf.buffer)
712	{
713	fastmap = re_comp_buf.fastmap;
714	re_comp_buf.fastmap = NULL;
715	__regfree (&re_comp_buf);
716	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717	re_comp_buf.fastmap = fastmap;
718	}
719
720	if (re_comp_buf.fastmap == NULL)
721	{
722	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723	if (re_comp_buf.fastmap == NULL)
724	return (char *) gettext (__re_error_msgid
725	+ __re_error_msgid_idx[(int) REG_ESPACE]);
726	}
727
728	/* Since 're_exec' always passes NULL for the 'regs' argument, we
729	don't need to initialize the pattern buffer fields which affect it. */
730
731	/* Match anchors at newlines. */
732	re_comp_buf.newline_anchor = 1;
733
734	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735
736	if (!ret)
737	return NULL;
738
739	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
740	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741	}
742
743	#ifdef _LIBC
744	libc_freeres_fn (free_mem)
745	{
746	__regfree (&re_comp_buf);
747	}
748	#endif
749
750	#endif /* _REGEX_RE_COMP */
751
752
753	/* Internal entry point.
754	Compile the regular expression PATTERN, whose length is LENGTH.
755	SYNTAX indicate regular expression's syntax. */
756
757	static reg_errcode_t
758	re_compile_internal (regex_t preg, const char pattern, size_t length,
759	reg_syntax_t syntax)
760	{
761	reg_errcode_t err = REG_NOERROR;
762	re_dfa_t *dfa;
763	re_string_t regexp;
764
765	/* Initialize the pattern buffer. */
766	preg->fastmap_accurate = 0;
767	preg->syntax = syntax;
768	preg->not_bol = preg->not_eol = 0;
769	preg->used = 0;
770	preg->re_nsub = 0;
771	preg->can_be_null = 0;
772	preg->regs_allocated = REGS_UNALLOCATED;
773
774	/* Initialize the dfa. */
775	dfa = preg->buffer;
776	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
777	{
778	/* If zero allocated, but buffer is non-null, try to realloc
779	enough space. This loses if buffer's address is bogus, but
780	that is the user's responsibility. If ->buffer is NULL this
781	is a simple allocation. */
782	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
783	if (dfa == NULL)
784	return REG_ESPACE;
785	preg->allocated = sizeof (re_dfa_t);
786	preg->buffer = dfa;
787	}
788	preg->used = sizeof (re_dfa_t);
789
790	err = init_dfa (dfa, length);
791	if (BE (err != REG_NOERROR, 0))
792	{
793	free_dfa_content (dfa);
794	preg->buffer = NULL;
795	preg->allocated = 0;
796	return err;
797	}
798	#ifdef DEBUG
799	/* Note: length+1 will not overflow since it is checked in init_dfa. */
800	dfa->re_str = re_malloc (char, length + 1);
801	strncpy (dfa->re_str, pattern, length + 1);
802	#endif
803
804	__libc_lock_init (dfa->lock);
805
806	err = re_string_construct (&regexp, pattern, length, preg->translate,
807	(syntax & RE_ICASE) != 0, dfa);
808	if (BE (err != REG_NOERROR, 0))
809	{
810	re_compile_internal_free_return:
811	free_workarea_compile (preg);
812	re_string_destruct (&regexp);
813	free_dfa_content (dfa);
814	preg->buffer = NULL;
815	preg->allocated = 0;
816	return err;
817	}
818
819	/* Parse the regular expression, and build a structure tree. */
820	preg->re_nsub = 0;
821	dfa->str_tree = parse (&regexp, preg, syntax, &err);
822	if (BE (dfa->str_tree == NULL, 0))
823	goto re_compile_internal_free_return;
824
825	/* Analyze the tree and create the nfa. */
826	err = analyze (preg);
827	if (BE (err != REG_NOERROR, 0))
828	goto re_compile_internal_free_return;
829
830	#ifdef RE_ENABLE_I18N
831	/* If possible, do searching in single byte encoding to speed things up. */
832	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833	optimize_utf8 (dfa);
834	#endif
835
836	/* Then create the initial state of the dfa. */
837	err = create_initial_state (dfa);
838
839	/* Release work areas. */
840	free_workarea_compile (preg);
841	re_string_destruct (&regexp);
842
843	if (BE (err != REG_NOERROR, 0))
844	{
845	free_dfa_content (dfa);
846	preg->buffer = NULL;
847	preg->allocated = 0;
848	}
849
850	return err;
851	}
852
853	/* Initialize DFA. We use the length of the regular expression PAT_LEN
854	as the initial length of some arrays. */
855
856	static reg_errcode_t
857	init_dfa (re_dfa_t *dfa, size_t pat_len)
858	{
859	__re_size_t table_size;
860	#ifndef _LIBC
861	const char *codeset_name;
862	#endif
863	#ifdef RE_ENABLE_I18N
864	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
865	#else
866	size_t max_i18n_object_size = 0;
867	#endif
868	size_t max_object_size =
869	MAX (sizeof (struct re_state_table_entry),
870	MAX (sizeof (re_token_t),
871	MAX (sizeof (re_node_set),
872	MAX (sizeof (regmatch_t),
873	max_i18n_object_size))));
874
875	memset (dfa, '\0', sizeof (re_dfa_t));
876
877	/* Force allocation of str_tree_storage the first time. */
878	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
879
880	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
881	calculation below, and for similar doubling calculations
882	elsewhere. And it's <= rather than <, because some of the
883	doubling calculations add 1 afterwards. */
884	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
885	return REG_ESPACE;
886
887	dfa->nodes_alloc = pat_len + 1;
888	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
889
890	/* table_size = 2 ^ ceil(log pat_len) */
891	for (table_size = 1; ; table_size <<= 1)
892	if (table_size > pat_len)
893	break;
894
895	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
896	dfa->state_hash_mask = table_size - 1;
897
898	dfa->mb_cur_max = MB_CUR_MAX;
899	#ifdef _LIBC
900	if (dfa->mb_cur_max == 6
901	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
902	dfa->is_utf8 = 1;
903	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
904	!= 0);
905	#else
906	codeset_name = nl_langinfo (CODESET);
907	if (strcasecmp (codeset_name, "UTF-8") == 0
908	\|\| strcasecmp (codeset_name, "UTF8") == 0)
909	dfa->is_utf8 = 1;
910
911	/* We check exhaustively in the loop below if this charset is a
912	superset of ASCII. */
913	dfa->map_notascii = 0;
914	#endif
915
916	#ifdef RE_ENABLE_I18N
917	if (dfa->mb_cur_max > 1)
918	{
919	if (dfa->is_utf8)
920	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
921	else
922	{
923	int i, j, ch;
924
925	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
926	if (BE (dfa->sb_char == NULL, 0))
927	return REG_ESPACE;
928
929	/* Set the bits corresponding to single byte chars. */
930	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
931	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
932	{
933	wint_t wch = __btowc (ch);
934	if (wch != WEOF)
935	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
936	# ifndef _LIBC
937	if (isascii (ch) && wch != ch)
938	dfa->map_notascii = 1;
939	# endif
940	}
941	}
942	}
943	#endif
944
945	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
946	return REG_ESPACE;
947	return REG_NOERROR;
948	}
949
950	/* Initialize WORD_CHAR table, which indicate which character is
951	"word". In this case "word" means that it is the word construction
952	character used by some operators like "\<", "\>", etc. */
953
954	static void
955	internal_function
956	init_word_char (re_dfa_t *dfa)
957	{
958	dfa->word_ops_used = 1;
959	int i = 0;
960	int j;
961	int ch = 0;
962	if (BE (dfa->map_notascii == 0, 1))
963	{
964	if (BITSET_WORD_BITS == 64)
965	{
966	dfa->word_char[0] = UINT64_C (0x03ff000000000000);
967	dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
968	i = 2;
969	}
970	else if (BITSET_WORD_BITS == 32)
971	{
972	dfa->word_char[0] = UINT32_C (0x00000000);
973	dfa->word_char[1] = UINT32_C (0x03ff0000);
974	dfa->word_char[2] = UINT32_C (0x87fffffe);
975	dfa->word_char[3] = UINT32_C (0x07fffffe);
976	i = 4;
977	}
978	else
979	goto general_case;
980	ch = 128;
981
982	if (BE (dfa->is_utf8, 1))
983	{
984	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
985	return;
986	}
987	}
988
989	general_case:
990	for (; i < BITSET_WORDS; ++i)
991	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
992	if (isalnum (ch) \|\| ch == '_')
993	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
994	}
995
996	/* Free the work area which are only used while compiling. */
997
998	static void
999	free_workarea_compile (regex_t *preg)
1000	{
1001	re_dfa_t *dfa = preg->buffer;
1002	bin_tree_storage_t storage, next;
1003	for (storage = dfa->str_tree_storage; storage; storage = next)
1004	{
1005	next = storage->next;
1006	re_free (storage);
1007	}
1008	dfa->str_tree_storage = NULL;
1009	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1010	dfa->str_tree = NULL;
1011	re_free (dfa->org_indices);
1012	dfa->org_indices = NULL;
1013	}
1014
1015	/* Create initial states for all contexts. */
1016
1017	static reg_errcode_t
1018	create_initial_state (re_dfa_t *dfa)
1019	{
1020	Idx first, i;
1021	reg_errcode_t err;
1022	re_node_set init_nodes;
1023
1024	/* Initial states have the epsilon closure of the node which is
1025	the first node of the regular expression. */
1026	first = dfa->str_tree->first->node_idx;
1027	dfa->init_node = first;
1028	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1029	if (BE (err != REG_NOERROR, 0))
1030	return err;
1031
1032	/* The back-references which are in initial states can epsilon transit,
1033	since in this case all of the subexpressions can be null.
1034	Then we add epsilon closures of the nodes which are the next nodes of
1035	the back-references. */
1036	if (dfa->nbackref > 0)
1037	for (i = 0; i < init_nodes.nelem; ++i)
1038	{
1039	Idx node_idx = init_nodes.elems[i];
1040	re_token_type_t type = dfa->nodes[node_idx].type;
1041
1042	Idx clexp_idx;
1043	if (type != OP_BACK_REF)
1044	continue;
1045	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1046	{
1047	re_token_t *clexp_node;
1048	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1049	if (clexp_node->type == OP_CLOSE_SUBEXP
1050	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1051	break;
1052	}
1053	if (clexp_idx == init_nodes.nelem)
1054	continue;
1055
1056	if (type == OP_BACK_REF)
1057	{
1058	Idx dest_idx = dfa->edests[node_idx].elems[0];
1059	if (!re_node_set_contains (&init_nodes, dest_idx))
1060	{
1061	reg_errcode_t merge_err
1062	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1063	if (merge_err != REG_NOERROR)
1064	return merge_err;
1065	i = 0;
1066	}
1067	}
1068	}
1069
1070	/* It must be the first time to invoke acquire_state. */
1071	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1072	/* We don't check ERR here, since the initial state must not be NULL. */
1073	if (BE (dfa->init_state == NULL, 0))
1074	return err;
1075	if (dfa->init_state->has_constraint)
1076	{
1077	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1078	CONTEXT_WORD);
1079	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1080	CONTEXT_NEWLINE);
1081	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1082	&init_nodes,
1083	CONTEXT_NEWLINE
1084	\| CONTEXT_BEGBUF);
1085	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
1086	\|\| dfa->init_state_begbuf == NULL, 0))
1087	return err;
1088	}
1089	else
1090	dfa->init_state_word = dfa->init_state_nl
1091	= dfa->init_state_begbuf = dfa->init_state;
1092
1093	re_node_set_free (&init_nodes);
1094	return REG_NOERROR;
1095	}
1096
1097
1098	#ifdef RE_ENABLE_I18N
1099	/* If it is possible to do searching in single byte encoding instead of UTF-8
1100	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101	DFA nodes where needed. */
1102
1103	static void
1104	optimize_utf8 (re_dfa_t *dfa)
1105	{
1106	Idx node;
1107	int i;
1108	bool mb_chars = false;
1109	bool has_period = false;
1110
1111	for (node = 0; node < dfa->nodes_len; ++node)
1112	switch (dfa->nodes[node].type)
1113	{
1114	case CHARACTER:
1115	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1116	mb_chars = true;
1117	break;
1118	case ANCHOR:
1119	switch (dfa->nodes[node].opr.ctx_type)
1120	{
1121	case LINE_FIRST:
1122	case LINE_LAST:
1123	case BUF_FIRST:
1124	case BUF_LAST:
1125	break;
1126	default:
1127	/* Word anchors etc. cannot be handled. It's okay to test
1128	opr.ctx_type since constraints (for all DFA nodes) are
1129	created by ORing one or more opr.ctx_type values. */
1130	return;
1131	}
1132	break;
1133	case OP_PERIOD:
1134	has_period = true;
1135	break;
1136	case OP_BACK_REF:
1137	case OP_ALT:
1138	case END_OF_RE:
1139	case OP_DUP_ASTERISK:
1140	case OP_OPEN_SUBEXP:
1141	case OP_CLOSE_SUBEXP:
1142	break;
1143	case COMPLEX_BRACKET:
1144	return;
1145	case SIMPLE_BRACKET:
1146	/* Just double check. */
1147	{
1148	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1149	? 0
1150	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1152	{
1153	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1154	return;
1155	rshift = 0;
1156	}
1157	}
1158	break;
1159	default:
1160	abort ();
1161	}
1162
1163	if (mb_chars \|\| has_period)
1164	for (node = 0; node < dfa->nodes_len; ++node)
1165	{
1166	if (dfa->nodes[node].type == CHARACTER
1167	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1168	dfa->nodes[node].mb_partial = 0;
1169	else if (dfa->nodes[node].type == OP_PERIOD)
1170	dfa->nodes[node].type = OP_UTF8_PERIOD;
1171	}
1172
1173	/* The search can be in single byte locale. */
1174	dfa->mb_cur_max = 1;
1175	dfa->is_utf8 = 0;
1176	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1177	}
1178	#endif
1179
1180
1181	/* Analyze the structure tree, and calculate "first", "next", "edest",
1182	"eclosure", and "inveclosure". */
1183
1184	static reg_errcode_t
1185	analyze (regex_t *preg)
1186	{
1187	re_dfa_t *dfa = preg->buffer;
1188	reg_errcode_t ret;
1189
1190	/* Allocate arrays. */
1191	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
1196	\|\| dfa->eclosures == NULL, 0))
1197	return REG_ESPACE;
1198
1199	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200	if (dfa->subexp_map != NULL)
1201	{
1202	Idx i;
1203	for (i = 0; i < preg->re_nsub; i++)
1204	dfa->subexp_map[i] = i;
1205	preorder (dfa->str_tree, optimize_subexps, dfa);
1206	for (i = 0; i < preg->re_nsub; i++)
1207	if (dfa->subexp_map[i] != i)
1208	break;
1209	if (i == preg->re_nsub)
1210	{
1211	free (dfa->subexp_map);
1212	dfa->subexp_map = NULL;
1213	}
1214	}
1215
1216	ret = postorder (dfa->str_tree, lower_subexps, preg);
1217	if (BE (ret != REG_NOERROR, 0))
1218	return ret;
1219	ret = postorder (dfa->str_tree, calc_first, dfa);
1220	if (BE (ret != REG_NOERROR, 0))
1221	return ret;
1222	preorder (dfa->str_tree, calc_next, dfa);
1223	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224	if (BE (ret != REG_NOERROR, 0))
1225	return ret;
1226	ret = calc_eclosure (dfa);
1227	if (BE (ret != REG_NOERROR, 0))
1228	return ret;
1229
1230	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1231	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1232	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1233	\|\| dfa->nbackref)
1234	{
1235	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236	if (BE (dfa->inveclosures == NULL, 0))
1237	return REG_ESPACE;
1238	ret = calc_inveclosure (dfa);
1239	}
1240
1241	return ret;
1242	}
1243
1244	/* Our parse trees are very unbalanced, so we cannot use a stack to
1245	implement parse tree visits. Instead, we use parent pointers and
1246	some hairy code in these two functions. */
1247	static reg_errcode_t
1248	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1249	void *extra)
1250	{
1251	bin_tree_t node, prev;
1252
1253	for (node = root; ; )
1254	{
1255	/* Descend down the tree, preferably to the left (or to the right
1256	if that's the only child). */
1257	while (node->left \|\| node->right)
1258	if (node->left)
1259	node = node->left;
1260	else
1261	node = node->right;
1262
1263	do
1264	{
1265	reg_errcode_t err = fn (extra, node);
1266	if (BE (err != REG_NOERROR, 0))
1267	return err;
1268	if (node->parent == NULL)
1269	return REG_NOERROR;
1270	prev = node;
1271	node = node->parent;
1272	}
1273	/* Go up while we have a node that is reached from the right. */
1274	while (node->right == prev \|\| node->right == NULL);
1275	node = node->right;
1276	}
1277	}
1278
1279	static reg_errcode_t
1280	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1281	void *extra)
1282	{
1283	bin_tree_t *node;
1284
1285	for (node = root; ; )
1286	{
1287	reg_errcode_t err = fn (extra, node);
1288	if (BE (err != REG_NOERROR, 0))
1289	return err;
1290
1291	/* Go to the left node, or up and to the right. */
1292	if (node->left)
1293	node = node->left;
1294	else
1295	{
1296	bin_tree_t *prev = NULL;
1297	while (node->right == prev \|\| node->right == NULL)
1298	{
1299	prev = node;
1300	node = node->parent;
1301	if (!node)
1302	return REG_NOERROR;
1303	}
1304	node = node->right;
1305	}
1306	}
1307	}
1308
1309	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1311	backreferences as well. Requires a preorder visit. */
1312	static reg_errcode_t
1313	optimize_subexps (void extra, bin_tree_t node)
1314	{
1315	re_dfa_t dfa = (re_dfa_t ) extra;
1316
1317	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318	{
1319	int idx = node->token.opr.idx;
1320	node->token.opr.idx = dfa->subexp_map[idx];
1321	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1322	}
1323
1324	else if (node->token.type == SUBEXP
1325	&& node->left && node->left->token.type == SUBEXP)
1326	{
1327	Idx other_idx = node->left->token.opr.idx;
1328
1329	node->left = node->left->left;
1330	if (node->left)
1331	node->left->parent = node;
1332
1333	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334	if (other_idx < BITSET_WORD_BITS)
1335	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1336	}
1337
1338	return REG_NOERROR;
1339	}
1340
1341	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1343	static reg_errcode_t
1344	lower_subexps (void extra, bin_tree_t node)
1345	{
1346	regex_t preg = (regex_t ) extra;
1347	reg_errcode_t err = REG_NOERROR;
1348
1349	if (node->left && node->left->token.type == SUBEXP)
1350	{
1351	node->left = lower_subexp (&err, preg, node->left);
1352	if (node->left)
1353	node->left->parent = node;
1354	}
1355	if (node->right && node->right->token.type == SUBEXP)
1356	{
1357	node->right = lower_subexp (&err, preg, node->right);
1358	if (node->right)
1359	node->right->parent = node;
1360	}
1361
1362	return err;
1363	}
1364
1365	static bin_tree_t *
1366	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1367	{
1368	re_dfa_t *dfa = preg->buffer;
1369	bin_tree_t *body = node->left;
1370	bin_tree_t op, cls, tree1, tree;
1371
1372	if (preg->no_sub
1373	/* We do not optimize empty subexpressions, because otherwise we may
1374	have bad CONCAT nodes with NULL children. This is obviously not
1375	very common, so we do not lose much. An example that triggers
1376	this case is the sed "script" //x. */
1377	&& node->left != NULL
1378	&& (node->token.opr.idx >= BITSET_WORD_BITS
1379	\|\| !(dfa->used_bkref_map
1380	& ((bitset_word_t) 1 << node->token.opr.idx))))
1381	return node->left;
1382
1383	/* Convert the SUBEXP node to the concatenation of an
1384	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1385	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388	tree = create_tree (dfa, op, tree1, CONCAT);
1389	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
1390	{
1391	*err = REG_ESPACE;
1392	return NULL;
1393	}
1394
1395	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397	return tree;
1398	}
1399
1400	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401	nodes. Requires a postorder visit. */
1402	static reg_errcode_t
1403	calc_first (void extra, bin_tree_t node)
1404	{
1405	re_dfa_t dfa = (re_dfa_t ) extra;
1406	if (node->token.type == CONCAT)
1407	{
1408	node->first = node->left->first;
1409	node->node_idx = node->left->node_idx;
1410	}
1411	else
1412	{
1413	node->first = node;
1414	node->node_idx = re_dfa_add_node (dfa, node->token);
1415	if (BE (node->node_idx == REG_MISSING, 0))
1416	return REG_ESPACE;
1417	if (node->token.type == ANCHOR)
1418	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1419	}
1420	return REG_NOERROR;
1421	}
1422
1423	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1424	static reg_errcode_t
1425	calc_next (void extra _UNUSED_PARAMETER_, bin_tree_t node)
1426	{
1427	switch (node->token.type)
1428	{
1429	case OP_DUP_ASTERISK:
1430	node->left->next = node;
1431	break;
1432	case CONCAT:
1433	node->left->next = node->right->first;
1434	node->right->next = node->next;
1435	break;
1436	default:
1437	if (node->left)
1438	node->left->next = node->next;
1439	if (node->right)
1440	node->right->next = node->next;
1441	break;
1442	}
1443	return REG_NOERROR;
1444	}
1445
1446	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1447	static reg_errcode_t
1448	link_nfa_nodes (void extra, bin_tree_t node)
1449	{
1450	re_dfa_t dfa = (re_dfa_t ) extra;
1451	Idx idx = node->node_idx;
1452	reg_errcode_t err = REG_NOERROR;
1453
1454	switch (node->token.type)
1455	{
1456	case CONCAT:
1457	break;
1458
1459	case END_OF_RE:
1460	assert (node->next == NULL);
1461	break;
1462
1463	case OP_DUP_ASTERISK:
1464	case OP_ALT:
1465	{
1466	Idx left, right;
1467	dfa->has_plural_match = 1;
1468	if (node->left != NULL)
1469	left = node->left->first->node_idx;
1470	else
1471	left = node->next->node_idx;
1472	if (node->right != NULL)
1473	right = node->right->first->node_idx;
1474	else
1475	right = node->next->node_idx;
1476	assert (REG_VALID_INDEX (left));
1477	assert (REG_VALID_INDEX (right));
1478	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1479	}
1480	break;
1481
1482	case ANCHOR:
1483	case OP_OPEN_SUBEXP:
1484	case OP_CLOSE_SUBEXP:
1485	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486	break;
1487
1488	case OP_BACK_REF:
1489	dfa->nexts[idx] = node->next->node_idx;
1490	if (node->token.type == OP_BACK_REF)
1491	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492	break;
1493
1494	default:
1495	assert (!IS_EPSILON_NODE (node->token.type));
1496	dfa->nexts[idx] = node->next->node_idx;
1497	break;
1498	}
1499
1500	return err;
1501	}
1502
1503	/* Duplicate the epsilon closure of the node ROOT_NODE.
1504	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505	to their own constraint. */
1506
1507	static reg_errcode_t
1508	internal_function
1509	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510	Idx root_node, unsigned int init_constraint)
1511	{
1512	Idx org_node, clone_node;
1513	bool ok;
1514	unsigned int constraint = init_constraint;
1515	for (org_node = top_org_node, clone_node = top_clone_node;;)
1516	{
1517	Idx org_dest, clone_dest;
1518	if (dfa->nodes[org_node].type == OP_BACK_REF)
1519	{
1520	/* If the back reference epsilon-transit, its destination must
1521	also have the constraint. Then duplicate the epsilon closure
1522	of the destination of the back reference, and store it in
1523	edests of the back reference. */
1524	org_dest = dfa->nexts[org_node];
1525	re_node_set_empty (dfa->edests + clone_node);
1526	clone_dest = duplicate_node (dfa, org_dest, constraint);
1527	if (BE (clone_dest == REG_MISSING, 0))
1528	return REG_ESPACE;
1529	dfa->nexts[clone_node] = dfa->nexts[org_node];
1530	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531	if (BE (! ok, 0))
1532	return REG_ESPACE;
1533	}
1534	else if (dfa->edests[org_node].nelem == 0)
1535	{
1536	/* In case of the node can't epsilon-transit, don't duplicate the
1537	destination and store the original destination as the
1538	destination of the node. */
1539	dfa->nexts[clone_node] = dfa->nexts[org_node];
1540	break;
1541	}
1542	else if (dfa->edests[org_node].nelem == 1)
1543	{
1544	/* In case of the node can epsilon-transit, and it has only one
1545	destination. */
1546	org_dest = dfa->edests[org_node].elems[0];
1547	re_node_set_empty (dfa->edests + clone_node);
1548	/* If the node is root_node itself, it means the epsilon closure
1549	has a loop. Then tie it to the destination of the root_node. */
1550	if (org_node == root_node && clone_node != org_node)
1551	{
1552	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1553	if (BE (! ok, 0))
1554	return REG_ESPACE;
1555	break;
1556	}
1557	/* In case the node has another constraint, append it. */
1558	constraint \|= dfa->nodes[org_node].constraint;
1559	clone_dest = duplicate_node (dfa, org_dest, constraint);
1560	if (BE (clone_dest == REG_MISSING, 0))
1561	return REG_ESPACE;
1562	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563	if (BE (! ok, 0))
1564	return REG_ESPACE;
1565	}
1566	else /* dfa->edests[org_node].nelem == 2 */
1567	{
1568	/* In case of the node can epsilon-transit, and it has two
1569	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1570	org_dest = dfa->edests[org_node].elems[0];
1571	re_node_set_empty (dfa->edests + clone_node);
1572	/* Search for a duplicated node which satisfies the constraint. */
1573	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574	if (clone_dest == REG_MISSING)
1575	{
1576	/* There is no such duplicated node, create a new one. */
1577	reg_errcode_t err;
1578	clone_dest = duplicate_node (dfa, org_dest, constraint);
1579	if (BE (clone_dest == REG_MISSING, 0))
1580	return REG_ESPACE;
1581	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582	if (BE (! ok, 0))
1583	return REG_ESPACE;
1584	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585	root_node, constraint);
1586	if (BE (err != REG_NOERROR, 0))
1587	return err;
1588	}
1589	else
1590	{
1591	/* There is a duplicated node which satisfies the constraint,
1592	use it to avoid infinite loop. */
1593	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1594	if (BE (! ok, 0))
1595	return REG_ESPACE;
1596	}
1597
1598	org_dest = dfa->edests[org_node].elems[1];
1599	clone_dest = duplicate_node (dfa, org_dest, constraint);
1600	if (BE (clone_dest == REG_MISSING, 0))
1601	return REG_ESPACE;
1602	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603	if (BE (! ok, 0))
1604	return REG_ESPACE;
1605	}
1606	org_node = org_dest;
1607	clone_node = clone_dest;
1608	}
1609	return REG_NOERROR;
1610	}
1611
1612	/* Search for a node which is duplicated from the node ORG_NODE, and
1613	satisfies the constraint CONSTRAINT. */
1614
1615	static Idx
1616	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617	unsigned int constraint)
1618	{
1619	Idx idx;
1620	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621	{
1622	if (org_node == dfa->org_indices[idx]
1623	&& constraint == dfa->nodes[idx].constraint)
1624	return idx; /* Found. */
1625	}
1626	return REG_MISSING; /* Not found. */
1627	}
1628
1629	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630	Return the index of the new node, or REG_MISSING if insufficient storage is
1631	available. */
1632
1633	static Idx
1634	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635	{
1636	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637	if (BE (dup_idx != REG_MISSING, 1))
1638	{
1639	dfa->nodes[dup_idx].constraint = constraint;
1640	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1641	dfa->nodes[dup_idx].duplicated = 1;
1642
1643	/* Store the index of the original node. */
1644	dfa->org_indices[dup_idx] = org_idx;
1645	}
1646	return dup_idx;
1647	}
1648
1649	static reg_errcode_t
1650	calc_inveclosure (re_dfa_t *dfa)
1651	{
1652	Idx src, idx;
1653	bool ok;
1654	for (idx = 0; idx < dfa->nodes_len; ++idx)
1655	re_node_set_init_empty (dfa->inveclosures + idx);
1656
1657	for (src = 0; src < dfa->nodes_len; ++src)
1658	{
1659	Idx *elems = dfa->eclosures[src].elems;
1660	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661	{
1662	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1663	if (BE (! ok, 0))
1664	return REG_ESPACE;
1665	}
1666	}
1667
1668	return REG_NOERROR;
1669	}
1670
1671	/* Calculate "eclosure" for all the node in DFA. */
1672
1673	static reg_errcode_t
1674	calc_eclosure (re_dfa_t *dfa)
1675	{
1676	Idx node_idx;
1677	bool incomplete;
1678	#ifdef DEBUG
1679	assert (dfa->nodes_len > 0);
1680	#endif
1681	incomplete = false;
1682	/* For each nodes, calculate epsilon closure. */
1683	for (node_idx = 0; ; ++node_idx)
1684	{
1685	reg_errcode_t err;
1686	re_node_set eclosure_elem;
1687	if (node_idx == dfa->nodes_len)
1688	{
1689	if (!incomplete)
1690	break;
1691	incomplete = false;
1692	node_idx = 0;
1693	}
1694
1695	#ifdef DEBUG
1696	assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1697	#endif
1698
1699	/* If we have already calculated, skip it. */
1700	if (dfa->eclosures[node_idx].nelem != 0)
1701	continue;
1702	/* Calculate epsilon closure of 'node_idx'. */
1703	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704	if (BE (err != REG_NOERROR, 0))
1705	return err;
1706
1707	if (dfa->eclosures[node_idx].nelem == 0)
1708	{
1709	incomplete = true;
1710	re_node_set_free (&eclosure_elem);
1711	}
1712	}
1713	return REG_NOERROR;
1714	}
1715
1716	/* Calculate epsilon closure of NODE. */
1717
1718	static reg_errcode_t
1719	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1720	{
1721	reg_errcode_t err;
1722	Idx i;
1723	re_node_set eclosure;
1724	bool ok;
1725	bool incomplete = false;
1726	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727	if (BE (err != REG_NOERROR, 0))
1728	return err;
1729
1730	/* This indicates that we are calculating this node now.
1731	We reference this value to avoid infinite loop. */
1732	dfa->eclosures[node].nelem = REG_MISSING;
1733
1734	/* If the current node has constraints, duplicate all nodes
1735	since they must inherit the constraints. */
1736	if (dfa->nodes[node].constraint
1737	&& dfa->edests[node].nelem
1738	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739	{
1740	err = duplicate_node_closure (dfa, node, node, node,
1741	dfa->nodes[node].constraint);
1742	if (BE (err != REG_NOERROR, 0))
1743	return err;
1744	}
1745
1746	/* Expand each epsilon destination nodes. */
1747	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748	for (i = 0; i < dfa->edests[node].nelem; ++i)
1749	{
1750	re_node_set eclosure_elem;
1751	Idx edest = dfa->edests[node].elems[i];
1752	/* If calculating the epsilon closure of 'edest' is in progress,
1753	return intermediate result. */
1754	if (dfa->eclosures[edest].nelem == REG_MISSING)
1755	{
1756	incomplete = true;
1757	continue;
1758	}
1759	/* If we haven't calculated the epsilon closure of 'edest' yet,
1760	calculate now. Otherwise use calculated epsilon closure. */
1761	if (dfa->eclosures[edest].nelem == 0)
1762	{
1763	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764	if (BE (err != REG_NOERROR, 0))
1765	return err;
1766	}
1767	else
1768	eclosure_elem = dfa->eclosures[edest];
1769	/* Merge the epsilon closure of 'edest'. */
1770	err = re_node_set_merge (&eclosure, &eclosure_elem);
1771	if (BE (err != REG_NOERROR, 0))
1772	return err;
1773	/* If the epsilon closure of 'edest' is incomplete,
1774	the epsilon closure of this node is also incomplete. */
1775	if (dfa->eclosures[edest].nelem == 0)
1776	{
1777	incomplete = true;
1778	re_node_set_free (&eclosure_elem);
1779	}
1780	}
1781
1782	/* An epsilon closure includes itself. */
1783	ok = re_node_set_insert (&eclosure, node);
1784	if (BE (! ok, 0))
1785	return REG_ESPACE;
1786	if (incomplete && !root)
1787	dfa->eclosures[node].nelem = 0;
1788	else
1789	dfa->eclosures[node] = eclosure;
1790	*new_set = eclosure;
1791	return REG_NOERROR;
1792	}
1793
1794
1795	/* Functions for token which are used in the parser. */
1796
1797	/* Fetch a token from INPUT.
1798	We must not use this function inside bracket expressions. */
1799
1800	static void
1801	internal_function
1802	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1803	{
1804	re_string_skip_bytes (input, peek_token (result, input, syntax));
1805	}
1806
1807	/* Peek a token from INPUT, and return the length of the token.
1808	We must not use this function inside bracket expressions. */
1809
1810	static int
1811	internal_function
1812	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1813	{
1814	unsigned char c;
1815
1816	if (re_string_eoi (input))
1817	{
1818	token->type = END_OF_RE;
1819	return 0;
1820	}
1821
1822	c = re_string_peek_byte (input, 0);
1823	token->opr.c = c;
1824
1825	token->word_char = 0;
1826	#ifdef RE_ENABLE_I18N
1827	token->mb_partial = 0;
1828	if (input->mb_cur_max > 1 &&
1829	!re_string_first_byte (input, re_string_cur_idx (input)))
1830	{
1831	token->type = CHARACTER;
1832	token->mb_partial = 1;
1833	return 1;
1834	}
1835	#endif
1836	if (c == '\\')
1837	{
1838	unsigned char c2;
1839	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1840	{
1841	token->type = BACK_SLASH;
1842	return 1;
1843	}
1844
1845	c2 = re_string_peek_byte_case (input, 1);
1846	token->opr.c = c2;
1847	token->type = CHARACTER;
1848	#ifdef RE_ENABLE_I18N
1849	if (input->mb_cur_max > 1)
1850	{
1851	wint_t wc = re_string_wchar_at (input,
1852	re_string_cur_idx (input) + 1);
1853	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854	}
1855	else
1856	#endif
1857	token->word_char = IS_WORD_CHAR (c2) != 0;
1858
1859	switch (c2)
1860	{
1861	case '\|':
1862	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1863	token->type = OP_ALT;
1864	break;
1865	case '1': case '2': case '3': case '4': case '5':
1866	case '6': case '7': case '8': case '9':
1867	if (!(syntax & RE_NO_BK_REFS))
1868	{
1869	token->type = OP_BACK_REF;
1870	token->opr.idx = c2 - '1';
1871	}
1872	break;
1873	case '<':
1874	if (!(syntax & RE_NO_GNU_OPS))
1875	{
1876	token->type = ANCHOR;
1877	token->opr.ctx_type = WORD_FIRST;
1878	}
1879	break;
1880	case '>':
1881	if (!(syntax & RE_NO_GNU_OPS))
1882	{
1883	token->type = ANCHOR;
1884	token->opr.ctx_type = WORD_LAST;
1885	}
1886	break;
1887	case 'b':
1888	if (!(syntax & RE_NO_GNU_OPS))
1889	{
1890	token->type = ANCHOR;
1891	token->opr.ctx_type = WORD_DELIM;
1892	}
1893	break;
1894	case 'B':
1895	if (!(syntax & RE_NO_GNU_OPS))
1896	{
1897	token->type = ANCHOR;
1898	token->opr.ctx_type = NOT_WORD_DELIM;
1899	}
1900	break;
1901	case 'w':
1902	if (!(syntax & RE_NO_GNU_OPS))
1903	token->type = OP_WORD;
1904	break;
1905	case 'W':
1906	if (!(syntax & RE_NO_GNU_OPS))
1907	token->type = OP_NOTWORD;
1908	break;
1909	case 's':
1910	if (!(syntax & RE_NO_GNU_OPS))
1911	token->type = OP_SPACE;
1912	break;
1913	case 'S':
1914	if (!(syntax & RE_NO_GNU_OPS))
1915	token->type = OP_NOTSPACE;
1916	break;
1917	case '`':
1918	if (!(syntax & RE_NO_GNU_OPS))
1919	{
1920	token->type = ANCHOR;
1921	token->opr.ctx_type = BUF_FIRST;
1922	}
1923	break;
1924	case '\'':
1925	if (!(syntax & RE_NO_GNU_OPS))
1926	{
1927	token->type = ANCHOR;
1928	token->opr.ctx_type = BUF_LAST;
1929	}
1930	break;
1931	case '(':
1932	if (!(syntax & RE_NO_BK_PARENS))
1933	token->type = OP_OPEN_SUBEXP;
1934	break;
1935	case ')':
1936	if (!(syntax & RE_NO_BK_PARENS))
1937	token->type = OP_CLOSE_SUBEXP;
1938	break;
1939	case '+':
1940	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941	token->type = OP_DUP_PLUS;
1942	break;
1943	case '?':
1944	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945	token->type = OP_DUP_QUESTION;
1946	break;
1947	case '{':
1948	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949	token->type = OP_OPEN_DUP_NUM;
1950	break;
1951	case '}':
1952	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953	token->type = OP_CLOSE_DUP_NUM;
1954	break;
1955	default:
1956	break;
1957	}
1958	return 2;
1959	}
1960
1961	token->type = CHARACTER;
1962	#ifdef RE_ENABLE_I18N
1963	if (input->mb_cur_max > 1)
1964	{
1965	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1966	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1967	}
1968	else
1969	#endif
1970	token->word_char = IS_WORD_CHAR (token->opr.c);
1971
1972	switch (c)
1973	{
1974	case '\n':
1975	if (syntax & RE_NEWLINE_ALT)
1976	token->type = OP_ALT;
1977	break;
1978	case '\|':
1979	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1980	token->type = OP_ALT;
1981	break;
1982	case '*':
1983	token->type = OP_DUP_ASTERISK;
1984	break;
1985	case '+':
1986	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987	token->type = OP_DUP_PLUS;
1988	break;
1989	case '?':
1990	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991	token->type = OP_DUP_QUESTION;
1992	break;
1993	case '{':
1994	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995	token->type = OP_OPEN_DUP_NUM;
1996	break;
1997	case '}':
1998	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999	token->type = OP_CLOSE_DUP_NUM;
2000	break;
2001	case '(':
2002	if (syntax & RE_NO_BK_PARENS)
2003	token->type = OP_OPEN_SUBEXP;
2004	break;
2005	case ')':
2006	if (syntax & RE_NO_BK_PARENS)
2007	token->type = OP_CLOSE_SUBEXP;
2008	break;
2009	case '[':
2010	token->type = OP_OPEN_BRACKET;
2011	break;
2012	case '.':
2013	token->type = OP_PERIOD;
2014	break;
2015	case '^':
2016	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
2017	re_string_cur_idx (input) != 0)
2018	{
2019	char prev = re_string_peek_byte (input, -1);
2020	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
2021	break;
2022	}
2023	token->type = ANCHOR;
2024	token->opr.ctx_type = LINE_FIRST;
2025	break;
2026	case '$':
2027	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2028	re_string_cur_idx (input) + 1 != re_string_length (input))
2029	{
2030	re_token_t next;
2031	re_string_skip_bytes (input, 1);
2032	peek_token (&next, input, syntax);
2033	re_string_skip_bytes (input, -1);
2034	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2035	break;
2036	}
2037	token->type = ANCHOR;
2038	token->opr.ctx_type = LINE_LAST;
2039	break;
2040	default:
2041	break;
2042	}
2043	return 1;
2044	}
2045
2046	/* Peek a token from INPUT, and return the length of the token.
2047	We must not use this function out of bracket expressions. */
2048
2049	static int
2050	internal_function
2051	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2052	{
2053	unsigned char c;
2054	if (re_string_eoi (input))
2055	{
2056	token->type = END_OF_RE;
2057	return 0;
2058	}
2059	c = re_string_peek_byte (input, 0);
2060	token->opr.c = c;
2061
2062	#ifdef RE_ENABLE_I18N
2063	if (input->mb_cur_max > 1 &&
2064	!re_string_first_byte (input, re_string_cur_idx (input)))
2065	{
2066	token->type = CHARACTER;
2067	return 1;
2068	}
2069	#endif /* RE_ENABLE_I18N */
2070
2071	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2072	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2073	{
2074	/* In this case, '\' escape a character. */
2075	unsigned char c2;
2076	re_string_skip_bytes (input, 1);
2077	c2 = re_string_peek_byte (input, 0);
2078	token->opr.c = c2;
2079	token->type = CHARACTER;
2080	return 1;
2081	}
2082	if (c == '[') /* '[' is a special char in a bracket exps. */
2083	{
2084	unsigned char c2;
2085	int token_len;
2086	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2087	c2 = re_string_peek_byte (input, 1);
2088	else
2089	c2 = 0;
2090	token->opr.c = c2;
2091	token_len = 2;
2092	switch (c2)
2093	{
2094	case '.':
2095	token->type = OP_OPEN_COLL_ELEM;
2096	break;
2097	case '=':
2098	token->type = OP_OPEN_EQUIV_CLASS;
2099	break;
2100	case ':':
2101	if (syntax & RE_CHAR_CLASSES)
2102	{
2103	token->type = OP_OPEN_CHAR_CLASS;
2104	break;
2105	}
2106	/* else fall through. */
2107	default:
2108	token->type = CHARACTER;
2109	token->opr.c = c;
2110	token_len = 1;
2111	break;
2112	}
2113	return token_len;
2114	}
2115	switch (c)
2116	{
2117	case '-':
2118	token->type = OP_CHARSET_RANGE;
2119	break;
2120	case ']':
2121	token->type = OP_CLOSE_BRACKET;
2122	break;
2123	case '^':
2124	token->type = OP_NON_MATCH_LIST;
2125	break;
2126	default:
2127	token->type = CHARACTER;
2128	}
2129	return 1;
2130	}
2131
2132
2133	/* Functions for parser. */
2134
2135	/* Entry point of the parser.
2136	Parse the regular expression REGEXP and return the structure tree.
2137	If an error occurs, ERR is set by error code, and return NULL.
2138	This function build the following tree, from regular expression <reg_exp>:
2139	CAT
2140	/ \
2141	/ \
2142	<reg_exp> EOR
2143
2144	CAT means concatenation.
2145	EOR means end of regular expression. */
2146
2147	static bin_tree_t *
2148	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2149	reg_errcode_t *err)
2150	{
2151	re_dfa_t *dfa = preg->buffer;
2152	bin_tree_t tree, eor, *root;
2153	re_token_t current_token;
2154	dfa->syntax = syntax;
2155	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2156	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2157	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158	return NULL;
2159	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2160	if (tree != NULL)
2161	root = create_tree (dfa, tree, eor, CONCAT);
2162	else
2163	root = eor;
2164	if (BE (eor == NULL \|\| root == NULL, 0))
2165	{
2166	*err = REG_ESPACE;
2167	return NULL;
2168	}
2169	return root;
2170	}
2171
2172	/* This function build the following tree, from regular expression
2173	<branch1>\|<branch2>:
2174	ALT
2175	/ \
2176	/ \
2177	<branch1> <branch2>
2178
2179	ALT means alternative, which represents the operator '\|'. */
2180
2181	static bin_tree_t *
2182	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2183	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2184	{
2185	re_dfa_t *dfa = preg->buffer;
2186	bin_tree_t tree, branch = NULL;
2187	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2188	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189	return NULL;
2190
2191	while (token->type == OP_ALT)
2192	{
2193	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2194	if (token->type != OP_ALT && token->type != END_OF_RE
2195	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2196	{
2197	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2198	if (BE (*err != REG_NOERROR && branch == NULL, 0))
2199	return NULL;
2200	}
2201	else
2202	branch = NULL;
2203	tree = create_tree (dfa, tree, branch, OP_ALT);
2204	if (BE (tree == NULL, 0))
2205	{
2206	*err = REG_ESPACE;
2207	return NULL;
2208	}
2209	}
2210	return tree;
2211	}
2212
2213	/* This function build the following tree, from regular expression
2214	<exp1><exp2>:
2215	CAT
2216	/ \
2217	/ \
2218	<exp1> <exp2>
2219
2220	CAT means concatenation. */
2221
2222	static bin_tree_t *
2223	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2224	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225	{
2226	bin_tree_t tree, expr;
2227	re_dfa_t *dfa = preg->buffer;
2228	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2229	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2230	return NULL;
2231
2232	while (token->type != OP_ALT && token->type != END_OF_RE
2233	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2234	{
2235	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2236	if (BE (*err != REG_NOERROR && expr == NULL, 0))
2237	{
2238	if (tree != NULL)
2239	postorder (tree, free_tree, NULL);
2240	return NULL;
2241	}
2242	if (tree != NULL && expr != NULL)
2243	{
2244	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2245	if (newtree == NULL)
2246	{
2247	postorder (expr, free_tree, NULL);
2248	postorder (tree, free_tree, NULL);
2249	*err = REG_ESPACE;
2250	return NULL;
2251	}
2252	tree = newtree;
2253	}
2254	else if (tree == NULL)
2255	tree = expr;
2256	/* Otherwise expr == NULL, we don't need to create new tree. */
2257	}
2258	return tree;
2259	}
2260
2261	/* This function build the following tree, from regular expression a*:
2262	*
2263	\|
2264	a
2265	*/
2266
2267	static bin_tree_t *
2268	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2269	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2270	{
2271	re_dfa_t *dfa = preg->buffer;
2272	bin_tree_t *tree;
2273	switch (token->type)
2274	{
2275	case CHARACTER:
2276	tree = create_token_tree (dfa, NULL, NULL, token);
2277	if (BE (tree == NULL, 0))
2278	{
2279	*err = REG_ESPACE;
2280	return NULL;
2281	}
2282	#ifdef RE_ENABLE_I18N
2283	if (dfa->mb_cur_max > 1)
2284	{
2285	while (!re_string_eoi (regexp)
2286	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2287	{
2288	bin_tree_t *mbc_remain;
2289	fetch_token (token, regexp, syntax);
2290	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2291	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2292	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
2293	{
2294	*err = REG_ESPACE;
2295	return NULL;
2296	}
2297	}
2298	}
2299	#endif
2300	break;
2301	case OP_OPEN_SUBEXP:
2302	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2303	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304	return NULL;
2305	break;
2306	case OP_OPEN_BRACKET:
2307	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2308	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2309	return NULL;
2310	break;
2311	case OP_BACK_REF:
2312	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2313	{
2314	*err = REG_ESUBREG;
2315	return NULL;
2316	}
2317	dfa->used_bkref_map \|= 1 << token->opr.idx;
2318	tree = create_token_tree (dfa, NULL, NULL, token);
2319	if (BE (tree == NULL, 0))
2320	{
2321	*err = REG_ESPACE;
2322	return NULL;
2323	}
2324	++dfa->nbackref;
2325	dfa->has_mb_node = 1;
2326	break;
2327	case OP_OPEN_DUP_NUM:
2328	if (syntax & RE_CONTEXT_INVALID_DUP)
2329	{
2330	*err = REG_BADRPT;
2331	return NULL;
2332	}
2333	/* FALLTHROUGH */
2334	case OP_DUP_ASTERISK:
2335	case OP_DUP_PLUS:
2336	case OP_DUP_QUESTION:
2337	if (syntax & RE_CONTEXT_INVALID_OPS)
2338	{
2339	*err = REG_BADRPT;
2340	return NULL;
2341	}
2342	else if (syntax & RE_CONTEXT_INDEP_OPS)
2343	{
2344	fetch_token (token, regexp, syntax);
2345	return parse_expression (regexp, preg, token, syntax, nest, err);
2346	}
2347	/* else fall through */
2348	case OP_CLOSE_SUBEXP:
2349	if ((token->type == OP_CLOSE_SUBEXP) &&
2350	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2351	{
2352	*err = REG_ERPAREN;
2353	return NULL;
2354	}
2355	/* else fall through */
2356	case OP_CLOSE_DUP_NUM:
2357	/* We treat it as a normal character. */
2358
2359	/* Then we can these characters as normal characters. */
2360	token->type = CHARACTER;
2361	/* mb_partial and word_char bits should be initialized already
2362	by peek_token. */
2363	tree = create_token_tree (dfa, NULL, NULL, token);
2364	if (BE (tree == NULL, 0))
2365	{
2366	*err = REG_ESPACE;
2367	return NULL;
2368	}
2369	break;
2370	case ANCHOR:
2371	if ((token->opr.ctx_type
2372	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2373	&& dfa->word_ops_used == 0)
2374	init_word_char (dfa);
2375	if (token->opr.ctx_type == WORD_DELIM
2376	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2377	{
2378	bin_tree_t tree_first, tree_last;
2379	if (token->opr.ctx_type == WORD_DELIM)
2380	{
2381	token->opr.ctx_type = WORD_FIRST;
2382	tree_first = create_token_tree (dfa, NULL, NULL, token);
2383	token->opr.ctx_type = WORD_LAST;
2384	}
2385	else
2386	{
2387	token->opr.ctx_type = INSIDE_WORD;
2388	tree_first = create_token_tree (dfa, NULL, NULL, token);
2389	token->opr.ctx_type = INSIDE_NOTWORD;
2390	}
2391	tree_last = create_token_tree (dfa, NULL, NULL, token);
2392	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
2394	{
2395	*err = REG_ESPACE;
2396	return NULL;
2397	}
2398	}
2399	else
2400	{
2401	tree = create_token_tree (dfa, NULL, NULL, token);
2402	if (BE (tree == NULL, 0))
2403	{
2404	*err = REG_ESPACE;
2405	return NULL;
2406	}
2407	}
2408	/* We must return here, since ANCHORs can't be followed
2409	by repetition operators.
2410	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2411	it must not be "<ANCHOR(^)><REPEAT()>". /
2412	fetch_token (token, regexp, syntax);
2413	return tree;
2414	case OP_PERIOD:
2415	tree = create_token_tree (dfa, NULL, NULL, token);
2416	if (BE (tree == NULL, 0))
2417	{
2418	*err = REG_ESPACE;
2419	return NULL;
2420	}
2421	if (dfa->mb_cur_max > 1)
2422	dfa->has_mb_node = 1;
2423	break;
2424	case OP_WORD:
2425	case OP_NOTWORD:
2426	tree = build_charclass_op (dfa, regexp->trans,
2427	(const unsigned char *) "alnum",
2428	(const unsigned char *) "_",
2429	token->type == OP_NOTWORD, err);
2430	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431	return NULL;
2432	break;
2433	case OP_SPACE:
2434	case OP_NOTSPACE:
2435	tree = build_charclass_op (dfa, regexp->trans,
2436	(const unsigned char *) "space",
2437	(const unsigned char *) "",
2438	token->type == OP_NOTSPACE, err);
2439	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2440	return NULL;
2441	break;
2442	case OP_ALT:
2443	case END_OF_RE:
2444	return NULL;
2445	case BACK_SLASH:
2446	*err = REG_EESCAPE;
2447	return NULL;
2448	default:
2449	/* Must not happen? */
2450	#ifdef DEBUG
2451	assert (0);
2452	#endif
2453	return NULL;
2454	}
2455	fetch_token (token, regexp, syntax);
2456
2457	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2458	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2459	{
2460	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2461	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2462	return NULL;
2463	/* In BRE consecutive duplications are not allowed. */
2464	if ((syntax & RE_CONTEXT_INVALID_DUP)
2465	&& (token->type == OP_DUP_ASTERISK
2466	\|\| token->type == OP_OPEN_DUP_NUM))
2467	{
2468	*err = REG_BADRPT;
2469	return NULL;
2470	}
2471	}
2472
2473	return tree;
2474	}
2475
2476	/* This function build the following tree, from regular expression
2477	(<reg_exp>):
2478	SUBEXP
2479	\|
2480	<reg_exp>
2481	*/
2482
2483	static bin_tree_t *
2484	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2485	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2486	{
2487	re_dfa_t *dfa = preg->buffer;
2488	bin_tree_t *tree;
2489	size_t cur_nsub;
2490	cur_nsub = preg->re_nsub++;
2491
2492	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2493
2494	/* The subexpression may be a null string. */
2495	if (token->type == OP_CLOSE_SUBEXP)
2496	tree = NULL;
2497	else
2498	{
2499	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2500	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2501	{
2502	if (tree != NULL)
2503	postorder (tree, free_tree, NULL);
2504	*err = REG_EPAREN;
2505	}
2506	if (BE (*err != REG_NOERROR, 0))
2507	return NULL;
2508	}
2509
2510	if (cur_nsub <= '9' - '1')
2511	dfa->completed_bkref_map \|= 1 << cur_nsub;
2512
2513	tree = create_tree (dfa, tree, NULL, SUBEXP);
2514	if (BE (tree == NULL, 0))
2515	{
2516	*err = REG_ESPACE;
2517	return NULL;
2518	}
2519	tree->token.opr.idx = cur_nsub;
2520	return tree;
2521	}
2522
2523	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2524
2525	static bin_tree_t *
2526	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2527	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2528	{
2529	bin_tree_t tree = NULL, old_tree = NULL;
2530	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2531	re_token_t start_token = *token;
2532
2533	if (token->type == OP_OPEN_DUP_NUM)
2534	{
2535	end = 0;
2536	start = fetch_number (regexp, token, syntax);
2537	if (start == REG_MISSING)
2538	{
2539	if (token->type == CHARACTER && token->opr.c == ',')
2540	start = 0; /* We treat "{,m}" as "{0,m}". */
2541	else
2542	{
2543	err = REG_BADBR; / <re>{} is invalid. */
2544	return NULL;
2545	}
2546	}
2547	if (BE (start != REG_ERROR, 1))
2548	{
2549	/* We treat "{n}" as "{n,n}". */
2550	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2551	: ((token->type == CHARACTER && token->opr.c == ',')
2552	? fetch_number (regexp, token, syntax) : REG_ERROR));
2553	}
2554	if (BE (start == REG_ERROR \|\| end == REG_ERROR, 0))
2555	{
2556	/* Invalid sequence. */
2557	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2558	{
2559	if (token->type == END_OF_RE)
2560	*err = REG_EBRACE;
2561	else
2562	*err = REG_BADBR;
2563
2564	return NULL;
2565	}
2566
2567	/* If the syntax bit is set, rollback. */
2568	re_string_set_index (regexp, start_idx);
2569	*token = start_token;
2570	token->type = CHARACTER;
2571	/* mb_partial and word_char bits should be already initialized by
2572	peek_token. */
2573	return elem;
2574	}
2575
2576	if (BE ((end != REG_MISSING && start > end)
2577	\|\| token->type != OP_CLOSE_DUP_NUM, 0))
2578	{
2579	/* First number greater than second. */
2580	*err = REG_BADBR;
2581	return NULL;
2582	}
2583
2584	if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2585	{
2586	*err = REG_ESIZE;
2587	return NULL;
2588	}
2589	}
2590	else
2591	{
2592	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2593	end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2594	}
2595
2596	fetch_token (token, regexp, syntax);
2597
2598	if (BE (elem == NULL, 0))
2599	return NULL;
2600	if (BE (start == 0 && end == 0, 0))
2601	{
2602	postorder (elem, free_tree, NULL);
2603	return NULL;
2604	}
2605
2606	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2607	if (BE (start > 0, 0))
2608	{
2609	tree = elem;
2610	for (i = 2; i <= start; ++i)
2611	{
2612	elem = duplicate_tree (elem, dfa);
2613	tree = create_tree (dfa, tree, elem, CONCAT);
2614	if (BE (elem == NULL \|\| tree == NULL, 0))
2615	goto parse_dup_op_espace;
2616	}
2617
2618	if (start == end)
2619	return tree;
2620
2621	/* Duplicate ELEM before it is marked optional. */
2622	elem = duplicate_tree (elem, dfa);
2623	old_tree = tree;
2624	}
2625	else
2626	old_tree = NULL;
2627
2628	if (elem->token.type == SUBEXP)
2629	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2630
2631	tree = create_tree (dfa, elem, NULL,
2632	(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633	if (BE (tree == NULL, 0))
2634	goto parse_dup_op_espace;
2635
2636	/* From gnulib's "intprops.h":
2637	True if the arithmetic type T is signed. */
2638	#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2639
2640	/* This loop is actually executed only when end != REG_MISSING,
2641	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2642	already created the start+1-th copy. */
2643	if (TYPE_SIGNED (Idx) \|\| end != REG_MISSING)
2644	for (i = start + 2; i <= end; ++i)
2645	{
2646	elem = duplicate_tree (elem, dfa);
2647	tree = create_tree (dfa, tree, elem, CONCAT);
2648	if (BE (elem == NULL \|\| tree == NULL, 0))
2649	goto parse_dup_op_espace;
2650
2651	tree = create_tree (dfa, tree, NULL, OP_ALT);
2652	if (BE (tree == NULL, 0))
2653	goto parse_dup_op_espace;
2654	}
2655
2656	if (old_tree)
2657	tree = create_tree (dfa, old_tree, tree, CONCAT);
2658
2659	return tree;
2660
2661	parse_dup_op_espace:
2662	*err = REG_ESPACE;
2663	return NULL;
2664	}
2665
2666	/* Size of the names for collating symbol/equivalence_class/character_class.
2667	I'm not sure, but maybe enough. */
2668	#define BRACKET_NAME_BUF_SIZE 32
2669
2670	#ifndef _LIBC
2671	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672	Build the range expression which starts from START_ELEM, and ends
2673	at END_ELEM. The result are written to MBCSET and SBCSET.
2674	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2675	mbcset->range_ends, is a pointer argument since we may
2676	update it. */
2677
2678	static reg_errcode_t
2679	internal_function
2680	# ifdef RE_ENABLE_I18N
2681	build_range_exp (const reg_syntax_t syntax,
2682	bitset_t sbcset,
2683	re_charset_t *mbcset,
2684	Idx *range_alloc,
2685	const bracket_elem_t *start_elem,
2686	const bracket_elem_t *end_elem)
2687	# else /* not RE_ENABLE_I18N */
2688	build_range_exp (const reg_syntax_t syntax,
2689	bitset_t sbcset,
2690	const bracket_elem_t *start_elem,
2691	const bracket_elem_t *end_elem)
2692	# endif /* not RE_ENABLE_I18N */
2693	{
2694	unsigned int start_ch, end_ch;
2695	/* Equivalence Classes and Character Classes can't be a range start/end. */
2696	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2697	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2698	0))
2699	return REG_ERANGE;
2700
2701	/* We can handle no multi character collating elements without libc
2702	support. */
2703	if (BE ((start_elem->type == COLL_SYM
2704	&& strlen ((char *) start_elem->opr.name) > 1)
2705	\|\| (end_elem->type == COLL_SYM
2706	&& strlen ((char *) end_elem->opr.name) > 1), 0))
2707	return REG_ECOLLATE;
2708
2709	# ifdef RE_ENABLE_I18N
2710	{
2711	wchar_t wc;
2712	wint_t start_wc;
2713	wint_t end_wc;
2714	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2715
2716	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718	: 0));
2719	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721	: 0));
2722	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2723	? __btowc (start_ch) : start_elem->opr.wch);
2724	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2725	? __btowc (end_ch) : end_elem->opr.wch);
2726	if (start_wc == WEOF \|\| end_wc == WEOF)
2727	return REG_ECOLLATE;
2728	cmp_buf[0] = start_wc;
2729	cmp_buf[4] = end_wc;
2730
2731	if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732	&& wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733	return REG_ERANGE;
2734
2735	/* Got valid collation sequence values, add them as a new entry.
2736	However, for !_LIBC we have no collation elements: if the
2737	character set is single byte, the single byte character set
2738	that we build below suffices. parse_bracket_exp passes
2739	no MBCSET if dfa->mb_cur_max == 1. */
2740	if (mbcset)
2741	{
2742	/* Check the space of the arrays. */
2743	if (BE (*range_alloc == mbcset->nranges, 0))
2744	{
2745	/* There is not enough space, need realloc. */
2746	wchar_t new_array_start, new_array_end;
2747	Idx new_nranges;
2748
2749	/* +1 in case of mbcset->nranges is 0. */
2750	new_nranges = 2 * mbcset->nranges + 1;
2751	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2752	are NULL if range_alloc == 0. /
2753	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2754	new_nranges);
2755	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756	new_nranges);
2757
2758	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2759	return REG_ESPACE;
2760
2761	mbcset->range_starts = new_array_start;
2762	mbcset->range_ends = new_array_end;
2763	*range_alloc = new_nranges;
2764	}
2765
2766	mbcset->range_starts[mbcset->nranges] = start_wc;
2767	mbcset->range_ends[mbcset->nranges++] = end_wc;
2768	}
2769
2770	/* Build the table for single byte characters. */
2771	for (wc = 0; wc < SBC_MAX; ++wc)
2772	{
2773	cmp_buf[2] = wc;
2774	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776	bitset_set (sbcset, wc);
2777	}
2778	}
2779	# else /* not RE_ENABLE_I18N */
2780	{
2781	unsigned int ch;
2782	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2784	: 0));
2785	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2787	: 0));
2788	if (start_ch > end_ch)
2789	return REG_ERANGE;
2790	/* Build the table for single byte characters. */
2791	for (ch = 0; ch < SBC_MAX; ++ch)
2792	if (start_ch <= ch && ch <= end_ch)
2793	bitset_set (sbcset, ch);
2794	}
2795	# endif /* not RE_ENABLE_I18N */
2796	return REG_NOERROR;
2797	}
2798	#endif /* not _LIBC */
2799
2800	#ifndef _LIBC
2801	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802	Build the collating element which is represented by NAME.
2803	The result are written to MBCSET and SBCSET.
2804	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805	pointer argument since we may update it. */
2806
2807	static reg_errcode_t
2808	internal_function
2809	# ifdef RE_ENABLE_I18N
2810	build_collating_symbol (bitset_t sbcset,
2811	re_charset_t *mbcset _UNUSED_PARAMETER_,
2812	Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2813	const unsigned char *name)
2814	# else /* not RE_ENABLE_I18N */
2815	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2816	# endif /* not RE_ENABLE_I18N */
2817	{
2818	size_t name_len = strlen ((const char *) name);
2819	if (BE (name_len != 1, 0))
2820	return REG_ECOLLATE;
2821	else
2822	{
2823	bitset_set (sbcset, name[0]);
2824	return REG_NOERROR;
2825	}
2826	}
2827	#endif /* not _LIBC */
2828
2829	/* This function parse bracket expression like "[abc]", "[a-c]",
2830	"[[.a-a.]]" etc. */
2831
2832	static bin_tree_t *
2833	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2834	reg_syntax_t syntax, reg_errcode_t *err)
2835	{
2836	#ifdef _LIBC
2837	const unsigned char *collseqmb;
2838	const char *collseqwc;
2839	uint32_t nrules;
2840	int32_t table_size;
2841	const int32_t *symb_table;
2842	const unsigned char *extra;
2843
2844	/* Local function for parse_bracket_exp used in _LIBC environment.
2845	Seek the collating symbol entry corresponding to NAME.
2846	Return the index of the symbol in the SYMB_TABLE. */
2847
2848	auto inline int32_t
2849	__attribute ((always_inline))
2850	seek_collating_symbol_entry (name, name_len)
2851	const unsigned char *name;
2852	size_t name_len;
2853	{
2854	int32_t hash = elem_hash ((const char *) name, name_len);
2855	int32_t elem = hash % table_size;
2856	if (symb_table[2 * elem] != 0)
2857	{
2858	int32_t second = hash % (table_size - 2) + 1;
2859
2860	do
2861	{
2862	/* First compare the hashing value. */
2863	if (symb_table[2 * elem] == hash
2864	/* Compare the length of the name. */
2865	&& name_len == extra[symb_table[2 * elem + 1]]
2866	/* Compare the name. */
2867	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2868	name_len) == 0)
2869	{
2870	/* Yep, this is the entry. */
2871	break;
2872	}
2873
2874	/* Next entry. */
2875	elem += second;
2876	}
2877	while (symb_table[2 * elem] != 0);
2878	}
2879	return elem;
2880	}
2881
2882	/* Local function for parse_bracket_exp used in _LIBC environment.
2883	Look up the collation sequence value of BR_ELEM.
2884	Return the value if succeeded, UINT_MAX otherwise. */
2885
2886	auto inline unsigned int
2887	__attribute ((always_inline))
2888	lookup_collation_sequence_value (br_elem)
2889	bracket_elem_t *br_elem;
2890	{
2891	if (br_elem->type == SB_CHAR)
2892	{
2893	/*
2894	if (MB_CUR_MAX == 1)
2895	*/
2896	if (nrules == 0)
2897	return collseqmb[br_elem->opr.ch];
2898	else
2899	{
2900	wint_t wc = __btowc (br_elem->opr.ch);
2901	return __collseq_table_lookup (collseqwc, wc);
2902	}
2903	}
2904	else if (br_elem->type == MB_CHAR)
2905	{
2906	if (nrules != 0)
2907	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908	}
2909	else if (br_elem->type == COLL_SYM)
2910	{
2911	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912	if (nrules != 0)
2913	{
2914	int32_t elem, idx;
2915	elem = seek_collating_symbol_entry (br_elem->opr.name,
2916	sym_name_len);
2917	if (symb_table[2 * elem] != 0)
2918	{
2919	/* We found the entry. */
2920	idx = symb_table[2 * elem + 1];
2921	/* Skip the name of collating element name. */
2922	idx += 1 + extra[idx];
2923	/* Skip the byte sequence of the collating element. */
2924	idx += 1 + extra[idx];
2925	/* Adjust for the alignment. */
2926	idx = (idx + 3) & ~3;
2927	/* Skip the multibyte collation sequence value. */
2928	idx += sizeof (unsigned int);
2929	/* Skip the wide char sequence of the collating element. */
2930	idx += sizeof (unsigned int) *
2931	(1 + (unsigned int ) (extra + idx));
2932	/* Return the collation sequence value. */
2933	return (unsigned int ) (extra + idx);
2934	}
2935	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2936	{
2937	/* No valid character. Match it as a single byte
2938	character. */
2939	return collseqmb[br_elem->opr.name[0]];
2940	}
2941	}
2942	else if (sym_name_len == 1)
2943	return collseqmb[br_elem->opr.name[0]];
2944	}
2945	return UINT_MAX;
2946	}
2947
2948	/* Local function for parse_bracket_exp used in _LIBC environment.
2949	Build the range expression which starts from START_ELEM, and ends
2950	at END_ELEM. The result are written to MBCSET and SBCSET.
2951	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952	mbcset->range_ends, is a pointer argument since we may
2953	update it. */
2954
2955	auto inline reg_errcode_t
2956	__attribute ((always_inline))
2957	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2958	re_charset_t *mbcset;
2959	Idx *range_alloc;
2960	bitset_t sbcset;
2961	bracket_elem_t start_elem, end_elem;
2962	{
2963	unsigned int ch;
2964	uint32_t start_collseq;
2965	uint32_t end_collseq;
2966
2967	/* Equivalence Classes and Character Classes can't be a range
2968	start/end. */
2969	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2970	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2971	0))
2972	return REG_ERANGE;
2973
2974	start_collseq = lookup_collation_sequence_value (start_elem);
2975	end_collseq = lookup_collation_sequence_value (end_elem);
2976	/* Check start/end collation sequence values. */
2977	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
2978	return REG_ECOLLATE;
2979	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2980	return REG_ERANGE;
2981
2982	/* Got valid collation sequence values, add them as a new entry.
2983	However, if we have no collation elements, and the character set
2984	is single byte, the single byte character set that we
2985	build below suffices. */
2986	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2987	{
2988	/* Check the space of the arrays. */
2989	if (BE (*range_alloc == mbcset->nranges, 0))
2990	{
2991	/* There is not enough space, need realloc. */
2992	uint32_t *new_array_start;
2993	uint32_t *new_array_end;
2994	Idx new_nranges;
2995
2996	/* +1 in case of mbcset->nranges is 0. */
2997	new_nranges = 2 * mbcset->nranges + 1;
2998	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2999	new_nranges);
3000	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3001	new_nranges);
3002
3003	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
3004	return REG_ESPACE;
3005
3006	mbcset->range_starts = new_array_start;
3007	mbcset->range_ends = new_array_end;
3008	*range_alloc = new_nranges;
3009	}
3010
3011	mbcset->range_starts[mbcset->nranges] = start_collseq;
3012	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013	}
3014
3015	/* Build the table for single byte characters. */
3016	for (ch = 0; ch < SBC_MAX; ch++)
3017	{
3018	uint32_t ch_collseq;
3019	/*
3020	if (MB_CUR_MAX == 1)
3021	*/
3022	if (nrules == 0)
3023	ch_collseq = collseqmb[ch];
3024	else
3025	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3026	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3027	bitset_set (sbcset, ch);
3028	}
3029	return REG_NOERROR;
3030	}
3031
3032	/* Local function for parse_bracket_exp used in _LIBC environment.
3033	Build the collating element which is represented by NAME.
3034	The result are written to MBCSET and SBCSET.
3035	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3036	pointer argument since we may update it. */
3037
3038	auto inline reg_errcode_t
3039	__attribute ((always_inline))
3040	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3041	re_charset_t *mbcset;
3042	Idx *coll_sym_alloc;
3043	bitset_t sbcset;
3044	const unsigned char *name;
3045	{
3046	int32_t elem, idx;
3047	size_t name_len = strlen ((const char *) name);
3048	if (nrules != 0)
3049	{
3050	elem = seek_collating_symbol_entry (name, name_len);
3051	if (symb_table[2 * elem] != 0)
3052	{
3053	/* We found the entry. */
3054	idx = symb_table[2 * elem + 1];
3055	/* Skip the name of collating element name. */
3056	idx += 1 + extra[idx];
3057	}
3058	else if (symb_table[2 * elem] == 0 && name_len == 1)
3059	{
3060	/* No valid character, treat it as a normal
3061	character. */
3062	bitset_set (sbcset, name[0]);
3063	return REG_NOERROR;
3064	}
3065	else
3066	return REG_ECOLLATE;
3067
3068	/* Got valid collation sequence, add it as a new entry. */
3069	/* Check the space of the arrays. */
3070	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3071	{
3072	/* Not enough, realloc it. */
3073	/* +1 in case of mbcset->ncoll_syms is 0. */
3074	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3075	/* Use realloc since mbcset->coll_syms is NULL
3076	if alloc == 0. /
3077	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3078	new_coll_sym_alloc);
3079	if (BE (new_coll_syms == NULL, 0))
3080	return REG_ESPACE;
3081	mbcset->coll_syms = new_coll_syms;
3082	*coll_sym_alloc = new_coll_sym_alloc;
3083	}
3084	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085	return REG_NOERROR;
3086	}
3087	else
3088	{
3089	if (BE (name_len != 1, 0))
3090	return REG_ECOLLATE;
3091	else
3092	{
3093	bitset_set (sbcset, name[0]);
3094	return REG_NOERROR;
3095	}
3096	}
3097	}
3098	#endif
3099
3100	re_token_t br_token;
3101	re_bitset_ptr_t sbcset;
3102	#ifdef RE_ENABLE_I18N
3103	re_charset_t *mbcset;
3104	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3105	Idx equiv_class_alloc = 0, char_class_alloc = 0;
3106	#endif /* not RE_ENABLE_I18N */
3107	bool non_match = false;
3108	bin_tree_t *work_tree;
3109	int token_len;
3110	bool first_round = true;
3111	#ifdef _LIBC
3112	collseqmb = (const unsigned char *)
3113	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3114	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3115	if (nrules)
3116	{
3117	/*
3118	if (MB_CUR_MAX > 1)
3119	*/
3120	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3121	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3122	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3123	_NL_COLLATE_SYMB_TABLEMB);
3124	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3125	_NL_COLLATE_SYMB_EXTRAMB);
3126	}
3127	#endif
3128	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3129	#ifdef RE_ENABLE_I18N
3130	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3131	#endif /* RE_ENABLE_I18N */
3132	#ifdef RE_ENABLE_I18N
3133	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3134	#else
3135	if (BE (sbcset == NULL, 0))
3136	#endif /* RE_ENABLE_I18N */
3137	{
3138	re_free (sbcset);
3139	#ifdef RE_ENABLE_I18N
3140	re_free (mbcset);
3141	#endif
3142	*err = REG_ESPACE;
3143	return NULL;
3144	}
3145
3146	token_len = peek_token_bracket (token, regexp, syntax);
3147	if (BE (token->type == END_OF_RE, 0))
3148	{
3149	*err = REG_BADPAT;
3150	goto parse_bracket_exp_free_return;
3151	}
3152	if (token->type == OP_NON_MATCH_LIST)
3153	{
3154	#ifdef RE_ENABLE_I18N
3155	mbcset->non_match = 1;
3156	#endif /* not RE_ENABLE_I18N */
3157	non_match = true;
3158	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3159	bitset_set (sbcset, '\n');
3160	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161	token_len = peek_token_bracket (token, regexp, syntax);
3162	if (BE (token->type == END_OF_RE, 0))
3163	{
3164	*err = REG_BADPAT;
3165	goto parse_bracket_exp_free_return;
3166	}
3167	}
3168
3169	/* We treat the first ']' as a normal character. */
3170	if (token->type == OP_CLOSE_BRACKET)
3171	token->type = CHARACTER;
3172
3173	while (1)
3174	{
3175	bracket_elem_t start_elem, end_elem;
3176	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3177	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178	reg_errcode_t ret;
3179	int token_len2 = 0;
3180	bool is_range_exp = false;
3181	re_token_t token2;
3182
3183	start_elem.opr.name = start_name_buf;
3184	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3185	syntax, first_round);
3186	if (BE (ret != REG_NOERROR, 0))
3187	{
3188	*err = ret;
3189	goto parse_bracket_exp_free_return;
3190	}
3191	first_round = false;
3192
3193	/* Get information about the next token. We need it in any case. */
3194	token_len = peek_token_bracket (token, regexp, syntax);
3195
3196	/* Do not check for ranges if we know they are not allowed. */
3197	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3198	{
3199	if (BE (token->type == END_OF_RE, 0))
3200	{
3201	*err = REG_EBRACK;
3202	goto parse_bracket_exp_free_return;
3203	}
3204	if (token->type == OP_CHARSET_RANGE)
3205	{
3206	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3207	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3208	if (BE (token2.type == END_OF_RE, 0))
3209	{
3210	*err = REG_EBRACK;
3211	goto parse_bracket_exp_free_return;
3212	}
3213	if (token2.type == OP_CLOSE_BRACKET)
3214	{
3215	/* We treat the last '-' as a normal character. */
3216	re_string_skip_bytes (regexp, -token_len);
3217	token->type = CHARACTER;
3218	}
3219	else
3220	is_range_exp = true;
3221	}
3222	}
3223
3224	if (is_range_exp == true)
3225	{
3226	end_elem.opr.name = end_name_buf;
3227	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3228	dfa, syntax, true);
3229	if (BE (ret != REG_NOERROR, 0))
3230	{
3231	*err = ret;
3232	goto parse_bracket_exp_free_return;
3233	}
3234
3235	token_len = peek_token_bracket (token, regexp, syntax);
3236
3237	#ifdef _LIBC
3238	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3239	&start_elem, &end_elem);
3240	#else
3241	# ifdef RE_ENABLE_I18N
3242	*err = build_range_exp (syntax, sbcset,
3243	dfa->mb_cur_max > 1 ? mbcset : NULL,
3244	&range_alloc, &start_elem, &end_elem);
3245	# else
3246	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3247	# endif
3248	#endif /* RE_ENABLE_I18N */
3249	if (BE (*err != REG_NOERROR, 0))
3250	goto parse_bracket_exp_free_return;
3251	}
3252	else
3253	{
3254	switch (start_elem.type)
3255	{
3256	case SB_CHAR:
3257	bitset_set (sbcset, start_elem.opr.ch);
3258	break;
3259	#ifdef RE_ENABLE_I18N
3260	case MB_CHAR:
3261	/* Check whether the array has enough space. */
3262	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3263	{
3264	wchar_t *new_mbchars;
3265	/* Not enough, realloc it. */
3266	/* +1 in case of mbcset->nmbchars is 0. */
3267	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3268	/* Use realloc since array is NULL if alloc == 0. /
3269	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3270	mbchar_alloc);
3271	if (BE (new_mbchars == NULL, 0))
3272	goto parse_bracket_exp_espace;
3273	mbcset->mbchars = new_mbchars;
3274	}
3275	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3276	break;
3277	#endif /* RE_ENABLE_I18N */
3278	case EQUIV_CLASS:
3279	*err = build_equiv_class (sbcset,
3280	#ifdef RE_ENABLE_I18N
3281	mbcset, &equiv_class_alloc,
3282	#endif /* RE_ENABLE_I18N */
3283	start_elem.opr.name);
3284	if (BE (*err != REG_NOERROR, 0))
3285	goto parse_bracket_exp_free_return;
3286	break;
3287	case COLL_SYM:
3288	*err = build_collating_symbol (sbcset,
3289	#ifdef RE_ENABLE_I18N
3290	mbcset, &coll_sym_alloc,
3291	#endif /* RE_ENABLE_I18N */
3292	start_elem.opr.name);
3293	if (BE (*err != REG_NOERROR, 0))
3294	goto parse_bracket_exp_free_return;
3295	break;
3296	case CHAR_CLASS:
3297	*err = build_charclass (regexp->trans, sbcset,
3298	#ifdef RE_ENABLE_I18N
3299	mbcset, &char_class_alloc,
3300	#endif /* RE_ENABLE_I18N */
3301	start_elem.opr.name, syntax);
3302	if (BE (*err != REG_NOERROR, 0))
3303	goto parse_bracket_exp_free_return;
3304	break;
3305	default:
3306	assert (0);
3307	break;
3308	}
3309	}
3310	if (BE (token->type == END_OF_RE, 0))
3311	{
3312	*err = REG_EBRACK;
3313	goto parse_bracket_exp_free_return;
3314	}
3315	if (token->type == OP_CLOSE_BRACKET)
3316	break;
3317	}
3318
3319	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3320
3321	/* If it is non-matching list. */
3322	if (non_match)
3323	bitset_not (sbcset);
3324
3325	#ifdef RE_ENABLE_I18N
3326	/* Ensure only single byte characters are set. */
3327	if (dfa->mb_cur_max > 1)
3328	bitset_mask (sbcset, dfa->sb_char);
3329
3330	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3331	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3332	\|\| mbcset->non_match)))
3333	{
3334	bin_tree_t *mbc_tree;
3335	int sbc_idx;
3336	/* Build a tree for complex bracket. */
3337	dfa->has_mb_node = 1;
3338	br_token.type = COMPLEX_BRACKET;
3339	br_token.opr.mbcset = mbcset;
3340	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341	if (BE (mbc_tree == NULL, 0))
3342	goto parse_bracket_exp_espace;
3343	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3344	if (sbcset[sbc_idx])
3345	break;
3346	/* If there are no bits set in sbcset, there is no point
3347	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3348	if (sbc_idx < BITSET_WORDS)
3349	{
3350	/* Build a tree for simple bracket. */
3351	br_token.type = SIMPLE_BRACKET;
3352	br_token.opr.sbcset = sbcset;
3353	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354	if (BE (work_tree == NULL, 0))
3355	goto parse_bracket_exp_espace;
3356
3357	/* Then join them by ALT node. */
3358	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3359	if (BE (work_tree == NULL, 0))
3360	goto parse_bracket_exp_espace;
3361	}
3362	else
3363	{
3364	re_free (sbcset);
3365	work_tree = mbc_tree;
3366	}
3367	}
3368	else
3369	#endif /* not RE_ENABLE_I18N */
3370	{
3371	#ifdef RE_ENABLE_I18N
3372	free_charset (mbcset);
3373	#endif
3374	/* Build a tree for simple bracket. */
3375	br_token.type = SIMPLE_BRACKET;
3376	br_token.opr.sbcset = sbcset;
3377	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3378	if (BE (work_tree == NULL, 0))
3379	goto parse_bracket_exp_espace;
3380	}
3381	return work_tree;
3382
3383	parse_bracket_exp_espace:
3384	*err = REG_ESPACE;
3385	parse_bracket_exp_free_return:
3386	re_free (sbcset);
3387	#ifdef RE_ENABLE_I18N
3388	free_charset (mbcset);
3389	#endif /* RE_ENABLE_I18N */
3390	return NULL;
3391	}
3392
3393	/* Parse an element in the bracket expression. */
3394
3395	static reg_errcode_t
3396	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3397	re_token_t *token, int token_len,
3398	re_dfa_t *dfa _UNUSED_PARAMETER_,
3399	reg_syntax_t syntax, bool accept_hyphen)
3400	{
3401	#ifdef RE_ENABLE_I18N
3402	int cur_char_size;
3403	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3404	if (cur_char_size > 1)
3405	{
3406	elem->type = MB_CHAR;
3407	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3408	re_string_skip_bytes (regexp, cur_char_size);
3409	return REG_NOERROR;
3410	}
3411	#endif /* RE_ENABLE_I18N */
3412	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3413	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3414	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3415	return parse_bracket_symbol (elem, regexp, token);
3416	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3417	{
3418	/* A '-' must only appear as anything but a range indicator before
3419	the closing bracket. Everything else is an error. */
3420	re_token_t token2;
3421	(void) peek_token_bracket (&token2, regexp, syntax);
3422	if (token2.type != OP_CLOSE_BRACKET)
3423	/* The actual error value is not standardized since this whole
3424	case is undefined. But ERANGE makes good sense. */
3425	return REG_ERANGE;
3426	}
3427	elem->type = SB_CHAR;
3428	elem->opr.ch = token->opr.c;
3429	return REG_NOERROR;
3430	}
3431
3432	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3433	such as [:<character_class>:], [.<collating_element>.], and
3434	[=<equivalent_class>=]. */
3435
3436	static reg_errcode_t
3437	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3438	re_token_t *token)
3439	{
3440	unsigned char ch, delim = token->opr.c;
3441	int i = 0;
3442	if (re_string_eoi(regexp))
3443	return REG_EBRACK;
3444	for (;; ++i)
3445	{
3446	if (i >= BRACKET_NAME_BUF_SIZE)
3447	return REG_EBRACK;
3448	if (token->type == OP_OPEN_CHAR_CLASS)
3449	ch = re_string_fetch_byte_case (regexp);
3450	else
3451	ch = re_string_fetch_byte (regexp);
3452	if (re_string_eoi(regexp))
3453	return REG_EBRACK;
3454	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3455	break;
3456	elem->opr.name[i] = ch;
3457	}
3458	re_string_skip_bytes (regexp, 1);
3459	elem->opr.name[i] = '\0';
3460	switch (token->type)
3461	{
3462	case OP_OPEN_COLL_ELEM:
3463	elem->type = COLL_SYM;
3464	break;
3465	case OP_OPEN_EQUIV_CLASS:
3466	elem->type = EQUIV_CLASS;
3467	break;
3468	case OP_OPEN_CHAR_CLASS:
3469	elem->type = CHAR_CLASS;
3470	break;
3471	default:
3472	break;
3473	}
3474	return REG_NOERROR;
3475	}
3476
3477	/* Helper function for parse_bracket_exp.
3478	Build the equivalence class which is represented by NAME.
3479	The result are written to MBCSET and SBCSET.
3480	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3481	is a pointer argument since we may update it. */
3482
3483	static reg_errcode_t
3484	#ifdef RE_ENABLE_I18N
3485	build_equiv_class (bitset_t sbcset,
3486	re_charset_t *mbcset _UNUSED_PARAMETER_,
3487	Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3488	const unsigned char *name)
3489	#else /* not RE_ENABLE_I18N */
3490	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3491	#endif /* not RE_ENABLE_I18N */
3492	{
3493	#ifdef _LIBC
3494	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3495	if (nrules != 0)
3496	{
3497	const int32_t table, indirect;
3498	const unsigned char weights, extra, *cp;
3499	unsigned char char_buf[2];
3500	int32_t idx1, idx2;
3501	unsigned int ch;
3502	size_t len;
3503	/* This #include defines a local function! */
3504	# include <locale/weight.h>
3505	/* Calculate the index for equivalence class. */
3506	cp = name;
3507	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509	_NL_COLLATE_WEIGHTMB);
3510	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511	_NL_COLLATE_EXTRAMB);
3512	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513	_NL_COLLATE_INDIRECTMB);
3514	idx1 = findidx (&cp, -1);
3515	if (BE (idx1 == 0 \|\| *cp != '\0', 0))
3516	/* This isn't a valid character. */
3517	return REG_ECOLLATE;
3518
3519	/* Build single byte matching table for this equivalence class. */
3520	len = weights[idx1 & 0xffffff];
3521	for (ch = 0; ch < SBC_MAX; ++ch)
3522	{
3523	char_buf[0] = ch;
3524	cp = char_buf;
3525	idx2 = findidx (&cp, 1);
3526	/*
3527	idx2 = table[ch];
3528	*/
3529	if (idx2 == 0)
3530	/* This isn't a valid character. */
3531	continue;
3532	/* Compare only if the length matches and the collation rule
3533	index is the same. */
3534	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3535	{
3536	int cnt = 0;
3537
3538	while (cnt <= len &&
3539	weights[(idx1 & 0xffffff) + 1 + cnt]
3540	== weights[(idx2 & 0xffffff) + 1 + cnt])
3541	++cnt;
3542
3543	if (cnt > len)
3544	bitset_set (sbcset, ch);
3545	}
3546	}
3547	/* Check whether the array has enough space. */
3548	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3549	{
3550	/* Not enough, realloc it. */
3551	/* +1 in case of mbcset->nequiv_classes is 0. */
3552	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553	/* Use realloc since the array is NULL if alloc == 0. /
3554	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555	int32_t,
3556	new_equiv_class_alloc);
3557	if (BE (new_equiv_classes == NULL, 0))
3558	return REG_ESPACE;
3559	mbcset->equiv_classes = new_equiv_classes;
3560	*equiv_class_alloc = new_equiv_class_alloc;
3561	}
3562	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563	}
3564	else
3565	#endif /* _LIBC */
3566	{
3567	if (BE (strlen ((const char *) name) != 1, 0))
3568	return REG_ECOLLATE;
3569	bitset_set (sbcset, *name);
3570	}
3571	return REG_NOERROR;
3572	}
3573
3574	/* Helper function for parse_bracket_exp.
3575	Build the character class which is represented by NAME.
3576	The result are written to MBCSET and SBCSET.
3577	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578	is a pointer argument since we may update it. */
3579
3580	static reg_errcode_t
3581	#ifdef RE_ENABLE_I18N
3582	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583	re_charset_t mbcset, Idx char_class_alloc,
3584	const unsigned char *class_name, reg_syntax_t syntax)
3585	#else /* not RE_ENABLE_I18N */
3586	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587	const unsigned char *class_name, reg_syntax_t syntax)
3588	#endif /* not RE_ENABLE_I18N */
3589	{
3590	int i;
3591	const char name = (const char ) class_name;
3592
3593	/* In case of REG_ICASE "upper" and "lower" match the both of
3594	upper and lower cases. */
3595	if ((syntax & RE_ICASE)
3596	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
3597	name = "alpha";
3598
3599	#ifdef RE_ENABLE_I18N
3600	/* Check the space of the arrays. */
3601	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3602	{
3603	/* Not enough, realloc it. */
3604	/* +1 in case of mbcset->nchar_classes is 0. */
3605	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606	/* Use realloc since array is NULL if alloc == 0. /
3607	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608	new_char_class_alloc);
3609	if (BE (new_char_classes == NULL, 0))
3610	return REG_ESPACE;
3611	mbcset->char_classes = new_char_classes;
3612	*char_class_alloc = new_char_class_alloc;
3613	}
3614	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615	#endif /* RE_ENABLE_I18N */
3616
3617	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3618	do { \
3619	if (BE (trans != NULL, 0)) \
3620	{ \
3621	for (i = 0; i < SBC_MAX; ++i) \
3622	if (ctype_func (i)) \
3623	bitset_set (sbcset, trans[i]); \
3624	} \
3625	else \
3626	{ \
3627	for (i = 0; i < SBC_MAX; ++i) \
3628	if (ctype_func (i)) \
3629	bitset_set (sbcset, i); \
3630	} \
3631	} while (0)
3632
3633	if (strcmp (name, "alnum") == 0)
3634	BUILD_CHARCLASS_LOOP (isalnum);
3635	else if (strcmp (name, "cntrl") == 0)
3636	BUILD_CHARCLASS_LOOP (iscntrl);
3637	else if (strcmp (name, "lower") == 0)
3638	BUILD_CHARCLASS_LOOP (islower);
3639	else if (strcmp (name, "space") == 0)
3640	BUILD_CHARCLASS_LOOP (isspace);
3641	else if (strcmp (name, "alpha") == 0)
3642	BUILD_CHARCLASS_LOOP (isalpha);
3643	else if (strcmp (name, "digit") == 0)
3644	BUILD_CHARCLASS_LOOP (isdigit);
3645	else if (strcmp (name, "print") == 0)
3646	BUILD_CHARCLASS_LOOP (isprint);
3647	else if (strcmp (name, "upper") == 0)
3648	BUILD_CHARCLASS_LOOP (isupper);
3649	else if (strcmp (name, "blank") == 0)
3650	BUILD_CHARCLASS_LOOP (isblank);
3651	else if (strcmp (name, "graph") == 0)
3652	BUILD_CHARCLASS_LOOP (isgraph);
3653	else if (strcmp (name, "punct") == 0)
3654	BUILD_CHARCLASS_LOOP (ispunct);
3655	else if (strcmp (name, "xdigit") == 0)
3656	BUILD_CHARCLASS_LOOP (isxdigit);
3657	else
3658	return REG_ECTYPE;
3659
3660	return REG_NOERROR;
3661	}
3662
3663	static bin_tree_t *
3664	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665	const unsigned char *class_name,
3666	const unsigned char *extra, bool non_match,
3667	reg_errcode_t *err)
3668	{
3669	re_bitset_ptr_t sbcset;
3670	#ifdef RE_ENABLE_I18N
3671	re_charset_t *mbcset;
3672	Idx alloc = 0;
3673	#endif /* not RE_ENABLE_I18N */
3674	reg_errcode_t ret;
3675	re_token_t br_token;
3676	bin_tree_t *tree;
3677
3678	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679	#ifdef RE_ENABLE_I18N
3680	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681	#endif /* RE_ENABLE_I18N */
3682
3683	#ifdef RE_ENABLE_I18N
3684	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3685	#else /* not RE_ENABLE_I18N */
3686	if (BE (sbcset == NULL, 0))
3687	#endif /* not RE_ENABLE_I18N */
3688	{
3689	*err = REG_ESPACE;
3690	return NULL;
3691	}
3692
3693	if (non_match)
3694	{
3695	#ifdef RE_ENABLE_I18N
3696	mbcset->non_match = 1;
3697	#endif /* not RE_ENABLE_I18N */
3698	}
3699
3700	/* We don't care the syntax in this case. */
3701	ret = build_charclass (trans, sbcset,
3702	#ifdef RE_ENABLE_I18N
3703	mbcset, &alloc,
3704	#endif /* RE_ENABLE_I18N */
3705	class_name, 0);
3706
3707	if (BE (ret != REG_NOERROR, 0))
3708	{
3709	re_free (sbcset);
3710	#ifdef RE_ENABLE_I18N
3711	free_charset (mbcset);
3712	#endif /* RE_ENABLE_I18N */
3713	*err = ret;
3714	return NULL;
3715	}
3716	/* \w match '_' also. */
3717	for (; *extra; extra++)
3718	bitset_set (sbcset, *extra);
3719
3720	/* If it is non-matching list. */
3721	if (non_match)
3722	bitset_not (sbcset);
3723
3724	#ifdef RE_ENABLE_I18N
3725	/* Ensure only single byte characters are set. */
3726	if (dfa->mb_cur_max > 1)
3727	bitset_mask (sbcset, dfa->sb_char);
3728	#endif
3729
3730	/* Build a tree for simple bracket. */
3731	br_token.type = SIMPLE_BRACKET;
3732	br_token.opr.sbcset = sbcset;
3733	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734	if (BE (tree == NULL, 0))
3735	goto build_word_op_espace;
3736
3737	#ifdef RE_ENABLE_I18N
3738	if (dfa->mb_cur_max > 1)
3739	{
3740	bin_tree_t *mbc_tree;
3741	/* Build a tree for complex bracket. */
3742	br_token.type = COMPLEX_BRACKET;
3743	br_token.opr.mbcset = mbcset;
3744	dfa->has_mb_node = 1;
3745	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746	if (BE (mbc_tree == NULL, 0))
3747	goto build_word_op_espace;
3748	/* Then join them by ALT node. */
3749	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750	if (BE (mbc_tree != NULL, 1))
3751	return tree;
3752	}
3753	else
3754	{
3755	free_charset (mbcset);
3756	return tree;
3757	}
3758	#else /* not RE_ENABLE_I18N */
3759	return tree;
3760	#endif /* not RE_ENABLE_I18N */
3761
3762	build_word_op_espace:
3763	re_free (sbcset);
3764	#ifdef RE_ENABLE_I18N
3765	free_charset (mbcset);
3766	#endif /* RE_ENABLE_I18N */
3767	*err = REG_ESPACE;
3768	return NULL;
3769	}
3770
3771	/* This is intended for the expressions like "a{1,3}".
3772	Fetch a number from 'input', and return the number.
3773	Return REG_MISSING if the number field is empty like "{,1}".
3774	Return RE_DUP_MAX + 1 if the number field is too large.
3775	Return REG_ERROR if an error occurred. */
3776
3777	static Idx
3778	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3779	{
3780	Idx num = REG_MISSING;
3781	unsigned char c;
3782	while (1)
3783	{
3784	fetch_token (token, input, syntax);
3785	c = token->opr.c;
3786	if (BE (token->type == END_OF_RE, 0))
3787	return REG_ERROR;
3788	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3789	break;
3790	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c
3791	\|\| num == REG_ERROR)
3792	? REG_ERROR
3793	: num == REG_MISSING
3794	? c - '0'
3795	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796	}
3797	return num;
3798	}
3799
3800
3801	#ifdef RE_ENABLE_I18N
3802	static void
3803	free_charset (re_charset_t *cset)
3804	{
3805	re_free (cset->mbchars);
3806	# ifdef _LIBC
3807	re_free (cset->coll_syms);
3808	re_free (cset->equiv_classes);
3809	re_free (cset->range_starts);
3810	re_free (cset->range_ends);
3811	# endif
3812	re_free (cset->char_classes);
3813	re_free (cset);
3814	}
3815	#endif /* RE_ENABLE_I18N */
3816
3817
3818	/* Functions for binary tree operation. */
3819
3820	/* Create a tree node. */
3821
3822	static bin_tree_t *
3823	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3824	re_token_type_t type)
3825	{
3826	re_token_t t;
3827	t.type = type;
3828	return create_token_tree (dfa, left, right, &t);
3829	}
3830
3831	static bin_tree_t *
3832	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3833	const re_token_t *token)
3834	{
3835	bin_tree_t *tree;
3836	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3837	{
3838	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3839
3840	if (storage == NULL)
3841	return NULL;
3842	storage->next = dfa->str_tree_storage;
3843	dfa->str_tree_storage = storage;
3844	dfa->str_tree_storage_idx = 0;
3845	}
3846	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3847
3848	tree->parent = NULL;
3849	tree->left = left;
3850	tree->right = right;
3851	tree->token = *token;
3852	tree->token.duplicated = 0;
3853	tree->token.opt_subexp = 0;
3854	tree->first = NULL;
3855	tree->next = NULL;
3856	tree->node_idx = REG_MISSING;
3857
3858	if (left != NULL)
3859	left->parent = tree;
3860	if (right != NULL)
3861	right->parent = tree;
3862	return tree;
3863	}
3864
3865	/* Mark the tree SRC as an optional subexpression.
3866	To be called from preorder or postorder. */
3867
3868	static reg_errcode_t
3869	mark_opt_subexp (void extra, bin_tree_t node)
3870	{
3871	Idx idx = (Idx) (long) extra;
3872	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3873	node->token.opt_subexp = 1;
3874
3875	return REG_NOERROR;
3876	}
3877
3878	/* Free the allocated memory inside NODE. */
3879
3880	static void
3881	free_token (re_token_t *node)
3882	{
3883	#ifdef RE_ENABLE_I18N
3884	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3885	free_charset (node->opr.mbcset);
3886	else
3887	#endif /* RE_ENABLE_I18N */
3888	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3889	re_free (node->opr.sbcset);
3890	}
3891
3892	/* Worker function for tree walking. Free the allocated memory inside NODE
3893	and its children. */
3894
3895	static reg_errcode_t
3896	free_tree (void extra _UNUSED_PARAMETER_, bin_tree_t node)
3897	{
3898	free_token (&node->token);
3899	return REG_NOERROR;
3900	}
3901
3902
3903	/* Duplicate the node SRC, and return new node. This is a preorder
3904	visit similar to the one implemented by the generic visitor, but
3905	we need more infrastructure to maintain two parallel trees --- so,
3906	it's easier to duplicate. */
3907
3908	static bin_tree_t *
3909	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3910	{
3911	const bin_tree_t *node;
3912	bin_tree_t *dup_root;
3913	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3914
3915	for (node = root; ; )
3916	{
3917	/* Create a new tree and link it back to the current parent. */
3918	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3919	if (*p_new == NULL)
3920	return NULL;
3921	(*p_new)->parent = dup_node;
3922	(*p_new)->token.duplicated = 1;
3923	dup_node = *p_new;
3924
3925	/* Go to the left node, or up and to the right. */
3926	if (node->left)
3927	{
3928	node = node->left;
3929	p_new = &dup_node->left;
3930	}
3931	else
3932	{
3933	const bin_tree_t *prev = NULL;
3934	while (node->right == prev \|\| node->right == NULL)
3935	{
3936	prev = node;
3937	node = node->parent;
3938	dup_node = dup_node->parent;
3939	if (!node)
3940	return dup_root;
3941	}
3942	node = node->right;
3943	p_new = &dup_node->right;
3944	}
3945	}
3946	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/grep/2.12/lib/regcomp.c

Download in other formats: