VirtualBox

source: kBuild/trunk/src/grep/lib/regcomp.c

Last change on this file was 3548, checked in by bird, 3 years ago

grep: Use get_crt_codepage(). Don't default to the UTF-8 manifest for older VCC versions as the CRT won't do the right thing.

  • Property svn:eol-style set to native
File size: 111.9 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30#ifdef RE_ENABLE_I18N
31static void free_charset (re_charset_t *cset);
32#endif /* RE_ENABLE_I18N */
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
38static reg_errcode_t analyze (regex_t *preg);
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92#ifdef RE_ENABLE_I18N
93static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103#else /* not RE_ENABLE_I18N */
104static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110#endif /* not RE_ENABLE_I18N */
111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127
128/* This table gives an error message for each of the error codes listed
129 in regex.h. Obviously the order here has to be same as there.
130 POSIX doesn't require that we do anything for REG_NOERROR,
131 but why not be nice? */
132
133static const char __re_error_msgid[] =
134 {
135#define REG_NOERROR_IDX 0
136 gettext_noop ("Success") /* REG_NOERROR */
137 "\0"
138#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139 gettext_noop ("No match") /* REG_NOMATCH */
140 "\0"
141#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 "\0"
144#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 "\0"
147#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 "\0"
150#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 "\0"
153#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 "\0"
156#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 "\0"
159#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 "\0"
162#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 "\0"
165#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 "\0"
168#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 "\0"
171#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 "\0"
174#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 "\0"
177#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 "\0"
180#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 "\0"
183#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 };
186
187static const size_t __re_error_msgid_idx[] =
188 {
189 REG_NOERROR_IDX,
190 REG_NOMATCH_IDX,
191 REG_BADPAT_IDX,
192 REG_ECOLLATE_IDX,
193 REG_ECTYPE_IDX,
194 REG_EESCAPE_IDX,
195 REG_ESUBREG_IDX,
196 REG_EBRACK_IDX,
197 REG_EPAREN_IDX,
198 REG_EBRACE_IDX,
199 REG_BADBR_IDX,
200 REG_ERANGE_IDX,
201 REG_ESPACE_IDX,
202 REG_BADRPT_IDX,
203 REG_EEND_IDX,
204 REG_ESIZE_IDX,
205 REG_ERPAREN_IDX
206 };
207
208
209/* Entry points for GNU code. */
210
211/* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
214
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
217
218const char *
219re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
221{
222 reg_errcode_t ret;
223
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
228
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
231
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
233
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237}
238weak_alias (__re_compile_pattern, re_compile_pattern)
239
240/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243/* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245reg_syntax_t re_syntax_options;
246
247
248/* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255reg_syntax_t
256re_set_syntax (reg_syntax_t syntax)
257{
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262}
263weak_alias (__re_set_syntax, re_set_syntax)
264
265int
266re_compile_fastmap (struct re_pattern_buffer *bufp)
267{
268 re_dfa_t *dfa = bufp->buffer;
269 char *fastmap = bufp->fastmap;
270
271 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
272 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
273 if (dfa->init_state != dfa->init_state_word)
274 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
275 if (dfa->init_state != dfa->init_state_nl)
276 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
277 if (dfa->init_state != dfa->init_state_begbuf)
278 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
279 bufp->fastmap_accurate = 1;
280 return 0;
281}
282weak_alias (__re_compile_fastmap, re_compile_fastmap)
283
284static inline void
285__attribute__ ((always_inline))
286re_set_fastmap (char *fastmap, bool icase, int ch)
287{
288 fastmap[ch] = 1;
289 if (icase)
290 fastmap[tolower (ch)] = 1;
291}
292
293/* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
295
296static void
297re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 char *fastmap)
299{
300 re_dfa_t *dfa = bufp->buffer;
301 Idx node_cnt;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304 {
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
307
308 if (type == CHARACTER)
309 {
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311#ifdef RE_ENABLE_I18N
312 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
313 {
314 unsigned char buf[MB_LEN_MAX];
315 unsigned char *p;
316 wchar_t wc;
317 mbstate_t state;
318
319 p = buf;
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, '\0', sizeof (state));
326 if (__mbrtowc (&wc, (const char *) buf, p - buf,
327 &state) == p - buf
328 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
329 != (size_t) -1))
330 re_set_fastmap (fastmap, false, buf[0]);
331 }
332#endif
333 }
334 else if (type == SIMPLE_BRACKET)
335 {
336 int i, ch;
337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 {
339 int j;
340 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 if (w & ((bitset_word_t) 1 << j))
343 re_set_fastmap (fastmap, icase, ch);
344 }
345 }
346#ifdef RE_ENABLE_I18N
347 else if (type == COMPLEX_BRACKET)
348 {
349 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350 Idx i;
351
352# ifdef _LIBC
353 /* See if we have to try all bytes which start multiple collation
354 elements.
355 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
356 collation element, and don't catch 'b' since 'b' is
357 the only collation element which starts from 'b' (and
358 it is caught by SIMPLE_BRACKET). */
359 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
360 && (cset->ncoll_syms || cset->nranges))
361 {
362 const int32_t *table = (const int32_t *)
363 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364 for (i = 0; i < SBC_MAX; ++i)
365 if (table[i] < 0)
366 re_set_fastmap (fastmap, icase, i);
367 }
368# endif /* _LIBC */
369
370 /* See if we have to start the match at all multibyte characters,
371 i.e. where we would not find an invalid sequence. This only
372 applies to multibyte character sets; for single byte character
373 sets, the SIMPLE_BRACKET again suffices. */
374 if (dfa->mb_cur_max > 1
375 && (cset->nchar_classes || cset->non_match || cset->nranges
376# ifdef _LIBC
377 || cset->nequiv_classes
378# endif /* _LIBC */
379 ))
380 {
381 unsigned char c = 0;
382 do
383 {
384 mbstate_t mbs;
385 memset (&mbs, 0, sizeof (mbs));
386 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
387 re_set_fastmap (fastmap, false, (int) c);
388 }
389 while (++c != 0);
390 }
391
392 else
393 {
394 /* ... Else catch all bytes which can start the mbchars. */
395 for (i = 0; i < cset->nmbchars; ++i)
396 {
397 char buf[256];
398 mbstate_t state;
399 memset (&state, '\0', sizeof (state));
400 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
401 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
402 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
403 {
404 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
405 != (size_t) -1)
406 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
407 }
408 }
409 }
410 }
411#endif /* RE_ENABLE_I18N */
412 else if (type == OP_PERIOD
413#ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415#endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
417 {
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
421 return;
422 }
423 }
424}
425
426
427/* Entry point for POSIX code. */
428/* regcomp takes a regular expression as a string and compiles it.
429
430 PREG is a regex_t *. We do not expect any fields to be initialized,
431 since POSIX says we shouldn't. Thus, we set
432
433 'buffer' to the compiled pattern;
434 'used' to the length of the compiled pattern;
435 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436 REG_EXTENDED bit in CFLAGS is set; otherwise, to
437 RE_SYNTAX_POSIX_BASIC;
438 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
439 'fastmap' to an allocated space for the fastmap;
440 'fastmap_accurate' to zero;
441 're_nsub' to the number of subexpressions in PATTERN.
442
443 PATTERN is the address of the pattern string.
444
445 CFLAGS is a series of bits which affect compilation.
446
447 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448 use POSIX basic syntax.
449
450 If REG_NEWLINE is set, then . and [^...] don't match newline.
451 Also, regexec will try a match beginning after every newline.
452
453 If REG_ICASE is set, then we considers upper- and lowercase
454 versions of letters to be equivalent when matching.
455
456 If REG_NOSUB is set, then when PREG is passed to regexec, that
457 routine will report only success or failure, and nothing about the
458 registers.
459
460 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461 the return codes and their meanings.) */
462
463int
464regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
465{
466 reg_errcode_t ret;
467 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC);
469
470 preg->buffer = NULL;
471 preg->allocated = 0;
472 preg->used = 0;
473
474 /* Try to allocate space for the fastmap. */
475 preg->fastmap = re_malloc (char, SBC_MAX);
476 if (__glibc_unlikely (preg->fastmap == NULL))
477 return REG_ESPACE;
478
479 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
480
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags & REG_NEWLINE)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax &= ~RE_DOT_NEWLINE;
485 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
486 /* It also changes the matching behavior. */
487 preg->newline_anchor = 1;
488 }
489 else
490 preg->newline_anchor = 0;
491 preg->no_sub = !!(cflags & REG_NOSUB);
492 preg->translate = NULL;
493
494 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret == REG_ERPAREN)
499 ret = REG_EPAREN;
500
501 /* We have already checked preg->fastmap != NULL. */
502 if (__glibc_likely (ret == REG_NOERROR))
503 /* Compute the fastmap now, since regexec cannot modify the pattern
504 buffer. This function never fails in this implementation. */
505 (void) re_compile_fastmap (preg);
506 else
507 {
508 /* Some error occurred while compiling the expression. */
509 re_free (preg->fastmap);
510 preg->fastmap = NULL;
511 }
512
513 return (int) ret;
514}
515libc_hidden_def (__regcomp)
516weak_alias (__regcomp, regcomp)
517
518/* Returns a message corresponding to an error code, ERRCODE, returned
519 from either regcomp or regexec. We don't use PREG here. */
520
521size_t
522regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
523 size_t errbuf_size)
524{
525 const char *msg;
526 size_t msg_size;
527 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
528
529 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
530 /* Only error codes returned by the rest of the code should be passed
531 to this routine. If we are given anything else, or if other regex
532 code generates an invalid error code, then the program has a bug.
533 Dump core so we can fix it. */
534 abort ();
535
536 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
537
538 msg_size = strlen (msg) + 1; /* Includes the null. */
539
540 if (__glibc_likely (errbuf_size != 0))
541 {
542 size_t cpy_size = msg_size;
543 if (__glibc_unlikely (msg_size > errbuf_size))
544 {
545 cpy_size = errbuf_size - 1;
546 errbuf[cpy_size] = '\0';
547 }
548 memcpy (errbuf, msg, cpy_size);
549 }
550
551 return msg_size;
552}
553weak_alias (__regerror, regerror)
554
555
556#ifdef RE_ENABLE_I18N
557/* This static array is used for the map to single-byte characters when
558 UTF-8 is used. Otherwise we would allocate memory just to initialize
559 it the same all the time. UTF-8 is the preferred encoding so this is
560 a worthwhile optimization. */
561static const bitset_t utf8_sb_map =
562{
563 /* Set the first 128 bits. */
564# if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
565 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
566# else
567# if 4 * BITSET_WORD_BITS < ASCII_CHARS
568# error "bitset_word_t is narrower than 32 bits"
569# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
570 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
571# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
572 BITSET_WORD_MAX, BITSET_WORD_MAX,
573# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX,
575# endif
576 (BITSET_WORD_MAX
577 >> (SBC_MAX % BITSET_WORD_BITS == 0
578 ? 0
579 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
580# endif
581};
582#endif
583
584
585static void
586free_dfa_content (re_dfa_t *dfa)
587{
588 Idx i, j;
589
590 if (dfa->nodes)
591 for (i = 0; i < dfa->nodes_len; ++i)
592 free_token (dfa->nodes + i);
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
595 {
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
602 }
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
607
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
610 {
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
613 {
614 re_dfastate_t *state = entry->array[j];
615 free_state (state);
616 }
617 re_free (entry->array);
618 }
619 re_free (dfa->state_table);
620#ifdef RE_ENABLE_I18N
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
623#endif
624 re_free (dfa->subexp_map);
625#ifdef DEBUG
626 re_free (dfa->re_str);
627#endif
628
629 re_free (dfa);
630}
631
632
633/* Free dynamically allocated space used by PREG. */
634
635void
636regfree (regex_t *preg)
637{
638 re_dfa_t *dfa = preg->buffer;
639 if (__glibc_likely (dfa != NULL))
640 {
641 lock_fini (dfa->lock);
642 free_dfa_content (dfa);
643 }
644 preg->buffer = NULL;
645 preg->allocated = 0;
646
647 re_free (preg->fastmap);
648 preg->fastmap = NULL;
649
650 re_free (preg->translate);
651 preg->translate = NULL;
652}
653libc_hidden_def (__regfree)
654weak_alias (__regfree, regfree)
655
656
657/* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
659
660#if defined _REGEX_RE_COMP || defined _LIBC
661
662/* BSD has one and only one pattern buffer. */
663static struct re_pattern_buffer re_comp_buf;
664
665char *
666# ifdef _LIBC
667/* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
670weak_function
671# endif
672re_comp (const char *s)
673{
674 reg_errcode_t ret;
675 char *fastmap;
676
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
683
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
692
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
700
701 /* Since 're_exec' always passes NULL for the 'regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
703
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
706
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709 if (!ret)
710 return NULL;
711
712 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714}
715
716#ifdef _LIBC
717libc_freeres_fn (free_mem)
718{
719 __regfree (&re_comp_buf);
720}
721#endif
722
723#endif /* _REGEX_RE_COMP */
724
725
726/* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
729
730static reg_errcode_t
731re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 reg_syntax_t syntax)
733{
734 reg_errcode_t err = REG_NOERROR;
735 re_dfa_t *dfa;
736 re_string_t regexp;
737
738 /* Initialize the pattern buffer. */
739 preg->fastmap_accurate = 0;
740 preg->syntax = syntax;
741 preg->not_bol = preg->not_eol = 0;
742 preg->used = 0;
743 preg->re_nsub = 0;
744 preg->can_be_null = 0;
745 preg->regs_allocated = REGS_UNALLOCATED;
746
747 /* Initialize the dfa. */
748 dfa = preg->buffer;
749 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
750 {
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756 if (dfa == NULL)
757 return REG_ESPACE;
758 preg->allocated = sizeof (re_dfa_t);
759 preg->buffer = dfa;
760 }
761 preg->used = sizeof (re_dfa_t);
762
763 err = init_dfa (dfa, length);
764 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
765 err = REG_ESPACE;
766 if (__glibc_unlikely (err != REG_NOERROR))
767 {
768 free_dfa_content (dfa);
769 preg->buffer = NULL;
770 preg->allocated = 0;
771 return err;
772 }
773#ifdef DEBUG
774 /* Note: length+1 will not overflow since it is checked in init_dfa. */
775 dfa->re_str = re_malloc (char, length + 1);
776 strncpy (dfa->re_str, pattern, length + 1);
777#endif
778
779 err = re_string_construct (&regexp, pattern, length, preg->translate,
780 (syntax & RE_ICASE) != 0, dfa);
781 if (__glibc_unlikely (err != REG_NOERROR))
782 {
783 re_compile_internal_free_return:
784 free_workarea_compile (preg);
785 re_string_destruct (&regexp);
786 lock_fini (dfa->lock);
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 return err;
791 }
792
793 /* Parse the regular expression, and build a structure tree. */
794 preg->re_nsub = 0;
795 dfa->str_tree = parse (&regexp, preg, syntax, &err);
796 if (__glibc_unlikely (dfa->str_tree == NULL))
797 goto re_compile_internal_free_return;
798
799 /* Analyze the tree and create the nfa. */
800 err = analyze (preg);
801 if (__glibc_unlikely (err != REG_NOERROR))
802 goto re_compile_internal_free_return;
803
804#ifdef RE_ENABLE_I18N
805 /* If possible, do searching in single byte encoding to speed things up. */
806 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
807 optimize_utf8 (dfa);
808#endif
809
810 /* Then create the initial state of the dfa. */
811 err = create_initial_state (dfa);
812
813 /* Release work areas. */
814 free_workarea_compile (preg);
815 re_string_destruct (&regexp);
816
817 if (__glibc_unlikely (err != REG_NOERROR))
818 {
819 lock_fini (dfa->lock);
820 free_dfa_content (dfa);
821 preg->buffer = NULL;
822 preg->allocated = 0;
823 }
824
825 return err;
826}
827
828/* Initialize DFA. We use the length of the regular expression PAT_LEN
829 as the initial length of some arrays. */
830
831static reg_errcode_t
832init_dfa (re_dfa_t *dfa, size_t pat_len)
833{
834 __re_size_t table_size;
835#ifndef _LIBC
836 const char *codeset_name;
837#endif
838#ifdef RE_ENABLE_I18N
839 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
840#else
841 size_t max_i18n_object_size = 0;
842#endif
843 size_t max_object_size =
844 MAX (sizeof (struct re_state_table_entry),
845 MAX (sizeof (re_token_t),
846 MAX (sizeof (re_node_set),
847 MAX (sizeof (regmatch_t),
848 max_i18n_object_size))));
849
850 memset (dfa, '\0', sizeof (re_dfa_t));
851
852 /* Force allocation of str_tree_storage the first time. */
853 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
856 calculation below, and for similar doubling calculations
857 elsewhere. And it's <= rather than <, because some of the
858 doubling calculations add 1 afterwards. */
859 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
860 <= pat_len))
861 return REG_ESPACE;
862
863 dfa->nodes_alloc = pat_len + 1;
864 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865
866 /* table_size = 2 ^ ceil(log pat_len) */
867 for (table_size = 1; ; table_size <<= 1)
868 if (table_size > pat_len)
869 break;
870
871 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872 dfa->state_hash_mask = table_size - 1;
873
874 dfa->mb_cur_max = MB_CUR_MAX;
875#ifdef _LIBC
876 if (dfa->mb_cur_max == 6
877 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878 dfa->is_utf8 = 1;
879 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 != 0);
881#else
882# ifdef _MSC_VER
883 (void)codeset_name;
884 if (get_crt_codepage() == CP_UTF8)
885# else
886 codeset_name = nl_langinfo (CODESET);
887 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
888 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
889 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
890 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
891# endif
892 dfa->is_utf8 = 1;
893
894 /* We check exhaustively in the loop below if this charset is a
895 superset of ASCII. */
896 dfa->map_notascii = 0;
897#endif
898
899#ifdef RE_ENABLE_I18N
900 if (dfa->mb_cur_max > 1)
901 {
902 if (dfa->is_utf8)
903 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904 else
905 {
906 int i, j, ch;
907
908 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
909 if (__glibc_unlikely (dfa->sb_char == NULL))
910 return REG_ESPACE;
911
912 /* Set the bits corresponding to single byte chars. */
913 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
914 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
915 {
916 wint_t wch = __btowc (ch);
917 if (wch != WEOF)
918 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
919# ifndef _LIBC
920 if (isascii (ch) && wch != ch)
921 dfa->map_notascii = 1;
922# endif
923 }
924 }
925 }
926#endif
927
928 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
929 return REG_ESPACE;
930 return REG_NOERROR;
931}
932
933/* Initialize WORD_CHAR table, which indicate which character is
934 "word". In this case "word" means that it is the word construction
935 character used by some operators like "\<", "\>", etc. */
936
937static void
938init_word_char (re_dfa_t *dfa)
939{
940 int i = 0;
941 int j;
942 int ch = 0;
943 dfa->word_ops_used = 1;
944 if (__glibc_likely (dfa->map_notascii == 0))
945 {
946 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
947 them, an issue when this code is used in Gnulib. */
948 bitset_word_t bits0 = 0x00000000;
949 bitset_word_t bits1 = 0x03ff0000;
950 bitset_word_t bits2 = 0x87fffffe;
951 bitset_word_t bits3 = 0x07fffffe;
952 if (BITSET_WORD_BITS == 64)
953 {
954 /* Pacify gcc -Woverflow on 32-bit platformns. */
955 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
956 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
957 i = 2;
958 }
959 else if (BITSET_WORD_BITS == 32)
960 {
961 dfa->word_char[0] = bits0;
962 dfa->word_char[1] = bits1;
963 dfa->word_char[2] = bits2;
964 dfa->word_char[3] = bits3;
965 i = 4;
966 }
967 else
968 goto general_case;
969 ch = 128;
970
971 if (__glibc_likely (dfa->is_utf8))
972 {
973 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
974 return;
975 }
976 }
977
978 general_case:
979 for (; i < BITSET_WORDS; ++i)
980 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
981 if (isalnum (ch) || ch == '_')
982 dfa->word_char[i] |= (bitset_word_t) 1 << j;
983}
984
985/* Free the work area which are only used while compiling. */
986
987static void
988free_workarea_compile (regex_t *preg)
989{
990 re_dfa_t *dfa = preg->buffer;
991 bin_tree_storage_t *storage, *next;
992 for (storage = dfa->str_tree_storage; storage; storage = next)
993 {
994 next = storage->next;
995 re_free (storage);
996 }
997 dfa->str_tree_storage = NULL;
998 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
999 dfa->str_tree = NULL;
1000 re_free (dfa->org_indices);
1001 dfa->org_indices = NULL;
1002}
1003
1004/* Create initial states for all contexts. */
1005
1006static reg_errcode_t
1007create_initial_state (re_dfa_t *dfa)
1008{
1009 Idx first, i;
1010 reg_errcode_t err;
1011 re_node_set init_nodes;
1012
1013 /* Initial states have the epsilon closure of the node which is
1014 the first node of the regular expression. */
1015 first = dfa->str_tree->first->node_idx;
1016 dfa->init_node = first;
1017 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1018 if (__glibc_unlikely (err != REG_NOERROR))
1019 return err;
1020
1021 /* The back-references which are in initial states can epsilon transit,
1022 since in this case all of the subexpressions can be null.
1023 Then we add epsilon closures of the nodes which are the next nodes of
1024 the back-references. */
1025 if (dfa->nbackref > 0)
1026 for (i = 0; i < init_nodes.nelem; ++i)
1027 {
1028 Idx node_idx = init_nodes.elems[i];
1029 re_token_type_t type = dfa->nodes[node_idx].type;
1030
1031 Idx clexp_idx;
1032 if (type != OP_BACK_REF)
1033 continue;
1034 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1035 {
1036 re_token_t *clexp_node;
1037 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1038 if (clexp_node->type == OP_CLOSE_SUBEXP
1039 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1040 break;
1041 }
1042 if (clexp_idx == init_nodes.nelem)
1043 continue;
1044
1045 if (type == OP_BACK_REF)
1046 {
1047 Idx dest_idx = dfa->edests[node_idx].elems[0];
1048 if (!re_node_set_contains (&init_nodes, dest_idx))
1049 {
1050 reg_errcode_t merge_err
1051 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1052 if (merge_err != REG_NOERROR)
1053 return merge_err;
1054 i = 0;
1055 }
1056 }
1057 }
1058
1059 /* It must be the first time to invoke acquire_state. */
1060 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1061 /* We don't check ERR here, since the initial state must not be NULL. */
1062 if (__glibc_unlikely (dfa->init_state == NULL))
1063 return err;
1064 if (dfa->init_state->has_constraint)
1065 {
1066 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1067 CONTEXT_WORD);
1068 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1069 CONTEXT_NEWLINE);
1070 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1071 &init_nodes,
1072 CONTEXT_NEWLINE
1073 | CONTEXT_BEGBUF);
1074 if (__glibc_unlikely (dfa->init_state_word == NULL
1075 || dfa->init_state_nl == NULL
1076 || dfa->init_state_begbuf == NULL))
1077 return err;
1078 }
1079 else
1080 dfa->init_state_word = dfa->init_state_nl
1081 = dfa->init_state_begbuf = dfa->init_state;
1082
1083 re_node_set_free (&init_nodes);
1084 return REG_NOERROR;
1085}
1086
1087
1088#ifdef RE_ENABLE_I18N
1089/* If it is possible to do searching in single byte encoding instead of UTF-8
1090 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091 DFA nodes where needed. */
1092
1093static void
1094optimize_utf8 (re_dfa_t *dfa)
1095{
1096 Idx node;
1097 int i;
1098 bool mb_chars = false;
1099 bool has_period = false;
1100
1101 for (node = 0; node < dfa->nodes_len; ++node)
1102 switch (dfa->nodes[node].type)
1103 {
1104 case CHARACTER:
1105 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 mb_chars = true;
1107 break;
1108 case ANCHOR:
1109 switch (dfa->nodes[node].opr.ctx_type)
1110 {
1111 case LINE_FIRST:
1112 case LINE_LAST:
1113 case BUF_FIRST:
1114 case BUF_LAST:
1115 break;
1116 default:
1117 /* Word anchors etc. cannot be handled. It's okay to test
1118 opr.ctx_type since constraints (for all DFA nodes) are
1119 created by ORing one or more opr.ctx_type values. */
1120 return;
1121 }
1122 break;
1123 case OP_PERIOD:
1124 has_period = true;
1125 break;
1126 case OP_BACK_REF:
1127 case OP_ALT:
1128 case END_OF_RE:
1129 case OP_DUP_ASTERISK:
1130 case OP_OPEN_SUBEXP:
1131 case OP_CLOSE_SUBEXP:
1132 break;
1133 case COMPLEX_BRACKET:
1134 return;
1135 case SIMPLE_BRACKET:
1136 /* Just double check. */
1137 {
1138 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139 ? 0
1140 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142 {
1143 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 return;
1145 rshift = 0;
1146 }
1147 }
1148 break;
1149 default:
1150 abort ();
1151 }
1152
1153 if (mb_chars || has_period)
1154 for (node = 0; node < dfa->nodes_len; ++node)
1155 {
1156 if (dfa->nodes[node].type == CHARACTER
1157 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158 dfa->nodes[node].mb_partial = 0;
1159 else if (dfa->nodes[node].type == OP_PERIOD)
1160 dfa->nodes[node].type = OP_UTF8_PERIOD;
1161 }
1162
1163 /* The search can be in single byte locale. */
1164 dfa->mb_cur_max = 1;
1165 dfa->is_utf8 = 0;
1166 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1167}
1168#endif
1169
1170
1171/* Analyze the structure tree, and calculate "first", "next", "edest",
1172 "eclosure", and "inveclosure". */
1173
1174static reg_errcode_t
1175analyze (regex_t *preg)
1176{
1177 re_dfa_t *dfa = preg->buffer;
1178 reg_errcode_t ret;
1179
1180 /* Allocate arrays. */
1181 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1182 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1183 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1184 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1185 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1186 || dfa->edests == NULL || dfa->eclosures == NULL))
1187 return REG_ESPACE;
1188
1189 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1190 if (dfa->subexp_map != NULL)
1191 {
1192 Idx i;
1193 for (i = 0; i < preg->re_nsub; i++)
1194 dfa->subexp_map[i] = i;
1195 preorder (dfa->str_tree, optimize_subexps, dfa);
1196 for (i = 0; i < preg->re_nsub; i++)
1197 if (dfa->subexp_map[i] != i)
1198 break;
1199 if (i == preg->re_nsub)
1200 {
1201 re_free (dfa->subexp_map);
1202 dfa->subexp_map = NULL;
1203 }
1204 }
1205
1206 ret = postorder (dfa->str_tree, lower_subexps, preg);
1207 if (__glibc_unlikely (ret != REG_NOERROR))
1208 return ret;
1209 ret = postorder (dfa->str_tree, calc_first, dfa);
1210 if (__glibc_unlikely (ret != REG_NOERROR))
1211 return ret;
1212 preorder (dfa->str_tree, calc_next, dfa);
1213 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1214 if (__glibc_unlikely (ret != REG_NOERROR))
1215 return ret;
1216 ret = calc_eclosure (dfa);
1217 if (__glibc_unlikely (ret != REG_NOERROR))
1218 return ret;
1219
1220 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1221 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1222 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1223 || dfa->nbackref)
1224 {
1225 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1226 if (__glibc_unlikely (dfa->inveclosures == NULL))
1227 return REG_ESPACE;
1228 ret = calc_inveclosure (dfa);
1229 }
1230
1231 return ret;
1232}
1233
1234/* Our parse trees are very unbalanced, so we cannot use a stack to
1235 implement parse tree visits. Instead, we use parent pointers and
1236 some hairy code in these two functions. */
1237static reg_errcode_t
1238postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239 void *extra)
1240{
1241 bin_tree_t *node, *prev;
1242
1243 for (node = root; ; )
1244 {
1245 /* Descend down the tree, preferably to the left (or to the right
1246 if that's the only child). */
1247 while (node->left || node->right)
1248 if (node->left)
1249 node = node->left;
1250 else
1251 node = node->right;
1252
1253 do
1254 {
1255 reg_errcode_t err = fn (extra, node);
1256 if (__glibc_unlikely (err != REG_NOERROR))
1257 return err;
1258 if (node->parent == NULL)
1259 return REG_NOERROR;
1260 prev = node;
1261 node = node->parent;
1262 }
1263 /* Go up while we have a node that is reached from the right. */
1264 while (node->right == prev || node->right == NULL);
1265 node = node->right;
1266 }
1267}
1268
1269static reg_errcode_t
1270preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1271 void *extra)
1272{
1273 bin_tree_t *node;
1274
1275 for (node = root; ; )
1276 {
1277 reg_errcode_t err = fn (extra, node);
1278 if (__glibc_unlikely (err != REG_NOERROR))
1279 return err;
1280
1281 /* Go to the left node, or up and to the right. */
1282 if (node->left)
1283 node = node->left;
1284 else
1285 {
1286 bin_tree_t *prev = NULL;
1287 while (node->right == prev || node->right == NULL)
1288 {
1289 prev = node;
1290 node = node->parent;
1291 if (!node)
1292 return REG_NOERROR;
1293 }
1294 node = node->right;
1295 }
1296 }
1297}
1298
1299/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1300 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1301 backreferences as well. Requires a preorder visit. */
1302static reg_errcode_t
1303optimize_subexps (void *extra, bin_tree_t *node)
1304{
1305 re_dfa_t *dfa = (re_dfa_t *) extra;
1306
1307 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1308 {
1309 int idx = node->token.opr.idx;
1310 node->token.opr.idx = dfa->subexp_map[idx];
1311 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1312 }
1313
1314 else if (node->token.type == SUBEXP
1315 && node->left && node->left->token.type == SUBEXP)
1316 {
1317 Idx other_idx = node->left->token.opr.idx;
1318
1319 node->left = node->left->left;
1320 if (node->left)
1321 node->left->parent = node;
1322
1323 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1324 if (other_idx < BITSET_WORD_BITS)
1325 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1326 }
1327
1328 return REG_NOERROR;
1329}
1330
1331/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1332 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1333static reg_errcode_t
1334lower_subexps (void *extra, bin_tree_t *node)
1335{
1336 regex_t *preg = (regex_t *) extra;
1337 reg_errcode_t err = REG_NOERROR;
1338
1339 if (node->left && node->left->token.type == SUBEXP)
1340 {
1341 node->left = lower_subexp (&err, preg, node->left);
1342 if (node->left)
1343 node->left->parent = node;
1344 }
1345 if (node->right && node->right->token.type == SUBEXP)
1346 {
1347 node->right = lower_subexp (&err, preg, node->right);
1348 if (node->right)
1349 node->right->parent = node;
1350 }
1351
1352 return err;
1353}
1354
1355static bin_tree_t *
1356lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1357{
1358 re_dfa_t *dfa = preg->buffer;
1359 bin_tree_t *body = node->left;
1360 bin_tree_t *op, *cls, *tree1, *tree;
1361
1362 if (preg->no_sub
1363 /* We do not optimize empty subexpressions, because otherwise we may
1364 have bad CONCAT nodes with NULL children. This is obviously not
1365 very common, so we do not lose much. An example that triggers
1366 this case is the sed "script" /\(\)/x. */
1367 && node->left != NULL
1368 && (node->token.opr.idx >= BITSET_WORD_BITS
1369 || !(dfa->used_bkref_map
1370 & ((bitset_word_t) 1 << node->token.opr.idx))))
1371 return node->left;
1372
1373 /* Convert the SUBEXP node to the concatenation of an
1374 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1375 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1376 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1377 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1378 tree = create_tree (dfa, op, tree1, CONCAT);
1379 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1380 || op == NULL || cls == NULL))
1381 {
1382 *err = REG_ESPACE;
1383 return NULL;
1384 }
1385
1386 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388 return tree;
1389}
1390
1391/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392 nodes. Requires a postorder visit. */
1393static reg_errcode_t
1394calc_first (void *extra, bin_tree_t *node)
1395{
1396 re_dfa_t *dfa = (re_dfa_t *) extra;
1397 if (node->token.type == CONCAT)
1398 {
1399 node->first = node->left->first;
1400 node->node_idx = node->left->node_idx;
1401 }
1402 else
1403 {
1404 node->first = node;
1405 node->node_idx = re_dfa_add_node (dfa, node->token);
1406 if (__glibc_unlikely (node->node_idx == -1))
1407 return REG_ESPACE;
1408 if (node->token.type == ANCHOR)
1409 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1410 }
1411 return REG_NOERROR;
1412}
1413
1414/* Pass 2: compute NEXT on the tree. Preorder visit. */
1415static reg_errcode_t
1416calc_next (void *extra, bin_tree_t *node)
1417{
1418 switch (node->token.type)
1419 {
1420 case OP_DUP_ASTERISK:
1421 node->left->next = node;
1422 break;
1423 case CONCAT:
1424 node->left->next = node->right->first;
1425 node->right->next = node->next;
1426 break;
1427 default:
1428 if (node->left)
1429 node->left->next = node->next;
1430 if (node->right)
1431 node->right->next = node->next;
1432 break;
1433 }
1434 return REG_NOERROR;
1435}
1436
1437/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438static reg_errcode_t
1439link_nfa_nodes (void *extra, bin_tree_t *node)
1440{
1441 re_dfa_t *dfa = (re_dfa_t *) extra;
1442 Idx idx = node->node_idx;
1443 reg_errcode_t err = REG_NOERROR;
1444
1445 switch (node->token.type)
1446 {
1447 case CONCAT:
1448 break;
1449
1450 case END_OF_RE:
1451 DEBUG_ASSERT (node->next == NULL);
1452 break;
1453
1454 case OP_DUP_ASTERISK:
1455 case OP_ALT:
1456 {
1457 Idx left, right;
1458 dfa->has_plural_match = 1;
1459 if (node->left != NULL)
1460 left = node->left->first->node_idx;
1461 else
1462 left = node->next->node_idx;
1463 if (node->right != NULL)
1464 right = node->right->first->node_idx;
1465 else
1466 right = node->next->node_idx;
1467 DEBUG_ASSERT (left > -1);
1468 DEBUG_ASSERT (right > -1);
1469 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1470 }
1471 break;
1472
1473 case ANCHOR:
1474 case OP_OPEN_SUBEXP:
1475 case OP_CLOSE_SUBEXP:
1476 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477 break;
1478
1479 case OP_BACK_REF:
1480 dfa->nexts[idx] = node->next->node_idx;
1481 if (node->token.type == OP_BACK_REF)
1482 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483 break;
1484
1485 default:
1486 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1487 dfa->nexts[idx] = node->next->node_idx;
1488 break;
1489 }
1490
1491 return err;
1492}
1493
1494/* Duplicate the epsilon closure of the node ROOT_NODE.
1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496 to their own constraint. */
1497
1498static reg_errcode_t
1499duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500 Idx root_node, unsigned int init_constraint)
1501{
1502 Idx org_node, clone_node;
1503 bool ok;
1504 unsigned int constraint = init_constraint;
1505 for (org_node = top_org_node, clone_node = top_clone_node;;)
1506 {
1507 Idx org_dest, clone_dest;
1508 if (dfa->nodes[org_node].type == OP_BACK_REF)
1509 {
1510 /* If the back reference epsilon-transit, its destination must
1511 also have the constraint. Then duplicate the epsilon closure
1512 of the destination of the back reference, and store it in
1513 edests of the back reference. */
1514 org_dest = dfa->nexts[org_node];
1515 re_node_set_empty (dfa->edests + clone_node);
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (__glibc_unlikely (clone_dest == -1))
1518 return REG_ESPACE;
1519 dfa->nexts[clone_node] = dfa->nexts[org_node];
1520 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 if (__glibc_unlikely (! ok))
1522 return REG_ESPACE;
1523 }
1524 else if (dfa->edests[org_node].nelem == 0)
1525 {
1526 /* In case of the node can't epsilon-transit, don't duplicate the
1527 destination and store the original destination as the
1528 destination of the node. */
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 break;
1531 }
1532 else if (dfa->edests[org_node].nelem == 1)
1533 {
1534 /* In case of the node can epsilon-transit, and it has only one
1535 destination. */
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
1538 /* If the node is root_node itself, it means the epsilon closure
1539 has a loop. Then tie it to the destination of the root_node. */
1540 if (org_node == root_node && clone_node != org_node)
1541 {
1542 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543 if (__glibc_unlikely (! ok))
1544 return REG_ESPACE;
1545 break;
1546 }
1547 /* In case the node has another constraint, append it. */
1548 constraint |= dfa->nodes[org_node].constraint;
1549 clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 if (__glibc_unlikely (clone_dest == -1))
1551 return REG_ESPACE;
1552 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553 if (__glibc_unlikely (! ok))
1554 return REG_ESPACE;
1555 }
1556 else /* dfa->edests[org_node].nelem == 2 */
1557 {
1558 /* In case of the node can epsilon-transit, and it has two
1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1560 org_dest = dfa->edests[org_node].elems[0];
1561 re_node_set_empty (dfa->edests + clone_node);
1562 /* Search for a duplicated node which satisfies the constraint. */
1563 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564 if (clone_dest == -1)
1565 {
1566 /* There is no such duplicated node, create a new one. */
1567 reg_errcode_t err;
1568 clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 if (__glibc_unlikely (clone_dest == -1))
1570 return REG_ESPACE;
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1573 return REG_ESPACE;
1574 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575 root_node, constraint);
1576 if (__glibc_unlikely (err != REG_NOERROR))
1577 return err;
1578 }
1579 else
1580 {
1581 /* There is a duplicated node which satisfies the constraint,
1582 use it to avoid infinite loop. */
1583 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584 if (__glibc_unlikely (! ok))
1585 return REG_ESPACE;
1586 }
1587
1588 org_dest = dfa->edests[org_node].elems[1];
1589 clone_dest = duplicate_node (dfa, org_dest, constraint);
1590 if (__glibc_unlikely (clone_dest == -1))
1591 return REG_ESPACE;
1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 if (__glibc_unlikely (! ok))
1594 return REG_ESPACE;
1595 }
1596 org_node = org_dest;
1597 clone_node = clone_dest;
1598 }
1599 return REG_NOERROR;
1600}
1601
1602/* Search for a node which is duplicated from the node ORG_NODE, and
1603 satisfies the constraint CONSTRAINT. */
1604
1605static Idx
1606search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607 unsigned int constraint)
1608{
1609 Idx idx;
1610 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1611 {
1612 if (org_node == dfa->org_indices[idx]
1613 && constraint == dfa->nodes[idx].constraint)
1614 return idx; /* Found. */
1615 }
1616 return -1; /* Not found. */
1617}
1618
1619/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620 Return the index of the new node, or -1 if insufficient storage is
1621 available. */
1622
1623static Idx
1624duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1625{
1626 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627 if (__glibc_likely (dup_idx != -1))
1628 {
1629 dfa->nodes[dup_idx].constraint = constraint;
1630 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1631 dfa->nodes[dup_idx].duplicated = 1;
1632
1633 /* Store the index of the original node. */
1634 dfa->org_indices[dup_idx] = org_idx;
1635 }
1636 return dup_idx;
1637}
1638
1639static reg_errcode_t
1640calc_inveclosure (re_dfa_t *dfa)
1641{
1642 Idx src, idx;
1643 bool ok;
1644 for (idx = 0; idx < dfa->nodes_len; ++idx)
1645 re_node_set_init_empty (dfa->inveclosures + idx);
1646
1647 for (src = 0; src < dfa->nodes_len; ++src)
1648 {
1649 Idx *elems = dfa->eclosures[src].elems;
1650 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1651 {
1652 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653 if (__glibc_unlikely (! ok))
1654 return REG_ESPACE;
1655 }
1656 }
1657
1658 return REG_NOERROR;
1659}
1660
1661/* Calculate "eclosure" for all the node in DFA. */
1662
1663static reg_errcode_t
1664calc_eclosure (re_dfa_t *dfa)
1665{
1666 Idx node_idx;
1667 bool incomplete;
1668 DEBUG_ASSERT (dfa->nodes_len > 0);
1669 incomplete = false;
1670 /* For each nodes, calculate epsilon closure. */
1671 for (node_idx = 0; ; ++node_idx)
1672 {
1673 reg_errcode_t err;
1674 re_node_set eclosure_elem;
1675 if (node_idx == dfa->nodes_len)
1676 {
1677 if (!incomplete)
1678 break;
1679 incomplete = false;
1680 node_idx = 0;
1681 }
1682
1683 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1684
1685 /* If we have already calculated, skip it. */
1686 if (dfa->eclosures[node_idx].nelem != 0)
1687 continue;
1688 /* Calculate epsilon closure of 'node_idx'. */
1689 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1690 if (__glibc_unlikely (err != REG_NOERROR))
1691 return err;
1692
1693 if (dfa->eclosures[node_idx].nelem == 0)
1694 {
1695 incomplete = true;
1696 re_node_set_free (&eclosure_elem);
1697 }
1698 }
1699 return REG_NOERROR;
1700}
1701
1702/* Calculate epsilon closure of NODE. */
1703
1704static reg_errcode_t
1705calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1706{
1707 reg_errcode_t err;
1708 Idx i;
1709 re_node_set eclosure;
1710 bool incomplete = false;
1711 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712 if (__glibc_unlikely (err != REG_NOERROR))
1713 return err;
1714
1715 /* An epsilon closure includes itself. */
1716 eclosure.elems[eclosure.nelem++] = node;
1717
1718 /* This indicates that we are calculating this node now.
1719 We reference this value to avoid infinite loop. */
1720 dfa->eclosures[node].nelem = -1;
1721
1722 /* If the current node has constraints, duplicate all nodes
1723 since they must inherit the constraints. */
1724 if (dfa->nodes[node].constraint
1725 && dfa->edests[node].nelem
1726 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727 {
1728 err = duplicate_node_closure (dfa, node, node, node,
1729 dfa->nodes[node].constraint);
1730 if (__glibc_unlikely (err != REG_NOERROR))
1731 return err;
1732 }
1733
1734 /* Expand each epsilon destination nodes. */
1735 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736 for (i = 0; i < dfa->edests[node].nelem; ++i)
1737 {
1738 re_node_set eclosure_elem;
1739 Idx edest = dfa->edests[node].elems[i];
1740 /* If calculating the epsilon closure of 'edest' is in progress,
1741 return intermediate result. */
1742 if (dfa->eclosures[edest].nelem == -1)
1743 {
1744 incomplete = true;
1745 continue;
1746 }
1747 /* If we haven't calculated the epsilon closure of 'edest' yet,
1748 calculate now. Otherwise use calculated epsilon closure. */
1749 if (dfa->eclosures[edest].nelem == 0)
1750 {
1751 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752 if (__glibc_unlikely (err != REG_NOERROR))
1753 return err;
1754 }
1755 else
1756 eclosure_elem = dfa->eclosures[edest];
1757 /* Merge the epsilon closure of 'edest'. */
1758 err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 if (__glibc_unlikely (err != REG_NOERROR))
1760 return err;
1761 /* If the epsilon closure of 'edest' is incomplete,
1762 the epsilon closure of this node is also incomplete. */
1763 if (dfa->eclosures[edest].nelem == 0)
1764 {
1765 incomplete = true;
1766 re_node_set_free (&eclosure_elem);
1767 }
1768 }
1769
1770 if (incomplete && !root)
1771 dfa->eclosures[node].nelem = 0;
1772 else
1773 dfa->eclosures[node] = eclosure;
1774 *new_set = eclosure;
1775 return REG_NOERROR;
1776}
1777
1778
1779/* Functions for token which are used in the parser. */
1780
1781/* Fetch a token from INPUT.
1782 We must not use this function inside bracket expressions. */
1783
1784static void
1785fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1786{
1787 re_string_skip_bytes (input, peek_token (result, input, syntax));
1788}
1789
1790/* Peek a token from INPUT, and return the length of the token.
1791 We must not use this function inside bracket expressions. */
1792
1793static int
1794peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1795{
1796 unsigned char c;
1797
1798 if (re_string_eoi (input))
1799 {
1800 token->type = END_OF_RE;
1801 return 0;
1802 }
1803
1804 c = re_string_peek_byte (input, 0);
1805 token->opr.c = c;
1806
1807 token->word_char = 0;
1808#ifdef RE_ENABLE_I18N
1809 token->mb_partial = 0;
1810 if (input->mb_cur_max > 1
1811 && !re_string_first_byte (input, re_string_cur_idx (input)))
1812 {
1813 token->type = CHARACTER;
1814 token->mb_partial = 1;
1815 return 1;
1816 }
1817#endif
1818 if (c == '\\')
1819 {
1820 unsigned char c2;
1821 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1822 {
1823 token->type = BACK_SLASH;
1824 return 1;
1825 }
1826
1827 c2 = re_string_peek_byte_case (input, 1);
1828 token->opr.c = c2;
1829 token->type = CHARACTER;
1830#ifdef RE_ENABLE_I18N
1831 if (input->mb_cur_max > 1)
1832 {
1833 wint_t wc = re_string_wchar_at (input,
1834 re_string_cur_idx (input) + 1);
1835 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1836 }
1837 else
1838#endif
1839 token->word_char = IS_WORD_CHAR (c2) != 0;
1840
1841 switch (c2)
1842 {
1843 case '|':
1844 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1845 token->type = OP_ALT;
1846 break;
1847 case '1': case '2': case '3': case '4': case '5':
1848 case '6': case '7': case '8': case '9':
1849 if (!(syntax & RE_NO_BK_REFS))
1850 {
1851 token->type = OP_BACK_REF;
1852 token->opr.idx = c2 - '1';
1853 }
1854 break;
1855 case '<':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 {
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = WORD_FIRST;
1860 }
1861 break;
1862 case '>':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 {
1865 token->type = ANCHOR;
1866 token->opr.ctx_type = WORD_LAST;
1867 }
1868 break;
1869 case 'b':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 {
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = WORD_DELIM;
1874 }
1875 break;
1876 case 'B':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 {
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = NOT_WORD_DELIM;
1881 }
1882 break;
1883 case 'w':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_WORD;
1886 break;
1887 case 'W':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = OP_NOTWORD;
1890 break;
1891 case 's':
1892 if (!(syntax & RE_NO_GNU_OPS))
1893 token->type = OP_SPACE;
1894 break;
1895 case 'S':
1896 if (!(syntax & RE_NO_GNU_OPS))
1897 token->type = OP_NOTSPACE;
1898 break;
1899 case '`':
1900 if (!(syntax & RE_NO_GNU_OPS))
1901 {
1902 token->type = ANCHOR;
1903 token->opr.ctx_type = BUF_FIRST;
1904 }
1905 break;
1906 case '\'':
1907 if (!(syntax & RE_NO_GNU_OPS))
1908 {
1909 token->type = ANCHOR;
1910 token->opr.ctx_type = BUF_LAST;
1911 }
1912 break;
1913 case '(':
1914 if (!(syntax & RE_NO_BK_PARENS))
1915 token->type = OP_OPEN_SUBEXP;
1916 break;
1917 case ')':
1918 if (!(syntax & RE_NO_BK_PARENS))
1919 token->type = OP_CLOSE_SUBEXP;
1920 break;
1921 case '+':
1922 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1923 token->type = OP_DUP_PLUS;
1924 break;
1925 case '?':
1926 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1927 token->type = OP_DUP_QUESTION;
1928 break;
1929 case '{':
1930 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1931 token->type = OP_OPEN_DUP_NUM;
1932 break;
1933 case '}':
1934 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1935 token->type = OP_CLOSE_DUP_NUM;
1936 break;
1937 default:
1938 break;
1939 }
1940 return 2;
1941 }
1942
1943 token->type = CHARACTER;
1944#ifdef RE_ENABLE_I18N
1945 if (input->mb_cur_max > 1)
1946 {
1947 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1948 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1949 }
1950 else
1951#endif
1952 token->word_char = IS_WORD_CHAR (token->opr.c);
1953
1954 switch (c)
1955 {
1956 case '\n':
1957 if (syntax & RE_NEWLINE_ALT)
1958 token->type = OP_ALT;
1959 break;
1960 case '|':
1961 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1962 token->type = OP_ALT;
1963 break;
1964 case '*':
1965 token->type = OP_DUP_ASTERISK;
1966 break;
1967 case '+':
1968 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1969 token->type = OP_DUP_PLUS;
1970 break;
1971 case '?':
1972 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1973 token->type = OP_DUP_QUESTION;
1974 break;
1975 case '{':
1976 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1977 token->type = OP_OPEN_DUP_NUM;
1978 break;
1979 case '}':
1980 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1981 token->type = OP_CLOSE_DUP_NUM;
1982 break;
1983 case '(':
1984 if (syntax & RE_NO_BK_PARENS)
1985 token->type = OP_OPEN_SUBEXP;
1986 break;
1987 case ')':
1988 if (syntax & RE_NO_BK_PARENS)
1989 token->type = OP_CLOSE_SUBEXP;
1990 break;
1991 case '[':
1992 token->type = OP_OPEN_BRACKET;
1993 break;
1994 case '.':
1995 token->type = OP_PERIOD;
1996 break;
1997 case '^':
1998 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1999 && re_string_cur_idx (input) != 0)
2000 {
2001 char prev = re_string_peek_byte (input, -1);
2002 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2003 break;
2004 }
2005 token->type = ANCHOR;
2006 token->opr.ctx_type = LINE_FIRST;
2007 break;
2008 case '$':
2009 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2010 && re_string_cur_idx (input) + 1 != re_string_length (input))
2011 {
2012 re_token_t next;
2013 re_string_skip_bytes (input, 1);
2014 peek_token (&next, input, syntax);
2015 re_string_skip_bytes (input, -1);
2016 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2017 break;
2018 }
2019 token->type = ANCHOR;
2020 token->opr.ctx_type = LINE_LAST;
2021 break;
2022 default:
2023 break;
2024 }
2025 return 1;
2026}
2027
2028/* Peek a token from INPUT, and return the length of the token.
2029 We must not use this function out of bracket expressions. */
2030
2031static int
2032peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2033{
2034 unsigned char c;
2035 if (re_string_eoi (input))
2036 {
2037 token->type = END_OF_RE;
2038 return 0;
2039 }
2040 c = re_string_peek_byte (input, 0);
2041 token->opr.c = c;
2042
2043#ifdef RE_ENABLE_I18N
2044 if (input->mb_cur_max > 1
2045 && !re_string_first_byte (input, re_string_cur_idx (input)))
2046 {
2047 token->type = CHARACTER;
2048 return 1;
2049 }
2050#endif /* RE_ENABLE_I18N */
2051
2052 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2053 && re_string_cur_idx (input) + 1 < re_string_length (input))
2054 {
2055 /* In this case, '\' escape a character. */
2056 unsigned char c2;
2057 re_string_skip_bytes (input, 1);
2058 c2 = re_string_peek_byte (input, 0);
2059 token->opr.c = c2;
2060 token->type = CHARACTER;
2061 return 1;
2062 }
2063 if (c == '[') /* '[' is a special char in a bracket exps. */
2064 {
2065 unsigned char c2;
2066 int token_len;
2067 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2068 c2 = re_string_peek_byte (input, 1);
2069 else
2070 c2 = 0;
2071 token->opr.c = c2;
2072 token_len = 2;
2073 switch (c2)
2074 {
2075 case '.':
2076 token->type = OP_OPEN_COLL_ELEM;
2077 break;
2078
2079 case '=':
2080 token->type = OP_OPEN_EQUIV_CLASS;
2081 break;
2082
2083 case ':':
2084 if (syntax & RE_CHAR_CLASSES)
2085 {
2086 token->type = OP_OPEN_CHAR_CLASS;
2087 break;
2088 }
2089 FALLTHROUGH;
2090 default:
2091 token->type = CHARACTER;
2092 token->opr.c = c;
2093 token_len = 1;
2094 break;
2095 }
2096 return token_len;
2097 }
2098 switch (c)
2099 {
2100 case '-':
2101 token->type = OP_CHARSET_RANGE;
2102 break;
2103 case ']':
2104 token->type = OP_CLOSE_BRACKET;
2105 break;
2106 case '^':
2107 token->type = OP_NON_MATCH_LIST;
2108 break;
2109 default:
2110 token->type = CHARACTER;
2111 }
2112 return 1;
2113}
2114
2115
2116/* Functions for parser. */
2117
2118/* Entry point of the parser.
2119 Parse the regular expression REGEXP and return the structure tree.
2120 If an error occurs, ERR is set by error code, and return NULL.
2121 This function build the following tree, from regular expression <reg_exp>:
2122 CAT
2123 / \
2124 / \
2125 <reg_exp> EOR
2126
2127 CAT means concatenation.
2128 EOR means end of regular expression. */
2129
2130static bin_tree_t *
2131parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2132 reg_errcode_t *err)
2133{
2134 re_dfa_t *dfa = preg->buffer;
2135 bin_tree_t *tree, *eor, *root;
2136 re_token_t current_token;
2137 dfa->syntax = syntax;
2138 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2140 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2141 return NULL;
2142 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2143 if (tree != NULL)
2144 root = create_tree (dfa, tree, eor, CONCAT);
2145 else
2146 root = eor;
2147 if (__glibc_unlikely (eor == NULL || root == NULL))
2148 {
2149 *err = REG_ESPACE;
2150 return NULL;
2151 }
2152 return root;
2153}
2154
2155/* This function build the following tree, from regular expression
2156 <branch1>|<branch2>:
2157 ALT
2158 / \
2159 / \
2160 <branch1> <branch2>
2161
2162 ALT means alternative, which represents the operator '|'. */
2163
2164static bin_tree_t *
2165parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2167{
2168 re_dfa_t *dfa = preg->buffer;
2169 bin_tree_t *tree, *branch = NULL;
2170 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2171 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2173 return NULL;
2174
2175 while (token->type == OP_ALT)
2176 {
2177 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2178 if (token->type != OP_ALT && token->type != END_OF_RE
2179 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2180 {
2181 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2182 dfa->completed_bkref_map = initial_bkref_map;
2183 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2184 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2185 {
2186 if (tree != NULL)
2187 postorder (tree, free_tree, NULL);
2188 return NULL;
2189 }
2190 dfa->completed_bkref_map |= accumulated_bkref_map;
2191 }
2192 else
2193 branch = NULL;
2194 tree = create_tree (dfa, tree, branch, OP_ALT);
2195 if (__glibc_unlikely (tree == NULL))
2196 {
2197 *err = REG_ESPACE;
2198 return NULL;
2199 }
2200 }
2201 return tree;
2202}
2203
2204/* This function build the following tree, from regular expression
2205 <exp1><exp2>:
2206 CAT
2207 / \
2208 / \
2209 <exp1> <exp2>
2210
2211 CAT means concatenation. */
2212
2213static bin_tree_t *
2214parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216{
2217 bin_tree_t *tree, *expr;
2218 re_dfa_t *dfa = preg->buffer;
2219 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2221 return NULL;
2222
2223 while (token->type != OP_ALT && token->type != END_OF_RE
2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225 {
2226 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2228 {
2229 if (tree != NULL)
2230 postorder (tree, free_tree, NULL);
2231 return NULL;
2232 }
2233 if (tree != NULL && expr != NULL)
2234 {
2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 if (newtree == NULL)
2237 {
2238 postorder (expr, free_tree, NULL);
2239 postorder (tree, free_tree, NULL);
2240 *err = REG_ESPACE;
2241 return NULL;
2242 }
2243 tree = newtree;
2244 }
2245 else if (tree == NULL)
2246 tree = expr;
2247 /* Otherwise expr == NULL, we don't need to create new tree. */
2248 }
2249 return tree;
2250}
2251
2252/* This function build the following tree, from regular expression a*:
2253 *
2254 |
2255 a
2256*/
2257
2258static bin_tree_t *
2259parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261{
2262 re_dfa_t *dfa = preg->buffer;
2263 bin_tree_t *tree;
2264 switch (token->type)
2265 {
2266 case CHARACTER:
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (__glibc_unlikely (tree == NULL))
2269 {
2270 *err = REG_ESPACE;
2271 return NULL;
2272 }
2273#ifdef RE_ENABLE_I18N
2274 if (dfa->mb_cur_max > 1)
2275 {
2276 while (!re_string_eoi (regexp)
2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278 {
2279 bin_tree_t *mbc_remain;
2280 fetch_token (token, regexp, syntax);
2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2284 {
2285 *err = REG_ESPACE;
2286 return NULL;
2287 }
2288 }
2289 }
2290#endif
2291 break;
2292
2293 case OP_OPEN_SUBEXP:
2294 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2295 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2296 return NULL;
2297 break;
2298
2299 case OP_OPEN_BRACKET:
2300 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2301 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2302 return NULL;
2303 break;
2304
2305 case OP_BACK_REF:
2306 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2307 {
2308 *err = REG_ESUBREG;
2309 return NULL;
2310 }
2311 dfa->used_bkref_map |= 1 << token->opr.idx;
2312 tree = create_token_tree (dfa, NULL, NULL, token);
2313 if (__glibc_unlikely (tree == NULL))
2314 {
2315 *err = REG_ESPACE;
2316 return NULL;
2317 }
2318 ++dfa->nbackref;
2319 dfa->has_mb_node = 1;
2320 break;
2321
2322 case OP_OPEN_DUP_NUM:
2323 if (syntax & RE_CONTEXT_INVALID_DUP)
2324 {
2325 *err = REG_BADRPT;
2326 return NULL;
2327 }
2328 FALLTHROUGH;
2329 case OP_DUP_ASTERISK:
2330 case OP_DUP_PLUS:
2331 case OP_DUP_QUESTION:
2332 if (syntax & RE_CONTEXT_INVALID_OPS)
2333 {
2334 *err = REG_BADRPT;
2335 return NULL;
2336 }
2337 else if (syntax & RE_CONTEXT_INDEP_OPS)
2338 {
2339 fetch_token (token, regexp, syntax);
2340 return parse_expression (regexp, preg, token, syntax, nest, err);
2341 }
2342 FALLTHROUGH;
2343 case OP_CLOSE_SUBEXP:
2344 if ((token->type == OP_CLOSE_SUBEXP)
2345 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346 {
2347 *err = REG_ERPAREN;
2348 return NULL;
2349 }
2350 FALLTHROUGH;
2351 case OP_CLOSE_DUP_NUM:
2352 /* We treat it as a normal character. */
2353
2354 /* Then we can these characters as normal characters. */
2355 token->type = CHARACTER;
2356 /* mb_partial and word_char bits should be initialized already
2357 by peek_token. */
2358 tree = create_token_tree (dfa, NULL, NULL, token);
2359 if (__glibc_unlikely (tree == NULL))
2360 {
2361 *err = REG_ESPACE;
2362 return NULL;
2363 }
2364 break;
2365
2366 case ANCHOR:
2367 if ((token->opr.ctx_type
2368 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2369 && dfa->word_ops_used == 0)
2370 init_word_char (dfa);
2371 if (token->opr.ctx_type == WORD_DELIM
2372 || token->opr.ctx_type == NOT_WORD_DELIM)
2373 {
2374 bin_tree_t *tree_first, *tree_last;
2375 if (token->opr.ctx_type == WORD_DELIM)
2376 {
2377 token->opr.ctx_type = WORD_FIRST;
2378 tree_first = create_token_tree (dfa, NULL, NULL, token);
2379 token->opr.ctx_type = WORD_LAST;
2380 }
2381 else
2382 {
2383 token->opr.ctx_type = INSIDE_WORD;
2384 tree_first = create_token_tree (dfa, NULL, NULL, token);
2385 token->opr.ctx_type = INSIDE_NOTWORD;
2386 }
2387 tree_last = create_token_tree (dfa, NULL, NULL, token);
2388 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2389 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2390 || tree == NULL))
2391 {
2392 *err = REG_ESPACE;
2393 return NULL;
2394 }
2395 }
2396 else
2397 {
2398 tree = create_token_tree (dfa, NULL, NULL, token);
2399 if (__glibc_unlikely (tree == NULL))
2400 {
2401 *err = REG_ESPACE;
2402 return NULL;
2403 }
2404 }
2405 /* We must return here, since ANCHORs can't be followed
2406 by repetition operators.
2407 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2408 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2409 fetch_token (token, regexp, syntax);
2410 return tree;
2411
2412 case OP_PERIOD:
2413 tree = create_token_tree (dfa, NULL, NULL, token);
2414 if (__glibc_unlikely (tree == NULL))
2415 {
2416 *err = REG_ESPACE;
2417 return NULL;
2418 }
2419 if (dfa->mb_cur_max > 1)
2420 dfa->has_mb_node = 1;
2421 break;
2422
2423 case OP_WORD:
2424 case OP_NOTWORD:
2425 tree = build_charclass_op (dfa, regexp->trans,
2426 "alnum",
2427 "_",
2428 token->type == OP_NOTWORD, err);
2429 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2430 return NULL;
2431 break;
2432
2433 case OP_SPACE:
2434 case OP_NOTSPACE:
2435 tree = build_charclass_op (dfa, regexp->trans,
2436 "space",
2437 "",
2438 token->type == OP_NOTSPACE, err);
2439 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2440 return NULL;
2441 break;
2442
2443 case OP_ALT:
2444 case END_OF_RE:
2445 return NULL;
2446
2447 case BACK_SLASH:
2448 *err = REG_EESCAPE;
2449 return NULL;
2450
2451 default:
2452 /* Must not happen? */
2453 DEBUG_ASSERT (false);
2454 return NULL;
2455 }
2456 fetch_token (token, regexp, syntax);
2457
2458 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2459 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2460 {
2461 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2462 syntax, err);
2463 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2464 {
2465 if (tree != NULL)
2466 postorder (tree, free_tree, NULL);
2467 return NULL;
2468 }
2469 tree = dup_tree;
2470 /* In BRE consecutive duplications are not allowed. */
2471 if ((syntax & RE_CONTEXT_INVALID_DUP)
2472 && (token->type == OP_DUP_ASTERISK
2473 || token->type == OP_OPEN_DUP_NUM))
2474 {
2475 if (tree != NULL)
2476 postorder (tree, free_tree, NULL);
2477 *err = REG_BADRPT;
2478 return NULL;
2479 }
2480 }
2481
2482 return tree;
2483}
2484
2485/* This function build the following tree, from regular expression
2486 (<reg_exp>):
2487 SUBEXP
2488 |
2489 <reg_exp>
2490*/
2491
2492static bin_tree_t *
2493parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2494 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2495{
2496 re_dfa_t *dfa = preg->buffer;
2497 bin_tree_t *tree;
2498 size_t cur_nsub;
2499 cur_nsub = preg->re_nsub++;
2500
2501 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2502
2503 /* The subexpression may be a null string. */
2504 if (token->type == OP_CLOSE_SUBEXP)
2505 tree = NULL;
2506 else
2507 {
2508 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2509 if (__glibc_unlikely (*err == REG_NOERROR
2510 && token->type != OP_CLOSE_SUBEXP))
2511 {
2512 if (tree != NULL)
2513 postorder (tree, free_tree, NULL);
2514 *err = REG_EPAREN;
2515 }
2516 if (__glibc_unlikely (*err != REG_NOERROR))
2517 return NULL;
2518 }
2519
2520 if (cur_nsub <= '9' - '1')
2521 dfa->completed_bkref_map |= 1 << cur_nsub;
2522
2523 tree = create_tree (dfa, tree, NULL, SUBEXP);
2524 if (__glibc_unlikely (tree == NULL))
2525 {
2526 *err = REG_ESPACE;
2527 return NULL;
2528 }
2529 tree->token.opr.idx = cur_nsub;
2530 return tree;
2531}
2532
2533/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2534
2535static bin_tree_t *
2536parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2537 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2538{
2539 bin_tree_t *tree = NULL, *old_tree = NULL;
2540 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2541 re_token_t start_token = *token;
2542
2543 if (token->type == OP_OPEN_DUP_NUM)
2544 {
2545 end = 0;
2546 start = fetch_number (regexp, token, syntax);
2547 if (start == -1)
2548 {
2549 if (token->type == CHARACTER && token->opr.c == ',')
2550 start = 0; /* We treat "{,m}" as "{0,m}". */
2551 else
2552 {
2553 *err = REG_BADBR; /* <re>{} is invalid. */
2554 return NULL;
2555 }
2556 }
2557 if (__glibc_likely (start != -2))
2558 {
2559 /* We treat "{n}" as "{n,n}". */
2560 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2561 : ((token->type == CHARACTER && token->opr.c == ',')
2562 ? fetch_number (regexp, token, syntax) : -2));
2563 }
2564 if (__glibc_unlikely (start == -2 || end == -2))
2565 {
2566 /* Invalid sequence. */
2567 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2568 {
2569 if (token->type == END_OF_RE)
2570 *err = REG_EBRACE;
2571 else
2572 *err = REG_BADBR;
2573
2574 return NULL;
2575 }
2576
2577 /* If the syntax bit is set, rollback. */
2578 re_string_set_index (regexp, start_idx);
2579 *token = start_token;
2580 token->type = CHARACTER;
2581 /* mb_partial and word_char bits should be already initialized by
2582 peek_token. */
2583 return elem;
2584 }
2585
2586 if (__glibc_unlikely ((end != -1 && start > end)
2587 || token->type != OP_CLOSE_DUP_NUM))
2588 {
2589 /* First number greater than second. */
2590 *err = REG_BADBR;
2591 return NULL;
2592 }
2593
2594 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2595 {
2596 *err = REG_ESIZE;
2597 return NULL;
2598 }
2599 }
2600 else
2601 {
2602 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2603 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2604 }
2605
2606 fetch_token (token, regexp, syntax);
2607
2608 if (__glibc_unlikely (elem == NULL))
2609 return NULL;
2610 if (__glibc_unlikely (start == 0 && end == 0))
2611 {
2612 postorder (elem, free_tree, NULL);
2613 return NULL;
2614 }
2615
2616 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2617 if (__glibc_unlikely (start > 0))
2618 {
2619 tree = elem;
2620 for (i = 2; i <= start; ++i)
2621 {
2622 elem = duplicate_tree (elem, dfa);
2623 tree = create_tree (dfa, tree, elem, CONCAT);
2624 if (__glibc_unlikely (elem == NULL || tree == NULL))
2625 goto parse_dup_op_espace;
2626 }
2627
2628 if (start == end)
2629 return tree;
2630
2631 /* Duplicate ELEM before it is marked optional. */
2632 elem = duplicate_tree (elem, dfa);
2633 if (__glibc_unlikely (elem == NULL))
2634 goto parse_dup_op_espace;
2635 old_tree = tree;
2636 }
2637 else
2638 old_tree = NULL;
2639
2640 if (elem->token.type == SUBEXP)
2641 {
2642 uintptr_t subidx = elem->token.opr.idx;
2643 postorder (elem, mark_opt_subexp, (void *) subidx);
2644 }
2645
2646 tree = create_tree (dfa, elem, NULL,
2647 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2648 if (__glibc_unlikely (tree == NULL))
2649 goto parse_dup_op_espace;
2650
2651 /* This loop is actually executed only when end != -1,
2652 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2653 already created the start+1-th copy. */
2654 if (TYPE_SIGNED (Idx) || end != -1)
2655 for (i = start + 2; i <= end; ++i)
2656 {
2657 elem = duplicate_tree (elem, dfa);
2658 tree = create_tree (dfa, tree, elem, CONCAT);
2659 if (__glibc_unlikely (elem == NULL || tree == NULL))
2660 goto parse_dup_op_espace;
2661
2662 tree = create_tree (dfa, tree, NULL, OP_ALT);
2663 if (__glibc_unlikely (tree == NULL))
2664 goto parse_dup_op_espace;
2665 }
2666
2667 if (old_tree)
2668 tree = create_tree (dfa, old_tree, tree, CONCAT);
2669
2670 return tree;
2671
2672 parse_dup_op_espace:
2673 *err = REG_ESPACE;
2674 return NULL;
2675}
2676
2677/* Size of the names for collating symbol/equivalence_class/character_class.
2678 I'm not sure, but maybe enough. */
2679#define BRACKET_NAME_BUF_SIZE 32
2680
2681#ifndef _LIBC
2682
2683# ifdef RE_ENABLE_I18N
2684/* Convert the byte B to the corresponding wide character. In a
2685 unibyte locale, treat B as itself. In a multibyte locale, return
2686 WEOF if B is an encoding error. */
2687static wint_t
2688parse_byte (unsigned char b, re_charset_t *mbcset)
2689{
2690 return mbcset == NULL ? b : __btowc (b);
2691}
2692# endif
2693
2694 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2695 Build the range expression which starts from START_ELEM, and ends
2696 at END_ELEM. The result are written to MBCSET and SBCSET.
2697 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2698 mbcset->range_ends, is a pointer argument since we may
2699 update it. */
2700
2701static reg_errcode_t
2702# ifdef RE_ENABLE_I18N
2703build_range_exp (const reg_syntax_t syntax,
2704 bitset_t sbcset,
2705 re_charset_t *mbcset,
2706 Idx *range_alloc,
2707 const bracket_elem_t *start_elem,
2708 const bracket_elem_t *end_elem)
2709# else /* not RE_ENABLE_I18N */
2710build_range_exp (const reg_syntax_t syntax,
2711 bitset_t sbcset,
2712 const bracket_elem_t *start_elem,
2713 const bracket_elem_t *end_elem)
2714# endif /* not RE_ENABLE_I18N */
2715{
2716 unsigned int start_ch, end_ch;
2717 /* Equivalence Classes and Character Classes can't be a range start/end. */
2718 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2719 || start_elem->type == CHAR_CLASS
2720 || end_elem->type == EQUIV_CLASS
2721 || end_elem->type == CHAR_CLASS))
2722 return REG_ERANGE;
2723
2724 /* We can handle no multi character collating elements without libc
2725 support. */
2726 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2727 && strlen ((char *) start_elem->opr.name) > 1)
2728 || (end_elem->type == COLL_SYM
2729 && strlen ((char *) end_elem->opr.name) > 1)))
2730 return REG_ECOLLATE;
2731
2732# ifdef RE_ENABLE_I18N
2733 {
2734 wchar_t wc;
2735 wint_t start_wc;
2736 wint_t end_wc;
2737
2738 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2739 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740 : 0));
2741 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2742 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743 : 0));
2744 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2745 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2746 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2747 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2748 if (start_wc == WEOF || end_wc == WEOF)
2749 return REG_ECOLLATE;
2750 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2751 && start_wc > end_wc))
2752 return REG_ERANGE;
2753
2754 /* Got valid collation sequence values, add them as a new entry.
2755 However, for !_LIBC we have no collation elements: if the
2756 character set is single byte, the single byte character set
2757 that we build below suffices. parse_bracket_exp passes
2758 no MBCSET if dfa->mb_cur_max == 1. */
2759 if (mbcset)
2760 {
2761 /* Check the space of the arrays. */
2762 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2763 {
2764 /* There is not enough space, need realloc. */
2765 wchar_t *new_array_start, *new_array_end;
2766 Idx new_nranges;
2767
2768 /* +1 in case of mbcset->nranges is 0. */
2769 new_nranges = 2 * mbcset->nranges + 1;
2770 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2771 are NULL if *range_alloc == 0. */
2772 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2773 new_nranges);
2774 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2775 new_nranges);
2776
2777 if (__glibc_unlikely (new_array_start == NULL
2778 || new_array_end == NULL))
2779 {
2780 re_free (new_array_start);
2781 re_free (new_array_end);
2782 return REG_ESPACE;
2783 }
2784
2785 mbcset->range_starts = new_array_start;
2786 mbcset->range_ends = new_array_end;
2787 *range_alloc = new_nranges;
2788 }
2789
2790 mbcset->range_starts[mbcset->nranges] = start_wc;
2791 mbcset->range_ends[mbcset->nranges++] = end_wc;
2792 }
2793
2794 /* Build the table for single byte characters. */
2795 for (wc = 0; wc < SBC_MAX; ++wc)
2796 {
2797 if (start_wc <= wc && wc <= end_wc)
2798 bitset_set (sbcset, wc);
2799 }
2800 }
2801# else /* not RE_ENABLE_I18N */
2802 {
2803 unsigned int ch;
2804 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2805 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806 : 0));
2807 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2808 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809 : 0));
2810 if (start_ch > end_ch)
2811 return REG_ERANGE;
2812 /* Build the table for single byte characters. */
2813 for (ch = 0; ch < SBC_MAX; ++ch)
2814 if (start_ch <= ch && ch <= end_ch)
2815 bitset_set (sbcset, ch);
2816 }
2817# endif /* not RE_ENABLE_I18N */
2818 return REG_NOERROR;
2819}
2820#endif /* not _LIBC */
2821
2822#ifndef _LIBC
2823/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824 Build the collating element which is represented by NAME.
2825 The result are written to MBCSET and SBCSET.
2826 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827 pointer argument since we may update it. */
2828
2829static reg_errcode_t
2830# ifdef RE_ENABLE_I18N
2831build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 Idx *coll_sym_alloc, const unsigned char *name)
2833# else /* not RE_ENABLE_I18N */
2834build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835# endif /* not RE_ENABLE_I18N */
2836{
2837 size_t name_len = strlen ((const char *) name);
2838 if (__glibc_unlikely (name_len != 1))
2839 return REG_ECOLLATE;
2840 else
2841 {
2842 bitset_set (sbcset, name[0]);
2843 return REG_NOERROR;
2844 }
2845}
2846#endif /* not _LIBC */
2847
2848/* This function parse bracket expression like "[abc]", "[a-c]",
2849 "[[.a-a.]]" etc. */
2850
2851static bin_tree_t *
2852parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 reg_syntax_t syntax, reg_errcode_t *err)
2854{
2855#ifdef _LIBC
2856 const unsigned char *collseqmb;
2857 const char *collseqwc;
2858 uint32_t nrules;
2859 int32_t table_size;
2860 const int32_t *symb_table;
2861 const unsigned char *extra;
2862
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
2867
2868 auto inline int32_t
2869 __attribute__ ((always_inline))
2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871 {
2872 int32_t elem;
2873
2874 for (elem = 0; elem < table_size; elem++)
2875 if (symb_table[2 * elem] != 0)
2876 {
2877 int32_t idx = symb_table[2 * elem + 1];
2878 /* Skip the name of collating element name. */
2879 idx += 1 + extra[idx];
2880 if (/* Compare the length of the name. */
2881 name_len == extra[idx]
2882 /* Compare the name. */
2883 && memcmp (name, &extra[idx + 1], name_len) == 0)
2884 /* Yep, this is the entry. */
2885 return elem;
2886 }
2887 return -1;
2888 }
2889
2890 /* Local function for parse_bracket_exp used in _LIBC environment.
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2893
2894 auto inline unsigned int
2895 __attribute__ ((always_inline))
2896 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897 {
2898 if (br_elem->type == SB_CHAR)
2899 {
2900 /*
2901 if (MB_CUR_MAX == 1)
2902 */
2903 if (nrules == 0)
2904 return collseqmb[br_elem->opr.ch];
2905 else
2906 {
2907 wint_t wc = __btowc (br_elem->opr.ch);
2908 return __collseq_table_lookup (collseqwc, wc);
2909 }
2910 }
2911 else if (br_elem->type == MB_CHAR)
2912 {
2913 if (nrules != 0)
2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 }
2916 else if (br_elem->type == COLL_SYM)
2917 {
2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 if (nrules != 0)
2920 {
2921 int32_t elem, idx;
2922 elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 sym_name_len);
2924 if (elem != -1)
2925 {
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 /* Skip the byte sequence of the collating element. */
2931 idx += 1 + extra[idx];
2932 /* Adjust for the alignment. */
2933 idx = (idx + 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx += sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx += sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra + idx));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra + idx);
2941 }
2942 else if (sym_name_len == 1)
2943 {
2944 /* No valid character. Match it as a single byte
2945 character. */
2946 return collseqmb[br_elem->opr.name[0]];
2947 }
2948 }
2949 else if (sym_name_len == 1)
2950 return collseqmb[br_elem->opr.name[0]];
2951 }
2952 return UINT_MAX;
2953 }
2954
2955 /* Local function for parse_bracket_exp used in _LIBC environment.
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959 mbcset->range_ends, is a pointer argument since we may
2960 update it. */
2961
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline))
2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966 {
2967 unsigned int ch;
2968 uint32_t start_collseq;
2969 uint32_t end_collseq;
2970
2971 /* Equivalence Classes and Character Classes can't be a range
2972 start/end. */
2973 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2974 || start_elem->type == CHAR_CLASS
2975 || end_elem->type == EQUIV_CLASS
2976 || end_elem->type == CHAR_CLASS))
2977 return REG_ERANGE;
2978
2979 /* FIXME: Implement rational ranges here, too. */
2980 start_collseq = lookup_collation_sequence_value (start_elem);
2981 end_collseq = lookup_collation_sequence_value (end_elem);
2982 /* Check start/end collation sequence values. */
2983 if (__glibc_unlikely (start_collseq == UINT_MAX
2984 || end_collseq == UINT_MAX))
2985 return REG_ECOLLATE;
2986 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2987 && start_collseq > end_collseq))
2988 return REG_ERANGE;
2989
2990 /* Got valid collation sequence values, add them as a new entry.
2991 However, if we have no collation elements, and the character set
2992 is single byte, the single byte character set that we
2993 build below suffices. */
2994 if (nrules > 0 || dfa->mb_cur_max > 1)
2995 {
2996 /* Check the space of the arrays. */
2997 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2998 {
2999 /* There is not enough space, need realloc. */
3000 uint32_t *new_array_start;
3001 uint32_t *new_array_end;
3002 Idx new_nranges;
3003
3004 /* +1 in case of mbcset->nranges is 0. */
3005 new_nranges = 2 * mbcset->nranges + 1;
3006 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3007 new_nranges);
3008 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3009 new_nranges);
3010
3011 if (__glibc_unlikely (new_array_start == NULL
3012 || new_array_end == NULL))
3013 return REG_ESPACE;
3014
3015 mbcset->range_starts = new_array_start;
3016 mbcset->range_ends = new_array_end;
3017 *range_alloc = new_nranges;
3018 }
3019
3020 mbcset->range_starts[mbcset->nranges] = start_collseq;
3021 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3022 }
3023
3024 /* Build the table for single byte characters. */
3025 for (ch = 0; ch < SBC_MAX; ch++)
3026 {
3027 uint32_t ch_collseq;
3028 /*
3029 if (MB_CUR_MAX == 1)
3030 */
3031 if (nrules == 0)
3032 ch_collseq = collseqmb[ch];
3033 else
3034 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3035 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3036 bitset_set (sbcset, ch);
3037 }
3038 return REG_NOERROR;
3039 }
3040
3041 /* Local function for parse_bracket_exp used in _LIBC environment.
3042 Build the collating element which is represented by NAME.
3043 The result are written to MBCSET and SBCSET.
3044 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3045 pointer argument since we may update it. */
3046
3047 auto inline reg_errcode_t
3048 __attribute__ ((always_inline))
3049 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3050 Idx *coll_sym_alloc, const unsigned char *name)
3051 {
3052 int32_t elem, idx;
3053 size_t name_len = strlen ((const char *) name);
3054 if (nrules != 0)
3055 {
3056 elem = seek_collating_symbol_entry (name, name_len);
3057 if (elem != -1)
3058 {
3059 /* We found the entry. */
3060 idx = symb_table[2 * elem + 1];
3061 /* Skip the name of collating element name. */
3062 idx += 1 + extra[idx];
3063 }
3064 else if (name_len == 1)
3065 {
3066 /* No valid character, treat it as a normal
3067 character. */
3068 bitset_set (sbcset, name[0]);
3069 return REG_NOERROR;
3070 }
3071 else
3072 return REG_ECOLLATE;
3073
3074 /* Got valid collation sequence, add it as a new entry. */
3075 /* Check the space of the arrays. */
3076 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3077 {
3078 /* Not enough, realloc it. */
3079 /* +1 in case of mbcset->ncoll_syms is 0. */
3080 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3081 /* Use realloc since mbcset->coll_syms is NULL
3082 if *alloc == 0. */
3083 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3084 new_coll_sym_alloc);
3085 if (__glibc_unlikely (new_coll_syms == NULL))
3086 return REG_ESPACE;
3087 mbcset->coll_syms = new_coll_syms;
3088 *coll_sym_alloc = new_coll_sym_alloc;
3089 }
3090 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3091 return REG_NOERROR;
3092 }
3093 else
3094 {
3095 if (__glibc_unlikely (name_len != 1))
3096 return REG_ECOLLATE;
3097 else
3098 {
3099 bitset_set (sbcset, name[0]);
3100 return REG_NOERROR;
3101 }
3102 }
3103 }
3104#endif
3105
3106 re_token_t br_token;
3107 re_bitset_ptr_t sbcset;
3108#ifdef RE_ENABLE_I18N
3109 re_charset_t *mbcset;
3110 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3111 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3112#endif /* not RE_ENABLE_I18N */
3113 bool non_match = false;
3114 bin_tree_t *work_tree;
3115 int token_len;
3116 bool first_round = true;
3117#ifdef _LIBC
3118 collseqmb = (const unsigned char *)
3119 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3120 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3121 if (nrules)
3122 {
3123 /*
3124 if (MB_CUR_MAX > 1)
3125 */
3126 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3127 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3128 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3129 _NL_COLLATE_SYMB_TABLEMB);
3130 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3131 _NL_COLLATE_SYMB_EXTRAMB);
3132 }
3133#endif
3134 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3135#ifdef RE_ENABLE_I18N
3136 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3137#endif /* RE_ENABLE_I18N */
3138#ifdef RE_ENABLE_I18N
3139 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3140#else
3141 if (__glibc_unlikely (sbcset == NULL))
3142#endif /* RE_ENABLE_I18N */
3143 {
3144 re_free (sbcset);
3145#ifdef RE_ENABLE_I18N
3146 re_free (mbcset);
3147#endif
3148 *err = REG_ESPACE;
3149 return NULL;
3150 }
3151
3152 token_len = peek_token_bracket (token, regexp, syntax);
3153 if (__glibc_unlikely (token->type == END_OF_RE))
3154 {
3155 *err = REG_BADPAT;
3156 goto parse_bracket_exp_free_return;
3157 }
3158 if (token->type == OP_NON_MATCH_LIST)
3159 {
3160#ifdef RE_ENABLE_I18N
3161 mbcset->non_match = 1;
3162#endif /* not RE_ENABLE_I18N */
3163 non_match = true;
3164 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3165 bitset_set (sbcset, '\n');
3166 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3167 token_len = peek_token_bracket (token, regexp, syntax);
3168 if (__glibc_unlikely (token->type == END_OF_RE))
3169 {
3170 *err = REG_BADPAT;
3171 goto parse_bracket_exp_free_return;
3172 }
3173 }
3174
3175 /* We treat the first ']' as a normal character. */
3176 if (token->type == OP_CLOSE_BRACKET)
3177 token->type = CHARACTER;
3178
3179 while (1)
3180 {
3181 bracket_elem_t start_elem, end_elem;
3182 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3183 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3184 reg_errcode_t ret;
3185 int token_len2 = 0;
3186 bool is_range_exp = false;
3187 re_token_t token2;
3188
3189 start_elem.opr.name = start_name_buf;
3190 start_elem.type = COLL_SYM;
3191 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3192 syntax, first_round);
3193 if (__glibc_unlikely (ret != REG_NOERROR))
3194 {
3195 *err = ret;
3196 goto parse_bracket_exp_free_return;
3197 }
3198 first_round = false;
3199
3200 /* Get information about the next token. We need it in any case. */
3201 token_len = peek_token_bracket (token, regexp, syntax);
3202
3203 /* Do not check for ranges if we know they are not allowed. */
3204 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3205 {
3206 if (__glibc_unlikely (token->type == END_OF_RE))
3207 {
3208 *err = REG_EBRACK;
3209 goto parse_bracket_exp_free_return;
3210 }
3211 if (token->type == OP_CHARSET_RANGE)
3212 {
3213 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3214 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3215 if (__glibc_unlikely (token2.type == END_OF_RE))
3216 {
3217 *err = REG_EBRACK;
3218 goto parse_bracket_exp_free_return;
3219 }
3220 if (token2.type == OP_CLOSE_BRACKET)
3221 {
3222 /* We treat the last '-' as a normal character. */
3223 re_string_skip_bytes (regexp, -token_len);
3224 token->type = CHARACTER;
3225 }
3226 else
3227 is_range_exp = true;
3228 }
3229 }
3230
3231 if (is_range_exp == true)
3232 {
3233 end_elem.opr.name = end_name_buf;
3234 end_elem.type = COLL_SYM;
3235 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3236 dfa, syntax, true);
3237 if (__glibc_unlikely (ret != REG_NOERROR))
3238 {
3239 *err = ret;
3240 goto parse_bracket_exp_free_return;
3241 }
3242
3243 token_len = peek_token_bracket (token, regexp, syntax);
3244
3245#ifdef _LIBC
3246 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3247 &start_elem, &end_elem);
3248#else
3249# ifdef RE_ENABLE_I18N
3250 *err = build_range_exp (syntax, sbcset,
3251 dfa->mb_cur_max > 1 ? mbcset : NULL,
3252 &range_alloc, &start_elem, &end_elem);
3253# else
3254 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3255# endif
3256#endif /* RE_ENABLE_I18N */
3257 if (__glibc_unlikely (*err != REG_NOERROR))
3258 goto parse_bracket_exp_free_return;
3259 }
3260 else
3261 {
3262 switch (start_elem.type)
3263 {
3264 case SB_CHAR:
3265 bitset_set (sbcset, start_elem.opr.ch);
3266 break;
3267#ifdef RE_ENABLE_I18N
3268 case MB_CHAR:
3269 /* Check whether the array has enough space. */
3270 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3271 {
3272 wchar_t *new_mbchars;
3273 /* Not enough, realloc it. */
3274 /* +1 in case of mbcset->nmbchars is 0. */
3275 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3276 /* Use realloc since array is NULL if *alloc == 0. */
3277 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3278 mbchar_alloc);
3279 if (__glibc_unlikely (new_mbchars == NULL))
3280 goto parse_bracket_exp_espace;
3281 mbcset->mbchars = new_mbchars;
3282 }
3283 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3284 break;
3285#endif /* RE_ENABLE_I18N */
3286 case EQUIV_CLASS:
3287 *err = build_equiv_class (sbcset,
3288#ifdef RE_ENABLE_I18N
3289 mbcset, &equiv_class_alloc,
3290#endif /* RE_ENABLE_I18N */
3291 start_elem.opr.name);
3292 if (__glibc_unlikely (*err != REG_NOERROR))
3293 goto parse_bracket_exp_free_return;
3294 break;
3295 case COLL_SYM:
3296 *err = build_collating_symbol (sbcset,
3297#ifdef RE_ENABLE_I18N
3298 mbcset, &coll_sym_alloc,
3299#endif /* RE_ENABLE_I18N */
3300 start_elem.opr.name);
3301 if (__glibc_unlikely (*err != REG_NOERROR))
3302 goto parse_bracket_exp_free_return;
3303 break;
3304 case CHAR_CLASS:
3305 *err = build_charclass (regexp->trans, sbcset,
3306#ifdef RE_ENABLE_I18N
3307 mbcset, &char_class_alloc,
3308#endif /* RE_ENABLE_I18N */
3309 (const char *) start_elem.opr.name,
3310 syntax);
3311 if (__glibc_unlikely (*err != REG_NOERROR))
3312 goto parse_bracket_exp_free_return;
3313 break;
3314 default:
3315 DEBUG_ASSERT (false);
3316 break;
3317 }
3318 }
3319 if (__glibc_unlikely (token->type == END_OF_RE))
3320 {
3321 *err = REG_EBRACK;
3322 goto parse_bracket_exp_free_return;
3323 }
3324 if (token->type == OP_CLOSE_BRACKET)
3325 break;
3326 }
3327
3328 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3329
3330 /* If it is non-matching list. */
3331 if (non_match)
3332 bitset_not (sbcset);
3333
3334#ifdef RE_ENABLE_I18N
3335 /* Ensure only single byte characters are set. */
3336 if (dfa->mb_cur_max > 1)
3337 bitset_mask (sbcset, dfa->sb_char);
3338
3339 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3340 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3341 || mbcset->non_match)))
3342 {
3343 bin_tree_t *mbc_tree;
3344 int sbc_idx;
3345 /* Build a tree for complex bracket. */
3346 dfa->has_mb_node = 1;
3347 br_token.type = COMPLEX_BRACKET;
3348 br_token.opr.mbcset = mbcset;
3349 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3350 if (__glibc_unlikely (mbc_tree == NULL))
3351 goto parse_bracket_exp_espace;
3352 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3353 if (sbcset[sbc_idx])
3354 break;
3355 /* If there are no bits set in sbcset, there is no point
3356 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3357 if (sbc_idx < BITSET_WORDS)
3358 {
3359 /* Build a tree for simple bracket. */
3360 br_token.type = SIMPLE_BRACKET;
3361 br_token.opr.sbcset = sbcset;
3362 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3363 if (__glibc_unlikely (work_tree == NULL))
3364 goto parse_bracket_exp_espace;
3365
3366 /* Then join them by ALT node. */
3367 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3368 if (__glibc_unlikely (work_tree == NULL))
3369 goto parse_bracket_exp_espace;
3370 }
3371 else
3372 {
3373 re_free (sbcset);
3374 work_tree = mbc_tree;
3375 }
3376 }
3377 else
3378#endif /* not RE_ENABLE_I18N */
3379 {
3380#ifdef RE_ENABLE_I18N
3381 free_charset (mbcset);
3382#endif
3383 /* Build a tree for simple bracket. */
3384 br_token.type = SIMPLE_BRACKET;
3385 br_token.opr.sbcset = sbcset;
3386 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3387 if (__glibc_unlikely (work_tree == NULL))
3388 goto parse_bracket_exp_espace;
3389 }
3390 return work_tree;
3391
3392 parse_bracket_exp_espace:
3393 *err = REG_ESPACE;
3394 parse_bracket_exp_free_return:
3395 re_free (sbcset);
3396#ifdef RE_ENABLE_I18N
3397 free_charset (mbcset);
3398#endif /* RE_ENABLE_I18N */
3399 return NULL;
3400}
3401
3402/* Parse an element in the bracket expression. */
3403
3404static reg_errcode_t
3405parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3406 re_token_t *token, int token_len, re_dfa_t *dfa,
3407 reg_syntax_t syntax, bool accept_hyphen)
3408{
3409#ifdef RE_ENABLE_I18N
3410 int cur_char_size;
3411 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3412 if (cur_char_size > 1)
3413 {
3414 elem->type = MB_CHAR;
3415 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3416 re_string_skip_bytes (regexp, cur_char_size);
3417 return REG_NOERROR;
3418 }
3419#endif /* RE_ENABLE_I18N */
3420 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3421 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3422 || token->type == OP_OPEN_EQUIV_CLASS)
3423 return parse_bracket_symbol (elem, regexp, token);
3424 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3425 {
3426 /* A '-' must only appear as anything but a range indicator before
3427 the closing bracket. Everything else is an error. */
3428 re_token_t token2;
3429 (void) peek_token_bracket (&token2, regexp, syntax);
3430 if (token2.type != OP_CLOSE_BRACKET)
3431 /* The actual error value is not standardized since this whole
3432 case is undefined. But ERANGE makes good sense. */
3433 return REG_ERANGE;
3434 }
3435 elem->type = SB_CHAR;
3436 elem->opr.ch = token->opr.c;
3437 return REG_NOERROR;
3438}
3439
3440/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3441 such as [:<character_class>:], [.<collating_element>.], and
3442 [=<equivalent_class>=]. */
3443
3444static reg_errcode_t
3445parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3446 re_token_t *token)
3447{
3448 unsigned char ch, delim = token->opr.c;
3449 int i = 0;
3450 if (re_string_eoi(regexp))
3451 return REG_EBRACK;
3452 for (;; ++i)
3453 {
3454 if (i >= BRACKET_NAME_BUF_SIZE)
3455 return REG_EBRACK;
3456 if (token->type == OP_OPEN_CHAR_CLASS)
3457 ch = re_string_fetch_byte_case (regexp);
3458 else
3459 ch = re_string_fetch_byte (regexp);
3460 if (re_string_eoi(regexp))
3461 return REG_EBRACK;
3462 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3463 break;
3464 elem->opr.name[i] = ch;
3465 }
3466 re_string_skip_bytes (regexp, 1);
3467 elem->opr.name[i] = '\0';
3468 switch (token->type)
3469 {
3470 case OP_OPEN_COLL_ELEM:
3471 elem->type = COLL_SYM;
3472 break;
3473 case OP_OPEN_EQUIV_CLASS:
3474 elem->type = EQUIV_CLASS;
3475 break;
3476 case OP_OPEN_CHAR_CLASS:
3477 elem->type = CHAR_CLASS;
3478 break;
3479 default:
3480 break;
3481 }
3482 return REG_NOERROR;
3483}
3484
3485 /* Helper function for parse_bracket_exp.
3486 Build the equivalence class which is represented by NAME.
3487 The result are written to MBCSET and SBCSET.
3488 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3489 is a pointer argument since we may update it. */
3490
3491static reg_errcode_t
3492#ifdef RE_ENABLE_I18N
3493build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3494 Idx *equiv_class_alloc, const unsigned char *name)
3495#else /* not RE_ENABLE_I18N */
3496build_equiv_class (bitset_t sbcset, const unsigned char *name)
3497#endif /* not RE_ENABLE_I18N */
3498{
3499#ifdef _LIBC
3500 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3501 if (nrules != 0)
3502 {
3503 const int32_t *table, *indirect;
3504 const unsigned char *weights, *extra, *cp;
3505 unsigned char char_buf[2];
3506 int32_t idx1, idx2;
3507 unsigned int ch;
3508 size_t len;
3509 /* Calculate the index for equivalence class. */
3510 cp = name;
3511 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3512 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_WEIGHTMB);
3514 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3515 _NL_COLLATE_EXTRAMB);
3516 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3517 _NL_COLLATE_INDIRECTMB);
3518 idx1 = findidx (table, indirect, extra, &cp, -1);
3519 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3520 /* This isn't a valid character. */
3521 return REG_ECOLLATE;
3522
3523 /* Build single byte matching table for this equivalence class. */
3524 len = weights[idx1 & 0xffffff];
3525 for (ch = 0; ch < SBC_MAX; ++ch)
3526 {
3527 char_buf[0] = ch;
3528 cp = char_buf;
3529 idx2 = findidx (table, indirect, extra, &cp, 1);
3530/*
3531 idx2 = table[ch];
3532*/
3533 if (idx2 == 0)
3534 /* This isn't a valid character. */
3535 continue;
3536 /* Compare only if the length matches and the collation rule
3537 index is the same. */
3538 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3539 && memcmp (weights + (idx1 & 0xffffff) + 1,
3540 weights + (idx2 & 0xffffff) + 1, len) == 0)
3541 bitset_set (sbcset, ch);
3542 }
3543 /* Check whether the array has enough space. */
3544 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3545 {
3546 /* Not enough, realloc it. */
3547 /* +1 in case of mbcset->nequiv_classes is 0. */
3548 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3549 /* Use realloc since the array is NULL if *alloc == 0. */
3550 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3551 int32_t,
3552 new_equiv_class_alloc);
3553 if (__glibc_unlikely (new_equiv_classes == NULL))
3554 return REG_ESPACE;
3555 mbcset->equiv_classes = new_equiv_classes;
3556 *equiv_class_alloc = new_equiv_class_alloc;
3557 }
3558 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3559 }
3560 else
3561#endif /* _LIBC */
3562 {
3563 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3564 return REG_ECOLLATE;
3565 bitset_set (sbcset, *name);
3566 }
3567 return REG_NOERROR;
3568}
3569
3570 /* Helper function for parse_bracket_exp.
3571 Build the character class which is represented by NAME.
3572 The result are written to MBCSET and SBCSET.
3573 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3574 is a pointer argument since we may update it. */
3575
3576static reg_errcode_t
3577#ifdef RE_ENABLE_I18N
3578build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579 re_charset_t *mbcset, Idx *char_class_alloc,
3580 const char *class_name, reg_syntax_t syntax)
3581#else /* not RE_ENABLE_I18N */
3582build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 const char *class_name, reg_syntax_t syntax)
3584#endif /* not RE_ENABLE_I18N */
3585{
3586 int i;
3587 const char *name = class_name;
3588
3589 /* In case of REG_ICASE "upper" and "lower" match the both of
3590 upper and lower cases. */
3591 if ((syntax & RE_ICASE)
3592 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3593 name = "alpha";
3594
3595#ifdef RE_ENABLE_I18N
3596 /* Check the space of the arrays. */
3597 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3598 {
3599 /* Not enough, realloc it. */
3600 /* +1 in case of mbcset->nchar_classes is 0. */
3601 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3602 /* Use realloc since array is NULL if *alloc == 0. */
3603 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3604 new_char_class_alloc);
3605 if (__glibc_unlikely (new_char_classes == NULL))
3606 return REG_ESPACE;
3607 mbcset->char_classes = new_char_classes;
3608 *char_class_alloc = new_char_class_alloc;
3609 }
3610 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3611#endif /* RE_ENABLE_I18N */
3612
3613#define BUILD_CHARCLASS_LOOP(ctype_func) \
3614 do { \
3615 if (__glibc_unlikely (trans != NULL)) \
3616 { \
3617 for (i = 0; i < SBC_MAX; ++i) \
3618 if (ctype_func (i)) \
3619 bitset_set (sbcset, trans[i]); \
3620 } \
3621 else \
3622 { \
3623 for (i = 0; i < SBC_MAX; ++i) \
3624 if (ctype_func (i)) \
3625 bitset_set (sbcset, i); \
3626 } \
3627 } while (0)
3628
3629 if (strcmp (name, "alnum") == 0)
3630 BUILD_CHARCLASS_LOOP (isalnum);
3631 else if (strcmp (name, "cntrl") == 0)
3632 BUILD_CHARCLASS_LOOP (iscntrl);
3633 else if (strcmp (name, "lower") == 0)
3634 BUILD_CHARCLASS_LOOP (islower);
3635 else if (strcmp (name, "space") == 0)
3636 BUILD_CHARCLASS_LOOP (isspace);
3637 else if (strcmp (name, "alpha") == 0)
3638 BUILD_CHARCLASS_LOOP (isalpha);
3639 else if (strcmp (name, "digit") == 0)
3640 BUILD_CHARCLASS_LOOP (isdigit);
3641 else if (strcmp (name, "print") == 0)
3642 BUILD_CHARCLASS_LOOP (isprint);
3643 else if (strcmp (name, "upper") == 0)
3644 BUILD_CHARCLASS_LOOP (isupper);
3645 else if (strcmp (name, "blank") == 0)
3646 BUILD_CHARCLASS_LOOP (isblank);
3647 else if (strcmp (name, "graph") == 0)
3648 BUILD_CHARCLASS_LOOP (isgraph);
3649 else if (strcmp (name, "punct") == 0)
3650 BUILD_CHARCLASS_LOOP (ispunct);
3651 else if (strcmp (name, "xdigit") == 0)
3652 BUILD_CHARCLASS_LOOP (isxdigit);
3653 else
3654 return REG_ECTYPE;
3655
3656 return REG_NOERROR;
3657}
3658
3659static bin_tree_t *
3660build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3661 const char *class_name,
3662 const char *extra, bool non_match,
3663 reg_errcode_t *err)
3664{
3665 re_bitset_ptr_t sbcset;
3666#ifdef RE_ENABLE_I18N
3667 re_charset_t *mbcset;
3668 Idx alloc = 0;
3669#endif /* not RE_ENABLE_I18N */
3670 reg_errcode_t ret;
3671 bin_tree_t *tree;
3672
3673 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3674 if (__glibc_unlikely (sbcset == NULL))
3675 {
3676 *err = REG_ESPACE;
3677 return NULL;
3678 }
3679#ifdef RE_ENABLE_I18N
3680 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681 if (__glibc_unlikely (mbcset == NULL))
3682 {
3683 re_free (sbcset);
3684 *err = REG_ESPACE;
3685 return NULL;
3686 }
3687 mbcset->non_match = non_match;
3688#endif /* RE_ENABLE_I18N */
3689
3690 /* We don't care the syntax in this case. */
3691 ret = build_charclass (trans, sbcset,
3692#ifdef RE_ENABLE_I18N
3693 mbcset, &alloc,
3694#endif /* RE_ENABLE_I18N */
3695 class_name, 0);
3696
3697 if (__glibc_unlikely (ret != REG_NOERROR))
3698 {
3699 re_free (sbcset);
3700#ifdef RE_ENABLE_I18N
3701 free_charset (mbcset);
3702#endif /* RE_ENABLE_I18N */
3703 *err = ret;
3704 return NULL;
3705 }
3706 /* \w match '_' also. */
3707 for (; *extra; extra++)
3708 bitset_set (sbcset, *extra);
3709
3710 /* If it is non-matching list. */
3711 if (non_match)
3712 bitset_not (sbcset);
3713
3714#ifdef RE_ENABLE_I18N
3715 /* Ensure only single byte characters are set. */
3716 if (dfa->mb_cur_max > 1)
3717 bitset_mask (sbcset, dfa->sb_char);
3718#endif
3719
3720 /* Build a tree for simple bracket. */
3721 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3722 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723 if (__glibc_unlikely (tree == NULL))
3724 goto build_word_op_espace;
3725
3726#ifdef RE_ENABLE_I18N
3727 if (dfa->mb_cur_max > 1)
3728 {
3729 bin_tree_t *mbc_tree;
3730 /* Build a tree for complex bracket. */
3731 br_token.type = COMPLEX_BRACKET;
3732 br_token.opr.mbcset = mbcset;
3733 dfa->has_mb_node = 1;
3734 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3735 if (__glibc_unlikely (mbc_tree == NULL))
3736 goto build_word_op_espace;
3737 /* Then join them by ALT node. */
3738 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3739 if (__glibc_likely (mbc_tree != NULL))
3740 return tree;
3741 }
3742 else
3743 {
3744 free_charset (mbcset);
3745 return tree;
3746 }
3747#else /* not RE_ENABLE_I18N */
3748 return tree;
3749#endif /* not RE_ENABLE_I18N */
3750
3751 build_word_op_espace:
3752 re_free (sbcset);
3753#ifdef RE_ENABLE_I18N
3754 free_charset (mbcset);
3755#endif /* RE_ENABLE_I18N */
3756 *err = REG_ESPACE;
3757 return NULL;
3758}
3759
3760/* This is intended for the expressions like "a{1,3}".
3761 Fetch a number from 'input', and return the number.
3762 Return -1 if the number field is empty like "{,1}".
3763 Return RE_DUP_MAX + 1 if the number field is too large.
3764 Return -2 if an error occurred. */
3765
3766static Idx
3767fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3768{
3769 Idx num = -1;
3770 unsigned char c;
3771 while (1)
3772 {
3773 fetch_token (token, input, syntax);
3774 c = token->opr.c;
3775 if (__glibc_unlikely (token->type == END_OF_RE))
3776 return -2;
3777 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3778 break;
3779 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3780 ? -2
3781 : num == -1
3782 ? c - '0'
3783 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3784 }
3785 return num;
3786}
3787
3788
3789#ifdef RE_ENABLE_I18N
3790static void
3791free_charset (re_charset_t *cset)
3792{
3793 re_free (cset->mbchars);
3794# ifdef _LIBC
3795 re_free (cset->coll_syms);
3796 re_free (cset->equiv_classes);
3797# endif
3798 re_free (cset->range_starts);
3799 re_free (cset->range_ends);
3800 re_free (cset->char_classes);
3801 re_free (cset);
3802}
3803#endif /* RE_ENABLE_I18N */
3804
3805
3806/* Functions for binary tree operation. */
3807
3808/* Create a tree node. */
3809
3810static bin_tree_t *
3811create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3812 re_token_type_t type)
3813{
3814 re_token_t t = { .type = type };
3815 return create_token_tree (dfa, left, right, &t);
3816}
3817
3818static bin_tree_t *
3819create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3820 const re_token_t *token)
3821{
3822 bin_tree_t *tree;
3823 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3824 {
3825 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3826
3827 if (storage == NULL)
3828 return NULL;
3829 storage->next = dfa->str_tree_storage;
3830 dfa->str_tree_storage = storage;
3831 dfa->str_tree_storage_idx = 0;
3832 }
3833 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3834
3835 tree->parent = NULL;
3836 tree->left = left;
3837 tree->right = right;
3838 tree->token = *token;
3839 tree->token.duplicated = 0;
3840 tree->token.opt_subexp = 0;
3841 tree->first = NULL;
3842 tree->next = NULL;
3843 tree->node_idx = -1;
3844
3845 if (left != NULL)
3846 left->parent = tree;
3847 if (right != NULL)
3848 right->parent = tree;
3849 return tree;
3850}
3851
3852/* Mark the tree SRC as an optional subexpression.
3853 To be called from preorder or postorder. */
3854
3855static reg_errcode_t
3856mark_opt_subexp (void *extra, bin_tree_t *node)
3857{
3858 Idx idx = (uintptr_t) extra;
3859 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3860 node->token.opt_subexp = 1;
3861
3862 return REG_NOERROR;
3863}
3864
3865/* Free the allocated memory inside NODE. */
3866
3867static void
3868free_token (re_token_t *node)
3869{
3870#ifdef RE_ENABLE_I18N
3871 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3872 free_charset (node->opr.mbcset);
3873 else
3874#endif /* RE_ENABLE_I18N */
3875 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3876 re_free (node->opr.sbcset);
3877}
3878
3879/* Worker function for tree walking. Free the allocated memory inside NODE
3880 and its children. */
3881
3882static reg_errcode_t
3883free_tree (void *extra, bin_tree_t *node)
3884{
3885 free_token (&node->token);
3886 return REG_NOERROR;
3887}
3888
3889
3890/* Duplicate the node SRC, and return new node. This is a preorder
3891 visit similar to the one implemented by the generic visitor, but
3892 we need more infrastructure to maintain two parallel trees --- so,
3893 it's easier to duplicate. */
3894
3895static bin_tree_t *
3896duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3897{
3898 const bin_tree_t *node;
3899 bin_tree_t *dup_root;
3900 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3901
3902 for (node = root; ; )
3903 {
3904 /* Create a new tree and link it back to the current parent. */
3905 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3906 if (*p_new == NULL)
3907 return NULL;
3908 (*p_new)->parent = dup_node;
3909 (*p_new)->token.duplicated = 1;
3910 dup_node = *p_new;
3911
3912 /* Go to the left node, or up and to the right. */
3913 if (node->left)
3914 {
3915 node = node->left;
3916 p_new = &dup_node->left;
3917 }
3918 else
3919 {
3920 const bin_tree_t *prev = NULL;
3921 while (node->right == prev || node->right == NULL)
3922 {
3923 prev = node;
3924 node = node->parent;
3925 dup_node = dup_node->parent;
3926 if (!node)
3927 return dup_root;
3928 }
3929 node = node->right;
3930 p_new = &dup_node->right;
3931 }
3932 }
3933}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette