VirtualBox

source: kBuild/vendor/grep/2.12/lib/regcomp.c

Last change on this file was 2595, checked in by bird, 13 years ago

gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)

  • Property svn:eol-style set to native
File size: 110.8 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
23 char *fastmap);
24static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25#ifdef RE_ENABLE_I18N
26static void free_charset (re_charset_t *cset);
27#endif /* RE_ENABLE_I18N */
28static void free_workarea_compile (regex_t *preg);
29static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30#ifdef RE_ENABLE_I18N
31static void optimize_utf8 (re_dfa_t *dfa);
32#endif
33static reg_errcode_t analyze (regex_t *preg);
34static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
36 void *extra);
37static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
43 bin_tree_t *node);
44static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
52 Idx node, bool root);
53static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54static Idx fetch_number (re_string_t *input, re_token_t *token,
55 reg_syntax_t syntax);
56static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
77 reg_errcode_t *err);
78static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79 re_string_t *regexp,
80 re_token_t *token, int token_len,
81 re_dfa_t *dfa,
82 reg_syntax_t syntax,
83 bool accept_hyphen);
84static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 re_string_t *regexp,
86 re_token_t *token);
87#ifdef RE_ENABLE_I18N
88static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
97 reg_syntax_t syntax);
98#else /* not RE_ENABLE_I18N */
99static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 bitset_t sbcset,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105#endif /* not RE_ENABLE_I18N */
106static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118static void free_token (re_token_t *node);
119static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
121
122
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128static const char __re_error_msgid[] =
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202
203
204/* Entry points for GNU code. */
205
206/* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
211 are set in BUFP on entry. */
212
213#ifdef _LIBC
214const char *
215re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
219#else /* size_t might promote */
220const char *
221re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
223#endif
224{
225 reg_errcode_t ret;
226
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
234
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
236
237 if (!ret)
238 return NULL;
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240}
241#ifdef _LIBC
242weak_alias (__re_compile_pattern, re_compile_pattern)
243#endif
244
245/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248/* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250reg_syntax_t re_syntax_options;
251
252
253/* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
256
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
259
260reg_syntax_t
261re_set_syntax (syntax)
262 reg_syntax_t syntax;
263{
264 reg_syntax_t ret = re_syntax_options;
265
266 re_syntax_options = syntax;
267 return ret;
268}
269#ifdef _LIBC
270weak_alias (__re_set_syntax, re_set_syntax)
271#endif
272
273int
274re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
276{
277 re_dfa_t *dfa = bufp->buffer;
278 char *fastmap = bufp->fastmap;
279
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
289 return 0;
290}
291#ifdef _LIBC
292weak_alias (__re_compile_fastmap, re_compile_fastmap)
293#endif
294
295static inline void
296__attribute ((always_inline))
297re_set_fastmap (char *fastmap, bool icase, int ch)
298{
299 fastmap[ch] = 1;
300 if (icase)
301 fastmap[tolower (ch)] = 1;
302}
303
304/* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
306
307static void
308re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 char *fastmap)
310{
311 re_dfa_t *dfa = bufp->buffer;
312 Idx node_cnt;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 {
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
318
319 if (type == CHARACTER)
320 {
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322#ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 {
325 unsigned char buf[MB_LEN_MAX];
326 unsigned char *p;
327 wchar_t wc;
328 mbstate_t state;
329
330 p = buf;
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 &state) == p - buf
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 != (size_t) -1))
341 re_set_fastmap (fastmap, false, buf[0]);
342 }
343#endif
344 }
345 else if (type == SIMPLE_BRACKET)
346 {
347 int i, ch;
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349 {
350 int j;
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
355 }
356 }
357#ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
359 {
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 Idx i;
362
363# ifdef _LIBC
364 /* See if we have to try all bytes which start multiple collation
365 elements.
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
372 {
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
376 if (table[i] < 0)
377 re_set_fastmap (fastmap, icase, i);
378 }
379# endif /* _LIBC */
380
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match || cset->nranges
387# ifdef _LIBC
388 || cset->nequiv_classes
389# endif /* _LIBC */
390 ))
391 {
392 unsigned char c = 0;
393 do
394 {
395 mbstate_t mbs;
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
399 }
400 while (++c != 0);
401 }
402
403 else
404 {
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
407 {
408 char buf[256];
409 mbstate_t state;
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 {
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 != (size_t) -1)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
418 }
419 }
420 }
421 }
422#endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424#ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426#endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
428 {
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
432 return;
433 }
434 }
435}
436
437
438/* Entry point for POSIX code. */
439/* regcomp takes a regular expression as a string and compiles it.
440
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
443
444 'buffer' to the compiled pattern;
445 'used' to the length of the compiled pattern;
446 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 'fastmap' to an allocated space for the fastmap;
451 'fastmap_accurate' to zero;
452 're_nsub' to the number of subexpressions in PATTERN.
453
454 PATTERN is the address of the pattern string.
455
456 CFLAGS is a series of bits which affect compilation.
457
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
460
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
463
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
466
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
469 registers.
470
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
473
474int
475regcomp (preg, pattern, cflags)
476 regex_t *_Restrict_ preg;
477 const char *_Restrict_ pattern;
478 int cflags;
479{
480 reg_errcode_t ret;
481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC);
483
484 preg->buffer = NULL;
485 preg->allocated = 0;
486 preg->used = 0;
487
488 /* Try to allocate space for the fastmap. */
489 preg->fastmap = re_malloc (char, SBC_MAX);
490 if (BE (preg->fastmap == NULL, 0))
491 return REG_ESPACE;
492
493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags & REG_NEWLINE)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax &= ~RE_DOT_NEWLINE;
499 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500 /* It also changes the matching behavior. */
501 preg->newline_anchor = 1;
502 }
503 else
504 preg->newline_anchor = 0;
505 preg->no_sub = !!(cflags & REG_NOSUB);
506 preg->translate = NULL;
507
508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret == REG_ERPAREN)
513 ret = REG_EPAREN;
514
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret == REG_NOERROR, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg);
520 else
521 {
522 /* Some error occurred while compiling the expression. */
523 re_free (preg->fastmap);
524 preg->fastmap = NULL;
525 }
526
527 return (int) ret;
528}
529#ifdef _LIBC
530weak_alias (__regcomp, regcomp)
531#endif
532
533/* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
535
536#ifdef _LIBC
537size_t
538regerror (errcode, preg, errbuf, errbuf_size)
539 int errcode;
540 const regex_t *_Restrict_ preg;
541 char *_Restrict_ errbuf;
542 size_t errbuf_size;
543#else /* size_t might promote */
544size_t
545regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
546 char *_Restrict_ errbuf, size_t errbuf_size)
547#endif
548{
549 const char *msg;
550 size_t msg_size;
551
552 if (BE (errcode < 0
553 || errcode >= (int) (sizeof (__re_error_msgid_idx)
554 / sizeof (__re_error_msgid_idx[0])), 0))
555 /* Only error codes returned by the rest of the code should be passed
556 to this routine. If we are given anything else, or if other regex
557 code generates an invalid error code, then the program has a bug.
558 Dump core so we can fix it. */
559 abort ();
560
561 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562
563 msg_size = strlen (msg) + 1; /* Includes the null. */
564
565 if (BE (errbuf_size != 0, 1))
566 {
567 size_t cpy_size = msg_size;
568 if (BE (msg_size > errbuf_size, 0))
569 {
570 cpy_size = errbuf_size - 1;
571 errbuf[cpy_size] = '\0';
572 }
573 memcpy (errbuf, msg, cpy_size);
574 }
575
576 return msg_size;
577}
578#ifdef _LIBC
579weak_alias (__regerror, regerror)
580#endif
581
582
583#ifdef RE_ENABLE_I18N
584/* This static array is used for the map to single-byte characters when
585 UTF-8 is used. Otherwise we would allocate memory just to initialize
586 it the same all the time. UTF-8 is the preferred encoding so this is
587 a worthwhile optimization. */
588static const bitset_t utf8_sb_map =
589{
590 /* Set the first 128 bits. */
591# ifdef __GNUC__
592 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
593# else
594# if 4 * BITSET_WORD_BITS < ASCII_CHARS
595# error "bitset_word_t is narrower than 32 bits"
596# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
598# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX, BITSET_WORD_MAX,
600# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 BITSET_WORD_MAX,
602# endif
603 (BITSET_WORD_MAX
604 >> (SBC_MAX % BITSET_WORD_BITS == 0
605 ? 0
606 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
607# endif
608};
609#endif
610
611
612static void
613free_dfa_content (re_dfa_t *dfa)
614{
615 Idx i, j;
616
617 if (dfa->nodes)
618 for (i = 0; i < dfa->nodes_len; ++i)
619 free_token (dfa->nodes + i);
620 re_free (dfa->nexts);
621 for (i = 0; i < dfa->nodes_len; ++i)
622 {
623 if (dfa->eclosures != NULL)
624 re_node_set_free (dfa->eclosures + i);
625 if (dfa->inveclosures != NULL)
626 re_node_set_free (dfa->inveclosures + i);
627 if (dfa->edests != NULL)
628 re_node_set_free (dfa->edests + i);
629 }
630 re_free (dfa->edests);
631 re_free (dfa->eclosures);
632 re_free (dfa->inveclosures);
633 re_free (dfa->nodes);
634
635 if (dfa->state_table)
636 for (i = 0; i <= dfa->state_hash_mask; ++i)
637 {
638 struct re_state_table_entry *entry = dfa->state_table + i;
639 for (j = 0; j < entry->num; ++j)
640 {
641 re_dfastate_t *state = entry->array[j];
642 free_state (state);
643 }
644 re_free (entry->array);
645 }
646 re_free (dfa->state_table);
647#ifdef RE_ENABLE_I18N
648 if (dfa->sb_char != utf8_sb_map)
649 re_free (dfa->sb_char);
650#endif
651 re_free (dfa->subexp_map);
652#ifdef DEBUG
653 re_free (dfa->re_str);
654#endif
655
656 re_free (dfa);
657}
658
659
660/* Free dynamically allocated space used by PREG. */
661
662void
663regfree (preg)
664 regex_t *preg;
665{
666 re_dfa_t *dfa = preg->buffer;
667 if (BE (dfa != NULL, 1))
668 free_dfa_content (dfa);
669 preg->buffer = NULL;
670 preg->allocated = 0;
671
672 re_free (preg->fastmap);
673 preg->fastmap = NULL;
674
675 re_free (preg->translate);
676 preg->translate = NULL;
677}
678#ifdef _LIBC
679weak_alias (__regfree, regfree)
680#endif
681
682
683/* Entry points compatible with 4.2 BSD regex library. We don't define
684 them unless specifically requested. */
685
686#if defined _REGEX_RE_COMP || defined _LIBC
687
688/* BSD has one and only one pattern buffer. */
689static struct re_pattern_buffer re_comp_buf;
690
691char *
692# ifdef _LIBC
693/* Make these definitions weak in libc, so POSIX programs can redefine
694 these names if they don't use our functions, and still use
695 regcomp/regexec above without link errors. */
696weak_function
697# endif
698re_comp (s)
699 const char *s;
700{
701 reg_errcode_t ret;
702 char *fastmap;
703
704 if (!s)
705 {
706 if (!re_comp_buf.buffer)
707 return gettext ("No previous regular expression");
708 return 0;
709 }
710
711 if (re_comp_buf.buffer)
712 {
713 fastmap = re_comp_buf.fastmap;
714 re_comp_buf.fastmap = NULL;
715 __regfree (&re_comp_buf);
716 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717 re_comp_buf.fastmap = fastmap;
718 }
719
720 if (re_comp_buf.fastmap == NULL)
721 {
722 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723 if (re_comp_buf.fastmap == NULL)
724 return (char *) gettext (__re_error_msgid
725 + __re_error_msgid_idx[(int) REG_ESPACE]);
726 }
727
728 /* Since 're_exec' always passes NULL for the 'regs' argument, we
729 don't need to initialize the pattern buffer fields which affect it. */
730
731 /* Match anchors at newlines. */
732 re_comp_buf.newline_anchor = 1;
733
734 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735
736 if (!ret)
737 return NULL;
738
739 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
740 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741}
742
743#ifdef _LIBC
744libc_freeres_fn (free_mem)
745{
746 __regfree (&re_comp_buf);
747}
748#endif
749
750#endif /* _REGEX_RE_COMP */
751
752
753/* Internal entry point.
754 Compile the regular expression PATTERN, whose length is LENGTH.
755 SYNTAX indicate regular expression's syntax. */
756
757static reg_errcode_t
758re_compile_internal (regex_t *preg, const char * pattern, size_t length,
759 reg_syntax_t syntax)
760{
761 reg_errcode_t err = REG_NOERROR;
762 re_dfa_t *dfa;
763 re_string_t regexp;
764
765 /* Initialize the pattern buffer. */
766 preg->fastmap_accurate = 0;
767 preg->syntax = syntax;
768 preg->not_bol = preg->not_eol = 0;
769 preg->used = 0;
770 preg->re_nsub = 0;
771 preg->can_be_null = 0;
772 preg->regs_allocated = REGS_UNALLOCATED;
773
774 /* Initialize the dfa. */
775 dfa = preg->buffer;
776 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
777 {
778 /* If zero allocated, but buffer is non-null, try to realloc
779 enough space. This loses if buffer's address is bogus, but
780 that is the user's responsibility. If ->buffer is NULL this
781 is a simple allocation. */
782 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
783 if (dfa == NULL)
784 return REG_ESPACE;
785 preg->allocated = sizeof (re_dfa_t);
786 preg->buffer = dfa;
787 }
788 preg->used = sizeof (re_dfa_t);
789
790 err = init_dfa (dfa, length);
791 if (BE (err != REG_NOERROR, 0))
792 {
793 free_dfa_content (dfa);
794 preg->buffer = NULL;
795 preg->allocated = 0;
796 return err;
797 }
798#ifdef DEBUG
799 /* Note: length+1 will not overflow since it is checked in init_dfa. */
800 dfa->re_str = re_malloc (char, length + 1);
801 strncpy (dfa->re_str, pattern, length + 1);
802#endif
803
804 __libc_lock_init (dfa->lock);
805
806 err = re_string_construct (&regexp, pattern, length, preg->translate,
807 (syntax & RE_ICASE) != 0, dfa);
808 if (BE (err != REG_NOERROR, 0))
809 {
810 re_compile_internal_free_return:
811 free_workarea_compile (preg);
812 re_string_destruct (&regexp);
813 free_dfa_content (dfa);
814 preg->buffer = NULL;
815 preg->allocated = 0;
816 return err;
817 }
818
819 /* Parse the regular expression, and build a structure tree. */
820 preg->re_nsub = 0;
821 dfa->str_tree = parse (&regexp, preg, syntax, &err);
822 if (BE (dfa->str_tree == NULL, 0))
823 goto re_compile_internal_free_return;
824
825 /* Analyze the tree and create the nfa. */
826 err = analyze (preg);
827 if (BE (err != REG_NOERROR, 0))
828 goto re_compile_internal_free_return;
829
830#ifdef RE_ENABLE_I18N
831 /* If possible, do searching in single byte encoding to speed things up. */
832 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833 optimize_utf8 (dfa);
834#endif
835
836 /* Then create the initial state of the dfa. */
837 err = create_initial_state (dfa);
838
839 /* Release work areas. */
840 free_workarea_compile (preg);
841 re_string_destruct (&regexp);
842
843 if (BE (err != REG_NOERROR, 0))
844 {
845 free_dfa_content (dfa);
846 preg->buffer = NULL;
847 preg->allocated = 0;
848 }
849
850 return err;
851}
852
853/* Initialize DFA. We use the length of the regular expression PAT_LEN
854 as the initial length of some arrays. */
855
856static reg_errcode_t
857init_dfa (re_dfa_t *dfa, size_t pat_len)
858{
859 __re_size_t table_size;
860#ifndef _LIBC
861 const char *codeset_name;
862#endif
863#ifdef RE_ENABLE_I18N
864 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
865#else
866 size_t max_i18n_object_size = 0;
867#endif
868 size_t max_object_size =
869 MAX (sizeof (struct re_state_table_entry),
870 MAX (sizeof (re_token_t),
871 MAX (sizeof (re_node_set),
872 MAX (sizeof (regmatch_t),
873 max_i18n_object_size))));
874
875 memset (dfa, '\0', sizeof (re_dfa_t));
876
877 /* Force allocation of str_tree_storage the first time. */
878 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
879
880 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
881 calculation below, and for similar doubling calculations
882 elsewhere. And it's <= rather than <, because some of the
883 doubling calculations add 1 afterwards. */
884 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
885 return REG_ESPACE;
886
887 dfa->nodes_alloc = pat_len + 1;
888 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
889
890 /* table_size = 2 ^ ceil(log pat_len) */
891 for (table_size = 1; ; table_size <<= 1)
892 if (table_size > pat_len)
893 break;
894
895 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
896 dfa->state_hash_mask = table_size - 1;
897
898 dfa->mb_cur_max = MB_CUR_MAX;
899#ifdef _LIBC
900 if (dfa->mb_cur_max == 6
901 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
902 dfa->is_utf8 = 1;
903 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
904 != 0);
905#else
906 codeset_name = nl_langinfo (CODESET);
907 if (strcasecmp (codeset_name, "UTF-8") == 0
908 || strcasecmp (codeset_name, "UTF8") == 0)
909 dfa->is_utf8 = 1;
910
911 /* We check exhaustively in the loop below if this charset is a
912 superset of ASCII. */
913 dfa->map_notascii = 0;
914#endif
915
916#ifdef RE_ENABLE_I18N
917 if (dfa->mb_cur_max > 1)
918 {
919 if (dfa->is_utf8)
920 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
921 else
922 {
923 int i, j, ch;
924
925 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
926 if (BE (dfa->sb_char == NULL, 0))
927 return REG_ESPACE;
928
929 /* Set the bits corresponding to single byte chars. */
930 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
931 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
932 {
933 wint_t wch = __btowc (ch);
934 if (wch != WEOF)
935 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
936# ifndef _LIBC
937 if (isascii (ch) && wch != ch)
938 dfa->map_notascii = 1;
939# endif
940 }
941 }
942 }
943#endif
944
945 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
946 return REG_ESPACE;
947 return REG_NOERROR;
948}
949
950/* Initialize WORD_CHAR table, which indicate which character is
951 "word". In this case "word" means that it is the word construction
952 character used by some operators like "\<", "\>", etc. */
953
954static void
955internal_function
956init_word_char (re_dfa_t *dfa)
957{
958 dfa->word_ops_used = 1;
959 int i = 0;
960 int j;
961 int ch = 0;
962 if (BE (dfa->map_notascii == 0, 1))
963 {
964 if (BITSET_WORD_BITS == 64)
965 {
966 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
967 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
968 i = 2;
969 }
970 else if (BITSET_WORD_BITS == 32)
971 {
972 dfa->word_char[0] = UINT32_C (0x00000000);
973 dfa->word_char[1] = UINT32_C (0x03ff0000);
974 dfa->word_char[2] = UINT32_C (0x87fffffe);
975 dfa->word_char[3] = UINT32_C (0x07fffffe);
976 i = 4;
977 }
978 else
979 goto general_case;
980 ch = 128;
981
982 if (BE (dfa->is_utf8, 1))
983 {
984 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
985 return;
986 }
987 }
988
989 general_case:
990 for (; i < BITSET_WORDS; ++i)
991 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
992 if (isalnum (ch) || ch == '_')
993 dfa->word_char[i] |= (bitset_word_t) 1 << j;
994}
995
996/* Free the work area which are only used while compiling. */
997
998static void
999free_workarea_compile (regex_t *preg)
1000{
1001 re_dfa_t *dfa = preg->buffer;
1002 bin_tree_storage_t *storage, *next;
1003 for (storage = dfa->str_tree_storage; storage; storage = next)
1004 {
1005 next = storage->next;
1006 re_free (storage);
1007 }
1008 dfa->str_tree_storage = NULL;
1009 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1010 dfa->str_tree = NULL;
1011 re_free (dfa->org_indices);
1012 dfa->org_indices = NULL;
1013}
1014
1015/* Create initial states for all contexts. */
1016
1017static reg_errcode_t
1018create_initial_state (re_dfa_t *dfa)
1019{
1020 Idx first, i;
1021 reg_errcode_t err;
1022 re_node_set init_nodes;
1023
1024 /* Initial states have the epsilon closure of the node which is
1025 the first node of the regular expression. */
1026 first = dfa->str_tree->first->node_idx;
1027 dfa->init_node = first;
1028 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1029 if (BE (err != REG_NOERROR, 0))
1030 return err;
1031
1032 /* The back-references which are in initial states can epsilon transit,
1033 since in this case all of the subexpressions can be null.
1034 Then we add epsilon closures of the nodes which are the next nodes of
1035 the back-references. */
1036 if (dfa->nbackref > 0)
1037 for (i = 0; i < init_nodes.nelem; ++i)
1038 {
1039 Idx node_idx = init_nodes.elems[i];
1040 re_token_type_t type = dfa->nodes[node_idx].type;
1041
1042 Idx clexp_idx;
1043 if (type != OP_BACK_REF)
1044 continue;
1045 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1046 {
1047 re_token_t *clexp_node;
1048 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1049 if (clexp_node->type == OP_CLOSE_SUBEXP
1050 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1051 break;
1052 }
1053 if (clexp_idx == init_nodes.nelem)
1054 continue;
1055
1056 if (type == OP_BACK_REF)
1057 {
1058 Idx dest_idx = dfa->edests[node_idx].elems[0];
1059 if (!re_node_set_contains (&init_nodes, dest_idx))
1060 {
1061 reg_errcode_t merge_err
1062 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1063 if (merge_err != REG_NOERROR)
1064 return merge_err;
1065 i = 0;
1066 }
1067 }
1068 }
1069
1070 /* It must be the first time to invoke acquire_state. */
1071 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1072 /* We don't check ERR here, since the initial state must not be NULL. */
1073 if (BE (dfa->init_state == NULL, 0))
1074 return err;
1075 if (dfa->init_state->has_constraint)
1076 {
1077 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1078 CONTEXT_WORD);
1079 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1080 CONTEXT_NEWLINE);
1081 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1082 &init_nodes,
1083 CONTEXT_NEWLINE
1084 | CONTEXT_BEGBUF);
1085 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1086 || dfa->init_state_begbuf == NULL, 0))
1087 return err;
1088 }
1089 else
1090 dfa->init_state_word = dfa->init_state_nl
1091 = dfa->init_state_begbuf = dfa->init_state;
1092
1093 re_node_set_free (&init_nodes);
1094 return REG_NOERROR;
1095}
1096
1097
1098#ifdef RE_ENABLE_I18N
1099/* If it is possible to do searching in single byte encoding instead of UTF-8
1100 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101 DFA nodes where needed. */
1102
1103static void
1104optimize_utf8 (re_dfa_t *dfa)
1105{
1106 Idx node;
1107 int i;
1108 bool mb_chars = false;
1109 bool has_period = false;
1110
1111 for (node = 0; node < dfa->nodes_len; ++node)
1112 switch (dfa->nodes[node].type)
1113 {
1114 case CHARACTER:
1115 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1116 mb_chars = true;
1117 break;
1118 case ANCHOR:
1119 switch (dfa->nodes[node].opr.ctx_type)
1120 {
1121 case LINE_FIRST:
1122 case LINE_LAST:
1123 case BUF_FIRST:
1124 case BUF_LAST:
1125 break;
1126 default:
1127 /* Word anchors etc. cannot be handled. It's okay to test
1128 opr.ctx_type since constraints (for all DFA nodes) are
1129 created by ORing one or more opr.ctx_type values. */
1130 return;
1131 }
1132 break;
1133 case OP_PERIOD:
1134 has_period = true;
1135 break;
1136 case OP_BACK_REF:
1137 case OP_ALT:
1138 case END_OF_RE:
1139 case OP_DUP_ASTERISK:
1140 case OP_OPEN_SUBEXP:
1141 case OP_CLOSE_SUBEXP:
1142 break;
1143 case COMPLEX_BRACKET:
1144 return;
1145 case SIMPLE_BRACKET:
1146 /* Just double check. */
1147 {
1148 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1149 ? 0
1150 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1152 {
1153 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1154 return;
1155 rshift = 0;
1156 }
1157 }
1158 break;
1159 default:
1160 abort ();
1161 }
1162
1163 if (mb_chars || has_period)
1164 for (node = 0; node < dfa->nodes_len; ++node)
1165 {
1166 if (dfa->nodes[node].type == CHARACTER
1167 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1168 dfa->nodes[node].mb_partial = 0;
1169 else if (dfa->nodes[node].type == OP_PERIOD)
1170 dfa->nodes[node].type = OP_UTF8_PERIOD;
1171 }
1172
1173 /* The search can be in single byte locale. */
1174 dfa->mb_cur_max = 1;
1175 dfa->is_utf8 = 0;
1176 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1177}
1178#endif
1179
1180
1181/* Analyze the structure tree, and calculate "first", "next", "edest",
1182 "eclosure", and "inveclosure". */
1183
1184static reg_errcode_t
1185analyze (regex_t *preg)
1186{
1187 re_dfa_t *dfa = preg->buffer;
1188 reg_errcode_t ret;
1189
1190 /* Allocate arrays. */
1191 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1196 || dfa->eclosures == NULL, 0))
1197 return REG_ESPACE;
1198
1199 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200 if (dfa->subexp_map != NULL)
1201 {
1202 Idx i;
1203 for (i = 0; i < preg->re_nsub; i++)
1204 dfa->subexp_map[i] = i;
1205 preorder (dfa->str_tree, optimize_subexps, dfa);
1206 for (i = 0; i < preg->re_nsub; i++)
1207 if (dfa->subexp_map[i] != i)
1208 break;
1209 if (i == preg->re_nsub)
1210 {
1211 free (dfa->subexp_map);
1212 dfa->subexp_map = NULL;
1213 }
1214 }
1215
1216 ret = postorder (dfa->str_tree, lower_subexps, preg);
1217 if (BE (ret != REG_NOERROR, 0))
1218 return ret;
1219 ret = postorder (dfa->str_tree, calc_first, dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1221 return ret;
1222 preorder (dfa->str_tree, calc_next, dfa);
1223 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224 if (BE (ret != REG_NOERROR, 0))
1225 return ret;
1226 ret = calc_eclosure (dfa);
1227 if (BE (ret != REG_NOERROR, 0))
1228 return ret;
1229
1230 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1231 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1232 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1233 || dfa->nbackref)
1234 {
1235 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236 if (BE (dfa->inveclosures == NULL, 0))
1237 return REG_ESPACE;
1238 ret = calc_inveclosure (dfa);
1239 }
1240
1241 return ret;
1242}
1243
1244/* Our parse trees are very unbalanced, so we cannot use a stack to
1245 implement parse tree visits. Instead, we use parent pointers and
1246 some hairy code in these two functions. */
1247static reg_errcode_t
1248postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249 void *extra)
1250{
1251 bin_tree_t *node, *prev;
1252
1253 for (node = root; ; )
1254 {
1255 /* Descend down the tree, preferably to the left (or to the right
1256 if that's the only child). */
1257 while (node->left || node->right)
1258 if (node->left)
1259 node = node->left;
1260 else
1261 node = node->right;
1262
1263 do
1264 {
1265 reg_errcode_t err = fn (extra, node);
1266 if (BE (err != REG_NOERROR, 0))
1267 return err;
1268 if (node->parent == NULL)
1269 return REG_NOERROR;
1270 prev = node;
1271 node = node->parent;
1272 }
1273 /* Go up while we have a node that is reached from the right. */
1274 while (node->right == prev || node->right == NULL);
1275 node = node->right;
1276 }
1277}
1278
1279static reg_errcode_t
1280preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1281 void *extra)
1282{
1283 bin_tree_t *node;
1284
1285 for (node = root; ; )
1286 {
1287 reg_errcode_t err = fn (extra, node);
1288 if (BE (err != REG_NOERROR, 0))
1289 return err;
1290
1291 /* Go to the left node, or up and to the right. */
1292 if (node->left)
1293 node = node->left;
1294 else
1295 {
1296 bin_tree_t *prev = NULL;
1297 while (node->right == prev || node->right == NULL)
1298 {
1299 prev = node;
1300 node = node->parent;
1301 if (!node)
1302 return REG_NOERROR;
1303 }
1304 node = node->right;
1305 }
1306 }
1307}
1308
1309/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1311 backreferences as well. Requires a preorder visit. */
1312static reg_errcode_t
1313optimize_subexps (void *extra, bin_tree_t *node)
1314{
1315 re_dfa_t *dfa = (re_dfa_t *) extra;
1316
1317 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318 {
1319 int idx = node->token.opr.idx;
1320 node->token.opr.idx = dfa->subexp_map[idx];
1321 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1322 }
1323
1324 else if (node->token.type == SUBEXP
1325 && node->left && node->left->token.type == SUBEXP)
1326 {
1327 Idx other_idx = node->left->token.opr.idx;
1328
1329 node->left = node->left->left;
1330 if (node->left)
1331 node->left->parent = node;
1332
1333 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334 if (other_idx < BITSET_WORD_BITS)
1335 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1336 }
1337
1338 return REG_NOERROR;
1339}
1340
1341/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1343static reg_errcode_t
1344lower_subexps (void *extra, bin_tree_t *node)
1345{
1346 regex_t *preg = (regex_t *) extra;
1347 reg_errcode_t err = REG_NOERROR;
1348
1349 if (node->left && node->left->token.type == SUBEXP)
1350 {
1351 node->left = lower_subexp (&err, preg, node->left);
1352 if (node->left)
1353 node->left->parent = node;
1354 }
1355 if (node->right && node->right->token.type == SUBEXP)
1356 {
1357 node->right = lower_subexp (&err, preg, node->right);
1358 if (node->right)
1359 node->right->parent = node;
1360 }
1361
1362 return err;
1363}
1364
1365static bin_tree_t *
1366lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1367{
1368 re_dfa_t *dfa = preg->buffer;
1369 bin_tree_t *body = node->left;
1370 bin_tree_t *op, *cls, *tree1, *tree;
1371
1372 if (preg->no_sub
1373 /* We do not optimize empty subexpressions, because otherwise we may
1374 have bad CONCAT nodes with NULL children. This is obviously not
1375 very common, so we do not lose much. An example that triggers
1376 this case is the sed "script" /\(\)/x. */
1377 && node->left != NULL
1378 && (node->token.opr.idx >= BITSET_WORD_BITS
1379 || !(dfa->used_bkref_map
1380 & ((bitset_word_t) 1 << node->token.opr.idx))))
1381 return node->left;
1382
1383 /* Convert the SUBEXP node to the concatenation of an
1384 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1385 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388 tree = create_tree (dfa, op, tree1, CONCAT);
1389 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1390 {
1391 *err = REG_ESPACE;
1392 return NULL;
1393 }
1394
1395 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397 return tree;
1398}
1399
1400/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401 nodes. Requires a postorder visit. */
1402static reg_errcode_t
1403calc_first (void *extra, bin_tree_t *node)
1404{
1405 re_dfa_t *dfa = (re_dfa_t *) extra;
1406 if (node->token.type == CONCAT)
1407 {
1408 node->first = node->left->first;
1409 node->node_idx = node->left->node_idx;
1410 }
1411 else
1412 {
1413 node->first = node;
1414 node->node_idx = re_dfa_add_node (dfa, node->token);
1415 if (BE (node->node_idx == REG_MISSING, 0))
1416 return REG_ESPACE;
1417 if (node->token.type == ANCHOR)
1418 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1419 }
1420 return REG_NOERROR;
1421}
1422
1423/* Pass 2: compute NEXT on the tree. Preorder visit. */
1424static reg_errcode_t
1425calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1426{
1427 switch (node->token.type)
1428 {
1429 case OP_DUP_ASTERISK:
1430 node->left->next = node;
1431 break;
1432 case CONCAT:
1433 node->left->next = node->right->first;
1434 node->right->next = node->next;
1435 break;
1436 default:
1437 if (node->left)
1438 node->left->next = node->next;
1439 if (node->right)
1440 node->right->next = node->next;
1441 break;
1442 }
1443 return REG_NOERROR;
1444}
1445
1446/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1447static reg_errcode_t
1448link_nfa_nodes (void *extra, bin_tree_t *node)
1449{
1450 re_dfa_t *dfa = (re_dfa_t *) extra;
1451 Idx idx = node->node_idx;
1452 reg_errcode_t err = REG_NOERROR;
1453
1454 switch (node->token.type)
1455 {
1456 case CONCAT:
1457 break;
1458
1459 case END_OF_RE:
1460 assert (node->next == NULL);
1461 break;
1462
1463 case OP_DUP_ASTERISK:
1464 case OP_ALT:
1465 {
1466 Idx left, right;
1467 dfa->has_plural_match = 1;
1468 if (node->left != NULL)
1469 left = node->left->first->node_idx;
1470 else
1471 left = node->next->node_idx;
1472 if (node->right != NULL)
1473 right = node->right->first->node_idx;
1474 else
1475 right = node->next->node_idx;
1476 assert (REG_VALID_INDEX (left));
1477 assert (REG_VALID_INDEX (right));
1478 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1479 }
1480 break;
1481
1482 case ANCHOR:
1483 case OP_OPEN_SUBEXP:
1484 case OP_CLOSE_SUBEXP:
1485 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486 break;
1487
1488 case OP_BACK_REF:
1489 dfa->nexts[idx] = node->next->node_idx;
1490 if (node->token.type == OP_BACK_REF)
1491 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492 break;
1493
1494 default:
1495 assert (!IS_EPSILON_NODE (node->token.type));
1496 dfa->nexts[idx] = node->next->node_idx;
1497 break;
1498 }
1499
1500 return err;
1501}
1502
1503/* Duplicate the epsilon closure of the node ROOT_NODE.
1504 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505 to their own constraint. */
1506
1507static reg_errcode_t
1508internal_function
1509duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510 Idx root_node, unsigned int init_constraint)
1511{
1512 Idx org_node, clone_node;
1513 bool ok;
1514 unsigned int constraint = init_constraint;
1515 for (org_node = top_org_node, clone_node = top_clone_node;;)
1516 {
1517 Idx org_dest, clone_dest;
1518 if (dfa->nodes[org_node].type == OP_BACK_REF)
1519 {
1520 /* If the back reference epsilon-transit, its destination must
1521 also have the constraint. Then duplicate the epsilon closure
1522 of the destination of the back reference, and store it in
1523 edests of the back reference. */
1524 org_dest = dfa->nexts[org_node];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 clone_dest = duplicate_node (dfa, org_dest, constraint);
1527 if (BE (clone_dest == REG_MISSING, 0))
1528 return REG_ESPACE;
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 if (BE (! ok, 0))
1532 return REG_ESPACE;
1533 }
1534 else if (dfa->edests[org_node].nelem == 0)
1535 {
1536 /* In case of the node can't epsilon-transit, don't duplicate the
1537 destination and store the original destination as the
1538 destination of the node. */
1539 dfa->nexts[clone_node] = dfa->nexts[org_node];
1540 break;
1541 }
1542 else if (dfa->edests[org_node].nelem == 1)
1543 {
1544 /* In case of the node can epsilon-transit, and it has only one
1545 destination. */
1546 org_dest = dfa->edests[org_node].elems[0];
1547 re_node_set_empty (dfa->edests + clone_node);
1548 /* If the node is root_node itself, it means the epsilon closure
1549 has a loop. Then tie it to the destination of the root_node. */
1550 if (org_node == root_node && clone_node != org_node)
1551 {
1552 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1553 if (BE (! ok, 0))
1554 return REG_ESPACE;
1555 break;
1556 }
1557 /* In case the node has another constraint, append it. */
1558 constraint |= dfa->nodes[org_node].constraint;
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == REG_MISSING, 0))
1561 return REG_ESPACE;
1562 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 if (BE (! ok, 0))
1564 return REG_ESPACE;
1565 }
1566 else /* dfa->edests[org_node].nelem == 2 */
1567 {
1568 /* In case of the node can epsilon-transit, and it has two
1569 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1570 org_dest = dfa->edests[org_node].elems[0];
1571 re_node_set_empty (dfa->edests + clone_node);
1572 /* Search for a duplicated node which satisfies the constraint. */
1573 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574 if (clone_dest == REG_MISSING)
1575 {
1576 /* There is no such duplicated node, create a new one. */
1577 reg_errcode_t err;
1578 clone_dest = duplicate_node (dfa, org_dest, constraint);
1579 if (BE (clone_dest == REG_MISSING, 0))
1580 return REG_ESPACE;
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
1583 return REG_ESPACE;
1584 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585 root_node, constraint);
1586 if (BE (err != REG_NOERROR, 0))
1587 return err;
1588 }
1589 else
1590 {
1591 /* There is a duplicated node which satisfies the constraint,
1592 use it to avoid infinite loop. */
1593 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1594 if (BE (! ok, 0))
1595 return REG_ESPACE;
1596 }
1597
1598 org_dest = dfa->edests[org_node].elems[1];
1599 clone_dest = duplicate_node (dfa, org_dest, constraint);
1600 if (BE (clone_dest == REG_MISSING, 0))
1601 return REG_ESPACE;
1602 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 if (BE (! ok, 0))
1604 return REG_ESPACE;
1605 }
1606 org_node = org_dest;
1607 clone_node = clone_dest;
1608 }
1609 return REG_NOERROR;
1610}
1611
1612/* Search for a node which is duplicated from the node ORG_NODE, and
1613 satisfies the constraint CONSTRAINT. */
1614
1615static Idx
1616search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617 unsigned int constraint)
1618{
1619 Idx idx;
1620 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621 {
1622 if (org_node == dfa->org_indices[idx]
1623 && constraint == dfa->nodes[idx].constraint)
1624 return idx; /* Found. */
1625 }
1626 return REG_MISSING; /* Not found. */
1627}
1628
1629/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630 Return the index of the new node, or REG_MISSING if insufficient storage is
1631 available. */
1632
1633static Idx
1634duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635{
1636 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637 if (BE (dup_idx != REG_MISSING, 1))
1638 {
1639 dfa->nodes[dup_idx].constraint = constraint;
1640 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1641 dfa->nodes[dup_idx].duplicated = 1;
1642
1643 /* Store the index of the original node. */
1644 dfa->org_indices[dup_idx] = org_idx;
1645 }
1646 return dup_idx;
1647}
1648
1649static reg_errcode_t
1650calc_inveclosure (re_dfa_t *dfa)
1651{
1652 Idx src, idx;
1653 bool ok;
1654 for (idx = 0; idx < dfa->nodes_len; ++idx)
1655 re_node_set_init_empty (dfa->inveclosures + idx);
1656
1657 for (src = 0; src < dfa->nodes_len; ++src)
1658 {
1659 Idx *elems = dfa->eclosures[src].elems;
1660 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661 {
1662 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1663 if (BE (! ok, 0))
1664 return REG_ESPACE;
1665 }
1666 }
1667
1668 return REG_NOERROR;
1669}
1670
1671/* Calculate "eclosure" for all the node in DFA. */
1672
1673static reg_errcode_t
1674calc_eclosure (re_dfa_t *dfa)
1675{
1676 Idx node_idx;
1677 bool incomplete;
1678#ifdef DEBUG
1679 assert (dfa->nodes_len > 0);
1680#endif
1681 incomplete = false;
1682 /* For each nodes, calculate epsilon closure. */
1683 for (node_idx = 0; ; ++node_idx)
1684 {
1685 reg_errcode_t err;
1686 re_node_set eclosure_elem;
1687 if (node_idx == dfa->nodes_len)
1688 {
1689 if (!incomplete)
1690 break;
1691 incomplete = false;
1692 node_idx = 0;
1693 }
1694
1695#ifdef DEBUG
1696 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1697#endif
1698
1699 /* If we have already calculated, skip it. */
1700 if (dfa->eclosures[node_idx].nelem != 0)
1701 continue;
1702 /* Calculate epsilon closure of 'node_idx'. */
1703 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704 if (BE (err != REG_NOERROR, 0))
1705 return err;
1706
1707 if (dfa->eclosures[node_idx].nelem == 0)
1708 {
1709 incomplete = true;
1710 re_node_set_free (&eclosure_elem);
1711 }
1712 }
1713 return REG_NOERROR;
1714}
1715
1716/* Calculate epsilon closure of NODE. */
1717
1718static reg_errcode_t
1719calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1720{
1721 reg_errcode_t err;
1722 Idx i;
1723 re_node_set eclosure;
1724 bool ok;
1725 bool incomplete = false;
1726 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729
1730 /* This indicates that we are calculating this node now.
1731 We reference this value to avoid infinite loop. */
1732 dfa->eclosures[node].nelem = REG_MISSING;
1733
1734 /* If the current node has constraints, duplicate all nodes
1735 since they must inherit the constraints. */
1736 if (dfa->nodes[node].constraint
1737 && dfa->edests[node].nelem
1738 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739 {
1740 err = duplicate_node_closure (dfa, node, node, node,
1741 dfa->nodes[node].constraint);
1742 if (BE (err != REG_NOERROR, 0))
1743 return err;
1744 }
1745
1746 /* Expand each epsilon destination nodes. */
1747 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748 for (i = 0; i < dfa->edests[node].nelem; ++i)
1749 {
1750 re_node_set eclosure_elem;
1751 Idx edest = dfa->edests[node].elems[i];
1752 /* If calculating the epsilon closure of 'edest' is in progress,
1753 return intermediate result. */
1754 if (dfa->eclosures[edest].nelem == REG_MISSING)
1755 {
1756 incomplete = true;
1757 continue;
1758 }
1759 /* If we haven't calculated the epsilon closure of 'edest' yet,
1760 calculate now. Otherwise use calculated epsilon closure. */
1761 if (dfa->eclosures[edest].nelem == 0)
1762 {
1763 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764 if (BE (err != REG_NOERROR, 0))
1765 return err;
1766 }
1767 else
1768 eclosure_elem = dfa->eclosures[edest];
1769 /* Merge the epsilon closure of 'edest'. */
1770 err = re_node_set_merge (&eclosure, &eclosure_elem);
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1773 /* If the epsilon closure of 'edest' is incomplete,
1774 the epsilon closure of this node is also incomplete. */
1775 if (dfa->eclosures[edest].nelem == 0)
1776 {
1777 incomplete = true;
1778 re_node_set_free (&eclosure_elem);
1779 }
1780 }
1781
1782 /* An epsilon closure includes itself. */
1783 ok = re_node_set_insert (&eclosure, node);
1784 if (BE (! ok, 0))
1785 return REG_ESPACE;
1786 if (incomplete && !root)
1787 dfa->eclosures[node].nelem = 0;
1788 else
1789 dfa->eclosures[node] = eclosure;
1790 *new_set = eclosure;
1791 return REG_NOERROR;
1792}
1793
1794
1795/* Functions for token which are used in the parser. */
1796
1797/* Fetch a token from INPUT.
1798 We must not use this function inside bracket expressions. */
1799
1800static void
1801internal_function
1802fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1803{
1804 re_string_skip_bytes (input, peek_token (result, input, syntax));
1805}
1806
1807/* Peek a token from INPUT, and return the length of the token.
1808 We must not use this function inside bracket expressions. */
1809
1810static int
1811internal_function
1812peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1813{
1814 unsigned char c;
1815
1816 if (re_string_eoi (input))
1817 {
1818 token->type = END_OF_RE;
1819 return 0;
1820 }
1821
1822 c = re_string_peek_byte (input, 0);
1823 token->opr.c = c;
1824
1825 token->word_char = 0;
1826#ifdef RE_ENABLE_I18N
1827 token->mb_partial = 0;
1828 if (input->mb_cur_max > 1 &&
1829 !re_string_first_byte (input, re_string_cur_idx (input)))
1830 {
1831 token->type = CHARACTER;
1832 token->mb_partial = 1;
1833 return 1;
1834 }
1835#endif
1836 if (c == '\\')
1837 {
1838 unsigned char c2;
1839 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1840 {
1841 token->type = BACK_SLASH;
1842 return 1;
1843 }
1844
1845 c2 = re_string_peek_byte_case (input, 1);
1846 token->opr.c = c2;
1847 token->type = CHARACTER;
1848#ifdef RE_ENABLE_I18N
1849 if (input->mb_cur_max > 1)
1850 {
1851 wint_t wc = re_string_wchar_at (input,
1852 re_string_cur_idx (input) + 1);
1853 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854 }
1855 else
1856#endif
1857 token->word_char = IS_WORD_CHAR (c2) != 0;
1858
1859 switch (c2)
1860 {
1861 case '|':
1862 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1863 token->type = OP_ALT;
1864 break;
1865 case '1': case '2': case '3': case '4': case '5':
1866 case '6': case '7': case '8': case '9':
1867 if (!(syntax & RE_NO_BK_REFS))
1868 {
1869 token->type = OP_BACK_REF;
1870 token->opr.idx = c2 - '1';
1871 }
1872 break;
1873 case '<':
1874 if (!(syntax & RE_NO_GNU_OPS))
1875 {
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = WORD_FIRST;
1878 }
1879 break;
1880 case '>':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 {
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = WORD_LAST;
1885 }
1886 break;
1887 case 'b':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = WORD_DELIM;
1892 }
1893 break;
1894 case 'B':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 {
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = NOT_WORD_DELIM;
1899 }
1900 break;
1901 case 'w':
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_WORD;
1904 break;
1905 case 'W':
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_NOTWORD;
1908 break;
1909 case 's':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = OP_SPACE;
1912 break;
1913 case 'S':
1914 if (!(syntax & RE_NO_GNU_OPS))
1915 token->type = OP_NOTSPACE;
1916 break;
1917 case '`':
1918 if (!(syntax & RE_NO_GNU_OPS))
1919 {
1920 token->type = ANCHOR;
1921 token->opr.ctx_type = BUF_FIRST;
1922 }
1923 break;
1924 case '\'':
1925 if (!(syntax & RE_NO_GNU_OPS))
1926 {
1927 token->type = ANCHOR;
1928 token->opr.ctx_type = BUF_LAST;
1929 }
1930 break;
1931 case '(':
1932 if (!(syntax & RE_NO_BK_PARENS))
1933 token->type = OP_OPEN_SUBEXP;
1934 break;
1935 case ')':
1936 if (!(syntax & RE_NO_BK_PARENS))
1937 token->type = OP_CLOSE_SUBEXP;
1938 break;
1939 case '+':
1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941 token->type = OP_DUP_PLUS;
1942 break;
1943 case '?':
1944 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945 token->type = OP_DUP_QUESTION;
1946 break;
1947 case '{':
1948 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949 token->type = OP_OPEN_DUP_NUM;
1950 break;
1951 case '}':
1952 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953 token->type = OP_CLOSE_DUP_NUM;
1954 break;
1955 default:
1956 break;
1957 }
1958 return 2;
1959 }
1960
1961 token->type = CHARACTER;
1962#ifdef RE_ENABLE_I18N
1963 if (input->mb_cur_max > 1)
1964 {
1965 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1966 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1967 }
1968 else
1969#endif
1970 token->word_char = IS_WORD_CHAR (token->opr.c);
1971
1972 switch (c)
1973 {
1974 case '\n':
1975 if (syntax & RE_NEWLINE_ALT)
1976 token->type = OP_ALT;
1977 break;
1978 case '|':
1979 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1980 token->type = OP_ALT;
1981 break;
1982 case '*':
1983 token->type = OP_DUP_ASTERISK;
1984 break;
1985 case '+':
1986 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987 token->type = OP_DUP_PLUS;
1988 break;
1989 case '?':
1990 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991 token->type = OP_DUP_QUESTION;
1992 break;
1993 case '{':
1994 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995 token->type = OP_OPEN_DUP_NUM;
1996 break;
1997 case '}':
1998 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999 token->type = OP_CLOSE_DUP_NUM;
2000 break;
2001 case '(':
2002 if (syntax & RE_NO_BK_PARENS)
2003 token->type = OP_OPEN_SUBEXP;
2004 break;
2005 case ')':
2006 if (syntax & RE_NO_BK_PARENS)
2007 token->type = OP_CLOSE_SUBEXP;
2008 break;
2009 case '[':
2010 token->type = OP_OPEN_BRACKET;
2011 break;
2012 case '.':
2013 token->type = OP_PERIOD;
2014 break;
2015 case '^':
2016 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2017 re_string_cur_idx (input) != 0)
2018 {
2019 char prev = re_string_peek_byte (input, -1);
2020 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2021 break;
2022 }
2023 token->type = ANCHOR;
2024 token->opr.ctx_type = LINE_FIRST;
2025 break;
2026 case '$':
2027 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2028 re_string_cur_idx (input) + 1 != re_string_length (input))
2029 {
2030 re_token_t next;
2031 re_string_skip_bytes (input, 1);
2032 peek_token (&next, input, syntax);
2033 re_string_skip_bytes (input, -1);
2034 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2035 break;
2036 }
2037 token->type = ANCHOR;
2038 token->opr.ctx_type = LINE_LAST;
2039 break;
2040 default:
2041 break;
2042 }
2043 return 1;
2044}
2045
2046/* Peek a token from INPUT, and return the length of the token.
2047 We must not use this function out of bracket expressions. */
2048
2049static int
2050internal_function
2051peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2052{
2053 unsigned char c;
2054 if (re_string_eoi (input))
2055 {
2056 token->type = END_OF_RE;
2057 return 0;
2058 }
2059 c = re_string_peek_byte (input, 0);
2060 token->opr.c = c;
2061
2062#ifdef RE_ENABLE_I18N
2063 if (input->mb_cur_max > 1 &&
2064 !re_string_first_byte (input, re_string_cur_idx (input)))
2065 {
2066 token->type = CHARACTER;
2067 return 1;
2068 }
2069#endif /* RE_ENABLE_I18N */
2070
2071 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2072 && re_string_cur_idx (input) + 1 < re_string_length (input))
2073 {
2074 /* In this case, '\' escape a character. */
2075 unsigned char c2;
2076 re_string_skip_bytes (input, 1);
2077 c2 = re_string_peek_byte (input, 0);
2078 token->opr.c = c2;
2079 token->type = CHARACTER;
2080 return 1;
2081 }
2082 if (c == '[') /* '[' is a special char in a bracket exps. */
2083 {
2084 unsigned char c2;
2085 int token_len;
2086 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2087 c2 = re_string_peek_byte (input, 1);
2088 else
2089 c2 = 0;
2090 token->opr.c = c2;
2091 token_len = 2;
2092 switch (c2)
2093 {
2094 case '.':
2095 token->type = OP_OPEN_COLL_ELEM;
2096 break;
2097 case '=':
2098 token->type = OP_OPEN_EQUIV_CLASS;
2099 break;
2100 case ':':
2101 if (syntax & RE_CHAR_CLASSES)
2102 {
2103 token->type = OP_OPEN_CHAR_CLASS;
2104 break;
2105 }
2106 /* else fall through. */
2107 default:
2108 token->type = CHARACTER;
2109 token->opr.c = c;
2110 token_len = 1;
2111 break;
2112 }
2113 return token_len;
2114 }
2115 switch (c)
2116 {
2117 case '-':
2118 token->type = OP_CHARSET_RANGE;
2119 break;
2120 case ']':
2121 token->type = OP_CLOSE_BRACKET;
2122 break;
2123 case '^':
2124 token->type = OP_NON_MATCH_LIST;
2125 break;
2126 default:
2127 token->type = CHARACTER;
2128 }
2129 return 1;
2130}
2131
2132
2133/* Functions for parser. */
2134
2135/* Entry point of the parser.
2136 Parse the regular expression REGEXP and return the structure tree.
2137 If an error occurs, ERR is set by error code, and return NULL.
2138 This function build the following tree, from regular expression <reg_exp>:
2139 CAT
2140 / \
2141 / \
2142 <reg_exp> EOR
2143
2144 CAT means concatenation.
2145 EOR means end of regular expression. */
2146
2147static bin_tree_t *
2148parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2149 reg_errcode_t *err)
2150{
2151 re_dfa_t *dfa = preg->buffer;
2152 bin_tree_t *tree, *eor, *root;
2153 re_token_t current_token;
2154 dfa->syntax = syntax;
2155 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2156 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2157 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158 return NULL;
2159 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2160 if (tree != NULL)
2161 root = create_tree (dfa, tree, eor, CONCAT);
2162 else
2163 root = eor;
2164 if (BE (eor == NULL || root == NULL, 0))
2165 {
2166 *err = REG_ESPACE;
2167 return NULL;
2168 }
2169 return root;
2170}
2171
2172/* This function build the following tree, from regular expression
2173 <branch1>|<branch2>:
2174 ALT
2175 / \
2176 / \
2177 <branch1> <branch2>
2178
2179 ALT means alternative, which represents the operator '|'. */
2180
2181static bin_tree_t *
2182parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2183 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2184{
2185 re_dfa_t *dfa = preg->buffer;
2186 bin_tree_t *tree, *branch = NULL;
2187 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 return NULL;
2190
2191 while (token->type == OP_ALT)
2192 {
2193 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2194 if (token->type != OP_ALT && token->type != END_OF_RE
2195 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 {
2197 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2198 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2199 return NULL;
2200 }
2201 else
2202 branch = NULL;
2203 tree = create_tree (dfa, tree, branch, OP_ALT);
2204 if (BE (tree == NULL, 0))
2205 {
2206 *err = REG_ESPACE;
2207 return NULL;
2208 }
2209 }
2210 return tree;
2211}
2212
2213/* This function build the following tree, from regular expression
2214 <exp1><exp2>:
2215 CAT
2216 / \
2217 / \
2218 <exp1> <exp2>
2219
2220 CAT means concatenation. */
2221
2222static bin_tree_t *
2223parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225{
2226 bin_tree_t *tree, *expr;
2227 re_dfa_t *dfa = preg->buffer;
2228 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2229 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2230 return NULL;
2231
2232 while (token->type != OP_ALT && token->type != END_OF_RE
2233 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2234 {
2235 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2236 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2237 {
2238 if (tree != NULL)
2239 postorder (tree, free_tree, NULL);
2240 return NULL;
2241 }
2242 if (tree != NULL && expr != NULL)
2243 {
2244 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2245 if (newtree == NULL)
2246 {
2247 postorder (expr, free_tree, NULL);
2248 postorder (tree, free_tree, NULL);
2249 *err = REG_ESPACE;
2250 return NULL;
2251 }
2252 tree = newtree;
2253 }
2254 else if (tree == NULL)
2255 tree = expr;
2256 /* Otherwise expr == NULL, we don't need to create new tree. */
2257 }
2258 return tree;
2259}
2260
2261/* This function build the following tree, from regular expression a*:
2262 *
2263 |
2264 a
2265*/
2266
2267static bin_tree_t *
2268parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2269 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2270{
2271 re_dfa_t *dfa = preg->buffer;
2272 bin_tree_t *tree;
2273 switch (token->type)
2274 {
2275 case CHARACTER:
2276 tree = create_token_tree (dfa, NULL, NULL, token);
2277 if (BE (tree == NULL, 0))
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282#ifdef RE_ENABLE_I18N
2283 if (dfa->mb_cur_max > 1)
2284 {
2285 while (!re_string_eoi (regexp)
2286 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2287 {
2288 bin_tree_t *mbc_remain;
2289 fetch_token (token, regexp, syntax);
2290 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2291 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2292 if (BE (mbc_remain == NULL || tree == NULL, 0))
2293 {
2294 *err = REG_ESPACE;
2295 return NULL;
2296 }
2297 }
2298 }
2299#endif
2300 break;
2301 case OP_OPEN_SUBEXP:
2302 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 return NULL;
2305 break;
2306 case OP_OPEN_BRACKET:
2307 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2308 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2309 return NULL;
2310 break;
2311 case OP_BACK_REF:
2312 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2313 {
2314 *err = REG_ESUBREG;
2315 return NULL;
2316 }
2317 dfa->used_bkref_map |= 1 << token->opr.idx;
2318 tree = create_token_tree (dfa, NULL, NULL, token);
2319 if (BE (tree == NULL, 0))
2320 {
2321 *err = REG_ESPACE;
2322 return NULL;
2323 }
2324 ++dfa->nbackref;
2325 dfa->has_mb_node = 1;
2326 break;
2327 case OP_OPEN_DUP_NUM:
2328 if (syntax & RE_CONTEXT_INVALID_DUP)
2329 {
2330 *err = REG_BADRPT;
2331 return NULL;
2332 }
2333 /* FALLTHROUGH */
2334 case OP_DUP_ASTERISK:
2335 case OP_DUP_PLUS:
2336 case OP_DUP_QUESTION:
2337 if (syntax & RE_CONTEXT_INVALID_OPS)
2338 {
2339 *err = REG_BADRPT;
2340 return NULL;
2341 }
2342 else if (syntax & RE_CONTEXT_INDEP_OPS)
2343 {
2344 fetch_token (token, regexp, syntax);
2345 return parse_expression (regexp, preg, token, syntax, nest, err);
2346 }
2347 /* else fall through */
2348 case OP_CLOSE_SUBEXP:
2349 if ((token->type == OP_CLOSE_SUBEXP) &&
2350 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2351 {
2352 *err = REG_ERPAREN;
2353 return NULL;
2354 }
2355 /* else fall through */
2356 case OP_CLOSE_DUP_NUM:
2357 /* We treat it as a normal character. */
2358
2359 /* Then we can these characters as normal characters. */
2360 token->type = CHARACTER;
2361 /* mb_partial and word_char bits should be initialized already
2362 by peek_token. */
2363 tree = create_token_tree (dfa, NULL, NULL, token);
2364 if (BE (tree == NULL, 0))
2365 {
2366 *err = REG_ESPACE;
2367 return NULL;
2368 }
2369 break;
2370 case ANCHOR:
2371 if ((token->opr.ctx_type
2372 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2373 && dfa->word_ops_used == 0)
2374 init_word_char (dfa);
2375 if (token->opr.ctx_type == WORD_DELIM
2376 || token->opr.ctx_type == NOT_WORD_DELIM)
2377 {
2378 bin_tree_t *tree_first, *tree_last;
2379 if (token->opr.ctx_type == WORD_DELIM)
2380 {
2381 token->opr.ctx_type = WORD_FIRST;
2382 tree_first = create_token_tree (dfa, NULL, NULL, token);
2383 token->opr.ctx_type = WORD_LAST;
2384 }
2385 else
2386 {
2387 token->opr.ctx_type = INSIDE_WORD;
2388 tree_first = create_token_tree (dfa, NULL, NULL, token);
2389 token->opr.ctx_type = INSIDE_NOTWORD;
2390 }
2391 tree_last = create_token_tree (dfa, NULL, NULL, token);
2392 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2394 {
2395 *err = REG_ESPACE;
2396 return NULL;
2397 }
2398 }
2399 else
2400 {
2401 tree = create_token_tree (dfa, NULL, NULL, token);
2402 if (BE (tree == NULL, 0))
2403 {
2404 *err = REG_ESPACE;
2405 return NULL;
2406 }
2407 }
2408 /* We must return here, since ANCHORs can't be followed
2409 by repetition operators.
2410 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2411 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2412 fetch_token (token, regexp, syntax);
2413 return tree;
2414 case OP_PERIOD:
2415 tree = create_token_tree (dfa, NULL, NULL, token);
2416 if (BE (tree == NULL, 0))
2417 {
2418 *err = REG_ESPACE;
2419 return NULL;
2420 }
2421 if (dfa->mb_cur_max > 1)
2422 dfa->has_mb_node = 1;
2423 break;
2424 case OP_WORD:
2425 case OP_NOTWORD:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 (const unsigned char *) "alnum",
2428 (const unsigned char *) "_",
2429 token->type == OP_NOTWORD, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2433 case OP_SPACE:
2434 case OP_NOTSPACE:
2435 tree = build_charclass_op (dfa, regexp->trans,
2436 (const unsigned char *) "space",
2437 (const unsigned char *) "",
2438 token->type == OP_NOTSPACE, err);
2439 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2440 return NULL;
2441 break;
2442 case OP_ALT:
2443 case END_OF_RE:
2444 return NULL;
2445 case BACK_SLASH:
2446 *err = REG_EESCAPE;
2447 return NULL;
2448 default:
2449 /* Must not happen? */
2450#ifdef DEBUG
2451 assert (0);
2452#endif
2453 return NULL;
2454 }
2455 fetch_token (token, regexp, syntax);
2456
2457 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2458 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2459 {
2460 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2461 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2462 return NULL;
2463 /* In BRE consecutive duplications are not allowed. */
2464 if ((syntax & RE_CONTEXT_INVALID_DUP)
2465 && (token->type == OP_DUP_ASTERISK
2466 || token->type == OP_OPEN_DUP_NUM))
2467 {
2468 *err = REG_BADRPT;
2469 return NULL;
2470 }
2471 }
2472
2473 return tree;
2474}
2475
2476/* This function build the following tree, from regular expression
2477 (<reg_exp>):
2478 SUBEXP
2479 |
2480 <reg_exp>
2481*/
2482
2483static bin_tree_t *
2484parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2485 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2486{
2487 re_dfa_t *dfa = preg->buffer;
2488 bin_tree_t *tree;
2489 size_t cur_nsub;
2490 cur_nsub = preg->re_nsub++;
2491
2492 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2493
2494 /* The subexpression may be a null string. */
2495 if (token->type == OP_CLOSE_SUBEXP)
2496 tree = NULL;
2497 else
2498 {
2499 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2500 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2501 {
2502 if (tree != NULL)
2503 postorder (tree, free_tree, NULL);
2504 *err = REG_EPAREN;
2505 }
2506 if (BE (*err != REG_NOERROR, 0))
2507 return NULL;
2508 }
2509
2510 if (cur_nsub <= '9' - '1')
2511 dfa->completed_bkref_map |= 1 << cur_nsub;
2512
2513 tree = create_tree (dfa, tree, NULL, SUBEXP);
2514 if (BE (tree == NULL, 0))
2515 {
2516 *err = REG_ESPACE;
2517 return NULL;
2518 }
2519 tree->token.opr.idx = cur_nsub;
2520 return tree;
2521}
2522
2523/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2524
2525static bin_tree_t *
2526parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2527 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2528{
2529 bin_tree_t *tree = NULL, *old_tree = NULL;
2530 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2531 re_token_t start_token = *token;
2532
2533 if (token->type == OP_OPEN_DUP_NUM)
2534 {
2535 end = 0;
2536 start = fetch_number (regexp, token, syntax);
2537 if (start == REG_MISSING)
2538 {
2539 if (token->type == CHARACTER && token->opr.c == ',')
2540 start = 0; /* We treat "{,m}" as "{0,m}". */
2541 else
2542 {
2543 *err = REG_BADBR; /* <re>{} is invalid. */
2544 return NULL;
2545 }
2546 }
2547 if (BE (start != REG_ERROR, 1))
2548 {
2549 /* We treat "{n}" as "{n,n}". */
2550 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2551 : ((token->type == CHARACTER && token->opr.c == ',')
2552 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2553 }
2554 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2555 {
2556 /* Invalid sequence. */
2557 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2558 {
2559 if (token->type == END_OF_RE)
2560 *err = REG_EBRACE;
2561 else
2562 *err = REG_BADBR;
2563
2564 return NULL;
2565 }
2566
2567 /* If the syntax bit is set, rollback. */
2568 re_string_set_index (regexp, start_idx);
2569 *token = start_token;
2570 token->type = CHARACTER;
2571 /* mb_partial and word_char bits should be already initialized by
2572 peek_token. */
2573 return elem;
2574 }
2575
2576 if (BE ((end != REG_MISSING && start > end)
2577 || token->type != OP_CLOSE_DUP_NUM, 0))
2578 {
2579 /* First number greater than second. */
2580 *err = REG_BADBR;
2581 return NULL;
2582 }
2583
2584 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2585 {
2586 *err = REG_ESIZE;
2587 return NULL;
2588 }
2589 }
2590 else
2591 {
2592 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2593 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2594 }
2595
2596 fetch_token (token, regexp, syntax);
2597
2598 if (BE (elem == NULL, 0))
2599 return NULL;
2600 if (BE (start == 0 && end == 0, 0))
2601 {
2602 postorder (elem, free_tree, NULL);
2603 return NULL;
2604 }
2605
2606 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2607 if (BE (start > 0, 0))
2608 {
2609 tree = elem;
2610 for (i = 2; i <= start; ++i)
2611 {
2612 elem = duplicate_tree (elem, dfa);
2613 tree = create_tree (dfa, tree, elem, CONCAT);
2614 if (BE (elem == NULL || tree == NULL, 0))
2615 goto parse_dup_op_espace;
2616 }
2617
2618 if (start == end)
2619 return tree;
2620
2621 /* Duplicate ELEM before it is marked optional. */
2622 elem = duplicate_tree (elem, dfa);
2623 old_tree = tree;
2624 }
2625 else
2626 old_tree = NULL;
2627
2628 if (elem->token.type == SUBEXP)
2629 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2630
2631 tree = create_tree (dfa, elem, NULL,
2632 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633 if (BE (tree == NULL, 0))
2634 goto parse_dup_op_espace;
2635
2636/* From gnulib's "intprops.h":
2637 True if the arithmetic type T is signed. */
2638#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2639
2640 /* This loop is actually executed only when end != REG_MISSING,
2641 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2642 already created the start+1-th copy. */
2643 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2644 for (i = start + 2; i <= end; ++i)
2645 {
2646 elem = duplicate_tree (elem, dfa);
2647 tree = create_tree (dfa, tree, elem, CONCAT);
2648 if (BE (elem == NULL || tree == NULL, 0))
2649 goto parse_dup_op_espace;
2650
2651 tree = create_tree (dfa, tree, NULL, OP_ALT);
2652 if (BE (tree == NULL, 0))
2653 goto parse_dup_op_espace;
2654 }
2655
2656 if (old_tree)
2657 tree = create_tree (dfa, old_tree, tree, CONCAT);
2658
2659 return tree;
2660
2661 parse_dup_op_espace:
2662 *err = REG_ESPACE;
2663 return NULL;
2664}
2665
2666/* Size of the names for collating symbol/equivalence_class/character_class.
2667 I'm not sure, but maybe enough. */
2668#define BRACKET_NAME_BUF_SIZE 32
2669
2670#ifndef _LIBC
2671 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672 Build the range expression which starts from START_ELEM, and ends
2673 at END_ELEM. The result are written to MBCSET and SBCSET.
2674 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2675 mbcset->range_ends, is a pointer argument since we may
2676 update it. */
2677
2678static reg_errcode_t
2679internal_function
2680# ifdef RE_ENABLE_I18N
2681build_range_exp (const reg_syntax_t syntax,
2682 bitset_t sbcset,
2683 re_charset_t *mbcset,
2684 Idx *range_alloc,
2685 const bracket_elem_t *start_elem,
2686 const bracket_elem_t *end_elem)
2687# else /* not RE_ENABLE_I18N */
2688build_range_exp (const reg_syntax_t syntax,
2689 bitset_t sbcset,
2690 const bracket_elem_t *start_elem,
2691 const bracket_elem_t *end_elem)
2692# endif /* not RE_ENABLE_I18N */
2693{
2694 unsigned int start_ch, end_ch;
2695 /* Equivalence Classes and Character Classes can't be a range start/end. */
2696 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2697 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2698 0))
2699 return REG_ERANGE;
2700
2701 /* We can handle no multi character collating elements without libc
2702 support. */
2703 if (BE ((start_elem->type == COLL_SYM
2704 && strlen ((char *) start_elem->opr.name) > 1)
2705 || (end_elem->type == COLL_SYM
2706 && strlen ((char *) end_elem->opr.name) > 1), 0))
2707 return REG_ECOLLATE;
2708
2709# ifdef RE_ENABLE_I18N
2710 {
2711 wchar_t wc;
2712 wint_t start_wc;
2713 wint_t end_wc;
2714 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2715
2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718 : 0));
2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721 : 0));
2722 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 ? __btowc (start_ch) : start_elem->opr.wch);
2724 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 ? __btowc (end_ch) : end_elem->opr.wch);
2726 if (start_wc == WEOF || end_wc == WEOF)
2727 return REG_ECOLLATE;
2728 cmp_buf[0] = start_wc;
2729 cmp_buf[4] = end_wc;
2730
2731 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733 return REG_ERANGE;
2734
2735 /* Got valid collation sequence values, add them as a new entry.
2736 However, for !_LIBC we have no collation elements: if the
2737 character set is single byte, the single byte character set
2738 that we build below suffices. parse_bracket_exp passes
2739 no MBCSET if dfa->mb_cur_max == 1. */
2740 if (mbcset)
2741 {
2742 /* Check the space of the arrays. */
2743 if (BE (*range_alloc == mbcset->nranges, 0))
2744 {
2745 /* There is not enough space, need realloc. */
2746 wchar_t *new_array_start, *new_array_end;
2747 Idx new_nranges;
2748
2749 /* +1 in case of mbcset->nranges is 0. */
2750 new_nranges = 2 * mbcset->nranges + 1;
2751 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2752 are NULL if *range_alloc == 0. */
2753 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2754 new_nranges);
2755 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756 new_nranges);
2757
2758 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2759 return REG_ESPACE;
2760
2761 mbcset->range_starts = new_array_start;
2762 mbcset->range_ends = new_array_end;
2763 *range_alloc = new_nranges;
2764 }
2765
2766 mbcset->range_starts[mbcset->nranges] = start_wc;
2767 mbcset->range_ends[mbcset->nranges++] = end_wc;
2768 }
2769
2770 /* Build the table for single byte characters. */
2771 for (wc = 0; wc < SBC_MAX; ++wc)
2772 {
2773 cmp_buf[2] = wc;
2774 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776 bitset_set (sbcset, wc);
2777 }
2778 }
2779# else /* not RE_ENABLE_I18N */
2780 {
2781 unsigned int ch;
2782 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2784 : 0));
2785 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2787 : 0));
2788 if (start_ch > end_ch)
2789 return REG_ERANGE;
2790 /* Build the table for single byte characters. */
2791 for (ch = 0; ch < SBC_MAX; ++ch)
2792 if (start_ch <= ch && ch <= end_ch)
2793 bitset_set (sbcset, ch);
2794 }
2795# endif /* not RE_ENABLE_I18N */
2796 return REG_NOERROR;
2797}
2798#endif /* not _LIBC */
2799
2800#ifndef _LIBC
2801/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802 Build the collating element which is represented by NAME.
2803 The result are written to MBCSET and SBCSET.
2804 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805 pointer argument since we may update it. */
2806
2807static reg_errcode_t
2808internal_function
2809# ifdef RE_ENABLE_I18N
2810build_collating_symbol (bitset_t sbcset,
2811 re_charset_t *mbcset _UNUSED_PARAMETER_,
2812 Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2813 const unsigned char *name)
2814# else /* not RE_ENABLE_I18N */
2815build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2816# endif /* not RE_ENABLE_I18N */
2817{
2818 size_t name_len = strlen ((const char *) name);
2819 if (BE (name_len != 1, 0))
2820 return REG_ECOLLATE;
2821 else
2822 {
2823 bitset_set (sbcset, name[0]);
2824 return REG_NOERROR;
2825 }
2826}
2827#endif /* not _LIBC */
2828
2829/* This function parse bracket expression like "[abc]", "[a-c]",
2830 "[[.a-a.]]" etc. */
2831
2832static bin_tree_t *
2833parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2834 reg_syntax_t syntax, reg_errcode_t *err)
2835{
2836#ifdef _LIBC
2837 const unsigned char *collseqmb;
2838 const char *collseqwc;
2839 uint32_t nrules;
2840 int32_t table_size;
2841 const int32_t *symb_table;
2842 const unsigned char *extra;
2843
2844 /* Local function for parse_bracket_exp used in _LIBC environment.
2845 Seek the collating symbol entry corresponding to NAME.
2846 Return the index of the symbol in the SYMB_TABLE. */
2847
2848 auto inline int32_t
2849 __attribute ((always_inline))
2850 seek_collating_symbol_entry (name, name_len)
2851 const unsigned char *name;
2852 size_t name_len;
2853 {
2854 int32_t hash = elem_hash ((const char *) name, name_len);
2855 int32_t elem = hash % table_size;
2856 if (symb_table[2 * elem] != 0)
2857 {
2858 int32_t second = hash % (table_size - 2) + 1;
2859
2860 do
2861 {
2862 /* First compare the hashing value. */
2863 if (symb_table[2 * elem] == hash
2864 /* Compare the length of the name. */
2865 && name_len == extra[symb_table[2 * elem + 1]]
2866 /* Compare the name. */
2867 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2868 name_len) == 0)
2869 {
2870 /* Yep, this is the entry. */
2871 break;
2872 }
2873
2874 /* Next entry. */
2875 elem += second;
2876 }
2877 while (symb_table[2 * elem] != 0);
2878 }
2879 return elem;
2880 }
2881
2882 /* Local function for parse_bracket_exp used in _LIBC environment.
2883 Look up the collation sequence value of BR_ELEM.
2884 Return the value if succeeded, UINT_MAX otherwise. */
2885
2886 auto inline unsigned int
2887 __attribute ((always_inline))
2888 lookup_collation_sequence_value (br_elem)
2889 bracket_elem_t *br_elem;
2890 {
2891 if (br_elem->type == SB_CHAR)
2892 {
2893 /*
2894 if (MB_CUR_MAX == 1)
2895 */
2896 if (nrules == 0)
2897 return collseqmb[br_elem->opr.ch];
2898 else
2899 {
2900 wint_t wc = __btowc (br_elem->opr.ch);
2901 return __collseq_table_lookup (collseqwc, wc);
2902 }
2903 }
2904 else if (br_elem->type == MB_CHAR)
2905 {
2906 if (nrules != 0)
2907 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908 }
2909 else if (br_elem->type == COLL_SYM)
2910 {
2911 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912 if (nrules != 0)
2913 {
2914 int32_t elem, idx;
2915 elem = seek_collating_symbol_entry (br_elem->opr.name,
2916 sym_name_len);
2917 if (symb_table[2 * elem] != 0)
2918 {
2919 /* We found the entry. */
2920 idx = symb_table[2 * elem + 1];
2921 /* Skip the name of collating element name. */
2922 idx += 1 + extra[idx];
2923 /* Skip the byte sequence of the collating element. */
2924 idx += 1 + extra[idx];
2925 /* Adjust for the alignment. */
2926 idx = (idx + 3) & ~3;
2927 /* Skip the multibyte collation sequence value. */
2928 idx += sizeof (unsigned int);
2929 /* Skip the wide char sequence of the collating element. */
2930 idx += sizeof (unsigned int) *
2931 (1 + *(unsigned int *) (extra + idx));
2932 /* Return the collation sequence value. */
2933 return *(unsigned int *) (extra + idx);
2934 }
2935 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2936 {
2937 /* No valid character. Match it as a single byte
2938 character. */
2939 return collseqmb[br_elem->opr.name[0]];
2940 }
2941 }
2942 else if (sym_name_len == 1)
2943 return collseqmb[br_elem->opr.name[0]];
2944 }
2945 return UINT_MAX;
2946 }
2947
2948 /* Local function for parse_bracket_exp used in _LIBC environment.
2949 Build the range expression which starts from START_ELEM, and ends
2950 at END_ELEM. The result are written to MBCSET and SBCSET.
2951 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952 mbcset->range_ends, is a pointer argument since we may
2953 update it. */
2954
2955 auto inline reg_errcode_t
2956 __attribute ((always_inline))
2957 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2958 re_charset_t *mbcset;
2959 Idx *range_alloc;
2960 bitset_t sbcset;
2961 bracket_elem_t *start_elem, *end_elem;
2962 {
2963 unsigned int ch;
2964 uint32_t start_collseq;
2965 uint32_t end_collseq;
2966
2967 /* Equivalence Classes and Character Classes can't be a range
2968 start/end. */
2969 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2970 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2971 0))
2972 return REG_ERANGE;
2973
2974 start_collseq = lookup_collation_sequence_value (start_elem);
2975 end_collseq = lookup_collation_sequence_value (end_elem);
2976 /* Check start/end collation sequence values. */
2977 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2978 return REG_ECOLLATE;
2979 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2980 return REG_ERANGE;
2981
2982 /* Got valid collation sequence values, add them as a new entry.
2983 However, if we have no collation elements, and the character set
2984 is single byte, the single byte character set that we
2985 build below suffices. */
2986 if (nrules > 0 || dfa->mb_cur_max > 1)
2987 {
2988 /* Check the space of the arrays. */
2989 if (BE (*range_alloc == mbcset->nranges, 0))
2990 {
2991 /* There is not enough space, need realloc. */
2992 uint32_t *new_array_start;
2993 uint32_t *new_array_end;
2994 Idx new_nranges;
2995
2996 /* +1 in case of mbcset->nranges is 0. */
2997 new_nranges = 2 * mbcset->nranges + 1;
2998 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2999 new_nranges);
3000 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3001 new_nranges);
3002
3003 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3004 return REG_ESPACE;
3005
3006 mbcset->range_starts = new_array_start;
3007 mbcset->range_ends = new_array_end;
3008 *range_alloc = new_nranges;
3009 }
3010
3011 mbcset->range_starts[mbcset->nranges] = start_collseq;
3012 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013 }
3014
3015 /* Build the table for single byte characters. */
3016 for (ch = 0; ch < SBC_MAX; ch++)
3017 {
3018 uint32_t ch_collseq;
3019 /*
3020 if (MB_CUR_MAX == 1)
3021 */
3022 if (nrules == 0)
3023 ch_collseq = collseqmb[ch];
3024 else
3025 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3026 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3027 bitset_set (sbcset, ch);
3028 }
3029 return REG_NOERROR;
3030 }
3031
3032 /* Local function for parse_bracket_exp used in _LIBC environment.
3033 Build the collating element which is represented by NAME.
3034 The result are written to MBCSET and SBCSET.
3035 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3036 pointer argument since we may update it. */
3037
3038 auto inline reg_errcode_t
3039 __attribute ((always_inline))
3040 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3041 re_charset_t *mbcset;
3042 Idx *coll_sym_alloc;
3043 bitset_t sbcset;
3044 const unsigned char *name;
3045 {
3046 int32_t elem, idx;
3047 size_t name_len = strlen ((const char *) name);
3048 if (nrules != 0)
3049 {
3050 elem = seek_collating_symbol_entry (name, name_len);
3051 if (symb_table[2 * elem] != 0)
3052 {
3053 /* We found the entry. */
3054 idx = symb_table[2 * elem + 1];
3055 /* Skip the name of collating element name. */
3056 idx += 1 + extra[idx];
3057 }
3058 else if (symb_table[2 * elem] == 0 && name_len == 1)
3059 {
3060 /* No valid character, treat it as a normal
3061 character. */
3062 bitset_set (sbcset, name[0]);
3063 return REG_NOERROR;
3064 }
3065 else
3066 return REG_ECOLLATE;
3067
3068 /* Got valid collation sequence, add it as a new entry. */
3069 /* Check the space of the arrays. */
3070 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3071 {
3072 /* Not enough, realloc it. */
3073 /* +1 in case of mbcset->ncoll_syms is 0. */
3074 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3075 /* Use realloc since mbcset->coll_syms is NULL
3076 if *alloc == 0. */
3077 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3078 new_coll_sym_alloc);
3079 if (BE (new_coll_syms == NULL, 0))
3080 return REG_ESPACE;
3081 mbcset->coll_syms = new_coll_syms;
3082 *coll_sym_alloc = new_coll_sym_alloc;
3083 }
3084 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085 return REG_NOERROR;
3086 }
3087 else
3088 {
3089 if (BE (name_len != 1, 0))
3090 return REG_ECOLLATE;
3091 else
3092 {
3093 bitset_set (sbcset, name[0]);
3094 return REG_NOERROR;
3095 }
3096 }
3097 }
3098#endif
3099
3100 re_token_t br_token;
3101 re_bitset_ptr_t sbcset;
3102#ifdef RE_ENABLE_I18N
3103 re_charset_t *mbcset;
3104 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3105 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3106#endif /* not RE_ENABLE_I18N */
3107 bool non_match = false;
3108 bin_tree_t *work_tree;
3109 int token_len;
3110 bool first_round = true;
3111#ifdef _LIBC
3112 collseqmb = (const unsigned char *)
3113 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3114 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3115 if (nrules)
3116 {
3117 /*
3118 if (MB_CUR_MAX > 1)
3119 */
3120 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3121 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3122 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3123 _NL_COLLATE_SYMB_TABLEMB);
3124 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3125 _NL_COLLATE_SYMB_EXTRAMB);
3126 }
3127#endif
3128 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3129#ifdef RE_ENABLE_I18N
3130 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3131#endif /* RE_ENABLE_I18N */
3132#ifdef RE_ENABLE_I18N
3133 if (BE (sbcset == NULL || mbcset == NULL, 0))
3134#else
3135 if (BE (sbcset == NULL, 0))
3136#endif /* RE_ENABLE_I18N */
3137 {
3138 re_free (sbcset);
3139#ifdef RE_ENABLE_I18N
3140 re_free (mbcset);
3141#endif
3142 *err = REG_ESPACE;
3143 return NULL;
3144 }
3145
3146 token_len = peek_token_bracket (token, regexp, syntax);
3147 if (BE (token->type == END_OF_RE, 0))
3148 {
3149 *err = REG_BADPAT;
3150 goto parse_bracket_exp_free_return;
3151 }
3152 if (token->type == OP_NON_MATCH_LIST)
3153 {
3154#ifdef RE_ENABLE_I18N
3155 mbcset->non_match = 1;
3156#endif /* not RE_ENABLE_I18N */
3157 non_match = true;
3158 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3159 bitset_set (sbcset, '\n');
3160 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161 token_len = peek_token_bracket (token, regexp, syntax);
3162 if (BE (token->type == END_OF_RE, 0))
3163 {
3164 *err = REG_BADPAT;
3165 goto parse_bracket_exp_free_return;
3166 }
3167 }
3168
3169 /* We treat the first ']' as a normal character. */
3170 if (token->type == OP_CLOSE_BRACKET)
3171 token->type = CHARACTER;
3172
3173 while (1)
3174 {
3175 bracket_elem_t start_elem, end_elem;
3176 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3177 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178 reg_errcode_t ret;
3179 int token_len2 = 0;
3180 bool is_range_exp = false;
3181 re_token_t token2;
3182
3183 start_elem.opr.name = start_name_buf;
3184 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3185 syntax, first_round);
3186 if (BE (ret != REG_NOERROR, 0))
3187 {
3188 *err = ret;
3189 goto parse_bracket_exp_free_return;
3190 }
3191 first_round = false;
3192
3193 /* Get information about the next token. We need it in any case. */
3194 token_len = peek_token_bracket (token, regexp, syntax);
3195
3196 /* Do not check for ranges if we know they are not allowed. */
3197 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3198 {
3199 if (BE (token->type == END_OF_RE, 0))
3200 {
3201 *err = REG_EBRACK;
3202 goto parse_bracket_exp_free_return;
3203 }
3204 if (token->type == OP_CHARSET_RANGE)
3205 {
3206 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3207 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3208 if (BE (token2.type == END_OF_RE, 0))
3209 {
3210 *err = REG_EBRACK;
3211 goto parse_bracket_exp_free_return;
3212 }
3213 if (token2.type == OP_CLOSE_BRACKET)
3214 {
3215 /* We treat the last '-' as a normal character. */
3216 re_string_skip_bytes (regexp, -token_len);
3217 token->type = CHARACTER;
3218 }
3219 else
3220 is_range_exp = true;
3221 }
3222 }
3223
3224 if (is_range_exp == true)
3225 {
3226 end_elem.opr.name = end_name_buf;
3227 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3228 dfa, syntax, true);
3229 if (BE (ret != REG_NOERROR, 0))
3230 {
3231 *err = ret;
3232 goto parse_bracket_exp_free_return;
3233 }
3234
3235 token_len = peek_token_bracket (token, regexp, syntax);
3236
3237#ifdef _LIBC
3238 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3239 &start_elem, &end_elem);
3240#else
3241# ifdef RE_ENABLE_I18N
3242 *err = build_range_exp (syntax, sbcset,
3243 dfa->mb_cur_max > 1 ? mbcset : NULL,
3244 &range_alloc, &start_elem, &end_elem);
3245# else
3246 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3247# endif
3248#endif /* RE_ENABLE_I18N */
3249 if (BE (*err != REG_NOERROR, 0))
3250 goto parse_bracket_exp_free_return;
3251 }
3252 else
3253 {
3254 switch (start_elem.type)
3255 {
3256 case SB_CHAR:
3257 bitset_set (sbcset, start_elem.opr.ch);
3258 break;
3259#ifdef RE_ENABLE_I18N
3260 case MB_CHAR:
3261 /* Check whether the array has enough space. */
3262 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3263 {
3264 wchar_t *new_mbchars;
3265 /* Not enough, realloc it. */
3266 /* +1 in case of mbcset->nmbchars is 0. */
3267 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3268 /* Use realloc since array is NULL if *alloc == 0. */
3269 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3270 mbchar_alloc);
3271 if (BE (new_mbchars == NULL, 0))
3272 goto parse_bracket_exp_espace;
3273 mbcset->mbchars = new_mbchars;
3274 }
3275 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3276 break;
3277#endif /* RE_ENABLE_I18N */
3278 case EQUIV_CLASS:
3279 *err = build_equiv_class (sbcset,
3280#ifdef RE_ENABLE_I18N
3281 mbcset, &equiv_class_alloc,
3282#endif /* RE_ENABLE_I18N */
3283 start_elem.opr.name);
3284 if (BE (*err != REG_NOERROR, 0))
3285 goto parse_bracket_exp_free_return;
3286 break;
3287 case COLL_SYM:
3288 *err = build_collating_symbol (sbcset,
3289#ifdef RE_ENABLE_I18N
3290 mbcset, &coll_sym_alloc,
3291#endif /* RE_ENABLE_I18N */
3292 start_elem.opr.name);
3293 if (BE (*err != REG_NOERROR, 0))
3294 goto parse_bracket_exp_free_return;
3295 break;
3296 case CHAR_CLASS:
3297 *err = build_charclass (regexp->trans, sbcset,
3298#ifdef RE_ENABLE_I18N
3299 mbcset, &char_class_alloc,
3300#endif /* RE_ENABLE_I18N */
3301 start_elem.opr.name, syntax);
3302 if (BE (*err != REG_NOERROR, 0))
3303 goto parse_bracket_exp_free_return;
3304 break;
3305 default:
3306 assert (0);
3307 break;
3308 }
3309 }
3310 if (BE (token->type == END_OF_RE, 0))
3311 {
3312 *err = REG_EBRACK;
3313 goto parse_bracket_exp_free_return;
3314 }
3315 if (token->type == OP_CLOSE_BRACKET)
3316 break;
3317 }
3318
3319 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3320
3321 /* If it is non-matching list. */
3322 if (non_match)
3323 bitset_not (sbcset);
3324
3325#ifdef RE_ENABLE_I18N
3326 /* Ensure only single byte characters are set. */
3327 if (dfa->mb_cur_max > 1)
3328 bitset_mask (sbcset, dfa->sb_char);
3329
3330 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3331 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3332 || mbcset->non_match)))
3333 {
3334 bin_tree_t *mbc_tree;
3335 int sbc_idx;
3336 /* Build a tree for complex bracket. */
3337 dfa->has_mb_node = 1;
3338 br_token.type = COMPLEX_BRACKET;
3339 br_token.opr.mbcset = mbcset;
3340 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341 if (BE (mbc_tree == NULL, 0))
3342 goto parse_bracket_exp_espace;
3343 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3344 if (sbcset[sbc_idx])
3345 break;
3346 /* If there are no bits set in sbcset, there is no point
3347 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3348 if (sbc_idx < BITSET_WORDS)
3349 {
3350 /* Build a tree for simple bracket. */
3351 br_token.type = SIMPLE_BRACKET;
3352 br_token.opr.sbcset = sbcset;
3353 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354 if (BE (work_tree == NULL, 0))
3355 goto parse_bracket_exp_espace;
3356
3357 /* Then join them by ALT node. */
3358 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
3361 }
3362 else
3363 {
3364 re_free (sbcset);
3365 work_tree = mbc_tree;
3366 }
3367 }
3368 else
3369#endif /* not RE_ENABLE_I18N */
3370 {
3371#ifdef RE_ENABLE_I18N
3372 free_charset (mbcset);
3373#endif
3374 /* Build a tree for simple bracket. */
3375 br_token.type = SIMPLE_BRACKET;
3376 br_token.opr.sbcset = sbcset;
3377 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3378 if (BE (work_tree == NULL, 0))
3379 goto parse_bracket_exp_espace;
3380 }
3381 return work_tree;
3382
3383 parse_bracket_exp_espace:
3384 *err = REG_ESPACE;
3385 parse_bracket_exp_free_return:
3386 re_free (sbcset);
3387#ifdef RE_ENABLE_I18N
3388 free_charset (mbcset);
3389#endif /* RE_ENABLE_I18N */
3390 return NULL;
3391}
3392
3393/* Parse an element in the bracket expression. */
3394
3395static reg_errcode_t
3396parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3397 re_token_t *token, int token_len,
3398 re_dfa_t *dfa _UNUSED_PARAMETER_,
3399 reg_syntax_t syntax, bool accept_hyphen)
3400{
3401#ifdef RE_ENABLE_I18N
3402 int cur_char_size;
3403 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3404 if (cur_char_size > 1)
3405 {
3406 elem->type = MB_CHAR;
3407 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3408 re_string_skip_bytes (regexp, cur_char_size);
3409 return REG_NOERROR;
3410 }
3411#endif /* RE_ENABLE_I18N */
3412 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3413 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3414 || token->type == OP_OPEN_EQUIV_CLASS)
3415 return parse_bracket_symbol (elem, regexp, token);
3416 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3417 {
3418 /* A '-' must only appear as anything but a range indicator before
3419 the closing bracket. Everything else is an error. */
3420 re_token_t token2;
3421 (void) peek_token_bracket (&token2, regexp, syntax);
3422 if (token2.type != OP_CLOSE_BRACKET)
3423 /* The actual error value is not standardized since this whole
3424 case is undefined. But ERANGE makes good sense. */
3425 return REG_ERANGE;
3426 }
3427 elem->type = SB_CHAR;
3428 elem->opr.ch = token->opr.c;
3429 return REG_NOERROR;
3430}
3431
3432/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3433 such as [:<character_class>:], [.<collating_element>.], and
3434 [=<equivalent_class>=]. */
3435
3436static reg_errcode_t
3437parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3438 re_token_t *token)
3439{
3440 unsigned char ch, delim = token->opr.c;
3441 int i = 0;
3442 if (re_string_eoi(regexp))
3443 return REG_EBRACK;
3444 for (;; ++i)
3445 {
3446 if (i >= BRACKET_NAME_BUF_SIZE)
3447 return REG_EBRACK;
3448 if (token->type == OP_OPEN_CHAR_CLASS)
3449 ch = re_string_fetch_byte_case (regexp);
3450 else
3451 ch = re_string_fetch_byte (regexp);
3452 if (re_string_eoi(regexp))
3453 return REG_EBRACK;
3454 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3455 break;
3456 elem->opr.name[i] = ch;
3457 }
3458 re_string_skip_bytes (regexp, 1);
3459 elem->opr.name[i] = '\0';
3460 switch (token->type)
3461 {
3462 case OP_OPEN_COLL_ELEM:
3463 elem->type = COLL_SYM;
3464 break;
3465 case OP_OPEN_EQUIV_CLASS:
3466 elem->type = EQUIV_CLASS;
3467 break;
3468 case OP_OPEN_CHAR_CLASS:
3469 elem->type = CHAR_CLASS;
3470 break;
3471 default:
3472 break;
3473 }
3474 return REG_NOERROR;
3475}
3476
3477 /* Helper function for parse_bracket_exp.
3478 Build the equivalence class which is represented by NAME.
3479 The result are written to MBCSET and SBCSET.
3480 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3481 is a pointer argument since we may update it. */
3482
3483static reg_errcode_t
3484#ifdef RE_ENABLE_I18N
3485build_equiv_class (bitset_t sbcset,
3486 re_charset_t *mbcset _UNUSED_PARAMETER_,
3487 Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3488 const unsigned char *name)
3489#else /* not RE_ENABLE_I18N */
3490build_equiv_class (bitset_t sbcset, const unsigned char *name)
3491#endif /* not RE_ENABLE_I18N */
3492{
3493#ifdef _LIBC
3494 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3495 if (nrules != 0)
3496 {
3497 const int32_t *table, *indirect;
3498 const unsigned char *weights, *extra, *cp;
3499 unsigned char char_buf[2];
3500 int32_t idx1, idx2;
3501 unsigned int ch;
3502 size_t len;
3503 /* This #include defines a local function! */
3504# include <locale/weight.h>
3505 /* Calculate the index for equivalence class. */
3506 cp = name;
3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_WEIGHTMB);
3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_EXTRAMB);
3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_INDIRECTMB);
3514 idx1 = findidx (&cp, -1);
3515 if (BE (idx1 == 0 || *cp != '\0', 0))
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE;
3518
3519 /* Build single byte matching table for this equivalence class. */
3520 len = weights[idx1 & 0xffffff];
3521 for (ch = 0; ch < SBC_MAX; ++ch)
3522 {
3523 char_buf[0] = ch;
3524 cp = char_buf;
3525 idx2 = findidx (&cp, 1);
3526/*
3527 idx2 = table[ch];
3528*/
3529 if (idx2 == 0)
3530 /* This isn't a valid character. */
3531 continue;
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3535 {
3536 int cnt = 0;
3537
3538 while (cnt <= len &&
3539 weights[(idx1 & 0xffffff) + 1 + cnt]
3540 == weights[(idx2 & 0xffffff) + 1 + cnt])
3541 ++cnt;
3542
3543 if (cnt > len)
3544 bitset_set (sbcset, ch);
3545 }
3546 }
3547 /* Check whether the array has enough space. */
3548 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3549 {
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nequiv_classes is 0. */
3552 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 /* Use realloc since the array is NULL if *alloc == 0. */
3554 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 int32_t,
3556 new_equiv_class_alloc);
3557 if (BE (new_equiv_classes == NULL, 0))
3558 return REG_ESPACE;
3559 mbcset->equiv_classes = new_equiv_classes;
3560 *equiv_class_alloc = new_equiv_class_alloc;
3561 }
3562 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563 }
3564 else
3565#endif /* _LIBC */
3566 {
3567 if (BE (strlen ((const char *) name) != 1, 0))
3568 return REG_ECOLLATE;
3569 bitset_set (sbcset, *name);
3570 }
3571 return REG_NOERROR;
3572}
3573
3574 /* Helper function for parse_bracket_exp.
3575 Build the character class which is represented by NAME.
3576 The result are written to MBCSET and SBCSET.
3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578 is a pointer argument since we may update it. */
3579
3580static reg_errcode_t
3581#ifdef RE_ENABLE_I18N
3582build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 re_charset_t *mbcset, Idx *char_class_alloc,
3584 const unsigned char *class_name, reg_syntax_t syntax)
3585#else /* not RE_ENABLE_I18N */
3586build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 const unsigned char *class_name, reg_syntax_t syntax)
3588#endif /* not RE_ENABLE_I18N */
3589{
3590 int i;
3591 const char *name = (const char *) class_name;
3592
3593 /* In case of REG_ICASE "upper" and "lower" match the both of
3594 upper and lower cases. */
3595 if ((syntax & RE_ICASE)
3596 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597 name = "alpha";
3598
3599#ifdef RE_ENABLE_I18N
3600 /* Check the space of the arrays. */
3601 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3602 {
3603 /* Not enough, realloc it. */
3604 /* +1 in case of mbcset->nchar_classes is 0. */
3605 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606 /* Use realloc since array is NULL if *alloc == 0. */
3607 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 new_char_class_alloc);
3609 if (BE (new_char_classes == NULL, 0))
3610 return REG_ESPACE;
3611 mbcset->char_classes = new_char_classes;
3612 *char_class_alloc = new_char_class_alloc;
3613 }
3614 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615#endif /* RE_ENABLE_I18N */
3616
3617#define BUILD_CHARCLASS_LOOP(ctype_func) \
3618 do { \
3619 if (BE (trans != NULL, 0)) \
3620 { \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, trans[i]); \
3624 } \
3625 else \
3626 { \
3627 for (i = 0; i < SBC_MAX; ++i) \
3628 if (ctype_func (i)) \
3629 bitset_set (sbcset, i); \
3630 } \
3631 } while (0)
3632
3633 if (strcmp (name, "alnum") == 0)
3634 BUILD_CHARCLASS_LOOP (isalnum);
3635 else if (strcmp (name, "cntrl") == 0)
3636 BUILD_CHARCLASS_LOOP (iscntrl);
3637 else if (strcmp (name, "lower") == 0)
3638 BUILD_CHARCLASS_LOOP (islower);
3639 else if (strcmp (name, "space") == 0)
3640 BUILD_CHARCLASS_LOOP (isspace);
3641 else if (strcmp (name, "alpha") == 0)
3642 BUILD_CHARCLASS_LOOP (isalpha);
3643 else if (strcmp (name, "digit") == 0)
3644 BUILD_CHARCLASS_LOOP (isdigit);
3645 else if (strcmp (name, "print") == 0)
3646 BUILD_CHARCLASS_LOOP (isprint);
3647 else if (strcmp (name, "upper") == 0)
3648 BUILD_CHARCLASS_LOOP (isupper);
3649 else if (strcmp (name, "blank") == 0)
3650 BUILD_CHARCLASS_LOOP (isblank);
3651 else if (strcmp (name, "graph") == 0)
3652 BUILD_CHARCLASS_LOOP (isgraph);
3653 else if (strcmp (name, "punct") == 0)
3654 BUILD_CHARCLASS_LOOP (ispunct);
3655 else if (strcmp (name, "xdigit") == 0)
3656 BUILD_CHARCLASS_LOOP (isxdigit);
3657 else
3658 return REG_ECTYPE;
3659
3660 return REG_NOERROR;
3661}
3662
3663static bin_tree_t *
3664build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 const unsigned char *class_name,
3666 const unsigned char *extra, bool non_match,
3667 reg_errcode_t *err)
3668{
3669 re_bitset_ptr_t sbcset;
3670#ifdef RE_ENABLE_I18N
3671 re_charset_t *mbcset;
3672 Idx alloc = 0;
3673#endif /* not RE_ENABLE_I18N */
3674 reg_errcode_t ret;
3675 re_token_t br_token;
3676 bin_tree_t *tree;
3677
3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679#ifdef RE_ENABLE_I18N
3680 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681#endif /* RE_ENABLE_I18N */
3682
3683#ifdef RE_ENABLE_I18N
3684 if (BE (sbcset == NULL || mbcset == NULL, 0))
3685#else /* not RE_ENABLE_I18N */
3686 if (BE (sbcset == NULL, 0))
3687#endif /* not RE_ENABLE_I18N */
3688 {
3689 *err = REG_ESPACE;
3690 return NULL;
3691 }
3692
3693 if (non_match)
3694 {
3695#ifdef RE_ENABLE_I18N
3696 mbcset->non_match = 1;
3697#endif /* not RE_ENABLE_I18N */
3698 }
3699
3700 /* We don't care the syntax in this case. */
3701 ret = build_charclass (trans, sbcset,
3702#ifdef RE_ENABLE_I18N
3703 mbcset, &alloc,
3704#endif /* RE_ENABLE_I18N */
3705 class_name, 0);
3706
3707 if (BE (ret != REG_NOERROR, 0))
3708 {
3709 re_free (sbcset);
3710#ifdef RE_ENABLE_I18N
3711 free_charset (mbcset);
3712#endif /* RE_ENABLE_I18N */
3713 *err = ret;
3714 return NULL;
3715 }
3716 /* \w match '_' also. */
3717 for (; *extra; extra++)
3718 bitset_set (sbcset, *extra);
3719
3720 /* If it is non-matching list. */
3721 if (non_match)
3722 bitset_not (sbcset);
3723
3724#ifdef RE_ENABLE_I18N
3725 /* Ensure only single byte characters are set. */
3726 if (dfa->mb_cur_max > 1)
3727 bitset_mask (sbcset, dfa->sb_char);
3728#endif
3729
3730 /* Build a tree for simple bracket. */
3731 br_token.type = SIMPLE_BRACKET;
3732 br_token.opr.sbcset = sbcset;
3733 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734 if (BE (tree == NULL, 0))
3735 goto build_word_op_espace;
3736
3737#ifdef RE_ENABLE_I18N
3738 if (dfa->mb_cur_max > 1)
3739 {
3740 bin_tree_t *mbc_tree;
3741 /* Build a tree for complex bracket. */
3742 br_token.type = COMPLEX_BRACKET;
3743 br_token.opr.mbcset = mbcset;
3744 dfa->has_mb_node = 1;
3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746 if (BE (mbc_tree == NULL, 0))
3747 goto build_word_op_espace;
3748 /* Then join them by ALT node. */
3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750 if (BE (mbc_tree != NULL, 1))
3751 return tree;
3752 }
3753 else
3754 {
3755 free_charset (mbcset);
3756 return tree;
3757 }
3758#else /* not RE_ENABLE_I18N */
3759 return tree;
3760#endif /* not RE_ENABLE_I18N */
3761
3762 build_word_op_espace:
3763 re_free (sbcset);
3764#ifdef RE_ENABLE_I18N
3765 free_charset (mbcset);
3766#endif /* RE_ENABLE_I18N */
3767 *err = REG_ESPACE;
3768 return NULL;
3769}
3770
3771/* This is intended for the expressions like "a{1,3}".
3772 Fetch a number from 'input', and return the number.
3773 Return REG_MISSING if the number field is empty like "{,1}".
3774 Return RE_DUP_MAX + 1 if the number field is too large.
3775 Return REG_ERROR if an error occurred. */
3776
3777static Idx
3778fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3779{
3780 Idx num = REG_MISSING;
3781 unsigned char c;
3782 while (1)
3783 {
3784 fetch_token (token, input, syntax);
3785 c = token->opr.c;
3786 if (BE (token->type == END_OF_RE, 0))
3787 return REG_ERROR;
3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 break;
3790 num = ((token->type != CHARACTER || c < '0' || '9' < c
3791 || num == REG_ERROR)
3792 ? REG_ERROR
3793 : num == REG_MISSING
3794 ? c - '0'
3795 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796 }
3797 return num;
3798}
3799
3800
3801#ifdef RE_ENABLE_I18N
3802static void
3803free_charset (re_charset_t *cset)
3804{
3805 re_free (cset->mbchars);
3806# ifdef _LIBC
3807 re_free (cset->coll_syms);
3808 re_free (cset->equiv_classes);
3809 re_free (cset->range_starts);
3810 re_free (cset->range_ends);
3811# endif
3812 re_free (cset->char_classes);
3813 re_free (cset);
3814}
3815#endif /* RE_ENABLE_I18N */
3816
3817
3818/* Functions for binary tree operation. */
3819
3820/* Create a tree node. */
3821
3822static bin_tree_t *
3823create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824 re_token_type_t type)
3825{
3826 re_token_t t;
3827 t.type = type;
3828 return create_token_tree (dfa, left, right, &t);
3829}
3830
3831static bin_tree_t *
3832create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3833 const re_token_t *token)
3834{
3835 bin_tree_t *tree;
3836 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3837 {
3838 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3839
3840 if (storage == NULL)
3841 return NULL;
3842 storage->next = dfa->str_tree_storage;
3843 dfa->str_tree_storage = storage;
3844 dfa->str_tree_storage_idx = 0;
3845 }
3846 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3847
3848 tree->parent = NULL;
3849 tree->left = left;
3850 tree->right = right;
3851 tree->token = *token;
3852 tree->token.duplicated = 0;
3853 tree->token.opt_subexp = 0;
3854 tree->first = NULL;
3855 tree->next = NULL;
3856 tree->node_idx = REG_MISSING;
3857
3858 if (left != NULL)
3859 left->parent = tree;
3860 if (right != NULL)
3861 right->parent = tree;
3862 return tree;
3863}
3864
3865/* Mark the tree SRC as an optional subexpression.
3866 To be called from preorder or postorder. */
3867
3868static reg_errcode_t
3869mark_opt_subexp (void *extra, bin_tree_t *node)
3870{
3871 Idx idx = (Idx) (long) extra;
3872 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3873 node->token.opt_subexp = 1;
3874
3875 return REG_NOERROR;
3876}
3877
3878/* Free the allocated memory inside NODE. */
3879
3880static void
3881free_token (re_token_t *node)
3882{
3883#ifdef RE_ENABLE_I18N
3884 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3885 free_charset (node->opr.mbcset);
3886 else
3887#endif /* RE_ENABLE_I18N */
3888 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3889 re_free (node->opr.sbcset);
3890}
3891
3892/* Worker function for tree walking. Free the allocated memory inside NODE
3893 and its children. */
3894
3895static reg_errcode_t
3896free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3897{
3898 free_token (&node->token);
3899 return REG_NOERROR;
3900}
3901
3902
3903/* Duplicate the node SRC, and return new node. This is a preorder
3904 visit similar to the one implemented by the generic visitor, but
3905 we need more infrastructure to maintain two parallel trees --- so,
3906 it's easier to duplicate. */
3907
3908static bin_tree_t *
3909duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3910{
3911 const bin_tree_t *node;
3912 bin_tree_t *dup_root;
3913 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3914
3915 for (node = root; ; )
3916 {
3917 /* Create a new tree and link it back to the current parent. */
3918 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3919 if (*p_new == NULL)
3920 return NULL;
3921 (*p_new)->parent = dup_node;
3922 (*p_new)->token.duplicated = 1;
3923 dup_node = *p_new;
3924
3925 /* Go to the left node, or up and to the right. */
3926 if (node->left)
3927 {
3928 node = node->left;
3929 p_new = &dup_node->left;
3930 }
3931 else
3932 {
3933 const bin_tree_t *prev = NULL;
3934 while (node->right == prev || node->right == NULL)
3935 {
3936 prev = node;
3937 node = node->parent;
3938 dup_node = dup_node->parent;
3939 if (!node)
3940 return dup_root;
3941 }
3942 node = node->right;
3943 p_new = &dup_node->right;
3944 }
3945 }
3946}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette