VirtualBox

source: kBuild/trunk/src/sed/lib/regcomp.c@ 3550

Last change on this file since 3550 was 3550, checked in by bird, 3 years ago

sed: Use get_crt_codepage() instead of fake nl_langinfo.

File size: 107.7 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124
125/* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
129
130const char __re_error_msgid[] attribute_hidden =
131 {
132#define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 };
183
184const size_t __re_error_msgid_idx[] attribute_hidden =
185 {
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
203 };
204
205
206/* Entry points for GNU code. */
207
208/* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
211
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
214
215const char *
216re_compile_pattern (pattern, length, bufp)
217 const char *pattern;
218 size_t length;
219 struct re_pattern_buffer *bufp;
220{
221 reg_errcode_t ret;
222
223 /* And GNU code determines whether or not to get register information
224 by passing null for the REGS argument to re_match, etc., not by
225 setting no_sub, unless RE_NO_SUB is set. */
226 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227
228 /* Match anchors at newline. */
229 bufp->newline_anchor = 1;
230
231 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232
233 if (!ret)
234 return NULL;
235 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236}
237#ifdef _LIBC
238weak_alias (__re_compile_pattern, re_compile_pattern)
239#endif
240
241/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
242 also be assigned to arbitrarily: each pattern buffer stores its own
243 syntax, so it can be changed between regex compilations. */
244/* This has no initializer because initialized variables in Emacs
245 become read-only after dumping. */
246reg_syntax_t re_syntax_options;
247
248
249/* Specify the precise syntax of regexps for compilation. This provides
250 for compatibility for various utilities which historically have
251 different, incompatible syntaxes.
252
253 The argument SYNTAX is a bit mask comprised of the various bits
254 defined in regex.h. We return the old syntax. */
255
256reg_syntax_t
257re_set_syntax (syntax)
258 reg_syntax_t syntax;
259{
260 reg_syntax_t ret = re_syntax_options;
261
262 re_syntax_options = syntax;
263 return ret;
264}
265#ifdef _LIBC
266weak_alias (__re_set_syntax, re_set_syntax)
267#endif
268
269int
270re_compile_fastmap (bufp)
271 struct re_pattern_buffer *bufp;
272{
273 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
274 char *fastmap = bufp->fastmap;
275
276 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
277 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
278 if (dfa->init_state != dfa->init_state_word)
279 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
280 if (dfa->init_state != dfa->init_state_nl)
281 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
282 if (dfa->init_state != dfa->init_state_begbuf)
283 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
284 bufp->fastmap_accurate = 1;
285 return 0;
286}
287#ifdef _LIBC
288weak_alias (__re_compile_fastmap, re_compile_fastmap)
289#endif
290
291static inline void
292__attribute ((always_inline))
293re_set_fastmap (char *fastmap, int icase, int ch)
294{
295 fastmap[ch] = 1;
296 if (icase)
297 fastmap[tolower (ch)] = 1;
298}
299
300/* Helper function for re_compile_fastmap.
301 Compile fastmap for the initial_state INIT_STATE. */
302
303static void
304re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
305 char *fastmap)
306{
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
308 int node_cnt;
309 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 {
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
314
315 if (type == CHARACTER)
316 {
317 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
318#ifdef RE_ENABLE_I18N
319 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 {
321 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
322 wchar_t wc;
323 mbstate_t state;
324
325 p = buf;
326 *p++ = dfa->nodes[node].opr.c;
327 while (++node < dfa->nodes_len
328 && dfa->nodes[node].type == CHARACTER
329 && dfa->nodes[node].mb_partial)
330 *p++ = dfa->nodes[node].opr.c;
331 memset (&state, '\0', sizeof (state));
332 if (mbrtowc (&wc, (const char *) buf, p - buf,
333 &state) == p - buf
334 && (__wcrtomb ((char *) buf, towlower (wc), &state)
335 != (size_t) -1))
336 re_set_fastmap (fastmap, 0, buf[0]);
337 }
338#endif
339 }
340 else if (type == SIMPLE_BRACKET)
341 {
342 int i, ch;
343 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 {
345 int j;
346 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
347 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
348 if (w & ((bitset_word_t) 1 << j))
349 re_set_fastmap (fastmap, icase, ch);
350 }
351 }
352#ifdef RE_ENABLE_I18N
353 else if (type == COMPLEX_BRACKET)
354 {
355 int i;
356 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
357 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
358 || cset->nranges || cset->nchar_classes)
359 {
360# ifdef _LIBC
361 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
362 {
363 /* In this case we want to catch the bytes which are
364 the first byte of any collation elements.
365 e.g. In da_DK, we want to catch 'a' since "aa"
366 is a valid collation element, and don't catch
367 'b' since 'b' is the only collation element
368 which starts from 'b'. */
369 const int32_t *table = (const int32_t *)
370 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
371 for (i = 0; i < SBC_MAX; ++i)
372 if (table[i] < 0)
373 re_set_fastmap (fastmap, icase, i);
374 }
375# else
376 if (dfa->mb_cur_max > 1)
377 for (i = 0; i < SBC_MAX; ++i)
378 if (__btowc (i) == WEOF)
379 re_set_fastmap (fastmap, icase, i);
380# endif /* not _LIBC */
381 }
382 for (i = 0; i < cset->nmbchars; ++i)
383 {
384 char buf[256];
385 mbstate_t state;
386 memset (&state, '\0', sizeof (state));
387 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
388 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
389 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
390 {
391 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
392 != (size_t) -1)
393 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
394 }
395 }
396 }
397#endif /* RE_ENABLE_I18N */
398 else if (type == OP_PERIOD
399#ifdef RE_ENABLE_I18N
400 || type == OP_UTF8_PERIOD
401#endif /* RE_ENABLE_I18N */
402 || type == END_OF_RE)
403 {
404 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
405 if (type == END_OF_RE)
406 bufp->can_be_null = 1;
407 return;
408 }
409 }
410}
411
412
413/* Entry point for POSIX code. */
414/* regcomp takes a regular expression as a string and compiles it.
415
416 PREG is a regex_t *. We do not expect any fields to be initialized,
417 since POSIX says we shouldn't. Thus, we set
418
419 `buffer' to the compiled pattern;
420 `used' to the length of the compiled pattern;
421 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
422 REG_EXTENDED bit in CFLAGS is set; otherwise, to
423 RE_SYNTAX_POSIX_BASIC;
424 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
425 `fastmap' to an allocated space for the fastmap;
426 `fastmap_accurate' to zero;
427 `re_nsub' to the number of subexpressions in PATTERN.
428
429 PATTERN is the address of the pattern string.
430
431 CFLAGS is a series of bits which affect compilation.
432
433 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
434 use POSIX basic syntax.
435
436 If REG_NEWLINE is set, then . and [^...] don't match newline.
437 Also, regexec will try a match beginning after every newline.
438
439 If REG_ICASE is set, then we considers upper- and lowercase
440 versions of letters to be equivalent when matching.
441
442 If REG_NOSUB is set, then when PREG is passed to regexec, that
443 routine will report only success or failure, and nothing about the
444 registers.
445
446 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
447 the return codes and their meanings.) */
448
449int
450regcomp (preg, pattern, cflags)
451 regex_t *__restrict preg;
452 const char *__restrict pattern;
453 int cflags;
454{
455 reg_errcode_t ret;
456 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
457 : RE_SYNTAX_POSIX_BASIC);
458
459 preg->buffer = NULL;
460 preg->allocated = 0;
461 preg->used = 0;
462
463 /* Try to allocate space for the fastmap. */
464 preg->fastmap = re_malloc (char, SBC_MAX);
465 if (BE (preg->fastmap == NULL, 0))
466 return REG_ESPACE;
467
468 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
469
470 /* If REG_NEWLINE is set, newlines are treated differently. */
471 if (cflags & REG_NEWLINE)
472 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
473 syntax &= ~RE_DOT_NEWLINE;
474 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
475 /* It also changes the matching behavior. */
476 preg->newline_anchor = 1;
477 }
478 else
479 preg->newline_anchor = 0;
480 preg->no_sub = !!(cflags & REG_NOSUB);
481 preg->translate = NULL;
482
483 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484
485 /* POSIX doesn't distinguish between an unmatched open-group and an
486 unmatched close-group: both are REG_EPAREN. */
487 if (ret == REG_ERPAREN)
488 ret = REG_EPAREN;
489
490 /* We have already checked preg->fastmap != NULL. */
491 if (BE (ret == REG_NOERROR, 1))
492 /* Compute the fastmap now, since regexec cannot modify the pattern
493 buffer. This function never fails in this implementation. */
494 (void) re_compile_fastmap (preg);
495 else
496 {
497 /* Some error occurred while compiling the expression. */
498 re_free (preg->fastmap);
499 preg->fastmap = NULL;
500 }
501
502 return (int) ret;
503}
504#ifdef _LIBC
505weak_alias (__regcomp, regcomp)
506#endif
507
508/* Returns a message corresponding to an error code, ERRCODE, returned
509 from either regcomp or regexec. We don't use PREG here. */
510
511size_t
512regerror (errcode, preg, errbuf, errbuf_size)
513 int errcode;
514 const regex_t *__restrict preg;
515 char *__restrict errbuf;
516 size_t errbuf_size;
517{
518 const char *msg;
519 size_t msg_size;
520
521 if (BE (errcode < 0
522 || errcode >= (int) (sizeof (__re_error_msgid_idx)
523 / sizeof (__re_error_msgid_idx[0])), 0))
524 /* Only error codes returned by the rest of the code should be passed
525 to this routine. If we are given anything else, or if other regex
526 code generates an invalid error code, then the program has a bug.
527 Dump core so we can fix it. */
528 abort ();
529
530 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
531
532 msg_size = strlen (msg) + 1; /* Includes the null. */
533
534 if (BE (errbuf_size != 0, 1))
535 {
536 if (BE (msg_size > errbuf_size, 0))
537 {
538#if defined HAVE_MEMPCPY || defined _LIBC
539 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
540#else
541 memcpy (errbuf, msg, errbuf_size - 1);
542 errbuf[errbuf_size - 1] = 0;
543#endif
544 }
545 else
546 memcpy (errbuf, msg, msg_size);
547 }
548
549 return msg_size;
550}
551#ifdef _LIBC
552weak_alias (__regerror, regerror)
553#endif
554
555
556#ifdef RE_ENABLE_I18N
557/* This static array is used for the map to single-byte characters when
558 UTF-8 is used. Otherwise we would allocate memory just to initialize
559 it the same all the time. UTF-8 is the preferred encoding so this is
560 a worthwhile optimization. */
561static const bitset_t utf8_sb_map =
562{
563 /* Set the first 128 bits. */
564#ifdef _MSC_VER
565 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 0, 0, 0, 0
566#else
567 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
568#endif
569};
570#endif
571
572
573static void
574free_dfa_content (re_dfa_t *dfa)
575{
576 int i, j;
577
578 if (dfa->nodes)
579 for (i = 0; i < dfa->nodes_len; ++i)
580 free_token (dfa->nodes + i);
581 re_free (dfa->nexts);
582 for (i = 0; i < dfa->nodes_len; ++i)
583 {
584 if (dfa->eclosures != NULL)
585 re_node_set_free (dfa->eclosures + i);
586 if (dfa->inveclosures != NULL)
587 re_node_set_free (dfa->inveclosures + i);
588 if (dfa->edests != NULL)
589 re_node_set_free (dfa->edests + i);
590 }
591 re_free (dfa->edests);
592 re_free (dfa->eclosures);
593 re_free (dfa->inveclosures);
594 re_free (dfa->nodes);
595
596 if (dfa->state_table)
597 for (i = 0; i <= dfa->state_hash_mask; ++i)
598 {
599 struct re_state_table_entry *entry = dfa->state_table + i;
600 for (j = 0; j < entry->num; ++j)
601 {
602 re_dfastate_t *state = entry->array[j];
603 free_state (state);
604 }
605 re_free (entry->array);
606 }
607 re_free (dfa->state_table);
608#ifdef RE_ENABLE_I18N
609 if (dfa->sb_char != utf8_sb_map)
610 re_free (dfa->sb_char);
611#endif
612 re_free (dfa->subexp_map);
613#ifdef DEBUG
614 re_free (dfa->re_str);
615#endif
616
617 re_free (dfa);
618}
619
620
621/* Free dynamically allocated space used by PREG. */
622
623void
624regfree (preg)
625 regex_t *preg;
626{
627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
628 if (BE (dfa != NULL, 1))
629 free_dfa_content (dfa);
630 preg->buffer = NULL;
631 preg->allocated = 0;
632
633 re_free (preg->fastmap);
634 preg->fastmap = NULL;
635
636 re_free (preg->translate);
637 preg->translate = NULL;
638}
639#ifdef _LIBC
640weak_alias (__regfree, regfree)
641#endif
642
643
644/* Entry points compatible with 4.2 BSD regex library. We don't define
645 them unless specifically requested. */
646
647#if defined _REGEX_RE_COMP || defined _LIBC
648
649/* BSD has one and only one pattern buffer. */
650static struct re_pattern_buffer re_comp_buf;
651
652char *
653# ifdef _LIBC
654/* Make these definitions weak in libc, so POSIX programs can redefine
655 these names if they don't use our functions, and still use
656 regcomp/regexec above without link errors. */
657weak_function
658# endif
659re_comp (s)
660 const char *s;
661{
662 reg_errcode_t ret;
663 char *fastmap;
664
665 if (!s)
666 {
667 if (!re_comp_buf.buffer)
668 return gettext ("No previous regular expression");
669 return 0;
670 }
671
672 if (re_comp_buf.buffer)
673 {
674 fastmap = re_comp_buf.fastmap;
675 re_comp_buf.fastmap = NULL;
676 __regfree (&re_comp_buf);
677 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
678 re_comp_buf.fastmap = fastmap;
679 }
680
681 if (re_comp_buf.fastmap == NULL)
682 {
683 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
684 if (re_comp_buf.fastmap == NULL)
685 return (char *) gettext (__re_error_msgid
686 + __re_error_msgid_idx[(int) REG_ESPACE]);
687 }
688
689 /* Since `re_exec' always passes NULL for the `regs' argument, we
690 don't need to initialize the pattern buffer fields which affect it. */
691
692 /* Match anchors at newlines. */
693 re_comp_buf.newline_anchor = 1;
694
695 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
696
697 if (!ret)
698 return NULL;
699
700 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
701 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
702}
703
704#ifdef _LIBC
705libc_freeres_fn (free_mem)
706{
707 __regfree (&re_comp_buf);
708}
709#endif
710
711#endif /* _REGEX_RE_COMP */
712
713
714/* Internal entry point.
715 Compile the regular expression PATTERN, whose length is LENGTH.
716 SYNTAX indicate regular expression's syntax. */
717
718static reg_errcode_t
719re_compile_internal (regex_t *preg, const char * pattern, size_t length,
720 reg_syntax_t syntax)
721{
722 reg_errcode_t err = REG_NOERROR;
723 re_dfa_t *dfa;
724 re_string_t regexp;
725
726 /* Initialize the pattern buffer. */
727 preg->fastmap_accurate = 0;
728 preg->syntax = syntax;
729 preg->not_bol = preg->not_eol = 0;
730 preg->used = 0;
731 preg->re_nsub = 0;
732 preg->can_be_null = 0;
733 preg->regs_allocated = REGS_UNALLOCATED;
734
735 /* Initialize the dfa. */
736 dfa = (re_dfa_t *) preg->buffer;
737 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
738 {
739 /* If zero allocated, but buffer is non-null, try to realloc
740 enough space. This loses if buffer's address is bogus, but
741 that is the user's responsibility. If ->buffer is NULL this
742 is a simple allocation. */
743 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
744 if (dfa == NULL)
745 return REG_ESPACE;
746 preg->allocated = sizeof (re_dfa_t);
747 preg->buffer = (unsigned char *) dfa;
748 }
749 preg->used = sizeof (re_dfa_t);
750
751 err = init_dfa (dfa, length);
752 if (BE (err != REG_NOERROR, 0))
753 {
754 free_dfa_content (dfa);
755 preg->buffer = NULL;
756 preg->allocated = 0;
757 return err;
758 }
759#ifdef DEBUG
760 /* Note: length+1 will not overflow since it is checked in init_dfa. */
761 dfa->re_str = re_malloc (char, length + 1);
762 strncpy (dfa->re_str, pattern, length + 1);
763#endif
764
765 __libc_lock_init (dfa->lock);
766
767 err = re_string_construct (&regexp, pattern, length, preg->translate,
768 syntax & RE_ICASE, dfa);
769 if (BE (err != REG_NOERROR, 0))
770 {
771 re_compile_internal_free_return:
772 free_workarea_compile (preg);
773 re_string_destruct (&regexp);
774 free_dfa_content (dfa);
775 preg->buffer = NULL;
776 preg->allocated = 0;
777 return err;
778 }
779
780 /* Parse the regular expression, and build a structure tree. */
781 preg->re_nsub = 0;
782 dfa->str_tree = parse (&regexp, preg, syntax, &err);
783 if (BE (dfa->str_tree == NULL, 0))
784 goto re_compile_internal_free_return;
785
786 /* Analyze the tree and create the nfa. */
787 err = analyze (preg);
788 if (BE (err != REG_NOERROR, 0))
789 goto re_compile_internal_free_return;
790
791#ifdef RE_ENABLE_I18N
792 /* If possible, do searching in single byte encoding to speed things up. */
793 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
794 optimize_utf8 (dfa);
795#endif
796
797 /* Then create the initial state of the dfa. */
798 err = create_initial_state (dfa);
799
800 /* Release work areas. */
801 free_workarea_compile (preg);
802 re_string_destruct (&regexp);
803
804 if (BE (err != REG_NOERROR, 0))
805 {
806 free_dfa_content (dfa);
807 preg->buffer = NULL;
808 preg->allocated = 0;
809 }
810
811 return err;
812}
813
814/* Initialize DFA. We use the length of the regular expression PAT_LEN
815 as the initial length of some arrays. */
816
817static reg_errcode_t
818init_dfa (re_dfa_t *dfa, size_t pat_len)
819{
820 unsigned int table_size;
821#ifndef _LIBC
822 char *codeset_name;
823#endif
824
825 memset (dfa, '\0', sizeof (re_dfa_t));
826
827 /* Force allocation of str_tree_storage the first time. */
828 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
829
830 /* Avoid overflows. */
831 if (pat_len == SIZE_MAX)
832 return REG_ESPACE;
833
834 dfa->nodes_alloc = pat_len + 1;
835 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
836
837 /* table_size = 2 ^ ceil(log pat_len) */
838 for (table_size = 1; ; table_size <<= 1)
839 if (table_size > pat_len)
840 break;
841
842 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
843 dfa->state_hash_mask = table_size - 1;
844
845 dfa->mb_cur_max = MB_CUR_MAX;
846#ifdef _LIBC
847 if (dfa->mb_cur_max == 6
848 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
849 dfa->is_utf8 = 1;
850 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
851 != 0);
852#else
853# ifdef KBUILD_OS_WINDOWS
854 (void)codeset_name;
855 if (get_crt_codepage() == MY_CP_UTF8)
856# else
857# ifdef HAVE_LANGINFO_CODESET
858 codeset_name = nl_langinfo (CODESET);
859# else
860 codeset_name = getenv ("LC_ALL");
861 if (codeset_name == NULL || codeset_name[0] == '\0')
862 codeset_name = getenv ("LC_CTYPE");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LANG");
865 if (codeset_name == NULL)
866 codeset_name = "";
867 else if (strchr (codeset_name, '.') != NULL)
868 codeset_name = strchr (codeset_name, '.') + 1;
869# endif
870
871 if (strcasecmp (codeset_name, "UTF-8") == 0
872 || strcasecmp (codeset_name, "UTF8") == 0)
873# endif
874 dfa->is_utf8 = 1;
875
876 /* We check exhaustively in the loop below if this charset is a
877 superset of ASCII. */
878 dfa->map_notascii = 0;
879#endif
880
881#ifdef RE_ENABLE_I18N
882 if (dfa->mb_cur_max > 1)
883 {
884 if (dfa->is_utf8)
885 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
886 else
887 {
888 int i, j, ch;
889
890 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
891 if (BE (dfa->sb_char == NULL, 0))
892 return REG_ESPACE;
893
894 /* Set the bits corresponding to single byte chars. */
895 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
896 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
897 {
898 wint_t wch = __btowc (ch);
899 if (wch != WEOF)
900 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
901# ifndef _LIBC
902 if (isascii (ch) && wch != ch)
903 dfa->map_notascii = 1;
904# endif
905 }
906 }
907 }
908#endif
909
910 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
911 return REG_ESPACE;
912 return REG_NOERROR;
913}
914
915/* Initialize WORD_CHAR table, which indicate which character is
916 "word". In this case "word" means that it is the word construction
917 character used by some operators like "\<", "\>", etc. */
918
919static void
920internal_function
921init_word_char (re_dfa_t *dfa)
922{
923 int i, j, ch;
924 dfa->word_ops_used = 1;
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
927 if (isalnum (ch) || ch == '_')
928 dfa->word_char[i] |= (bitset_word_t) 1 << j;
929}
930
931/* Free the work area which are only used while compiling. */
932
933static void
934free_workarea_compile (regex_t *preg)
935{
936 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
937 bin_tree_storage_t *storage, *next;
938 for (storage = dfa->str_tree_storage; storage; storage = next)
939 {
940 next = storage->next;
941 re_free (storage);
942 }
943 dfa->str_tree_storage = NULL;
944 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
945 dfa->str_tree = NULL;
946 re_free (dfa->org_indices);
947 dfa->org_indices = NULL;
948}
949
950/* Create initial states for all contexts. */
951
952static reg_errcode_t
953create_initial_state (re_dfa_t *dfa)
954{
955 int first, i;
956 reg_errcode_t err;
957 re_node_set init_nodes;
958
959 /* Initial states have the epsilon closure of the node which is
960 the first node of the regular expression. */
961 first = dfa->str_tree->first->node_idx;
962 dfa->init_node = first;
963 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
964 if (BE (err != REG_NOERROR, 0))
965 return err;
966
967 /* The back-references which are in initial states can epsilon transit,
968 since in this case all of the subexpressions can be null.
969 Then we add epsilon closures of the nodes which are the next nodes of
970 the back-references. */
971 if (dfa->nbackref > 0)
972 for (i = 0; i < init_nodes.nelem; ++i)
973 {
974 int node_idx = init_nodes.elems[i];
975 re_token_type_t type = dfa->nodes[node_idx].type;
976
977 int clexp_idx;
978 if (type != OP_BACK_REF)
979 continue;
980 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
981 {
982 re_token_t *clexp_node;
983 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
984 if (clexp_node->type == OP_CLOSE_SUBEXP
985 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
986 break;
987 }
988 if (clexp_idx == init_nodes.nelem)
989 continue;
990
991 if (type == OP_BACK_REF)
992 {
993 int dest_idx = dfa->edests[node_idx].elems[0];
994 if (!re_node_set_contains (&init_nodes, dest_idx))
995 {
996 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
997 i = 0;
998 }
999 }
1000 }
1001
1002 /* It must be the first time to invoke acquire_state. */
1003 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1004 /* We don't check ERR here, since the initial state must not be NULL. */
1005 if (BE (dfa->init_state == NULL, 0))
1006 return err;
1007 if (dfa->init_state->has_constraint)
1008 {
1009 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1010 CONTEXT_WORD);
1011 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1012 CONTEXT_NEWLINE);
1013 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1014 &init_nodes,
1015 CONTEXT_NEWLINE
1016 | CONTEXT_BEGBUF);
1017 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1018 || dfa->init_state_begbuf == NULL, 0))
1019 return err;
1020 }
1021 else
1022 dfa->init_state_word = dfa->init_state_nl
1023 = dfa->init_state_begbuf = dfa->init_state;
1024
1025 re_node_set_free (&init_nodes);
1026 return REG_NOERROR;
1027}
1028
1029
1030#ifdef RE_ENABLE_I18N
1031/* If it is possible to do searching in single byte encoding instead of UTF-8
1032 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1033 DFA nodes where needed. */
1034
1035static void
1036optimize_utf8 (re_dfa_t *dfa)
1037{
1038 int node, i, mb_chars = 0, has_period = 0;
1039
1040 for (node = 0; node < dfa->nodes_len; ++node)
1041 switch (dfa->nodes[node].type)
1042 {
1043 case CHARACTER:
1044 if (dfa->nodes[node].opr.c >= 0x80)
1045 mb_chars = 1;
1046 break;
1047 case ANCHOR:
1048 switch (dfa->nodes[node].opr.idx)
1049 {
1050 case LINE_FIRST:
1051 case LINE_LAST:
1052 case BUF_FIRST:
1053 case BUF_LAST:
1054 break;
1055 default:
1056 /* Word anchors etc. cannot be handled. */
1057 return;
1058 }
1059 break;
1060 case OP_PERIOD:
1061 has_period = 1;
1062 break;
1063 case OP_BACK_REF:
1064 case OP_ALT:
1065 case END_OF_RE:
1066 case OP_DUP_ASTERISK:
1067 case OP_OPEN_SUBEXP:
1068 case OP_CLOSE_SUBEXP:
1069 break;
1070 case COMPLEX_BRACKET:
1071 return;
1072 case SIMPLE_BRACKET:
1073 /* Just double check. The non-ASCII range starts at 0x80. */
1074 assert (0x80 % BITSET_WORD_BITS == 0);
1075 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1076 if (dfa->nodes[node].opr.sbcset[i])
1077 return;
1078 break;
1079 default:
1080 abort ();
1081 }
1082
1083 if (mb_chars || has_period)
1084 for (node = 0; node < dfa->nodes_len; ++node)
1085 {
1086 if (dfa->nodes[node].type == CHARACTER
1087 && dfa->nodes[node].opr.c >= 0x80)
1088 dfa->nodes[node].mb_partial = 0;
1089 else if (dfa->nodes[node].type == OP_PERIOD)
1090 dfa->nodes[node].type = OP_UTF8_PERIOD;
1091 }
1092
1093 /* The search can be in single byte locale. */
1094 dfa->mb_cur_max = 1;
1095 dfa->is_utf8 = 0;
1096 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1097}
1098#endif
1099
1100
1101/* Analyze the structure tree, and calculate "first", "next", "edest",
1102 "eclosure", and "inveclosure". */
1103
1104static reg_errcode_t
1105analyze (regex_t *preg)
1106{
1107 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1108 reg_errcode_t ret;
1109
1110 /* Allocate arrays. */
1111 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1112 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1113 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1114 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1115 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1116 || dfa->eclosures == NULL, 0))
1117 return REG_ESPACE;
1118
1119 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1120 if (dfa->subexp_map != NULL)
1121 {
1122 int i;
1123 for (i = 0; i < preg->re_nsub; i++)
1124 dfa->subexp_map[i] = i;
1125 preorder (dfa->str_tree, optimize_subexps, dfa);
1126 for (i = 0; i < preg->re_nsub; i++)
1127 if (dfa->subexp_map[i] != i)
1128 break;
1129 if (i == preg->re_nsub)
1130 {
1131 free (dfa->subexp_map);
1132 dfa->subexp_map = NULL;
1133 }
1134 }
1135
1136 ret = postorder (dfa->str_tree, lower_subexps, preg);
1137 if (BE (ret != REG_NOERROR, 0))
1138 return ret;
1139 ret = postorder (dfa->str_tree, calc_first, dfa);
1140 if (BE (ret != REG_NOERROR, 0))
1141 return ret;
1142 preorder (dfa->str_tree, calc_next, dfa);
1143 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1144 if (BE (ret != REG_NOERROR, 0))
1145 return ret;
1146 ret = calc_eclosure (dfa);
1147 if (BE (ret != REG_NOERROR, 0))
1148 return ret;
1149
1150 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1151 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1152 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1153 || dfa->nbackref)
1154 {
1155 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1156 if (BE (dfa->inveclosures == NULL, 0))
1157 return REG_ESPACE;
1158 ret = calc_inveclosure (dfa);
1159 }
1160
1161 return ret;
1162}
1163
1164/* Our parse trees are very unbalanced, so we cannot use a stack to
1165 implement parse tree visits. Instead, we use parent pointers and
1166 some hairy code in these two functions. */
1167static reg_errcode_t
1168postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1169 void *extra)
1170{
1171 bin_tree_t *node, *prev;
1172
1173 for (node = root; ; )
1174 {
1175 /* Descend down the tree, preferably to the left (or to the right
1176 if that's the only child). */
1177 while (node->left || node->right)
1178 if (node->left)
1179 node = node->left;
1180 else
1181 node = node->right;
1182
1183 do
1184 {
1185 reg_errcode_t err = fn (extra, node);
1186 if (BE (err != REG_NOERROR, 0))
1187 return err;
1188 if (node->parent == NULL)
1189 return REG_NOERROR;
1190 prev = node;
1191 node = node->parent;
1192 }
1193 /* Go up while we have a node that is reached from the right. */
1194 while (node->right == prev || node->right == NULL);
1195 node = node->right;
1196 }
1197}
1198
1199static reg_errcode_t
1200preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1201 void *extra)
1202{
1203 bin_tree_t *node;
1204
1205 for (node = root; ; )
1206 {
1207 reg_errcode_t err = fn (extra, node);
1208 if (BE (err != REG_NOERROR, 0))
1209 return err;
1210
1211 /* Go to the left node, or up and to the right. */
1212 if (node->left)
1213 node = node->left;
1214 else
1215 {
1216 bin_tree_t *prev = NULL;
1217 while (node->right == prev || node->right == NULL)
1218 {
1219 prev = node;
1220 node = node->parent;
1221 if (!node)
1222 return REG_NOERROR;
1223 }
1224 node = node->right;
1225 }
1226 }
1227}
1228
1229/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1230 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1231 backreferences as well. Requires a preorder visit. */
1232static reg_errcode_t
1233optimize_subexps (void *extra, bin_tree_t *node)
1234{
1235 re_dfa_t *dfa = (re_dfa_t *) extra;
1236
1237 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1238 {
1239 int idx = node->token.opr.idx;
1240 node->token.opr.idx = dfa->subexp_map[idx];
1241 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1242 }
1243
1244 else if (node->token.type == SUBEXP
1245 && node->left && node->left->token.type == SUBEXP)
1246 {
1247 int other_idx = node->left->token.opr.idx;
1248
1249 node->left = node->left->left;
1250 if (node->left)
1251 node->left->parent = node;
1252
1253 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1254 if (other_idx < BITSET_WORD_BITS)
1255 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1256 }
1257
1258 return REG_NOERROR;
1259}
1260
1261/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1262 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1263static reg_errcode_t
1264lower_subexps (void *extra, bin_tree_t *node)
1265{
1266 regex_t *preg = (regex_t *) extra;
1267 reg_errcode_t err = REG_NOERROR;
1268
1269 if (node->left && node->left->token.type == SUBEXP)
1270 {
1271 node->left = lower_subexp (&err, preg, node->left);
1272 if (node->left)
1273 node->left->parent = node;
1274 }
1275 if (node->right && node->right->token.type == SUBEXP)
1276 {
1277 node->right = lower_subexp (&err, preg, node->right);
1278 if (node->right)
1279 node->right->parent = node;
1280 }
1281
1282 return err;
1283}
1284
1285static bin_tree_t *
1286lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1287{
1288 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1289 bin_tree_t *body = node->left;
1290 bin_tree_t *op, *cls, *tree1, *tree;
1291
1292 if (preg->no_sub
1293 /* We do not optimize empty subexpressions, because otherwise we may
1294 have bad CONCAT nodes with NULL children. This is obviously not
1295 very common, so we do not lose much. An example that triggers
1296 this case is the sed "script" /\(\)/x. */
1297 && node->left != NULL
1298 && (node->token.opr.idx >= BITSET_WORD_BITS
1299 || !(dfa->used_bkref_map
1300 & ((bitset_word_t) 1 << node->token.opr.idx))))
1301 return node->left;
1302
1303 /* Convert the SUBEXP node to the concatenation of an
1304 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1305 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1306 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1307 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1308 tree = create_tree (dfa, op, tree1, CONCAT);
1309 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1310 {
1311 *err = REG_ESPACE;
1312 return NULL;
1313 }
1314
1315 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1316 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1317 return tree;
1318}
1319
1320/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1321 nodes. Requires a postorder visit. */
1322static reg_errcode_t
1323calc_first (void *extra, bin_tree_t *node)
1324{
1325 re_dfa_t *dfa = (re_dfa_t *) extra;
1326 if (node->token.type == CONCAT)
1327 {
1328 node->first = node->left->first;
1329 node->node_idx = node->left->node_idx;
1330 }
1331 else
1332 {
1333 node->first = node;
1334 node->node_idx = re_dfa_add_node (dfa, node->token);
1335 if (BE (node->node_idx == -1, 0))
1336 return REG_ESPACE;
1337 }
1338 return REG_NOERROR;
1339}
1340
1341/* Pass 2: compute NEXT on the tree. Preorder visit. */
1342static reg_errcode_t
1343calc_next (void *extra, bin_tree_t *node)
1344{
1345 switch (node->token.type)
1346 {
1347 case OP_DUP_ASTERISK:
1348 node->left->next = node;
1349 break;
1350 case CONCAT:
1351 node->left->next = node->right->first;
1352 node->right->next = node->next;
1353 break;
1354 default:
1355 if (node->left)
1356 node->left->next = node->next;
1357 if (node->right)
1358 node->right->next = node->next;
1359 break;
1360 }
1361 return REG_NOERROR;
1362}
1363
1364/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1365static reg_errcode_t
1366link_nfa_nodes (void *extra, bin_tree_t *node)
1367{
1368 re_dfa_t *dfa = (re_dfa_t *) extra;
1369 int idx = node->node_idx;
1370 reg_errcode_t err = REG_NOERROR;
1371
1372 switch (node->token.type)
1373 {
1374 case CONCAT:
1375 break;
1376
1377 case END_OF_RE:
1378 assert (node->next == NULL);
1379 break;
1380
1381 case OP_DUP_ASTERISK:
1382 case OP_ALT:
1383 {
1384 int left, right;
1385 dfa->has_plural_match = 1;
1386 if (node->left != NULL)
1387 left = node->left->first->node_idx;
1388 else
1389 left = node->next->node_idx;
1390 if (node->right != NULL)
1391 right = node->right->first->node_idx;
1392 else
1393 right = node->next->node_idx;
1394 assert (left > -1);
1395 assert (right > -1);
1396 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1397 }
1398 break;
1399
1400 case ANCHOR:
1401 case OP_OPEN_SUBEXP:
1402 case OP_CLOSE_SUBEXP:
1403 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1404 break;
1405
1406 case OP_BACK_REF:
1407 dfa->nexts[idx] = node->next->node_idx;
1408 if (node->token.type == OP_BACK_REF)
1409 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1410 break;
1411
1412 default:
1413 assert (!IS_EPSILON_NODE (node->token.type));
1414 dfa->nexts[idx] = node->next->node_idx;
1415 break;
1416 }
1417
1418 return err;
1419}
1420
1421/* Duplicate the epsilon closure of the node ROOT_NODE.
1422 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1423 to their own constraint. */
1424
1425static reg_errcode_t
1426internal_function
1427duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1428 int root_node, unsigned int init_constraint)
1429{
1430 int org_node, clone_node, ret;
1431 unsigned int constraint = init_constraint;
1432 for (org_node = top_org_node, clone_node = top_clone_node;;)
1433 {
1434 int org_dest, clone_dest;
1435 if (dfa->nodes[org_node].type == OP_BACK_REF)
1436 {
1437 /* If the back reference epsilon-transit, its destination must
1438 also have the constraint. Then duplicate the epsilon closure
1439 of the destination of the back reference, and store it in
1440 edests of the back reference. */
1441 org_dest = dfa->nexts[org_node];
1442 re_node_set_empty (dfa->edests + clone_node);
1443 clone_dest = duplicate_node (dfa, org_dest, constraint);
1444 if (BE (clone_dest == -1, 0))
1445 return REG_ESPACE;
1446 dfa->nexts[clone_node] = dfa->nexts[org_node];
1447 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1448 if (BE (ret < 0, 0))
1449 return REG_ESPACE;
1450 }
1451 else if (dfa->edests[org_node].nelem == 0)
1452 {
1453 /* In case of the node can't epsilon-transit, don't duplicate the
1454 destination and store the original destination as the
1455 destination of the node. */
1456 dfa->nexts[clone_node] = dfa->nexts[org_node];
1457 break;
1458 }
1459 else if (dfa->edests[org_node].nelem == 1)
1460 {
1461 /* In case of the node can epsilon-transit, and it has only one
1462 destination. */
1463 org_dest = dfa->edests[org_node].elems[0];
1464 re_node_set_empty (dfa->edests + clone_node);
1465 if (dfa->nodes[org_node].type == ANCHOR)
1466 {
1467 /* In case of the node has another constraint, append it. */
1468 if (org_node == root_node && clone_node != org_node)
1469 {
1470 /* ...but if the node is root_node itself, it means the
1471 epsilon closure have a loop, then tie it to the
1472 destination of the root_node. */
1473 ret = re_node_set_insert (dfa->edests + clone_node,
1474 org_dest);
1475 if (BE (ret < 0, 0))
1476 return REG_ESPACE;
1477 break;
1478 }
1479 constraint |= dfa->nodes[org_node].opr.ctx_type;
1480 }
1481 clone_dest = duplicate_node (dfa, org_dest, constraint);
1482 if (BE (clone_dest == -1, 0))
1483 return REG_ESPACE;
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1486 return REG_ESPACE;
1487 }
1488 else /* dfa->edests[org_node].nelem == 2 */
1489 {
1490 /* In case of the node can epsilon-transit, and it has two
1491 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1492 org_dest = dfa->edests[org_node].elems[0];
1493 re_node_set_empty (dfa->edests + clone_node);
1494 /* Search for a duplicated node which satisfies the constraint. */
1495 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1496 if (clone_dest == -1)
1497 {
1498 /* There are no such a duplicated node, create a new one. */
1499 reg_errcode_t err;
1500 clone_dest = duplicate_node (dfa, org_dest, constraint);
1501 if (BE (clone_dest == -1, 0))
1502 return REG_ESPACE;
1503 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 if (BE (ret < 0, 0))
1505 return REG_ESPACE;
1506 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1507 root_node, constraint);
1508 if (BE (err != REG_NOERROR, 0))
1509 return err;
1510 }
1511 else
1512 {
1513 /* There are a duplicated node which satisfy the constraint,
1514 use it to avoid infinite loop. */
1515 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 if (BE (ret < 0, 0))
1517 return REG_ESPACE;
1518 }
1519
1520 org_dest = dfa->edests[org_node].elems[1];
1521 clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 if (BE (clone_dest == -1, 0))
1523 return REG_ESPACE;
1524 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525 if (BE (ret < 0, 0))
1526 return REG_ESPACE;
1527 }
1528 org_node = org_dest;
1529 clone_node = clone_dest;
1530 }
1531 return REG_NOERROR;
1532}
1533
1534/* Search for a node which is duplicated from the node ORG_NODE, and
1535 satisfies the constraint CONSTRAINT. */
1536
1537static int
1538search_duplicated_node (const re_dfa_t *dfa, int org_node,
1539 unsigned int constraint)
1540{
1541 int idx;
1542 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1543 {
1544 if (org_node == dfa->org_indices[idx]
1545 && constraint == dfa->nodes[idx].constraint)
1546 return idx; /* Found. */
1547 }
1548 return -1; /* Not found. */
1549}
1550
1551/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1552 Return the index of the new node, or -1 if insufficient storage is
1553 available. */
1554
1555static int
1556duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1557{
1558 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1559 if (BE (dup_idx != -1, 1))
1560 {
1561 dfa->nodes[dup_idx].constraint = constraint;
1562 if (dfa->nodes[org_idx].type == ANCHOR)
1563 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1564 dfa->nodes[dup_idx].duplicated = 1;
1565
1566 /* Store the index of the original node. */
1567 dfa->org_indices[dup_idx] = org_idx;
1568 }
1569 return dup_idx;
1570}
1571
1572static reg_errcode_t
1573calc_inveclosure (re_dfa_t *dfa)
1574{
1575 int src, idx, ret;
1576 for (idx = 0; idx < dfa->nodes_len; ++idx)
1577 re_node_set_init_empty (dfa->inveclosures + idx);
1578
1579 for (src = 0; src < dfa->nodes_len; ++src)
1580 {
1581 int *elems = dfa->eclosures[src].elems;
1582 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1583 {
1584 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1585 if (BE (ret == -1, 0))
1586 return REG_ESPACE;
1587 }
1588 }
1589
1590 return REG_NOERROR;
1591}
1592
1593/* Calculate "eclosure" for all the node in DFA. */
1594
1595static reg_errcode_t
1596calc_eclosure (re_dfa_t *dfa)
1597{
1598 int node_idx, incomplete;
1599#ifdef DEBUG
1600 assert (dfa->nodes_len > 0);
1601#endif
1602 incomplete = 0;
1603 /* For each nodes, calculate epsilon closure. */
1604 for (node_idx = 0; ; ++node_idx)
1605 {
1606 reg_errcode_t err;
1607 re_node_set eclosure_elem;
1608 if (node_idx == dfa->nodes_len)
1609 {
1610 if (!incomplete)
1611 break;
1612 incomplete = 0;
1613 node_idx = 0;
1614 }
1615
1616#ifdef DEBUG
1617 assert (dfa->eclosures[node_idx].nelem != -1);
1618#endif
1619
1620 /* If we have already calculated, skip it. */
1621 if (dfa->eclosures[node_idx].nelem != 0)
1622 continue;
1623 /* Calculate epsilon closure of `node_idx'. */
1624 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1625 if (BE (err != REG_NOERROR, 0))
1626 return err;
1627
1628 if (dfa->eclosures[node_idx].nelem == 0)
1629 {
1630 incomplete = 1;
1631 re_node_set_free (&eclosure_elem);
1632 }
1633 }
1634 return REG_NOERROR;
1635}
1636
1637/* Calculate epsilon closure of NODE. */
1638
1639static reg_errcode_t
1640calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1641{
1642 reg_errcode_t err;
1643 unsigned int constraint;
1644 int i, incomplete;
1645 re_node_set eclosure;
1646 incomplete = 0;
1647 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1648 if (BE (err != REG_NOERROR, 0))
1649 return err;
1650
1651 /* This indicates that we are calculating this node now.
1652 We reference this value to avoid infinite loop. */
1653 dfa->eclosures[node].nelem = -1;
1654
1655 constraint = ((dfa->nodes[node].type == ANCHOR)
1656 ? dfa->nodes[node].opr.ctx_type : 0);
1657 /* If the current node has constraints, duplicate all nodes.
1658 Since they must inherit the constraints. */
1659 if (constraint
1660 && dfa->edests[node].nelem
1661 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662 {
1663 int org_node, cur_node;
1664 org_node = cur_node = node;
1665 err = duplicate_node_closure (dfa, node, node, node, constraint);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1668 }
1669
1670 /* Expand each epsilon destination nodes. */
1671 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1672 for (i = 0; i < dfa->edests[node].nelem; ++i)
1673 {
1674 re_node_set eclosure_elem;
1675 int edest = dfa->edests[node].elems[i];
1676 /* If calculating the epsilon closure of `edest' is in progress,
1677 return intermediate result. */
1678 if (dfa->eclosures[edest].nelem == -1)
1679 {
1680 incomplete = 1;
1681 continue;
1682 }
1683 /* If we haven't calculated the epsilon closure of `edest' yet,
1684 calculate now. Otherwise use calculated epsilon closure. */
1685 if (dfa->eclosures[edest].nelem == 0)
1686 {
1687 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1688 if (BE (err != REG_NOERROR, 0))
1689 return err;
1690 }
1691 else
1692 eclosure_elem = dfa->eclosures[edest];
1693 /* Merge the epsilon closure of `edest'. */
1694 re_node_set_merge (&eclosure, &eclosure_elem);
1695 /* If the epsilon closure of `edest' is incomplete,
1696 the epsilon closure of this node is also incomplete. */
1697 if (dfa->eclosures[edest].nelem == 0)
1698 {
1699 incomplete = 1;
1700 re_node_set_free (&eclosure_elem);
1701 }
1702 }
1703
1704 /* Epsilon closures include itself. */
1705 re_node_set_insert (&eclosure, node);
1706 if (incomplete && !root)
1707 dfa->eclosures[node].nelem = 0;
1708 else
1709 dfa->eclosures[node] = eclosure;
1710 *new_set = eclosure;
1711 return REG_NOERROR;
1712}
1713
1714
1715/* Functions for token which are used in the parser. */
1716
1717/* Fetch a token from INPUT.
1718 We must not use this function inside bracket expressions. */
1719
1720static void
1721internal_function
1722fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1723{
1724 re_string_skip_bytes (input, peek_token (result, input, syntax));
1725}
1726
1727/* Peek a token from INPUT, and return the length of the token.
1728 We must not use this function inside bracket expressions. */
1729
1730static int
1731internal_function
1732peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1733{
1734 unsigned char c;
1735
1736 if (re_string_eoi (input))
1737 {
1738 token->type = END_OF_RE;
1739 return 0;
1740 }
1741
1742 c = re_string_peek_byte (input, 0);
1743 token->opr.c = c;
1744
1745 token->word_char = 0;
1746#ifdef RE_ENABLE_I18N
1747 token->mb_partial = 0;
1748 if (input->mb_cur_max > 1 &&
1749 !re_string_first_byte (input, re_string_cur_idx (input)))
1750 {
1751 token->type = CHARACTER;
1752 token->mb_partial = 1;
1753 return 1;
1754 }
1755#endif
1756 if (c == '\\')
1757 {
1758 unsigned char c2;
1759 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1760 {
1761 token->type = BACK_SLASH;
1762 return 1;
1763 }
1764
1765 c2 = re_string_peek_byte_case (input, 1);
1766 token->opr.c = c2;
1767 token->type = CHARACTER;
1768#ifdef RE_ENABLE_I18N
1769 if (input->mb_cur_max > 1)
1770 {
1771 wint_t wc = re_string_wchar_at (input,
1772 re_string_cur_idx (input) + 1);
1773 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1774 }
1775 else
1776#endif
1777 token->word_char = IS_WORD_CHAR (c2) != 0;
1778
1779 switch (c2)
1780 {
1781 case '|':
1782 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1783 token->type = OP_ALT;
1784 break;
1785 case '1': case '2': case '3': case '4': case '5':
1786 case '6': case '7': case '8': case '9':
1787 if (!(syntax & RE_NO_BK_REFS))
1788 {
1789 token->type = OP_BACK_REF;
1790 token->opr.idx = c2 - '1';
1791 }
1792 break;
1793 case '<':
1794 if (!(syntax & RE_NO_GNU_OPS))
1795 {
1796 token->type = ANCHOR;
1797 token->opr.ctx_type = WORD_FIRST;
1798 }
1799 break;
1800 case '>':
1801 if (!(syntax & RE_NO_GNU_OPS))
1802 {
1803 token->type = ANCHOR;
1804 token->opr.ctx_type = WORD_LAST;
1805 }
1806 break;
1807 case 'b':
1808 if (!(syntax & RE_NO_GNU_OPS))
1809 {
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = WORD_DELIM;
1812 }
1813 break;
1814 case 'B':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = NOT_WORD_DELIM;
1819 }
1820 break;
1821 case 'w':
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 token->type = OP_WORD;
1824 break;
1825 case 'W':
1826 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = OP_NOTWORD;
1828 break;
1829 case 's':
1830 if (!(syntax & RE_NO_GNU_OPS))
1831 token->type = OP_SPACE;
1832 break;
1833 case 'S':
1834 if (!(syntax & RE_NO_GNU_OPS))
1835 token->type = OP_NOTSPACE;
1836 break;
1837 case '`':
1838 if (!(syntax & RE_NO_GNU_OPS))
1839 {
1840 token->type = ANCHOR;
1841 token->opr.ctx_type = BUF_FIRST;
1842 }
1843 break;
1844 case '\'':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = BUF_LAST;
1849 }
1850 break;
1851 case '(':
1852 if (!(syntax & RE_NO_BK_PARENS))
1853 token->type = OP_OPEN_SUBEXP;
1854 break;
1855 case ')':
1856 if (!(syntax & RE_NO_BK_PARENS))
1857 token->type = OP_CLOSE_SUBEXP;
1858 break;
1859 case '+':
1860 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1861 token->type = OP_DUP_PLUS;
1862 break;
1863 case '?':
1864 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1865 token->type = OP_DUP_QUESTION;
1866 break;
1867 case '{':
1868 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1869 token->type = OP_OPEN_DUP_NUM;
1870 break;
1871 case '}':
1872 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1873 token->type = OP_CLOSE_DUP_NUM;
1874 break;
1875 default:
1876 break;
1877 }
1878 return 2;
1879 }
1880
1881 token->type = CHARACTER;
1882#ifdef RE_ENABLE_I18N
1883 if (input->mb_cur_max > 1)
1884 {
1885 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1886 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1887 }
1888 else
1889#endif
1890 token->word_char = IS_WORD_CHAR (token->opr.c);
1891
1892 switch (c)
1893 {
1894 case '\n':
1895 if (syntax & RE_NEWLINE_ALT)
1896 token->type = OP_ALT;
1897 break;
1898 case '|':
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1900 token->type = OP_ALT;
1901 break;
1902 case '*':
1903 token->type = OP_DUP_ASTERISK;
1904 break;
1905 case '+':
1906 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_PLUS;
1908 break;
1909 case '?':
1910 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_QUESTION;
1912 break;
1913 case '{':
1914 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1915 token->type = OP_OPEN_DUP_NUM;
1916 break;
1917 case '}':
1918 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1919 token->type = OP_CLOSE_DUP_NUM;
1920 break;
1921 case '(':
1922 if (syntax & RE_NO_BK_PARENS)
1923 token->type = OP_OPEN_SUBEXP;
1924 break;
1925 case ')':
1926 if (syntax & RE_NO_BK_PARENS)
1927 token->type = OP_CLOSE_SUBEXP;
1928 break;
1929 case '[':
1930 token->type = OP_OPEN_BRACKET;
1931 break;
1932 case '.':
1933 token->type = OP_PERIOD;
1934 break;
1935 case '^':
1936 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1937 re_string_cur_idx (input) != 0)
1938 {
1939 char prev = re_string_peek_byte (input, -1);
1940 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1941 break;
1942 }
1943 token->type = ANCHOR;
1944 token->opr.ctx_type = LINE_FIRST;
1945 break;
1946 case '$':
1947 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1948 re_string_cur_idx (input) + 1 != re_string_length (input))
1949 {
1950 re_token_t next;
1951 re_string_skip_bytes (input, 1);
1952 peek_token (&next, input, syntax);
1953 re_string_skip_bytes (input, -1);
1954 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1955 break;
1956 }
1957 token->type = ANCHOR;
1958 token->opr.ctx_type = LINE_LAST;
1959 break;
1960 default:
1961 break;
1962 }
1963 return 1;
1964}
1965
1966/* Peek a token from INPUT, and return the length of the token.
1967 We must not use this function out of bracket expressions. */
1968
1969static int
1970internal_function
1971peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1972{
1973 unsigned char c;
1974 if (re_string_eoi (input))
1975 {
1976 token->type = END_OF_RE;
1977 return 0;
1978 }
1979 c = re_string_peek_byte (input, 0);
1980 token->opr.c = c;
1981
1982#ifdef RE_ENABLE_I18N
1983 if (input->mb_cur_max > 1 &&
1984 !re_string_first_byte (input, re_string_cur_idx (input)))
1985 {
1986 token->type = CHARACTER;
1987 return 1;
1988 }
1989#endif /* RE_ENABLE_I18N */
1990
1991 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1992 && re_string_cur_idx (input) + 1 < re_string_length (input))
1993 {
1994 /* In this case, '\' escape a character. */
1995 unsigned char c2;
1996 re_string_skip_bytes (input, 1);
1997 c2 = re_string_peek_byte (input, 0);
1998 token->opr.c = c2;
1999 token->type = CHARACTER;
2000 return 1;
2001 }
2002 if (c == '[') /* '[' is a special char in a bracket exps. */
2003 {
2004 unsigned char c2;
2005 int token_len;
2006 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2007 c2 = re_string_peek_byte (input, 1);
2008 else
2009 c2 = 0;
2010 token->opr.c = c2;
2011 token_len = 2;
2012 switch (c2)
2013 {
2014 case '.':
2015 token->type = OP_OPEN_COLL_ELEM;
2016 break;
2017 case '=':
2018 token->type = OP_OPEN_EQUIV_CLASS;
2019 break;
2020 case ':':
2021 if (syntax & RE_CHAR_CLASSES)
2022 {
2023 token->type = OP_OPEN_CHAR_CLASS;
2024 break;
2025 }
2026 /* else fall through. */
2027 default:
2028 token->type = CHARACTER;
2029 token->opr.c = c;
2030 token_len = 1;
2031 break;
2032 }
2033 return token_len;
2034 }
2035 switch (c)
2036 {
2037 case '-':
2038 token->type = OP_CHARSET_RANGE;
2039 break;
2040 case ']':
2041 token->type = OP_CLOSE_BRACKET;
2042 break;
2043 case '^':
2044 token->type = OP_NON_MATCH_LIST;
2045 break;
2046 default:
2047 token->type = CHARACTER;
2048 }
2049 return 1;
2050}
2051
2052
2053/* Functions for parser. */
2054
2055/* Entry point of the parser.
2056 Parse the regular expression REGEXP and return the structure tree.
2057 If an error is occured, ERR is set by error code, and return NULL.
2058 This function build the following tree, from regular expression <reg_exp>:
2059 CAT
2060 / \
2061 / \
2062 <reg_exp> EOR
2063
2064 CAT means concatenation.
2065 EOR means end of regular expression. */
2066
2067static bin_tree_t *
2068parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2069 reg_errcode_t *err)
2070{
2071 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2072 bin_tree_t *tree, *eor, *root;
2073 re_token_t current_token;
2074 dfa->syntax = syntax;
2075 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2076 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078 return NULL;
2079 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2080 if (tree != NULL)
2081 root = create_tree (dfa, tree, eor, CONCAT);
2082 else
2083 root = eor;
2084 if (BE (eor == NULL || root == NULL, 0))
2085 {
2086 *err = REG_ESPACE;
2087 return NULL;
2088 }
2089 return root;
2090}
2091
2092/* This function build the following tree, from regular expression
2093 <branch1>|<branch2>:
2094 ALT
2095 / \
2096 / \
2097 <branch1> <branch2>
2098
2099 ALT means alternative, which represents the operator `|'. */
2100
2101static bin_tree_t *
2102parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2103 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2104{
2105 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2106 bin_tree_t *tree, *branch = NULL;
2107 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2108 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109 return NULL;
2110
2111 while (token->type == OP_ALT)
2112 {
2113 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114 if (token->type != OP_ALT && token->type != END_OF_RE
2115 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2116 {
2117 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2118 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2119 return NULL;
2120 }
2121 else
2122 branch = NULL;
2123 tree = create_tree (dfa, tree, branch, OP_ALT);
2124 if (BE (tree == NULL, 0))
2125 {
2126 *err = REG_ESPACE;
2127 return NULL;
2128 }
2129 }
2130 return tree;
2131}
2132
2133/* This function build the following tree, from regular expression
2134 <exp1><exp2>:
2135 CAT
2136 / \
2137 / \
2138 <exp1> <exp2>
2139
2140 CAT means concatenation. */
2141
2142static bin_tree_t *
2143parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2145{
2146 bin_tree_t *tree, *exp;
2147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2148 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 return NULL;
2151
2152 while (token->type != OP_ALT && token->type != END_OF_RE
2153 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154 {
2155 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2157 {
2158 return NULL;
2159 }
2160 if (tree != NULL && exp != NULL)
2161 {
2162 tree = create_tree (dfa, tree, exp, CONCAT);
2163 if (tree == NULL)
2164 {
2165 *err = REG_ESPACE;
2166 return NULL;
2167 }
2168 }
2169 else if (tree == NULL)
2170 tree = exp;
2171 /* Otherwise exp == NULL, we don't need to create new tree. */
2172 }
2173 return tree;
2174}
2175
2176/* This function build the following tree, from regular expression a*:
2177 *
2178 |
2179 a
2180*/
2181
2182static bin_tree_t *
2183parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2185{
2186 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2187 bin_tree_t *tree;
2188 switch (token->type)
2189 {
2190 case CHARACTER:
2191 tree = create_token_tree (dfa, NULL, NULL, token);
2192 if (BE (tree == NULL, 0))
2193 {
2194 *err = REG_ESPACE;
2195 return NULL;
2196 }
2197#ifdef RE_ENABLE_I18N
2198 if (dfa->mb_cur_max > 1)
2199 {
2200 while (!re_string_eoi (regexp)
2201 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2202 {
2203 bin_tree_t *mbc_remain;
2204 fetch_token (token, regexp, syntax);
2205 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2206 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2207 if (BE (mbc_remain == NULL || tree == NULL, 0))
2208 {
2209 *err = REG_ESPACE;
2210 return NULL;
2211 }
2212 }
2213 }
2214#endif
2215 break;
2216 case OP_OPEN_SUBEXP:
2217 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2218 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2219 return NULL;
2220 break;
2221 case OP_OPEN_BRACKET:
2222 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2223 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2224 return NULL;
2225 break;
2226 case OP_BACK_REF:
2227 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2228 {
2229 *err = REG_ESUBREG;
2230 return NULL;
2231 }
2232 dfa->used_bkref_map |= 1 << token->opr.idx;
2233 tree = create_token_tree (dfa, NULL, NULL, token);
2234 if (BE (tree == NULL, 0))
2235 {
2236 *err = REG_ESPACE;
2237 return NULL;
2238 }
2239 ++dfa->nbackref;
2240 dfa->has_mb_node = 1;
2241 break;
2242 case OP_OPEN_DUP_NUM:
2243 if (syntax & RE_CONTEXT_INVALID_DUP)
2244 {
2245 *err = REG_BADRPT;
2246 return NULL;
2247 }
2248 /* FALLTHROUGH */
2249 case OP_DUP_ASTERISK:
2250 case OP_DUP_PLUS:
2251 case OP_DUP_QUESTION:
2252 if (syntax & RE_CONTEXT_INVALID_OPS)
2253 {
2254 *err = REG_BADRPT;
2255 return NULL;
2256 }
2257 else if (syntax & RE_CONTEXT_INDEP_OPS)
2258 {
2259 fetch_token (token, regexp, syntax);
2260 return parse_expression (regexp, preg, token, syntax, nest, err);
2261 }
2262 /* else fall through */
2263 case OP_CLOSE_SUBEXP:
2264 if ((token->type == OP_CLOSE_SUBEXP) &&
2265 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2266 {
2267 *err = REG_ERPAREN;
2268 return NULL;
2269 }
2270 /* else fall through */
2271 case OP_CLOSE_DUP_NUM:
2272 /* We treat it as a normal character. */
2273
2274 /* Then we can these characters as normal characters. */
2275 token->type = CHARACTER;
2276 /* mb_partial and word_char bits should be initialized already
2277 by peek_token. */
2278 tree = create_token_tree (dfa, NULL, NULL, token);
2279 if (BE (tree == NULL, 0))
2280 {
2281 *err = REG_ESPACE;
2282 return NULL;
2283 }
2284 break;
2285 case ANCHOR:
2286 if ((token->opr.ctx_type
2287 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2288 && dfa->word_ops_used == 0)
2289 init_word_char (dfa);
2290 if (token->opr.ctx_type == WORD_DELIM
2291 || token->opr.ctx_type == NOT_WORD_DELIM)
2292 {
2293 bin_tree_t *tree_first, *tree_last;
2294 if (token->opr.ctx_type == WORD_DELIM)
2295 {
2296 token->opr.ctx_type = WORD_FIRST;
2297 tree_first = create_token_tree (dfa, NULL, NULL, token);
2298 token->opr.ctx_type = WORD_LAST;
2299 }
2300 else
2301 {
2302 token->opr.ctx_type = INSIDE_WORD;
2303 tree_first = create_token_tree (dfa, NULL, NULL, token);
2304 token->opr.ctx_type = INSIDE_NOTWORD;
2305 }
2306 tree_last = create_token_tree (dfa, NULL, NULL, token);
2307 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2308 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2309 {
2310 *err = REG_ESPACE;
2311 return NULL;
2312 }
2313 }
2314 else
2315 {
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 }
2323 /* We must return here, since ANCHORs can't be followed
2324 by repetition operators.
2325 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2326 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2327 fetch_token (token, regexp, syntax);
2328 return tree;
2329 case OP_PERIOD:
2330 tree = create_token_tree (dfa, NULL, NULL, token);
2331 if (BE (tree == NULL, 0))
2332 {
2333 *err = REG_ESPACE;
2334 return NULL;
2335 }
2336 if (dfa->mb_cur_max > 1)
2337 dfa->has_mb_node = 1;
2338 break;
2339 case OP_WORD:
2340 case OP_NOTWORD:
2341 tree = build_charclass_op (dfa, regexp->trans,
2342 (const unsigned char *) "alnum",
2343 (const unsigned char *) "_",
2344 token->type == OP_NOTWORD, err);
2345 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346 return NULL;
2347 break;
2348 case OP_SPACE:
2349 case OP_NOTSPACE:
2350 tree = build_charclass_op (dfa, regexp->trans,
2351 (const unsigned char *) "space",
2352 (const unsigned char *) "",
2353 token->type == OP_NOTSPACE, err);
2354 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2355 return NULL;
2356 break;
2357 case OP_ALT:
2358 case END_OF_RE:
2359 return NULL;
2360 case BACK_SLASH:
2361 *err = REG_EESCAPE;
2362 return NULL;
2363 default:
2364 /* Must not happen? */
2365#ifdef DEBUG
2366 assert (0);
2367#endif
2368 return NULL;
2369 }
2370 fetch_token (token, regexp, syntax);
2371
2372 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2373 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2374 {
2375 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2376 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2377 return NULL;
2378 /* In BRE consecutive duplications are not allowed. */
2379 if ((syntax & RE_CONTEXT_INVALID_DUP)
2380 && (token->type == OP_DUP_ASTERISK
2381 || token->type == OP_OPEN_DUP_NUM))
2382 {
2383 *err = REG_BADRPT;
2384 return NULL;
2385 }
2386 }
2387
2388 return tree;
2389}
2390
2391/* This function build the following tree, from regular expression
2392 (<reg_exp>):
2393 SUBEXP
2394 |
2395 <reg_exp>
2396*/
2397
2398static bin_tree_t *
2399parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2400 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2401{
2402 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2403 bin_tree_t *tree;
2404 size_t cur_nsub;
2405 cur_nsub = preg->re_nsub++;
2406
2407 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2408
2409 /* The subexpression may be a null string. */
2410 if (token->type == OP_CLOSE_SUBEXP)
2411 tree = NULL;
2412 else
2413 {
2414 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2415 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2416 *err = REG_EPAREN;
2417 if (BE (*err != REG_NOERROR, 0))
2418 return NULL;
2419 }
2420
2421 if (cur_nsub <= '9' - '1')
2422 dfa->completed_bkref_map |= 1 << cur_nsub;
2423
2424 tree = create_tree (dfa, tree, NULL, SUBEXP);
2425 if (BE (tree == NULL, 0))
2426 {
2427 *err = REG_ESPACE;
2428 return NULL;
2429 }
2430 tree->token.opr.idx = cur_nsub;
2431 return tree;
2432}
2433
2434/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2435
2436static bin_tree_t *
2437parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2438 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2439{
2440 bin_tree_t *tree = NULL, *old_tree = NULL;
2441 int i, start, end, start_idx = re_string_cur_idx (regexp);
2442 re_token_t start_token = *token;
2443
2444 if (token->type == OP_OPEN_DUP_NUM)
2445 {
2446 end = 0;
2447 start = fetch_number (regexp, token, syntax);
2448 if (start == -1)
2449 {
2450 if (token->type == CHARACTER && token->opr.c == ',')
2451 start = 0; /* We treat "{,m}" as "{0,m}". */
2452 else
2453 {
2454 *err = REG_BADBR; /* <re>{} is invalid. */
2455 return NULL;
2456 }
2457 }
2458 if (BE (start != -2, 1))
2459 {
2460 /* We treat "{n}" as "{n,n}". */
2461 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2462 : ((token->type == CHARACTER && token->opr.c == ',')
2463 ? fetch_number (regexp, token, syntax) : -2));
2464 }
2465 if (BE (start == -2 || end == -2, 0))
2466 {
2467 /* Invalid sequence. */
2468 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2469 {
2470 if (token->type == END_OF_RE)
2471 *err = REG_EBRACE;
2472 else
2473 *err = REG_BADBR;
2474
2475 return NULL;
2476 }
2477
2478 /* If the syntax bit is set, rollback. */
2479 re_string_set_index (regexp, start_idx);
2480 *token = start_token;
2481 token->type = CHARACTER;
2482 /* mb_partial and word_char bits should be already initialized by
2483 peek_token. */
2484 return elem;
2485 }
2486
2487 if (BE (end != -1 && start > end, 0))
2488 {
2489 /* First number greater than second. */
2490 *err = REG_BADBR;
2491 return NULL;
2492 }
2493 }
2494 else
2495 {
2496 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2497 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2498 }
2499
2500 fetch_token (token, regexp, syntax);
2501
2502 if (BE (elem == NULL, 0))
2503 return NULL;
2504 if (BE (start == 0 && end == 0, 0))
2505 {
2506 postorder (elem, free_tree, NULL);
2507 return NULL;
2508 }
2509
2510 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2511 if (BE (start > 0, 0))
2512 {
2513 tree = elem;
2514 for (i = 2; i <= start; ++i)
2515 {
2516 elem = duplicate_tree (elem, dfa);
2517 tree = create_tree (dfa, tree, elem, CONCAT);
2518 if (BE (elem == NULL || tree == NULL, 0))
2519 goto parse_dup_op_espace;
2520 }
2521
2522 if (start == end)
2523 return tree;
2524
2525 /* Duplicate ELEM before it is marked optional. */
2526 elem = duplicate_tree (elem, dfa);
2527 old_tree = tree;
2528 }
2529 else
2530 old_tree = NULL;
2531
2532 if (elem->token.type == SUBEXP)
2533 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2534
2535 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2536 if (BE (tree == NULL, 0))
2537 goto parse_dup_op_espace;
2538
2539 /* This loop is actually executed only when end != -1,
2540 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2541 already created the start+1-th copy. */
2542 for (i = start + 2; i <= end; ++i)
2543 {
2544 elem = duplicate_tree (elem, dfa);
2545 tree = create_tree (dfa, tree, elem, CONCAT);
2546 if (BE (elem == NULL || tree == NULL, 0))
2547 goto parse_dup_op_espace;
2548
2549 tree = create_tree (dfa, tree, NULL, OP_ALT);
2550 if (BE (tree == NULL, 0))
2551 goto parse_dup_op_espace;
2552 }
2553
2554 if (old_tree)
2555 tree = create_tree (dfa, old_tree, tree, CONCAT);
2556
2557 return tree;
2558
2559 parse_dup_op_espace:
2560 *err = REG_ESPACE;
2561 return NULL;
2562}
2563
2564/* Size of the names for collating symbol/equivalence_class/character_class.
2565 I'm not sure, but maybe enough. */
2566#define BRACKET_NAME_BUF_SIZE 32
2567
2568#ifndef _LIBC
2569 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2570 Build the range expression which starts from START_ELEM, and ends
2571 at END_ELEM. The result are written to MBCSET and SBCSET.
2572 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2573 mbcset->range_ends, is a pointer argument sinse we may
2574 update it. */
2575
2576static reg_errcode_t
2577internal_function
2578# ifdef RE_ENABLE_I18N
2579build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2580 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2581# else /* not RE_ENABLE_I18N */
2582build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2583 bracket_elem_t *end_elem)
2584# endif /* not RE_ENABLE_I18N */
2585{
2586 unsigned int start_ch, end_ch;
2587 /* Equivalence Classes and Character Classes can't be a range start/end. */
2588 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2589 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2590 0))
2591 return REG_ERANGE;
2592
2593 /* We can handle no multi character collating elements without libc
2594 support. */
2595 if (BE ((start_elem->type == COLL_SYM
2596 && strlen ((char *) start_elem->opr.name) > 1)
2597 || (end_elem->type == COLL_SYM
2598 && strlen ((char *) end_elem->opr.name) > 1), 0))
2599 return REG_ECOLLATE;
2600
2601# ifdef RE_ENABLE_I18N
2602 {
2603 wchar_t wc;
2604 wint_t start_wc;
2605 wint_t end_wc;
2606 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2607
2608 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2609 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2610 : 0));
2611 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2612 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2613 : 0));
2614 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2615 ? __btowc (start_ch) : start_elem->opr.wch);
2616 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2617 ? __btowc (end_ch) : end_elem->opr.wch);
2618 if (start_wc == WEOF || end_wc == WEOF)
2619 return REG_ECOLLATE;
2620 cmp_buf[0] = start_wc;
2621 cmp_buf[4] = end_wc;
2622 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2623 return REG_ERANGE;
2624
2625 /* Got valid collation sequence values, add them as a new entry.
2626 However, for !_LIBC we have no collation elements: if the
2627 character set is single byte, the single byte character set
2628 that we build below suffices. parse_bracket_exp passes
2629 no MBCSET if dfa->mb_cur_max == 1. */
2630 if (mbcset)
2631 {
2632 /* Check the space of the arrays. */
2633 if (BE (*range_alloc == mbcset->nranges, 0))
2634 {
2635 /* There is not enough space, need realloc. */
2636 wchar_t *new_array_start, *new_array_end;
2637 int new_nranges;
2638
2639 /* +1 in case of mbcset->nranges is 0. */
2640 new_nranges = 2 * mbcset->nranges + 1;
2641 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2642 are NULL if *range_alloc == 0. */
2643 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2644 new_nranges);
2645 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2646 new_nranges);
2647
2648 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2649 return REG_ESPACE;
2650
2651 mbcset->range_starts = new_array_start;
2652 mbcset->range_ends = new_array_end;
2653 *range_alloc = new_nranges;
2654 }
2655
2656 mbcset->range_starts[mbcset->nranges] = start_wc;
2657 mbcset->range_ends[mbcset->nranges++] = end_wc;
2658 }
2659
2660 /* Build the table for single byte characters. */
2661 for (wc = 0; wc < SBC_MAX; ++wc)
2662 {
2663 cmp_buf[2] = wc;
2664 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2665 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2666 bitset_set (sbcset, wc);
2667 }
2668 }
2669# else /* not RE_ENABLE_I18N */
2670 {
2671 unsigned int ch;
2672 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2673 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2674 : 0));
2675 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2676 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2677 : 0));
2678 if (start_ch > end_ch)
2679 return REG_ERANGE;
2680 /* Build the table for single byte characters. */
2681 for (ch = 0; ch < SBC_MAX; ++ch)
2682 if (start_ch <= ch && ch <= end_ch)
2683 bitset_set (sbcset, ch);
2684 }
2685# endif /* not RE_ENABLE_I18N */
2686 return REG_NOERROR;
2687}
2688#endif /* not _LIBC */
2689
2690#ifndef _LIBC
2691/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2692 Build the collating element which is represented by NAME.
2693 The result are written to MBCSET and SBCSET.
2694 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2695 pointer argument since we may update it. */
2696
2697static reg_errcode_t
2698internal_function
2699# ifdef RE_ENABLE_I18N
2700build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2701 int *coll_sym_alloc, const unsigned char *name)
2702# else /* not RE_ENABLE_I18N */
2703build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2704# endif /* not RE_ENABLE_I18N */
2705{
2706 size_t name_len = strlen ((const char *) name);
2707 if (BE (name_len != 1, 0))
2708 return REG_ECOLLATE;
2709 else
2710 {
2711 bitset_set (sbcset, name[0]);
2712 return REG_NOERROR;
2713 }
2714}
2715#endif /* not _LIBC */
2716
2717/* This function parse bracket expression like "[abc]", "[a-c]",
2718 "[[.a-a.]]" etc. */
2719
2720static bin_tree_t *
2721parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2722 reg_syntax_t syntax, reg_errcode_t *err)
2723{
2724#ifdef _LIBC
2725 const unsigned char *collseqmb;
2726 const char *collseqwc;
2727 uint32_t nrules;
2728 int32_t table_size;
2729 const int32_t *symb_table;
2730 const unsigned char *extra;
2731
2732 /* Local function for parse_bracket_exp used in _LIBC environement.
2733 Seek the collating symbol entry correspondings to NAME.
2734 Return the index of the symbol in the SYMB_TABLE. */
2735
2736 auto inline int32_t
2737 __attribute ((always_inline))
2738 seek_collating_symbol_entry (name, name_len)
2739 const unsigned char *name;
2740 size_t name_len;
2741 {
2742 int32_t hash = elem_hash ((const char *) name, name_len);
2743 int32_t elem = hash % table_size;
2744 if (symb_table[2 * elem] != 0)
2745 {
2746 int32_t second = hash % (table_size - 2) + 1;
2747
2748 do
2749 {
2750 /* First compare the hashing value. */
2751 if (symb_table[2 * elem] == hash
2752 /* Compare the length of the name. */
2753 && name_len == extra[symb_table[2 * elem + 1]]
2754 /* Compare the name. */
2755 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2756 name_len) == 0)
2757 {
2758 /* Yep, this is the entry. */
2759 break;
2760 }
2761
2762 /* Next entry. */
2763 elem += second;
2764 }
2765 while (symb_table[2 * elem] != 0);
2766 }
2767 return elem;
2768 }
2769
2770 /* Local function for parse_bracket_exp used in _LIBC environement.
2771 Look up the collation sequence value of BR_ELEM.
2772 Return the value if succeeded, UINT_MAX otherwise. */
2773
2774 auto inline unsigned int
2775 __attribute ((always_inline))
2776 lookup_collation_sequence_value (br_elem)
2777 bracket_elem_t *br_elem;
2778 {
2779 if (br_elem->type == SB_CHAR)
2780 {
2781 /*
2782 if (MB_CUR_MAX == 1)
2783 */
2784 if (nrules == 0)
2785 return collseqmb[br_elem->opr.ch];
2786 else
2787 {
2788 wint_t wc = __btowc (br_elem->opr.ch);
2789 return __collseq_table_lookup (collseqwc, wc);
2790 }
2791 }
2792 else if (br_elem->type == MB_CHAR)
2793 {
2794 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2795 }
2796 else if (br_elem->type == COLL_SYM)
2797 {
2798 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2799 if (nrules != 0)
2800 {
2801 int32_t elem, idx;
2802 elem = seek_collating_symbol_entry (br_elem->opr.name,
2803 sym_name_len);
2804 if (symb_table[2 * elem] != 0)
2805 {
2806 /* We found the entry. */
2807 idx = symb_table[2 * elem + 1];
2808 /* Skip the name of collating element name. */
2809 idx += 1 + extra[idx];
2810 /* Skip the byte sequence of the collating element. */
2811 idx += 1 + extra[idx];
2812 /* Adjust for the alignment. */
2813 idx = (idx + 3) & ~3;
2814 /* Skip the multibyte collation sequence value. */
2815 idx += sizeof (unsigned int);
2816 /* Skip the wide char sequence of the collating element. */
2817 idx += sizeof (unsigned int) *
2818 (1 + *(unsigned int *) (extra + idx));
2819 /* Return the collation sequence value. */
2820 return *(unsigned int *) (extra + idx);
2821 }
2822 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2823 {
2824 /* No valid character. Match it as a single byte
2825 character. */
2826 return collseqmb[br_elem->opr.name[0]];
2827 }
2828 }
2829 else if (sym_name_len == 1)
2830 return collseqmb[br_elem->opr.name[0]];
2831 }
2832 return UINT_MAX;
2833 }
2834
2835 /* Local function for parse_bracket_exp used in _LIBC environement.
2836 Build the range expression which starts from START_ELEM, and ends
2837 at END_ELEM. The result are written to MBCSET and SBCSET.
2838 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2839 mbcset->range_ends, is a pointer argument sinse we may
2840 update it. */
2841
2842 auto inline reg_errcode_t
2843 __attribute ((always_inline))
2844 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2845 re_charset_t *mbcset;
2846 int *range_alloc;
2847 bitset_t sbcset;
2848 bracket_elem_t *start_elem, *end_elem;
2849 {
2850 unsigned int ch;
2851 uint32_t start_collseq;
2852 uint32_t end_collseq;
2853
2854 /* Equivalence Classes and Character Classes can't be a range
2855 start/end. */
2856 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2857 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2858 0))
2859 return REG_ERANGE;
2860
2861 start_collseq = lookup_collation_sequence_value (start_elem);
2862 end_collseq = lookup_collation_sequence_value (end_elem);
2863 /* Check start/end collation sequence values. */
2864 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2865 return REG_ECOLLATE;
2866 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2867 return REG_ERANGE;
2868
2869 /* Got valid collation sequence values, add them as a new entry.
2870 However, if we have no collation elements, and the character set
2871 is single byte, the single byte character set that we
2872 build below suffices. */
2873 if (nrules > 0 || dfa->mb_cur_max > 1)
2874 {
2875 /* Check the space of the arrays. */
2876 if (BE (*range_alloc == mbcset->nranges, 0))
2877 {
2878 /* There is not enough space, need realloc. */
2879 uint32_t *new_array_start;
2880 uint32_t *new_array_end;
2881 int new_nranges;
2882
2883 /* +1 in case of mbcset->nranges is 0. */
2884 new_nranges = 2 * mbcset->nranges + 1;
2885 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2886 new_nranges);
2887 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2888 new_nranges);
2889
2890 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2891 return REG_ESPACE;
2892
2893 mbcset->range_starts = new_array_start;
2894 mbcset->range_ends = new_array_end;
2895 *range_alloc = new_nranges;
2896 }
2897
2898 mbcset->range_starts[mbcset->nranges] = start_collseq;
2899 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2900 }
2901
2902 /* Build the table for single byte characters. */
2903 for (ch = 0; ch < SBC_MAX; ch++)
2904 {
2905 uint32_t ch_collseq;
2906 /*
2907 if (MB_CUR_MAX == 1)
2908 */
2909 if (nrules == 0)
2910 ch_collseq = collseqmb[ch];
2911 else
2912 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2913 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2914 bitset_set (sbcset, ch);
2915 }
2916 return REG_NOERROR;
2917 }
2918
2919 /* Local function for parse_bracket_exp used in _LIBC environement.
2920 Build the collating element which is represented by NAME.
2921 The result are written to MBCSET and SBCSET.
2922 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2923 pointer argument sinse we may update it. */
2924
2925 auto inline reg_errcode_t
2926 __attribute ((always_inline))
2927 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2928 re_charset_t *mbcset;
2929 int *coll_sym_alloc;
2930 bitset_t sbcset;
2931 const unsigned char *name;
2932 {
2933 int32_t elem, idx;
2934 size_t name_len = strlen ((const char *) name);
2935 if (nrules != 0)
2936 {
2937 elem = seek_collating_symbol_entry (name, name_len);
2938 if (symb_table[2 * elem] != 0)
2939 {
2940 /* We found the entry. */
2941 idx = symb_table[2 * elem + 1];
2942 /* Skip the name of collating element name. */
2943 idx += 1 + extra[idx];
2944 }
2945 else if (symb_table[2 * elem] == 0 && name_len == 1)
2946 {
2947 /* No valid character, treat it as a normal
2948 character. */
2949 bitset_set (sbcset, name[0]);
2950 return REG_NOERROR;
2951 }
2952 else
2953 return REG_ECOLLATE;
2954
2955 /* Got valid collation sequence, add it as a new entry. */
2956 /* Check the space of the arrays. */
2957 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2958 {
2959 /* Not enough, realloc it. */
2960 /* +1 in case of mbcset->ncoll_syms is 0. */
2961 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2962 /* Use realloc since mbcset->coll_syms is NULL
2963 if *alloc == 0. */
2964 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2965 new_coll_sym_alloc);
2966 if (BE (new_coll_syms == NULL, 0))
2967 return REG_ESPACE;
2968 mbcset->coll_syms = new_coll_syms;
2969 *coll_sym_alloc = new_coll_sym_alloc;
2970 }
2971 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2972 return REG_NOERROR;
2973 }
2974 else
2975 {
2976 if (BE (name_len != 1, 0))
2977 return REG_ECOLLATE;
2978 else
2979 {
2980 bitset_set (sbcset, name[0]);
2981 return REG_NOERROR;
2982 }
2983 }
2984 }
2985#endif
2986
2987 re_token_t br_token;
2988 re_bitset_ptr_t sbcset;
2989#ifdef RE_ENABLE_I18N
2990 re_charset_t *mbcset;
2991 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2992 int equiv_class_alloc = 0, char_class_alloc = 0;
2993#endif /* not RE_ENABLE_I18N */
2994 int non_match = 0;
2995 bin_tree_t *work_tree;
2996 int token_len;
2997 int first_round = 1;
2998#ifdef _LIBC
2999 collseqmb = (const unsigned char *)
3000 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3001 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3002 if (nrules)
3003 {
3004 /*
3005 if (MB_CUR_MAX > 1)
3006 */
3007 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3008 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3009 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3010 _NL_COLLATE_SYMB_TABLEMB);
3011 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3012 _NL_COLLATE_SYMB_EXTRAMB);
3013 }
3014#endif
3015 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3016#ifdef RE_ENABLE_I18N
3017 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3018#endif /* RE_ENABLE_I18N */
3019#ifdef RE_ENABLE_I18N
3020 if (BE (sbcset == NULL || mbcset == NULL, 0))
3021#else
3022 if (BE (sbcset == NULL, 0))
3023#endif /* RE_ENABLE_I18N */
3024 {
3025 *err = REG_ESPACE;
3026 return NULL;
3027 }
3028
3029 token_len = peek_token_bracket (token, regexp, syntax);
3030 if (BE (token->type == END_OF_RE, 0))
3031 {
3032 *err = REG_BADPAT;
3033 goto parse_bracket_exp_free_return;
3034 }
3035 if (token->type == OP_NON_MATCH_LIST)
3036 {
3037#ifdef RE_ENABLE_I18N
3038 mbcset->non_match = 1;
3039#endif /* not RE_ENABLE_I18N */
3040 non_match = 1;
3041 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3042 bitset_set (sbcset, '\0');
3043 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3044 token_len = peek_token_bracket (token, regexp, syntax);
3045 if (BE (token->type == END_OF_RE, 0))
3046 {
3047 *err = REG_BADPAT;
3048 goto parse_bracket_exp_free_return;
3049 }
3050 }
3051
3052 /* We treat the first ']' as a normal character. */
3053 if (token->type == OP_CLOSE_BRACKET)
3054 token->type = CHARACTER;
3055
3056 while (1)
3057 {
3058 bracket_elem_t start_elem, end_elem;
3059 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3060 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3061 reg_errcode_t ret;
3062 int token_len2 = 0, is_range_exp = 0;
3063 re_token_t token2;
3064
3065 start_elem.opr.name = start_name_buf;
3066 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3067 syntax, first_round);
3068 if (BE (ret != REG_NOERROR, 0))
3069 {
3070 *err = ret;
3071 goto parse_bracket_exp_free_return;
3072 }
3073 first_round = 0;
3074
3075 /* Get information about the next token. We need it in any case. */
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077
3078 /* Do not check for ranges if we know they are not allowed. */
3079 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3080 {
3081 if (BE (token->type == END_OF_RE, 0))
3082 {
3083 *err = REG_EBRACK;
3084 goto parse_bracket_exp_free_return;
3085 }
3086 if (token->type == OP_CHARSET_RANGE)
3087 {
3088 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3089 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3090 if (BE (token2.type == END_OF_RE, 0))
3091 {
3092 *err = REG_EBRACK;
3093 goto parse_bracket_exp_free_return;
3094 }
3095 if (token2.type == OP_CLOSE_BRACKET)
3096 {
3097 /* We treat the last '-' as a normal character. */
3098 re_string_skip_bytes (regexp, -token_len);
3099 token->type = CHARACTER;
3100 }
3101 else
3102 is_range_exp = 1;
3103 }
3104 }
3105
3106 if (is_range_exp == 1)
3107 {
3108 end_elem.opr.name = end_name_buf;
3109 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3110 dfa, syntax, 1);
3111 if (BE (ret != REG_NOERROR, 0))
3112 {
3113 *err = ret;
3114 goto parse_bracket_exp_free_return;
3115 }
3116
3117 token_len = peek_token_bracket (token, regexp, syntax);
3118
3119#ifdef _LIBC
3120 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3121 &start_elem, &end_elem);
3122#else
3123# ifdef RE_ENABLE_I18N
3124 *err = build_range_exp (sbcset,
3125 dfa->mb_cur_max > 1 ? mbcset : NULL,
3126 &range_alloc, &start_elem, &end_elem);
3127# else
3128 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3129# endif
3130#endif /* RE_ENABLE_I18N */
3131 if (BE (*err != REG_NOERROR, 0))
3132 goto parse_bracket_exp_free_return;
3133 }
3134 else
3135 {
3136 switch (start_elem.type)
3137 {
3138 case SB_CHAR:
3139 bitset_set (sbcset, start_elem.opr.ch);
3140 break;
3141#ifdef RE_ENABLE_I18N
3142 case MB_CHAR:
3143 /* Check whether the array has enough space. */
3144 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3145 {
3146 wchar_t *new_mbchars;
3147 /* Not enough, realloc it. */
3148 /* +1 in case of mbcset->nmbchars is 0. */
3149 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3150 /* Use realloc since array is NULL if *alloc == 0. */
3151 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3152 mbchar_alloc);
3153 if (BE (new_mbchars == NULL, 0))
3154 goto parse_bracket_exp_espace;
3155 mbcset->mbchars = new_mbchars;
3156 }
3157 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3158 break;
3159#endif /* RE_ENABLE_I18N */
3160 case EQUIV_CLASS:
3161 *err = build_equiv_class (sbcset,
3162#ifdef RE_ENABLE_I18N
3163 mbcset, &equiv_class_alloc,
3164#endif /* RE_ENABLE_I18N */
3165 start_elem.opr.name);
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3168 break;
3169 case COLL_SYM:
3170 *err = build_collating_symbol (sbcset,
3171#ifdef RE_ENABLE_I18N
3172 mbcset, &coll_sym_alloc,
3173#endif /* RE_ENABLE_I18N */
3174 start_elem.opr.name);
3175 if (BE (*err != REG_NOERROR, 0))
3176 goto parse_bracket_exp_free_return;
3177 break;
3178 case CHAR_CLASS:
3179 *err = build_charclass (regexp->trans, sbcset,
3180#ifdef RE_ENABLE_I18N
3181 mbcset, &char_class_alloc,
3182#endif /* RE_ENABLE_I18N */
3183 start_elem.opr.name, syntax);
3184 if (BE (*err != REG_NOERROR, 0))
3185 goto parse_bracket_exp_free_return;
3186 break;
3187 default:
3188 assert (0);
3189 break;
3190 }
3191 }
3192 if (BE (token->type == END_OF_RE, 0))
3193 {
3194 *err = REG_EBRACK;
3195 goto parse_bracket_exp_free_return;
3196 }
3197 if (token->type == OP_CLOSE_BRACKET)
3198 break;
3199 }
3200
3201 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3202
3203 /* If it is non-matching list. */
3204 if (non_match)
3205 bitset_not (sbcset);
3206
3207#ifdef RE_ENABLE_I18N
3208 /* Ensure only single byte characters are set. */
3209 if (dfa->mb_cur_max > 1)
3210 bitset_mask (sbcset, dfa->sb_char);
3211
3212 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3213 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3214 || mbcset->non_match)))
3215 {
3216 bin_tree_t *mbc_tree;
3217 int sbc_idx;
3218 /* Build a tree for complex bracket. */
3219 dfa->has_mb_node = 1;
3220 br_token.type = COMPLEX_BRACKET;
3221 br_token.opr.mbcset = mbcset;
3222 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3223 if (BE (mbc_tree == NULL, 0))
3224 goto parse_bracket_exp_espace;
3225 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3226 if (sbcset[sbc_idx])
3227 break;
3228 /* If there are no bits set in sbcset, there is no point
3229 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3230 if (sbc_idx < BITSET_WORDS)
3231 {
3232 /* Build a tree for simple bracket. */
3233 br_token.type = SIMPLE_BRACKET;
3234 br_token.opr.sbcset = sbcset;
3235 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236 if (BE (work_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3238
3239 /* Then join them by ALT node. */
3240 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3241 if (BE (work_tree == NULL, 0))
3242 goto parse_bracket_exp_espace;
3243 }
3244 else
3245 {
3246 re_free (sbcset);
3247 work_tree = mbc_tree;
3248 }
3249 }
3250 else
3251#endif /* not RE_ENABLE_I18N */
3252 {
3253#ifdef RE_ENABLE_I18N
3254 free_charset (mbcset);
3255#endif
3256 /* Build a tree for simple bracket. */
3257 br_token.type = SIMPLE_BRACKET;
3258 br_token.opr.sbcset = sbcset;
3259 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3260 if (BE (work_tree == NULL, 0))
3261 goto parse_bracket_exp_espace;
3262 }
3263 return work_tree;
3264
3265 parse_bracket_exp_espace:
3266 *err = REG_ESPACE;
3267 parse_bracket_exp_free_return:
3268 re_free (sbcset);
3269#ifdef RE_ENABLE_I18N
3270 free_charset (mbcset);
3271#endif /* RE_ENABLE_I18N */
3272 return NULL;
3273}
3274
3275/* Parse an element in the bracket expression. */
3276
3277static reg_errcode_t
3278parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3279 re_token_t *token, int token_len, re_dfa_t *dfa,
3280 reg_syntax_t syntax, int accept_hyphen)
3281{
3282#ifdef RE_ENABLE_I18N
3283 int cur_char_size;
3284 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3285 if (cur_char_size > 1)
3286 {
3287 elem->type = MB_CHAR;
3288 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3289 re_string_skip_bytes (regexp, cur_char_size);
3290 return REG_NOERROR;
3291 }
3292#endif /* RE_ENABLE_I18N */
3293 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3294 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3295 || token->type == OP_OPEN_EQUIV_CLASS)
3296 return parse_bracket_symbol (elem, regexp, token);
3297 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3298 {
3299 /* A '-' must only appear as anything but a range indicator before
3300 the closing bracket. Everything else is an error. */
3301 re_token_t token2;
3302 (void) peek_token_bracket (&token2, regexp, syntax);
3303 if (token2.type != OP_CLOSE_BRACKET)
3304 /* The actual error value is not standardized since this whole
3305 case is undefined. But ERANGE makes good sense. */
3306 return REG_ERANGE;
3307 }
3308 elem->type = SB_CHAR;
3309 elem->opr.ch = token->opr.c;
3310 return REG_NOERROR;
3311}
3312
3313/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3314 such as [:<character_class>:], [.<collating_element>.], and
3315 [=<equivalent_class>=]. */
3316
3317static reg_errcode_t
3318parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3319 re_token_t *token)
3320{
3321 unsigned char ch, delim = token->opr.c;
3322 int i = 0;
3323 if (re_string_eoi(regexp))
3324 return REG_EBRACK;
3325 for (;; ++i)
3326 {
3327 if (i >= BRACKET_NAME_BUF_SIZE)
3328 return REG_EBRACK;
3329 if (token->type == OP_OPEN_CHAR_CLASS)
3330 ch = re_string_fetch_byte_case (regexp);
3331 else
3332 ch = re_string_fetch_byte (regexp);
3333 if (re_string_eoi(regexp))
3334 return REG_EBRACK;
3335 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3336 break;
3337 elem->opr.name[i] = ch;
3338 }
3339 re_string_skip_bytes (regexp, 1);
3340 elem->opr.name[i] = '\0';
3341 switch (token->type)
3342 {
3343 case OP_OPEN_COLL_ELEM:
3344 elem->type = COLL_SYM;
3345 break;
3346 case OP_OPEN_EQUIV_CLASS:
3347 elem->type = EQUIV_CLASS;
3348 break;
3349 case OP_OPEN_CHAR_CLASS:
3350 elem->type = CHAR_CLASS;
3351 break;
3352 default:
3353 break;
3354 }
3355 return REG_NOERROR;
3356}
3357
3358 /* Helper function for parse_bracket_exp.
3359 Build the equivalence class which is represented by NAME.
3360 The result are written to MBCSET and SBCSET.
3361 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3362 is a pointer argument sinse we may update it. */
3363
3364static reg_errcode_t
3365#ifdef RE_ENABLE_I18N
3366build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3367 int *equiv_class_alloc, const unsigned char *name)
3368#else /* not RE_ENABLE_I18N */
3369build_equiv_class (bitset_t sbcset, const unsigned char *name)
3370#endif /* not RE_ENABLE_I18N */
3371{
3372#ifdef _LIBC
3373 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3374 if (nrules != 0)
3375 {
3376 const int32_t *table, *indirect;
3377 const unsigned char *weights, *extra, *cp;
3378 unsigned char char_buf[2];
3379 int32_t idx1, idx2;
3380 unsigned int ch;
3381 size_t len;
3382 /* This #include defines a local function! */
3383# include <locale/weight.h>
3384 /* Calculate the index for equivalence class. */
3385 cp = name;
3386 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3387 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388 _NL_COLLATE_WEIGHTMB);
3389 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390 _NL_COLLATE_EXTRAMB);
3391 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3392 _NL_COLLATE_INDIRECTMB);
3393 idx1 = findidx (&cp);
3394 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3395 /* This isn't a valid character. */
3396 return REG_ECOLLATE;
3397
3398 /* Build single byte matcing table for this equivalence class. */
3399 char_buf[1] = (unsigned char) '\0';
3400 len = weights[idx1];
3401 for (ch = 0; ch < SBC_MAX; ++ch)
3402 {
3403 char_buf[0] = ch;
3404 cp = char_buf;
3405 idx2 = findidx (&cp);
3406/*
3407 idx2 = table[ch];
3408*/
3409 if (idx2 == 0)
3410 /* This isn't a valid character. */
3411 continue;
3412 if (len == weights[idx2])
3413 {
3414 int cnt = 0;
3415 while (cnt <= len &&
3416 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3417 ++cnt;
3418
3419 if (cnt > len)
3420 bitset_set (sbcset, ch);
3421 }
3422 }
3423 /* Check whether the array has enough space. */
3424 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3425 {
3426 /* Not enough, realloc it. */
3427 /* +1 in case of mbcset->nequiv_classes is 0. */
3428 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3429 /* Use realloc since the array is NULL if *alloc == 0. */
3430 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3431 int32_t,
3432 new_equiv_class_alloc);
3433 if (BE (new_equiv_classes == NULL, 0))
3434 return REG_ESPACE;
3435 mbcset->equiv_classes = new_equiv_classes;
3436 *equiv_class_alloc = new_equiv_class_alloc;
3437 }
3438 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3439 }
3440 else
3441#endif /* _LIBC */
3442 {
3443 if (BE (strlen ((const char *) name) != 1, 0))
3444 return REG_ECOLLATE;
3445 bitset_set (sbcset, *name);
3446 }
3447 return REG_NOERROR;
3448}
3449
3450 /* Helper function for parse_bracket_exp.
3451 Build the character class which is represented by NAME.
3452 The result are written to MBCSET and SBCSET.
3453 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3454 is a pointer argument sinse we may update it. */
3455
3456static reg_errcode_t
3457#ifdef RE_ENABLE_I18N
3458build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3459 re_charset_t *mbcset, int *char_class_alloc,
3460 const unsigned char *class_name, reg_syntax_t syntax)
3461#else /* not RE_ENABLE_I18N */
3462build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3463 const unsigned char *class_name, reg_syntax_t syntax)
3464#endif /* not RE_ENABLE_I18N */
3465{
3466 int i;
3467 const char *name = (const char *) class_name;
3468
3469 /* In case of REG_ICASE "upper" and "lower" match the both of
3470 upper and lower cases. */
3471 if ((syntax & RE_ICASE)
3472 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3473 name = "alpha";
3474
3475#ifdef RE_ENABLE_I18N
3476 /* Check the space of the arrays. */
3477 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3478 {
3479 /* Not enough, realloc it. */
3480 /* +1 in case of mbcset->nchar_classes is 0. */
3481 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3482 /* Use realloc since array is NULL if *alloc == 0. */
3483 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3484 new_char_class_alloc);
3485 if (BE (new_char_classes == NULL, 0))
3486 return REG_ESPACE;
3487 mbcset->char_classes = new_char_classes;
3488 *char_class_alloc = new_char_class_alloc;
3489 }
3490 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3491#endif /* RE_ENABLE_I18N */
3492
3493#define BUILD_CHARCLASS_LOOP(ctype_func) \
3494 do { \
3495 if (BE (trans != NULL, 0)) \
3496 { \
3497 for (i = 0; i < SBC_MAX; ++i) \
3498 if (ctype_func (i)) \
3499 bitset_set (sbcset, trans[i]); \
3500 } \
3501 else \
3502 { \
3503 for (i = 0; i < SBC_MAX; ++i) \
3504 if (ctype_func (i)) \
3505 bitset_set (sbcset, i); \
3506 } \
3507 } while (0)
3508
3509 if (strcmp (name, "alnum") == 0)
3510 BUILD_CHARCLASS_LOOP (isalnum);
3511 else if (strcmp (name, "cntrl") == 0)
3512 BUILD_CHARCLASS_LOOP (iscntrl);
3513 else if (strcmp (name, "lower") == 0)
3514 BUILD_CHARCLASS_LOOP (islower);
3515 else if (strcmp (name, "space") == 0)
3516 BUILD_CHARCLASS_LOOP (isspace);
3517 else if (strcmp (name, "alpha") == 0)
3518 BUILD_CHARCLASS_LOOP (isalpha);
3519 else if (strcmp (name, "digit") == 0)
3520 BUILD_CHARCLASS_LOOP (isdigit);
3521 else if (strcmp (name, "print") == 0)
3522 BUILD_CHARCLASS_LOOP (isprint);
3523 else if (strcmp (name, "upper") == 0)
3524 BUILD_CHARCLASS_LOOP (isupper);
3525 else if (strcmp (name, "blank") == 0)
3526 BUILD_CHARCLASS_LOOP (isblank);
3527 else if (strcmp (name, "graph") == 0)
3528 BUILD_CHARCLASS_LOOP (isgraph);
3529 else if (strcmp (name, "punct") == 0)
3530 BUILD_CHARCLASS_LOOP (ispunct);
3531 else if (strcmp (name, "xdigit") == 0)
3532 BUILD_CHARCLASS_LOOP (isxdigit);
3533 else
3534 return REG_ECTYPE;
3535
3536 return REG_NOERROR;
3537}
3538
3539static bin_tree_t *
3540build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541 const unsigned char *class_name,
3542 const unsigned char *extra, int non_match,
3543 reg_errcode_t *err)
3544{
3545 re_bitset_ptr_t sbcset;
3546#ifdef RE_ENABLE_I18N
3547 re_charset_t *mbcset;
3548 int alloc = 0;
3549#endif /* not RE_ENABLE_I18N */
3550 reg_errcode_t ret;
3551 re_token_t br_token;
3552 bin_tree_t *tree;
3553
3554 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3555#ifdef RE_ENABLE_I18N
3556 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3557#endif /* RE_ENABLE_I18N */
3558
3559#ifdef RE_ENABLE_I18N
3560 if (BE (sbcset == NULL || mbcset == NULL, 0))
3561#else /* not RE_ENABLE_I18N */
3562 if (BE (sbcset == NULL, 0))
3563#endif /* not RE_ENABLE_I18N */
3564 {
3565 *err = REG_ESPACE;
3566 return NULL;
3567 }
3568
3569 if (non_match)
3570 {
3571#ifdef RE_ENABLE_I18N
3572 /*
3573 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3574 bitset_set(cset->sbcset, '\0');
3575 */
3576 mbcset->non_match = 1;
3577#endif /* not RE_ENABLE_I18N */
3578 }
3579
3580 /* We don't care the syntax in this case. */
3581 ret = build_charclass (trans, sbcset,
3582#ifdef RE_ENABLE_I18N
3583 mbcset, &alloc,
3584#endif /* RE_ENABLE_I18N */
3585 class_name, 0);
3586
3587 if (BE (ret != REG_NOERROR, 0))
3588 {
3589 re_free (sbcset);
3590#ifdef RE_ENABLE_I18N
3591 free_charset (mbcset);
3592#endif /* RE_ENABLE_I18N */
3593 *err = ret;
3594 return NULL;
3595 }
3596 /* \w match '_' also. */
3597 for (; *extra; extra++)
3598 bitset_set (sbcset, *extra);
3599
3600 /* If it is non-matching list. */
3601 if (non_match)
3602 bitset_not (sbcset);
3603
3604#ifdef RE_ENABLE_I18N
3605 /* Ensure only single byte characters are set. */
3606 if (dfa->mb_cur_max > 1)
3607 bitset_mask (sbcset, dfa->sb_char);
3608#endif
3609
3610 /* Build a tree for simple bracket. */
3611 br_token.type = SIMPLE_BRACKET;
3612 br_token.opr.sbcset = sbcset;
3613 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3614 if (BE (tree == NULL, 0))
3615 goto build_word_op_espace;
3616
3617#ifdef RE_ENABLE_I18N
3618 if (dfa->mb_cur_max > 1)
3619 {
3620 bin_tree_t *mbc_tree;
3621 /* Build a tree for complex bracket. */
3622 br_token.type = COMPLEX_BRACKET;
3623 br_token.opr.mbcset = mbcset;
3624 dfa->has_mb_node = 1;
3625 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3626 if (BE (mbc_tree == NULL, 0))
3627 goto build_word_op_espace;
3628 /* Then join them by ALT node. */
3629 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3630 if (BE (mbc_tree != NULL, 1))
3631 return tree;
3632 }
3633 else
3634 {
3635 free_charset (mbcset);
3636 return tree;
3637 }
3638#else /* not RE_ENABLE_I18N */
3639 return tree;
3640#endif /* not RE_ENABLE_I18N */
3641
3642 build_word_op_espace:
3643 re_free (sbcset);
3644#ifdef RE_ENABLE_I18N
3645 free_charset (mbcset);
3646#endif /* RE_ENABLE_I18N */
3647 *err = REG_ESPACE;
3648 return NULL;
3649}
3650
3651/* This is intended for the expressions like "a{1,3}".
3652 Fetch a number from `input', and return the number.
3653 Return -1, if the number field is empty like "{,1}".
3654 Return -2, If an error is occured. */
3655
3656static int
3657fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3658{
3659 int num = -1;
3660 unsigned char c;
3661 while (1)
3662 {
3663 fetch_token (token, input, syntax);
3664 c = token->opr.c;
3665 if (BE (token->type == END_OF_RE, 0))
3666 return -2;
3667 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3668 break;
3669 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3670 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3671 num = (num > RE_DUP_MAX) ? -2 : num;
3672 }
3673 return num;
3674}
3675
3676
3677#ifdef RE_ENABLE_I18N
3678static void
3679free_charset (re_charset_t *cset)
3680{
3681 re_free (cset->mbchars);
3682# ifdef _LIBC
3683 re_free (cset->coll_syms);
3684 re_free (cset->equiv_classes);
3685 re_free (cset->range_starts);
3686 re_free (cset->range_ends);
3687# endif
3688 re_free (cset->char_classes);
3689 re_free (cset);
3690}
3691#endif /* RE_ENABLE_I18N */
3692
3693
3694/* Functions for binary tree operation. */
3695
3696/* Create a tree node. */
3697
3698static bin_tree_t *
3699create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3700 re_token_type_t type)
3701{
3702 re_token_t t;
3703 t.type = type;
3704 return create_token_tree (dfa, left, right, &t);
3705}
3706
3707static bin_tree_t *
3708create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3709 const re_token_t *token)
3710{
3711 bin_tree_t *tree;
3712 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3713 {
3714 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3715
3716 if (storage == NULL)
3717 return NULL;
3718 storage->next = dfa->str_tree_storage;
3719 dfa->str_tree_storage = storage;
3720 dfa->str_tree_storage_idx = 0;
3721 }
3722 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3723
3724 tree->parent = NULL;
3725 tree->left = left;
3726 tree->right = right;
3727 tree->token = *token;
3728 tree->token.duplicated = 0;
3729 tree->token.opt_subexp = 0;
3730 tree->first = NULL;
3731 tree->next = NULL;
3732 tree->node_idx = -1;
3733
3734 if (left != NULL)
3735 left->parent = tree;
3736 if (right != NULL)
3737 right->parent = tree;
3738 return tree;
3739}
3740
3741/* Mark the tree SRC as an optional subexpression.
3742 To be called from preorder or postorder. */
3743
3744static reg_errcode_t
3745mark_opt_subexp (void *extra, bin_tree_t *node)
3746{
3747 int idx = (int) (long) extra;
3748 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3749 node->token.opt_subexp = 1;
3750
3751 return REG_NOERROR;
3752}
3753
3754/* Free the allocated memory inside NODE. */
3755
3756static void
3757free_token (re_token_t *node)
3758{
3759#ifdef RE_ENABLE_I18N
3760 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3761 free_charset (node->opr.mbcset);
3762 else
3763#endif /* RE_ENABLE_I18N */
3764 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3765 re_free (node->opr.sbcset);
3766}
3767
3768/* Worker function for tree walking. Free the allocated memory inside NODE
3769 and its children. */
3770
3771static reg_errcode_t
3772free_tree (void *extra, bin_tree_t *node)
3773{
3774 free_token (&node->token);
3775 return REG_NOERROR;
3776}
3777
3778
3779/* Duplicate the node SRC, and return new node. This is a preorder
3780 visit similar to the one implemented by the generic visitor, but
3781 we need more infrastructure to maintain two parallel trees --- so,
3782 it's easier to duplicate. */
3783
3784static bin_tree_t *
3785duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3786{
3787 const bin_tree_t *node;
3788 bin_tree_t *dup_root;
3789 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3790
3791 for (node = root; ; )
3792 {
3793 /* Create a new tree and link it back to the current parent. */
3794 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3795 if (*p_new == NULL)
3796 return NULL;
3797 (*p_new)->parent = dup_node;
3798 (*p_new)->token.duplicated = 1;
3799 dup_node = *p_new;
3800
3801 /* Go to the left node, or up and to the right. */
3802 if (node->left)
3803 {
3804 node = node->left;
3805 p_new = &dup_node->left;
3806 }
3807 else
3808 {
3809 const bin_tree_t *prev = NULL;
3810 while (node->right == prev || node->right == NULL)
3811 {
3812 prev = node;
3813 node = node->parent;
3814 dup_node = dup_node->parent;
3815 if (!node)
3816 return dup_root;
3817 }
3818 node = node->right;
3819 p_new = &dup_node->right;
3820 }
3821 }
3822}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette