VirtualBox

source: kBuild/trunk/src/grep/lib/regexec.c

Last change on this file was 3529, checked in by bird, 3 years ago

Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.

  • Property svn:eol-style set to native
File size: 126.0 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22static void match_ctx_clean (re_match_context_t *mctx);
23static void match_ctx_free (re_match_context_t *cache);
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70#ifdef RE_ENABLE_I18N
71static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74#endif /* RE_ENABLE_I18N */
75static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121#if 0
122static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125#endif
126#ifdef RE_ENABLE_I18N
127static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129#endif /* RE_ENABLE_I18N */
130static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159#ifdef RE_ENABLE_I18N
160static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162# ifdef _LIBC
163static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165# endif /* _LIBC */
166#endif /* RE_ENABLE_I18N */
167static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174
175
176/* Entry point for POSIX code. */
177
178/* regexec searches for a given pattern, specified by PREG, in the
179 string STRING.
180
181 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
182 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
183 least NMATCH elements, and we set them to the offsets of the
184 corresponding matched substrings.
185
186 EFLAGS specifies "execution flags" which affect matching: if
187 REG_NOTBOL is set, then ^ does not match at the beginning of the
188 string; if REG_NOTEOL is set, then $ does not match at the end.
189
190 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
191 EFLAGS is invalid. */
192
193int
194regexec (const regex_t *__restrict preg, const char *__restrict string,
195 size_t nmatch, regmatch_t pmatch[], int eflags)
196{
197 reg_errcode_t err;
198 Idx start, length;
199 re_dfa_t *dfa = preg->buffer;
200
201 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
202 return REG_BADPAT;
203
204 if (eflags & REG_STARTEND)
205 {
206 start = pmatch[0].rm_so;
207 length = pmatch[0].rm_eo;
208 }
209 else
210 {
211 start = 0;
212 length = strlen (string);
213 }
214
215 lock_lock (dfa->lock);
216 if (preg->no_sub)
217 err = re_search_internal (preg, string, length, start, length,
218 length, 0, NULL, eflags);
219 else
220 err = re_search_internal (preg, string, length, start, length,
221 length, nmatch, pmatch, eflags);
222 lock_unlock (dfa->lock);
223 return err != REG_NOERROR;
224}
225
226#ifdef _LIBC
227libc_hidden_def (__regexec)
228
229# include <shlib-compat.h>
230versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
231
232# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
233__typeof__ (__regexec) __compat_regexec;
234
235int
236attribute_compat_text_section
237__compat_regexec (const regex_t *__restrict preg,
238 const char *__restrict string, size_t nmatch,
239 regmatch_t pmatch[], int eflags)
240{
241 return regexec (preg, string, nmatch, pmatch,
242 eflags & (REG_NOTBOL | REG_NOTEOL));
243}
244compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
245# endif
246#endif
247
248/* Entry points for GNU code. */
249
250/* re_match, re_search, re_match_2, re_search_2
251
252 The former two functions operate on STRING with length LENGTH,
253 while the later two operate on concatenation of STRING1 and STRING2
254 with lengths LENGTH1 and LENGTH2, respectively.
255
256 re_match() matches the compiled pattern in BUFP against the string,
257 starting at index START.
258
259 re_search() first tries matching at index START, then it tries to match
260 starting from index START + 1, and so on. The last start position tried
261 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
262 way as re_match().)
263
264 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
265 the first STOP characters of the concatenation of the strings should be
266 concerned.
267
268 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
269 and all groups is stored in REGS. (For the "_2" variants, the offsets are
270 computed relative to the concatenation, not relative to the individual
271 strings.)
272
273 On success, re_match* functions return the length of the match, re_search*
274 return the position of the start of the match. They return -1 on
275 match failure, -2 on error. */
276
277regoff_t
278re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
279 Idx start, struct re_registers *regs)
280{
281 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
282}
283#ifdef _LIBC
284weak_alias (__re_match, re_match)
285#endif
286
287regoff_t
288re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
289 Idx start, regoff_t range, struct re_registers *regs)
290{
291 return re_search_stub (bufp, string, length, start, range, length, regs,
292 false);
293}
294#ifdef _LIBC
295weak_alias (__re_search, re_search)
296#endif
297
298regoff_t
299re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
300 const char *string2, Idx length2, Idx start,
301 struct re_registers *regs, Idx stop)
302{
303 return re_search_2_stub (bufp, string1, length1, string2, length2,
304 start, 0, regs, stop, true);
305}
306#ifdef _LIBC
307weak_alias (__re_match_2, re_match_2)
308#endif
309
310regoff_t
311re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
312 const char *string2, Idx length2, Idx start, regoff_t range,
313 struct re_registers *regs, Idx stop)
314{
315 return re_search_2_stub (bufp, string1, length1, string2, length2,
316 start, range, regs, stop, false);
317}
318#ifdef _LIBC
319weak_alias (__re_search_2, re_search_2)
320#endif
321
322static regoff_t
323re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
324 Idx length1, const char *string2, Idx length2, Idx start,
325 regoff_t range, struct re_registers *regs,
326 Idx stop, bool ret_len)
327{
328 const char *str;
329 regoff_t rval;
330 Idx len;
331 char *s = NULL;
332
333 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
334 || INT_ADD_WRAPV (length1, length2, &len))))
335 return -2;
336
337 /* Concatenate the strings. */
338 if (length2 > 0)
339 if (length1 > 0)
340 {
341 s = re_malloc (char, len);
342
343 if (__glibc_unlikely (s == NULL))
344 return -2;
345#ifdef _LIBC
346 memcpy (__mempcpy (s, string1, length1), string2, length2);
347#else
348 memcpy (s, string1, length1);
349 memcpy (s + length1, string2, length2);
350#endif
351 str = s;
352 }
353 else
354 str = string2;
355 else
356 str = string1;
357
358 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
359 ret_len);
360 re_free (s);
361 return rval;
362}
363
364/* The parameters have the same meaning as those of re_search.
365 Additional parameters:
366 If RET_LEN is true the length of the match is returned (re_match style);
367 otherwise the position of the match is returned. */
368
369static regoff_t
370re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
371 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
372 bool ret_len)
373{
374 reg_errcode_t result;
375 regmatch_t *pmatch;
376 Idx nregs;
377 regoff_t rval;
378 int eflags = 0;
379 re_dfa_t *dfa = bufp->buffer;
380 Idx last_start = start + range;
381
382 /* Check for out-of-range. */
383 if (__glibc_unlikely (start < 0 || start > length))
384 return -1;
385 if (__glibc_unlikely (length < last_start
386 || (0 <= range && last_start < start)))
387 last_start = length;
388 else if (__glibc_unlikely (last_start < 0
389 || (range < 0 && start <= last_start)))
390 last_start = 0;
391
392 lock_lock (dfa->lock);
393
394 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
395 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
396
397 /* Compile fastmap if we haven't yet. */
398 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
399 re_compile_fastmap (bufp);
400
401 if (__glibc_unlikely (bufp->no_sub))
402 regs = NULL;
403
404 /* We need at least 1 register. */
405 if (regs == NULL)
406 nregs = 1;
407 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
408 && regs->num_regs <= bufp->re_nsub))
409 {
410 nregs = regs->num_regs;
411 if (__glibc_unlikely (nregs < 1))
412 {
413 /* Nothing can be copied to regs. */
414 regs = NULL;
415 nregs = 1;
416 }
417 }
418 else
419 nregs = bufp->re_nsub + 1;
420 pmatch = re_malloc (regmatch_t, nregs);
421 if (__glibc_unlikely (pmatch == NULL))
422 {
423 rval = -2;
424 goto out;
425 }
426
427 result = re_search_internal (bufp, string, length, start, last_start, stop,
428 nregs, pmatch, eflags);
429
430 rval = 0;
431
432 /* I hope we needn't fill their regs with -1's when no match was found. */
433 if (result != REG_NOERROR)
434 rval = result == REG_NOMATCH ? -1 : -2;
435 else if (regs != NULL)
436 {
437 /* If caller wants register contents data back, copy them. */
438 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
439 bufp->regs_allocated);
440 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
441 rval = -2;
442 }
443
444 if (__glibc_likely (rval == 0))
445 {
446 if (ret_len)
447 {
448 DEBUG_ASSERT (pmatch[0].rm_so == start);
449 rval = pmatch[0].rm_eo - start;
450 }
451 else
452 rval = pmatch[0].rm_so;
453 }
454 re_free (pmatch);
455 out:
456 lock_unlock (dfa->lock);
457 return rval;
458}
459
460static unsigned
461re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
462 int regs_allocated)
463{
464 int rval = REGS_REALLOCATE;
465 Idx i;
466 Idx need_regs = nregs + 1;
467 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
468 uses. */
469
470 /* Have the register data arrays been allocated? */
471 if (regs_allocated == REGS_UNALLOCATED)
472 { /* No. So allocate them with malloc. */
473 regs->start = re_malloc (regoff_t, need_regs);
474 if (__glibc_unlikely (regs->start == NULL))
475 return REGS_UNALLOCATED;
476 regs->end = re_malloc (regoff_t, need_regs);
477 if (__glibc_unlikely (regs->end == NULL))
478 {
479 re_free (regs->start);
480 return REGS_UNALLOCATED;
481 }
482 regs->num_regs = need_regs;
483 }
484 else if (regs_allocated == REGS_REALLOCATE)
485 { /* Yes. If we need more elements than were already
486 allocated, reallocate them. If we need fewer, just
487 leave it alone. */
488 if (__glibc_unlikely (need_regs > regs->num_regs))
489 {
490 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
491 regoff_t *new_end;
492 if (__glibc_unlikely (new_start == NULL))
493 return REGS_UNALLOCATED;
494 new_end = re_realloc (regs->end, regoff_t, need_regs);
495 if (__glibc_unlikely (new_end == NULL))
496 {
497 re_free (new_start);
498 return REGS_UNALLOCATED;
499 }
500 regs->start = new_start;
501 regs->end = new_end;
502 regs->num_regs = need_regs;
503 }
504 }
505 else
506 {
507 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
508 /* This function may not be called with REGS_FIXED and nregs too big. */
509 DEBUG_ASSERT (nregs <= regs->num_regs);
510 rval = REGS_FIXED;
511 }
512
513 /* Copy the regs. */
514 for (i = 0; i < nregs; ++i)
515 {
516 regs->start[i] = pmatch[i].rm_so;
517 regs->end[i] = pmatch[i].rm_eo;
518 }
519 for ( ; i < regs->num_regs; ++i)
520 regs->start[i] = regs->end[i] = -1;
521
522 return rval;
523}
524
525/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
526 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
527 this memory for recording register information. STARTS and ENDS
528 must be allocated using the malloc library routine, and must each
529 be at least NUM_REGS * sizeof (regoff_t) bytes long.
530
531 If NUM_REGS == 0, then subsequent matches should allocate their own
532 register data.
533
534 Unless this function is called, the first search or match using
535 PATTERN_BUFFER will allocate its own register data, without
536 freeing the old data. */
537
538void
539re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
540 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
541{
542 if (num_regs)
543 {
544 bufp->regs_allocated = REGS_REALLOCATE;
545 regs->num_regs = num_regs;
546 regs->start = starts;
547 regs->end = ends;
548 }
549 else
550 {
551 bufp->regs_allocated = REGS_UNALLOCATED;
552 regs->num_regs = 0;
553 regs->start = regs->end = NULL;
554 }
555}
556#ifdef _LIBC
557weak_alias (__re_set_registers, re_set_registers)
558#endif
559
560
561/* Entry points compatible with 4.2 BSD regex library. We don't define
562 them unless specifically requested. */
563
564#if defined _REGEX_RE_COMP || defined _LIBC
565int
566# ifdef _LIBC
567weak_function
568# endif
569re_exec (const char *s)
570{
571 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
572}
573#endif /* _REGEX_RE_COMP */
574
575
576/* Internal entry point. */
577
578/* Searches for a compiled pattern PREG in the string STRING, whose
579 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
580 meaning as with regexec. LAST_START is START + RANGE, where
581 START and RANGE have the same meaning as with re_search.
582 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
583 otherwise return the error code.
584 Note: We assume front end functions already check ranges.
585 (0 <= LAST_START && LAST_START <= LENGTH) */
586
587static reg_errcode_t
588__attribute_warn_unused_result__
589re_search_internal (const regex_t *preg, const char *string, Idx length,
590 Idx start, Idx last_start, Idx stop, size_t nmatch,
591 regmatch_t pmatch[], int eflags)
592{
593 reg_errcode_t err;
594 const re_dfa_t *dfa = preg->buffer;
595 Idx left_lim, right_lim;
596 int incr;
597 bool fl_longest_match;
598 int match_kind;
599 Idx match_first;
600 Idx match_last = -1;
601 Idx extra_nmatch;
602 bool sb;
603 int ch;
604 re_match_context_t mctx = { .dfa = dfa };
605 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
606 && start != last_start && !preg->can_be_null)
607 ? preg->fastmap : NULL);
608 RE_TRANSLATE_TYPE t = preg->translate;
609
610 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
611 nmatch -= extra_nmatch;
612
613 /* Check if the DFA haven't been compiled. */
614 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
615 || dfa->init_state_word == NULL
616 || dfa->init_state_nl == NULL
617 || dfa->init_state_begbuf == NULL))
618 return REG_NOMATCH;
619
620 /* We assume front-end functions already check them. */
621 DEBUG_ASSERT (0 <= last_start && last_start <= length);
622
623 /* If initial states with non-begbuf contexts have no elements,
624 the regex must be anchored. If preg->newline_anchor is set,
625 we'll never use init_state_nl, so do not check it. */
626 if (dfa->init_state->nodes.nelem == 0
627 && dfa->init_state_word->nodes.nelem == 0
628 && (dfa->init_state_nl->nodes.nelem == 0
629 || !preg->newline_anchor))
630 {
631 if (start != 0 && last_start != 0)
632 return REG_NOMATCH;
633 start = last_start = 0;
634 }
635
636 /* We must check the longest matching, if nmatch > 0. */
637 fl_longest_match = (nmatch != 0 || dfa->nbackref);
638
639 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
640 preg->translate, (preg->syntax & RE_ICASE) != 0,
641 dfa);
642 if (__glibc_unlikely (err != REG_NOERROR))
643 goto free_return;
644 mctx.input.stop = stop;
645 mctx.input.raw_stop = stop;
646 mctx.input.newline_anchor = preg->newline_anchor;
647
648 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
649 if (__glibc_unlikely (err != REG_NOERROR))
650 goto free_return;
651
652 /* We will log all the DFA states through which the dfa pass,
653 if nmatch > 1, or this dfa has "multibyte node", which is a
654 back-reference or a node which can accept multibyte character or
655 multi character collating element. */
656 if (nmatch > 1 || dfa->has_mb_node)
657 {
658 /* Avoid overflow. */
659 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
660 <= mctx.input.bufs_len)))
661 {
662 err = REG_ESPACE;
663 goto free_return;
664 }
665
666 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
667 if (__glibc_unlikely (mctx.state_log == NULL))
668 {
669 err = REG_ESPACE;
670 goto free_return;
671 }
672 }
673
674 match_first = start;
675 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
676 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
677
678 /* Check incrementally whether the input string matches. */
679 incr = (last_start < start) ? -1 : 1;
680 left_lim = (last_start < start) ? last_start : start;
681 right_lim = (last_start < start) ? start : last_start;
682 sb = dfa->mb_cur_max == 1;
683 match_kind =
684 (fastmap
685 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
686 | (start <= last_start ? 2 : 0)
687 | (t != NULL ? 1 : 0))
688 : 8);
689
690 for (;; match_first += incr)
691 {
692 err = REG_NOMATCH;
693 if (match_first < left_lim || right_lim < match_first)
694 goto free_return;
695
696 /* Advance as rapidly as possible through the string, until we
697 find a plausible place to start matching. This may be done
698 with varying efficiency, so there are various possibilities:
699 only the most common of them are specialized, in order to
700 save on code size. We use a switch statement for speed. */
701 switch (match_kind)
702 {
703 case 8:
704 /* No fastmap. */
705 break;
706
707 case 7:
708 /* Fastmap with single-byte translation, match forward. */
709 while (__glibc_likely (match_first < right_lim)
710 && !fastmap[t[(unsigned char) string[match_first]]])
711 ++match_first;
712 goto forward_match_found_start_or_reached_end;
713
714 case 6:
715 /* Fastmap without translation, match forward. */
716 while (__glibc_likely (match_first < right_lim)
717 && !fastmap[(unsigned char) string[match_first]])
718 ++match_first;
719
720 forward_match_found_start_or_reached_end:
721 if (__glibc_unlikely (match_first == right_lim))
722 {
723 ch = match_first >= length
724 ? 0 : (unsigned char) string[match_first];
725 if (!fastmap[t ? t[ch] : ch])
726 goto free_return;
727 }
728 break;
729
730 case 4:
731 case 5:
732 /* Fastmap without multi-byte translation, match backwards. */
733 while (match_first >= left_lim)
734 {
735 ch = match_first >= length
736 ? 0 : (unsigned char) string[match_first];
737 if (fastmap[t ? t[ch] : ch])
738 break;
739 --match_first;
740 }
741 if (match_first < left_lim)
742 goto free_return;
743 break;
744
745 default:
746 /* In this case, we can't determine easily the current byte,
747 since it might be a component byte of a multibyte
748 character. Then we use the constructed buffer instead. */
749 for (;;)
750 {
751 /* If MATCH_FIRST is out of the valid range, reconstruct the
752 buffers. */
753 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
754 if (__glibc_unlikely (offset
755 >= (__re_size_t) mctx.input.valid_raw_len))
756 {
757 err = re_string_reconstruct (&mctx.input, match_first,
758 eflags);
759 if (__glibc_unlikely (err != REG_NOERROR))
760 goto free_return;
761
762 offset = match_first - mctx.input.raw_mbs_idx;
763 }
764 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
765 Note that MATCH_FIRST must not be smaller than 0. */
766 ch = (match_first >= length
767 ? 0 : re_string_byte_at (&mctx.input, offset));
768 if (fastmap[ch])
769 break;
770 match_first += incr;
771 if (match_first < left_lim || match_first > right_lim)
772 {
773 err = REG_NOMATCH;
774 goto free_return;
775 }
776 }
777 break;
778 }
779
780 /* Reconstruct the buffers so that the matcher can assume that
781 the matching starts from the beginning of the buffer. */
782 err = re_string_reconstruct (&mctx.input, match_first, eflags);
783 if (__glibc_unlikely (err != REG_NOERROR))
784 goto free_return;
785
786#ifdef RE_ENABLE_I18N
787 /* Don't consider this char as a possible match start if it part,
788 yet isn't the head, of a multibyte character. */
789 if (!sb && !re_string_first_byte (&mctx.input, 0))
790 continue;
791#endif
792
793 /* It seems to be appropriate one, then use the matcher. */
794 /* We assume that the matching starts from 0. */
795 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
796 match_last = check_matching (&mctx, fl_longest_match,
797 start <= last_start ? &match_first : NULL);
798 if (match_last != -1)
799 {
800 if (__glibc_unlikely (match_last == -2))
801 {
802 err = REG_ESPACE;
803 goto free_return;
804 }
805 else
806 {
807 mctx.match_last = match_last;
808 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
809 {
810 re_dfastate_t *pstate = mctx.state_log[match_last];
811 mctx.last_node = check_halt_state_context (&mctx, pstate,
812 match_last);
813 }
814 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
815 || dfa->nbackref)
816 {
817 err = prune_impossible_nodes (&mctx);
818 if (err == REG_NOERROR)
819 break;
820 if (__glibc_unlikely (err != REG_NOMATCH))
821 goto free_return;
822 match_last = -1;
823 }
824 else
825 break; /* We found a match. */
826 }
827 }
828
829 match_ctx_clean (&mctx);
830 }
831
832 DEBUG_ASSERT (match_last != -1);
833 DEBUG_ASSERT (err == REG_NOERROR);
834
835 /* Set pmatch[] if we need. */
836 if (nmatch > 0)
837 {
838 Idx reg_idx;
839
840 /* Initialize registers. */
841 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
842 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
843
844 /* Set the points where matching start/end. */
845 pmatch[0].rm_so = 0;
846 pmatch[0].rm_eo = mctx.match_last;
847 /* FIXME: This function should fail if mctx.match_last exceeds
848 the maximum possible regoff_t value. We need a new error
849 code REG_OVERFLOW. */
850
851 if (!preg->no_sub && nmatch > 1)
852 {
853 err = set_regs (preg, &mctx, nmatch, pmatch,
854 dfa->has_plural_match && dfa->nbackref > 0);
855 if (__glibc_unlikely (err != REG_NOERROR))
856 goto free_return;
857 }
858
859 /* At last, add the offset to each register, since we slid
860 the buffers so that we could assume that the matching starts
861 from 0. */
862 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
863 if (pmatch[reg_idx].rm_so != -1)
864 {
865#ifdef RE_ENABLE_I18N
866 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
867 {
868 pmatch[reg_idx].rm_so =
869 (pmatch[reg_idx].rm_so == mctx.input.valid_len
870 ? mctx.input.valid_raw_len
871 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
872 pmatch[reg_idx].rm_eo =
873 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
874 ? mctx.input.valid_raw_len
875 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
876 }
877#else
878 DEBUG_ASSERT (mctx.input.offsets_needed == 0);
879#endif
880 pmatch[reg_idx].rm_so += match_first;
881 pmatch[reg_idx].rm_eo += match_first;
882 }
883 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
884 {
885 pmatch[nmatch + reg_idx].rm_so = -1;
886 pmatch[nmatch + reg_idx].rm_eo = -1;
887 }
888
889 if (dfa->subexp_map)
890 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
891 if (dfa->subexp_map[reg_idx] != reg_idx)
892 {
893 pmatch[reg_idx + 1].rm_so
894 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
895 pmatch[reg_idx + 1].rm_eo
896 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
897 }
898 }
899
900 free_return:
901 re_free (mctx.state_log);
902 if (dfa->nbackref)
903 match_ctx_free (&mctx);
904 re_string_destruct (&mctx.input);
905 return err;
906}
907
908static reg_errcode_t
909__attribute_warn_unused_result__
910prune_impossible_nodes (re_match_context_t *mctx)
911{
912 const re_dfa_t *const dfa = mctx->dfa;
913 Idx halt_node, match_last;
914 reg_errcode_t ret;
915 re_dfastate_t **sifted_states;
916 re_dfastate_t **lim_states = NULL;
917 re_sift_context_t sctx;
918 DEBUG_ASSERT (mctx->state_log != NULL);
919 match_last = mctx->match_last;
920 halt_node = mctx->last_node;
921
922 /* Avoid overflow. */
923 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
924 <= match_last))
925 return REG_ESPACE;
926
927 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
928 if (__glibc_unlikely (sifted_states == NULL))
929 {
930 ret = REG_ESPACE;
931 goto free_return;
932 }
933 if (dfa->nbackref)
934 {
935 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
936 if (__glibc_unlikely (lim_states == NULL))
937 {
938 ret = REG_ESPACE;
939 goto free_return;
940 }
941 while (1)
942 {
943 memset (lim_states, '\0',
944 sizeof (re_dfastate_t *) * (match_last + 1));
945 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
946 match_last);
947 ret = sift_states_backward (mctx, &sctx);
948 re_node_set_free (&sctx.limits);
949 if (__glibc_unlikely (ret != REG_NOERROR))
950 goto free_return;
951 if (sifted_states[0] != NULL || lim_states[0] != NULL)
952 break;
953 do
954 {
955 --match_last;
956 if (match_last < 0)
957 {
958 ret = REG_NOMATCH;
959 goto free_return;
960 }
961 } while (mctx->state_log[match_last] == NULL
962 || !mctx->state_log[match_last]->halt);
963 halt_node = check_halt_state_context (mctx,
964 mctx->state_log[match_last],
965 match_last);
966 }
967 ret = merge_state_array (dfa, sifted_states, lim_states,
968 match_last + 1);
969 re_free (lim_states);
970 lim_states = NULL;
971 if (__glibc_unlikely (ret != REG_NOERROR))
972 goto free_return;
973 }
974 else
975 {
976 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
977 ret = sift_states_backward (mctx, &sctx);
978 re_node_set_free (&sctx.limits);
979 if (__glibc_unlikely (ret != REG_NOERROR))
980 goto free_return;
981 if (sifted_states[0] == NULL)
982 {
983 ret = REG_NOMATCH;
984 goto free_return;
985 }
986 }
987 re_free (mctx->state_log);
988 mctx->state_log = sifted_states;
989 sifted_states = NULL;
990 mctx->last_node = halt_node;
991 mctx->match_last = match_last;
992 ret = REG_NOERROR;
993 free_return:
994 re_free (sifted_states);
995 re_free (lim_states);
996 return ret;
997}
998
999/* Acquire an initial state and return it.
1000 We must select appropriate initial state depending on the context,
1001 since initial states may have constraints like "\<", "^", etc.. */
1002
1003static inline re_dfastate_t *
1004__attribute__ ((always_inline))
1005acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1006 Idx idx)
1007{
1008 const re_dfa_t *const dfa = mctx->dfa;
1009 if (dfa->init_state->has_constraint)
1010 {
1011 unsigned int context;
1012 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1013 if (IS_WORD_CONTEXT (context))
1014 return dfa->init_state_word;
1015 else if (IS_ORDINARY_CONTEXT (context))
1016 return dfa->init_state;
1017 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1018 return dfa->init_state_begbuf;
1019 else if (IS_NEWLINE_CONTEXT (context))
1020 return dfa->init_state_nl;
1021 else if (IS_BEGBUF_CONTEXT (context))
1022 {
1023 /* It is relatively rare case, then calculate on demand. */
1024 return re_acquire_state_context (err, dfa,
1025 dfa->init_state->entrance_nodes,
1026 context);
1027 }
1028 else
1029 /* Must not happen? */
1030 return dfa->init_state;
1031 }
1032 else
1033 return dfa->init_state;
1034}
1035
1036/* Check whether the regular expression match input string INPUT or not,
1037 and return the index where the matching end. Return -1 if
1038 there is no match, and return -2 in case of an error.
1039 FL_LONGEST_MATCH means we want the POSIX longest matching.
1040 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1041 next place where we may want to try matching.
1042 Note that the matcher assumes that the matching starts from the current
1043 index of the buffer. */
1044
1045static Idx
1046__attribute_warn_unused_result__
1047check_matching (re_match_context_t *mctx, bool fl_longest_match,
1048 Idx *p_match_first)
1049{
1050 const re_dfa_t *const dfa = mctx->dfa;
1051 reg_errcode_t err;
1052 Idx match = 0;
1053 Idx match_last = -1;
1054 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1055 re_dfastate_t *cur_state;
1056 bool at_init_state = p_match_first != NULL;
1057 Idx next_start_idx = cur_str_idx;
1058
1059 err = REG_NOERROR;
1060 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1061 /* An initial state must not be NULL (invalid). */
1062 if (__glibc_unlikely (cur_state == NULL))
1063 {
1064 DEBUG_ASSERT (err == REG_ESPACE);
1065 return -2;
1066 }
1067
1068 if (mctx->state_log != NULL)
1069 {
1070 mctx->state_log[cur_str_idx] = cur_state;
1071
1072 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1073 later. E.g. Processing back references. */
1074 if (__glibc_unlikely (dfa->nbackref))
1075 {
1076 at_init_state = false;
1077 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1078 if (__glibc_unlikely (err != REG_NOERROR))
1079 return err;
1080
1081 if (cur_state->has_backref)
1082 {
1083 err = transit_state_bkref (mctx, &cur_state->nodes);
1084 if (__glibc_unlikely (err != REG_NOERROR))
1085 return err;
1086 }
1087 }
1088 }
1089
1090 /* If the RE accepts NULL string. */
1091 if (__glibc_unlikely (cur_state->halt))
1092 {
1093 if (!cur_state->has_constraint
1094 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1095 {
1096 if (!fl_longest_match)
1097 return cur_str_idx;
1098 else
1099 {
1100 match_last = cur_str_idx;
1101 match = 1;
1102 }
1103 }
1104 }
1105
1106 while (!re_string_eoi (&mctx->input))
1107 {
1108 re_dfastate_t *old_state = cur_state;
1109 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1110
1111 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1112 && mctx->input.bufs_len < mctx->input.len)
1113 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1114 && mctx->input.valid_len < mctx->input.len))
1115 {
1116 err = extend_buffers (mctx, next_char_idx + 1);
1117 if (__glibc_unlikely (err != REG_NOERROR))
1118 {
1119 DEBUG_ASSERT (err == REG_ESPACE);
1120 return -2;
1121 }
1122 }
1123
1124 cur_state = transit_state (&err, mctx, cur_state);
1125 if (mctx->state_log != NULL)
1126 cur_state = merge_state_with_log (&err, mctx, cur_state);
1127
1128 if (cur_state == NULL)
1129 {
1130 /* Reached the invalid state or an error. Try to recover a valid
1131 state using the state log, if available and if we have not
1132 already found a valid (even if not the longest) match. */
1133 if (__glibc_unlikely (err != REG_NOERROR))
1134 return -2;
1135
1136 if (mctx->state_log == NULL
1137 || (match && !fl_longest_match)
1138 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1139 break;
1140 }
1141
1142 if (__glibc_unlikely (at_init_state))
1143 {
1144 if (old_state == cur_state)
1145 next_start_idx = next_char_idx;
1146 else
1147 at_init_state = false;
1148 }
1149
1150 if (cur_state->halt)
1151 {
1152 /* Reached a halt state.
1153 Check the halt state can satisfy the current context. */
1154 if (!cur_state->has_constraint
1155 || check_halt_state_context (mctx, cur_state,
1156 re_string_cur_idx (&mctx->input)))
1157 {
1158 /* We found an appropriate halt state. */
1159 match_last = re_string_cur_idx (&mctx->input);
1160 match = 1;
1161
1162 /* We found a match, do not modify match_first below. */
1163 p_match_first = NULL;
1164 if (!fl_longest_match)
1165 break;
1166 }
1167 }
1168 }
1169
1170 if (p_match_first)
1171 *p_match_first += next_start_idx;
1172
1173 return match_last;
1174}
1175
1176/* Check NODE match the current context. */
1177
1178static bool
1179check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1180{
1181 re_token_type_t type = dfa->nodes[node].type;
1182 unsigned int constraint = dfa->nodes[node].constraint;
1183 if (type != END_OF_RE)
1184 return false;
1185 if (!constraint)
1186 return true;
1187 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1188 return false;
1189 return true;
1190}
1191
1192/* Check the halt state STATE match the current context.
1193 Return 0 if not match, if the node, STATE has, is a halt node and
1194 match the context, return the node. */
1195
1196static Idx
1197check_halt_state_context (const re_match_context_t *mctx,
1198 const re_dfastate_t *state, Idx idx)
1199{
1200 Idx i;
1201 unsigned int context;
1202 DEBUG_ASSERT (state->halt);
1203 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1204 for (i = 0; i < state->nodes.nelem; ++i)
1205 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1206 return state->nodes.elems[i];
1207 return 0;
1208}
1209
1210/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1211 corresponding to the DFA).
1212 Return the destination node, and update EPS_VIA_NODES;
1213 return -1 on match failure, -2 on error. */
1214
1215static Idx
1216proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1217 regmatch_t *prevregs,
1218 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1219 struct re_fail_stack_t *fs)
1220{
1221 const re_dfa_t *const dfa = mctx->dfa;
1222 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1223 {
1224 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1225 re_node_set *edests = &dfa->edests[node];
1226
1227 if (! re_node_set_contains (eps_via_nodes, node))
1228 {
1229 bool ok = re_node_set_insert (eps_via_nodes, node);
1230 if (__glibc_unlikely (! ok))
1231 return -2;
1232 }
1233
1234 /* Pick a valid destination, or return -1 if none is found. */
1235 Idx dest_node = -1;
1236 for (Idx i = 0; i < edests->nelem; i++)
1237 {
1238 Idx candidate = edests->elems[i];
1239 if (!re_node_set_contains (cur_nodes, candidate))
1240 continue;
1241 if (dest_node == -1)
1242 dest_node = candidate;
1243
1244 else
1245 {
1246 /* In order to avoid infinite loop like "(a*)*", return the second
1247 epsilon-transition if the first was already considered. */
1248 if (re_node_set_contains (eps_via_nodes, dest_node))
1249 return candidate;
1250
1251 /* Otherwise, push the second epsilon-transition on the fail stack. */
1252 else if (fs != NULL
1253 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1254 prevregs, eps_via_nodes))
1255 return -2;
1256
1257 /* We know we are going to exit. */
1258 break;
1259 }
1260 }
1261 return dest_node;
1262 }
1263 else
1264 {
1265 Idx naccepted = 0;
1266 re_token_type_t type = dfa->nodes[node].type;
1267
1268#ifdef RE_ENABLE_I18N
1269 if (dfa->nodes[node].accept_mb)
1270 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1271 else
1272#endif /* RE_ENABLE_I18N */
1273 if (type == OP_BACK_REF)
1274 {
1275 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1276 if (subexp_idx < nregs)
1277 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1278 if (fs != NULL)
1279 {
1280 if (subexp_idx >= nregs
1281 || regs[subexp_idx].rm_so == -1
1282 || regs[subexp_idx].rm_eo == -1)
1283 return -1;
1284 else if (naccepted)
1285 {
1286 char *buf = (char *) re_string_get_buffer (&mctx->input);
1287 if (mctx->input.valid_len - *pidx < naccepted
1288 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1289 naccepted)
1290 != 0))
1291 return -1;
1292 }
1293 }
1294
1295 if (naccepted == 0)
1296 {
1297 Idx dest_node;
1298 bool ok = re_node_set_insert (eps_via_nodes, node);
1299 if (__glibc_unlikely (! ok))
1300 return -2;
1301 dest_node = dfa->edests[node].elems[0];
1302 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1303 dest_node))
1304 return dest_node;
1305 }
1306 }
1307
1308 if (naccepted != 0
1309 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1310 {
1311 Idx dest_node = dfa->nexts[node];
1312 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1313 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1314 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1315 dest_node)))
1316 return -1;
1317 re_node_set_empty (eps_via_nodes);
1318 return dest_node;
1319 }
1320 }
1321 return -1;
1322}
1323
1324static reg_errcode_t
1325__attribute_warn_unused_result__
1326push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1327 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1328 re_node_set *eps_via_nodes)
1329{
1330 reg_errcode_t err;
1331 Idx num = fs->num++;
1332 if (fs->num == fs->alloc)
1333 {
1334 struct re_fail_stack_ent_t *new_array;
1335 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1336 fs->alloc * 2);
1337 if (new_array == NULL)
1338 return REG_ESPACE;
1339 fs->alloc *= 2;
1340 fs->stack = new_array;
1341 }
1342 fs->stack[num].idx = str_idx;
1343 fs->stack[num].node = dest_node;
1344 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1345 if (fs->stack[num].regs == NULL)
1346 return REG_ESPACE;
1347 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1348 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1349 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1350 return err;
1351}
1352
1353static Idx
1354pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1355 regmatch_t *regs, regmatch_t *prevregs,
1356 re_node_set *eps_via_nodes)
1357{
1358 if (fs == NULL || fs->num == 0)
1359 return -1;
1360 Idx num = --fs->num;
1361 *pidx = fs->stack[num].idx;
1362 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1363 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1364 re_node_set_free (eps_via_nodes);
1365 re_free (fs->stack[num].regs);
1366 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1367 DEBUG_ASSERT (0 <= fs->stack[num].node);
1368 return fs->stack[num].node;
1369}
1370
1371
1372#define DYNARRAY_STRUCT regmatch_list
1373#define DYNARRAY_ELEMENT regmatch_t
1374#define DYNARRAY_PREFIX regmatch_list_
1375#include <malloc/dynarray-skeleton.c>
1376
1377/* Set the positions where the subexpressions are starts/ends to registers
1378 PMATCH.
1379 Note: We assume that pmatch[0] is already set, and
1380 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1381
1382static reg_errcode_t
1383__attribute_warn_unused_result__
1384set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1385 regmatch_t *pmatch, bool fl_backtrack)
1386{
1387 const re_dfa_t *dfa = preg->buffer;
1388 Idx idx, cur_node;
1389 re_node_set eps_via_nodes;
1390 struct re_fail_stack_t *fs;
1391 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1392 struct regmatch_list prev_match;
1393 regmatch_list_init (&prev_match);
1394
1395 DEBUG_ASSERT (nmatch > 1);
1396 DEBUG_ASSERT (mctx->state_log != NULL);
1397 if (fl_backtrack)
1398 {
1399 fs = &fs_body;
1400 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1401 if (fs->stack == NULL)
1402 return REG_ESPACE;
1403 }
1404 else
1405 fs = NULL;
1406
1407 cur_node = dfa->init_node;
1408 re_node_set_init_empty (&eps_via_nodes);
1409
1410 if (!regmatch_list_resize (&prev_match, nmatch))
1411 {
1412 regmatch_list_free (&prev_match);
1413 free_fail_stack_return (fs);
1414 return REG_ESPACE;
1415 }
1416 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1417 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1418
1419 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1420 {
1421 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1422
1423 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1424 || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1425 {
1426 Idx reg_idx;
1427 cur_node = -1;
1428 if (fs)
1429 {
1430 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1431 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1432 {
1433 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1434 prev_idx_match, &eps_via_nodes);
1435 break;
1436 }
1437 }
1438 if (cur_node < 0)
1439 {
1440 re_node_set_free (&eps_via_nodes);
1441 regmatch_list_free (&prev_match);
1442 return free_fail_stack_return (fs);
1443 }
1444 }
1445
1446 /* Proceed to next node. */
1447 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1448 &idx, cur_node,
1449 &eps_via_nodes, fs);
1450
1451 if (__glibc_unlikely (cur_node < 0))
1452 {
1453 if (__glibc_unlikely (cur_node == -2))
1454 {
1455 re_node_set_free (&eps_via_nodes);
1456 regmatch_list_free (&prev_match);
1457 free_fail_stack_return (fs);
1458 return REG_ESPACE;
1459 }
1460 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1461 prev_idx_match, &eps_via_nodes);
1462 if (cur_node < 0)
1463 {
1464 re_node_set_free (&eps_via_nodes);
1465 regmatch_list_free (&prev_match);
1466 free_fail_stack_return (fs);
1467 return REG_NOMATCH;
1468 }
1469 }
1470 }
1471 re_node_set_free (&eps_via_nodes);
1472 regmatch_list_free (&prev_match);
1473 return free_fail_stack_return (fs);
1474}
1475
1476static reg_errcode_t
1477free_fail_stack_return (struct re_fail_stack_t *fs)
1478{
1479 if (fs)
1480 {
1481 Idx fs_idx;
1482 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1483 {
1484 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1485 re_free (fs->stack[fs_idx].regs);
1486 }
1487 re_free (fs->stack);
1488 }
1489 return REG_NOERROR;
1490}
1491
1492static void
1493update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1494 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1495{
1496 int type = dfa->nodes[cur_node].type;
1497 if (type == OP_OPEN_SUBEXP)
1498 {
1499 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1500
1501 /* We are at the first node of this sub expression. */
1502 if (reg_num < nmatch)
1503 {
1504 pmatch[reg_num].rm_so = cur_idx;
1505 pmatch[reg_num].rm_eo = -1;
1506 }
1507 }
1508 else if (type == OP_CLOSE_SUBEXP)
1509 {
1510 /* We are at the last node of this sub expression. */
1511 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1512 if (reg_num < nmatch)
1513 {
1514 if (pmatch[reg_num].rm_so < cur_idx)
1515 {
1516 pmatch[reg_num].rm_eo = cur_idx;
1517 /* This is a non-empty match or we are not inside an optional
1518 subexpression. Accept this right away. */
1519 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1520 }
1521 else
1522 {
1523 if (dfa->nodes[cur_node].opt_subexp
1524 && prev_idx_match[reg_num].rm_so != -1)
1525 /* We transited through an empty match for an optional
1526 subexpression, like (a?)*, and this is not the subexp's
1527 first match. Copy back the old content of the registers
1528 so that matches of an inner subexpression are undone as
1529 well, like in ((a?))*. */
1530 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1531 else
1532 /* We completed a subexpression, but it may be part of
1533 an optional one, so do not update PREV_IDX_MATCH. */
1534 pmatch[reg_num].rm_eo = cur_idx;
1535 }
1536 }
1537 }
1538}
1539
1540/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1541 and sift the nodes in each states according to the following rules.
1542 Updated state_log will be wrote to STATE_LOG.
1543
1544 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1545 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1546 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1547 the LAST_NODE, we throw away the node 'a'.
1548 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1549 string 's' and transit to 'b':
1550 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1551 away the node 'a'.
1552 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1553 thrown away, we throw away the node 'a'.
1554 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1555 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1556 node 'a'.
1557 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1558 we throw away the node 'a'. */
1559
1560#define STATE_NODE_CONTAINS(state,node) \
1561 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1562
1563static reg_errcode_t
1564sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1565{
1566 reg_errcode_t err;
1567 int null_cnt = 0;
1568 Idx str_idx = sctx->last_str_idx;
1569 re_node_set cur_dest;
1570
1571 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1572
1573 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1574 transit to the last_node and the last_node itself. */
1575 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1576 if (__glibc_unlikely (err != REG_NOERROR))
1577 return err;
1578 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1579 if (__glibc_unlikely (err != REG_NOERROR))
1580 goto free_return;
1581
1582 /* Then check each states in the state_log. */
1583 while (str_idx > 0)
1584 {
1585 /* Update counters. */
1586 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1587 if (null_cnt > mctx->max_mb_elem_len)
1588 {
1589 memset (sctx->sifted_states, '\0',
1590 sizeof (re_dfastate_t *) * str_idx);
1591 re_node_set_free (&cur_dest);
1592 return REG_NOERROR;
1593 }
1594 re_node_set_empty (&cur_dest);
1595 --str_idx;
1596
1597 if (mctx->state_log[str_idx])
1598 {
1599 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1600 if (__glibc_unlikely (err != REG_NOERROR))
1601 goto free_return;
1602 }
1603
1604 /* Add all the nodes which satisfy the following conditions:
1605 - It can epsilon transit to a node in CUR_DEST.
1606 - It is in CUR_SRC.
1607 And update state_log. */
1608 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1609 if (__glibc_unlikely (err != REG_NOERROR))
1610 goto free_return;
1611 }
1612 err = REG_NOERROR;
1613 free_return:
1614 re_node_set_free (&cur_dest);
1615 return err;
1616}
1617
1618static reg_errcode_t
1619__attribute_warn_unused_result__
1620build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1621 Idx str_idx, re_node_set *cur_dest)
1622{
1623 const re_dfa_t *const dfa = mctx->dfa;
1624 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1625 Idx i;
1626
1627 /* Then build the next sifted state.
1628 We build the next sifted state on 'cur_dest', and update
1629 'sifted_states[str_idx]' with 'cur_dest'.
1630 Note:
1631 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1632 'cur_src' points the node_set of the old 'state_log[str_idx]'
1633 (with the epsilon nodes pre-filtered out). */
1634 for (i = 0; i < cur_src->nelem; i++)
1635 {
1636 Idx prev_node = cur_src->elems[i];
1637 int naccepted = 0;
1638 bool ok;
1639 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1640
1641#ifdef RE_ENABLE_I18N
1642 /* If the node may accept "multi byte". */
1643 if (dfa->nodes[prev_node].accept_mb)
1644 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1645 str_idx, sctx->last_str_idx);
1646#endif /* RE_ENABLE_I18N */
1647
1648 /* We don't check backreferences here.
1649 See update_cur_sifted_state(). */
1650 if (!naccepted
1651 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1652 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1653 dfa->nexts[prev_node]))
1654 naccepted = 1;
1655
1656 if (naccepted == 0)
1657 continue;
1658
1659 if (sctx->limits.nelem)
1660 {
1661 Idx to_idx = str_idx + naccepted;
1662 if (check_dst_limits (mctx, &sctx->limits,
1663 dfa->nexts[prev_node], to_idx,
1664 prev_node, str_idx))
1665 continue;
1666 }
1667 ok = re_node_set_insert (cur_dest, prev_node);
1668 if (__glibc_unlikely (! ok))
1669 return REG_ESPACE;
1670 }
1671
1672 return REG_NOERROR;
1673}
1674
1675/* Helper functions. */
1676
1677static reg_errcode_t
1678clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1679{
1680 Idx top = mctx->state_log_top;
1681
1682 if ((next_state_log_idx >= mctx->input.bufs_len
1683 && mctx->input.bufs_len < mctx->input.len)
1684 || (next_state_log_idx >= mctx->input.valid_len
1685 && mctx->input.valid_len < mctx->input.len))
1686 {
1687 reg_errcode_t err;
1688 err = extend_buffers (mctx, next_state_log_idx + 1);
1689 if (__glibc_unlikely (err != REG_NOERROR))
1690 return err;
1691 }
1692
1693 if (top < next_state_log_idx)
1694 {
1695 memset (mctx->state_log + top + 1, '\0',
1696 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1697 mctx->state_log_top = next_state_log_idx;
1698 }
1699 return REG_NOERROR;
1700}
1701
1702static reg_errcode_t
1703merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1704 re_dfastate_t **src, Idx num)
1705{
1706 Idx st_idx;
1707 reg_errcode_t err;
1708 for (st_idx = 0; st_idx < num; ++st_idx)
1709 {
1710 if (dst[st_idx] == NULL)
1711 dst[st_idx] = src[st_idx];
1712 else if (src[st_idx] != NULL)
1713 {
1714 re_node_set merged_set;
1715 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1716 &src[st_idx]->nodes);
1717 if (__glibc_unlikely (err != REG_NOERROR))
1718 return err;
1719 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1720 re_node_set_free (&merged_set);
1721 if (__glibc_unlikely (err != REG_NOERROR))
1722 return err;
1723 }
1724 }
1725 return REG_NOERROR;
1726}
1727
1728static reg_errcode_t
1729update_cur_sifted_state (const re_match_context_t *mctx,
1730 re_sift_context_t *sctx, Idx str_idx,
1731 re_node_set *dest_nodes)
1732{
1733 const re_dfa_t *const dfa = mctx->dfa;
1734 reg_errcode_t err = REG_NOERROR;
1735 const re_node_set *candidates;
1736 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1737 : &mctx->state_log[str_idx]->nodes);
1738
1739 if (dest_nodes->nelem == 0)
1740 sctx->sifted_states[str_idx] = NULL;
1741 else
1742 {
1743 if (candidates)
1744 {
1745 /* At first, add the nodes which can epsilon transit to a node in
1746 DEST_NODE. */
1747 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1748 if (__glibc_unlikely (err != REG_NOERROR))
1749 return err;
1750
1751 /* Then, check the limitations in the current sift_context. */
1752 if (sctx->limits.nelem)
1753 {
1754 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1755 mctx->bkref_ents, str_idx);
1756 if (__glibc_unlikely (err != REG_NOERROR))
1757 return err;
1758 }
1759 }
1760
1761 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1762 if (__glibc_unlikely (err != REG_NOERROR))
1763 return err;
1764 }
1765
1766 if (candidates && mctx->state_log[str_idx]->has_backref)
1767 {
1768 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1769 if (__glibc_unlikely (err != REG_NOERROR))
1770 return err;
1771 }
1772 return REG_NOERROR;
1773}
1774
1775static reg_errcode_t
1776__attribute_warn_unused_result__
1777add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1778 const re_node_set *candidates)
1779{
1780 reg_errcode_t err = REG_NOERROR;
1781 Idx i;
1782
1783 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1784 if (__glibc_unlikely (err != REG_NOERROR))
1785 return err;
1786
1787 if (!state->inveclosure.alloc)
1788 {
1789 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1790 if (__glibc_unlikely (err != REG_NOERROR))
1791 return REG_ESPACE;
1792 for (i = 0; i < dest_nodes->nelem; i++)
1793 {
1794 err = re_node_set_merge (&state->inveclosure,
1795 dfa->inveclosures + dest_nodes->elems[i]);
1796 if (__glibc_unlikely (err != REG_NOERROR))
1797 return REG_ESPACE;
1798 }
1799 }
1800 return re_node_set_add_intersect (dest_nodes, candidates,
1801 &state->inveclosure);
1802}
1803
1804static reg_errcode_t
1805sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1806 const re_node_set *candidates)
1807{
1808 Idx ecl_idx;
1809 reg_errcode_t err;
1810 re_node_set *inv_eclosure = dfa->inveclosures + node;
1811 re_node_set except_nodes;
1812 re_node_set_init_empty (&except_nodes);
1813 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1814 {
1815 Idx cur_node = inv_eclosure->elems[ecl_idx];
1816 if (cur_node == node)
1817 continue;
1818 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1819 {
1820 Idx edst1 = dfa->edests[cur_node].elems[0];
1821 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1822 ? dfa->edests[cur_node].elems[1] : -1);
1823 if ((!re_node_set_contains (inv_eclosure, edst1)
1824 && re_node_set_contains (dest_nodes, edst1))
1825 || (edst2 > 0
1826 && !re_node_set_contains (inv_eclosure, edst2)
1827 && re_node_set_contains (dest_nodes, edst2)))
1828 {
1829 err = re_node_set_add_intersect (&except_nodes, candidates,
1830 dfa->inveclosures + cur_node);
1831 if (__glibc_unlikely (err != REG_NOERROR))
1832 {
1833 re_node_set_free (&except_nodes);
1834 return err;
1835 }
1836 }
1837 }
1838 }
1839 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1840 {
1841 Idx cur_node = inv_eclosure->elems[ecl_idx];
1842 if (!re_node_set_contains (&except_nodes, cur_node))
1843 {
1844 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1845 re_node_set_remove_at (dest_nodes, idx);
1846 }
1847 }
1848 re_node_set_free (&except_nodes);
1849 return REG_NOERROR;
1850}
1851
1852static bool
1853check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1854 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1855{
1856 const re_dfa_t *const dfa = mctx->dfa;
1857 Idx lim_idx, src_pos, dst_pos;
1858
1859 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1860 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1861 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1862 {
1863 Idx subexp_idx;
1864 struct re_backref_cache_entry *ent;
1865 ent = mctx->bkref_ents + limits->elems[lim_idx];
1866 subexp_idx = dfa->nodes[ent->node].opr.idx;
1867
1868 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1869 subexp_idx, dst_node, dst_idx,
1870 dst_bkref_idx);
1871 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1872 subexp_idx, src_node, src_idx,
1873 src_bkref_idx);
1874
1875 /* In case of:
1876 <src> <dst> ( <subexp> )
1877 ( <subexp> ) <src> <dst>
1878 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1879 if (src_pos == dst_pos)
1880 continue; /* This is unrelated limitation. */
1881 else
1882 return true;
1883 }
1884 return false;
1885}
1886
1887static int
1888check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1889 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1890{
1891 const re_dfa_t *const dfa = mctx->dfa;
1892 const re_node_set *eclosures = dfa->eclosures + from_node;
1893 Idx node_idx;
1894
1895 /* Else, we are on the boundary: examine the nodes on the epsilon
1896 closure. */
1897 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1898 {
1899 Idx node = eclosures->elems[node_idx];
1900 switch (dfa->nodes[node].type)
1901 {
1902 case OP_BACK_REF:
1903 if (bkref_idx != -1)
1904 {
1905 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1906 do
1907 {
1908 Idx dst;
1909 int cpos;
1910
1911 if (ent->node != node)
1912 continue;
1913
1914 if (subexp_idx < BITSET_WORD_BITS
1915 && !(ent->eps_reachable_subexps_map
1916 & ((bitset_word_t) 1 << subexp_idx)))
1917 continue;
1918
1919 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1920 OP_CLOSE_SUBEXP cases below. But, if the
1921 destination node is the same node as the source
1922 node, don't recurse because it would cause an
1923 infinite loop: a regex that exhibits this behavior
1924 is ()\1*\1* */
1925 dst = dfa->edests[node].elems[0];
1926 if (dst == from_node)
1927 {
1928 if (boundaries & 1)
1929 return -1;
1930 else /* if (boundaries & 2) */
1931 return 0;
1932 }
1933
1934 cpos =
1935 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1936 dst, bkref_idx);
1937 if (cpos == -1 /* && (boundaries & 1) */)
1938 return -1;
1939 if (cpos == 0 && (boundaries & 2))
1940 return 0;
1941
1942 if (subexp_idx < BITSET_WORD_BITS)
1943 ent->eps_reachable_subexps_map
1944 &= ~((bitset_word_t) 1 << subexp_idx);
1945 }
1946 while (ent++->more);
1947 }
1948 break;
1949
1950 case OP_OPEN_SUBEXP:
1951 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1952 return -1;
1953 break;
1954
1955 case OP_CLOSE_SUBEXP:
1956 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1957 return 0;
1958 break;
1959
1960 default:
1961 break;
1962 }
1963 }
1964
1965 return (boundaries & 2) ? 1 : 0;
1966}
1967
1968static int
1969check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1970 Idx subexp_idx, Idx from_node, Idx str_idx,
1971 Idx bkref_idx)
1972{
1973 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1974 int boundaries;
1975
1976 /* If we are outside the range of the subexpression, return -1 or 1. */
1977 if (str_idx < lim->subexp_from)
1978 return -1;
1979
1980 if (lim->subexp_to < str_idx)
1981 return 1;
1982
1983 /* If we are within the subexpression, return 0. */
1984 boundaries = (str_idx == lim->subexp_from);
1985 boundaries |= (str_idx == lim->subexp_to) << 1;
1986 if (boundaries == 0)
1987 return 0;
1988
1989 /* Else, examine epsilon closure. */
1990 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1991 from_node, bkref_idx);
1992}
1993
1994/* Check the limitations of sub expressions LIMITS, and remove the nodes
1995 which are against limitations from DEST_NODES. */
1996
1997static reg_errcode_t
1998check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1999 const re_node_set *candidates, re_node_set *limits,
2000 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2001{
2002 reg_errcode_t err;
2003 Idx node_idx, lim_idx;
2004
2005 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2006 {
2007 Idx subexp_idx;
2008 struct re_backref_cache_entry *ent;
2009 ent = bkref_ents + limits->elems[lim_idx];
2010
2011 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2012 continue; /* This is unrelated limitation. */
2013
2014 subexp_idx = dfa->nodes[ent->node].opr.idx;
2015 if (ent->subexp_to == str_idx)
2016 {
2017 Idx ops_node = -1;
2018 Idx cls_node = -1;
2019 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2020 {
2021 Idx node = dest_nodes->elems[node_idx];
2022 re_token_type_t type = dfa->nodes[node].type;
2023 if (type == OP_OPEN_SUBEXP
2024 && subexp_idx == dfa->nodes[node].opr.idx)
2025 ops_node = node;
2026 else if (type == OP_CLOSE_SUBEXP
2027 && subexp_idx == dfa->nodes[node].opr.idx)
2028 cls_node = node;
2029 }
2030
2031 /* Check the limitation of the open subexpression. */
2032 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2033 if (ops_node >= 0)
2034 {
2035 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2036 candidates);
2037 if (__glibc_unlikely (err != REG_NOERROR))
2038 return err;
2039 }
2040
2041 /* Check the limitation of the close subexpression. */
2042 if (cls_node >= 0)
2043 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2044 {
2045 Idx node = dest_nodes->elems[node_idx];
2046 if (!re_node_set_contains (dfa->inveclosures + node,
2047 cls_node)
2048 && !re_node_set_contains (dfa->eclosures + node,
2049 cls_node))
2050 {
2051 /* It is against this limitation.
2052 Remove it form the current sifted state. */
2053 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2054 candidates);
2055 if (__glibc_unlikely (err != REG_NOERROR))
2056 return err;
2057 --node_idx;
2058 }
2059 }
2060 }
2061 else /* (ent->subexp_to != str_idx) */
2062 {
2063 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064 {
2065 Idx node = dest_nodes->elems[node_idx];
2066 re_token_type_t type = dfa->nodes[node].type;
2067 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2068 {
2069 if (subexp_idx != dfa->nodes[node].opr.idx)
2070 continue;
2071 /* It is against this limitation.
2072 Remove it form the current sifted state. */
2073 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2074 candidates);
2075 if (__glibc_unlikely (err != REG_NOERROR))
2076 return err;
2077 }
2078 }
2079 }
2080 }
2081 return REG_NOERROR;
2082}
2083
2084static reg_errcode_t
2085__attribute_warn_unused_result__
2086sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2087 Idx str_idx, const re_node_set *candidates)
2088{
2089 const re_dfa_t *const dfa = mctx->dfa;
2090 reg_errcode_t err;
2091 Idx node_idx, node;
2092 re_sift_context_t local_sctx;
2093 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2094
2095 if (first_idx == -1)
2096 return REG_NOERROR;
2097
2098 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2099
2100 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2101 {
2102 Idx enabled_idx;
2103 re_token_type_t type;
2104 struct re_backref_cache_entry *entry;
2105 node = candidates->elems[node_idx];
2106 type = dfa->nodes[node].type;
2107 /* Avoid infinite loop for the REs like "()\1+". */
2108 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2109 continue;
2110 if (type != OP_BACK_REF)
2111 continue;
2112
2113 entry = mctx->bkref_ents + first_idx;
2114 enabled_idx = first_idx;
2115 do
2116 {
2117 Idx subexp_len;
2118 Idx to_idx;
2119 Idx dst_node;
2120 bool ok;
2121 re_dfastate_t *cur_state;
2122
2123 if (entry->node != node)
2124 continue;
2125 subexp_len = entry->subexp_to - entry->subexp_from;
2126 to_idx = str_idx + subexp_len;
2127 dst_node = (subexp_len ? dfa->nexts[node]
2128 : dfa->edests[node].elems[0]);
2129
2130 if (to_idx > sctx->last_str_idx
2131 || sctx->sifted_states[to_idx] == NULL
2132 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2133 || check_dst_limits (mctx, &sctx->limits, node,
2134 str_idx, dst_node, to_idx))
2135 continue;
2136
2137 if (local_sctx.sifted_states == NULL)
2138 {
2139 local_sctx = *sctx;
2140 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2141 if (__glibc_unlikely (err != REG_NOERROR))
2142 goto free_return;
2143 }
2144 local_sctx.last_node = node;
2145 local_sctx.last_str_idx = str_idx;
2146 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2147 if (__glibc_unlikely (! ok))
2148 {
2149 err = REG_ESPACE;
2150 goto free_return;
2151 }
2152 cur_state = local_sctx.sifted_states[str_idx];
2153 err = sift_states_backward (mctx, &local_sctx);
2154 if (__glibc_unlikely (err != REG_NOERROR))
2155 goto free_return;
2156 if (sctx->limited_states != NULL)
2157 {
2158 err = merge_state_array (dfa, sctx->limited_states,
2159 local_sctx.sifted_states,
2160 str_idx + 1);
2161 if (__glibc_unlikely (err != REG_NOERROR))
2162 goto free_return;
2163 }
2164 local_sctx.sifted_states[str_idx] = cur_state;
2165 re_node_set_remove (&local_sctx.limits, enabled_idx);
2166
2167 /* mctx->bkref_ents may have changed, reload the pointer. */
2168 entry = mctx->bkref_ents + enabled_idx;
2169 }
2170 while (enabled_idx++, entry++->more);
2171 }
2172 err = REG_NOERROR;
2173 free_return:
2174 if (local_sctx.sifted_states != NULL)
2175 {
2176 re_node_set_free (&local_sctx.limits);
2177 }
2178
2179 return err;
2180}
2181
2182
2183#ifdef RE_ENABLE_I18N
2184static int
2185sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2186 Idx node_idx, Idx str_idx, Idx max_str_idx)
2187{
2188 const re_dfa_t *const dfa = mctx->dfa;
2189 int naccepted;
2190 /* Check the node can accept "multi byte". */
2191 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2192 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2193 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2194 dfa->nexts[node_idx]))
2195 /* The node can't accept the "multi byte", or the
2196 destination was already thrown away, then the node
2197 couldn't accept the current input "multi byte". */
2198 naccepted = 0;
2199 /* Otherwise, it is sure that the node could accept
2200 'naccepted' bytes input. */
2201 return naccepted;
2202}
2203#endif /* RE_ENABLE_I18N */
2204
2205
2206
2207/* Functions for state transition. */
2208
2209/* Return the next state to which the current state STATE will transit by
2210 accepting the current input byte, and update STATE_LOG if necessary.
2211 Return NULL on failure.
2212 If STATE can accept a multibyte char/collating element/back reference
2213 update the destination of STATE_LOG. */
2214
2215static re_dfastate_t *
2216__attribute_warn_unused_result__
2217transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2218 re_dfastate_t *state)
2219{
2220 re_dfastate_t **trtable;
2221 unsigned char ch;
2222
2223#ifdef RE_ENABLE_I18N
2224 /* If the current state can accept multibyte. */
2225 if (__glibc_unlikely (state->accept_mb))
2226 {
2227 *err = transit_state_mb (mctx, state);
2228 if (__glibc_unlikely (*err != REG_NOERROR))
2229 return NULL;
2230 }
2231#endif /* RE_ENABLE_I18N */
2232
2233 /* Then decide the next state with the single byte. */
2234#if 0
2235 if (0)
2236 /* don't use transition table */
2237 return transit_state_sb (err, mctx, state);
2238#endif
2239
2240 /* Use transition table */
2241 ch = re_string_fetch_byte (&mctx->input);
2242 for (;;)
2243 {
2244 trtable = state->trtable;
2245 if (__glibc_likely (trtable != NULL))
2246 return trtable[ch];
2247
2248 trtable = state->word_trtable;
2249 if (__glibc_likely (trtable != NULL))
2250 {
2251 unsigned int context;
2252 context
2253 = re_string_context_at (&mctx->input,
2254 re_string_cur_idx (&mctx->input) - 1,
2255 mctx->eflags);
2256 if (IS_WORD_CONTEXT (context))
2257 return trtable[ch + SBC_MAX];
2258 else
2259 return trtable[ch];
2260 }
2261
2262 if (!build_trtable (mctx->dfa, state))
2263 {
2264 *err = REG_ESPACE;
2265 return NULL;
2266 }
2267
2268 /* Retry, we now have a transition table. */
2269 }
2270}
2271
2272/* Update the state_log if we need */
2273static re_dfastate_t *
2274merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2275 re_dfastate_t *next_state)
2276{
2277 const re_dfa_t *const dfa = mctx->dfa;
2278 Idx cur_idx = re_string_cur_idx (&mctx->input);
2279
2280 if (cur_idx > mctx->state_log_top)
2281 {
2282 mctx->state_log[cur_idx] = next_state;
2283 mctx->state_log_top = cur_idx;
2284 }
2285 else if (mctx->state_log[cur_idx] == 0)
2286 {
2287 mctx->state_log[cur_idx] = next_state;
2288 }
2289 else
2290 {
2291 re_dfastate_t *pstate;
2292 unsigned int context;
2293 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2294 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2295 the destination of a multibyte char/collating element/
2296 back reference. Then the next state is the union set of
2297 these destinations and the results of the transition table. */
2298 pstate = mctx->state_log[cur_idx];
2299 log_nodes = pstate->entrance_nodes;
2300 if (next_state != NULL)
2301 {
2302 table_nodes = next_state->entrance_nodes;
2303 *err = re_node_set_init_union (&next_nodes, table_nodes,
2304 log_nodes);
2305 if (__glibc_unlikely (*err != REG_NOERROR))
2306 return NULL;
2307 }
2308 else
2309 next_nodes = *log_nodes;
2310 /* Note: We already add the nodes of the initial state,
2311 then we don't need to add them here. */
2312
2313 context = re_string_context_at (&mctx->input,
2314 re_string_cur_idx (&mctx->input) - 1,
2315 mctx->eflags);
2316 next_state = mctx->state_log[cur_idx]
2317 = re_acquire_state_context (err, dfa, &next_nodes, context);
2318 /* We don't need to check errors here, since the return value of
2319 this function is next_state and ERR is already set. */
2320
2321 if (table_nodes != NULL)
2322 re_node_set_free (&next_nodes);
2323 }
2324
2325 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2326 {
2327 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2328 later. We must check them here, since the back references in the
2329 next state might use them. */
2330 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2331 cur_idx);
2332 if (__glibc_unlikely (*err != REG_NOERROR))
2333 return NULL;
2334
2335 /* If the next state has back references. */
2336 if (next_state->has_backref)
2337 {
2338 *err = transit_state_bkref (mctx, &next_state->nodes);
2339 if (__glibc_unlikely (*err != REG_NOERROR))
2340 return NULL;
2341 next_state = mctx->state_log[cur_idx];
2342 }
2343 }
2344
2345 return next_state;
2346}
2347
2348/* Skip bytes in the input that correspond to part of a
2349 multi-byte match, then look in the log for a state
2350 from which to restart matching. */
2351static re_dfastate_t *
2352find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2353{
2354 re_dfastate_t *cur_state;
2355 do
2356 {
2357 Idx max = mctx->state_log_top;
2358 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2359
2360 do
2361 {
2362 if (++cur_str_idx > max)
2363 return NULL;
2364 re_string_skip_bytes (&mctx->input, 1);
2365 }
2366 while (mctx->state_log[cur_str_idx] == NULL);
2367
2368 cur_state = merge_state_with_log (err, mctx, NULL);
2369 }
2370 while (*err == REG_NOERROR && cur_state == NULL);
2371 return cur_state;
2372}
2373
2374/* Helper functions for transit_state. */
2375
2376/* From the node set CUR_NODES, pick up the nodes whose types are
2377 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2378 expression. And register them to use them later for evaluating the
2379 corresponding back references. */
2380
2381static reg_errcode_t
2382check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2383 Idx str_idx)
2384{
2385 const re_dfa_t *const dfa = mctx->dfa;
2386 Idx node_idx;
2387 reg_errcode_t err;
2388
2389 /* TODO: This isn't efficient.
2390 Because there might be more than one nodes whose types are
2391 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2392 nodes.
2393 E.g. RE: (a){2} */
2394 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2395 {
2396 Idx node = cur_nodes->elems[node_idx];
2397 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2398 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2399 && (dfa->used_bkref_map
2400 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2401 {
2402 err = match_ctx_add_subtop (mctx, node, str_idx);
2403 if (__glibc_unlikely (err != REG_NOERROR))
2404 return err;
2405 }
2406 }
2407 return REG_NOERROR;
2408}
2409
2410#if 0
2411/* Return the next state to which the current state STATE will transit by
2412 accepting the current input byte. Return NULL on failure. */
2413
2414static re_dfastate_t *
2415transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2416 re_dfastate_t *state)
2417{
2418 const re_dfa_t *const dfa = mctx->dfa;
2419 re_node_set next_nodes;
2420 re_dfastate_t *next_state;
2421 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2422 unsigned int context;
2423
2424 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2425 if (__glibc_unlikely (*err != REG_NOERROR))
2426 return NULL;
2427 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2428 {
2429 Idx cur_node = state->nodes.elems[node_cnt];
2430 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2431 {
2432 *err = re_node_set_merge (&next_nodes,
2433 dfa->eclosures + dfa->nexts[cur_node]);
2434 if (__glibc_unlikely (*err != REG_NOERROR))
2435 {
2436 re_node_set_free (&next_nodes);
2437 return NULL;
2438 }
2439 }
2440 }
2441 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2442 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2443 /* We don't need to check errors here, since the return value of
2444 this function is next_state and ERR is already set. */
2445
2446 re_node_set_free (&next_nodes);
2447 re_string_skip_bytes (&mctx->input, 1);
2448 return next_state;
2449}
2450#endif
2451
2452#ifdef RE_ENABLE_I18N
2453static reg_errcode_t
2454transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2455{
2456 const re_dfa_t *const dfa = mctx->dfa;
2457 reg_errcode_t err;
2458 Idx i;
2459
2460 for (i = 0; i < pstate->nodes.nelem; ++i)
2461 {
2462 re_node_set dest_nodes, *new_nodes;
2463 Idx cur_node_idx = pstate->nodes.elems[i];
2464 int naccepted;
2465 Idx dest_idx;
2466 unsigned int context;
2467 re_dfastate_t *dest_state;
2468
2469 if (!dfa->nodes[cur_node_idx].accept_mb)
2470 continue;
2471
2472 if (dfa->nodes[cur_node_idx].constraint)
2473 {
2474 context = re_string_context_at (&mctx->input,
2475 re_string_cur_idx (&mctx->input),
2476 mctx->eflags);
2477 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2478 context))
2479 continue;
2480 }
2481
2482 /* How many bytes the node can accept? */
2483 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2484 re_string_cur_idx (&mctx->input));
2485 if (naccepted == 0)
2486 continue;
2487
2488 /* The node can accepts 'naccepted' bytes. */
2489 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2490 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2491 : mctx->max_mb_elem_len);
2492 err = clean_state_log_if_needed (mctx, dest_idx);
2493 if (__glibc_unlikely (err != REG_NOERROR))
2494 return err;
2495 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2496 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2497
2498 dest_state = mctx->state_log[dest_idx];
2499 if (dest_state == NULL)
2500 dest_nodes = *new_nodes;
2501 else
2502 {
2503 err = re_node_set_init_union (&dest_nodes,
2504 dest_state->entrance_nodes, new_nodes);
2505 if (__glibc_unlikely (err != REG_NOERROR))
2506 return err;
2507 }
2508 context = re_string_context_at (&mctx->input, dest_idx - 1,
2509 mctx->eflags);
2510 mctx->state_log[dest_idx]
2511 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2512 if (dest_state != NULL)
2513 re_node_set_free (&dest_nodes);
2514 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2515 && err != REG_NOERROR))
2516 return err;
2517 }
2518 return REG_NOERROR;
2519}
2520#endif /* RE_ENABLE_I18N */
2521
2522static reg_errcode_t
2523transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2524{
2525 const re_dfa_t *const dfa = mctx->dfa;
2526 reg_errcode_t err;
2527 Idx i;
2528 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2529
2530 for (i = 0; i < nodes->nelem; ++i)
2531 {
2532 Idx dest_str_idx, prev_nelem, bkc_idx;
2533 Idx node_idx = nodes->elems[i];
2534 unsigned int context;
2535 const re_token_t *node = dfa->nodes + node_idx;
2536 re_node_set *new_dest_nodes;
2537
2538 /* Check whether 'node' is a backreference or not. */
2539 if (node->type != OP_BACK_REF)
2540 continue;
2541
2542 if (node->constraint)
2543 {
2544 context = re_string_context_at (&mctx->input, cur_str_idx,
2545 mctx->eflags);
2546 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2547 continue;
2548 }
2549
2550 /* 'node' is a backreference.
2551 Check the substring which the substring matched. */
2552 bkc_idx = mctx->nbkref_ents;
2553 err = get_subexp (mctx, node_idx, cur_str_idx);
2554 if (__glibc_unlikely (err != REG_NOERROR))
2555 goto free_return;
2556
2557 /* And add the epsilon closures (which is 'new_dest_nodes') of
2558 the backreference to appropriate state_log. */
2559 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2560 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2561 {
2562 Idx subexp_len;
2563 re_dfastate_t *dest_state;
2564 struct re_backref_cache_entry *bkref_ent;
2565 bkref_ent = mctx->bkref_ents + bkc_idx;
2566 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2567 continue;
2568 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2569 new_dest_nodes = (subexp_len == 0
2570 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2571 : dfa->eclosures + dfa->nexts[node_idx]);
2572 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2573 - bkref_ent->subexp_from);
2574 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2575 mctx->eflags);
2576 dest_state = mctx->state_log[dest_str_idx];
2577 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2578 : mctx->state_log[cur_str_idx]->nodes.nelem);
2579 /* Add 'new_dest_node' to state_log. */
2580 if (dest_state == NULL)
2581 {
2582 mctx->state_log[dest_str_idx]
2583 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2584 context);
2585 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2586 && err != REG_NOERROR))
2587 goto free_return;
2588 }
2589 else
2590 {
2591 re_node_set dest_nodes;
2592 err = re_node_set_init_union (&dest_nodes,
2593 dest_state->entrance_nodes,
2594 new_dest_nodes);
2595 if (__glibc_unlikely (err != REG_NOERROR))
2596 {
2597 re_node_set_free (&dest_nodes);
2598 goto free_return;
2599 }
2600 mctx->state_log[dest_str_idx]
2601 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2602 re_node_set_free (&dest_nodes);
2603 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2604 && err != REG_NOERROR))
2605 goto free_return;
2606 }
2607 /* We need to check recursively if the backreference can epsilon
2608 transit. */
2609 if (subexp_len == 0
2610 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2611 {
2612 err = check_subexp_matching_top (mctx, new_dest_nodes,
2613 cur_str_idx);
2614 if (__glibc_unlikely (err != REG_NOERROR))
2615 goto free_return;
2616 err = transit_state_bkref (mctx, new_dest_nodes);
2617 if (__glibc_unlikely (err != REG_NOERROR))
2618 goto free_return;
2619 }
2620 }
2621 }
2622 err = REG_NOERROR;
2623 free_return:
2624 return err;
2625}
2626
2627/* Enumerate all the candidates which the backreference BKREF_NODE can match
2628 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2629 Note that we might collect inappropriate candidates here.
2630 However, the cost of checking them strictly here is too high, then we
2631 delay these checking for prune_impossible_nodes(). */
2632
2633static reg_errcode_t
2634__attribute_warn_unused_result__
2635get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2636{
2637 const re_dfa_t *const dfa = mctx->dfa;
2638 Idx subexp_num, sub_top_idx;
2639 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2640 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2641 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2642 if (cache_idx != -1)
2643 {
2644 const struct re_backref_cache_entry *entry
2645 = mctx->bkref_ents + cache_idx;
2646 do
2647 if (entry->node == bkref_node)
2648 return REG_NOERROR; /* We already checked it. */
2649 while (entry++->more);
2650 }
2651
2652 subexp_num = dfa->nodes[bkref_node].opr.idx;
2653
2654 /* For each sub expression */
2655 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2656 {
2657 reg_errcode_t err;
2658 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2659 re_sub_match_last_t *sub_last;
2660 Idx sub_last_idx, sl_str, bkref_str_off;
2661
2662 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2663 continue; /* It isn't related. */
2664
2665 sl_str = sub_top->str_idx;
2666 bkref_str_off = bkref_str_idx;
2667 /* At first, check the last node of sub expressions we already
2668 evaluated. */
2669 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2670 {
2671 regoff_t sl_str_diff;
2672 sub_last = sub_top->lasts[sub_last_idx];
2673 sl_str_diff = sub_last->str_idx - sl_str;
2674 /* The matched string by the sub expression match with the substring
2675 at the back reference? */
2676 if (sl_str_diff > 0)
2677 {
2678 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2679 > mctx->input.valid_len))
2680 {
2681 /* Not enough chars for a successful match. */
2682 if (bkref_str_off + sl_str_diff > mctx->input.len)
2683 break;
2684
2685 err = clean_state_log_if_needed (mctx,
2686 bkref_str_off
2687 + sl_str_diff);
2688 if (__glibc_unlikely (err != REG_NOERROR))
2689 return err;
2690 buf = (const char *) re_string_get_buffer (&mctx->input);
2691 }
2692 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2693 /* We don't need to search this sub expression any more. */
2694 break;
2695 }
2696 bkref_str_off += sl_str_diff;
2697 sl_str += sl_str_diff;
2698 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2699 bkref_str_idx);
2700
2701 /* Reload buf, since the preceding call might have reallocated
2702 the buffer. */
2703 buf = (const char *) re_string_get_buffer (&mctx->input);
2704
2705 if (err == REG_NOMATCH)
2706 continue;
2707 if (__glibc_unlikely (err != REG_NOERROR))
2708 return err;
2709 }
2710
2711 if (sub_last_idx < sub_top->nlasts)
2712 continue;
2713 if (sub_last_idx > 0)
2714 ++sl_str;
2715 /* Then, search for the other last nodes of the sub expression. */
2716 for (; sl_str <= bkref_str_idx; ++sl_str)
2717 {
2718 Idx cls_node;
2719 regoff_t sl_str_off;
2720 const re_node_set *nodes;
2721 sl_str_off = sl_str - sub_top->str_idx;
2722 /* The matched string by the sub expression match with the substring
2723 at the back reference? */
2724 if (sl_str_off > 0)
2725 {
2726 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2727 {
2728 /* If we are at the end of the input, we cannot match. */
2729 if (bkref_str_off >= mctx->input.len)
2730 break;
2731
2732 err = extend_buffers (mctx, bkref_str_off + 1);
2733 if (__glibc_unlikely (err != REG_NOERROR))
2734 return err;
2735
2736 buf = (const char *) re_string_get_buffer (&mctx->input);
2737 }
2738 if (buf [bkref_str_off++] != buf[sl_str - 1])
2739 break; /* We don't need to search this sub expression
2740 any more. */
2741 }
2742 if (mctx->state_log[sl_str] == NULL)
2743 continue;
2744 /* Does this state have a ')' of the sub expression? */
2745 nodes = &mctx->state_log[sl_str]->nodes;
2746 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2747 OP_CLOSE_SUBEXP);
2748 if (cls_node == -1)
2749 continue; /* No. */
2750 if (sub_top->path == NULL)
2751 {
2752 sub_top->path = calloc (sizeof (state_array_t),
2753 sl_str - sub_top->str_idx + 1);
2754 if (sub_top->path == NULL)
2755 return REG_ESPACE;
2756 }
2757 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2758 in the current context? */
2759 err = check_arrival (mctx, sub_top->path, sub_top->node,
2760 sub_top->str_idx, cls_node, sl_str,
2761 OP_CLOSE_SUBEXP);
2762 if (err == REG_NOMATCH)
2763 continue;
2764 if (__glibc_unlikely (err != REG_NOERROR))
2765 return err;
2766 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2767 if (__glibc_unlikely (sub_last == NULL))
2768 return REG_ESPACE;
2769 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2770 bkref_str_idx);
2771 buf = (const char *) re_string_get_buffer (&mctx->input);
2772 if (err == REG_NOMATCH)
2773 continue;
2774 if (__glibc_unlikely (err != REG_NOERROR))
2775 return err;
2776 }
2777 }
2778 return REG_NOERROR;
2779}
2780
2781/* Helper functions for get_subexp(). */
2782
2783/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2784 If it can arrive, register the sub expression expressed with SUB_TOP
2785 and SUB_LAST. */
2786
2787static reg_errcode_t
2788get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2789 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2790{
2791 reg_errcode_t err;
2792 Idx to_idx;
2793 /* Can the subexpression arrive the back reference? */
2794 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2795 sub_last->str_idx, bkref_node, bkref_str,
2796 OP_OPEN_SUBEXP);
2797 if (err != REG_NOERROR)
2798 return err;
2799 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2800 sub_last->str_idx);
2801 if (__glibc_unlikely (err != REG_NOERROR))
2802 return err;
2803 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2804 return clean_state_log_if_needed (mctx, to_idx);
2805}
2806
2807/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2808 Search '(' if FL_OPEN, or search ')' otherwise.
2809 TODO: This function isn't efficient...
2810 Because there might be more than one nodes whose types are
2811 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2812 nodes.
2813 E.g. RE: (a){2} */
2814
2815static Idx
2816find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2817 Idx subexp_idx, int type)
2818{
2819 Idx cls_idx;
2820 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2821 {
2822 Idx cls_node = nodes->elems[cls_idx];
2823 const re_token_t *node = dfa->nodes + cls_node;
2824 if (node->type == type
2825 && node->opr.idx == subexp_idx)
2826 return cls_node;
2827 }
2828 return -1;
2829}
2830
2831/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2832 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2833 heavily reused.
2834 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2835 REG_ESPACE if memory is exhausted. */
2836
2837static reg_errcode_t
2838__attribute_warn_unused_result__
2839check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2840 Idx top_str, Idx last_node, Idx last_str, int type)
2841{
2842 const re_dfa_t *const dfa = mctx->dfa;
2843 reg_errcode_t err = REG_NOERROR;
2844 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2845 re_dfastate_t *cur_state = NULL;
2846 re_node_set *cur_nodes, next_nodes;
2847 re_dfastate_t **backup_state_log;
2848 unsigned int context;
2849
2850 subexp_num = dfa->nodes[top_node].opr.idx;
2851 /* Extend the buffer if we need. */
2852 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2853 {
2854 re_dfastate_t **new_array;
2855 Idx old_alloc = path->alloc;
2856 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2857 Idx new_alloc;
2858 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2859 return REG_ESPACE;
2860 new_alloc = old_alloc + incr_alloc;
2861 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2862 return REG_ESPACE;
2863 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2864 if (__glibc_unlikely (new_array == NULL))
2865 return REG_ESPACE;
2866 path->array = new_array;
2867 path->alloc = new_alloc;
2868 memset (new_array + old_alloc, '\0',
2869 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2870 }
2871
2872 str_idx = path->next_idx ? path->next_idx : top_str;
2873
2874 /* Temporary modify MCTX. */
2875 backup_state_log = mctx->state_log;
2876 backup_cur_idx = mctx->input.cur_idx;
2877 mctx->state_log = path->array;
2878 mctx->input.cur_idx = str_idx;
2879
2880 /* Setup initial node set. */
2881 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2882 if (str_idx == top_str)
2883 {
2884 err = re_node_set_init_1 (&next_nodes, top_node);
2885 if (__glibc_unlikely (err != REG_NOERROR))
2886 return err;
2887 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2888 if (__glibc_unlikely (err != REG_NOERROR))
2889 {
2890 re_node_set_free (&next_nodes);
2891 return err;
2892 }
2893 }
2894 else
2895 {
2896 cur_state = mctx->state_log[str_idx];
2897 if (cur_state && cur_state->has_backref)
2898 {
2899 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2900 if (__glibc_unlikely (err != REG_NOERROR))
2901 return err;
2902 }
2903 else
2904 re_node_set_init_empty (&next_nodes);
2905 }
2906 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2907 {
2908 if (next_nodes.nelem)
2909 {
2910 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2911 subexp_num, type);
2912 if (__glibc_unlikely (err != REG_NOERROR))
2913 {
2914 re_node_set_free (&next_nodes);
2915 return err;
2916 }
2917 }
2918 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2919 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2920 {
2921 re_node_set_free (&next_nodes);
2922 return err;
2923 }
2924 mctx->state_log[str_idx] = cur_state;
2925 }
2926
2927 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2928 {
2929 re_node_set_empty (&next_nodes);
2930 if (mctx->state_log[str_idx + 1])
2931 {
2932 err = re_node_set_merge (&next_nodes,
2933 &mctx->state_log[str_idx + 1]->nodes);
2934 if (__glibc_unlikely (err != REG_NOERROR))
2935 {
2936 re_node_set_free (&next_nodes);
2937 return err;
2938 }
2939 }
2940 if (cur_state)
2941 {
2942 err = check_arrival_add_next_nodes (mctx, str_idx,
2943 &cur_state->non_eps_nodes,
2944 &next_nodes);
2945 if (__glibc_unlikely (err != REG_NOERROR))
2946 {
2947 re_node_set_free (&next_nodes);
2948 return err;
2949 }
2950 }
2951 ++str_idx;
2952 if (next_nodes.nelem)
2953 {
2954 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2955 if (__glibc_unlikely (err != REG_NOERROR))
2956 {
2957 re_node_set_free (&next_nodes);
2958 return err;
2959 }
2960 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2961 subexp_num, type);
2962 if (__glibc_unlikely (err != REG_NOERROR))
2963 {
2964 re_node_set_free (&next_nodes);
2965 return err;
2966 }
2967 }
2968 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2969 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2970 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2971 {
2972 re_node_set_free (&next_nodes);
2973 return err;
2974 }
2975 mctx->state_log[str_idx] = cur_state;
2976 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2977 }
2978 re_node_set_free (&next_nodes);
2979 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2980 : &mctx->state_log[last_str]->nodes);
2981 path->next_idx = str_idx;
2982
2983 /* Fix MCTX. */
2984 mctx->state_log = backup_state_log;
2985 mctx->input.cur_idx = backup_cur_idx;
2986
2987 /* Then check the current node set has the node LAST_NODE. */
2988 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2989 return REG_NOERROR;
2990
2991 return REG_NOMATCH;
2992}
2993
2994/* Helper functions for check_arrival. */
2995
2996/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2997 to NEXT_NODES.
2998 TODO: This function is similar to the functions transit_state*(),
2999 however this function has many additional works.
3000 Can't we unify them? */
3001
3002static reg_errcode_t
3003__attribute_warn_unused_result__
3004check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3005 re_node_set *cur_nodes, re_node_set *next_nodes)
3006{
3007 const re_dfa_t *const dfa = mctx->dfa;
3008 bool ok;
3009 Idx cur_idx;
3010#ifdef RE_ENABLE_I18N
3011 reg_errcode_t err = REG_NOERROR;
3012#endif
3013 re_node_set union_set;
3014 re_node_set_init_empty (&union_set);
3015 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3016 {
3017 int naccepted = 0;
3018 Idx cur_node = cur_nodes->elems[cur_idx];
3019 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3020
3021#ifdef RE_ENABLE_I18N
3022 /* If the node may accept "multi byte". */
3023 if (dfa->nodes[cur_node].accept_mb)
3024 {
3025 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3026 str_idx);
3027 if (naccepted > 1)
3028 {
3029 re_dfastate_t *dest_state;
3030 Idx next_node = dfa->nexts[cur_node];
3031 Idx next_idx = str_idx + naccepted;
3032 dest_state = mctx->state_log[next_idx];
3033 re_node_set_empty (&union_set);
3034 if (dest_state)
3035 {
3036 err = re_node_set_merge (&union_set, &dest_state->nodes);
3037 if (__glibc_unlikely (err != REG_NOERROR))
3038 {
3039 re_node_set_free (&union_set);
3040 return err;
3041 }
3042 }
3043 ok = re_node_set_insert (&union_set, next_node);
3044 if (__glibc_unlikely (! ok))
3045 {
3046 re_node_set_free (&union_set);
3047 return REG_ESPACE;
3048 }
3049 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3050 &union_set);
3051 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3052 && err != REG_NOERROR))
3053 {
3054 re_node_set_free (&union_set);
3055 return err;
3056 }
3057 }
3058 }
3059#endif /* RE_ENABLE_I18N */
3060 if (naccepted
3061 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3062 {
3063 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3064 if (__glibc_unlikely (! ok))
3065 {
3066 re_node_set_free (&union_set);
3067 return REG_ESPACE;
3068 }
3069 }
3070 }
3071 re_node_set_free (&union_set);
3072 return REG_NOERROR;
3073}
3074
3075/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3076 CUR_NODES, however exclude the nodes which are:
3077 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3078 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3079*/
3080
3081static reg_errcode_t
3082check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3083 Idx ex_subexp, int type)
3084{
3085 reg_errcode_t err;
3086 Idx idx, outside_node;
3087 re_node_set new_nodes;
3088 DEBUG_ASSERT (cur_nodes->nelem);
3089 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3090 if (__glibc_unlikely (err != REG_NOERROR))
3091 return err;
3092 /* Create a new node set NEW_NODES with the nodes which are epsilon
3093 closures of the node in CUR_NODES. */
3094
3095 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3096 {
3097 Idx cur_node = cur_nodes->elems[idx];
3098 const re_node_set *eclosure = dfa->eclosures + cur_node;
3099 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3100 if (outside_node == -1)
3101 {
3102 /* There are no problematic nodes, just merge them. */
3103 err = re_node_set_merge (&new_nodes, eclosure);
3104 if (__glibc_unlikely (err != REG_NOERROR))
3105 {
3106 re_node_set_free (&new_nodes);
3107 return err;
3108 }
3109 }
3110 else
3111 {
3112 /* There are problematic nodes, re-calculate incrementally. */
3113 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3114 ex_subexp, type);
3115 if (__glibc_unlikely (err != REG_NOERROR))
3116 {
3117 re_node_set_free (&new_nodes);
3118 return err;
3119 }
3120 }
3121 }
3122 re_node_set_free (cur_nodes);
3123 *cur_nodes = new_nodes;
3124 return REG_NOERROR;
3125}
3126
3127/* Helper function for check_arrival_expand_ecl.
3128 Check incrementally the epsilon closure of TARGET, and if it isn't
3129 problematic append it to DST_NODES. */
3130
3131static reg_errcode_t
3132__attribute_warn_unused_result__
3133check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3134 Idx target, Idx ex_subexp, int type)
3135{
3136 Idx cur_node;
3137 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3138 {
3139 bool ok;
3140
3141 if (dfa->nodes[cur_node].type == type
3142 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3143 {
3144 if (type == OP_CLOSE_SUBEXP)
3145 {
3146 ok = re_node_set_insert (dst_nodes, cur_node);
3147 if (__glibc_unlikely (! ok))
3148 return REG_ESPACE;
3149 }
3150 break;
3151 }
3152 ok = re_node_set_insert (dst_nodes, cur_node);
3153 if (__glibc_unlikely (! ok))
3154 return REG_ESPACE;
3155 if (dfa->edests[cur_node].nelem == 0)
3156 break;
3157 if (dfa->edests[cur_node].nelem == 2)
3158 {
3159 reg_errcode_t err;
3160 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3161 dfa->edests[cur_node].elems[1],
3162 ex_subexp, type);
3163 if (__glibc_unlikely (err != REG_NOERROR))
3164 return err;
3165 }
3166 cur_node = dfa->edests[cur_node].elems[0];
3167 }
3168 return REG_NOERROR;
3169}
3170
3171
3172/* For all the back references in the current state, calculate the
3173 destination of the back references by the appropriate entry
3174 in MCTX->BKREF_ENTS. */
3175
3176static reg_errcode_t
3177__attribute_warn_unused_result__
3178expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3179 Idx cur_str, Idx subexp_num, int type)
3180{
3181 const re_dfa_t *const dfa = mctx->dfa;
3182 reg_errcode_t err;
3183 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3184 struct re_backref_cache_entry *ent;
3185
3186 if (cache_idx_start == -1)
3187 return REG_NOERROR;
3188
3189 restart:
3190 ent = mctx->bkref_ents + cache_idx_start;
3191 do
3192 {
3193 Idx to_idx, next_node;
3194
3195 /* Is this entry ENT is appropriate? */
3196 if (!re_node_set_contains (cur_nodes, ent->node))
3197 continue; /* No. */
3198
3199 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3200 /* Calculate the destination of the back reference, and append it
3201 to MCTX->STATE_LOG. */
3202 if (to_idx == cur_str)
3203 {
3204 /* The backreference did epsilon transit, we must re-check all the
3205 node in the current state. */
3206 re_node_set new_dests;
3207 reg_errcode_t err2, err3;
3208 next_node = dfa->edests[ent->node].elems[0];
3209 if (re_node_set_contains (cur_nodes, next_node))
3210 continue;
3211 err = re_node_set_init_1 (&new_dests, next_node);
3212 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3213 err3 = re_node_set_merge (cur_nodes, &new_dests);
3214 re_node_set_free (&new_dests);
3215 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3216 || err3 != REG_NOERROR))
3217 {
3218 err = (err != REG_NOERROR ? err
3219 : (err2 != REG_NOERROR ? err2 : err3));
3220 return err;
3221 }
3222 /* TODO: It is still inefficient... */
3223 goto restart;
3224 }
3225 else
3226 {
3227 re_node_set union_set;
3228 next_node = dfa->nexts[ent->node];
3229 if (mctx->state_log[to_idx])
3230 {
3231 bool ok;
3232 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3233 next_node))
3234 continue;
3235 err = re_node_set_init_copy (&union_set,
3236 &mctx->state_log[to_idx]->nodes);
3237 ok = re_node_set_insert (&union_set, next_node);
3238 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3239 {
3240 re_node_set_free (&union_set);
3241 err = err != REG_NOERROR ? err : REG_ESPACE;
3242 return err;
3243 }
3244 }
3245 else
3246 {
3247 err = re_node_set_init_1 (&union_set, next_node);
3248 if (__glibc_unlikely (err != REG_NOERROR))
3249 return err;
3250 }
3251 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3252 re_node_set_free (&union_set);
3253 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3254 && err != REG_NOERROR))
3255 return err;
3256 }
3257 }
3258 while (ent++->more);
3259 return REG_NOERROR;
3260}
3261
3262/* Build transition table for the state.
3263 Return true if successful. */
3264
3265static bool __attribute_noinline__
3266build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3267{
3268 reg_errcode_t err;
3269 Idx i, j;
3270 int ch;
3271 bool need_word_trtable = false;
3272 bitset_word_t elem, mask;
3273 Idx ndests; /* Number of the destination states from 'state'. */
3274 re_dfastate_t **trtable;
3275 re_dfastate_t *dest_states[SBC_MAX];
3276 re_dfastate_t *dest_states_word[SBC_MAX];
3277 re_dfastate_t *dest_states_nl[SBC_MAX];
3278 re_node_set follows;
3279 bitset_t acceptable;
3280
3281 /* We build DFA states which corresponds to the destination nodes
3282 from 'state'. 'dests_node[i]' represents the nodes which i-th
3283 destination state contains, and 'dests_ch[i]' represents the
3284 characters which i-th destination state accepts. */
3285 re_node_set dests_node[SBC_MAX];
3286 bitset_t dests_ch[SBC_MAX];
3287
3288 /* Initialize transition table. */
3289 state->word_trtable = state->trtable = NULL;
3290
3291 /* At first, group all nodes belonging to 'state' into several
3292 destinations. */
3293 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3294 if (__glibc_unlikely (ndests <= 0))
3295 {
3296 /* Return false in case of an error, true otherwise. */
3297 if (ndests == 0)
3298 {
3299 state->trtable = (re_dfastate_t **)
3300 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3301 if (__glibc_unlikely (state->trtable == NULL))
3302 return false;
3303 return true;
3304 }
3305 return false;
3306 }
3307
3308 err = re_node_set_alloc (&follows, ndests + 1);
3309 if (__glibc_unlikely (err != REG_NOERROR))
3310 {
3311 out_free:
3312 re_node_set_free (&follows);
3313 for (i = 0; i < ndests; ++i)
3314 re_node_set_free (dests_node + i);
3315 return false;
3316 }
3317
3318 bitset_empty (acceptable);
3319
3320 /* Then build the states for all destinations. */
3321 for (i = 0; i < ndests; ++i)
3322 {
3323 Idx next_node;
3324 re_node_set_empty (&follows);
3325 /* Merge the follows of this destination states. */
3326 for (j = 0; j < dests_node[i].nelem; ++j)
3327 {
3328 next_node = dfa->nexts[dests_node[i].elems[j]];
3329 if (next_node != -1)
3330 {
3331 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3332 if (__glibc_unlikely (err != REG_NOERROR))
3333 goto out_free;
3334 }
3335 }
3336 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3337 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3338 goto out_free;
3339 /* If the new state has context constraint,
3340 build appropriate states for these contexts. */
3341 if (dest_states[i]->has_constraint)
3342 {
3343 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3344 CONTEXT_WORD);
3345 if (__glibc_unlikely (dest_states_word[i] == NULL
3346 && err != REG_NOERROR))
3347 goto out_free;
3348
3349 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3350 need_word_trtable = true;
3351
3352 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3353 CONTEXT_NEWLINE);
3354 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3355 goto out_free;
3356 }
3357 else
3358 {
3359 dest_states_word[i] = dest_states[i];
3360 dest_states_nl[i] = dest_states[i];
3361 }
3362 bitset_merge (acceptable, dests_ch[i]);
3363 }
3364
3365 if (!__glibc_unlikely (need_word_trtable))
3366 {
3367 /* We don't care about whether the following character is a word
3368 character, or we are in a single-byte character set so we can
3369 discern by looking at the character code: allocate a
3370 256-entry transition table. */
3371 trtable = state->trtable =
3372 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3373 if (__glibc_unlikely (trtable == NULL))
3374 goto out_free;
3375
3376 /* For all characters ch...: */
3377 for (i = 0; i < BITSET_WORDS; ++i)
3378 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3379 elem;
3380 mask <<= 1, elem >>= 1, ++ch)
3381 if (__glibc_unlikely (elem & 1))
3382 {
3383 /* There must be exactly one destination which accepts
3384 character ch. See group_nodes_into_DFAstates. */
3385 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3386 ;
3387
3388 /* j-th destination accepts the word character ch. */
3389 if (dfa->word_char[i] & mask)
3390 trtable[ch] = dest_states_word[j];
3391 else
3392 trtable[ch] = dest_states[j];
3393 }
3394 }
3395 else
3396 {
3397 /* We care about whether the following character is a word
3398 character, and we are in a multi-byte character set: discern
3399 by looking at the character code: build two 256-entry
3400 transition tables, one starting at trtable[0] and one
3401 starting at trtable[SBC_MAX]. */
3402 trtable = state->word_trtable =
3403 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3404 if (__glibc_unlikely (trtable == NULL))
3405 goto out_free;
3406
3407 /* For all characters ch...: */
3408 for (i = 0; i < BITSET_WORDS; ++i)
3409 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3410 elem;
3411 mask <<= 1, elem >>= 1, ++ch)
3412 if (__glibc_unlikely (elem & 1))
3413 {
3414 /* There must be exactly one destination which accepts
3415 character ch. See group_nodes_into_DFAstates. */
3416 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3417 ;
3418
3419 /* j-th destination accepts the word character ch. */
3420 trtable[ch] = dest_states[j];
3421 trtable[ch + SBC_MAX] = dest_states_word[j];
3422 }
3423 }
3424
3425 /* new line */
3426 if (bitset_contain (acceptable, NEWLINE_CHAR))
3427 {
3428 /* The current state accepts newline character. */
3429 for (j = 0; j < ndests; ++j)
3430 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3431 {
3432 /* k-th destination accepts newline character. */
3433 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3434 if (need_word_trtable)
3435 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3436 /* There must be only one destination which accepts
3437 newline. See group_nodes_into_DFAstates. */
3438 break;
3439 }
3440 }
3441
3442 re_node_set_free (&follows);
3443 for (i = 0; i < ndests; ++i)
3444 re_node_set_free (dests_node + i);
3445 return true;
3446}
3447
3448/* Group all nodes belonging to STATE into several destinations.
3449 Then for all destinations, set the nodes belonging to the destination
3450 to DESTS_NODE[i] and set the characters accepted by the destination
3451 to DEST_CH[i]. Return the number of destinations if successful,
3452 -1 on internal error. */
3453
3454static Idx
3455group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3456 re_node_set *dests_node, bitset_t *dests_ch)
3457{
3458 reg_errcode_t err;
3459 bool ok;
3460 Idx i, j, k;
3461 Idx ndests; /* Number of the destinations from 'state'. */
3462 bitset_t accepts; /* Characters a node can accept. */
3463 const re_node_set *cur_nodes = &state->nodes;
3464 bitset_empty (accepts);
3465 ndests = 0;
3466
3467 /* For all the nodes belonging to 'state', */
3468 for (i = 0; i < cur_nodes->nelem; ++i)
3469 {
3470 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3471 re_token_type_t type = node->type;
3472 unsigned int constraint = node->constraint;
3473
3474 /* Enumerate all single byte character this node can accept. */
3475 if (type == CHARACTER)
3476 bitset_set (accepts, node->opr.c);
3477 else if (type == SIMPLE_BRACKET)
3478 {
3479 bitset_merge (accepts, node->opr.sbcset);
3480 }
3481 else if (type == OP_PERIOD)
3482 {
3483#ifdef RE_ENABLE_I18N
3484 if (dfa->mb_cur_max > 1)
3485 bitset_merge (accepts, dfa->sb_char);
3486 else
3487#endif
3488 bitset_set_all (accepts);
3489 if (!(dfa->syntax & RE_DOT_NEWLINE))
3490 bitset_clear (accepts, '\n');
3491 if (dfa->syntax & RE_DOT_NOT_NULL)
3492 bitset_clear (accepts, '\0');
3493 }
3494#ifdef RE_ENABLE_I18N
3495 else if (type == OP_UTF8_PERIOD)
3496 {
3497 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3498 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3499 else
3500 bitset_merge (accepts, utf8_sb_map);
3501 if (!(dfa->syntax & RE_DOT_NEWLINE))
3502 bitset_clear (accepts, '\n');
3503 if (dfa->syntax & RE_DOT_NOT_NULL)
3504 bitset_clear (accepts, '\0');
3505 }
3506#endif
3507 else
3508 continue;
3509
3510 /* Check the 'accepts' and sift the characters which are not
3511 match it the context. */
3512 if (constraint)
3513 {
3514 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3515 {
3516 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3517 bitset_empty (accepts);
3518 if (accepts_newline)
3519 bitset_set (accepts, NEWLINE_CHAR);
3520 else
3521 continue;
3522 }
3523 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3524 {
3525 bitset_empty (accepts);
3526 continue;
3527 }
3528
3529 if (constraint & NEXT_WORD_CONSTRAINT)
3530 {
3531 bitset_word_t any_set = 0;
3532 if (type == CHARACTER && !node->word_char)
3533 {
3534 bitset_empty (accepts);
3535 continue;
3536 }
3537#ifdef RE_ENABLE_I18N
3538 if (dfa->mb_cur_max > 1)
3539 for (j = 0; j < BITSET_WORDS; ++j)
3540 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3541 else
3542#endif
3543 for (j = 0; j < BITSET_WORDS; ++j)
3544 any_set |= (accepts[j] &= dfa->word_char[j]);
3545 if (!any_set)
3546 continue;
3547 }
3548 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3549 {
3550 bitset_word_t any_set = 0;
3551 if (type == CHARACTER && node->word_char)
3552 {
3553 bitset_empty (accepts);
3554 continue;
3555 }
3556#ifdef RE_ENABLE_I18N
3557 if (dfa->mb_cur_max > 1)
3558 for (j = 0; j < BITSET_WORDS; ++j)
3559 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3560 else
3561#endif
3562 for (j = 0; j < BITSET_WORDS; ++j)
3563 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3564 if (!any_set)
3565 continue;
3566 }
3567 }
3568
3569 /* Then divide 'accepts' into DFA states, or create a new
3570 state. Above, we make sure that accepts is not empty. */
3571 for (j = 0; j < ndests; ++j)
3572 {
3573 bitset_t intersec; /* Intersection sets, see below. */
3574 bitset_t remains;
3575 /* Flags, see below. */
3576 bitset_word_t has_intersec, not_subset, not_consumed;
3577
3578 /* Optimization, skip if this state doesn't accept the character. */
3579 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3580 continue;
3581
3582 /* Enumerate the intersection set of this state and 'accepts'. */
3583 has_intersec = 0;
3584 for (k = 0; k < BITSET_WORDS; ++k)
3585 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3586 /* And skip if the intersection set is empty. */
3587 if (!has_intersec)
3588 continue;
3589
3590 /* Then check if this state is a subset of 'accepts'. */
3591 not_subset = not_consumed = 0;
3592 for (k = 0; k < BITSET_WORDS; ++k)
3593 {
3594 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3595 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3596 }
3597
3598 /* If this state isn't a subset of 'accepts', create a
3599 new group state, which has the 'remains'. */
3600 if (not_subset)
3601 {
3602 bitset_copy (dests_ch[ndests], remains);
3603 bitset_copy (dests_ch[j], intersec);
3604 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3605 if (__glibc_unlikely (err != REG_NOERROR))
3606 goto error_return;
3607 ++ndests;
3608 }
3609
3610 /* Put the position in the current group. */
3611 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3612 if (__glibc_unlikely (! ok))
3613 goto error_return;
3614
3615 /* If all characters are consumed, go to next node. */
3616 if (!not_consumed)
3617 break;
3618 }
3619 /* Some characters remain, create a new group. */
3620 if (j == ndests)
3621 {
3622 bitset_copy (dests_ch[ndests], accepts);
3623 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3624 if (__glibc_unlikely (err != REG_NOERROR))
3625 goto error_return;
3626 ++ndests;
3627 bitset_empty (accepts);
3628 }
3629 }
3630 assume (ndests <= SBC_MAX);
3631 return ndests;
3632 error_return:
3633 for (j = 0; j < ndests; ++j)
3634 re_node_set_free (dests_node + j);
3635 return -1;
3636}
3637
3638#ifdef RE_ENABLE_I18N
3639/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3640 Return the number of the bytes the node accepts.
3641 STR_IDX is the current index of the input string.
3642
3643 This function handles the nodes which can accept one character, or
3644 one collating element like '.', '[a-z]', opposite to the other nodes
3645 can only accept one byte. */
3646
3647# ifdef _LIBC
3648# include <locale/weight.h>
3649# endif
3650
3651static int
3652check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3653 const re_string_t *input, Idx str_idx)
3654{
3655 const re_token_t *node = dfa->nodes + node_idx;
3656 int char_len, elem_len;
3657 Idx i;
3658
3659 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3660 {
3661 unsigned char c = re_string_byte_at (input, str_idx), d;
3662 if (__glibc_likely (c < 0xc2))
3663 return 0;
3664
3665 if (str_idx + 2 > input->len)
3666 return 0;
3667
3668 d = re_string_byte_at (input, str_idx + 1);
3669 if (c < 0xe0)
3670 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3671 else if (c < 0xf0)
3672 {
3673 char_len = 3;
3674 if (c == 0xe0 && d < 0xa0)
3675 return 0;
3676 }
3677 else if (c < 0xf8)
3678 {
3679 char_len = 4;
3680 if (c == 0xf0 && d < 0x90)
3681 return 0;
3682 }
3683 else if (c < 0xfc)
3684 {
3685 char_len = 5;
3686 if (c == 0xf8 && d < 0x88)
3687 return 0;
3688 }
3689 else if (c < 0xfe)
3690 {
3691 char_len = 6;
3692 if (c == 0xfc && d < 0x84)
3693 return 0;
3694 }
3695 else
3696 return 0;
3697
3698 if (str_idx + char_len > input->len)
3699 return 0;
3700
3701 for (i = 1; i < char_len; ++i)
3702 {
3703 d = re_string_byte_at (input, str_idx + i);
3704 if (d < 0x80 || d > 0xbf)
3705 return 0;
3706 }
3707 return char_len;
3708 }
3709
3710 char_len = re_string_char_size_at (input, str_idx);
3711 if (node->type == OP_PERIOD)
3712 {
3713 if (char_len <= 1)
3714 return 0;
3715 /* FIXME: I don't think this if is needed, as both '\n'
3716 and '\0' are char_len == 1. */
3717 /* '.' accepts any one character except the following two cases. */
3718 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3719 && re_string_byte_at (input, str_idx) == '\n')
3720 || ((dfa->syntax & RE_DOT_NOT_NULL)
3721 && re_string_byte_at (input, str_idx) == '\0'))
3722 return 0;
3723 return char_len;
3724 }
3725
3726 elem_len = re_string_elem_size_at (input, str_idx);
3727 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3728 return 0;
3729
3730 if (node->type == COMPLEX_BRACKET)
3731 {
3732 const re_charset_t *cset = node->opr.mbcset;
3733# ifdef _LIBC
3734 const unsigned char *pin
3735 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3736 Idx j;
3737 uint32_t nrules;
3738# endif /* _LIBC */
3739 int match_len = 0;
3740 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3741 ? re_string_wchar_at (input, str_idx) : 0);
3742
3743 /* match with multibyte character? */
3744 for (i = 0; i < cset->nmbchars; ++i)
3745 if (wc == cset->mbchars[i])
3746 {
3747 match_len = char_len;
3748 goto check_node_accept_bytes_match;
3749 }
3750 /* match with character_class? */
3751 for (i = 0; i < cset->nchar_classes; ++i)
3752 {
3753 wctype_t wt = cset->char_classes[i];
3754 if (__iswctype (wc, wt))
3755 {
3756 match_len = char_len;
3757 goto check_node_accept_bytes_match;
3758 }
3759 }
3760
3761# ifdef _LIBC
3762 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3763 if (nrules != 0)
3764 {
3765 unsigned int in_collseq = 0;
3766 const int32_t *table, *indirect;
3767 const unsigned char *weights, *extra;
3768 const char *collseqwc;
3769
3770 /* match with collating_symbol? */
3771 if (cset->ncoll_syms)
3772 extra = (const unsigned char *)
3773 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3774 for (i = 0; i < cset->ncoll_syms; ++i)
3775 {
3776 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3777 /* Compare the length of input collating element and
3778 the length of current collating element. */
3779 if (*coll_sym != elem_len)
3780 continue;
3781 /* Compare each bytes. */
3782 for (j = 0; j < *coll_sym; j++)
3783 if (pin[j] != coll_sym[1 + j])
3784 break;
3785 if (j == *coll_sym)
3786 {
3787 /* Match if every bytes is equal. */
3788 match_len = j;
3789 goto check_node_accept_bytes_match;
3790 }
3791 }
3792
3793 if (cset->nranges)
3794 {
3795 if (elem_len <= char_len)
3796 {
3797 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3798 in_collseq = __collseq_table_lookup (collseqwc, wc);
3799 }
3800 else
3801 in_collseq = find_collation_sequence_value (pin, elem_len);
3802 }
3803 /* match with range expression? */
3804 /* FIXME: Implement rational ranges here, too. */
3805 for (i = 0; i < cset->nranges; ++i)
3806 if (cset->range_starts[i] <= in_collseq
3807 && in_collseq <= cset->range_ends[i])
3808 {
3809 match_len = elem_len;
3810 goto check_node_accept_bytes_match;
3811 }
3812
3813 /* match with equivalence_class? */
3814 if (cset->nequiv_classes)
3815 {
3816 const unsigned char *cp = pin;
3817 table = (const int32_t *)
3818 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3819 weights = (const unsigned char *)
3820 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3821 extra = (const unsigned char *)
3822 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3823 indirect = (const int32_t *)
3824 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3825 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3826 int32_t rule = idx >> 24;
3827 idx &= 0xffffff;
3828 if (idx > 0)
3829 {
3830 size_t weight_len = weights[idx];
3831 for (i = 0; i < cset->nequiv_classes; ++i)
3832 {
3833 int32_t equiv_class_idx = cset->equiv_classes[i];
3834 int32_t equiv_class_rule = equiv_class_idx >> 24;
3835 equiv_class_idx &= 0xffffff;
3836 if (weights[equiv_class_idx] == weight_len
3837 && equiv_class_rule == rule
3838 && memcmp (weights + idx + 1,
3839 weights + equiv_class_idx + 1,
3840 weight_len) == 0)
3841 {
3842 match_len = elem_len;
3843 goto check_node_accept_bytes_match;
3844 }
3845 }
3846 }
3847 }
3848 }
3849 else
3850# endif /* _LIBC */
3851 {
3852 /* match with range expression? */
3853 for (i = 0; i < cset->nranges; ++i)
3854 {
3855 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3856 {
3857 match_len = char_len;
3858 goto check_node_accept_bytes_match;
3859 }
3860 }
3861 }
3862 check_node_accept_bytes_match:
3863 if (!cset->non_match)
3864 return match_len;
3865 else
3866 {
3867 if (match_len > 0)
3868 return 0;
3869 else
3870 return (elem_len > char_len) ? elem_len : char_len;
3871 }
3872 }
3873 return 0;
3874}
3875
3876# ifdef _LIBC
3877static unsigned int
3878find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3879{
3880 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3881 if (nrules == 0)
3882 {
3883 if (mbs_len == 1)
3884 {
3885 /* No valid character. Match it as a single byte character. */
3886 const unsigned char *collseq = (const unsigned char *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3888 return collseq[mbs[0]];
3889 }
3890 return UINT_MAX;
3891 }
3892 else
3893 {
3894 int32_t idx;
3895 const unsigned char *extra = (const unsigned char *)
3896 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3897 int32_t extrasize = (const unsigned char *)
3898 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3899
3900 for (idx = 0; idx < extrasize;)
3901 {
3902 int mbs_cnt;
3903 bool found = false;
3904 int32_t elem_mbs_len;
3905 /* Skip the name of collating element name. */
3906 idx = idx + extra[idx] + 1;
3907 elem_mbs_len = extra[idx++];
3908 if (mbs_len == elem_mbs_len)
3909 {
3910 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3911 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3912 break;
3913 if (mbs_cnt == elem_mbs_len)
3914 /* Found the entry. */
3915 found = true;
3916 }
3917 /* Skip the byte sequence of the collating element. */
3918 idx += elem_mbs_len;
3919 /* Adjust for the alignment. */
3920 idx = (idx + 3) & ~3;
3921 /* Skip the collation sequence value. */
3922 idx += sizeof (uint32_t);
3923 /* Skip the wide char sequence of the collating element. */
3924 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3925 /* If we found the entry, return the sequence value. */
3926 if (found)
3927 return *(uint32_t *) (extra + idx);
3928 /* Skip the collation sequence value. */
3929 idx += sizeof (uint32_t);
3930 }
3931 return UINT_MAX;
3932 }
3933}
3934# endif /* _LIBC */
3935#endif /* RE_ENABLE_I18N */
3936
3937/* Check whether the node accepts the byte which is IDX-th
3938 byte of the INPUT. */
3939
3940static bool
3941check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3942 Idx idx)
3943{
3944 unsigned char ch;
3945 ch = re_string_byte_at (&mctx->input, idx);
3946 switch (node->type)
3947 {
3948 case CHARACTER:
3949 if (node->opr.c != ch)
3950 return false;
3951 break;
3952
3953 case SIMPLE_BRACKET:
3954 if (!bitset_contain (node->opr.sbcset, ch))
3955 return false;
3956 break;
3957
3958#ifdef RE_ENABLE_I18N
3959 case OP_UTF8_PERIOD:
3960 if (ch >= ASCII_CHARS)
3961 return false;
3962 FALLTHROUGH;
3963#endif
3964 case OP_PERIOD:
3965 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3966 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3967 return false;
3968 break;
3969
3970 default:
3971 return false;
3972 }
3973
3974 if (node->constraint)
3975 {
3976 /* The node has constraints. Check whether the current context
3977 satisfies the constraints. */
3978 unsigned int context = re_string_context_at (&mctx->input, idx,
3979 mctx->eflags);
3980 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3981 return false;
3982 }
3983
3984 return true;
3985}
3986
3987/* Extend the buffers, if the buffers have run out. */
3988
3989static reg_errcode_t
3990__attribute_warn_unused_result__
3991extend_buffers (re_match_context_t *mctx, int min_len)
3992{
3993 reg_errcode_t ret;
3994 re_string_t *pstr = &mctx->input;
3995
3996 /* Avoid overflow. */
3997 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3998 <= pstr->bufs_len))
3999 return REG_ESPACE;
4000
4001 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4002 ret = re_string_realloc_buffers (pstr,
4003 MAX (min_len,
4004 MIN (pstr->len, pstr->bufs_len * 2)));
4005 if (__glibc_unlikely (ret != REG_NOERROR))
4006 return ret;
4007
4008 if (mctx->state_log != NULL)
4009 {
4010 /* And double the length of state_log. */
4011 /* XXX We have no indication of the size of this buffer. If this
4012 allocation fail we have no indication that the state_log array
4013 does not have the right size. */
4014 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4015 pstr->bufs_len + 1);
4016 if (__glibc_unlikely (new_array == NULL))
4017 return REG_ESPACE;
4018 mctx->state_log = new_array;
4019 }
4020
4021 /* Then reconstruct the buffers. */
4022 if (pstr->icase)
4023 {
4024#ifdef RE_ENABLE_I18N
4025 if (pstr->mb_cur_max > 1)
4026 {
4027 ret = build_wcs_upper_buffer (pstr);
4028 if (__glibc_unlikely (ret != REG_NOERROR))
4029 return ret;
4030 }
4031 else
4032#endif /* RE_ENABLE_I18N */
4033 build_upper_buffer (pstr);
4034 }
4035 else
4036 {
4037#ifdef RE_ENABLE_I18N
4038 if (pstr->mb_cur_max > 1)
4039 build_wcs_buffer (pstr);
4040 else
4041#endif /* RE_ENABLE_I18N */
4042 {
4043 if (pstr->trans != NULL)
4044 re_string_translate_buffer (pstr);
4045 }
4046 }
4047 return REG_NOERROR;
4048}
4049
4050
4051
4052/* Functions for matching context. */
4053
4054/* Initialize MCTX. */
4055
4056static reg_errcode_t
4057__attribute_warn_unused_result__
4058match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4059{
4060 mctx->eflags = eflags;
4061 mctx->match_last = -1;
4062 if (n > 0)
4063 {
4064 /* Avoid overflow. */
4065 size_t max_object_size =
4066 MAX (sizeof (struct re_backref_cache_entry),
4067 sizeof (re_sub_match_top_t *));
4068 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4069 return REG_ESPACE;
4070
4071 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4072 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4073 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4074 return REG_ESPACE;
4075 }
4076 /* Already zero-ed by the caller.
4077 else
4078 mctx->bkref_ents = NULL;
4079 mctx->nbkref_ents = 0;
4080 mctx->nsub_tops = 0; */
4081 mctx->abkref_ents = n;
4082 mctx->max_mb_elem_len = 1;
4083 mctx->asub_tops = n;
4084 return REG_NOERROR;
4085}
4086
4087/* Clean the entries which depend on the current input in MCTX.
4088 This function must be invoked when the matcher changes the start index
4089 of the input, or changes the input string. */
4090
4091static void
4092match_ctx_clean (re_match_context_t *mctx)
4093{
4094 Idx st_idx;
4095 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4096 {
4097 Idx sl_idx;
4098 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4099 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4100 {
4101 re_sub_match_last_t *last = top->lasts[sl_idx];
4102 re_free (last->path.array);
4103 re_free (last);
4104 }
4105 re_free (top->lasts);
4106 if (top->path)
4107 {
4108 re_free (top->path->array);
4109 re_free (top->path);
4110 }
4111 re_free (top);
4112 }
4113
4114 mctx->nsub_tops = 0;
4115 mctx->nbkref_ents = 0;
4116}
4117
4118/* Free all the memory associated with MCTX. */
4119
4120static void
4121match_ctx_free (re_match_context_t *mctx)
4122{
4123 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4124 match_ctx_clean (mctx);
4125 re_free (mctx->sub_tops);
4126 re_free (mctx->bkref_ents);
4127}
4128
4129/* Add a new backreference entry to MCTX.
4130 Note that we assume that caller never call this function with duplicate
4131 entry, and call with STR_IDX which isn't smaller than any existing entry.
4132*/
4133
4134static reg_errcode_t
4135__attribute_warn_unused_result__
4136match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4137 Idx to)
4138{
4139 if (mctx->nbkref_ents >= mctx->abkref_ents)
4140 {
4141 struct re_backref_cache_entry* new_entry;
4142 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4143 mctx->abkref_ents * 2);
4144 if (__glibc_unlikely (new_entry == NULL))
4145 {
4146 re_free (mctx->bkref_ents);
4147 return REG_ESPACE;
4148 }
4149 mctx->bkref_ents = new_entry;
4150 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4151 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4152 mctx->abkref_ents *= 2;
4153 }
4154 if (mctx->nbkref_ents > 0
4155 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4156 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4157
4158 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4159 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4160 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4161 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4162
4163 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4164 If bit N is clear, means that this entry won't epsilon-transition to
4165 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4166 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4167 such node.
4168
4169 A backreference does not epsilon-transition unless it is empty, so set
4170 to all zeros if FROM != TO. */
4171 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4172 = (from == to ? -1 : 0);
4173
4174 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4175 if (mctx->max_mb_elem_len < to - from)
4176 mctx->max_mb_elem_len = to - from;
4177 return REG_NOERROR;
4178}
4179
4180/* Return the first entry with the same str_idx, or -1 if none is
4181 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4182
4183static Idx
4184search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4185{
4186 Idx left, right, mid, last;
4187 last = right = mctx->nbkref_ents;
4188 for (left = 0; left < right;)
4189 {
4190 mid = (left + right) / 2;
4191 if (mctx->bkref_ents[mid].str_idx < str_idx)
4192 left = mid + 1;
4193 else
4194 right = mid;
4195 }
4196 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4197 return left;
4198 else
4199 return -1;
4200}
4201
4202/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4203 at STR_IDX. */
4204
4205static reg_errcode_t
4206__attribute_warn_unused_result__
4207match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4208{
4209 DEBUG_ASSERT (mctx->sub_tops != NULL);
4210 DEBUG_ASSERT (mctx->asub_tops > 0);
4211 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4212 {
4213 Idx new_asub_tops = mctx->asub_tops * 2;
4214 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4215 re_sub_match_top_t *,
4216 new_asub_tops);
4217 if (__glibc_unlikely (new_array == NULL))
4218 return REG_ESPACE;
4219 mctx->sub_tops = new_array;
4220 mctx->asub_tops = new_asub_tops;
4221 }
4222 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4223 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4224 return REG_ESPACE;
4225 mctx->sub_tops[mctx->nsub_tops]->node = node;
4226 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4227 return REG_NOERROR;
4228}
4229
4230/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4231 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4232 Return the new entry if successful, NULL if memory is exhausted. */
4233
4234static re_sub_match_last_t *
4235match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4236{
4237 re_sub_match_last_t *new_entry;
4238 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4239 {
4240 Idx new_alasts = 2 * subtop->alasts + 1;
4241 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4242 re_sub_match_last_t *,
4243 new_alasts);
4244 if (__glibc_unlikely (new_array == NULL))
4245 return NULL;
4246 subtop->lasts = new_array;
4247 subtop->alasts = new_alasts;
4248 }
4249 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4250 if (__glibc_likely (new_entry != NULL))
4251 {
4252 subtop->lasts[subtop->nlasts] = new_entry;
4253 new_entry->node = node;
4254 new_entry->str_idx = str_idx;
4255 ++subtop->nlasts;
4256 }
4257 return new_entry;
4258}
4259
4260static void
4261sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4262 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4263{
4264 sctx->sifted_states = sifted_sts;
4265 sctx->limited_states = limited_sts;
4266 sctx->last_node = last_node;
4267 sctx->last_str_idx = last_str_idx;
4268 re_node_set_init_empty (&sctx->limits);
4269}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette