VirtualBox

source: kBuild/vendor/sed/4.1.5/lib/regexec.c

Last change on this file was 599, checked in by bird, 18 years ago

GNU sed 4.1.5.

File size: 125.2 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
76
77#ifdef RE_ENABLE_I18N
78static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82#endif /* RE_ENABLE_I18N */
83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137#if 0
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142#endif
143#ifdef RE_ENABLE_I18N
144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147#endif /* RE_ENABLE_I18N */
148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184#ifdef RE_ENABLE_I18N
185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188# ifdef _LIBC
189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192# endif /* _LIBC */
193#endif /* RE_ENABLE_I18N */
194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
203
204
205/* Entry point for POSIX code. */
206
207/* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
209
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
214
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
218
219 We return 0 if we find a match and REG_NOMATCH if not. */
220
221int
222regexec (preg, string, nmatch, pmatch, eflags)
223 const regex_t *__restrict preg;
224 const char *__restrict string;
225 size_t nmatch;
226 regmatch_t pmatch[];
227 int eflags;
228{
229 reg_errcode_t err;
230 int start, length;
231 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232
233 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
234 return REG_BADPAT;
235
236 if (eflags & REG_STARTEND)
237 {
238 start = pmatch[0].rm_so;
239 length = pmatch[0].rm_eo;
240 }
241 else
242 {
243 start = 0;
244 length = strlen (string);
245 }
246
247 __libc_lock_lock (dfa->lock);
248 if (preg->no_sub)
249 err = re_search_internal (preg, string, length, start, length - start,
250 length, 0, NULL, eflags);
251 else
252 err = re_search_internal (preg, string, length, start, length - start,
253 length, nmatch, pmatch, eflags);
254 __libc_lock_unlock (dfa->lock);
255 return err != REG_NOERROR;
256}
257
258#ifdef _LIBC
259# include <shlib-compat.h>
260versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261
262# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
263__typeof__ (__regexec) __compat_regexec;
264
265int
266attribute_compat_text_section
267__compat_regexec (const regex_t *__restrict preg,
268 const char *__restrict string, size_t nmatch,
269 regmatch_t pmatch[], int eflags)
270{
271 return regexec (preg, string, nmatch, pmatch,
272 eflags & (REG_NOTBOL | REG_NOTEOL));
273}
274compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
275# endif
276#endif
277
278/* Entry points for GNU code. */
279
280/* re_match, re_search, re_match_2, re_search_2
281
282 The former two functions operate on STRING with length LENGTH,
283 while the later two operate on concatenation of STRING1 and STRING2
284 with lengths LENGTH1 and LENGTH2, respectively.
285
286 re_match() matches the compiled pattern in BUFP against the string,
287 starting at index START.
288
289 re_search() first tries matching at index START, then it tries to match
290 starting from index START + 1, and so on. The last start position tried
291 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
292 way as re_match().)
293
294 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
295 the first STOP characters of the concatenation of the strings should be
296 concerned.
297
298 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
299 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
300 computed relative to the concatenation, not relative to the individual
301 strings.)
302
303 On success, re_match* functions return the length of the match, re_search*
304 return the position of the start of the match. Return value -1 means no
305 match was found and -2 indicates an internal error. */
306
307int
308re_match (bufp, string, length, start, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start;
312 struct re_registers *regs;
313{
314 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
315}
316#ifdef _LIBC
317weak_alias (__re_match, re_match)
318#endif
319
320int
321re_search (bufp, string, length, start, range, regs)
322 struct re_pattern_buffer *bufp;
323 const char *string;
324 int length, start, range;
325 struct re_registers *regs;
326{
327 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
328}
329#ifdef _LIBC
330weak_alias (__re_search, re_search)
331#endif
332
333int
334re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
335 struct re_pattern_buffer *bufp;
336 const char *string1, *string2;
337 int length1, length2, start, stop;
338 struct re_registers *regs;
339{
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, 0, regs, stop, 1);
342}
343#ifdef _LIBC
344weak_alias (__re_match_2, re_match_2)
345#endif
346
347int
348re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
349 struct re_pattern_buffer *bufp;
350 const char *string1, *string2;
351 int length1, length2, start, range, stop;
352 struct re_registers *regs;
353{
354 return re_search_2_stub (bufp, string1, length1, string2, length2,
355 start, range, regs, stop, 0);
356}
357#ifdef _LIBC
358weak_alias (__re_search_2, re_search_2)
359#endif
360
361static int
362re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
363 stop, ret_len)
364 struct re_pattern_buffer *bufp;
365 const char *string1, *string2;
366 int length1, length2, start, range, stop, ret_len;
367 struct re_registers *regs;
368{
369 const char *str;
370 int rval;
371 int len = length1 + length2;
372 int free_str = 0;
373
374 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
375 return -2;
376
377 /* Concatenate the strings. */
378 if (length2 > 0)
379 if (length1 > 0)
380 {
381 char *s = re_malloc (char, len);
382
383 if (BE (s == NULL, 0))
384 return -2;
385#ifdef _LIBC
386 memcpy (__mempcpy (s, string1, length1), string2, length2);
387#else
388 memcpy (s, string1, length1);
389 memcpy (s + length1, string2, length2);
390#endif
391 str = s;
392 free_str = 1;
393 }
394 else
395 str = string2;
396 else
397 str = string1;
398
399 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
400 ret_len);
401 if (free_str)
402 re_free ((char *) str);
403 return rval;
404}
405
406/* The parameters have the same meaning as those of re_search.
407 Additional parameters:
408 If RET_LEN is nonzero the length of the match is returned (re_match style);
409 otherwise the position of the match is returned. */
410
411static int
412re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
413 struct re_pattern_buffer *bufp;
414 const char *string;
415 int length, start, range, stop, ret_len;
416 struct re_registers *regs;
417{
418 reg_errcode_t result;
419 regmatch_t *pmatch;
420 int nregs, rval;
421 int eflags = 0;
422 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
423
424 /* Check for out-of-range. */
425 if (BE (start < 0 || start > length, 0))
426 return -1;
427 if (BE (start + range > length, 0))
428 range = length - start;
429 else if (BE (start + range < 0, 0))
430 range = -start;
431
432 __libc_lock_lock (dfa->lock);
433
434 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
435 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
436
437 /* Compile fastmap if we haven't yet. */
438 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
439 re_compile_fastmap (bufp);
440
441 if (BE (bufp->no_sub, 0))
442 regs = NULL;
443
444 /* We need at least 1 register. */
445 if (regs == NULL)
446 nregs = 1;
447 else if (BE (bufp->regs_allocated == REGS_FIXED &&
448 regs->num_regs < bufp->re_nsub + 1, 0))
449 {
450 nregs = regs->num_regs;
451 if (BE (nregs < 1, 0))
452 {
453 /* Nothing can be copied to regs. */
454 regs = NULL;
455 nregs = 1;
456 }
457 }
458 else
459 nregs = bufp->re_nsub + 1;
460 pmatch = re_malloc (regmatch_t, nregs);
461 if (BE (pmatch == NULL, 0))
462 {
463 rval = -2;
464 goto out;
465 }
466
467 result = re_search_internal (bufp, string, length, start, range, stop,
468 nregs, pmatch, eflags);
469
470 rval = 0;
471
472 /* I hope we needn't fill ther regs with -1's when no match was found. */
473 if (result != REG_NOERROR)
474 rval = -1;
475 else if (regs != NULL)
476 {
477 /* If caller wants register contents data back, copy them. */
478 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
479 bufp->regs_allocated);
480 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
481 rval = -2;
482 }
483
484 if (BE (rval == 0, 1))
485 {
486 if (ret_len)
487 {
488 assert (pmatch[0].rm_so == start);
489 rval = pmatch[0].rm_eo - start;
490 }
491 else
492 rval = pmatch[0].rm_so;
493 }
494 re_free (pmatch);
495 out:
496 __libc_lock_unlock (dfa->lock);
497 return rval;
498}
499
500static unsigned
501re_copy_regs (regs, pmatch, nregs, regs_allocated)
502 struct re_registers *regs;
503 regmatch_t *pmatch;
504 int nregs, regs_allocated;
505{
506 int rval = REGS_REALLOCATE;
507 int i;
508 int need_regs = nregs + 1;
509 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
510 uses. */
511
512 /* Have the register data arrays been allocated? */
513 if (regs_allocated == REGS_UNALLOCATED)
514 { /* No. So allocate them with malloc. */
515 regs->start = re_malloc (regoff_t, need_regs);
516 regs->end = re_malloc (regoff_t, need_regs);
517 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
518 return REGS_UNALLOCATED;
519 regs->num_regs = need_regs;
520 }
521 else if (regs_allocated == REGS_REALLOCATE)
522 { /* Yes. If we need more elements than were already
523 allocated, reallocate them. If we need fewer, just
524 leave it alone. */
525 if (BE (need_regs > regs->num_regs, 0))
526 {
527 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
528 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
529 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
530 return REGS_UNALLOCATED;
531 regs->start = new_start;
532 regs->end = new_end;
533 regs->num_regs = need_regs;
534 }
535 }
536 else
537 {
538 assert (regs_allocated == REGS_FIXED);
539 /* This function may not be called with REGS_FIXED and nregs too big. */
540 assert (regs->num_regs >= nregs);
541 rval = REGS_FIXED;
542 }
543
544 /* Copy the regs. */
545 for (i = 0; i < nregs; ++i)
546 {
547 regs->start[i] = pmatch[i].rm_so;
548 regs->end[i] = pmatch[i].rm_eo;
549 }
550 for ( ; i < regs->num_regs; ++i)
551 regs->start[i] = regs->end[i] = -1;
552
553 return rval;
554}
555
556/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
557 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
558 this memory for recording register information. STARTS and ENDS
559 must be allocated using the malloc library routine, and must each
560 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561
562 If NUM_REGS == 0, then subsequent matches should allocate their own
563 register data.
564
565 Unless this function is called, the first search or match using
566 PATTERN_BUFFER will allocate its own register data, without
567 freeing the old data. */
568
569void
570re_set_registers (bufp, regs, num_regs, starts, ends)
571 struct re_pattern_buffer *bufp;
572 struct re_registers *regs;
573 unsigned num_regs;
574 regoff_t *starts, *ends;
575{
576 if (num_regs)
577 {
578 bufp->regs_allocated = REGS_REALLOCATE;
579 regs->num_regs = num_regs;
580 regs->start = starts;
581 regs->end = ends;
582 }
583 else
584 {
585 bufp->regs_allocated = REGS_UNALLOCATED;
586 regs->num_regs = 0;
587 regs->start = regs->end = (regoff_t *) 0;
588 }
589}
590#ifdef _LIBC
591weak_alias (__re_set_registers, re_set_registers)
592#endif
593
594
595/* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
597
598#if defined _REGEX_RE_COMP || defined _LIBC
599int
600# ifdef _LIBC
601weak_function
602# endif
603re_exec (s)
604 const char *s;
605{
606 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
607}
608#endif /* _REGEX_RE_COMP */
609
610
611/* Internal entry point. */
612
613/* Searches for a compiled pattern PREG in the string STRING, whose
614 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
615 mingings with regexec. START, and RANGE have the same meanings
616 with re_search.
617 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
618 otherwise return the error code.
619 Note: We assume front end functions already check ranges.
620 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
621
622static reg_errcode_t
623re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
624 eflags)
625 const regex_t *preg;
626 const char *string;
627 int length, start, range, stop, eflags;
628 size_t nmatch;
629 regmatch_t pmatch[];
630{
631 reg_errcode_t err;
632 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
633 int left_lim, right_lim, incr;
634 int fl_longest_match, match_first, match_kind, match_last = -1;
635 int extra_nmatch;
636 int sb, ch;
637#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
638 re_match_context_t mctx = { .dfa = dfa };
639#else
640 re_match_context_t mctx;
641#endif
642 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
643 && range && !preg->can_be_null) ? preg->fastmap : NULL;
644 RE_TRANSLATE_TYPE t = preg->translate;
645
646#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
647 memset (&mctx, '\0', sizeof (re_match_context_t));
648 mctx.dfa = dfa;
649#endif
650
651 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
652 nmatch -= extra_nmatch;
653
654 /* Check if the DFA haven't been compiled. */
655 if (BE (preg->used == 0 || dfa->init_state == NULL
656 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
657 || dfa->init_state_begbuf == NULL, 0))
658 return REG_NOMATCH;
659
660#ifdef DEBUG
661 /* We assume front-end functions already check them. */
662 assert (start + range >= 0 && start + range <= length);
663#endif
664
665 /* If initial states with non-begbuf contexts have no elements,
666 the regex must be anchored. If preg->newline_anchor is set,
667 we'll never use init_state_nl, so do not check it. */
668 if (dfa->init_state->nodes.nelem == 0
669 && dfa->init_state_word->nodes.nelem == 0
670 && (dfa->init_state_nl->nodes.nelem == 0
671 || !preg->newline_anchor))
672 {
673 if (start != 0 && start + range != 0)
674 return REG_NOMATCH;
675 start = range = 0;
676 }
677
678 /* We must check the longest matching, if nmatch > 0. */
679 fl_longest_match = (nmatch != 0 || dfa->nbackref);
680
681 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
682 preg->translate, preg->syntax & RE_ICASE, dfa);
683 if (BE (err != REG_NOERROR, 0))
684 goto free_return;
685 mctx.input.stop = stop;
686 mctx.input.raw_stop = stop;
687 mctx.input.newline_anchor = preg->newline_anchor;
688
689 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
690 if (BE (err != REG_NOERROR, 0))
691 goto free_return;
692
693 /* We will log all the DFA states through which the dfa pass,
694 if nmatch > 1, or this dfa has "multibyte node", which is a
695 back-reference or a node which can accept multibyte character or
696 multi character collating element. */
697 if (nmatch > 1 || dfa->has_mb_node)
698 {
699 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
700 if (BE (mctx.state_log == NULL, 0))
701 {
702 err = REG_ESPACE;
703 goto free_return;
704 }
705 }
706 else
707 mctx.state_log = NULL;
708
709 match_first = start;
710 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
711 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
712
713 /* Check incrementally whether of not the input string match. */
714 incr = (range < 0) ? -1 : 1;
715 left_lim = (range < 0) ? start + range : start;
716 right_lim = (range < 0) ? start : start + range;
717 sb = dfa->mb_cur_max == 1;
718 match_kind =
719 (fastmap
720 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
721 | (range >= 0 ? 2 : 0)
722 | (t != NULL ? 1 : 0))
723 : 8);
724
725 for (;; match_first += incr)
726 {
727 err = REG_NOMATCH;
728 if (match_first < left_lim || right_lim < match_first)
729 goto free_return;
730
731 /* Advance as rapidly as possible through the string, until we
732 find a plausible place to start matching. This may be done
733 with varying efficiency, so there are various possibilities:
734 only the most common of them are specialized, in order to
735 save on code size. We use a switch statement for speed. */
736 switch (match_kind)
737 {
738 case 8:
739 /* No fastmap. */
740 break;
741
742 case 7:
743 /* Fastmap with single-byte translation, match forward. */
744 while (BE (match_first < right_lim, 1)
745 && !fastmap[t[(unsigned char) string[match_first]]])
746 ++match_first;
747 goto forward_match_found_start_or_reached_end;
748
749 case 6:
750 /* Fastmap without translation, match forward. */
751 while (BE (match_first < right_lim, 1)
752 && !fastmap[(unsigned char) string[match_first]])
753 ++match_first;
754
755 forward_match_found_start_or_reached_end:
756 if (BE (match_first == right_lim, 0))
757 {
758 ch = match_first >= length
759 ? 0 : (unsigned char) string[match_first];
760 if (!fastmap[t ? t[ch] : ch])
761 goto free_return;
762 }
763 break;
764
765 case 4:
766 case 5:
767 /* Fastmap without multi-byte translation, match backwards. */
768 while (match_first >= left_lim)
769 {
770 ch = match_first >= length
771 ? 0 : (unsigned char) string[match_first];
772 if (fastmap[t ? t[ch] : ch])
773 break;
774 --match_first;
775 }
776 if (match_first < left_lim)
777 goto free_return;
778 break;
779
780 default:
781 /* In this case, we can't determine easily the current byte,
782 since it might be a component byte of a multibyte
783 character. Then we use the constructed buffer instead. */
784 for (;;)
785 {
786 /* If MATCH_FIRST is out of the valid range, reconstruct the
787 buffers. */
788 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
789 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
790 {
791 err = re_string_reconstruct (&mctx.input, match_first,
792 eflags);
793 if (BE (err != REG_NOERROR, 0))
794 goto free_return;
795
796 offset = match_first - mctx.input.raw_mbs_idx;
797 }
798 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
799 Note that MATCH_FIRST must not be smaller than 0. */
800 ch = (match_first >= length
801 ? 0 : re_string_byte_at (&mctx.input, offset));
802 if (fastmap[ch])
803 break;
804 match_first += incr;
805 if (match_first < left_lim || match_first > right_lim)
806 {
807 err = REG_NOMATCH;
808 goto free_return;
809 }
810 }
811 break;
812 }
813
814 /* Reconstruct the buffers so that the matcher can assume that
815 the matching starts from the beginning of the buffer. */
816 err = re_string_reconstruct (&mctx.input, match_first, eflags);
817 if (BE (err != REG_NOERROR, 0))
818 goto free_return;
819
820#ifdef RE_ENABLE_I18N
821 /* Don't consider this char as a possible match start if it part,
822 yet isn't the head, of a multibyte character. */
823 if (!sb && !re_string_first_byte (&mctx.input, 0))
824 continue;
825#endif
826
827 /* It seems to be appropriate one, then use the matcher. */
828 /* We assume that the matching starts from 0. */
829 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
830 match_last = check_matching (&mctx, fl_longest_match,
831 range >= 0 ? &match_first : NULL);
832 if (match_last != -1)
833 {
834 if (BE (match_last == -2, 0))
835 {
836 err = REG_ESPACE;
837 goto free_return;
838 }
839 else
840 {
841 mctx.match_last = match_last;
842 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
843 {
844 re_dfastate_t *pstate = mctx.state_log[match_last];
845 mctx.last_node = check_halt_state_context (&mctx, pstate,
846 match_last);
847 }
848 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
849 || dfa->nbackref)
850 {
851 err = prune_impossible_nodes (&mctx);
852 if (err == REG_NOERROR)
853 break;
854 if (BE (err != REG_NOMATCH, 0))
855 goto free_return;
856 match_last = -1;
857 }
858 else
859 break; /* We found a match. */
860 }
861 }
862
863 match_ctx_clean (&mctx);
864 }
865
866#ifdef DEBUG
867 assert (match_last != -1);
868 assert (err == REG_NOERROR);
869#endif
870
871 /* Set pmatch[] if we need. */
872 if (nmatch > 0)
873 {
874 int reg_idx;
875
876 /* Initialize registers. */
877 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
878 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
879
880 /* Set the points where matching start/end. */
881 pmatch[0].rm_so = 0;
882 pmatch[0].rm_eo = mctx.match_last;
883
884 if (!preg->no_sub && nmatch > 1)
885 {
886 err = set_regs (preg, &mctx, nmatch, pmatch,
887 dfa->has_plural_match && dfa->nbackref > 0);
888 if (BE (err != REG_NOERROR, 0))
889 goto free_return;
890 }
891
892 /* At last, add the offset to the each registers, since we slided
893 the buffers so that we could assume that the matching starts
894 from 0. */
895 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
896 if (pmatch[reg_idx].rm_so != -1)
897 {
898#ifdef RE_ENABLE_I18N
899 if (BE (mctx.input.offsets_needed != 0, 0))
900 {
901 pmatch[reg_idx].rm_so =
902 (pmatch[reg_idx].rm_so == mctx.input.valid_len
903 ? mctx.input.valid_raw_len
904 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
905 pmatch[reg_idx].rm_eo =
906 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
907 ? mctx.input.valid_raw_len
908 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
909 }
910#else
911 assert (mctx.input.offsets_needed == 0);
912#endif
913 pmatch[reg_idx].rm_so += match_first;
914 pmatch[reg_idx].rm_eo += match_first;
915 }
916 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
917 {
918 pmatch[nmatch + reg_idx].rm_so = -1;
919 pmatch[nmatch + reg_idx].rm_eo = -1;
920 }
921
922 if (dfa->subexp_map)
923 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
924 if (dfa->subexp_map[reg_idx] != reg_idx)
925 {
926 pmatch[reg_idx + 1].rm_so
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
928 pmatch[reg_idx + 1].rm_eo
929 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
930 }
931 }
932
933 free_return:
934 re_free (mctx.state_log);
935 if (dfa->nbackref)
936 match_ctx_free (&mctx);
937 re_string_destruct (&mctx.input);
938 return err;
939}
940
941static reg_errcode_t
942prune_impossible_nodes (mctx)
943 re_match_context_t *mctx;
944{
945 const re_dfa_t *const dfa = mctx->dfa;
946 int halt_node, match_last;
947 reg_errcode_t ret;
948 re_dfastate_t **sifted_states;
949 re_dfastate_t **lim_states = NULL;
950 re_sift_context_t sctx;
951#ifdef DEBUG
952 assert (mctx->state_log != NULL);
953#endif
954 match_last = mctx->match_last;
955 halt_node = mctx->last_node;
956 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
957 if (BE (sifted_states == NULL, 0))
958 {
959 ret = REG_ESPACE;
960 goto free_return;
961 }
962 if (dfa->nbackref)
963 {
964 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
965 if (BE (lim_states == NULL, 0))
966 {
967 ret = REG_ESPACE;
968 goto free_return;
969 }
970 while (1)
971 {
972 memset (lim_states, '\0',
973 sizeof (re_dfastate_t *) * (match_last + 1));
974 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
975 match_last);
976 ret = sift_states_backward (mctx, &sctx);
977 re_node_set_free (&sctx.limits);
978 if (BE (ret != REG_NOERROR, 0))
979 goto free_return;
980 if (sifted_states[0] != NULL || lim_states[0] != NULL)
981 break;
982 do
983 {
984 --match_last;
985 if (match_last < 0)
986 {
987 ret = REG_NOMATCH;
988 goto free_return;
989 }
990 } while (mctx->state_log[match_last] == NULL
991 || !mctx->state_log[match_last]->halt);
992 halt_node = check_halt_state_context (mctx,
993 mctx->state_log[match_last],
994 match_last);
995 }
996 ret = merge_state_array (dfa, sifted_states, lim_states,
997 match_last + 1);
998 re_free (lim_states);
999 lim_states = NULL;
1000 if (BE (ret != REG_NOERROR, 0))
1001 goto free_return;
1002 }
1003 else
1004 {
1005 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1006 ret = sift_states_backward (mctx, &sctx);
1007 re_node_set_free (&sctx.limits);
1008 if (BE (ret != REG_NOERROR, 0))
1009 goto free_return;
1010 }
1011 re_free (mctx->state_log);
1012 mctx->state_log = sifted_states;
1013 sifted_states = NULL;
1014 mctx->last_node = halt_node;
1015 mctx->match_last = match_last;
1016 ret = REG_NOERROR;
1017 free_return:
1018 re_free (sifted_states);
1019 re_free (lim_states);
1020 return ret;
1021}
1022
1023/* Acquire an initial state and return it.
1024 We must select appropriate initial state depending on the context,
1025 since initial states may have constraints like "\<", "^", etc.. */
1026
1027static inline re_dfastate_t *
1028__attribute ((always_inline)) internal_function
1029acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1030 int idx)
1031{
1032 const re_dfa_t *const dfa = mctx->dfa;
1033 if (dfa->init_state->has_constraint)
1034 {
1035 unsigned int context;
1036 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1037 if (IS_WORD_CONTEXT (context))
1038 return dfa->init_state_word;
1039 else if (IS_ORDINARY_CONTEXT (context))
1040 return dfa->init_state;
1041 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1042 return dfa->init_state_begbuf;
1043 else if (IS_NEWLINE_CONTEXT (context))
1044 return dfa->init_state_nl;
1045 else if (IS_BEGBUF_CONTEXT (context))
1046 {
1047 /* It is relatively rare case, then calculate on demand. */
1048 return re_acquire_state_context (err, dfa,
1049 dfa->init_state->entrance_nodes,
1050 context);
1051 }
1052 else
1053 /* Must not happen? */
1054 return dfa->init_state;
1055 }
1056 else
1057 return dfa->init_state;
1058}
1059
1060/* Check whether the regular expression match input string INPUT or not,
1061 and return the index where the matching end, return -1 if not match,
1062 or return -2 in case of an error.
1063 FL_LONGEST_MATCH means we want the POSIX longest matching.
1064 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1065 next place where we may want to try matching.
1066 Note that the matcher assume that the maching starts from the current
1067 index of the buffer. */
1068
1069static int
1070internal_function
1071check_matching (re_match_context_t *mctx, int fl_longest_match,
1072 int *p_match_first)
1073{
1074 const re_dfa_t *const dfa = mctx->dfa;
1075 reg_errcode_t err;
1076 int match = 0;
1077 int match_last = -1;
1078 int cur_str_idx = re_string_cur_idx (&mctx->input);
1079 re_dfastate_t *cur_state;
1080 int at_init_state = p_match_first != NULL;
1081 int next_start_idx = cur_str_idx;
1082
1083 err = REG_NOERROR;
1084 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1085 /* An initial state must not be NULL (invalid). */
1086 if (BE (cur_state == NULL, 0))
1087 {
1088 assert (err == REG_ESPACE);
1089 return -2;
1090 }
1091
1092 if (mctx->state_log != NULL)
1093 {
1094 mctx->state_log[cur_str_idx] = cur_state;
1095
1096 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1097 later. E.g. Processing back references. */
1098 if (BE (dfa->nbackref, 0))
1099 {
1100 at_init_state = 0;
1101 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1102 if (BE (err != REG_NOERROR, 0))
1103 return err;
1104
1105 if (cur_state->has_backref)
1106 {
1107 err = transit_state_bkref (mctx, &cur_state->nodes);
1108 if (BE (err != REG_NOERROR, 0))
1109 return err;
1110 }
1111 }
1112 }
1113
1114 /* If the RE accepts NULL string. */
1115 if (BE (cur_state->halt, 0))
1116 {
1117 if (!cur_state->has_constraint
1118 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1119 {
1120 if (!fl_longest_match)
1121 return cur_str_idx;
1122 else
1123 {
1124 match_last = cur_str_idx;
1125 match = 1;
1126 }
1127 }
1128 }
1129
1130 while (!re_string_eoi (&mctx->input))
1131 {
1132 re_dfastate_t *old_state = cur_state;
1133 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1134
1135 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1136 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1137 && mctx->input.valid_len < mctx->input.len))
1138 {
1139 err = extend_buffers (mctx);
1140 if (BE (err != REG_NOERROR, 0))
1141 {
1142 assert (err == REG_ESPACE);
1143 return -2;
1144 }
1145 }
1146
1147 cur_state = transit_state (&err, mctx, cur_state);
1148 if (mctx->state_log != NULL)
1149 cur_state = merge_state_with_log (&err, mctx, cur_state);
1150
1151 if (cur_state == NULL)
1152 {
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err != REG_NOERROR, 0))
1157 return -2;
1158
1159 if (mctx->state_log == NULL
1160 || (match && !fl_longest_match)
1161 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1162 break;
1163 }
1164
1165 if (BE (at_init_state, 0))
1166 {
1167 if (old_state == cur_state)
1168 next_start_idx = next_char_idx;
1169 else
1170 at_init_state = 0;
1171 }
1172
1173 if (cur_state->halt)
1174 {
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state->has_constraint
1178 || check_halt_state_context (mctx, cur_state,
1179 re_string_cur_idx (&mctx->input)))
1180 {
1181 /* We found an appropriate halt state. */
1182 match_last = re_string_cur_idx (&mctx->input);
1183 match = 1;
1184
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first = NULL;
1187 if (!fl_longest_match)
1188 break;
1189 }
1190 }
1191 }
1192
1193 if (p_match_first)
1194 *p_match_first += next_start_idx;
1195
1196 return match_last;
1197}
1198
1199/* Check NODE match the current context. */
1200
1201static int
1202internal_function
1203check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1204{
1205 re_token_type_t type = dfa->nodes[node].type;
1206 unsigned int constraint = dfa->nodes[node].constraint;
1207 if (type != END_OF_RE)
1208 return 0;
1209 if (!constraint)
1210 return 1;
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1212 return 0;
1213 return 1;
1214}
1215
1216/* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1219
1220static int
1221internal_function
1222check_halt_state_context (const re_match_context_t *mctx,
1223 const re_dfastate_t *state, int idx)
1224{
1225 int i;
1226 unsigned int context;
1227#ifdef DEBUG
1228 assert (state->halt);
1229#endif
1230 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1231 for (i = 0; i < state->nodes.nelem; ++i)
1232 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1233 return state->nodes.elems[i];
1234 return 0;
1235}
1236
1237/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1240 of errors. */
1241
1242static int
1243internal_function
1244proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1245 int *pidx, int node, re_node_set *eps_via_nodes,
1246 struct re_fail_stack_t *fs)
1247{
1248 const re_dfa_t *const dfa = mctx->dfa;
1249 int i, err;
1250 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1251 {
1252 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1253 re_node_set *edests = &dfa->edests[node];
1254 int dest_node;
1255 err = re_node_set_insert (eps_via_nodes, node);
1256 if (BE (err < 0, 0))
1257 return -2;
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1260 {
1261 int candidate = edests->elems[i];
1262 if (!re_node_set_contains (cur_nodes, candidate))
1263 continue;
1264 if (dest_node == -1)
1265 dest_node = candidate;
1266
1267 else
1268 {
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes, dest_node))
1272 return candidate;
1273
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1275 else if (fs != NULL
1276 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1277 eps_via_nodes))
1278 return -2;
1279
1280 /* We know we are going to exit. */
1281 break;
1282 }
1283 }
1284 return dest_node;
1285 }
1286 else
1287 {
1288 int naccepted = 0;
1289 re_token_type_t type = dfa->nodes[node].type;
1290
1291#ifdef RE_ENABLE_I18N
1292 if (dfa->nodes[node].accept_mb)
1293 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1294 else
1295#endif /* RE_ENABLE_I18N */
1296 if (type == OP_BACK_REF)
1297 {
1298 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1299 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1300 if (fs != NULL)
1301 {
1302 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1303 return -1;
1304 else if (naccepted)
1305 {
1306 char *buf = (char *) re_string_get_buffer (&mctx->input);
1307 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1308 naccepted) != 0)
1309 return -1;
1310 }
1311 }
1312
1313 if (naccepted == 0)
1314 {
1315 int dest_node;
1316 err = re_node_set_insert (eps_via_nodes, node);
1317 if (BE (err < 0, 0))
1318 return -2;
1319 dest_node = dfa->edests[node].elems[0];
1320 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 dest_node))
1322 return dest_node;
1323 }
1324 }
1325
1326 if (naccepted != 0
1327 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1328 {
1329 int dest_node = dfa->nexts[node];
1330 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1331 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1332 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1333 dest_node)))
1334 return -1;
1335 re_node_set_empty (eps_via_nodes);
1336 return dest_node;
1337 }
1338 }
1339 return -1;
1340}
1341
1342static reg_errcode_t
1343internal_function
1344push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1345 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1346{
1347 reg_errcode_t err;
1348 int num = fs->num++;
1349 if (fs->num == fs->alloc)
1350 {
1351 struct re_fail_stack_ent_t *new_array;
1352 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1353 * fs->alloc * 2));
1354 if (new_array == NULL)
1355 return REG_ESPACE;
1356 fs->alloc *= 2;
1357 fs->stack = new_array;
1358 }
1359 fs->stack[num].idx = str_idx;
1360 fs->stack[num].node = dest_node;
1361 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1362 if (fs->stack[num].regs == NULL)
1363 return REG_ESPACE;
1364 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1365 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1366 return err;
1367}
1368
1369static int
1370internal_function
1371pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1372 regmatch_t *regs, re_node_set *eps_via_nodes)
1373{
1374 int num = --fs->num;
1375 assert (num >= 0);
1376 *pidx = fs->stack[num].idx;
1377 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1378 re_node_set_free (eps_via_nodes);
1379 re_free (fs->stack[num].regs);
1380 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1381 return fs->stack[num].node;
1382}
1383
1384/* Set the positions where the subexpressions are starts/ends to registers
1385 PMATCH.
1386 Note: We assume that pmatch[0] is already set, and
1387 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1388
1389static reg_errcode_t
1390internal_function
1391set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1392 regmatch_t *pmatch, int fl_backtrack)
1393{
1394 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1395 int idx, cur_node;
1396 re_node_set eps_via_nodes;
1397 struct re_fail_stack_t *fs;
1398 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1399 regmatch_t *prev_idx_match;
1400 int prev_idx_match_malloced = 0;
1401
1402#ifdef DEBUG
1403 assert (nmatch > 1);
1404 assert (mctx->state_log != NULL);
1405#endif
1406 if (fl_backtrack)
1407 {
1408 fs = &fs_body;
1409 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1410 if (fs->stack == NULL)
1411 return REG_ESPACE;
1412 }
1413 else
1414 fs = NULL;
1415
1416 cur_node = dfa->init_node;
1417 re_node_set_init_empty (&eps_via_nodes);
1418
1419 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1420 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1421 else
1422 {
1423 prev_idx_match = re_malloc (regmatch_t, nmatch);
1424 if (prev_idx_match == NULL)
1425 {
1426 free_fail_stack_return (fs);
1427 return REG_ESPACE;
1428 }
1429 prev_idx_match_malloced = 1;
1430 }
1431 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1432
1433 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1434 {
1435 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1436
1437 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1438 {
1439 int reg_idx;
1440 if (fs)
1441 {
1442 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1443 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1444 break;
1445 if (reg_idx == nmatch)
1446 {
1447 re_node_set_free (&eps_via_nodes);
1448 if (prev_idx_match_malloced)
1449 re_free (prev_idx_match);
1450 return free_fail_stack_return (fs);
1451 }
1452 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1453 &eps_via_nodes);
1454 }
1455 else
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return REG_NOERROR;
1461 }
1462 }
1463
1464 /* Proceed to next node. */
1465 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1466 &eps_via_nodes, fs);
1467
1468 if (BE (cur_node < 0, 0))
1469 {
1470 if (BE (cur_node == -2, 0))
1471 {
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 free_fail_stack_return (fs);
1476 return REG_ESPACE;
1477 }
1478 if (fs)
1479 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 &eps_via_nodes);
1481 else
1482 {
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 return REG_NOMATCH;
1487 }
1488 }
1489 }
1490 re_node_set_free (&eps_via_nodes);
1491 if (prev_idx_match_malloced)
1492 re_free (prev_idx_match);
1493 return free_fail_stack_return (fs);
1494}
1495
1496static reg_errcode_t
1497internal_function
1498free_fail_stack_return (struct re_fail_stack_t *fs)
1499{
1500 if (fs)
1501 {
1502 int fs_idx;
1503 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1504 {
1505 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1506 re_free (fs->stack[fs_idx].regs);
1507 }
1508 re_free (fs->stack);
1509 }
1510 return REG_NOERROR;
1511}
1512
1513static void
1514internal_function
1515update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1516 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1517{
1518 int type = dfa->nodes[cur_node].type;
1519 if (type == OP_OPEN_SUBEXP)
1520 {
1521 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1522
1523 /* We are at the first node of this sub expression. */
1524 if (reg_num < nmatch)
1525 {
1526 pmatch[reg_num].rm_so = cur_idx;
1527 pmatch[reg_num].rm_eo = -1;
1528 }
1529 }
1530 else if (type == OP_CLOSE_SUBEXP)
1531 {
1532 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533 if (reg_num < nmatch)
1534 {
1535 /* We are at the last node of this sub expression. */
1536 if (pmatch[reg_num].rm_so < cur_idx)
1537 {
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 /* This is a non-empty match or we are not inside an optional
1540 subexpression. Accept this right away. */
1541 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1542 }
1543 else
1544 {
1545 if (dfa->nodes[cur_node].opt_subexp
1546 && prev_idx_match[reg_num].rm_so != -1)
1547 /* We transited through an empty match for an optional
1548 subexpression, like (a?)*, and this is not the subexp's
1549 first match. Copy back the old content of the registers
1550 so that matches of an inner subexpression are undone as
1551 well, like in ((a?))*. */
1552 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1553 else
1554 /* We completed a subexpression, but it may be part of
1555 an optional one, so do not update PREV_IDX_MATCH. */
1556 pmatch[reg_num].rm_eo = cur_idx;
1557 }
1558 }
1559 }
1560}
1561
1562/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1563 and sift the nodes in each states according to the following rules.
1564 Updated state_log will be wrote to STATE_LOG.
1565
1566 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1567 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1568 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1569 the LAST_NODE, we throw away the node `a'.
1570 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1571 string `s' and transit to `b':
1572 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1573 away the node `a'.
1574 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1575 thrown away, we throw away the node `a'.
1576 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1577 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1578 node `a'.
1579 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1580 we throw away the node `a'. */
1581
1582#define STATE_NODE_CONTAINS(state,node) \
1583 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1584
1585static reg_errcode_t
1586internal_function
1587sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1588{
1589 reg_errcode_t err;
1590 int null_cnt = 0;
1591 int str_idx = sctx->last_str_idx;
1592 re_node_set cur_dest;
1593
1594#ifdef DEBUG
1595 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1596#endif
1597
1598 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1599 transit to the last_node and the last_node itself. */
1600 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1601 if (BE (err != REG_NOERROR, 0))
1602 return err;
1603 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1604 if (BE (err != REG_NOERROR, 0))
1605 goto free_return;
1606
1607 /* Then check each states in the state_log. */
1608 while (str_idx > 0)
1609 {
1610 /* Update counters. */
1611 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1612 if (null_cnt > mctx->max_mb_elem_len)
1613 {
1614 memset (sctx->sifted_states, '\0',
1615 sizeof (re_dfastate_t *) * str_idx);
1616 re_node_set_free (&cur_dest);
1617 return REG_NOERROR;
1618 }
1619 re_node_set_empty (&cur_dest);
1620 --str_idx;
1621
1622 if (mctx->state_log[str_idx])
1623 {
1624 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1625 if (BE (err != REG_NOERROR, 0))
1626 goto free_return;
1627 }
1628
1629 /* Add all the nodes which satisfy the following conditions:
1630 - It can epsilon transit to a node in CUR_DEST.
1631 - It is in CUR_SRC.
1632 And update state_log. */
1633 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1636 }
1637 err = REG_NOERROR;
1638 free_return:
1639 re_node_set_free (&cur_dest);
1640 return err;
1641}
1642
1643static reg_errcode_t
1644internal_function
1645build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1646 int str_idx, re_node_set *cur_dest)
1647{
1648 const re_dfa_t *const dfa = mctx->dfa;
1649 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1650 int i;
1651
1652 /* Then build the next sifted state.
1653 We build the next sifted state on `cur_dest', and update
1654 `sifted_states[str_idx]' with `cur_dest'.
1655 Note:
1656 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1657 `cur_src' points the node_set of the old `state_log[str_idx]'
1658 (with the epsilon nodes pre-filtered out). */
1659 for (i = 0; i < cur_src->nelem; i++)
1660 {
1661 int prev_node = cur_src->elems[i];
1662 int naccepted = 0;
1663 int ret;
1664
1665#ifdef DEBUG
1666 re_token_type_t type = dfa->nodes[prev_node].type;
1667 assert (!IS_EPSILON_NODE (type));
1668#endif
1669#ifdef RE_ENABLE_I18N
1670 /* If the node may accept `multi byte'. */
1671 if (dfa->nodes[prev_node].accept_mb)
1672 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1673 str_idx, sctx->last_str_idx);
1674#endif /* RE_ENABLE_I18N */
1675
1676 /* We don't check backreferences here.
1677 See update_cur_sifted_state(). */
1678 if (!naccepted
1679 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1680 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1681 dfa->nexts[prev_node]))
1682 naccepted = 1;
1683
1684 if (naccepted == 0)
1685 continue;
1686
1687 if (sctx->limits.nelem)
1688 {
1689 int to_idx = str_idx + naccepted;
1690 if (check_dst_limits (mctx, &sctx->limits,
1691 dfa->nexts[prev_node], to_idx,
1692 prev_node, str_idx))
1693 continue;
1694 }
1695 ret = re_node_set_insert (cur_dest, prev_node);
1696 if (BE (ret == -1, 0))
1697 return REG_ESPACE;
1698 }
1699
1700 return REG_NOERROR;
1701}
1702
1703/* Helper functions. */
1704
1705static reg_errcode_t
1706internal_function
1707clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1708{
1709 int top = mctx->state_log_top;
1710
1711 if (next_state_log_idx >= mctx->input.bufs_len
1712 || (next_state_log_idx >= mctx->input.valid_len
1713 && mctx->input.valid_len < mctx->input.len))
1714 {
1715 reg_errcode_t err;
1716 err = extend_buffers (mctx);
1717 if (BE (err != REG_NOERROR, 0))
1718 return err;
1719 }
1720
1721 if (top < next_state_log_idx)
1722 {
1723 memset (mctx->state_log + top + 1, '\0',
1724 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1725 mctx->state_log_top = next_state_log_idx;
1726 }
1727 return REG_NOERROR;
1728}
1729
1730static reg_errcode_t
1731internal_function
1732merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1733 re_dfastate_t **src, int num)
1734{
1735 int st_idx;
1736 reg_errcode_t err;
1737 for (st_idx = 0; st_idx < num; ++st_idx)
1738 {
1739 if (dst[st_idx] == NULL)
1740 dst[st_idx] = src[st_idx];
1741 else if (src[st_idx] != NULL)
1742 {
1743 re_node_set merged_set;
1744 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1745 &src[st_idx]->nodes);
1746 if (BE (err != REG_NOERROR, 0))
1747 return err;
1748 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1749 re_node_set_free (&merged_set);
1750 if (BE (err != REG_NOERROR, 0))
1751 return err;
1752 }
1753 }
1754 return REG_NOERROR;
1755}
1756
1757static reg_errcode_t
1758internal_function
1759update_cur_sifted_state (const re_match_context_t *mctx,
1760 re_sift_context_t *sctx, int str_idx,
1761 re_node_set *dest_nodes)
1762{
1763 const re_dfa_t *const dfa = mctx->dfa;
1764 reg_errcode_t err = REG_NOERROR;
1765 const re_node_set *candidates;
1766 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1767 : &mctx->state_log[str_idx]->nodes);
1768
1769 if (dest_nodes->nelem == 0)
1770 sctx->sifted_states[str_idx] = NULL;
1771 else
1772 {
1773 if (candidates)
1774 {
1775 /* At first, add the nodes which can epsilon transit to a node in
1776 DEST_NODE. */
1777 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1778 if (BE (err != REG_NOERROR, 0))
1779 return err;
1780
1781 /* Then, check the limitations in the current sift_context. */
1782 if (sctx->limits.nelem)
1783 {
1784 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1785 mctx->bkref_ents, str_idx);
1786 if (BE (err != REG_NOERROR, 0))
1787 return err;
1788 }
1789 }
1790
1791 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1792 if (BE (err != REG_NOERROR, 0))
1793 return err;
1794 }
1795
1796 if (candidates && mctx->state_log[str_idx]->has_backref)
1797 {
1798 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1799 if (BE (err != REG_NOERROR, 0))
1800 return err;
1801 }
1802 return REG_NOERROR;
1803}
1804
1805static reg_errcode_t
1806internal_function
1807add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1808 const re_node_set *candidates)
1809{
1810 reg_errcode_t err = REG_NOERROR;
1811 int i;
1812
1813 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1814 if (BE (err != REG_NOERROR, 0))
1815 return err;
1816
1817 if (!state->inveclosure.alloc)
1818 {
1819 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1820 if (BE (err != REG_NOERROR, 0))
1821 return REG_ESPACE;
1822 for (i = 0; i < dest_nodes->nelem; i++)
1823 re_node_set_merge (&state->inveclosure,
1824 dfa->inveclosures + dest_nodes->elems[i]);
1825 }
1826 return re_node_set_add_intersect (dest_nodes, candidates,
1827 &state->inveclosure);
1828}
1829
1830static reg_errcode_t
1831internal_function
1832sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1833 const re_node_set *candidates)
1834{
1835 int ecl_idx;
1836 reg_errcode_t err;
1837 re_node_set *inv_eclosure = dfa->inveclosures + node;
1838 re_node_set except_nodes;
1839 re_node_set_init_empty (&except_nodes);
1840 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1841 {
1842 int cur_node = inv_eclosure->elems[ecl_idx];
1843 if (cur_node == node)
1844 continue;
1845 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1846 {
1847 int edst1 = dfa->edests[cur_node].elems[0];
1848 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1849 ? dfa->edests[cur_node].elems[1] : -1);
1850 if ((!re_node_set_contains (inv_eclosure, edst1)
1851 && re_node_set_contains (dest_nodes, edst1))
1852 || (edst2 > 0
1853 && !re_node_set_contains (inv_eclosure, edst2)
1854 && re_node_set_contains (dest_nodes, edst2)))
1855 {
1856 err = re_node_set_add_intersect (&except_nodes, candidates,
1857 dfa->inveclosures + cur_node);
1858 if (BE (err != REG_NOERROR, 0))
1859 {
1860 re_node_set_free (&except_nodes);
1861 return err;
1862 }
1863 }
1864 }
1865 }
1866 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1867 {
1868 int cur_node = inv_eclosure->elems[ecl_idx];
1869 if (!re_node_set_contains (&except_nodes, cur_node))
1870 {
1871 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1872 re_node_set_remove_at (dest_nodes, idx);
1873 }
1874 }
1875 re_node_set_free (&except_nodes);
1876 return REG_NOERROR;
1877}
1878
1879static int
1880internal_function
1881check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1882 int dst_node, int dst_idx, int src_node, int src_idx)
1883{
1884 const re_dfa_t *const dfa = mctx->dfa;
1885 int lim_idx, src_pos, dst_pos;
1886
1887 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1888 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1889 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1890 {
1891 int subexp_idx;
1892 struct re_backref_cache_entry *ent;
1893 ent = mctx->bkref_ents + limits->elems[lim_idx];
1894 subexp_idx = dfa->nodes[ent->node].opr.idx;
1895
1896 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1897 subexp_idx, dst_node, dst_idx,
1898 dst_bkref_idx);
1899 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1900 subexp_idx, src_node, src_idx,
1901 src_bkref_idx);
1902
1903 /* In case of:
1904 <src> <dst> ( <subexp> )
1905 ( <subexp> ) <src> <dst>
1906 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1907 if (src_pos == dst_pos)
1908 continue; /* This is unrelated limitation. */
1909 else
1910 return 1;
1911 }
1912 return 0;
1913}
1914
1915static int
1916internal_function
1917check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1918 int subexp_idx, int from_node, int bkref_idx)
1919{
1920 const re_dfa_t *const dfa = mctx->dfa;
1921 const re_node_set *eclosures = dfa->eclosures + from_node;
1922 int node_idx;
1923
1924 /* Else, we are on the boundary: examine the nodes on the epsilon
1925 closure. */
1926 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1927 {
1928 int node = eclosures->elems[node_idx];
1929 switch (dfa->nodes[node].type)
1930 {
1931 case OP_BACK_REF:
1932 if (bkref_idx != -1)
1933 {
1934 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1935 do
1936 {
1937 int dst, cpos;
1938
1939 if (ent->node != node)
1940 continue;
1941
1942 if (subexp_idx < BITSET_WORD_BITS
1943 && !(ent->eps_reachable_subexps_map
1944 & ((bitset_word_t) 1 << subexp_idx)))
1945 continue;
1946
1947 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1948 OP_CLOSE_SUBEXP cases below. But, if the
1949 destination node is the same node as the source
1950 node, don't recurse because it would cause an
1951 infinite loop: a regex that exhibits this behavior
1952 is ()\1*\1* */
1953 dst = dfa->edests[node].elems[0];
1954 if (dst == from_node)
1955 {
1956 if (boundaries & 1)
1957 return -1;
1958 else /* if (boundaries & 2) */
1959 return 0;
1960 }
1961
1962 cpos =
1963 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1964 dst, bkref_idx);
1965 if (cpos == -1 /* && (boundaries & 1) */)
1966 return -1;
1967 if (cpos == 0 && (boundaries & 2))
1968 return 0;
1969
1970 if (subexp_idx < BITSET_WORD_BITS)
1971 ent->eps_reachable_subexps_map
1972 &= ~((bitset_word_t) 1 << subexp_idx);
1973 }
1974 while (ent++->more);
1975 }
1976 break;
1977
1978 case OP_OPEN_SUBEXP:
1979 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1980 return -1;
1981 break;
1982
1983 case OP_CLOSE_SUBEXP:
1984 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1985 return 0;
1986 break;
1987
1988 default:
1989 break;
1990 }
1991 }
1992
1993 return (boundaries & 2) ? 1 : 0;
1994}
1995
1996static int
1997internal_function
1998check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1999 int subexp_idx, int from_node, int str_idx,
2000 int bkref_idx)
2001{
2002 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2003 int boundaries;
2004
2005 /* If we are outside the range of the subexpression, return -1 or 1. */
2006 if (str_idx < lim->subexp_from)
2007 return -1;
2008
2009 if (lim->subexp_to < str_idx)
2010 return 1;
2011
2012 /* If we are within the subexpression, return 0. */
2013 boundaries = (str_idx == lim->subexp_from);
2014 boundaries |= (str_idx == lim->subexp_to) << 1;
2015 if (boundaries == 0)
2016 return 0;
2017
2018 /* Else, examine epsilon closure. */
2019 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2020 from_node, bkref_idx);
2021}
2022
2023/* Check the limitations of sub expressions LIMITS, and remove the nodes
2024 which are against limitations from DEST_NODES. */
2025
2026static reg_errcode_t
2027internal_function
2028check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2029 const re_node_set *candidates, re_node_set *limits,
2030 struct re_backref_cache_entry *bkref_ents, int str_idx)
2031{
2032 reg_errcode_t err;
2033 int node_idx, lim_idx;
2034
2035 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2036 {
2037 int subexp_idx;
2038 struct re_backref_cache_entry *ent;
2039 ent = bkref_ents + limits->elems[lim_idx];
2040
2041 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2042 continue; /* This is unrelated limitation. */
2043
2044 subexp_idx = dfa->nodes[ent->node].opr.idx;
2045 if (ent->subexp_to == str_idx)
2046 {
2047 int ops_node = -1;
2048 int cls_node = -1;
2049 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2050 {
2051 int node = dest_nodes->elems[node_idx];
2052 re_token_type_t type = dfa->nodes[node].type;
2053 if (type == OP_OPEN_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2055 ops_node = node;
2056 else if (type == OP_CLOSE_SUBEXP
2057 && subexp_idx == dfa->nodes[node].opr.idx)
2058 cls_node = node;
2059 }
2060
2061 /* Check the limitation of the open subexpression. */
2062 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2063 if (ops_node >= 0)
2064 {
2065 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2066 candidates);
2067 if (BE (err != REG_NOERROR, 0))
2068 return err;
2069 }
2070
2071 /* Check the limitation of the close subexpression. */
2072 if (cls_node >= 0)
2073 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074 {
2075 int node = dest_nodes->elems[node_idx];
2076 if (!re_node_set_contains (dfa->inveclosures + node,
2077 cls_node)
2078 && !re_node_set_contains (dfa->eclosures + node,
2079 cls_node))
2080 {
2081 /* It is against this limitation.
2082 Remove it form the current sifted state. */
2083 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2084 candidates);
2085 if (BE (err != REG_NOERROR, 0))
2086 return err;
2087 --node_idx;
2088 }
2089 }
2090 }
2091 else /* (ent->subexp_to != str_idx) */
2092 {
2093 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2094 {
2095 int node = dest_nodes->elems[node_idx];
2096 re_token_type_t type = dfa->nodes[node].type;
2097 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2098 {
2099 if (subexp_idx != dfa->nodes[node].opr.idx)
2100 continue;
2101 /* It is against this limitation.
2102 Remove it form the current sifted state. */
2103 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2104 candidates);
2105 if (BE (err != REG_NOERROR, 0))
2106 return err;
2107 }
2108 }
2109 }
2110 }
2111 return REG_NOERROR;
2112}
2113
2114static reg_errcode_t
2115internal_function
2116sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2117 int str_idx, const re_node_set *candidates)
2118{
2119 const re_dfa_t *const dfa = mctx->dfa;
2120 reg_errcode_t err;
2121 int node_idx, node;
2122 re_sift_context_t local_sctx;
2123 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2124
2125 if (first_idx == -1)
2126 return REG_NOERROR;
2127
2128 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2129
2130 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2131 {
2132 int enabled_idx;
2133 re_token_type_t type;
2134 struct re_backref_cache_entry *entry;
2135 node = candidates->elems[node_idx];
2136 type = dfa->nodes[node].type;
2137 /* Avoid infinite loop for the REs like "()\1+". */
2138 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2139 continue;
2140 if (type != OP_BACK_REF)
2141 continue;
2142
2143 entry = mctx->bkref_ents + first_idx;
2144 enabled_idx = first_idx;
2145 do
2146 {
2147 int subexp_len;
2148 int to_idx;
2149 int dst_node;
2150 int ret;
2151 re_dfastate_t *cur_state;
2152
2153 if (entry->node != node)
2154 continue;
2155 subexp_len = entry->subexp_to - entry->subexp_from;
2156 to_idx = str_idx + subexp_len;
2157 dst_node = (subexp_len ? dfa->nexts[node]
2158 : dfa->edests[node].elems[0]);
2159
2160 if (to_idx > sctx->last_str_idx
2161 || sctx->sifted_states[to_idx] == NULL
2162 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2163 || check_dst_limits (mctx, &sctx->limits, node,
2164 str_idx, dst_node, to_idx))
2165 continue;
2166
2167 if (local_sctx.sifted_states == NULL)
2168 {
2169 local_sctx = *sctx;
2170 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2171 if (BE (err != REG_NOERROR, 0))
2172 goto free_return;
2173 }
2174 local_sctx.last_node = node;
2175 local_sctx.last_str_idx = str_idx;
2176 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 if (BE (ret < 0, 0))
2178 {
2179 err = REG_ESPACE;
2180 goto free_return;
2181 }
2182 cur_state = local_sctx.sifted_states[str_idx];
2183 err = sift_states_backward (mctx, &local_sctx);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2186 if (sctx->limited_states != NULL)
2187 {
2188 err = merge_state_array (dfa, sctx->limited_states,
2189 local_sctx.sifted_states,
2190 str_idx + 1);
2191 if (BE (err != REG_NOERROR, 0))
2192 goto free_return;
2193 }
2194 local_sctx.sifted_states[str_idx] = cur_state;
2195 re_node_set_remove (&local_sctx.limits, enabled_idx);
2196
2197 /* mctx->bkref_ents may have changed, reload the pointer. */
2198 entry = mctx->bkref_ents + enabled_idx;
2199 }
2200 while (enabled_idx++, entry++->more);
2201 }
2202 err = REG_NOERROR;
2203 free_return:
2204 if (local_sctx.sifted_states != NULL)
2205 {
2206 re_node_set_free (&local_sctx.limits);
2207 }
2208
2209 return err;
2210}
2211
2212
2213#ifdef RE_ENABLE_I18N
2214static int
2215internal_function
2216sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2217 int node_idx, int str_idx, int max_str_idx)
2218{
2219 const re_dfa_t *const dfa = mctx->dfa;
2220 int naccepted;
2221 /* Check the node can accept `multi byte'. */
2222 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2223 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2224 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2225 dfa->nexts[node_idx]))
2226 /* The node can't accept the `multi byte', or the
2227 destination was already thrown away, then the node
2228 could't accept the current input `multi byte'. */
2229 naccepted = 0;
2230 /* Otherwise, it is sure that the node could accept
2231 `naccepted' bytes input. */
2232 return naccepted;
2233}
2234#endif /* RE_ENABLE_I18N */
2235
2236
2237
2238/* Functions for state transition. */
2239
2240/* Return the next state to which the current state STATE will transit by
2241 accepting the current input byte, and update STATE_LOG if necessary.
2242 If STATE can accept a multibyte char/collating element/back reference
2243 update the destination of STATE_LOG. */
2244
2245static re_dfastate_t *
2246internal_function
2247transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2248 re_dfastate_t *state)
2249{
2250 re_dfastate_t **trtable;
2251 unsigned char ch;
2252
2253#ifdef RE_ENABLE_I18N
2254 /* If the current state can accept multibyte. */
2255 if (BE (state->accept_mb, 0))
2256 {
2257 *err = transit_state_mb (mctx, state);
2258 if (BE (*err != REG_NOERROR, 0))
2259 return NULL;
2260 }
2261#endif /* RE_ENABLE_I18N */
2262
2263 /* Then decide the next state with the single byte. */
2264#if 0
2265 if (0)
2266 /* don't use transition table */
2267 return transit_state_sb (err, mctx, state);
2268#endif
2269
2270 /* Use transition table */
2271 ch = re_string_fetch_byte (&mctx->input);
2272 for (;;)
2273 {
2274 trtable = state->trtable;
2275 if (BE (trtable != NULL, 1))
2276 return trtable[ch];
2277
2278 trtable = state->word_trtable;
2279 if (BE (trtable != NULL, 1))
2280 {
2281 unsigned int context;
2282 context
2283 = re_string_context_at (&mctx->input,
2284 re_string_cur_idx (&mctx->input) - 1,
2285 mctx->eflags);
2286 if (IS_WORD_CONTEXT (context))
2287 return trtable[ch + SBC_MAX];
2288 else
2289 return trtable[ch];
2290 }
2291
2292 if (!build_trtable (mctx->dfa, state))
2293 {
2294 *err = REG_ESPACE;
2295 return NULL;
2296 }
2297
2298 /* Retry, we now have a transition table. */
2299 }
2300}
2301
2302/* Update the state_log if we need */
2303re_dfastate_t *
2304internal_function
2305merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2306 re_dfastate_t *next_state)
2307{
2308 const re_dfa_t *const dfa = mctx->dfa;
2309 int cur_idx = re_string_cur_idx (&mctx->input);
2310
2311 if (cur_idx > mctx->state_log_top)
2312 {
2313 mctx->state_log[cur_idx] = next_state;
2314 mctx->state_log_top = cur_idx;
2315 }
2316 else if (mctx->state_log[cur_idx] == 0)
2317 {
2318 mctx->state_log[cur_idx] = next_state;
2319 }
2320 else
2321 {
2322 re_dfastate_t *pstate;
2323 unsigned int context;
2324 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2325 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2326 the destination of a multibyte char/collating element/
2327 back reference. Then the next state is the union set of
2328 these destinations and the results of the transition table. */
2329 pstate = mctx->state_log[cur_idx];
2330 log_nodes = pstate->entrance_nodes;
2331 if (next_state != NULL)
2332 {
2333 table_nodes = next_state->entrance_nodes;
2334 *err = re_node_set_init_union (&next_nodes, table_nodes,
2335 log_nodes);
2336 if (BE (*err != REG_NOERROR, 0))
2337 return NULL;
2338 }
2339 else
2340 next_nodes = *log_nodes;
2341 /* Note: We already add the nodes of the initial state,
2342 then we don't need to add them here. */
2343
2344 context = re_string_context_at (&mctx->input,
2345 re_string_cur_idx (&mctx->input) - 1,
2346 mctx->eflags);
2347 next_state = mctx->state_log[cur_idx]
2348 = re_acquire_state_context (err, dfa, &next_nodes, context);
2349 /* We don't need to check errors here, since the return value of
2350 this function is next_state and ERR is already set. */
2351
2352 if (table_nodes != NULL)
2353 re_node_set_free (&next_nodes);
2354 }
2355
2356 if (BE (dfa->nbackref, 0) && next_state != NULL)
2357 {
2358 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2359 later. We must check them here, since the back references in the
2360 next state might use them. */
2361 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2362 cur_idx);
2363 if (BE (*err != REG_NOERROR, 0))
2364 return NULL;
2365
2366 /* If the next state has back references. */
2367 if (next_state->has_backref)
2368 {
2369 *err = transit_state_bkref (mctx, &next_state->nodes);
2370 if (BE (*err != REG_NOERROR, 0))
2371 return NULL;
2372 next_state = mctx->state_log[cur_idx];
2373 }
2374 }
2375
2376 return next_state;
2377}
2378
2379/* Skip bytes in the input that correspond to part of a
2380 multi-byte match, then look in the log for a state
2381 from which to restart matching. */
2382re_dfastate_t *
2383internal_function
2384find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2385{
2386 re_dfastate_t *cur_state;
2387 do
2388 {
2389 int max = mctx->state_log_top;
2390 int cur_str_idx = re_string_cur_idx (&mctx->input);
2391
2392 do
2393 {
2394 if (++cur_str_idx > max)
2395 return NULL;
2396 re_string_skip_bytes (&mctx->input, 1);
2397 }
2398 while (mctx->state_log[cur_str_idx] == NULL);
2399
2400 cur_state = merge_state_with_log (err, mctx, NULL);
2401 }
2402 while (*err == REG_NOERROR && cur_state == NULL);
2403 return cur_state;
2404}
2405
2406/* Helper functions for transit_state. */
2407
2408/* From the node set CUR_NODES, pick up the nodes whose types are
2409 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2410 expression. And register them to use them later for evaluating the
2411 correspoding back references. */
2412
2413static reg_errcode_t
2414internal_function
2415check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2416 int str_idx)
2417{
2418 const re_dfa_t *const dfa = mctx->dfa;
2419 int node_idx;
2420 reg_errcode_t err;
2421
2422 /* TODO: This isn't efficient.
2423 Because there might be more than one nodes whose types are
2424 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2425 nodes.
2426 E.g. RE: (a){2} */
2427 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2428 {
2429 int node = cur_nodes->elems[node_idx];
2430 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2431 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2432 && (dfa->used_bkref_map
2433 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2434 {
2435 err = match_ctx_add_subtop (mctx, node, str_idx);
2436 if (BE (err != REG_NOERROR, 0))
2437 return err;
2438 }
2439 }
2440 return REG_NOERROR;
2441}
2442
2443#if 0
2444/* Return the next state to which the current state STATE will transit by
2445 accepting the current input byte. */
2446
2447static re_dfastate_t *
2448transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2449 re_dfastate_t *state)
2450{
2451 const re_dfa_t *const dfa = mctx->dfa;
2452 re_node_set next_nodes;
2453 re_dfastate_t *next_state;
2454 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2455 unsigned int context;
2456
2457 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2458 if (BE (*err != REG_NOERROR, 0))
2459 return NULL;
2460 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2461 {
2462 int cur_node = state->nodes.elems[node_cnt];
2463 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2464 {
2465 *err = re_node_set_merge (&next_nodes,
2466 dfa->eclosures + dfa->nexts[cur_node]);
2467 if (BE (*err != REG_NOERROR, 0))
2468 {
2469 re_node_set_free (&next_nodes);
2470 return NULL;
2471 }
2472 }
2473 }
2474 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2475 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2476 /* We don't need to check errors here, since the return value of
2477 this function is next_state and ERR is already set. */
2478
2479 re_node_set_free (&next_nodes);
2480 re_string_skip_bytes (&mctx->input, 1);
2481 return next_state;
2482}
2483#endif
2484
2485#ifdef RE_ENABLE_I18N
2486static reg_errcode_t
2487internal_function
2488transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2489{
2490 const re_dfa_t *const dfa = mctx->dfa;
2491 reg_errcode_t err;
2492 int i;
2493
2494 for (i = 0; i < pstate->nodes.nelem; ++i)
2495 {
2496 re_node_set dest_nodes, *new_nodes;
2497 int cur_node_idx = pstate->nodes.elems[i];
2498 int naccepted, dest_idx;
2499 unsigned int context;
2500 re_dfastate_t *dest_state;
2501
2502 if (!dfa->nodes[cur_node_idx].accept_mb)
2503 continue;
2504
2505 if (dfa->nodes[cur_node_idx].constraint)
2506 {
2507 context = re_string_context_at (&mctx->input,
2508 re_string_cur_idx (&mctx->input),
2509 mctx->eflags);
2510 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2511 context))
2512 continue;
2513 }
2514
2515 /* How many bytes the node can accept? */
2516 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2517 re_string_cur_idx (&mctx->input));
2518 if (naccepted == 0)
2519 continue;
2520
2521 /* The node can accepts `naccepted' bytes. */
2522 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2523 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2524 : mctx->max_mb_elem_len);
2525 err = clean_state_log_if_needed (mctx, dest_idx);
2526 if (BE (err != REG_NOERROR, 0))
2527 return err;
2528#ifdef DEBUG
2529 assert (dfa->nexts[cur_node_idx] != -1);
2530#endif
2531 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2532
2533 dest_state = mctx->state_log[dest_idx];
2534 if (dest_state == NULL)
2535 dest_nodes = *new_nodes;
2536 else
2537 {
2538 err = re_node_set_init_union (&dest_nodes,
2539 dest_state->entrance_nodes, new_nodes);
2540 if (BE (err != REG_NOERROR, 0))
2541 return err;
2542 }
2543 context = re_string_context_at (&mctx->input, dest_idx - 1,
2544 mctx->eflags);
2545 mctx->state_log[dest_idx]
2546 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2547 if (dest_state != NULL)
2548 re_node_set_free (&dest_nodes);
2549 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2550 return err;
2551 }
2552 return REG_NOERROR;
2553}
2554#endif /* RE_ENABLE_I18N */
2555
2556static reg_errcode_t
2557internal_function
2558transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2559{
2560 const re_dfa_t *const dfa = mctx->dfa;
2561 reg_errcode_t err;
2562 int i;
2563 int cur_str_idx = re_string_cur_idx (&mctx->input);
2564
2565 for (i = 0; i < nodes->nelem; ++i)
2566 {
2567 int dest_str_idx, prev_nelem, bkc_idx;
2568 int node_idx = nodes->elems[i];
2569 unsigned int context;
2570 const re_token_t *node = dfa->nodes + node_idx;
2571 re_node_set *new_dest_nodes;
2572
2573 /* Check whether `node' is a backreference or not. */
2574 if (node->type != OP_BACK_REF)
2575 continue;
2576
2577 if (node->constraint)
2578 {
2579 context = re_string_context_at (&mctx->input, cur_str_idx,
2580 mctx->eflags);
2581 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2582 continue;
2583 }
2584
2585 /* `node' is a backreference.
2586 Check the substring which the substring matched. */
2587 bkc_idx = mctx->nbkref_ents;
2588 err = get_subexp (mctx, node_idx, cur_str_idx);
2589 if (BE (err != REG_NOERROR, 0))
2590 goto free_return;
2591
2592 /* And add the epsilon closures (which is `new_dest_nodes') of
2593 the backreference to appropriate state_log. */
2594#ifdef DEBUG
2595 assert (dfa->nexts[node_idx] != -1);
2596#endif
2597 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2598 {
2599 int subexp_len;
2600 re_dfastate_t *dest_state;
2601 struct re_backref_cache_entry *bkref_ent;
2602 bkref_ent = mctx->bkref_ents + bkc_idx;
2603 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2604 continue;
2605 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2606 new_dest_nodes = (subexp_len == 0
2607 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2608 : dfa->eclosures + dfa->nexts[node_idx]);
2609 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2610 - bkref_ent->subexp_from);
2611 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2612 mctx->eflags);
2613 dest_state = mctx->state_log[dest_str_idx];
2614 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2615 : mctx->state_log[cur_str_idx]->nodes.nelem);
2616 /* Add `new_dest_node' to state_log. */
2617 if (dest_state == NULL)
2618 {
2619 mctx->state_log[dest_str_idx]
2620 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2621 context);
2622 if (BE (mctx->state_log[dest_str_idx] == NULL
2623 && err != REG_NOERROR, 0))
2624 goto free_return;
2625 }
2626 else
2627 {
2628 re_node_set dest_nodes;
2629 err = re_node_set_init_union (&dest_nodes,
2630 dest_state->entrance_nodes,
2631 new_dest_nodes);
2632 if (BE (err != REG_NOERROR, 0))
2633 {
2634 re_node_set_free (&dest_nodes);
2635 goto free_return;
2636 }
2637 mctx->state_log[dest_str_idx]
2638 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2639 re_node_set_free (&dest_nodes);
2640 if (BE (mctx->state_log[dest_str_idx] == NULL
2641 && err != REG_NOERROR, 0))
2642 goto free_return;
2643 }
2644 /* We need to check recursively if the backreference can epsilon
2645 transit. */
2646 if (subexp_len == 0
2647 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2648 {
2649 err = check_subexp_matching_top (mctx, new_dest_nodes,
2650 cur_str_idx);
2651 if (BE (err != REG_NOERROR, 0))
2652 goto free_return;
2653 err = transit_state_bkref (mctx, new_dest_nodes);
2654 if (BE (err != REG_NOERROR, 0))
2655 goto free_return;
2656 }
2657 }
2658 }
2659 err = REG_NOERROR;
2660 free_return:
2661 return err;
2662}
2663
2664/* Enumerate all the candidates which the backreference BKREF_NODE can match
2665 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2666 Note that we might collect inappropriate candidates here.
2667 However, the cost of checking them strictly here is too high, then we
2668 delay these checking for prune_impossible_nodes(). */
2669
2670static reg_errcode_t
2671internal_function
2672get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2673{
2674 const re_dfa_t *const dfa = mctx->dfa;
2675 int subexp_num, sub_top_idx;
2676 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2677 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2678 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2679 if (cache_idx != -1)
2680 {
2681 const struct re_backref_cache_entry *entry
2682 = mctx->bkref_ents + cache_idx;
2683 do
2684 if (entry->node == bkref_node)
2685 return REG_NOERROR; /* We already checked it. */
2686 while (entry++->more);
2687 }
2688
2689 subexp_num = dfa->nodes[bkref_node].opr.idx;
2690
2691 /* For each sub expression */
2692 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2693 {
2694 reg_errcode_t err;
2695 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2696 re_sub_match_last_t *sub_last;
2697 int sub_last_idx, sl_str, bkref_str_off;
2698
2699 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2700 continue; /* It isn't related. */
2701
2702 sl_str = sub_top->str_idx;
2703 bkref_str_off = bkref_str_idx;
2704 /* At first, check the last node of sub expressions we already
2705 evaluated. */
2706 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2707 {
2708 int sl_str_diff;
2709 sub_last = sub_top->lasts[sub_last_idx];
2710 sl_str_diff = sub_last->str_idx - sl_str;
2711 /* The matched string by the sub expression match with the substring
2712 at the back reference? */
2713 if (sl_str_diff > 0)
2714 {
2715 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2716 {
2717 /* Not enough chars for a successful match. */
2718 if (bkref_str_off + sl_str_diff > mctx->input.len)
2719 break;
2720
2721 err = clean_state_log_if_needed (mctx,
2722 bkref_str_off
2723 + sl_str_diff);
2724 if (BE (err != REG_NOERROR, 0))
2725 return err;
2726 buf = (const char *) re_string_get_buffer (&mctx->input);
2727 }
2728 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2729 /* We don't need to search this sub expression any more. */
2730 break;
2731 }
2732 bkref_str_off += sl_str_diff;
2733 sl_str += sl_str_diff;
2734 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2735 bkref_str_idx);
2736
2737 /* Reload buf, since the preceding call might have reallocated
2738 the buffer. */
2739 buf = (const char *) re_string_get_buffer (&mctx->input);
2740
2741 if (err == REG_NOMATCH)
2742 continue;
2743 if (BE (err != REG_NOERROR, 0))
2744 return err;
2745 }
2746
2747 if (sub_last_idx < sub_top->nlasts)
2748 continue;
2749 if (sub_last_idx > 0)
2750 ++sl_str;
2751 /* Then, search for the other last nodes of the sub expression. */
2752 for (; sl_str <= bkref_str_idx; ++sl_str)
2753 {
2754 int cls_node, sl_str_off;
2755 const re_node_set *nodes;
2756 sl_str_off = sl_str - sub_top->str_idx;
2757 /* The matched string by the sub expression match with the substring
2758 at the back reference? */
2759 if (sl_str_off > 0)
2760 {
2761 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2762 {
2763 /* If we are at the end of the input, we cannot match. */
2764 if (bkref_str_off >= mctx->input.len)
2765 break;
2766
2767 err = extend_buffers (mctx);
2768 if (BE (err != REG_NOERROR, 0))
2769 return err;
2770
2771 buf = (const char *) re_string_get_buffer (&mctx->input);
2772 }
2773 if (buf [bkref_str_off++] != buf[sl_str - 1])
2774 break; /* We don't need to search this sub expression
2775 any more. */
2776 }
2777 if (mctx->state_log[sl_str] == NULL)
2778 continue;
2779 /* Does this state have a ')' of the sub expression? */
2780 nodes = &mctx->state_log[sl_str]->nodes;
2781 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2782 OP_CLOSE_SUBEXP);
2783 if (cls_node == -1)
2784 continue; /* No. */
2785 if (sub_top->path == NULL)
2786 {
2787 sub_top->path = calloc (sizeof (state_array_t),
2788 sl_str - sub_top->str_idx + 1);
2789 if (sub_top->path == NULL)
2790 return REG_ESPACE;
2791 }
2792 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2793 in the current context? */
2794 err = check_arrival (mctx, sub_top->path, sub_top->node,
2795 sub_top->str_idx, cls_node, sl_str,
2796 OP_CLOSE_SUBEXP);
2797 if (err == REG_NOMATCH)
2798 continue;
2799 if (BE (err != REG_NOERROR, 0))
2800 return err;
2801 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2802 if (BE (sub_last == NULL, 0))
2803 return REG_ESPACE;
2804 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2805 bkref_str_idx);
2806 if (err == REG_NOMATCH)
2807 continue;
2808 }
2809 }
2810 return REG_NOERROR;
2811}
2812
2813/* Helper functions for get_subexp(). */
2814
2815/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2816 If it can arrive, register the sub expression expressed with SUB_TOP
2817 and SUB_LAST. */
2818
2819static reg_errcode_t
2820internal_function
2821get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2822 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2823{
2824 reg_errcode_t err;
2825 int to_idx;
2826 /* Can the subexpression arrive the back reference? */
2827 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2828 sub_last->str_idx, bkref_node, bkref_str,
2829 OP_OPEN_SUBEXP);
2830 if (err != REG_NOERROR)
2831 return err;
2832 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2833 sub_last->str_idx);
2834 if (BE (err != REG_NOERROR, 0))
2835 return err;
2836 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2837 return clean_state_log_if_needed (mctx, to_idx);
2838}
2839
2840/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2841 Search '(' if FL_OPEN, or search ')' otherwise.
2842 TODO: This function isn't efficient...
2843 Because there might be more than one nodes whose types are
2844 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2845 nodes.
2846 E.g. RE: (a){2} */
2847
2848static int
2849internal_function
2850find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2851 int subexp_idx, int type)
2852{
2853 int cls_idx;
2854 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2855 {
2856 int cls_node = nodes->elems[cls_idx];
2857 const re_token_t *node = dfa->nodes + cls_node;
2858 if (node->type == type
2859 && node->opr.idx == subexp_idx)
2860 return cls_node;
2861 }
2862 return -1;
2863}
2864
2865/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2866 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2867 heavily reused.
2868 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2869
2870static reg_errcode_t
2871internal_function
2872check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2873 int top_str, int last_node, int last_str, int type)
2874{
2875 const re_dfa_t *const dfa = mctx->dfa;
2876 reg_errcode_t err = REG_NOERROR;
2877 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2878 re_dfastate_t *cur_state = NULL;
2879 re_node_set *cur_nodes, next_nodes;
2880 re_dfastate_t **backup_state_log;
2881 unsigned int context;
2882
2883 subexp_num = dfa->nodes[top_node].opr.idx;
2884 /* Extend the buffer if we need. */
2885 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2886 {
2887 re_dfastate_t **new_array;
2888 int old_alloc = path->alloc;
2889 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2890 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2891 if (BE (new_array == NULL, 0))
2892 {
2893 path->alloc = old_alloc;
2894 return REG_ESPACE;
2895 }
2896 path->array = new_array;
2897 memset (new_array + old_alloc, '\0',
2898 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2899 }
2900
2901 str_idx = path->next_idx ?: top_str;
2902
2903 /* Temporary modify MCTX. */
2904 backup_state_log = mctx->state_log;
2905 backup_cur_idx = mctx->input.cur_idx;
2906 mctx->state_log = path->array;
2907 mctx->input.cur_idx = str_idx;
2908
2909 /* Setup initial node set. */
2910 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2911 if (str_idx == top_str)
2912 {
2913 err = re_node_set_init_1 (&next_nodes, top_node);
2914 if (BE (err != REG_NOERROR, 0))
2915 return err;
2916 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2917 if (BE (err != REG_NOERROR, 0))
2918 {
2919 re_node_set_free (&next_nodes);
2920 return err;
2921 }
2922 }
2923 else
2924 {
2925 cur_state = mctx->state_log[str_idx];
2926 if (cur_state && cur_state->has_backref)
2927 {
2928 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2929 if (BE (err != REG_NOERROR, 0))
2930 return err;
2931 }
2932 else
2933 re_node_set_init_empty (&next_nodes);
2934 }
2935 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2936 {
2937 if (next_nodes.nelem)
2938 {
2939 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2940 subexp_num, type);
2941 if (BE (err != REG_NOERROR, 0))
2942 {
2943 re_node_set_free (&next_nodes);
2944 return err;
2945 }
2946 }
2947 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2948 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2949 {
2950 re_node_set_free (&next_nodes);
2951 return err;
2952 }
2953 mctx->state_log[str_idx] = cur_state;
2954 }
2955
2956 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2957 {
2958 re_node_set_empty (&next_nodes);
2959 if (mctx->state_log[str_idx + 1])
2960 {
2961 err = re_node_set_merge (&next_nodes,
2962 &mctx->state_log[str_idx + 1]->nodes);
2963 if (BE (err != REG_NOERROR, 0))
2964 {
2965 re_node_set_free (&next_nodes);
2966 return err;
2967 }
2968 }
2969 if (cur_state)
2970 {
2971 err = check_arrival_add_next_nodes (mctx, str_idx,
2972 &cur_state->non_eps_nodes,
2973 &next_nodes);
2974 if (BE (err != REG_NOERROR, 0))
2975 {
2976 re_node_set_free (&next_nodes);
2977 return err;
2978 }
2979 }
2980 ++str_idx;
2981 if (next_nodes.nelem)
2982 {
2983 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2984 if (BE (err != REG_NOERROR, 0))
2985 {
2986 re_node_set_free (&next_nodes);
2987 return err;
2988 }
2989 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2990 subexp_num, type);
2991 if (BE (err != REG_NOERROR, 0))
2992 {
2993 re_node_set_free (&next_nodes);
2994 return err;
2995 }
2996 }
2997 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2998 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2999 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3000 {
3001 re_node_set_free (&next_nodes);
3002 return err;
3003 }
3004 mctx->state_log[str_idx] = cur_state;
3005 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3006 }
3007 re_node_set_free (&next_nodes);
3008 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3009 : &mctx->state_log[last_str]->nodes);
3010 path->next_idx = str_idx;
3011
3012 /* Fix MCTX. */
3013 mctx->state_log = backup_state_log;
3014 mctx->input.cur_idx = backup_cur_idx;
3015
3016 /* Then check the current node set has the node LAST_NODE. */
3017 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3018 return REG_NOERROR;
3019
3020 return REG_NOMATCH;
3021}
3022
3023/* Helper functions for check_arrival. */
3024
3025/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3026 to NEXT_NODES.
3027 TODO: This function is similar to the functions transit_state*(),
3028 however this function has many additional works.
3029 Can't we unify them? */
3030
3031static reg_errcode_t
3032internal_function
3033check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3034 re_node_set *cur_nodes, re_node_set *next_nodes)
3035{
3036 const re_dfa_t *const dfa = mctx->dfa;
3037 int result;
3038 int cur_idx;
3039 reg_errcode_t err = REG_NOERROR;
3040 re_node_set union_set;
3041 re_node_set_init_empty (&union_set);
3042 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3043 {
3044 int naccepted = 0;
3045 int cur_node = cur_nodes->elems[cur_idx];
3046#ifdef DEBUG
3047 re_token_type_t type = dfa->nodes[cur_node].type;
3048 assert (!IS_EPSILON_NODE (type));
3049#endif
3050#ifdef RE_ENABLE_I18N
3051 /* If the node may accept `multi byte'. */
3052 if (dfa->nodes[cur_node].accept_mb)
3053 {
3054 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3055 str_idx);
3056 if (naccepted > 1)
3057 {
3058 re_dfastate_t *dest_state;
3059 int next_node = dfa->nexts[cur_node];
3060 int next_idx = str_idx + naccepted;
3061 dest_state = mctx->state_log[next_idx];
3062 re_node_set_empty (&union_set);
3063 if (dest_state)
3064 {
3065 err = re_node_set_merge (&union_set, &dest_state->nodes);
3066 if (BE (err != REG_NOERROR, 0))
3067 {
3068 re_node_set_free (&union_set);
3069 return err;
3070 }
3071 }
3072 result = re_node_set_insert (&union_set, next_node);
3073 if (BE (result < 0, 0))
3074 {
3075 re_node_set_free (&union_set);
3076 return REG_ESPACE;
3077 }
3078 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3079 &union_set);
3080 if (BE (mctx->state_log[next_idx] == NULL
3081 && err != REG_NOERROR, 0))
3082 {
3083 re_node_set_free (&union_set);
3084 return err;
3085 }
3086 }
3087 }
3088#endif /* RE_ENABLE_I18N */
3089 if (naccepted
3090 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3091 {
3092 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3093 if (BE (result < 0, 0))
3094 {
3095 re_node_set_free (&union_set);
3096 return REG_ESPACE;
3097 }
3098 }
3099 }
3100 re_node_set_free (&union_set);
3101 return REG_NOERROR;
3102}
3103
3104/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3105 CUR_NODES, however exclude the nodes which are:
3106 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3107 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3108*/
3109
3110static reg_errcode_t
3111internal_function
3112check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3113 int ex_subexp, int type)
3114{
3115 reg_errcode_t err;
3116 int idx, outside_node;
3117 re_node_set new_nodes;
3118#ifdef DEBUG
3119 assert (cur_nodes->nelem);
3120#endif
3121 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3122 if (BE (err != REG_NOERROR, 0))
3123 return err;
3124 /* Create a new node set NEW_NODES with the nodes which are epsilon
3125 closures of the node in CUR_NODES. */
3126
3127 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3128 {
3129 int cur_node = cur_nodes->elems[idx];
3130 const re_node_set *eclosure = dfa->eclosures + cur_node;
3131 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3132 if (outside_node == -1)
3133 {
3134 /* There are no problematic nodes, just merge them. */
3135 err = re_node_set_merge (&new_nodes, eclosure);
3136 if (BE (err != REG_NOERROR, 0))
3137 {
3138 re_node_set_free (&new_nodes);
3139 return err;
3140 }
3141 }
3142 else
3143 {
3144 /* There are problematic nodes, re-calculate incrementally. */
3145 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3146 ex_subexp, type);
3147 if (BE (err != REG_NOERROR, 0))
3148 {
3149 re_node_set_free (&new_nodes);
3150 return err;
3151 }
3152 }
3153 }
3154 re_node_set_free (cur_nodes);
3155 *cur_nodes = new_nodes;
3156 return REG_NOERROR;
3157}
3158
3159/* Helper function for check_arrival_expand_ecl.
3160 Check incrementally the epsilon closure of TARGET, and if it isn't
3161 problematic append it to DST_NODES. */
3162
3163static reg_errcode_t
3164internal_function
3165check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3166 int target, int ex_subexp, int type)
3167{
3168 int cur_node;
3169 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3170 {
3171 int err;
3172
3173 if (dfa->nodes[cur_node].type == type
3174 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3175 {
3176 if (type == OP_CLOSE_SUBEXP)
3177 {
3178 err = re_node_set_insert (dst_nodes, cur_node);
3179 if (BE (err == -1, 0))
3180 return REG_ESPACE;
3181 }
3182 break;
3183 }
3184 err = re_node_set_insert (dst_nodes, cur_node);
3185 if (BE (err == -1, 0))
3186 return REG_ESPACE;
3187 if (dfa->edests[cur_node].nelem == 0)
3188 break;
3189 if (dfa->edests[cur_node].nelem == 2)
3190 {
3191 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3192 dfa->edests[cur_node].elems[1],
3193 ex_subexp, type);
3194 if (BE (err != REG_NOERROR, 0))
3195 return err;
3196 }
3197 cur_node = dfa->edests[cur_node].elems[0];
3198 }
3199 return REG_NOERROR;
3200}
3201
3202
3203/* For all the back references in the current state, calculate the
3204 destination of the back references by the appropriate entry
3205 in MCTX->BKREF_ENTS. */
3206
3207static reg_errcode_t
3208internal_function
3209expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3210 int cur_str, int subexp_num, int type)
3211{
3212 const re_dfa_t *const dfa = mctx->dfa;
3213 reg_errcode_t err;
3214 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3215 struct re_backref_cache_entry *ent;
3216
3217 if (cache_idx_start == -1)
3218 return REG_NOERROR;
3219
3220 restart:
3221 ent = mctx->bkref_ents + cache_idx_start;
3222 do
3223 {
3224 int to_idx, next_node;
3225
3226 /* Is this entry ENT is appropriate? */
3227 if (!re_node_set_contains (cur_nodes, ent->node))
3228 continue; /* No. */
3229
3230 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3231 /* Calculate the destination of the back reference, and append it
3232 to MCTX->STATE_LOG. */
3233 if (to_idx == cur_str)
3234 {
3235 /* The backreference did epsilon transit, we must re-check all the
3236 node in the current state. */
3237 re_node_set new_dests;
3238 reg_errcode_t err2, err3;
3239 next_node = dfa->edests[ent->node].elems[0];
3240 if (re_node_set_contains (cur_nodes, next_node))
3241 continue;
3242 err = re_node_set_init_1 (&new_dests, next_node);
3243 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3244 err3 = re_node_set_merge (cur_nodes, &new_dests);
3245 re_node_set_free (&new_dests);
3246 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3247 || err3 != REG_NOERROR, 0))
3248 {
3249 err = (err != REG_NOERROR ? err
3250 : (err2 != REG_NOERROR ? err2 : err3));
3251 return err;
3252 }
3253 /* TODO: It is still inefficient... */
3254 goto restart;
3255 }
3256 else
3257 {
3258 re_node_set union_set;
3259 next_node = dfa->nexts[ent->node];
3260 if (mctx->state_log[to_idx])
3261 {
3262 int ret;
3263 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3264 next_node))
3265 continue;
3266 err = re_node_set_init_copy (&union_set,
3267 &mctx->state_log[to_idx]->nodes);
3268 ret = re_node_set_insert (&union_set, next_node);
3269 if (BE (err != REG_NOERROR || ret < 0, 0))
3270 {
3271 re_node_set_free (&union_set);
3272 err = err != REG_NOERROR ? err : REG_ESPACE;
3273 return err;
3274 }
3275 }
3276 else
3277 {
3278 err = re_node_set_init_1 (&union_set, next_node);
3279 if (BE (err != REG_NOERROR, 0))
3280 return err;
3281 }
3282 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3283 re_node_set_free (&union_set);
3284 if (BE (mctx->state_log[to_idx] == NULL
3285 && err != REG_NOERROR, 0))
3286 return err;
3287 }
3288 }
3289 while (ent++->more);
3290 return REG_NOERROR;
3291}
3292
3293/* Build transition table for the state.
3294 Return 1 if succeeded, otherwise return NULL. */
3295
3296static int
3297internal_function
3298build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3299{
3300 reg_errcode_t err;
3301 int i, j, ch, need_word_trtable = 0;
3302 bitset_word_t elem, mask;
3303 bool dests_node_malloced = false;
3304 bool dest_states_malloced = false;
3305 int ndests; /* Number of the destination states from `state'. */
3306 re_dfastate_t **trtable;
3307 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3308 re_node_set follows, *dests_node;
3309 bitset_t *dests_ch;
3310 bitset_t acceptable;
3311
3312 struct dests_alloc
3313 {
3314 re_node_set dests_node[SBC_MAX];
3315 bitset_t dests_ch[SBC_MAX];
3316 } *dests_alloc;
3317
3318 /* We build DFA states which corresponds to the destination nodes
3319 from `state'. `dests_node[i]' represents the nodes which i-th
3320 destination state contains, and `dests_ch[i]' represents the
3321 characters which i-th destination state accepts. */
3322 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3323 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3324 else
3325 {
3326 dests_alloc = re_malloc (struct dests_alloc, 1);
3327 if (BE (dests_alloc == NULL, 0))
3328 return 0;
3329 dests_node_malloced = true;
3330 }
3331 dests_node = dests_alloc->dests_node;
3332 dests_ch = dests_alloc->dests_ch;
3333
3334 /* Initialize transiton table. */
3335 state->word_trtable = state->trtable = NULL;
3336
3337 /* At first, group all nodes belonging to `state' into several
3338 destinations. */
3339 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3340 if (BE (ndests <= 0, 0))
3341 {
3342 if (dests_node_malloced)
3343 free (dests_alloc);
3344 /* Return 0 in case of an error, 1 otherwise. */
3345 if (ndests == 0)
3346 {
3347 state->trtable = (re_dfastate_t **)
3348 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3349 return 1;
3350 }
3351 return 0;
3352 }
3353
3354 err = re_node_set_alloc (&follows, ndests + 1);
3355 if (BE (err != REG_NOERROR, 0))
3356 goto out_free;
3357
3358 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3359 + ndests * 3 * sizeof (re_dfastate_t *)))
3360 dest_states = (re_dfastate_t **)
3361 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3362 else
3363 {
3364 dest_states = (re_dfastate_t **)
3365 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3366 if (BE (dest_states == NULL, 0))
3367 {
3368out_free:
3369 if (dest_states_malloced)
3370 free (dest_states);
3371 re_node_set_free (&follows);
3372 for (i = 0; i < ndests; ++i)
3373 re_node_set_free (dests_node + i);
3374 if (dests_node_malloced)
3375 free (dests_alloc);
3376 return 0;
3377 }
3378 dest_states_malloced = true;
3379 }
3380 dest_states_word = dest_states + ndests;
3381 dest_states_nl = dest_states_word + ndests;
3382 bitset_empty (acceptable);
3383
3384 /* Then build the states for all destinations. */
3385 for (i = 0; i < ndests; ++i)
3386 {
3387 int next_node;
3388 re_node_set_empty (&follows);
3389 /* Merge the follows of this destination states. */
3390 for (j = 0; j < dests_node[i].nelem; ++j)
3391 {
3392 next_node = dfa->nexts[dests_node[i].elems[j]];
3393 if (next_node != -1)
3394 {
3395 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3396 if (BE (err != REG_NOERROR, 0))
3397 goto out_free;
3398 }
3399 }
3400 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3401 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3402 goto out_free;
3403 /* If the new state has context constraint,
3404 build appropriate states for these contexts. */
3405 if (dest_states[i]->has_constraint)
3406 {
3407 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3408 CONTEXT_WORD);
3409 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3410 goto out_free;
3411
3412 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3413 need_word_trtable = 1;
3414
3415 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3416 CONTEXT_NEWLINE);
3417 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3418 goto out_free;
3419 }
3420 else
3421 {
3422 dest_states_word[i] = dest_states[i];
3423 dest_states_nl[i] = dest_states[i];
3424 }
3425 bitset_merge (acceptable, dests_ch[i]);
3426 }
3427
3428 if (!BE (need_word_trtable, 0))
3429 {
3430 /* We don't care about whether the following character is a word
3431 character, or we are in a single-byte character set so we can
3432 discern by looking at the character code: allocate a
3433 256-entry transition table. */
3434 trtable = state->trtable =
3435 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3436 if (BE (trtable == NULL, 0))
3437 goto out_free;
3438
3439 /* For all characters ch...: */
3440 for (i = 0; i < BITSET_WORDS; ++i)
3441 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3442 elem;
3443 mask <<= 1, elem >>= 1, ++ch)
3444 if (BE (elem & 1, 0))
3445 {
3446 /* There must be exactly one destination which accepts
3447 character ch. See group_nodes_into_DFAstates. */
3448 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3449 ;
3450
3451 /* j-th destination accepts the word character ch. */
3452 if (dfa->word_char[i] & mask)
3453 trtable[ch] = dest_states_word[j];
3454 else
3455 trtable[ch] = dest_states[j];
3456 }
3457 }
3458 else
3459 {
3460 /* We care about whether the following character is a word
3461 character, and we are in a multi-byte character set: discern
3462 by looking at the character code: build two 256-entry
3463 transition tables, one starting at trtable[0] and one
3464 starting at trtable[SBC_MAX]. */
3465 trtable = state->word_trtable =
3466 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3467 if (BE (trtable == NULL, 0))
3468 goto out_free;
3469
3470 /* For all characters ch...: */
3471 for (i = 0; i < BITSET_WORDS; ++i)
3472 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3473 elem;
3474 mask <<= 1, elem >>= 1, ++ch)
3475 if (BE (elem & 1, 0))
3476 {
3477 /* There must be exactly one destination which accepts
3478 character ch. See group_nodes_into_DFAstates. */
3479 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3480 ;
3481
3482 /* j-th destination accepts the word character ch. */
3483 trtable[ch] = dest_states[j];
3484 trtable[ch + SBC_MAX] = dest_states_word[j];
3485 }
3486 }
3487
3488 /* new line */
3489 if (bitset_contain (acceptable, NEWLINE_CHAR))
3490 {
3491 /* The current state accepts newline character. */
3492 for (j = 0; j < ndests; ++j)
3493 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3494 {
3495 /* k-th destination accepts newline character. */
3496 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3497 if (need_word_trtable)
3498 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3499 /* There must be only one destination which accepts
3500 newline. See group_nodes_into_DFAstates. */
3501 break;
3502 }
3503 }
3504
3505 if (dest_states_malloced)
3506 free (dest_states);
3507
3508 re_node_set_free (&follows);
3509 for (i = 0; i < ndests; ++i)
3510 re_node_set_free (dests_node + i);
3511
3512 if (dests_node_malloced)
3513 free (dests_alloc);
3514
3515 return 1;
3516}
3517
3518/* Group all nodes belonging to STATE into several destinations.
3519 Then for all destinations, set the nodes belonging to the destination
3520 to DESTS_NODE[i] and set the characters accepted by the destination
3521 to DEST_CH[i]. This function return the number of destinations. */
3522
3523static int
3524internal_function
3525group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3526 re_node_set *dests_node, bitset_t *dests_ch)
3527{
3528 reg_errcode_t err;
3529 int result;
3530 int i, j, k;
3531 int ndests; /* Number of the destinations from `state'. */
3532 bitset_t accepts; /* Characters a node can accept. */
3533 const re_node_set *cur_nodes = &state->nodes;
3534 bitset_empty (accepts);
3535 ndests = 0;
3536
3537 /* For all the nodes belonging to `state', */
3538 for (i = 0; i < cur_nodes->nelem; ++i)
3539 {
3540 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3541 re_token_type_t type = node->type;
3542 unsigned int constraint = node->constraint;
3543
3544 /* Enumerate all single byte character this node can accept. */
3545 if (type == CHARACTER)
3546 bitset_set (accepts, node->opr.c);
3547 else if (type == SIMPLE_BRACKET)
3548 {
3549 bitset_merge (accepts, node->opr.sbcset);
3550 }
3551 else if (type == OP_PERIOD)
3552 {
3553#ifdef RE_ENABLE_I18N
3554 if (dfa->mb_cur_max > 1)
3555 bitset_merge (accepts, dfa->sb_char);
3556 else
3557#endif
3558 bitset_set_all (accepts);
3559 if (!(dfa->syntax & RE_DOT_NEWLINE))
3560 bitset_clear (accepts, '\n');
3561 if (dfa->syntax & RE_DOT_NOT_NULL)
3562 bitset_clear (accepts, '\0');
3563 }
3564#ifdef RE_ENABLE_I18N
3565 else if (type == OP_UTF8_PERIOD)
3566 {
3567 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3568 if (!(dfa->syntax & RE_DOT_NEWLINE))
3569 bitset_clear (accepts, '\n');
3570 if (dfa->syntax & RE_DOT_NOT_NULL)
3571 bitset_clear (accepts, '\0');
3572 }
3573#endif
3574 else
3575 continue;
3576
3577 /* Check the `accepts' and sift the characters which are not
3578 match it the context. */
3579 if (constraint)
3580 {
3581 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3582 {
3583 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3584 bitset_empty (accepts);
3585 if (accepts_newline)
3586 bitset_set (accepts, NEWLINE_CHAR);
3587 else
3588 continue;
3589 }
3590 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3591 {
3592 bitset_empty (accepts);
3593 continue;
3594 }
3595
3596 if (constraint & NEXT_WORD_CONSTRAINT)
3597 {
3598 bitset_word_t any_set = 0;
3599 if (type == CHARACTER && !node->word_char)
3600 {
3601 bitset_empty (accepts);
3602 continue;
3603 }
3604#ifdef RE_ENABLE_I18N
3605 if (dfa->mb_cur_max > 1)
3606 for (j = 0; j < BITSET_WORDS; ++j)
3607 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3608 else
3609#endif
3610 for (j = 0; j < BITSET_WORDS; ++j)
3611 any_set |= (accepts[j] &= dfa->word_char[j]);
3612 if (!any_set)
3613 continue;
3614 }
3615 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3616 {
3617 bitset_word_t any_set = 0;
3618 if (type == CHARACTER && node->word_char)
3619 {
3620 bitset_empty (accepts);
3621 continue;
3622 }
3623#ifdef RE_ENABLE_I18N
3624 if (dfa->mb_cur_max > 1)
3625 for (j = 0; j < BITSET_WORDS; ++j)
3626 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3627 else
3628#endif
3629 for (j = 0; j < BITSET_WORDS; ++j)
3630 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3631 if (!any_set)
3632 continue;
3633 }
3634 }
3635
3636 /* Then divide `accepts' into DFA states, or create a new
3637 state. Above, we make sure that accepts is not empty. */
3638 for (j = 0; j < ndests; ++j)
3639 {
3640 bitset_t intersec; /* Intersection sets, see below. */
3641 bitset_t remains;
3642 /* Flags, see below. */
3643 bitset_word_t has_intersec, not_subset, not_consumed;
3644
3645 /* Optimization, skip if this state doesn't accept the character. */
3646 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3647 continue;
3648
3649 /* Enumerate the intersection set of this state and `accepts'. */
3650 has_intersec = 0;
3651 for (k = 0; k < BITSET_WORDS; ++k)
3652 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3653 /* And skip if the intersection set is empty. */
3654 if (!has_intersec)
3655 continue;
3656
3657 /* Then check if this state is a subset of `accepts'. */
3658 not_subset = not_consumed = 0;
3659 for (k = 0; k < BITSET_WORDS; ++k)
3660 {
3661 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3662 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3663 }
3664
3665 /* If this state isn't a subset of `accepts', create a
3666 new group state, which has the `remains'. */
3667 if (not_subset)
3668 {
3669 bitset_copy (dests_ch[ndests], remains);
3670 bitset_copy (dests_ch[j], intersec);
3671 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3672 if (BE (err != REG_NOERROR, 0))
3673 goto error_return;
3674 ++ndests;
3675 }
3676
3677 /* Put the position in the current group. */
3678 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3679 if (BE (result < 0, 0))
3680 goto error_return;
3681
3682 /* If all characters are consumed, go to next node. */
3683 if (!not_consumed)
3684 break;
3685 }
3686 /* Some characters remain, create a new group. */
3687 if (j == ndests)
3688 {
3689 bitset_copy (dests_ch[ndests], accepts);
3690 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3691 if (BE (err != REG_NOERROR, 0))
3692 goto error_return;
3693 ++ndests;
3694 bitset_empty (accepts);
3695 }
3696 }
3697 return ndests;
3698 error_return:
3699 for (j = 0; j < ndests; ++j)
3700 re_node_set_free (dests_node + j);
3701 return -1;
3702}
3703
3704#ifdef RE_ENABLE_I18N
3705/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3706 Return the number of the bytes the node accepts.
3707 STR_IDX is the current index of the input string.
3708
3709 This function handles the nodes which can accept one character, or
3710 one collating element like '.', '[a-z]', opposite to the other nodes
3711 can only accept one byte. */
3712
3713static int
3714internal_function
3715check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3716 const re_string_t *input, int str_idx)
3717{
3718 const re_token_t *node = dfa->nodes + node_idx;
3719 int char_len, elem_len;
3720 int i;
3721
3722 if (BE (node->type == OP_UTF8_PERIOD, 0))
3723 {
3724 unsigned char c = re_string_byte_at (input, str_idx), d;
3725 if (BE (c < 0xc2, 1))
3726 return 0;
3727
3728 if (str_idx + 2 > input->len)
3729 return 0;
3730
3731 d = re_string_byte_at (input, str_idx + 1);
3732 if (c < 0xe0)
3733 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3734 else if (c < 0xf0)
3735 {
3736 char_len = 3;
3737 if (c == 0xe0 && d < 0xa0)
3738 return 0;
3739 }
3740 else if (c < 0xf8)
3741 {
3742 char_len = 4;
3743 if (c == 0xf0 && d < 0x90)
3744 return 0;
3745 }
3746 else if (c < 0xfc)
3747 {
3748 char_len = 5;
3749 if (c == 0xf8 && d < 0x88)
3750 return 0;
3751 }
3752 else if (c < 0xfe)
3753 {
3754 char_len = 6;
3755 if (c == 0xfc && d < 0x84)
3756 return 0;
3757 }
3758 else
3759 return 0;
3760
3761 if (str_idx + char_len > input->len)
3762 return 0;
3763
3764 for (i = 1; i < char_len; ++i)
3765 {
3766 d = re_string_byte_at (input, str_idx + i);
3767 if (d < 0x80 || d > 0xbf)
3768 return 0;
3769 }
3770 return char_len;
3771 }
3772
3773 char_len = re_string_char_size_at (input, str_idx);
3774 if (node->type == OP_PERIOD)
3775 {
3776 if (char_len <= 1)
3777 return 0;
3778 /* FIXME: I don't think this if is needed, as both '\n'
3779 and '\0' are char_len == 1. */
3780 /* '.' accepts any one character except the following two cases. */
3781 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3782 re_string_byte_at (input, str_idx) == '\n') ||
3783 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3784 re_string_byte_at (input, str_idx) == '\0'))
3785 return 0;
3786 return char_len;
3787 }
3788
3789 elem_len = re_string_elem_size_at (input, str_idx);
3790 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3791 return 0;
3792
3793 if (node->type == COMPLEX_BRACKET)
3794 {
3795 const re_charset_t *cset = node->opr.mbcset;
3796# ifdef _LIBC
3797 const unsigned char *pin
3798 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3799 int j;
3800 uint32_t nrules;
3801# endif /* _LIBC */
3802 int match_len = 0;
3803 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3804 ? re_string_wchar_at (input, str_idx) : 0);
3805
3806 /* match with multibyte character? */
3807 for (i = 0; i < cset->nmbchars; ++i)
3808 if (wc == cset->mbchars[i])
3809 {
3810 match_len = char_len;
3811 goto check_node_accept_bytes_match;
3812 }
3813 /* match with character_class? */
3814 for (i = 0; i < cset->nchar_classes; ++i)
3815 {
3816 wctype_t wt = cset->char_classes[i];
3817 if (__iswctype (wc, wt))
3818 {
3819 match_len = char_len;
3820 goto check_node_accept_bytes_match;
3821 }
3822 }
3823
3824# ifdef _LIBC
3825 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3826 if (nrules != 0)
3827 {
3828 unsigned int in_collseq = 0;
3829 const int32_t *table, *indirect;
3830 const unsigned char *weights, *extra;
3831 const char *collseqwc;
3832 int32_t idx;
3833 /* This #include defines a local function! */
3834# include <locale/weight.h>
3835
3836 /* match with collating_symbol? */
3837 if (cset->ncoll_syms)
3838 extra = (const unsigned char *)
3839 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3840 for (i = 0; i < cset->ncoll_syms; ++i)
3841 {
3842 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3843 /* Compare the length of input collating element and
3844 the length of current collating element. */
3845 if (*coll_sym != elem_len)
3846 continue;
3847 /* Compare each bytes. */
3848 for (j = 0; j < *coll_sym; j++)
3849 if (pin[j] != coll_sym[1 + j])
3850 break;
3851 if (j == *coll_sym)
3852 {
3853 /* Match if every bytes is equal. */
3854 match_len = j;
3855 goto check_node_accept_bytes_match;
3856 }
3857 }
3858
3859 if (cset->nranges)
3860 {
3861 if (elem_len <= char_len)
3862 {
3863 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3864 in_collseq = __collseq_table_lookup (collseqwc, wc);
3865 }
3866 else
3867 in_collseq = find_collation_sequence_value (pin, elem_len);
3868 }
3869 /* match with range expression? */
3870 for (i = 0; i < cset->nranges; ++i)
3871 if (cset->range_starts[i] <= in_collseq
3872 && in_collseq <= cset->range_ends[i])
3873 {
3874 match_len = elem_len;
3875 goto check_node_accept_bytes_match;
3876 }
3877
3878 /* match with equivalence_class? */
3879 if (cset->nequiv_classes)
3880 {
3881 const unsigned char *cp = pin;
3882 table = (const int32_t *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3884 weights = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3886 extra = (const unsigned char *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3888 indirect = (const int32_t *)
3889 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3890 idx = findidx (&cp);
3891 if (idx > 0)
3892 for (i = 0; i < cset->nequiv_classes; ++i)
3893 {
3894 int32_t equiv_class_idx = cset->equiv_classes[i];
3895 size_t weight_len = weights[idx];
3896 if (weight_len == weights[equiv_class_idx])
3897 {
3898 int cnt = 0;
3899 while (cnt <= weight_len
3900 && (weights[equiv_class_idx + 1 + cnt]
3901 == weights[idx + 1 + cnt]))
3902 ++cnt;
3903 if (cnt > weight_len)
3904 {
3905 match_len = elem_len;
3906 goto check_node_accept_bytes_match;
3907 }
3908 }
3909 }
3910 }
3911 }
3912 else
3913# endif /* _LIBC */
3914 {
3915 /* match with range expression? */
3916#if __GNUC__ >= 2
3917 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3918#else
3919 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3920 cmp_buf[2] = wc;
3921#endif
3922 for (i = 0; i < cset->nranges; ++i)
3923 {
3924 cmp_buf[0] = cset->range_starts[i];
3925 cmp_buf[4] = cset->range_ends[i];
3926 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3927 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3928 {
3929 match_len = char_len;
3930 goto check_node_accept_bytes_match;
3931 }
3932 }
3933 }
3934 check_node_accept_bytes_match:
3935 if (!cset->non_match)
3936 return match_len;
3937 else
3938 {
3939 if (match_len > 0)
3940 return 0;
3941 else
3942 return (elem_len > char_len) ? elem_len : char_len;
3943 }
3944 }
3945 return 0;
3946}
3947
3948# ifdef _LIBC
3949static unsigned int
3950internal_function
3951find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3952{
3953 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3954 if (nrules == 0)
3955 {
3956 if (mbs_len == 1)
3957 {
3958 /* No valid character. Match it as a single byte character. */
3959 const unsigned char *collseq = (const unsigned char *)
3960 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3961 return collseq[mbs[0]];
3962 }
3963 return UINT_MAX;
3964 }
3965 else
3966 {
3967 int32_t idx;
3968 const unsigned char *extra = (const unsigned char *)
3969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3970 int32_t extrasize = (const unsigned char *)
3971 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3972
3973 for (idx = 0; idx < extrasize;)
3974 {
3975 int mbs_cnt, found = 0;
3976 int32_t elem_mbs_len;
3977 /* Skip the name of collating element name. */
3978 idx = idx + extra[idx] + 1;
3979 elem_mbs_len = extra[idx++];
3980 if (mbs_len == elem_mbs_len)
3981 {
3982 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3983 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3984 break;
3985 if (mbs_cnt == elem_mbs_len)
3986 /* Found the entry. */
3987 found = 1;
3988 }
3989 /* Skip the byte sequence of the collating element. */
3990 idx += elem_mbs_len;
3991 /* Adjust for the alignment. */
3992 idx = (idx + 3) & ~3;
3993 /* Skip the collation sequence value. */
3994 idx += sizeof (uint32_t);
3995 /* Skip the wide char sequence of the collating element. */
3996 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3997 /* If we found the entry, return the sequence value. */
3998 if (found)
3999 return *(uint32_t *) (extra + idx);
4000 /* Skip the collation sequence value. */
4001 idx += sizeof (uint32_t);
4002 }
4003 return UINT_MAX;
4004 }
4005}
4006# endif /* _LIBC */
4007#endif /* RE_ENABLE_I18N */
4008
4009/* Check whether the node accepts the byte which is IDX-th
4010 byte of the INPUT. */
4011
4012static int
4013internal_function
4014check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4015 int idx)
4016{
4017 unsigned char ch;
4018 ch = re_string_byte_at (&mctx->input, idx);
4019 switch (node->type)
4020 {
4021 case CHARACTER:
4022 if (node->opr.c != ch)
4023 return 0;
4024 break;
4025
4026 case SIMPLE_BRACKET:
4027 if (!bitset_contain (node->opr.sbcset, ch))
4028 return 0;
4029 break;
4030
4031#ifdef RE_ENABLE_I18N
4032 case OP_UTF8_PERIOD:
4033 if (ch >= 0x80)
4034 return 0;
4035 /* FALLTHROUGH */
4036#endif
4037 case OP_PERIOD:
4038 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4039 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4040 return 0;
4041 break;
4042
4043 default:
4044 return 0;
4045 }
4046
4047 if (node->constraint)
4048 {
4049 /* The node has constraints. Check whether the current context
4050 satisfies the constraints. */
4051 unsigned int context = re_string_context_at (&mctx->input, idx,
4052 mctx->eflags);
4053 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4054 return 0;
4055 }
4056
4057 return 1;
4058}
4059
4060/* Extend the buffers, if the buffers have run out. */
4061
4062static reg_errcode_t
4063internal_function
4064extend_buffers (re_match_context_t *mctx)
4065{
4066 reg_errcode_t ret;
4067 re_string_t *pstr = &mctx->input;
4068
4069 /* Double the lengthes of the buffers. */
4070 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4071 if (BE (ret != REG_NOERROR, 0))
4072 return ret;
4073
4074 if (mctx->state_log != NULL)
4075 {
4076 /* And double the length of state_log. */
4077 /* XXX We have no indication of the size of this buffer. If this
4078 allocation fail we have no indication that the state_log array
4079 does not have the right size. */
4080 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4081 pstr->bufs_len + 1);
4082 if (BE (new_array == NULL, 0))
4083 return REG_ESPACE;
4084 mctx->state_log = new_array;
4085 }
4086
4087 /* Then reconstruct the buffers. */
4088 if (pstr->icase)
4089 {
4090#ifdef RE_ENABLE_I18N
4091 if (pstr->mb_cur_max > 1)
4092 {
4093 ret = build_wcs_upper_buffer (pstr);
4094 if (BE (ret != REG_NOERROR, 0))
4095 return ret;
4096 }
4097 else
4098#endif /* RE_ENABLE_I18N */
4099 build_upper_buffer (pstr);
4100 }
4101 else
4102 {
4103#ifdef RE_ENABLE_I18N
4104 if (pstr->mb_cur_max > 1)
4105 build_wcs_buffer (pstr);
4106 else
4107#endif /* RE_ENABLE_I18N */
4108 {
4109 if (pstr->trans != NULL)
4110 re_string_translate_buffer (pstr);
4111 }
4112 }
4113 return REG_NOERROR;
4114}
4115
4116
4117
4118/* Functions for matching context. */
4119
4120/* Initialize MCTX. */
4121
4122static reg_errcode_t
4123internal_function
4124match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4125{
4126 mctx->eflags = eflags;
4127 mctx->match_last = -1;
4128 if (n > 0)
4129 {
4130 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4131 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4132 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4133 return REG_ESPACE;
4134 }
4135 /* Already zero-ed by the caller.
4136 else
4137 mctx->bkref_ents = NULL;
4138 mctx->nbkref_ents = 0;
4139 mctx->nsub_tops = 0; */
4140 mctx->abkref_ents = n;
4141 mctx->max_mb_elem_len = 1;
4142 mctx->asub_tops = n;
4143 return REG_NOERROR;
4144}
4145
4146/* Clean the entries which depend on the current input in MCTX.
4147 This function must be invoked when the matcher changes the start index
4148 of the input, or changes the input string. */
4149
4150static void
4151internal_function
4152match_ctx_clean (re_match_context_t *mctx)
4153{
4154 int st_idx;
4155 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4156 {
4157 int sl_idx;
4158 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4159 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4160 {
4161 re_sub_match_last_t *last = top->lasts[sl_idx];
4162 re_free (last->path.array);
4163 re_free (last);
4164 }
4165 re_free (top->lasts);
4166 if (top->path)
4167 {
4168 re_free (top->path->array);
4169 re_free (top->path);
4170 }
4171 free (top);
4172 }
4173
4174 mctx->nsub_tops = 0;
4175 mctx->nbkref_ents = 0;
4176}
4177
4178/* Free all the memory associated with MCTX. */
4179
4180static void
4181internal_function
4182match_ctx_free (re_match_context_t *mctx)
4183{
4184 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4185 match_ctx_clean (mctx);
4186 re_free (mctx->sub_tops);
4187 re_free (mctx->bkref_ents);
4188}
4189
4190/* Add a new backreference entry to MCTX.
4191 Note that we assume that caller never call this function with duplicate
4192 entry, and call with STR_IDX which isn't smaller than any existing entry.
4193*/
4194
4195static reg_errcode_t
4196internal_function
4197match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4198 int to)
4199{
4200 if (mctx->nbkref_ents >= mctx->abkref_ents)
4201 {
4202 struct re_backref_cache_entry* new_entry;
4203 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4204 mctx->abkref_ents * 2);
4205 if (BE (new_entry == NULL, 0))
4206 {
4207 re_free (mctx->bkref_ents);
4208 return REG_ESPACE;
4209 }
4210 mctx->bkref_ents = new_entry;
4211 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4212 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4213 mctx->abkref_ents *= 2;
4214 }
4215 if (mctx->nbkref_ents > 0
4216 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4217 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4218
4219 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4220 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4221 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4222 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4223
4224 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4225 If bit N is clear, means that this entry won't epsilon-transition to
4226 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4227 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4228 such node.
4229
4230 A backreference does not epsilon-transition unless it is empty, so set
4231 to all zeros if FROM != TO. */
4232 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4233 = (from == to ? ~0 : 0);
4234
4235 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4236 if (mctx->max_mb_elem_len < to - from)
4237 mctx->max_mb_elem_len = to - from;
4238 return REG_NOERROR;
4239}
4240
4241/* Search for the first entry which has the same str_idx, or -1 if none is
4242 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4243
4244static int
4245internal_function
4246search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4247{
4248 int left, right, mid, last;
4249 last = right = mctx->nbkref_ents;
4250 for (left = 0; left < right;)
4251 {
4252 mid = (left + right) / 2;
4253 if (mctx->bkref_ents[mid].str_idx < str_idx)
4254 left = mid + 1;
4255 else
4256 right = mid;
4257 }
4258 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4259 return left;
4260 else
4261 return -1;
4262}
4263
4264/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4265 at STR_IDX. */
4266
4267static reg_errcode_t
4268internal_function
4269match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4270{
4271#ifdef DEBUG
4272 assert (mctx->sub_tops != NULL);
4273 assert (mctx->asub_tops > 0);
4274#endif
4275 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4276 {
4277 int new_asub_tops = mctx->asub_tops * 2;
4278 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4279 re_sub_match_top_t *,
4280 new_asub_tops);
4281 if (BE (new_array == NULL, 0))
4282 return REG_ESPACE;
4283 mctx->sub_tops = new_array;
4284 mctx->asub_tops = new_asub_tops;
4285 }
4286 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4287 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4288 return REG_ESPACE;
4289 mctx->sub_tops[mctx->nsub_tops]->node = node;
4290 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4291 return REG_NOERROR;
4292}
4293
4294/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4295 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4296
4297static re_sub_match_last_t *
4298internal_function
4299match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4300{
4301 re_sub_match_last_t *new_entry;
4302 if (BE (subtop->nlasts == subtop->alasts, 0))
4303 {
4304 int new_alasts = 2 * subtop->alasts + 1;
4305 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4306 re_sub_match_last_t *,
4307 new_alasts);
4308 if (BE (new_array == NULL, 0))
4309 return NULL;
4310 subtop->lasts = new_array;
4311 subtop->alasts = new_alasts;
4312 }
4313 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4314 if (BE (new_entry != NULL, 1))
4315 {
4316 subtop->lasts[subtop->nlasts] = new_entry;
4317 new_entry->node = node;
4318 new_entry->str_idx = str_idx;
4319 ++subtop->nlasts;
4320 }
4321 return new_entry;
4322}
4323
4324static void
4325internal_function
4326sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4327 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4328{
4329 sctx->sifted_states = sifted_sts;
4330 sctx->limited_states = limited_sts;
4331 sctx->last_node = last_node;
4332 sctx->last_str_idx = last_str_idx;
4333 re_node_set_init_empty (&sctx->limits);
4334}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette