VirtualBox

source: kBuild/vendor/grep/current/lib/dfa.c@ 3530

Last change on this file since 3530 was 3529, checked in by bird, 3 years ago

Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.

  • Property svn:eol-style set to native
File size: 133.3 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24
25#include "dfa.h"
26
27#include "flexmember.h"
28#include "idx.h"
29#include "verify.h"
30
31#include <assert.h>
32#include <ctype.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <limits.h>
37#include <string.h>
38
39/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40 understandably cannot deduce that the input comes from a
41 well-formed regular expression. There's little point to the
42 runtime overhead of 'assert' instead of 'assume_nonnull' when the
43 MMU will check anyway. */
44#define assume_nonnull(x) assume ((x) != NULL)
45
46static bool
47streq (char const *a, char const *b)
48{
49 return strcmp (a, b) == 0;
50}
51
52static bool
53isasciidigit (char c)
54{
55 return '0' <= c && c <= '9';
56}
57
58#include "gettext.h"
59#define _(str) gettext (str)
60
61#include <wchar.h>
62
63#include "xalloc.h"
64#include "localeinfo.h"
65
66#ifndef FALLTHROUGH
67# if 201710L < __STDC_VERSION__
68# define FALLTHROUGH [[__fallthrough__]]
69# elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
70# define FALLTHROUGH __attribute__ ((__fallthrough__))
71# else
72# define FALLTHROUGH ((void) 0)
73# endif
74#endif
75
76#ifndef MIN
77# define MIN(a,b) ((a) < (b) ? (a) : (b))
78#endif
79
80/* HPUX defines these as macros in sys/param.h. */
81#ifdef setbit
82# undef setbit
83#endif
84#ifdef clrbit
85# undef clrbit
86#endif
87
88/* For code that does not use Gnulib’s isblank module. */
89#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90# define isblank dfa_isblank
91static int
92isblank (int c)
93{
94 return c == ' ' || c == '\t';
95}
96#endif
97
98/* First integer value that is greater than any character code. */
99enum { NOTCHAR = 1 << CHAR_BIT };
100
101#ifdef UINT_LEAST64_MAX
102
103/* Number of bits used in a charclass word. */
104enum { CHARCLASS_WORD_BITS = 64 };
105
106/* This represents part of a character class. It must be unsigned and
107 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108typedef uint_least64_t charclass_word;
109
110/* Part of a charclass initializer that represents 64 bits' worth of a
111 charclass, where LO and HI are the low and high-order 32 bits of
112 the 64-bit quantity. */
113# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115#else
116/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117enum { CHARCLASS_WORD_BITS = 32 };
118typedef unsigned long charclass_word;
119# define CHARCLASS_PAIR(lo, hi) lo, hi
120#endif
121
122/* An initializer for a charclass whose 32-bit words are A through H. */
123#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124 {{ \
125 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 }}
128
129/* The maximum useful value of a charclass_word; all used bits are 1. */
130static charclass_word const CHARCLASS_WORD_MASK
131 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133/* Number of words required to hold a bit for every character. */
134enum
135{
136 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137};
138
139/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142/* Convert a possibly-signed character to an unsigned character. This is
143 a bit safer than casting to unsigned char, since it catches some type
144 errors that the cast doesn't. */
145static unsigned char
146to_uchar (char ch)
147{
148 return ch;
149}
150
151/* Contexts tell us whether a character is a newline or a word constituent.
152 Word-constituent characters are those that satisfy iswalnum, plus '_'.
153 Each character has a single CTX_* value; bitmasks of CTX_* values denote
154 a particular character class.
155
156 A state also stores a context value, which is a bitmask of CTX_* values.
157 A state's context represents a set of characters that the state's
158 predecessors must match. For example, a state whose context does not
159 include CTX_LETTER will never have transitions where the previous
160 character is a word constituent. A state whose context is CTX_ANY
161 might have transitions from any character. */
162
163enum
164 {
165 CTX_NONE = 1,
166 CTX_LETTER = 2,
167 CTX_NEWLINE = 4,
168 CTX_ANY = 7
169 };
170
171/* Sometimes characters can only be matched depending on the surrounding
172 context. Such context decisions depend on what the previous character
173 was, and the value of the current (lookahead) character. Context
174 dependent constraints are encoded as 9-bit integers. Each bit that
175 is set indicates that the constraint succeeds in the corresponding
176 context.
177
178 bit 6-8 - valid contexts when next character is CTX_NEWLINE
179 bit 3-5 - valid contexts when next character is CTX_LETTER
180 bit 0-2 - valid contexts when next character is CTX_NONE
181
182 succeeds_in_context determines whether a given constraint
183 succeeds in a particular context. Prev is a bitmask of possible
184 context values for the previous character, curr is the (single-bit)
185 context value for the lookahead character. */
186static int
187newline_constraint (int constraint)
188{
189 return (constraint >> 6) & 7;
190}
191static int
192letter_constraint (int constraint)
193{
194 return (constraint >> 3) & 7;
195}
196static int
197other_constraint (int constraint)
198{
199 return constraint & 7;
200}
201
202static bool
203succeeds_in_context (int constraint, int prev, int curr)
204{
205 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208 & prev);
209}
210
211/* The following describe what a constraint depends on. */
212static bool
213prev_newline_dependent (int constraint)
214{
215 return ((constraint ^ constraint >> 2) & 0111) != 0;
216}
217static bool
218prev_letter_dependent (int constraint)
219{
220 return ((constraint ^ constraint >> 1) & 0111) != 0;
221}
222
223/* Tokens that match the empty string subject to some constraint actually
224 work by applying that constraint to determine what may follow them,
225 taking into account what has gone before. The following values are
226 the constraints corresponding to the special tokens previously defined. */
227enum
228 {
229 NO_CONSTRAINT = 0777,
230 BEGLINE_CONSTRAINT = 0444,
231 ENDLINE_CONSTRAINT = 0700,
232 BEGWORD_CONSTRAINT = 0050,
233 ENDWORD_CONSTRAINT = 0202,
234 LIMWORD_CONSTRAINT = 0252,
235 NOTLIMWORD_CONSTRAINT = 0525
236 };
237
238/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239 are operators and others are terminal symbols. Most (but not all) of these
240 codes are returned by the lexical analyzer. */
241
242typedef ptrdiff_t token;
243static token const TOKEN_MAX = PTRDIFF_MAX;
244
245/* States are indexed by state_num values. These are normally
246 nonnegative but -1 is used as a special value. */
247typedef ptrdiff_t state_num;
248
249/* Predefined token values. */
250enum
251{
252 END = -1, /* END is a terminal symbol that matches the
253 end of input; any value of END or less in
254 the parse tree is such a symbol. Accepting
255 states of the DFA are those that would have
256 a transition on END. This is -1, not some
257 more-negative value, to tweak the speed of
258 comparisons to END. */
259
260 /* Ordinary character values are terminal symbols that match themselves. */
261
262 /* CSET must come last in the following list of special tokens. Otherwise,
263 the list order matters only for performance. Related special tokens
264 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
265 || CSET <= t) can be done with a single machine-level comparison. */
266
267 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268 the empty string. */
269
270 QMARK, /* QMARK is an operator of one argument that
271 matches zero or one occurrences of its
272 argument. */
273
274 STAR, /* STAR is an operator of one argument that
275 matches the Kleene closure (zero or more
276 occurrences) of its argument. */
277
278 PLUS, /* PLUS is an operator of one argument that
279 matches the positive closure (one or more
280 occurrences) of its argument. */
281
282 REPMN, /* REPMN is a lexical token corresponding
283 to the {m,n} construct. REPMN never
284 appears in the compiled token vector. */
285
286 CAT, /* CAT is an operator of two arguments that
287 matches the concatenation of its
288 arguments. CAT is never returned by the
289 lexical analyzer. */
290
291 OR, /* OR is an operator of two arguments that
292 matches either of its arguments. */
293
294 LPAREN, /* LPAREN never appears in the parse tree,
295 it is only a lexeme. */
296
297 RPAREN, /* RPAREN never appears in the parse tree. */
298
299 WCHAR, /* Only returned by lex. wctok contains
300 the wide character representation. */
301
302 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
303 a valid multibyte (or single byte) character.
304 It is used only if MB_CUR_MAX > 1. */
305
306 BEG, /* BEG is an initial symbol that matches the
307 beginning of input. */
308
309 BEGLINE, /* BEGLINE is a terminal symbol that matches
310 the empty string at the beginning of a
311 line. */
312
313 ENDLINE, /* ENDLINE is a terminal symbol that matches
314 the empty string at the end of a line. */
315
316 BEGWORD, /* BEGWORD is a terminal symbol that matches
317 the empty string at the beginning of a
318 word. */
319
320 ENDWORD, /* ENDWORD is a terminal symbol that matches
321 the empty string at the end of a word. */
322
323 LIMWORD, /* LIMWORD is a terminal symbol that matches
324 the empty string at the beginning or the
325 end of a word. */
326
327 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
328 matches the empty string not at
329 the beginning or end of a word. */
330
331 BACKREF, /* BACKREF is generated by \<digit>
332 or by any other construct that
333 is not completely handled. If the scanner
334 detects a transition on backref, it returns
335 a kind of "semi-success" indicating that
336 the match will have to be verified with
337 a backtracking matcher. */
338
339 MBCSET, /* MBCSET is similar to CSET, but for
340 multibyte characters. */
341
342 CSET /* CSET and (and any value greater) is a
343 terminal symbol that matches any of a
344 class of characters. */
345};
346
347
348/* States of the recognizer correspond to sets of positions in the parse
349 tree, together with the constraints under which they may be matched.
350 So a position is encoded as an index into the parse tree together with
351 a constraint. */
352typedef struct
353{
354 idx_t index; /* Index into the parse array. */
355 unsigned int constraint; /* Constraint for matching this position. */
356} position;
357
358/* Sets of positions are stored as arrays. */
359typedef struct
360{
361 position *elems; /* Elements of this position set. */
362 idx_t nelem; /* Number of elements in this set. */
363 idx_t alloc; /* Number of elements allocated in ELEMS. */
364} position_set;
365
366/* A state of the dfa consists of a set of positions, some flags,
367 and the token value of the lowest-numbered position of the state that
368 contains an END token. */
369typedef struct
370{
371 size_t hash; /* Hash of the positions of this state. */
372 position_set elems; /* Positions this state could match. */
373 unsigned char context; /* Context from previous state. */
374 unsigned short constraint; /* Constraint for this state to accept. */
375 position_set mbps; /* Positions which can match multibyte
376 characters or the follows, e.g., period.
377 Used only if MB_CUR_MAX > 1. */
378 state_num mb_trindex; /* Index of this state in MB_TRANS, or
379 negative if the state does not have
380 ANYCHAR. */
381} dfa_state;
382
383/* Maximum for any transition table count. This should be at least 3,
384 for the initial state setup. */
385enum { MAX_TRCOUNT = 1024 };
386
387/* A bracket operator.
388 e.g., [a-c], [[:alpha:]], etc. */
389struct mb_char_classes
390{
391 ptrdiff_t cset;
392 bool invert;
393 wchar_t *chars; /* Normal characters. */
394 idx_t nchars;
395 idx_t nchars_alloc;
396};
397
398struct regex_syntax
399{
400 /* Syntax bits controlling the behavior of the lexical analyzer. */
401 reg_syntax_t syntax_bits;
402 bool syntax_bits_set;
403
404 /* Flag for case-folding letters into sets. */
405 bool case_fold;
406
407 /* True if ^ and $ match only the start and end of data, and do not match
408 end-of-line within data. */
409 bool anchor;
410
411 /* End-of-line byte in data. */
412 unsigned char eolbyte;
413
414 /* Cache of char-context values. */
415 char sbit[NOTCHAR];
416
417 /* If never_trail[B], the byte B cannot be a non-initial byte in a
418 multibyte character. */
419 bool never_trail[NOTCHAR];
420
421 /* Set of characters considered letters. */
422 charclass letters;
423
424 /* Set of characters that are newline. */
425 charclass newline;
426};
427
428/* Lexical analyzer. All the dross that deals with the obnoxious
429 GNU Regex syntax bits is located here. The poor, suffering
430 reader is referred to the GNU Regex documentation for the
431 meaning of the @#%!@#%^!@ syntax bits. */
432struct lexer_state
433{
434 char const *ptr; /* Pointer to next input character. */
435 idx_t left; /* Number of characters remaining. */
436 token lasttok; /* Previous token returned; initially END. */
437 idx_t parens; /* Count of outstanding left parens. */
438 int minrep, maxrep; /* Repeat counts for {m,n}. */
439
440 /* Wide character representation of the current multibyte character,
441 or WEOF if there was an encoding error. Used only if
442 MB_CUR_MAX > 1. */
443 wint_t wctok;
444
445 /* The most recently analyzed multibyte bracket expression. */
446 struct mb_char_classes brack;
447
448 /* We're separated from beginning or (, | only by zero-width characters. */
449 bool laststart;
450};
451
452/* Recursive descent parser for regular expressions. */
453
454struct parser_state
455{
456 token tok; /* Lookahead token. */
457 idx_t depth; /* Current depth of a hypothetical stack
458 holding deferred productions. This is
459 used to determine the depth that will be
460 required of the real stack later on in
461 dfaanalyze. */
462};
463
464/* A compiled regular expression. */
465struct dfa
466{
467 /* Fields filled by the scanner. */
468 charclass *charclasses; /* Array of character sets for CSET tokens. */
469 idx_t cindex; /* Index for adding new charclasses. */
470 idx_t calloc; /* Number of charclasses allocated. */
471 ptrdiff_t canychar; /* Index of anychar class, or -1. */
472
473 /* Scanner state */
474 struct lexer_state lex;
475
476 /* Parser state */
477 struct parser_state parse;
478
479 /* Fields filled by the parser. */
480 token *tokens; /* Postfix parse array. */
481 idx_t tindex; /* Index for adding new tokens. */
482 idx_t talloc; /* Number of tokens currently allocated. */
483 idx_t depth; /* Depth required of an evaluation stack
484 used for depth-first traversal of the
485 parse tree. */
486 idx_t nleaves; /* Number of non-EMPTY leaves
487 in the parse tree. */
488 idx_t nregexps; /* Count of parallel regexps being built
489 with dfaparse. */
490 bool fast; /* The DFA is fast. */
491 bool epsilon; /* Does a token match only the empty string? */
492 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
493 mbstate_t mbs; /* Multibyte conversion state. */
494
495 /* The following are valid only if MB_CUR_MAX > 1. */
496
497 /* The value of multibyte_prop[i] is defined by following rule.
498 if tokens[i] < NOTCHAR
499 bit 0 : tokens[i] is the first byte of a character, including
500 single-byte characters.
501 bit 1 : tokens[i] is the last byte of a character, including
502 single-byte characters.
503
504 e.g.
505 tokens
506 = 'single_byte_a', 'multi_byte_A', single_byte_b'
507 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
508 multibyte_prop
509 = 3 , 1 , 0 , 2 , 3
510 */
511 char *multibyte_prop;
512
513 /* Fields filled by the superset. */
514 struct dfa *superset; /* Hint of the dfa. */
515
516 /* Fields filled by the state builder. */
517 dfa_state *states; /* States of the dfa. */
518 state_num sindex; /* Index for adding new states. */
519 idx_t salloc; /* Number of states currently allocated. */
520
521 /* Fields filled by the parse tree->NFA conversion. */
522 position_set *follows; /* Array of follow sets, indexed by position
523 index. The follow of a position is the set
524 of positions containing characters that
525 could conceivably follow a character
526 matching the given position in a string
527 matching the regexp. Allocated to the
528 maximum possible position index. */
529 bool searchflag; /* We are supposed to build a searching
530 as opposed to an exact matcher. A searching
531 matcher finds the first and shortest string
532 matching a regexp anywhere in the buffer,
533 whereas an exact matcher finds the longest
534 string matching, but anchored to the
535 beginning of the buffer. */
536
537 /* Fields filled by dfaanalyze. */
538 int *constraints; /* Array of union of accepting constraints
539 in the follow of a position. */
540 int *separates; /* Array of contexts on follow of a
541 position. */
542
543 /* Fields filled by dfaexec. */
544 state_num tralloc; /* Number of transition tables that have
545 slots so far, not counting trans[-1] and
546 trans[-2]. */
547 int trcount; /* Number of transition tables that have
548 been built, other than for initial
549 states. */
550 int min_trcount; /* Number of initial states. Equivalently,
551 the minimum state number for which trcount
552 counts transitions. */
553 state_num **trans; /* Transition tables for states that can
554 never accept. If the transitions for a
555 state have not yet been computed, or the
556 state could possibly accept, its entry in
557 this table is NULL. This points to two
558 past the start of the allocated array,
559 and trans[-1] and trans[-2] are always
560 NULL. */
561 state_num **fails; /* Transition tables after failing to accept
562 on a state that potentially could do so.
563 If trans[i] is non-null, fails[i] must
564 be null. */
565 char *success; /* Table of acceptance conditions used in
566 dfaexec and computed in build_state. */
567 state_num *newlines; /* Transitions on newlines. The entry for a
568 newline in any transition table is always
569 -1 so we can count lines without wasting
570 too many cycles. The transition for a
571 newline is stored separately and handled
572 as a special case. Newline is also used
573 as a sentinel at the end of the buffer. */
574 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
575 context in multibyte locales, in which we
576 do not distinguish between their contexts,
577 as not supported word. */
578 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
579 state_num **mb_trans; /* Transition tables for states with
580 ANYCHAR. */
581 state_num mb_trcount; /* Number of transition tables for states with
582 ANYCHAR that have actually been built. */
583
584 /* Syntax configuration. This is near the end so that dfacopysyntax
585 can memset up to here. */
586 struct regex_syntax syntax;
587
588 /* Information derived from the locale. This is at the end so that
589 a quick memset need not clear it specially. */
590
591 /* dfaexec implementation. */
592 char *(*dfaexec) (struct dfa *, char const *, char *,
593 bool, ptrdiff_t *, bool *);
594
595 /* Other cached information derived from the locale. */
596 struct localeinfo localeinfo;
597};
598
599/* User access to dfa internals. */
600
601/* S could possibly be an accepting state of R. */
602static bool
603accepting (state_num s, struct dfa const *r)
604{
605 return r->states[s].constraint != 0;
606}
607
608/* STATE accepts in the specified context. */
609static bool
610accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
611{
612 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
613}
614
615static void regexp (struct dfa *dfa);
616
617/* Store into *PWC the result of converting the leading bytes of the
618 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
619 and updating the conversion state in *D. On conversion error,
620 convert just a single byte, to WEOF. Return the number of bytes
621 converted.
622
623 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
624
625 * PWC points to wint_t, not to wchar_t.
626 * The last arg is a dfa *D instead of merely a multibyte conversion
627 state D->mbs.
628 * N is idx_t not size_t, and must be at least 1.
629 * S[N - 1] must be a sentinel byte.
630 * Shift encodings are not supported.
631 * The return value is always in the range 1..N.
632 * D->mbs is always valid afterwards.
633 * *PWC is always set to something. */
634static int
635mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
636{
637 unsigned char uc = s[0];
638 wint_t wc = d->localeinfo.sbctowc[uc];
639
640 if (wc == WEOF)
641 {
642 wchar_t wch;
643 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
644 if (0 < nbytes && nbytes < (size_t) -2)
645 {
646 *pwc = wch;
647 return nbytes;
648 }
649 memset (&d->mbs, 0, sizeof d->mbs);
650 }
651
652 *pwc = wc;
653 return 1;
654}
655
656#ifdef DEBUG
657
658static void
659prtok (token t)
660{
661 if (t <= END)
662 fprintf (stderr, "END");
663 else if (0 <= t && t < NOTCHAR)
664 {
665 unsigned int ch = t;
666 fprintf (stderr, "0x%02x", ch);
667 }
668 else
669 {
670 char const *s;
671 switch (t)
672 {
673 case BEG:
674 s = "BEG";
675 break;
676 case EMPTY:
677 s = "EMPTY";
678 break;
679 case BACKREF:
680 s = "BACKREF";
681 break;
682 case BEGLINE:
683 s = "BEGLINE";
684 break;
685 case ENDLINE:
686 s = "ENDLINE";
687 break;
688 case BEGWORD:
689 s = "BEGWORD";
690 break;
691 case ENDWORD:
692 s = "ENDWORD";
693 break;
694 case LIMWORD:
695 s = "LIMWORD";
696 break;
697 case NOTLIMWORD:
698 s = "NOTLIMWORD";
699 break;
700 case QMARK:
701 s = "QMARK";
702 break;
703 case STAR:
704 s = "STAR";
705 break;
706 case PLUS:
707 s = "PLUS";
708 break;
709 case CAT:
710 s = "CAT";
711 break;
712 case OR:
713 s = "OR";
714 break;
715 case LPAREN:
716 s = "LPAREN";
717 break;
718 case RPAREN:
719 s = "RPAREN";
720 break;
721 case ANYCHAR:
722 s = "ANYCHAR";
723 break;
724 case MBCSET:
725 s = "MBCSET";
726 break;
727 default:
728 s = "CSET";
729 break;
730 }
731 fprintf (stderr, "%s", s);
732 }
733}
734#endif /* DEBUG */
735
736/* Stuff pertaining to charclasses. */
737
738static bool
739tstbit (unsigned int b, charclass const *c)
740{
741 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
742}
743
744static void
745setbit (unsigned int b, charclass *c)
746{
747 charclass_word one = 1;
748 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
749}
750
751static void
752clrbit (unsigned int b, charclass *c)
753{
754 charclass_word one = 1;
755 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
756}
757
758static void
759zeroset (charclass *s)
760{
761 memset (s, 0, sizeof *s);
762}
763
764static void
765fillset (charclass *s)
766{
767 for (int i = 0; i < CHARCLASS_WORDS; i++)
768 s->w[i] = CHARCLASS_WORD_MASK;
769}
770
771static void
772notset (charclass *s)
773{
774 for (int i = 0; i < CHARCLASS_WORDS; ++i)
775 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
776}
777
778static bool
779equal (charclass const *s1, charclass const *s2)
780{
781 charclass_word w = 0;
782 for (int i = 0; i < CHARCLASS_WORDS; i++)
783 w |= s1->w[i] ^ s2->w[i];
784 return w == 0;
785}
786
787static bool
788emptyset (charclass const *s)
789{
790 charclass_word w = 0;
791 for (int i = 0; i < CHARCLASS_WORDS; i++)
792 w |= s->w[i];
793 return w == 0;
794}
795
796/* Ensure that the array addressed by PA holds at least I + 1 items.
797 Either return PA, or reallocate the array and return its new address.
798 Although PA may be null, the returned value is never null.
799
800 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
801 is updated on reallocation. If PA is null, *NITEMS must be zero.
802 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
803 ITEM_SIZE is the size of one item; it must be positive.
804 Avoid O(N**2) behavior on arrays growing linearly. */
805static void *
806maybe_realloc (void *pa, idx_t i, idx_t *nitems,
807 ptrdiff_t nitems_max, idx_t item_size)
808{
809 if (i < *nitems)
810 return pa;
811 return xpalloc (pa, nitems, 1, nitems_max, item_size);
812}
813
814/* In DFA D, find the index of charclass S, or allocate a new one. */
815static idx_t
816charclass_index (struct dfa *d, charclass const *s)
817{
818 idx_t i;
819
820 for (i = 0; i < d->cindex; ++i)
821 if (equal (s, &d->charclasses[i]))
822 return i;
823 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
824 TOKEN_MAX - CSET, sizeof *d->charclasses);
825 ++d->cindex;
826 d->charclasses[i] = *s;
827 return i;
828}
829
830static bool
831unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
832{
833 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
834}
835
836static int
837char_context (struct dfa const *dfa, unsigned char c)
838{
839 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
840 return CTX_NEWLINE;
841 if (unibyte_word_constituent (dfa, c))
842 return CTX_LETTER;
843 return CTX_NONE;
844}
845
846/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
847 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
848 this may happen when folding case in weird Turkish locales where
849 dotless i/dotted I are not included in the chosen character set.
850 Return whether a bit was set in the charclass. */
851static bool
852setbit_wc (wint_t wc, charclass *c)
853{
854 int b = wctob (wc);
855 if (b < 0)
856 return false;
857
858 setbit (b, c);
859 return true;
860}
861
862/* Set a bit for B and its case variants in the charclass C.
863 MB_CUR_MAX must be 1. */
864static void
865setbit_case_fold_c (int b, charclass *c)
866{
867 int ub = toupper (b);
868 for (int i = 0; i < NOTCHAR; i++)
869 if (toupper (i) == ub)
870 setbit (i, c);
871}
872
873/* Fetch the next lexical input character from the pattern. There
874 must at least one byte of pattern input. Set DFA->lex.wctok to the
875 value of the character or to WEOF depending on whether the input is
876 a valid multibyte character (possibly of length 1). Then return
877 the next input byte value, except return EOF if the input is a
878 multibyte character of length greater than 1. */
879static int
880fetch_wc (struct dfa *dfa)
881{
882 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
883 dfa);
884 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
885 dfa->lex.ptr += nbytes;
886 dfa->lex.left -= nbytes;
887 return c;
888}
889
890/* If there is no more input, report an error about unbalanced brackets.
891 Otherwise, behave as with fetch_wc (DFA). */
892static int
893bracket_fetch_wc (struct dfa *dfa)
894{
895 if (! dfa->lex.left)
896 dfaerror (_("unbalanced ["));
897 return fetch_wc (dfa);
898}
899
900typedef int predicate (int);
901
902/* The following list maps the names of the Posix named character classes
903 to predicate functions that determine whether a given character is in
904 the class. The leading [ has already been eaten by the lexical
905 analyzer. */
906struct dfa_ctype
907{
908 const char *name;
909 predicate *func;
910 bool single_byte_only;
911};
912
913static const struct dfa_ctype prednames[] = {
914 {"alpha", isalpha, false},
915 {"upper", isupper, false},
916 {"lower", islower, false},
917 {"digit", isdigit, true},
918 {"xdigit", isxdigit, false},
919 {"space", isspace, false},
920 {"punct", ispunct, false},
921 {"alnum", isalnum, false},
922 {"print", isprint, false},
923 {"graph", isgraph, false},
924 {"cntrl", iscntrl, false},
925 {"blank", isblank, false},
926 {NULL, NULL, false}
927};
928
929static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
930find_pred (const char *str)
931{
932 for (int i = 0; prednames[i].name; i++)
933 if (streq (str, prednames[i].name))
934 return &prednames[i];
935 return NULL;
936}
937
938/* Parse a bracket expression, which possibly includes multibyte
939 characters. */
940static token
941parse_bracket_exp (struct dfa *dfa)
942{
943 /* This is a bracket expression that dfaexec is known to
944 process correctly. */
945 bool known_bracket_exp = true;
946
947 /* Used to warn about [:space:].
948 Bit 0 = first character is a colon.
949 Bit 1 = last character is a colon.
950 Bit 2 = includes any other character but a colon.
951 Bit 3 = includes ranges, char/equiv classes or collation elements. */
952 int colon_warning_state;
953
954 dfa->lex.brack.nchars = 0;
955 charclass ccl;
956 zeroset (&ccl);
957 int c = bracket_fetch_wc (dfa);
958 bool invert = c == '^';
959 if (invert)
960 {
961 c = bracket_fetch_wc (dfa);
962 known_bracket_exp = dfa->localeinfo.simple;
963 }
964 wint_t wc = dfa->lex.wctok;
965 int c1;
966 wint_t wc1;
967 colon_warning_state = (c == ':');
968 do
969 {
970 c1 = NOTCHAR; /* Mark c1 as not initialized. */
971 colon_warning_state &= ~2;
972
973 /* Note that if we're looking at some other [:...:] construct,
974 we just treat it as a bunch of ordinary characters. We can do
975 this because we assume regex has checked for syntax errors before
976 dfa is ever called. */
977 if (c == '[')
978 {
979 c1 = bracket_fetch_wc (dfa);
980 wc1 = dfa->lex.wctok;
981
982 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
983 || c1 == '.' || c1 == '=')
984 {
985 enum { MAX_BRACKET_STRING_LEN = 32 };
986 char str[MAX_BRACKET_STRING_LEN + 1];
987 int len = 0;
988 for (;;)
989 {
990 c = bracket_fetch_wc (dfa);
991 if (dfa->lex.left == 0
992 || (c == c1 && dfa->lex.ptr[0] == ']'))
993 break;
994 if (len < MAX_BRACKET_STRING_LEN)
995 str[len++] = c;
996 else
997 /* This is in any case an invalid class name. */
998 str[0] = '\0';
999 }
1000 str[len] = '\0';
1001
1002 /* Fetch bracket. */
1003 c = bracket_fetch_wc (dfa);
1004 wc = dfa->lex.wctok;
1005 if (c1 == ':')
1006 /* Build character class. POSIX allows character
1007 classes to match multicharacter collating elements,
1008 but the regex code does not support that, so do not
1009 worry about that possibility. */
1010 {
1011 char const *class
1012 = (dfa->syntax.case_fold && (streq (str, "upper")
1013 || streq (str, "lower"))
1014 ? "alpha" : str);
1015 const struct dfa_ctype *pred = find_pred (class);
1016 if (!pred)
1017 dfaerror (_("invalid character class"));
1018
1019 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1020 known_bracket_exp = false;
1021 else
1022 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1023 if (pred->func (c2))
1024 setbit (c2, &ccl);
1025 }
1026 else
1027 known_bracket_exp = false;
1028
1029 colon_warning_state |= 8;
1030
1031 /* Fetch new lookahead character. */
1032 c1 = bracket_fetch_wc (dfa);
1033 wc1 = dfa->lex.wctok;
1034 continue;
1035 }
1036
1037 /* We treat '[' as a normal character here. c/c1/wc/wc1
1038 are already set up. */
1039 }
1040
1041 if (c == '\\'
1042 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1043 {
1044 c = bracket_fetch_wc (dfa);
1045 wc = dfa->lex.wctok;
1046 }
1047
1048 if (c1 == NOTCHAR)
1049 {
1050 c1 = bracket_fetch_wc (dfa);
1051 wc1 = dfa->lex.wctok;
1052 }
1053
1054 if (c1 == '-')
1055 /* build range characters. */
1056 {
1057 int c2 = bracket_fetch_wc (dfa);
1058 wint_t wc2 = dfa->lex.wctok;
1059
1060 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1061 Treat it like [-a[.aa.]] while parsing it, and
1062 remember that the set is unknown. */
1063 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1064 {
1065 known_bracket_exp = false;
1066 c2 = ']';
1067 }
1068
1069 if (c2 == ']')
1070 {
1071 /* In the case [x-], the - is an ordinary hyphen,
1072 which is left in c1, the lookahead character. */
1073 dfa->lex.ptr--;
1074 dfa->lex.left++;
1075 }
1076 else
1077 {
1078 if (c2 == '\\' && (dfa->syntax.syntax_bits
1079 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1080 {
1081 c2 = bracket_fetch_wc (dfa);
1082 wc2 = dfa->lex.wctok;
1083 }
1084
1085 colon_warning_state |= 8;
1086 c1 = bracket_fetch_wc (dfa);
1087 wc1 = dfa->lex.wctok;
1088
1089 /* Treat [x-y] as a range if x != y. */
1090 if (wc != wc2 || wc == WEOF)
1091 {
1092 if (dfa->localeinfo.simple
1093 || (isasciidigit (c) & isasciidigit (c2)))
1094 {
1095 for (int ci = c; ci <= c2; ci++)
1096 if (dfa->syntax.case_fold && isalpha (ci))
1097 setbit_case_fold_c (ci, &ccl);
1098 else
1099 setbit (ci, &ccl);
1100 }
1101 else
1102 known_bracket_exp = false;
1103
1104 continue;
1105 }
1106 }
1107 }
1108
1109 colon_warning_state |= (c == ':') ? 2 : 4;
1110
1111 if (!dfa->localeinfo.multibyte)
1112 {
1113 if (dfa->syntax.case_fold && isalpha (c))
1114 setbit_case_fold_c (c, &ccl);
1115 else
1116 setbit (c, &ccl);
1117 continue;
1118 }
1119
1120 if (wc == WEOF)
1121 known_bracket_exp = false;
1122 else
1123 {
1124 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1125 int n = (dfa->syntax.case_fold
1126 ? case_folded_counterparts (wc, folded + 1) + 1
1127 : 1);
1128 folded[0] = wc;
1129 for (int i = 0; i < n; i++)
1130 if (!setbit_wc (folded[i], &ccl))
1131 {
1132 dfa->lex.brack.chars
1133 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1134 &dfa->lex.brack.nchars_alloc, -1,
1135 sizeof *dfa->lex.brack.chars);
1136 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1137 }
1138 }
1139 }
1140 while ((wc = wc1, (c = c1) != ']'));
1141
1142 if (colon_warning_state == 7)
1143 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1144
1145 if (! known_bracket_exp)
1146 return BACKREF;
1147
1148 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1149 {
1150 dfa->lex.brack.invert = invert;
1151 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152 return MBCSET;
1153 }
1154
1155 if (invert)
1156 {
1157 notset (&ccl);
1158 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159 clrbit ('\n', &ccl);
1160 }
1161
1162 return CSET + charclass_index (dfa, &ccl);
1163}
1164
1165struct lexptr
1166{
1167 char const *ptr;
1168 idx_t left;
1169};
1170
1171static void
1172push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1173{
1174 ls->ptr = dfa->lex.ptr;
1175 ls->left = dfa->lex.left;
1176 dfa->lex.ptr = s;
1177 dfa->lex.left = strlen (s);
1178}
1179
1180static void
1181pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1182{
1183 dfa->lex.ptr = ls->ptr;
1184 dfa->lex.left = ls->left;
1185}
1186
1187static token
1188lex (struct dfa *dfa)
1189{
1190 bool backslash = false;
1191
1192 /* Basic plan: We fetch a character. If it's a backslash,
1193 we set the backslash flag and go through the loop again.
1194 On the plus side, this avoids having a duplicate of the
1195 main switch inside the backslash case. On the minus side,
1196 it means that just about every case begins with
1197 "if (backslash) ...". */
1198 for (int i = 0; i < 2; ++i)
1199 {
1200 if (! dfa->lex.left)
1201 return dfa->lex.lasttok = END;
1202 int c = fetch_wc (dfa);
1203
1204 switch (c)
1205 {
1206 case '\\':
1207 if (backslash)
1208 goto normal_char;
1209 if (dfa->lex.left == 0)
1210 dfaerror (_("unfinished \\ escape"));
1211 backslash = true;
1212 break;
1213
1214 case '^':
1215 if (backslash)
1216 goto normal_char;
1217 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1218 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1219 || dfa->lex.lasttok == OR)
1220 return dfa->lex.lasttok = BEGLINE;
1221 goto normal_char;
1222
1223 case '$':
1224 if (backslash)
1225 goto normal_char;
1226 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1227 || dfa->lex.left == 0
1228 || ((dfa->lex.left
1229 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1230 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1231 & (dfa->lex.ptr[0] == '\\')]
1232 == ')'))
1233 || ((dfa->lex.left
1234 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1235 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1236 & (dfa->lex.ptr[0] == '\\')]
1237 == '|'))
1238 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1239 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1240 return dfa->lex.lasttok = ENDLINE;
1241 goto normal_char;
1242
1243 case '1':
1244 case '2':
1245 case '3':
1246 case '4':
1247 case '5':
1248 case '6':
1249 case '7':
1250 case '8':
1251 case '9':
1252 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1253 {
1254 dfa->lex.laststart = false;
1255 return dfa->lex.lasttok = BACKREF;
1256 }
1257 goto normal_char;
1258
1259 case '`':
1260 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1261 {
1262 /* FIXME: should be beginning of string */
1263 return dfa->lex.lasttok = BEGLINE;
1264 }
1265 goto normal_char;
1266
1267 case '\'':
1268 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1269 {
1270 /* FIXME: should be end of string */
1271 return dfa->lex.lasttok = ENDLINE;
1272 }
1273 goto normal_char;
1274
1275 case '<':
1276 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1277 return dfa->lex.lasttok = BEGWORD;
1278 goto normal_char;
1279
1280 case '>':
1281 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1282 return dfa->lex.lasttok = ENDWORD;
1283 goto normal_char;
1284
1285 case 'b':
1286 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1287 return dfa->lex.lasttok = LIMWORD;
1288 goto normal_char;
1289
1290 case 'B':
1291 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1292 return dfa->lex.lasttok = NOTLIMWORD;
1293 goto normal_char;
1294
1295 case '?':
1296 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1297 goto normal_char;
1298 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1299 goto normal_char;
1300 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1301 && dfa->lex.laststart)
1302 goto normal_char;
1303 return dfa->lex.lasttok = QMARK;
1304
1305 case '*':
1306 if (backslash)
1307 goto normal_char;
1308 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1309 && dfa->lex.laststart)
1310 goto normal_char;
1311 return dfa->lex.lasttok = STAR;
1312
1313 case '+':
1314 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1315 goto normal_char;
1316 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1317 goto normal_char;
1318 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1319 && dfa->lex.laststart)
1320 goto normal_char;
1321 return dfa->lex.lasttok = PLUS;
1322
1323 case '{':
1324 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1325 goto normal_char;
1326 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1327 goto normal_char;
1328 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1329 && dfa->lex.laststart)
1330 goto normal_char;
1331
1332 /* Cases:
1333 {M} - exact count
1334 {M,} - minimum count, maximum is infinity
1335 {,N} - 0 through N
1336 {,} - 0 to infinity (same as '*')
1337 {M,N} - M through N */
1338 {
1339 char const *p = dfa->lex.ptr;
1340 char const *lim = p + dfa->lex.left;
1341 dfa->lex.minrep = dfa->lex.maxrep = -1;
1342 for (; p != lim && isasciidigit (*p); p++)
1343 dfa->lex.minrep = (dfa->lex.minrep < 0
1344 ? *p - '0'
1345 : MIN (RE_DUP_MAX + 1,
1346 dfa->lex.minrep * 10 + *p - '0'));
1347 if (p != lim)
1348 {
1349 if (*p != ',')
1350 dfa->lex.maxrep = dfa->lex.minrep;
1351 else
1352 {
1353 if (dfa->lex.minrep < 0)
1354 dfa->lex.minrep = 0;
1355 while (++p != lim && isasciidigit (*p))
1356 dfa->lex.maxrep
1357 = (dfa->lex.maxrep < 0
1358 ? *p - '0'
1359 : MIN (RE_DUP_MAX + 1,
1360 dfa->lex.maxrep * 10 + *p - '0'));
1361 }
1362 }
1363 if (! ((! backslash || (p != lim && *p++ == '\\'))
1364 && p != lim && *p++ == '}'
1365 && 0 <= dfa->lex.minrep
1366 && (dfa->lex.maxrep < 0
1367 || dfa->lex.minrep <= dfa->lex.maxrep)))
1368 {
1369 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1370 goto normal_char;
1371 dfaerror (_("invalid content of \\{\\}"));
1372 }
1373 if (RE_DUP_MAX < dfa->lex.maxrep)
1374 dfaerror (_("regular expression too big"));
1375 dfa->lex.ptr = p;
1376 dfa->lex.left = lim - p;
1377 }
1378 dfa->lex.laststart = false;
1379 return dfa->lex.lasttok = REPMN;
1380
1381 case '|':
1382 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1383 goto normal_char;
1384 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1385 goto normal_char;
1386 dfa->lex.laststart = true;
1387 return dfa->lex.lasttok = OR;
1388
1389 case '\n':
1390 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1391 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1392 goto normal_char;
1393 dfa->lex.laststart = true;
1394 return dfa->lex.lasttok = OR;
1395
1396 case '(':
1397 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1398 goto normal_char;
1399 dfa->lex.parens++;
1400 dfa->lex.laststart = true;
1401 return dfa->lex.lasttok = LPAREN;
1402
1403 case ')':
1404 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1405 goto normal_char;
1406 if (dfa->lex.parens == 0
1407 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1408 goto normal_char;
1409 dfa->lex.parens--;
1410 dfa->lex.laststart = false;
1411 return dfa->lex.lasttok = RPAREN;
1412
1413 case '.':
1414 if (backslash)
1415 goto normal_char;
1416 if (dfa->canychar < 0)
1417 {
1418 charclass ccl;
1419 fillset (&ccl);
1420 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1421 clrbit ('\n', &ccl);
1422 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1423 clrbit ('\0', &ccl);
1424 if (dfa->localeinfo.multibyte)
1425 for (int c2 = 0; c2 < NOTCHAR; c2++)
1426 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1427 clrbit (c2, &ccl);
1428 dfa->canychar = charclass_index (dfa, &ccl);
1429 }
1430 dfa->lex.laststart = false;
1431 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1432 ? ANYCHAR
1433 : CSET + dfa->canychar);
1434
1435 case 's':
1436 case 'S':
1437 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1438 goto normal_char;
1439 if (!dfa->localeinfo.multibyte)
1440 {
1441 charclass ccl;
1442 zeroset (&ccl);
1443 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1444 if (isspace (c2))
1445 setbit (c2, &ccl);
1446 if (c == 'S')
1447 notset (&ccl);
1448 dfa->lex.laststart = false;
1449 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1450 }
1451
1452 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1453 add_utf8_anychar, makes sense. */
1454
1455 /* \s and \S are documented to be equivalent to [[:space:]] and
1456 [^[:space:]] respectively, so tell the lexer to process those
1457 strings, each minus its "already processed" '['. */
1458 {
1459 struct lexptr ls;
1460 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1461 dfa->lex.lasttok = parse_bracket_exp (dfa);
1462 pop_lex_state (dfa, &ls);
1463 }
1464
1465 dfa->lex.laststart = false;
1466 return dfa->lex.lasttok;
1467
1468 case 'w':
1469 case 'W':
1470 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1471 goto normal_char;
1472
1473 if (!dfa->localeinfo.multibyte)
1474 {
1475 charclass ccl;
1476 zeroset (&ccl);
1477 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1478 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1479 setbit (c2, &ccl);
1480 if (c == 'W')
1481 notset (&ccl);
1482 dfa->lex.laststart = false;
1483 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1484 }
1485
1486 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1487 add_utf8_anychar, makes sense. */
1488
1489 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1490 [^_[:alnum:]] respectively, so tell the lexer to process those
1491 strings, each minus its "already processed" '['. */
1492 {
1493 struct lexptr ls;
1494 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1495 dfa->lex.lasttok = parse_bracket_exp (dfa);
1496 pop_lex_state (dfa, &ls);
1497 }
1498
1499 dfa->lex.laststart = false;
1500 return dfa->lex.lasttok;
1501
1502 case '[':
1503 if (backslash)
1504 goto normal_char;
1505 dfa->lex.laststart = false;
1506 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1507
1508 default:
1509 normal_char:
1510 dfa->lex.laststart = false;
1511 /* For multibyte character sets, folding is done in atom. Always
1512 return WCHAR. */
1513 if (dfa->localeinfo.multibyte)
1514 return dfa->lex.lasttok = WCHAR;
1515
1516 if (dfa->syntax.case_fold && isalpha (c))
1517 {
1518 charclass ccl;
1519 zeroset (&ccl);
1520 setbit_case_fold_c (c, &ccl);
1521 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1522 }
1523
1524 return dfa->lex.lasttok = c;
1525 }
1526 }
1527
1528 /* The above loop should consume at most a backslash
1529 and some other character. */
1530 abort ();
1531 return END; /* keeps pedantic compilers happy. */
1532}
1533
1534static void
1535addtok_mb (struct dfa *dfa, token t, char mbprop)
1536{
1537 if (dfa->talloc == dfa->tindex)
1538 {
1539 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1540 sizeof *dfa->tokens);
1541 if (dfa->localeinfo.multibyte)
1542 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1543 sizeof *dfa->multibyte_prop);
1544 }
1545 if (dfa->localeinfo.multibyte)
1546 dfa->multibyte_prop[dfa->tindex] = mbprop;
1547 dfa->tokens[dfa->tindex++] = t;
1548
1549 switch (t)
1550 {
1551 case QMARK:
1552 case STAR:
1553 case PLUS:
1554 break;
1555
1556 case CAT:
1557 case OR:
1558 dfa->parse.depth--;
1559 break;
1560
1561 case EMPTY:
1562 dfa->epsilon = true;
1563 goto increment_depth;
1564
1565 case BACKREF:
1566 dfa->fast = false;
1567 goto increment_nleaves;
1568
1569 case BEGLINE:
1570 case ENDLINE:
1571 case BEGWORD:
1572 case ENDWORD:
1573 case LIMWORD:
1574 case NOTLIMWORD:
1575 dfa->epsilon = true;
1576 FALLTHROUGH;
1577 default:
1578 increment_nleaves:
1579 dfa->nleaves++;
1580 increment_depth:
1581 dfa->parse.depth++;
1582 if (dfa->depth < dfa->parse.depth)
1583 dfa->depth = dfa->parse.depth;
1584 break;
1585 }
1586}
1587
1588static void addtok_wc (struct dfa *dfa, wint_t wc);
1589
1590/* Add the given token to the parse tree, maintaining the depth count and
1591 updating the maximum depth if necessary. */
1592static void
1593addtok (struct dfa *dfa, token t)
1594{
1595 if (dfa->localeinfo.multibyte && t == MBCSET)
1596 {
1597 bool need_or = false;
1598
1599 /* Extract wide characters into alternations for better performance.
1600 This does not require UTF-8. */
1601 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1602 {
1603 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1604 if (need_or)
1605 addtok (dfa, OR);
1606 need_or = true;
1607 }
1608 dfa->lex.brack.nchars = 0;
1609
1610 /* Wide characters have been handled above, so it is possible
1611 that the set is empty now. Do nothing in that case. */
1612 if (dfa->lex.brack.cset != -1)
1613 {
1614 addtok (dfa, CSET + dfa->lex.brack.cset);
1615 if (need_or)
1616 addtok (dfa, OR);
1617 }
1618 }
1619 else
1620 {
1621 addtok_mb (dfa, t, 3);
1622 }
1623}
1624
1625/* We treat a multibyte character as a single atom, so that DFA
1626 can treat a multibyte character as a single expression.
1627
1628 e.g., we construct the following tree from "<mb1><mb2>".
1629 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1630 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1631static void
1632addtok_wc (struct dfa *dfa, wint_t wc)
1633{
1634 unsigned char buf[MB_LEN_MAX];
1635 mbstate_t s = { 0 };
1636 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1637 int buflen;
1638
1639 if (stored_bytes != (size_t) -1)
1640 buflen = stored_bytes;
1641 else
1642 {
1643 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1644 the addtok_mb call altogether can corrupt the heap. */
1645 buflen = 1;
1646 buf[0] = 0;
1647 }
1648
1649 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1650 for (int i = 1; i < buflen; i++)
1651 {
1652 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1653 addtok (dfa, CAT);
1654 }
1655}
1656
1657static void
1658add_utf8_anychar (struct dfa *dfa)
1659{
1660 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1661 UTF-8 byte sequence has been defined as follows:
1662
1663 ([\x00-\x7f]
1664 |[\xc2-\xdf][\x80-\xbf]
1665 |[\xe0][\xa0-\xbf][\x80-\xbf]
1666 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1667 |[\xed][\x80-\x9f][\x80-\xbf]
1668 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1669 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1670 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1671
1672 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1673 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1674 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1675 H = [\x80-\x9f], I = [\xf0],
1676 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1677
1678 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1679
1680 /* Mnemonics for classes containing two or more bytes. */
1681 enum { A, B, C, E, F, H, J, K, M };
1682
1683 /* Mnemonics for single-byte tokens. */
1684 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1685
1686 static charclass const utf8_classes[] = {
1687 /* A. 00-7f: 1-byte sequence. */
1688 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1689
1690 /* B. c2-df: 1st byte of a 2-byte sequence. */
1691 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1692
1693 /* C. 80-bf: non-leading bytes. */
1694 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1695
1696 /* D. e0 (just a token). */
1697
1698 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1699 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1700
1701 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1702 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1703
1704 /* G. ed (just a token). */
1705
1706 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1707 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1708
1709 /* I. f0 (just a token). */
1710
1711 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1712 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1713
1714 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1715 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1716
1717 /* L. f4 (just a token). */
1718
1719 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1720 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1721 };
1722
1723 /* Define the character classes that are needed below. */
1724 if (dfa->utf8_anychar_classes[0] == 0)
1725 {
1726 charclass c = utf8_classes[0];
1727 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1728 clrbit ('\n', &c);
1729 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1730 clrbit ('\0', &c);
1731 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1732
1733 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1734 dfa->utf8_anychar_classes[i]
1735 = CSET + charclass_index (dfa, &utf8_classes[i]);
1736 }
1737
1738 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1739 The token buffer is in reverse Polish order, so we get
1740 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1741 C CAT OR C CAT OR C CAT OR". */
1742 addtok (dfa, dfa->utf8_anychar_classes[A]);
1743 addtok (dfa, dfa->utf8_anychar_classes[B]);
1744 addtok (dfa, D_token);
1745 addtok (dfa, dfa->utf8_anychar_classes[E]);
1746 addtok (dfa, CAT);
1747 addtok (dfa, OR);
1748 addtok (dfa, G_token);
1749 addtok (dfa, dfa->utf8_anychar_classes[H]);
1750 addtok (dfa, CAT);
1751 addtok (dfa, OR);
1752 addtok (dfa, dfa->utf8_anychar_classes[F]);
1753 addtok (dfa, I_token);
1754 addtok (dfa, dfa->utf8_anychar_classes[J]);
1755 addtok (dfa, CAT);
1756 addtok (dfa, OR);
1757 addtok (dfa, L_token);
1758 addtok (dfa, dfa->utf8_anychar_classes[M]);
1759 addtok (dfa, CAT);
1760 addtok (dfa, OR);
1761 addtok (dfa, dfa->utf8_anychar_classes[K]);
1762 for (int i = 0; i < 3; i++)
1763 {
1764 addtok (dfa, dfa->utf8_anychar_classes[C]);
1765 addtok (dfa, CAT);
1766 addtok (dfa, OR);
1767 }
1768}
1769
1770/* The grammar understood by the parser is as follows.
1771
1772 regexp:
1773 regexp OR branch
1774 branch
1775
1776 branch:
1777 branch closure
1778 closure
1779
1780 closure:
1781 closure QMARK
1782 closure STAR
1783 closure PLUS
1784 closure REPMN
1785 atom
1786
1787 atom:
1788 <normal character>
1789 <multibyte character>
1790 ANYCHAR
1791 MBCSET
1792 CSET
1793 BACKREF
1794 BEGLINE
1795 ENDLINE
1796 BEGWORD
1797 ENDWORD
1798 LIMWORD
1799 NOTLIMWORD
1800 LPAREN regexp RPAREN
1801 <empty>
1802
1803 The parser builds a parse tree in postfix form in an array of tokens. */
1804
1805static void
1806atom (struct dfa *dfa)
1807{
1808 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1809 || dfa->parse.tok >= CSET
1810 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1811 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1812 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1813 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1814 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1815 {
1816 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1817 {
1818 /* For UTF-8 expand the period to a series of CSETs that define a
1819 valid UTF-8 character. This avoids using the slow multibyte
1820 path. I'm pretty sure it would be both profitable and correct to
1821 do it for any encoding; however, the optimization must be done
1822 manually as it is done above in add_utf8_anychar. So, let's
1823 start with UTF-8: it is the most used, and the structure of the
1824 encoding makes the correctness more obvious. */
1825 add_utf8_anychar (dfa);
1826 }
1827 else
1828 addtok (dfa, dfa->parse.tok);
1829 dfa->parse.tok = lex (dfa);
1830 }
1831 else if (dfa->parse.tok == WCHAR)
1832 {
1833 if (dfa->lex.wctok == WEOF)
1834 addtok (dfa, BACKREF);
1835 else
1836 {
1837 addtok_wc (dfa, dfa->lex.wctok);
1838
1839 if (dfa->syntax.case_fold)
1840 {
1841 wchar_t folded[CASE_FOLDED_BUFSIZE];
1842 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1843 for (int i = 0; i < n; i++)
1844 {
1845 addtok_wc (dfa, folded[i]);
1846 addtok (dfa, OR);
1847 }
1848 }
1849 }
1850
1851 dfa->parse.tok = lex (dfa);
1852 }
1853 else if (dfa->parse.tok == LPAREN)
1854 {
1855 dfa->parse.tok = lex (dfa);
1856 regexp (dfa);
1857 if (dfa->parse.tok != RPAREN)
1858 dfaerror (_("unbalanced ("));
1859 dfa->parse.tok = lex (dfa);
1860 }
1861 else
1862 addtok (dfa, EMPTY);
1863}
1864
1865/* Return the number of tokens in the given subexpression. */
1866static idx_t _GL_ATTRIBUTE_PURE
1867nsubtoks (struct dfa const *dfa, idx_t tindex)
1868{
1869 switch (dfa->tokens[tindex - 1])
1870 {
1871 default:
1872 return 1;
1873 case QMARK:
1874 case STAR:
1875 case PLUS:
1876 return 1 + nsubtoks (dfa, tindex - 1);
1877 case CAT:
1878 case OR:
1879 {
1880 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1881 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1882 }
1883 }
1884}
1885
1886/* Copy the given subexpression to the top of the tree. */
1887static void
1888copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1889{
1890 if (dfa->localeinfo.multibyte)
1891 for (idx_t i = 0; i < ntokens; i++)
1892 addtok_mb (dfa, dfa->tokens[tindex + i],
1893 dfa->multibyte_prop[tindex + i]);
1894 else
1895 for (idx_t i = 0; i < ntokens; i++)
1896 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1897}
1898
1899static void
1900closure (struct dfa *dfa)
1901{
1902 atom (dfa);
1903 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1904 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1905 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1906 {
1907 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1908 idx_t tindex = dfa->tindex - ntokens;
1909 if (dfa->lex.maxrep < 0)
1910 addtok (dfa, PLUS);
1911 if (dfa->lex.minrep == 0)
1912 addtok (dfa, QMARK);
1913 int i;
1914 for (i = 1; i < dfa->lex.minrep; i++)
1915 {
1916 copytoks (dfa, tindex, ntokens);
1917 addtok (dfa, CAT);
1918 }
1919 for (; i < dfa->lex.maxrep; i++)
1920 {
1921 copytoks (dfa, tindex, ntokens);
1922 addtok (dfa, QMARK);
1923 addtok (dfa, CAT);
1924 }
1925 dfa->parse.tok = lex (dfa);
1926 }
1927 else if (dfa->parse.tok == REPMN)
1928 {
1929 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1930 dfa->parse.tok = lex (dfa);
1931 closure (dfa);
1932 }
1933 else
1934 {
1935 addtok (dfa, dfa->parse.tok);
1936 dfa->parse.tok = lex (dfa);
1937 }
1938}
1939
1940static void
1941branch (struct dfa* dfa)
1942{
1943 closure (dfa);
1944 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1945 && dfa->parse.tok >= 0)
1946 {
1947 closure (dfa);
1948 addtok (dfa, CAT);
1949 }
1950}
1951
1952static void
1953regexp (struct dfa *dfa)
1954{
1955 branch (dfa);
1956 while (dfa->parse.tok == OR)
1957 {
1958 dfa->parse.tok = lex (dfa);
1959 branch (dfa);
1960 addtok (dfa, OR);
1961 }
1962}
1963
1964/* Parse a string S of length LEN into D. S can include NUL characters.
1965 This is the main entry point for the parser. */
1966void
1967dfaparse (char const *s, idx_t len, struct dfa *d)
1968{
1969 d->lex.ptr = s;
1970 d->lex.left = len;
1971 d->lex.lasttok = END;
1972 d->lex.laststart = true;
1973
1974 if (!d->syntax.syntax_bits_set)
1975 dfaerror (_("no syntax specified"));
1976
1977 if (!d->nregexps)
1978 addtok (d, BEG);
1979
1980 d->parse.tok = lex (d);
1981 d->parse.depth = d->depth;
1982
1983 regexp (d);
1984
1985 if (d->parse.tok != END)
1986 dfaerror (_("unbalanced )"));
1987
1988 addtok (d, END - d->nregexps);
1989 addtok (d, CAT);
1990
1991 if (d->nregexps)
1992 addtok (d, OR);
1993
1994 ++d->nregexps;
1995}
1996
1997/* Some primitives for operating on sets of positions. */
1998
1999/* Copy one set to another. */
2000static void
2001copy (position_set const *src, position_set *dst)
2002{
2003 if (dst->alloc < src->nelem)
2004 {
2005 free (dst->elems);
2006 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2007 sizeof *dst->elems);
2008 }
2009 dst->nelem = src->nelem;
2010 if (src->nelem != 0)
2011 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2012}
2013
2014static void
2015alloc_position_set (position_set *s, idx_t size)
2016{
2017 s->elems = xnmalloc (size, sizeof *s->elems);
2018 s->alloc = size;
2019 s->nelem = 0;
2020}
2021
2022/* Insert position P in set S. S is maintained in sorted order on
2023 decreasing index. If there is already an entry in S with P.index
2024 then merge (logically-OR) P's constraints into the one in S.
2025 S->elems must point to an array large enough to hold the resulting set. */
2026static void
2027insert (position p, position_set *s)
2028{
2029 idx_t count = s->nelem;
2030 idx_t lo = 0, hi = count;
2031 while (lo < hi)
2032 {
2033 idx_t mid = (lo + hi) >> 1;
2034 if (s->elems[mid].index < p.index)
2035 lo = mid + 1;
2036 else if (s->elems[mid].index == p.index)
2037 {
2038 s->elems[mid].constraint |= p.constraint;
2039 return;
2040 }
2041 else
2042 hi = mid;
2043 }
2044
2045 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2046 for (idx_t i = count; i > lo; i--)
2047 s->elems[i] = s->elems[i - 1];
2048 s->elems[lo] = p;
2049 ++s->nelem;
2050}
2051
2052static void
2053append (position p, position_set *s)
2054{
2055 idx_t count = s->nelem;
2056 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2057 s->elems[s->nelem++] = p;
2058}
2059
2060/* Merge S1 and S2 (with the additional constraint C2) into M. The
2061 result is as if the positions of S1, and of S2 with the additional
2062 constraint C2, were inserted into an initially empty set. */
2063static void
2064merge_constrained (position_set const *s1, position_set const *s2,
2065 unsigned int c2, position_set *m)
2066{
2067 idx_t i = 0, j = 0;
2068
2069 if (m->alloc - s1->nelem < s2->nelem)
2070 {
2071 free (m->elems);
2072 m->alloc = s1->nelem;
2073 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2074 }
2075 m->nelem = 0;
2076 while (i < s1->nelem || j < s2->nelem)
2077 if (! (j < s2->nelem)
2078 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2079 {
2080 unsigned int c = ((i < s1->nelem && j < s2->nelem
2081 && s1->elems[i].index == s2->elems[j].index)
2082 ? s2->elems[j++].constraint & c2
2083 : 0);
2084 m->elems[m->nelem].index = s1->elems[i].index;
2085 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2086 }
2087 else
2088 {
2089 if (s2->elems[j].constraint & c2)
2090 {
2091 m->elems[m->nelem].index = s2->elems[j].index;
2092 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2093 }
2094 j++;
2095 }
2096}
2097
2098/* Merge two sets of positions into a third. The result is exactly as if
2099 the positions of both sets were inserted into an initially empty set. */
2100static void
2101merge (position_set const *s1, position_set const *s2, position_set *m)
2102{
2103 merge_constrained (s1, s2, -1, m);
2104}
2105
2106/* Merge into DST all the elements of SRC, possibly destroying
2107 the contents of the temporary M. */
2108static void
2109merge2 (position_set *dst, position_set const *src, position_set *m)
2110{
2111 if (src->nelem < 4)
2112 {
2113 for (idx_t i = 0; i < src->nelem; i++)
2114 insert (src->elems[i], dst);
2115 }
2116 else
2117 {
2118 merge (src, dst, m);
2119 copy (m, dst);
2120 }
2121}
2122
2123/* Delete a position from a set. Return the nonzero constraint of the
2124 deleted position, or zero if there was no such position. */
2125static unsigned int
2126delete (idx_t del, position_set *s)
2127{
2128 idx_t count = s->nelem;
2129 idx_t lo = 0, hi = count;
2130 while (lo < hi)
2131 {
2132 idx_t mid = (lo + hi) >> 1;
2133 if (s->elems[mid].index < del)
2134 lo = mid + 1;
2135 else if (s->elems[mid].index == del)
2136 {
2137 unsigned int c = s->elems[mid].constraint;
2138 idx_t i;
2139 for (i = mid; i + 1 < count; i++)
2140 s->elems[i] = s->elems[i + 1];
2141 s->nelem = i;
2142 return c;
2143 }
2144 else
2145 hi = mid;
2146 }
2147 return 0;
2148}
2149
2150/* Replace a position with the followed set. */
2151static void
2152replace (position_set *dst, idx_t del, position_set *add,
2153 unsigned int constraint, position_set *tmp)
2154{
2155 unsigned int c = delete (del, dst) & constraint;
2156
2157 if (c)
2158 {
2159 copy (dst, tmp);
2160 merge_constrained (tmp, add, c, dst);
2161 }
2162}
2163
2164/* Find the index of the state corresponding to the given position set with
2165 the given preceding context, or create a new state if there is no such
2166 state. Context tells whether we got here on a newline or letter. */
2167static state_num
2168state_index (struct dfa *d, position_set const *s, int context)
2169{
2170 size_t hash = 0;
2171 int constraint = 0;
2172 state_num i;
2173
2174 for (i = 0; i < s->nelem; ++i)
2175 {
2176 idx_t ind = s->elems[i].index;
2177 hash ^= ind + s->elems[i].constraint;
2178 }
2179
2180 /* Try to find a state that exactly matches the proposed one. */
2181 for (i = 0; i < d->sindex; ++i)
2182 {
2183 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2184 || context != d->states[i].context)
2185 continue;
2186 state_num j;
2187 for (j = 0; j < s->nelem; ++j)
2188 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2189 || s->elems[j].index != d->states[i].elems.elems[j].index)
2190 break;
2191 if (j == s->nelem)
2192 return i;
2193 }
2194
2195#ifdef DEBUG
2196 fprintf (stderr, "new state %td\n nextpos:", i);
2197 for (state_num j = 0; j < s->nelem; j++)
2198 {
2199 fprintf (stderr, " %td:", s->elems[j].index);
2200 prtok (d->tokens[s->elems[j].index]);
2201 }
2202 fprintf (stderr, "\n context:");
2203 if (context ^ CTX_ANY)
2204 {
2205 if (context & CTX_NONE)
2206 fprintf (stderr, " CTX_NONE");
2207 if (context & CTX_LETTER)
2208 fprintf (stderr, " CTX_LETTER");
2209 if (context & CTX_NEWLINE)
2210 fprintf (stderr, " CTX_NEWLINE");
2211 }
2212 else
2213 fprintf (stderr, " CTX_ANY");
2214 fprintf (stderr, "\n");
2215#endif
2216
2217 for (state_num j = 0; j < s->nelem; j++)
2218 {
2219 int c = d->constraints[s->elems[j].index];
2220
2221 if (c != 0)
2222 {
2223 if (succeeds_in_context (c, context, CTX_ANY))
2224 constraint |= c;
2225 }
2226 else if (d->tokens[s->elems[j].index] == BACKREF)
2227 constraint = NO_CONSTRAINT;
2228 }
2229
2230
2231 /* Create a new state. */
2232 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2233 sizeof *d->states);
2234 d->states[i].hash = hash;
2235 alloc_position_set (&d->states[i].elems, s->nelem);
2236 copy (s, &d->states[i].elems);
2237 d->states[i].context = context;
2238 d->states[i].constraint = constraint;
2239 d->states[i].mbps.nelem = 0;
2240 d->states[i].mbps.elems = NULL;
2241 d->states[i].mb_trindex = -1;
2242
2243 ++d->sindex;
2244
2245 return i;
2246}
2247
2248/* Find the epsilon closure of D's set of positions. If any position of the set
2249 contains a symbol that matches the empty string in some context, replace
2250 that position with the elements of its follow labeled with an appropriate
2251 constraint. Repeat exhaustively until no funny positions are left.
2252 S->elems must be large enough to hold the result. BACKWARD is D's
2253 backward set; use and update it too. */
2254static void
2255epsclosure (struct dfa const *d, position_set *backward)
2256{
2257 position_set tmp;
2258 alloc_position_set (&tmp, d->nleaves);
2259 for (idx_t i = 0; i < d->tindex; i++)
2260 if (0 < d->follows[i].nelem)
2261 {
2262 unsigned int constraint;
2263 switch (d->tokens[i])
2264 {
2265 default:
2266 continue;
2267
2268 case BEGLINE:
2269 constraint = BEGLINE_CONSTRAINT;
2270 break;
2271 case ENDLINE:
2272 constraint = ENDLINE_CONSTRAINT;
2273 break;
2274 case BEGWORD:
2275 constraint = BEGWORD_CONSTRAINT;
2276 break;
2277 case ENDWORD:
2278 constraint = ENDWORD_CONSTRAINT;
2279 break;
2280 case LIMWORD:
2281 constraint = LIMWORD_CONSTRAINT;
2282 break;
2283 case NOTLIMWORD:
2284 constraint = NOTLIMWORD_CONSTRAINT;
2285 break;
2286 case EMPTY:
2287 constraint = NO_CONSTRAINT;
2288 break;
2289 }
2290
2291 delete (i, &d->follows[i]);
2292
2293 for (idx_t j = 0; j < backward[i].nelem; j++)
2294 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2295 constraint, &tmp);
2296 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2297 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2298 NO_CONSTRAINT, &tmp);
2299 }
2300 free (tmp.elems);
2301}
2302
2303/* Returns the set of contexts for which there is at least one
2304 character included in C. */
2305
2306static int
2307charclass_context (struct dfa const *dfa, charclass const *c)
2308{
2309 int context = 0;
2310
2311 for (int j = 0; j < CHARCLASS_WORDS; j++)
2312 {
2313 if (c->w[j] & dfa->syntax.newline.w[j])
2314 context |= CTX_NEWLINE;
2315 if (c->w[j] & dfa->syntax.letters.w[j])
2316 context |= CTX_LETTER;
2317 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2318 context |= CTX_NONE;
2319 }
2320
2321 return context;
2322}
2323
2324/* Returns the contexts on which the position set S depends. Each context
2325 in the set of returned contexts (let's call it SC) may have a different
2326 follow set than other contexts in SC, and also different from the
2327 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2328 in the complement set will have the same follow set. */
2329
2330static int _GL_ATTRIBUTE_PURE
2331state_separate_contexts (struct dfa *d, position_set const *s)
2332{
2333 int separate_contexts = 0;
2334
2335 for (idx_t j = 0; j < s->nelem; j++)
2336 separate_contexts |= d->separates[s->elems[j].index];
2337
2338 return separate_contexts;
2339}
2340
2341enum
2342{
2343 /* Single token is repeated. It is distinguished from non-repeated. */
2344 OPT_REPEAT = (1 << 0),
2345
2346 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2347 node is not merged. */
2348 OPT_LPAREN = (1 << 1),
2349
2350 /* Multiple branches are joined. The node is not merged. */
2351 OPT_RPAREN = (1 << 2),
2352
2353 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2354 flag is turned on. */
2355 OPT_WALKED = (1 << 3),
2356
2357 /* The node is queued. The node is not queued again. */
2358 OPT_QUEUED = (1 << 4)
2359};
2360
2361static void
2362merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2363 position_set *merged)
2364{
2365 position_set *follows = d->follows;
2366 idx_t nelem = 0;
2367
2368 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2369 {
2370 idx_t sindex = follows[tindex].elems[i].index;
2371
2372 /* Skip the node as pruned in future. */
2373 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2374 if (iconstraint == 0)
2375 continue;
2376
2377 if (d->tokens[follows[tindex].elems[i].index] <= END)
2378 {
2379 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2380 continue;
2381 }
2382
2383 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2384 {
2385 idx_t j;
2386
2387 for (j = 0; j < nelem; j++)
2388 {
2389 idx_t dindex = follows[tindex].elems[j].index;
2390
2391 if (dindex == tindex)
2392 continue;
2393
2394 if (follows[tindex].elems[j].constraint != iconstraint)
2395 continue;
2396
2397 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2398 continue;
2399
2400 if (d->tokens[sindex] != d->tokens[dindex])
2401 continue;
2402
2403 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2404 continue;
2405
2406 if (flags[sindex] & OPT_REPEAT)
2407 delete (sindex, &follows[sindex]);
2408
2409 merge2 (&follows[dindex], &follows[sindex], merged);
2410
2411 break;
2412 }
2413
2414 if (j < nelem)
2415 continue;
2416 }
2417
2418 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2419 flags[sindex] |= OPT_QUEUED;
2420 }
2421
2422 follows[tindex].nelem = nelem;
2423}
2424
2425static int
2426compare (const void *a, const void *b)
2427{
2428 position const *p = a, *q = b;
2429 return (p->index > q->index) - (p->index < q->index);
2430}
2431
2432static void
2433reorder_tokens (struct dfa *d)
2434{
2435 idx_t nleaves = 0;
2436 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2437 map[0] = nleaves++;
2438 for (idx_t i = 1; i < d->tindex; i++)
2439 map[i] = -1;
2440
2441 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2442 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2443 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2444 char *multibyte_prop = (d->localeinfo.multibyte
2445 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2446 : NULL);
2447
2448 for (idx_t i = 0; i < d->tindex; i++)
2449 {
2450 if (map[i] < 0)
2451 {
2452 free (d->follows[i].elems);
2453 d->follows[i].elems = NULL;
2454 d->follows[i].nelem = 0;
2455 continue;
2456 }
2457
2458 tokens[map[i]] = d->tokens[i];
2459 follows[map[i]] = d->follows[i];
2460 constraints[map[i]] = d->constraints[i];
2461
2462 if (multibyte_prop != NULL)
2463 multibyte_prop[map[i]] = d->multibyte_prop[i];
2464
2465 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2466 {
2467 if (map[d->follows[i].elems[j].index] == -1)
2468 map[d->follows[i].elems[j].index] = nleaves++;
2469
2470 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2471 }
2472
2473 qsort (d->follows[i].elems, d->follows[i].nelem,
2474 sizeof *d->follows[i].elems, compare);
2475 }
2476
2477 for (idx_t i = 0; i < nleaves; i++)
2478 {
2479 d->tokens[i] = tokens[i];
2480 d->follows[i] = follows[i];
2481 d->constraints[i] = constraints[i];
2482
2483 if (multibyte_prop != NULL)
2484 d->multibyte_prop[i] = multibyte_prop[i];
2485 }
2486
2487 d->tindex = d->nleaves = nleaves;
2488
2489 free (tokens);
2490 free (follows);
2491 free (constraints);
2492 free (multibyte_prop);
2493 free (map);
2494}
2495
2496static void
2497dfaoptimize (struct dfa *d)
2498{
2499 char *flags = xizalloc (d->tindex);
2500
2501 for (idx_t i = 0; i < d->tindex; i++)
2502 {
2503 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2504 {
2505 if (d->follows[i].elems[j].index == i)
2506 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2507 else if (d->follows[i].elems[j].index < i)
2508 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2509 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2510 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2511 else
2512 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2513 }
2514 }
2515
2516 flags[0] |= OPT_QUEUED;
2517
2518 position_set merged0;
2519 position_set *merged = &merged0;
2520 alloc_position_set (merged, d->nleaves);
2521
2522 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2523
2524 for (idx_t i = 0; i < d->tindex; i++)
2525 if (flags[i] & OPT_QUEUED)
2526 merge_nfa_state (d, i, flags, merged);
2527
2528 reorder_tokens (d);
2529
2530 free (merged->elems);
2531 free (flags);
2532}
2533
2534/* Perform bottom-up analysis on the parse tree, computing various functions.
2535 Note that at this point, we're pretending constructs like \< are real
2536 characters rather than constraints on what can follow them.
2537
2538 Nullable: A node is nullable if it is at the root of a regexp that can
2539 match the empty string.
2540 * EMPTY leaves are nullable.
2541 * No other leaf is nullable.
2542 * A QMARK or STAR node is nullable.
2543 * A PLUS node is nullable if its argument is nullable.
2544 * A CAT node is nullable if both its arguments are nullable.
2545 * An OR node is nullable if either argument is nullable.
2546
2547 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2548 that could correspond to the first character of a string matching the
2549 regexp rooted at the given node.
2550 * EMPTY leaves have empty firstpos.
2551 * The firstpos of a nonempty leaf is that leaf itself.
2552 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2553 argument.
2554 * The firstpos of a CAT node is the firstpos of the left argument, union
2555 the firstpos of the right if the left argument is nullable.
2556 * The firstpos of an OR node is the union of firstpos of each argument.
2557
2558 Lastpos: The lastpos of a node is the set of positions that could
2559 correspond to the last character of a string matching the regexp at
2560 the given node.
2561 * EMPTY leaves have empty lastpos.
2562 * The lastpos of a nonempty leaf is that leaf itself.
2563 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2564 argument.
2565 * The lastpos of a CAT node is the lastpos of its right argument, union
2566 the lastpos of the left if the right argument is nullable.
2567 * The lastpos of an OR node is the union of the lastpos of each argument.
2568
2569 Follow: The follow of a position is the set of positions that could
2570 correspond to the character following a character matching the node in
2571 a string matching the regexp. At this point we consider special symbols
2572 that match the empty string in some context to be just normal characters.
2573 Later, if we find that a special symbol is in a follow set, we will
2574 replace it with the elements of its follow, labeled with an appropriate
2575 constraint.
2576 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2577 the follow of every node in the lastpos.
2578 * Every node in the firstpos of the second argument of a CAT node is in
2579 the follow of every node in the lastpos of the first argument.
2580
2581 Because of the postfix representation of the parse tree, the depth-first
2582 analysis is conveniently done by a linear scan with the aid of a stack.
2583 Sets are stored as arrays of the elements, obeying a stack-like allocation
2584 scheme; the number of elements in each set deeper in the stack can be
2585 used to determine the address of a particular set's array. */
2586static void
2587dfaanalyze (struct dfa *d, bool searchflag)
2588{
2589 /* Array allocated to hold position sets. */
2590 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2591 /* Firstpos and lastpos elements. */
2592 position *firstpos = posalloc;
2593 position *lastpos = firstpos + d->nleaves;
2594 position pos;
2595 position_set tmp;
2596
2597 /* Stack for element counts and nullable flags. */
2598 struct
2599 {
2600 /* Whether the entry is nullable. */
2601 bool nullable;
2602
2603 /* Counts of firstpos and lastpos sets. */
2604 idx_t nfirstpos;
2605 idx_t nlastpos;
2606 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2607
2608 position_set merged; /* Result of merging sets. */
2609
2610 addtok (d, CAT);
2611 idx_t tindex = d->tindex;
2612
2613#ifdef DEBUG
2614 fprintf (stderr, "dfaanalyze:\n");
2615 for (idx_t i = 0; i < tindex; i++)
2616 {
2617 fprintf (stderr, " %td:", i);
2618 prtok (d->tokens[i]);
2619 }
2620 putc ('\n', stderr);
2621#endif
2622
2623 d->searchflag = searchflag;
2624 alloc_position_set (&merged, d->nleaves);
2625 d->follows = xicalloc (tindex, sizeof *d->follows);
2626 position_set *backward
2627 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2628
2629 for (idx_t i = 0; i < tindex; i++)
2630 {
2631 switch (d->tokens[i])
2632 {
2633 case EMPTY:
2634 /* The empty set is nullable. */
2635 stk->nullable = true;
2636
2637 /* The firstpos and lastpos of the empty leaf are both empty. */
2638 stk->nfirstpos = stk->nlastpos = 0;
2639 stk++;
2640 break;
2641
2642 case STAR:
2643 case PLUS:
2644 /* Every element in the lastpos of the argument is in the backward
2645 set of every element in the firstpos. */
2646 if (d->epsilon)
2647 {
2648 tmp.elems = lastpos - stk[-1].nlastpos;
2649 tmp.nelem = stk[-1].nlastpos;
2650 for (position *p = firstpos - stk[-1].nfirstpos;
2651 p < firstpos; p++)
2652 merge2 (&backward[p->index], &tmp, &merged);
2653 }
2654
2655 /* Every element in the firstpos of the argument is in the follow
2656 of every element in the lastpos. */
2657 {
2658 tmp.elems = firstpos - stk[-1].nfirstpos;
2659 tmp.nelem = stk[-1].nfirstpos;
2660 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2661 merge2 (&d->follows[p->index], &tmp, &merged);
2662 }
2663 FALLTHROUGH;
2664 case QMARK:
2665 /* A QMARK or STAR node is automatically nullable. */
2666 if (d->tokens[i] != PLUS)
2667 stk[-1].nullable = true;
2668 break;
2669
2670 case CAT:
2671 /* Every element in the lastpos of the first argument is in
2672 the backward set of every element in the firstpos of the
2673 second argument. */
2674 if (backward)
2675 {
2676 tmp.nelem = stk[-2].nlastpos;
2677 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2678 for (position *p = firstpos - stk[-1].nfirstpos;
2679 p < firstpos; p++)
2680 merge2 (&backward[p->index], &tmp, &merged);
2681 }
2682
2683 /* Every element in the firstpos of the second argument is in the
2684 follow of every element in the lastpos of the first argument. */
2685 {
2686 tmp.nelem = stk[-1].nfirstpos;
2687 tmp.elems = firstpos - stk[-1].nfirstpos;
2688 for (position *plim = lastpos - stk[-1].nlastpos,
2689 *p = plim - stk[-2].nlastpos;
2690 p < plim; p++)
2691 merge2 (&d->follows[p->index], &tmp, &merged);
2692 }
2693
2694 /* The firstpos of a CAT node is the firstpos of the first argument,
2695 union that of the second argument if the first is nullable. */
2696 if (stk[-2].nullable)
2697 stk[-2].nfirstpos += stk[-1].nfirstpos;
2698 else
2699 firstpos -= stk[-1].nfirstpos;
2700
2701 /* The lastpos of a CAT node is the lastpos of the second argument,
2702 union that of the first argument if the second is nullable. */
2703 if (stk[-1].nullable)
2704 stk[-2].nlastpos += stk[-1].nlastpos;
2705 else
2706 {
2707 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2709 p[j] = p[j + stk[-2].nlastpos];
2710 lastpos -= stk[-2].nlastpos;
2711 stk[-2].nlastpos = stk[-1].nlastpos;
2712 }
2713
2714 /* A CAT node is nullable if both arguments are nullable. */
2715 stk[-2].nullable &= stk[-1].nullable;
2716 stk--;
2717 break;
2718
2719 case OR:
2720 /* The firstpos is the union of the firstpos of each argument. */
2721 stk[-2].nfirstpos += stk[-1].nfirstpos;
2722
2723 /* The lastpos is the union of the lastpos of each argument. */
2724 stk[-2].nlastpos += stk[-1].nlastpos;
2725
2726 /* An OR node is nullable if either argument is nullable. */
2727 stk[-2].nullable |= stk[-1].nullable;
2728 stk--;
2729 break;
2730
2731 default:
2732 /* Anything else is a nonempty position. (Note that special
2733 constructs like \< are treated as nonempty strings here;
2734 an "epsilon closure" effectively makes them nullable later.
2735 Backreferences have to get a real position so we can detect
2736 transitions on them later. But they are nullable. */
2737 stk->nullable = d->tokens[i] == BACKREF;
2738
2739 /* This position is in its own firstpos and lastpos. */
2740 stk->nfirstpos = stk->nlastpos = 1;
2741 stk++;
2742
2743 firstpos->index = lastpos->index = i;
2744 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745 firstpos++, lastpos++;
2746
2747 break;
2748 }
2749#ifdef DEBUG
2750 /* ... balance the above nonsyntactic #ifdef goo... */
2751 fprintf (stderr, "node %td:", i);
2752 prtok (d->tokens[i]);
2753 putc ('\n', stderr);
2754 fprintf (stderr,
2755 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756 fprintf (stderr, " firstpos:");
2757 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2758 {
2759 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2760 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2761 }
2762 fprintf (stderr, "\n lastpos:");
2763 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2764 {
2765 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2766 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2767 }
2768 putc ('\n', stderr);
2769#endif
2770 }
2771
2772 if (backward)
2773 {
2774 /* For each follow set that is the follow set of a real position,
2775 replace it with its epsilon closure. */
2776 epsclosure (d, backward);
2777
2778 for (idx_t i = 0; i < tindex; i++)
2779 free (backward[i].elems);
2780 free (backward);
2781 }
2782
2783 dfaoptimize (d);
2784
2785#ifdef DEBUG
2786 for (idx_t i = 0; i < tindex; i++)
2787 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2788 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2789 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2790 {
2791 fprintf (stderr, "follows(%td:", i);
2792 prtok (d->tokens[i]);
2793 fprintf (stderr, "):");
2794 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2795 {
2796 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2797 prtok (d->tokens[d->follows[i].elems[j].index]);
2798 }
2799 putc ('\n', stderr);
2800 }
2801#endif
2802
2803 pos.index = 0;
2804 pos.constraint = NO_CONSTRAINT;
2805
2806 alloc_position_set (&tmp, 1);
2807
2808 append (pos, &tmp);
2809
2810 d->separates = xicalloc (tindex, sizeof *d->separates);
2811
2812 for (idx_t i = 0; i < tindex; i++)
2813 {
2814 if (prev_newline_dependent (d->constraints[i]))
2815 d->separates[i] |= CTX_NEWLINE;
2816 if (prev_letter_dependent (d->constraints[i]))
2817 d->separates[i] |= CTX_LETTER;
2818
2819 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2820 {
2821 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2822 d->separates[i] |= CTX_NEWLINE;
2823 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2824 d->separates[i] |= CTX_LETTER;
2825 }
2826 }
2827
2828 /* Context wanted by some position. */
2829 int separate_contexts = state_separate_contexts (d, &tmp);
2830
2831 /* Build the initial state. */
2832 if (separate_contexts & CTX_NEWLINE)
2833 state_index (d, &tmp, CTX_NEWLINE);
2834 d->initstate_notbol = d->min_trcount
2835 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2836 if (separate_contexts & CTX_LETTER)
2837 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2838 d->min_trcount++;
2839 d->trcount = 0;
2840
2841 free (posalloc);
2842 free (stkalloc);
2843 free (merged.elems);
2844 free (tmp.elems);
2845}
2846
2847/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2848static void
2849realloc_trans_if_necessary (struct dfa *d)
2850{
2851 state_num oldalloc = d->tralloc;
2852 if (oldalloc < d->sindex)
2853 {
2854 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2855 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2856 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2857 -1, sizeof *realtrans);
2858 realtrans[0] = realtrans[1] = NULL;
2859 d->trans = realtrans + 2;
2860 idx_t newalloc = d->tralloc = newalloc1 - 2;
2861 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2862 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2863 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2864 if (d->localeinfo.multibyte)
2865 {
2866 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2867 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2868 if (oldalloc == 0)
2869 realtrans[0] = realtrans[1] = NULL;
2870 d->mb_trans = realtrans + 2;
2871 }
2872 for (; oldalloc < newalloc; oldalloc++)
2873 {
2874 d->trans[oldalloc] = NULL;
2875 d->fails[oldalloc] = NULL;
2876 if (d->localeinfo.multibyte)
2877 d->mb_trans[oldalloc] = NULL;
2878 }
2879 }
2880}
2881
2882/*
2883 Calculate the transition table for a new state derived from state s
2884 for a compiled dfa d after input character uc, and return the new
2885 state number.
2886
2887 Do not worry about all possible input characters; calculate just the group
2888 of positions that match uc. Label it with the set of characters that
2889 every position in the group matches (taking into account, if necessary,
2890 preceding context information of s). Then find the union
2891 of these positions' follows, i.e., the set of positions of the
2892 new state. For each character in the group's label, set the transition
2893 on this character to be to a state corresponding to the set's positions,
2894 and its associated backward context information, if necessary.
2895
2896 When building a searching matcher, include the positions of state
2897 0 in every state.
2898
2899 The group is constructed by building an equivalence-class
2900 partition of the positions of s.
2901
2902 For each position, find the set of characters C that it matches. Eliminate
2903 any characters from C that fail on grounds of backward context.
2904
2905 Check whether the group's label L has nonempty
2906 intersection with C. If L - C is nonempty, create a new group labeled
2907 L - C and having the same positions as the current group, and set L to
2908 the intersection of L and C. Insert the position in the group, set
2909 C = C - L, and resume scanning.
2910
2911 If after comparing with every group there are characters remaining in C,
2912 create a new group labeled with the characters of C and insert this
2913 position in that group. */
2914
2915static state_num
2916build_state (state_num s, struct dfa *d, unsigned char uc)
2917{
2918 position_set follows; /* Union of the follows for each
2919 position of the current state. */
2920 position_set group; /* Positions that match the input char. */
2921 position_set tmp; /* Temporary space for merging sets. */
2922 state_num state; /* New state. */
2923 state_num state_newline; /* New state on a newline transition. */
2924 state_num state_letter; /* New state on a letter transition. */
2925
2926#ifdef DEBUG
2927 fprintf (stderr, "build state %td\n", s);
2928#endif
2929
2930 /* A pointer to the new transition table, and the table itself. */
2931 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2932 state_num *trans = *ptrans;
2933
2934 if (!trans)
2935 {
2936 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2937 transition tables that can exist at once, other than for
2938 initial states. Often-used transition tables are quickly
2939 rebuilt, whereas rarely-used ones are cleared away. */
2940 if (MAX_TRCOUNT <= d->trcount)
2941 {
2942 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2943 {
2944 free (d->trans[i]);
2945 free (d->fails[i]);
2946 d->trans[i] = d->fails[i] = NULL;
2947 }
2948 d->trcount = 0;
2949 }
2950
2951 d->trcount++;
2952 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2953
2954 /* Fill transition table with a default value which means that the
2955 transited state has not been calculated yet. */
2956 for (int i = 0; i < NOTCHAR; i++)
2957 trans[i] = -2;
2958 }
2959
2960 /* Set up the success bits for this state. */
2961 d->success[s] = 0;
2962 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2963 d->success[s] |= CTX_NEWLINE;
2964 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2965 d->success[s] |= CTX_LETTER;
2966 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2967 d->success[s] |= CTX_NONE;
2968
2969 alloc_position_set (&follows, d->nleaves);
2970
2971 /* Find the union of the follows of the positions of the group.
2972 This is a hideously inefficient loop. Fix it someday. */
2973 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2974 for (idx_t k = 0;
2975 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2976 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2977 &follows);
2978
2979 /* Positions that match the input char. */
2980 alloc_position_set (&group, d->nleaves);
2981
2982 /* The group's label. */
2983 charclass label;
2984 fillset (&label);
2985
2986 for (idx_t i = 0; i < follows.nelem; i++)
2987 {
2988 charclass matches; /* Set of matching characters. */
2989 position pos = follows.elems[i];
2990 bool matched = false;
2991 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2992 {
2993 zeroset (&matches);
2994 setbit (d->tokens[pos.index], &matches);
2995 if (d->tokens[pos.index] == uc)
2996 matched = true;
2997 }
2998 else if (d->tokens[pos.index] >= CSET)
2999 {
3000 matches = d->charclasses[d->tokens[pos.index] - CSET];
3001 if (tstbit (uc, &matches))
3002 matched = true;
3003 }
3004 else if (d->tokens[pos.index] == ANYCHAR)
3005 {
3006 matches = d->charclasses[d->canychar];
3007 if (tstbit (uc, &matches))
3008 matched = true;
3009
3010 /* ANYCHAR must match with a single character, so we must put
3011 it to D->states[s].mbps which contains the positions which
3012 can match with a single character not a byte. If all
3013 positions which has ANYCHAR does not depend on context of
3014 next character, we put the follows instead of it to
3015 D->states[s].mbps to optimize. */
3016 if (succeeds_in_context (pos.constraint, d->states[s].context,
3017 CTX_NONE))
3018 {
3019 if (d->states[s].mbps.nelem == 0)
3020 alloc_position_set (&d->states[s].mbps, 1);
3021 insert (pos, &d->states[s].mbps);
3022 }
3023 }
3024 else
3025 continue;
3026
3027 /* Some characters may need to be eliminated from matches because
3028 they fail in the current context. */
3029 if (pos.constraint != NO_CONSTRAINT)
3030 {
3031 if (!succeeds_in_context (pos.constraint,
3032 d->states[s].context, CTX_NEWLINE))
3033 for (int j = 0; j < CHARCLASS_WORDS; j++)
3034 matches.w[j] &= ~d->syntax.newline.w[j];
3035 if (!succeeds_in_context (pos.constraint,
3036 d->states[s].context, CTX_LETTER))
3037 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3038 matches.w[j] &= ~d->syntax.letters.w[j];
3039 if (!succeeds_in_context (pos.constraint,
3040 d->states[s].context, CTX_NONE))
3041 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3042 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3043
3044 /* If there are no characters left, there's no point in going on. */
3045 if (emptyset (&matches))
3046 continue;
3047
3048 /* If we have reset the bit that made us declare "matched", reset
3049 that indicator, too. This is required to avoid an infinite loop
3050 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3051 if (!tstbit (uc, &matches))
3052 matched = false;
3053 }
3054
3055#ifdef DEBUG
3056 fprintf (stderr, " nextpos %td:", pos.index);
3057 prtok (d->tokens[pos.index]);
3058 fprintf (stderr, " of");
3059 for (unsigned j = 0; j < NOTCHAR; j++)
3060 if (tstbit (j, &matches))
3061 fprintf (stderr, " 0x%02x", j);
3062 fprintf (stderr, "\n");
3063#endif
3064
3065 if (matched)
3066 {
3067 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3068 label.w[k] &= matches.w[k];
3069 append (pos, &group);
3070 }
3071 else
3072 {
3073 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3074 label.w[k] &= ~matches.w[k];
3075 }
3076 }
3077
3078 alloc_position_set (&tmp, d->nleaves);
3079
3080 if (group.nelem > 0)
3081 {
3082 /* If we are building a searching matcher, throw in the positions
3083 of state 0 as well, if possible. */
3084 if (d->searchflag)
3085 {
3086 /* If a token in follows.elems is not 1st byte of a multibyte
3087 character, or the states of follows must accept the bytes
3088 which are not 1st byte of the multibyte character.
3089 Then, if a state of follows encounters a byte, it must not be
3090 a 1st byte of a multibyte character nor a single byte character.
3091 In this case, do not add state[0].follows to next state, because
3092 state[0] must accept 1st-byte.
3093
3094 For example, suppose <sb a> is a certain single byte character,
3095 <mb A> is a certain multibyte character, and the codepoint of
3096 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3097 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3098 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3099 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3100 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3101 not add state[0]. */
3102
3103 bool mergeit = !d->localeinfo.multibyte;
3104 if (!mergeit)
3105 {
3106 mergeit = true;
3107 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3108 mergeit &= d->multibyte_prop[group.elems[j].index];
3109 }
3110 if (mergeit)
3111 merge2 (&group, &d->states[0].elems, &tmp);
3112 }
3113
3114 /* Find out if the new state will want any context information,
3115 by calculating possible contexts that the group can match,
3116 and separate contexts that the new state wants to know. */
3117 int possible_contexts = charclass_context (d, &label);
3118 int separate_contexts = state_separate_contexts (d, &group);
3119
3120 /* Find the state(s) corresponding to the union of the follows. */
3121 if (possible_contexts & ~separate_contexts)
3122 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123 else
3124 state = -1;
3125 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126 state_newline = state_index (d, &group, CTX_NEWLINE);
3127 else
3128 state_newline = state;
3129 if (separate_contexts & possible_contexts & CTX_LETTER)
3130 state_letter = state_index (d, &group, CTX_LETTER);
3131 else
3132 state_letter = state;
3133
3134 /* Reallocate now, to reallocate any newline transition properly. */
3135 realloc_trans_if_necessary (d);
3136 }
3137
3138 /* If we are a searching matcher, the default transition is to a state
3139 containing the positions of state 0, otherwise the default transition
3140 is to fail miserably. */
3141 else if (d->searchflag)
3142 {
3143 state_newline = 0;
3144 state_letter = d->min_trcount - 1;
3145 state = d->initstate_notbol;
3146 }
3147 else
3148 {
3149 state_newline = -1;
3150 state_letter = -1;
3151 state = -1;
3152 }
3153
3154 /* Set the transitions for each character in the label. */
3155 for (int i = 0; i < NOTCHAR; i++)
3156 if (tstbit (i, &label))
3157 switch (d->syntax.sbit[i])
3158 {
3159 case CTX_NEWLINE:
3160 trans[i] = state_newline;
3161 break;
3162 case CTX_LETTER:
3163 trans[i] = state_letter;
3164 break;
3165 default:
3166 trans[i] = state;
3167 break;
3168 }
3169
3170#ifdef DEBUG
3171 fprintf (stderr, "trans table %td", s);
3172 for (int i = 0; i < NOTCHAR; ++i)
3173 {
3174 if (!(i & 0xf))
3175 fprintf (stderr, "\n");
3176 fprintf (stderr, " %2td", trans[i]);
3177 }
3178 fprintf (stderr, "\n");
3179#endif
3180
3181 free (group.elems);
3182 free (follows.elems);
3183 free (tmp.elems);
3184
3185 /* Keep the newline transition in a special place so we can use it as
3186 a sentinel. */
3187 if (tstbit (d->syntax.eolbyte, &label))
3188 {
3189 d->newlines[s] = trans[d->syntax.eolbyte];
3190 trans[d->syntax.eolbyte] = -1;
3191 }
3192
3193 return trans[uc];
3194}
3195
3196/* Multibyte character handling sub-routines for dfaexec. */
3197
3198/* Consume a single byte and transit state from 's' to '*next_state'.
3199 This function is almost same as the state transition routin in dfaexec.
3200 But state transition is done just once, otherwise matching succeed or
3201 reach the end of the buffer. */
3202static state_num
3203transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3204{
3205 state_num *t;
3206
3207 if (d->trans[s])
3208 t = d->trans[s];
3209 else if (d->fails[s])
3210 t = d->fails[s];
3211 else
3212 {
3213 build_state (s, d, **pp);
3214 if (d->trans[s])
3215 t = d->trans[s];
3216 else
3217 {
3218 t = d->fails[s];
3219 assert (t);
3220 }
3221 }
3222
3223 if (t[**pp] == -2)
3224 build_state (s, d, **pp);
3225
3226 return t[*(*pp)++];
3227}
3228
3229/* Transit state from s, then return new state and update the pointer of
3230 the buffer. This function is for a period operator which can match a
3231 multi-byte character. */
3232static state_num
3233transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3234 unsigned char const *end)
3235{
3236 wint_t wc;
3237
3238 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3239
3240 /* This state has some operators which can match a multibyte character. */
3241 d->mb_follows.nelem = 0;
3242
3243 /* Calculate the state which can be reached from the state 's' by
3244 consuming 'mbclen' single bytes from the buffer. */
3245 state_num s1 = s;
3246 int mbci;
3247 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3248 s = transit_state_singlebyte (d, s, pp);
3249 *pp += mbclen - mbci;
3250
3251 if (wc == WEOF)
3252 {
3253 /* It is an invalid character, so ANYCHAR is not accepted. */
3254 return s;
3255 }
3256
3257 /* If all positions which have ANYCHAR do not depend on the context
3258 of the next character, calculate the next state with
3259 pre-calculated follows and cache the result. */
3260 if (d->states[s1].mb_trindex < 0)
3261 {
3262 if (MAX_TRCOUNT <= d->mb_trcount)
3263 {
3264 state_num s3;
3265 for (s3 = -1; s3 < d->tralloc; s3++)
3266 {
3267 free (d->mb_trans[s3]);
3268 d->mb_trans[s3] = NULL;
3269 }
3270
3271 for (state_num i = 0; i < d->sindex; i++)
3272 d->states[i].mb_trindex = -1;
3273 d->mb_trcount = 0;
3274 }
3275 d->states[s1].mb_trindex = d->mb_trcount++;
3276 }
3277
3278 if (! d->mb_trans[s])
3279 {
3280 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283 for (int i = 0; i < MAX_TRCOUNT; i++)
3284 d->mb_trans[s][i] = -1;
3285 }
3286 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287 return d->mb_trans[s][d->states[s1].mb_trindex];
3288
3289 if (s == -1)
3290 copy (&d->states[s1].mbps, &d->mb_follows);
3291 else
3292 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293
3294 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296 realloc_trans_if_necessary (d);
3297
3298 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299
3300 return s2;
3301}
3302
3303/* The initial state may encounter a byte which is not a single byte character
3304 nor the first byte of a multibyte character. But it is incorrect for the
3305 initial state to accept such a byte. For example, in Shift JIS the regular
3306 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3307 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3308 that are not a single byte character nor the first byte of a multibyte
3309 character.
3310
3311 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3312 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3313 the result is greater than P, set *WCP to the final wide character
3314 processed, or to WEOF if no wide character is processed. Otherwise,
3315 if WCP is non-NULL, *WCP may or may not be updated.
3316
3317 Both P and MBP must be no larger than END. */
3318static unsigned char const *
3319skip_remains_mb (struct dfa *d, unsigned char const *p,
3320 unsigned char const *mbp, char const *end)
3321{
3322 if (d->syntax.never_trail[*p])
3323 return p;
3324 while (mbp < p)
3325 {
3326 wint_t wc;
3327 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328 end - (char const *) mbp, d);
3329 }
3330 return mbp;
3331}
3332
3333/* Search through a buffer looking for a match to the struct dfa *D.
3334 Find the first occurrence of a string matching the regexp in the
3335 buffer, and the shortest possible version thereof. Return a pointer to
3336 the first character after the match, or NULL if none is found. BEGIN
3337 points to the beginning of the buffer, and END points to the first byte
3338 after its end. Note however that we store a sentinel byte (usually
3339 newline) in *END, so the actual buffer must be one byte longer.
3340 When ALLOW_NL, newlines may appear in the matching string.
3341 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3342 If MULTIBYTE, the input consists of multibyte characters and/or
3343 encoding-error bytes. Otherwise, it consists of single-byte characters.
3344 Here is the list of features that make this DFA matcher punt:
3345 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3346 - [^...] in non-simple locale
3347 - [[=foo=]] or [[.foo.]]
3348 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3349 - back-reference: (.)\1
3350 - word-delimiter in multibyte locale: \<, \>, \b, \B
3351 See struct localeinfo.simple for the definition of "simple locale". */
3352
3353static inline char *
3354dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3355 ptrdiff_t *count, bool multibyte)
3356{
3357 if (MAX_TRCOUNT <= d->sindex)
3358 {
3359 for (state_num s = d->min_trcount; s < d->sindex; s++)
3360 {
3361 free (d->states[s].elems.elems);
3362 free (d->states[s].mbps.elems);
3363 }
3364 d->sindex = d->min_trcount;
3365
3366 if (d->trans)
3367 {
3368 for (state_num s = 0; s < d->tralloc; s++)
3369 {
3370 free (d->trans[s]);
3371 free (d->fails[s]);
3372 d->trans[s] = d->fails[s] = NULL;
3373 }
3374 d->trcount = 0;
3375 }
3376
3377 if (d->localeinfo.multibyte && d->mb_trans)
3378 {
3379 for (state_num s = -1; s < d->tralloc; s++)
3380 {
3381 free (d->mb_trans[s]);
3382 d->mb_trans[s] = NULL;
3383 }
3384 for (state_num s = 0; s < d->min_trcount; s++)
3385 d->states[s].mb_trindex = -1;
3386 d->mb_trcount = 0;
3387 }
3388 }
3389
3390 if (!d->tralloc)
3391 realloc_trans_if_necessary (d);
3392
3393 /* Current state. */
3394 state_num s = 0, s1 = 0;
3395
3396 /* Current input character. */
3397 unsigned char const *p = (unsigned char const *) begin;
3398 unsigned char const *mbp = p;
3399
3400 /* Copy of d->trans so it can be optimized into a register. */
3401 state_num **trans = d->trans;
3402 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3403 unsigned char saved_end = *(unsigned char *) end;
3404 *end = eol;
3405
3406 if (multibyte)
3407 {
3408 memset (&d->mbs, 0, sizeof d->mbs);
3409 if (d->mb_follows.alloc == 0)
3410 alloc_position_set (&d->mb_follows, d->nleaves);
3411 }
3412
3413 idx_t nlcount = 0;
3414 for (;;)
3415 {
3416 state_num *t;
3417 while ((t = trans[s]) != NULL)
3418 {
3419 if (s < d->min_trcount)
3420 {
3421 if (!multibyte || d->states[s].mbps.nelem == 0)
3422 {
3423 while (t[*p] == s)
3424 p++;
3425 }
3426 if (multibyte)
3427 p = mbp = skip_remains_mb (d, p, mbp, end);
3428 }
3429
3430 if (multibyte)
3431 {
3432 s1 = s;
3433
3434 if (d->states[s].mbps.nelem == 0
3435 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3436 {
3437 /* If an input character does not match ANYCHAR, do it
3438 like a single-byte character. */
3439 s = t[*p++];
3440 }
3441 else
3442 {
3443 s = transit_state (d, s, &p, (unsigned char *) end);
3444 mbp = p;
3445 trans = d->trans;
3446 }
3447 }
3448 else
3449 {
3450 s1 = t[*p++];
3451 t = trans[s1];
3452 if (! t)
3453 {
3454 state_num tmp = s;
3455 s = s1;
3456 s1 = tmp; /* swap */
3457 break;
3458 }
3459 if (s < d->min_trcount)
3460 {
3461 while (t[*p] == s1)
3462 p++;
3463 }
3464 s = t[*p++];
3465 }
3466 }
3467
3468 if (s < 0)
3469 {
3470 if (s == -2)
3471 {
3472 s = build_state (s1, d, p[-1]);
3473 trans = d->trans;
3474 }
3475 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476 {
3477 /* The previous character was a newline. Count it, and skip
3478 checking of multibyte character boundary until here. */
3479 nlcount++;
3480 mbp = p;
3481
3482 s = (allow_nl ? d->newlines[s1]
3483 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485 : d->initstate_notbol);
3486 }
3487 else
3488 {
3489 p = NULL;
3490 goto done;
3491 }
3492 }
3493 else if (d->fails[s])
3494 {
3495 if ((d->success[s] & d->syntax.sbit[*p])
3496 || ((char *) p == end
3497 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498 d)))
3499 goto done;
3500
3501 if (multibyte && s < d->min_trcount)
3502 p = mbp = skip_remains_mb (d, p, mbp, end);
3503
3504 s1 = s;
3505 if (!multibyte || d->states[s].mbps.nelem == 0
3506 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3507 {
3508 /* If a input character does not match ANYCHAR, do it
3509 like a single-byte character. */
3510 s = d->fails[s][*p++];
3511 }
3512 else
3513 {
3514 s = transit_state (d, s, &p, (unsigned char *) end);
3515 mbp = p;
3516 trans = d->trans;
3517 }
3518 }
3519 else
3520 {
3521 build_state (s, d, p[0]);
3522 trans = d->trans;
3523 }
3524 }
3525
3526 done:
3527 if (count)
3528 *count += nlcount;
3529 *end = saved_end;
3530 return (char *) p;
3531}
3532
3533/* Specialized versions of dfaexec for multibyte and single-byte cases.
3534 This is for performance, as dfaexec_main is an inline function. */
3535
3536static char *
3537dfaexec_mb (struct dfa *d, char const *begin, char *end,
3538 bool allow_nl, ptrdiff_t *count, bool *backref)
3539{
3540 return dfaexec_main (d, begin, end, allow_nl, count, true);
3541}
3542
3543static char *
3544dfaexec_sb (struct dfa *d, char const *begin, char *end,
3545 bool allow_nl, ptrdiff_t *count, bool *backref)
3546{
3547 return dfaexec_main (d, begin, end, allow_nl, count, false);
3548}
3549
3550/* Always set *BACKREF and return BEGIN. Use this wrapper for
3551 any regexp that uses a construct not supported by this code. */
3552static char *
3553dfaexec_noop (struct dfa *d, char const *begin, char *end,
3554 bool allow_nl, ptrdiff_t *count, bool *backref)
3555{
3556 *backref = true;
3557 return (char *) begin;
3558}
3559
3560/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3561 but faster and set *BACKREF if the DFA code does not support this
3562 regexp usage. */
3563
3564char *
3565dfaexec (struct dfa *d, char const *begin, char *end,
3566 bool allow_nl, ptrdiff_t *count, bool *backref)
3567{
3568 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569}
3570
3571struct dfa *
3572dfasuperset (struct dfa const *d)
3573{
3574 return d->superset;
3575}
3576
3577bool
3578dfaisfast (struct dfa const *d)
3579{
3580 return d->fast;
3581}
3582
3583static void
3584free_mbdata (struct dfa *d)
3585{
3586 free (d->multibyte_prop);
3587 free (d->lex.brack.chars);
3588 free (d->mb_follows.elems);
3589
3590 if (d->mb_trans)
3591 {
3592 state_num s;
3593 for (s = -1; s < d->tralloc; s++)
3594 free (d->mb_trans[s]);
3595 free (d->mb_trans - 2);
3596 }
3597}
3598
3599/* Return true if every construct in D is supported by this DFA matcher. */
3600bool
3601dfasupported (struct dfa const *d)
3602{
3603 for (idx_t i = 0; i < d->tindex; i++)
3604 {
3605 switch (d->tokens[i])
3606 {
3607 case BEGWORD:
3608 case ENDWORD:
3609 case LIMWORD:
3610 case NOTLIMWORD:
3611 if (!d->localeinfo.multibyte)
3612 continue;
3613 FALLTHROUGH;
3614 case BACKREF:
3615 case MBCSET:
3616 return false;
3617 }
3618 }
3619 return true;
3620}
3621
3622/* Disable use of the superset DFA if it is not likely to help
3623 performance. */
3624static void
3625maybe_disable_superset_dfa (struct dfa *d)
3626{
3627 if (!d->localeinfo.using_utf8)
3628 return;
3629
3630 bool have_backref = false;
3631 for (idx_t i = 0; i < d->tindex; i++)
3632 {
3633 switch (d->tokens[i])
3634 {
3635 case ANYCHAR:
3636 /* Lowered. */
3637 abort ();
3638 case BACKREF:
3639 have_backref = true;
3640 break;
3641 case MBCSET:
3642 /* Requires multi-byte algorithm. */
3643 return;
3644 default:
3645 break;
3646 }
3647 }
3648
3649 if (!have_backref && d->superset)
3650 {
3651 /* The superset DFA is not likely to be much faster, so remove it. */
3652 dfafree (d->superset);
3653 free (d->superset);
3654 d->superset = NULL;
3655 }
3656
3657 free_mbdata (d);
3658 d->localeinfo.multibyte = false;
3659 d->dfaexec = dfaexec_sb;
3660 d->fast = true;
3661}
3662
3663static void
3664dfassbuild (struct dfa *d)
3665{
3666 struct dfa *sup = dfaalloc ();
3667
3668 *sup = *d;
3669 sup->localeinfo.multibyte = false;
3670 sup->dfaexec = dfaexec_sb;
3671 sup->multibyte_prop = NULL;
3672 sup->superset = NULL;
3673 sup->states = NULL;
3674 sup->sindex = 0;
3675 sup->constraints = NULL;
3676 sup->separates = NULL;
3677 sup->follows = NULL;
3678 sup->tralloc = 0;
3679 sup->trans = NULL;
3680 sup->fails = NULL;
3681 sup->success = NULL;
3682 sup->newlines = NULL;
3683
3684 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685 if (d->cindex)
3686 {
3687 memcpy (sup->charclasses, d->charclasses,
3688 d->cindex * sizeof *sup->charclasses);
3689 }
3690
3691 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692 sup->talloc = d->tindex * 2;
3693
3694 bool have_achar = false;
3695 bool have_nchar = false;
3696 idx_t j;
3697 for (idx_t i = j = 0; i < d->tindex; i++)
3698 {
3699 switch (d->tokens[i])
3700 {
3701 case ANYCHAR:
3702 case MBCSET:
3703 case BACKREF:
3704 {
3705 charclass ccl;
3706 fillset (&ccl);
3707 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708 sup->tokens[j++] = STAR;
3709 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3710 || d->tokens[i + 1] == PLUS)
3711 i++;
3712 have_achar = true;
3713 }
3714 break;
3715 case BEGWORD:
3716 case ENDWORD:
3717 case LIMWORD:
3718 case NOTLIMWORD:
3719 if (d->localeinfo.multibyte)
3720 {
3721 /* These constraints aren't supported in a multibyte locale.
3722 Ignore them in the superset DFA. */
3723 sup->tokens[j++] = EMPTY;
3724 break;
3725 }
3726 FALLTHROUGH;
3727 default:
3728 sup->tokens[j++] = d->tokens[i];
3729 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730 || d->tokens[i] >= CSET)
3731 have_nchar = true;
3732 break;
3733 }
3734 }
3735 sup->tindex = j;
3736
3737 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3738 d->superset = sup;
3739 else
3740 {
3741 dfafree (sup);
3742 free (sup);
3743 }
3744}
3745
3746/* Parse a string S of length LEN into D (but skip this step if S is null).
3747 Then analyze D and build a matcher for it.
3748 SEARCHFLAG says whether to build a searching or an exact matcher. */
3749void
3750dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3751{
3752 if (s != NULL)
3753 dfaparse (s, len, d);
3754
3755 dfassbuild (d);
3756
3757 if (dfasupported (d))
3758 {
3759 maybe_disable_superset_dfa (d);
3760 dfaanalyze (d, searchflag);
3761 }
3762 else
3763 {
3764 d->dfaexec = dfaexec_noop;
3765 }
3766
3767 if (d->superset)
3768 {
3769 d->fast = true;
3770 dfaanalyze (d->superset, searchflag);
3771 }
3772}
3773
3774/* Free the storage held by the components of a dfa. */
3775void
3776dfafree (struct dfa *d)
3777{
3778 free (d->charclasses);
3779 free (d->tokens);
3780
3781 if (d->localeinfo.multibyte)
3782 free_mbdata (d);
3783
3784 free (d->constraints);
3785 free (d->separates);
3786
3787 for (idx_t i = 0; i < d->sindex; i++)
3788 {
3789 free (d->states[i].elems.elems);
3790 free (d->states[i].mbps.elems);
3791 }
3792 free (d->states);
3793
3794 if (d->follows)
3795 {
3796 for (idx_t i = 0; i < d->tindex; i++)
3797 free (d->follows[i].elems);
3798 free (d->follows);
3799 }
3800
3801 if (d->trans)
3802 {
3803 for (idx_t i = 0; i < d->tralloc; i++)
3804 {
3805 free (d->trans[i]);
3806 free (d->fails[i]);
3807 }
3808
3809 free (d->trans - 2);
3810 free (d->fails);
3811 free (d->newlines);
3812 free (d->success);
3813 }
3814
3815 if (d->superset)
3816 {
3817 dfafree (d->superset);
3818 free (d->superset);
3819 }
3820}
3821
3822/* Having found the postfix representation of the regular expression,
3823 try to find a long sequence of characters that must appear in any line
3824 containing the r.e.
3825 Finding a "longest" sequence is beyond the scope here;
3826 we take an easy way out and hope for the best.
3827 (Take "(ab|a)b"--please.)
3828
3829 We do a bottom-up calculation of sequences of characters that must appear
3830 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3831 representation:
3832 sequences that must appear at the left of the match ("left")
3833 sequences that must appear at the right of the match ("right")
3834 lists of sequences that must appear somewhere in the match ("in")
3835 sequences that must constitute the match ("is")
3836
3837 When we get to the root of the tree, we use one of the longest of its
3838 calculated "in" sequences as our answer.
3839
3840 The sequences calculated for the various types of node (in pseudo ANSI c)
3841 are shown below. "p" is the operand of unary operators (and the left-hand
3842 operand of binary operators); "q" is the right-hand operand of binary
3843 operators.
3844
3845 "ZERO" means "a zero-length sequence" below.
3846
3847 Type left right is in
3848 ---- ---- ----- -- --
3849 char c # c # c # c # c
3850
3851 ANYCHAR ZERO ZERO ZERO ZERO
3852
3853 MBCSET ZERO ZERO ZERO ZERO
3854
3855 CSET ZERO ZERO ZERO ZERO
3856
3857 STAR ZERO ZERO ZERO ZERO
3858
3859 QMARK ZERO ZERO ZERO ZERO
3860
3861 PLUS p->left p->right ZERO p->in
3862
3863 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3864 p->left : q->right : q->is!=ZERO) ? q->in plus
3865 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3866 ZERO
3867
3868 OR longest common longest common (do p->is and substrings common
3869 leading trailing to q->is have same p->in and
3870 (sub)sequence (sub)sequence q->in length and content) ?
3871 of p->left of p->right
3872 and q->left and q->right p->is : NULL
3873
3874 If there's anything else we recognize in the tree, all four sequences get set
3875 to zero-length sequences. If there's something we don't recognize in the
3876 tree, we just return a zero-length sequence.
3877
3878 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3879 'aaa')?
3880
3881 And ... is it here or someplace that we might ponder "optimizations" such as
3882 egrep 'psi|epsilon' -> egrep 'psi'
3883 egrep 'pepsi|epsilon' -> egrep 'epsi'
3884 (Yes, we now find "epsi" as a "string
3885 that must occur", but we might also
3886 simplify the *entire* r.e. being sought)
3887 grep '[c]' -> grep 'c'
3888 grep '(ab|a)b' -> grep 'ab'
3889 grep 'ab*' -> grep 'a'
3890 grep 'a*b' -> grep 'b'
3891
3892 There are several issues:
3893
3894 Is optimization easy (enough)?
3895
3896 Does optimization actually accomplish anything,
3897 or is the automaton you get from "psi|epsilon" (for example)
3898 the same as the one you get from "psi" (for example)?
3899
3900 Are optimizable r.e.'s likely to be used in real-life situations
3901 (something like 'ab*' is probably unlikely; something like is
3902 'psi|epsilon' is likelier)? */
3903
3904static char *
3905icatalloc (char *old, char const *new)
3906{
3907 idx_t newsize = strlen (new);
3908 if (newsize == 0)
3909 return old;
3910 idx_t oldsize = strlen (old);
3911 char *result = xirealloc (old, oldsize + newsize + 1);
3912 memcpy (result + oldsize, new, newsize + 1);
3913 return result;
3914}
3915
3916static void
3917freelist (char **cpp)
3918{
3919 while (*cpp)
3920 free (*cpp++);
3921}
3922
3923static char **
3924enlistnew (char **cpp, char *new)
3925{
3926 /* Is there already something in the list that's new (or longer)? */
3927 idx_t i;
3928 for (i = 0; cpp[i] != NULL; i++)
3929 if (strstr (cpp[i], new) != NULL)
3930 {
3931 free (new);
3932 return cpp;
3933 }
3934 /* Eliminate any obsoleted strings. */
3935 for (idx_t j = 0; cpp[j] != NULL; )
3936 if (strstr (new, cpp[j]) == NULL)
3937 ++j;
3938 else
3939 {
3940 free (cpp[j]);
3941 if (--i == j)
3942 break;
3943 cpp[j] = cpp[i];
3944 cpp[i] = NULL;
3945 }
3946 /* Add the new string. */
3947 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3948 cpp[i] = new;
3949 cpp[i + 1] = NULL;
3950 return cpp;
3951}
3952
3953static char **
3954enlist (char **cpp, char const *str, idx_t len)
3955{
3956 return enlistnew (cpp, ximemdup0 (str, len));
3957}
3958
3959/* Given pointers to two strings, return a pointer to an allocated
3960 list of their distinct common substrings. */
3961static char **
3962comsubs (char *left, char const *right)
3963{
3964 char **cpp = xzalloc (sizeof *cpp);
3965
3966 for (char *lcp = left; *lcp != '\0'; lcp++)
3967 {
3968 idx_t len = 0;
3969 char *rcp = strchr (right, *lcp);
3970 while (rcp != NULL)
3971 {
3972 idx_t i;
3973 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3974 continue;
3975 if (i > len)
3976 len = i;
3977 rcp = strchr (rcp + 1, *lcp);
3978 }
3979 if (len != 0)
3980 cpp = enlist (cpp, lcp, len);
3981 }
3982 return cpp;
3983}
3984
3985static char **
3986addlists (char **old, char **new)
3987{
3988 for (; *new; new++)
3989 old = enlistnew (old, xstrdup (*new));
3990 return old;
3991}
3992
3993/* Given two lists of substrings, return a new list giving substrings
3994 common to both. */
3995static char **
3996inboth (char **left, char **right)
3997{
3998 char **both = xzalloc (sizeof *both);
3999
4000 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4001 {
4002 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4003 {
4004 char **temp = comsubs (left[lnum], right[rnum]);
4005 both = addlists (both, temp);
4006 freelist (temp);
4007 free (temp);
4008 }
4009 }
4010 return both;
4011}
4012
4013typedef struct must must;
4014
4015struct must
4016{
4017 char **in;
4018 char *left;
4019 char *right;
4020 char *is;
4021 bool begline;
4022 bool endline;
4023 must *prev;
4024};
4025
4026static must *
4027allocmust (must *mp, idx_t size)
4028{
4029 must *new_mp = xmalloc (sizeof *new_mp);
4030 new_mp->in = xzalloc (sizeof *new_mp->in);
4031 new_mp->left = xizalloc (size);
4032 new_mp->right = xizalloc (size);
4033 new_mp->is = xizalloc (size);
4034 new_mp->begline = false;
4035 new_mp->endline = false;
4036 new_mp->prev = mp;
4037 return new_mp;
4038}
4039
4040static void
4041resetmust (must *mp)
4042{
4043 freelist (mp->in);
4044 mp->in[0] = NULL;
4045 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4046 mp->begline = false;
4047 mp->endline = false;
4048}
4049
4050static void
4051freemust (must *mp)
4052{
4053 freelist (mp->in);
4054 free (mp->in);
4055 free (mp->left);
4056 free (mp->right);
4057 free (mp->is);
4058 free (mp);
4059}
4060
4061struct dfamust *
4062dfamust (struct dfa const *d)
4063{
4064 must *mp = NULL;
4065 char const *result = "";
4066 bool exact = false;
4067 bool begline = false;
4068 bool endline = false;
4069 bool need_begline = false;
4070 bool need_endline = false;
4071 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4072
4073 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4074 {
4075 token t = d->tokens[ri];
4076 switch (t)
4077 {
4078 case BEGLINE:
4079 mp = allocmust (mp, 2);
4080 mp->begline = true;
4081 need_begline = true;
4082 break;
4083 case ENDLINE:
4084 mp = allocmust (mp, 2);
4085 mp->endline = true;
4086 need_endline = true;
4087 break;
4088 case LPAREN:
4089 case RPAREN:
4090 assert (!"neither LPAREN nor RPAREN may appear here");
4091
4092 case EMPTY:
4093 case BEGWORD:
4094 case ENDWORD:
4095 case LIMWORD:
4096 case NOTLIMWORD:
4097 case BACKREF:
4098 case ANYCHAR:
4099 case MBCSET:
4100 mp = allocmust (mp, 2);
4101 break;
4102
4103 case STAR:
4104 case QMARK:
4105 assume_nonnull (mp);
4106 resetmust (mp);
4107 break;
4108
4109 case OR:
4110 {
4111 char **new;
4112 must *rmp = mp;
4113 assume_nonnull (rmp);
4114 must *lmp = mp = mp->prev;
4115 assume_nonnull (lmp);
4116 idx_t j, ln, rn, n;
4117
4118 /* Guaranteed to be. Unlikely, but ... */
4119 if (streq (lmp->is, rmp->is))
4120 {
4121 lmp->begline &= rmp->begline;
4122 lmp->endline &= rmp->endline;
4123 }
4124 else
4125 {
4126 lmp->is[0] = '\0';
4127 lmp->begline = false;
4128 lmp->endline = false;
4129 }
4130 /* Left side--easy */
4131 idx_t i = 0;
4132 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4133 ++i;
4134 lmp->left[i] = '\0';
4135 /* Right side */
4136 ln = strlen (lmp->right);
4137 rn = strlen (rmp->right);
4138 n = ln;
4139 if (n > rn)
4140 n = rn;
4141 for (i = 0; i < n; ++i)
4142 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4143 break;
4144 for (j = 0; j < i; ++j)
4145 lmp->right[j] = lmp->right[(ln - i) + j];
4146 lmp->right[j] = '\0';
4147 new = inboth (lmp->in, rmp->in);
4148 freelist (lmp->in);
4149 free (lmp->in);
4150 lmp->in = new;
4151 freemust (rmp);
4152 }
4153 break;
4154
4155 case PLUS:
4156 assume_nonnull (mp);
4157 mp->is[0] = '\0';
4158 break;
4159
4160 case END:
4161 assume_nonnull (mp);
4162 assert (!mp->prev);
4163 for (idx_t i = 0; mp->in[i] != NULL; i++)
4164 if (strlen (mp->in[i]) > strlen (result))
4165 result = mp->in[i];
4166 if (streq (result, mp->is))
4167 {
4168 if ((!need_begline || mp->begline) && (!need_endline
4169 || mp->endline))
4170 exact = true;
4171 begline = mp->begline;
4172 endline = mp->endline;
4173 }
4174 goto done;
4175
4176 case CAT:
4177 {
4178 must *rmp = mp;
4179 assume_nonnull (rmp);
4180 must *lmp = mp = mp->prev;
4181 assume_nonnull (lmp);
4182
4183 /* In. Everything in left, plus everything in
4184 right, plus concatenation of
4185 left's right and right's left. */
4186 lmp->in = addlists (lmp->in, rmp->in);
4187 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4188 {
4189 idx_t lrlen = strlen (lmp->right);
4190 idx_t rllen = strlen (rmp->left);
4191 char *tp = ximalloc (lrlen + rllen + 1);
4192 memcpy (tp + lrlen, rmp->left, rllen + 1);
4193 memcpy (tp, lmp->right, lrlen);
4194 lmp->in = enlistnew (lmp->in, tp);
4195 }
4196 /* Left-hand */
4197 if (lmp->is[0] != '\0')
4198 lmp->left = icatalloc (lmp->left, rmp->left);
4199 /* Right-hand */
4200 if (rmp->is[0] == '\0')
4201 lmp->right[0] = '\0';
4202 lmp->right = icatalloc (lmp->right, rmp->right);
4203 /* Guaranteed to be */
4204 if ((lmp->is[0] != '\0' || lmp->begline)
4205 && (rmp->is[0] != '\0' || rmp->endline))
4206 {
4207 lmp->is = icatalloc (lmp->is, rmp->is);
4208 lmp->endline = rmp->endline;
4209 }
4210 else
4211 {
4212 lmp->is[0] = '\0';
4213 lmp->begline = false;
4214 lmp->endline = false;
4215 }
4216 freemust (rmp);
4217 }
4218 break;
4219
4220 case '\0':
4221 /* Not on *my* shift. */
4222 goto done;
4223
4224 default:
4225 if (CSET <= t)
4226 {
4227 /* If T is a singleton, or if case-folding in a unibyte
4228 locale and T's members all case-fold to the same char,
4229 convert T to one of its members. Otherwise, do
4230 nothing further with T. */
4231 charclass *ccl = &d->charclasses[t - CSET];
4232 int j;
4233 for (j = 0; j < NOTCHAR; j++)
4234 if (tstbit (j, ccl))
4235 break;
4236 if (! (j < NOTCHAR))
4237 {
4238 mp = allocmust (mp, 2);
4239 break;
4240 }
4241 t = j;
4242 while (++j < NOTCHAR)
4243 if (tstbit (j, ccl)
4244 && ! (case_fold_unibyte
4245 && toupper (j) == toupper (t)))
4246 break;
4247 if (j < NOTCHAR)
4248 {
4249 mp = allocmust (mp, 2);
4250 break;
4251 }
4252 }
4253
4254 idx_t rj = ri + 2;
4255 if (d->tokens[ri + 1] == CAT)
4256 {
4257 for (; rj < d->tindex - 1; rj += 2)
4258 {
4259 if ((rj != ri && (d->tokens[rj] <= 0
4260 || NOTCHAR <= d->tokens[rj]))
4261 || d->tokens[rj + 1] != CAT)
4262 break;
4263 }
4264 }
4265 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4266 mp->is[0] = mp->left[0] = mp->right[0]
4267 = case_fold_unibyte ? toupper (t) : t;
4268
4269 idx_t i;
4270 for (i = 1; ri + 2 < rj; i++)
4271 {
4272 ri += 2;
4273 t = d->tokens[ri];
4274 mp->is[i] = mp->left[i] = mp->right[i]
4275 = case_fold_unibyte ? toupper (t) : t;
4276 }
4277 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4278 mp->in = enlist (mp->in, mp->is, i);
4279 break;
4280 }
4281 }
4282 done:;
4283
4284 struct dfamust *dm = NULL;
4285 if (*result)
4286 {
4287 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4288 dm->exact = exact;
4289 dm->begline = begline;
4290 dm->endline = endline;
4291 strcpy (dm->must, result);
4292 }
4293
4294 while (mp)
4295 {
4296 must *prev = mp->prev;
4297 freemust (mp);
4298 mp = prev;
4299 }
4300
4301 return dm;
4302}
4303
4304void
4305dfamustfree (struct dfamust *dm)
4306{
4307 free (dm);
4308}
4309
4310struct dfa *
4311dfaalloc (void)
4312{
4313 return xmalloc (sizeof (struct dfa));
4314}
4315
4316/* Initialize DFA. */
4317void
4318dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4319 reg_syntax_t bits, int dfaopts)
4320{
4321 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4322 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4323 dfa->localeinfo = *linfo;
4324
4325 dfa->fast = !dfa->localeinfo.multibyte;
4326
4327 dfa->canychar = -1;
4328 dfa->syntax.syntax_bits_set = true;
4329 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4330 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4331 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4332 dfa->syntax.syntax_bits = bits;
4333
4334 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4335 {
4336 unsigned char uc = i;
4337
4338 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4339 switch (dfa->syntax.sbit[uc])
4340 {
4341 case CTX_LETTER:
4342 setbit (uc, &dfa->syntax.letters);
4343 break;
4344 case CTX_NEWLINE:
4345 setbit (uc, &dfa->syntax.newline);
4346 break;
4347 }
4348
4349 /* POSIX requires that the five bytes in "\n\r./" (including the
4350 terminating NUL) cannot occur inside a multibyte character. */
4351 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4352 ? (uc & 0xc0) != 0x80
4353 : strchr ("\n\r./", uc) != NULL);
4354 }
4355}
4356
4357/* Initialize TO by copying FROM's syntax settings. */
4358void
4359dfacopysyntax (struct dfa *to, struct dfa const *from)
4360{
4361 memset (to, 0, offsetof (struct dfa, syntax));
4362 to->canychar = -1;
4363 to->fast = from->fast;
4364 to->syntax = from->syntax;
4365 to->dfaexec = from->dfaexec;
4366 to->localeinfo = from->localeinfo;
4367}
4368
4369/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette