VirtualBox

source: kBuild/trunk/src/grep/lib/dfa.c@ 3646

Last change on this file since 3646 was 3532, checked in by bird, 3 years ago

grep: Initial windows config and adjustments.

  • Property svn:eol-style set to native
File size: 133.4 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24
25#include "dfa.h"
26
27#include "flexmember.h"
28#include "idx.h"
29#include "verify.h"
30
31#include <assert.h>
32#include <ctype.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <limits.h>
37#include <string.h>
38
39/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40 understandably cannot deduce that the input comes from a
41 well-formed regular expression. There's little point to the
42 runtime overhead of 'assert' instead of 'assume_nonnull' when the
43 MMU will check anyway. */
44#define assume_nonnull(x) assume ((x) != NULL)
45
46static bool
47streq (char const *a, char const *b)
48{
49 return strcmp (a, b) == 0;
50}
51
52static bool
53isasciidigit (char c)
54{
55 return '0' <= c && c <= '9';
56}
57
58#include "gettext.h"
59#define _(str) gettext (str)
60
61#include <wchar.h>
62
63#include "xalloc.h"
64#include "localeinfo.h"
65
66#ifndef FALLTHROUGH
67# if 201710L < __STDC_VERSION__
68# define FALLTHROUGH [[__fallthrough__]]
69# elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
70# define FALLTHROUGH __attribute__ ((__fallthrough__))
71# else
72# define FALLTHROUGH ((void) 0)
73# endif
74#endif
75
76#ifndef MIN
77# define MIN(a,b) ((a) < (b) ? (a) : (b))
78#endif
79
80/* HPUX defines these as macros in sys/param.h. */
81#ifdef setbit
82# undef setbit
83#endif
84#ifdef clrbit
85# undef clrbit
86#endif
87
88/* For code that does not use Gnulib’s isblank module. */
89#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90# define isblank dfa_isblank
91static int
92isblank (int c)
93{
94 return c == ' ' || c == '\t';
95}
96#endif
97
98/* First integer value that is greater than any character code. */
99enum { NOTCHAR = 1 << CHAR_BIT };
100
101#ifdef UINT_LEAST64_MAX
102
103/* Number of bits used in a charclass word. */
104enum { CHARCLASS_WORD_BITS = 64 };
105
106/* This represents part of a character class. It must be unsigned and
107 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108typedef uint_least64_t charclass_word;
109
110/* Part of a charclass initializer that represents 64 bits' worth of a
111 charclass, where LO and HI are the low and high-order 32 bits of
112 the 64-bit quantity. */
113# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115#else
116/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117enum { CHARCLASS_WORD_BITS = 32 };
118typedef unsigned long charclass_word;
119# define CHARCLASS_PAIR(lo, hi) lo, hi
120#endif
121
122/* An initializer for a charclass whose 32-bit words are A through H. */
123#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124 {{ \
125 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 }}
128
129/* The maximum useful value of a charclass_word; all used bits are 1. */
130static charclass_word const CHARCLASS_WORD_MASK
131 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133/* Number of words required to hold a bit for every character. */
134enum
135{
136 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137};
138
139/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142/* Convert a possibly-signed character to an unsigned character. This is
143 a bit safer than casting to unsigned char, since it catches some type
144 errors that the cast doesn't. */
145static unsigned char
146to_uchar (char ch)
147{
148 return ch;
149}
150
151/* Contexts tell us whether a character is a newline or a word constituent.
152 Word-constituent characters are those that satisfy iswalnum, plus '_'.
153 Each character has a single CTX_* value; bitmasks of CTX_* values denote
154 a particular character class.
155
156 A state also stores a context value, which is a bitmask of CTX_* values.
157 A state's context represents a set of characters that the state's
158 predecessors must match. For example, a state whose context does not
159 include CTX_LETTER will never have transitions where the previous
160 character is a word constituent. A state whose context is CTX_ANY
161 might have transitions from any character. */
162
163enum
164 {
165 CTX_NONE = 1,
166 CTX_LETTER = 2,
167 CTX_NEWLINE = 4,
168 CTX_ANY = 7
169 };
170
171/* Sometimes characters can only be matched depending on the surrounding
172 context. Such context decisions depend on what the previous character
173 was, and the value of the current (lookahead) character. Context
174 dependent constraints are encoded as 9-bit integers. Each bit that
175 is set indicates that the constraint succeeds in the corresponding
176 context.
177
178 bit 6-8 - valid contexts when next character is CTX_NEWLINE
179 bit 3-5 - valid contexts when next character is CTX_LETTER
180 bit 0-2 - valid contexts when next character is CTX_NONE
181
182 succeeds_in_context determines whether a given constraint
183 succeeds in a particular context. Prev is a bitmask of possible
184 context values for the previous character, curr is the (single-bit)
185 context value for the lookahead character. */
186static int
187newline_constraint (int constraint)
188{
189 return (constraint >> 6) & 7;
190}
191static int
192letter_constraint (int constraint)
193{
194 return (constraint >> 3) & 7;
195}
196static int
197other_constraint (int constraint)
198{
199 return constraint & 7;
200}
201
202static bool
203succeeds_in_context (int constraint, int prev, int curr)
204{
205 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208 & prev);
209}
210
211/* The following describe what a constraint depends on. */
212static bool
213prev_newline_dependent (int constraint)
214{
215 return ((constraint ^ constraint >> 2) & 0111) != 0;
216}
217static bool
218prev_letter_dependent (int constraint)
219{
220 return ((constraint ^ constraint >> 1) & 0111) != 0;
221}
222
223/* Tokens that match the empty string subject to some constraint actually
224 work by applying that constraint to determine what may follow them,
225 taking into account what has gone before. The following values are
226 the constraints corresponding to the special tokens previously defined. */
227enum
228 {
229 NO_CONSTRAINT = 0777,
230 BEGLINE_CONSTRAINT = 0444,
231 ENDLINE_CONSTRAINT = 0700,
232 BEGWORD_CONSTRAINT = 0050,
233 ENDWORD_CONSTRAINT = 0202,
234 LIMWORD_CONSTRAINT = 0252,
235 NOTLIMWORD_CONSTRAINT = 0525
236 };
237
238/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239 are operators and others are terminal symbols. Most (but not all) of these
240 codes are returned by the lexical analyzer. */
241
242typedef ptrdiff_t token;
243static token const TOKEN_MAX = PTRDIFF_MAX;
244
245/* States are indexed by state_num values. These are normally
246 nonnegative but -1 is used as a special value. */
247typedef ptrdiff_t state_num;
248
249/* Predefined token values. */
250enum
251{
252 END = -1, /* END is a terminal symbol that matches the
253 end of input; any value of END or less in
254 the parse tree is such a symbol. Accepting
255 states of the DFA are those that would have
256 a transition on END. This is -1, not some
257 more-negative value, to tweak the speed of
258 comparisons to END. */
259
260 /* Ordinary character values are terminal symbols that match themselves. */
261
262 /* CSET must come last in the following list of special tokens. Otherwise,
263 the list order matters only for performance. Related special tokens
264 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
265 || CSET <= t) can be done with a single machine-level comparison. */
266
267 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268 the empty string. */
269
270 QMARK, /* QMARK is an operator of one argument that
271 matches zero or one occurrences of its
272 argument. */
273
274 STAR, /* STAR is an operator of one argument that
275 matches the Kleene closure (zero or more
276 occurrences) of its argument. */
277
278 PLUS, /* PLUS is an operator of one argument that
279 matches the positive closure (one or more
280 occurrences) of its argument. */
281
282 REPMN, /* REPMN is a lexical token corresponding
283 to the {m,n} construct. REPMN never
284 appears in the compiled token vector. */
285
286 CAT, /* CAT is an operator of two arguments that
287 matches the concatenation of its
288 arguments. CAT is never returned by the
289 lexical analyzer. */
290
291 OR, /* OR is an operator of two arguments that
292 matches either of its arguments. */
293
294 LPAREN, /* LPAREN never appears in the parse tree,
295 it is only a lexeme. */
296
297 RPAREN, /* RPAREN never appears in the parse tree. */
298
299#if defined(KMK_GREP) && defined(KBUILD_OS_WINDOWS)
300# define WCHAR DFA_WCHAR
301#endif
302 WCHAR, /* Only returned by lex. wctok contains
303 the wide character representation. */
304
305 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
306 a valid multibyte (or single byte) character.
307 It is used only if MB_CUR_MAX > 1. */
308
309 BEG, /* BEG is an initial symbol that matches the
310 beginning of input. */
311
312 BEGLINE, /* BEGLINE is a terminal symbol that matches
313 the empty string at the beginning of a
314 line. */
315
316 ENDLINE, /* ENDLINE is a terminal symbol that matches
317 the empty string at the end of a line. */
318
319 BEGWORD, /* BEGWORD is a terminal symbol that matches
320 the empty string at the beginning of a
321 word. */
322
323 ENDWORD, /* ENDWORD is a terminal symbol that matches
324 the empty string at the end of a word. */
325
326 LIMWORD, /* LIMWORD is a terminal symbol that matches
327 the empty string at the beginning or the
328 end of a word. */
329
330 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
331 matches the empty string not at
332 the beginning or end of a word. */
333
334 BACKREF, /* BACKREF is generated by \<digit>
335 or by any other construct that
336 is not completely handled. If the scanner
337 detects a transition on backref, it returns
338 a kind of "semi-success" indicating that
339 the match will have to be verified with
340 a backtracking matcher. */
341
342 MBCSET, /* MBCSET is similar to CSET, but for
343 multibyte characters. */
344
345 CSET /* CSET and (and any value greater) is a
346 terminal symbol that matches any of a
347 class of characters. */
348};
349
350
351/* States of the recognizer correspond to sets of positions in the parse
352 tree, together with the constraints under which they may be matched.
353 So a position is encoded as an index into the parse tree together with
354 a constraint. */
355typedef struct
356{
357 idx_t index; /* Index into the parse array. */
358 unsigned int constraint; /* Constraint for matching this position. */
359} position;
360
361/* Sets of positions are stored as arrays. */
362typedef struct
363{
364 position *elems; /* Elements of this position set. */
365 idx_t nelem; /* Number of elements in this set. */
366 idx_t alloc; /* Number of elements allocated in ELEMS. */
367} position_set;
368
369/* A state of the dfa consists of a set of positions, some flags,
370 and the token value of the lowest-numbered position of the state that
371 contains an END token. */
372typedef struct
373{
374 size_t hash; /* Hash of the positions of this state. */
375 position_set elems; /* Positions this state could match. */
376 unsigned char context; /* Context from previous state. */
377 unsigned short constraint; /* Constraint for this state to accept. */
378 position_set mbps; /* Positions which can match multibyte
379 characters or the follows, e.g., period.
380 Used only if MB_CUR_MAX > 1. */
381 state_num mb_trindex; /* Index of this state in MB_TRANS, or
382 negative if the state does not have
383 ANYCHAR. */
384} dfa_state;
385
386/* Maximum for any transition table count. This should be at least 3,
387 for the initial state setup. */
388enum { MAX_TRCOUNT = 1024 };
389
390/* A bracket operator.
391 e.g., [a-c], [[:alpha:]], etc. */
392struct mb_char_classes
393{
394 ptrdiff_t cset;
395 bool invert;
396 wchar_t *chars; /* Normal characters. */
397 idx_t nchars;
398 idx_t nchars_alloc;
399};
400
401struct regex_syntax
402{
403 /* Syntax bits controlling the behavior of the lexical analyzer. */
404 reg_syntax_t syntax_bits;
405 bool syntax_bits_set;
406
407 /* Flag for case-folding letters into sets. */
408 bool case_fold;
409
410 /* True if ^ and $ match only the start and end of data, and do not match
411 end-of-line within data. */
412 bool anchor;
413
414 /* End-of-line byte in data. */
415 unsigned char eolbyte;
416
417 /* Cache of char-context values. */
418 char sbit[NOTCHAR];
419
420 /* If never_trail[B], the byte B cannot be a non-initial byte in a
421 multibyte character. */
422 bool never_trail[NOTCHAR];
423
424 /* Set of characters considered letters. */
425 charclass letters;
426
427 /* Set of characters that are newline. */
428 charclass newline;
429};
430
431/* Lexical analyzer. All the dross that deals with the obnoxious
432 GNU Regex syntax bits is located here. The poor, suffering
433 reader is referred to the GNU Regex documentation for the
434 meaning of the @#%!@#%^!@ syntax bits. */
435struct lexer_state
436{
437 char const *ptr; /* Pointer to next input character. */
438 idx_t left; /* Number of characters remaining. */
439 token lasttok; /* Previous token returned; initially END. */
440 idx_t parens; /* Count of outstanding left parens. */
441 int minrep, maxrep; /* Repeat counts for {m,n}. */
442
443 /* Wide character representation of the current multibyte character,
444 or WEOF if there was an encoding error. Used only if
445 MB_CUR_MAX > 1. */
446 wint_t wctok;
447
448 /* The most recently analyzed multibyte bracket expression. */
449 struct mb_char_classes brack;
450
451 /* We're separated from beginning or (, | only by zero-width characters. */
452 bool laststart;
453};
454
455/* Recursive descent parser for regular expressions. */
456
457struct parser_state
458{
459 token tok; /* Lookahead token. */
460 idx_t depth; /* Current depth of a hypothetical stack
461 holding deferred productions. This is
462 used to determine the depth that will be
463 required of the real stack later on in
464 dfaanalyze. */
465};
466
467/* A compiled regular expression. */
468struct dfa
469{
470 /* Fields filled by the scanner. */
471 charclass *charclasses; /* Array of character sets for CSET tokens. */
472 idx_t cindex; /* Index for adding new charclasses. */
473 idx_t calloc; /* Number of charclasses allocated. */
474 ptrdiff_t canychar; /* Index of anychar class, or -1. */
475
476 /* Scanner state */
477 struct lexer_state lex;
478
479 /* Parser state */
480 struct parser_state parse;
481
482 /* Fields filled by the parser. */
483 token *tokens; /* Postfix parse array. */
484 idx_t tindex; /* Index for adding new tokens. */
485 idx_t talloc; /* Number of tokens currently allocated. */
486 idx_t depth; /* Depth required of an evaluation stack
487 used for depth-first traversal of the
488 parse tree. */
489 idx_t nleaves; /* Number of non-EMPTY leaves
490 in the parse tree. */
491 idx_t nregexps; /* Count of parallel regexps being built
492 with dfaparse. */
493 bool fast; /* The DFA is fast. */
494 bool epsilon; /* Does a token match only the empty string? */
495 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
496 mbstate_t mbs; /* Multibyte conversion state. */
497
498 /* The following are valid only if MB_CUR_MAX > 1. */
499
500 /* The value of multibyte_prop[i] is defined by following rule.
501 if tokens[i] < NOTCHAR
502 bit 0 : tokens[i] is the first byte of a character, including
503 single-byte characters.
504 bit 1 : tokens[i] is the last byte of a character, including
505 single-byte characters.
506
507 e.g.
508 tokens
509 = 'single_byte_a', 'multi_byte_A', single_byte_b'
510 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
511 multibyte_prop
512 = 3 , 1 , 0 , 2 , 3
513 */
514 char *multibyte_prop;
515
516 /* Fields filled by the superset. */
517 struct dfa *superset; /* Hint of the dfa. */
518
519 /* Fields filled by the state builder. */
520 dfa_state *states; /* States of the dfa. */
521 state_num sindex; /* Index for adding new states. */
522 idx_t salloc; /* Number of states currently allocated. */
523
524 /* Fields filled by the parse tree->NFA conversion. */
525 position_set *follows; /* Array of follow sets, indexed by position
526 index. The follow of a position is the set
527 of positions containing characters that
528 could conceivably follow a character
529 matching the given position in a string
530 matching the regexp. Allocated to the
531 maximum possible position index. */
532 bool searchflag; /* We are supposed to build a searching
533 as opposed to an exact matcher. A searching
534 matcher finds the first and shortest string
535 matching a regexp anywhere in the buffer,
536 whereas an exact matcher finds the longest
537 string matching, but anchored to the
538 beginning of the buffer. */
539
540 /* Fields filled by dfaanalyze. */
541 int *constraints; /* Array of union of accepting constraints
542 in the follow of a position. */
543 int *separates; /* Array of contexts on follow of a
544 position. */
545
546 /* Fields filled by dfaexec. */
547 state_num tralloc; /* Number of transition tables that have
548 slots so far, not counting trans[-1] and
549 trans[-2]. */
550 int trcount; /* Number of transition tables that have
551 been built, other than for initial
552 states. */
553 int min_trcount; /* Number of initial states. Equivalently,
554 the minimum state number for which trcount
555 counts transitions. */
556 state_num **trans; /* Transition tables for states that can
557 never accept. If the transitions for a
558 state have not yet been computed, or the
559 state could possibly accept, its entry in
560 this table is NULL. This points to two
561 past the start of the allocated array,
562 and trans[-1] and trans[-2] are always
563 NULL. */
564 state_num **fails; /* Transition tables after failing to accept
565 on a state that potentially could do so.
566 If trans[i] is non-null, fails[i] must
567 be null. */
568 char *success; /* Table of acceptance conditions used in
569 dfaexec and computed in build_state. */
570 state_num *newlines; /* Transitions on newlines. The entry for a
571 newline in any transition table is always
572 -1 so we can count lines without wasting
573 too many cycles. The transition for a
574 newline is stored separately and handled
575 as a special case. Newline is also used
576 as a sentinel at the end of the buffer. */
577 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
578 context in multibyte locales, in which we
579 do not distinguish between their contexts,
580 as not supported word. */
581 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
582 state_num **mb_trans; /* Transition tables for states with
583 ANYCHAR. */
584 state_num mb_trcount; /* Number of transition tables for states with
585 ANYCHAR that have actually been built. */
586
587 /* Syntax configuration. This is near the end so that dfacopysyntax
588 can memset up to here. */
589 struct regex_syntax syntax;
590
591 /* Information derived from the locale. This is at the end so that
592 a quick memset need not clear it specially. */
593
594 /* dfaexec implementation. */
595 char *(*dfaexec) (struct dfa *, char const *, char *,
596 bool, ptrdiff_t *, bool *);
597
598 /* Other cached information derived from the locale. */
599 struct localeinfo localeinfo;
600};
601
602/* User access to dfa internals. */
603
604/* S could possibly be an accepting state of R. */
605static bool
606accepting (state_num s, struct dfa const *r)
607{
608 return r->states[s].constraint != 0;
609}
610
611/* STATE accepts in the specified context. */
612static bool
613accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
614{
615 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
616}
617
618static void regexp (struct dfa *dfa);
619
620/* Store into *PWC the result of converting the leading bytes of the
621 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
622 and updating the conversion state in *D. On conversion error,
623 convert just a single byte, to WEOF. Return the number of bytes
624 converted.
625
626 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
627
628 * PWC points to wint_t, not to wchar_t.
629 * The last arg is a dfa *D instead of merely a multibyte conversion
630 state D->mbs.
631 * N is idx_t not size_t, and must be at least 1.
632 * S[N - 1] must be a sentinel byte.
633 * Shift encodings are not supported.
634 * The return value is always in the range 1..N.
635 * D->mbs is always valid afterwards.
636 * *PWC is always set to something. */
637static int
638mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
639{
640 unsigned char uc = s[0];
641 wint_t wc = d->localeinfo.sbctowc[uc];
642
643 if (wc == WEOF)
644 {
645 wchar_t wch;
646 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
647 if (0 < nbytes && nbytes < (size_t) -2)
648 {
649 *pwc = wch;
650 return nbytes;
651 }
652 memset (&d->mbs, 0, sizeof d->mbs);
653 }
654
655 *pwc = wc;
656 return 1;
657}
658
659#ifdef DEBUG
660
661static void
662prtok (token t)
663{
664 if (t <= END)
665 fprintf (stderr, "END");
666 else if (0 <= t && t < NOTCHAR)
667 {
668 unsigned int ch = t;
669 fprintf (stderr, "0x%02x", ch);
670 }
671 else
672 {
673 char const *s;
674 switch (t)
675 {
676 case BEG:
677 s = "BEG";
678 break;
679 case EMPTY:
680 s = "EMPTY";
681 break;
682 case BACKREF:
683 s = "BACKREF";
684 break;
685 case BEGLINE:
686 s = "BEGLINE";
687 break;
688 case ENDLINE:
689 s = "ENDLINE";
690 break;
691 case BEGWORD:
692 s = "BEGWORD";
693 break;
694 case ENDWORD:
695 s = "ENDWORD";
696 break;
697 case LIMWORD:
698 s = "LIMWORD";
699 break;
700 case NOTLIMWORD:
701 s = "NOTLIMWORD";
702 break;
703 case QMARK:
704 s = "QMARK";
705 break;
706 case STAR:
707 s = "STAR";
708 break;
709 case PLUS:
710 s = "PLUS";
711 break;
712 case CAT:
713 s = "CAT";
714 break;
715 case OR:
716 s = "OR";
717 break;
718 case LPAREN:
719 s = "LPAREN";
720 break;
721 case RPAREN:
722 s = "RPAREN";
723 break;
724 case ANYCHAR:
725 s = "ANYCHAR";
726 break;
727 case MBCSET:
728 s = "MBCSET";
729 break;
730 default:
731 s = "CSET";
732 break;
733 }
734 fprintf (stderr, "%s", s);
735 }
736}
737#endif /* DEBUG */
738
739/* Stuff pertaining to charclasses. */
740
741static bool
742tstbit (unsigned int b, charclass const *c)
743{
744 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
745}
746
747static void
748setbit (unsigned int b, charclass *c)
749{
750 charclass_word one = 1;
751 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
752}
753
754static void
755clrbit (unsigned int b, charclass *c)
756{
757 charclass_word one = 1;
758 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
759}
760
761static void
762zeroset (charclass *s)
763{
764 memset (s, 0, sizeof *s);
765}
766
767static void
768fillset (charclass *s)
769{
770 for (int i = 0; i < CHARCLASS_WORDS; i++)
771 s->w[i] = CHARCLASS_WORD_MASK;
772}
773
774static void
775notset (charclass *s)
776{
777 for (int i = 0; i < CHARCLASS_WORDS; ++i)
778 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
779}
780
781static bool
782equal (charclass const *s1, charclass const *s2)
783{
784 charclass_word w = 0;
785 for (int i = 0; i < CHARCLASS_WORDS; i++)
786 w |= s1->w[i] ^ s2->w[i];
787 return w == 0;
788}
789
790static bool
791emptyset (charclass const *s)
792{
793 charclass_word w = 0;
794 for (int i = 0; i < CHARCLASS_WORDS; i++)
795 w |= s->w[i];
796 return w == 0;
797}
798
799/* Ensure that the array addressed by PA holds at least I + 1 items.
800 Either return PA, or reallocate the array and return its new address.
801 Although PA may be null, the returned value is never null.
802
803 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
804 is updated on reallocation. If PA is null, *NITEMS must be zero.
805 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
806 ITEM_SIZE is the size of one item; it must be positive.
807 Avoid O(N**2) behavior on arrays growing linearly. */
808static void *
809maybe_realloc (void *pa, idx_t i, idx_t *nitems,
810 ptrdiff_t nitems_max, idx_t item_size)
811{
812 if (i < *nitems)
813 return pa;
814 return xpalloc (pa, nitems, 1, nitems_max, item_size);
815}
816
817/* In DFA D, find the index of charclass S, or allocate a new one. */
818static idx_t
819charclass_index (struct dfa *d, charclass const *s)
820{
821 idx_t i;
822
823 for (i = 0; i < d->cindex; ++i)
824 if (equal (s, &d->charclasses[i]))
825 return i;
826 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
827 TOKEN_MAX - CSET, sizeof *d->charclasses);
828 ++d->cindex;
829 d->charclasses[i] = *s;
830 return i;
831}
832
833static bool
834unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
835{
836 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
837}
838
839static int
840char_context (struct dfa const *dfa, unsigned char c)
841{
842 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
843 return CTX_NEWLINE;
844 if (unibyte_word_constituent (dfa, c))
845 return CTX_LETTER;
846 return CTX_NONE;
847}
848
849/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
850 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
851 this may happen when folding case in weird Turkish locales where
852 dotless i/dotted I are not included in the chosen character set.
853 Return whether a bit was set in the charclass. */
854static bool
855setbit_wc (wint_t wc, charclass *c)
856{
857 int b = wctob (wc);
858 if (b < 0)
859 return false;
860
861 setbit (b, c);
862 return true;
863}
864
865/* Set a bit for B and its case variants in the charclass C.
866 MB_CUR_MAX must be 1. */
867static void
868setbit_case_fold_c (int b, charclass *c)
869{
870 int ub = toupper (b);
871 for (int i = 0; i < NOTCHAR; i++)
872 if (toupper (i) == ub)
873 setbit (i, c);
874}
875
876/* Fetch the next lexical input character from the pattern. There
877 must at least one byte of pattern input. Set DFA->lex.wctok to the
878 value of the character or to WEOF depending on whether the input is
879 a valid multibyte character (possibly of length 1). Then return
880 the next input byte value, except return EOF if the input is a
881 multibyte character of length greater than 1. */
882static int
883fetch_wc (struct dfa *dfa)
884{
885 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
886 dfa);
887 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
888 dfa->lex.ptr += nbytes;
889 dfa->lex.left -= nbytes;
890 return c;
891}
892
893/* If there is no more input, report an error about unbalanced brackets.
894 Otherwise, behave as with fetch_wc (DFA). */
895static int
896bracket_fetch_wc (struct dfa *dfa)
897{
898 if (! dfa->lex.left)
899 dfaerror (_("unbalanced ["));
900 return fetch_wc (dfa);
901}
902
903typedef int predicate (int);
904
905/* The following list maps the names of the Posix named character classes
906 to predicate functions that determine whether a given character is in
907 the class. The leading [ has already been eaten by the lexical
908 analyzer. */
909struct dfa_ctype
910{
911 const char *name;
912 predicate *func;
913 bool single_byte_only;
914};
915
916static const struct dfa_ctype prednames[] = {
917 {"alpha", isalpha, false},
918 {"upper", isupper, false},
919 {"lower", islower, false},
920 {"digit", isdigit, true},
921 {"xdigit", isxdigit, false},
922 {"space", isspace, false},
923 {"punct", ispunct, false},
924 {"alnum", isalnum, false},
925 {"print", isprint, false},
926 {"graph", isgraph, false},
927 {"cntrl", iscntrl, false},
928 {"blank", isblank, false},
929 {NULL, NULL, false}
930};
931
932static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
933find_pred (const char *str)
934{
935 for (int i = 0; prednames[i].name; i++)
936 if (streq (str, prednames[i].name))
937 return &prednames[i];
938 return NULL;
939}
940
941/* Parse a bracket expression, which possibly includes multibyte
942 characters. */
943static token
944parse_bracket_exp (struct dfa *dfa)
945{
946 /* This is a bracket expression that dfaexec is known to
947 process correctly. */
948 bool known_bracket_exp = true;
949
950 /* Used to warn about [:space:].
951 Bit 0 = first character is a colon.
952 Bit 1 = last character is a colon.
953 Bit 2 = includes any other character but a colon.
954 Bit 3 = includes ranges, char/equiv classes or collation elements. */
955 int colon_warning_state;
956
957 dfa->lex.brack.nchars = 0;
958 charclass ccl;
959 zeroset (&ccl);
960 int c = bracket_fetch_wc (dfa);
961 bool invert = c == '^';
962 if (invert)
963 {
964 c = bracket_fetch_wc (dfa);
965 known_bracket_exp = dfa->localeinfo.simple;
966 }
967 wint_t wc = dfa->lex.wctok;
968 int c1;
969 wint_t wc1;
970 colon_warning_state = (c == ':');
971 do
972 {
973 c1 = NOTCHAR; /* Mark c1 as not initialized. */
974 colon_warning_state &= ~2;
975
976 /* Note that if we're looking at some other [:...:] construct,
977 we just treat it as a bunch of ordinary characters. We can do
978 this because we assume regex has checked for syntax errors before
979 dfa is ever called. */
980 if (c == '[')
981 {
982 c1 = bracket_fetch_wc (dfa);
983 wc1 = dfa->lex.wctok;
984
985 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
986 || c1 == '.' || c1 == '=')
987 {
988 enum { MAX_BRACKET_STRING_LEN = 32 };
989 char str[MAX_BRACKET_STRING_LEN + 1];
990 int len = 0;
991 for (;;)
992 {
993 c = bracket_fetch_wc (dfa);
994 if (dfa->lex.left == 0
995 || (c == c1 && dfa->lex.ptr[0] == ']'))
996 break;
997 if (len < MAX_BRACKET_STRING_LEN)
998 str[len++] = c;
999 else
1000 /* This is in any case an invalid class name. */
1001 str[0] = '\0';
1002 }
1003 str[len] = '\0';
1004
1005 /* Fetch bracket. */
1006 c = bracket_fetch_wc (dfa);
1007 wc = dfa->lex.wctok;
1008 if (c1 == ':')
1009 /* Build character class. POSIX allows character
1010 classes to match multicharacter collating elements,
1011 but the regex code does not support that, so do not
1012 worry about that possibility. */
1013 {
1014 char const *class
1015 = (dfa->syntax.case_fold && (streq (str, "upper")
1016 || streq (str, "lower"))
1017 ? "alpha" : str);
1018 const struct dfa_ctype *pred = find_pred (class);
1019 if (!pred)
1020 dfaerror (_("invalid character class"));
1021
1022 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1023 known_bracket_exp = false;
1024 else
1025 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1026 if (pred->func (c2))
1027 setbit (c2, &ccl);
1028 }
1029 else
1030 known_bracket_exp = false;
1031
1032 colon_warning_state |= 8;
1033
1034 /* Fetch new lookahead character. */
1035 c1 = bracket_fetch_wc (dfa);
1036 wc1 = dfa->lex.wctok;
1037 continue;
1038 }
1039
1040 /* We treat '[' as a normal character here. c/c1/wc/wc1
1041 are already set up. */
1042 }
1043
1044 if (c == '\\'
1045 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1046 {
1047 c = bracket_fetch_wc (dfa);
1048 wc = dfa->lex.wctok;
1049 }
1050
1051 if (c1 == NOTCHAR)
1052 {
1053 c1 = bracket_fetch_wc (dfa);
1054 wc1 = dfa->lex.wctok;
1055 }
1056
1057 if (c1 == '-')
1058 /* build range characters. */
1059 {
1060 int c2 = bracket_fetch_wc (dfa);
1061 wint_t wc2 = dfa->lex.wctok;
1062
1063 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1064 Treat it like [-a[.aa.]] while parsing it, and
1065 remember that the set is unknown. */
1066 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1067 {
1068 known_bracket_exp = false;
1069 c2 = ']';
1070 }
1071
1072 if (c2 == ']')
1073 {
1074 /* In the case [x-], the - is an ordinary hyphen,
1075 which is left in c1, the lookahead character. */
1076 dfa->lex.ptr--;
1077 dfa->lex.left++;
1078 }
1079 else
1080 {
1081 if (c2 == '\\' && (dfa->syntax.syntax_bits
1082 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1083 {
1084 c2 = bracket_fetch_wc (dfa);
1085 wc2 = dfa->lex.wctok;
1086 }
1087
1088 colon_warning_state |= 8;
1089 c1 = bracket_fetch_wc (dfa);
1090 wc1 = dfa->lex.wctok;
1091
1092 /* Treat [x-y] as a range if x != y. */
1093 if (wc != wc2 || wc == WEOF)
1094 {
1095 if (dfa->localeinfo.simple
1096 || (isasciidigit (c) & isasciidigit (c2)))
1097 {
1098 for (int ci = c; ci <= c2; ci++)
1099 if (dfa->syntax.case_fold && isalpha (ci))
1100 setbit_case_fold_c (ci, &ccl);
1101 else
1102 setbit (ci, &ccl);
1103 }
1104 else
1105 known_bracket_exp = false;
1106
1107 continue;
1108 }
1109 }
1110 }
1111
1112 colon_warning_state |= (c == ':') ? 2 : 4;
1113
1114 if (!dfa->localeinfo.multibyte)
1115 {
1116 if (dfa->syntax.case_fold && isalpha (c))
1117 setbit_case_fold_c (c, &ccl);
1118 else
1119 setbit (c, &ccl);
1120 continue;
1121 }
1122
1123 if (wc == WEOF)
1124 known_bracket_exp = false;
1125 else
1126 {
1127 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1128 int n = (dfa->syntax.case_fold
1129 ? case_folded_counterparts (wc, folded + 1) + 1
1130 : 1);
1131 folded[0] = wc;
1132 for (int i = 0; i < n; i++)
1133 if (!setbit_wc (folded[i], &ccl))
1134 {
1135 dfa->lex.brack.chars
1136 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1137 &dfa->lex.brack.nchars_alloc, -1,
1138 sizeof *dfa->lex.brack.chars);
1139 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1140 }
1141 }
1142 }
1143 while ((wc = wc1, (c = c1) != ']'));
1144
1145 if (colon_warning_state == 7)
1146 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1147
1148 if (! known_bracket_exp)
1149 return BACKREF;
1150
1151 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1152 {
1153 dfa->lex.brack.invert = invert;
1154 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1155 return MBCSET;
1156 }
1157
1158 if (invert)
1159 {
1160 notset (&ccl);
1161 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1162 clrbit ('\n', &ccl);
1163 }
1164
1165 return CSET + charclass_index (dfa, &ccl);
1166}
1167
1168struct lexptr
1169{
1170 char const *ptr;
1171 idx_t left;
1172};
1173
1174static void
1175push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1176{
1177 ls->ptr = dfa->lex.ptr;
1178 ls->left = dfa->lex.left;
1179 dfa->lex.ptr = s;
1180 dfa->lex.left = strlen (s);
1181}
1182
1183static void
1184pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1185{
1186 dfa->lex.ptr = ls->ptr;
1187 dfa->lex.left = ls->left;
1188}
1189
1190static token
1191lex (struct dfa *dfa)
1192{
1193 bool backslash = false;
1194
1195 /* Basic plan: We fetch a character. If it's a backslash,
1196 we set the backslash flag and go through the loop again.
1197 On the plus side, this avoids having a duplicate of the
1198 main switch inside the backslash case. On the minus side,
1199 it means that just about every case begins with
1200 "if (backslash) ...". */
1201 for (int i = 0; i < 2; ++i)
1202 {
1203 if (! dfa->lex.left)
1204 return dfa->lex.lasttok = END;
1205 int c = fetch_wc (dfa);
1206
1207 switch (c)
1208 {
1209 case '\\':
1210 if (backslash)
1211 goto normal_char;
1212 if (dfa->lex.left == 0)
1213 dfaerror (_("unfinished \\ escape"));
1214 backslash = true;
1215 break;
1216
1217 case '^':
1218 if (backslash)
1219 goto normal_char;
1220 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1221 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1222 || dfa->lex.lasttok == OR)
1223 return dfa->lex.lasttok = BEGLINE;
1224 goto normal_char;
1225
1226 case '$':
1227 if (backslash)
1228 goto normal_char;
1229 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1230 || dfa->lex.left == 0
1231 || ((dfa->lex.left
1232 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1233 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1234 & (dfa->lex.ptr[0] == '\\')]
1235 == ')'))
1236 || ((dfa->lex.left
1237 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1238 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1239 & (dfa->lex.ptr[0] == '\\')]
1240 == '|'))
1241 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1242 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1243 return dfa->lex.lasttok = ENDLINE;
1244 goto normal_char;
1245
1246 case '1':
1247 case '2':
1248 case '3':
1249 case '4':
1250 case '5':
1251 case '6':
1252 case '7':
1253 case '8':
1254 case '9':
1255 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1256 {
1257 dfa->lex.laststart = false;
1258 return dfa->lex.lasttok = BACKREF;
1259 }
1260 goto normal_char;
1261
1262 case '`':
1263 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1264 {
1265 /* FIXME: should be beginning of string */
1266 return dfa->lex.lasttok = BEGLINE;
1267 }
1268 goto normal_char;
1269
1270 case '\'':
1271 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1272 {
1273 /* FIXME: should be end of string */
1274 return dfa->lex.lasttok = ENDLINE;
1275 }
1276 goto normal_char;
1277
1278 case '<':
1279 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1280 return dfa->lex.lasttok = BEGWORD;
1281 goto normal_char;
1282
1283 case '>':
1284 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1285 return dfa->lex.lasttok = ENDWORD;
1286 goto normal_char;
1287
1288 case 'b':
1289 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1290 return dfa->lex.lasttok = LIMWORD;
1291 goto normal_char;
1292
1293 case 'B':
1294 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1295 return dfa->lex.lasttok = NOTLIMWORD;
1296 goto normal_char;
1297
1298 case '?':
1299 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1300 goto normal_char;
1301 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1302 goto normal_char;
1303 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1304 && dfa->lex.laststart)
1305 goto normal_char;
1306 return dfa->lex.lasttok = QMARK;
1307
1308 case '*':
1309 if (backslash)
1310 goto normal_char;
1311 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1312 && dfa->lex.laststart)
1313 goto normal_char;
1314 return dfa->lex.lasttok = STAR;
1315
1316 case '+':
1317 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1318 goto normal_char;
1319 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1320 goto normal_char;
1321 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1322 && dfa->lex.laststart)
1323 goto normal_char;
1324 return dfa->lex.lasttok = PLUS;
1325
1326 case '{':
1327 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1328 goto normal_char;
1329 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1330 goto normal_char;
1331 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1332 && dfa->lex.laststart)
1333 goto normal_char;
1334
1335 /* Cases:
1336 {M} - exact count
1337 {M,} - minimum count, maximum is infinity
1338 {,N} - 0 through N
1339 {,} - 0 to infinity (same as '*')
1340 {M,N} - M through N */
1341 {
1342 char const *p = dfa->lex.ptr;
1343 char const *lim = p + dfa->lex.left;
1344 dfa->lex.minrep = dfa->lex.maxrep = -1;
1345 for (; p != lim && isasciidigit (*p); p++)
1346 dfa->lex.minrep = (dfa->lex.minrep < 0
1347 ? *p - '0'
1348 : MIN (RE_DUP_MAX + 1,
1349 dfa->lex.minrep * 10 + *p - '0'));
1350 if (p != lim)
1351 {
1352 if (*p != ',')
1353 dfa->lex.maxrep = dfa->lex.minrep;
1354 else
1355 {
1356 if (dfa->lex.minrep < 0)
1357 dfa->lex.minrep = 0;
1358 while (++p != lim && isasciidigit (*p))
1359 dfa->lex.maxrep
1360 = (dfa->lex.maxrep < 0
1361 ? *p - '0'
1362 : MIN (RE_DUP_MAX + 1,
1363 dfa->lex.maxrep * 10 + *p - '0'));
1364 }
1365 }
1366 if (! ((! backslash || (p != lim && *p++ == '\\'))
1367 && p != lim && *p++ == '}'
1368 && 0 <= dfa->lex.minrep
1369 && (dfa->lex.maxrep < 0
1370 || dfa->lex.minrep <= dfa->lex.maxrep)))
1371 {
1372 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1373 goto normal_char;
1374 dfaerror (_("invalid content of \\{\\}"));
1375 }
1376 if (RE_DUP_MAX < dfa->lex.maxrep)
1377 dfaerror (_("regular expression too big"));
1378 dfa->lex.ptr = p;
1379 dfa->lex.left = lim - p;
1380 }
1381 dfa->lex.laststart = false;
1382 return dfa->lex.lasttok = REPMN;
1383
1384 case '|':
1385 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1386 goto normal_char;
1387 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1388 goto normal_char;
1389 dfa->lex.laststart = true;
1390 return dfa->lex.lasttok = OR;
1391
1392 case '\n':
1393 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1394 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1395 goto normal_char;
1396 dfa->lex.laststart = true;
1397 return dfa->lex.lasttok = OR;
1398
1399 case '(':
1400 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1401 goto normal_char;
1402 dfa->lex.parens++;
1403 dfa->lex.laststart = true;
1404 return dfa->lex.lasttok = LPAREN;
1405
1406 case ')':
1407 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1408 goto normal_char;
1409 if (dfa->lex.parens == 0
1410 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1411 goto normal_char;
1412 dfa->lex.parens--;
1413 dfa->lex.laststart = false;
1414 return dfa->lex.lasttok = RPAREN;
1415
1416 case '.':
1417 if (backslash)
1418 goto normal_char;
1419 if (dfa->canychar < 0)
1420 {
1421 charclass ccl;
1422 fillset (&ccl);
1423 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1424 clrbit ('\n', &ccl);
1425 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1426 clrbit ('\0', &ccl);
1427 if (dfa->localeinfo.multibyte)
1428 for (int c2 = 0; c2 < NOTCHAR; c2++)
1429 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1430 clrbit (c2, &ccl);
1431 dfa->canychar = charclass_index (dfa, &ccl);
1432 }
1433 dfa->lex.laststart = false;
1434 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1435 ? ANYCHAR
1436 : CSET + dfa->canychar);
1437
1438 case 's':
1439 case 'S':
1440 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1441 goto normal_char;
1442 if (!dfa->localeinfo.multibyte)
1443 {
1444 charclass ccl;
1445 zeroset (&ccl);
1446 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1447 if (isspace (c2))
1448 setbit (c2, &ccl);
1449 if (c == 'S')
1450 notset (&ccl);
1451 dfa->lex.laststart = false;
1452 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1453 }
1454
1455 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1456 add_utf8_anychar, makes sense. */
1457
1458 /* \s and \S are documented to be equivalent to [[:space:]] and
1459 [^[:space:]] respectively, so tell the lexer to process those
1460 strings, each minus its "already processed" '['. */
1461 {
1462 struct lexptr ls;
1463 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1464 dfa->lex.lasttok = parse_bracket_exp (dfa);
1465 pop_lex_state (dfa, &ls);
1466 }
1467
1468 dfa->lex.laststart = false;
1469 return dfa->lex.lasttok;
1470
1471 case 'w':
1472 case 'W':
1473 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1474 goto normal_char;
1475
1476 if (!dfa->localeinfo.multibyte)
1477 {
1478 charclass ccl;
1479 zeroset (&ccl);
1480 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1481 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1482 setbit (c2, &ccl);
1483 if (c == 'W')
1484 notset (&ccl);
1485 dfa->lex.laststart = false;
1486 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1487 }
1488
1489 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1490 add_utf8_anychar, makes sense. */
1491
1492 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1493 [^_[:alnum:]] respectively, so tell the lexer to process those
1494 strings, each minus its "already processed" '['. */
1495 {
1496 struct lexptr ls;
1497 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1498 dfa->lex.lasttok = parse_bracket_exp (dfa);
1499 pop_lex_state (dfa, &ls);
1500 }
1501
1502 dfa->lex.laststart = false;
1503 return dfa->lex.lasttok;
1504
1505 case '[':
1506 if (backslash)
1507 goto normal_char;
1508 dfa->lex.laststart = false;
1509 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1510
1511 default:
1512 normal_char:
1513 dfa->lex.laststart = false;
1514 /* For multibyte character sets, folding is done in atom. Always
1515 return WCHAR. */
1516 if (dfa->localeinfo.multibyte)
1517 return dfa->lex.lasttok = WCHAR;
1518
1519 if (dfa->syntax.case_fold && isalpha (c))
1520 {
1521 charclass ccl;
1522 zeroset (&ccl);
1523 setbit_case_fold_c (c, &ccl);
1524 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1525 }
1526
1527 return dfa->lex.lasttok = c;
1528 }
1529 }
1530
1531 /* The above loop should consume at most a backslash
1532 and some other character. */
1533 abort ();
1534 return END; /* keeps pedantic compilers happy. */
1535}
1536
1537static void
1538addtok_mb (struct dfa *dfa, token t, char mbprop)
1539{
1540 if (dfa->talloc == dfa->tindex)
1541 {
1542 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1543 sizeof *dfa->tokens);
1544 if (dfa->localeinfo.multibyte)
1545 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1546 sizeof *dfa->multibyte_prop);
1547 }
1548 if (dfa->localeinfo.multibyte)
1549 dfa->multibyte_prop[dfa->tindex] = mbprop;
1550 dfa->tokens[dfa->tindex++] = t;
1551
1552 switch (t)
1553 {
1554 case QMARK:
1555 case STAR:
1556 case PLUS:
1557 break;
1558
1559 case CAT:
1560 case OR:
1561 dfa->parse.depth--;
1562 break;
1563
1564 case EMPTY:
1565 dfa->epsilon = true;
1566 goto increment_depth;
1567
1568 case BACKREF:
1569 dfa->fast = false;
1570 goto increment_nleaves;
1571
1572 case BEGLINE:
1573 case ENDLINE:
1574 case BEGWORD:
1575 case ENDWORD:
1576 case LIMWORD:
1577 case NOTLIMWORD:
1578 dfa->epsilon = true;
1579 FALLTHROUGH;
1580 default:
1581 increment_nleaves:
1582 dfa->nleaves++;
1583 increment_depth:
1584 dfa->parse.depth++;
1585 if (dfa->depth < dfa->parse.depth)
1586 dfa->depth = dfa->parse.depth;
1587 break;
1588 }
1589}
1590
1591static void addtok_wc (struct dfa *dfa, wint_t wc);
1592
1593/* Add the given token to the parse tree, maintaining the depth count and
1594 updating the maximum depth if necessary. */
1595static void
1596addtok (struct dfa *dfa, token t)
1597{
1598 if (dfa->localeinfo.multibyte && t == MBCSET)
1599 {
1600 bool need_or = false;
1601
1602 /* Extract wide characters into alternations for better performance.
1603 This does not require UTF-8. */
1604 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1605 {
1606 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1607 if (need_or)
1608 addtok (dfa, OR);
1609 need_or = true;
1610 }
1611 dfa->lex.brack.nchars = 0;
1612
1613 /* Wide characters have been handled above, so it is possible
1614 that the set is empty now. Do nothing in that case. */
1615 if (dfa->lex.brack.cset != -1)
1616 {
1617 addtok (dfa, CSET + dfa->lex.brack.cset);
1618 if (need_or)
1619 addtok (dfa, OR);
1620 }
1621 }
1622 else
1623 {
1624 addtok_mb (dfa, t, 3);
1625 }
1626}
1627
1628/* We treat a multibyte character as a single atom, so that DFA
1629 can treat a multibyte character as a single expression.
1630
1631 e.g., we construct the following tree from "<mb1><mb2>".
1632 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1633 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1634static void
1635addtok_wc (struct dfa *dfa, wint_t wc)
1636{
1637 unsigned char buf[MB_LEN_MAX];
1638 mbstate_t s = { 0 };
1639 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1640 int buflen;
1641
1642 if (stored_bytes != (size_t) -1)
1643 buflen = stored_bytes;
1644 else
1645 {
1646 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1647 the addtok_mb call altogether can corrupt the heap. */
1648 buflen = 1;
1649 buf[0] = 0;
1650 }
1651
1652 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1653 for (int i = 1; i < buflen; i++)
1654 {
1655 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1656 addtok (dfa, CAT);
1657 }
1658}
1659
1660static void
1661add_utf8_anychar (struct dfa *dfa)
1662{
1663 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1664 UTF-8 byte sequence has been defined as follows:
1665
1666 ([\x00-\x7f]
1667 |[\xc2-\xdf][\x80-\xbf]
1668 |[\xe0][\xa0-\xbf][\x80-\xbf]
1669 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1670 |[\xed][\x80-\x9f][\x80-\xbf]
1671 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1672 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1673 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1674
1675 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1676 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1677 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1678 H = [\x80-\x9f], I = [\xf0],
1679 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1680
1681 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1682
1683 /* Mnemonics for classes containing two or more bytes. */
1684 enum { A, B, C, E, F, H, J, K, M };
1685
1686 /* Mnemonics for single-byte tokens. */
1687 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1688
1689 static charclass const utf8_classes[] = {
1690 /* A. 00-7f: 1-byte sequence. */
1691 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1692
1693 /* B. c2-df: 1st byte of a 2-byte sequence. */
1694 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1695
1696 /* C. 80-bf: non-leading bytes. */
1697 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1698
1699 /* D. e0 (just a token). */
1700
1701 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1702 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1703
1704 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1705 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1706
1707 /* G. ed (just a token). */
1708
1709 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1710 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1711
1712 /* I. f0 (just a token). */
1713
1714 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1715 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1716
1717 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1718 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1719
1720 /* L. f4 (just a token). */
1721
1722 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1723 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1724 };
1725
1726 /* Define the character classes that are needed below. */
1727 if (dfa->utf8_anychar_classes[0] == 0)
1728 {
1729 charclass c = utf8_classes[0];
1730 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1731 clrbit ('\n', &c);
1732 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1733 clrbit ('\0', &c);
1734 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1735
1736 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1737 dfa->utf8_anychar_classes[i]
1738 = CSET + charclass_index (dfa, &utf8_classes[i]);
1739 }
1740
1741 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1742 The token buffer is in reverse Polish order, so we get
1743 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1744 C CAT OR C CAT OR C CAT OR". */
1745 addtok (dfa, dfa->utf8_anychar_classes[A]);
1746 addtok (dfa, dfa->utf8_anychar_classes[B]);
1747 addtok (dfa, D_token);
1748 addtok (dfa, dfa->utf8_anychar_classes[E]);
1749 addtok (dfa, CAT);
1750 addtok (dfa, OR);
1751 addtok (dfa, G_token);
1752 addtok (dfa, dfa->utf8_anychar_classes[H]);
1753 addtok (dfa, CAT);
1754 addtok (dfa, OR);
1755 addtok (dfa, dfa->utf8_anychar_classes[F]);
1756 addtok (dfa, I_token);
1757 addtok (dfa, dfa->utf8_anychar_classes[J]);
1758 addtok (dfa, CAT);
1759 addtok (dfa, OR);
1760 addtok (dfa, L_token);
1761 addtok (dfa, dfa->utf8_anychar_classes[M]);
1762 addtok (dfa, CAT);
1763 addtok (dfa, OR);
1764 addtok (dfa, dfa->utf8_anychar_classes[K]);
1765 for (int i = 0; i < 3; i++)
1766 {
1767 addtok (dfa, dfa->utf8_anychar_classes[C]);
1768 addtok (dfa, CAT);
1769 addtok (dfa, OR);
1770 }
1771}
1772
1773/* The grammar understood by the parser is as follows.
1774
1775 regexp:
1776 regexp OR branch
1777 branch
1778
1779 branch:
1780 branch closure
1781 closure
1782
1783 closure:
1784 closure QMARK
1785 closure STAR
1786 closure PLUS
1787 closure REPMN
1788 atom
1789
1790 atom:
1791 <normal character>
1792 <multibyte character>
1793 ANYCHAR
1794 MBCSET
1795 CSET
1796 BACKREF
1797 BEGLINE
1798 ENDLINE
1799 BEGWORD
1800 ENDWORD
1801 LIMWORD
1802 NOTLIMWORD
1803 LPAREN regexp RPAREN
1804 <empty>
1805
1806 The parser builds a parse tree in postfix form in an array of tokens. */
1807
1808static void
1809atom (struct dfa *dfa)
1810{
1811 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1812 || dfa->parse.tok >= CSET
1813 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1814 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1815 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1816 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1817 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1818 {
1819 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1820 {
1821 /* For UTF-8 expand the period to a series of CSETs that define a
1822 valid UTF-8 character. This avoids using the slow multibyte
1823 path. I'm pretty sure it would be both profitable and correct to
1824 do it for any encoding; however, the optimization must be done
1825 manually as it is done above in add_utf8_anychar. So, let's
1826 start with UTF-8: it is the most used, and the structure of the
1827 encoding makes the correctness more obvious. */
1828 add_utf8_anychar (dfa);
1829 }
1830 else
1831 addtok (dfa, dfa->parse.tok);
1832 dfa->parse.tok = lex (dfa);
1833 }
1834 else if (dfa->parse.tok == WCHAR)
1835 {
1836 if (dfa->lex.wctok == WEOF)
1837 addtok (dfa, BACKREF);
1838 else
1839 {
1840 addtok_wc (dfa, dfa->lex.wctok);
1841
1842 if (dfa->syntax.case_fold)
1843 {
1844 wchar_t folded[CASE_FOLDED_BUFSIZE];
1845 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1846 for (int i = 0; i < n; i++)
1847 {
1848 addtok_wc (dfa, folded[i]);
1849 addtok (dfa, OR);
1850 }
1851 }
1852 }
1853
1854 dfa->parse.tok = lex (dfa);
1855 }
1856 else if (dfa->parse.tok == LPAREN)
1857 {
1858 dfa->parse.tok = lex (dfa);
1859 regexp (dfa);
1860 if (dfa->parse.tok != RPAREN)
1861 dfaerror (_("unbalanced ("));
1862 dfa->parse.tok = lex (dfa);
1863 }
1864 else
1865 addtok (dfa, EMPTY);
1866}
1867
1868/* Return the number of tokens in the given subexpression. */
1869static idx_t _GL_ATTRIBUTE_PURE
1870nsubtoks (struct dfa const *dfa, idx_t tindex)
1871{
1872 switch (dfa->tokens[tindex - 1])
1873 {
1874 default:
1875 return 1;
1876 case QMARK:
1877 case STAR:
1878 case PLUS:
1879 return 1 + nsubtoks (dfa, tindex - 1);
1880 case CAT:
1881 case OR:
1882 {
1883 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1884 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1885 }
1886 }
1887}
1888
1889/* Copy the given subexpression to the top of the tree. */
1890static void
1891copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1892{
1893 if (dfa->localeinfo.multibyte)
1894 for (idx_t i = 0; i < ntokens; i++)
1895 addtok_mb (dfa, dfa->tokens[tindex + i],
1896 dfa->multibyte_prop[tindex + i]);
1897 else
1898 for (idx_t i = 0; i < ntokens; i++)
1899 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1900}
1901
1902static void
1903closure (struct dfa *dfa)
1904{
1905 atom (dfa);
1906 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1907 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1908 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1909 {
1910 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1911 idx_t tindex = dfa->tindex - ntokens;
1912 if (dfa->lex.maxrep < 0)
1913 addtok (dfa, PLUS);
1914 if (dfa->lex.minrep == 0)
1915 addtok (dfa, QMARK);
1916 int i;
1917 for (i = 1; i < dfa->lex.minrep; i++)
1918 {
1919 copytoks (dfa, tindex, ntokens);
1920 addtok (dfa, CAT);
1921 }
1922 for (; i < dfa->lex.maxrep; i++)
1923 {
1924 copytoks (dfa, tindex, ntokens);
1925 addtok (dfa, QMARK);
1926 addtok (dfa, CAT);
1927 }
1928 dfa->parse.tok = lex (dfa);
1929 }
1930 else if (dfa->parse.tok == REPMN)
1931 {
1932 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1933 dfa->parse.tok = lex (dfa);
1934 closure (dfa);
1935 }
1936 else
1937 {
1938 addtok (dfa, dfa->parse.tok);
1939 dfa->parse.tok = lex (dfa);
1940 }
1941}
1942
1943static void
1944branch (struct dfa* dfa)
1945{
1946 closure (dfa);
1947 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1948 && dfa->parse.tok >= 0)
1949 {
1950 closure (dfa);
1951 addtok (dfa, CAT);
1952 }
1953}
1954
1955static void
1956regexp (struct dfa *dfa)
1957{
1958 branch (dfa);
1959 while (dfa->parse.tok == OR)
1960 {
1961 dfa->parse.tok = lex (dfa);
1962 branch (dfa);
1963 addtok (dfa, OR);
1964 }
1965}
1966
1967/* Parse a string S of length LEN into D. S can include NUL characters.
1968 This is the main entry point for the parser. */
1969void
1970dfaparse (char const *s, idx_t len, struct dfa *d)
1971{
1972 d->lex.ptr = s;
1973 d->lex.left = len;
1974 d->lex.lasttok = END;
1975 d->lex.laststart = true;
1976
1977 if (!d->syntax.syntax_bits_set)
1978 dfaerror (_("no syntax specified"));
1979
1980 if (!d->nregexps)
1981 addtok (d, BEG);
1982
1983 d->parse.tok = lex (d);
1984 d->parse.depth = d->depth;
1985
1986 regexp (d);
1987
1988 if (d->parse.tok != END)
1989 dfaerror (_("unbalanced )"));
1990
1991 addtok (d, END - d->nregexps);
1992 addtok (d, CAT);
1993
1994 if (d->nregexps)
1995 addtok (d, OR);
1996
1997 ++d->nregexps;
1998}
1999
2000/* Some primitives for operating on sets of positions. */
2001
2002/* Copy one set to another. */
2003static void
2004copy (position_set const *src, position_set *dst)
2005{
2006 if (dst->alloc < src->nelem)
2007 {
2008 free (dst->elems);
2009 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2010 sizeof *dst->elems);
2011 }
2012 dst->nelem = src->nelem;
2013 if (src->nelem != 0)
2014 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2015}
2016
2017static void
2018alloc_position_set (position_set *s, idx_t size)
2019{
2020 s->elems = xnmalloc (size, sizeof *s->elems);
2021 s->alloc = size;
2022 s->nelem = 0;
2023}
2024
2025/* Insert position P in set S. S is maintained in sorted order on
2026 decreasing index. If there is already an entry in S with P.index
2027 then merge (logically-OR) P's constraints into the one in S.
2028 S->elems must point to an array large enough to hold the resulting set. */
2029static void
2030insert (position p, position_set *s)
2031{
2032 idx_t count = s->nelem;
2033 idx_t lo = 0, hi = count;
2034 while (lo < hi)
2035 {
2036 idx_t mid = (lo + hi) >> 1;
2037 if (s->elems[mid].index < p.index)
2038 lo = mid + 1;
2039 else if (s->elems[mid].index == p.index)
2040 {
2041 s->elems[mid].constraint |= p.constraint;
2042 return;
2043 }
2044 else
2045 hi = mid;
2046 }
2047
2048 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2049 for (idx_t i = count; i > lo; i--)
2050 s->elems[i] = s->elems[i - 1];
2051 s->elems[lo] = p;
2052 ++s->nelem;
2053}
2054
2055static void
2056append (position p, position_set *s)
2057{
2058 idx_t count = s->nelem;
2059 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2060 s->elems[s->nelem++] = p;
2061}
2062
2063/* Merge S1 and S2 (with the additional constraint C2) into M. The
2064 result is as if the positions of S1, and of S2 with the additional
2065 constraint C2, were inserted into an initially empty set. */
2066static void
2067merge_constrained (position_set const *s1, position_set const *s2,
2068 unsigned int c2, position_set *m)
2069{
2070 idx_t i = 0, j = 0;
2071
2072 if (m->alloc - s1->nelem < s2->nelem)
2073 {
2074 free (m->elems);
2075 m->alloc = s1->nelem;
2076 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2077 }
2078 m->nelem = 0;
2079 while (i < s1->nelem || j < s2->nelem)
2080 if (! (j < s2->nelem)
2081 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2082 {
2083 unsigned int c = ((i < s1->nelem && j < s2->nelem
2084 && s1->elems[i].index == s2->elems[j].index)
2085 ? s2->elems[j++].constraint & c2
2086 : 0);
2087 m->elems[m->nelem].index = s1->elems[i].index;
2088 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2089 }
2090 else
2091 {
2092 if (s2->elems[j].constraint & c2)
2093 {
2094 m->elems[m->nelem].index = s2->elems[j].index;
2095 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2096 }
2097 j++;
2098 }
2099}
2100
2101/* Merge two sets of positions into a third. The result is exactly as if
2102 the positions of both sets were inserted into an initially empty set. */
2103static void
2104merge (position_set const *s1, position_set const *s2, position_set *m)
2105{
2106 merge_constrained (s1, s2, -1, m);
2107}
2108
2109/* Merge into DST all the elements of SRC, possibly destroying
2110 the contents of the temporary M. */
2111static void
2112merge2 (position_set *dst, position_set const *src, position_set *m)
2113{
2114 if (src->nelem < 4)
2115 {
2116 for (idx_t i = 0; i < src->nelem; i++)
2117 insert (src->elems[i], dst);
2118 }
2119 else
2120 {
2121 merge (src, dst, m);
2122 copy (m, dst);
2123 }
2124}
2125
2126/* Delete a position from a set. Return the nonzero constraint of the
2127 deleted position, or zero if there was no such position. */
2128static unsigned int
2129delete (idx_t del, position_set *s)
2130{
2131 idx_t count = s->nelem;
2132 idx_t lo = 0, hi = count;
2133 while (lo < hi)
2134 {
2135 idx_t mid = (lo + hi) >> 1;
2136 if (s->elems[mid].index < del)
2137 lo = mid + 1;
2138 else if (s->elems[mid].index == del)
2139 {
2140 unsigned int c = s->elems[mid].constraint;
2141 idx_t i;
2142 for (i = mid; i + 1 < count; i++)
2143 s->elems[i] = s->elems[i + 1];
2144 s->nelem = i;
2145 return c;
2146 }
2147 else
2148 hi = mid;
2149 }
2150 return 0;
2151}
2152
2153/* Replace a position with the followed set. */
2154static void
2155replace (position_set *dst, idx_t del, position_set *add,
2156 unsigned int constraint, position_set *tmp)
2157{
2158 unsigned int c = delete (del, dst) & constraint;
2159
2160 if (c)
2161 {
2162 copy (dst, tmp);
2163 merge_constrained (tmp, add, c, dst);
2164 }
2165}
2166
2167/* Find the index of the state corresponding to the given position set with
2168 the given preceding context, or create a new state if there is no such
2169 state. Context tells whether we got here on a newline or letter. */
2170static state_num
2171state_index (struct dfa *d, position_set const *s, int context)
2172{
2173 size_t hash = 0;
2174 int constraint = 0;
2175 state_num i;
2176
2177 for (i = 0; i < s->nelem; ++i)
2178 {
2179 idx_t ind = s->elems[i].index;
2180 hash ^= ind + s->elems[i].constraint;
2181 }
2182
2183 /* Try to find a state that exactly matches the proposed one. */
2184 for (i = 0; i < d->sindex; ++i)
2185 {
2186 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2187 || context != d->states[i].context)
2188 continue;
2189 state_num j;
2190 for (j = 0; j < s->nelem; ++j)
2191 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2192 || s->elems[j].index != d->states[i].elems.elems[j].index)
2193 break;
2194 if (j == s->nelem)
2195 return i;
2196 }
2197
2198#ifdef DEBUG
2199 fprintf (stderr, "new state %td\n nextpos:", i);
2200 for (state_num j = 0; j < s->nelem; j++)
2201 {
2202 fprintf (stderr, " %td:", s->elems[j].index);
2203 prtok (d->tokens[s->elems[j].index]);
2204 }
2205 fprintf (stderr, "\n context:");
2206 if (context ^ CTX_ANY)
2207 {
2208 if (context & CTX_NONE)
2209 fprintf (stderr, " CTX_NONE");
2210 if (context & CTX_LETTER)
2211 fprintf (stderr, " CTX_LETTER");
2212 if (context & CTX_NEWLINE)
2213 fprintf (stderr, " CTX_NEWLINE");
2214 }
2215 else
2216 fprintf (stderr, " CTX_ANY");
2217 fprintf (stderr, "\n");
2218#endif
2219
2220 for (state_num j = 0; j < s->nelem; j++)
2221 {
2222 int c = d->constraints[s->elems[j].index];
2223
2224 if (c != 0)
2225 {
2226 if (succeeds_in_context (c, context, CTX_ANY))
2227 constraint |= c;
2228 }
2229 else if (d->tokens[s->elems[j].index] == BACKREF)
2230 constraint = NO_CONSTRAINT;
2231 }
2232
2233
2234 /* Create a new state. */
2235 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2236 sizeof *d->states);
2237 d->states[i].hash = hash;
2238 alloc_position_set (&d->states[i].elems, s->nelem);
2239 copy (s, &d->states[i].elems);
2240 d->states[i].context = context;
2241 d->states[i].constraint = constraint;
2242 d->states[i].mbps.nelem = 0;
2243 d->states[i].mbps.elems = NULL;
2244 d->states[i].mb_trindex = -1;
2245
2246 ++d->sindex;
2247
2248 return i;
2249}
2250
2251/* Find the epsilon closure of D's set of positions. If any position of the set
2252 contains a symbol that matches the empty string in some context, replace
2253 that position with the elements of its follow labeled with an appropriate
2254 constraint. Repeat exhaustively until no funny positions are left.
2255 S->elems must be large enough to hold the result. BACKWARD is D's
2256 backward set; use and update it too. */
2257static void
2258epsclosure (struct dfa const *d, position_set *backward)
2259{
2260 position_set tmp;
2261 alloc_position_set (&tmp, d->nleaves);
2262 for (idx_t i = 0; i < d->tindex; i++)
2263 if (0 < d->follows[i].nelem)
2264 {
2265 unsigned int constraint;
2266 switch (d->tokens[i])
2267 {
2268 default:
2269 continue;
2270
2271 case BEGLINE:
2272 constraint = BEGLINE_CONSTRAINT;
2273 break;
2274 case ENDLINE:
2275 constraint = ENDLINE_CONSTRAINT;
2276 break;
2277 case BEGWORD:
2278 constraint = BEGWORD_CONSTRAINT;
2279 break;
2280 case ENDWORD:
2281 constraint = ENDWORD_CONSTRAINT;
2282 break;
2283 case LIMWORD:
2284 constraint = LIMWORD_CONSTRAINT;
2285 break;
2286 case NOTLIMWORD:
2287 constraint = NOTLIMWORD_CONSTRAINT;
2288 break;
2289 case EMPTY:
2290 constraint = NO_CONSTRAINT;
2291 break;
2292 }
2293
2294 delete (i, &d->follows[i]);
2295
2296 for (idx_t j = 0; j < backward[i].nelem; j++)
2297 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2298 constraint, &tmp);
2299 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2300 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2301 NO_CONSTRAINT, &tmp);
2302 }
2303 free (tmp.elems);
2304}
2305
2306/* Returns the set of contexts for which there is at least one
2307 character included in C. */
2308
2309static int
2310charclass_context (struct dfa const *dfa, charclass const *c)
2311{
2312 int context = 0;
2313
2314 for (int j = 0; j < CHARCLASS_WORDS; j++)
2315 {
2316 if (c->w[j] & dfa->syntax.newline.w[j])
2317 context |= CTX_NEWLINE;
2318 if (c->w[j] & dfa->syntax.letters.w[j])
2319 context |= CTX_LETTER;
2320 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2321 context |= CTX_NONE;
2322 }
2323
2324 return context;
2325}
2326
2327/* Returns the contexts on which the position set S depends. Each context
2328 in the set of returned contexts (let's call it SC) may have a different
2329 follow set than other contexts in SC, and also different from the
2330 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2331 in the complement set will have the same follow set. */
2332
2333static int _GL_ATTRIBUTE_PURE
2334state_separate_contexts (struct dfa *d, position_set const *s)
2335{
2336 int separate_contexts = 0;
2337
2338 for (idx_t j = 0; j < s->nelem; j++)
2339 separate_contexts |= d->separates[s->elems[j].index];
2340
2341 return separate_contexts;
2342}
2343
2344enum
2345{
2346 /* Single token is repeated. It is distinguished from non-repeated. */
2347 OPT_REPEAT = (1 << 0),
2348
2349 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2350 node is not merged. */
2351 OPT_LPAREN = (1 << 1),
2352
2353 /* Multiple branches are joined. The node is not merged. */
2354 OPT_RPAREN = (1 << 2),
2355
2356 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2357 flag is turned on. */
2358 OPT_WALKED = (1 << 3),
2359
2360 /* The node is queued. The node is not queued again. */
2361 OPT_QUEUED = (1 << 4)
2362};
2363
2364static void
2365merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2366 position_set *merged)
2367{
2368 position_set *follows = d->follows;
2369 idx_t nelem = 0;
2370
2371 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2372 {
2373 idx_t sindex = follows[tindex].elems[i].index;
2374
2375 /* Skip the node as pruned in future. */
2376 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2377 if (iconstraint == 0)
2378 continue;
2379
2380 if (d->tokens[follows[tindex].elems[i].index] <= END)
2381 {
2382 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2383 continue;
2384 }
2385
2386 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2387 {
2388 idx_t j;
2389
2390 for (j = 0; j < nelem; j++)
2391 {
2392 idx_t dindex = follows[tindex].elems[j].index;
2393
2394 if (dindex == tindex)
2395 continue;
2396
2397 if (follows[tindex].elems[j].constraint != iconstraint)
2398 continue;
2399
2400 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2401 continue;
2402
2403 if (d->tokens[sindex] != d->tokens[dindex])
2404 continue;
2405
2406 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2407 continue;
2408
2409 if (flags[sindex] & OPT_REPEAT)
2410 delete (sindex, &follows[sindex]);
2411
2412 merge2 (&follows[dindex], &follows[sindex], merged);
2413
2414 break;
2415 }
2416
2417 if (j < nelem)
2418 continue;
2419 }
2420
2421 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2422 flags[sindex] |= OPT_QUEUED;
2423 }
2424
2425 follows[tindex].nelem = nelem;
2426}
2427
2428static int
2429compare (const void *a, const void *b)
2430{
2431 position const *p = a, *q = b;
2432 return (p->index > q->index) - (p->index < q->index);
2433}
2434
2435static void
2436reorder_tokens (struct dfa *d)
2437{
2438 idx_t nleaves = 0;
2439 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2440 map[0] = nleaves++;
2441 for (idx_t i = 1; i < d->tindex; i++)
2442 map[i] = -1;
2443
2444 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2445 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2446 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2447 char *multibyte_prop = (d->localeinfo.multibyte
2448 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2449 : NULL);
2450
2451 for (idx_t i = 0; i < d->tindex; i++)
2452 {
2453 if (map[i] < 0)
2454 {
2455 free (d->follows[i].elems);
2456 d->follows[i].elems = NULL;
2457 d->follows[i].nelem = 0;
2458 continue;
2459 }
2460
2461 tokens[map[i]] = d->tokens[i];
2462 follows[map[i]] = d->follows[i];
2463 constraints[map[i]] = d->constraints[i];
2464
2465 if (multibyte_prop != NULL)
2466 multibyte_prop[map[i]] = d->multibyte_prop[i];
2467
2468 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2469 {
2470 if (map[d->follows[i].elems[j].index] == -1)
2471 map[d->follows[i].elems[j].index] = nleaves++;
2472
2473 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2474 }
2475
2476 qsort (d->follows[i].elems, d->follows[i].nelem,
2477 sizeof *d->follows[i].elems, compare);
2478 }
2479
2480 for (idx_t i = 0; i < nleaves; i++)
2481 {
2482 d->tokens[i] = tokens[i];
2483 d->follows[i] = follows[i];
2484 d->constraints[i] = constraints[i];
2485
2486 if (multibyte_prop != NULL)
2487 d->multibyte_prop[i] = multibyte_prop[i];
2488 }
2489
2490 d->tindex = d->nleaves = nleaves;
2491
2492 free (tokens);
2493 free (follows);
2494 free (constraints);
2495 free (multibyte_prop);
2496 free (map);
2497}
2498
2499static void
2500dfaoptimize (struct dfa *d)
2501{
2502 char *flags = xizalloc (d->tindex);
2503
2504 for (idx_t i = 0; i < d->tindex; i++)
2505 {
2506 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2507 {
2508 if (d->follows[i].elems[j].index == i)
2509 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2510 else if (d->follows[i].elems[j].index < i)
2511 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2512 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2513 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2514 else
2515 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2516 }
2517 }
2518
2519 flags[0] |= OPT_QUEUED;
2520
2521 position_set merged0;
2522 position_set *merged = &merged0;
2523 alloc_position_set (merged, d->nleaves);
2524
2525 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2526
2527 for (idx_t i = 0; i < d->tindex; i++)
2528 if (flags[i] & OPT_QUEUED)
2529 merge_nfa_state (d, i, flags, merged);
2530
2531 reorder_tokens (d);
2532
2533 free (merged->elems);
2534 free (flags);
2535}
2536
2537/* Perform bottom-up analysis on the parse tree, computing various functions.
2538 Note that at this point, we're pretending constructs like \< are real
2539 characters rather than constraints on what can follow them.
2540
2541 Nullable: A node is nullable if it is at the root of a regexp that can
2542 match the empty string.
2543 * EMPTY leaves are nullable.
2544 * No other leaf is nullable.
2545 * A QMARK or STAR node is nullable.
2546 * A PLUS node is nullable if its argument is nullable.
2547 * A CAT node is nullable if both its arguments are nullable.
2548 * An OR node is nullable if either argument is nullable.
2549
2550 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2551 that could correspond to the first character of a string matching the
2552 regexp rooted at the given node.
2553 * EMPTY leaves have empty firstpos.
2554 * The firstpos of a nonempty leaf is that leaf itself.
2555 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2556 argument.
2557 * The firstpos of a CAT node is the firstpos of the left argument, union
2558 the firstpos of the right if the left argument is nullable.
2559 * The firstpos of an OR node is the union of firstpos of each argument.
2560
2561 Lastpos: The lastpos of a node is the set of positions that could
2562 correspond to the last character of a string matching the regexp at
2563 the given node.
2564 * EMPTY leaves have empty lastpos.
2565 * The lastpos of a nonempty leaf is that leaf itself.
2566 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2567 argument.
2568 * The lastpos of a CAT node is the lastpos of its right argument, union
2569 the lastpos of the left if the right argument is nullable.
2570 * The lastpos of an OR node is the union of the lastpos of each argument.
2571
2572 Follow: The follow of a position is the set of positions that could
2573 correspond to the character following a character matching the node in
2574 a string matching the regexp. At this point we consider special symbols
2575 that match the empty string in some context to be just normal characters.
2576 Later, if we find that a special symbol is in a follow set, we will
2577 replace it with the elements of its follow, labeled with an appropriate
2578 constraint.
2579 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2580 the follow of every node in the lastpos.
2581 * Every node in the firstpos of the second argument of a CAT node is in
2582 the follow of every node in the lastpos of the first argument.
2583
2584 Because of the postfix representation of the parse tree, the depth-first
2585 analysis is conveniently done by a linear scan with the aid of a stack.
2586 Sets are stored as arrays of the elements, obeying a stack-like allocation
2587 scheme; the number of elements in each set deeper in the stack can be
2588 used to determine the address of a particular set's array. */
2589static void
2590dfaanalyze (struct dfa *d, bool searchflag)
2591{
2592 /* Array allocated to hold position sets. */
2593 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2594 /* Firstpos and lastpos elements. */
2595 position *firstpos = posalloc;
2596 position *lastpos = firstpos + d->nleaves;
2597 position pos;
2598 position_set tmp;
2599
2600 /* Stack for element counts and nullable flags. */
2601 struct
2602 {
2603 /* Whether the entry is nullable. */
2604 bool nullable;
2605
2606 /* Counts of firstpos and lastpos sets. */
2607 idx_t nfirstpos;
2608 idx_t nlastpos;
2609 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2610
2611 position_set merged; /* Result of merging sets. */
2612
2613 addtok (d, CAT);
2614 idx_t tindex = d->tindex;
2615
2616#ifdef DEBUG
2617 fprintf (stderr, "dfaanalyze:\n");
2618 for (idx_t i = 0; i < tindex; i++)
2619 {
2620 fprintf (stderr, " %td:", i);
2621 prtok (d->tokens[i]);
2622 }
2623 putc ('\n', stderr);
2624#endif
2625
2626 d->searchflag = searchflag;
2627 alloc_position_set (&merged, d->nleaves);
2628 d->follows = xicalloc (tindex, sizeof *d->follows);
2629 position_set *backward
2630 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2631
2632 for (idx_t i = 0; i < tindex; i++)
2633 {
2634 switch (d->tokens[i])
2635 {
2636 case EMPTY:
2637 /* The empty set is nullable. */
2638 stk->nullable = true;
2639
2640 /* The firstpos and lastpos of the empty leaf are both empty. */
2641 stk->nfirstpos = stk->nlastpos = 0;
2642 stk++;
2643 break;
2644
2645 case STAR:
2646 case PLUS:
2647 /* Every element in the lastpos of the argument is in the backward
2648 set of every element in the firstpos. */
2649 if (d->epsilon)
2650 {
2651 tmp.elems = lastpos - stk[-1].nlastpos;
2652 tmp.nelem = stk[-1].nlastpos;
2653 for (position *p = firstpos - stk[-1].nfirstpos;
2654 p < firstpos; p++)
2655 merge2 (&backward[p->index], &tmp, &merged);
2656 }
2657
2658 /* Every element in the firstpos of the argument is in the follow
2659 of every element in the lastpos. */
2660 {
2661 tmp.elems = firstpos - stk[-1].nfirstpos;
2662 tmp.nelem = stk[-1].nfirstpos;
2663 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2664 merge2 (&d->follows[p->index], &tmp, &merged);
2665 }
2666 FALLTHROUGH;
2667 case QMARK:
2668 /* A QMARK or STAR node is automatically nullable. */
2669 if (d->tokens[i] != PLUS)
2670 stk[-1].nullable = true;
2671 break;
2672
2673 case CAT:
2674 /* Every element in the lastpos of the first argument is in
2675 the backward set of every element in the firstpos of the
2676 second argument. */
2677 if (backward)
2678 {
2679 tmp.nelem = stk[-2].nlastpos;
2680 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2681 for (position *p = firstpos - stk[-1].nfirstpos;
2682 p < firstpos; p++)
2683 merge2 (&backward[p->index], &tmp, &merged);
2684 }
2685
2686 /* Every element in the firstpos of the second argument is in the
2687 follow of every element in the lastpos of the first argument. */
2688 {
2689 tmp.nelem = stk[-1].nfirstpos;
2690 tmp.elems = firstpos - stk[-1].nfirstpos;
2691 for (position *plim = lastpos - stk[-1].nlastpos,
2692 *p = plim - stk[-2].nlastpos;
2693 p < plim; p++)
2694 merge2 (&d->follows[p->index], &tmp, &merged);
2695 }
2696
2697 /* The firstpos of a CAT node is the firstpos of the first argument,
2698 union that of the second argument if the first is nullable. */
2699 if (stk[-2].nullable)
2700 stk[-2].nfirstpos += stk[-1].nfirstpos;
2701 else
2702 firstpos -= stk[-1].nfirstpos;
2703
2704 /* The lastpos of a CAT node is the lastpos of the second argument,
2705 union that of the first argument if the second is nullable. */
2706 if (stk[-1].nullable)
2707 stk[-2].nlastpos += stk[-1].nlastpos;
2708 else
2709 {
2710 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2711 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2712 p[j] = p[j + stk[-2].nlastpos];
2713 lastpos -= stk[-2].nlastpos;
2714 stk[-2].nlastpos = stk[-1].nlastpos;
2715 }
2716
2717 /* A CAT node is nullable if both arguments are nullable. */
2718 stk[-2].nullable &= stk[-1].nullable;
2719 stk--;
2720 break;
2721
2722 case OR:
2723 /* The firstpos is the union of the firstpos of each argument. */
2724 stk[-2].nfirstpos += stk[-1].nfirstpos;
2725
2726 /* The lastpos is the union of the lastpos of each argument. */
2727 stk[-2].nlastpos += stk[-1].nlastpos;
2728
2729 /* An OR node is nullable if either argument is nullable. */
2730 stk[-2].nullable |= stk[-1].nullable;
2731 stk--;
2732 break;
2733
2734 default:
2735 /* Anything else is a nonempty position. (Note that special
2736 constructs like \< are treated as nonempty strings here;
2737 an "epsilon closure" effectively makes them nullable later.
2738 Backreferences have to get a real position so we can detect
2739 transitions on them later. But they are nullable. */
2740 stk->nullable = d->tokens[i] == BACKREF;
2741
2742 /* This position is in its own firstpos and lastpos. */
2743 stk->nfirstpos = stk->nlastpos = 1;
2744 stk++;
2745
2746 firstpos->index = lastpos->index = i;
2747 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2748 firstpos++, lastpos++;
2749
2750 break;
2751 }
2752#ifdef DEBUG
2753 /* ... balance the above nonsyntactic #ifdef goo... */
2754 fprintf (stderr, "node %td:", i);
2755 prtok (d->tokens[i]);
2756 putc ('\n', stderr);
2757 fprintf (stderr,
2758 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2759 fprintf (stderr, " firstpos:");
2760 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2761 {
2762 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2763 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2764 }
2765 fprintf (stderr, "\n lastpos:");
2766 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2767 {
2768 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2769 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2770 }
2771 putc ('\n', stderr);
2772#endif
2773 }
2774
2775 if (backward)
2776 {
2777 /* For each follow set that is the follow set of a real position,
2778 replace it with its epsilon closure. */
2779 epsclosure (d, backward);
2780
2781 for (idx_t i = 0; i < tindex; i++)
2782 free (backward[i].elems);
2783 free (backward);
2784 }
2785
2786 dfaoptimize (d);
2787
2788#ifdef DEBUG
2789 for (idx_t i = 0; i < tindex; i++)
2790 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2791 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2792 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2793 {
2794 fprintf (stderr, "follows(%td:", i);
2795 prtok (d->tokens[i]);
2796 fprintf (stderr, "):");
2797 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2798 {
2799 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2800 prtok (d->tokens[d->follows[i].elems[j].index]);
2801 }
2802 putc ('\n', stderr);
2803 }
2804#endif
2805
2806 pos.index = 0;
2807 pos.constraint = NO_CONSTRAINT;
2808
2809 alloc_position_set (&tmp, 1);
2810
2811 append (pos, &tmp);
2812
2813 d->separates = xicalloc (tindex, sizeof *d->separates);
2814
2815 for (idx_t i = 0; i < tindex; i++)
2816 {
2817 if (prev_newline_dependent (d->constraints[i]))
2818 d->separates[i] |= CTX_NEWLINE;
2819 if (prev_letter_dependent (d->constraints[i]))
2820 d->separates[i] |= CTX_LETTER;
2821
2822 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2823 {
2824 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2825 d->separates[i] |= CTX_NEWLINE;
2826 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2827 d->separates[i] |= CTX_LETTER;
2828 }
2829 }
2830
2831 /* Context wanted by some position. */
2832 int separate_contexts = state_separate_contexts (d, &tmp);
2833
2834 /* Build the initial state. */
2835 if (separate_contexts & CTX_NEWLINE)
2836 state_index (d, &tmp, CTX_NEWLINE);
2837 d->initstate_notbol = d->min_trcount
2838 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2839 if (separate_contexts & CTX_LETTER)
2840 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2841 d->min_trcount++;
2842 d->trcount = 0;
2843
2844 free (posalloc);
2845 free (stkalloc);
2846 free (merged.elems);
2847 free (tmp.elems);
2848}
2849
2850/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2851static void
2852realloc_trans_if_necessary (struct dfa *d)
2853{
2854 state_num oldalloc = d->tralloc;
2855 if (oldalloc < d->sindex)
2856 {
2857 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2858 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2859 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2860 -1, sizeof *realtrans);
2861 realtrans[0] = realtrans[1] = NULL;
2862 d->trans = realtrans + 2;
2863 idx_t newalloc = d->tralloc = newalloc1 - 2;
2864 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2865 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2866 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2867 if (d->localeinfo.multibyte)
2868 {
2869 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2870 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2871 if (oldalloc == 0)
2872 realtrans[0] = realtrans[1] = NULL;
2873 d->mb_trans = realtrans + 2;
2874 }
2875 for (; oldalloc < newalloc; oldalloc++)
2876 {
2877 d->trans[oldalloc] = NULL;
2878 d->fails[oldalloc] = NULL;
2879 if (d->localeinfo.multibyte)
2880 d->mb_trans[oldalloc] = NULL;
2881 }
2882 }
2883}
2884
2885/*
2886 Calculate the transition table for a new state derived from state s
2887 for a compiled dfa d after input character uc, and return the new
2888 state number.
2889
2890 Do not worry about all possible input characters; calculate just the group
2891 of positions that match uc. Label it with the set of characters that
2892 every position in the group matches (taking into account, if necessary,
2893 preceding context information of s). Then find the union
2894 of these positions' follows, i.e., the set of positions of the
2895 new state. For each character in the group's label, set the transition
2896 on this character to be to a state corresponding to the set's positions,
2897 and its associated backward context information, if necessary.
2898
2899 When building a searching matcher, include the positions of state
2900 0 in every state.
2901
2902 The group is constructed by building an equivalence-class
2903 partition of the positions of s.
2904
2905 For each position, find the set of characters C that it matches. Eliminate
2906 any characters from C that fail on grounds of backward context.
2907
2908 Check whether the group's label L has nonempty
2909 intersection with C. If L - C is nonempty, create a new group labeled
2910 L - C and having the same positions as the current group, and set L to
2911 the intersection of L and C. Insert the position in the group, set
2912 C = C - L, and resume scanning.
2913
2914 If after comparing with every group there are characters remaining in C,
2915 create a new group labeled with the characters of C and insert this
2916 position in that group. */
2917
2918static state_num
2919build_state (state_num s, struct dfa *d, unsigned char uc)
2920{
2921 position_set follows; /* Union of the follows for each
2922 position of the current state. */
2923 position_set group; /* Positions that match the input char. */
2924 position_set tmp; /* Temporary space for merging sets. */
2925 state_num state; /* New state. */
2926 state_num state_newline; /* New state on a newline transition. */
2927 state_num state_letter; /* New state on a letter transition. */
2928
2929#ifdef DEBUG
2930 fprintf (stderr, "build state %td\n", s);
2931#endif
2932
2933 /* A pointer to the new transition table, and the table itself. */
2934 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2935 state_num *trans = *ptrans;
2936
2937 if (!trans)
2938 {
2939 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2940 transition tables that can exist at once, other than for
2941 initial states. Often-used transition tables are quickly
2942 rebuilt, whereas rarely-used ones are cleared away. */
2943 if (MAX_TRCOUNT <= d->trcount)
2944 {
2945 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2946 {
2947 free (d->trans[i]);
2948 free (d->fails[i]);
2949 d->trans[i] = d->fails[i] = NULL;
2950 }
2951 d->trcount = 0;
2952 }
2953
2954 d->trcount++;
2955 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2956
2957 /* Fill transition table with a default value which means that the
2958 transited state has not been calculated yet. */
2959 for (int i = 0; i < NOTCHAR; i++)
2960 trans[i] = -2;
2961 }
2962
2963 /* Set up the success bits for this state. */
2964 d->success[s] = 0;
2965 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2966 d->success[s] |= CTX_NEWLINE;
2967 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2968 d->success[s] |= CTX_LETTER;
2969 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2970 d->success[s] |= CTX_NONE;
2971
2972 alloc_position_set (&follows, d->nleaves);
2973
2974 /* Find the union of the follows of the positions of the group.
2975 This is a hideously inefficient loop. Fix it someday. */
2976 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2977 for (idx_t k = 0;
2978 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2979 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2980 &follows);
2981
2982 /* Positions that match the input char. */
2983 alloc_position_set (&group, d->nleaves);
2984
2985 /* The group's label. */
2986 charclass label;
2987 fillset (&label);
2988
2989 for (idx_t i = 0; i < follows.nelem; i++)
2990 {
2991 charclass matches; /* Set of matching characters. */
2992 position pos = follows.elems[i];
2993 bool matched = false;
2994 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2995 {
2996 zeroset (&matches);
2997 setbit (d->tokens[pos.index], &matches);
2998 if (d->tokens[pos.index] == uc)
2999 matched = true;
3000 }
3001 else if (d->tokens[pos.index] >= CSET)
3002 {
3003 matches = d->charclasses[d->tokens[pos.index] - CSET];
3004 if (tstbit (uc, &matches))
3005 matched = true;
3006 }
3007 else if (d->tokens[pos.index] == ANYCHAR)
3008 {
3009 matches = d->charclasses[d->canychar];
3010 if (tstbit (uc, &matches))
3011 matched = true;
3012
3013 /* ANYCHAR must match with a single character, so we must put
3014 it to D->states[s].mbps which contains the positions which
3015 can match with a single character not a byte. If all
3016 positions which has ANYCHAR does not depend on context of
3017 next character, we put the follows instead of it to
3018 D->states[s].mbps to optimize. */
3019 if (succeeds_in_context (pos.constraint, d->states[s].context,
3020 CTX_NONE))
3021 {
3022 if (d->states[s].mbps.nelem == 0)
3023 alloc_position_set (&d->states[s].mbps, 1);
3024 insert (pos, &d->states[s].mbps);
3025 }
3026 }
3027 else
3028 continue;
3029
3030 /* Some characters may need to be eliminated from matches because
3031 they fail in the current context. */
3032 if (pos.constraint != NO_CONSTRAINT)
3033 {
3034 if (!succeeds_in_context (pos.constraint,
3035 d->states[s].context, CTX_NEWLINE))
3036 for (int j = 0; j < CHARCLASS_WORDS; j++)
3037 matches.w[j] &= ~d->syntax.newline.w[j];
3038 if (!succeeds_in_context (pos.constraint,
3039 d->states[s].context, CTX_LETTER))
3040 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3041 matches.w[j] &= ~d->syntax.letters.w[j];
3042 if (!succeeds_in_context (pos.constraint,
3043 d->states[s].context, CTX_NONE))
3044 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3045 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3046
3047 /* If there are no characters left, there's no point in going on. */
3048 if (emptyset (&matches))
3049 continue;
3050
3051 /* If we have reset the bit that made us declare "matched", reset
3052 that indicator, too. This is required to avoid an infinite loop
3053 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3054 if (!tstbit (uc, &matches))
3055 matched = false;
3056 }
3057
3058#ifdef DEBUG
3059 fprintf (stderr, " nextpos %td:", pos.index);
3060 prtok (d->tokens[pos.index]);
3061 fprintf (stderr, " of");
3062 for (unsigned j = 0; j < NOTCHAR; j++)
3063 if (tstbit (j, &matches))
3064 fprintf (stderr, " 0x%02x", j);
3065 fprintf (stderr, "\n");
3066#endif
3067
3068 if (matched)
3069 {
3070 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3071 label.w[k] &= matches.w[k];
3072 append (pos, &group);
3073 }
3074 else
3075 {
3076 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3077 label.w[k] &= ~matches.w[k];
3078 }
3079 }
3080
3081 alloc_position_set (&tmp, d->nleaves);
3082
3083 if (group.nelem > 0)
3084 {
3085 /* If we are building a searching matcher, throw in the positions
3086 of state 0 as well, if possible. */
3087 if (d->searchflag)
3088 {
3089 /* If a token in follows.elems is not 1st byte of a multibyte
3090 character, or the states of follows must accept the bytes
3091 which are not 1st byte of the multibyte character.
3092 Then, if a state of follows encounters a byte, it must not be
3093 a 1st byte of a multibyte character nor a single byte character.
3094 In this case, do not add state[0].follows to next state, because
3095 state[0] must accept 1st-byte.
3096
3097 For example, suppose <sb a> is a certain single byte character,
3098 <mb A> is a certain multibyte character, and the codepoint of
3099 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3100 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3101 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3102 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3103 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3104 not add state[0]. */
3105
3106 bool mergeit = !d->localeinfo.multibyte;
3107 if (!mergeit)
3108 {
3109 mergeit = true;
3110 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3111 mergeit &= d->multibyte_prop[group.elems[j].index];
3112 }
3113 if (mergeit)
3114 merge2 (&group, &d->states[0].elems, &tmp);
3115 }
3116
3117 /* Find out if the new state will want any context information,
3118 by calculating possible contexts that the group can match,
3119 and separate contexts that the new state wants to know. */
3120 int possible_contexts = charclass_context (d, &label);
3121 int separate_contexts = state_separate_contexts (d, &group);
3122
3123 /* Find the state(s) corresponding to the union of the follows. */
3124 if (possible_contexts & ~separate_contexts)
3125 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3126 else
3127 state = -1;
3128 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3129 state_newline = state_index (d, &group, CTX_NEWLINE);
3130 else
3131 state_newline = state;
3132 if (separate_contexts & possible_contexts & CTX_LETTER)
3133 state_letter = state_index (d, &group, CTX_LETTER);
3134 else
3135 state_letter = state;
3136
3137 /* Reallocate now, to reallocate any newline transition properly. */
3138 realloc_trans_if_necessary (d);
3139 }
3140
3141 /* If we are a searching matcher, the default transition is to a state
3142 containing the positions of state 0, otherwise the default transition
3143 is to fail miserably. */
3144 else if (d->searchflag)
3145 {
3146 state_newline = 0;
3147 state_letter = d->min_trcount - 1;
3148 state = d->initstate_notbol;
3149 }
3150 else
3151 {
3152 state_newline = -1;
3153 state_letter = -1;
3154 state = -1;
3155 }
3156
3157 /* Set the transitions for each character in the label. */
3158 for (int i = 0; i < NOTCHAR; i++)
3159 if (tstbit (i, &label))
3160 switch (d->syntax.sbit[i])
3161 {
3162 case CTX_NEWLINE:
3163 trans[i] = state_newline;
3164 break;
3165 case CTX_LETTER:
3166 trans[i] = state_letter;
3167 break;
3168 default:
3169 trans[i] = state;
3170 break;
3171 }
3172
3173#ifdef DEBUG
3174 fprintf (stderr, "trans table %td", s);
3175 for (int i = 0; i < NOTCHAR; ++i)
3176 {
3177 if (!(i & 0xf))
3178 fprintf (stderr, "\n");
3179 fprintf (stderr, " %2td", trans[i]);
3180 }
3181 fprintf (stderr, "\n");
3182#endif
3183
3184 free (group.elems);
3185 free (follows.elems);
3186 free (tmp.elems);
3187
3188 /* Keep the newline transition in a special place so we can use it as
3189 a sentinel. */
3190 if (tstbit (d->syntax.eolbyte, &label))
3191 {
3192 d->newlines[s] = trans[d->syntax.eolbyte];
3193 trans[d->syntax.eolbyte] = -1;
3194 }
3195
3196 return trans[uc];
3197}
3198
3199/* Multibyte character handling sub-routines for dfaexec. */
3200
3201/* Consume a single byte and transit state from 's' to '*next_state'.
3202 This function is almost same as the state transition routin in dfaexec.
3203 But state transition is done just once, otherwise matching succeed or
3204 reach the end of the buffer. */
3205static state_num
3206transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3207{
3208 state_num *t;
3209
3210 if (d->trans[s])
3211 t = d->trans[s];
3212 else if (d->fails[s])
3213 t = d->fails[s];
3214 else
3215 {
3216 build_state (s, d, **pp);
3217 if (d->trans[s])
3218 t = d->trans[s];
3219 else
3220 {
3221 t = d->fails[s];
3222 assert (t);
3223 }
3224 }
3225
3226 if (t[**pp] == -2)
3227 build_state (s, d, **pp);
3228
3229 return t[*(*pp)++];
3230}
3231
3232/* Transit state from s, then return new state and update the pointer of
3233 the buffer. This function is for a period operator which can match a
3234 multi-byte character. */
3235static state_num
3236transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3237 unsigned char const *end)
3238{
3239 wint_t wc;
3240
3241 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3242
3243 /* This state has some operators which can match a multibyte character. */
3244 d->mb_follows.nelem = 0;
3245
3246 /* Calculate the state which can be reached from the state 's' by
3247 consuming 'mbclen' single bytes from the buffer. */
3248 state_num s1 = s;
3249 int mbci;
3250 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3251 s = transit_state_singlebyte (d, s, pp);
3252 *pp += mbclen - mbci;
3253
3254 if (wc == WEOF)
3255 {
3256 /* It is an invalid character, so ANYCHAR is not accepted. */
3257 return s;
3258 }
3259
3260 /* If all positions which have ANYCHAR do not depend on the context
3261 of the next character, calculate the next state with
3262 pre-calculated follows and cache the result. */
3263 if (d->states[s1].mb_trindex < 0)
3264 {
3265 if (MAX_TRCOUNT <= d->mb_trcount)
3266 {
3267 state_num s3;
3268 for (s3 = -1; s3 < d->tralloc; s3++)
3269 {
3270 free (d->mb_trans[s3]);
3271 d->mb_trans[s3] = NULL;
3272 }
3273
3274 for (state_num i = 0; i < d->sindex; i++)
3275 d->states[i].mb_trindex = -1;
3276 d->mb_trcount = 0;
3277 }
3278 d->states[s1].mb_trindex = d->mb_trcount++;
3279 }
3280
3281 if (! d->mb_trans[s])
3282 {
3283 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3284 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3285 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3286 for (int i = 0; i < MAX_TRCOUNT; i++)
3287 d->mb_trans[s][i] = -1;
3288 }
3289 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3290 return d->mb_trans[s][d->states[s1].mb_trindex];
3291
3292 if (s == -1)
3293 copy (&d->states[s1].mbps, &d->mb_follows);
3294 else
3295 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3296
3297 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3298 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3299 realloc_trans_if_necessary (d);
3300
3301 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3302
3303 return s2;
3304}
3305
3306/* The initial state may encounter a byte which is not a single byte character
3307 nor the first byte of a multibyte character. But it is incorrect for the
3308 initial state to accept such a byte. For example, in Shift JIS the regular
3309 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3310 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3311 that are not a single byte character nor the first byte of a multibyte
3312 character.
3313
3314 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3315 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3316 the result is greater than P, set *WCP to the final wide character
3317 processed, or to WEOF if no wide character is processed. Otherwise,
3318 if WCP is non-NULL, *WCP may or may not be updated.
3319
3320 Both P and MBP must be no larger than END. */
3321static unsigned char const *
3322skip_remains_mb (struct dfa *d, unsigned char const *p,
3323 unsigned char const *mbp, char const *end)
3324{
3325 if (d->syntax.never_trail[*p])
3326 return p;
3327 while (mbp < p)
3328 {
3329 wint_t wc;
3330 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3331 end - (char const *) mbp, d);
3332 }
3333 return mbp;
3334}
3335
3336/* Search through a buffer looking for a match to the struct dfa *D.
3337 Find the first occurrence of a string matching the regexp in the
3338 buffer, and the shortest possible version thereof. Return a pointer to
3339 the first character after the match, or NULL if none is found. BEGIN
3340 points to the beginning of the buffer, and END points to the first byte
3341 after its end. Note however that we store a sentinel byte (usually
3342 newline) in *END, so the actual buffer must be one byte longer.
3343 When ALLOW_NL, newlines may appear in the matching string.
3344 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3345 If MULTIBYTE, the input consists of multibyte characters and/or
3346 encoding-error bytes. Otherwise, it consists of single-byte characters.
3347 Here is the list of features that make this DFA matcher punt:
3348 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3349 - [^...] in non-simple locale
3350 - [[=foo=]] or [[.foo.]]
3351 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3352 - back-reference: (.)\1
3353 - word-delimiter in multibyte locale: \<, \>, \b, \B
3354 See struct localeinfo.simple for the definition of "simple locale". */
3355
3356static inline char *
3357dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3358 ptrdiff_t *count, bool multibyte)
3359{
3360 if (MAX_TRCOUNT <= d->sindex)
3361 {
3362 for (state_num s = d->min_trcount; s < d->sindex; s++)
3363 {
3364 free (d->states[s].elems.elems);
3365 free (d->states[s].mbps.elems);
3366 }
3367 d->sindex = d->min_trcount;
3368
3369 if (d->trans)
3370 {
3371 for (state_num s = 0; s < d->tralloc; s++)
3372 {
3373 free (d->trans[s]);
3374 free (d->fails[s]);
3375 d->trans[s] = d->fails[s] = NULL;
3376 }
3377 d->trcount = 0;
3378 }
3379
3380 if (d->localeinfo.multibyte && d->mb_trans)
3381 {
3382 for (state_num s = -1; s < d->tralloc; s++)
3383 {
3384 free (d->mb_trans[s]);
3385 d->mb_trans[s] = NULL;
3386 }
3387 for (state_num s = 0; s < d->min_trcount; s++)
3388 d->states[s].mb_trindex = -1;
3389 d->mb_trcount = 0;
3390 }
3391 }
3392
3393 if (!d->tralloc)
3394 realloc_trans_if_necessary (d);
3395
3396 /* Current state. */
3397 state_num s = 0, s1 = 0;
3398
3399 /* Current input character. */
3400 unsigned char const *p = (unsigned char const *) begin;
3401 unsigned char const *mbp = p;
3402
3403 /* Copy of d->trans so it can be optimized into a register. */
3404 state_num **trans = d->trans;
3405 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3406 unsigned char saved_end = *(unsigned char *) end;
3407 *end = eol;
3408
3409 if (multibyte)
3410 {
3411 memset (&d->mbs, 0, sizeof d->mbs);
3412 if (d->mb_follows.alloc == 0)
3413 alloc_position_set (&d->mb_follows, d->nleaves);
3414 }
3415
3416 idx_t nlcount = 0;
3417 for (;;)
3418 {
3419 state_num *t;
3420 while ((t = trans[s]) != NULL)
3421 {
3422 if (s < d->min_trcount)
3423 {
3424 if (!multibyte || d->states[s].mbps.nelem == 0)
3425 {
3426 while (t[*p] == s)
3427 p++;
3428 }
3429 if (multibyte)
3430 p = mbp = skip_remains_mb (d, p, mbp, end);
3431 }
3432
3433 if (multibyte)
3434 {
3435 s1 = s;
3436
3437 if (d->states[s].mbps.nelem == 0
3438 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3439 {
3440 /* If an input character does not match ANYCHAR, do it
3441 like a single-byte character. */
3442 s = t[*p++];
3443 }
3444 else
3445 {
3446 s = transit_state (d, s, &p, (unsigned char *) end);
3447 mbp = p;
3448 trans = d->trans;
3449 }
3450 }
3451 else
3452 {
3453 s1 = t[*p++];
3454 t = trans[s1];
3455 if (! t)
3456 {
3457 state_num tmp = s;
3458 s = s1;
3459 s1 = tmp; /* swap */
3460 break;
3461 }
3462 if (s < d->min_trcount)
3463 {
3464 while (t[*p] == s1)
3465 p++;
3466 }
3467 s = t[*p++];
3468 }
3469 }
3470
3471 if (s < 0)
3472 {
3473 if (s == -2)
3474 {
3475 s = build_state (s1, d, p[-1]);
3476 trans = d->trans;
3477 }
3478 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3479 {
3480 /* The previous character was a newline. Count it, and skip
3481 checking of multibyte character boundary until here. */
3482 nlcount++;
3483 mbp = p;
3484
3485 s = (allow_nl ? d->newlines[s1]
3486 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3487 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3488 : d->initstate_notbol);
3489 }
3490 else
3491 {
3492 p = NULL;
3493 goto done;
3494 }
3495 }
3496 else if (d->fails[s])
3497 {
3498 if ((d->success[s] & d->syntax.sbit[*p])
3499 || ((char *) p == end
3500 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3501 d)))
3502 goto done;
3503
3504 if (multibyte && s < d->min_trcount)
3505 p = mbp = skip_remains_mb (d, p, mbp, end);
3506
3507 s1 = s;
3508 if (!multibyte || d->states[s].mbps.nelem == 0
3509 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3510 {
3511 /* If a input character does not match ANYCHAR, do it
3512 like a single-byte character. */
3513 s = d->fails[s][*p++];
3514 }
3515 else
3516 {
3517 s = transit_state (d, s, &p, (unsigned char *) end);
3518 mbp = p;
3519 trans = d->trans;
3520 }
3521 }
3522 else
3523 {
3524 build_state (s, d, p[0]);
3525 trans = d->trans;
3526 }
3527 }
3528
3529 done:
3530 if (count)
3531 *count += nlcount;
3532 *end = saved_end;
3533 return (char *) p;
3534}
3535
3536/* Specialized versions of dfaexec for multibyte and single-byte cases.
3537 This is for performance, as dfaexec_main is an inline function. */
3538
3539static char *
3540dfaexec_mb (struct dfa *d, char const *begin, char *end,
3541 bool allow_nl, ptrdiff_t *count, bool *backref)
3542{
3543 return dfaexec_main (d, begin, end, allow_nl, count, true);
3544}
3545
3546static char *
3547dfaexec_sb (struct dfa *d, char const *begin, char *end,
3548 bool allow_nl, ptrdiff_t *count, bool *backref)
3549{
3550 return dfaexec_main (d, begin, end, allow_nl, count, false);
3551}
3552
3553/* Always set *BACKREF and return BEGIN. Use this wrapper for
3554 any regexp that uses a construct not supported by this code. */
3555static char *
3556dfaexec_noop (struct dfa *d, char const *begin, char *end,
3557 bool allow_nl, ptrdiff_t *count, bool *backref)
3558{
3559 *backref = true;
3560 return (char *) begin;
3561}
3562
3563/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3564 but faster and set *BACKREF if the DFA code does not support this
3565 regexp usage. */
3566
3567char *
3568dfaexec (struct dfa *d, char const *begin, char *end,
3569 bool allow_nl, ptrdiff_t *count, bool *backref)
3570{
3571 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3572}
3573
3574struct dfa *
3575dfasuperset (struct dfa const *d)
3576{
3577 return d->superset;
3578}
3579
3580bool
3581dfaisfast (struct dfa const *d)
3582{
3583 return d->fast;
3584}
3585
3586static void
3587free_mbdata (struct dfa *d)
3588{
3589 free (d->multibyte_prop);
3590 free (d->lex.brack.chars);
3591 free (d->mb_follows.elems);
3592
3593 if (d->mb_trans)
3594 {
3595 state_num s;
3596 for (s = -1; s < d->tralloc; s++)
3597 free (d->mb_trans[s]);
3598 free (d->mb_trans - 2);
3599 }
3600}
3601
3602/* Return true if every construct in D is supported by this DFA matcher. */
3603bool
3604dfasupported (struct dfa const *d)
3605{
3606 for (idx_t i = 0; i < d->tindex; i++)
3607 {
3608 switch (d->tokens[i])
3609 {
3610 case BEGWORD:
3611 case ENDWORD:
3612 case LIMWORD:
3613 case NOTLIMWORD:
3614 if (!d->localeinfo.multibyte)
3615 continue;
3616 FALLTHROUGH;
3617 case BACKREF:
3618 case MBCSET:
3619 return false;
3620 }
3621 }
3622 return true;
3623}
3624
3625/* Disable use of the superset DFA if it is not likely to help
3626 performance. */
3627static void
3628maybe_disable_superset_dfa (struct dfa *d)
3629{
3630 if (!d->localeinfo.using_utf8)
3631 return;
3632
3633 bool have_backref = false;
3634 for (idx_t i = 0; i < d->tindex; i++)
3635 {
3636 switch (d->tokens[i])
3637 {
3638 case ANYCHAR:
3639 /* Lowered. */
3640 abort ();
3641 case BACKREF:
3642 have_backref = true;
3643 break;
3644 case MBCSET:
3645 /* Requires multi-byte algorithm. */
3646 return;
3647 default:
3648 break;
3649 }
3650 }
3651
3652 if (!have_backref && d->superset)
3653 {
3654 /* The superset DFA is not likely to be much faster, so remove it. */
3655 dfafree (d->superset);
3656 free (d->superset);
3657 d->superset = NULL;
3658 }
3659
3660 free_mbdata (d);
3661 d->localeinfo.multibyte = false;
3662 d->dfaexec = dfaexec_sb;
3663 d->fast = true;
3664}
3665
3666static void
3667dfassbuild (struct dfa *d)
3668{
3669 struct dfa *sup = dfaalloc ();
3670
3671 *sup = *d;
3672 sup->localeinfo.multibyte = false;
3673 sup->dfaexec = dfaexec_sb;
3674 sup->multibyte_prop = NULL;
3675 sup->superset = NULL;
3676 sup->states = NULL;
3677 sup->sindex = 0;
3678 sup->constraints = NULL;
3679 sup->separates = NULL;
3680 sup->follows = NULL;
3681 sup->tralloc = 0;
3682 sup->trans = NULL;
3683 sup->fails = NULL;
3684 sup->success = NULL;
3685 sup->newlines = NULL;
3686
3687 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3688 if (d->cindex)
3689 {
3690 memcpy (sup->charclasses, d->charclasses,
3691 d->cindex * sizeof *sup->charclasses);
3692 }
3693
3694 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3695 sup->talloc = d->tindex * 2;
3696
3697 bool have_achar = false;
3698 bool have_nchar = false;
3699 idx_t j;
3700 for (idx_t i = j = 0; i < d->tindex; i++)
3701 {
3702 switch (d->tokens[i])
3703 {
3704 case ANYCHAR:
3705 case MBCSET:
3706 case BACKREF:
3707 {
3708 charclass ccl;
3709 fillset (&ccl);
3710 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3711 sup->tokens[j++] = STAR;
3712 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3713 || d->tokens[i + 1] == PLUS)
3714 i++;
3715 have_achar = true;
3716 }
3717 break;
3718 case BEGWORD:
3719 case ENDWORD:
3720 case LIMWORD:
3721 case NOTLIMWORD:
3722 if (d->localeinfo.multibyte)
3723 {
3724 /* These constraints aren't supported in a multibyte locale.
3725 Ignore them in the superset DFA. */
3726 sup->tokens[j++] = EMPTY;
3727 break;
3728 }
3729 FALLTHROUGH;
3730 default:
3731 sup->tokens[j++] = d->tokens[i];
3732 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3733 || d->tokens[i] >= CSET)
3734 have_nchar = true;
3735 break;
3736 }
3737 }
3738 sup->tindex = j;
3739
3740 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3741 d->superset = sup;
3742 else
3743 {
3744 dfafree (sup);
3745 free (sup);
3746 }
3747}
3748
3749/* Parse a string S of length LEN into D (but skip this step if S is null).
3750 Then analyze D and build a matcher for it.
3751 SEARCHFLAG says whether to build a searching or an exact matcher. */
3752void
3753dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3754{
3755 if (s != NULL)
3756 dfaparse (s, len, d);
3757
3758 dfassbuild (d);
3759
3760 if (dfasupported (d))
3761 {
3762 maybe_disable_superset_dfa (d);
3763 dfaanalyze (d, searchflag);
3764 }
3765 else
3766 {
3767 d->dfaexec = dfaexec_noop;
3768 }
3769
3770 if (d->superset)
3771 {
3772 d->fast = true;
3773 dfaanalyze (d->superset, searchflag);
3774 }
3775}
3776
3777/* Free the storage held by the components of a dfa. */
3778void
3779dfafree (struct dfa *d)
3780{
3781 free (d->charclasses);
3782 free (d->tokens);
3783
3784 if (d->localeinfo.multibyte)
3785 free_mbdata (d);
3786
3787 free (d->constraints);
3788 free (d->separates);
3789
3790 for (idx_t i = 0; i < d->sindex; i++)
3791 {
3792 free (d->states[i].elems.elems);
3793 free (d->states[i].mbps.elems);
3794 }
3795 free (d->states);
3796
3797 if (d->follows)
3798 {
3799 for (idx_t i = 0; i < d->tindex; i++)
3800 free (d->follows[i].elems);
3801 free (d->follows);
3802 }
3803
3804 if (d->trans)
3805 {
3806 for (idx_t i = 0; i < d->tralloc; i++)
3807 {
3808 free (d->trans[i]);
3809 free (d->fails[i]);
3810 }
3811
3812 free (d->trans - 2);
3813 free (d->fails);
3814 free (d->newlines);
3815 free (d->success);
3816 }
3817
3818 if (d->superset)
3819 {
3820 dfafree (d->superset);
3821 free (d->superset);
3822 }
3823}
3824
3825/* Having found the postfix representation of the regular expression,
3826 try to find a long sequence of characters that must appear in any line
3827 containing the r.e.
3828 Finding a "longest" sequence is beyond the scope here;
3829 we take an easy way out and hope for the best.
3830 (Take "(ab|a)b"--please.)
3831
3832 We do a bottom-up calculation of sequences of characters that must appear
3833 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3834 representation:
3835 sequences that must appear at the left of the match ("left")
3836 sequences that must appear at the right of the match ("right")
3837 lists of sequences that must appear somewhere in the match ("in")
3838 sequences that must constitute the match ("is")
3839
3840 When we get to the root of the tree, we use one of the longest of its
3841 calculated "in" sequences as our answer.
3842
3843 The sequences calculated for the various types of node (in pseudo ANSI c)
3844 are shown below. "p" is the operand of unary operators (and the left-hand
3845 operand of binary operators); "q" is the right-hand operand of binary
3846 operators.
3847
3848 "ZERO" means "a zero-length sequence" below.
3849
3850 Type left right is in
3851 ---- ---- ----- -- --
3852 char c # c # c # c # c
3853
3854 ANYCHAR ZERO ZERO ZERO ZERO
3855
3856 MBCSET ZERO ZERO ZERO ZERO
3857
3858 CSET ZERO ZERO ZERO ZERO
3859
3860 STAR ZERO ZERO ZERO ZERO
3861
3862 QMARK ZERO ZERO ZERO ZERO
3863
3864 PLUS p->left p->right ZERO p->in
3865
3866 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3867 p->left : q->right : q->is!=ZERO) ? q->in plus
3868 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3869 ZERO
3870
3871 OR longest common longest common (do p->is and substrings common
3872 leading trailing to q->is have same p->in and
3873 (sub)sequence (sub)sequence q->in length and content) ?
3874 of p->left of p->right
3875 and q->left and q->right p->is : NULL
3876
3877 If there's anything else we recognize in the tree, all four sequences get set
3878 to zero-length sequences. If there's something we don't recognize in the
3879 tree, we just return a zero-length sequence.
3880
3881 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3882 'aaa')?
3883
3884 And ... is it here or someplace that we might ponder "optimizations" such as
3885 egrep 'psi|epsilon' -> egrep 'psi'
3886 egrep 'pepsi|epsilon' -> egrep 'epsi'
3887 (Yes, we now find "epsi" as a "string
3888 that must occur", but we might also
3889 simplify the *entire* r.e. being sought)
3890 grep '[c]' -> grep 'c'
3891 grep '(ab|a)b' -> grep 'ab'
3892 grep 'ab*' -> grep 'a'
3893 grep 'a*b' -> grep 'b'
3894
3895 There are several issues:
3896
3897 Is optimization easy (enough)?
3898
3899 Does optimization actually accomplish anything,
3900 or is the automaton you get from "psi|epsilon" (for example)
3901 the same as the one you get from "psi" (for example)?
3902
3903 Are optimizable r.e.'s likely to be used in real-life situations
3904 (something like 'ab*' is probably unlikely; something like is
3905 'psi|epsilon' is likelier)? */
3906
3907static char *
3908icatalloc (char *old, char const *new)
3909{
3910 idx_t newsize = strlen (new);
3911 if (newsize == 0)
3912 return old;
3913 idx_t oldsize = strlen (old);
3914 char *result = xirealloc (old, oldsize + newsize + 1);
3915 memcpy (result + oldsize, new, newsize + 1);
3916 return result;
3917}
3918
3919static void
3920freelist (char **cpp)
3921{
3922 while (*cpp)
3923 free (*cpp++);
3924}
3925
3926static char **
3927enlistnew (char **cpp, char *new)
3928{
3929 /* Is there already something in the list that's new (or longer)? */
3930 idx_t i;
3931 for (i = 0; cpp[i] != NULL; i++)
3932 if (strstr (cpp[i], new) != NULL)
3933 {
3934 free (new);
3935 return cpp;
3936 }
3937 /* Eliminate any obsoleted strings. */
3938 for (idx_t j = 0; cpp[j] != NULL; )
3939 if (strstr (new, cpp[j]) == NULL)
3940 ++j;
3941 else
3942 {
3943 free (cpp[j]);
3944 if (--i == j)
3945 break;
3946 cpp[j] = cpp[i];
3947 cpp[i] = NULL;
3948 }
3949 /* Add the new string. */
3950 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3951 cpp[i] = new;
3952 cpp[i + 1] = NULL;
3953 return cpp;
3954}
3955
3956static char **
3957enlist (char **cpp, char const *str, idx_t len)
3958{
3959 return enlistnew (cpp, ximemdup0 (str, len));
3960}
3961
3962/* Given pointers to two strings, return a pointer to an allocated
3963 list of their distinct common substrings. */
3964static char **
3965comsubs (char *left, char const *right)
3966{
3967 char **cpp = xzalloc (sizeof *cpp);
3968
3969 for (char *lcp = left; *lcp != '\0'; lcp++)
3970 {
3971 idx_t len = 0;
3972 char *rcp = strchr (right, *lcp);
3973 while (rcp != NULL)
3974 {
3975 idx_t i;
3976 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3977 continue;
3978 if (i > len)
3979 len = i;
3980 rcp = strchr (rcp + 1, *lcp);
3981 }
3982 if (len != 0)
3983 cpp = enlist (cpp, lcp, len);
3984 }
3985 return cpp;
3986}
3987
3988static char **
3989addlists (char **old, char **new)
3990{
3991 for (; *new; new++)
3992 old = enlistnew (old, xstrdup (*new));
3993 return old;
3994}
3995
3996/* Given two lists of substrings, return a new list giving substrings
3997 common to both. */
3998static char **
3999inboth (char **left, char **right)
4000{
4001 char **both = xzalloc (sizeof *both);
4002
4003 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4004 {
4005 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4006 {
4007 char **temp = comsubs (left[lnum], right[rnum]);
4008 both = addlists (both, temp);
4009 freelist (temp);
4010 free (temp);
4011 }
4012 }
4013 return both;
4014}
4015
4016typedef struct must must;
4017
4018struct must
4019{
4020 char **in;
4021 char *left;
4022 char *right;
4023 char *is;
4024 bool begline;
4025 bool endline;
4026 must *prev;
4027};
4028
4029static must *
4030allocmust (must *mp, idx_t size)
4031{
4032 must *new_mp = xmalloc (sizeof *new_mp);
4033 new_mp->in = xzalloc (sizeof *new_mp->in);
4034 new_mp->left = xizalloc (size);
4035 new_mp->right = xizalloc (size);
4036 new_mp->is = xizalloc (size);
4037 new_mp->begline = false;
4038 new_mp->endline = false;
4039 new_mp->prev = mp;
4040 return new_mp;
4041}
4042
4043static void
4044resetmust (must *mp)
4045{
4046 freelist (mp->in);
4047 mp->in[0] = NULL;
4048 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4049 mp->begline = false;
4050 mp->endline = false;
4051}
4052
4053static void
4054freemust (must *mp)
4055{
4056 freelist (mp->in);
4057 free (mp->in);
4058 free (mp->left);
4059 free (mp->right);
4060 free (mp->is);
4061 free (mp);
4062}
4063
4064struct dfamust *
4065dfamust (struct dfa const *d)
4066{
4067 must *mp = NULL;
4068 char const *result = "";
4069 bool exact = false;
4070 bool begline = false;
4071 bool endline = false;
4072 bool need_begline = false;
4073 bool need_endline = false;
4074 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4075
4076 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4077 {
4078 token t = d->tokens[ri];
4079 switch (t)
4080 {
4081 case BEGLINE:
4082 mp = allocmust (mp, 2);
4083 mp->begline = true;
4084 need_begline = true;
4085 break;
4086 case ENDLINE:
4087 mp = allocmust (mp, 2);
4088 mp->endline = true;
4089 need_endline = true;
4090 break;
4091 case LPAREN:
4092 case RPAREN:
4093 assert (!"neither LPAREN nor RPAREN may appear here");
4094
4095 case EMPTY:
4096 case BEGWORD:
4097 case ENDWORD:
4098 case LIMWORD:
4099 case NOTLIMWORD:
4100 case BACKREF:
4101 case ANYCHAR:
4102 case MBCSET:
4103 mp = allocmust (mp, 2);
4104 break;
4105
4106 case STAR:
4107 case QMARK:
4108 assume_nonnull (mp);
4109 resetmust (mp);
4110 break;
4111
4112 case OR:
4113 {
4114 char **new;
4115 must *rmp = mp;
4116 assume_nonnull (rmp);
4117 must *lmp = mp = mp->prev;
4118 assume_nonnull (lmp);
4119 idx_t j, ln, rn, n;
4120
4121 /* Guaranteed to be. Unlikely, but ... */
4122 if (streq (lmp->is, rmp->is))
4123 {
4124 lmp->begline &= rmp->begline;
4125 lmp->endline &= rmp->endline;
4126 }
4127 else
4128 {
4129 lmp->is[0] = '\0';
4130 lmp->begline = false;
4131 lmp->endline = false;
4132 }
4133 /* Left side--easy */
4134 idx_t i = 0;
4135 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4136 ++i;
4137 lmp->left[i] = '\0';
4138 /* Right side */
4139 ln = strlen (lmp->right);
4140 rn = strlen (rmp->right);
4141 n = ln;
4142 if (n > rn)
4143 n = rn;
4144 for (i = 0; i < n; ++i)
4145 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4146 break;
4147 for (j = 0; j < i; ++j)
4148 lmp->right[j] = lmp->right[(ln - i) + j];
4149 lmp->right[j] = '\0';
4150 new = inboth (lmp->in, rmp->in);
4151 freelist (lmp->in);
4152 free (lmp->in);
4153 lmp->in = new;
4154 freemust (rmp);
4155 }
4156 break;
4157
4158 case PLUS:
4159 assume_nonnull (mp);
4160 mp->is[0] = '\0';
4161 break;
4162
4163 case END:
4164 assume_nonnull (mp);
4165 assert (!mp->prev);
4166 for (idx_t i = 0; mp->in[i] != NULL; i++)
4167 if (strlen (mp->in[i]) > strlen (result))
4168 result = mp->in[i];
4169 if (streq (result, mp->is))
4170 {
4171 if ((!need_begline || mp->begline) && (!need_endline
4172 || mp->endline))
4173 exact = true;
4174 begline = mp->begline;
4175 endline = mp->endline;
4176 }
4177 goto done;
4178
4179 case CAT:
4180 {
4181 must *rmp = mp;
4182 assume_nonnull (rmp);
4183 must *lmp = mp = mp->prev;
4184 assume_nonnull (lmp);
4185
4186 /* In. Everything in left, plus everything in
4187 right, plus concatenation of
4188 left's right and right's left. */
4189 lmp->in = addlists (lmp->in, rmp->in);
4190 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4191 {
4192 idx_t lrlen = strlen (lmp->right);
4193 idx_t rllen = strlen (rmp->left);
4194 char *tp = ximalloc (lrlen + rllen + 1);
4195 memcpy (tp + lrlen, rmp->left, rllen + 1);
4196 memcpy (tp, lmp->right, lrlen);
4197 lmp->in = enlistnew (lmp->in, tp);
4198 }
4199 /* Left-hand */
4200 if (lmp->is[0] != '\0')
4201 lmp->left = icatalloc (lmp->left, rmp->left);
4202 /* Right-hand */
4203 if (rmp->is[0] == '\0')
4204 lmp->right[0] = '\0';
4205 lmp->right = icatalloc (lmp->right, rmp->right);
4206 /* Guaranteed to be */
4207 if ((lmp->is[0] != '\0' || lmp->begline)
4208 && (rmp->is[0] != '\0' || rmp->endline))
4209 {
4210 lmp->is = icatalloc (lmp->is, rmp->is);
4211 lmp->endline = rmp->endline;
4212 }
4213 else
4214 {
4215 lmp->is[0] = '\0';
4216 lmp->begline = false;
4217 lmp->endline = false;
4218 }
4219 freemust (rmp);
4220 }
4221 break;
4222
4223 case '\0':
4224 /* Not on *my* shift. */
4225 goto done;
4226
4227 default:
4228 if (CSET <= t)
4229 {
4230 /* If T is a singleton, or if case-folding in a unibyte
4231 locale and T's members all case-fold to the same char,
4232 convert T to one of its members. Otherwise, do
4233 nothing further with T. */
4234 charclass *ccl = &d->charclasses[t - CSET];
4235 int j;
4236 for (j = 0; j < NOTCHAR; j++)
4237 if (tstbit (j, ccl))
4238 break;
4239 if (! (j < NOTCHAR))
4240 {
4241 mp = allocmust (mp, 2);
4242 break;
4243 }
4244 t = j;
4245 while (++j < NOTCHAR)
4246 if (tstbit (j, ccl)
4247 && ! (case_fold_unibyte
4248 && toupper (j) == toupper (t)))
4249 break;
4250 if (j < NOTCHAR)
4251 {
4252 mp = allocmust (mp, 2);
4253 break;
4254 }
4255 }
4256
4257 idx_t rj = ri + 2;
4258 if (d->tokens[ri + 1] == CAT)
4259 {
4260 for (; rj < d->tindex - 1; rj += 2)
4261 {
4262 if ((rj != ri && (d->tokens[rj] <= 0
4263 || NOTCHAR <= d->tokens[rj]))
4264 || d->tokens[rj + 1] != CAT)
4265 break;
4266 }
4267 }
4268 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4269 mp->is[0] = mp->left[0] = mp->right[0]
4270 = case_fold_unibyte ? toupper (t) : t;
4271
4272 idx_t i;
4273 for (i = 1; ri + 2 < rj; i++)
4274 {
4275 ri += 2;
4276 t = d->tokens[ri];
4277 mp->is[i] = mp->left[i] = mp->right[i]
4278 = case_fold_unibyte ? toupper (t) : t;
4279 }
4280 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4281 mp->in = enlist (mp->in, mp->is, i);
4282 break;
4283 }
4284 }
4285 done:;
4286
4287 struct dfamust *dm = NULL;
4288 if (*result)
4289 {
4290 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4291 dm->exact = exact;
4292 dm->begline = begline;
4293 dm->endline = endline;
4294 strcpy (dm->must, result);
4295 }
4296
4297 while (mp)
4298 {
4299 must *prev = mp->prev;
4300 freemust (mp);
4301 mp = prev;
4302 }
4303
4304 return dm;
4305}
4306
4307void
4308dfamustfree (struct dfamust *dm)
4309{
4310 free (dm);
4311}
4312
4313struct dfa *
4314dfaalloc (void)
4315{
4316 return xmalloc (sizeof (struct dfa));
4317}
4318
4319/* Initialize DFA. */
4320void
4321dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4322 reg_syntax_t bits, int dfaopts)
4323{
4324 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4325 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4326 dfa->localeinfo = *linfo;
4327
4328 dfa->fast = !dfa->localeinfo.multibyte;
4329
4330 dfa->canychar = -1;
4331 dfa->syntax.syntax_bits_set = true;
4332 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4333 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4334 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4335 dfa->syntax.syntax_bits = bits;
4336
4337 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4338 {
4339 unsigned char uc = i;
4340
4341 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4342 switch (dfa->syntax.sbit[uc])
4343 {
4344 case CTX_LETTER:
4345 setbit (uc, &dfa->syntax.letters);
4346 break;
4347 case CTX_NEWLINE:
4348 setbit (uc, &dfa->syntax.newline);
4349 break;
4350 }
4351
4352 /* POSIX requires that the five bytes in "\n\r./" (including the
4353 terminating NUL) cannot occur inside a multibyte character. */
4354 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4355 ? (uc & 0xc0) != 0x80
4356 : strchr ("\n\r./", uc) != NULL);
4357 }
4358}
4359
4360/* Initialize TO by copying FROM's syntax settings. */
4361void
4362dfacopysyntax (struct dfa *to, struct dfa const *from)
4363{
4364 memset (to, 0, offsetof (struct dfa, syntax));
4365 to->canychar = -1;
4366 to->fast = from->fast;
4367 to->syntax = from->syntax;
4368 to->dfaexec = from->dfaexec;
4369 to->localeinfo = from->localeinfo;
4370}
4371
4372/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette