VirtualBox

source: kBuild/trunk/src/sed/lib/dfa.c@ 3657

Last change on this file since 3657 was 3657, checked in by bird, 6 months ago

sed/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug.

File size: 136.3 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2022 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24
25#include "dfa.h"
26
27#include "flexmember.h"
28#include "idx.h"
29#include "verify.h"
30
31#include <assert.h>
32#include <ctype.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <limits.h>
37#include <string.h>
38
39/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40 understandably cannot deduce that the input comes from a
41 well-formed regular expression. There's little point to the
42 runtime overhead of 'assert' instead of 'assume_nonnull' when the
43 MMU will check anyway. */
44#define assume_nonnull(x) assume ((x) != NULL)
45
46static bool
47str_eq (char const *a, char const *b)
48{
49 return strcmp (a, b) == 0;
50}
51
52static bool
53c_isdigit (char c)
54{
55 return '0' <= c && c <= '9';
56}
57
58#include "gettext.h"
59#define _(str) gettext (str)
60
61#include <wchar.h>
62#include <wctype.h>
63
64#include "xalloc.h"
65#include "localeinfo.h"
66
67#ifndef FALLTHROUGH
68# if 201710L < __STDC_VERSION__
69# define FALLTHROUGH [[__fallthrough__]]
70# elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
71# define FALLTHROUGH __attribute__ ((__fallthrough__))
72# else
73# define FALLTHROUGH ((void) 0)
74# endif
75#endif
76
77#ifndef MIN
78# define MIN(a,b) ((a) < (b) ? (a) : (b))
79#endif
80
81/* HPUX defines these as macros in sys/param.h. */
82#ifdef setbit
83# undef setbit
84#endif
85#ifdef clrbit
86# undef clrbit
87#endif
88
89/* For code that does not use Gnulib’s isblank module. */
90#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
91# define isblank dfa_isblank
92static int
93isblank (int c)
94{
95 return c == ' ' || c == '\t';
96}
97#endif
98
99/* First integer value that is greater than any character code. */
100enum { NOTCHAR = 1 << CHAR_BIT };
101
102#ifdef UINT_LEAST64_MAX
103
104/* Number of bits used in a charclass word. */
105enum { CHARCLASS_WORD_BITS = 64 };
106
107/* This represents part of a character class. It must be unsigned and
108 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
109typedef uint_least64_t charclass_word;
110
111/* Part of a charclass initializer that represents 64 bits' worth of a
112 charclass, where LO and HI are the low and high-order 32 bits of
113 the 64-bit quantity. */
114# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
115
116#else
117/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
118enum { CHARCLASS_WORD_BITS = 32 };
119typedef unsigned long charclass_word;
120# define CHARCLASS_PAIR(lo, hi) lo, hi
121#endif
122
123/* An initializer for a charclass whose 32-bit words are A through H. */
124#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
125 {{ \
126 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
127 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
128 }}
129
130/* The maximum useful value of a charclass_word; all used bits are 1. */
131static charclass_word const CHARCLASS_WORD_MASK
132 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
133
134/* Number of words required to hold a bit for every character. */
135enum
136{
137 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
138};
139
140/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
141typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
142
143/* Convert a possibly-signed character to an unsigned character. This is
144 a bit safer than casting to unsigned char, since it catches some type
145 errors that the cast doesn't. */
146static unsigned char
147to_uchar (char ch)
148{
149 return ch;
150}
151
152/* Contexts tell us whether a character is a newline or a word constituent.
153 Word-constituent characters are those that satisfy iswalnum, plus '_'.
154 Each character has a single CTX_* value; bitmasks of CTX_* values denote
155 a particular character class.
156
157 A state also stores a context value, which is a bitmask of CTX_* values.
158 A state's context represents a set of characters that the state's
159 predecessors must match. For example, a state whose context does not
160 include CTX_LETTER will never have transitions where the previous
161 character is a word constituent. A state whose context is CTX_ANY
162 might have transitions from any character. */
163
164enum
165 {
166 CTX_NONE = 1,
167 CTX_LETTER = 2,
168 CTX_NEWLINE = 4,
169 CTX_ANY = 7
170 };
171
172/* Sometimes characters can only be matched depending on the surrounding
173 context. Such context decisions depend on what the previous character
174 was, and the value of the current (lookahead) character. Context
175 dependent constraints are encoded as 9-bit integers. Each bit that
176 is set indicates that the constraint succeeds in the corresponding
177 context.
178
179 bit 6-8 - valid contexts when next character is CTX_NEWLINE
180 bit 3-5 - valid contexts when next character is CTX_LETTER
181 bit 0-2 - valid contexts when next character is CTX_NONE
182
183 succeeds_in_context determines whether a given constraint
184 succeeds in a particular context. Prev is a bitmask of possible
185 context values for the previous character, curr is the (single-bit)
186 context value for the lookahead character. */
187static int
188newline_constraint (int constraint)
189{
190 return (constraint >> 6) & 7;
191}
192static int
193letter_constraint (int constraint)
194{
195 return (constraint >> 3) & 7;
196}
197static int
198other_constraint (int constraint)
199{
200 return constraint & 7;
201}
202
203static bool
204succeeds_in_context (int constraint, int prev, int curr)
205{
206 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
207 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
208 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
209 & prev);
210}
211
212/* The following describe what a constraint depends on. */
213static bool
214prev_newline_dependent (int constraint)
215{
216 return ((constraint ^ constraint >> 2) & 0111) != 0;
217}
218static bool
219prev_letter_dependent (int constraint)
220{
221 return ((constraint ^ constraint >> 1) & 0111) != 0;
222}
223
224/* Tokens that match the empty string subject to some constraint actually
225 work by applying that constraint to determine what may follow them,
226 taking into account what has gone before. The following values are
227 the constraints corresponding to the special tokens previously defined. */
228enum
229 {
230 NO_CONSTRAINT = 0777,
231 BEGLINE_CONSTRAINT = 0444,
232 ENDLINE_CONSTRAINT = 0700,
233 BEGWORD_CONSTRAINT = 0050,
234 ENDWORD_CONSTRAINT = 0202,
235 LIMWORD_CONSTRAINT = 0252,
236 NOTLIMWORD_CONSTRAINT = 0525
237 };
238
239/* The regexp is parsed into an array of tokens in postfix form. Some tokens
240 are operators and others are terminal symbols. Most (but not all) of these
241 codes are returned by the lexical analyzer. */
242
243typedef ptrdiff_t token;
244static token const TOKEN_MAX = PTRDIFF_MAX;
245
246/* States are indexed by state_num values. These are normally
247 nonnegative but -1 is used as a special value. */
248typedef ptrdiff_t state_num;
249
250/* Predefined token values. */
251enum
252{
253 END = -1, /* END is a terminal symbol that matches the
254 end of input; any value of END or less in
255 the parse tree is such a symbol. Accepting
256 states of the DFA are those that would have
257 a transition on END. This is -1, not some
258 more-negative value, to tweak the speed of
259 comparisons to END. */
260
261 /* Ordinary character values are terminal symbols that match themselves. */
262
263 /* CSET must come last in the following list of special tokens. Otherwise,
264 the list order matters only for performance. Related special tokens
265 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
266 || CSET <= t) can be done with a single machine-level comparison. */
267
268 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
269 the empty string. */
270
271 QMARK, /* QMARK is an operator of one argument that
272 matches zero or one occurrences of its
273 argument. */
274
275 STAR, /* STAR is an operator of one argument that
276 matches the Kleene closure (zero or more
277 occurrences) of its argument. */
278
279 PLUS, /* PLUS is an operator of one argument that
280 matches the positive closure (one or more
281 occurrences) of its argument. */
282
283 REPMN, /* REPMN is a lexical token corresponding
284 to the {m,n} construct. REPMN never
285 appears in the compiled token vector. */
286
287 CAT, /* CAT is an operator of two arguments that
288 matches the concatenation of its
289 arguments. CAT is never returned by the
290 lexical analyzer. */
291
292 OR, /* OR is an operator of two arguments that
293 matches either of its arguments. */
294
295 LPAREN, /* LPAREN never appears in the parse tree,
296 it is only a lexeme. */
297
298 RPAREN, /* RPAREN never appears in the parse tree. */
299
300 WCHAR, /* Only returned by lex. wctok contains
301 the wide character representation. */
302
303 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
304 a valid multibyte (or single byte) character.
305 It is used only if MB_CUR_MAX > 1. */
306
307 BEG, /* BEG is an initial symbol that matches the
308 beginning of input. */
309
310 BEGLINE, /* BEGLINE is a terminal symbol that matches
311 the empty string at the beginning of a
312 line. */
313
314 ENDLINE, /* ENDLINE is a terminal symbol that matches
315 the empty string at the end of a line. */
316
317 BEGWORD, /* BEGWORD is a terminal symbol that matches
318 the empty string at the beginning of a
319 word. */
320
321 ENDWORD, /* ENDWORD is a terminal symbol that matches
322 the empty string at the end of a word. */
323
324 LIMWORD, /* LIMWORD is a terminal symbol that matches
325 the empty string at the beginning or the
326 end of a word. */
327
328 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
329 matches the empty string not at
330 the beginning or end of a word. */
331
332 BACKREF, /* BACKREF is generated by \<digit>
333 or by any other construct that
334 is not completely handled. If the scanner
335 detects a transition on backref, it returns
336 a kind of "semi-success" indicating that
337 the match will have to be verified with
338 a backtracking matcher. */
339
340 MBCSET, /* MBCSET is similar to CSET, but for
341 multibyte characters. */
342
343 CSET /* CSET and (and any value greater) is a
344 terminal symbol that matches any of a
345 class of characters. */
346};
347
348
349/* States of the recognizer correspond to sets of positions in the parse
350 tree, together with the constraints under which they may be matched.
351 So a position is encoded as an index into the parse tree together with
352 a constraint. */
353typedef struct
354{
355 idx_t index; /* Index into the parse array. */
356 unsigned int constraint; /* Constraint for matching this position. */
357} position;
358
359/* Sets of positions are stored as arrays. */
360typedef struct
361{
362 position *elems; /* Elements of this position set. */
363 idx_t nelem; /* Number of elements in this set. */
364 idx_t alloc; /* Number of elements allocated in ELEMS. */
365} position_set;
366
367/* A state of the dfa consists of a set of positions, some flags,
368 and the token value of the lowest-numbered position of the state that
369 contains an END token. */
370typedef struct
371{
372 size_t hash; /* Hash of the positions of this state. */
373 position_set elems; /* Positions this state could match. */
374 unsigned char context; /* Context from previous state. */
375 unsigned short constraint; /* Constraint for this state to accept. */
376 position_set mbps; /* Positions which can match multibyte
377 characters or the follows, e.g., period.
378 Used only if MB_CUR_MAX > 1. */
379 state_num mb_trindex; /* Index of this state in MB_TRANS, or
380 negative if the state does not have
381 ANYCHAR. */
382} dfa_state;
383
384/* Maximum for any transition table count. This should be at least 3,
385 for the initial state setup. */
386enum { MAX_TRCOUNT = 1024 };
387
388/* A bracket operator.
389 e.g., [a-c], [[:alpha:]], etc. */
390struct mb_char_classes
391{
392 ptrdiff_t cset;
393 bool invert;
394 wchar_t *chars; /* Normal characters. */
395 idx_t nchars;
396 idx_t nchars_alloc;
397};
398
399struct regex_syntax
400{
401 /* Syntax bits controlling the behavior of the lexical analyzer. */
402 reg_syntax_t syntax_bits;
403 int dfaopts;
404 bool syntax_bits_set;
405
406 /* Flag for case-folding letters into sets. */
407 bool case_fold;
408
409 /* End-of-line byte in data. */
410 unsigned char eolbyte;
411
412 /* Cache of char-context values. */
413 char sbit[NOTCHAR];
414
415 /* If never_trail[B], the byte B cannot be a non-initial byte in a
416 multibyte character. */
417 bool never_trail[NOTCHAR];
418
419 /* Set of characters considered letters. */
420 charclass letters;
421
422 /* Set of characters that are newline. */
423 charclass newline;
424};
425
426/* Lexical analyzer. All the dross that deals with the obnoxious
427 GNU Regex syntax bits is located here. The poor, suffering
428 reader is referred to the GNU Regex documentation for the
429 meaning of the @#%!@#%^!@ syntax bits. */
430struct lexer_state
431{
432 char const *ptr; /* Pointer to next input character. */
433 idx_t left; /* Number of characters remaining. */
434 token lasttok; /* Previous token returned; initially END. */
435 idx_t parens; /* Count of outstanding left parens. */
436 int minrep, maxrep; /* Repeat counts for {m,n}. */
437
438 /* Wide character representation of the current multibyte character,
439 or WEOF if there was an encoding error. Used only if
440 MB_CUR_MAX > 1. */
441 wint_t wctok;
442
443 /* The most recently analyzed multibyte bracket expression. */
444 struct mb_char_classes brack;
445
446 /* We're separated from beginning or (, | only by zero-width characters. */
447 bool laststart;
448};
449
450/* Recursive descent parser for regular expressions. */
451
452struct parser_state
453{
454 token tok; /* Lookahead token. */
455 idx_t depth; /* Current depth of a hypothetical stack
456 holding deferred productions. This is
457 used to determine the depth that will be
458 required of the real stack later on in
459 dfaanalyze. */
460};
461
462/* A compiled regular expression. */
463struct dfa
464{
465 /* Fields filled by the scanner. */
466 charclass *charclasses; /* Array of character sets for CSET tokens. */
467 idx_t cindex; /* Index for adding new charclasses. */
468 idx_t calloc; /* Number of charclasses allocated. */
469 ptrdiff_t canychar; /* Index of anychar class, or -1. */
470
471 /* Scanner state */
472 struct lexer_state lex;
473
474 /* Parser state */
475 struct parser_state parse;
476
477 /* Fields filled by the parser. */
478 token *tokens; /* Postfix parse array. */
479 idx_t tindex; /* Index for adding new tokens. */
480 idx_t talloc; /* Number of tokens currently allocated. */
481 idx_t depth; /* Depth required of an evaluation stack
482 used for depth-first traversal of the
483 parse tree. */
484 idx_t nleaves; /* Number of non-EMPTY leaves
485 in the parse tree. */
486 idx_t nregexps; /* Count of parallel regexps being built
487 with dfaparse. */
488 bool fast; /* The DFA is fast. */
489 bool epsilon; /* Does a token match only the empty string? */
490 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
491 mbstate_t mbs; /* Multibyte conversion state. */
492
493 /* The following are valid only if MB_CUR_MAX > 1. */
494
495 /* The value of multibyte_prop[i] is defined by following rule.
496 if tokens[i] < NOTCHAR
497 bit 0 : tokens[i] is the first byte of a character, including
498 single-byte characters.
499 bit 1 : tokens[i] is the last byte of a character, including
500 single-byte characters.
501
502 e.g.
503 tokens
504 = 'single_byte_a', 'multi_byte_A', single_byte_b'
505 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
506 multibyte_prop
507 = 3 , 1 , 0 , 2 , 3
508 */
509 char *multibyte_prop;
510
511 /* Fields filled by the superset. */
512 struct dfa *superset; /* Hint of the dfa. */
513
514 /* Fields filled by the state builder. */
515 dfa_state *states; /* States of the dfa. */
516 state_num sindex; /* Index for adding new states. */
517 idx_t salloc; /* Number of states currently allocated. */
518
519 /* Fields filled by the parse tree->NFA conversion. */
520 position_set *follows; /* Array of follow sets, indexed by position
521 index. The follow of a position is the set
522 of positions containing characters that
523 could conceivably follow a character
524 matching the given position in a string
525 matching the regexp. Allocated to the
526 maximum possible position index. */
527 bool searchflag; /* We are supposed to build a searching
528 as opposed to an exact matcher. A searching
529 matcher finds the first and shortest string
530 matching a regexp anywhere in the buffer,
531 whereas an exact matcher finds the longest
532 string matching, but anchored to the
533 beginning of the buffer. */
534
535 /* Fields filled by dfaanalyze. */
536 int *constraints; /* Array of union of accepting constraints
537 in the follow of a position. */
538 int *separates; /* Array of contexts on follow of a
539 position. */
540
541 /* Fields filled by dfaexec. */
542 state_num tralloc; /* Number of transition tables that have
543 slots so far, not counting trans[-1] and
544 trans[-2]. */
545 int trcount; /* Number of transition tables that have
546 been built, other than for initial
547 states. */
548 int min_trcount; /* Number of initial states. Equivalently,
549 the minimum state number for which trcount
550 counts transitions. */
551 state_num **trans; /* Transition tables for states that can
552 never accept. If the transitions for a
553 state have not yet been computed, or the
554 state could possibly accept, its entry in
555 this table is NULL. This points to two
556 past the start of the allocated array,
557 and trans[-1] and trans[-2] are always
558 NULL. */
559 state_num **fails; /* Transition tables after failing to accept
560 on a state that potentially could do so.
561 If trans[i] is non-null, fails[i] must
562 be null. */
563 char *success; /* Table of acceptance conditions used in
564 dfaexec and computed in build_state. */
565 state_num *newlines; /* Transitions on newlines. The entry for a
566 newline in any transition table is always
567 -1 so we can count lines without wasting
568 too many cycles. The transition for a
569 newline is stored separately and handled
570 as a special case. Newline is also used
571 as a sentinel at the end of the buffer. */
572 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
573 context in multibyte locales, in which we
574 do not distinguish between their contexts,
575 as not supported word. */
576 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
577 state_num **mb_trans; /* Transition tables for states with
578 ANYCHAR. */
579 state_num mb_trcount; /* Number of transition tables for states with
580 ANYCHAR that have actually been built. */
581
582 /* Syntax configuration. This is near the end so that dfacopysyntax
583 can memset up to here. */
584 struct regex_syntax syntax;
585
586 /* Information derived from the locale. This is at the end so that
587 a quick memset need not clear it specially. */
588
589 /* dfaexec implementation. */
590 char *(*dfaexec) (struct dfa *, char const *, char *,
591 bool, idx_t *, bool *);
592
593 /* Other cached information derived from the locale. */
594 struct localeinfo localeinfo;
595};
596
597/* User access to dfa internals. */
598
599/* S could possibly be an accepting state of R. */
600static bool
601accepting (state_num s, struct dfa const *r)
602{
603 return r->states[s].constraint != 0;
604}
605
606/* STATE accepts in the specified context. */
607static bool
608accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
609{
610 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
611}
612
613static void regexp (struct dfa *dfa);
614
615/* Store into *PWC the result of converting the leading bytes of the
616 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
617 and updating the conversion state in *D. On conversion error,
618 convert just a single byte, to WEOF. Return the number of bytes
619 converted.
620
621 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
622
623 * PWC points to wint_t, not to wchar_t.
624 * The last arg is a dfa *D instead of merely a multibyte conversion
625 state D->mbs.
626 * N is idx_t not size_t, and must be at least 1.
627 * S[N - 1] must be a sentinel byte.
628 * Shift encodings are not supported.
629 * The return value is always in the range 1..N.
630 * D->mbs is always valid afterwards.
631 * *PWC is always set to something. */
632static int
633mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
634{
635 unsigned char uc = s[0];
636 wint_t wc = d->localeinfo.sbctowc[uc];
637
638 if (wc == WEOF)
639 {
640 wchar_t wch;
641 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
642 if (0 < nbytes && nbytes < (size_t) -2)
643 {
644 *pwc = wch;
645 return nbytes;
646 }
647 memset (&d->mbs, 0, sizeof d->mbs);
648 }
649
650 *pwc = wc;
651 return 1;
652}
653
654#ifdef DEBUG
655
656static void
657prtok (token t)
658{
659 if (t <= END)
660 fprintf (stderr, "END");
661 else if (0 <= t && t < NOTCHAR)
662 {
663 unsigned int ch = t;
664 fprintf (stderr, "0x%02x", ch);
665 }
666 else
667 {
668 char const *s;
669 switch (t)
670 {
671 case BEG:
672 s = "BEG";
673 break;
674 case EMPTY:
675 s = "EMPTY";
676 break;
677 case BACKREF:
678 s = "BACKREF";
679 break;
680 case BEGLINE:
681 s = "BEGLINE";
682 break;
683 case ENDLINE:
684 s = "ENDLINE";
685 break;
686 case BEGWORD:
687 s = "BEGWORD";
688 break;
689 case ENDWORD:
690 s = "ENDWORD";
691 break;
692 case LIMWORD:
693 s = "LIMWORD";
694 break;
695 case NOTLIMWORD:
696 s = "NOTLIMWORD";
697 break;
698 case QMARK:
699 s = "QMARK";
700 break;
701 case STAR:
702 s = "STAR";
703 break;
704 case PLUS:
705 s = "PLUS";
706 break;
707 case CAT:
708 s = "CAT";
709 break;
710 case OR:
711 s = "OR";
712 break;
713 case LPAREN:
714 s = "LPAREN";
715 break;
716 case RPAREN:
717 s = "RPAREN";
718 break;
719 case ANYCHAR:
720 s = "ANYCHAR";
721 break;
722 case MBCSET:
723 s = "MBCSET";
724 break;
725 default:
726 s = "CSET";
727 break;
728 }
729 fprintf (stderr, "%s", s);
730 }
731}
732#endif /* DEBUG */
733
734/* Stuff pertaining to charclasses. */
735
736static bool
737tstbit (unsigned int b, charclass const *c)
738{
739 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
740}
741
742static void
743setbit (unsigned int b, charclass *c)
744{
745 charclass_word one = 1;
746 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
747}
748
749static void
750clrbit (unsigned int b, charclass *c)
751{
752 charclass_word one = 1;
753 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
754}
755
756static void
757zeroset (charclass *s)
758{
759 memset (s, 0, sizeof *s);
760}
761
762static void
763fillset (charclass *s)
764{
765 for (int i = 0; i < CHARCLASS_WORDS; i++)
766 s->w[i] = CHARCLASS_WORD_MASK;
767}
768
769static void
770notset (charclass *s)
771{
772 for (int i = 0; i < CHARCLASS_WORDS; ++i)
773 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
774}
775
776static bool
777equal (charclass const *s1, charclass const *s2)
778{
779 charclass_word w = 0;
780 for (int i = 0; i < CHARCLASS_WORDS; i++)
781 w |= s1->w[i] ^ s2->w[i];
782 return w == 0;
783}
784
785static bool
786emptyset (charclass const *s)
787{
788 charclass_word w = 0;
789 for (int i = 0; i < CHARCLASS_WORDS; i++)
790 w |= s->w[i];
791 return w == 0;
792}
793
794/* Ensure that the array addressed by PA holds at least I + 1 items.
795 Either return PA, or reallocate the array and return its new address.
796 Although PA may be null, the returned value is never null.
797
798 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
799 is updated on reallocation. If PA is null, *NITEMS must be zero.
800 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
801 ITEM_SIZE is the size of one item; it must be positive.
802 Avoid O(N**2) behavior on arrays growing linearly. */
803static void *
804maybe_realloc (void *pa, idx_t i, idx_t *nitems,
805 ptrdiff_t nitems_max, idx_t item_size)
806{
807 if (i < *nitems)
808 return pa;
809 return xpalloc (pa, nitems, 1, nitems_max, item_size);
810}
811
812/* In DFA D, find the index of charclass S, or allocate a new one. */
813static idx_t
814charclass_index (struct dfa *d, charclass const *s)
815{
816 idx_t i;
817
818 for (i = 0; i < d->cindex; ++i)
819 if (equal (s, &d->charclasses[i]))
820 return i;
821 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
822 TOKEN_MAX - CSET, sizeof *d->charclasses);
823 ++d->cindex;
824 d->charclasses[i] = *s;
825 return i;
826}
827
828static bool
829unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
830{
831 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
832}
833
834static int
835char_context (struct dfa const *dfa, unsigned char c)
836{
837 if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
838 return CTX_NEWLINE;
839 if (unibyte_word_constituent (dfa, c))
840 return CTX_LETTER;
841 return CTX_NONE;
842}
843
844/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
845 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
846 this may happen when folding case in weird Turkish locales where
847 dotless i/dotted I are not included in the chosen character set.
848 Return whether a bit was set in the charclass. */
849static bool
850setbit_wc (wint_t wc, charclass *c)
851{
852 int b = wctob (wc);
853 if (b < 0)
854 return false;
855
856 setbit (b, c);
857 return true;
858}
859
860/* Set a bit for B and its case variants in the charclass C.
861 MB_CUR_MAX must be 1. */
862static void
863setbit_case_fold_c (int b, charclass *c)
864{
865 int ub = toupper (b);
866 for (int i = 0; i < NOTCHAR; i++)
867 if (toupper (i) == ub)
868 setbit (i, c);
869}
870
871/* Fetch the next lexical input character from the pattern. There
872 must at least one byte of pattern input. Set DFA->lex.wctok to the
873 value of the character or to WEOF depending on whether the input is
874 a valid multibyte character (possibly of length 1). Then return
875 the next input byte value, except return EOF if the input is a
876 multibyte character of length greater than 1. */
877static int
878fetch_wc (struct dfa *dfa)
879{
880 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
881 dfa);
882 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
883 dfa->lex.ptr += nbytes;
884 dfa->lex.left -= nbytes;
885 return c;
886}
887
888/* If there is no more input, report an error about unbalanced brackets.
889 Otherwise, behave as with fetch_wc (DFA). */
890static int
891bracket_fetch_wc (struct dfa *dfa)
892{
893 if (! dfa->lex.left)
894 dfaerror (_("unbalanced ["));
895 return fetch_wc (dfa);
896}
897
898typedef int predicate (int);
899
900/* The following list maps the names of the Posix named character classes
901 to predicate functions that determine whether a given character is in
902 the class. The leading [ has already been eaten by the lexical
903 analyzer. */
904struct dfa_ctype
905{
906 const char *name;
907 predicate *func;
908 bool single_byte_only;
909};
910
911static const struct dfa_ctype prednames[] = {
912 {"alpha", isalpha, false},
913 {"upper", isupper, false},
914 {"lower", islower, false},
915 {"digit", isdigit, true},
916 {"xdigit", isxdigit, false},
917 {"space", isspace, false},
918 {"punct", ispunct, false},
919 {"alnum", isalnum, false},
920 {"print", isprint, false},
921 {"graph", isgraph, false},
922 {"cntrl", iscntrl, false},
923 {"blank", isblank, false},
924 {NULL, NULL, false}
925};
926
927static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
928find_pred (const char *str)
929{
930 for (int i = 0; prednames[i].name; i++)
931 if (str_eq (str, prednames[i].name))
932 return &prednames[i];
933 return NULL;
934}
935
936/* Parse a bracket expression, which possibly includes multibyte
937 characters. */
938static token
939parse_bracket_exp (struct dfa *dfa)
940{
941 /* This is a bracket expression that dfaexec is known to
942 process correctly. */
943 bool known_bracket_exp = true;
944
945 /* Used to warn about [:space:].
946 Bit 0 = first character is a colon.
947 Bit 1 = last character is a colon.
948 Bit 2 = includes any other character but a colon.
949 Bit 3 = includes ranges, char/equiv classes or collation elements. */
950 int colon_warning_state;
951
952 dfa->lex.brack.nchars = 0;
953 charclass ccl;
954 zeroset (&ccl);
955 int c = bracket_fetch_wc (dfa);
956 bool invert = c == '^';
957 if (invert)
958 {
959 c = bracket_fetch_wc (dfa);
960 known_bracket_exp = dfa->localeinfo.simple;
961 }
962 wint_t wc = dfa->lex.wctok;
963 int c1;
964 wint_t wc1;
965 colon_warning_state = (c == ':');
966 do
967 {
968 c1 = NOTCHAR; /* Mark c1 as not initialized. */
969 colon_warning_state &= ~2;
970
971 /* Note that if we're looking at some other [:...:] construct,
972 we just treat it as a bunch of ordinary characters. We can do
973 this because we assume regex has checked for syntax errors before
974 dfa is ever called. */
975 if (c == '[')
976 {
977 c1 = bracket_fetch_wc (dfa);
978 wc1 = dfa->lex.wctok;
979
980 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
981 || c1 == '.' || c1 == '=')
982 {
983 enum { MAX_BRACKET_STRING_LEN = 32 };
984 char str[MAX_BRACKET_STRING_LEN + 1];
985 int len = 0;
986 for (;;)
987 {
988 c = bracket_fetch_wc (dfa);
989 if (dfa->lex.left == 0
990 || (c == c1 && dfa->lex.ptr[0] == ']'))
991 break;
992 if (len < MAX_BRACKET_STRING_LEN)
993 str[len++] = c;
994 else
995 /* This is in any case an invalid class name. */
996 str[0] = '\0';
997 }
998 str[len] = '\0';
999
1000 /* Fetch bracket. */
1001 c = bracket_fetch_wc (dfa);
1002 wc = dfa->lex.wctok;
1003 if (c1 == ':')
1004 /* Build character class. POSIX allows character
1005 classes to match multicharacter collating elements,
1006 but the regex code does not support that, so do not
1007 worry about that possibility. */
1008 {
1009 char const *class
1010 = (dfa->syntax.case_fold && (str_eq (str, "upper")
1011 || str_eq (str, "lower"))
1012 ? "alpha" : str);
1013 const struct dfa_ctype *pred = find_pred (class);
1014 if (!pred)
1015 dfaerror (_("invalid character class"));
1016
1017 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1018 known_bracket_exp = false;
1019 else
1020 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1021 if (pred->func (c2))
1022 setbit (c2, &ccl);
1023 }
1024 else
1025 known_bracket_exp = false;
1026
1027 colon_warning_state |= 8;
1028
1029 /* Fetch new lookahead character. */
1030 c1 = bracket_fetch_wc (dfa);
1031 wc1 = dfa->lex.wctok;
1032 continue;
1033 }
1034
1035 /* We treat '[' as a normal character here. c/c1/wc/wc1
1036 are already set up. */
1037 }
1038
1039 if (c == '\\'
1040 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1041 {
1042 c = bracket_fetch_wc (dfa);
1043 wc = dfa->lex.wctok;
1044 }
1045
1046 if (c1 == NOTCHAR)
1047 {
1048 c1 = bracket_fetch_wc (dfa);
1049 wc1 = dfa->lex.wctok;
1050 }
1051
1052 if (c1 == '-')
1053 /* build range characters. */
1054 {
1055 int c2 = bracket_fetch_wc (dfa);
1056 wint_t wc2 = dfa->lex.wctok;
1057
1058 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1059 Treat it like [-a[.aa.]] while parsing it, and
1060 remember that the set is unknown. */
1061 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1062 {
1063 known_bracket_exp = false;
1064 c2 = ']';
1065 }
1066
1067 if (c2 == ']')
1068 {
1069 /* In the case [x-], the - is an ordinary hyphen,
1070 which is left in c1, the lookahead character. */
1071 dfa->lex.ptr--;
1072 dfa->lex.left++;
1073 }
1074 else
1075 {
1076 if (c2 == '\\' && (dfa->syntax.syntax_bits
1077 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1078 {
1079 c2 = bracket_fetch_wc (dfa);
1080 wc2 = dfa->lex.wctok;
1081 }
1082
1083 colon_warning_state |= 8;
1084 c1 = bracket_fetch_wc (dfa);
1085 wc1 = dfa->lex.wctok;
1086
1087 /* Treat [x-y] as a range if x != y. */
1088 if (wc != wc2 || wc == WEOF)
1089 {
1090 if (dfa->localeinfo.simple
1091 || (c_isdigit (c) & c_isdigit (c2)))
1092 {
1093 for (int ci = c; ci <= c2; ci++)
1094 if (dfa->syntax.case_fold && isalpha (ci))
1095 setbit_case_fold_c (ci, &ccl);
1096 else
1097 setbit (ci, &ccl);
1098 }
1099 else
1100 known_bracket_exp = false;
1101
1102 continue;
1103 }
1104 }
1105 }
1106
1107 colon_warning_state |= (c == ':') ? 2 : 4;
1108
1109 if (!dfa->localeinfo.multibyte)
1110 {
1111 if (dfa->syntax.case_fold && isalpha (c))
1112 setbit_case_fold_c (c, &ccl);
1113 else
1114 setbit (c, &ccl);
1115 continue;
1116 }
1117
1118 if (wc == WEOF)
1119 known_bracket_exp = false;
1120 else
1121 {
1122 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1123 int n = (dfa->syntax.case_fold
1124 ? case_folded_counterparts (wc, folded + 1) + 1
1125 : 1);
1126 folded[0] = wc;
1127 for (int i = 0; i < n; i++)
1128 if (!setbit_wc (folded[i], &ccl))
1129 {
1130 dfa->lex.brack.chars
1131 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1132 &dfa->lex.brack.nchars_alloc, -1,
1133 sizeof *dfa->lex.brack.chars);
1134 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1135 }
1136 }
1137 }
1138 while ((wc = wc1, (c = c1) != ']'));
1139
1140 if (colon_warning_state == 7)
1141 ((dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR
1142 ? dfaerror : dfawarn)
1143 (_("character class syntax is [[:space:]], not [:space:]")));
1144
1145 if (! known_bracket_exp)
1146 return BACKREF;
1147
1148 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1149 {
1150 dfa->lex.brack.invert = invert;
1151 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152 return MBCSET;
1153 }
1154
1155 if (invert)
1156 {
1157 notset (&ccl);
1158 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159 clrbit ('\n', &ccl);
1160 }
1161
1162 return CSET + charclass_index (dfa, &ccl);
1163}
1164
1165struct lexptr
1166{
1167 char const *ptr;
1168 idx_t left;
1169};
1170
1171static void
1172push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1173{
1174 ls->ptr = dfa->lex.ptr;
1175 ls->left = dfa->lex.left;
1176 dfa->lex.ptr = s;
1177 dfa->lex.left = strlen (s);
1178}
1179
1180static void
1181pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1182{
1183 dfa->lex.ptr = ls->ptr;
1184 dfa->lex.left = ls->left;
1185}
1186
1187static token
1188lex (struct dfa *dfa)
1189{
1190 bool backslash = false;
1191
1192 /* Basic plan: We fetch a character. If it's a backslash,
1193 we set the backslash flag and go through the loop again.
1194 On the plus side, this avoids having a duplicate of the
1195 main switch inside the backslash case. On the minus side,
1196 it means that just about every case tests the backslash flag. */
1197 for (int i = 0; i < 2; ++i)
1198 {
1199 if (! dfa->lex.left)
1200 return dfa->lex.lasttok = END;
1201 int c = fetch_wc (dfa);
1202
1203 switch (c)
1204 {
1205 case '\\':
1206 if (backslash)
1207 goto normal_char;
1208 if (dfa->lex.left == 0)
1209 dfaerror (_("unfinished \\ escape"));
1210 backslash = true;
1211 break;
1212
1213 case '^':
1214 if (backslash)
1215 goto normal_char;
1216 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1217 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1218 || dfa->lex.lasttok == OR)
1219 return dfa->lex.lasttok = BEGLINE;
1220 goto normal_char;
1221
1222 case '$':
1223 if (backslash)
1224 goto normal_char;
1225 /* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
1226 for speed, workaround is to optimize it for size instead. The
1227 symptom is that the following SED expression fail to match:
1228 s/^[0-9a-fA-F]\{1,\} \(00[0-9a-fA-F]*\) ABS *notype *External *| \([^.]\{1,\}\)\.\(.*$\)/ 1=\1 2=\2 3=\3/
1229
1230 Seems the exact problem is that it gets the indexing here wrong:
1231 dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
1232 It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
1233 ANDs with a register initialized to zero. Rewriting the
1234 expressions using the tinary operator works around the problem,
1235 although the resulting code is a lot bulkier.
1236 */
1237 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1238 || dfa->lex.left == 0
1239#ifdef _MSC_VER /* see above */
1240 || (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1241 ? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
1242 : dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
1243#else
1244 || ((dfa->lex.left
1245 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1246 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1247 & (dfa->lex.ptr[0] == '\\')]
1248 == ')'))
1249#endif
1250#ifdef _MSC_VER /* see above */
1251 || (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1252 ? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '|'
1253 : dfa->lex.left > 0 && dfa->lex.ptr[0] == '|')
1254#else
1255 || ((dfa->lex.left
1256 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1257 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1258 & (dfa->lex.ptr[0] == '\\')]
1259 == '|'))
1260#endif
1261 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1262 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1263 return dfa->lex.lasttok = ENDLINE;
1264 goto normal_char;
1265
1266 case '1':
1267 case '2':
1268 case '3':
1269 case '4':
1270 case '5':
1271 case '6':
1272 case '7':
1273 case '8':
1274 case '9':
1275 if (!backslash)
1276 goto normal_char;
1277 if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1278 goto stray_backslash;
1279
1280 dfa->lex.laststart = false;
1281 return dfa->lex.lasttok = BACKREF;
1282
1283 case '`':
1284 if (!backslash)
1285 goto normal_char;
1286 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1287 goto stray_backslash;
1288
1289 /* FIXME: should be beginning of string */
1290 return dfa->lex.lasttok = BEGLINE;
1291
1292 case '\'':
1293 if (!backslash)
1294 goto normal_char;
1295 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1296 goto stray_backslash;
1297
1298 /* FIXME: should be end of string */
1299 return dfa->lex.lasttok = ENDLINE;
1300
1301 case '<':
1302 if (!backslash)
1303 goto normal_char;
1304 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1305 goto stray_backslash;
1306
1307 return dfa->lex.lasttok = BEGWORD;
1308
1309 case '>':
1310 if (!backslash)
1311 goto normal_char;
1312 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1313 goto stray_backslash;
1314
1315 return dfa->lex.lasttok = ENDWORD;
1316
1317 case 'b':
1318 if (!backslash)
1319 goto normal_char;
1320 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1321 goto stray_backslash;
1322
1323 return dfa->lex.lasttok = LIMWORD;
1324
1325 case 'B':
1326 if (!backslash)
1327 goto normal_char;
1328 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1329 goto stray_backslash;
1330
1331 return dfa->lex.lasttok = NOTLIMWORD;
1332
1333 case '?':
1334 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1335 goto default_case;
1336 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1337 goto normal_char;
1338 if (dfa->lex.laststart)
1339 {
1340 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1341 goto default_case;
1342 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1343 dfawarn (_("? at start of expression"));
1344 }
1345 return dfa->lex.lasttok = QMARK;
1346
1347 case '*':
1348 if (backslash)
1349 goto normal_char;
1350 if (dfa->lex.laststart)
1351 {
1352 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1353 goto default_case;
1354 if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1355 dfawarn (_("* at start of expression"));
1356 }
1357 return dfa->lex.lasttok = STAR;
1358
1359 case '+':
1360 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1361 goto default_case;
1362 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1363 goto normal_char;
1364 if (dfa->lex.laststart)
1365 {
1366 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1367 goto default_case;
1368 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1369 dfawarn (_("+ at start of expression"));
1370 }
1371 return dfa->lex.lasttok = PLUS;
1372
1373 case '{':
1374 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1375 goto default_case;
1376 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1377 goto normal_char;
1378
1379 /* Cases:
1380 {M} - exact count
1381 {M,} - minimum count, maximum is infinity
1382 {,N} - 0 through N
1383 {,} - 0 to infinity (same as '*')
1384 {M,N} - M through N */
1385 {
1386 char const *p = dfa->lex.ptr;
1387 char const *lim = p + dfa->lex.left;
1388 dfa->lex.minrep = dfa->lex.maxrep = -1;
1389 for (; p != lim && c_isdigit (*p); p++)
1390 dfa->lex.minrep = (dfa->lex.minrep < 0
1391 ? *p - '0'
1392 : MIN (RE_DUP_MAX + 1,
1393 dfa->lex.minrep * 10 + *p - '0'));
1394 if (p != lim)
1395 {
1396 if (*p != ',')
1397 dfa->lex.maxrep = dfa->lex.minrep;
1398 else
1399 {
1400 if (dfa->lex.minrep < 0)
1401 dfa->lex.minrep = 0;
1402 while (++p != lim && c_isdigit (*p))
1403 dfa->lex.maxrep
1404 = (dfa->lex.maxrep < 0
1405 ? *p - '0'
1406 : MIN (RE_DUP_MAX + 1,
1407 dfa->lex.maxrep * 10 + *p - '0'));
1408 }
1409 }
1410 bool invalid_content
1411 = ! ((! backslash || (p != lim && *p++ == '\\'))
1412 && p != lim && *p++ == '}'
1413 && 0 <= dfa->lex.minrep
1414 && (dfa->lex.maxrep < 0
1415 || dfa->lex.minrep <= dfa->lex.maxrep));
1416 if (invalid_content
1417 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1418 goto normal_char;
1419 if (dfa->lex.laststart)
1420 {
1421 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1422 goto default_case;
1423 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1424 dfawarn (_("{...} at start of expression"));
1425 }
1426 if (invalid_content)
1427 dfaerror (_("invalid content of \\{\\}"));
1428 if (RE_DUP_MAX < dfa->lex.maxrep)
1429 dfaerror (_("regular expression too big"));
1430 dfa->lex.ptr = p;
1431 dfa->lex.left = lim - p;
1432 }
1433 dfa->lex.laststart = false;
1434 return dfa->lex.lasttok = REPMN;
1435
1436 case '|':
1437 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1438 goto default_case;
1439 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1440 goto normal_char;
1441 dfa->lex.laststart = true;
1442 return dfa->lex.lasttok = OR;
1443
1444 case '\n':
1445 if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1446 goto default_case;
1447 if (backslash)
1448 goto normal_char;
1449 dfa->lex.laststart = true;
1450 return dfa->lex.lasttok = OR;
1451
1452 case '(':
1453 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1454 goto normal_char;
1455 dfa->lex.parens++;
1456 dfa->lex.laststart = true;
1457 return dfa->lex.lasttok = LPAREN;
1458
1459 case ')':
1460 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1461 goto normal_char;
1462 if (dfa->lex.parens == 0
1463 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1464 goto normal_char;
1465 dfa->lex.parens--;
1466 dfa->lex.laststart = false;
1467 return dfa->lex.lasttok = RPAREN;
1468
1469 case '.':
1470 if (backslash)
1471 goto normal_char;
1472 if (dfa->canychar < 0)
1473 {
1474 charclass ccl;
1475 fillset (&ccl);
1476 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1477 clrbit ('\n', &ccl);
1478 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1479 clrbit ('\0', &ccl);
1480 if (dfa->localeinfo.multibyte)
1481 for (int c2 = 0; c2 < NOTCHAR; c2++)
1482 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1483 clrbit (c2, &ccl);
1484 dfa->canychar = charclass_index (dfa, &ccl);
1485 }
1486 dfa->lex.laststart = false;
1487 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1488 ? ANYCHAR
1489 : CSET + dfa->canychar);
1490
1491 case 's':
1492 case 'S':
1493 if (!backslash)
1494 goto normal_char;
1495 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1496 goto stray_backslash;
1497
1498 if (!dfa->localeinfo.multibyte)
1499 {
1500 charclass ccl;
1501 zeroset (&ccl);
1502 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1503 if (isspace (c2))
1504 setbit (c2, &ccl);
1505 if (c == 'S')
1506 notset (&ccl);
1507 dfa->lex.laststart = false;
1508 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1509 }
1510
1511 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1512 add_utf8_anychar, makes sense. */
1513
1514 /* \s and \S are documented to be equivalent to [[:space:]] and
1515 [^[:space:]] respectively, so tell the lexer to process those
1516 strings, each minus its "already processed" '['. */
1517 {
1518 struct lexptr ls;
1519 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1520 dfa->lex.lasttok = parse_bracket_exp (dfa);
1521 pop_lex_state (dfa, &ls);
1522 }
1523
1524 dfa->lex.laststart = false;
1525 return dfa->lex.lasttok;
1526
1527 case 'w':
1528 case 'W':
1529 if (!backslash)
1530 goto normal_char;
1531 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1532 goto stray_backslash;
1533
1534 if (!dfa->localeinfo.multibyte)
1535 {
1536 charclass ccl;
1537 zeroset (&ccl);
1538 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1539 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1540 setbit (c2, &ccl);
1541 if (c == 'W')
1542 notset (&ccl);
1543 dfa->lex.laststart = false;
1544 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1545 }
1546
1547 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1548 add_utf8_anychar, makes sense. */
1549
1550 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1551 [^_[:alnum:]] respectively, so tell the lexer to process those
1552 strings, each minus its "already processed" '['. */
1553 {
1554 struct lexptr ls;
1555 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1556 dfa->lex.lasttok = parse_bracket_exp (dfa);
1557 pop_lex_state (dfa, &ls);
1558 }
1559
1560 dfa->lex.laststart = false;
1561 return dfa->lex.lasttok;
1562
1563 case '[':
1564 if (backslash)
1565 goto normal_char;
1566 dfa->lex.laststart = false;
1567 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1568
1569 default:
1570 default_case:
1571 if (!backslash)
1572 goto normal_char;
1573 stray_backslash:
1574 if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1575 {
1576 char const *msg;
1577 char msgbuf[100];
1578 if (!iswprint (dfa->lex.wctok))
1579 msg = _("stray \\ before unprintable character");
1580 else if (iswspace (dfa->lex.wctok))
1581 msg = _("stray \\ before white space");
1582 else
1583 {
1584 int n = snprintf (msgbuf, sizeof msgbuf,
1585 _("stray \\ before %lc"), dfa->lex.wctok);
1586 msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1587 }
1588 dfawarn (msg);
1589 }
1590 FALLTHROUGH;
1591 case ']': case '}':
1592 normal_char:
1593 dfa->lex.laststart = false;
1594 /* For multibyte character sets, folding is done in atom. Always
1595 return WCHAR. */
1596 if (dfa->localeinfo.multibyte)
1597 return dfa->lex.lasttok = WCHAR;
1598
1599 if (dfa->syntax.case_fold && isalpha (c))
1600 {
1601 charclass ccl;
1602 zeroset (&ccl);
1603 setbit_case_fold_c (c, &ccl);
1604 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1605 }
1606
1607 return dfa->lex.lasttok = c;
1608 }
1609 }
1610
1611 /* The above loop should consume at most a backslash
1612 and some other character. */
1613 abort ();
1614 return END; /* keeps pedantic compilers happy. */
1615}
1616
1617static void
1618addtok_mb (struct dfa *dfa, token t, char mbprop)
1619{
1620 if (dfa->talloc == dfa->tindex)
1621 {
1622 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1623 sizeof *dfa->tokens);
1624 if (dfa->localeinfo.multibyte)
1625 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1626 sizeof *dfa->multibyte_prop);
1627 }
1628 if (dfa->localeinfo.multibyte)
1629 dfa->multibyte_prop[dfa->tindex] = mbprop;
1630 dfa->tokens[dfa->tindex++] = t;
1631
1632 switch (t)
1633 {
1634 case QMARK:
1635 case STAR:
1636 case PLUS:
1637 break;
1638
1639 case CAT:
1640 case OR:
1641 dfa->parse.depth--;
1642 break;
1643
1644 case EMPTY:
1645 dfa->epsilon = true;
1646 goto increment_depth;
1647
1648 case BACKREF:
1649 dfa->fast = false;
1650 goto increment_nleaves;
1651
1652 case BEGLINE:
1653 case ENDLINE:
1654 case BEGWORD:
1655 case ENDWORD:
1656 case LIMWORD:
1657 case NOTLIMWORD:
1658 dfa->epsilon = true;
1659 FALLTHROUGH;
1660 default:
1661 increment_nleaves:
1662 dfa->nleaves++;
1663 increment_depth:
1664 dfa->parse.depth++;
1665 if (dfa->depth < dfa->parse.depth)
1666 dfa->depth = dfa->parse.depth;
1667 break;
1668 }
1669}
1670
1671static void addtok_wc (struct dfa *dfa, wint_t wc);
1672
1673/* Add the given token to the parse tree, maintaining the depth count and
1674 updating the maximum depth if necessary. */
1675static void
1676addtok (struct dfa *dfa, token t)
1677{
1678 if (dfa->localeinfo.multibyte && t == MBCSET)
1679 {
1680 bool need_or = false;
1681
1682 /* Extract wide characters into alternations for better performance.
1683 This does not require UTF-8. */
1684 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1685 {
1686 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1687 if (need_or)
1688 addtok (dfa, OR);
1689 need_or = true;
1690 }
1691 dfa->lex.brack.nchars = 0;
1692
1693 /* Wide characters have been handled above, so it is possible
1694 that the set is empty now. Do nothing in that case. */
1695 if (dfa->lex.brack.cset != -1)
1696 {
1697 addtok (dfa, CSET + dfa->lex.brack.cset);
1698 if (need_or)
1699 addtok (dfa, OR);
1700 }
1701 }
1702 else
1703 {
1704 addtok_mb (dfa, t, 3);
1705 }
1706}
1707
1708/* We treat a multibyte character as a single atom, so that DFA
1709 can treat a multibyte character as a single expression.
1710
1711 e.g., we construct the following tree from "<mb1><mb2>".
1712 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1713 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1714static void
1715addtok_wc (struct dfa *dfa, wint_t wc)
1716{
1717 unsigned char buf[MB_LEN_MAX];
1718 mbstate_t s = { 0 };
1719 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1720 int buflen;
1721
1722 if (stored_bytes != (size_t) -1)
1723 buflen = stored_bytes;
1724 else
1725 {
1726 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1727 the addtok_mb call altogether can corrupt the heap. */
1728 buflen = 1;
1729 buf[0] = 0;
1730 }
1731
1732 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1733 for (int i = 1; i < buflen; i++)
1734 {
1735 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1736 addtok (dfa, CAT);
1737 }
1738}
1739
1740static void
1741add_utf8_anychar (struct dfa *dfa)
1742{
1743 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1744 UTF-8 byte sequence has been defined as follows:
1745
1746 ([\x00-\x7f]
1747 |[\xc2-\xdf][\x80-\xbf]
1748 |[\xe0][\xa0-\xbf][\x80-\xbf]
1749 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1750 |[\xed][\x80-\x9f][\x80-\xbf]
1751 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1752 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1753 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1754
1755 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1756 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1757 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1758 H = [\x80-\x9f], I = [\xf0],
1759 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1760
1761 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1762
1763 /* Mnemonics for classes containing two or more bytes. */
1764 enum { A, B, C, E, F, H, J, K, M };
1765
1766 /* Mnemonics for single-byte tokens. */
1767 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1768
1769 static charclass const utf8_classes[] = {
1770 /* A. 00-7f: 1-byte sequence. */
1771 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1772
1773 /* B. c2-df: 1st byte of a 2-byte sequence. */
1774 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1775
1776 /* C. 80-bf: non-leading bytes. */
1777 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1778
1779 /* D. e0 (just a token). */
1780
1781 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1782 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1783
1784 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1785 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1786
1787 /* G. ed (just a token). */
1788
1789 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1790 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1791
1792 /* I. f0 (just a token). */
1793
1794 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1795 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1796
1797 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1798 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1799
1800 /* L. f4 (just a token). */
1801
1802 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1803 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1804 };
1805
1806 /* Define the character classes that are needed below. */
1807 if (dfa->utf8_anychar_classes[0] == 0)
1808 {
1809 charclass c = utf8_classes[0];
1810 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1811 clrbit ('\n', &c);
1812 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1813 clrbit ('\0', &c);
1814 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1815
1816 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1817 dfa->utf8_anychar_classes[i]
1818 = CSET + charclass_index (dfa, &utf8_classes[i]);
1819 }
1820
1821 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1822 The token buffer is in reverse Polish order, so we get
1823 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1824 C CAT OR C CAT OR C CAT OR". */
1825 addtok (dfa, dfa->utf8_anychar_classes[A]);
1826 addtok (dfa, dfa->utf8_anychar_classes[B]);
1827 addtok (dfa, D_token);
1828 addtok (dfa, dfa->utf8_anychar_classes[E]);
1829 addtok (dfa, CAT);
1830 addtok (dfa, OR);
1831 addtok (dfa, G_token);
1832 addtok (dfa, dfa->utf8_anychar_classes[H]);
1833 addtok (dfa, CAT);
1834 addtok (dfa, OR);
1835 addtok (dfa, dfa->utf8_anychar_classes[F]);
1836 addtok (dfa, I_token);
1837 addtok (dfa, dfa->utf8_anychar_classes[J]);
1838 addtok (dfa, CAT);
1839 addtok (dfa, OR);
1840 addtok (dfa, L_token);
1841 addtok (dfa, dfa->utf8_anychar_classes[M]);
1842 addtok (dfa, CAT);
1843 addtok (dfa, OR);
1844 addtok (dfa, dfa->utf8_anychar_classes[K]);
1845 for (int i = 0; i < 3; i++)
1846 {
1847 addtok (dfa, dfa->utf8_anychar_classes[C]);
1848 addtok (dfa, CAT);
1849 addtok (dfa, OR);
1850 }
1851}
1852
1853/* The grammar understood by the parser is as follows.
1854
1855 regexp:
1856 regexp OR branch
1857 branch
1858
1859 branch:
1860 branch closure
1861 closure
1862
1863 closure:
1864 closure QMARK
1865 closure STAR
1866 closure PLUS
1867 closure REPMN
1868 atom
1869
1870 atom:
1871 <normal character>
1872 <multibyte character>
1873 ANYCHAR
1874 MBCSET
1875 CSET
1876 BACKREF
1877 BEGLINE
1878 ENDLINE
1879 BEGWORD
1880 ENDWORD
1881 LIMWORD
1882 NOTLIMWORD
1883 LPAREN regexp RPAREN
1884 <empty>
1885
1886 The parser builds a parse tree in postfix form in an array of tokens. */
1887
1888static void
1889atom (struct dfa *dfa)
1890{
1891 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1892 || dfa->parse.tok >= CSET
1893 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1894 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1895 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1896 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1897 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1898 {
1899 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1900 {
1901 /* For UTF-8 expand the period to a series of CSETs that define a
1902 valid UTF-8 character. This avoids using the slow multibyte
1903 path. I'm pretty sure it would be both profitable and correct to
1904 do it for any encoding; however, the optimization must be done
1905 manually as it is done above in add_utf8_anychar. So, let's
1906 start with UTF-8: it is the most used, and the structure of the
1907 encoding makes the correctness more obvious. */
1908 add_utf8_anychar (dfa);
1909 }
1910 else
1911 addtok (dfa, dfa->parse.tok);
1912 dfa->parse.tok = lex (dfa);
1913 }
1914 else if (dfa->parse.tok == WCHAR)
1915 {
1916 if (dfa->lex.wctok == WEOF)
1917 addtok (dfa, BACKREF);
1918 else
1919 {
1920 addtok_wc (dfa, dfa->lex.wctok);
1921
1922 if (dfa->syntax.case_fold)
1923 {
1924 wchar_t folded[CASE_FOLDED_BUFSIZE];
1925 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1926 for (int i = 0; i < n; i++)
1927 {
1928 addtok_wc (dfa, folded[i]);
1929 addtok (dfa, OR);
1930 }
1931 }
1932 }
1933
1934 dfa->parse.tok = lex (dfa);
1935 }
1936 else if (dfa->parse.tok == LPAREN)
1937 {
1938 dfa->parse.tok = lex (dfa);
1939 regexp (dfa);
1940 if (dfa->parse.tok != RPAREN)
1941 dfaerror (_("unbalanced ("));
1942 dfa->parse.tok = lex (dfa);
1943 }
1944 else
1945 addtok (dfa, EMPTY);
1946}
1947
1948/* Return the number of tokens in the given subexpression. */
1949static idx_t _GL_ATTRIBUTE_PURE
1950nsubtoks (struct dfa const *dfa, idx_t tindex)
1951{
1952 switch (dfa->tokens[tindex - 1])
1953 {
1954 default:
1955 return 1;
1956 case QMARK:
1957 case STAR:
1958 case PLUS:
1959 return 1 + nsubtoks (dfa, tindex - 1);
1960 case CAT:
1961 case OR:
1962 {
1963 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1964 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1965 }
1966 }
1967}
1968
1969/* Copy the given subexpression to the top of the tree. */
1970static void
1971copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1972{
1973 if (dfa->localeinfo.multibyte)
1974 for (idx_t i = 0; i < ntokens; i++)
1975 addtok_mb (dfa, dfa->tokens[tindex + i],
1976 dfa->multibyte_prop[tindex + i]);
1977 else
1978 for (idx_t i = 0; i < ntokens; i++)
1979 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1980}
1981
1982static void
1983closure (struct dfa *dfa)
1984{
1985 atom (dfa);
1986 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1987 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1988 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1989 {
1990 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1991 idx_t tindex = dfa->tindex - ntokens;
1992 if (dfa->lex.maxrep < 0)
1993 addtok (dfa, PLUS);
1994 if (dfa->lex.minrep == 0)
1995 addtok (dfa, QMARK);
1996 int i;
1997 for (i = 1; i < dfa->lex.minrep; i++)
1998 {
1999 copytoks (dfa, tindex, ntokens);
2000 addtok (dfa, CAT);
2001 }
2002 for (; i < dfa->lex.maxrep; i++)
2003 {
2004 copytoks (dfa, tindex, ntokens);
2005 addtok (dfa, QMARK);
2006 addtok (dfa, CAT);
2007 }
2008 dfa->parse.tok = lex (dfa);
2009 }
2010 else if (dfa->parse.tok == REPMN)
2011 {
2012 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2013 dfa->parse.tok = lex (dfa);
2014 closure (dfa);
2015 }
2016 else
2017 {
2018 addtok (dfa, dfa->parse.tok);
2019 dfa->parse.tok = lex (dfa);
2020 }
2021}
2022
2023static void
2024branch (struct dfa* dfa)
2025{
2026 closure (dfa);
2027 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2028 && dfa->parse.tok >= 0)
2029 {
2030 closure (dfa);
2031 addtok (dfa, CAT);
2032 }
2033}
2034
2035static void
2036regexp (struct dfa *dfa)
2037{
2038 branch (dfa);
2039 while (dfa->parse.tok == OR)
2040 {
2041 dfa->parse.tok = lex (dfa);
2042 branch (dfa);
2043 addtok (dfa, OR);
2044 }
2045}
2046
2047/* Parse a string S of length LEN into D. S can include NUL characters.
2048 This is the main entry point for the parser. */
2049void
2050dfaparse (char const *s, idx_t len, struct dfa *d)
2051{
2052 d->lex.ptr = s;
2053 d->lex.left = len;
2054 d->lex.lasttok = END;
2055 d->lex.laststart = true;
2056
2057 if (!d->syntax.syntax_bits_set)
2058 dfaerror (_("no syntax specified"));
2059
2060 if (!d->nregexps)
2061 addtok (d, BEG);
2062
2063 d->parse.tok = lex (d);
2064 d->parse.depth = d->depth;
2065
2066 regexp (d);
2067
2068 if (d->parse.tok != END)
2069 dfaerror (_("unbalanced )"));
2070
2071 addtok (d, END - d->nregexps);
2072 addtok (d, CAT);
2073
2074 if (d->nregexps)
2075 addtok (d, OR);
2076
2077 ++d->nregexps;
2078}
2079
2080/* Some primitives for operating on sets of positions. */
2081
2082/* Copy one set to another. */
2083static void
2084copy (position_set const *src, position_set *dst)
2085{
2086 if (dst->alloc < src->nelem)
2087 {
2088 free (dst->elems);
2089 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2090 sizeof *dst->elems);
2091 }
2092 dst->nelem = src->nelem;
2093 if (src->nelem != 0)
2094 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2095}
2096
2097static void
2098alloc_position_set (position_set *s, idx_t size)
2099{
2100 s->elems = xnmalloc (size, sizeof *s->elems);
2101 s->alloc = size;
2102 s->nelem = 0;
2103}
2104
2105/* Insert position P in set S. S is maintained in sorted order on
2106 decreasing index. If there is already an entry in S with P.index
2107 then merge (logically-OR) P's constraints into the one in S.
2108 S->elems must point to an array large enough to hold the resulting set. */
2109static void
2110insert (position p, position_set *s)
2111{
2112 idx_t count = s->nelem;
2113 idx_t lo = 0, hi = count;
2114 while (lo < hi)
2115 {
2116 idx_t mid = (lo + hi) >> 1;
2117 if (s->elems[mid].index < p.index)
2118 lo = mid + 1;
2119 else if (s->elems[mid].index == p.index)
2120 {
2121 s->elems[mid].constraint |= p.constraint;
2122 return;
2123 }
2124 else
2125 hi = mid;
2126 }
2127
2128 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2129 for (idx_t i = count; i > lo; i--)
2130 s->elems[i] = s->elems[i - 1];
2131 s->elems[lo] = p;
2132 ++s->nelem;
2133}
2134
2135static void
2136append (position p, position_set *s)
2137{
2138 idx_t count = s->nelem;
2139 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2140 s->elems[s->nelem++] = p;
2141}
2142
2143/* Merge S1 and S2 (with the additional constraint C2) into M. The
2144 result is as if the positions of S1, and of S2 with the additional
2145 constraint C2, were inserted into an initially empty set. */
2146static void
2147merge_constrained (position_set const *s1, position_set const *s2,
2148 unsigned int c2, position_set *m)
2149{
2150 idx_t i = 0, j = 0;
2151
2152 if (m->alloc - s1->nelem < s2->nelem)
2153 {
2154 free (m->elems);
2155 m->alloc = s1->nelem;
2156 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2157 }
2158 m->nelem = 0;
2159 while (i < s1->nelem || j < s2->nelem)
2160 if (! (j < s2->nelem)
2161 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2162 {
2163 unsigned int c = ((i < s1->nelem && j < s2->nelem
2164 && s1->elems[i].index == s2->elems[j].index)
2165 ? s2->elems[j++].constraint & c2
2166 : 0);
2167 m->elems[m->nelem].index = s1->elems[i].index;
2168 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2169 }
2170 else
2171 {
2172 if (s2->elems[j].constraint & c2)
2173 {
2174 m->elems[m->nelem].index = s2->elems[j].index;
2175 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2176 }
2177 j++;
2178 }
2179}
2180
2181/* Merge two sets of positions into a third. The result is exactly as if
2182 the positions of both sets were inserted into an initially empty set. */
2183static void
2184merge (position_set const *s1, position_set const *s2, position_set *m)
2185{
2186 merge_constrained (s1, s2, -1, m);
2187}
2188
2189/* Merge into DST all the elements of SRC, possibly destroying
2190 the contents of the temporary M. */
2191static void
2192merge2 (position_set *dst, position_set const *src, position_set *m)
2193{
2194 if (src->nelem < 4)
2195 {
2196 for (idx_t i = 0; i < src->nelem; i++)
2197 insert (src->elems[i], dst);
2198 }
2199 else
2200 {
2201 merge (src, dst, m);
2202 copy (m, dst);
2203 }
2204}
2205
2206/* Delete a position from a set. Return the nonzero constraint of the
2207 deleted position, or zero if there was no such position. */
2208static unsigned int
2209delete (idx_t del, position_set *s)
2210{
2211 idx_t count = s->nelem;
2212 idx_t lo = 0, hi = count;
2213 while (lo < hi)
2214 {
2215 idx_t mid = (lo + hi) >> 1;
2216 if (s->elems[mid].index < del)
2217 lo = mid + 1;
2218 else if (s->elems[mid].index == del)
2219 {
2220 unsigned int c = s->elems[mid].constraint;
2221 idx_t i;
2222 for (i = mid; i + 1 < count; i++)
2223 s->elems[i] = s->elems[i + 1];
2224 s->nelem = i;
2225 return c;
2226 }
2227 else
2228 hi = mid;
2229 }
2230 return 0;
2231}
2232
2233/* Replace a position with the followed set. */
2234static void
2235replace (position_set *dst, idx_t del, position_set *add,
2236 unsigned int constraint, position_set *tmp)
2237{
2238 unsigned int c = delete (del, dst) & constraint;
2239
2240 if (c)
2241 {
2242 copy (dst, tmp);
2243 merge_constrained (tmp, add, c, dst);
2244 }
2245}
2246
2247/* Find the index of the state corresponding to the given position set with
2248 the given preceding context, or create a new state if there is no such
2249 state. Context tells whether we got here on a newline or letter. */
2250static state_num
2251state_index (struct dfa *d, position_set const *s, int context)
2252{
2253 size_t hash = 0;
2254 int constraint = 0;
2255 state_num i;
2256
2257 for (i = 0; i < s->nelem; ++i)
2258 {
2259 idx_t ind = s->elems[i].index;
2260 hash ^= ind + s->elems[i].constraint;
2261 }
2262
2263 /* Try to find a state that exactly matches the proposed one. */
2264 for (i = 0; i < d->sindex; ++i)
2265 {
2266 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2267 || context != d->states[i].context)
2268 continue;
2269 state_num j;
2270 for (j = 0; j < s->nelem; ++j)
2271 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2272 || s->elems[j].index != d->states[i].elems.elems[j].index)
2273 break;
2274 if (j == s->nelem)
2275 return i;
2276 }
2277
2278#ifdef DEBUG
2279 fprintf (stderr, "new state %td\n nextpos:", i);
2280 for (state_num j = 0; j < s->nelem; j++)
2281 {
2282 fprintf (stderr, " %td:", s->elems[j].index);
2283 prtok (d->tokens[s->elems[j].index]);
2284 }
2285 fprintf (stderr, "\n context:");
2286 if (context ^ CTX_ANY)
2287 {
2288 if (context & CTX_NONE)
2289 fprintf (stderr, " CTX_NONE");
2290 if (context & CTX_LETTER)
2291 fprintf (stderr, " CTX_LETTER");
2292 if (context & CTX_NEWLINE)
2293 fprintf (stderr, " CTX_NEWLINE");
2294 }
2295 else
2296 fprintf (stderr, " CTX_ANY");
2297 fprintf (stderr, "\n");
2298#endif
2299
2300 for (state_num j = 0; j < s->nelem; j++)
2301 {
2302 int c = d->constraints[s->elems[j].index];
2303
2304 if (c != 0)
2305 {
2306 if (succeeds_in_context (c, context, CTX_ANY))
2307 constraint |= c;
2308 }
2309 else if (d->tokens[s->elems[j].index] == BACKREF)
2310 constraint = NO_CONSTRAINT;
2311 }
2312
2313
2314 /* Create a new state. */
2315 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2316 sizeof *d->states);
2317 d->states[i].hash = hash;
2318 alloc_position_set (&d->states[i].elems, s->nelem);
2319 copy (s, &d->states[i].elems);
2320 d->states[i].context = context;
2321 d->states[i].constraint = constraint;
2322 d->states[i].mbps.nelem = 0;
2323 d->states[i].mbps.elems = NULL;
2324 d->states[i].mb_trindex = -1;
2325
2326 ++d->sindex;
2327
2328 return i;
2329}
2330
2331/* Find the epsilon closure of D's set of positions. If any position of the set
2332 contains a symbol that matches the empty string in some context, replace
2333 that position with the elements of its follow labeled with an appropriate
2334 constraint. Repeat exhaustively until no funny positions are left.
2335 S->elems must be large enough to hold the result. BACKWARD is D's
2336 backward set; use and update it too. */
2337static void
2338epsclosure (struct dfa const *d, position_set *backward)
2339{
2340 position_set tmp;
2341 alloc_position_set (&tmp, d->nleaves);
2342 for (idx_t i = 0; i < d->tindex; i++)
2343 if (0 < d->follows[i].nelem)
2344 {
2345 unsigned int constraint;
2346 switch (d->tokens[i])
2347 {
2348 default:
2349 continue;
2350
2351 case BEGLINE:
2352 constraint = BEGLINE_CONSTRAINT;
2353 break;
2354 case ENDLINE:
2355 constraint = ENDLINE_CONSTRAINT;
2356 break;
2357 case BEGWORD:
2358 constraint = BEGWORD_CONSTRAINT;
2359 break;
2360 case ENDWORD:
2361 constraint = ENDWORD_CONSTRAINT;
2362 break;
2363 case LIMWORD:
2364 constraint = LIMWORD_CONSTRAINT;
2365 break;
2366 case NOTLIMWORD:
2367 constraint = NOTLIMWORD_CONSTRAINT;
2368 break;
2369 case EMPTY:
2370 constraint = NO_CONSTRAINT;
2371 break;
2372 }
2373
2374 delete (i, &d->follows[i]);
2375
2376 for (idx_t j = 0; j < backward[i].nelem; j++)
2377 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2378 constraint, &tmp);
2379 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2380 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2381 NO_CONSTRAINT, &tmp);
2382 }
2383 free (tmp.elems);
2384}
2385
2386/* Returns the set of contexts for which there is at least one
2387 character included in C. */
2388
2389static int
2390charclass_context (struct dfa const *dfa, charclass const *c)
2391{
2392 int context = 0;
2393
2394 for (int j = 0; j < CHARCLASS_WORDS; j++)
2395 {
2396 if (c->w[j] & dfa->syntax.newline.w[j])
2397 context |= CTX_NEWLINE;
2398 if (c->w[j] & dfa->syntax.letters.w[j])
2399 context |= CTX_LETTER;
2400 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2401 context |= CTX_NONE;
2402 }
2403
2404 return context;
2405}
2406
2407/* Returns the contexts on which the position set S depends. Each context
2408 in the set of returned contexts (let's call it SC) may have a different
2409 follow set than other contexts in SC, and also different from the
2410 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2411 in the complement set will have the same follow set. */
2412
2413static int _GL_ATTRIBUTE_PURE
2414state_separate_contexts (struct dfa *d, position_set const *s)
2415{
2416 int separate_contexts = 0;
2417
2418 for (idx_t j = 0; j < s->nelem; j++)
2419 separate_contexts |= d->separates[s->elems[j].index];
2420
2421 return separate_contexts;
2422}
2423
2424enum
2425{
2426 /* Single token is repeated. It is distinguished from non-repeated. */
2427 OPT_REPEAT = (1 << 0),
2428
2429 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2430 node is not merged. */
2431 OPT_LPAREN = (1 << 1),
2432
2433 /* Multiple branches are joined. The node is not merged. */
2434 OPT_RPAREN = (1 << 2),
2435
2436 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2437 flag is turned on. */
2438 OPT_WALKED = (1 << 3),
2439
2440 /* The node is queued. The node is not queued again. */
2441 OPT_QUEUED = (1 << 4)
2442};
2443
2444static void
2445merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2446 position_set *merged)
2447{
2448 position_set *follows = d->follows;
2449 idx_t nelem = 0;
2450
2451 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2452 {
2453 idx_t sindex = follows[tindex].elems[i].index;
2454
2455 /* Skip the node as pruned in future. */
2456 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2457 if (iconstraint == 0)
2458 continue;
2459
2460 if (d->tokens[follows[tindex].elems[i].index] <= END)
2461 {
2462 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2463 continue;
2464 }
2465
2466 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2467 {
2468 idx_t j;
2469
2470 for (j = 0; j < nelem; j++)
2471 {
2472 idx_t dindex = follows[tindex].elems[j].index;
2473
2474 if (dindex == tindex)
2475 continue;
2476
2477 if (follows[tindex].elems[j].constraint != iconstraint)
2478 continue;
2479
2480 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2481 continue;
2482
2483 if (d->tokens[sindex] != d->tokens[dindex])
2484 continue;
2485
2486 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2487 continue;
2488
2489 if (flags[sindex] & OPT_REPEAT)
2490 delete (sindex, &follows[sindex]);
2491
2492 merge2 (&follows[dindex], &follows[sindex], merged);
2493
2494 break;
2495 }
2496
2497 if (j < nelem)
2498 continue;
2499 }
2500
2501 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2502 flags[sindex] |= OPT_QUEUED;
2503 }
2504
2505 follows[tindex].nelem = nelem;
2506}
2507
2508static int
2509compare (const void *a, const void *b)
2510{
2511 position const *p = a, *q = b;
2512 return (p->index > q->index) - (p->index < q->index);
2513}
2514
2515static void
2516reorder_tokens (struct dfa *d)
2517{
2518 idx_t nleaves = 0;
2519 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2520 map[0] = nleaves++;
2521 for (idx_t i = 1; i < d->tindex; i++)
2522 map[i] = -1;
2523
2524 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2525 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2526 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2527 char *multibyte_prop = (d->localeinfo.multibyte
2528 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2529 : NULL);
2530
2531 for (idx_t i = 0; i < d->tindex; i++)
2532 {
2533 if (map[i] < 0)
2534 {
2535 free (d->follows[i].elems);
2536 d->follows[i].elems = NULL;
2537 d->follows[i].nelem = 0;
2538 continue;
2539 }
2540
2541 tokens[map[i]] = d->tokens[i];
2542 follows[map[i]] = d->follows[i];
2543 constraints[map[i]] = d->constraints[i];
2544
2545 if (multibyte_prop != NULL)
2546 multibyte_prop[map[i]] = d->multibyte_prop[i];
2547
2548 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2549 {
2550 if (map[d->follows[i].elems[j].index] == -1)
2551 map[d->follows[i].elems[j].index] = nleaves++;
2552
2553 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2554 }
2555
2556 qsort (d->follows[i].elems, d->follows[i].nelem,
2557 sizeof *d->follows[i].elems, compare);
2558 }
2559
2560 for (idx_t i = 0; i < nleaves; i++)
2561 {
2562 d->tokens[i] = tokens[i];
2563 d->follows[i] = follows[i];
2564 d->constraints[i] = constraints[i];
2565
2566 if (multibyte_prop != NULL)
2567 d->multibyte_prop[i] = multibyte_prop[i];
2568 }
2569
2570 d->tindex = d->nleaves = nleaves;
2571
2572 free (tokens);
2573 free (follows);
2574 free (constraints);
2575 free (multibyte_prop);
2576 free (map);
2577}
2578
2579static void
2580dfaoptimize (struct dfa *d)
2581{
2582 char *flags = xizalloc (d->tindex);
2583
2584 for (idx_t i = 0; i < d->tindex; i++)
2585 {
2586 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2587 {
2588 if (d->follows[i].elems[j].index == i)
2589 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2590 else if (d->follows[i].elems[j].index < i)
2591 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2592 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2593 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2594 else
2595 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2596 }
2597 }
2598
2599 flags[0] |= OPT_QUEUED;
2600
2601 position_set merged0;
2602 position_set *merged = &merged0;
2603 alloc_position_set (merged, d->nleaves);
2604
2605 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2606
2607 for (idx_t i = 0; i < d->tindex; i++)
2608 if (flags[i] & OPT_QUEUED)
2609 merge_nfa_state (d, i, flags, merged);
2610
2611 reorder_tokens (d);
2612
2613 free (merged->elems);
2614 free (flags);
2615}
2616
2617/* Perform bottom-up analysis on the parse tree, computing various functions.
2618 Note that at this point, we're pretending constructs like \< are real
2619 characters rather than constraints on what can follow them.
2620
2621 Nullable: A node is nullable if it is at the root of a regexp that can
2622 match the empty string.
2623 * EMPTY leaves are nullable.
2624 * No other leaf is nullable.
2625 * A QMARK or STAR node is nullable.
2626 * A PLUS node is nullable if its argument is nullable.
2627 * A CAT node is nullable if both its arguments are nullable.
2628 * An OR node is nullable if either argument is nullable.
2629
2630 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2631 that could correspond to the first character of a string matching the
2632 regexp rooted at the given node.
2633 * EMPTY leaves have empty firstpos.
2634 * The firstpos of a nonempty leaf is that leaf itself.
2635 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2636 argument.
2637 * The firstpos of a CAT node is the firstpos of the left argument, union
2638 the firstpos of the right if the left argument is nullable.
2639 * The firstpos of an OR node is the union of firstpos of each argument.
2640
2641 Lastpos: The lastpos of a node is the set of positions that could
2642 correspond to the last character of a string matching the regexp at
2643 the given node.
2644 * EMPTY leaves have empty lastpos.
2645 * The lastpos of a nonempty leaf is that leaf itself.
2646 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2647 argument.
2648 * The lastpos of a CAT node is the lastpos of its right argument, union
2649 the lastpos of the left if the right argument is nullable.
2650 * The lastpos of an OR node is the union of the lastpos of each argument.
2651
2652 Follow: The follow of a position is the set of positions that could
2653 correspond to the character following a character matching the node in
2654 a string matching the regexp. At this point we consider special symbols
2655 that match the empty string in some context to be just normal characters.
2656 Later, if we find that a special symbol is in a follow set, we will
2657 replace it with the elements of its follow, labeled with an appropriate
2658 constraint.
2659 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2660 the follow of every node in the lastpos.
2661 * Every node in the firstpos of the second argument of a CAT node is in
2662 the follow of every node in the lastpos of the first argument.
2663
2664 Because of the postfix representation of the parse tree, the depth-first
2665 analysis is conveniently done by a linear scan with the aid of a stack.
2666 Sets are stored as arrays of the elements, obeying a stack-like allocation
2667 scheme; the number of elements in each set deeper in the stack can be
2668 used to determine the address of a particular set's array. */
2669static void
2670dfaanalyze (struct dfa *d, bool searchflag)
2671{
2672 /* Array allocated to hold position sets. */
2673 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2674 /* Firstpos and lastpos elements. */
2675 position *firstpos = posalloc;
2676 position *lastpos = firstpos + d->nleaves;
2677 position pos;
2678 position_set tmp;
2679
2680 /* Stack for element counts and nullable flags. */
2681 struct
2682 {
2683 /* Whether the entry is nullable. */
2684 bool nullable;
2685
2686 /* Counts of firstpos and lastpos sets. */
2687 idx_t nfirstpos;
2688 idx_t nlastpos;
2689 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2690
2691 position_set merged; /* Result of merging sets. */
2692
2693 addtok (d, CAT);
2694 idx_t tindex = d->tindex;
2695
2696#ifdef DEBUG
2697 fprintf (stderr, "dfaanalyze:\n");
2698 for (idx_t i = 0; i < tindex; i++)
2699 {
2700 fprintf (stderr, " %td:", i);
2701 prtok (d->tokens[i]);
2702 }
2703 putc ('\n', stderr);
2704#endif
2705
2706 d->searchflag = searchflag;
2707 alloc_position_set (&merged, d->nleaves);
2708 d->follows = xicalloc (tindex, sizeof *d->follows);
2709 position_set *backward
2710 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2711
2712 for (idx_t i = 0; i < tindex; i++)
2713 {
2714 switch (d->tokens[i])
2715 {
2716 case EMPTY:
2717 /* The empty set is nullable. */
2718 stk->nullable = true;
2719
2720 /* The firstpos and lastpos of the empty leaf are both empty. */
2721 stk->nfirstpos = stk->nlastpos = 0;
2722 stk++;
2723 break;
2724
2725 case STAR:
2726 case PLUS:
2727 /* Every element in the lastpos of the argument is in the backward
2728 set of every element in the firstpos. */
2729 if (d->epsilon)
2730 {
2731 tmp.elems = lastpos - stk[-1].nlastpos;
2732 tmp.nelem = stk[-1].nlastpos;
2733 for (position *p = firstpos - stk[-1].nfirstpos;
2734 p < firstpos; p++)
2735 merge2 (&backward[p->index], &tmp, &merged);
2736 }
2737
2738 /* Every element in the firstpos of the argument is in the follow
2739 of every element in the lastpos. */
2740 {
2741 tmp.elems = firstpos - stk[-1].nfirstpos;
2742 tmp.nelem = stk[-1].nfirstpos;
2743 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2744 merge2 (&d->follows[p->index], &tmp, &merged);
2745 }
2746 FALLTHROUGH;
2747 case QMARK:
2748 /* A QMARK or STAR node is automatically nullable. */
2749 if (d->tokens[i] != PLUS)
2750 stk[-1].nullable = true;
2751 break;
2752
2753 case CAT:
2754 /* Every element in the lastpos of the first argument is in
2755 the backward set of every element in the firstpos of the
2756 second argument. */
2757 if (backward)
2758 {
2759 tmp.nelem = stk[-2].nlastpos;
2760 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2761 for (position *p = firstpos - stk[-1].nfirstpos;
2762 p < firstpos; p++)
2763 merge2 (&backward[p->index], &tmp, &merged);
2764 }
2765
2766 /* Every element in the firstpos of the second argument is in the
2767 follow of every element in the lastpos of the first argument. */
2768 {
2769 tmp.nelem = stk[-1].nfirstpos;
2770 tmp.elems = firstpos - stk[-1].nfirstpos;
2771 for (position *plim = lastpos - stk[-1].nlastpos,
2772 *p = plim - stk[-2].nlastpos;
2773 p < plim; p++)
2774 merge2 (&d->follows[p->index], &tmp, &merged);
2775 }
2776
2777 /* The firstpos of a CAT node is the firstpos of the first argument,
2778 union that of the second argument if the first is nullable. */
2779 if (stk[-2].nullable)
2780 stk[-2].nfirstpos += stk[-1].nfirstpos;
2781 else
2782 firstpos -= stk[-1].nfirstpos;
2783
2784 /* The lastpos of a CAT node is the lastpos of the second argument,
2785 union that of the first argument if the second is nullable. */
2786 if (stk[-1].nullable)
2787 stk[-2].nlastpos += stk[-1].nlastpos;
2788 else
2789 {
2790 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2791 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2792 p[j] = p[j + stk[-2].nlastpos];
2793 lastpos -= stk[-2].nlastpos;
2794 stk[-2].nlastpos = stk[-1].nlastpos;
2795 }
2796
2797 /* A CAT node is nullable if both arguments are nullable. */
2798 stk[-2].nullable &= stk[-1].nullable;
2799 stk--;
2800 break;
2801
2802 case OR:
2803 /* The firstpos is the union of the firstpos of each argument. */
2804 stk[-2].nfirstpos += stk[-1].nfirstpos;
2805
2806 /* The lastpos is the union of the lastpos of each argument. */
2807 stk[-2].nlastpos += stk[-1].nlastpos;
2808
2809 /* An OR node is nullable if either argument is nullable. */
2810 stk[-2].nullable |= stk[-1].nullable;
2811 stk--;
2812 break;
2813
2814 default:
2815 /* Anything else is a nonempty position. (Note that special
2816 constructs like \< are treated as nonempty strings here;
2817 an "epsilon closure" effectively makes them nullable later.
2818 Backreferences have to get a real position so we can detect
2819 transitions on them later. But they are nullable. */
2820 stk->nullable = d->tokens[i] == BACKREF;
2821
2822 /* This position is in its own firstpos and lastpos. */
2823 stk->nfirstpos = stk->nlastpos = 1;
2824 stk++;
2825
2826 firstpos->index = lastpos->index = i;
2827 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2828 firstpos++, lastpos++;
2829
2830 break;
2831 }
2832#ifdef DEBUG
2833 /* ... balance the above nonsyntactic #ifdef goo... */
2834 fprintf (stderr, "node %td:", i);
2835 prtok (d->tokens[i]);
2836 putc ('\n', stderr);
2837 fprintf (stderr,
2838 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2839 fprintf (stderr, " firstpos:");
2840 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2841 {
2842 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2843 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2844 }
2845 fprintf (stderr, "\n lastpos:");
2846 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2847 {
2848 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2849 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2850 }
2851 putc ('\n', stderr);
2852#endif
2853 }
2854
2855 if (backward)
2856 {
2857 /* For each follow set that is the follow set of a real position,
2858 replace it with its epsilon closure. */
2859 epsclosure (d, backward);
2860
2861 for (idx_t i = 0; i < tindex; i++)
2862 free (backward[i].elems);
2863 free (backward);
2864 }
2865
2866 dfaoptimize (d);
2867
2868#ifdef DEBUG
2869 for (idx_t i = 0; i < tindex; i++)
2870 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2871 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2872 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2873 {
2874 fprintf (stderr, "follows(%td:", i);
2875 prtok (d->tokens[i]);
2876 fprintf (stderr, "):");
2877 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2878 {
2879 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2880 prtok (d->tokens[d->follows[i].elems[j].index]);
2881 }
2882 putc ('\n', stderr);
2883 }
2884#endif
2885
2886 pos.index = 0;
2887 pos.constraint = NO_CONSTRAINT;
2888
2889 alloc_position_set (&tmp, 1);
2890
2891 append (pos, &tmp);
2892
2893 d->separates = xicalloc (tindex, sizeof *d->separates);
2894
2895 for (idx_t i = 0; i < tindex; i++)
2896 {
2897 if (prev_newline_dependent (d->constraints[i]))
2898 d->separates[i] |= CTX_NEWLINE;
2899 if (prev_letter_dependent (d->constraints[i]))
2900 d->separates[i] |= CTX_LETTER;
2901
2902 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2903 {
2904 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2905 d->separates[i] |= CTX_NEWLINE;
2906 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2907 d->separates[i] |= CTX_LETTER;
2908 }
2909 }
2910
2911 /* Context wanted by some position. */
2912 int separate_contexts = state_separate_contexts (d, &tmp);
2913
2914 /* Build the initial state. */
2915 if (separate_contexts & CTX_NEWLINE)
2916 state_index (d, &tmp, CTX_NEWLINE);
2917 d->initstate_notbol = d->min_trcount
2918 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2919 if (separate_contexts & CTX_LETTER)
2920 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2921 d->min_trcount++;
2922 d->trcount = 0;
2923
2924 free (posalloc);
2925 free (stkalloc);
2926 free (merged.elems);
2927 free (tmp.elems);
2928}
2929
2930/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2931static void
2932realloc_trans_if_necessary (struct dfa *d)
2933{
2934 state_num oldalloc = d->tralloc;
2935 if (oldalloc < d->sindex)
2936 {
2937 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2938 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2939 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2940 -1, sizeof *realtrans);
2941 realtrans[0] = realtrans[1] = NULL;
2942 d->trans = realtrans + 2;
2943 idx_t newalloc = d->tralloc = newalloc1 - 2;
2944 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2945 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2946 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2947 if (d->localeinfo.multibyte)
2948 {
2949 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2950 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2951 if (oldalloc == 0)
2952 realtrans[0] = realtrans[1] = NULL;
2953 d->mb_trans = realtrans + 2;
2954 }
2955 for (; oldalloc < newalloc; oldalloc++)
2956 {
2957 d->trans[oldalloc] = NULL;
2958 d->fails[oldalloc] = NULL;
2959 if (d->localeinfo.multibyte)
2960 d->mb_trans[oldalloc] = NULL;
2961 }
2962 }
2963}
2964
2965/*
2966 Calculate the transition table for a new state derived from state s
2967 for a compiled dfa d after input character uc, and return the new
2968 state number.
2969
2970 Do not worry about all possible input characters; calculate just the group
2971 of positions that match uc. Label it with the set of characters that
2972 every position in the group matches (taking into account, if necessary,
2973 preceding context information of s). Then find the union
2974 of these positions' follows, i.e., the set of positions of the
2975 new state. For each character in the group's label, set the transition
2976 on this character to be to a state corresponding to the set's positions,
2977 and its associated backward context information, if necessary.
2978
2979 When building a searching matcher, include the positions of state
2980 0 in every state.
2981
2982 The group is constructed by building an equivalence-class
2983 partition of the positions of s.
2984
2985 For each position, find the set of characters C that it matches. Eliminate
2986 any characters from C that fail on grounds of backward context.
2987
2988 Check whether the group's label L has nonempty
2989 intersection with C. If L - C is nonempty, create a new group labeled
2990 L - C and having the same positions as the current group, and set L to
2991 the intersection of L and C. Insert the position in the group, set
2992 C = C - L, and resume scanning.
2993
2994 If after comparing with every group there are characters remaining in C,
2995 create a new group labeled with the characters of C and insert this
2996 position in that group. */
2997
2998static state_num
2999build_state (state_num s, struct dfa *d, unsigned char uc)
3000{
3001 position_set follows; /* Union of the follows for each
3002 position of the current state. */
3003 position_set group; /* Positions that match the input char. */
3004 position_set tmp; /* Temporary space for merging sets. */
3005 state_num state; /* New state. */
3006 state_num state_newline; /* New state on a newline transition. */
3007 state_num state_letter; /* New state on a letter transition. */
3008
3009#ifdef DEBUG
3010 fprintf (stderr, "build state %td\n", s);
3011#endif
3012
3013 /* A pointer to the new transition table, and the table itself. */
3014 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3015 state_num *trans = *ptrans;
3016
3017 if (!trans)
3018 {
3019 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3020 transition tables that can exist at once, other than for
3021 initial states. Often-used transition tables are quickly
3022 rebuilt, whereas rarely-used ones are cleared away. */
3023 if (MAX_TRCOUNT <= d->trcount)
3024 {
3025 for (state_num i = d->min_trcount; i < d->tralloc; i++)
3026 {
3027 free (d->trans[i]);
3028 free (d->fails[i]);
3029 d->trans[i] = d->fails[i] = NULL;
3030 }
3031 d->trcount = 0;
3032 }
3033
3034 d->trcount++;
3035 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3036
3037 /* Fill transition table with a default value which means that the
3038 transited state has not been calculated yet. */
3039 for (int i = 0; i < NOTCHAR; i++)
3040 trans[i] = -2;
3041 }
3042
3043 /* Set up the success bits for this state. */
3044 d->success[s] = 0;
3045 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3046 d->success[s] |= CTX_NEWLINE;
3047 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3048 d->success[s] |= CTX_LETTER;
3049 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3050 d->success[s] |= CTX_NONE;
3051
3052 alloc_position_set (&follows, d->nleaves);
3053
3054 /* Find the union of the follows of the positions of the group.
3055 This is a hideously inefficient loop. Fix it someday. */
3056 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3057 for (idx_t k = 0;
3058 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3059 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3060 &follows);
3061
3062 /* Positions that match the input char. */
3063 alloc_position_set (&group, d->nleaves);
3064
3065 /* The group's label. */
3066 charclass label;
3067 fillset (&label);
3068
3069 for (idx_t i = 0; i < follows.nelem; i++)
3070 {
3071 charclass matches; /* Set of matching characters. */
3072 position pos = follows.elems[i];
3073 bool matched = false;
3074 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3075 {
3076 zeroset (&matches);
3077 setbit (d->tokens[pos.index], &matches);
3078 if (d->tokens[pos.index] == uc)
3079 matched = true;
3080 }
3081 else if (d->tokens[pos.index] >= CSET)
3082 {
3083 matches = d->charclasses[d->tokens[pos.index] - CSET];
3084 if (tstbit (uc, &matches))
3085 matched = true;
3086 }
3087 else if (d->tokens[pos.index] == ANYCHAR)
3088 {
3089 matches = d->charclasses[d->canychar];
3090 if (tstbit (uc, &matches))
3091 matched = true;
3092
3093 /* ANYCHAR must match with a single character, so we must put
3094 it to D->states[s].mbps which contains the positions which
3095 can match with a single character not a byte. If all
3096 positions which has ANYCHAR does not depend on context of
3097 next character, we put the follows instead of it to
3098 D->states[s].mbps to optimize. */
3099 if (succeeds_in_context (pos.constraint, d->states[s].context,
3100 CTX_NONE))
3101 {
3102 if (d->states[s].mbps.nelem == 0)
3103 alloc_position_set (&d->states[s].mbps, 1);
3104 insert (pos, &d->states[s].mbps);
3105 }
3106 }
3107 else
3108 continue;
3109
3110 /* Some characters may need to be eliminated from matches because
3111 they fail in the current context. */
3112 if (pos.constraint != NO_CONSTRAINT)
3113 {
3114 if (!succeeds_in_context (pos.constraint,
3115 d->states[s].context, CTX_NEWLINE))
3116 for (int j = 0; j < CHARCLASS_WORDS; j++)
3117 matches.w[j] &= ~d->syntax.newline.w[j];
3118 if (!succeeds_in_context (pos.constraint,
3119 d->states[s].context, CTX_LETTER))
3120 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3121 matches.w[j] &= ~d->syntax.letters.w[j];
3122 if (!succeeds_in_context (pos.constraint,
3123 d->states[s].context, CTX_NONE))
3124 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3125 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3126
3127 /* If there are no characters left, there's no point in going on. */
3128 if (emptyset (&matches))
3129 continue;
3130
3131 /* If we have reset the bit that made us declare "matched", reset
3132 that indicator, too. This is required to avoid an infinite loop
3133 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3134 if (!tstbit (uc, &matches))
3135 matched = false;
3136 }
3137
3138#ifdef DEBUG
3139 fprintf (stderr, " nextpos %td:", pos.index);
3140 prtok (d->tokens[pos.index]);
3141 fprintf (stderr, " of");
3142 for (unsigned j = 0; j < NOTCHAR; j++)
3143 if (tstbit (j, &matches))
3144 fprintf (stderr, " 0x%02x", j);
3145 fprintf (stderr, "\n");
3146#endif
3147
3148 if (matched)
3149 {
3150 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3151 label.w[k] &= matches.w[k];
3152 append (pos, &group);
3153 }
3154 else
3155 {
3156 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3157 label.w[k] &= ~matches.w[k];
3158 }
3159 }
3160
3161 alloc_position_set (&tmp, d->nleaves);
3162
3163 if (group.nelem > 0)
3164 {
3165 /* If we are building a searching matcher, throw in the positions
3166 of state 0 as well, if possible. */
3167 if (d->searchflag)
3168 {
3169 /* If a token in follows.elems is not 1st byte of a multibyte
3170 character, or the states of follows must accept the bytes
3171 which are not 1st byte of the multibyte character.
3172 Then, if a state of follows encounters a byte, it must not be
3173 a 1st byte of a multibyte character nor a single byte character.
3174 In this case, do not add state[0].follows to next state, because
3175 state[0] must accept 1st-byte.
3176
3177 For example, suppose <sb a> is a certain single byte character,
3178 <mb A> is a certain multibyte character, and the codepoint of
3179 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3180 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3181 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3182 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3183 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3184 not add state[0]. */
3185
3186 bool mergeit = !d->localeinfo.multibyte;
3187 if (!mergeit)
3188 {
3189 mergeit = true;
3190 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3191 mergeit &= d->multibyte_prop[group.elems[j].index];
3192 }
3193 if (mergeit)
3194 merge2 (&group, &d->states[0].elems, &tmp);
3195 }
3196
3197 /* Find out if the new state will want any context information,
3198 by calculating possible contexts that the group can match,
3199 and separate contexts that the new state wants to know. */
3200 int possible_contexts = charclass_context (d, &label);
3201 int separate_contexts = state_separate_contexts (d, &group);
3202
3203 /* Find the state(s) corresponding to the union of the follows. */
3204 if (possible_contexts & ~separate_contexts)
3205 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3206 else
3207 state = -1;
3208 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3209 state_newline = state_index (d, &group, CTX_NEWLINE);
3210 else
3211 state_newline = state;
3212 if (separate_contexts & possible_contexts & CTX_LETTER)
3213 state_letter = state_index (d, &group, CTX_LETTER);
3214 else
3215 state_letter = state;
3216
3217 /* Reallocate now, to reallocate any newline transition properly. */
3218 realloc_trans_if_necessary (d);
3219 }
3220
3221 /* If we are a searching matcher, the default transition is to a state
3222 containing the positions of state 0, otherwise the default transition
3223 is to fail miserably. */
3224 else if (d->searchflag)
3225 {
3226 state_newline = 0;
3227 state_letter = d->min_trcount - 1;
3228 state = d->initstate_notbol;
3229 }
3230 else
3231 {
3232 state_newline = -1;
3233 state_letter = -1;
3234 state = -1;
3235 }
3236
3237 /* Set the transitions for each character in the label. */
3238 for (int i = 0; i < NOTCHAR; i++)
3239 if (tstbit (i, &label))
3240 switch (d->syntax.sbit[i])
3241 {
3242 case CTX_NEWLINE:
3243 trans[i] = state_newline;
3244 break;
3245 case CTX_LETTER:
3246 trans[i] = state_letter;
3247 break;
3248 default:
3249 trans[i] = state;
3250 break;
3251 }
3252
3253#ifdef DEBUG
3254 fprintf (stderr, "trans table %td", s);
3255 for (int i = 0; i < NOTCHAR; ++i)
3256 {
3257 if (!(i & 0xf))
3258 fprintf (stderr, "\n");
3259 fprintf (stderr, " %2td", trans[i]);
3260 }
3261 fprintf (stderr, "\n");
3262#endif
3263
3264 free (group.elems);
3265 free (follows.elems);
3266 free (tmp.elems);
3267
3268 /* Keep the newline transition in a special place so we can use it as
3269 a sentinel. */
3270 if (tstbit (d->syntax.eolbyte, &label))
3271 {
3272 d->newlines[s] = trans[d->syntax.eolbyte];
3273 trans[d->syntax.eolbyte] = -1;
3274 }
3275
3276 return trans[uc];
3277}
3278
3279/* Multibyte character handling sub-routines for dfaexec. */
3280
3281/* Consume a single byte and transit state from 's' to '*next_state'.
3282 This function is almost same as the state transition routin in dfaexec.
3283 But state transition is done just once, otherwise matching succeed or
3284 reach the end of the buffer. */
3285static state_num
3286transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3287{
3288 state_num *t;
3289
3290 if (d->trans[s])
3291 t = d->trans[s];
3292 else if (d->fails[s])
3293 t = d->fails[s];
3294 else
3295 {
3296 build_state (s, d, **pp);
3297 if (d->trans[s])
3298 t = d->trans[s];
3299 else
3300 {
3301 t = d->fails[s];
3302 assert (t);
3303 }
3304 }
3305
3306 if (t[**pp] == -2)
3307 build_state (s, d, **pp);
3308
3309 return t[*(*pp)++];
3310}
3311
3312/* Transit state from s, then return new state and update the pointer of
3313 the buffer. This function is for a period operator which can match a
3314 multi-byte character. */
3315static state_num
3316transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3317 unsigned char const *end)
3318{
3319 wint_t wc;
3320
3321 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3322
3323 /* This state has some operators which can match a multibyte character. */
3324 d->mb_follows.nelem = 0;
3325
3326 /* Calculate the state which can be reached from the state 's' by
3327 consuming 'mbclen' single bytes from the buffer. */
3328 state_num s1 = s;
3329 int mbci;
3330 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3331 s = transit_state_singlebyte (d, s, pp);
3332 *pp += mbclen - mbci;
3333
3334 if (wc == WEOF)
3335 {
3336 /* It is an invalid character, so ANYCHAR is not accepted. */
3337 return s;
3338 }
3339
3340 /* If all positions which have ANYCHAR do not depend on the context
3341 of the next character, calculate the next state with
3342 pre-calculated follows and cache the result. */
3343 if (d->states[s1].mb_trindex < 0)
3344 {
3345 if (MAX_TRCOUNT <= d->mb_trcount)
3346 {
3347 state_num s3;
3348 for (s3 = -1; s3 < d->tralloc; s3++)
3349 {
3350 free (d->mb_trans[s3]);
3351 d->mb_trans[s3] = NULL;
3352 }
3353
3354 for (state_num i = 0; i < d->sindex; i++)
3355 d->states[i].mb_trindex = -1;
3356 d->mb_trcount = 0;
3357 }
3358 d->states[s1].mb_trindex = d->mb_trcount++;
3359 }
3360
3361 if (! d->mb_trans[s])
3362 {
3363 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3364 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3365 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3366 for (int i = 0; i < MAX_TRCOUNT; i++)
3367 d->mb_trans[s][i] = -1;
3368 }
3369 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3370 return d->mb_trans[s][d->states[s1].mb_trindex];
3371
3372 if (s == -1)
3373 copy (&d->states[s1].mbps, &d->mb_follows);
3374 else
3375 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3376
3377 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3378 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3379 realloc_trans_if_necessary (d);
3380
3381 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3382
3383 return s2;
3384}
3385
3386/* The initial state may encounter a byte which is not a single byte character
3387 nor the first byte of a multibyte character. But it is incorrect for the
3388 initial state to accept such a byte. For example, in Shift JIS the regular
3389 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3390 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3391 that are not a single byte character nor the first byte of a multibyte
3392 character.
3393
3394 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3395 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3396 the result is greater than P, set *WCP to the final wide character
3397 processed, or to WEOF if no wide character is processed. Otherwise,
3398 if WCP is non-NULL, *WCP may or may not be updated.
3399
3400 Both P and MBP must be no larger than END. */
3401static unsigned char const *
3402skip_remains_mb (struct dfa *d, unsigned char const *p,
3403 unsigned char const *mbp, char const *end)
3404{
3405 if (d->syntax.never_trail[*p])
3406 return p;
3407 while (mbp < p)
3408 {
3409 wint_t wc;
3410 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3411 end - (char const *) mbp, d);
3412 }
3413 return mbp;
3414}
3415
3416/* Search through a buffer looking for a match to the struct dfa *D.
3417 Find the first occurrence of a string matching the regexp in the
3418 buffer, and the shortest possible version thereof. Return a pointer to
3419 the first character after the match, or NULL if none is found. BEGIN
3420 points to the beginning of the buffer, and END points to the first byte
3421 after its end. Note however that we store a sentinel byte (usually
3422 newline) in *END, so the actual buffer must be one byte longer.
3423 When ALLOW_NL, newlines may appear in the matching string.
3424 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3425 If MULTIBYTE, the input consists of multibyte characters and/or
3426 encoding-error bytes. Otherwise, it consists of single-byte characters.
3427 Here is the list of features that make this DFA matcher punt:
3428 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3429 - [^...] in non-simple locale
3430 - [[=foo=]] or [[.foo.]]
3431 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3432 - back-reference: (.)\1
3433 - word-delimiter in multibyte locale: \<, \>, \b, \B
3434 See struct localeinfo.simple for the definition of "simple locale". */
3435
3436static inline char *
3437dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3438 idx_t *count, bool multibyte)
3439{
3440 if (MAX_TRCOUNT <= d->sindex)
3441 {
3442 for (state_num s = d->min_trcount; s < d->sindex; s++)
3443 {
3444 free (d->states[s].elems.elems);
3445 free (d->states[s].mbps.elems);
3446 }
3447 d->sindex = d->min_trcount;
3448
3449 if (d->trans)
3450 {
3451 for (state_num s = 0; s < d->tralloc; s++)
3452 {
3453 free (d->trans[s]);
3454 free (d->fails[s]);
3455 d->trans[s] = d->fails[s] = NULL;
3456 }
3457 d->trcount = 0;
3458 }
3459
3460 if (d->localeinfo.multibyte && d->mb_trans)
3461 {
3462 for (state_num s = -1; s < d->tralloc; s++)
3463 {
3464 free (d->mb_trans[s]);
3465 d->mb_trans[s] = NULL;
3466 }
3467 for (state_num s = 0; s < d->min_trcount; s++)
3468 d->states[s].mb_trindex = -1;
3469 d->mb_trcount = 0;
3470 }
3471 }
3472
3473 if (!d->tralloc)
3474 realloc_trans_if_necessary (d);
3475
3476 /* Current state. */
3477 state_num s = 0, s1 = 0;
3478
3479 /* Current input character. */
3480 unsigned char const *p = (unsigned char const *) begin;
3481 unsigned char const *mbp = p;
3482
3483 /* Copy of d->trans so it can be optimized into a register. */
3484 state_num **trans = d->trans;
3485 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3486 unsigned char saved_end = *(unsigned char *) end;
3487 *end = eol;
3488
3489 if (multibyte)
3490 {
3491 memset (&d->mbs, 0, sizeof d->mbs);
3492 if (d->mb_follows.alloc == 0)
3493 alloc_position_set (&d->mb_follows, d->nleaves);
3494 }
3495
3496 idx_t nlcount = 0;
3497 for (;;)
3498 {
3499 state_num *t;
3500 while ((t = trans[s]) != NULL)
3501 {
3502 if (s < d->min_trcount)
3503 {
3504 if (!multibyte || d->states[s].mbps.nelem == 0)
3505 {
3506 while (t[*p] == s)
3507 p++;
3508 }
3509 if (multibyte)
3510 p = mbp = skip_remains_mb (d, p, mbp, end);
3511 }
3512
3513 if (multibyte)
3514 {
3515 s1 = s;
3516
3517 if (d->states[s].mbps.nelem == 0
3518 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3519 {
3520 /* If an input character does not match ANYCHAR, do it
3521 like a single-byte character. */
3522 s = t[*p++];
3523 }
3524 else
3525 {
3526 s = transit_state (d, s, &p, (unsigned char *) end);
3527 mbp = p;
3528 trans = d->trans;
3529 }
3530 }
3531 else
3532 {
3533 s1 = t[*p++];
3534 t = trans[s1];
3535 if (! t)
3536 {
3537 state_num tmp = s;
3538 s = s1;
3539 s1 = tmp; /* swap */
3540 break;
3541 }
3542 if (s < d->min_trcount)
3543 {
3544 while (t[*p] == s1)
3545 p++;
3546 }
3547 s = t[*p++];
3548 }
3549 }
3550
3551 if (s < 0)
3552 {
3553 if (s == -2)
3554 {
3555 s = build_state (s1, d, p[-1]);
3556 trans = d->trans;
3557 }
3558 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3559 {
3560 /* The previous character was a newline. Count it, and skip
3561 checking of multibyte character boundary until here. */
3562 nlcount++;
3563 mbp = p;
3564
3565 s = (allow_nl ? d->newlines[s1]
3566 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3567 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3568 : d->initstate_notbol);
3569 }
3570 else
3571 {
3572 p = NULL;
3573 goto done;
3574 }
3575 }
3576 else if (d->fails[s])
3577 {
3578 if ((d->success[s] & d->syntax.sbit[*p])
3579 || ((char *) p == end
3580 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3581 d)))
3582 goto done;
3583
3584 if (multibyte && s < d->min_trcount)
3585 p = mbp = skip_remains_mb (d, p, mbp, end);
3586
3587 s1 = s;
3588 if (!multibyte || d->states[s].mbps.nelem == 0
3589 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3590 {
3591 /* If a input character does not match ANYCHAR, do it
3592 like a single-byte character. */
3593 s = d->fails[s][*p++];
3594 }
3595 else
3596 {
3597 s = transit_state (d, s, &p, (unsigned char *) end);
3598 mbp = p;
3599 trans = d->trans;
3600 }
3601 }
3602 else
3603 {
3604 build_state (s, d, p[0]);
3605 trans = d->trans;
3606 }
3607 }
3608
3609 done:
3610 if (count)
3611 *count += nlcount;
3612 *end = saved_end;
3613 return (char *) p;
3614}
3615
3616/* Specialized versions of dfaexec for multibyte and single-byte cases.
3617 This is for performance, as dfaexec_main is an inline function. */
3618
3619static char *
3620dfaexec_mb (struct dfa *d, char const *begin, char *end,
3621 bool allow_nl, idx_t *count, bool *backref)
3622{
3623 return dfaexec_main (d, begin, end, allow_nl, count, true);
3624}
3625
3626static char *
3627dfaexec_sb (struct dfa *d, char const *begin, char *end,
3628 bool allow_nl, idx_t *count, bool *backref)
3629{
3630 return dfaexec_main (d, begin, end, allow_nl, count, false);
3631}
3632
3633/* Always set *BACKREF and return BEGIN. Use this wrapper for
3634 any regexp that uses a construct not supported by this code. */
3635static char *
3636dfaexec_noop (struct dfa *d, char const *begin, char *end,
3637 bool allow_nl, idx_t *count, bool *backref)
3638{
3639 *backref = true;
3640 return (char *) begin;
3641}
3642
3643/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3644 but faster and set *BACKREF if the DFA code does not support this
3645 regexp usage. */
3646
3647char *
3648dfaexec (struct dfa *d, char const *begin, char *end,
3649 bool allow_nl, idx_t *count, bool *backref)
3650{
3651 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3652}
3653
3654struct dfa *
3655dfasuperset (struct dfa const *d)
3656{
3657 return d->superset;
3658}
3659
3660bool
3661dfaisfast (struct dfa const *d)
3662{
3663 return d->fast;
3664}
3665
3666static void
3667free_mbdata (struct dfa *d)
3668{
3669 free (d->multibyte_prop);
3670 free (d->lex.brack.chars);
3671 free (d->mb_follows.elems);
3672
3673 if (d->mb_trans)
3674 {
3675 state_num s;
3676 for (s = -1; s < d->tralloc; s++)
3677 free (d->mb_trans[s]);
3678 free (d->mb_trans - 2);
3679 }
3680}
3681
3682/* Return true if every construct in D is supported by this DFA matcher. */
3683bool
3684dfasupported (struct dfa const *d)
3685{
3686 for (idx_t i = 0; i < d->tindex; i++)
3687 {
3688 switch (d->tokens[i])
3689 {
3690 case BEGWORD:
3691 case ENDWORD:
3692 case LIMWORD:
3693 case NOTLIMWORD:
3694 if (!d->localeinfo.multibyte)
3695 continue;
3696 FALLTHROUGH;
3697 case BACKREF:
3698 case MBCSET:
3699 return false;
3700 }
3701 }
3702 return true;
3703}
3704
3705/* Disable use of the superset DFA if it is not likely to help
3706 performance. */
3707static void
3708maybe_disable_superset_dfa (struct dfa *d)
3709{
3710 if (!d->localeinfo.using_utf8)
3711 return;
3712
3713 bool have_backref = false;
3714 for (idx_t i = 0; i < d->tindex; i++)
3715 {
3716 switch (d->tokens[i])
3717 {
3718 case ANYCHAR:
3719 /* Lowered. */
3720 abort ();
3721 case BACKREF:
3722 have_backref = true;
3723 break;
3724 case MBCSET:
3725 /* Requires multi-byte algorithm. */
3726 return;
3727 default:
3728 break;
3729 }
3730 }
3731
3732 if (!have_backref && d->superset)
3733 {
3734 /* The superset DFA is not likely to be much faster, so remove it. */
3735 dfafree (d->superset);
3736 free (d->superset);
3737 d->superset = NULL;
3738 }
3739
3740 free_mbdata (d);
3741 d->localeinfo.multibyte = false;
3742 d->dfaexec = dfaexec_sb;
3743 d->fast = true;
3744}
3745
3746static void
3747dfassbuild (struct dfa *d)
3748{
3749 struct dfa *sup = dfaalloc ();
3750
3751 *sup = *d;
3752 sup->localeinfo.multibyte = false;
3753 sup->dfaexec = dfaexec_sb;
3754 sup->multibyte_prop = NULL;
3755 sup->superset = NULL;
3756 sup->states = NULL;
3757 sup->sindex = 0;
3758 sup->constraints = NULL;
3759 sup->separates = NULL;
3760 sup->follows = NULL;
3761 sup->tralloc = 0;
3762 sup->trans = NULL;
3763 sup->fails = NULL;
3764 sup->success = NULL;
3765 sup->newlines = NULL;
3766
3767 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3768 if (d->cindex)
3769 {
3770 memcpy (sup->charclasses, d->charclasses,
3771 d->cindex * sizeof *sup->charclasses);
3772 }
3773
3774 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3775 sup->talloc = d->tindex * 2;
3776
3777 bool have_achar = false;
3778 bool have_nchar = false;
3779 idx_t j;
3780 for (idx_t i = j = 0; i < d->tindex; i++)
3781 {
3782 switch (d->tokens[i])
3783 {
3784 case ANYCHAR:
3785 case MBCSET:
3786 case BACKREF:
3787 {
3788 charclass ccl;
3789 fillset (&ccl);
3790 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3791 sup->tokens[j++] = STAR;
3792 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3793 || d->tokens[i + 1] == PLUS)
3794 i++;
3795 have_achar = true;
3796 }
3797 break;
3798 case BEGWORD:
3799 case ENDWORD:
3800 case LIMWORD:
3801 case NOTLIMWORD:
3802 if (d->localeinfo.multibyte)
3803 {
3804 /* These constraints aren't supported in a multibyte locale.
3805 Ignore them in the superset DFA. */
3806 sup->tokens[j++] = EMPTY;
3807 break;
3808 }
3809 FALLTHROUGH;
3810 default:
3811 sup->tokens[j++] = d->tokens[i];
3812 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3813 || d->tokens[i] >= CSET)
3814 have_nchar = true;
3815 break;
3816 }
3817 }
3818 sup->tindex = j;
3819
3820 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3821 d->superset = sup;
3822 else
3823 {
3824 dfafree (sup);
3825 free (sup);
3826 }
3827}
3828
3829/* Parse a string S of length LEN into D (but skip this step if S is null).
3830 Then analyze D and build a matcher for it.
3831 SEARCHFLAG says whether to build a searching or an exact matcher. */
3832void
3833dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3834{
3835 if (s != NULL)
3836 dfaparse (s, len, d);
3837
3838 dfassbuild (d);
3839
3840 if (dfasupported (d))
3841 {
3842 maybe_disable_superset_dfa (d);
3843 dfaanalyze (d, searchflag);
3844 }
3845 else
3846 {
3847 d->dfaexec = dfaexec_noop;
3848 }
3849
3850 if (d->superset)
3851 {
3852 d->fast = true;
3853 dfaanalyze (d->superset, searchflag);
3854 }
3855}
3856
3857/* Free the storage held by the components of a dfa. */
3858void
3859dfafree (struct dfa *d)
3860{
3861 free (d->charclasses);
3862 free (d->tokens);
3863
3864 if (d->localeinfo.multibyte)
3865 free_mbdata (d);
3866
3867 free (d->constraints);
3868 free (d->separates);
3869
3870 for (idx_t i = 0; i < d->sindex; i++)
3871 {
3872 free (d->states[i].elems.elems);
3873 free (d->states[i].mbps.elems);
3874 }
3875 free (d->states);
3876
3877 if (d->follows)
3878 {
3879 for (idx_t i = 0; i < d->tindex; i++)
3880 free (d->follows[i].elems);
3881 free (d->follows);
3882 }
3883
3884 if (d->trans)
3885 {
3886 for (idx_t i = 0; i < d->tralloc; i++)
3887 {
3888 free (d->trans[i]);
3889 free (d->fails[i]);
3890 }
3891
3892 free (d->trans - 2);
3893 free (d->fails);
3894 free (d->newlines);
3895 free (d->success);
3896 }
3897
3898 if (d->superset)
3899 {
3900 dfafree (d->superset);
3901 free (d->superset);
3902 }
3903}
3904
3905/* Having found the postfix representation of the regular expression,
3906 try to find a long sequence of characters that must appear in any line
3907 containing the r.e.
3908 Finding a "longest" sequence is beyond the scope here;
3909 we take an easy way out and hope for the best.
3910 (Take "(ab|a)b"--please.)
3911
3912 We do a bottom-up calculation of sequences of characters that must appear
3913 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3914 representation:
3915 sequences that must appear at the left of the match ("left")
3916 sequences that must appear at the right of the match ("right")
3917 lists of sequences that must appear somewhere in the match ("in")
3918 sequences that must constitute the match ("is")
3919
3920 When we get to the root of the tree, we use one of the longest of its
3921 calculated "in" sequences as our answer.
3922
3923 The sequences calculated for the various types of node (in pseudo ANSI c)
3924 are shown below. "p" is the operand of unary operators (and the left-hand
3925 operand of binary operators); "q" is the right-hand operand of binary
3926 operators.
3927
3928 "ZERO" means "a zero-length sequence" below.
3929
3930 Type left right is in
3931 ---- ---- ----- -- --
3932 char c # c # c # c # c
3933
3934 ANYCHAR ZERO ZERO ZERO ZERO
3935
3936 MBCSET ZERO ZERO ZERO ZERO
3937
3938 CSET ZERO ZERO ZERO ZERO
3939
3940 STAR ZERO ZERO ZERO ZERO
3941
3942 QMARK ZERO ZERO ZERO ZERO
3943
3944 PLUS p->left p->right ZERO p->in
3945
3946 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3947 p->left : q->right : q->is!=ZERO) ? q->in plus
3948 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3949 ZERO
3950
3951 OR longest common longest common (do p->is and substrings common
3952 leading trailing to q->is have same p->in and
3953 (sub)sequence (sub)sequence q->in length and content) ?
3954 of p->left of p->right
3955 and q->left and q->right p->is : NULL
3956
3957 If there's anything else we recognize in the tree, all four sequences get set
3958 to zero-length sequences. If there's something we don't recognize in the
3959 tree, we just return a zero-length sequence.
3960
3961 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3962 'aaa')?
3963
3964 And ... is it here or someplace that we might ponder "optimizations" such as
3965 egrep 'psi|epsilon' -> egrep 'psi'
3966 egrep 'pepsi|epsilon' -> egrep 'epsi'
3967 (Yes, we now find "epsi" as a "string
3968 that must occur", but we might also
3969 simplify the *entire* r.e. being sought)
3970 grep '[c]' -> grep 'c'
3971 grep '(ab|a)b' -> grep 'ab'
3972 grep 'ab*' -> grep 'a'
3973 grep 'a*b' -> grep 'b'
3974
3975 There are several issues:
3976
3977 Is optimization easy (enough)?
3978
3979 Does optimization actually accomplish anything,
3980 or is the automaton you get from "psi|epsilon" (for example)
3981 the same as the one you get from "psi" (for example)?
3982
3983 Are optimizable r.e.'s likely to be used in real-life situations
3984 (something like 'ab*' is probably unlikely; something like is
3985 'psi|epsilon' is likelier)? */
3986
3987static char *
3988icatalloc (char *old, char const *new)
3989{
3990 idx_t newsize = strlen (new);
3991 if (newsize == 0)
3992 return old;
3993 idx_t oldsize = strlen (old);
3994 char *result = xirealloc (old, oldsize + newsize + 1);
3995 memcpy (result + oldsize, new, newsize + 1);
3996 return result;
3997}
3998
3999static void
4000freelist (char **cpp)
4001{
4002 while (*cpp)
4003 free (*cpp++);
4004}
4005
4006static char **
4007enlistnew (char **cpp, char *new)
4008{
4009 /* Is there already something in the list that's new (or longer)? */
4010 idx_t i;
4011 for (i = 0; cpp[i] != NULL; i++)
4012 if (strstr (cpp[i], new) != NULL)
4013 {
4014 free (new);
4015 return cpp;
4016 }
4017 /* Eliminate any obsoleted strings. */
4018 for (idx_t j = 0; cpp[j] != NULL; )
4019 if (strstr (new, cpp[j]) == NULL)
4020 ++j;
4021 else
4022 {
4023 free (cpp[j]);
4024 if (--i == j)
4025 break;
4026 cpp[j] = cpp[i];
4027 cpp[i] = NULL;
4028 }
4029 /* Add the new string. */
4030 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4031 cpp[i] = new;
4032 cpp[i + 1] = NULL;
4033 return cpp;
4034}
4035
4036static char **
4037enlist (char **cpp, char const *str, idx_t len)
4038{
4039 return enlistnew (cpp, ximemdup0 (str, len));
4040}
4041
4042/* Given pointers to two strings, return a pointer to an allocated
4043 list of their distinct common substrings. */
4044static char **
4045comsubs (char *left, char const *right)
4046{
4047 char **cpp = xzalloc (sizeof *cpp);
4048
4049 for (char *lcp = left; *lcp != '\0'; lcp++)
4050 {
4051 idx_t len = 0;
4052 char *rcp = strchr (right, *lcp);
4053 while (rcp != NULL)
4054 {
4055 idx_t i;
4056 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4057 continue;
4058 if (i > len)
4059 len = i;
4060 rcp = strchr (rcp + 1, *lcp);
4061 }
4062 if (len != 0)
4063 cpp = enlist (cpp, lcp, len);
4064 }
4065 return cpp;
4066}
4067
4068static char **
4069addlists (char **old, char **new)
4070{
4071 for (; *new; new++)
4072 old = enlistnew (old, xstrdup (*new));
4073 return old;
4074}
4075
4076/* Given two lists of substrings, return a new list giving substrings
4077 common to both. */
4078static char **
4079inboth (char **left, char **right)
4080{
4081 char **both = xzalloc (sizeof *both);
4082
4083 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4084 {
4085 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4086 {
4087 char **temp = comsubs (left[lnum], right[rnum]);
4088 both = addlists (both, temp);
4089 freelist (temp);
4090 free (temp);
4091 }
4092 }
4093 return both;
4094}
4095
4096typedef struct must must;
4097
4098struct must
4099{
4100 char **in;
4101 char *left;
4102 char *right;
4103 char *is;
4104 bool begline;
4105 bool endline;
4106 must *prev;
4107};
4108
4109static must *
4110allocmust (must *mp, idx_t size)
4111{
4112 must *new_mp = xmalloc (sizeof *new_mp);
4113 new_mp->in = xzalloc (sizeof *new_mp->in);
4114 new_mp->left = xizalloc (size);
4115 new_mp->right = xizalloc (size);
4116 new_mp->is = xizalloc (size);
4117 new_mp->begline = false;
4118 new_mp->endline = false;
4119 new_mp->prev = mp;
4120 return new_mp;
4121}
4122
4123static void
4124resetmust (must *mp)
4125{
4126 freelist (mp->in);
4127 mp->in[0] = NULL;
4128 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4129 mp->begline = false;
4130 mp->endline = false;
4131}
4132
4133static void
4134freemust (must *mp)
4135{
4136 freelist (mp->in);
4137 free (mp->in);
4138 free (mp->left);
4139 free (mp->right);
4140 free (mp->is);
4141 free (mp);
4142}
4143
4144struct dfamust *
4145dfamust (struct dfa const *d)
4146{
4147 must *mp = NULL;
4148 char const *result = "";
4149 bool exact = false;
4150 bool begline = false;
4151 bool endline = false;
4152 bool need_begline = false;
4153 bool need_endline = false;
4154 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4155
4156 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4157 {
4158 token t = d->tokens[ri];
4159 switch (t)
4160 {
4161 case BEGLINE:
4162 mp = allocmust (mp, 2);
4163 mp->begline = true;
4164 need_begline = true;
4165 break;
4166 case ENDLINE:
4167 mp = allocmust (mp, 2);
4168 mp->endline = true;
4169 need_endline = true;
4170 break;
4171 case LPAREN:
4172 case RPAREN:
4173 assert (!"neither LPAREN nor RPAREN may appear here");
4174
4175 case EMPTY:
4176 case BEGWORD:
4177 case ENDWORD:
4178 case LIMWORD:
4179 case NOTLIMWORD:
4180 case BACKREF:
4181 case ANYCHAR:
4182 case MBCSET:
4183 mp = allocmust (mp, 2);
4184 break;
4185
4186 case STAR:
4187 case QMARK:
4188 assume_nonnull (mp);
4189 resetmust (mp);
4190 break;
4191
4192 case OR:
4193 {
4194 char **new;
4195 must *rmp = mp;
4196 assume_nonnull (rmp);
4197 must *lmp = mp = mp->prev;
4198 assume_nonnull (lmp);
4199 idx_t j, ln, rn, n;
4200
4201 /* Guaranteed to be. Unlikely, but ... */
4202 if (str_eq (lmp->is, rmp->is))
4203 {
4204 lmp->begline &= rmp->begline;
4205 lmp->endline &= rmp->endline;
4206 }
4207 else
4208 {
4209 lmp->is[0] = '\0';
4210 lmp->begline = false;
4211 lmp->endline = false;
4212 }
4213 /* Left side--easy */
4214 idx_t i = 0;
4215 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4216 ++i;
4217 lmp->left[i] = '\0';
4218 /* Right side */
4219 ln = strlen (lmp->right);
4220 rn = strlen (rmp->right);
4221 n = ln;
4222 if (n > rn)
4223 n = rn;
4224 for (i = 0; i < n; ++i)
4225 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4226 break;
4227 for (j = 0; j < i; ++j)
4228 lmp->right[j] = lmp->right[(ln - i) + j];
4229 lmp->right[j] = '\0';
4230 new = inboth (lmp->in, rmp->in);
4231 freelist (lmp->in);
4232 free (lmp->in);
4233 lmp->in = new;
4234 freemust (rmp);
4235 }
4236 break;
4237
4238 case PLUS:
4239 assume_nonnull (mp);
4240 mp->is[0] = '\0';
4241 break;
4242
4243 case END:
4244 assume_nonnull (mp);
4245 assert (!mp->prev);
4246 for (idx_t i = 0; mp->in[i] != NULL; i++)
4247 if (strlen (mp->in[i]) > strlen (result))
4248 result = mp->in[i];
4249 if (str_eq (result, mp->is))
4250 {
4251 if ((!need_begline || mp->begline) && (!need_endline
4252 || mp->endline))
4253 exact = true;
4254 begline = mp->begline;
4255 endline = mp->endline;
4256 }
4257 goto done;
4258
4259 case CAT:
4260 {
4261 must *rmp = mp;
4262 assume_nonnull (rmp);
4263 must *lmp = mp = mp->prev;
4264 assume_nonnull (lmp);
4265
4266 /* In. Everything in left, plus everything in
4267 right, plus concatenation of
4268 left's right and right's left. */
4269 lmp->in = addlists (lmp->in, rmp->in);
4270 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4271 {
4272 idx_t lrlen = strlen (lmp->right);
4273 idx_t rllen = strlen (rmp->left);
4274 char *tp = ximalloc (lrlen + rllen + 1);
4275 memcpy (tp + lrlen, rmp->left, rllen + 1);
4276 memcpy (tp, lmp->right, lrlen);
4277 lmp->in = enlistnew (lmp->in, tp);
4278 }
4279 /* Left-hand */
4280 if (lmp->is[0] != '\0')
4281 lmp->left = icatalloc (lmp->left, rmp->left);
4282 /* Right-hand */
4283 if (rmp->is[0] == '\0')
4284 lmp->right[0] = '\0';
4285 lmp->right = icatalloc (lmp->right, rmp->right);
4286 /* Guaranteed to be */
4287 if ((lmp->is[0] != '\0' || lmp->begline)
4288 && (rmp->is[0] != '\0' || rmp->endline))
4289 {
4290 lmp->is = icatalloc (lmp->is, rmp->is);
4291 lmp->endline = rmp->endline;
4292 }
4293 else
4294 {
4295 lmp->is[0] = '\0';
4296 lmp->begline = false;
4297 lmp->endline = false;
4298 }
4299 freemust (rmp);
4300 }
4301 break;
4302
4303 case '\0':
4304 /* Not on *my* shift. */
4305 goto done;
4306
4307 default:
4308 if (CSET <= t)
4309 {
4310 /* If T is a singleton, or if case-folding in a unibyte
4311 locale and T's members all case-fold to the same char,
4312 convert T to one of its members. Otherwise, do
4313 nothing further with T. */
4314 charclass *ccl = &d->charclasses[t - CSET];
4315 int j;
4316 for (j = 0; j < NOTCHAR; j++)
4317 if (tstbit (j, ccl))
4318 break;
4319 if (! (j < NOTCHAR))
4320 {
4321 mp = allocmust (mp, 2);
4322 break;
4323 }
4324 t = j;
4325 while (++j < NOTCHAR)
4326 if (tstbit (j, ccl)
4327 && ! (case_fold_unibyte
4328 && toupper (j) == toupper (t)))
4329 break;
4330 if (j < NOTCHAR)
4331 {
4332 mp = allocmust (mp, 2);
4333 break;
4334 }
4335 }
4336
4337 idx_t rj = ri + 2;
4338 if (d->tokens[ri + 1] == CAT)
4339 {
4340 for (; rj < d->tindex - 1; rj += 2)
4341 {
4342 if ((rj != ri && (d->tokens[rj] <= 0
4343 || NOTCHAR <= d->tokens[rj]))
4344 || d->tokens[rj + 1] != CAT)
4345 break;
4346 }
4347 }
4348 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4349 mp->is[0] = mp->left[0] = mp->right[0]
4350 = case_fold_unibyte ? toupper (t) : t;
4351
4352 idx_t i;
4353 for (i = 1; ri + 2 < rj; i++)
4354 {
4355 ri += 2;
4356 t = d->tokens[ri];
4357 mp->is[i] = mp->left[i] = mp->right[i]
4358 = case_fold_unibyte ? toupper (t) : t;
4359 }
4360 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4361 mp->in = enlist (mp->in, mp->is, i);
4362 break;
4363 }
4364 }
4365 done:;
4366
4367 struct dfamust *dm = NULL;
4368 if (*result)
4369 {
4370 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4371 dm->exact = exact;
4372 dm->begline = begline;
4373 dm->endline = endline;
4374 strcpy (dm->must, result);
4375 }
4376
4377 while (mp)
4378 {
4379 must *prev = mp->prev;
4380 freemust (mp);
4381 mp = prev;
4382 }
4383
4384 return dm;
4385}
4386
4387void
4388dfamustfree (struct dfamust *dm)
4389{
4390 free (dm);
4391}
4392
4393struct dfa *
4394dfaalloc (void)
4395{
4396 return xmalloc (sizeof (struct dfa));
4397}
4398
4399/* Initialize DFA. */
4400void
4401dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4402 reg_syntax_t bits, int dfaopts)
4403{
4404 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4405 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4406 dfa->localeinfo = *linfo;
4407
4408 dfa->fast = !dfa->localeinfo.multibyte;
4409
4410 dfa->canychar = -1;
4411 dfa->syntax.syntax_bits_set = true;
4412 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4413 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4414 dfa->syntax.syntax_bits = bits;
4415 dfa->syntax.dfaopts = dfaopts;
4416
4417 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4418 {
4419 unsigned char uc = i;
4420
4421 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4422 switch (dfa->syntax.sbit[uc])
4423 {
4424 case CTX_LETTER:
4425 setbit (uc, &dfa->syntax.letters);
4426 break;
4427 case CTX_NEWLINE:
4428 setbit (uc, &dfa->syntax.newline);
4429 break;
4430 }
4431
4432 /* POSIX requires that the five bytes in "\n\r./" (including the
4433 terminating NUL) cannot occur inside a multibyte character. */
4434 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4435 ? (uc & 0xc0) != 0x80
4436 : strchr ("\n\r./", uc) != NULL);
4437 }
4438}
4439
4440/* Initialize TO by copying FROM's syntax settings. */
4441void
4442dfacopysyntax (struct dfa *to, struct dfa const *from)
4443{
4444 memset (to, 0, offsetof (struct dfa, syntax));
4445 to->canychar = -1;
4446 to->fast = from->fast;
4447 to->syntax = from->syntax;
4448 to->dfaexec = from->dfaexec;
4449 to->localeinfo = from->localeinfo;
4450}
4451
4452/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette