VirtualBox

source: kBuild/vendor/sed/current/lib/dfa.c@ 3611

Last change on this file since 3611 was 3611, checked in by bird, 7 months ago

vendor/sed/current: GNU sed 4.9 (sed-4.9.tar.xz sha256:6e226b732e1cd739464ad6862bd1a1aba42d7982922da7a53519631d24975181)

File size: 135.0 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2022 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24
25#include "dfa.h"
26
27#include "flexmember.h"
28#include "idx.h"
29#include "verify.h"
30
31#include <assert.h>
32#include <ctype.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <limits.h>
37#include <string.h>
38
39/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40 understandably cannot deduce that the input comes from a
41 well-formed regular expression. There's little point to the
42 runtime overhead of 'assert' instead of 'assume_nonnull' when the
43 MMU will check anyway. */
44#define assume_nonnull(x) assume ((x) != NULL)
45
46static bool
47str_eq (char const *a, char const *b)
48{
49 return strcmp (a, b) == 0;
50}
51
52static bool
53c_isdigit (char c)
54{
55 return '0' <= c && c <= '9';
56}
57
58#include "gettext.h"
59#define _(str) gettext (str)
60
61#include <wchar.h>
62#include <wctype.h>
63
64#include "xalloc.h"
65#include "localeinfo.h"
66
67#ifndef FALLTHROUGH
68# if 201710L < __STDC_VERSION__
69# define FALLTHROUGH [[__fallthrough__]]
70# elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
71# define FALLTHROUGH __attribute__ ((__fallthrough__))
72# else
73# define FALLTHROUGH ((void) 0)
74# endif
75#endif
76
77#ifndef MIN
78# define MIN(a,b) ((a) < (b) ? (a) : (b))
79#endif
80
81/* HPUX defines these as macros in sys/param.h. */
82#ifdef setbit
83# undef setbit
84#endif
85#ifdef clrbit
86# undef clrbit
87#endif
88
89/* For code that does not use Gnulib’s isblank module. */
90#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
91# define isblank dfa_isblank
92static int
93isblank (int c)
94{
95 return c == ' ' || c == '\t';
96}
97#endif
98
99/* First integer value that is greater than any character code. */
100enum { NOTCHAR = 1 << CHAR_BIT };
101
102#ifdef UINT_LEAST64_MAX
103
104/* Number of bits used in a charclass word. */
105enum { CHARCLASS_WORD_BITS = 64 };
106
107/* This represents part of a character class. It must be unsigned and
108 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
109typedef uint_least64_t charclass_word;
110
111/* Part of a charclass initializer that represents 64 bits' worth of a
112 charclass, where LO and HI are the low and high-order 32 bits of
113 the 64-bit quantity. */
114# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
115
116#else
117/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
118enum { CHARCLASS_WORD_BITS = 32 };
119typedef unsigned long charclass_word;
120# define CHARCLASS_PAIR(lo, hi) lo, hi
121#endif
122
123/* An initializer for a charclass whose 32-bit words are A through H. */
124#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
125 {{ \
126 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
127 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
128 }}
129
130/* The maximum useful value of a charclass_word; all used bits are 1. */
131static charclass_word const CHARCLASS_WORD_MASK
132 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
133
134/* Number of words required to hold a bit for every character. */
135enum
136{
137 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
138};
139
140/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
141typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
142
143/* Convert a possibly-signed character to an unsigned character. This is
144 a bit safer than casting to unsigned char, since it catches some type
145 errors that the cast doesn't. */
146static unsigned char
147to_uchar (char ch)
148{
149 return ch;
150}
151
152/* Contexts tell us whether a character is a newline or a word constituent.
153 Word-constituent characters are those that satisfy iswalnum, plus '_'.
154 Each character has a single CTX_* value; bitmasks of CTX_* values denote
155 a particular character class.
156
157 A state also stores a context value, which is a bitmask of CTX_* values.
158 A state's context represents a set of characters that the state's
159 predecessors must match. For example, a state whose context does not
160 include CTX_LETTER will never have transitions where the previous
161 character is a word constituent. A state whose context is CTX_ANY
162 might have transitions from any character. */
163
164enum
165 {
166 CTX_NONE = 1,
167 CTX_LETTER = 2,
168 CTX_NEWLINE = 4,
169 CTX_ANY = 7
170 };
171
172/* Sometimes characters can only be matched depending on the surrounding
173 context. Such context decisions depend on what the previous character
174 was, and the value of the current (lookahead) character. Context
175 dependent constraints are encoded as 9-bit integers. Each bit that
176 is set indicates that the constraint succeeds in the corresponding
177 context.
178
179 bit 6-8 - valid contexts when next character is CTX_NEWLINE
180 bit 3-5 - valid contexts when next character is CTX_LETTER
181 bit 0-2 - valid contexts when next character is CTX_NONE
182
183 succeeds_in_context determines whether a given constraint
184 succeeds in a particular context. Prev is a bitmask of possible
185 context values for the previous character, curr is the (single-bit)
186 context value for the lookahead character. */
187static int
188newline_constraint (int constraint)
189{
190 return (constraint >> 6) & 7;
191}
192static int
193letter_constraint (int constraint)
194{
195 return (constraint >> 3) & 7;
196}
197static int
198other_constraint (int constraint)
199{
200 return constraint & 7;
201}
202
203static bool
204succeeds_in_context (int constraint, int prev, int curr)
205{
206 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
207 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
208 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
209 & prev);
210}
211
212/* The following describe what a constraint depends on. */
213static bool
214prev_newline_dependent (int constraint)
215{
216 return ((constraint ^ constraint >> 2) & 0111) != 0;
217}
218static bool
219prev_letter_dependent (int constraint)
220{
221 return ((constraint ^ constraint >> 1) & 0111) != 0;
222}
223
224/* Tokens that match the empty string subject to some constraint actually
225 work by applying that constraint to determine what may follow them,
226 taking into account what has gone before. The following values are
227 the constraints corresponding to the special tokens previously defined. */
228enum
229 {
230 NO_CONSTRAINT = 0777,
231 BEGLINE_CONSTRAINT = 0444,
232 ENDLINE_CONSTRAINT = 0700,
233 BEGWORD_CONSTRAINT = 0050,
234 ENDWORD_CONSTRAINT = 0202,
235 LIMWORD_CONSTRAINT = 0252,
236 NOTLIMWORD_CONSTRAINT = 0525
237 };
238
239/* The regexp is parsed into an array of tokens in postfix form. Some tokens
240 are operators and others are terminal symbols. Most (but not all) of these
241 codes are returned by the lexical analyzer. */
242
243typedef ptrdiff_t token;
244static token const TOKEN_MAX = PTRDIFF_MAX;
245
246/* States are indexed by state_num values. These are normally
247 nonnegative but -1 is used as a special value. */
248typedef ptrdiff_t state_num;
249
250/* Predefined token values. */
251enum
252{
253 END = -1, /* END is a terminal symbol that matches the
254 end of input; any value of END or less in
255 the parse tree is such a symbol. Accepting
256 states of the DFA are those that would have
257 a transition on END. This is -1, not some
258 more-negative value, to tweak the speed of
259 comparisons to END. */
260
261 /* Ordinary character values are terminal symbols that match themselves. */
262
263 /* CSET must come last in the following list of special tokens. Otherwise,
264 the list order matters only for performance. Related special tokens
265 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
266 || CSET <= t) can be done with a single machine-level comparison. */
267
268 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
269 the empty string. */
270
271 QMARK, /* QMARK is an operator of one argument that
272 matches zero or one occurrences of its
273 argument. */
274
275 STAR, /* STAR is an operator of one argument that
276 matches the Kleene closure (zero or more
277 occurrences) of its argument. */
278
279 PLUS, /* PLUS is an operator of one argument that
280 matches the positive closure (one or more
281 occurrences) of its argument. */
282
283 REPMN, /* REPMN is a lexical token corresponding
284 to the {m,n} construct. REPMN never
285 appears in the compiled token vector. */
286
287 CAT, /* CAT is an operator of two arguments that
288 matches the concatenation of its
289 arguments. CAT is never returned by the
290 lexical analyzer. */
291
292 OR, /* OR is an operator of two arguments that
293 matches either of its arguments. */
294
295 LPAREN, /* LPAREN never appears in the parse tree,
296 it is only a lexeme. */
297
298 RPAREN, /* RPAREN never appears in the parse tree. */
299
300 WCHAR, /* Only returned by lex. wctok contains
301 the wide character representation. */
302
303 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
304 a valid multibyte (or single byte) character.
305 It is used only if MB_CUR_MAX > 1. */
306
307 BEG, /* BEG is an initial symbol that matches the
308 beginning of input. */
309
310 BEGLINE, /* BEGLINE is a terminal symbol that matches
311 the empty string at the beginning of a
312 line. */
313
314 ENDLINE, /* ENDLINE is a terminal symbol that matches
315 the empty string at the end of a line. */
316
317 BEGWORD, /* BEGWORD is a terminal symbol that matches
318 the empty string at the beginning of a
319 word. */
320
321 ENDWORD, /* ENDWORD is a terminal symbol that matches
322 the empty string at the end of a word. */
323
324 LIMWORD, /* LIMWORD is a terminal symbol that matches
325 the empty string at the beginning or the
326 end of a word. */
327
328 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
329 matches the empty string not at
330 the beginning or end of a word. */
331
332 BACKREF, /* BACKREF is generated by \<digit>
333 or by any other construct that
334 is not completely handled. If the scanner
335 detects a transition on backref, it returns
336 a kind of "semi-success" indicating that
337 the match will have to be verified with
338 a backtracking matcher. */
339
340 MBCSET, /* MBCSET is similar to CSET, but for
341 multibyte characters. */
342
343 CSET /* CSET and (and any value greater) is a
344 terminal symbol that matches any of a
345 class of characters. */
346};
347
348
349/* States of the recognizer correspond to sets of positions in the parse
350 tree, together with the constraints under which they may be matched.
351 So a position is encoded as an index into the parse tree together with
352 a constraint. */
353typedef struct
354{
355 idx_t index; /* Index into the parse array. */
356 unsigned int constraint; /* Constraint for matching this position. */
357} position;
358
359/* Sets of positions are stored as arrays. */
360typedef struct
361{
362 position *elems; /* Elements of this position set. */
363 idx_t nelem; /* Number of elements in this set. */
364 idx_t alloc; /* Number of elements allocated in ELEMS. */
365} position_set;
366
367/* A state of the dfa consists of a set of positions, some flags,
368 and the token value of the lowest-numbered position of the state that
369 contains an END token. */
370typedef struct
371{
372 size_t hash; /* Hash of the positions of this state. */
373 position_set elems; /* Positions this state could match. */
374 unsigned char context; /* Context from previous state. */
375 unsigned short constraint; /* Constraint for this state to accept. */
376 position_set mbps; /* Positions which can match multibyte
377 characters or the follows, e.g., period.
378 Used only if MB_CUR_MAX > 1. */
379 state_num mb_trindex; /* Index of this state in MB_TRANS, or
380 negative if the state does not have
381 ANYCHAR. */
382} dfa_state;
383
384/* Maximum for any transition table count. This should be at least 3,
385 for the initial state setup. */
386enum { MAX_TRCOUNT = 1024 };
387
388/* A bracket operator.
389 e.g., [a-c], [[:alpha:]], etc. */
390struct mb_char_classes
391{
392 ptrdiff_t cset;
393 bool invert;
394 wchar_t *chars; /* Normal characters. */
395 idx_t nchars;
396 idx_t nchars_alloc;
397};
398
399struct regex_syntax
400{
401 /* Syntax bits controlling the behavior of the lexical analyzer. */
402 reg_syntax_t syntax_bits;
403 int dfaopts;
404 bool syntax_bits_set;
405
406 /* Flag for case-folding letters into sets. */
407 bool case_fold;
408
409 /* End-of-line byte in data. */
410 unsigned char eolbyte;
411
412 /* Cache of char-context values. */
413 char sbit[NOTCHAR];
414
415 /* If never_trail[B], the byte B cannot be a non-initial byte in a
416 multibyte character. */
417 bool never_trail[NOTCHAR];
418
419 /* Set of characters considered letters. */
420 charclass letters;
421
422 /* Set of characters that are newline. */
423 charclass newline;
424};
425
426/* Lexical analyzer. All the dross that deals with the obnoxious
427 GNU Regex syntax bits is located here. The poor, suffering
428 reader is referred to the GNU Regex documentation for the
429 meaning of the @#%!@#%^!@ syntax bits. */
430struct lexer_state
431{
432 char const *ptr; /* Pointer to next input character. */
433 idx_t left; /* Number of characters remaining. */
434 token lasttok; /* Previous token returned; initially END. */
435 idx_t parens; /* Count of outstanding left parens. */
436 int minrep, maxrep; /* Repeat counts for {m,n}. */
437
438 /* Wide character representation of the current multibyte character,
439 or WEOF if there was an encoding error. Used only if
440 MB_CUR_MAX > 1. */
441 wint_t wctok;
442
443 /* The most recently analyzed multibyte bracket expression. */
444 struct mb_char_classes brack;
445
446 /* We're separated from beginning or (, | only by zero-width characters. */
447 bool laststart;
448};
449
450/* Recursive descent parser for regular expressions. */
451
452struct parser_state
453{
454 token tok; /* Lookahead token. */
455 idx_t depth; /* Current depth of a hypothetical stack
456 holding deferred productions. This is
457 used to determine the depth that will be
458 required of the real stack later on in
459 dfaanalyze. */
460};
461
462/* A compiled regular expression. */
463struct dfa
464{
465 /* Fields filled by the scanner. */
466 charclass *charclasses; /* Array of character sets for CSET tokens. */
467 idx_t cindex; /* Index for adding new charclasses. */
468 idx_t calloc; /* Number of charclasses allocated. */
469 ptrdiff_t canychar; /* Index of anychar class, or -1. */
470
471 /* Scanner state */
472 struct lexer_state lex;
473
474 /* Parser state */
475 struct parser_state parse;
476
477 /* Fields filled by the parser. */
478 token *tokens; /* Postfix parse array. */
479 idx_t tindex; /* Index for adding new tokens. */
480 idx_t talloc; /* Number of tokens currently allocated. */
481 idx_t depth; /* Depth required of an evaluation stack
482 used for depth-first traversal of the
483 parse tree. */
484 idx_t nleaves; /* Number of non-EMPTY leaves
485 in the parse tree. */
486 idx_t nregexps; /* Count of parallel regexps being built
487 with dfaparse. */
488 bool fast; /* The DFA is fast. */
489 bool epsilon; /* Does a token match only the empty string? */
490 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
491 mbstate_t mbs; /* Multibyte conversion state. */
492
493 /* The following are valid only if MB_CUR_MAX > 1. */
494
495 /* The value of multibyte_prop[i] is defined by following rule.
496 if tokens[i] < NOTCHAR
497 bit 0 : tokens[i] is the first byte of a character, including
498 single-byte characters.
499 bit 1 : tokens[i] is the last byte of a character, including
500 single-byte characters.
501
502 e.g.
503 tokens
504 = 'single_byte_a', 'multi_byte_A', single_byte_b'
505 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
506 multibyte_prop
507 = 3 , 1 , 0 , 2 , 3
508 */
509 char *multibyte_prop;
510
511 /* Fields filled by the superset. */
512 struct dfa *superset; /* Hint of the dfa. */
513
514 /* Fields filled by the state builder. */
515 dfa_state *states; /* States of the dfa. */
516 state_num sindex; /* Index for adding new states. */
517 idx_t salloc; /* Number of states currently allocated. */
518
519 /* Fields filled by the parse tree->NFA conversion. */
520 position_set *follows; /* Array of follow sets, indexed by position
521 index. The follow of a position is the set
522 of positions containing characters that
523 could conceivably follow a character
524 matching the given position in a string
525 matching the regexp. Allocated to the
526 maximum possible position index. */
527 bool searchflag; /* We are supposed to build a searching
528 as opposed to an exact matcher. A searching
529 matcher finds the first and shortest string
530 matching a regexp anywhere in the buffer,
531 whereas an exact matcher finds the longest
532 string matching, but anchored to the
533 beginning of the buffer. */
534
535 /* Fields filled by dfaanalyze. */
536 int *constraints; /* Array of union of accepting constraints
537 in the follow of a position. */
538 int *separates; /* Array of contexts on follow of a
539 position. */
540
541 /* Fields filled by dfaexec. */
542 state_num tralloc; /* Number of transition tables that have
543 slots so far, not counting trans[-1] and
544 trans[-2]. */
545 int trcount; /* Number of transition tables that have
546 been built, other than for initial
547 states. */
548 int min_trcount; /* Number of initial states. Equivalently,
549 the minimum state number for which trcount
550 counts transitions. */
551 state_num **trans; /* Transition tables for states that can
552 never accept. If the transitions for a
553 state have not yet been computed, or the
554 state could possibly accept, its entry in
555 this table is NULL. This points to two
556 past the start of the allocated array,
557 and trans[-1] and trans[-2] are always
558 NULL. */
559 state_num **fails; /* Transition tables after failing to accept
560 on a state that potentially could do so.
561 If trans[i] is non-null, fails[i] must
562 be null. */
563 char *success; /* Table of acceptance conditions used in
564 dfaexec and computed in build_state. */
565 state_num *newlines; /* Transitions on newlines. The entry for a
566 newline in any transition table is always
567 -1 so we can count lines without wasting
568 too many cycles. The transition for a
569 newline is stored separately and handled
570 as a special case. Newline is also used
571 as a sentinel at the end of the buffer. */
572 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
573 context in multibyte locales, in which we
574 do not distinguish between their contexts,
575 as not supported word. */
576 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
577 state_num **mb_trans; /* Transition tables for states with
578 ANYCHAR. */
579 state_num mb_trcount; /* Number of transition tables for states with
580 ANYCHAR that have actually been built. */
581
582 /* Syntax configuration. This is near the end so that dfacopysyntax
583 can memset up to here. */
584 struct regex_syntax syntax;
585
586 /* Information derived from the locale. This is at the end so that
587 a quick memset need not clear it specially. */
588
589 /* dfaexec implementation. */
590 char *(*dfaexec) (struct dfa *, char const *, char *,
591 bool, idx_t *, bool *);
592
593 /* Other cached information derived from the locale. */
594 struct localeinfo localeinfo;
595};
596
597/* User access to dfa internals. */
598
599/* S could possibly be an accepting state of R. */
600static bool
601accepting (state_num s, struct dfa const *r)
602{
603 return r->states[s].constraint != 0;
604}
605
606/* STATE accepts in the specified context. */
607static bool
608accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
609{
610 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
611}
612
613static void regexp (struct dfa *dfa);
614
615/* Store into *PWC the result of converting the leading bytes of the
616 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
617 and updating the conversion state in *D. On conversion error,
618 convert just a single byte, to WEOF. Return the number of bytes
619 converted.
620
621 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
622
623 * PWC points to wint_t, not to wchar_t.
624 * The last arg is a dfa *D instead of merely a multibyte conversion
625 state D->mbs.
626 * N is idx_t not size_t, and must be at least 1.
627 * S[N - 1] must be a sentinel byte.
628 * Shift encodings are not supported.
629 * The return value is always in the range 1..N.
630 * D->mbs is always valid afterwards.
631 * *PWC is always set to something. */
632static int
633mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
634{
635 unsigned char uc = s[0];
636 wint_t wc = d->localeinfo.sbctowc[uc];
637
638 if (wc == WEOF)
639 {
640 wchar_t wch;
641 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
642 if (0 < nbytes && nbytes < (size_t) -2)
643 {
644 *pwc = wch;
645 return nbytes;
646 }
647 memset (&d->mbs, 0, sizeof d->mbs);
648 }
649
650 *pwc = wc;
651 return 1;
652}
653
654#ifdef DEBUG
655
656static void
657prtok (token t)
658{
659 if (t <= END)
660 fprintf (stderr, "END");
661 else if (0 <= t && t < NOTCHAR)
662 {
663 unsigned int ch = t;
664 fprintf (stderr, "0x%02x", ch);
665 }
666 else
667 {
668 char const *s;
669 switch (t)
670 {
671 case BEG:
672 s = "BEG";
673 break;
674 case EMPTY:
675 s = "EMPTY";
676 break;
677 case BACKREF:
678 s = "BACKREF";
679 break;
680 case BEGLINE:
681 s = "BEGLINE";
682 break;
683 case ENDLINE:
684 s = "ENDLINE";
685 break;
686 case BEGWORD:
687 s = "BEGWORD";
688 break;
689 case ENDWORD:
690 s = "ENDWORD";
691 break;
692 case LIMWORD:
693 s = "LIMWORD";
694 break;
695 case NOTLIMWORD:
696 s = "NOTLIMWORD";
697 break;
698 case QMARK:
699 s = "QMARK";
700 break;
701 case STAR:
702 s = "STAR";
703 break;
704 case PLUS:
705 s = "PLUS";
706 break;
707 case CAT:
708 s = "CAT";
709 break;
710 case OR:
711 s = "OR";
712 break;
713 case LPAREN:
714 s = "LPAREN";
715 break;
716 case RPAREN:
717 s = "RPAREN";
718 break;
719 case ANYCHAR:
720 s = "ANYCHAR";
721 break;
722 case MBCSET:
723 s = "MBCSET";
724 break;
725 default:
726 s = "CSET";
727 break;
728 }
729 fprintf (stderr, "%s", s);
730 }
731}
732#endif /* DEBUG */
733
734/* Stuff pertaining to charclasses. */
735
736static bool
737tstbit (unsigned int b, charclass const *c)
738{
739 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
740}
741
742static void
743setbit (unsigned int b, charclass *c)
744{
745 charclass_word one = 1;
746 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
747}
748
749static void
750clrbit (unsigned int b, charclass *c)
751{
752 charclass_word one = 1;
753 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
754}
755
756static void
757zeroset (charclass *s)
758{
759 memset (s, 0, sizeof *s);
760}
761
762static void
763fillset (charclass *s)
764{
765 for (int i = 0; i < CHARCLASS_WORDS; i++)
766 s->w[i] = CHARCLASS_WORD_MASK;
767}
768
769static void
770notset (charclass *s)
771{
772 for (int i = 0; i < CHARCLASS_WORDS; ++i)
773 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
774}
775
776static bool
777equal (charclass const *s1, charclass const *s2)
778{
779 charclass_word w = 0;
780 for (int i = 0; i < CHARCLASS_WORDS; i++)
781 w |= s1->w[i] ^ s2->w[i];
782 return w == 0;
783}
784
785static bool
786emptyset (charclass const *s)
787{
788 charclass_word w = 0;
789 for (int i = 0; i < CHARCLASS_WORDS; i++)
790 w |= s->w[i];
791 return w == 0;
792}
793
794/* Ensure that the array addressed by PA holds at least I + 1 items.
795 Either return PA, or reallocate the array and return its new address.
796 Although PA may be null, the returned value is never null.
797
798 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
799 is updated on reallocation. If PA is null, *NITEMS must be zero.
800 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
801 ITEM_SIZE is the size of one item; it must be positive.
802 Avoid O(N**2) behavior on arrays growing linearly. */
803static void *
804maybe_realloc (void *pa, idx_t i, idx_t *nitems,
805 ptrdiff_t nitems_max, idx_t item_size)
806{
807 if (i < *nitems)
808 return pa;
809 return xpalloc (pa, nitems, 1, nitems_max, item_size);
810}
811
812/* In DFA D, find the index of charclass S, or allocate a new one. */
813static idx_t
814charclass_index (struct dfa *d, charclass const *s)
815{
816 idx_t i;
817
818 for (i = 0; i < d->cindex; ++i)
819 if (equal (s, &d->charclasses[i]))
820 return i;
821 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
822 TOKEN_MAX - CSET, sizeof *d->charclasses);
823 ++d->cindex;
824 d->charclasses[i] = *s;
825 return i;
826}
827
828static bool
829unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
830{
831 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
832}
833
834static int
835char_context (struct dfa const *dfa, unsigned char c)
836{
837 if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
838 return CTX_NEWLINE;
839 if (unibyte_word_constituent (dfa, c))
840 return CTX_LETTER;
841 return CTX_NONE;
842}
843
844/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
845 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
846 this may happen when folding case in weird Turkish locales where
847 dotless i/dotted I are not included in the chosen character set.
848 Return whether a bit was set in the charclass. */
849static bool
850setbit_wc (wint_t wc, charclass *c)
851{
852 int b = wctob (wc);
853 if (b < 0)
854 return false;
855
856 setbit (b, c);
857 return true;
858}
859
860/* Set a bit for B and its case variants in the charclass C.
861 MB_CUR_MAX must be 1. */
862static void
863setbit_case_fold_c (int b, charclass *c)
864{
865 int ub = toupper (b);
866 for (int i = 0; i < NOTCHAR; i++)
867 if (toupper (i) == ub)
868 setbit (i, c);
869}
870
871/* Fetch the next lexical input character from the pattern. There
872 must at least one byte of pattern input. Set DFA->lex.wctok to the
873 value of the character or to WEOF depending on whether the input is
874 a valid multibyte character (possibly of length 1). Then return
875 the next input byte value, except return EOF if the input is a
876 multibyte character of length greater than 1. */
877static int
878fetch_wc (struct dfa *dfa)
879{
880 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
881 dfa);
882 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
883 dfa->lex.ptr += nbytes;
884 dfa->lex.left -= nbytes;
885 return c;
886}
887
888/* If there is no more input, report an error about unbalanced brackets.
889 Otherwise, behave as with fetch_wc (DFA). */
890static int
891bracket_fetch_wc (struct dfa *dfa)
892{
893 if (! dfa->lex.left)
894 dfaerror (_("unbalanced ["));
895 return fetch_wc (dfa);
896}
897
898typedef int predicate (int);
899
900/* The following list maps the names of the Posix named character classes
901 to predicate functions that determine whether a given character is in
902 the class. The leading [ has already been eaten by the lexical
903 analyzer. */
904struct dfa_ctype
905{
906 const char *name;
907 predicate *func;
908 bool single_byte_only;
909};
910
911static const struct dfa_ctype prednames[] = {
912 {"alpha", isalpha, false},
913 {"upper", isupper, false},
914 {"lower", islower, false},
915 {"digit", isdigit, true},
916 {"xdigit", isxdigit, false},
917 {"space", isspace, false},
918 {"punct", ispunct, false},
919 {"alnum", isalnum, false},
920 {"print", isprint, false},
921 {"graph", isgraph, false},
922 {"cntrl", iscntrl, false},
923 {"blank", isblank, false},
924 {NULL, NULL, false}
925};
926
927static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
928find_pred (const char *str)
929{
930 for (int i = 0; prednames[i].name; i++)
931 if (str_eq (str, prednames[i].name))
932 return &prednames[i];
933 return NULL;
934}
935
936/* Parse a bracket expression, which possibly includes multibyte
937 characters. */
938static token
939parse_bracket_exp (struct dfa *dfa)
940{
941 /* This is a bracket expression that dfaexec is known to
942 process correctly. */
943 bool known_bracket_exp = true;
944
945 /* Used to warn about [:space:].
946 Bit 0 = first character is a colon.
947 Bit 1 = last character is a colon.
948 Bit 2 = includes any other character but a colon.
949 Bit 3 = includes ranges, char/equiv classes or collation elements. */
950 int colon_warning_state;
951
952 dfa->lex.brack.nchars = 0;
953 charclass ccl;
954 zeroset (&ccl);
955 int c = bracket_fetch_wc (dfa);
956 bool invert = c == '^';
957 if (invert)
958 {
959 c = bracket_fetch_wc (dfa);
960 known_bracket_exp = dfa->localeinfo.simple;
961 }
962 wint_t wc = dfa->lex.wctok;
963 int c1;
964 wint_t wc1;
965 colon_warning_state = (c == ':');
966 do
967 {
968 c1 = NOTCHAR; /* Mark c1 as not initialized. */
969 colon_warning_state &= ~2;
970
971 /* Note that if we're looking at some other [:...:] construct,
972 we just treat it as a bunch of ordinary characters. We can do
973 this because we assume regex has checked for syntax errors before
974 dfa is ever called. */
975 if (c == '[')
976 {
977 c1 = bracket_fetch_wc (dfa);
978 wc1 = dfa->lex.wctok;
979
980 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
981 || c1 == '.' || c1 == '=')
982 {
983 enum { MAX_BRACKET_STRING_LEN = 32 };
984 char str[MAX_BRACKET_STRING_LEN + 1];
985 int len = 0;
986 for (;;)
987 {
988 c = bracket_fetch_wc (dfa);
989 if (dfa->lex.left == 0
990 || (c == c1 && dfa->lex.ptr[0] == ']'))
991 break;
992 if (len < MAX_BRACKET_STRING_LEN)
993 str[len++] = c;
994 else
995 /* This is in any case an invalid class name. */
996 str[0] = '\0';
997 }
998 str[len] = '\0';
999
1000 /* Fetch bracket. */
1001 c = bracket_fetch_wc (dfa);
1002 wc = dfa->lex.wctok;
1003 if (c1 == ':')
1004 /* Build character class. POSIX allows character
1005 classes to match multicharacter collating elements,
1006 but the regex code does not support that, so do not
1007 worry about that possibility. */
1008 {
1009 char const *class
1010 = (dfa->syntax.case_fold && (str_eq (str, "upper")
1011 || str_eq (str, "lower"))
1012 ? "alpha" : str);
1013 const struct dfa_ctype *pred = find_pred (class);
1014 if (!pred)
1015 dfaerror (_("invalid character class"));
1016
1017 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1018 known_bracket_exp = false;
1019 else
1020 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1021 if (pred->func (c2))
1022 setbit (c2, &ccl);
1023 }
1024 else
1025 known_bracket_exp = false;
1026
1027 colon_warning_state |= 8;
1028
1029 /* Fetch new lookahead character. */
1030 c1 = bracket_fetch_wc (dfa);
1031 wc1 = dfa->lex.wctok;
1032 continue;
1033 }
1034
1035 /* We treat '[' as a normal character here. c/c1/wc/wc1
1036 are already set up. */
1037 }
1038
1039 if (c == '\\'
1040 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1041 {
1042 c = bracket_fetch_wc (dfa);
1043 wc = dfa->lex.wctok;
1044 }
1045
1046 if (c1 == NOTCHAR)
1047 {
1048 c1 = bracket_fetch_wc (dfa);
1049 wc1 = dfa->lex.wctok;
1050 }
1051
1052 if (c1 == '-')
1053 /* build range characters. */
1054 {
1055 int c2 = bracket_fetch_wc (dfa);
1056 wint_t wc2 = dfa->lex.wctok;
1057
1058 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1059 Treat it like [-a[.aa.]] while parsing it, and
1060 remember that the set is unknown. */
1061 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1062 {
1063 known_bracket_exp = false;
1064 c2 = ']';
1065 }
1066
1067 if (c2 == ']')
1068 {
1069 /* In the case [x-], the - is an ordinary hyphen,
1070 which is left in c1, the lookahead character. */
1071 dfa->lex.ptr--;
1072 dfa->lex.left++;
1073 }
1074 else
1075 {
1076 if (c2 == '\\' && (dfa->syntax.syntax_bits
1077 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1078 {
1079 c2 = bracket_fetch_wc (dfa);
1080 wc2 = dfa->lex.wctok;
1081 }
1082
1083 colon_warning_state |= 8;
1084 c1 = bracket_fetch_wc (dfa);
1085 wc1 = dfa->lex.wctok;
1086
1087 /* Treat [x-y] as a range if x != y. */
1088 if (wc != wc2 || wc == WEOF)
1089 {
1090 if (dfa->localeinfo.simple
1091 || (c_isdigit (c) & c_isdigit (c2)))
1092 {
1093 for (int ci = c; ci <= c2; ci++)
1094 if (dfa->syntax.case_fold && isalpha (ci))
1095 setbit_case_fold_c (ci, &ccl);
1096 else
1097 setbit (ci, &ccl);
1098 }
1099 else
1100 known_bracket_exp = false;
1101
1102 continue;
1103 }
1104 }
1105 }
1106
1107 colon_warning_state |= (c == ':') ? 2 : 4;
1108
1109 if (!dfa->localeinfo.multibyte)
1110 {
1111 if (dfa->syntax.case_fold && isalpha (c))
1112 setbit_case_fold_c (c, &ccl);
1113 else
1114 setbit (c, &ccl);
1115 continue;
1116 }
1117
1118 if (wc == WEOF)
1119 known_bracket_exp = false;
1120 else
1121 {
1122 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1123 int n = (dfa->syntax.case_fold
1124 ? case_folded_counterparts (wc, folded + 1) + 1
1125 : 1);
1126 folded[0] = wc;
1127 for (int i = 0; i < n; i++)
1128 if (!setbit_wc (folded[i], &ccl))
1129 {
1130 dfa->lex.brack.chars
1131 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1132 &dfa->lex.brack.nchars_alloc, -1,
1133 sizeof *dfa->lex.brack.chars);
1134 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1135 }
1136 }
1137 }
1138 while ((wc = wc1, (c = c1) != ']'));
1139
1140 if (colon_warning_state == 7)
1141 ((dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR
1142 ? dfaerror : dfawarn)
1143 (_("character class syntax is [[:space:]], not [:space:]")));
1144
1145 if (! known_bracket_exp)
1146 return BACKREF;
1147
1148 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1149 {
1150 dfa->lex.brack.invert = invert;
1151 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152 return MBCSET;
1153 }
1154
1155 if (invert)
1156 {
1157 notset (&ccl);
1158 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159 clrbit ('\n', &ccl);
1160 }
1161
1162 return CSET + charclass_index (dfa, &ccl);
1163}
1164
1165struct lexptr
1166{
1167 char const *ptr;
1168 idx_t left;
1169};
1170
1171static void
1172push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1173{
1174 ls->ptr = dfa->lex.ptr;
1175 ls->left = dfa->lex.left;
1176 dfa->lex.ptr = s;
1177 dfa->lex.left = strlen (s);
1178}
1179
1180static void
1181pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1182{
1183 dfa->lex.ptr = ls->ptr;
1184 dfa->lex.left = ls->left;
1185}
1186
1187static token
1188lex (struct dfa *dfa)
1189{
1190 bool backslash = false;
1191
1192 /* Basic plan: We fetch a character. If it's a backslash,
1193 we set the backslash flag and go through the loop again.
1194 On the plus side, this avoids having a duplicate of the
1195 main switch inside the backslash case. On the minus side,
1196 it means that just about every case tests the backslash flag. */
1197 for (int i = 0; i < 2; ++i)
1198 {
1199 if (! dfa->lex.left)
1200 return dfa->lex.lasttok = END;
1201 int c = fetch_wc (dfa);
1202
1203 switch (c)
1204 {
1205 case '\\':
1206 if (backslash)
1207 goto normal_char;
1208 if (dfa->lex.left == 0)
1209 dfaerror (_("unfinished \\ escape"));
1210 backslash = true;
1211 break;
1212
1213 case '^':
1214 if (backslash)
1215 goto normal_char;
1216 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1217 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1218 || dfa->lex.lasttok == OR)
1219 return dfa->lex.lasttok = BEGLINE;
1220 goto normal_char;
1221
1222 case '$':
1223 if (backslash)
1224 goto normal_char;
1225 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1226 || dfa->lex.left == 0
1227 || ((dfa->lex.left
1228 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1229 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1230 & (dfa->lex.ptr[0] == '\\')]
1231 == ')'))
1232 || ((dfa->lex.left
1233 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1234 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1235 & (dfa->lex.ptr[0] == '\\')]
1236 == '|'))
1237 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1238 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1239 return dfa->lex.lasttok = ENDLINE;
1240 goto normal_char;
1241
1242 case '1':
1243 case '2':
1244 case '3':
1245 case '4':
1246 case '5':
1247 case '6':
1248 case '7':
1249 case '8':
1250 case '9':
1251 if (!backslash)
1252 goto normal_char;
1253 if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1254 goto stray_backslash;
1255
1256 dfa->lex.laststart = false;
1257 return dfa->lex.lasttok = BACKREF;
1258
1259 case '`':
1260 if (!backslash)
1261 goto normal_char;
1262 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1263 goto stray_backslash;
1264
1265 /* FIXME: should be beginning of string */
1266 return dfa->lex.lasttok = BEGLINE;
1267
1268 case '\'':
1269 if (!backslash)
1270 goto normal_char;
1271 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1272 goto stray_backslash;
1273
1274 /* FIXME: should be end of string */
1275 return dfa->lex.lasttok = ENDLINE;
1276
1277 case '<':
1278 if (!backslash)
1279 goto normal_char;
1280 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1281 goto stray_backslash;
1282
1283 return dfa->lex.lasttok = BEGWORD;
1284
1285 case '>':
1286 if (!backslash)
1287 goto normal_char;
1288 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1289 goto stray_backslash;
1290
1291 return dfa->lex.lasttok = ENDWORD;
1292
1293 case 'b':
1294 if (!backslash)
1295 goto normal_char;
1296 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1297 goto stray_backslash;
1298
1299 return dfa->lex.lasttok = LIMWORD;
1300
1301 case 'B':
1302 if (!backslash)
1303 goto normal_char;
1304 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1305 goto stray_backslash;
1306
1307 return dfa->lex.lasttok = NOTLIMWORD;
1308
1309 case '?':
1310 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1311 goto default_case;
1312 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1313 goto normal_char;
1314 if (dfa->lex.laststart)
1315 {
1316 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1317 goto default_case;
1318 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1319 dfawarn (_("? at start of expression"));
1320 }
1321 return dfa->lex.lasttok = QMARK;
1322
1323 case '*':
1324 if (backslash)
1325 goto normal_char;
1326 if (dfa->lex.laststart)
1327 {
1328 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1329 goto default_case;
1330 if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1331 dfawarn (_("* at start of expression"));
1332 }
1333 return dfa->lex.lasttok = STAR;
1334
1335 case '+':
1336 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1337 goto default_case;
1338 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1339 goto normal_char;
1340 if (dfa->lex.laststart)
1341 {
1342 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1343 goto default_case;
1344 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1345 dfawarn (_("+ at start of expression"));
1346 }
1347 return dfa->lex.lasttok = PLUS;
1348
1349 case '{':
1350 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1351 goto default_case;
1352 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1353 goto normal_char;
1354
1355 /* Cases:
1356 {M} - exact count
1357 {M,} - minimum count, maximum is infinity
1358 {,N} - 0 through N
1359 {,} - 0 to infinity (same as '*')
1360 {M,N} - M through N */
1361 {
1362 char const *p = dfa->lex.ptr;
1363 char const *lim = p + dfa->lex.left;
1364 dfa->lex.minrep = dfa->lex.maxrep = -1;
1365 for (; p != lim && c_isdigit (*p); p++)
1366 dfa->lex.minrep = (dfa->lex.minrep < 0
1367 ? *p - '0'
1368 : MIN (RE_DUP_MAX + 1,
1369 dfa->lex.minrep * 10 + *p - '0'));
1370 if (p != lim)
1371 {
1372 if (*p != ',')
1373 dfa->lex.maxrep = dfa->lex.minrep;
1374 else
1375 {
1376 if (dfa->lex.minrep < 0)
1377 dfa->lex.minrep = 0;
1378 while (++p != lim && c_isdigit (*p))
1379 dfa->lex.maxrep
1380 = (dfa->lex.maxrep < 0
1381 ? *p - '0'
1382 : MIN (RE_DUP_MAX + 1,
1383 dfa->lex.maxrep * 10 + *p - '0'));
1384 }
1385 }
1386 bool invalid_content
1387 = ! ((! backslash || (p != lim && *p++ == '\\'))
1388 && p != lim && *p++ == '}'
1389 && 0 <= dfa->lex.minrep
1390 && (dfa->lex.maxrep < 0
1391 || dfa->lex.minrep <= dfa->lex.maxrep));
1392 if (invalid_content
1393 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1394 goto normal_char;
1395 if (dfa->lex.laststart)
1396 {
1397 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1398 goto default_case;
1399 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1400 dfawarn (_("{...} at start of expression"));
1401 }
1402 if (invalid_content)
1403 dfaerror (_("invalid content of \\{\\}"));
1404 if (RE_DUP_MAX < dfa->lex.maxrep)
1405 dfaerror (_("regular expression too big"));
1406 dfa->lex.ptr = p;
1407 dfa->lex.left = lim - p;
1408 }
1409 dfa->lex.laststart = false;
1410 return dfa->lex.lasttok = REPMN;
1411
1412 case '|':
1413 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1414 goto default_case;
1415 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1416 goto normal_char;
1417 dfa->lex.laststart = true;
1418 return dfa->lex.lasttok = OR;
1419
1420 case '\n':
1421 if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1422 goto default_case;
1423 if (backslash)
1424 goto normal_char;
1425 dfa->lex.laststart = true;
1426 return dfa->lex.lasttok = OR;
1427
1428 case '(':
1429 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1430 goto normal_char;
1431 dfa->lex.parens++;
1432 dfa->lex.laststart = true;
1433 return dfa->lex.lasttok = LPAREN;
1434
1435 case ')':
1436 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1437 goto normal_char;
1438 if (dfa->lex.parens == 0
1439 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1440 goto normal_char;
1441 dfa->lex.parens--;
1442 dfa->lex.laststart = false;
1443 return dfa->lex.lasttok = RPAREN;
1444
1445 case '.':
1446 if (backslash)
1447 goto normal_char;
1448 if (dfa->canychar < 0)
1449 {
1450 charclass ccl;
1451 fillset (&ccl);
1452 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1453 clrbit ('\n', &ccl);
1454 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1455 clrbit ('\0', &ccl);
1456 if (dfa->localeinfo.multibyte)
1457 for (int c2 = 0; c2 < NOTCHAR; c2++)
1458 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1459 clrbit (c2, &ccl);
1460 dfa->canychar = charclass_index (dfa, &ccl);
1461 }
1462 dfa->lex.laststart = false;
1463 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1464 ? ANYCHAR
1465 : CSET + dfa->canychar);
1466
1467 case 's':
1468 case 'S':
1469 if (!backslash)
1470 goto normal_char;
1471 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1472 goto stray_backslash;
1473
1474 if (!dfa->localeinfo.multibyte)
1475 {
1476 charclass ccl;
1477 zeroset (&ccl);
1478 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1479 if (isspace (c2))
1480 setbit (c2, &ccl);
1481 if (c == 'S')
1482 notset (&ccl);
1483 dfa->lex.laststart = false;
1484 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1485 }
1486
1487 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1488 add_utf8_anychar, makes sense. */
1489
1490 /* \s and \S are documented to be equivalent to [[:space:]] and
1491 [^[:space:]] respectively, so tell the lexer to process those
1492 strings, each minus its "already processed" '['. */
1493 {
1494 struct lexptr ls;
1495 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1496 dfa->lex.lasttok = parse_bracket_exp (dfa);
1497 pop_lex_state (dfa, &ls);
1498 }
1499
1500 dfa->lex.laststart = false;
1501 return dfa->lex.lasttok;
1502
1503 case 'w':
1504 case 'W':
1505 if (!backslash)
1506 goto normal_char;
1507 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1508 goto stray_backslash;
1509
1510 if (!dfa->localeinfo.multibyte)
1511 {
1512 charclass ccl;
1513 zeroset (&ccl);
1514 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1515 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1516 setbit (c2, &ccl);
1517 if (c == 'W')
1518 notset (&ccl);
1519 dfa->lex.laststart = false;
1520 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1521 }
1522
1523 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1524 add_utf8_anychar, makes sense. */
1525
1526 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1527 [^_[:alnum:]] respectively, so tell the lexer to process those
1528 strings, each minus its "already processed" '['. */
1529 {
1530 struct lexptr ls;
1531 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1532 dfa->lex.lasttok = parse_bracket_exp (dfa);
1533 pop_lex_state (dfa, &ls);
1534 }
1535
1536 dfa->lex.laststart = false;
1537 return dfa->lex.lasttok;
1538
1539 case '[':
1540 if (backslash)
1541 goto normal_char;
1542 dfa->lex.laststart = false;
1543 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1544
1545 default:
1546 default_case:
1547 if (!backslash)
1548 goto normal_char;
1549 stray_backslash:
1550 if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1551 {
1552 char const *msg;
1553 char msgbuf[100];
1554 if (!iswprint (dfa->lex.wctok))
1555 msg = _("stray \\ before unprintable character");
1556 else if (iswspace (dfa->lex.wctok))
1557 msg = _("stray \\ before white space");
1558 else
1559 {
1560 int n = snprintf (msgbuf, sizeof msgbuf,
1561 _("stray \\ before %lc"), dfa->lex.wctok);
1562 msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1563 }
1564 dfawarn (msg);
1565 }
1566 FALLTHROUGH;
1567 case ']': case '}':
1568 normal_char:
1569 dfa->lex.laststart = false;
1570 /* For multibyte character sets, folding is done in atom. Always
1571 return WCHAR. */
1572 if (dfa->localeinfo.multibyte)
1573 return dfa->lex.lasttok = WCHAR;
1574
1575 if (dfa->syntax.case_fold && isalpha (c))
1576 {
1577 charclass ccl;
1578 zeroset (&ccl);
1579 setbit_case_fold_c (c, &ccl);
1580 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1581 }
1582
1583 return dfa->lex.lasttok = c;
1584 }
1585 }
1586
1587 /* The above loop should consume at most a backslash
1588 and some other character. */
1589 abort ();
1590 return END; /* keeps pedantic compilers happy. */
1591}
1592
1593static void
1594addtok_mb (struct dfa *dfa, token t, char mbprop)
1595{
1596 if (dfa->talloc == dfa->tindex)
1597 {
1598 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1599 sizeof *dfa->tokens);
1600 if (dfa->localeinfo.multibyte)
1601 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1602 sizeof *dfa->multibyte_prop);
1603 }
1604 if (dfa->localeinfo.multibyte)
1605 dfa->multibyte_prop[dfa->tindex] = mbprop;
1606 dfa->tokens[dfa->tindex++] = t;
1607
1608 switch (t)
1609 {
1610 case QMARK:
1611 case STAR:
1612 case PLUS:
1613 break;
1614
1615 case CAT:
1616 case OR:
1617 dfa->parse.depth--;
1618 break;
1619
1620 case EMPTY:
1621 dfa->epsilon = true;
1622 goto increment_depth;
1623
1624 case BACKREF:
1625 dfa->fast = false;
1626 goto increment_nleaves;
1627
1628 case BEGLINE:
1629 case ENDLINE:
1630 case BEGWORD:
1631 case ENDWORD:
1632 case LIMWORD:
1633 case NOTLIMWORD:
1634 dfa->epsilon = true;
1635 FALLTHROUGH;
1636 default:
1637 increment_nleaves:
1638 dfa->nleaves++;
1639 increment_depth:
1640 dfa->parse.depth++;
1641 if (dfa->depth < dfa->parse.depth)
1642 dfa->depth = dfa->parse.depth;
1643 break;
1644 }
1645}
1646
1647static void addtok_wc (struct dfa *dfa, wint_t wc);
1648
1649/* Add the given token to the parse tree, maintaining the depth count and
1650 updating the maximum depth if necessary. */
1651static void
1652addtok (struct dfa *dfa, token t)
1653{
1654 if (dfa->localeinfo.multibyte && t == MBCSET)
1655 {
1656 bool need_or = false;
1657
1658 /* Extract wide characters into alternations for better performance.
1659 This does not require UTF-8. */
1660 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1661 {
1662 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1663 if (need_or)
1664 addtok (dfa, OR);
1665 need_or = true;
1666 }
1667 dfa->lex.brack.nchars = 0;
1668
1669 /* Wide characters have been handled above, so it is possible
1670 that the set is empty now. Do nothing in that case. */
1671 if (dfa->lex.brack.cset != -1)
1672 {
1673 addtok (dfa, CSET + dfa->lex.brack.cset);
1674 if (need_or)
1675 addtok (dfa, OR);
1676 }
1677 }
1678 else
1679 {
1680 addtok_mb (dfa, t, 3);
1681 }
1682}
1683
1684/* We treat a multibyte character as a single atom, so that DFA
1685 can treat a multibyte character as a single expression.
1686
1687 e.g., we construct the following tree from "<mb1><mb2>".
1688 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1689 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1690static void
1691addtok_wc (struct dfa *dfa, wint_t wc)
1692{
1693 unsigned char buf[MB_LEN_MAX];
1694 mbstate_t s = { 0 };
1695 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1696 int buflen;
1697
1698 if (stored_bytes != (size_t) -1)
1699 buflen = stored_bytes;
1700 else
1701 {
1702 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1703 the addtok_mb call altogether can corrupt the heap. */
1704 buflen = 1;
1705 buf[0] = 0;
1706 }
1707
1708 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1709 for (int i = 1; i < buflen; i++)
1710 {
1711 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1712 addtok (dfa, CAT);
1713 }
1714}
1715
1716static void
1717add_utf8_anychar (struct dfa *dfa)
1718{
1719 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1720 UTF-8 byte sequence has been defined as follows:
1721
1722 ([\x00-\x7f]
1723 |[\xc2-\xdf][\x80-\xbf]
1724 |[\xe0][\xa0-\xbf][\x80-\xbf]
1725 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1726 |[\xed][\x80-\x9f][\x80-\xbf]
1727 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1728 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1729 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1730
1731 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1732 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1733 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1734 H = [\x80-\x9f], I = [\xf0],
1735 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1736
1737 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1738
1739 /* Mnemonics for classes containing two or more bytes. */
1740 enum { A, B, C, E, F, H, J, K, M };
1741
1742 /* Mnemonics for single-byte tokens. */
1743 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1744
1745 static charclass const utf8_classes[] = {
1746 /* A. 00-7f: 1-byte sequence. */
1747 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1748
1749 /* B. c2-df: 1st byte of a 2-byte sequence. */
1750 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1751
1752 /* C. 80-bf: non-leading bytes. */
1753 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1754
1755 /* D. e0 (just a token). */
1756
1757 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1758 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1759
1760 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1761 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1762
1763 /* G. ed (just a token). */
1764
1765 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1766 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1767
1768 /* I. f0 (just a token). */
1769
1770 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1771 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1772
1773 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1774 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1775
1776 /* L. f4 (just a token). */
1777
1778 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1779 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1780 };
1781
1782 /* Define the character classes that are needed below. */
1783 if (dfa->utf8_anychar_classes[0] == 0)
1784 {
1785 charclass c = utf8_classes[0];
1786 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1787 clrbit ('\n', &c);
1788 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1789 clrbit ('\0', &c);
1790 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1791
1792 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1793 dfa->utf8_anychar_classes[i]
1794 = CSET + charclass_index (dfa, &utf8_classes[i]);
1795 }
1796
1797 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1798 The token buffer is in reverse Polish order, so we get
1799 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1800 C CAT OR C CAT OR C CAT OR". */
1801 addtok (dfa, dfa->utf8_anychar_classes[A]);
1802 addtok (dfa, dfa->utf8_anychar_classes[B]);
1803 addtok (dfa, D_token);
1804 addtok (dfa, dfa->utf8_anychar_classes[E]);
1805 addtok (dfa, CAT);
1806 addtok (dfa, OR);
1807 addtok (dfa, G_token);
1808 addtok (dfa, dfa->utf8_anychar_classes[H]);
1809 addtok (dfa, CAT);
1810 addtok (dfa, OR);
1811 addtok (dfa, dfa->utf8_anychar_classes[F]);
1812 addtok (dfa, I_token);
1813 addtok (dfa, dfa->utf8_anychar_classes[J]);
1814 addtok (dfa, CAT);
1815 addtok (dfa, OR);
1816 addtok (dfa, L_token);
1817 addtok (dfa, dfa->utf8_anychar_classes[M]);
1818 addtok (dfa, CAT);
1819 addtok (dfa, OR);
1820 addtok (dfa, dfa->utf8_anychar_classes[K]);
1821 for (int i = 0; i < 3; i++)
1822 {
1823 addtok (dfa, dfa->utf8_anychar_classes[C]);
1824 addtok (dfa, CAT);
1825 addtok (dfa, OR);
1826 }
1827}
1828
1829/* The grammar understood by the parser is as follows.
1830
1831 regexp:
1832 regexp OR branch
1833 branch
1834
1835 branch:
1836 branch closure
1837 closure
1838
1839 closure:
1840 closure QMARK
1841 closure STAR
1842 closure PLUS
1843 closure REPMN
1844 atom
1845
1846 atom:
1847 <normal character>
1848 <multibyte character>
1849 ANYCHAR
1850 MBCSET
1851 CSET
1852 BACKREF
1853 BEGLINE
1854 ENDLINE
1855 BEGWORD
1856 ENDWORD
1857 LIMWORD
1858 NOTLIMWORD
1859 LPAREN regexp RPAREN
1860 <empty>
1861
1862 The parser builds a parse tree in postfix form in an array of tokens. */
1863
1864static void
1865atom (struct dfa *dfa)
1866{
1867 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1868 || dfa->parse.tok >= CSET
1869 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1870 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1871 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1872 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1873 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1874 {
1875 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1876 {
1877 /* For UTF-8 expand the period to a series of CSETs that define a
1878 valid UTF-8 character. This avoids using the slow multibyte
1879 path. I'm pretty sure it would be both profitable and correct to
1880 do it for any encoding; however, the optimization must be done
1881 manually as it is done above in add_utf8_anychar. So, let's
1882 start with UTF-8: it is the most used, and the structure of the
1883 encoding makes the correctness more obvious. */
1884 add_utf8_anychar (dfa);
1885 }
1886 else
1887 addtok (dfa, dfa->parse.tok);
1888 dfa->parse.tok = lex (dfa);
1889 }
1890 else if (dfa->parse.tok == WCHAR)
1891 {
1892 if (dfa->lex.wctok == WEOF)
1893 addtok (dfa, BACKREF);
1894 else
1895 {
1896 addtok_wc (dfa, dfa->lex.wctok);
1897
1898 if (dfa->syntax.case_fold)
1899 {
1900 wchar_t folded[CASE_FOLDED_BUFSIZE];
1901 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1902 for (int i = 0; i < n; i++)
1903 {
1904 addtok_wc (dfa, folded[i]);
1905 addtok (dfa, OR);
1906 }
1907 }
1908 }
1909
1910 dfa->parse.tok = lex (dfa);
1911 }
1912 else if (dfa->parse.tok == LPAREN)
1913 {
1914 dfa->parse.tok = lex (dfa);
1915 regexp (dfa);
1916 if (dfa->parse.tok != RPAREN)
1917 dfaerror (_("unbalanced ("));
1918 dfa->parse.tok = lex (dfa);
1919 }
1920 else
1921 addtok (dfa, EMPTY);
1922}
1923
1924/* Return the number of tokens in the given subexpression. */
1925static idx_t _GL_ATTRIBUTE_PURE
1926nsubtoks (struct dfa const *dfa, idx_t tindex)
1927{
1928 switch (dfa->tokens[tindex - 1])
1929 {
1930 default:
1931 return 1;
1932 case QMARK:
1933 case STAR:
1934 case PLUS:
1935 return 1 + nsubtoks (dfa, tindex - 1);
1936 case CAT:
1937 case OR:
1938 {
1939 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1940 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1941 }
1942 }
1943}
1944
1945/* Copy the given subexpression to the top of the tree. */
1946static void
1947copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1948{
1949 if (dfa->localeinfo.multibyte)
1950 for (idx_t i = 0; i < ntokens; i++)
1951 addtok_mb (dfa, dfa->tokens[tindex + i],
1952 dfa->multibyte_prop[tindex + i]);
1953 else
1954 for (idx_t i = 0; i < ntokens; i++)
1955 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1956}
1957
1958static void
1959closure (struct dfa *dfa)
1960{
1961 atom (dfa);
1962 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1963 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1964 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1965 {
1966 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1967 idx_t tindex = dfa->tindex - ntokens;
1968 if (dfa->lex.maxrep < 0)
1969 addtok (dfa, PLUS);
1970 if (dfa->lex.minrep == 0)
1971 addtok (dfa, QMARK);
1972 int i;
1973 for (i = 1; i < dfa->lex.minrep; i++)
1974 {
1975 copytoks (dfa, tindex, ntokens);
1976 addtok (dfa, CAT);
1977 }
1978 for (; i < dfa->lex.maxrep; i++)
1979 {
1980 copytoks (dfa, tindex, ntokens);
1981 addtok (dfa, QMARK);
1982 addtok (dfa, CAT);
1983 }
1984 dfa->parse.tok = lex (dfa);
1985 }
1986 else if (dfa->parse.tok == REPMN)
1987 {
1988 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1989 dfa->parse.tok = lex (dfa);
1990 closure (dfa);
1991 }
1992 else
1993 {
1994 addtok (dfa, dfa->parse.tok);
1995 dfa->parse.tok = lex (dfa);
1996 }
1997}
1998
1999static void
2000branch (struct dfa* dfa)
2001{
2002 closure (dfa);
2003 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2004 && dfa->parse.tok >= 0)
2005 {
2006 closure (dfa);
2007 addtok (dfa, CAT);
2008 }
2009}
2010
2011static void
2012regexp (struct dfa *dfa)
2013{
2014 branch (dfa);
2015 while (dfa->parse.tok == OR)
2016 {
2017 dfa->parse.tok = lex (dfa);
2018 branch (dfa);
2019 addtok (dfa, OR);
2020 }
2021}
2022
2023/* Parse a string S of length LEN into D. S can include NUL characters.
2024 This is the main entry point for the parser. */
2025void
2026dfaparse (char const *s, idx_t len, struct dfa *d)
2027{
2028 d->lex.ptr = s;
2029 d->lex.left = len;
2030 d->lex.lasttok = END;
2031 d->lex.laststart = true;
2032
2033 if (!d->syntax.syntax_bits_set)
2034 dfaerror (_("no syntax specified"));
2035
2036 if (!d->nregexps)
2037 addtok (d, BEG);
2038
2039 d->parse.tok = lex (d);
2040 d->parse.depth = d->depth;
2041
2042 regexp (d);
2043
2044 if (d->parse.tok != END)
2045 dfaerror (_("unbalanced )"));
2046
2047 addtok (d, END - d->nregexps);
2048 addtok (d, CAT);
2049
2050 if (d->nregexps)
2051 addtok (d, OR);
2052
2053 ++d->nregexps;
2054}
2055
2056/* Some primitives for operating on sets of positions. */
2057
2058/* Copy one set to another. */
2059static void
2060copy (position_set const *src, position_set *dst)
2061{
2062 if (dst->alloc < src->nelem)
2063 {
2064 free (dst->elems);
2065 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2066 sizeof *dst->elems);
2067 }
2068 dst->nelem = src->nelem;
2069 if (src->nelem != 0)
2070 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2071}
2072
2073static void
2074alloc_position_set (position_set *s, idx_t size)
2075{
2076 s->elems = xnmalloc (size, sizeof *s->elems);
2077 s->alloc = size;
2078 s->nelem = 0;
2079}
2080
2081/* Insert position P in set S. S is maintained in sorted order on
2082 decreasing index. If there is already an entry in S with P.index
2083 then merge (logically-OR) P's constraints into the one in S.
2084 S->elems must point to an array large enough to hold the resulting set. */
2085static void
2086insert (position p, position_set *s)
2087{
2088 idx_t count = s->nelem;
2089 idx_t lo = 0, hi = count;
2090 while (lo < hi)
2091 {
2092 idx_t mid = (lo + hi) >> 1;
2093 if (s->elems[mid].index < p.index)
2094 lo = mid + 1;
2095 else if (s->elems[mid].index == p.index)
2096 {
2097 s->elems[mid].constraint |= p.constraint;
2098 return;
2099 }
2100 else
2101 hi = mid;
2102 }
2103
2104 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2105 for (idx_t i = count; i > lo; i--)
2106 s->elems[i] = s->elems[i - 1];
2107 s->elems[lo] = p;
2108 ++s->nelem;
2109}
2110
2111static void
2112append (position p, position_set *s)
2113{
2114 idx_t count = s->nelem;
2115 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2116 s->elems[s->nelem++] = p;
2117}
2118
2119/* Merge S1 and S2 (with the additional constraint C2) into M. The
2120 result is as if the positions of S1, and of S2 with the additional
2121 constraint C2, were inserted into an initially empty set. */
2122static void
2123merge_constrained (position_set const *s1, position_set const *s2,
2124 unsigned int c2, position_set *m)
2125{
2126 idx_t i = 0, j = 0;
2127
2128 if (m->alloc - s1->nelem < s2->nelem)
2129 {
2130 free (m->elems);
2131 m->alloc = s1->nelem;
2132 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2133 }
2134 m->nelem = 0;
2135 while (i < s1->nelem || j < s2->nelem)
2136 if (! (j < s2->nelem)
2137 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2138 {
2139 unsigned int c = ((i < s1->nelem && j < s2->nelem
2140 && s1->elems[i].index == s2->elems[j].index)
2141 ? s2->elems[j++].constraint & c2
2142 : 0);
2143 m->elems[m->nelem].index = s1->elems[i].index;
2144 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2145 }
2146 else
2147 {
2148 if (s2->elems[j].constraint & c2)
2149 {
2150 m->elems[m->nelem].index = s2->elems[j].index;
2151 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2152 }
2153 j++;
2154 }
2155}
2156
2157/* Merge two sets of positions into a third. The result is exactly as if
2158 the positions of both sets were inserted into an initially empty set. */
2159static void
2160merge (position_set const *s1, position_set const *s2, position_set *m)
2161{
2162 merge_constrained (s1, s2, -1, m);
2163}
2164
2165/* Merge into DST all the elements of SRC, possibly destroying
2166 the contents of the temporary M. */
2167static void
2168merge2 (position_set *dst, position_set const *src, position_set *m)
2169{
2170 if (src->nelem < 4)
2171 {
2172 for (idx_t i = 0; i < src->nelem; i++)
2173 insert (src->elems[i], dst);
2174 }
2175 else
2176 {
2177 merge (src, dst, m);
2178 copy (m, dst);
2179 }
2180}
2181
2182/* Delete a position from a set. Return the nonzero constraint of the
2183 deleted position, or zero if there was no such position. */
2184static unsigned int
2185delete (idx_t del, position_set *s)
2186{
2187 idx_t count = s->nelem;
2188 idx_t lo = 0, hi = count;
2189 while (lo < hi)
2190 {
2191 idx_t mid = (lo + hi) >> 1;
2192 if (s->elems[mid].index < del)
2193 lo = mid + 1;
2194 else if (s->elems[mid].index == del)
2195 {
2196 unsigned int c = s->elems[mid].constraint;
2197 idx_t i;
2198 for (i = mid; i + 1 < count; i++)
2199 s->elems[i] = s->elems[i + 1];
2200 s->nelem = i;
2201 return c;
2202 }
2203 else
2204 hi = mid;
2205 }
2206 return 0;
2207}
2208
2209/* Replace a position with the followed set. */
2210static void
2211replace (position_set *dst, idx_t del, position_set *add,
2212 unsigned int constraint, position_set *tmp)
2213{
2214 unsigned int c = delete (del, dst) & constraint;
2215
2216 if (c)
2217 {
2218 copy (dst, tmp);
2219 merge_constrained (tmp, add, c, dst);
2220 }
2221}
2222
2223/* Find the index of the state corresponding to the given position set with
2224 the given preceding context, or create a new state if there is no such
2225 state. Context tells whether we got here on a newline or letter. */
2226static state_num
2227state_index (struct dfa *d, position_set const *s, int context)
2228{
2229 size_t hash = 0;
2230 int constraint = 0;
2231 state_num i;
2232
2233 for (i = 0; i < s->nelem; ++i)
2234 {
2235 idx_t ind = s->elems[i].index;
2236 hash ^= ind + s->elems[i].constraint;
2237 }
2238
2239 /* Try to find a state that exactly matches the proposed one. */
2240 for (i = 0; i < d->sindex; ++i)
2241 {
2242 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2243 || context != d->states[i].context)
2244 continue;
2245 state_num j;
2246 for (j = 0; j < s->nelem; ++j)
2247 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2248 || s->elems[j].index != d->states[i].elems.elems[j].index)
2249 break;
2250 if (j == s->nelem)
2251 return i;
2252 }
2253
2254#ifdef DEBUG
2255 fprintf (stderr, "new state %td\n nextpos:", i);
2256 for (state_num j = 0; j < s->nelem; j++)
2257 {
2258 fprintf (stderr, " %td:", s->elems[j].index);
2259 prtok (d->tokens[s->elems[j].index]);
2260 }
2261 fprintf (stderr, "\n context:");
2262 if (context ^ CTX_ANY)
2263 {
2264 if (context & CTX_NONE)
2265 fprintf (stderr, " CTX_NONE");
2266 if (context & CTX_LETTER)
2267 fprintf (stderr, " CTX_LETTER");
2268 if (context & CTX_NEWLINE)
2269 fprintf (stderr, " CTX_NEWLINE");
2270 }
2271 else
2272 fprintf (stderr, " CTX_ANY");
2273 fprintf (stderr, "\n");
2274#endif
2275
2276 for (state_num j = 0; j < s->nelem; j++)
2277 {
2278 int c = d->constraints[s->elems[j].index];
2279
2280 if (c != 0)
2281 {
2282 if (succeeds_in_context (c, context, CTX_ANY))
2283 constraint |= c;
2284 }
2285 else if (d->tokens[s->elems[j].index] == BACKREF)
2286 constraint = NO_CONSTRAINT;
2287 }
2288
2289
2290 /* Create a new state. */
2291 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2292 sizeof *d->states);
2293 d->states[i].hash = hash;
2294 alloc_position_set (&d->states[i].elems, s->nelem);
2295 copy (s, &d->states[i].elems);
2296 d->states[i].context = context;
2297 d->states[i].constraint = constraint;
2298 d->states[i].mbps.nelem = 0;
2299 d->states[i].mbps.elems = NULL;
2300 d->states[i].mb_trindex = -1;
2301
2302 ++d->sindex;
2303
2304 return i;
2305}
2306
2307/* Find the epsilon closure of D's set of positions. If any position of the set
2308 contains a symbol that matches the empty string in some context, replace
2309 that position with the elements of its follow labeled with an appropriate
2310 constraint. Repeat exhaustively until no funny positions are left.
2311 S->elems must be large enough to hold the result. BACKWARD is D's
2312 backward set; use and update it too. */
2313static void
2314epsclosure (struct dfa const *d, position_set *backward)
2315{
2316 position_set tmp;
2317 alloc_position_set (&tmp, d->nleaves);
2318 for (idx_t i = 0; i < d->tindex; i++)
2319 if (0 < d->follows[i].nelem)
2320 {
2321 unsigned int constraint;
2322 switch (d->tokens[i])
2323 {
2324 default:
2325 continue;
2326
2327 case BEGLINE:
2328 constraint = BEGLINE_CONSTRAINT;
2329 break;
2330 case ENDLINE:
2331 constraint = ENDLINE_CONSTRAINT;
2332 break;
2333 case BEGWORD:
2334 constraint = BEGWORD_CONSTRAINT;
2335 break;
2336 case ENDWORD:
2337 constraint = ENDWORD_CONSTRAINT;
2338 break;
2339 case LIMWORD:
2340 constraint = LIMWORD_CONSTRAINT;
2341 break;
2342 case NOTLIMWORD:
2343 constraint = NOTLIMWORD_CONSTRAINT;
2344 break;
2345 case EMPTY:
2346 constraint = NO_CONSTRAINT;
2347 break;
2348 }
2349
2350 delete (i, &d->follows[i]);
2351
2352 for (idx_t j = 0; j < backward[i].nelem; j++)
2353 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2354 constraint, &tmp);
2355 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2356 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2357 NO_CONSTRAINT, &tmp);
2358 }
2359 free (tmp.elems);
2360}
2361
2362/* Returns the set of contexts for which there is at least one
2363 character included in C. */
2364
2365static int
2366charclass_context (struct dfa const *dfa, charclass const *c)
2367{
2368 int context = 0;
2369
2370 for (int j = 0; j < CHARCLASS_WORDS; j++)
2371 {
2372 if (c->w[j] & dfa->syntax.newline.w[j])
2373 context |= CTX_NEWLINE;
2374 if (c->w[j] & dfa->syntax.letters.w[j])
2375 context |= CTX_LETTER;
2376 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2377 context |= CTX_NONE;
2378 }
2379
2380 return context;
2381}
2382
2383/* Returns the contexts on which the position set S depends. Each context
2384 in the set of returned contexts (let's call it SC) may have a different
2385 follow set than other contexts in SC, and also different from the
2386 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2387 in the complement set will have the same follow set. */
2388
2389static int _GL_ATTRIBUTE_PURE
2390state_separate_contexts (struct dfa *d, position_set const *s)
2391{
2392 int separate_contexts = 0;
2393
2394 for (idx_t j = 0; j < s->nelem; j++)
2395 separate_contexts |= d->separates[s->elems[j].index];
2396
2397 return separate_contexts;
2398}
2399
2400enum
2401{
2402 /* Single token is repeated. It is distinguished from non-repeated. */
2403 OPT_REPEAT = (1 << 0),
2404
2405 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2406 node is not merged. */
2407 OPT_LPAREN = (1 << 1),
2408
2409 /* Multiple branches are joined. The node is not merged. */
2410 OPT_RPAREN = (1 << 2),
2411
2412 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2413 flag is turned on. */
2414 OPT_WALKED = (1 << 3),
2415
2416 /* The node is queued. The node is not queued again. */
2417 OPT_QUEUED = (1 << 4)
2418};
2419
2420static void
2421merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2422 position_set *merged)
2423{
2424 position_set *follows = d->follows;
2425 idx_t nelem = 0;
2426
2427 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2428 {
2429 idx_t sindex = follows[tindex].elems[i].index;
2430
2431 /* Skip the node as pruned in future. */
2432 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2433 if (iconstraint == 0)
2434 continue;
2435
2436 if (d->tokens[follows[tindex].elems[i].index] <= END)
2437 {
2438 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2439 continue;
2440 }
2441
2442 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2443 {
2444 idx_t j;
2445
2446 for (j = 0; j < nelem; j++)
2447 {
2448 idx_t dindex = follows[tindex].elems[j].index;
2449
2450 if (dindex == tindex)
2451 continue;
2452
2453 if (follows[tindex].elems[j].constraint != iconstraint)
2454 continue;
2455
2456 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2457 continue;
2458
2459 if (d->tokens[sindex] != d->tokens[dindex])
2460 continue;
2461
2462 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2463 continue;
2464
2465 if (flags[sindex] & OPT_REPEAT)
2466 delete (sindex, &follows[sindex]);
2467
2468 merge2 (&follows[dindex], &follows[sindex], merged);
2469
2470 break;
2471 }
2472
2473 if (j < nelem)
2474 continue;
2475 }
2476
2477 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2478 flags[sindex] |= OPT_QUEUED;
2479 }
2480
2481 follows[tindex].nelem = nelem;
2482}
2483
2484static int
2485compare (const void *a, const void *b)
2486{
2487 position const *p = a, *q = b;
2488 return (p->index > q->index) - (p->index < q->index);
2489}
2490
2491static void
2492reorder_tokens (struct dfa *d)
2493{
2494 idx_t nleaves = 0;
2495 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2496 map[0] = nleaves++;
2497 for (idx_t i = 1; i < d->tindex; i++)
2498 map[i] = -1;
2499
2500 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2501 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2502 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2503 char *multibyte_prop = (d->localeinfo.multibyte
2504 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2505 : NULL);
2506
2507 for (idx_t i = 0; i < d->tindex; i++)
2508 {
2509 if (map[i] < 0)
2510 {
2511 free (d->follows[i].elems);
2512 d->follows[i].elems = NULL;
2513 d->follows[i].nelem = 0;
2514 continue;
2515 }
2516
2517 tokens[map[i]] = d->tokens[i];
2518 follows[map[i]] = d->follows[i];
2519 constraints[map[i]] = d->constraints[i];
2520
2521 if (multibyte_prop != NULL)
2522 multibyte_prop[map[i]] = d->multibyte_prop[i];
2523
2524 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2525 {
2526 if (map[d->follows[i].elems[j].index] == -1)
2527 map[d->follows[i].elems[j].index] = nleaves++;
2528
2529 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2530 }
2531
2532 qsort (d->follows[i].elems, d->follows[i].nelem,
2533 sizeof *d->follows[i].elems, compare);
2534 }
2535
2536 for (idx_t i = 0; i < nleaves; i++)
2537 {
2538 d->tokens[i] = tokens[i];
2539 d->follows[i] = follows[i];
2540 d->constraints[i] = constraints[i];
2541
2542 if (multibyte_prop != NULL)
2543 d->multibyte_prop[i] = multibyte_prop[i];
2544 }
2545
2546 d->tindex = d->nleaves = nleaves;
2547
2548 free (tokens);
2549 free (follows);
2550 free (constraints);
2551 free (multibyte_prop);
2552 free (map);
2553}
2554
2555static void
2556dfaoptimize (struct dfa *d)
2557{
2558 char *flags = xizalloc (d->tindex);
2559
2560 for (idx_t i = 0; i < d->tindex; i++)
2561 {
2562 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2563 {
2564 if (d->follows[i].elems[j].index == i)
2565 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2566 else if (d->follows[i].elems[j].index < i)
2567 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2568 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2569 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2570 else
2571 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2572 }
2573 }
2574
2575 flags[0] |= OPT_QUEUED;
2576
2577 position_set merged0;
2578 position_set *merged = &merged0;
2579 alloc_position_set (merged, d->nleaves);
2580
2581 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2582
2583 for (idx_t i = 0; i < d->tindex; i++)
2584 if (flags[i] & OPT_QUEUED)
2585 merge_nfa_state (d, i, flags, merged);
2586
2587 reorder_tokens (d);
2588
2589 free (merged->elems);
2590 free (flags);
2591}
2592
2593/* Perform bottom-up analysis on the parse tree, computing various functions.
2594 Note that at this point, we're pretending constructs like \< are real
2595 characters rather than constraints on what can follow them.
2596
2597 Nullable: A node is nullable if it is at the root of a regexp that can
2598 match the empty string.
2599 * EMPTY leaves are nullable.
2600 * No other leaf is nullable.
2601 * A QMARK or STAR node is nullable.
2602 * A PLUS node is nullable if its argument is nullable.
2603 * A CAT node is nullable if both its arguments are nullable.
2604 * An OR node is nullable if either argument is nullable.
2605
2606 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2607 that could correspond to the first character of a string matching the
2608 regexp rooted at the given node.
2609 * EMPTY leaves have empty firstpos.
2610 * The firstpos of a nonempty leaf is that leaf itself.
2611 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2612 argument.
2613 * The firstpos of a CAT node is the firstpos of the left argument, union
2614 the firstpos of the right if the left argument is nullable.
2615 * The firstpos of an OR node is the union of firstpos of each argument.
2616
2617 Lastpos: The lastpos of a node is the set of positions that could
2618 correspond to the last character of a string matching the regexp at
2619 the given node.
2620 * EMPTY leaves have empty lastpos.
2621 * The lastpos of a nonempty leaf is that leaf itself.
2622 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2623 argument.
2624 * The lastpos of a CAT node is the lastpos of its right argument, union
2625 the lastpos of the left if the right argument is nullable.
2626 * The lastpos of an OR node is the union of the lastpos of each argument.
2627
2628 Follow: The follow of a position is the set of positions that could
2629 correspond to the character following a character matching the node in
2630 a string matching the regexp. At this point we consider special symbols
2631 that match the empty string in some context to be just normal characters.
2632 Later, if we find that a special symbol is in a follow set, we will
2633 replace it with the elements of its follow, labeled with an appropriate
2634 constraint.
2635 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2636 the follow of every node in the lastpos.
2637 * Every node in the firstpos of the second argument of a CAT node is in
2638 the follow of every node in the lastpos of the first argument.
2639
2640 Because of the postfix representation of the parse tree, the depth-first
2641 analysis is conveniently done by a linear scan with the aid of a stack.
2642 Sets are stored as arrays of the elements, obeying a stack-like allocation
2643 scheme; the number of elements in each set deeper in the stack can be
2644 used to determine the address of a particular set's array. */
2645static void
2646dfaanalyze (struct dfa *d, bool searchflag)
2647{
2648 /* Array allocated to hold position sets. */
2649 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2650 /* Firstpos and lastpos elements. */
2651 position *firstpos = posalloc;
2652 position *lastpos = firstpos + d->nleaves;
2653 position pos;
2654 position_set tmp;
2655
2656 /* Stack for element counts and nullable flags. */
2657 struct
2658 {
2659 /* Whether the entry is nullable. */
2660 bool nullable;
2661
2662 /* Counts of firstpos and lastpos sets. */
2663 idx_t nfirstpos;
2664 idx_t nlastpos;
2665 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2666
2667 position_set merged; /* Result of merging sets. */
2668
2669 addtok (d, CAT);
2670 idx_t tindex = d->tindex;
2671
2672#ifdef DEBUG
2673 fprintf (stderr, "dfaanalyze:\n");
2674 for (idx_t i = 0; i < tindex; i++)
2675 {
2676 fprintf (stderr, " %td:", i);
2677 prtok (d->tokens[i]);
2678 }
2679 putc ('\n', stderr);
2680#endif
2681
2682 d->searchflag = searchflag;
2683 alloc_position_set (&merged, d->nleaves);
2684 d->follows = xicalloc (tindex, sizeof *d->follows);
2685 position_set *backward
2686 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2687
2688 for (idx_t i = 0; i < tindex; i++)
2689 {
2690 switch (d->tokens[i])
2691 {
2692 case EMPTY:
2693 /* The empty set is nullable. */
2694 stk->nullable = true;
2695
2696 /* The firstpos and lastpos of the empty leaf are both empty. */
2697 stk->nfirstpos = stk->nlastpos = 0;
2698 stk++;
2699 break;
2700
2701 case STAR:
2702 case PLUS:
2703 /* Every element in the lastpos of the argument is in the backward
2704 set of every element in the firstpos. */
2705 if (d->epsilon)
2706 {
2707 tmp.elems = lastpos - stk[-1].nlastpos;
2708 tmp.nelem = stk[-1].nlastpos;
2709 for (position *p = firstpos - stk[-1].nfirstpos;
2710 p < firstpos; p++)
2711 merge2 (&backward[p->index], &tmp, &merged);
2712 }
2713
2714 /* Every element in the firstpos of the argument is in the follow
2715 of every element in the lastpos. */
2716 {
2717 tmp.elems = firstpos - stk[-1].nfirstpos;
2718 tmp.nelem = stk[-1].nfirstpos;
2719 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2720 merge2 (&d->follows[p->index], &tmp, &merged);
2721 }
2722 FALLTHROUGH;
2723 case QMARK:
2724 /* A QMARK or STAR node is automatically nullable. */
2725 if (d->tokens[i] != PLUS)
2726 stk[-1].nullable = true;
2727 break;
2728
2729 case CAT:
2730 /* Every element in the lastpos of the first argument is in
2731 the backward set of every element in the firstpos of the
2732 second argument. */
2733 if (backward)
2734 {
2735 tmp.nelem = stk[-2].nlastpos;
2736 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2737 for (position *p = firstpos - stk[-1].nfirstpos;
2738 p < firstpos; p++)
2739 merge2 (&backward[p->index], &tmp, &merged);
2740 }
2741
2742 /* Every element in the firstpos of the second argument is in the
2743 follow of every element in the lastpos of the first argument. */
2744 {
2745 tmp.nelem = stk[-1].nfirstpos;
2746 tmp.elems = firstpos - stk[-1].nfirstpos;
2747 for (position *plim = lastpos - stk[-1].nlastpos,
2748 *p = plim - stk[-2].nlastpos;
2749 p < plim; p++)
2750 merge2 (&d->follows[p->index], &tmp, &merged);
2751 }
2752
2753 /* The firstpos of a CAT node is the firstpos of the first argument,
2754 union that of the second argument if the first is nullable. */
2755 if (stk[-2].nullable)
2756 stk[-2].nfirstpos += stk[-1].nfirstpos;
2757 else
2758 firstpos -= stk[-1].nfirstpos;
2759
2760 /* The lastpos of a CAT node is the lastpos of the second argument,
2761 union that of the first argument if the second is nullable. */
2762 if (stk[-1].nullable)
2763 stk[-2].nlastpos += stk[-1].nlastpos;
2764 else
2765 {
2766 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2767 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2768 p[j] = p[j + stk[-2].nlastpos];
2769 lastpos -= stk[-2].nlastpos;
2770 stk[-2].nlastpos = stk[-1].nlastpos;
2771 }
2772
2773 /* A CAT node is nullable if both arguments are nullable. */
2774 stk[-2].nullable &= stk[-1].nullable;
2775 stk--;
2776 break;
2777
2778 case OR:
2779 /* The firstpos is the union of the firstpos of each argument. */
2780 stk[-2].nfirstpos += stk[-1].nfirstpos;
2781
2782 /* The lastpos is the union of the lastpos of each argument. */
2783 stk[-2].nlastpos += stk[-1].nlastpos;
2784
2785 /* An OR node is nullable if either argument is nullable. */
2786 stk[-2].nullable |= stk[-1].nullable;
2787 stk--;
2788 break;
2789
2790 default:
2791 /* Anything else is a nonempty position. (Note that special
2792 constructs like \< are treated as nonempty strings here;
2793 an "epsilon closure" effectively makes them nullable later.
2794 Backreferences have to get a real position so we can detect
2795 transitions on them later. But they are nullable. */
2796 stk->nullable = d->tokens[i] == BACKREF;
2797
2798 /* This position is in its own firstpos and lastpos. */
2799 stk->nfirstpos = stk->nlastpos = 1;
2800 stk++;
2801
2802 firstpos->index = lastpos->index = i;
2803 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2804 firstpos++, lastpos++;
2805
2806 break;
2807 }
2808#ifdef DEBUG
2809 /* ... balance the above nonsyntactic #ifdef goo... */
2810 fprintf (stderr, "node %td:", i);
2811 prtok (d->tokens[i]);
2812 putc ('\n', stderr);
2813 fprintf (stderr,
2814 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2815 fprintf (stderr, " firstpos:");
2816 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2817 {
2818 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2819 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2820 }
2821 fprintf (stderr, "\n lastpos:");
2822 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2823 {
2824 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2825 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2826 }
2827 putc ('\n', stderr);
2828#endif
2829 }
2830
2831 if (backward)
2832 {
2833 /* For each follow set that is the follow set of a real position,
2834 replace it with its epsilon closure. */
2835 epsclosure (d, backward);
2836
2837 for (idx_t i = 0; i < tindex; i++)
2838 free (backward[i].elems);
2839 free (backward);
2840 }
2841
2842 dfaoptimize (d);
2843
2844#ifdef DEBUG
2845 for (idx_t i = 0; i < tindex; i++)
2846 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2847 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2848 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2849 {
2850 fprintf (stderr, "follows(%td:", i);
2851 prtok (d->tokens[i]);
2852 fprintf (stderr, "):");
2853 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2854 {
2855 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2856 prtok (d->tokens[d->follows[i].elems[j].index]);
2857 }
2858 putc ('\n', stderr);
2859 }
2860#endif
2861
2862 pos.index = 0;
2863 pos.constraint = NO_CONSTRAINT;
2864
2865 alloc_position_set (&tmp, 1);
2866
2867 append (pos, &tmp);
2868
2869 d->separates = xicalloc (tindex, sizeof *d->separates);
2870
2871 for (idx_t i = 0; i < tindex; i++)
2872 {
2873 if (prev_newline_dependent (d->constraints[i]))
2874 d->separates[i] |= CTX_NEWLINE;
2875 if (prev_letter_dependent (d->constraints[i]))
2876 d->separates[i] |= CTX_LETTER;
2877
2878 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2879 {
2880 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2881 d->separates[i] |= CTX_NEWLINE;
2882 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2883 d->separates[i] |= CTX_LETTER;
2884 }
2885 }
2886
2887 /* Context wanted by some position. */
2888 int separate_contexts = state_separate_contexts (d, &tmp);
2889
2890 /* Build the initial state. */
2891 if (separate_contexts & CTX_NEWLINE)
2892 state_index (d, &tmp, CTX_NEWLINE);
2893 d->initstate_notbol = d->min_trcount
2894 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2895 if (separate_contexts & CTX_LETTER)
2896 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2897 d->min_trcount++;
2898 d->trcount = 0;
2899
2900 free (posalloc);
2901 free (stkalloc);
2902 free (merged.elems);
2903 free (tmp.elems);
2904}
2905
2906/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2907static void
2908realloc_trans_if_necessary (struct dfa *d)
2909{
2910 state_num oldalloc = d->tralloc;
2911 if (oldalloc < d->sindex)
2912 {
2913 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2914 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2915 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2916 -1, sizeof *realtrans);
2917 realtrans[0] = realtrans[1] = NULL;
2918 d->trans = realtrans + 2;
2919 idx_t newalloc = d->tralloc = newalloc1 - 2;
2920 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2921 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2922 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2923 if (d->localeinfo.multibyte)
2924 {
2925 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2926 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2927 if (oldalloc == 0)
2928 realtrans[0] = realtrans[1] = NULL;
2929 d->mb_trans = realtrans + 2;
2930 }
2931 for (; oldalloc < newalloc; oldalloc++)
2932 {
2933 d->trans[oldalloc] = NULL;
2934 d->fails[oldalloc] = NULL;
2935 if (d->localeinfo.multibyte)
2936 d->mb_trans[oldalloc] = NULL;
2937 }
2938 }
2939}
2940
2941/*
2942 Calculate the transition table for a new state derived from state s
2943 for a compiled dfa d after input character uc, and return the new
2944 state number.
2945
2946 Do not worry about all possible input characters; calculate just the group
2947 of positions that match uc. Label it with the set of characters that
2948 every position in the group matches (taking into account, if necessary,
2949 preceding context information of s). Then find the union
2950 of these positions' follows, i.e., the set of positions of the
2951 new state. For each character in the group's label, set the transition
2952 on this character to be to a state corresponding to the set's positions,
2953 and its associated backward context information, if necessary.
2954
2955 When building a searching matcher, include the positions of state
2956 0 in every state.
2957
2958 The group is constructed by building an equivalence-class
2959 partition of the positions of s.
2960
2961 For each position, find the set of characters C that it matches. Eliminate
2962 any characters from C that fail on grounds of backward context.
2963
2964 Check whether the group's label L has nonempty
2965 intersection with C. If L - C is nonempty, create a new group labeled
2966 L - C and having the same positions as the current group, and set L to
2967 the intersection of L and C. Insert the position in the group, set
2968 C = C - L, and resume scanning.
2969
2970 If after comparing with every group there are characters remaining in C,
2971 create a new group labeled with the characters of C and insert this
2972 position in that group. */
2973
2974static state_num
2975build_state (state_num s, struct dfa *d, unsigned char uc)
2976{
2977 position_set follows; /* Union of the follows for each
2978 position of the current state. */
2979 position_set group; /* Positions that match the input char. */
2980 position_set tmp; /* Temporary space for merging sets. */
2981 state_num state; /* New state. */
2982 state_num state_newline; /* New state on a newline transition. */
2983 state_num state_letter; /* New state on a letter transition. */
2984
2985#ifdef DEBUG
2986 fprintf (stderr, "build state %td\n", s);
2987#endif
2988
2989 /* A pointer to the new transition table, and the table itself. */
2990 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2991 state_num *trans = *ptrans;
2992
2993 if (!trans)
2994 {
2995 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2996 transition tables that can exist at once, other than for
2997 initial states. Often-used transition tables are quickly
2998 rebuilt, whereas rarely-used ones are cleared away. */
2999 if (MAX_TRCOUNT <= d->trcount)
3000 {
3001 for (state_num i = d->min_trcount; i < d->tralloc; i++)
3002 {
3003 free (d->trans[i]);
3004 free (d->fails[i]);
3005 d->trans[i] = d->fails[i] = NULL;
3006 }
3007 d->trcount = 0;
3008 }
3009
3010 d->trcount++;
3011 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3012
3013 /* Fill transition table with a default value which means that the
3014 transited state has not been calculated yet. */
3015 for (int i = 0; i < NOTCHAR; i++)
3016 trans[i] = -2;
3017 }
3018
3019 /* Set up the success bits for this state. */
3020 d->success[s] = 0;
3021 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3022 d->success[s] |= CTX_NEWLINE;
3023 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3024 d->success[s] |= CTX_LETTER;
3025 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3026 d->success[s] |= CTX_NONE;
3027
3028 alloc_position_set (&follows, d->nleaves);
3029
3030 /* Find the union of the follows of the positions of the group.
3031 This is a hideously inefficient loop. Fix it someday. */
3032 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3033 for (idx_t k = 0;
3034 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3035 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3036 &follows);
3037
3038 /* Positions that match the input char. */
3039 alloc_position_set (&group, d->nleaves);
3040
3041 /* The group's label. */
3042 charclass label;
3043 fillset (&label);
3044
3045 for (idx_t i = 0; i < follows.nelem; i++)
3046 {
3047 charclass matches; /* Set of matching characters. */
3048 position pos = follows.elems[i];
3049 bool matched = false;
3050 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3051 {
3052 zeroset (&matches);
3053 setbit (d->tokens[pos.index], &matches);
3054 if (d->tokens[pos.index] == uc)
3055 matched = true;
3056 }
3057 else if (d->tokens[pos.index] >= CSET)
3058 {
3059 matches = d->charclasses[d->tokens[pos.index] - CSET];
3060 if (tstbit (uc, &matches))
3061 matched = true;
3062 }
3063 else if (d->tokens[pos.index] == ANYCHAR)
3064 {
3065 matches = d->charclasses[d->canychar];
3066 if (tstbit (uc, &matches))
3067 matched = true;
3068
3069 /* ANYCHAR must match with a single character, so we must put
3070 it to D->states[s].mbps which contains the positions which
3071 can match with a single character not a byte. If all
3072 positions which has ANYCHAR does not depend on context of
3073 next character, we put the follows instead of it to
3074 D->states[s].mbps to optimize. */
3075 if (succeeds_in_context (pos.constraint, d->states[s].context,
3076 CTX_NONE))
3077 {
3078 if (d->states[s].mbps.nelem == 0)
3079 alloc_position_set (&d->states[s].mbps, 1);
3080 insert (pos, &d->states[s].mbps);
3081 }
3082 }
3083 else
3084 continue;
3085
3086 /* Some characters may need to be eliminated from matches because
3087 they fail in the current context. */
3088 if (pos.constraint != NO_CONSTRAINT)
3089 {
3090 if (!succeeds_in_context (pos.constraint,
3091 d->states[s].context, CTX_NEWLINE))
3092 for (int j = 0; j < CHARCLASS_WORDS; j++)
3093 matches.w[j] &= ~d->syntax.newline.w[j];
3094 if (!succeeds_in_context (pos.constraint,
3095 d->states[s].context, CTX_LETTER))
3096 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3097 matches.w[j] &= ~d->syntax.letters.w[j];
3098 if (!succeeds_in_context (pos.constraint,
3099 d->states[s].context, CTX_NONE))
3100 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3101 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3102
3103 /* If there are no characters left, there's no point in going on. */
3104 if (emptyset (&matches))
3105 continue;
3106
3107 /* If we have reset the bit that made us declare "matched", reset
3108 that indicator, too. This is required to avoid an infinite loop
3109 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3110 if (!tstbit (uc, &matches))
3111 matched = false;
3112 }
3113
3114#ifdef DEBUG
3115 fprintf (stderr, " nextpos %td:", pos.index);
3116 prtok (d->tokens[pos.index]);
3117 fprintf (stderr, " of");
3118 for (unsigned j = 0; j < NOTCHAR; j++)
3119 if (tstbit (j, &matches))
3120 fprintf (stderr, " 0x%02x", j);
3121 fprintf (stderr, "\n");
3122#endif
3123
3124 if (matched)
3125 {
3126 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3127 label.w[k] &= matches.w[k];
3128 append (pos, &group);
3129 }
3130 else
3131 {
3132 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3133 label.w[k] &= ~matches.w[k];
3134 }
3135 }
3136
3137 alloc_position_set (&tmp, d->nleaves);
3138
3139 if (group.nelem > 0)
3140 {
3141 /* If we are building a searching matcher, throw in the positions
3142 of state 0 as well, if possible. */
3143 if (d->searchflag)
3144 {
3145 /* If a token in follows.elems is not 1st byte of a multibyte
3146 character, or the states of follows must accept the bytes
3147 which are not 1st byte of the multibyte character.
3148 Then, if a state of follows encounters a byte, it must not be
3149 a 1st byte of a multibyte character nor a single byte character.
3150 In this case, do not add state[0].follows to next state, because
3151 state[0] must accept 1st-byte.
3152
3153 For example, suppose <sb a> is a certain single byte character,
3154 <mb A> is a certain multibyte character, and the codepoint of
3155 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3156 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3157 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3158 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3159 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3160 not add state[0]. */
3161
3162 bool mergeit = !d->localeinfo.multibyte;
3163 if (!mergeit)
3164 {
3165 mergeit = true;
3166 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3167 mergeit &= d->multibyte_prop[group.elems[j].index];
3168 }
3169 if (mergeit)
3170 merge2 (&group, &d->states[0].elems, &tmp);
3171 }
3172
3173 /* Find out if the new state will want any context information,
3174 by calculating possible contexts that the group can match,
3175 and separate contexts that the new state wants to know. */
3176 int possible_contexts = charclass_context (d, &label);
3177 int separate_contexts = state_separate_contexts (d, &group);
3178
3179 /* Find the state(s) corresponding to the union of the follows. */
3180 if (possible_contexts & ~separate_contexts)
3181 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3182 else
3183 state = -1;
3184 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3185 state_newline = state_index (d, &group, CTX_NEWLINE);
3186 else
3187 state_newline = state;
3188 if (separate_contexts & possible_contexts & CTX_LETTER)
3189 state_letter = state_index (d, &group, CTX_LETTER);
3190 else
3191 state_letter = state;
3192
3193 /* Reallocate now, to reallocate any newline transition properly. */
3194 realloc_trans_if_necessary (d);
3195 }
3196
3197 /* If we are a searching matcher, the default transition is to a state
3198 containing the positions of state 0, otherwise the default transition
3199 is to fail miserably. */
3200 else if (d->searchflag)
3201 {
3202 state_newline = 0;
3203 state_letter = d->min_trcount - 1;
3204 state = d->initstate_notbol;
3205 }
3206 else
3207 {
3208 state_newline = -1;
3209 state_letter = -1;
3210 state = -1;
3211 }
3212
3213 /* Set the transitions for each character in the label. */
3214 for (int i = 0; i < NOTCHAR; i++)
3215 if (tstbit (i, &label))
3216 switch (d->syntax.sbit[i])
3217 {
3218 case CTX_NEWLINE:
3219 trans[i] = state_newline;
3220 break;
3221 case CTX_LETTER:
3222 trans[i] = state_letter;
3223 break;
3224 default:
3225 trans[i] = state;
3226 break;
3227 }
3228
3229#ifdef DEBUG
3230 fprintf (stderr, "trans table %td", s);
3231 for (int i = 0; i < NOTCHAR; ++i)
3232 {
3233 if (!(i & 0xf))
3234 fprintf (stderr, "\n");
3235 fprintf (stderr, " %2td", trans[i]);
3236 }
3237 fprintf (stderr, "\n");
3238#endif
3239
3240 free (group.elems);
3241 free (follows.elems);
3242 free (tmp.elems);
3243
3244 /* Keep the newline transition in a special place so we can use it as
3245 a sentinel. */
3246 if (tstbit (d->syntax.eolbyte, &label))
3247 {
3248 d->newlines[s] = trans[d->syntax.eolbyte];
3249 trans[d->syntax.eolbyte] = -1;
3250 }
3251
3252 return trans[uc];
3253}
3254
3255/* Multibyte character handling sub-routines for dfaexec. */
3256
3257/* Consume a single byte and transit state from 's' to '*next_state'.
3258 This function is almost same as the state transition routin in dfaexec.
3259 But state transition is done just once, otherwise matching succeed or
3260 reach the end of the buffer. */
3261static state_num
3262transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3263{
3264 state_num *t;
3265
3266 if (d->trans[s])
3267 t = d->trans[s];
3268 else if (d->fails[s])
3269 t = d->fails[s];
3270 else
3271 {
3272 build_state (s, d, **pp);
3273 if (d->trans[s])
3274 t = d->trans[s];
3275 else
3276 {
3277 t = d->fails[s];
3278 assert (t);
3279 }
3280 }
3281
3282 if (t[**pp] == -2)
3283 build_state (s, d, **pp);
3284
3285 return t[*(*pp)++];
3286}
3287
3288/* Transit state from s, then return new state and update the pointer of
3289 the buffer. This function is for a period operator which can match a
3290 multi-byte character. */
3291static state_num
3292transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3293 unsigned char const *end)
3294{
3295 wint_t wc;
3296
3297 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3298
3299 /* This state has some operators which can match a multibyte character. */
3300 d->mb_follows.nelem = 0;
3301
3302 /* Calculate the state which can be reached from the state 's' by
3303 consuming 'mbclen' single bytes from the buffer. */
3304 state_num s1 = s;
3305 int mbci;
3306 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3307 s = transit_state_singlebyte (d, s, pp);
3308 *pp += mbclen - mbci;
3309
3310 if (wc == WEOF)
3311 {
3312 /* It is an invalid character, so ANYCHAR is not accepted. */
3313 return s;
3314 }
3315
3316 /* If all positions which have ANYCHAR do not depend on the context
3317 of the next character, calculate the next state with
3318 pre-calculated follows and cache the result. */
3319 if (d->states[s1].mb_trindex < 0)
3320 {
3321 if (MAX_TRCOUNT <= d->mb_trcount)
3322 {
3323 state_num s3;
3324 for (s3 = -1; s3 < d->tralloc; s3++)
3325 {
3326 free (d->mb_trans[s3]);
3327 d->mb_trans[s3] = NULL;
3328 }
3329
3330 for (state_num i = 0; i < d->sindex; i++)
3331 d->states[i].mb_trindex = -1;
3332 d->mb_trcount = 0;
3333 }
3334 d->states[s1].mb_trindex = d->mb_trcount++;
3335 }
3336
3337 if (! d->mb_trans[s])
3338 {
3339 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3340 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3341 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3342 for (int i = 0; i < MAX_TRCOUNT; i++)
3343 d->mb_trans[s][i] = -1;
3344 }
3345 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3346 return d->mb_trans[s][d->states[s1].mb_trindex];
3347
3348 if (s == -1)
3349 copy (&d->states[s1].mbps, &d->mb_follows);
3350 else
3351 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3352
3353 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3354 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3355 realloc_trans_if_necessary (d);
3356
3357 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3358
3359 return s2;
3360}
3361
3362/* The initial state may encounter a byte which is not a single byte character
3363 nor the first byte of a multibyte character. But it is incorrect for the
3364 initial state to accept such a byte. For example, in Shift JIS the regular
3365 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3366 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3367 that are not a single byte character nor the first byte of a multibyte
3368 character.
3369
3370 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3371 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3372 the result is greater than P, set *WCP to the final wide character
3373 processed, or to WEOF if no wide character is processed. Otherwise,
3374 if WCP is non-NULL, *WCP may or may not be updated.
3375
3376 Both P and MBP must be no larger than END. */
3377static unsigned char const *
3378skip_remains_mb (struct dfa *d, unsigned char const *p,
3379 unsigned char const *mbp, char const *end)
3380{
3381 if (d->syntax.never_trail[*p])
3382 return p;
3383 while (mbp < p)
3384 {
3385 wint_t wc;
3386 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3387 end - (char const *) mbp, d);
3388 }
3389 return mbp;
3390}
3391
3392/* Search through a buffer looking for a match to the struct dfa *D.
3393 Find the first occurrence of a string matching the regexp in the
3394 buffer, and the shortest possible version thereof. Return a pointer to
3395 the first character after the match, or NULL if none is found. BEGIN
3396 points to the beginning of the buffer, and END points to the first byte
3397 after its end. Note however that we store a sentinel byte (usually
3398 newline) in *END, so the actual buffer must be one byte longer.
3399 When ALLOW_NL, newlines may appear in the matching string.
3400 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3401 If MULTIBYTE, the input consists of multibyte characters and/or
3402 encoding-error bytes. Otherwise, it consists of single-byte characters.
3403 Here is the list of features that make this DFA matcher punt:
3404 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3405 - [^...] in non-simple locale
3406 - [[=foo=]] or [[.foo.]]
3407 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3408 - back-reference: (.)\1
3409 - word-delimiter in multibyte locale: \<, \>, \b, \B
3410 See struct localeinfo.simple for the definition of "simple locale". */
3411
3412static inline char *
3413dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3414 idx_t *count, bool multibyte)
3415{
3416 if (MAX_TRCOUNT <= d->sindex)
3417 {
3418 for (state_num s = d->min_trcount; s < d->sindex; s++)
3419 {
3420 free (d->states[s].elems.elems);
3421 free (d->states[s].mbps.elems);
3422 }
3423 d->sindex = d->min_trcount;
3424
3425 if (d->trans)
3426 {
3427 for (state_num s = 0; s < d->tralloc; s++)
3428 {
3429 free (d->trans[s]);
3430 free (d->fails[s]);
3431 d->trans[s] = d->fails[s] = NULL;
3432 }
3433 d->trcount = 0;
3434 }
3435
3436 if (d->localeinfo.multibyte && d->mb_trans)
3437 {
3438 for (state_num s = -1; s < d->tralloc; s++)
3439 {
3440 free (d->mb_trans[s]);
3441 d->mb_trans[s] = NULL;
3442 }
3443 for (state_num s = 0; s < d->min_trcount; s++)
3444 d->states[s].mb_trindex = -1;
3445 d->mb_trcount = 0;
3446 }
3447 }
3448
3449 if (!d->tralloc)
3450 realloc_trans_if_necessary (d);
3451
3452 /* Current state. */
3453 state_num s = 0, s1 = 0;
3454
3455 /* Current input character. */
3456 unsigned char const *p = (unsigned char const *) begin;
3457 unsigned char const *mbp = p;
3458
3459 /* Copy of d->trans so it can be optimized into a register. */
3460 state_num **trans = d->trans;
3461 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3462 unsigned char saved_end = *(unsigned char *) end;
3463 *end = eol;
3464
3465 if (multibyte)
3466 {
3467 memset (&d->mbs, 0, sizeof d->mbs);
3468 if (d->mb_follows.alloc == 0)
3469 alloc_position_set (&d->mb_follows, d->nleaves);
3470 }
3471
3472 idx_t nlcount = 0;
3473 for (;;)
3474 {
3475 state_num *t;
3476 while ((t = trans[s]) != NULL)
3477 {
3478 if (s < d->min_trcount)
3479 {
3480 if (!multibyte || d->states[s].mbps.nelem == 0)
3481 {
3482 while (t[*p] == s)
3483 p++;
3484 }
3485 if (multibyte)
3486 p = mbp = skip_remains_mb (d, p, mbp, end);
3487 }
3488
3489 if (multibyte)
3490 {
3491 s1 = s;
3492
3493 if (d->states[s].mbps.nelem == 0
3494 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3495 {
3496 /* If an input character does not match ANYCHAR, do it
3497 like a single-byte character. */
3498 s = t[*p++];
3499 }
3500 else
3501 {
3502 s = transit_state (d, s, &p, (unsigned char *) end);
3503 mbp = p;
3504 trans = d->trans;
3505 }
3506 }
3507 else
3508 {
3509 s1 = t[*p++];
3510 t = trans[s1];
3511 if (! t)
3512 {
3513 state_num tmp = s;
3514 s = s1;
3515 s1 = tmp; /* swap */
3516 break;
3517 }
3518 if (s < d->min_trcount)
3519 {
3520 while (t[*p] == s1)
3521 p++;
3522 }
3523 s = t[*p++];
3524 }
3525 }
3526
3527 if (s < 0)
3528 {
3529 if (s == -2)
3530 {
3531 s = build_state (s1, d, p[-1]);
3532 trans = d->trans;
3533 }
3534 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3535 {
3536 /* The previous character was a newline. Count it, and skip
3537 checking of multibyte character boundary until here. */
3538 nlcount++;
3539 mbp = p;
3540
3541 s = (allow_nl ? d->newlines[s1]
3542 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3543 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3544 : d->initstate_notbol);
3545 }
3546 else
3547 {
3548 p = NULL;
3549 goto done;
3550 }
3551 }
3552 else if (d->fails[s])
3553 {
3554 if ((d->success[s] & d->syntax.sbit[*p])
3555 || ((char *) p == end
3556 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3557 d)))
3558 goto done;
3559
3560 if (multibyte && s < d->min_trcount)
3561 p = mbp = skip_remains_mb (d, p, mbp, end);
3562
3563 s1 = s;
3564 if (!multibyte || d->states[s].mbps.nelem == 0
3565 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3566 {
3567 /* If a input character does not match ANYCHAR, do it
3568 like a single-byte character. */
3569 s = d->fails[s][*p++];
3570 }
3571 else
3572 {
3573 s = transit_state (d, s, &p, (unsigned char *) end);
3574 mbp = p;
3575 trans = d->trans;
3576 }
3577 }
3578 else
3579 {
3580 build_state (s, d, p[0]);
3581 trans = d->trans;
3582 }
3583 }
3584
3585 done:
3586 if (count)
3587 *count += nlcount;
3588 *end = saved_end;
3589 return (char *) p;
3590}
3591
3592/* Specialized versions of dfaexec for multibyte and single-byte cases.
3593 This is for performance, as dfaexec_main is an inline function. */
3594
3595static char *
3596dfaexec_mb (struct dfa *d, char const *begin, char *end,
3597 bool allow_nl, idx_t *count, bool *backref)
3598{
3599 return dfaexec_main (d, begin, end, allow_nl, count, true);
3600}
3601
3602static char *
3603dfaexec_sb (struct dfa *d, char const *begin, char *end,
3604 bool allow_nl, idx_t *count, bool *backref)
3605{
3606 return dfaexec_main (d, begin, end, allow_nl, count, false);
3607}
3608
3609/* Always set *BACKREF and return BEGIN. Use this wrapper for
3610 any regexp that uses a construct not supported by this code. */
3611static char *
3612dfaexec_noop (struct dfa *d, char const *begin, char *end,
3613 bool allow_nl, idx_t *count, bool *backref)
3614{
3615 *backref = true;
3616 return (char *) begin;
3617}
3618
3619/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3620 but faster and set *BACKREF if the DFA code does not support this
3621 regexp usage. */
3622
3623char *
3624dfaexec (struct dfa *d, char const *begin, char *end,
3625 bool allow_nl, idx_t *count, bool *backref)
3626{
3627 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3628}
3629
3630struct dfa *
3631dfasuperset (struct dfa const *d)
3632{
3633 return d->superset;
3634}
3635
3636bool
3637dfaisfast (struct dfa const *d)
3638{
3639 return d->fast;
3640}
3641
3642static void
3643free_mbdata (struct dfa *d)
3644{
3645 free (d->multibyte_prop);
3646 free (d->lex.brack.chars);
3647 free (d->mb_follows.elems);
3648
3649 if (d->mb_trans)
3650 {
3651 state_num s;
3652 for (s = -1; s < d->tralloc; s++)
3653 free (d->mb_trans[s]);
3654 free (d->mb_trans - 2);
3655 }
3656}
3657
3658/* Return true if every construct in D is supported by this DFA matcher. */
3659bool
3660dfasupported (struct dfa const *d)
3661{
3662 for (idx_t i = 0; i < d->tindex; i++)
3663 {
3664 switch (d->tokens[i])
3665 {
3666 case BEGWORD:
3667 case ENDWORD:
3668 case LIMWORD:
3669 case NOTLIMWORD:
3670 if (!d->localeinfo.multibyte)
3671 continue;
3672 FALLTHROUGH;
3673 case BACKREF:
3674 case MBCSET:
3675 return false;
3676 }
3677 }
3678 return true;
3679}
3680
3681/* Disable use of the superset DFA if it is not likely to help
3682 performance. */
3683static void
3684maybe_disable_superset_dfa (struct dfa *d)
3685{
3686 if (!d->localeinfo.using_utf8)
3687 return;
3688
3689 bool have_backref = false;
3690 for (idx_t i = 0; i < d->tindex; i++)
3691 {
3692 switch (d->tokens[i])
3693 {
3694 case ANYCHAR:
3695 /* Lowered. */
3696 abort ();
3697 case BACKREF:
3698 have_backref = true;
3699 break;
3700 case MBCSET:
3701 /* Requires multi-byte algorithm. */
3702 return;
3703 default:
3704 break;
3705 }
3706 }
3707
3708 if (!have_backref && d->superset)
3709 {
3710 /* The superset DFA is not likely to be much faster, so remove it. */
3711 dfafree (d->superset);
3712 free (d->superset);
3713 d->superset = NULL;
3714 }
3715
3716 free_mbdata (d);
3717 d->localeinfo.multibyte = false;
3718 d->dfaexec = dfaexec_sb;
3719 d->fast = true;
3720}
3721
3722static void
3723dfassbuild (struct dfa *d)
3724{
3725 struct dfa *sup = dfaalloc ();
3726
3727 *sup = *d;
3728 sup->localeinfo.multibyte = false;
3729 sup->dfaexec = dfaexec_sb;
3730 sup->multibyte_prop = NULL;
3731 sup->superset = NULL;
3732 sup->states = NULL;
3733 sup->sindex = 0;
3734 sup->constraints = NULL;
3735 sup->separates = NULL;
3736 sup->follows = NULL;
3737 sup->tralloc = 0;
3738 sup->trans = NULL;
3739 sup->fails = NULL;
3740 sup->success = NULL;
3741 sup->newlines = NULL;
3742
3743 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3744 if (d->cindex)
3745 {
3746 memcpy (sup->charclasses, d->charclasses,
3747 d->cindex * sizeof *sup->charclasses);
3748 }
3749
3750 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3751 sup->talloc = d->tindex * 2;
3752
3753 bool have_achar = false;
3754 bool have_nchar = false;
3755 idx_t j;
3756 for (idx_t i = j = 0; i < d->tindex; i++)
3757 {
3758 switch (d->tokens[i])
3759 {
3760 case ANYCHAR:
3761 case MBCSET:
3762 case BACKREF:
3763 {
3764 charclass ccl;
3765 fillset (&ccl);
3766 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3767 sup->tokens[j++] = STAR;
3768 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3769 || d->tokens[i + 1] == PLUS)
3770 i++;
3771 have_achar = true;
3772 }
3773 break;
3774 case BEGWORD:
3775 case ENDWORD:
3776 case LIMWORD:
3777 case NOTLIMWORD:
3778 if (d->localeinfo.multibyte)
3779 {
3780 /* These constraints aren't supported in a multibyte locale.
3781 Ignore them in the superset DFA. */
3782 sup->tokens[j++] = EMPTY;
3783 break;
3784 }
3785 FALLTHROUGH;
3786 default:
3787 sup->tokens[j++] = d->tokens[i];
3788 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3789 || d->tokens[i] >= CSET)
3790 have_nchar = true;
3791 break;
3792 }
3793 }
3794 sup->tindex = j;
3795
3796 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3797 d->superset = sup;
3798 else
3799 {
3800 dfafree (sup);
3801 free (sup);
3802 }
3803}
3804
3805/* Parse a string S of length LEN into D (but skip this step if S is null).
3806 Then analyze D and build a matcher for it.
3807 SEARCHFLAG says whether to build a searching or an exact matcher. */
3808void
3809dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3810{
3811 if (s != NULL)
3812 dfaparse (s, len, d);
3813
3814 dfassbuild (d);
3815
3816 if (dfasupported (d))
3817 {
3818 maybe_disable_superset_dfa (d);
3819 dfaanalyze (d, searchflag);
3820 }
3821 else
3822 {
3823 d->dfaexec = dfaexec_noop;
3824 }
3825
3826 if (d->superset)
3827 {
3828 d->fast = true;
3829 dfaanalyze (d->superset, searchflag);
3830 }
3831}
3832
3833/* Free the storage held by the components of a dfa. */
3834void
3835dfafree (struct dfa *d)
3836{
3837 free (d->charclasses);
3838 free (d->tokens);
3839
3840 if (d->localeinfo.multibyte)
3841 free_mbdata (d);
3842
3843 free (d->constraints);
3844 free (d->separates);
3845
3846 for (idx_t i = 0; i < d->sindex; i++)
3847 {
3848 free (d->states[i].elems.elems);
3849 free (d->states[i].mbps.elems);
3850 }
3851 free (d->states);
3852
3853 if (d->follows)
3854 {
3855 for (idx_t i = 0; i < d->tindex; i++)
3856 free (d->follows[i].elems);
3857 free (d->follows);
3858 }
3859
3860 if (d->trans)
3861 {
3862 for (idx_t i = 0; i < d->tralloc; i++)
3863 {
3864 free (d->trans[i]);
3865 free (d->fails[i]);
3866 }
3867
3868 free (d->trans - 2);
3869 free (d->fails);
3870 free (d->newlines);
3871 free (d->success);
3872 }
3873
3874 if (d->superset)
3875 {
3876 dfafree (d->superset);
3877 free (d->superset);
3878 }
3879}
3880
3881/* Having found the postfix representation of the regular expression,
3882 try to find a long sequence of characters that must appear in any line
3883 containing the r.e.
3884 Finding a "longest" sequence is beyond the scope here;
3885 we take an easy way out and hope for the best.
3886 (Take "(ab|a)b"--please.)
3887
3888 We do a bottom-up calculation of sequences of characters that must appear
3889 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3890 representation:
3891 sequences that must appear at the left of the match ("left")
3892 sequences that must appear at the right of the match ("right")
3893 lists of sequences that must appear somewhere in the match ("in")
3894 sequences that must constitute the match ("is")
3895
3896 When we get to the root of the tree, we use one of the longest of its
3897 calculated "in" sequences as our answer.
3898
3899 The sequences calculated for the various types of node (in pseudo ANSI c)
3900 are shown below. "p" is the operand of unary operators (and the left-hand
3901 operand of binary operators); "q" is the right-hand operand of binary
3902 operators.
3903
3904 "ZERO" means "a zero-length sequence" below.
3905
3906 Type left right is in
3907 ---- ---- ----- -- --
3908 char c # c # c # c # c
3909
3910 ANYCHAR ZERO ZERO ZERO ZERO
3911
3912 MBCSET ZERO ZERO ZERO ZERO
3913
3914 CSET ZERO ZERO ZERO ZERO
3915
3916 STAR ZERO ZERO ZERO ZERO
3917
3918 QMARK ZERO ZERO ZERO ZERO
3919
3920 PLUS p->left p->right ZERO p->in
3921
3922 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3923 p->left : q->right : q->is!=ZERO) ? q->in plus
3924 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3925 ZERO
3926
3927 OR longest common longest common (do p->is and substrings common
3928 leading trailing to q->is have same p->in and
3929 (sub)sequence (sub)sequence q->in length and content) ?
3930 of p->left of p->right
3931 and q->left and q->right p->is : NULL
3932
3933 If there's anything else we recognize in the tree, all four sequences get set
3934 to zero-length sequences. If there's something we don't recognize in the
3935 tree, we just return a zero-length sequence.
3936
3937 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3938 'aaa')?
3939
3940 And ... is it here or someplace that we might ponder "optimizations" such as
3941 egrep 'psi|epsilon' -> egrep 'psi'
3942 egrep 'pepsi|epsilon' -> egrep 'epsi'
3943 (Yes, we now find "epsi" as a "string
3944 that must occur", but we might also
3945 simplify the *entire* r.e. being sought)
3946 grep '[c]' -> grep 'c'
3947 grep '(ab|a)b' -> grep 'ab'
3948 grep 'ab*' -> grep 'a'
3949 grep 'a*b' -> grep 'b'
3950
3951 There are several issues:
3952
3953 Is optimization easy (enough)?
3954
3955 Does optimization actually accomplish anything,
3956 or is the automaton you get from "psi|epsilon" (for example)
3957 the same as the one you get from "psi" (for example)?
3958
3959 Are optimizable r.e.'s likely to be used in real-life situations
3960 (something like 'ab*' is probably unlikely; something like is
3961 'psi|epsilon' is likelier)? */
3962
3963static char *
3964icatalloc (char *old, char const *new)
3965{
3966 idx_t newsize = strlen (new);
3967 if (newsize == 0)
3968 return old;
3969 idx_t oldsize = strlen (old);
3970 char *result = xirealloc (old, oldsize + newsize + 1);
3971 memcpy (result + oldsize, new, newsize + 1);
3972 return result;
3973}
3974
3975static void
3976freelist (char **cpp)
3977{
3978 while (*cpp)
3979 free (*cpp++);
3980}
3981
3982static char **
3983enlistnew (char **cpp, char *new)
3984{
3985 /* Is there already something in the list that's new (or longer)? */
3986 idx_t i;
3987 for (i = 0; cpp[i] != NULL; i++)
3988 if (strstr (cpp[i], new) != NULL)
3989 {
3990 free (new);
3991 return cpp;
3992 }
3993 /* Eliminate any obsoleted strings. */
3994 for (idx_t j = 0; cpp[j] != NULL; )
3995 if (strstr (new, cpp[j]) == NULL)
3996 ++j;
3997 else
3998 {
3999 free (cpp[j]);
4000 if (--i == j)
4001 break;
4002 cpp[j] = cpp[i];
4003 cpp[i] = NULL;
4004 }
4005 /* Add the new string. */
4006 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4007 cpp[i] = new;
4008 cpp[i + 1] = NULL;
4009 return cpp;
4010}
4011
4012static char **
4013enlist (char **cpp, char const *str, idx_t len)
4014{
4015 return enlistnew (cpp, ximemdup0 (str, len));
4016}
4017
4018/* Given pointers to two strings, return a pointer to an allocated
4019 list of their distinct common substrings. */
4020static char **
4021comsubs (char *left, char const *right)
4022{
4023 char **cpp = xzalloc (sizeof *cpp);
4024
4025 for (char *lcp = left; *lcp != '\0'; lcp++)
4026 {
4027 idx_t len = 0;
4028 char *rcp = strchr (right, *lcp);
4029 while (rcp != NULL)
4030 {
4031 idx_t i;
4032 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4033 continue;
4034 if (i > len)
4035 len = i;
4036 rcp = strchr (rcp + 1, *lcp);
4037 }
4038 if (len != 0)
4039 cpp = enlist (cpp, lcp, len);
4040 }
4041 return cpp;
4042}
4043
4044static char **
4045addlists (char **old, char **new)
4046{
4047 for (; *new; new++)
4048 old = enlistnew (old, xstrdup (*new));
4049 return old;
4050}
4051
4052/* Given two lists of substrings, return a new list giving substrings
4053 common to both. */
4054static char **
4055inboth (char **left, char **right)
4056{
4057 char **both = xzalloc (sizeof *both);
4058
4059 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4060 {
4061 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4062 {
4063 char **temp = comsubs (left[lnum], right[rnum]);
4064 both = addlists (both, temp);
4065 freelist (temp);
4066 free (temp);
4067 }
4068 }
4069 return both;
4070}
4071
4072typedef struct must must;
4073
4074struct must
4075{
4076 char **in;
4077 char *left;
4078 char *right;
4079 char *is;
4080 bool begline;
4081 bool endline;
4082 must *prev;
4083};
4084
4085static must *
4086allocmust (must *mp, idx_t size)
4087{
4088 must *new_mp = xmalloc (sizeof *new_mp);
4089 new_mp->in = xzalloc (sizeof *new_mp->in);
4090 new_mp->left = xizalloc (size);
4091 new_mp->right = xizalloc (size);
4092 new_mp->is = xizalloc (size);
4093 new_mp->begline = false;
4094 new_mp->endline = false;
4095 new_mp->prev = mp;
4096 return new_mp;
4097}
4098
4099static void
4100resetmust (must *mp)
4101{
4102 freelist (mp->in);
4103 mp->in[0] = NULL;
4104 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4105 mp->begline = false;
4106 mp->endline = false;
4107}
4108
4109static void
4110freemust (must *mp)
4111{
4112 freelist (mp->in);
4113 free (mp->in);
4114 free (mp->left);
4115 free (mp->right);
4116 free (mp->is);
4117 free (mp);
4118}
4119
4120struct dfamust *
4121dfamust (struct dfa const *d)
4122{
4123 must *mp = NULL;
4124 char const *result = "";
4125 bool exact = false;
4126 bool begline = false;
4127 bool endline = false;
4128 bool need_begline = false;
4129 bool need_endline = false;
4130 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4131
4132 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4133 {
4134 token t = d->tokens[ri];
4135 switch (t)
4136 {
4137 case BEGLINE:
4138 mp = allocmust (mp, 2);
4139 mp->begline = true;
4140 need_begline = true;
4141 break;
4142 case ENDLINE:
4143 mp = allocmust (mp, 2);
4144 mp->endline = true;
4145 need_endline = true;
4146 break;
4147 case LPAREN:
4148 case RPAREN:
4149 assert (!"neither LPAREN nor RPAREN may appear here");
4150
4151 case EMPTY:
4152 case BEGWORD:
4153 case ENDWORD:
4154 case LIMWORD:
4155 case NOTLIMWORD:
4156 case BACKREF:
4157 case ANYCHAR:
4158 case MBCSET:
4159 mp = allocmust (mp, 2);
4160 break;
4161
4162 case STAR:
4163 case QMARK:
4164 assume_nonnull (mp);
4165 resetmust (mp);
4166 break;
4167
4168 case OR:
4169 {
4170 char **new;
4171 must *rmp = mp;
4172 assume_nonnull (rmp);
4173 must *lmp = mp = mp->prev;
4174 assume_nonnull (lmp);
4175 idx_t j, ln, rn, n;
4176
4177 /* Guaranteed to be. Unlikely, but ... */
4178 if (str_eq (lmp->is, rmp->is))
4179 {
4180 lmp->begline &= rmp->begline;
4181 lmp->endline &= rmp->endline;
4182 }
4183 else
4184 {
4185 lmp->is[0] = '\0';
4186 lmp->begline = false;
4187 lmp->endline = false;
4188 }
4189 /* Left side--easy */
4190 idx_t i = 0;
4191 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4192 ++i;
4193 lmp->left[i] = '\0';
4194 /* Right side */
4195 ln = strlen (lmp->right);
4196 rn = strlen (rmp->right);
4197 n = ln;
4198 if (n > rn)
4199 n = rn;
4200 for (i = 0; i < n; ++i)
4201 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4202 break;
4203 for (j = 0; j < i; ++j)
4204 lmp->right[j] = lmp->right[(ln - i) + j];
4205 lmp->right[j] = '\0';
4206 new = inboth (lmp->in, rmp->in);
4207 freelist (lmp->in);
4208 free (lmp->in);
4209 lmp->in = new;
4210 freemust (rmp);
4211 }
4212 break;
4213
4214 case PLUS:
4215 assume_nonnull (mp);
4216 mp->is[0] = '\0';
4217 break;
4218
4219 case END:
4220 assume_nonnull (mp);
4221 assert (!mp->prev);
4222 for (idx_t i = 0; mp->in[i] != NULL; i++)
4223 if (strlen (mp->in[i]) > strlen (result))
4224 result = mp->in[i];
4225 if (str_eq (result, mp->is))
4226 {
4227 if ((!need_begline || mp->begline) && (!need_endline
4228 || mp->endline))
4229 exact = true;
4230 begline = mp->begline;
4231 endline = mp->endline;
4232 }
4233 goto done;
4234
4235 case CAT:
4236 {
4237 must *rmp = mp;
4238 assume_nonnull (rmp);
4239 must *lmp = mp = mp->prev;
4240 assume_nonnull (lmp);
4241
4242 /* In. Everything in left, plus everything in
4243 right, plus concatenation of
4244 left's right and right's left. */
4245 lmp->in = addlists (lmp->in, rmp->in);
4246 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4247 {
4248 idx_t lrlen = strlen (lmp->right);
4249 idx_t rllen = strlen (rmp->left);
4250 char *tp = ximalloc (lrlen + rllen + 1);
4251 memcpy (tp + lrlen, rmp->left, rllen + 1);
4252 memcpy (tp, lmp->right, lrlen);
4253 lmp->in = enlistnew (lmp->in, tp);
4254 }
4255 /* Left-hand */
4256 if (lmp->is[0] != '\0')
4257 lmp->left = icatalloc (lmp->left, rmp->left);
4258 /* Right-hand */
4259 if (rmp->is[0] == '\0')
4260 lmp->right[0] = '\0';
4261 lmp->right = icatalloc (lmp->right, rmp->right);
4262 /* Guaranteed to be */
4263 if ((lmp->is[0] != '\0' || lmp->begline)
4264 && (rmp->is[0] != '\0' || rmp->endline))
4265 {
4266 lmp->is = icatalloc (lmp->is, rmp->is);
4267 lmp->endline = rmp->endline;
4268 }
4269 else
4270 {
4271 lmp->is[0] = '\0';
4272 lmp->begline = false;
4273 lmp->endline = false;
4274 }
4275 freemust (rmp);
4276 }
4277 break;
4278
4279 case '\0':
4280 /* Not on *my* shift. */
4281 goto done;
4282
4283 default:
4284 if (CSET <= t)
4285 {
4286 /* If T is a singleton, or if case-folding in a unibyte
4287 locale and T's members all case-fold to the same char,
4288 convert T to one of its members. Otherwise, do
4289 nothing further with T. */
4290 charclass *ccl = &d->charclasses[t - CSET];
4291 int j;
4292 for (j = 0; j < NOTCHAR; j++)
4293 if (tstbit (j, ccl))
4294 break;
4295 if (! (j < NOTCHAR))
4296 {
4297 mp = allocmust (mp, 2);
4298 break;
4299 }
4300 t = j;
4301 while (++j < NOTCHAR)
4302 if (tstbit (j, ccl)
4303 && ! (case_fold_unibyte
4304 && toupper (j) == toupper (t)))
4305 break;
4306 if (j < NOTCHAR)
4307 {
4308 mp = allocmust (mp, 2);
4309 break;
4310 }
4311 }
4312
4313 idx_t rj = ri + 2;
4314 if (d->tokens[ri + 1] == CAT)
4315 {
4316 for (; rj < d->tindex - 1; rj += 2)
4317 {
4318 if ((rj != ri && (d->tokens[rj] <= 0
4319 || NOTCHAR <= d->tokens[rj]))
4320 || d->tokens[rj + 1] != CAT)
4321 break;
4322 }
4323 }
4324 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4325 mp->is[0] = mp->left[0] = mp->right[0]
4326 = case_fold_unibyte ? toupper (t) : t;
4327
4328 idx_t i;
4329 for (i = 1; ri + 2 < rj; i++)
4330 {
4331 ri += 2;
4332 t = d->tokens[ri];
4333 mp->is[i] = mp->left[i] = mp->right[i]
4334 = case_fold_unibyte ? toupper (t) : t;
4335 }
4336 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4337 mp->in = enlist (mp->in, mp->is, i);
4338 break;
4339 }
4340 }
4341 done:;
4342
4343 struct dfamust *dm = NULL;
4344 if (*result)
4345 {
4346 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4347 dm->exact = exact;
4348 dm->begline = begline;
4349 dm->endline = endline;
4350 strcpy (dm->must, result);
4351 }
4352
4353 while (mp)
4354 {
4355 must *prev = mp->prev;
4356 freemust (mp);
4357 mp = prev;
4358 }
4359
4360 return dm;
4361}
4362
4363void
4364dfamustfree (struct dfamust *dm)
4365{
4366 free (dm);
4367}
4368
4369struct dfa *
4370dfaalloc (void)
4371{
4372 return xmalloc (sizeof (struct dfa));
4373}
4374
4375/* Initialize DFA. */
4376void
4377dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4378 reg_syntax_t bits, int dfaopts)
4379{
4380 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4381 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4382 dfa->localeinfo = *linfo;
4383
4384 dfa->fast = !dfa->localeinfo.multibyte;
4385
4386 dfa->canychar = -1;
4387 dfa->syntax.syntax_bits_set = true;
4388 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4389 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4390 dfa->syntax.syntax_bits = bits;
4391 dfa->syntax.dfaopts = dfaopts;
4392
4393 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4394 {
4395 unsigned char uc = i;
4396
4397 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4398 switch (dfa->syntax.sbit[uc])
4399 {
4400 case CTX_LETTER:
4401 setbit (uc, &dfa->syntax.letters);
4402 break;
4403 case CTX_NEWLINE:
4404 setbit (uc, &dfa->syntax.newline);
4405 break;
4406 }
4407
4408 /* POSIX requires that the five bytes in "\n\r./" (including the
4409 terminating NUL) cannot occur inside a multibyte character. */
4410 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4411 ? (uc & 0xc0) != 0x80
4412 : strchr ("\n\r./", uc) != NULL);
4413 }
4414}
4415
4416/* Initialize TO by copying FROM's syntax settings. */
4417void
4418dfacopysyntax (struct dfa *to, struct dfa const *from)
4419{
4420 memset (to, 0, offsetof (struct dfa, syntax));
4421 to->canychar = -1;
4422 to->fast = from->fast;
4423 to->syntax = from->syntax;
4424 to->dfaexec = from->dfaexec;
4425 to->localeinfo = from->localeinfo;
4426}
4427
4428/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette