VirtualBox

source: kBuild/vendor/grep/2.12/src/dfa.c@ 3576

Last change on this file since 3576 was 2595, checked in by bird, 13 years ago

gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)

  • Property svn:eol-style set to native
File size: 124.9 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24#include <assert.h>
25#include <ctype.h>
26#include <stdio.h>
27#include <sys/types.h>
28#include <stdlib.h>
29#include <limits.h>
30#include <string.h>
31#include <locale.h>
32
33#define STREQ(a, b) (strcmp (a, b) == 0)
34
35/* ISASCIIDIGIT differs from isdigit, as follows:
36 - Its arg may be any int or unsigned int; it need not be an unsigned char.
37 - It's guaranteed to evaluate its argument exactly once.
38 - It's typically faster.
39 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
40 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
41 it's important to use the locale's definition of `digit' even when the
42 host does not conform to Posix. */
43#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
44
45/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
46#include "gettext.h"
47#define _(str) gettext (str)
48
49#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
50#include <wchar.h>
51#include <wctype.h>
52
53#if HAVE_LANGINFO_CODESET
54# include <langinfo.h>
55#endif
56
57#include "regex.h"
58#include "dfa.h"
59#include "hard-locale.h"
60#include "xalloc.h"
61
62/* HPUX, define those as macros in sys/param.h */
63#ifdef setbit
64# undef setbit
65#endif
66#ifdef clrbit
67# undef clrbit
68#endif
69
70/* Number of bits in an unsigned char. */
71#ifndef CHARBITS
72# define CHARBITS 8
73#endif
74
75/* First integer value that is greater than any character code. */
76#define NOTCHAR (1 << CHARBITS)
77
78/* INTBITS need not be exact, just a lower bound. */
79#ifndef INTBITS
80# define INTBITS (CHARBITS * sizeof (int))
81#endif
82
83/* Number of ints required to hold a bit for every character. */
84#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
85
86/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
87typedef int charclass[CHARCLASS_INTS];
88
89/* Convert a possibly-signed character to an unsigned character. This is
90 a bit safer than casting to unsigned char, since it catches some type
91 errors that the cast doesn't. */
92static inline unsigned char
93to_uchar (char ch)
94{
95 return ch;
96}
97
98/* Contexts tell us whether a character is a newline or a word constituent.
99 Word-constituent characters are those that satisfy iswalnum(), plus '_'.
100 Each character has a single CTX_* value; bitmasks of CTX_* values denote
101 a particular character class.
102
103 A state also stores a context value, which is a bitmask of CTX_* values.
104 A state's context represents a set of characters that the state's
105 predecessors must match. For example, a state whose context does not
106 include CTX_LETTER will never have transitions where the previous
107 character is a word constituent. A state whose context is CTX_ANY
108 might have transitions from any character. */
109
110#define CTX_NONE 1
111#define CTX_LETTER 2
112#define CTX_NEWLINE 4
113#define CTX_ANY 7
114
115/* Sometimes characters can only be matched depending on the surrounding
116 context. Such context decisions depend on what the previous character
117 was, and the value of the current (lookahead) character. Context
118 dependent constraints are encoded as 8 bit integers. Each bit that
119 is set indicates that the constraint succeeds in the corresponding
120 context.
121
122 bit 8-11 - valid contexts when next character is CTX_NEWLINE
123 bit 4-7 - valid contexts when next character is CTX_LETTER
124 bit 0-3 - valid contexts when next character is CTX_NONE
125
126 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
127 succeeds in a particular context. Prev is a bitmask of possible
128 context values for the previous character, curr is the (single-bit)
129 context value for the lookahead character. */
130#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
131#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
132#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
133
134#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
135 ((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
136 | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
137 | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
138
139/* The following macros give information about what a constraint depends on. */
140#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
141#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
142#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
143
144#define PREV_NEWLINE_DEPENDENT(constraint) \
145 (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
146#define PREV_LETTER_DEPENDENT(constraint) \
147 (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
148
149/* Tokens that match the empty string subject to some constraint actually
150 work by applying that constraint to determine what may follow them,
151 taking into account what has gone before. The following values are
152 the constraints corresponding to the special tokens previously defined. */
153#define NO_CONSTRAINT 0x777
154#define BEGLINE_CONSTRAINT 0x444
155#define ENDLINE_CONSTRAINT 0x700
156#define BEGWORD_CONSTRAINT 0x050
157#define ENDWORD_CONSTRAINT 0x202
158#define LIMWORD_CONSTRAINT 0x252
159#define NOTLIMWORD_CONSTRAINT 0x525
160
161/* The regexp is parsed into an array of tokens in postfix form. Some tokens
162 are operators and others are terminal symbols. Most (but not all) of these
163 codes are returned by the lexical analyzer. */
164
165typedef ptrdiff_t token;
166
167/* Predefined token values. */
168enum
169{
170 END = -1, /* END is a terminal symbol that matches the
171 end of input; any value of END or less in
172 the parse tree is such a symbol. Accepting
173 states of the DFA are those that would have
174 a transition on END. */
175
176 /* Ordinary character values are terminal symbols that match themselves. */
177
178 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
179 the empty string. */
180
181 BACKREF, /* BACKREF is generated by \<digit>; it
182 is not completely handled. If the scanner
183 detects a transition on backref, it returns
184 a kind of "semi-success" indicating that
185 the match will have to be verified with
186 a backtracking matcher. */
187
188 BEGLINE, /* BEGLINE is a terminal symbol that matches
189 the empty string if it is at the beginning
190 of a line. */
191
192 ENDLINE, /* ENDLINE is a terminal symbol that matches
193 the empty string if it is at the end of
194 a line. */
195
196 BEGWORD, /* BEGWORD is a terminal symbol that matches
197 the empty string if it is at the beginning
198 of a word. */
199
200 ENDWORD, /* ENDWORD is a terminal symbol that matches
201 the empty string if it is at the end of
202 a word. */
203
204 LIMWORD, /* LIMWORD is a terminal symbol that matches
205 the empty string if it is at the beginning
206 or the end of a word. */
207
208 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
209 matches the empty string if it is not at
210 the beginning or end of a word. */
211
212 QMARK, /* QMARK is an operator of one argument that
213 matches zero or one occurrences of its
214 argument. */
215
216 STAR, /* STAR is an operator of one argument that
217 matches the Kleene closure (zero or more
218 occurrences) of its argument. */
219
220 PLUS, /* PLUS is an operator of one argument that
221 matches the positive closure (one or more
222 occurrences) of its argument. */
223
224 REPMN, /* REPMN is a lexical token corresponding
225 to the {m,n} construct. REPMN never
226 appears in the compiled token vector. */
227
228 CAT, /* CAT is an operator of two arguments that
229 matches the concatenation of its
230 arguments. CAT is never returned by the
231 lexical analyzer. */
232
233 OR, /* OR is an operator of two arguments that
234 matches either of its arguments. */
235
236 LPAREN, /* LPAREN never appears in the parse tree,
237 it is only a lexeme. */
238
239 RPAREN, /* RPAREN never appears in the parse tree. */
240
241 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
242 any multibyte (or single byte) characters.
243 It is used only if MB_CUR_MAX > 1. */
244
245 MBCSET, /* MBCSET is similar to CSET, but for
246 multibyte characters. */
247
248 WCHAR, /* Only returned by lex. wctok contains
249 the wide character representation. */
250
251 CSET /* CSET and (and any value greater) is a
252 terminal symbol that matches any of a
253 class of characters. */
254};
255
256
257/* States of the recognizer correspond to sets of positions in the parse
258 tree, together with the constraints under which they may be matched.
259 So a position is encoded as an index into the parse tree together with
260 a constraint. */
261typedef struct
262{
263 size_t index; /* Index into the parse array. */
264 unsigned int constraint; /* Constraint for matching this position. */
265} position;
266
267/* Sets of positions are stored as arrays. */
268typedef struct
269{
270 position *elems; /* Elements of this position set. */
271 size_t nelem; /* Number of elements in this set. */
272 size_t alloc; /* Number of elements allocated in ELEMS. */
273} position_set;
274
275/* Sets of leaves are also stored as arrays. */
276typedef struct
277{
278 size_t *elems; /* Elements of this position set. */
279 size_t nelem; /* Number of elements in this set. */
280} leaf_set;
281
282/* A state of the dfa consists of a set of positions, some flags,
283 and the token value of the lowest-numbered position of the state that
284 contains an END token. */
285typedef struct
286{
287 size_t hash; /* Hash of the positions of this state. */
288 position_set elems; /* Positions this state could match. */
289 unsigned char context; /* Context from previous state. */
290 char backref; /* True if this state matches a \<digit>. */
291 unsigned short constraint; /* Constraint for this state to accept. */
292 token first_end; /* Token value of the first END in elems. */
293 position_set mbps; /* Positions which can match multibyte
294 characters. e.g. period.
295 These staff are used only if
296 MB_CUR_MAX > 1. */
297} dfa_state;
298
299/* States are indexed by state_num values. These are normally
300 nonnegative but -1 is used as a special value. */
301typedef ptrdiff_t state_num;
302
303/* A bracket operator.
304 e.g. [a-c], [[:alpha:]], etc. */
305struct mb_char_classes
306{
307 ptrdiff_t cset;
308 int invert;
309 wchar_t *chars; /* Normal characters. */
310 size_t nchars;
311 wctype_t *ch_classes; /* Character classes. */
312 size_t nch_classes;
313 wchar_t *range_sts; /* Range characters (start of the range). */
314 wchar_t *range_ends; /* Range characters (end of the range). */
315 size_t nranges;
316 char **equivs; /* Equivalence classes. */
317 size_t nequivs;
318 char **coll_elems;
319 size_t ncoll_elems; /* Collating elements. */
320};
321
322/* A compiled regular expression. */
323struct dfa
324{
325 /* Fields filled by the scanner. */
326 charclass *charclasses; /* Array of character sets for CSET tokens. */
327 size_t cindex; /* Index for adding new charclasses. */
328 size_t calloc; /* Number of charclasses currently allocated. */
329
330 /* Fields filled by the parser. */
331 token *tokens; /* Postfix parse array. */
332 size_t tindex; /* Index for adding new tokens. */
333 size_t talloc; /* Number of tokens currently allocated. */
334 size_t depth; /* Depth required of an evaluation stack
335 used for depth-first traversal of the
336 parse tree. */
337 size_t nleaves; /* Number of leaves on the parse tree. */
338 size_t nregexps; /* Count of parallel regexps being built
339 with dfaparse(). */
340 unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
341 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
342
343 /* The following are used only if MB_CUR_MAX > 1. */
344
345 /* The value of multibyte_prop[i] is defined by following rule.
346 if tokens[i] < NOTCHAR
347 bit 0 : tokens[i] is the first byte of a character, including
348 single-byte characters.
349 bit 1 : tokens[i] is the last byte of a character, including
350 single-byte characters.
351
352 if tokens[i] = MBCSET
353 ("the index of mbcsets corresponding to this operator" << 2) + 3
354
355 e.g.
356 tokens
357 = 'single_byte_a', 'multi_byte_A', single_byte_b'
358 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
359 multibyte_prop
360 = 3 , 1 , 0 , 2 , 3
361 */
362 size_t nmultibyte_prop;
363 int *multibyte_prop;
364
365 /* Array of the bracket expression in the DFA. */
366 struct mb_char_classes *mbcsets;
367 size_t nmbcsets;
368 size_t mbcsets_alloc;
369
370 /* Fields filled by the state builder. */
371 dfa_state *states; /* States of the dfa. */
372 state_num sindex; /* Index for adding new states. */
373 state_num salloc; /* Number of states currently allocated. */
374
375 /* Fields filled by the parse tree->NFA conversion. */
376 position_set *follows; /* Array of follow sets, indexed by position
377 index. The follow of a position is the set
378 of positions containing characters that
379 could conceivably follow a character
380 matching the given position in a string
381 matching the regexp. Allocated to the
382 maximum possible position index. */
383 int searchflag; /* True if we are supposed to build a searching
384 as opposed to an exact matcher. A searching
385 matcher finds the first and shortest string
386 matching a regexp anywhere in the buffer,
387 whereas an exact matcher finds the longest
388 string matching, but anchored to the
389 beginning of the buffer. */
390
391 /* Fields filled by dfaexec. */
392 state_num tralloc; /* Number of transition tables that have
393 slots so far. */
394 int trcount; /* Number of transition tables that have
395 actually been built. */
396 state_num **trans; /* Transition tables for states that can
397 never accept. If the transitions for a
398 state have not yet been computed, or the
399 state could possibly accept, its entry in
400 this table is NULL. */
401 state_num **realtrans; /* Trans always points to realtrans + 1; this
402 is so trans[-1] can contain NULL. */
403 state_num **fails; /* Transition tables after failing to accept
404 on a state that potentially could do so. */
405 int *success; /* Table of acceptance conditions used in
406 dfaexec and computed in build_state. */
407 state_num *newlines; /* Transitions on newlines. The entry for a
408 newline in any transition table is always
409 -1 so we can count lines without wasting
410 too many cycles. The transition for a
411 newline is stored separately and handled
412 as a special case. Newline is also used
413 as a sentinel at the end of the buffer. */
414 struct dfamust *musts; /* List of strings, at least one of which
415 is known to appear in any r.e. matching
416 the dfa. */
417};
418
419/* Some macros for user access to dfa internals. */
420
421/* ACCEPTING returns true if s could possibly be an accepting state of r. */
422#define ACCEPTING(s, r) ((r).states[s].constraint)
423
424/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
425 specified context. */
426#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
427 SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
428
429static void dfamust (struct dfa *dfa);
430static void regexp (void);
431
432/* These two macros are identical to the ones in gnulib's xalloc.h,
433 except that they not to case the result to "(t *)", and thus may
434 be used via type-free CALLOC and MALLOC macros. */
435#undef XNMALLOC
436#undef XCALLOC
437
438/* Allocate memory for N elements of type T, with error checking. */
439/* extern t *XNMALLOC (size_t n, typename t); */
440# define XNMALLOC(n, t) \
441 (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
442
443/* Allocate memory for N elements of type T, with error checking,
444 and zero it. */
445/* extern t *XCALLOC (size_t n, typename t); */
446# define XCALLOC(n, t) \
447 (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
448
449#define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
450#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
451#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
452
453/* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
454#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
455 do \
456 { \
457 if ((n_alloc) <= (n_required)) \
458 { \
459 size_t new_n_alloc = (n_required) + !(p); \
460 (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
461 (n_alloc) = new_n_alloc; \
462 } \
463 } \
464 while (false)
465
466
467#ifdef DEBUG
468
469static void
470prtok (token t)
471{
472 char const *s;
473
474 if (t < 0)
475 fprintf (stderr, "END");
476 else if (t < NOTCHAR)
477 {
478 int ch = t;
479 fprintf (stderr, "%c", ch);
480 }
481 else
482 {
483 switch (t)
484 {
485 case EMPTY:
486 s = "EMPTY";
487 break;
488 case BACKREF:
489 s = "BACKREF";
490 break;
491 case BEGLINE:
492 s = "BEGLINE";
493 break;
494 case ENDLINE:
495 s = "ENDLINE";
496 break;
497 case BEGWORD:
498 s = "BEGWORD";
499 break;
500 case ENDWORD:
501 s = "ENDWORD";
502 break;
503 case LIMWORD:
504 s = "LIMWORD";
505 break;
506 case NOTLIMWORD:
507 s = "NOTLIMWORD";
508 break;
509 case QMARK:
510 s = "QMARK";
511 break;
512 case STAR:
513 s = "STAR";
514 break;
515 case PLUS:
516 s = "PLUS";
517 break;
518 case CAT:
519 s = "CAT";
520 break;
521 case OR:
522 s = "OR";
523 break;
524 case LPAREN:
525 s = "LPAREN";
526 break;
527 case RPAREN:
528 s = "RPAREN";
529 break;
530 case ANYCHAR:
531 s = "ANYCHAR";
532 break;
533 case MBCSET:
534 s = "MBCSET";
535 break;
536 default:
537 s = "CSET";
538 break;
539 }
540 fprintf (stderr, "%s", s);
541 }
542}
543#endif /* DEBUG */
544
545/* Stuff pertaining to charclasses. */
546
547static int
548tstbit (unsigned int b, charclass const c)
549{
550 return c[b / INTBITS] & 1 << b % INTBITS;
551}
552
553static void
554setbit (unsigned int b, charclass c)
555{
556 c[b / INTBITS] |= 1 << b % INTBITS;
557}
558
559static void
560clrbit (unsigned int b, charclass c)
561{
562 c[b / INTBITS] &= ~(1 << b % INTBITS);
563}
564
565static void
566copyset (charclass const src, charclass dst)
567{
568 memcpy (dst, src, sizeof (charclass));
569}
570
571static void
572zeroset (charclass s)
573{
574 memset (s, 0, sizeof (charclass));
575}
576
577static void
578notset (charclass s)
579{
580 int i;
581
582 for (i = 0; i < CHARCLASS_INTS; ++i)
583 s[i] = ~s[i];
584}
585
586static int
587equal (charclass const s1, charclass const s2)
588{
589 return memcmp (s1, s2, sizeof (charclass)) == 0;
590}
591
592/* A pointer to the current dfa is kept here during parsing. */
593static struct dfa *dfa;
594
595/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
596static size_t
597charclass_index (charclass const s)
598{
599 size_t i;
600
601 for (i = 0; i < dfa->cindex; ++i)
602 if (equal (s, dfa->charclasses[i]))
603 return i;
604 REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
605 ++dfa->cindex;
606 copyset (s, dfa->charclasses[i]);
607 return i;
608}
609
610/* Syntax bits controlling the behavior of the lexical analyzer. */
611static reg_syntax_t syntax_bits, syntax_bits_set;
612
613/* Flag for case-folding letters into sets. */
614static int case_fold;
615
616/* End-of-line byte in data. */
617static unsigned char eolbyte;
618
619/* Cache of char-context values. */
620static int sbit[NOTCHAR];
621
622/* Set of characters considered letters. */
623static charclass letters;
624
625/* Set of characters that are newline. */
626static charclass newline;
627
628/* Add this to the test for whether a byte is word-constituent, since on
629 BSD-based systems, many values in the 128..255 range are classified as
630 alphabetic, while on glibc-based systems, they are not. */
631#ifdef __GLIBC__
632# define is_valid_unibyte_character(c) 1
633#else
634# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
635#endif
636
637/* Return non-zero if C is a 'word-constituent' byte; zero otherwise. */
638#define IS_WORD_CONSTITUENT(C) \
639 (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
640
641static int
642char_context (unsigned char c)
643{
644 if (c == eolbyte || c == 0)
645 return CTX_NEWLINE;
646 if (IS_WORD_CONSTITUENT (c))
647 return CTX_LETTER;
648 return CTX_NONE;
649}
650
651static int
652wchar_context (wint_t wc)
653{
654 if (wc == (wchar_t) eolbyte || wc == 0)
655 return CTX_NEWLINE;
656 if (wc == L'_' || iswalnum (wc))
657 return CTX_LETTER;
658 return CTX_NONE;
659}
660
661/* Entry point to set syntax options. */
662void
663dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
664{
665 unsigned int i;
666
667 syntax_bits_set = 1;
668 syntax_bits = bits;
669 case_fold = fold;
670 eolbyte = eol;
671
672 for (i = 0; i < NOTCHAR; ++i)
673 {
674 sbit[i] = char_context (i);
675 switch (sbit[i])
676 {
677 case CTX_LETTER:
678 setbit (i, letters);
679 break;
680 case CTX_NEWLINE:
681 setbit (i, newline);
682 break;
683 }
684 }
685}
686
687/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
688 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
689 this may happen when folding case in weird Turkish locales where
690 dotless i/dotted I are not included in the chosen character set.
691 Return whether a bit was set in the charclass. */
692#if MBS_SUPPORT
693static bool
694setbit_wc (wint_t wc, charclass c)
695{
696 int b = wctob (wc);
697 if (b == EOF)
698 return false;
699
700 setbit (b, c);
701 return true;
702}
703
704/* Set a bit in the charclass for the given single byte character,
705 if it is valid in the current character set. */
706static void
707setbit_c (int b, charclass c)
708{
709 /* Do nothing if b is invalid in this character set. */
710 if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
711 return;
712 setbit (b, c);
713}
714#else
715# define setbit_c setbit
716static inline bool
717setbit_wc (wint_t wc, charclass c)
718{
719 abort ();
720 /*NOTREACHED*/ return false;
721}
722#endif
723
724/* Like setbit_c, but if case is folded, set both cases of a letter. For
725 MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
726 and the caller takes care of setting the appropriate field of struct
727 mb_char_classes. */
728static void
729setbit_case_fold_c (int b, charclass c)
730{
731 if (MB_CUR_MAX > 1)
732 {
733 wint_t wc = btowc (b);
734 if (wc == WEOF)
735 return;
736 setbit (b, c);
737 if (case_fold && iswalpha (wc))
738 setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
739 }
740 else
741 {
742 setbit (b, c);
743 if (case_fold && isalpha (b))
744 setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
745 }
746}
747
748
749
750/* UTF-8 encoding allows some optimizations that we can't otherwise
751 assume in a multibyte encoding. */
752static inline int
753using_utf8 (void)
754{
755 static int utf8 = -1;
756 if (utf8 == -1)
757 {
758#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
759 utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
760#else
761 utf8 = 0;
762#endif
763 }
764
765 return utf8;
766}
767
768/* Lexical analyzer. All the dross that deals with the obnoxious
769 GNU Regex syntax bits is located here. The poor, suffering
770 reader is referred to the GNU Regex documentation for the
771 meaning of the @#%!@#%^!@ syntax bits. */
772
773static char const *lexptr; /* Pointer to next input character. */
774static size_t lexleft; /* Number of characters remaining. */
775static token lasttok; /* Previous token returned; initially END. */
776static int laststart; /* True if we're separated from beginning or (, |
777 only by zero-width characters. */
778static size_t parens; /* Count of outstanding left parens. */
779static int minrep, maxrep; /* Repeat counts for {m,n}. */
780static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
781
782static int cur_mb_len = 1; /* Length of the multibyte representation of
783 wctok. */
784/* These variables are used only if (MB_CUR_MAX > 1). */
785static mbstate_t mbs; /* Mbstate for mbrlen(). */
786static wchar_t wctok; /* Wide character representation of the current
787 multibyte character. */
788static unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec().
789 Each element store the amount of remain
790 byte of corresponding multibyte character
791 in the input string. A element's value
792 is 0 if corresponding character is a
793 single byte character.
794 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
795 mblen_buf : 0, 3, 2, 1
796 */
797static wchar_t *inputwcs; /* Wide character representation of input
798 string in dfaexec().
799 The length of this array is same as
800 the length of input string(char array).
801 inputstring[i] is a single-byte char,
802 or 1st byte of a multibyte char.
803 And inputwcs[i] is the codepoint. */
804static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
805static unsigned char const *buf_end; /* reference to end in dfaexec(). */
806
807
808#if MBS_SUPPORT
809/* Note that characters become unsigned here. */
810# define FETCH_WC(c, wc, eoferr) \
811 do { \
812 if (! lexleft) \
813 { \
814 if ((eoferr) != 0) \
815 dfaerror (eoferr); \
816 else \
817 return lasttok = END; \
818 } \
819 else \
820 { \
821 wchar_t _wc; \
822 cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
823 if (cur_mb_len <= 0) \
824 { \
825 cur_mb_len = 1; \
826 --lexleft; \
827 (wc) = (c) = to_uchar (*lexptr++); \
828 } \
829 else \
830 { \
831 lexptr += cur_mb_len; \
832 lexleft -= cur_mb_len; \
833 (wc) = _wc; \
834 (c) = wctob (wc); \
835 } \
836 } \
837 } while(0)
838
839# define FETCH(c, eoferr) \
840 do { \
841 wint_t wc; \
842 FETCH_WC (c, wc, eoferr); \
843 } while (0)
844
845#else
846/* Note that characters become unsigned here. */
847# define FETCH(c, eoferr) \
848 do { \
849 if (! lexleft) \
850 { \
851 if ((eoferr) != 0) \
852 dfaerror (eoferr); \
853 else \
854 return lasttok = END; \
855 } \
856 (c) = to_uchar (*lexptr++); \
857 --lexleft; \
858 } while(0)
859
860# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
861
862#endif /* MBS_SUPPORT */
863
864#ifndef MIN
865# define MIN(a,b) ((a) < (b) ? (a) : (b))
866#endif
867
868typedef int predicate (int);
869
870/* The following list maps the names of the Posix named character classes
871 to predicate functions that determine whether a given character is in
872 the class. The leading [ has already been eaten by the lexical analyzer. */
873struct dfa_ctype
874{
875 const char *name;
876 predicate *func;
877 bool single_byte_only;
878};
879
880static const struct dfa_ctype prednames[] = {
881 {"alpha", isalpha, false},
882 {"upper", isupper, false},
883 {"lower", islower, false},
884 {"digit", isdigit, true},
885 {"xdigit", isxdigit, true},
886 {"space", isspace, false},
887 {"punct", ispunct, false},
888 {"alnum", isalnum, false},
889 {"print", isprint, false},
890 {"graph", isgraph, false},
891 {"cntrl", iscntrl, false},
892 {"blank", isblank, false},
893 {NULL, NULL, false}
894};
895
896static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
897find_pred (const char *str)
898{
899 unsigned int i;
900 for (i = 0; prednames[i].name; ++i)
901 if (STREQ (str, prednames[i].name))
902 break;
903
904 return &prednames[i];
905}
906
907/* Multibyte character handling sub-routine for lex.
908 This function parse a bracket expression and build a struct
909 mb_char_classes. */
910static token
911parse_bracket_exp (void)
912{
913 int invert;
914 int c, c1, c2;
915 charclass ccl;
916
917 /* Used to warn about [:space:].
918 Bit 0 = first character is a colon.
919 Bit 1 = last character is a colon.
920 Bit 2 = includes any other character but a colon.
921 Bit 3 = includes ranges, char/equiv classes or collation elements. */
922 int colon_warning_state;
923
924 wint_t wc;
925 wint_t wc2;
926 wint_t wc1 = 0;
927
928 /* Work area to build a mb_char_classes. */
929 struct mb_char_classes *work_mbc;
930 size_t chars_al, range_sts_al, range_ends_al, ch_classes_al,
931 equivs_al, coll_elems_al;
932
933 chars_al = 0;
934 range_sts_al = range_ends_al = 0;
935 ch_classes_al = equivs_al = coll_elems_al = 0;
936 if (MB_CUR_MAX > 1)
937 {
938 REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc,
939 dfa->nmbcsets + 1);
940
941 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
942 We will update dfa->multibyte_prop[] in addtok(), because we can't
943 decide the index in dfa->tokens[]. */
944
945 /* Initialize work area. */
946 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
947 memset (work_mbc, 0, sizeof *work_mbc);
948 }
949 else
950 work_mbc = NULL;
951
952 memset (ccl, 0, sizeof ccl);
953 FETCH_WC (c, wc, _("unbalanced ["));
954 if (c == '^')
955 {
956 FETCH_WC (c, wc, _("unbalanced ["));
957 invert = 1;
958 }
959 else
960 invert = 0;
961
962 colon_warning_state = (c == ':');
963 do
964 {
965 c1 = EOF; /* mark c1 is not initialized". */
966 colon_warning_state &= ~2;
967
968 /* Note that if we're looking at some other [:...:] construct,
969 we just treat it as a bunch of ordinary characters. We can do
970 this because we assume regex has checked for syntax errors before
971 dfa is ever called. */
972 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
973 {
974#define BRACKET_BUFFER_SIZE 128
975 char str[BRACKET_BUFFER_SIZE];
976 FETCH_WC (c1, wc1, _("unbalanced ["));
977
978 /* If pattern contains `[[:', `[[.', or `[[='. */
979 if (c1 == ':'
980 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
981 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
982 {
983 size_t len = 0;
984 for (;;)
985 {
986 FETCH_WC (c, wc, _("unbalanced ["));
987 if ((c == c1 && *lexptr == ']') || lexleft == 0)
988 break;
989 if (len < BRACKET_BUFFER_SIZE)
990 str[len++] = c;
991 else
992 /* This is in any case an invalid class name. */
993 str[0] = '\0';
994 }
995 str[len] = '\0';
996
997 /* Fetch bracket. */
998 FETCH_WC (c, wc, _("unbalanced ["));
999 if (c1 == ':')
1000 /* build character class. */
1001 {
1002 char const *class
1003 = (case_fold && (STREQ (str, "upper")
1004 || STREQ (str, "lower")) ? "alpha" : str);
1005 const struct dfa_ctype *pred = find_pred (class);
1006 if (!pred)
1007 dfaerror (_("invalid character class"));
1008
1009 if (MB_CUR_MAX > 1 && !pred->single_byte_only)
1010 {
1011 /* Store the character class as wctype_t. */
1012 wctype_t wt = wctype (class);
1013
1014 REALLOC_IF_NECESSARY (work_mbc->ch_classes,
1015 ch_classes_al,
1016 work_mbc->nch_classes + 1);
1017 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
1018 }
1019
1020 for (c2 = 0; c2 < NOTCHAR; ++c2)
1021 if (pred->func (c2))
1022 setbit_case_fold_c (c2, ccl);
1023 }
1024
1025 else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
1026 {
1027 char *elem = xmemdup (str, len + 1);
1028
1029 if (c1 == '=')
1030 /* build equivalence class. */
1031 {
1032 REALLOC_IF_NECESSARY (work_mbc->equivs,
1033 equivs_al, work_mbc->nequivs + 1);
1034 work_mbc->equivs[work_mbc->nequivs++] = elem;
1035 }
1036
1037 if (c1 == '.')
1038 /* build collating element. */
1039 {
1040 REALLOC_IF_NECESSARY (work_mbc->coll_elems,
1041 coll_elems_al,
1042 work_mbc->ncoll_elems + 1);
1043 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
1044 }
1045 }
1046 colon_warning_state |= 8;
1047
1048 /* Fetch new lookahead character. */
1049 FETCH_WC (c1, wc1, _("unbalanced ["));
1050 continue;
1051 }
1052
1053 /* We treat '[' as a normal character here. c/c1/wc/wc1
1054 are already set up. */
1055 }
1056
1057 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1058 FETCH_WC (c, wc, _("unbalanced ["));
1059
1060 if (c1 == EOF)
1061 FETCH_WC (c1, wc1, _("unbalanced ["));
1062
1063 if (c1 == '-')
1064 /* build range characters. */
1065 {
1066 FETCH_WC (c2, wc2, _("unbalanced ["));
1067 if (c2 == ']')
1068 {
1069 /* In the case [x-], the - is an ordinary hyphen,
1070 which is left in c1, the lookahead character. */
1071 lexptr -= cur_mb_len;
1072 lexleft += cur_mb_len;
1073 }
1074 }
1075
1076 if (c1 == '-' && c2 != ']')
1077 {
1078 if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1079 FETCH_WC (c2, wc2, _("unbalanced ["));
1080
1081 if (MB_CUR_MAX > 1)
1082 {
1083 /* When case folding map a range, say [m-z] (or even [M-z])
1084 to the pair of ranges, [m-z] [M-Z]. */
1085 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1086 range_sts_al, work_mbc->nranges + 1);
1087 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1088 range_ends_al, work_mbc->nranges + 1);
1089 work_mbc->range_sts[work_mbc->nranges] =
1090 case_fold ? towlower (wc) : (wchar_t) wc;
1091 work_mbc->range_ends[work_mbc->nranges++] =
1092 case_fold ? towlower (wc2) : (wchar_t) wc2;
1093
1094#ifndef GREP
1095 if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
1096 {
1097 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1098 range_sts_al, work_mbc->nranges + 1);
1099 work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
1100 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1101 range_ends_al, work_mbc->nranges + 1);
1102 work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
1103 }
1104#endif
1105 }
1106 else
1107 {
1108 c1 = c;
1109 if (case_fold)
1110 {
1111 c1 = tolower (c1);
1112 c2 = tolower (c2);
1113 }
1114 if (!hard_LC_COLLATE)
1115 for (c = c1; c <= c2; c++)
1116 setbit_case_fold_c (c, ccl);
1117 else
1118 {
1119 /* Defer to the system regex library about the meaning
1120 of range expressions. */
1121 regex_t re;
1122 char pattern[6] = { '[', c1, '-', c2, ']', 0 };
1123 char subject[2] = { 0, 0 };
1124 regcomp (&re, pattern, REG_NOSUB);
1125 for (c = 0; c < NOTCHAR; ++c)
1126 {
1127 subject[0] = c;
1128 if (!(case_fold && isupper (c))
1129 && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
1130 setbit_case_fold_c (c, ccl);
1131 }
1132 regfree (&re);
1133 }
1134 }
1135
1136 colon_warning_state |= 8;
1137 FETCH_WC (c1, wc1, _("unbalanced ["));
1138 continue;
1139 }
1140
1141 colon_warning_state |= (c == ':') ? 2 : 4;
1142
1143 if (MB_CUR_MAX == 1)
1144 {
1145 setbit_case_fold_c (c, ccl);
1146 continue;
1147 }
1148
1149 if (case_fold && iswalpha (wc))
1150 {
1151 wc = towlower (wc);
1152 if (!setbit_wc (wc, ccl))
1153 {
1154 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1155 work_mbc->nchars + 1);
1156 work_mbc->chars[work_mbc->nchars++] = wc;
1157 }
1158#ifdef GREP
1159 continue;
1160#else
1161 wc = towupper (wc);
1162#endif
1163 }
1164 if (!setbit_wc (wc, ccl))
1165 {
1166 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1167 work_mbc->nchars + 1);
1168 work_mbc->chars[work_mbc->nchars++] = wc;
1169 }
1170 }
1171 while ((wc = wc1, (c = c1) != ']'));
1172
1173 if (colon_warning_state == 7)
1174 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1175
1176 if (MB_CUR_MAX > 1)
1177 {
1178 static charclass zeroclass;
1179 work_mbc->invert = invert;
1180 work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
1181 return MBCSET;
1182 }
1183
1184 if (invert)
1185 {
1186 assert (MB_CUR_MAX == 1);
1187 notset (ccl);
1188 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1189 clrbit (eolbyte, ccl);
1190 }
1191
1192 return CSET + charclass_index (ccl);
1193}
1194
1195static token
1196lex (void)
1197{
1198 unsigned int c, c2;
1199 int backslash = 0;
1200 charclass ccl;
1201 int i;
1202
1203 /* Basic plan: We fetch a character. If it's a backslash,
1204 we set the backslash flag and go through the loop again.
1205 On the plus side, this avoids having a duplicate of the
1206 main switch inside the backslash case. On the minus side,
1207 it means that just about every case begins with
1208 "if (backslash) ...". */
1209 for (i = 0; i < 2; ++i)
1210 {
1211 if (MB_CUR_MAX > 1)
1212 {
1213 FETCH_WC (c, wctok, NULL);
1214 if ((int) c == EOF)
1215 goto normal_char;
1216 }
1217 else
1218 FETCH (c, NULL);
1219
1220 switch (c)
1221 {
1222 case '\\':
1223 if (backslash)
1224 goto normal_char;
1225 if (lexleft == 0)
1226 dfaerror (_("unfinished \\ escape"));
1227 backslash = 1;
1228 break;
1229
1230 case '^':
1231 if (backslash)
1232 goto normal_char;
1233 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1234 || lasttok == END || lasttok == LPAREN || lasttok == OR)
1235 return lasttok = BEGLINE;
1236 goto normal_char;
1237
1238 case '$':
1239 if (backslash)
1240 goto normal_char;
1241 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242 || lexleft == 0
1243 || (syntax_bits & RE_NO_BK_PARENS
1244 ? lexleft > 0 && *lexptr == ')'
1245 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1246 || (syntax_bits & RE_NO_BK_VBAR
1247 ? lexleft > 0 && *lexptr == '|'
1248 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1249 || ((syntax_bits & RE_NEWLINE_ALT)
1250 && lexleft > 0 && *lexptr == '\n'))
1251 return lasttok = ENDLINE;
1252 goto normal_char;
1253
1254 case '1':
1255 case '2':
1256 case '3':
1257 case '4':
1258 case '5':
1259 case '6':
1260 case '7':
1261 case '8':
1262 case '9':
1263 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1264 {
1265 laststart = 0;
1266 return lasttok = BACKREF;
1267 }
1268 goto normal_char;
1269
1270 case '`':
1271 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1272 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1273 goto normal_char;
1274
1275 case '\'':
1276 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1277 return lasttok = ENDLINE; /* FIXME: should be end of string */
1278 goto normal_char;
1279
1280 case '<':
1281 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1282 return lasttok = BEGWORD;
1283 goto normal_char;
1284
1285 case '>':
1286 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1287 return lasttok = ENDWORD;
1288 goto normal_char;
1289
1290 case 'b':
1291 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1292 return lasttok = LIMWORD;
1293 goto normal_char;
1294
1295 case 'B':
1296 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1297 return lasttok = NOTLIMWORD;
1298 goto normal_char;
1299
1300 case '?':
1301 if (syntax_bits & RE_LIMITED_OPS)
1302 goto normal_char;
1303 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1304 goto normal_char;
1305 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1306 goto normal_char;
1307 return lasttok = QMARK;
1308
1309 case '*':
1310 if (backslash)
1311 goto normal_char;
1312 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1313 goto normal_char;
1314 return lasttok = STAR;
1315
1316 case '+':
1317 if (syntax_bits & RE_LIMITED_OPS)
1318 goto normal_char;
1319 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1320 goto normal_char;
1321 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1322 goto normal_char;
1323 return lasttok = PLUS;
1324
1325 case '{':
1326 if (!(syntax_bits & RE_INTERVALS))
1327 goto normal_char;
1328 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1329 goto normal_char;
1330 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1331 goto normal_char;
1332
1333 /* Cases:
1334 {M} - exact count
1335 {M,} - minimum count, maximum is infinity
1336 {,N} - 0 through N
1337 {,} - 0 to infinity (same as '*')
1338 {M,N} - M through N */
1339 {
1340 char const *p = lexptr;
1341 char const *lim = p + lexleft;
1342 minrep = maxrep = -1;
1343 for (; p != lim && ISASCIIDIGIT (*p); p++)
1344 {
1345 if (minrep < 0)
1346 minrep = *p - '0';
1347 else
1348 minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
1349 }
1350 if (p != lim)
1351 {
1352 if (*p != ',')
1353 maxrep = minrep;
1354 else
1355 {
1356 if (minrep < 0)
1357 minrep = 0;
1358 while (++p != lim && ISASCIIDIGIT (*p))
1359 {
1360 if (maxrep < 0)
1361 maxrep = *p - '0';
1362 else
1363 maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
1364 }
1365 }
1366 }
1367 if (! ((! backslash || (p != lim && *p++ == '\\'))
1368 && p != lim && *p++ == '}'
1369 && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
1370 {
1371 if (syntax_bits & RE_INVALID_INTERVAL_ORD)
1372 goto normal_char;
1373 dfaerror (_("Invalid content of \\{\\}"));
1374 }
1375 if (RE_DUP_MAX < maxrep)
1376 dfaerror (_("Regular expression too big"));
1377 lexptr = p;
1378 lexleft = lim - p;
1379 }
1380 laststart = 0;
1381 return lasttok = REPMN;
1382
1383 case '|':
1384 if (syntax_bits & RE_LIMITED_OPS)
1385 goto normal_char;
1386 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1387 goto normal_char;
1388 laststart = 1;
1389 return lasttok = OR;
1390
1391 case '\n':
1392 if (syntax_bits & RE_LIMITED_OPS
1393 || backslash || !(syntax_bits & RE_NEWLINE_ALT))
1394 goto normal_char;
1395 laststart = 1;
1396 return lasttok = OR;
1397
1398 case '(':
1399 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1400 goto normal_char;
1401 ++parens;
1402 laststart = 1;
1403 return lasttok = LPAREN;
1404
1405 case ')':
1406 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1407 goto normal_char;
1408 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1409 goto normal_char;
1410 --parens;
1411 laststart = 0;
1412 return lasttok = RPAREN;
1413
1414 case '.':
1415 if (backslash)
1416 goto normal_char;
1417 if (MB_CUR_MAX > 1)
1418 {
1419 /* In multibyte environment period must match with a single
1420 character not a byte. So we use ANYCHAR. */
1421 laststart = 0;
1422 return lasttok = ANYCHAR;
1423 }
1424 zeroset (ccl);
1425 notset (ccl);
1426 if (!(syntax_bits & RE_DOT_NEWLINE))
1427 clrbit (eolbyte, ccl);
1428 if (syntax_bits & RE_DOT_NOT_NULL)
1429 clrbit ('\0', ccl);
1430 laststart = 0;
1431 return lasttok = CSET + charclass_index (ccl);
1432
1433 case 's':
1434 case 'S':
1435 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1436 goto normal_char;
1437 zeroset (ccl);
1438 for (c2 = 0; c2 < NOTCHAR; ++c2)
1439 if (isspace (c2))
1440 setbit (c2, ccl);
1441 if (c == 'S')
1442 notset (ccl);
1443 laststart = 0;
1444 return lasttok = CSET + charclass_index (ccl);
1445
1446 case 'w':
1447 case 'W':
1448 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1449 goto normal_char;
1450 zeroset (ccl);
1451 for (c2 = 0; c2 < NOTCHAR; ++c2)
1452 if (IS_WORD_CONSTITUENT (c2))
1453 setbit (c2, ccl);
1454 if (c == 'W')
1455 notset (ccl);
1456 laststart = 0;
1457 return lasttok = CSET + charclass_index (ccl);
1458
1459 case '[':
1460 if (backslash)
1461 goto normal_char;
1462 laststart = 0;
1463 return lasttok = parse_bracket_exp ();
1464
1465 default:
1466 normal_char:
1467 laststart = 0;
1468 /* For multibyte character sets, folding is done in atom. Always
1469 return WCHAR. */
1470 if (MB_CUR_MAX > 1)
1471 return lasttok = WCHAR;
1472
1473 if (case_fold && isalpha (c))
1474 {
1475 zeroset (ccl);
1476 setbit_case_fold_c (c, ccl);
1477 return lasttok = CSET + charclass_index (ccl);
1478 }
1479
1480 return lasttok = c;
1481 }
1482 }
1483
1484 /* The above loop should consume at most a backslash
1485 and some other character. */
1486 abort ();
1487 return END; /* keeps pedantic compilers happy. */
1488}
1489
1490/* Recursive descent parser for regular expressions. */
1491
1492static token tok; /* Lookahead token. */
1493static size_t depth; /* Current depth of a hypothetical stack
1494 holding deferred productions. This is
1495 used to determine the depth that will be
1496 required of the real stack later on in
1497 dfaanalyze(). */
1498
1499static void
1500addtok_mb (token t, int mbprop)
1501{
1502 if (MB_CUR_MAX > 1)
1503 {
1504 REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop,
1505 dfa->tindex + 1);
1506 dfa->multibyte_prop[dfa->tindex] = mbprop;
1507 }
1508
1509 REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1);
1510 dfa->tokens[dfa->tindex++] = t;
1511
1512 switch (t)
1513 {
1514 case QMARK:
1515 case STAR:
1516 case PLUS:
1517 break;
1518
1519 case CAT:
1520 case OR:
1521 --depth;
1522 break;
1523
1524 default:
1525 ++dfa->nleaves;
1526 case EMPTY:
1527 ++depth;
1528 break;
1529 }
1530 if (depth > dfa->depth)
1531 dfa->depth = depth;
1532}
1533
1534static void addtok_wc (wint_t wc);
1535
1536/* Add the given token to the parse tree, maintaining the depth count and
1537 updating the maximum depth if necessary. */
1538static void
1539addtok (token t)
1540{
1541 if (MB_CUR_MAX > 1 && t == MBCSET)
1542 {
1543 bool need_or = false;
1544 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1545
1546 /* Extract wide characters into alternations for better performance.
1547 This does not require UTF-8. */
1548 if (!work_mbc->invert)
1549 {
1550 size_t i;
1551 for (i = 0; i < work_mbc->nchars; i++)
1552 {
1553 addtok_wc (work_mbc->chars[i]);
1554 if (need_or)
1555 addtok (OR);
1556 need_or = true;
1557 }
1558 work_mbc->nchars = 0;
1559 }
1560
1561 /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
1562 if (work_mbc->invert
1563 || (!using_utf8 () && work_mbc->cset != -1)
1564 || work_mbc->nchars != 0
1565 || work_mbc->nch_classes != 0
1566 || work_mbc->nranges != 0
1567 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
1568 {
1569 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1570 if (need_or)
1571 addtok (OR);
1572 }
1573 else
1574 {
1575 /* Characters have been handled above, so it is possible
1576 that the mbcset is empty now. Do nothing in that case. */
1577 if (work_mbc->cset != -1)
1578 {
1579 assert (using_utf8 ());
1580 addtok (CSET + work_mbc->cset);
1581 if (need_or)
1582 addtok (OR);
1583 }
1584 }
1585 }
1586 else
1587 {
1588 addtok_mb (t, 3);
1589 }
1590}
1591
1592#if MBS_SUPPORT
1593/* We treat a multibyte character as a single atom, so that DFA
1594 can treat a multibyte character as a single expression.
1595
1596 e.g. We construct following tree from "<mb1><mb2>".
1597 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1598 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1599static void
1600addtok_wc (wint_t wc)
1601{
1602 unsigned char buf[MB_LEN_MAX];
1603 mbstate_t s;
1604 int i;
1605 memset (&s, 0, sizeof s);
1606 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1607
1608 /* This is merely stop-gap. When cur_mb_len is 0 or negative,
1609 buf[0] is undefined, yet skipping the addtok_mb call altogether
1610 can result in heap corruption. */
1611 if (cur_mb_len <= 0)
1612 buf[0] = 0;
1613
1614 addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
1615 for (i = 1; i < cur_mb_len; i++)
1616 {
1617 addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
1618 addtok (CAT);
1619 }
1620}
1621#else
1622static void
1623addtok_wc (wint_t wc)
1624{
1625}
1626#endif
1627
1628static void
1629add_utf8_anychar (void)
1630{
1631#if MBS_SUPPORT
1632 static const charclass utf8_classes[5] = {
1633 {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-lead bytes */
1634 {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
1635 {0, 0, 0, 0, 0, 0, 0xfffffffcU, 0}, /* c2-df: 2-byte sequence */
1636 {0, 0, 0, 0, 0, 0, 0, 0xffff}, /* e0-ef: 3-byte sequence */
1637 {0, 0, 0, 0, 0, 0, 0, 0xff0000} /* f0-f7: 4-byte sequence */
1638 };
1639 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1640 unsigned int i;
1641
1642 /* Define the five character classes that are needed below. */
1643 if (dfa->utf8_anychar_classes[0] == 0)
1644 for (i = 0; i < n; i++)
1645 {
1646 charclass c;
1647 copyset (utf8_classes[i], c);
1648 if (i == 1)
1649 {
1650 if (!(syntax_bits & RE_DOT_NEWLINE))
1651 clrbit (eolbyte, c);
1652 if (syntax_bits & RE_DOT_NOT_NULL)
1653 clrbit ('\0', c);
1654 }
1655 dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
1656 }
1657
1658 /* A valid UTF-8 character is
1659
1660 ([0x00-0x7f]
1661 |[0xc2-0xdf][0x80-0xbf]
1662 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1663 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1664
1665 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1666 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1667 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1668 for (i = 1; i < n; i++)
1669 addtok (dfa->utf8_anychar_classes[i]);
1670 while (--i > 1)
1671 {
1672 addtok (dfa->utf8_anychar_classes[0]);
1673 addtok (CAT);
1674 addtok (OR);
1675 }
1676#endif
1677}
1678
1679/* The grammar understood by the parser is as follows.
1680
1681 regexp:
1682 regexp OR branch
1683 branch
1684
1685 branch:
1686 branch closure
1687 closure
1688
1689 closure:
1690 closure QMARK
1691 closure STAR
1692 closure PLUS
1693 closure REPMN
1694 atom
1695
1696 atom:
1697 <normal character>
1698 <multibyte character>
1699 ANYCHAR
1700 MBCSET
1701 CSET
1702 BACKREF
1703 BEGLINE
1704 ENDLINE
1705 BEGWORD
1706 ENDWORD
1707 LIMWORD
1708 NOTLIMWORD
1709 LPAREN regexp RPAREN
1710 <empty>
1711
1712 The parser builds a parse tree in postfix form in an array of tokens. */
1713
1714static void
1715atom (void)
1716{
1717 if (0)
1718 {
1719 /* empty */
1720 }
1721 else if (MBS_SUPPORT && tok == WCHAR)
1722 {
1723 addtok_wc (case_fold ? towlower (wctok) : wctok);
1724#ifndef GREP
1725 if (case_fold && iswalpha (wctok))
1726 {
1727 addtok_wc (towupper (wctok));
1728 addtok (OR);
1729 }
1730#endif
1731
1732 tok = lex ();
1733 }
1734 else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
1735 {
1736 /* For UTF-8 expand the period to a series of CSETs that define a valid
1737 UTF-8 character. This avoids using the slow multibyte path. I'm
1738 pretty sure it would be both profitable and correct to do it for
1739 any encoding; however, the optimization must be done manually as
1740 it is done above in add_utf8_anychar. So, let's start with
1741 UTF-8: it is the most used, and the structure of the encoding
1742 makes the correctness more obvious. */
1743 add_utf8_anychar ();
1744 tok = lex ();
1745 }
1746 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1747 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1748#if MBS_SUPPORT
1749 || tok == ANYCHAR || tok == MBCSET
1750#endif /* MBS_SUPPORT */
1751 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1752 {
1753 addtok (tok);
1754 tok = lex ();
1755 }
1756 else if (tok == LPAREN)
1757 {
1758 tok = lex ();
1759 regexp ();
1760 if (tok != RPAREN)
1761 dfaerror (_("unbalanced ("));
1762 tok = lex ();
1763 }
1764 else
1765 addtok (EMPTY);
1766}
1767
1768/* Return the number of tokens in the given subexpression. */
1769static size_t _GL_ATTRIBUTE_PURE
1770nsubtoks (size_t tindex)
1771{
1772 size_t ntoks1;
1773
1774 switch (dfa->tokens[tindex - 1])
1775 {
1776 default:
1777 return 1;
1778 case QMARK:
1779 case STAR:
1780 case PLUS:
1781 return 1 + nsubtoks (tindex - 1);
1782 case CAT:
1783 case OR:
1784 ntoks1 = nsubtoks (tindex - 1);
1785 return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
1786 }
1787}
1788
1789/* Copy the given subexpression to the top of the tree. */
1790static void
1791copytoks (size_t tindex, size_t ntokens)
1792{
1793 size_t i;
1794
1795 for (i = 0; i < ntokens; ++i)
1796 {
1797 addtok (dfa->tokens[tindex + i]);
1798 /* Update index into multibyte csets. */
1799 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1800 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1801 }
1802}
1803
1804static void
1805closure (void)
1806{
1807 int i;
1808 size_t tindex, ntokens;
1809
1810 atom ();
1811 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1812 if (tok == REPMN && (minrep || maxrep))
1813 {
1814 ntokens = nsubtoks (dfa->tindex);
1815 tindex = dfa->tindex - ntokens;
1816 if (maxrep < 0)
1817 addtok (PLUS);
1818 if (minrep == 0)
1819 addtok (QMARK);
1820 for (i = 1; i < minrep; ++i)
1821 {
1822 copytoks (tindex, ntokens);
1823 addtok (CAT);
1824 }
1825 for (; i < maxrep; ++i)
1826 {
1827 copytoks (tindex, ntokens);
1828 addtok (QMARK);
1829 addtok (CAT);
1830 }
1831 tok = lex ();
1832 }
1833 else if (tok == REPMN)
1834 {
1835 dfa->tindex -= nsubtoks (dfa->tindex);
1836 tok = lex ();
1837 closure ();
1838 }
1839 else
1840 {
1841 addtok (tok);
1842 tok = lex ();
1843 }
1844}
1845
1846static void
1847branch (void)
1848{
1849 closure ();
1850 while (tok != RPAREN && tok != OR && tok >= 0)
1851 {
1852 closure ();
1853 addtok (CAT);
1854 }
1855}
1856
1857static void
1858regexp (void)
1859{
1860 branch ();
1861 while (tok == OR)
1862 {
1863 tok = lex ();
1864 branch ();
1865 addtok (OR);
1866 }
1867}
1868
1869/* Main entry point for the parser. S is a string to be parsed, len is the
1870 length of the string, so s can include NUL characters. D is a pointer to
1871 the struct dfa to parse into. */
1872void
1873dfaparse (char const *s, size_t len, struct dfa *d)
1874{
1875 dfa = d;
1876 lexptr = s;
1877 lexleft = len;
1878 lasttok = END;
1879 laststart = 1;
1880 parens = 0;
1881#ifdef LC_COLLATE
1882 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1883#endif
1884 if (MB_CUR_MAX > 1)
1885 {
1886 cur_mb_len = 0;
1887 memset (&mbs, 0, sizeof mbs);
1888 }
1889
1890 if (!syntax_bits_set)
1891 dfaerror (_("no syntax specified"));
1892
1893 tok = lex ();
1894 depth = d->depth;
1895
1896 regexp ();
1897
1898 if (tok != END)
1899 dfaerror (_("unbalanced )"));
1900
1901 addtok (END - d->nregexps);
1902 addtok (CAT);
1903
1904 if (d->nregexps)
1905 addtok (OR);
1906
1907 ++d->nregexps;
1908}
1909
1910/* Some primitives for operating on sets of positions. */
1911
1912/* Copy one set to another; the destination must be large enough. */
1913static void
1914copy (position_set const *src, position_set * dst)
1915{
1916 REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem);
1917 memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem);
1918 dst->nelem = src->nelem;
1919}
1920
1921static void
1922alloc_position_set (position_set * s, size_t size)
1923{
1924 MALLOC (s->elems, size);
1925 s->alloc = size;
1926 s->nelem = 0;
1927}
1928
1929/* Insert position P in set S. S is maintained in sorted order on
1930 decreasing index. If there is already an entry in S with P.index
1931 then merge (logically-OR) P's constraints into the one in S.
1932 S->elems must point to an array large enough to hold the resulting set. */
1933static void
1934insert (position p, position_set * s)
1935{
1936 size_t count = s->nelem;
1937 size_t lo = 0, hi = count;
1938 size_t i;
1939 while (lo < hi)
1940 {
1941 size_t mid = (lo + hi) >> 1;
1942 if (s->elems[mid].index > p.index)
1943 lo = mid + 1;
1944 else
1945 hi = mid;
1946 }
1947
1948 if (lo < count && p.index == s->elems[lo].index)
1949 {
1950 s->elems[lo].constraint |= p.constraint;
1951 return;
1952 }
1953
1954 REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1);
1955 for (i = count; i > lo; i--)
1956 s->elems[i] = s->elems[i - 1];
1957 s->elems[lo] = p;
1958 ++s->nelem;
1959}
1960
1961/* Merge two sets of positions into a third. The result is exactly as if
1962 the positions of both sets were inserted into an initially empty set. */
1963static void
1964merge (position_set const *s1, position_set const *s2, position_set * m)
1965{
1966 size_t i = 0, j = 0;
1967
1968 REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem);
1969 m->nelem = 0;
1970 while (i < s1->nelem && j < s2->nelem)
1971 if (s1->elems[i].index > s2->elems[j].index)
1972 m->elems[m->nelem++] = s1->elems[i++];
1973 else if (s1->elems[i].index < s2->elems[j].index)
1974 m->elems[m->nelem++] = s2->elems[j++];
1975 else
1976 {
1977 m->elems[m->nelem] = s1->elems[i++];
1978 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1979 }
1980 while (i < s1->nelem)
1981 m->elems[m->nelem++] = s1->elems[i++];
1982 while (j < s2->nelem)
1983 m->elems[m->nelem++] = s2->elems[j++];
1984}
1985
1986/* Delete a position from a set. */
1987static void
1988delete (position p, position_set * s)
1989{
1990 size_t i;
1991
1992 for (i = 0; i < s->nelem; ++i)
1993 if (p.index == s->elems[i].index)
1994 break;
1995 if (i < s->nelem)
1996 for (--s->nelem; i < s->nelem; ++i)
1997 s->elems[i] = s->elems[i + 1];
1998}
1999
2000/* Find the index of the state corresponding to the given position set with
2001 the given preceding context, or create a new state if there is no such
2002 state. Context tells whether we got here on a newline or letter. */
2003static state_num
2004state_index (struct dfa *d, position_set const *s, int context)
2005{
2006 size_t hash = 0;
2007 int constraint;
2008 state_num i, j;
2009
2010 for (i = 0; i < s->nelem; ++i)
2011 hash ^= s->elems[i].index + s->elems[i].constraint;
2012
2013 /* Try to find a state that exactly matches the proposed one. */
2014 for (i = 0; i < d->sindex; ++i)
2015 {
2016 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2017 || context != d->states[i].context)
2018 continue;
2019 for (j = 0; j < s->nelem; ++j)
2020 if (s->elems[j].constraint
2021 != d->states[i].elems.elems[j].constraint
2022 || s->elems[j].index != d->states[i].elems.elems[j].index)
2023 break;
2024 if (j == s->nelem)
2025 return i;
2026 }
2027
2028 /* We'll have to create a new state. */
2029 REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
2030 d->states[i].hash = hash;
2031 alloc_position_set (&d->states[i].elems, s->nelem);
2032 copy (s, &d->states[i].elems);
2033 d->states[i].context = context;
2034 d->states[i].backref = 0;
2035 d->states[i].constraint = 0;
2036 d->states[i].first_end = 0;
2037 if (MBS_SUPPORT)
2038 {
2039 d->states[i].mbps.nelem = 0;
2040 d->states[i].mbps.elems = NULL;
2041 }
2042 for (j = 0; j < s->nelem; ++j)
2043 if (d->tokens[s->elems[j].index] < 0)
2044 {
2045 constraint = s->elems[j].constraint;
2046 if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
2047 d->states[i].constraint |= constraint;
2048 if (!d->states[i].first_end)
2049 d->states[i].first_end = d->tokens[s->elems[j].index];
2050 }
2051 else if (d->tokens[s->elems[j].index] == BACKREF)
2052 {
2053 d->states[i].constraint = NO_CONSTRAINT;
2054 d->states[i].backref = 1;
2055 }
2056
2057 ++d->sindex;
2058
2059 return i;
2060}
2061
2062/* Find the epsilon closure of a set of positions. If any position of the set
2063 contains a symbol that matches the empty string in some context, replace
2064 that position with the elements of its follow labeled with an appropriate
2065 constraint. Repeat exhaustively until no funny positions are left.
2066 S->elems must be large enough to hold the result. */
2067static void
2068epsclosure (position_set * s, struct dfa const *d)
2069{
2070 size_t i, j;
2071 char *visited; /* array of booleans, enough to use char, not int */
2072 position p, old;
2073
2074 CALLOC (visited, d->tindex);
2075
2076 for (i = 0; i < s->nelem; ++i)
2077 if (d->tokens[s->elems[i].index] >= NOTCHAR
2078 && d->tokens[s->elems[i].index] != BACKREF
2079#if MBS_SUPPORT
2080 && d->tokens[s->elems[i].index] != ANYCHAR
2081 && d->tokens[s->elems[i].index] != MBCSET
2082#endif
2083 && d->tokens[s->elems[i].index] < CSET)
2084 {
2085 old = s->elems[i];
2086 p.constraint = old.constraint;
2087 delete (s->elems[i], s);
2088 if (visited[old.index])
2089 {
2090 --i;
2091 continue;
2092 }
2093 visited[old.index] = 1;
2094 switch (d->tokens[old.index])
2095 {
2096 case BEGLINE:
2097 p.constraint &= BEGLINE_CONSTRAINT;
2098 break;
2099 case ENDLINE:
2100 p.constraint &= ENDLINE_CONSTRAINT;
2101 break;
2102 case BEGWORD:
2103 p.constraint &= BEGWORD_CONSTRAINT;
2104 break;
2105 case ENDWORD:
2106 p.constraint &= ENDWORD_CONSTRAINT;
2107 break;
2108 case LIMWORD:
2109 p.constraint &= LIMWORD_CONSTRAINT;
2110 break;
2111 case NOTLIMWORD:
2112 p.constraint &= NOTLIMWORD_CONSTRAINT;
2113 break;
2114 default:
2115 break;
2116 }
2117 for (j = 0; j < d->follows[old.index].nelem; ++j)
2118 {
2119 p.index = d->follows[old.index].elems[j].index;
2120 insert (p, s);
2121 }
2122 /* Force rescan to start at the beginning. */
2123 i = -1;
2124 }
2125
2126 free (visited);
2127}
2128
2129/* Returns the set of contexts for which there is at least one
2130 character included in C. */
2131
2132static int
2133charclass_context (charclass c)
2134{
2135 int context = 0;
2136 unsigned int j;
2137
2138 if (tstbit (eolbyte, c))
2139 context |= CTX_NEWLINE;
2140
2141 for (j = 0; j < CHARCLASS_INTS; ++j)
2142 {
2143 if (c[j] & letters[j])
2144 context |= CTX_LETTER;
2145 if (c[j] & ~(letters[j] | newline[j]))
2146 context |= CTX_NONE;
2147 }
2148
2149 return context;
2150}
2151
2152/* Returns the contexts on which the position set S depends. Each context
2153 in the set of returned contexts (let's call it SC) may have a different
2154 follow set than other contexts in SC, and also different from the
2155 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2156 in the complement set will have the same follow set. */
2157
2158static int _GL_ATTRIBUTE_PURE
2159state_separate_contexts (position_set const *s)
2160{
2161 int separate_contexts = 0;
2162 size_t j;
2163
2164 for (j = 0; j < s->nelem; ++j)
2165 {
2166 if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2167 separate_contexts |= CTX_NEWLINE;
2168 if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2169 separate_contexts |= CTX_LETTER;
2170 }
2171
2172 return separate_contexts;
2173}
2174
2175
2176/* Perform bottom-up analysis on the parse tree, computing various functions.
2177 Note that at this point, we're pretending constructs like \< are real
2178 characters rather than constraints on what can follow them.
2179
2180 Nullable: A node is nullable if it is at the root of a regexp that can
2181 match the empty string.
2182 * EMPTY leaves are nullable.
2183 * No other leaf is nullable.
2184 * A QMARK or STAR node is nullable.
2185 * A PLUS node is nullable if its argument is nullable.
2186 * A CAT node is nullable if both its arguments are nullable.
2187 * An OR node is nullable if either argument is nullable.
2188
2189 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2190 that could correspond to the first character of a string matching the
2191 regexp rooted at the given node.
2192 * EMPTY leaves have empty firstpos.
2193 * The firstpos of a nonempty leaf is that leaf itself.
2194 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2195 argument.
2196 * The firstpos of a CAT node is the firstpos of the left argument, union
2197 the firstpos of the right if the left argument is nullable.
2198 * The firstpos of an OR node is the union of firstpos of each argument.
2199
2200 Lastpos: The lastpos of a node is the set of positions that could
2201 correspond to the last character of a string matching the regexp at
2202 the given node.
2203 * EMPTY leaves have empty lastpos.
2204 * The lastpos of a nonempty leaf is that leaf itself.
2205 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2206 argument.
2207 * The lastpos of a CAT node is the lastpos of its right argument, union
2208 the lastpos of the left if the right argument is nullable.
2209 * The lastpos of an OR node is the union of the lastpos of each argument.
2210
2211 Follow: The follow of a position is the set of positions that could
2212 correspond to the character following a character matching the node in
2213 a string matching the regexp. At this point we consider special symbols
2214 that match the empty string in some context to be just normal characters.
2215 Later, if we find that a special symbol is in a follow set, we will
2216 replace it with the elements of its follow, labeled with an appropriate
2217 constraint.
2218 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2219 the follow of every node in the lastpos.
2220 * Every node in the firstpos of the second argument of a CAT node is in
2221 the follow of every node in the lastpos of the first argument.
2222
2223 Because of the postfix representation of the parse tree, the depth-first
2224 analysis is conveniently done by a linear scan with the aid of a stack.
2225 Sets are stored as arrays of the elements, obeying a stack-like allocation
2226 scheme; the number of elements in each set deeper in the stack can be
2227 used to determine the address of a particular set's array. */
2228void
2229dfaanalyze (struct dfa *d, int searchflag)
2230{
2231 int *nullable; /* Nullable stack. */
2232 size_t *nfirstpos; /* Element count stack for firstpos sets. */
2233 position *firstpos; /* Array where firstpos elements are stored. */
2234 size_t *nlastpos; /* Element count stack for lastpos sets. */
2235 position *lastpos; /* Array where lastpos elements are stored. */
2236 position_set tmp; /* Temporary set for merging sets. */
2237 position_set merged; /* Result of merging sets. */
2238 int separate_contexts; /* Context wanted by some position. */
2239 int *o_nullable;
2240 size_t *o_nfirst, *o_nlast;
2241 position *o_firstpos, *o_lastpos;
2242 size_t i, j;
2243 position *pos;
2244
2245#ifdef DEBUG
2246 fprintf (stderr, "dfaanalyze:\n");
2247 for (i = 0; i < d->tindex; ++i)
2248 {
2249 fprintf (stderr, " %zd:", i);
2250 prtok (d->tokens[i]);
2251 }
2252 putc ('\n', stderr);
2253#endif
2254
2255 d->searchflag = searchflag;
2256
2257 MALLOC (nullable, d->depth);
2258 o_nullable = nullable;
2259 MALLOC (nfirstpos, d->depth);
2260 o_nfirst = nfirstpos;
2261 MALLOC (firstpos, d->nleaves);
2262 o_firstpos = firstpos, firstpos += d->nleaves;
2263 MALLOC (nlastpos, d->depth);
2264 o_nlast = nlastpos;
2265 MALLOC (lastpos, d->nleaves);
2266 o_lastpos = lastpos, lastpos += d->nleaves;
2267 alloc_position_set (&merged, d->nleaves);
2268
2269 CALLOC (d->follows, d->tindex);
2270
2271 for (i = 0; i < d->tindex; ++i)
2272 {
2273 switch (d->tokens[i])
2274 {
2275 case EMPTY:
2276 /* The empty set is nullable. */
2277 *nullable++ = 1;
2278
2279 /* The firstpos and lastpos of the empty leaf are both empty. */
2280 *nfirstpos++ = *nlastpos++ = 0;
2281 break;
2282
2283 case STAR:
2284 case PLUS:
2285 /* Every element in the firstpos of the argument is in the follow
2286 of every element in the lastpos. */
2287 tmp.nelem = nfirstpos[-1];
2288 tmp.elems = firstpos;
2289 pos = lastpos;
2290 for (j = 0; j < nlastpos[-1]; ++j)
2291 {
2292 merge (&tmp, &d->follows[pos[j].index], &merged);
2293 copy (&merged, &d->follows[pos[j].index]);
2294 }
2295
2296 case QMARK:
2297 /* A QMARK or STAR node is automatically nullable. */
2298 if (d->tokens[i] != PLUS)
2299 nullable[-1] = 1;
2300 break;
2301
2302 case CAT:
2303 /* Every element in the firstpos of the second argument is in the
2304 follow of every element in the lastpos of the first argument. */
2305 tmp.nelem = nfirstpos[-1];
2306 tmp.elems = firstpos;
2307 pos = lastpos + nlastpos[-1];
2308 for (j = 0; j < nlastpos[-2]; ++j)
2309 {
2310 merge (&tmp, &d->follows[pos[j].index], &merged);
2311 copy (&merged, &d->follows[pos[j].index]);
2312 }
2313
2314 /* The firstpos of a CAT node is the firstpos of the first argument,
2315 union that of the second argument if the first is nullable. */
2316 if (nullable[-2])
2317 nfirstpos[-2] += nfirstpos[-1];
2318 else
2319 firstpos += nfirstpos[-1];
2320 --nfirstpos;
2321
2322 /* The lastpos of a CAT node is the lastpos of the second argument,
2323 union that of the first argument if the second is nullable. */
2324 if (nullable[-1])
2325 nlastpos[-2] += nlastpos[-1];
2326 else
2327 {
2328 pos = lastpos + nlastpos[-2];
2329 for (j = nlastpos[-1]; j-- > 0;)
2330 pos[j] = lastpos[j];
2331 lastpos += nlastpos[-2];
2332 nlastpos[-2] = nlastpos[-1];
2333 }
2334 --nlastpos;
2335
2336 /* A CAT node is nullable if both arguments are nullable. */
2337 nullable[-2] = nullable[-1] && nullable[-2];
2338 --nullable;
2339 break;
2340
2341 case OR:
2342 /* The firstpos is the union of the firstpos of each argument. */
2343 nfirstpos[-2] += nfirstpos[-1];
2344 --nfirstpos;
2345
2346 /* The lastpos is the union of the lastpos of each argument. */
2347 nlastpos[-2] += nlastpos[-1];
2348 --nlastpos;
2349
2350 /* An OR node is nullable if either argument is nullable. */
2351 nullable[-2] = nullable[-1] || nullable[-2];
2352 --nullable;
2353 break;
2354
2355 default:
2356 /* Anything else is a nonempty position. (Note that special
2357 constructs like \< are treated as nonempty strings here;
2358 an "epsilon closure" effectively makes them nullable later.
2359 Backreferences have to get a real position so we can detect
2360 transitions on them later. But they are nullable. */
2361 *nullable++ = d->tokens[i] == BACKREF;
2362
2363 /* This position is in its own firstpos and lastpos. */
2364 *nfirstpos++ = *nlastpos++ = 1;
2365 --firstpos, --lastpos;
2366 firstpos->index = lastpos->index = i;
2367 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2368
2369 /* Allocate the follow set for this position. */
2370 alloc_position_set (&d->follows[i], 1);
2371 break;
2372 }
2373#ifdef DEBUG
2374 /* ... balance the above nonsyntactic #ifdef goo... */
2375 fprintf (stderr, "node %zd:", i);
2376 prtok (d->tokens[i]);
2377 putc ('\n', stderr);
2378 fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2379 fprintf (stderr, " firstpos:");
2380 for (j = nfirstpos[-1]; j-- > 0;)
2381 {
2382 fprintf (stderr, " %zd:", firstpos[j].index);
2383 prtok (d->tokens[firstpos[j].index]);
2384 }
2385 fprintf (stderr, "\n lastpos:");
2386 for (j = nlastpos[-1]; j-- > 0;)
2387 {
2388 fprintf (stderr, " %zd:", lastpos[j].index);
2389 prtok (d->tokens[lastpos[j].index]);
2390 }
2391 putc ('\n', stderr);
2392#endif
2393 }
2394
2395 /* For each follow set that is the follow set of a real position, replace
2396 it with its epsilon closure. */
2397 for (i = 0; i < d->tindex; ++i)
2398 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2399#if MBS_SUPPORT
2400 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2401#endif
2402 || d->tokens[i] >= CSET)
2403 {
2404#ifdef DEBUG
2405 fprintf (stderr, "follows(%zd:", i);
2406 prtok (d->tokens[i]);
2407 fprintf (stderr, "):");
2408 for (j = d->follows[i].nelem; j-- > 0;)
2409 {
2410 fprintf (stderr, " %zd:", d->follows[i].elems[j].index);
2411 prtok (d->tokens[d->follows[i].elems[j].index]);
2412 }
2413 putc ('\n', stderr);
2414#endif
2415 copy (&d->follows[i], &merged);
2416 epsclosure (&merged, d);
2417 copy (&merged, &d->follows[i]);
2418 }
2419
2420 /* Get the epsilon closure of the firstpos of the regexp. The result will
2421 be the set of positions of state 0. */
2422 merged.nelem = 0;
2423 for (i = 0; i < nfirstpos[-1]; ++i)
2424 insert (firstpos[i], &merged);
2425 epsclosure (&merged, d);
2426
2427 /* Build the initial state. */
2428 d->salloc = 1;
2429 d->sindex = 0;
2430 MALLOC (d->states, d->salloc);
2431
2432 separate_contexts = state_separate_contexts (&merged);
2433 state_index (d, &merged,
2434 (separate_contexts & CTX_NEWLINE
2435 ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
2436
2437 free (o_nullable);
2438 free (o_nfirst);
2439 free (o_firstpos);
2440 free (o_nlast);
2441 free (o_lastpos);
2442 free (merged.elems);
2443}
2444
2445
2446/* Find, for each character, the transition out of state s of d, and store
2447 it in the appropriate slot of trans.
2448
2449 We divide the positions of s into groups (positions can appear in more
2450 than one group). Each group is labeled with a set of characters that
2451 every position in the group matches (taking into account, if necessary,
2452 preceding context information of s). For each group, find the union
2453 of the its elements' follows. This set is the set of positions of the
2454 new state. For each character in the group's label, set the transition
2455 on this character to be to a state corresponding to the set's positions,
2456 and its associated backward context information, if necessary.
2457
2458 If we are building a searching matcher, we include the positions of state
2459 0 in every state.
2460
2461 The collection of groups is constructed by building an equivalence-class
2462 partition of the positions of s.
2463
2464 For each position, find the set of characters C that it matches. Eliminate
2465 any characters from C that fail on grounds of backward context.
2466
2467 Search through the groups, looking for a group whose label L has nonempty
2468 intersection with C. If L - C is nonempty, create a new group labeled
2469 L - C and having the same positions as the current group, and set L to
2470 the intersection of L and C. Insert the position in this group, set
2471 C = C - L, and resume scanning.
2472
2473 If after comparing with every group there are characters remaining in C,
2474 create a new group labeled with the characters of C and insert this
2475 position in that group. */
2476void
2477dfastate (state_num s, struct dfa *d, state_num trans[])
2478{
2479 leaf_set *grps; /* As many as will ever be needed. */
2480 charclass *labels; /* Labels corresponding to the groups. */
2481 size_t ngrps = 0; /* Number of groups actually used. */
2482 position pos; /* Current position being considered. */
2483 charclass matches; /* Set of matching characters. */
2484 int matchesf; /* True if matches is nonempty. */
2485 charclass intersect; /* Intersection with some label set. */
2486 int intersectf; /* True if intersect is nonempty. */
2487 charclass leftovers; /* Stuff in the label that didn't match. */
2488 int leftoversf; /* True if leftovers is nonempty. */
2489 position_set follows; /* Union of the follows of some group. */
2490 position_set tmp; /* Temporary space for merging sets. */
2491 int possible_contexts; /* Contexts that this group can match. */
2492 int separate_contexts; /* Context that new state wants to know. */
2493 state_num state; /* New state. */
2494 state_num state_newline; /* New state on a newline transition. */
2495 state_num state_letter; /* New state on a letter transition. */
2496 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2497 size_t i, j, k;
2498
2499 MALLOC (grps, NOTCHAR);
2500 MALLOC (labels, NOTCHAR);
2501
2502 zeroset (matches);
2503
2504 for (i = 0; i < d->states[s].elems.nelem; ++i)
2505 {
2506 pos = d->states[s].elems.elems[i];
2507 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2508 setbit (d->tokens[pos.index], matches);
2509 else if (d->tokens[pos.index] >= CSET)
2510 copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2511 else if (MBS_SUPPORT
2512 && (d->tokens[pos.index] == ANYCHAR
2513 || d->tokens[pos.index] == MBCSET))
2514 /* MB_CUR_MAX > 1 */
2515 {
2516 /* ANYCHAR and MBCSET must match with a single character, so we
2517 must put it to d->states[s].mbps, which contains the positions
2518 which can match with a single character not a byte. */
2519 if (d->states[s].mbps.nelem == 0)
2520 alloc_position_set (&d->states[s].mbps, 1);
2521 insert (pos, &(d->states[s].mbps));
2522 continue;
2523 }
2524 else
2525 continue;
2526
2527 /* Some characters may need to be eliminated from matches because
2528 they fail in the current context. */
2529 if (pos.constraint != NO_CONSTRAINT)
2530 {
2531 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2532 d->states[s].context, CTX_NEWLINE))
2533 for (j = 0; j < CHARCLASS_INTS; ++j)
2534 matches[j] &= ~newline[j];
2535 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2536 d->states[s].context, CTX_LETTER))
2537 for (j = 0; j < CHARCLASS_INTS; ++j)
2538 matches[j] &= ~letters[j];
2539 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2540 d->states[s].context, CTX_NONE))
2541 for (j = 0; j < CHARCLASS_INTS; ++j)
2542 matches[j] &= letters[j] | newline[j];
2543
2544 /* If there are no characters left, there's no point in going on. */
2545 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2546 continue;
2547 if (j == CHARCLASS_INTS)
2548 continue;
2549 }
2550
2551 for (j = 0; j < ngrps; ++j)
2552 {
2553 /* If matches contains a single character only, and the current
2554 group's label doesn't contain that character, go on to the
2555 next group. */
2556 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2557 && !tstbit (d->tokens[pos.index], labels[j]))
2558 continue;
2559
2560 /* Check if this group's label has a nonempty intersection with
2561 matches. */
2562 intersectf = 0;
2563 for (k = 0; k < CHARCLASS_INTS; ++k)
2564 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2565 if (!intersectf)
2566 continue;
2567
2568 /* It does; now find the set differences both ways. */
2569 leftoversf = matchesf = 0;
2570 for (k = 0; k < CHARCLASS_INTS; ++k)
2571 {
2572 /* Even an optimizing compiler can't know this for sure. */
2573 int match = matches[k], label = labels[j][k];
2574
2575 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2576 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2577 }
2578
2579 /* If there were leftovers, create a new group labeled with them. */
2580 if (leftoversf)
2581 {
2582 copyset (leftovers, labels[ngrps]);
2583 copyset (intersect, labels[j]);
2584 MALLOC (grps[ngrps].elems, d->nleaves);
2585 memcpy (grps[ngrps].elems, grps[j].elems,
2586 sizeof (grps[j].elems[0]) * grps[j].nelem);
2587 grps[ngrps].nelem = grps[j].nelem;
2588 ++ngrps;
2589 }
2590
2591 /* Put the position in the current group. The constraint is
2592 irrelevant here. */
2593 grps[j].elems[grps[j].nelem++] = pos.index;
2594
2595 /* If every character matching the current position has been
2596 accounted for, we're done. */
2597 if (!matchesf)
2598 break;
2599 }
2600
2601 /* If we've passed the last group, and there are still characters
2602 unaccounted for, then we'll have to create a new group. */
2603 if (j == ngrps)
2604 {
2605 copyset (matches, labels[ngrps]);
2606 zeroset (matches);
2607 MALLOC (grps[ngrps].elems, d->nleaves);
2608 grps[ngrps].nelem = 1;
2609 grps[ngrps].elems[0] = pos.index;
2610 ++ngrps;
2611 }
2612 }
2613
2614 alloc_position_set (&follows, d->nleaves);
2615 alloc_position_set (&tmp, d->nleaves);
2616
2617 /* If we are a searching matcher, the default transition is to a state
2618 containing the positions of state 0, otherwise the default transition
2619 is to fail miserably. */
2620 if (d->searchflag)
2621 {
2622 /* Find the state(s) corresponding to the positions of state 0. */
2623 copy (&d->states[0].elems, &follows);
2624 separate_contexts = state_separate_contexts (&follows);
2625 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2626 if (separate_contexts & CTX_NEWLINE)
2627 state_newline = state_index (d, &follows, CTX_NEWLINE);
2628 else
2629 state_newline = state;
2630 if (separate_contexts & CTX_LETTER)
2631 state_letter = state_index (d, &follows, CTX_LETTER);
2632 else
2633 state_letter = state;
2634
2635 for (i = 0; i < NOTCHAR; ++i)
2636 trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
2637 trans[eolbyte] = state_newline;
2638 }
2639 else
2640 for (i = 0; i < NOTCHAR; ++i)
2641 trans[i] = -1;
2642
2643 for (i = 0; i < ngrps; ++i)
2644 {
2645 follows.nelem = 0;
2646
2647 /* Find the union of the follows of the positions of the group.
2648 This is a hideously inefficient loop. Fix it someday. */
2649 for (j = 0; j < grps[i].nelem; ++j)
2650 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2651 insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2652
2653 if (d->mb_cur_max > 1)
2654 {
2655 /* If a token in follows.elems is not 1st byte of a multibyte
2656 character, or the states of follows must accept the bytes
2657 which are not 1st byte of the multibyte character.
2658 Then, if a state of follows encounter a byte, it must not be
2659 a 1st byte of a multibyte character nor single byte character.
2660 We cansel to add state[0].follows to next state, because
2661 state[0] must accept 1st-byte
2662
2663 For example, we assume <sb a> is a certain single byte
2664 character, <mb A> is a certain multibyte character, and the
2665 codepoint of <sb a> equals the 2nd byte of the codepoint of
2666 <mb A>.
2667 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2668 by accepting accepts 1st byte of <mb A>, and state[i+1]
2669 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2670 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2671 <mb A>, so we cannot add state[0]. */
2672
2673 next_isnt_1st_byte = 0;
2674 for (j = 0; j < follows.nelem; ++j)
2675 {
2676 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2677 {
2678 next_isnt_1st_byte = 1;
2679 break;
2680 }
2681 }
2682 }
2683
2684 /* If we are building a searching matcher, throw in the positions
2685 of state 0 as well. */
2686 if (d->searchflag
2687 && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
2688 for (j = 0; j < d->states[0].elems.nelem; ++j)
2689 insert (d->states[0].elems.elems[j], &follows);
2690
2691 /* Find out if the new state will want any context information. */
2692 possible_contexts = charclass_context (labels[i]);
2693 separate_contexts = state_separate_contexts (&follows);
2694
2695 /* Find the state(s) corresponding to the union of the follows. */
2696 if ((separate_contexts & possible_contexts) != possible_contexts)
2697 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2698 else
2699 state = -1;
2700 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2701 state_newline = state_index (d, &follows, CTX_NEWLINE);
2702 else
2703 state_newline = state;
2704 if (separate_contexts & possible_contexts & CTX_LETTER)
2705 state_letter = state_index (d, &follows, CTX_LETTER);
2706 else
2707 state_letter = state;
2708
2709 /* Set the transitions for each character in the current label. */
2710 for (j = 0; j < CHARCLASS_INTS; ++j)
2711 for (k = 0; k < INTBITS; ++k)
2712 if (labels[i][j] & 1 << k)
2713 {
2714 int c = j * INTBITS + k;
2715
2716 if (c == eolbyte)
2717 trans[c] = state_newline;
2718 else if (IS_WORD_CONSTITUENT (c))
2719 trans[c] = state_letter;
2720 else if (c < NOTCHAR)
2721 trans[c] = state;
2722 }
2723 }
2724
2725 for (i = 0; i < ngrps; ++i)
2726 free (grps[i].elems);
2727 free (follows.elems);
2728 free (tmp.elems);
2729 free (grps);
2730 free (labels);
2731}
2732
2733/* Some routines for manipulating a compiled dfa's transition tables.
2734 Each state may or may not have a transition table; if it does, and it
2735 is a non-accepting state, then d->trans[state] points to its table.
2736 If it is an accepting state then d->fails[state] points to its table.
2737 If it has no table at all, then d->trans[state] is NULL.
2738 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2739
2740static void
2741build_state (state_num s, struct dfa *d)
2742{
2743 state_num *trans; /* The new transition table. */
2744 state_num i;
2745
2746 /* Set an upper limit on the number of transition tables that will ever
2747 exist at once. 1024 is arbitrary. The idea is that the frequently
2748 used transition tables will be quickly rebuilt, whereas the ones that
2749 were only needed once or twice will be cleared away. */
2750 if (d->trcount >= 1024)
2751 {
2752 for (i = 0; i < d->tralloc; ++i)
2753 {
2754 free (d->trans[i]);
2755 free (d->fails[i]);
2756 d->trans[i] = d->fails[i] = NULL;
2757 }
2758 d->trcount = 0;
2759 }
2760
2761 ++d->trcount;
2762
2763 /* Set up the success bits for this state. */
2764 d->success[s] = 0;
2765 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2766 d->success[s] |= CTX_NEWLINE;
2767 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2768 d->success[s] |= CTX_LETTER;
2769 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2770 d->success[s] |= CTX_NONE;
2771
2772 MALLOC (trans, NOTCHAR);
2773 dfastate (s, d, trans);
2774
2775 /* Now go through the new transition table, and make sure that the trans
2776 and fail arrays are allocated large enough to hold a pointer for the
2777 largest state mentioned in the table. */
2778 for (i = 0; i < NOTCHAR; ++i)
2779 if (trans[i] >= d->tralloc)
2780 {
2781 state_num oldalloc = d->tralloc;
2782
2783 while (trans[i] >= d->tralloc)
2784 d->tralloc *= 2;
2785 REALLOC (d->realtrans, d->tralloc + 1);
2786 d->trans = d->realtrans + 1;
2787 REALLOC (d->fails, d->tralloc);
2788 REALLOC (d->success, d->tralloc);
2789 REALLOC (d->newlines, d->tralloc);
2790 while (oldalloc < d->tralloc)
2791 {
2792 d->trans[oldalloc] = NULL;
2793 d->fails[oldalloc++] = NULL;
2794 }
2795 }
2796
2797 /* Keep the newline transition in a special place so we can use it as
2798 a sentinel. */
2799 d->newlines[s] = trans[eolbyte];
2800 trans[eolbyte] = -1;
2801
2802 if (ACCEPTING (s, *d))
2803 d->fails[s] = trans;
2804 else
2805 d->trans[s] = trans;
2806}
2807
2808static void
2809build_state_zero (struct dfa *d)
2810{
2811 d->tralloc = 1;
2812 d->trcount = 0;
2813 CALLOC (d->realtrans, d->tralloc + 1);
2814 d->trans = d->realtrans + 1;
2815 CALLOC (d->fails, d->tralloc);
2816 MALLOC (d->success, d->tralloc);
2817 MALLOC (d->newlines, d->tralloc);
2818 build_state (0, d);
2819}
2820
2821/* Multibyte character handling sub-routines for dfaexec. */
2822
2823/* Initial state may encounter the byte which is not a single byte character
2824 nor 1st byte of a multibyte character. But it is incorrect for initial
2825 state to accept such a byte.
2826 For example, in sjis encoding the regular expression like "\\" accepts
2827 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2828 0x815c. Then Initial state must skip the bytes which are not a single byte
2829 character nor 1st byte of a multibyte character. */
2830#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2831 if (s == 0) \
2832 { \
2833 while (inputwcs[p - buf_begin] == 0 \
2834 && mblen_buf[p - buf_begin] > 0 \
2835 && (unsigned char const *) p < buf_end) \
2836 ++p; \
2837 if ((char *) p >= end) \
2838 { \
2839 free (mblen_buf); \
2840 free (inputwcs); \
2841 *end = saved_end; \
2842 return NULL; \
2843 } \
2844 }
2845
2846static void
2847realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2848{
2849 /* Make sure that the trans and fail arrays are allocated large enough
2850 to hold a pointer for the new state. */
2851 if (new_state >= d->tralloc)
2852 {
2853 state_num oldalloc = d->tralloc;
2854
2855 while (new_state >= d->tralloc)
2856 d->tralloc *= 2;
2857 REALLOC (d->realtrans, d->tralloc + 1);
2858 d->trans = d->realtrans + 1;
2859 REALLOC (d->fails, d->tralloc);
2860 REALLOC (d->success, d->tralloc);
2861 REALLOC (d->newlines, d->tralloc);
2862 while (oldalloc < d->tralloc)
2863 {
2864 d->trans[oldalloc] = NULL;
2865 d->fails[oldalloc++] = NULL;
2866 }
2867 }
2868}
2869
2870/* Return values of transit_state_singlebyte(), and
2871 transit_state_consume_1char. */
2872typedef enum
2873{
2874 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2875 TRANSIT_STATE_DONE, /* State transition has finished. */
2876 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2877} status_transit_state;
2878
2879/* Consume a single byte and transit state from 's' to '*next_state'.
2880 This function is almost same as the state transition routin in dfaexec().
2881 But state transition is done just once, otherwise matching succeed or
2882 reach the end of the buffer. */
2883static status_transit_state
2884transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
2885 state_num * next_state)
2886{
2887 state_num *t;
2888 state_num works = s;
2889
2890 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2891
2892 while (rval == TRANSIT_STATE_IN_PROGRESS)
2893 {
2894 if ((t = d->trans[works]) != NULL)
2895 {
2896 works = t[*p];
2897 rval = TRANSIT_STATE_DONE;
2898 if (works < 0)
2899 works = 0;
2900 }
2901 else if (works < 0)
2902 {
2903 if (p == buf_end)
2904 {
2905 /* At the moment, it must not happen. */
2906 abort ();
2907 }
2908 works = 0;
2909 }
2910 else if (d->fails[works])
2911 {
2912 works = d->fails[works][*p];
2913 rval = TRANSIT_STATE_DONE;
2914 }
2915 else
2916 {
2917 build_state (works, d);
2918 }
2919 }
2920 *next_state = works;
2921 return rval;
2922}
2923
2924/* Match a "." against the current context. buf_begin[IDX] is the
2925 current position. Return the length of the match, in bytes.
2926 POS is the position of the ".". */
2927static int
2928match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
2929{
2930 int context;
2931 wchar_t wc;
2932 int mbclen;
2933
2934 wc = inputwcs[idx];
2935 mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2936
2937 /* Check syntax bits. */
2938 if (wc == (wchar_t) eolbyte)
2939 {
2940 if (!(syntax_bits & RE_DOT_NEWLINE))
2941 return 0;
2942 }
2943 else if (wc == (wchar_t) '\0')
2944 {
2945 if (syntax_bits & RE_DOT_NOT_NULL)
2946 return 0;
2947 }
2948
2949 context = wchar_context (wc);
2950 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2951 return 0;
2952
2953 return mbclen;
2954}
2955
2956/* Match a bracket expression against the current context.
2957 buf_begin[IDX] is the current position.
2958 Return the length of the match, in bytes.
2959 POS is the position of the bracket expression. */
2960static int
2961match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
2962{
2963 size_t i;
2964 int match; /* Flag which represent that matching succeed. */
2965 int match_len; /* Length of the character (or collating element)
2966 with which this operator match. */
2967 int op_len; /* Length of the operator. */
2968 char buffer[128];
2969 wchar_t wcbuf[6];
2970
2971 /* Pointer to the structure to which we are currently referring. */
2972 struct mb_char_classes *work_mbc;
2973
2974 int context;
2975 wchar_t wc; /* Current referring character. */
2976
2977 wc = inputwcs[idx];
2978
2979 /* Check syntax bits. */
2980 if (wc == (wchar_t) eolbyte)
2981 {
2982 if (!(syntax_bits & RE_DOT_NEWLINE))
2983 return 0;
2984 }
2985 else if (wc == (wchar_t) '\0')
2986 {
2987 if (syntax_bits & RE_DOT_NOT_NULL)
2988 return 0;
2989 }
2990
2991 context = wchar_context (wc);
2992 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2993 return 0;
2994
2995 /* Assign the current referring operator to work_mbc. */
2996 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2997 match = !work_mbc->invert;
2998 match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2999
3000 /* Match in range 0-255? */
3001 if (wc < NOTCHAR && work_mbc->cset != -1
3002 && tstbit ((unsigned char) wc, d->charclasses[work_mbc->cset]))
3003 goto charset_matched;
3004
3005 /* match with a character class? */
3006 for (i = 0; i < work_mbc->nch_classes; i++)
3007 {
3008 if (iswctype ((wint_t) wc, work_mbc->ch_classes[i]))
3009 goto charset_matched;
3010 }
3011
3012 strncpy (buffer, (char const *) buf_begin + idx, match_len);
3013 buffer[match_len] = '\0';
3014
3015 /* match with an equivalence class? */
3016 for (i = 0; i < work_mbc->nequivs; i++)
3017 {
3018 op_len = strlen (work_mbc->equivs[i]);
3019 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3020 buffer[op_len] = '\0';
3021 if (strcoll (work_mbc->equivs[i], buffer) == 0)
3022 {
3023 match_len = op_len;
3024 goto charset_matched;
3025 }
3026 }
3027
3028 /* match with a collating element? */
3029 for (i = 0; i < work_mbc->ncoll_elems; i++)
3030 {
3031 op_len = strlen (work_mbc->coll_elems[i]);
3032 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3033 buffer[op_len] = '\0';
3034
3035 if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
3036 {
3037 match_len = op_len;
3038 goto charset_matched;
3039 }
3040 }
3041
3042 wcbuf[0] = wc;
3043 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
3044
3045 /* match with a range? */
3046 for (i = 0; i < work_mbc->nranges; i++)
3047 {
3048 wcbuf[2] = work_mbc->range_sts[i];
3049 wcbuf[4] = work_mbc->range_ends[i];
3050
3051 if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0)
3052 goto charset_matched;
3053 }
3054
3055 /* match with a character? */
3056 for (i = 0; i < work_mbc->nchars; i++)
3057 {
3058 if (wc == work_mbc->chars[i])
3059 goto charset_matched;
3060 }
3061
3062 match = !match;
3063
3064charset_matched:
3065 return match ? match_len : 0;
3066}
3067
3068/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
3069 array which corresponds to `d->states[s].mbps.elem' and each element of
3070 the array contains the amount of the bytes with which the element can
3071 match.
3072 `idx' is the index from the buf_begin, and it is the current position
3073 in the buffer.
3074 Caller MUST free the array which this function return. */
3075static int *
3076check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
3077{
3078 size_t i;
3079 int *rarray;
3080
3081 MALLOC (rarray, d->states[s].mbps.nelem);
3082 for (i = 0; i < d->states[s].mbps.nelem; ++i)
3083 {
3084 position pos = d->states[s].mbps.elems[i];
3085 switch (d->tokens[pos.index])
3086 {
3087 case ANYCHAR:
3088 rarray[i] = match_anychar (d, s, pos, idx);
3089 break;
3090 case MBCSET:
3091 rarray[i] = match_mb_charset (d, s, pos, idx);
3092 break;
3093 default:
3094 break; /* cannot happen. */
3095 }
3096 }
3097 return rarray;
3098}
3099
3100/* Consume a single character and enumerate all of the positions which can
3101 be next position from the state `s'.
3102 `match_lens' is the input. It can be NULL, but it can also be the output
3103 of check_matching_with_multibyte_ops() for optimization.
3104 `mbclen' and `pps' are the output. `mbclen' is the length of the
3105 character consumed, and `pps' is the set this function enumerate. */
3106static status_transit_state
3107transit_state_consume_1char (struct dfa *d, state_num s,
3108 unsigned char const **pp,
3109 int *match_lens, int *mbclen, position_set * pps)
3110{
3111 size_t i, j;
3112 int k;
3113 state_num s1, s2;
3114 int *work_mbls;
3115 status_transit_state rs = TRANSIT_STATE_DONE;
3116
3117 /* Calculate the length of the (single/multi byte) character
3118 to which p points. */
3119 *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
3120
3121 /* Calculate the state which can be reached from the state `s' by
3122 consuming `*mbclen' single bytes from the buffer. */
3123 s1 = s;
3124 for (k = 0; k < *mbclen; k++)
3125 {
3126 s2 = s1;
3127 rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
3128 }
3129 /* Copy the positions contained by `s1' to the set `pps'. */
3130 copy (&(d->states[s1].elems), pps);
3131
3132 /* Check (input) match_lens, and initialize if it is NULL. */
3133 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3134 work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3135 else
3136 work_mbls = match_lens;
3137
3138 /* Add all of the positions which can be reached from `s' by consuming
3139 a single character. */
3140 for (i = 0; i < d->states[s].mbps.nelem; i++)
3141 {
3142 if (work_mbls[i] == *mbclen)
3143 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3144 j++)
3145 insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
3146 }
3147
3148 if (match_lens == NULL && work_mbls != NULL)
3149 free (work_mbls);
3150
3151 /* FIXME: this return value is always ignored. */
3152 return rs;
3153}
3154
3155/* Transit state from s, then return new state and update the pointer of the
3156 buffer. This function is for some operator which can match with a multi-
3157 byte character or a collating element (which may be multi characters). */
3158static state_num
3159transit_state (struct dfa *d, state_num s, unsigned char const **pp)
3160{
3161 state_num s1;
3162 int mbclen; /* The length of current input multibyte character. */
3163 int maxlen = 0;
3164 size_t i, j;
3165 int *match_lens = NULL;
3166 size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
3167 position_set follows;
3168 unsigned char const *p1 = *pp;
3169 wchar_t wc;
3170
3171 if (nelem > 0)
3172 /* This state has (a) multibyte operator(s).
3173 We check whether each of them can match or not. */
3174 {
3175 /* Note: caller must free the return value of this function. */
3176 match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3177
3178 for (i = 0; i < nelem; i++)
3179 /* Search the operator which match the longest string,
3180 in this state. */
3181 {
3182 if (match_lens[i] > maxlen)
3183 maxlen = match_lens[i];
3184 }
3185 }
3186
3187 if (nelem == 0 || maxlen == 0)
3188 /* This state has no multibyte operator which can match.
3189 We need to check only one single byte character. */
3190 {
3191 status_transit_state rs;
3192 rs = transit_state_singlebyte (d, s, *pp, &s1);
3193
3194 /* We must update the pointer if state transition succeeded. */
3195 if (rs == TRANSIT_STATE_DONE)
3196 ++*pp;
3197
3198 free (match_lens);
3199 return s1;
3200 }
3201
3202 /* This state has some operators which can match a multibyte character. */
3203 alloc_position_set (&follows, d->nleaves);
3204
3205 /* `maxlen' may be longer than the length of a character, because it may
3206 not be a character but a (multi character) collating element.
3207 We enumerate all of the positions which `s' can reach by consuming
3208 `maxlen' bytes. */
3209 transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
3210
3211 wc = inputwcs[*pp - mbclen - buf_begin];
3212 s1 = state_index (d, &follows, wchar_context (wc));
3213 realloc_trans_if_necessary (d, s1);
3214
3215 while (*pp - p1 < maxlen)
3216 {
3217 transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
3218
3219 for (i = 0; i < nelem; i++)
3220 {
3221 if (match_lens[i] == *pp - p1)
3222 for (j = 0;
3223 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3224 insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3225 &follows);
3226 }
3227
3228 wc = inputwcs[*pp - mbclen - buf_begin];
3229 s1 = state_index (d, &follows, wchar_context (wc));
3230 realloc_trans_if_necessary (d, s1);
3231 }
3232 free (match_lens);
3233 free (follows.elems);
3234 return s1;
3235}
3236
3237
3238/* Initialize mblen_buf and inputwcs with data from the next line. */
3239
3240static void
3241prepare_wc_buf (const char *begin, const char *end)
3242{
3243#if MBS_SUPPORT
3244 unsigned char eol = eolbyte;
3245 size_t remain_bytes, i;
3246
3247 buf_begin = (unsigned char *) begin;
3248
3249 remain_bytes = 0;
3250 for (i = 0; i < end - begin + 1; i++)
3251 {
3252 if (remain_bytes == 0)
3253 {
3254 remain_bytes
3255 = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3256 if (remain_bytes < 1
3257 || remain_bytes == (size_t) -1
3258 || remain_bytes == (size_t) -2
3259 || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
3260 {
3261 remain_bytes = 0;
3262 inputwcs[i] = (wchar_t) begin[i];
3263 mblen_buf[i] = 0;
3264 if (begin[i] == eol)
3265 break;
3266 }
3267 else
3268 {
3269 mblen_buf[i] = remain_bytes;
3270 remain_bytes--;
3271 }
3272 }
3273 else
3274 {
3275 mblen_buf[i] = remain_bytes;
3276 inputwcs[i] = 0;
3277 remain_bytes--;
3278 }
3279 }
3280
3281 buf_end = (unsigned char *) (begin + i);
3282 mblen_buf[i] = 0;
3283 inputwcs[i] = 0; /* sentinel */
3284#endif /* MBS_SUPPORT */
3285}
3286
3287/* Search through a buffer looking for a match to the given struct dfa.
3288 Find the first occurrence of a string matching the regexp in the
3289 buffer, and the shortest possible version thereof. Return a pointer to
3290 the first character after the match, or NULL if none is found. BEGIN
3291 points to the beginning of the buffer, and END points to the first byte
3292 after its end. Note however that we store a sentinel byte (usually
3293 newline) in *END, so the actual buffer must be one byte longer.
3294 When ALLOW_NL is nonzero, newlines may appear in the matching string.
3295 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3296 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3297 encountered a back-reference (1) or not (0). The caller may use this
3298 to decide whether to fall back on a backtracking matcher. */
3299char *
3300dfaexec (struct dfa *d, char const *begin, char *end,
3301 int allow_nl, size_t *count, int *backref)
3302{
3303 state_num s, s1; /* Current state. */
3304 unsigned char const *p; /* Current input character. */
3305 state_num **trans, *t; /* Copy of d->trans so it can be optimized
3306 into a register. */
3307 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3308 unsigned char saved_end;
3309
3310 if (!d->tralloc)
3311 build_state_zero (d);
3312
3313 s = s1 = 0;
3314 p = (unsigned char const *) begin;
3315 trans = d->trans;
3316 saved_end = *(unsigned char *) end;
3317 *end = eol;
3318
3319 if (d->mb_cur_max > 1)
3320 {
3321 MALLOC (mblen_buf, end - begin + 2);
3322 MALLOC (inputwcs, end - begin + 2);
3323 memset (&mbs, 0, sizeof (mbstate_t));
3324 prepare_wc_buf ((const char *) p, end);
3325 }
3326
3327 for (;;)
3328 {
3329 if (d->mb_cur_max > 1)
3330 while ((t = trans[s]) != NULL)
3331 {
3332 if (p > buf_end)
3333 break;
3334 s1 = s;
3335 SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
3336
3337 if (d->states[s].mbps.nelem == 0)
3338 {
3339 s = t[*p++];
3340 continue;
3341 }
3342
3343 /* Falling back to the glibc matcher in this case gives
3344 better performance (up to 25% better on [a-z], for
3345 example) and enables support for collating symbols and
3346 equivalence classes. */
3347 if (backref)
3348 {
3349 *backref = 1;
3350 free (mblen_buf);
3351 free (inputwcs);
3352 *end = saved_end;
3353 return (char *) p;
3354 }
3355
3356 /* Can match with a multibyte character (and multi character
3357 collating element). Transition table might be updated. */
3358 s = transit_state (d, s, &p);
3359 trans = d->trans;
3360 }
3361 else
3362 {
3363 while ((t = trans[s]) != NULL)
3364 {
3365 s1 = t[*p++];
3366 if ((t = trans[s1]) == NULL)
3367 {
3368 state_num tmp = s;
3369 s = s1;
3370 s1 = tmp; /* swap */
3371 break;
3372 }
3373 s = t[*p++];
3374 }
3375 }
3376
3377 if (s >= 0 && (char *) p <= end && d->fails[s])
3378 {
3379 if (d->success[s] & sbit[*p])
3380 {
3381 if (backref)
3382 *backref = (d->states[s].backref != 0);
3383 if (d->mb_cur_max > 1)
3384 {
3385 free (mblen_buf);
3386 free (inputwcs);
3387 }
3388 *end = saved_end;
3389 return (char *) p;
3390 }
3391
3392 s1 = s;
3393 if (d->mb_cur_max > 1)
3394 {
3395 /* Can match with a multibyte character (and multicharacter
3396 collating element). Transition table might be updated. */
3397 s = transit_state (d, s, &p);
3398 trans = d->trans;
3399 }
3400 else
3401 s = d->fails[s][*p++];
3402 continue;
3403 }
3404
3405 /* If the previous character was a newline, count it. */
3406 if ((char *) p <= end && p[-1] == eol)
3407 {
3408 if (count)
3409 ++*count;
3410
3411 if (d->mb_cur_max > 1)
3412 prepare_wc_buf ((const char *) p, end);
3413 }
3414
3415 /* Check if we've run off the end of the buffer. */
3416 if ((char *) p > end)
3417 {
3418 if (d->mb_cur_max > 1)
3419 {
3420 free (mblen_buf);
3421 free (inputwcs);
3422 }
3423 *end = saved_end;
3424 return NULL;
3425 }
3426
3427 if (s >= 0)
3428 {
3429 build_state (s, d);
3430 trans = d->trans;
3431 continue;
3432 }
3433
3434 if (p[-1] == eol && allow_nl)
3435 {
3436 s = d->newlines[s1];
3437 continue;
3438 }
3439
3440 s = 0;
3441 }
3442}
3443
3444static void
3445free_mbdata (struct dfa *d)
3446{
3447 size_t i;
3448
3449 free (d->multibyte_prop);
3450 d->multibyte_prop = NULL;
3451
3452 for (i = 0; i < d->nmbcsets; ++i)
3453 {
3454 size_t j;
3455 struct mb_char_classes *p = &(d->mbcsets[i]);
3456 free (p->chars);
3457 free (p->ch_classes);
3458 free (p->range_sts);
3459 free (p->range_ends);
3460
3461 for (j = 0; j < p->nequivs; ++j)
3462 free (p->equivs[j]);
3463 free (p->equivs);
3464
3465 for (j = 0; j < p->ncoll_elems; ++j)
3466 free (p->coll_elems[j]);
3467 free (p->coll_elems);
3468 }
3469
3470 free (d->mbcsets);
3471 d->mbcsets = NULL;
3472 d->nmbcsets = 0;
3473}
3474
3475/* Initialize the components of a dfa that the other routines don't
3476 initialize for themselves. */
3477void
3478dfainit (struct dfa *d)
3479{
3480 memset (d, 0, sizeof *d);
3481
3482 d->calloc = 1;
3483 MALLOC (d->charclasses, d->calloc);
3484
3485 d->talloc = 1;
3486 MALLOC (d->tokens, d->talloc);
3487
3488 d->mb_cur_max = MB_CUR_MAX;
3489
3490 if (d->mb_cur_max > 1)
3491 {
3492 d->nmultibyte_prop = 1;
3493 MALLOC (d->multibyte_prop, d->nmultibyte_prop);
3494 d->mbcsets_alloc = 1;
3495 MALLOC (d->mbcsets, d->mbcsets_alloc);
3496 }
3497}
3498
3499static void
3500dfaoptimize (struct dfa *d)
3501{
3502 size_t i;
3503
3504 if (!MBS_SUPPORT || !using_utf8 ())
3505 return;
3506
3507 for (i = 0; i < d->tindex; ++i)
3508 {
3509 switch (d->tokens[i])
3510 {
3511 case ANYCHAR:
3512 /* Lowered. */
3513 abort ();
3514 case MBCSET:
3515 /* Requires multi-byte algorithm. */
3516 return;
3517 default:
3518 break;
3519 }
3520 }
3521
3522 free_mbdata (d);
3523 d->mb_cur_max = 1;
3524}
3525
3526/* Parse and analyze a single string of the given length. */
3527void
3528dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3529{
3530 dfainit (d);
3531 dfaparse (s, len, d);
3532 dfamust (d);
3533 dfaoptimize (d);
3534 dfaanalyze (d, searchflag);
3535}
3536
3537/* Free the storage held by the components of a dfa. */
3538void
3539dfafree (struct dfa *d)
3540{
3541 size_t i;
3542 struct dfamust *dm, *ndm;
3543
3544 free (d->charclasses);
3545 free (d->tokens);
3546
3547 if (d->mb_cur_max > 1)
3548 free_mbdata (d);
3549
3550 for (i = 0; i < d->sindex; ++i)
3551 {
3552 free (d->states[i].elems.elems);
3553 if (MBS_SUPPORT)
3554 free (d->states[i].mbps.elems);
3555 }
3556 free (d->states);
3557 for (i = 0; i < d->tindex; ++i)
3558 free (d->follows[i].elems);
3559 free (d->follows);
3560 for (i = 0; i < d->tralloc; ++i)
3561 {
3562 free (d->trans[i]);
3563 free (d->fails[i]);
3564 }
3565 free (d->realtrans);
3566 free (d->fails);
3567 free (d->newlines);
3568 free (d->success);
3569 for (dm = d->musts; dm; dm = ndm)
3570 {
3571 ndm = dm->next;
3572 free (dm->must);
3573 free (dm);
3574 }
3575}
3576
3577/* Having found the postfix representation of the regular expression,
3578 try to find a long sequence of characters that must appear in any line
3579 containing the r.e.
3580 Finding a "longest" sequence is beyond the scope here;
3581 we take an easy way out and hope for the best.
3582 (Take "(ab|a)b"--please.)
3583
3584 We do a bottom-up calculation of sequences of characters that must appear
3585 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3586 representation:
3587 sequences that must appear at the left of the match ("left")
3588 sequences that must appear at the right of the match ("right")
3589 lists of sequences that must appear somewhere in the match ("in")
3590 sequences that must constitute the match ("is")
3591
3592 When we get to the root of the tree, we use one of the longest of its
3593 calculated "in" sequences as our answer. The sequence we find is returned in
3594 d->must (where "d" is the single argument passed to "dfamust");
3595 the length of the sequence is returned in d->mustn.
3596
3597 The sequences calculated for the various types of node (in pseudo ANSI c)
3598 are shown below. "p" is the operand of unary operators (and the left-hand
3599 operand of binary operators); "q" is the right-hand operand of binary
3600 operators.
3601
3602 "ZERO" means "a zero-length sequence" below.
3603
3604 Type left right is in
3605 ---- ---- ----- -- --
3606 char c # c # c # c # c
3607
3608 ANYCHAR ZERO ZERO ZERO ZERO
3609
3610 MBCSET ZERO ZERO ZERO ZERO
3611
3612 CSET ZERO ZERO ZERO ZERO
3613
3614 STAR ZERO ZERO ZERO ZERO
3615
3616 QMARK ZERO ZERO ZERO ZERO
3617
3618 PLUS p->left p->right ZERO p->in
3619
3620 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3621 p->left : q->right : q->is!=ZERO) ? q->in plus
3622 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3623 ZERO
3624
3625 OR longest common longest common (do p->is and substrings common to
3626 leading trailing q->is have same p->in and q->in
3627 (sub)sequence (sub)sequence length and
3628 of p->left of p->right content) ?
3629 and q->left and q->right p->is : NULL
3630
3631 If there's anything else we recognize in the tree, all four sequences get set
3632 to zero-length sequences. If there's something we don't recognize in the tree,
3633 we just return a zero-length sequence.
3634
3635 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3636 'aaa')?
3637
3638 And. . .is it here or someplace that we might ponder "optimizations" such as
3639 egrep 'psi|epsilon' -> egrep 'psi'
3640 egrep 'pepsi|epsilon' -> egrep 'epsi'
3641 (Yes, we now find "epsi" as a "string
3642 that must occur", but we might also
3643 simplify the *entire* r.e. being sought)
3644 grep '[c]' -> grep 'c'
3645 grep '(ab|a)b' -> grep 'ab'
3646 grep 'ab*' -> grep 'a'
3647 grep 'a*b' -> grep 'b'
3648
3649 There are several issues:
3650
3651 Is optimization easy (enough)?
3652
3653 Does optimization actually accomplish anything,
3654 or is the automaton you get from "psi|epsilon" (for example)
3655 the same as the one you get from "psi" (for example)?
3656
3657 Are optimizable r.e.'s likely to be used in real-life situations
3658 (something like 'ab*' is probably unlikely; something like is
3659 'psi|epsilon' is likelier)? */
3660
3661static char *
3662icatalloc (char *old, char const *new)
3663{
3664 char *result;
3665 size_t oldsize = old == NULL ? 0 : strlen (old);
3666 size_t newsize = new == NULL ? 0 : strlen (new);
3667 if (newsize == 0)
3668 return old;
3669 result = xrealloc (old, oldsize + newsize + 1);
3670 memcpy (result + oldsize, new, newsize + 1);
3671 return result;
3672}
3673
3674static char *
3675icpyalloc (char const *string)
3676{
3677 return icatalloc (NULL, string);
3678}
3679
3680static char *_GL_ATTRIBUTE_PURE
3681istrstr (char const *lookin, char const *lookfor)
3682{
3683 char const *cp;
3684 size_t len;
3685
3686 len = strlen (lookfor);
3687 for (cp = lookin; *cp != '\0'; ++cp)
3688 if (strncmp (cp, lookfor, len) == 0)
3689 return (char *) cp;
3690 return NULL;
3691}
3692
3693static void
3694freelist (char **cpp)
3695{
3696 size_t i;
3697
3698 if (cpp == NULL)
3699 return;
3700 for (i = 0; cpp[i] != NULL; ++i)
3701 {
3702 free (cpp[i]);
3703 cpp[i] = NULL;
3704 }
3705}
3706
3707static char **
3708enlist (char **cpp, char *new, size_t len)
3709{
3710 size_t i, j;
3711
3712 if (cpp == NULL)
3713 return NULL;
3714 if ((new = icpyalloc (new)) == NULL)
3715 {
3716 freelist (cpp);
3717 return NULL;
3718 }
3719 new[len] = '\0';
3720 /* Is there already something in the list that's new (or longer)? */
3721 for (i = 0; cpp[i] != NULL; ++i)
3722 if (istrstr (cpp[i], new) != NULL)
3723 {
3724 free (new);
3725 return cpp;
3726 }
3727 /* Eliminate any obsoleted strings. */
3728 j = 0;
3729 while (cpp[j] != NULL)
3730 if (istrstr (new, cpp[j]) == NULL)
3731 ++j;
3732 else
3733 {
3734 free (cpp[j]);
3735 if (--i == j)
3736 break;
3737 cpp[j] = cpp[i];
3738 cpp[i] = NULL;
3739 }
3740 /* Add the new string. */
3741 REALLOC (cpp, i + 2);
3742 cpp[i] = new;
3743 cpp[i + 1] = NULL;
3744 return cpp;
3745}
3746
3747/* Given pointers to two strings, return a pointer to an allocated
3748 list of their distinct common substrings. Return NULL if something
3749 seems wild. */
3750static char **
3751comsubs (char *left, char const *right)
3752{
3753 char **cpp;
3754 char *lcp;
3755 char *rcp;
3756 size_t i, len;
3757
3758 if (left == NULL || right == NULL)
3759 return NULL;
3760 cpp = malloc (sizeof *cpp);
3761 if (cpp == NULL)
3762 return NULL;
3763 cpp[0] = NULL;
3764 for (lcp = left; *lcp != '\0'; ++lcp)
3765 {
3766 len = 0;
3767 rcp = strchr (right, *lcp);
3768 while (rcp != NULL)
3769 {
3770 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3771 continue;
3772 if (i > len)
3773 len = i;
3774 rcp = strchr (rcp + 1, *lcp);
3775 }
3776 if (len == 0)
3777 continue;
3778 {
3779 char **p = enlist (cpp, lcp, len);
3780 if (p == NULL)
3781 {
3782 freelist (cpp);
3783 cpp = NULL;
3784 break;
3785 }
3786 cpp = p;
3787 }
3788 }
3789 return cpp;
3790}
3791
3792static char **
3793addlists (char **old, char **new)
3794{
3795 size_t i;
3796
3797 if (old == NULL || new == NULL)
3798 return NULL;
3799 for (i = 0; new[i] != NULL; ++i)
3800 {
3801 old = enlist (old, new[i], strlen (new[i]));
3802 if (old == NULL)
3803 break;
3804 }
3805 return old;
3806}
3807
3808/* Given two lists of substrings, return a new list giving substrings
3809 common to both. */
3810static char **
3811inboth (char **left, char **right)
3812{
3813 char **both;
3814 char **temp;
3815 size_t lnum, rnum;
3816
3817 if (left == NULL || right == NULL)
3818 return NULL;
3819 both = malloc (sizeof *both);
3820 if (both == NULL)
3821 return NULL;
3822 both[0] = NULL;
3823 for (lnum = 0; left[lnum] != NULL; ++lnum)
3824 {
3825 for (rnum = 0; right[rnum] != NULL; ++rnum)
3826 {
3827 temp = comsubs (left[lnum], right[rnum]);
3828 if (temp == NULL)
3829 {
3830 freelist (both);
3831 return NULL;
3832 }
3833 both = addlists (both, temp);
3834 freelist (temp);
3835 free (temp);
3836 if (both == NULL)
3837 return NULL;
3838 }
3839 }
3840 return both;
3841}
3842
3843typedef struct
3844{
3845 char **in;
3846 char *left;
3847 char *right;
3848 char *is;
3849} must;
3850
3851static void
3852resetmust (must * mp)
3853{
3854 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3855 freelist (mp->in);
3856}
3857
3858static void
3859dfamust (struct dfa *d)
3860{
3861 must *musts;
3862 must *mp;
3863 char *result;
3864 size_t ri;
3865 size_t i;
3866 int exact;
3867 token t;
3868 static must must0;
3869 struct dfamust *dm;
3870 static char empty_string[] = "";
3871
3872 result = empty_string;
3873 exact = 0;
3874 MALLOC (musts, d->tindex + 1);
3875 mp = musts;
3876 for (i = 0; i <= d->tindex; ++i)
3877 mp[i] = must0;
3878 for (i = 0; i <= d->tindex; ++i)
3879 {
3880 mp[i].in = xmalloc (sizeof *mp[i].in);
3881 mp[i].left = xmalloc (2);
3882 mp[i].right = xmalloc (2);
3883 mp[i].is = xmalloc (2);
3884 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3885 mp[i].in[0] = NULL;
3886 }
3887#ifdef DEBUG
3888 fprintf (stderr, "dfamust:\n");
3889 for (i = 0; i < d->tindex; ++i)
3890 {
3891 fprintf (stderr, " %zd:", i);
3892 prtok (d->tokens[i]);
3893 }
3894 putc ('\n', stderr);
3895#endif
3896 for (ri = 0; ri < d->tindex; ++ri)
3897 {
3898 switch (t = d->tokens[ri])
3899 {
3900 case LPAREN:
3901 case RPAREN:
3902 assert (!"neither LPAREN nor RPAREN may appear here");
3903 case EMPTY:
3904 case BEGLINE:
3905 case ENDLINE:
3906 case BEGWORD:
3907 case ENDWORD:
3908 case LIMWORD:
3909 case NOTLIMWORD:
3910 case BACKREF:
3911 resetmust (mp);
3912 break;
3913 case STAR:
3914 case QMARK:
3915 assert (musts < mp);
3916 --mp;
3917 resetmust (mp);
3918 break;
3919 case OR:
3920 assert (&musts[2] <= mp);
3921 {
3922 char **new;
3923 must *lmp;
3924 must *rmp;
3925 size_t j, ln, rn, n;
3926
3927 rmp = --mp;
3928 lmp = --mp;
3929 /* Guaranteed to be. Unlikely, but. . . */
3930 if (!STREQ (lmp->is, rmp->is))
3931 lmp->is[0] = '\0';
3932 /* Left side--easy */
3933 i = 0;
3934 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3935 ++i;
3936 lmp->left[i] = '\0';
3937 /* Right side */
3938 ln = strlen (lmp->right);
3939 rn = strlen (rmp->right);
3940 n = ln;
3941 if (n > rn)
3942 n = rn;
3943 for (i = 0; i < n; ++i)
3944 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3945 break;
3946 for (j = 0; j < i; ++j)
3947 lmp->right[j] = lmp->right[(ln - i) + j];
3948 lmp->right[j] = '\0';
3949 new = inboth (lmp->in, rmp->in);
3950 if (new == NULL)
3951 goto done;
3952 freelist (lmp->in);
3953 free (lmp->in);
3954 lmp->in = new;
3955 }
3956 break;
3957 case PLUS:
3958 assert (musts < mp);
3959 --mp;
3960 mp->is[0] = '\0';
3961 break;
3962 case END:
3963 assert (mp == &musts[1]);
3964 for (i = 0; musts[0].in[i] != NULL; ++i)
3965 if (strlen (musts[0].in[i]) > strlen (result))
3966 result = musts[0].in[i];
3967 if (STREQ (result, musts[0].is))
3968 exact = 1;
3969 goto done;
3970 case CAT:
3971 assert (&musts[2] <= mp);
3972 {
3973 must *lmp;
3974 must *rmp;
3975
3976 rmp = --mp;
3977 lmp = --mp;
3978 /* In. Everything in left, plus everything in
3979 right, plus concatenation of
3980 left's right and right's left. */
3981 lmp->in = addlists (lmp->in, rmp->in);
3982 if (lmp->in == NULL)
3983 goto done;
3984 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3985 {
3986 char *tp;
3987
3988 tp = icpyalloc (lmp->right);
3989 tp = icatalloc (tp, rmp->left);
3990 lmp->in = enlist (lmp->in, tp, strlen (tp));
3991 free (tp);
3992 if (lmp->in == NULL)
3993 goto done;
3994 }
3995 /* Left-hand */
3996 if (lmp->is[0] != '\0')
3997 {
3998 lmp->left = icatalloc (lmp->left, rmp->left);
3999 if (lmp->left == NULL)
4000 goto done;
4001 }
4002 /* Right-hand */
4003 if (rmp->is[0] == '\0')
4004 lmp->right[0] = '\0';
4005 lmp->right = icatalloc (lmp->right, rmp->right);
4006 if (lmp->right == NULL)
4007 goto done;
4008 /* Guaranteed to be */
4009 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
4010 {
4011 lmp->is = icatalloc (lmp->is, rmp->is);
4012 if (lmp->is == NULL)
4013 goto done;
4014 }
4015 else
4016 lmp->is[0] = '\0';
4017 }
4018 break;
4019 default:
4020 if (t < END)
4021 {
4022 assert (!"oops! t >= END");
4023 }
4024 else if (t == '\0')
4025 {
4026 /* not on *my* shift */
4027 goto done;
4028 }
4029 else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
4030 {
4031 /* easy enough */
4032 resetmust (mp);
4033 }
4034 else
4035 {
4036 /* plain character */
4037 resetmust (mp);
4038 mp->is[0] = mp->left[0] = mp->right[0] = t;
4039 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4040 mp->in = enlist (mp->in, mp->is, (size_t) 1);
4041 if (mp->in == NULL)
4042 goto done;
4043 }
4044 break;
4045 }
4046#ifdef DEBUG
4047 fprintf (stderr, " node: %zd:", ri);
4048 prtok (d->tokens[ri]);
4049 fprintf (stderr, "\n in:");
4050 for (i = 0; mp->in[i]; ++i)
4051 fprintf (stderr, " \"%s\"", mp->in[i]);
4052 fprintf (stderr, "\n is: \"%s\"\n", mp->is);
4053 fprintf (stderr, " left: \"%s\"\n", mp->left);
4054 fprintf (stderr, " right: \"%s\"\n", mp->right);
4055#endif
4056 ++mp;
4057 }
4058done:
4059 if (strlen (result))
4060 {
4061 MALLOC (dm, 1);
4062 dm->exact = exact;
4063 dm->must = xmemdup (result, strlen (result) + 1);
4064 dm->next = d->musts;
4065 d->musts = dm;
4066 }
4067 mp = musts;
4068 for (i = 0; i <= d->tindex; ++i)
4069 {
4070 freelist (mp[i].in);
4071 free (mp[i].in);
4072 free (mp[i].left);
4073 free (mp[i].right);
4074 free (mp[i].is);
4075 }
4076 free (mp);
4077}
4078
4079struct dfa *
4080dfaalloc (void)
4081{
4082 return xmalloc (sizeof (struct dfa));
4083}
4084
4085struct dfamust *_GL_ATTRIBUTE_PURE
4086dfamusts (struct dfa const *d)
4087{
4088 return d->musts;
4089}
4090
4091/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette