dfa.c@ 3611

Last change on this file since 3611 was 3611, checked in by bird, 7 months ago
vendor/sed/current: GNU sed 4.9 (sed-4.9.tar.xz sha256:6e226b732e1cd739464ad6862bd1a1aba42d7982922da7a53519631d24975181)
File size: 135.0 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2022 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation, either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24
25	#include "dfa.h"
26
27	#include "flexmember.h"
28	#include "idx.h"
29	#include "verify.h"
30
31	#include <assert.h>
32	#include <ctype.h>
33	#include <stdint.h>
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include <limits.h>
37	#include <string.h>
38
39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40	understandably cannot deduce that the input comes from a
41	well-formed regular expression. There's little point to the
42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
43	MMU will check anyway. */
44	#define assume_nonnull(x) assume ((x) != NULL)
45
46	static bool
47	str_eq (char const a, char const b)
48	{
49	return strcmp (a, b) == 0;
50	}
51
52	static bool
53	c_isdigit (char c)
54	{
55	return '0' <= c && c <= '9';
56	}
57
58	#include "gettext.h"
59	#define _(str) gettext (str)
60
61	#include <wchar.h>
62	#include <wctype.h>
63
64	#include "xalloc.h"
65	#include "localeinfo.h"
66
67	#ifndef FALLTHROUGH
68	# if 201710L < __STDC_VERSION__
69	# define FALLTHROUGH [[__fallthrough__]]
70	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
71	# define FALLTHROUGH __attribute__ ((__fallthrough__))
72	# else
73	# define FALLTHROUGH ((void) 0)
74	# endif
75	#endif
76
77	#ifndef MIN
78	# define MIN(a,b) ((a) < (b) ? (a) : (b))
79	#endif
80
81	/* HPUX defines these as macros in sys/param.h. */
82	#ifdef setbit
83	# undef setbit
84	#endif
85	#ifdef clrbit
86	# undef clrbit
87	#endif
88
89	/* For code that does not use Gnulib’s isblank module. */
90	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
91	# define isblank dfa_isblank
92	static int
93	isblank (int c)
94	{
95	return c == ' ' \|\| c == '\t';
96	}
97	#endif
98
99	/* First integer value that is greater than any character code. */
100	enum { NOTCHAR = 1 << CHAR_BIT };
101
102	#ifdef UINT_LEAST64_MAX
103
104	/* Number of bits used in a charclass word. */
105	enum { CHARCLASS_WORD_BITS = 64 };
106
107	/* This represents part of a character class. It must be unsigned and
108	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
109	typedef uint_least64_t charclass_word;
110
111	/* Part of a charclass initializer that represents 64 bits' worth of a
112	charclass, where LO and HI are the low and high-order 32 bits of
113	the 64-bit quantity. */
114	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
115
116	#else
117	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
118	enum { CHARCLASS_WORD_BITS = 32 };
119	typedef unsigned long charclass_word;
120	# define CHARCLASS_PAIR(lo, hi) lo, hi
121	#endif
122
123	/* An initializer for a charclass whose 32-bit words are A through H. */
124	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
125	{{ \
126	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
127	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
128	}}
129
130	/* The maximum useful value of a charclass_word; all used bits are 1. */
131	static charclass_word const CHARCLASS_WORD_MASK
132	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
133
134	/* Number of words required to hold a bit for every character. */
135	enum
136	{
137	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
138	};
139
140	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
141	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
142
143	/* Convert a possibly-signed character to an unsigned character. This is
144	a bit safer than casting to unsigned char, since it catches some type
145	errors that the cast doesn't. */
146	static unsigned char
147	to_uchar (char ch)
148	{
149	return ch;
150	}
151
152	/* Contexts tell us whether a character is a newline or a word constituent.
153	Word-constituent characters are those that satisfy iswalnum, plus '_'.
154	Each character has a single CTX_* value; bitmasks of CTX_* values denote
155	a particular character class.
156
157	A state also stores a context value, which is a bitmask of CTX_* values.
158	A state's context represents a set of characters that the state's
159	predecessors must match. For example, a state whose context does not
160	include CTX_LETTER will never have transitions where the previous
161	character is a word constituent. A state whose context is CTX_ANY
162	might have transitions from any character. */
163
164	enum
165	{
166	CTX_NONE = 1,
167	CTX_LETTER = 2,
168	CTX_NEWLINE = 4,
169	CTX_ANY = 7
170	};
171
172	/* Sometimes characters can only be matched depending on the surrounding
173	context. Such context decisions depend on what the previous character
174	was, and the value of the current (lookahead) character. Context
175	dependent constraints are encoded as 9-bit integers. Each bit that
176	is set indicates that the constraint succeeds in the corresponding
177	context.
178
179	bit 6-8 - valid contexts when next character is CTX_NEWLINE
180	bit 3-5 - valid contexts when next character is CTX_LETTER
181	bit 0-2 - valid contexts when next character is CTX_NONE
182
183	succeeds_in_context determines whether a given constraint
184	succeeds in a particular context. Prev is a bitmask of possible
185	context values for the previous character, curr is the (single-bit)
186	context value for the lookahead character. */
187	static int
188	newline_constraint (int constraint)
189	{
190	return (constraint >> 6) & 7;
191	}
192	static int
193	letter_constraint (int constraint)
194	{
195	return (constraint >> 3) & 7;
196	}
197	static int
198	other_constraint (int constraint)
199	{
200	return constraint & 7;
201	}
202
203	static bool
204	succeeds_in_context (int constraint, int prev, int curr)
205	{
206	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
207	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
208	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
209	& prev);
210	}
211
212	/* The following describe what a constraint depends on. */
213	static bool
214	prev_newline_dependent (int constraint)
215	{
216	return ((constraint ^ constraint >> 2) & 0111) != 0;
217	}
218	static bool
219	prev_letter_dependent (int constraint)
220	{
221	return ((constraint ^ constraint >> 1) & 0111) != 0;
222	}
223
224	/* Tokens that match the empty string subject to some constraint actually
225	work by applying that constraint to determine what may follow them,
226	taking into account what has gone before. The following values are
227	the constraints corresponding to the special tokens previously defined. */
228	enum
229	{
230	NO_CONSTRAINT = 0777,
231	BEGLINE_CONSTRAINT = 0444,
232	ENDLINE_CONSTRAINT = 0700,
233	BEGWORD_CONSTRAINT = 0050,
234	ENDWORD_CONSTRAINT = 0202,
235	LIMWORD_CONSTRAINT = 0252,
236	NOTLIMWORD_CONSTRAINT = 0525
237	};
238
239	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
240	are operators and others are terminal symbols. Most (but not all) of these
241	codes are returned by the lexical analyzer. */
242
243	typedef ptrdiff_t token;
244	static token const TOKEN_MAX = PTRDIFF_MAX;
245
246	/* States are indexed by state_num values. These are normally
247	nonnegative but -1 is used as a special value. */
248	typedef ptrdiff_t state_num;
249
250	/* Predefined token values. */
251	enum
252	{
253	END = -1, /* END is a terminal symbol that matches the
254	end of input; any value of END or less in
255	the parse tree is such a symbol. Accepting
256	states of the DFA are those that would have
257	a transition on END. This is -1, not some
258	more-negative value, to tweak the speed of
259	comparisons to END. */
260
261	/* Ordinary character values are terminal symbols that match themselves. */
262
263	/* CSET must come last in the following list of special tokens. Otherwise,
264	the list order matters only for performance. Related special tokens
265	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
266	\|\| CSET <= t) can be done with a single machine-level comparison. */
267
268	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
269	the empty string. */
270
271	QMARK, /* QMARK is an operator of one argument that
272	matches zero or one occurrences of its
273	argument. */
274
275	STAR, /* STAR is an operator of one argument that
276	matches the Kleene closure (zero or more
277	occurrences) of its argument. */
278
279	PLUS, /* PLUS is an operator of one argument that
280	matches the positive closure (one or more
281	occurrences) of its argument. */
282
283	REPMN, /* REPMN is a lexical token corresponding
284	to the {m,n} construct. REPMN never
285	appears in the compiled token vector. */
286
287	CAT, /* CAT is an operator of two arguments that
288	matches the concatenation of its
289	arguments. CAT is never returned by the
290	lexical analyzer. */
291
292	OR, /* OR is an operator of two arguments that
293	matches either of its arguments. */
294
295	LPAREN, /* LPAREN never appears in the parse tree,
296	it is only a lexeme. */
297
298	RPAREN, /* RPAREN never appears in the parse tree. */
299
300	WCHAR, /* Only returned by lex. wctok contains
301	the wide character representation. */
302
303	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
304	a valid multibyte (or single byte) character.
305	It is used only if MB_CUR_MAX > 1. */
306
307	BEG, /* BEG is an initial symbol that matches the
308	beginning of input. */
309
310	BEGLINE, /* BEGLINE is a terminal symbol that matches
311	the empty string at the beginning of a
312	line. */
313
314	ENDLINE, /* ENDLINE is a terminal symbol that matches
315	the empty string at the end of a line. */
316
317	BEGWORD, /* BEGWORD is a terminal symbol that matches
318	the empty string at the beginning of a
319	word. */
320
321	ENDWORD, /* ENDWORD is a terminal symbol that matches
322	the empty string at the end of a word. */
323
324	LIMWORD, /* LIMWORD is a terminal symbol that matches
325	the empty string at the beginning or the
326	end of a word. */
327
328	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
329	matches the empty string not at
330	the beginning or end of a word. */
331
332	BACKREF, /* BACKREF is generated by \<digit>
333	or by any other construct that
334	is not completely handled. If the scanner
335	detects a transition on backref, it returns
336	a kind of "semi-success" indicating that
337	the match will have to be verified with
338	a backtracking matcher. */
339
340	MBCSET, /* MBCSET is similar to CSET, but for
341	multibyte characters. */
342
343	CSET /* CSET and (and any value greater) is a
344	terminal symbol that matches any of a
345	class of characters. */
346	};
347
348
349	/* States of the recognizer correspond to sets of positions in the parse
350	tree, together with the constraints under which they may be matched.
351	So a position is encoded as an index into the parse tree together with
352	a constraint. */
353	typedef struct
354	{
355	idx_t index; /* Index into the parse array. */
356	unsigned int constraint; /* Constraint for matching this position. */
357	} position;
358
359	/* Sets of positions are stored as arrays. */
360	typedef struct
361	{
362	position elems; / Elements of this position set. */
363	idx_t nelem; /* Number of elements in this set. */
364	idx_t alloc; /* Number of elements allocated in ELEMS. */
365	} position_set;
366
367	/* A state of the dfa consists of a set of positions, some flags,
368	and the token value of the lowest-numbered position of the state that
369	contains an END token. */
370	typedef struct
371	{
372	size_t hash; /* Hash of the positions of this state. */
373	position_set elems; /* Positions this state could match. */
374	unsigned char context; /* Context from previous state. */
375	unsigned short constraint; /* Constraint for this state to accept. */
376	position_set mbps; /* Positions which can match multibyte
377	characters or the follows, e.g., period.
378	Used only if MB_CUR_MAX > 1. */
379	state_num mb_trindex; /* Index of this state in MB_TRANS, or
380	negative if the state does not have
381	ANYCHAR. */
382	} dfa_state;
383
384	/* Maximum for any transition table count. This should be at least 3,
385	for the initial state setup. */
386	enum { MAX_TRCOUNT = 1024 };
387
388	/* A bracket operator.
389	e.g., [a-c], [[:alpha:]], etc. */
390	struct mb_char_classes
391	{
392	ptrdiff_t cset;
393	bool invert;
394	wchar_t chars; / Normal characters. */
395	idx_t nchars;
396	idx_t nchars_alloc;
397	};
398
399	struct regex_syntax
400	{
401	/* Syntax bits controlling the behavior of the lexical analyzer. */
402	reg_syntax_t syntax_bits;
403	int dfaopts;
404	bool syntax_bits_set;
405
406	/* Flag for case-folding letters into sets. */
407	bool case_fold;
408
409	/* End-of-line byte in data. */
410	unsigned char eolbyte;
411
412	/* Cache of char-context values. */
413	char sbit[NOTCHAR];
414
415	/* If never_trail[B], the byte B cannot be a non-initial byte in a
416	multibyte character. */
417	bool never_trail[NOTCHAR];
418
419	/* Set of characters considered letters. */
420	charclass letters;
421
422	/* Set of characters that are newline. */
423	charclass newline;
424	};
425
426	/* Lexical analyzer. All the dross that deals with the obnoxious
427	GNU Regex syntax bits is located here. The poor, suffering
428	reader is referred to the GNU Regex documentation for the
429	meaning of the @#%!@#%^!@ syntax bits. */
430	struct lexer_state
431	{
432	char const ptr; / Pointer to next input character. */
433	idx_t left; /* Number of characters remaining. */
434	token lasttok; /* Previous token returned; initially END. */
435	idx_t parens; /* Count of outstanding left parens. */
436	int minrep, maxrep; /* Repeat counts for {m,n}. */
437
438	/* Wide character representation of the current multibyte character,
439	or WEOF if there was an encoding error. Used only if
440	MB_CUR_MAX > 1. */
441	wint_t wctok;
442
443	/* The most recently analyzed multibyte bracket expression. */
444	struct mb_char_classes brack;
445
446	/* We're separated from beginning or (, \| only by zero-width characters. */
447	bool laststart;
448	};
449
450	/* Recursive descent parser for regular expressions. */
451
452	struct parser_state
453	{
454	token tok; /* Lookahead token. */
455	idx_t depth; /* Current depth of a hypothetical stack
456	holding deferred productions. This is
457	used to determine the depth that will be
458	required of the real stack later on in
459	dfaanalyze. */
460	};
461
462	/* A compiled regular expression. */
463	struct dfa
464	{
465	/* Fields filled by the scanner. */
466	charclass charclasses; / Array of character sets for CSET tokens. */
467	idx_t cindex; /* Index for adding new charclasses. */
468	idx_t calloc; /* Number of charclasses allocated. */
469	ptrdiff_t canychar; /* Index of anychar class, or -1. */
470
471	/* Scanner state */
472	struct lexer_state lex;
473
474	/* Parser state */
475	struct parser_state parse;
476
477	/* Fields filled by the parser. */
478	token tokens; / Postfix parse array. */
479	idx_t tindex; /* Index for adding new tokens. */
480	idx_t talloc; /* Number of tokens currently allocated. */
481	idx_t depth; /* Depth required of an evaluation stack
482	used for depth-first traversal of the
483	parse tree. */
484	idx_t nleaves; /* Number of non-EMPTY leaves
485	in the parse tree. */
486	idx_t nregexps; /* Count of parallel regexps being built
487	with dfaparse. */
488	bool fast; /* The DFA is fast. */
489	bool epsilon; /* Does a token match only the empty string? */
490	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
491	mbstate_t mbs; /* Multibyte conversion state. */
492
493	/* The following are valid only if MB_CUR_MAX > 1. */
494
495	/* The value of multibyte_prop[i] is defined by following rule.
496	if tokens[i] < NOTCHAR
497	bit 0 : tokens[i] is the first byte of a character, including
498	single-byte characters.
499	bit 1 : tokens[i] is the last byte of a character, including
500	single-byte characters.
501
502	e.g.
503	tokens
504	= 'single_byte_a', 'multi_byte_A', single_byte_b'
505	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
506	multibyte_prop
507	= 3 , 1 , 0 , 2 , 3
508	*/
509	char *multibyte_prop;
510
511	/* Fields filled by the superset. */
512	struct dfa superset; / Hint of the dfa. */
513
514	/* Fields filled by the state builder. */
515	dfa_state states; / States of the dfa. */
516	state_num sindex; /* Index for adding new states. */
517	idx_t salloc; /* Number of states currently allocated. */
518
519	/* Fields filled by the parse tree->NFA conversion. */
520	position_set follows; / Array of follow sets, indexed by position
521	index. The follow of a position is the set
522	of positions containing characters that
523	could conceivably follow a character
524	matching the given position in a string
525	matching the regexp. Allocated to the
526	maximum possible position index. */
527	bool searchflag; /* We are supposed to build a searching
528	as opposed to an exact matcher. A searching
529	matcher finds the first and shortest string
530	matching a regexp anywhere in the buffer,
531	whereas an exact matcher finds the longest
532	string matching, but anchored to the
533	beginning of the buffer. */
534
535	/* Fields filled by dfaanalyze. */
536	int constraints; / Array of union of accepting constraints
537	in the follow of a position. */
538	int separates; / Array of contexts on follow of a
539	position. */
540
541	/* Fields filled by dfaexec. */
542	state_num tralloc; /* Number of transition tables that have
543	slots so far, not counting trans[-1] and
544	trans[-2]. */
545	int trcount; /* Number of transition tables that have
546	been built, other than for initial
547	states. */
548	int min_trcount; /* Number of initial states. Equivalently,
549	the minimum state number for which trcount
550	counts transitions. */
551	state_num *trans; / Transition tables for states that can
552	never accept. If the transitions for a
553	state have not yet been computed, or the
554	state could possibly accept, its entry in
555	this table is NULL. This points to two
556	past the start of the allocated array,
557	and trans[-1] and trans[-2] are always
558	NULL. */
559	state_num *fails; / Transition tables after failing to accept
560	on a state that potentially could do so.
561	If trans[i] is non-null, fails[i] must
562	be null. */
563	char success; / Table of acceptance conditions used in
564	dfaexec and computed in build_state. */
565	state_num newlines; / Transitions on newlines. The entry for a
566	newline in any transition table is always
567	-1 so we can count lines without wasting
568	too many cycles. The transition for a
569	newline is stored separately and handled
570	as a special case. Newline is also used
571	as a sentinel at the end of the buffer. */
572	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
573	context in multibyte locales, in which we
574	do not distinguish between their contexts,
575	as not supported word. */
576	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
577	state_num *mb_trans; / Transition tables for states with
578	ANYCHAR. */
579	state_num mb_trcount; /* Number of transition tables for states with
580	ANYCHAR that have actually been built. */
581
582	/* Syntax configuration. This is near the end so that dfacopysyntax
583	can memset up to here. */
584	struct regex_syntax syntax;
585
586	/* Information derived from the locale. This is at the end so that
587	a quick memset need not clear it specially. */
588
589	/* dfaexec implementation. */
590	char (dfaexec) (struct dfa , char const , char *,
591	bool, idx_t , bool );
592
593	/* Other cached information derived from the locale. */
594	struct localeinfo localeinfo;
595	};
596
597	/* User access to dfa internals. */
598
599	/* S could possibly be an accepting state of R. */
600	static bool
601	accepting (state_num s, struct dfa const *r)
602	{
603	return r->states[s].constraint != 0;
604	}
605
606	/* STATE accepts in the specified context. */
607	static bool
608	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
609	{
610	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
611	}
612
613	static void regexp (struct dfa *dfa);
614
615	/* Store into *PWC the result of converting the leading bytes of the
616	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
617	and updating the conversion state in *D. On conversion error,
618	convert just a single byte, to WEOF. Return the number of bytes
619	converted.
620
621	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
622
623	* PWC points to wint_t, not to wchar_t.
624	* The last arg is a dfa *D instead of merely a multibyte conversion
625	state D->mbs.
626	* N is idx_t not size_t, and must be at least 1.
627	* S[N - 1] must be a sentinel byte.
628	* Shift encodings are not supported.
629	* The return value is always in the range 1..N.
630	* D->mbs is always valid afterwards.
631	* PWC is always set to something. /
632	static int
633	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
634	{
635	unsigned char uc = s[0];
636	wint_t wc = d->localeinfo.sbctowc[uc];
637
638	if (wc == WEOF)
639	{
640	wchar_t wch;
641	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
642	if (0 < nbytes && nbytes < (size_t) -2)
643	{
644	*pwc = wch;
645	return nbytes;
646	}
647	memset (&d->mbs, 0, sizeof d->mbs);
648	}
649
650	*pwc = wc;
651	return 1;
652	}
653
654	#ifdef DEBUG
655
656	static void
657	prtok (token t)
658	{
659	if (t <= END)
660	fprintf (stderr, "END");
661	else if (0 <= t && t < NOTCHAR)
662	{
663	unsigned int ch = t;
664	fprintf (stderr, "0x%02x", ch);
665	}
666	else
667	{
668	char const *s;
669	switch (t)
670	{
671	case BEG:
672	s = "BEG";
673	break;
674	case EMPTY:
675	s = "EMPTY";
676	break;
677	case BACKREF:
678	s = "BACKREF";
679	break;
680	case BEGLINE:
681	s = "BEGLINE";
682	break;
683	case ENDLINE:
684	s = "ENDLINE";
685	break;
686	case BEGWORD:
687	s = "BEGWORD";
688	break;
689	case ENDWORD:
690	s = "ENDWORD";
691	break;
692	case LIMWORD:
693	s = "LIMWORD";
694	break;
695	case NOTLIMWORD:
696	s = "NOTLIMWORD";
697	break;
698	case QMARK:
699	s = "QMARK";
700	break;
701	case STAR:
702	s = "STAR";
703	break;
704	case PLUS:
705	s = "PLUS";
706	break;
707	case CAT:
708	s = "CAT";
709	break;
710	case OR:
711	s = "OR";
712	break;
713	case LPAREN:
714	s = "LPAREN";
715	break;
716	case RPAREN:
717	s = "RPAREN";
718	break;
719	case ANYCHAR:
720	s = "ANYCHAR";
721	break;
722	case MBCSET:
723	s = "MBCSET";
724	break;
725	default:
726	s = "CSET";
727	break;
728	}
729	fprintf (stderr, "%s", s);
730	}
731	}
732	#endif /* DEBUG */
733
734	/* Stuff pertaining to charclasses. */
735
736	static bool
737	tstbit (unsigned int b, charclass const *c)
738	{
739	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
740	}
741
742	static void
743	setbit (unsigned int b, charclass *c)
744	{
745	charclass_word one = 1;
746	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
747	}
748
749	static void
750	clrbit (unsigned int b, charclass *c)
751	{
752	charclass_word one = 1;
753	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
754	}
755
756	static void
757	zeroset (charclass *s)
758	{
759	memset (s, 0, sizeof *s);
760	}
761
762	static void
763	fillset (charclass *s)
764	{
765	for (int i = 0; i < CHARCLASS_WORDS; i++)
766	s->w[i] = CHARCLASS_WORD_MASK;
767	}
768
769	static void
770	notset (charclass *s)
771	{
772	for (int i = 0; i < CHARCLASS_WORDS; ++i)
773	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
774	}
775
776	static bool
777	equal (charclass const s1, charclass const s2)
778	{
779	charclass_word w = 0;
780	for (int i = 0; i < CHARCLASS_WORDS; i++)
781	w \|= s1->w[i] ^ s2->w[i];
782	return w == 0;
783	}
784
785	static bool
786	emptyset (charclass const *s)
787	{
788	charclass_word w = 0;
789	for (int i = 0; i < CHARCLASS_WORDS; i++)
790	w \|= s->w[i];
791	return w == 0;
792	}
793
794	/* Ensure that the array addressed by PA holds at least I + 1 items.
795	Either return PA, or reallocate the array and return its new address.
796	Although PA may be null, the returned value is never null.
797
798	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
799	is updated on reallocation. If PA is null, *NITEMS must be zero.
800	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
801	ITEM_SIZE is the size of one item; it must be positive.
802	Avoid O(N*2) behavior on arrays growing linearly. /
803	static void *
804	maybe_realloc (void pa, idx_t i, idx_t nitems,
805	ptrdiff_t nitems_max, idx_t item_size)
806	{
807	if (i < *nitems)
808	return pa;
809	return xpalloc (pa, nitems, 1, nitems_max, item_size);
810	}
811
812	/* In DFA D, find the index of charclass S, or allocate a new one. */
813	static idx_t
814	charclass_index (struct dfa d, charclass const s)
815	{
816	idx_t i;
817
818	for (i = 0; i < d->cindex; ++i)
819	if (equal (s, &d->charclasses[i]))
820	return i;
821	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
822	TOKEN_MAX - CSET, sizeof *d->charclasses);
823	++d->cindex;
824	d->charclasses[i] = *s;
825	return i;
826	}
827
828	static bool
829	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
830	{
831	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
832	}
833
834	static int
835	char_context (struct dfa const *dfa, unsigned char c)
836	{
837	if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
838	return CTX_NEWLINE;
839	if (unibyte_word_constituent (dfa, c))
840	return CTX_LETTER;
841	return CTX_NONE;
842	}
843
844	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
845	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
846	this may happen when folding case in weird Turkish locales where
847	dotless i/dotted I are not included in the chosen character set.
848	Return whether a bit was set in the charclass. */
849	static bool
850	setbit_wc (wint_t wc, charclass *c)
851	{
852	int b = wctob (wc);
853	if (b < 0)
854	return false;
855
856	setbit (b, c);
857	return true;
858	}
859
860	/* Set a bit for B and its case variants in the charclass C.
861	MB_CUR_MAX must be 1. */
862	static void
863	setbit_case_fold_c (int b, charclass *c)
864	{
865	int ub = toupper (b);
866	for (int i = 0; i < NOTCHAR; i++)
867	if (toupper (i) == ub)
868	setbit (i, c);
869	}
870
871	/* Fetch the next lexical input character from the pattern. There
872	must at least one byte of pattern input. Set DFA->lex.wctok to the
873	value of the character or to WEOF depending on whether the input is
874	a valid multibyte character (possibly of length 1). Then return
875	the next input byte value, except return EOF if the input is a
876	multibyte character of length greater than 1. */
877	static int
878	fetch_wc (struct dfa *dfa)
879	{
880	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
881	dfa);
882	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
883	dfa->lex.ptr += nbytes;
884	dfa->lex.left -= nbytes;
885	return c;
886	}
887
888	/* If there is no more input, report an error about unbalanced brackets.
889	Otherwise, behave as with fetch_wc (DFA). */
890	static int
891	bracket_fetch_wc (struct dfa *dfa)
892	{
893	if (! dfa->lex.left)
894	dfaerror (_("unbalanced ["));
895	return fetch_wc (dfa);
896	}
897
898	typedef int predicate (int);
899
900	/* The following list maps the names of the Posix named character classes
901	to predicate functions that determine whether a given character is in
902	the class. The leading [ has already been eaten by the lexical
903	analyzer. */
904	struct dfa_ctype
905	{
906	const char *name;
907	predicate *func;
908	bool single_byte_only;
909	};
910
911	static const struct dfa_ctype prednames[] = {
912	{"alpha", isalpha, false},
913	{"upper", isupper, false},
914	{"lower", islower, false},
915	{"digit", isdigit, true},
916	{"xdigit", isxdigit, false},
917	{"space", isspace, false},
918	{"punct", ispunct, false},
919	{"alnum", isalnum, false},
920	{"print", isprint, false},
921	{"graph", isgraph, false},
922	{"cntrl", iscntrl, false},
923	{"blank", isblank, false},
924	{NULL, NULL, false}
925	};
926
927	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
928	find_pred (const char *str)
929	{
930	for (int i = 0; prednames[i].name; i++)
931	if (str_eq (str, prednames[i].name))
932	return &prednames[i];
933	return NULL;
934	}
935
936	/* Parse a bracket expression, which possibly includes multibyte
937	characters. */
938	static token
939	parse_bracket_exp (struct dfa *dfa)
940	{
941	/* This is a bracket expression that dfaexec is known to
942	process correctly. */
943	bool known_bracket_exp = true;
944
945	/* Used to warn about [:space:].
946	Bit 0 = first character is a colon.
947	Bit 1 = last character is a colon.
948	Bit 2 = includes any other character but a colon.
949	Bit 3 = includes ranges, char/equiv classes or collation elements. */
950	int colon_warning_state;
951
952	dfa->lex.brack.nchars = 0;
953	charclass ccl;
954	zeroset (&ccl);
955	int c = bracket_fetch_wc (dfa);
956	bool invert = c == '^';
957	if (invert)
958	{
959	c = bracket_fetch_wc (dfa);
960	known_bracket_exp = dfa->localeinfo.simple;
961	}
962	wint_t wc = dfa->lex.wctok;
963	int c1;
964	wint_t wc1;
965	colon_warning_state = (c == ':');
966	do
967	{
968	c1 = NOTCHAR; /* Mark c1 as not initialized. */
969	colon_warning_state &= ~2;
970
971	/* Note that if we're looking at some other [:...:] construct,
972	we just treat it as a bunch of ordinary characters. We can do
973	this because we assume regex has checked for syntax errors before
974	dfa is ever called. */
975	if (c == '[')
976	{
977	c1 = bracket_fetch_wc (dfa);
978	wc1 = dfa->lex.wctok;
979
980	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
981	\|\| c1 == '.' \|\| c1 == '=')
982	{
983	enum { MAX_BRACKET_STRING_LEN = 32 };
984	char str[MAX_BRACKET_STRING_LEN + 1];
985	int len = 0;
986	for (;;)
987	{
988	c = bracket_fetch_wc (dfa);
989	if (dfa->lex.left == 0
990	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
991	break;
992	if (len < MAX_BRACKET_STRING_LEN)
993	str[len++] = c;
994	else
995	/* This is in any case an invalid class name. */
996	str[0] = '\0';
997	}
998	str[len] = '\0';
999
1000	/* Fetch bracket. */
1001	c = bracket_fetch_wc (dfa);
1002	wc = dfa->lex.wctok;
1003	if (c1 == ':')
1004	/* Build character class. POSIX allows character
1005	classes to match multicharacter collating elements,
1006	but the regex code does not support that, so do not
1007	worry about that possibility. */
1008	{
1009	char const *class
1010	= (dfa->syntax.case_fold && (str_eq (str, "upper")
1011	\|\| str_eq (str, "lower"))
1012	? "alpha" : str);
1013	const struct dfa_ctype *pred = find_pred (class);
1014	if (!pred)
1015	dfaerror (_("invalid character class"));
1016
1017	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1018	known_bracket_exp = false;
1019	else
1020	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1021	if (pred->func (c2))
1022	setbit (c2, &ccl);
1023	}
1024	else
1025	known_bracket_exp = false;
1026
1027	colon_warning_state \|= 8;
1028
1029	/* Fetch new lookahead character. */
1030	c1 = bracket_fetch_wc (dfa);
1031	wc1 = dfa->lex.wctok;
1032	continue;
1033	}
1034
1035	/* We treat '[' as a normal character here. c/c1/wc/wc1
1036	are already set up. */
1037	}
1038
1039	if (c == '\\'
1040	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1041	{
1042	c = bracket_fetch_wc (dfa);
1043	wc = dfa->lex.wctok;
1044	}
1045
1046	if (c1 == NOTCHAR)
1047	{
1048	c1 = bracket_fetch_wc (dfa);
1049	wc1 = dfa->lex.wctok;
1050	}
1051
1052	if (c1 == '-')
1053	/* build range characters. */
1054	{
1055	int c2 = bracket_fetch_wc (dfa);
1056	wint_t wc2 = dfa->lex.wctok;
1057
1058	/* A bracket expression like [a-[.aa.]] matches an unknown set.
1059	Treat it like [-a[.aa.]] while parsing it, and
1060	remember that the set is unknown. */
1061	if (c2 == '[' && dfa->lex.ptr[0] == '.')
1062	{
1063	known_bracket_exp = false;
1064	c2 = ']';
1065	}
1066
1067	if (c2 == ']')
1068	{
1069	/* In the case [x-], the - is an ordinary hyphen,
1070	which is left in c1, the lookahead character. */
1071	dfa->lex.ptr--;
1072	dfa->lex.left++;
1073	}
1074	else
1075	{
1076	if (c2 == '\\' && (dfa->syntax.syntax_bits
1077	& RE_BACKSLASH_ESCAPE_IN_LISTS))
1078	{
1079	c2 = bracket_fetch_wc (dfa);
1080	wc2 = dfa->lex.wctok;
1081	}
1082
1083	colon_warning_state \|= 8;
1084	c1 = bracket_fetch_wc (dfa);
1085	wc1 = dfa->lex.wctok;
1086
1087	/* Treat [x-y] as a range if x != y. */
1088	if (wc != wc2 \|\| wc == WEOF)
1089	{
1090	if (dfa->localeinfo.simple
1091	\|\| (c_isdigit (c) & c_isdigit (c2)))
1092	{
1093	for (int ci = c; ci <= c2; ci++)
1094	if (dfa->syntax.case_fold && isalpha (ci))
1095	setbit_case_fold_c (ci, &ccl);
1096	else
1097	setbit (ci, &ccl);
1098	}
1099	else
1100	known_bracket_exp = false;
1101
1102	continue;
1103	}
1104	}
1105	}
1106
1107	colon_warning_state \|= (c == ':') ? 2 : 4;
1108
1109	if (!dfa->localeinfo.multibyte)
1110	{
1111	if (dfa->syntax.case_fold && isalpha (c))
1112	setbit_case_fold_c (c, &ccl);
1113	else
1114	setbit (c, &ccl);
1115	continue;
1116	}
1117
1118	if (wc == WEOF)
1119	known_bracket_exp = false;
1120	else
1121	{
1122	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1123	int n = (dfa->syntax.case_fold
1124	? case_folded_counterparts (wc, folded + 1) + 1
1125	: 1);
1126	folded[0] = wc;
1127	for (int i = 0; i < n; i++)
1128	if (!setbit_wc (folded[i], &ccl))
1129	{
1130	dfa->lex.brack.chars
1131	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1132	&dfa->lex.brack.nchars_alloc, -1,
1133	sizeof *dfa->lex.brack.chars);
1134	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1135	}
1136	}
1137	}
1138	while ((wc = wc1, (c = c1) != ']'));
1139
1140	if (colon_warning_state == 7)
1141	((dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR
1142	? dfaerror : dfawarn)
1143	(_("character class syntax is [[:space:]], not [:space:]")));
1144
1145	if (! known_bracket_exp)
1146	return BACKREF;
1147
1148	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
1149	{
1150	dfa->lex.brack.invert = invert;
1151	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152	return MBCSET;
1153	}
1154
1155	if (invert)
1156	{
1157	notset (&ccl);
1158	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159	clrbit ('\n', &ccl);
1160	}
1161
1162	return CSET + charclass_index (dfa, &ccl);
1163	}
1164
1165	struct lexptr
1166	{
1167	char const *ptr;
1168	idx_t left;
1169	};
1170
1171	static void
1172	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
1173	{
1174	ls->ptr = dfa->lex.ptr;
1175	ls->left = dfa->lex.left;
1176	dfa->lex.ptr = s;
1177	dfa->lex.left = strlen (s);
1178	}
1179
1180	static void
1181	pop_lex_state (struct dfa dfa, struct lexptr const ls)
1182	{
1183	dfa->lex.ptr = ls->ptr;
1184	dfa->lex.left = ls->left;
1185	}
1186
1187	static token
1188	lex (struct dfa *dfa)
1189	{
1190	bool backslash = false;
1191
1192	/* Basic plan: We fetch a character. If it's a backslash,
1193	we set the backslash flag and go through the loop again.
1194	On the plus side, this avoids having a duplicate of the
1195	main switch inside the backslash case. On the minus side,
1196	it means that just about every case tests the backslash flag. */
1197	for (int i = 0; i < 2; ++i)
1198	{
1199	if (! dfa->lex.left)
1200	return dfa->lex.lasttok = END;
1201	int c = fetch_wc (dfa);
1202
1203	switch (c)
1204	{
1205	case '\\':
1206	if (backslash)
1207	goto normal_char;
1208	if (dfa->lex.left == 0)
1209	dfaerror (_("unfinished \\ escape"));
1210	backslash = true;
1211	break;
1212
1213	case '^':
1214	if (backslash)
1215	goto normal_char;
1216	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1217	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
1218	\|\| dfa->lex.lasttok == OR)
1219	return dfa->lex.lasttok = BEGLINE;
1220	goto normal_char;
1221
1222	case '$':
1223	if (backslash)
1224	goto normal_char;
1225	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1226	\|\| dfa->lex.left == 0
1227	\|\| ((dfa->lex.left
1228	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1229	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1230	& (dfa->lex.ptr[0] == '\\')]
1231	== ')'))
1232	\|\| ((dfa->lex.left
1233	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1234	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1235	& (dfa->lex.ptr[0] == '\\')]
1236	== '\|'))
1237	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1238	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1239	return dfa->lex.lasttok = ENDLINE;
1240	goto normal_char;
1241
1242	case '1':
1243	case '2':
1244	case '3':
1245	case '4':
1246	case '5':
1247	case '6':
1248	case '7':
1249	case '8':
1250	case '9':
1251	if (!backslash)
1252	goto normal_char;
1253	if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1254	goto stray_backslash;
1255
1256	dfa->lex.laststart = false;
1257	return dfa->lex.lasttok = BACKREF;
1258
1259	case '`':
1260	if (!backslash)
1261	goto normal_char;
1262	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1263	goto stray_backslash;
1264
1265	/* FIXME: should be beginning of string */
1266	return dfa->lex.lasttok = BEGLINE;
1267
1268	case '\'':
1269	if (!backslash)
1270	goto normal_char;
1271	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1272	goto stray_backslash;
1273
1274	/* FIXME: should be end of string */
1275	return dfa->lex.lasttok = ENDLINE;
1276
1277	case '<':
1278	if (!backslash)
1279	goto normal_char;
1280	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1281	goto stray_backslash;
1282
1283	return dfa->lex.lasttok = BEGWORD;
1284
1285	case '>':
1286	if (!backslash)
1287	goto normal_char;
1288	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1289	goto stray_backslash;
1290
1291	return dfa->lex.lasttok = ENDWORD;
1292
1293	case 'b':
1294	if (!backslash)
1295	goto normal_char;
1296	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1297	goto stray_backslash;
1298
1299	return dfa->lex.lasttok = LIMWORD;
1300
1301	case 'B':
1302	if (!backslash)
1303	goto normal_char;
1304	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1305	goto stray_backslash;
1306
1307	return dfa->lex.lasttok = NOTLIMWORD;
1308
1309	case '?':
1310	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1311	goto default_case;
1312	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1313	goto normal_char;
1314	if (dfa->lex.laststart)
1315	{
1316	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1317	goto default_case;
1318	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1319	dfawarn (_("? at start of expression"));
1320	}
1321	return dfa->lex.lasttok = QMARK;
1322
1323	case '*':
1324	if (backslash)
1325	goto normal_char;
1326	if (dfa->lex.laststart)
1327	{
1328	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1329	goto default_case;
1330	if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1331	dfawarn (_("* at start of expression"));
1332	}
1333	return dfa->lex.lasttok = STAR;
1334
1335	case '+':
1336	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1337	goto default_case;
1338	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1339	goto normal_char;
1340	if (dfa->lex.laststart)
1341	{
1342	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1343	goto default_case;
1344	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1345	dfawarn (_("+ at start of expression"));
1346	}
1347	return dfa->lex.lasttok = PLUS;
1348
1349	case '{':
1350	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1351	goto default_case;
1352	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1353	goto normal_char;
1354
1355	/* Cases:
1356	{M} - exact count
1357	{M,} - minimum count, maximum is infinity
1358	{,N} - 0 through N
1359	{,} - 0 to infinity (same as '*')
1360	{M,N} - M through N */
1361	{
1362	char const *p = dfa->lex.ptr;
1363	char const *lim = p + dfa->lex.left;
1364	dfa->lex.minrep = dfa->lex.maxrep = -1;
1365	for (; p != lim && c_isdigit (*p); p++)
1366	dfa->lex.minrep = (dfa->lex.minrep < 0
1367	? *p - '0'
1368	: MIN (RE_DUP_MAX + 1,
1369	dfa->lex.minrep * 10 + *p - '0'));
1370	if (p != lim)
1371	{
1372	if (*p != ',')
1373	dfa->lex.maxrep = dfa->lex.minrep;
1374	else
1375	{
1376	if (dfa->lex.minrep < 0)
1377	dfa->lex.minrep = 0;
1378	while (++p != lim && c_isdigit (*p))
1379	dfa->lex.maxrep
1380	= (dfa->lex.maxrep < 0
1381	? *p - '0'
1382	: MIN (RE_DUP_MAX + 1,
1383	dfa->lex.maxrep * 10 + *p - '0'));
1384	}
1385	}
1386	bool invalid_content
1387	= ! ((! backslash \|\| (p != lim && *p++ == '\\'))
1388	&& p != lim && *p++ == '}'
1389	&& 0 <= dfa->lex.minrep
1390	&& (dfa->lex.maxrep < 0
1391	\|\| dfa->lex.minrep <= dfa->lex.maxrep));
1392	if (invalid_content
1393	&& (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1394	goto normal_char;
1395	if (dfa->lex.laststart)
1396	{
1397	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1398	goto default_case;
1399	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1400	dfawarn (_("{...} at start of expression"));
1401	}
1402	if (invalid_content)
1403	dfaerror (_("invalid content of \\{\\}"));
1404	if (RE_DUP_MAX < dfa->lex.maxrep)
1405	dfaerror (_("regular expression too big"));
1406	dfa->lex.ptr = p;
1407	dfa->lex.left = lim - p;
1408	}
1409	dfa->lex.laststart = false;
1410	return dfa->lex.lasttok = REPMN;
1411
1412	case '\|':
1413	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1414	goto default_case;
1415	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1416	goto normal_char;
1417	dfa->lex.laststart = true;
1418	return dfa->lex.lasttok = OR;
1419
1420	case '\n':
1421	if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1422	goto default_case;
1423	if (backslash)
1424	goto normal_char;
1425	dfa->lex.laststart = true;
1426	return dfa->lex.lasttok = OR;
1427
1428	case '(':
1429	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1430	goto normal_char;
1431	dfa->lex.parens++;
1432	dfa->lex.laststart = true;
1433	return dfa->lex.lasttok = LPAREN;
1434
1435	case ')':
1436	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1437	goto normal_char;
1438	if (dfa->lex.parens == 0
1439	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1440	goto normal_char;
1441	dfa->lex.parens--;
1442	dfa->lex.laststart = false;
1443	return dfa->lex.lasttok = RPAREN;
1444
1445	case '.':
1446	if (backslash)
1447	goto normal_char;
1448	if (dfa->canychar < 0)
1449	{
1450	charclass ccl;
1451	fillset (&ccl);
1452	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1453	clrbit ('\n', &ccl);
1454	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1455	clrbit ('\0', &ccl);
1456	if (dfa->localeinfo.multibyte)
1457	for (int c2 = 0; c2 < NOTCHAR; c2++)
1458	if (dfa->localeinfo.sbctowc[c2] == WEOF)
1459	clrbit (c2, &ccl);
1460	dfa->canychar = charclass_index (dfa, &ccl);
1461	}
1462	dfa->lex.laststart = false;
1463	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1464	? ANYCHAR
1465	: CSET + dfa->canychar);
1466
1467	case 's':
1468	case 'S':
1469	if (!backslash)
1470	goto normal_char;
1471	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1472	goto stray_backslash;
1473
1474	if (!dfa->localeinfo.multibyte)
1475	{
1476	charclass ccl;
1477	zeroset (&ccl);
1478	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1479	if (isspace (c2))
1480	setbit (c2, &ccl);
1481	if (c == 'S')
1482	notset (&ccl);
1483	dfa->lex.laststart = false;
1484	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1485	}
1486
1487	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1488	add_utf8_anychar, makes sense. */
1489
1490	/* \s and \S are documented to be equivalent to [[:space:]] and
1491	[^[:space:]] respectively, so tell the lexer to process those
1492	strings, each minus its "already processed" '['. */
1493	{
1494	struct lexptr ls;
1495	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1496	dfa->lex.lasttok = parse_bracket_exp (dfa);
1497	pop_lex_state (dfa, &ls);
1498	}
1499
1500	dfa->lex.laststart = false;
1501	return dfa->lex.lasttok;
1502
1503	case 'w':
1504	case 'W':
1505	if (!backslash)
1506	goto normal_char;
1507	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1508	goto stray_backslash;
1509
1510	if (!dfa->localeinfo.multibyte)
1511	{
1512	charclass ccl;
1513	zeroset (&ccl);
1514	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1515	if (dfa->syntax.sbit[c2] == CTX_LETTER)
1516	setbit (c2, &ccl);
1517	if (c == 'W')
1518	notset (&ccl);
1519	dfa->lex.laststart = false;
1520	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1521	}
1522
1523	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1524	add_utf8_anychar, makes sense. */
1525
1526	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
1527	[^_[:alnum:]] respectively, so tell the lexer to process those
1528	strings, each minus its "already processed" '['. */
1529	{
1530	struct lexptr ls;
1531	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1532	dfa->lex.lasttok = parse_bracket_exp (dfa);
1533	pop_lex_state (dfa, &ls);
1534	}
1535
1536	dfa->lex.laststart = false;
1537	return dfa->lex.lasttok;
1538
1539	case '[':
1540	if (backslash)
1541	goto normal_char;
1542	dfa->lex.laststart = false;
1543	return dfa->lex.lasttok = parse_bracket_exp (dfa);
1544
1545	default:
1546	default_case:
1547	if (!backslash)
1548	goto normal_char;
1549	stray_backslash:
1550	if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1551	{
1552	char const *msg;
1553	char msgbuf[100];
1554	if (!iswprint (dfa->lex.wctok))
1555	msg = _("stray \\ before unprintable character");
1556	else if (iswspace (dfa->lex.wctok))
1557	msg = _("stray \\ before white space");
1558	else
1559	{
1560	int n = snprintf (msgbuf, sizeof msgbuf,
1561	_("stray \\ before %lc"), dfa->lex.wctok);
1562	msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1563	}
1564	dfawarn (msg);
1565	}
1566	FALLTHROUGH;
1567	case ']': case '}':
1568	normal_char:
1569	dfa->lex.laststart = false;
1570	/* For multibyte character sets, folding is done in atom. Always
1571	return WCHAR. */
1572	if (dfa->localeinfo.multibyte)
1573	return dfa->lex.lasttok = WCHAR;
1574
1575	if (dfa->syntax.case_fold && isalpha (c))
1576	{
1577	charclass ccl;
1578	zeroset (&ccl);
1579	setbit_case_fold_c (c, &ccl);
1580	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1581	}
1582
1583	return dfa->lex.lasttok = c;
1584	}
1585	}
1586
1587	/* The above loop should consume at most a backslash
1588	and some other character. */
1589	abort ();
1590	return END; /* keeps pedantic compilers happy. */
1591	}
1592
1593	static void
1594	addtok_mb (struct dfa *dfa, token t, char mbprop)
1595	{
1596	if (dfa->talloc == dfa->tindex)
1597	{
1598	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1599	sizeof *dfa->tokens);
1600	if (dfa->localeinfo.multibyte)
1601	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1602	sizeof *dfa->multibyte_prop);
1603	}
1604	if (dfa->localeinfo.multibyte)
1605	dfa->multibyte_prop[dfa->tindex] = mbprop;
1606	dfa->tokens[dfa->tindex++] = t;
1607
1608	switch (t)
1609	{
1610	case QMARK:
1611	case STAR:
1612	case PLUS:
1613	break;
1614
1615	case CAT:
1616	case OR:
1617	dfa->parse.depth--;
1618	break;
1619
1620	case EMPTY:
1621	dfa->epsilon = true;
1622	goto increment_depth;
1623
1624	case BACKREF:
1625	dfa->fast = false;
1626	goto increment_nleaves;
1627
1628	case BEGLINE:
1629	case ENDLINE:
1630	case BEGWORD:
1631	case ENDWORD:
1632	case LIMWORD:
1633	case NOTLIMWORD:
1634	dfa->epsilon = true;
1635	FALLTHROUGH;
1636	default:
1637	increment_nleaves:
1638	dfa->nleaves++;
1639	increment_depth:
1640	dfa->parse.depth++;
1641	if (dfa->depth < dfa->parse.depth)
1642	dfa->depth = dfa->parse.depth;
1643	break;
1644	}
1645	}
1646
1647	static void addtok_wc (struct dfa *dfa, wint_t wc);
1648
1649	/* Add the given token to the parse tree, maintaining the depth count and
1650	updating the maximum depth if necessary. */
1651	static void
1652	addtok (struct dfa *dfa, token t)
1653	{
1654	if (dfa->localeinfo.multibyte && t == MBCSET)
1655	{
1656	bool need_or = false;
1657
1658	/* Extract wide characters into alternations for better performance.
1659	This does not require UTF-8. */
1660	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1661	{
1662	addtok_wc (dfa, dfa->lex.brack.chars[i]);
1663	if (need_or)
1664	addtok (dfa, OR);
1665	need_or = true;
1666	}
1667	dfa->lex.brack.nchars = 0;
1668
1669	/* Wide characters have been handled above, so it is possible
1670	that the set is empty now. Do nothing in that case. */
1671	if (dfa->lex.brack.cset != -1)
1672	{
1673	addtok (dfa, CSET + dfa->lex.brack.cset);
1674	if (need_or)
1675	addtok (dfa, OR);
1676	}
1677	}
1678	else
1679	{
1680	addtok_mb (dfa, t, 3);
1681	}
1682	}
1683
1684	/* We treat a multibyte character as a single atom, so that DFA
1685	can treat a multibyte character as a single expression.
1686
1687	e.g., we construct the following tree from "<mb1><mb2>".
1688	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1689	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1690	static void
1691	addtok_wc (struct dfa *dfa, wint_t wc)
1692	{
1693	unsigned char buf[MB_LEN_MAX];
1694	mbstate_t s = { 0 };
1695	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1696	int buflen;
1697
1698	if (stored_bytes != (size_t) -1)
1699	buflen = stored_bytes;
1700	else
1701	{
1702	/* This is merely stop-gap. buf[0] is undefined, yet skipping
1703	the addtok_mb call altogether can corrupt the heap. */
1704	buflen = 1;
1705	buf[0] = 0;
1706	}
1707
1708	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1709	for (int i = 1; i < buflen; i++)
1710	{
1711	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1712	addtok (dfa, CAT);
1713	}
1714	}
1715
1716	static void
1717	add_utf8_anychar (struct dfa *dfa)
1718	{
1719	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1720	UTF-8 byte sequence has been defined as follows:
1721
1722	([\x00-\x7f]
1723	\|[\xc2-\xdf][\x80-\xbf]
1724	\|[\xe0][\xa0-\xbf][\x80-\xbf]
1725	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1726	\|[\xed][\x80-\x9f][\x80-\xbf]
1727	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1728	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1729	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1730
1731	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
1732	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1733	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1734	H = [\x80-\x9f], I = [\xf0],
1735	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1736
1737	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
1738
1739	/* Mnemonics for classes containing two or more bytes. */
1740	enum { A, B, C, E, F, H, J, K, M };
1741
1742	/* Mnemonics for single-byte tokens. */
1743	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1744
1745	static charclass const utf8_classes[] = {
1746	/* A. 00-7f: 1-byte sequence. */
1747	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1748
1749	/* B. c2-df: 1st byte of a 2-byte sequence. */
1750	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1751
1752	/* C. 80-bf: non-leading bytes. */
1753	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1754
1755	/* D. e0 (just a token). */
1756
1757	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
1758	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1759
1760	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1761	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1762
1763	/* G. ed (just a token). */
1764
1765	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
1766	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1767
1768	/* I. f0 (just a token). */
1769
1770	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1771	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1772
1773	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
1774	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1775
1776	/* L. f4 (just a token). */
1777
1778	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1779	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1780	};
1781
1782	/* Define the character classes that are needed below. */
1783	if (dfa->utf8_anychar_classes[0] == 0)
1784	{
1785	charclass c = utf8_classes[0];
1786	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1787	clrbit ('\n', &c);
1788	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1789	clrbit ('\0', &c);
1790	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1791
1792	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1793	dfa->utf8_anychar_classes[i]
1794	= CSET + charclass_index (dfa, &utf8_classes[i]);
1795	}
1796
1797	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
1798	The token buffer is in reverse Polish order, so we get
1799	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1800	C CAT OR C CAT OR C CAT OR". */
1801	addtok (dfa, dfa->utf8_anychar_classes[A]);
1802	addtok (dfa, dfa->utf8_anychar_classes[B]);
1803	addtok (dfa, D_token);
1804	addtok (dfa, dfa->utf8_anychar_classes[E]);
1805	addtok (dfa, CAT);
1806	addtok (dfa, OR);
1807	addtok (dfa, G_token);
1808	addtok (dfa, dfa->utf8_anychar_classes[H]);
1809	addtok (dfa, CAT);
1810	addtok (dfa, OR);
1811	addtok (dfa, dfa->utf8_anychar_classes[F]);
1812	addtok (dfa, I_token);
1813	addtok (dfa, dfa->utf8_anychar_classes[J]);
1814	addtok (dfa, CAT);
1815	addtok (dfa, OR);
1816	addtok (dfa, L_token);
1817	addtok (dfa, dfa->utf8_anychar_classes[M]);
1818	addtok (dfa, CAT);
1819	addtok (dfa, OR);
1820	addtok (dfa, dfa->utf8_anychar_classes[K]);
1821	for (int i = 0; i < 3; i++)
1822	{
1823	addtok (dfa, dfa->utf8_anychar_classes[C]);
1824	addtok (dfa, CAT);
1825	addtok (dfa, OR);
1826	}
1827	}
1828
1829	/* The grammar understood by the parser is as follows.
1830
1831	regexp:
1832	regexp OR branch
1833	branch
1834
1835	branch:
1836	branch closure
1837	closure
1838
1839	closure:
1840	closure QMARK
1841	closure STAR
1842	closure PLUS
1843	closure REPMN
1844	atom
1845
1846	atom:
1847	<normal character>
1848	<multibyte character>
1849	ANYCHAR
1850	MBCSET
1851	CSET
1852	BACKREF
1853	BEGLINE
1854	ENDLINE
1855	BEGWORD
1856	ENDWORD
1857	LIMWORD
1858	NOTLIMWORD
1859	LPAREN regexp RPAREN
1860	<empty>
1861
1862	The parser builds a parse tree in postfix form in an array of tokens. */
1863
1864	static void
1865	atom (struct dfa *dfa)
1866	{
1867	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1868	\|\| dfa->parse.tok >= CSET
1869	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
1870	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
1871	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
1872	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
1873	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
1874	{
1875	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1876	{
1877	/* For UTF-8 expand the period to a series of CSETs that define a
1878	valid UTF-8 character. This avoids using the slow multibyte
1879	path. I'm pretty sure it would be both profitable and correct to
1880	do it for any encoding; however, the optimization must be done
1881	manually as it is done above in add_utf8_anychar. So, let's
1882	start with UTF-8: it is the most used, and the structure of the
1883	encoding makes the correctness more obvious. */
1884	add_utf8_anychar (dfa);
1885	}
1886	else
1887	addtok (dfa, dfa->parse.tok);
1888	dfa->parse.tok = lex (dfa);
1889	}
1890	else if (dfa->parse.tok == WCHAR)
1891	{
1892	if (dfa->lex.wctok == WEOF)
1893	addtok (dfa, BACKREF);
1894	else
1895	{
1896	addtok_wc (dfa, dfa->lex.wctok);
1897
1898	if (dfa->syntax.case_fold)
1899	{
1900	wchar_t folded[CASE_FOLDED_BUFSIZE];
1901	int n = case_folded_counterparts (dfa->lex.wctok, folded);
1902	for (int i = 0; i < n; i++)
1903	{
1904	addtok_wc (dfa, folded[i]);
1905	addtok (dfa, OR);
1906	}
1907	}
1908	}
1909
1910	dfa->parse.tok = lex (dfa);
1911	}
1912	else if (dfa->parse.tok == LPAREN)
1913	{
1914	dfa->parse.tok = lex (dfa);
1915	regexp (dfa);
1916	if (dfa->parse.tok != RPAREN)
1917	dfaerror (_("unbalanced ("));
1918	dfa->parse.tok = lex (dfa);
1919	}
1920	else
1921	addtok (dfa, EMPTY);
1922	}
1923
1924	/* Return the number of tokens in the given subexpression. */
1925	static idx_t _GL_ATTRIBUTE_PURE
1926	nsubtoks (struct dfa const *dfa, idx_t tindex)
1927	{
1928	switch (dfa->tokens[tindex - 1])
1929	{
1930	default:
1931	return 1;
1932	case QMARK:
1933	case STAR:
1934	case PLUS:
1935	return 1 + nsubtoks (dfa, tindex - 1);
1936	case CAT:
1937	case OR:
1938	{
1939	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1940	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1941	}
1942	}
1943	}
1944
1945	/* Copy the given subexpression to the top of the tree. */
1946	static void
1947	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1948	{
1949	if (dfa->localeinfo.multibyte)
1950	for (idx_t i = 0; i < ntokens; i++)
1951	addtok_mb (dfa, dfa->tokens[tindex + i],
1952	dfa->multibyte_prop[tindex + i]);
1953	else
1954	for (idx_t i = 0; i < ntokens; i++)
1955	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1956	}
1957
1958	static void
1959	closure (struct dfa *dfa)
1960	{
1961	atom (dfa);
1962	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
1963	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
1964	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
1965	{
1966	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1967	idx_t tindex = dfa->tindex - ntokens;
1968	if (dfa->lex.maxrep < 0)
1969	addtok (dfa, PLUS);
1970	if (dfa->lex.minrep == 0)
1971	addtok (dfa, QMARK);
1972	int i;
1973	for (i = 1; i < dfa->lex.minrep; i++)
1974	{
1975	copytoks (dfa, tindex, ntokens);
1976	addtok (dfa, CAT);
1977	}
1978	for (; i < dfa->lex.maxrep; i++)
1979	{
1980	copytoks (dfa, tindex, ntokens);
1981	addtok (dfa, QMARK);
1982	addtok (dfa, CAT);
1983	}
1984	dfa->parse.tok = lex (dfa);
1985	}
1986	else if (dfa->parse.tok == REPMN)
1987	{
1988	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1989	dfa->parse.tok = lex (dfa);
1990	closure (dfa);
1991	}
1992	else
1993	{
1994	addtok (dfa, dfa->parse.tok);
1995	dfa->parse.tok = lex (dfa);
1996	}
1997	}
1998
1999	static void
2000	branch (struct dfa* dfa)
2001	{
2002	closure (dfa);
2003	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2004	&& dfa->parse.tok >= 0)
2005	{
2006	closure (dfa);
2007	addtok (dfa, CAT);
2008	}
2009	}
2010
2011	static void
2012	regexp (struct dfa *dfa)
2013	{
2014	branch (dfa);
2015	while (dfa->parse.tok == OR)
2016	{
2017	dfa->parse.tok = lex (dfa);
2018	branch (dfa);
2019	addtok (dfa, OR);
2020	}
2021	}
2022
2023	/* Parse a string S of length LEN into D. S can include NUL characters.
2024	This is the main entry point for the parser. */
2025	void
2026	dfaparse (char const s, idx_t len, struct dfa d)
2027	{
2028	d->lex.ptr = s;
2029	d->lex.left = len;
2030	d->lex.lasttok = END;
2031	d->lex.laststart = true;
2032
2033	if (!d->syntax.syntax_bits_set)
2034	dfaerror (_("no syntax specified"));
2035
2036	if (!d->nregexps)
2037	addtok (d, BEG);
2038
2039	d->parse.tok = lex (d);
2040	d->parse.depth = d->depth;
2041
2042	regexp (d);
2043
2044	if (d->parse.tok != END)
2045	dfaerror (_("unbalanced )"));
2046
2047	addtok (d, END - d->nregexps);
2048	addtok (d, CAT);
2049
2050	if (d->nregexps)
2051	addtok (d, OR);
2052
2053	++d->nregexps;
2054	}
2055
2056	/* Some primitives for operating on sets of positions. */
2057
2058	/* Copy one set to another. */
2059	static void
2060	copy (position_set const src, position_set dst)
2061	{
2062	if (dst->alloc < src->nelem)
2063	{
2064	free (dst->elems);
2065	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2066	sizeof *dst->elems);
2067	}
2068	dst->nelem = src->nelem;
2069	if (src->nelem != 0)
2070	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2071	}
2072
2073	static void
2074	alloc_position_set (position_set *s, idx_t size)
2075	{
2076	s->elems = xnmalloc (size, sizeof *s->elems);
2077	s->alloc = size;
2078	s->nelem = 0;
2079	}
2080
2081	/* Insert position P in set S. S is maintained in sorted order on
2082	decreasing index. If there is already an entry in S with P.index
2083	then merge (logically-OR) P's constraints into the one in S.
2084	S->elems must point to an array large enough to hold the resulting set. */
2085	static void
2086	insert (position p, position_set *s)
2087	{
2088	idx_t count = s->nelem;
2089	idx_t lo = 0, hi = count;
2090	while (lo < hi)
2091	{
2092	idx_t mid = (lo + hi) >> 1;
2093	if (s->elems[mid].index < p.index)
2094	lo = mid + 1;
2095	else if (s->elems[mid].index == p.index)
2096	{
2097	s->elems[mid].constraint \|= p.constraint;
2098	return;
2099	}
2100	else
2101	hi = mid;
2102	}
2103
2104	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2105	for (idx_t i = count; i > lo; i--)
2106	s->elems[i] = s->elems[i - 1];
2107	s->elems[lo] = p;
2108	++s->nelem;
2109	}
2110
2111	static void
2112	append (position p, position_set *s)
2113	{
2114	idx_t count = s->nelem;
2115	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2116	s->elems[s->nelem++] = p;
2117	}
2118
2119	/* Merge S1 and S2 (with the additional constraint C2) into M. The
2120	result is as if the positions of S1, and of S2 with the additional
2121	constraint C2, were inserted into an initially empty set. */
2122	static void
2123	merge_constrained (position_set const s1, position_set const s2,
2124	unsigned int c2, position_set *m)
2125	{
2126	idx_t i = 0, j = 0;
2127
2128	if (m->alloc - s1->nelem < s2->nelem)
2129	{
2130	free (m->elems);
2131	m->alloc = s1->nelem;
2132	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2133	}
2134	m->nelem = 0;
2135	while (i < s1->nelem \|\| j < s2->nelem)
2136	if (! (j < s2->nelem)
2137	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2138	{
2139	unsigned int c = ((i < s1->nelem && j < s2->nelem
2140	&& s1->elems[i].index == s2->elems[j].index)
2141	? s2->elems[j++].constraint & c2
2142	: 0);
2143	m->elems[m->nelem].index = s1->elems[i].index;
2144	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
2145	}
2146	else
2147	{
2148	if (s2->elems[j].constraint & c2)
2149	{
2150	m->elems[m->nelem].index = s2->elems[j].index;
2151	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2152	}
2153	j++;
2154	}
2155	}
2156
2157	/* Merge two sets of positions into a third. The result is exactly as if
2158	the positions of both sets were inserted into an initially empty set. */
2159	static void
2160	merge (position_set const s1, position_set const s2, position_set *m)
2161	{
2162	merge_constrained (s1, s2, -1, m);
2163	}
2164
2165	/* Merge into DST all the elements of SRC, possibly destroying
2166	the contents of the temporary M. */
2167	static void
2168	merge2 (position_set dst, position_set const src, position_set *m)
2169	{
2170	if (src->nelem < 4)
2171	{
2172	for (idx_t i = 0; i < src->nelem; i++)
2173	insert (src->elems[i], dst);
2174	}
2175	else
2176	{
2177	merge (src, dst, m);
2178	copy (m, dst);
2179	}
2180	}
2181
2182	/* Delete a position from a set. Return the nonzero constraint of the
2183	deleted position, or zero if there was no such position. */
2184	static unsigned int
2185	delete (idx_t del, position_set *s)
2186	{
2187	idx_t count = s->nelem;
2188	idx_t lo = 0, hi = count;
2189	while (lo < hi)
2190	{
2191	idx_t mid = (lo + hi) >> 1;
2192	if (s->elems[mid].index < del)
2193	lo = mid + 1;
2194	else if (s->elems[mid].index == del)
2195	{
2196	unsigned int c = s->elems[mid].constraint;
2197	idx_t i;
2198	for (i = mid; i + 1 < count; i++)
2199	s->elems[i] = s->elems[i + 1];
2200	s->nelem = i;
2201	return c;
2202	}
2203	else
2204	hi = mid;
2205	}
2206	return 0;
2207	}
2208
2209	/* Replace a position with the followed set. */
2210	static void
2211	replace (position_set dst, idx_t del, position_set add,
2212	unsigned int constraint, position_set *tmp)
2213	{
2214	unsigned int c = delete (del, dst) & constraint;
2215
2216	if (c)
2217	{
2218	copy (dst, tmp);
2219	merge_constrained (tmp, add, c, dst);
2220	}
2221	}
2222
2223	/* Find the index of the state corresponding to the given position set with
2224	the given preceding context, or create a new state if there is no such
2225	state. Context tells whether we got here on a newline or letter. */
2226	static state_num
2227	state_index (struct dfa d, position_set const s, int context)
2228	{
2229	size_t hash = 0;
2230	int constraint = 0;
2231	state_num i;
2232
2233	for (i = 0; i < s->nelem; ++i)
2234	{
2235	idx_t ind = s->elems[i].index;
2236	hash ^= ind + s->elems[i].constraint;
2237	}
2238
2239	/* Try to find a state that exactly matches the proposed one. */
2240	for (i = 0; i < d->sindex; ++i)
2241	{
2242	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2243	\|\| context != d->states[i].context)
2244	continue;
2245	state_num j;
2246	for (j = 0; j < s->nelem; ++j)
2247	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2248	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2249	break;
2250	if (j == s->nelem)
2251	return i;
2252	}
2253
2254	#ifdef DEBUG
2255	fprintf (stderr, "new state %td\n nextpos:", i);
2256	for (state_num j = 0; j < s->nelem; j++)
2257	{
2258	fprintf (stderr, " %td:", s->elems[j].index);
2259	prtok (d->tokens[s->elems[j].index]);
2260	}
2261	fprintf (stderr, "\n context:");
2262	if (context ^ CTX_ANY)
2263	{
2264	if (context & CTX_NONE)
2265	fprintf (stderr, " CTX_NONE");
2266	if (context & CTX_LETTER)
2267	fprintf (stderr, " CTX_LETTER");
2268	if (context & CTX_NEWLINE)
2269	fprintf (stderr, " CTX_NEWLINE");
2270	}
2271	else
2272	fprintf (stderr, " CTX_ANY");
2273	fprintf (stderr, "\n");
2274	#endif
2275
2276	for (state_num j = 0; j < s->nelem; j++)
2277	{
2278	int c = d->constraints[s->elems[j].index];
2279
2280	if (c != 0)
2281	{
2282	if (succeeds_in_context (c, context, CTX_ANY))
2283	constraint \|= c;
2284	}
2285	else if (d->tokens[s->elems[j].index] == BACKREF)
2286	constraint = NO_CONSTRAINT;
2287	}
2288
2289
2290	/* Create a new state. */
2291	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2292	sizeof *d->states);
2293	d->states[i].hash = hash;
2294	alloc_position_set (&d->states[i].elems, s->nelem);
2295	copy (s, &d->states[i].elems);
2296	d->states[i].context = context;
2297	d->states[i].constraint = constraint;
2298	d->states[i].mbps.nelem = 0;
2299	d->states[i].mbps.elems = NULL;
2300	d->states[i].mb_trindex = -1;
2301
2302	++d->sindex;
2303
2304	return i;
2305	}
2306
2307	/* Find the epsilon closure of D's set of positions. If any position of the set
2308	contains a symbol that matches the empty string in some context, replace
2309	that position with the elements of its follow labeled with an appropriate
2310	constraint. Repeat exhaustively until no funny positions are left.
2311	S->elems must be large enough to hold the result. BACKWARD is D's
2312	backward set; use and update it too. */
2313	static void
2314	epsclosure (struct dfa const d, position_set backward)
2315	{
2316	position_set tmp;
2317	alloc_position_set (&tmp, d->nleaves);
2318	for (idx_t i = 0; i < d->tindex; i++)
2319	if (0 < d->follows[i].nelem)
2320	{
2321	unsigned int constraint;
2322	switch (d->tokens[i])
2323	{
2324	default:
2325	continue;
2326
2327	case BEGLINE:
2328	constraint = BEGLINE_CONSTRAINT;
2329	break;
2330	case ENDLINE:
2331	constraint = ENDLINE_CONSTRAINT;
2332	break;
2333	case BEGWORD:
2334	constraint = BEGWORD_CONSTRAINT;
2335	break;
2336	case ENDWORD:
2337	constraint = ENDWORD_CONSTRAINT;
2338	break;
2339	case LIMWORD:
2340	constraint = LIMWORD_CONSTRAINT;
2341	break;
2342	case NOTLIMWORD:
2343	constraint = NOTLIMWORD_CONSTRAINT;
2344	break;
2345	case EMPTY:
2346	constraint = NO_CONSTRAINT;
2347	break;
2348	}
2349
2350	delete (i, &d->follows[i]);
2351
2352	for (idx_t j = 0; j < backward[i].nelem; j++)
2353	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2354	constraint, &tmp);
2355	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2356	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2357	NO_CONSTRAINT, &tmp);
2358	}
2359	free (tmp.elems);
2360	}
2361
2362	/* Returns the set of contexts for which there is at least one
2363	character included in C. */
2364
2365	static int
2366	charclass_context (struct dfa const dfa, charclass const c)
2367	{
2368	int context = 0;
2369
2370	for (int j = 0; j < CHARCLASS_WORDS; j++)
2371	{
2372	if (c->w[j] & dfa->syntax.newline.w[j])
2373	context \|= CTX_NEWLINE;
2374	if (c->w[j] & dfa->syntax.letters.w[j])
2375	context \|= CTX_LETTER;
2376	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
2377	context \|= CTX_NONE;
2378	}
2379
2380	return context;
2381	}
2382
2383	/* Returns the contexts on which the position set S depends. Each context
2384	in the set of returned contexts (let's call it SC) may have a different
2385	follow set than other contexts in SC, and also different from the
2386	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2387	in the complement set will have the same follow set. */
2388
2389	static int _GL_ATTRIBUTE_PURE
2390	state_separate_contexts (struct dfa d, position_set const s)
2391	{
2392	int separate_contexts = 0;
2393
2394	for (idx_t j = 0; j < s->nelem; j++)
2395	separate_contexts \|= d->separates[s->elems[j].index];
2396
2397	return separate_contexts;
2398	}
2399
2400	enum
2401	{
2402	/* Single token is repeated. It is distinguished from non-repeated. */
2403	OPT_REPEAT = (1 << 0),
2404
2405	/* Multiple tokens are repeated. This flag is on at head of tokens. The
2406	node is not merged. */
2407	OPT_LPAREN = (1 << 1),
2408
2409	/* Multiple branches are joined. The node is not merged. */
2410	OPT_RPAREN = (1 << 2),
2411
2412	/* The node is walked. If the node is found in walking again, OPT_RPAREN
2413	flag is turned on. */
2414	OPT_WALKED = (1 << 3),
2415
2416	/* The node is queued. The node is not queued again. */
2417	OPT_QUEUED = (1 << 4)
2418	};
2419
2420	static void
2421	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
2422	position_set *merged)
2423	{
2424	position_set *follows = d->follows;
2425	idx_t nelem = 0;
2426
2427	for (idx_t i = 0; i < follows[tindex].nelem; i++)
2428	{
2429	idx_t sindex = follows[tindex].elems[i].index;
2430
2431	/* Skip the node as pruned in future. */
2432	unsigned int iconstraint = follows[tindex].elems[i].constraint;
2433	if (iconstraint == 0)
2434	continue;
2435
2436	if (d->tokens[follows[tindex].elems[i].index] <= END)
2437	{
2438	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
2439	continue;
2440	}
2441
2442	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
2443	{
2444	idx_t j;
2445
2446	for (j = 0; j < nelem; j++)
2447	{
2448	idx_t dindex = follows[tindex].elems[j].index;
2449
2450	if (dindex == tindex)
2451	continue;
2452
2453	if (follows[tindex].elems[j].constraint != iconstraint)
2454	continue;
2455
2456	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
2457	continue;
2458
2459	if (d->tokens[sindex] != d->tokens[dindex])
2460	continue;
2461
2462	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2463	continue;
2464
2465	if (flags[sindex] & OPT_REPEAT)
2466	delete (sindex, &follows[sindex]);
2467
2468	merge2 (&follows[dindex], &follows[sindex], merged);
2469
2470	break;
2471	}
2472
2473	if (j < nelem)
2474	continue;
2475	}
2476
2477	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2478	flags[sindex] \|= OPT_QUEUED;
2479	}
2480
2481	follows[tindex].nelem = nelem;
2482	}
2483
2484	static int
2485	compare (const void a, const void b)
2486	{
2487	position const p = a, q = b;
2488	return (p->index > q->index) - (p->index < q->index);
2489	}
2490
2491	static void
2492	reorder_tokens (struct dfa *d)
2493	{
2494	idx_t nleaves = 0;
2495	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
2496	map[0] = nleaves++;
2497	for (idx_t i = 1; i < d->tindex; i++)
2498	map[i] = -1;
2499
2500	token tokens = xnmalloc (d->nleaves, sizeof tokens);
2501	position_set follows = xnmalloc (d->nleaves, sizeof follows);
2502	int constraints = xnmalloc (d->nleaves, sizeof constraints);
2503	char *multibyte_prop = (d->localeinfo.multibyte
2504	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2505	: NULL);
2506
2507	for (idx_t i = 0; i < d->tindex; i++)
2508	{
2509	if (map[i] < 0)
2510	{
2511	free (d->follows[i].elems);
2512	d->follows[i].elems = NULL;
2513	d->follows[i].nelem = 0;
2514	continue;
2515	}
2516
2517	tokens[map[i]] = d->tokens[i];
2518	follows[map[i]] = d->follows[i];
2519	constraints[map[i]] = d->constraints[i];
2520
2521	if (multibyte_prop != NULL)
2522	multibyte_prop[map[i]] = d->multibyte_prop[i];
2523
2524	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2525	{
2526	if (map[d->follows[i].elems[j].index] == -1)
2527	map[d->follows[i].elems[j].index] = nleaves++;
2528
2529	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2530	}
2531
2532	qsort (d->follows[i].elems, d->follows[i].nelem,
2533	sizeof *d->follows[i].elems, compare);
2534	}
2535
2536	for (idx_t i = 0; i < nleaves; i++)
2537	{
2538	d->tokens[i] = tokens[i];
2539	d->follows[i] = follows[i];
2540	d->constraints[i] = constraints[i];
2541
2542	if (multibyte_prop != NULL)
2543	d->multibyte_prop[i] = multibyte_prop[i];
2544	}
2545
2546	d->tindex = d->nleaves = nleaves;
2547
2548	free (tokens);
2549	free (follows);
2550	free (constraints);
2551	free (multibyte_prop);
2552	free (map);
2553	}
2554
2555	static void
2556	dfaoptimize (struct dfa *d)
2557	{
2558	char *flags = xizalloc (d->tindex);
2559
2560	for (idx_t i = 0; i < d->tindex; i++)
2561	{
2562	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2563	{
2564	if (d->follows[i].elems[j].index == i)
2565	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
2566	else if (d->follows[i].elems[j].index < i)
2567	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
2568	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2569	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
2570	else
2571	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
2572	}
2573	}
2574
2575	flags[0] \|= OPT_QUEUED;
2576
2577	position_set merged0;
2578	position_set *merged = &merged0;
2579	alloc_position_set (merged, d->nleaves);
2580
2581	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2582
2583	for (idx_t i = 0; i < d->tindex; i++)
2584	if (flags[i] & OPT_QUEUED)
2585	merge_nfa_state (d, i, flags, merged);
2586
2587	reorder_tokens (d);
2588
2589	free (merged->elems);
2590	free (flags);
2591	}
2592
2593	/* Perform bottom-up analysis on the parse tree, computing various functions.
2594	Note that at this point, we're pretending constructs like \< are real
2595	characters rather than constraints on what can follow them.
2596
2597	Nullable: A node is nullable if it is at the root of a regexp that can
2598	match the empty string.
2599	* EMPTY leaves are nullable.
2600	* No other leaf is nullable.
2601	* A QMARK or STAR node is nullable.
2602	* A PLUS node is nullable if its argument is nullable.
2603	* A CAT node is nullable if both its arguments are nullable.
2604	* An OR node is nullable if either argument is nullable.
2605
2606	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2607	that could correspond to the first character of a string matching the
2608	regexp rooted at the given node.
2609	* EMPTY leaves have empty firstpos.
2610	* The firstpos of a nonempty leaf is that leaf itself.
2611	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2612	argument.
2613	* The firstpos of a CAT node is the firstpos of the left argument, union
2614	the firstpos of the right if the left argument is nullable.
2615	* The firstpos of an OR node is the union of firstpos of each argument.
2616
2617	Lastpos: The lastpos of a node is the set of positions that could
2618	correspond to the last character of a string matching the regexp at
2619	the given node.
2620	* EMPTY leaves have empty lastpos.
2621	* The lastpos of a nonempty leaf is that leaf itself.
2622	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2623	argument.
2624	* The lastpos of a CAT node is the lastpos of its right argument, union
2625	the lastpos of the left if the right argument is nullable.
2626	* The lastpos of an OR node is the union of the lastpos of each argument.
2627
2628	Follow: The follow of a position is the set of positions that could
2629	correspond to the character following a character matching the node in
2630	a string matching the regexp. At this point we consider special symbols
2631	that match the empty string in some context to be just normal characters.
2632	Later, if we find that a special symbol is in a follow set, we will
2633	replace it with the elements of its follow, labeled with an appropriate
2634	constraint.
2635	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2636	the follow of every node in the lastpos.
2637	* Every node in the firstpos of the second argument of a CAT node is in
2638	the follow of every node in the lastpos of the first argument.
2639
2640	Because of the postfix representation of the parse tree, the depth-first
2641	analysis is conveniently done by a linear scan with the aid of a stack.
2642	Sets are stored as arrays of the elements, obeying a stack-like allocation
2643	scheme; the number of elements in each set deeper in the stack can be
2644	used to determine the address of a particular set's array. */
2645	static void
2646	dfaanalyze (struct dfa *d, bool searchflag)
2647	{
2648	/* Array allocated to hold position sets. */
2649	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
2650	/* Firstpos and lastpos elements. */
2651	position *firstpos = posalloc;
2652	position *lastpos = firstpos + d->nleaves;
2653	position pos;
2654	position_set tmp;
2655
2656	/* Stack for element counts and nullable flags. */
2657	struct
2658	{
2659	/* Whether the entry is nullable. */
2660	bool nullable;
2661
2662	/* Counts of firstpos and lastpos sets. */
2663	idx_t nfirstpos;
2664	idx_t nlastpos;
2665	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
2666
2667	position_set merged; /* Result of merging sets. */
2668
2669	addtok (d, CAT);
2670	idx_t tindex = d->tindex;
2671
2672	#ifdef DEBUG
2673	fprintf (stderr, "dfaanalyze:\n");
2674	for (idx_t i = 0; i < tindex; i++)
2675	{
2676	fprintf (stderr, " %td:", i);
2677	prtok (d->tokens[i]);
2678	}
2679	putc ('\n', stderr);
2680	#endif
2681
2682	d->searchflag = searchflag;
2683	alloc_position_set (&merged, d->nleaves);
2684	d->follows = xicalloc (tindex, sizeof *d->follows);
2685	position_set *backward
2686	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2687
2688	for (idx_t i = 0; i < tindex; i++)
2689	{
2690	switch (d->tokens[i])
2691	{
2692	case EMPTY:
2693	/* The empty set is nullable. */
2694	stk->nullable = true;
2695
2696	/* The firstpos and lastpos of the empty leaf are both empty. */
2697	stk->nfirstpos = stk->nlastpos = 0;
2698	stk++;
2699	break;
2700
2701	case STAR:
2702	case PLUS:
2703	/* Every element in the lastpos of the argument is in the backward
2704	set of every element in the firstpos. */
2705	if (d->epsilon)
2706	{
2707	tmp.elems = lastpos - stk[-1].nlastpos;
2708	tmp.nelem = stk[-1].nlastpos;
2709	for (position *p = firstpos - stk[-1].nfirstpos;
2710	p < firstpos; p++)
2711	merge2 (&backward[p->index], &tmp, &merged);
2712	}
2713
2714	/* Every element in the firstpos of the argument is in the follow
2715	of every element in the lastpos. */
2716	{
2717	tmp.elems = firstpos - stk[-1].nfirstpos;
2718	tmp.nelem = stk[-1].nfirstpos;
2719	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2720	merge2 (&d->follows[p->index], &tmp, &merged);
2721	}
2722	FALLTHROUGH;
2723	case QMARK:
2724	/* A QMARK or STAR node is automatically nullable. */
2725	if (d->tokens[i] != PLUS)
2726	stk[-1].nullable = true;
2727	break;
2728
2729	case CAT:
2730	/* Every element in the lastpos of the first argument is in
2731	the backward set of every element in the firstpos of the
2732	second argument. */
2733	if (backward)
2734	{
2735	tmp.nelem = stk[-2].nlastpos;
2736	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2737	for (position *p = firstpos - stk[-1].nfirstpos;
2738	p < firstpos; p++)
2739	merge2 (&backward[p->index], &tmp, &merged);
2740	}
2741
2742	/* Every element in the firstpos of the second argument is in the
2743	follow of every element in the lastpos of the first argument. */
2744	{
2745	tmp.nelem = stk[-1].nfirstpos;
2746	tmp.elems = firstpos - stk[-1].nfirstpos;
2747	for (position *plim = lastpos - stk[-1].nlastpos,
2748	*p = plim - stk[-2].nlastpos;
2749	p < plim; p++)
2750	merge2 (&d->follows[p->index], &tmp, &merged);
2751	}
2752
2753	/* The firstpos of a CAT node is the firstpos of the first argument,
2754	union that of the second argument if the first is nullable. */
2755	if (stk[-2].nullable)
2756	stk[-2].nfirstpos += stk[-1].nfirstpos;
2757	else
2758	firstpos -= stk[-1].nfirstpos;
2759
2760	/* The lastpos of a CAT node is the lastpos of the second argument,
2761	union that of the first argument if the second is nullable. */
2762	if (stk[-1].nullable)
2763	stk[-2].nlastpos += stk[-1].nlastpos;
2764	else
2765	{
2766	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2767	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2768	p[j] = p[j + stk[-2].nlastpos];
2769	lastpos -= stk[-2].nlastpos;
2770	stk[-2].nlastpos = stk[-1].nlastpos;
2771	}
2772
2773	/* A CAT node is nullable if both arguments are nullable. */
2774	stk[-2].nullable &= stk[-1].nullable;
2775	stk--;
2776	break;
2777
2778	case OR:
2779	/* The firstpos is the union of the firstpos of each argument. */
2780	stk[-2].nfirstpos += stk[-1].nfirstpos;
2781
2782	/* The lastpos is the union of the lastpos of each argument. */
2783	stk[-2].nlastpos += stk[-1].nlastpos;
2784
2785	/* An OR node is nullable if either argument is nullable. */
2786	stk[-2].nullable \|= stk[-1].nullable;
2787	stk--;
2788	break;
2789
2790	default:
2791	/* Anything else is a nonempty position. (Note that special
2792	constructs like \< are treated as nonempty strings here;
2793	an "epsilon closure" effectively makes them nullable later.
2794	Backreferences have to get a real position so we can detect
2795	transitions on them later. But they are nullable. */
2796	stk->nullable = d->tokens[i] == BACKREF;
2797
2798	/* This position is in its own firstpos and lastpos. */
2799	stk->nfirstpos = stk->nlastpos = 1;
2800	stk++;
2801
2802	firstpos->index = lastpos->index = i;
2803	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2804	firstpos++, lastpos++;
2805
2806	break;
2807	}
2808	#ifdef DEBUG
2809	/* ... balance the above nonsyntactic #ifdef goo... */
2810	fprintf (stderr, "node %td:", i);
2811	prtok (d->tokens[i]);
2812	putc ('\n', stderr);
2813	fprintf (stderr,
2814	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2815	fprintf (stderr, " firstpos:");
2816	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2817	{
2818	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2819	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2820	}
2821	fprintf (stderr, "\n lastpos:");
2822	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2823	{
2824	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2825	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2826	}
2827	putc ('\n', stderr);
2828	#endif
2829	}
2830
2831	if (backward)
2832	{
2833	/* For each follow set that is the follow set of a real position,
2834	replace it with its epsilon closure. */
2835	epsclosure (d, backward);
2836
2837	for (idx_t i = 0; i < tindex; i++)
2838	free (backward[i].elems);
2839	free (backward);
2840	}
2841
2842	dfaoptimize (d);
2843
2844	#ifdef DEBUG
2845	for (idx_t i = 0; i < tindex; i++)
2846	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
2847	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
2848	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
2849	{
2850	fprintf (stderr, "follows(%td:", i);
2851	prtok (d->tokens[i]);
2852	fprintf (stderr, "):");
2853	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2854	{
2855	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2856	prtok (d->tokens[d->follows[i].elems[j].index]);
2857	}
2858	putc ('\n', stderr);
2859	}
2860	#endif
2861
2862	pos.index = 0;
2863	pos.constraint = NO_CONSTRAINT;
2864
2865	alloc_position_set (&tmp, 1);
2866
2867	append (pos, &tmp);
2868
2869	d->separates = xicalloc (tindex, sizeof *d->separates);
2870
2871	for (idx_t i = 0; i < tindex; i++)
2872	{
2873	if (prev_newline_dependent (d->constraints[i]))
2874	d->separates[i] \|= CTX_NEWLINE;
2875	if (prev_letter_dependent (d->constraints[i]))
2876	d->separates[i] \|= CTX_LETTER;
2877
2878	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2879	{
2880	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2881	d->separates[i] \|= CTX_NEWLINE;
2882	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2883	d->separates[i] \|= CTX_LETTER;
2884	}
2885	}
2886
2887	/* Context wanted by some position. */
2888	int separate_contexts = state_separate_contexts (d, &tmp);
2889
2890	/* Build the initial state. */
2891	if (separate_contexts & CTX_NEWLINE)
2892	state_index (d, &tmp, CTX_NEWLINE);
2893	d->initstate_notbol = d->min_trcount
2894	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2895	if (separate_contexts & CTX_LETTER)
2896	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2897	d->min_trcount++;
2898	d->trcount = 0;
2899
2900	free (posalloc);
2901	free (stkalloc);
2902	free (merged.elems);
2903	free (tmp.elems);
2904	}
2905
2906	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2907	static void
2908	realloc_trans_if_necessary (struct dfa *d)
2909	{
2910	state_num oldalloc = d->tralloc;
2911	if (oldalloc < d->sindex)
2912	{
2913	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2914	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2915	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2916	-1, sizeof *realtrans);
2917	realtrans[0] = realtrans[1] = NULL;
2918	d->trans = realtrans + 2;
2919	idx_t newalloc = d->tralloc = newalloc1 - 2;
2920	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2921	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2922	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2923	if (d->localeinfo.multibyte)
2924	{
2925	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2926	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2927	if (oldalloc == 0)
2928	realtrans[0] = realtrans[1] = NULL;
2929	d->mb_trans = realtrans + 2;
2930	}
2931	for (; oldalloc < newalloc; oldalloc++)
2932	{
2933	d->trans[oldalloc] = NULL;
2934	d->fails[oldalloc] = NULL;
2935	if (d->localeinfo.multibyte)
2936	d->mb_trans[oldalloc] = NULL;
2937	}
2938	}
2939	}
2940
2941	/*
2942	Calculate the transition table for a new state derived from state s
2943	for a compiled dfa d after input character uc, and return the new
2944	state number.
2945
2946	Do not worry about all possible input characters; calculate just the group
2947	of positions that match uc. Label it with the set of characters that
2948	every position in the group matches (taking into account, if necessary,
2949	preceding context information of s). Then find the union
2950	of these positions' follows, i.e., the set of positions of the
2951	new state. For each character in the group's label, set the transition
2952	on this character to be to a state corresponding to the set's positions,
2953	and its associated backward context information, if necessary.
2954
2955	When building a searching matcher, include the positions of state
2956	0 in every state.
2957
2958	The group is constructed by building an equivalence-class
2959	partition of the positions of s.
2960
2961	For each position, find the set of characters C that it matches. Eliminate
2962	any characters from C that fail on grounds of backward context.
2963
2964	Check whether the group's label L has nonempty
2965	intersection with C. If L - C is nonempty, create a new group labeled
2966	L - C and having the same positions as the current group, and set L to
2967	the intersection of L and C. Insert the position in the group, set
2968	C = C - L, and resume scanning.
2969
2970	If after comparing with every group there are characters remaining in C,
2971	create a new group labeled with the characters of C and insert this
2972	position in that group. */
2973
2974	static state_num
2975	build_state (state_num s, struct dfa *d, unsigned char uc)
2976	{
2977	position_set follows; /* Union of the follows for each
2978	position of the current state. */
2979	position_set group; /* Positions that match the input char. */
2980	position_set tmp; /* Temporary space for merging sets. */
2981	state_num state; /* New state. */
2982	state_num state_newline; /* New state on a newline transition. */
2983	state_num state_letter; /* New state on a letter transition. */
2984
2985	#ifdef DEBUG
2986	fprintf (stderr, "build state %td\n", s);
2987	#endif
2988
2989	/* A pointer to the new transition table, and the table itself. */
2990	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2991	state_num trans = ptrans;
2992
2993	if (!trans)
2994	{
2995	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
2996	transition tables that can exist at once, other than for
2997	initial states. Often-used transition tables are quickly
2998	rebuilt, whereas rarely-used ones are cleared away. */
2999	if (MAX_TRCOUNT <= d->trcount)
3000	{
3001	for (state_num i = d->min_trcount; i < d->tralloc; i++)
3002	{
3003	free (d->trans[i]);
3004	free (d->fails[i]);
3005	d->trans[i] = d->fails[i] = NULL;
3006	}
3007	d->trcount = 0;
3008	}
3009
3010	d->trcount++;
3011	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
3012
3013	/* Fill transition table with a default value which means that the
3014	transited state has not been calculated yet. */
3015	for (int i = 0; i < NOTCHAR; i++)
3016	trans[i] = -2;
3017	}
3018
3019	/* Set up the success bits for this state. */
3020	d->success[s] = 0;
3021	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3022	d->success[s] \|= CTX_NEWLINE;
3023	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3024	d->success[s] \|= CTX_LETTER;
3025	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3026	d->success[s] \|= CTX_NONE;
3027
3028	alloc_position_set (&follows, d->nleaves);
3029
3030	/* Find the union of the follows of the positions of the group.
3031	This is a hideously inefficient loop. Fix it someday. */
3032	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3033	for (idx_t k = 0;
3034	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3035	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3036	&follows);
3037
3038	/* Positions that match the input char. */
3039	alloc_position_set (&group, d->nleaves);
3040
3041	/* The group's label. */
3042	charclass label;
3043	fillset (&label);
3044
3045	for (idx_t i = 0; i < follows.nelem; i++)
3046	{
3047	charclass matches; /* Set of matching characters. */
3048	position pos = follows.elems[i];
3049	bool matched = false;
3050	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3051	{
3052	zeroset (&matches);
3053	setbit (d->tokens[pos.index], &matches);
3054	if (d->tokens[pos.index] == uc)
3055	matched = true;
3056	}
3057	else if (d->tokens[pos.index] >= CSET)
3058	{
3059	matches = d->charclasses[d->tokens[pos.index] - CSET];
3060	if (tstbit (uc, &matches))
3061	matched = true;
3062	}
3063	else if (d->tokens[pos.index] == ANYCHAR)
3064	{
3065	matches = d->charclasses[d->canychar];
3066	if (tstbit (uc, &matches))
3067	matched = true;
3068
3069	/* ANYCHAR must match with a single character, so we must put
3070	it to D->states[s].mbps which contains the positions which
3071	can match with a single character not a byte. If all
3072	positions which has ANYCHAR does not depend on context of
3073	next character, we put the follows instead of it to
3074	D->states[s].mbps to optimize. */
3075	if (succeeds_in_context (pos.constraint, d->states[s].context,
3076	CTX_NONE))
3077	{
3078	if (d->states[s].mbps.nelem == 0)
3079	alloc_position_set (&d->states[s].mbps, 1);
3080	insert (pos, &d->states[s].mbps);
3081	}
3082	}
3083	else
3084	continue;
3085
3086	/* Some characters may need to be eliminated from matches because
3087	they fail in the current context. */
3088	if (pos.constraint != NO_CONSTRAINT)
3089	{
3090	if (!succeeds_in_context (pos.constraint,
3091	d->states[s].context, CTX_NEWLINE))
3092	for (int j = 0; j < CHARCLASS_WORDS; j++)
3093	matches.w[j] &= ~d->syntax.newline.w[j];
3094	if (!succeeds_in_context (pos.constraint,
3095	d->states[s].context, CTX_LETTER))
3096	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3097	matches.w[j] &= ~d->syntax.letters.w[j];
3098	if (!succeeds_in_context (pos.constraint,
3099	d->states[s].context, CTX_NONE))
3100	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3101	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
3102
3103	/* If there are no characters left, there's no point in going on. */
3104	if (emptyset (&matches))
3105	continue;
3106
3107	/* If we have reset the bit that made us declare "matched", reset
3108	that indicator, too. This is required to avoid an infinite loop
3109	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
3110	if (!tstbit (uc, &matches))
3111	matched = false;
3112	}
3113
3114	#ifdef DEBUG
3115	fprintf (stderr, " nextpos %td:", pos.index);
3116	prtok (d->tokens[pos.index]);
3117	fprintf (stderr, " of");
3118	for (unsigned j = 0; j < NOTCHAR; j++)
3119	if (tstbit (j, &matches))
3120	fprintf (stderr, " 0x%02x", j);
3121	fprintf (stderr, "\n");
3122	#endif
3123
3124	if (matched)
3125	{
3126	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3127	label.w[k] &= matches.w[k];
3128	append (pos, &group);
3129	}
3130	else
3131	{
3132	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3133	label.w[k] &= ~matches.w[k];
3134	}
3135	}
3136
3137	alloc_position_set (&tmp, d->nleaves);
3138
3139	if (group.nelem > 0)
3140	{
3141	/* If we are building a searching matcher, throw in the positions
3142	of state 0 as well, if possible. */
3143	if (d->searchflag)
3144	{
3145	/* If a token in follows.elems is not 1st byte of a multibyte
3146	character, or the states of follows must accept the bytes
3147	which are not 1st byte of the multibyte character.
3148	Then, if a state of follows encounters a byte, it must not be
3149	a 1st byte of a multibyte character nor a single byte character.
3150	In this case, do not add state[0].follows to next state, because
3151	state[0] must accept 1st-byte.
3152
3153	For example, suppose <sb a> is a certain single byte character,
3154	<mb A> is a certain multibyte character, and the codepoint of
3155	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
3156	state[0] accepts <sb a>, state[i] transits to state[i+1] by
3157	accepting the 1st byte of <mb A>, and state[i+1] accepts the
3158	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3159	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3160	not add state[0]. */
3161
3162	bool mergeit = !d->localeinfo.multibyte;
3163	if (!mergeit)
3164	{
3165	mergeit = true;
3166	for (idx_t j = 0; mergeit && j < group.nelem; j++)
3167	mergeit &= d->multibyte_prop[group.elems[j].index];
3168	}
3169	if (mergeit)
3170	merge2 (&group, &d->states[0].elems, &tmp);
3171	}
3172
3173	/* Find out if the new state will want any context information,
3174	by calculating possible contexts that the group can match,
3175	and separate contexts that the new state wants to know. */
3176	int possible_contexts = charclass_context (d, &label);
3177	int separate_contexts = state_separate_contexts (d, &group);
3178
3179	/* Find the state(s) corresponding to the union of the follows. */
3180	if (possible_contexts & ~separate_contexts)
3181	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3182	else
3183	state = -1;
3184	if (separate_contexts & possible_contexts & CTX_NEWLINE)
3185	state_newline = state_index (d, &group, CTX_NEWLINE);
3186	else
3187	state_newline = state;
3188	if (separate_contexts & possible_contexts & CTX_LETTER)
3189	state_letter = state_index (d, &group, CTX_LETTER);
3190	else
3191	state_letter = state;
3192
3193	/* Reallocate now, to reallocate any newline transition properly. */
3194	realloc_trans_if_necessary (d);
3195	}
3196
3197	/* If we are a searching matcher, the default transition is to a state
3198	containing the positions of state 0, otherwise the default transition
3199	is to fail miserably. */
3200	else if (d->searchflag)
3201	{
3202	state_newline = 0;
3203	state_letter = d->min_trcount - 1;
3204	state = d->initstate_notbol;
3205	}
3206	else
3207	{
3208	state_newline = -1;
3209	state_letter = -1;
3210	state = -1;
3211	}
3212
3213	/* Set the transitions for each character in the label. */
3214	for (int i = 0; i < NOTCHAR; i++)
3215	if (tstbit (i, &label))
3216	switch (d->syntax.sbit[i])
3217	{
3218	case CTX_NEWLINE:
3219	trans[i] = state_newline;
3220	break;
3221	case CTX_LETTER:
3222	trans[i] = state_letter;
3223	break;
3224	default:
3225	trans[i] = state;
3226	break;
3227	}
3228
3229	#ifdef DEBUG
3230	fprintf (stderr, "trans table %td", s);
3231	for (int i = 0; i < NOTCHAR; ++i)
3232	{
3233	if (!(i & 0xf))
3234	fprintf (stderr, "\n");
3235	fprintf (stderr, " %2td", trans[i]);
3236	}
3237	fprintf (stderr, "\n");
3238	#endif
3239
3240	free (group.elems);
3241	free (follows.elems);
3242	free (tmp.elems);
3243
3244	/* Keep the newline transition in a special place so we can use it as
3245	a sentinel. */
3246	if (tstbit (d->syntax.eolbyte, &label))
3247	{
3248	d->newlines[s] = trans[d->syntax.eolbyte];
3249	trans[d->syntax.eolbyte] = -1;
3250	}
3251
3252	return trans[uc];
3253	}
3254
3255	/* Multibyte character handling sub-routines for dfaexec. */
3256
3257	/* Consume a single byte and transit state from 's' to '*next_state'.
3258	This function is almost same as the state transition routin in dfaexec.
3259	But state transition is done just once, otherwise matching succeed or
3260	reach the end of the buffer. */
3261	static state_num
3262	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
3263	{
3264	state_num *t;
3265
3266	if (d->trans[s])
3267	t = d->trans[s];
3268	else if (d->fails[s])
3269	t = d->fails[s];
3270	else
3271	{
3272	build_state (s, d, **pp);
3273	if (d->trans[s])
3274	t = d->trans[s];
3275	else
3276	{
3277	t = d->fails[s];
3278	assert (t);
3279	}
3280	}
3281
3282	if (t[**pp] == -2)
3283	build_state (s, d, **pp);
3284
3285	return t[(pp)++];
3286	}
3287
3288	/* Transit state from s, then return new state and update the pointer of
3289	the buffer. This function is for a period operator which can match a
3290	multi-byte character. */
3291	static state_num
3292	transit_state (struct dfa d, state_num s, unsigned char const *pp,
3293	unsigned char const *end)
3294	{
3295	wint_t wc;
3296
3297	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
3298
3299	/* This state has some operators which can match a multibyte character. */
3300	d->mb_follows.nelem = 0;
3301
3302	/* Calculate the state which can be reached from the state 's' by
3303	consuming 'mbclen' single bytes from the buffer. */
3304	state_num s1 = s;
3305	int mbci;
3306	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
3307	s = transit_state_singlebyte (d, s, pp);
3308	*pp += mbclen - mbci;
3309
3310	if (wc == WEOF)
3311	{
3312	/* It is an invalid character, so ANYCHAR is not accepted. */
3313	return s;
3314	}
3315
3316	/* If all positions which have ANYCHAR do not depend on the context
3317	of the next character, calculate the next state with
3318	pre-calculated follows and cache the result. */
3319	if (d->states[s1].mb_trindex < 0)
3320	{
3321	if (MAX_TRCOUNT <= d->mb_trcount)
3322	{
3323	state_num s3;
3324	for (s3 = -1; s3 < d->tralloc; s3++)
3325	{
3326	free (d->mb_trans[s3]);
3327	d->mb_trans[s3] = NULL;
3328	}
3329
3330	for (state_num i = 0; i < d->sindex; i++)
3331	d->states[i].mb_trindex = -1;
3332	d->mb_trcount = 0;
3333	}
3334	d->states[s1].mb_trindex = d->mb_trcount++;
3335	}
3336
3337	if (! d->mb_trans[s])
3338	{
3339	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3340	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3341	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3342	for (int i = 0; i < MAX_TRCOUNT; i++)
3343	d->mb_trans[s][i] = -1;
3344	}
3345	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3346	return d->mb_trans[s][d->states[s1].mb_trindex];
3347
3348	if (s == -1)
3349	copy (&d->states[s1].mbps, &d->mb_follows);
3350	else
3351	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3352
3353	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3354	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3355	realloc_trans_if_necessary (d);
3356
3357	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3358
3359	return s2;
3360	}
3361
3362	/* The initial state may encounter a byte which is not a single byte character
3363	nor the first byte of a multibyte character. But it is incorrect for the
3364	initial state to accept such a byte. For example, in Shift JIS the regular
3365	expression "\\" accepts the codepoint 0x5c, but should not accept the second
3366	byte of the codepoint 0x815c. Then the initial state must skip the bytes
3367	that are not a single byte character nor the first byte of a multibyte
3368	character.
3369
3370	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3371	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3372	the result is greater than P, set *WCP to the final wide character
3373	processed, or to WEOF if no wide character is processed. Otherwise,
3374	if WCP is non-NULL, *WCP may or may not be updated.
3375
3376	Both P and MBP must be no larger than END. */
3377	static unsigned char const *
3378	skip_remains_mb (struct dfa d, unsigned char const p,
3379	unsigned char const mbp, char const end)
3380	{
3381	if (d->syntax.never_trail[*p])
3382	return p;
3383	while (mbp < p)
3384	{
3385	wint_t wc;
3386	mbp += mbs_to_wchar (&wc, (char const *) mbp,
3387	end - (char const *) mbp, d);
3388	}
3389	return mbp;
3390	}
3391
3392	/* Search through a buffer looking for a match to the struct dfa *D.
3393	Find the first occurrence of a string matching the regexp in the
3394	buffer, and the shortest possible version thereof. Return a pointer to
3395	the first character after the match, or NULL if none is found. BEGIN
3396	points to the beginning of the buffer, and END points to the first byte
3397	after its end. Note however that we store a sentinel byte (usually
3398	newline) in *END, so the actual buffer must be one byte longer.
3399	When ALLOW_NL, newlines may appear in the matching string.
3400	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3401	If MULTIBYTE, the input consists of multibyte characters and/or
3402	encoding-error bytes. Otherwise, it consists of single-byte characters.
3403	Here is the list of features that make this DFA matcher punt:
3404	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3405	- [^...] in non-simple locale
3406	- [[=foo=]] or [[.foo.]]
3407	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3408	- back-reference: (.)\1
3409	- word-delimiter in multibyte locale: \<, \>, \b, \B
3410	See struct localeinfo.simple for the definition of "simple locale". */
3411
3412	static inline char *
3413	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
3414	idx_t *count, bool multibyte)
3415	{
3416	if (MAX_TRCOUNT <= d->sindex)
3417	{
3418	for (state_num s = d->min_trcount; s < d->sindex; s++)
3419	{
3420	free (d->states[s].elems.elems);
3421	free (d->states[s].mbps.elems);
3422	}
3423	d->sindex = d->min_trcount;
3424
3425	if (d->trans)
3426	{
3427	for (state_num s = 0; s < d->tralloc; s++)
3428	{
3429	free (d->trans[s]);
3430	free (d->fails[s]);
3431	d->trans[s] = d->fails[s] = NULL;
3432	}
3433	d->trcount = 0;
3434	}
3435
3436	if (d->localeinfo.multibyte && d->mb_trans)
3437	{
3438	for (state_num s = -1; s < d->tralloc; s++)
3439	{
3440	free (d->mb_trans[s]);
3441	d->mb_trans[s] = NULL;
3442	}
3443	for (state_num s = 0; s < d->min_trcount; s++)
3444	d->states[s].mb_trindex = -1;
3445	d->mb_trcount = 0;
3446	}
3447	}
3448
3449	if (!d->tralloc)
3450	realloc_trans_if_necessary (d);
3451
3452	/* Current state. */
3453	state_num s = 0, s1 = 0;
3454
3455	/* Current input character. */
3456	unsigned char const p = (unsigned char const ) begin;
3457	unsigned char const *mbp = p;
3458
3459	/* Copy of d->trans so it can be optimized into a register. */
3460	state_num **trans = d->trans;
3461	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3462	unsigned char saved_end = (unsigned char ) end;
3463	*end = eol;
3464
3465	if (multibyte)
3466	{
3467	memset (&d->mbs, 0, sizeof d->mbs);
3468	if (d->mb_follows.alloc == 0)
3469	alloc_position_set (&d->mb_follows, d->nleaves);
3470	}
3471
3472	idx_t nlcount = 0;
3473	for (;;)
3474	{
3475	state_num *t;
3476	while ((t = trans[s]) != NULL)
3477	{
3478	if (s < d->min_trcount)
3479	{
3480	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
3481	{
3482	while (t[*p] == s)
3483	p++;
3484	}
3485	if (multibyte)
3486	p = mbp = skip_remains_mb (d, p, mbp, end);
3487	}
3488
3489	if (multibyte)
3490	{
3491	s1 = s;
3492
3493	if (d->states[s].mbps.nelem == 0
3494	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3495	{
3496	/* If an input character does not match ANYCHAR, do it
3497	like a single-byte character. */
3498	s = t[*p++];
3499	}
3500	else
3501	{
3502	s = transit_state (d, s, &p, (unsigned char *) end);
3503	mbp = p;
3504	trans = d->trans;
3505	}
3506	}
3507	else
3508	{
3509	s1 = t[*p++];
3510	t = trans[s1];
3511	if (! t)
3512	{
3513	state_num tmp = s;
3514	s = s1;
3515	s1 = tmp; /* swap */
3516	break;
3517	}
3518	if (s < d->min_trcount)
3519	{
3520	while (t[*p] == s1)
3521	p++;
3522	}
3523	s = t[*p++];
3524	}
3525	}
3526
3527	if (s < 0)
3528	{
3529	if (s == -2)
3530	{
3531	s = build_state (s1, d, p[-1]);
3532	trans = d->trans;
3533	}
3534	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3535	{
3536	/* The previous character was a newline. Count it, and skip
3537	checking of multibyte character boundary until here. */
3538	nlcount++;
3539	mbp = p;
3540
3541	s = (allow_nl ? d->newlines[s1]
3542	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3543	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3544	: d->initstate_notbol);
3545	}
3546	else
3547	{
3548	p = NULL;
3549	goto done;
3550	}
3551	}
3552	else if (d->fails[s])
3553	{
3554	if ((d->success[s] & d->syntax.sbit[*p])
3555	\|\| ((char *) p == end
3556	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3557	d)))
3558	goto done;
3559
3560	if (multibyte && s < d->min_trcount)
3561	p = mbp = skip_remains_mb (d, p, mbp, end);
3562
3563	s1 = s;
3564	if (!multibyte \|\| d->states[s].mbps.nelem == 0
3565	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3566	{
3567	/* If a input character does not match ANYCHAR, do it
3568	like a single-byte character. */
3569	s = d->fails[s][*p++];
3570	}
3571	else
3572	{
3573	s = transit_state (d, s, &p, (unsigned char *) end);
3574	mbp = p;
3575	trans = d->trans;
3576	}
3577	}
3578	else
3579	{
3580	build_state (s, d, p[0]);
3581	trans = d->trans;
3582	}
3583	}
3584
3585	done:
3586	if (count)
3587	*count += nlcount;
3588	*end = saved_end;
3589	return (char *) p;
3590	}
3591
3592	/* Specialized versions of dfaexec for multibyte and single-byte cases.
3593	This is for performance, as dfaexec_main is an inline function. */
3594
3595	static char *
3596	dfaexec_mb (struct dfa d, char const begin, char *end,
3597	bool allow_nl, idx_t count, bool backref)
3598	{
3599	return dfaexec_main (d, begin, end, allow_nl, count, true);
3600	}
3601
3602	static char *
3603	dfaexec_sb (struct dfa d, char const begin, char *end,
3604	bool allow_nl, idx_t count, bool backref)
3605	{
3606	return dfaexec_main (d, begin, end, allow_nl, count, false);
3607	}
3608
3609	/* Always set *BACKREF and return BEGIN. Use this wrapper for
3610	any regexp that uses a construct not supported by this code. */
3611	static char *
3612	dfaexec_noop (struct dfa d, char const begin, char *end,
3613	bool allow_nl, idx_t count, bool backref)
3614	{
3615	*backref = true;
3616	return (char *) begin;
3617	}
3618
3619	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3620	but faster and set *BACKREF if the DFA code does not support this
3621	regexp usage. */
3622
3623	char *
3624	dfaexec (struct dfa d, char const begin, char *end,
3625	bool allow_nl, idx_t count, bool backref)
3626	{
3627	return d->dfaexec (d, begin, end, allow_nl, count, backref);
3628	}
3629
3630	struct dfa *
3631	dfasuperset (struct dfa const *d)
3632	{
3633	return d->superset;
3634	}
3635
3636	bool
3637	dfaisfast (struct dfa const *d)
3638	{
3639	return d->fast;
3640	}
3641
3642	static void
3643	free_mbdata (struct dfa *d)
3644	{
3645	free (d->multibyte_prop);
3646	free (d->lex.brack.chars);
3647	free (d->mb_follows.elems);
3648
3649	if (d->mb_trans)
3650	{
3651	state_num s;
3652	for (s = -1; s < d->tralloc; s++)
3653	free (d->mb_trans[s]);
3654	free (d->mb_trans - 2);
3655	}
3656	}
3657
3658	/* Return true if every construct in D is supported by this DFA matcher. */
3659	bool
3660	dfasupported (struct dfa const *d)
3661	{
3662	for (idx_t i = 0; i < d->tindex; i++)
3663	{
3664	switch (d->tokens[i])
3665	{
3666	case BEGWORD:
3667	case ENDWORD:
3668	case LIMWORD:
3669	case NOTLIMWORD:
3670	if (!d->localeinfo.multibyte)
3671	continue;
3672	FALLTHROUGH;
3673	case BACKREF:
3674	case MBCSET:
3675	return false;
3676	}
3677	}
3678	return true;
3679	}
3680
3681	/* Disable use of the superset DFA if it is not likely to help
3682	performance. */
3683	static void
3684	maybe_disable_superset_dfa (struct dfa *d)
3685	{
3686	if (!d->localeinfo.using_utf8)
3687	return;
3688
3689	bool have_backref = false;
3690	for (idx_t i = 0; i < d->tindex; i++)
3691	{
3692	switch (d->tokens[i])
3693	{
3694	case ANYCHAR:
3695	/* Lowered. */
3696	abort ();
3697	case BACKREF:
3698	have_backref = true;
3699	break;
3700	case MBCSET:
3701	/* Requires multi-byte algorithm. */
3702	return;
3703	default:
3704	break;
3705	}
3706	}
3707
3708	if (!have_backref && d->superset)
3709	{
3710	/* The superset DFA is not likely to be much faster, so remove it. */
3711	dfafree (d->superset);
3712	free (d->superset);
3713	d->superset = NULL;
3714	}
3715
3716	free_mbdata (d);
3717	d->localeinfo.multibyte = false;
3718	d->dfaexec = dfaexec_sb;
3719	d->fast = true;
3720	}
3721
3722	static void
3723	dfassbuild (struct dfa *d)
3724	{
3725	struct dfa *sup = dfaalloc ();
3726
3727	sup = d;
3728	sup->localeinfo.multibyte = false;
3729	sup->dfaexec = dfaexec_sb;
3730	sup->multibyte_prop = NULL;
3731	sup->superset = NULL;
3732	sup->states = NULL;
3733	sup->sindex = 0;
3734	sup->constraints = NULL;
3735	sup->separates = NULL;
3736	sup->follows = NULL;
3737	sup->tralloc = 0;
3738	sup->trans = NULL;
3739	sup->fails = NULL;
3740	sup->success = NULL;
3741	sup->newlines = NULL;
3742
3743	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3744	if (d->cindex)
3745	{
3746	memcpy (sup->charclasses, d->charclasses,
3747	d->cindex * sizeof *sup->charclasses);
3748	}
3749
3750	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3751	sup->talloc = d->tindex * 2;
3752
3753	bool have_achar = false;
3754	bool have_nchar = false;
3755	idx_t j;
3756	for (idx_t i = j = 0; i < d->tindex; i++)
3757	{
3758	switch (d->tokens[i])
3759	{
3760	case ANYCHAR:
3761	case MBCSET:
3762	case BACKREF:
3763	{
3764	charclass ccl;
3765	fillset (&ccl);
3766	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3767	sup->tokens[j++] = STAR;
3768	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
3769	\|\| d->tokens[i + 1] == PLUS)
3770	i++;
3771	have_achar = true;
3772	}
3773	break;
3774	case BEGWORD:
3775	case ENDWORD:
3776	case LIMWORD:
3777	case NOTLIMWORD:
3778	if (d->localeinfo.multibyte)
3779	{
3780	/* These constraints aren't supported in a multibyte locale.
3781	Ignore them in the superset DFA. */
3782	sup->tokens[j++] = EMPTY;
3783	break;
3784	}
3785	FALLTHROUGH;
3786	default:
3787	sup->tokens[j++] = d->tokens[i];
3788	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3789	\|\| d->tokens[i] >= CSET)
3790	have_nchar = true;
3791	break;
3792	}
3793	}
3794	sup->tindex = j;
3795
3796	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
3797	d->superset = sup;
3798	else
3799	{
3800	dfafree (sup);
3801	free (sup);
3802	}
3803	}
3804
3805	/* Parse a string S of length LEN into D (but skip this step if S is null).
3806	Then analyze D and build a matcher for it.
3807	SEARCHFLAG says whether to build a searching or an exact matcher. */
3808	void
3809	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
3810	{
3811	if (s != NULL)
3812	dfaparse (s, len, d);
3813
3814	dfassbuild (d);
3815
3816	if (dfasupported (d))
3817	{
3818	maybe_disable_superset_dfa (d);
3819	dfaanalyze (d, searchflag);
3820	}
3821	else
3822	{
3823	d->dfaexec = dfaexec_noop;
3824	}
3825
3826	if (d->superset)
3827	{
3828	d->fast = true;
3829	dfaanalyze (d->superset, searchflag);
3830	}
3831	}
3832
3833	/* Free the storage held by the components of a dfa. */
3834	void
3835	dfafree (struct dfa *d)
3836	{
3837	free (d->charclasses);
3838	free (d->tokens);
3839
3840	if (d->localeinfo.multibyte)
3841	free_mbdata (d);
3842
3843	free (d->constraints);
3844	free (d->separates);
3845
3846	for (idx_t i = 0; i < d->sindex; i++)
3847	{
3848	free (d->states[i].elems.elems);
3849	free (d->states[i].mbps.elems);
3850	}
3851	free (d->states);
3852
3853	if (d->follows)
3854	{
3855	for (idx_t i = 0; i < d->tindex; i++)
3856	free (d->follows[i].elems);
3857	free (d->follows);
3858	}
3859
3860	if (d->trans)
3861	{
3862	for (idx_t i = 0; i < d->tralloc; i++)
3863	{
3864	free (d->trans[i]);
3865	free (d->fails[i]);
3866	}
3867
3868	free (d->trans - 2);
3869	free (d->fails);
3870	free (d->newlines);
3871	free (d->success);
3872	}
3873
3874	if (d->superset)
3875	{
3876	dfafree (d->superset);
3877	free (d->superset);
3878	}
3879	}
3880
3881	/* Having found the postfix representation of the regular expression,
3882	try to find a long sequence of characters that must appear in any line
3883	containing the r.e.
3884	Finding a "longest" sequence is beyond the scope here;
3885	we take an easy way out and hope for the best.
3886	(Take "(ab\|a)b"--please.)
3887
3888	We do a bottom-up calculation of sequences of characters that must appear
3889	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3890	representation:
3891	sequences that must appear at the left of the match ("left")
3892	sequences that must appear at the right of the match ("right")
3893	lists of sequences that must appear somewhere in the match ("in")
3894	sequences that must constitute the match ("is")
3895
3896	When we get to the root of the tree, we use one of the longest of its
3897	calculated "in" sequences as our answer.
3898
3899	The sequences calculated for the various types of node (in pseudo ANSI c)
3900	are shown below. "p" is the operand of unary operators (and the left-hand
3901	operand of binary operators); "q" is the right-hand operand of binary
3902	operators.
3903
3904	"ZERO" means "a zero-length sequence" below.
3905
3906	Type left right is in
3907	---- ---- ----- -- --
3908	char c # c # c # c # c
3909
3910	ANYCHAR ZERO ZERO ZERO ZERO
3911
3912	MBCSET ZERO ZERO ZERO ZERO
3913
3914	CSET ZERO ZERO ZERO ZERO
3915
3916	STAR ZERO ZERO ZERO ZERO
3917
3918	QMARK ZERO ZERO ZERO ZERO
3919
3920	PLUS p->left p->right ZERO p->in
3921
3922	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3923	p->left : q->right : q->is!=ZERO) ? q->in plus
3924	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3925	ZERO
3926
3927	OR longest common longest common (do p->is and substrings common
3928	leading trailing to q->is have same p->in and
3929	(sub)sequence (sub)sequence q->in length and content) ?
3930	of p->left of p->right
3931	and q->left and q->right p->is : NULL
3932
3933	If there's anything else we recognize in the tree, all four sequences get set
3934	to zero-length sequences. If there's something we don't recognize in the
3935	tree, we just return a zero-length sequence.
3936
3937	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3938	'aaa')?
3939
3940	And ... is it here or someplace that we might ponder "optimizations" such as
3941	egrep 'psi\|epsilon' -> egrep 'psi'
3942	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3943	(Yes, we now find "epsi" as a "string
3944	that must occur", but we might also
3945	simplify the entire r.e. being sought)
3946	grep '[c]' -> grep 'c'
3947	grep '(ab\|a)b' -> grep 'ab'
3948	grep 'ab*' -> grep 'a'
3949	grep 'a*b' -> grep 'b'
3950
3951	There are several issues:
3952
3953	Is optimization easy (enough)?
3954
3955	Does optimization actually accomplish anything,
3956	or is the automaton you get from "psi\|epsilon" (for example)
3957	the same as the one you get from "psi" (for example)?
3958
3959	Are optimizable r.e.'s likely to be used in real-life situations
3960	(something like 'ab*' is probably unlikely; something like is
3961	'psi\|epsilon' is likelier)? */
3962
3963	static char *
3964	icatalloc (char old, char const new)
3965	{
3966	idx_t newsize = strlen (new);
3967	if (newsize == 0)
3968	return old;
3969	idx_t oldsize = strlen (old);
3970	char *result = xirealloc (old, oldsize + newsize + 1);
3971	memcpy (result + oldsize, new, newsize + 1);
3972	return result;
3973	}
3974
3975	static void
3976	freelist (char **cpp)
3977	{
3978	while (*cpp)
3979	free (*cpp++);
3980	}
3981
3982	static char **
3983	enlistnew (char *cpp, char new)
3984	{
3985	/* Is there already something in the list that's new (or longer)? */
3986	idx_t i;
3987	for (i = 0; cpp[i] != NULL; i++)
3988	if (strstr (cpp[i], new) != NULL)
3989	{
3990	free (new);
3991	return cpp;
3992	}
3993	/* Eliminate any obsoleted strings. */
3994	for (idx_t j = 0; cpp[j] != NULL; )
3995	if (strstr (new, cpp[j]) == NULL)
3996	++j;
3997	else
3998	{
3999	free (cpp[j]);
4000	if (--i == j)
4001	break;
4002	cpp[j] = cpp[i];
4003	cpp[i] = NULL;
4004	}
4005	/* Add the new string. */
4006	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4007	cpp[i] = new;
4008	cpp[i + 1] = NULL;
4009	return cpp;
4010	}
4011
4012	static char **
4013	enlist (char *cpp, char const str, idx_t len)
4014	{
4015	return enlistnew (cpp, ximemdup0 (str, len));
4016	}
4017
4018	/* Given pointers to two strings, return a pointer to an allocated
4019	list of their distinct common substrings. */
4020	static char **
4021	comsubs (char left, char const right)
4022	{
4023	char *cpp = xzalloc (sizeof cpp);
4024
4025	for (char lcp = left; lcp != '\0'; lcp++)
4026	{
4027	idx_t len = 0;
4028	char rcp = strchr (right, lcp);
4029	while (rcp != NULL)
4030	{
4031	idx_t i;
4032	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4033	continue;
4034	if (i > len)
4035	len = i;
4036	rcp = strchr (rcp + 1, *lcp);
4037	}
4038	if (len != 0)
4039	cpp = enlist (cpp, lcp, len);
4040	}
4041	return cpp;
4042	}
4043
4044	static char **
4045	addlists (char old, char new)
4046	{
4047	for (; *new; new++)
4048	old = enlistnew (old, xstrdup (*new));
4049	return old;
4050	}
4051
4052	/* Given two lists of substrings, return a new list giving substrings
4053	common to both. */
4054	static char **
4055	inboth (char left, char right)
4056	{
4057	char *both = xzalloc (sizeof both);
4058
4059	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4060	{
4061	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4062	{
4063	char **temp = comsubs (left[lnum], right[rnum]);
4064	both = addlists (both, temp);
4065	freelist (temp);
4066	free (temp);
4067	}
4068	}
4069	return both;
4070	}
4071
4072	typedef struct must must;
4073
4074	struct must
4075	{
4076	char **in;
4077	char *left;
4078	char *right;
4079	char *is;
4080	bool begline;
4081	bool endline;
4082	must *prev;
4083	};
4084
4085	static must *
4086	allocmust (must *mp, idx_t size)
4087	{
4088	must new_mp = xmalloc (sizeof new_mp);
4089	new_mp->in = xzalloc (sizeof *new_mp->in);
4090	new_mp->left = xizalloc (size);
4091	new_mp->right = xizalloc (size);
4092	new_mp->is = xizalloc (size);
4093	new_mp->begline = false;
4094	new_mp->endline = false;
4095	new_mp->prev = mp;
4096	return new_mp;
4097	}
4098
4099	static void
4100	resetmust (must *mp)
4101	{
4102	freelist (mp->in);
4103	mp->in[0] = NULL;
4104	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4105	mp->begline = false;
4106	mp->endline = false;
4107	}
4108
4109	static void
4110	freemust (must *mp)
4111	{
4112	freelist (mp->in);
4113	free (mp->in);
4114	free (mp->left);
4115	free (mp->right);
4116	free (mp->is);
4117	free (mp);
4118	}
4119
4120	struct dfamust *
4121	dfamust (struct dfa const *d)
4122	{
4123	must *mp = NULL;
4124	char const *result = "";
4125	bool exact = false;
4126	bool begline = false;
4127	bool endline = false;
4128	bool need_begline = false;
4129	bool need_endline = false;
4130	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4131
4132	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4133	{
4134	token t = d->tokens[ri];
4135	switch (t)
4136	{
4137	case BEGLINE:
4138	mp = allocmust (mp, 2);
4139	mp->begline = true;
4140	need_begline = true;
4141	break;
4142	case ENDLINE:
4143	mp = allocmust (mp, 2);
4144	mp->endline = true;
4145	need_endline = true;
4146	break;
4147	case LPAREN:
4148	case RPAREN:
4149	assert (!"neither LPAREN nor RPAREN may appear here");
4150
4151	case EMPTY:
4152	case BEGWORD:
4153	case ENDWORD:
4154	case LIMWORD:
4155	case NOTLIMWORD:
4156	case BACKREF:
4157	case ANYCHAR:
4158	case MBCSET:
4159	mp = allocmust (mp, 2);
4160	break;
4161
4162	case STAR:
4163	case QMARK:
4164	assume_nonnull (mp);
4165	resetmust (mp);
4166	break;
4167
4168	case OR:
4169	{
4170	char **new;
4171	must *rmp = mp;
4172	assume_nonnull (rmp);
4173	must *lmp = mp = mp->prev;
4174	assume_nonnull (lmp);
4175	idx_t j, ln, rn, n;
4176
4177	/* Guaranteed to be. Unlikely, but ... */
4178	if (str_eq (lmp->is, rmp->is))
4179	{
4180	lmp->begline &= rmp->begline;
4181	lmp->endline &= rmp->endline;
4182	}
4183	else
4184	{
4185	lmp->is[0] = '\0';
4186	lmp->begline = false;
4187	lmp->endline = false;
4188	}
4189	/* Left side--easy */
4190	idx_t i = 0;
4191	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4192	++i;
4193	lmp->left[i] = '\0';
4194	/* Right side */
4195	ln = strlen (lmp->right);
4196	rn = strlen (rmp->right);
4197	n = ln;
4198	if (n > rn)
4199	n = rn;
4200	for (i = 0; i < n; ++i)
4201	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4202	break;
4203	for (j = 0; j < i; ++j)
4204	lmp->right[j] = lmp->right[(ln - i) + j];
4205	lmp->right[j] = '\0';
4206	new = inboth (lmp->in, rmp->in);
4207	freelist (lmp->in);
4208	free (lmp->in);
4209	lmp->in = new;
4210	freemust (rmp);
4211	}
4212	break;
4213
4214	case PLUS:
4215	assume_nonnull (mp);
4216	mp->is[0] = '\0';
4217	break;
4218
4219	case END:
4220	assume_nonnull (mp);
4221	assert (!mp->prev);
4222	for (idx_t i = 0; mp->in[i] != NULL; i++)
4223	if (strlen (mp->in[i]) > strlen (result))
4224	result = mp->in[i];
4225	if (str_eq (result, mp->is))
4226	{
4227	if ((!need_begline \|\| mp->begline) && (!need_endline
4228	\|\| mp->endline))
4229	exact = true;
4230	begline = mp->begline;
4231	endline = mp->endline;
4232	}
4233	goto done;
4234
4235	case CAT:
4236	{
4237	must *rmp = mp;
4238	assume_nonnull (rmp);
4239	must *lmp = mp = mp->prev;
4240	assume_nonnull (lmp);
4241
4242	/* In. Everything in left, plus everything in
4243	right, plus concatenation of
4244	left's right and right's left. */
4245	lmp->in = addlists (lmp->in, rmp->in);
4246	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4247	{
4248	idx_t lrlen = strlen (lmp->right);
4249	idx_t rllen = strlen (rmp->left);
4250	char *tp = ximalloc (lrlen + rllen + 1);
4251	memcpy (tp + lrlen, rmp->left, rllen + 1);
4252	memcpy (tp, lmp->right, lrlen);
4253	lmp->in = enlistnew (lmp->in, tp);
4254	}
4255	/* Left-hand */
4256	if (lmp->is[0] != '\0')
4257	lmp->left = icatalloc (lmp->left, rmp->left);
4258	/* Right-hand */
4259	if (rmp->is[0] == '\0')
4260	lmp->right[0] = '\0';
4261	lmp->right = icatalloc (lmp->right, rmp->right);
4262	/* Guaranteed to be */
4263	if ((lmp->is[0] != '\0' \|\| lmp->begline)
4264	&& (rmp->is[0] != '\0' \|\| rmp->endline))
4265	{
4266	lmp->is = icatalloc (lmp->is, rmp->is);
4267	lmp->endline = rmp->endline;
4268	}
4269	else
4270	{
4271	lmp->is[0] = '\0';
4272	lmp->begline = false;
4273	lmp->endline = false;
4274	}
4275	freemust (rmp);
4276	}
4277	break;
4278
4279	case '\0':
4280	/* Not on my shift. */
4281	goto done;
4282
4283	default:
4284	if (CSET <= t)
4285	{
4286	/* If T is a singleton, or if case-folding in a unibyte
4287	locale and T's members all case-fold to the same char,
4288	convert T to one of its members. Otherwise, do
4289	nothing further with T. */
4290	charclass *ccl = &d->charclasses[t - CSET];
4291	int j;
4292	for (j = 0; j < NOTCHAR; j++)
4293	if (tstbit (j, ccl))
4294	break;
4295	if (! (j < NOTCHAR))
4296	{
4297	mp = allocmust (mp, 2);
4298	break;
4299	}
4300	t = j;
4301	while (++j < NOTCHAR)
4302	if (tstbit (j, ccl)
4303	&& ! (case_fold_unibyte
4304	&& toupper (j) == toupper (t)))
4305	break;
4306	if (j < NOTCHAR)
4307	{
4308	mp = allocmust (mp, 2);
4309	break;
4310	}
4311	}
4312
4313	idx_t rj = ri + 2;
4314	if (d->tokens[ri + 1] == CAT)
4315	{
4316	for (; rj < d->tindex - 1; rj += 2)
4317	{
4318	if ((rj != ri && (d->tokens[rj] <= 0
4319	\|\| NOTCHAR <= d->tokens[rj]))
4320	\|\| d->tokens[rj + 1] != CAT)
4321	break;
4322	}
4323	}
4324	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4325	mp->is[0] = mp->left[0] = mp->right[0]
4326	= case_fold_unibyte ? toupper (t) : t;
4327
4328	idx_t i;
4329	for (i = 1; ri + 2 < rj; i++)
4330	{
4331	ri += 2;
4332	t = d->tokens[ri];
4333	mp->is[i] = mp->left[i] = mp->right[i]
4334	= case_fold_unibyte ? toupper (t) : t;
4335	}
4336	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4337	mp->in = enlist (mp->in, mp->is, i);
4338	break;
4339	}
4340	}
4341	done:;
4342
4343	struct dfamust *dm = NULL;
4344	if (*result)
4345	{
4346	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4347	dm->exact = exact;
4348	dm->begline = begline;
4349	dm->endline = endline;
4350	strcpy (dm->must, result);
4351	}
4352
4353	while (mp)
4354	{
4355	must *prev = mp->prev;
4356	freemust (mp);
4357	mp = prev;
4358	}
4359
4360	return dm;
4361	}
4362
4363	void
4364	dfamustfree (struct dfamust *dm)
4365	{
4366	free (dm);
4367	}
4368
4369	struct dfa *
4370	dfaalloc (void)
4371	{
4372	return xmalloc (sizeof (struct dfa));
4373	}
4374
4375	/* Initialize DFA. */
4376	void
4377	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
4378	reg_syntax_t bits, int dfaopts)
4379	{
4380	memset (dfa, 0, offsetof (struct dfa, dfaexec));
4381	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4382	dfa->localeinfo = *linfo;
4383
4384	dfa->fast = !dfa->localeinfo.multibyte;
4385
4386	dfa->canychar = -1;
4387	dfa->syntax.syntax_bits_set = true;
4388	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4389	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4390	dfa->syntax.syntax_bits = bits;
4391	dfa->syntax.dfaopts = dfaopts;
4392
4393	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4394	{
4395	unsigned char uc = i;
4396
4397	dfa->syntax.sbit[uc] = char_context (dfa, uc);
4398	switch (dfa->syntax.sbit[uc])
4399	{
4400	case CTX_LETTER:
4401	setbit (uc, &dfa->syntax.letters);
4402	break;
4403	case CTX_NEWLINE:
4404	setbit (uc, &dfa->syntax.newline);
4405	break;
4406	}
4407
4408	/* POSIX requires that the five bytes in "\n\r./" (including the
4409	terminating NUL) cannot occur inside a multibyte character. */
4410	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4411	? (uc & 0xc0) != 0x80
4412	: strchr ("\n\r./", uc) != NULL);
4413	}
4414	}
4415
4416	/* Initialize TO by copying FROM's syntax settings. */
4417	void
4418	dfacopysyntax (struct dfa to, struct dfa const from)
4419	{
4420	memset (to, 0, offsetof (struct dfa, syntax));
4421	to->canychar = -1;
4422	to->fast = from->fast;
4423	to->syntax = from->syntax;
4424	to->dfaexec = from->dfaexec;
4425	to->localeinfo = from->localeinfo;
4426	}
4427
4428	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/sed/current/lib/dfa.c@ 3611

Download in other formats: