dfa.c@ 3657

Last change on this file since 3657 was 3657, checked in by bird, 6 months ago
sed/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug.
File size: 136.3 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2022 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation, either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24
25	#include "dfa.h"
26
27	#include "flexmember.h"
28	#include "idx.h"
29	#include "verify.h"
30
31	#include <assert.h>
32	#include <ctype.h>
33	#include <stdint.h>
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include <limits.h>
37	#include <string.h>
38
39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40	understandably cannot deduce that the input comes from a
41	well-formed regular expression. There's little point to the
42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
43	MMU will check anyway. */
44	#define assume_nonnull(x) assume ((x) != NULL)
45
46	static bool
47	str_eq (char const a, char const b)
48	{
49	return strcmp (a, b) == 0;
50	}
51
52	static bool
53	c_isdigit (char c)
54	{
55	return '0' <= c && c <= '9';
56	}
57
58	#include "gettext.h"
59	#define _(str) gettext (str)
60
61	#include <wchar.h>
62	#include <wctype.h>
63
64	#include "xalloc.h"
65	#include "localeinfo.h"
66
67	#ifndef FALLTHROUGH
68	# if 201710L < __STDC_VERSION__
69	# define FALLTHROUGH [[__fallthrough__]]
70	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
71	# define FALLTHROUGH __attribute__ ((__fallthrough__))
72	# else
73	# define FALLTHROUGH ((void) 0)
74	# endif
75	#endif
76
77	#ifndef MIN
78	# define MIN(a,b) ((a) < (b) ? (a) : (b))
79	#endif
80
81	/* HPUX defines these as macros in sys/param.h. */
82	#ifdef setbit
83	# undef setbit
84	#endif
85	#ifdef clrbit
86	# undef clrbit
87	#endif
88
89	/* For code that does not use Gnulib’s isblank module. */
90	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
91	# define isblank dfa_isblank
92	static int
93	isblank (int c)
94	{
95	return c == ' ' \|\| c == '\t';
96	}
97	#endif
98
99	/* First integer value that is greater than any character code. */
100	enum { NOTCHAR = 1 << CHAR_BIT };
101
102	#ifdef UINT_LEAST64_MAX
103
104	/* Number of bits used in a charclass word. */
105	enum { CHARCLASS_WORD_BITS = 64 };
106
107	/* This represents part of a character class. It must be unsigned and
108	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
109	typedef uint_least64_t charclass_word;
110
111	/* Part of a charclass initializer that represents 64 bits' worth of a
112	charclass, where LO and HI are the low and high-order 32 bits of
113	the 64-bit quantity. */
114	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
115
116	#else
117	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
118	enum { CHARCLASS_WORD_BITS = 32 };
119	typedef unsigned long charclass_word;
120	# define CHARCLASS_PAIR(lo, hi) lo, hi
121	#endif
122
123	/* An initializer for a charclass whose 32-bit words are A through H. */
124	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
125	{{ \
126	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
127	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
128	}}
129
130	/* The maximum useful value of a charclass_word; all used bits are 1. */
131	static charclass_word const CHARCLASS_WORD_MASK
132	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
133
134	/* Number of words required to hold a bit for every character. */
135	enum
136	{
137	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
138	};
139
140	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
141	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
142
143	/* Convert a possibly-signed character to an unsigned character. This is
144	a bit safer than casting to unsigned char, since it catches some type
145	errors that the cast doesn't. */
146	static unsigned char
147	to_uchar (char ch)
148	{
149	return ch;
150	}
151
152	/* Contexts tell us whether a character is a newline or a word constituent.
153	Word-constituent characters are those that satisfy iswalnum, plus '_'.
154	Each character has a single CTX_* value; bitmasks of CTX_* values denote
155	a particular character class.
156
157	A state also stores a context value, which is a bitmask of CTX_* values.
158	A state's context represents a set of characters that the state's
159	predecessors must match. For example, a state whose context does not
160	include CTX_LETTER will never have transitions where the previous
161	character is a word constituent. A state whose context is CTX_ANY
162	might have transitions from any character. */
163
164	enum
165	{
166	CTX_NONE = 1,
167	CTX_LETTER = 2,
168	CTX_NEWLINE = 4,
169	CTX_ANY = 7
170	};
171
172	/* Sometimes characters can only be matched depending on the surrounding
173	context. Such context decisions depend on what the previous character
174	was, and the value of the current (lookahead) character. Context
175	dependent constraints are encoded as 9-bit integers. Each bit that
176	is set indicates that the constraint succeeds in the corresponding
177	context.
178
179	bit 6-8 - valid contexts when next character is CTX_NEWLINE
180	bit 3-5 - valid contexts when next character is CTX_LETTER
181	bit 0-2 - valid contexts when next character is CTX_NONE
182
183	succeeds_in_context determines whether a given constraint
184	succeeds in a particular context. Prev is a bitmask of possible
185	context values for the previous character, curr is the (single-bit)
186	context value for the lookahead character. */
187	static int
188	newline_constraint (int constraint)
189	{
190	return (constraint >> 6) & 7;
191	}
192	static int
193	letter_constraint (int constraint)
194	{
195	return (constraint >> 3) & 7;
196	}
197	static int
198	other_constraint (int constraint)
199	{
200	return constraint & 7;
201	}
202
203	static bool
204	succeeds_in_context (int constraint, int prev, int curr)
205	{
206	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
207	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
208	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
209	& prev);
210	}
211
212	/* The following describe what a constraint depends on. */
213	static bool
214	prev_newline_dependent (int constraint)
215	{
216	return ((constraint ^ constraint >> 2) & 0111) != 0;
217	}
218	static bool
219	prev_letter_dependent (int constraint)
220	{
221	return ((constraint ^ constraint >> 1) & 0111) != 0;
222	}
223
224	/* Tokens that match the empty string subject to some constraint actually
225	work by applying that constraint to determine what may follow them,
226	taking into account what has gone before. The following values are
227	the constraints corresponding to the special tokens previously defined. */
228	enum
229	{
230	NO_CONSTRAINT = 0777,
231	BEGLINE_CONSTRAINT = 0444,
232	ENDLINE_CONSTRAINT = 0700,
233	BEGWORD_CONSTRAINT = 0050,
234	ENDWORD_CONSTRAINT = 0202,
235	LIMWORD_CONSTRAINT = 0252,
236	NOTLIMWORD_CONSTRAINT = 0525
237	};
238
239	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
240	are operators and others are terminal symbols. Most (but not all) of these
241	codes are returned by the lexical analyzer. */
242
243	typedef ptrdiff_t token;
244	static token const TOKEN_MAX = PTRDIFF_MAX;
245
246	/* States are indexed by state_num values. These are normally
247	nonnegative but -1 is used as a special value. */
248	typedef ptrdiff_t state_num;
249
250	/* Predefined token values. */
251	enum
252	{
253	END = -1, /* END is a terminal symbol that matches the
254	end of input; any value of END or less in
255	the parse tree is such a symbol. Accepting
256	states of the DFA are those that would have
257	a transition on END. This is -1, not some
258	more-negative value, to tweak the speed of
259	comparisons to END. */
260
261	/* Ordinary character values are terminal symbols that match themselves. */
262
263	/* CSET must come last in the following list of special tokens. Otherwise,
264	the list order matters only for performance. Related special tokens
265	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
266	\|\| CSET <= t) can be done with a single machine-level comparison. */
267
268	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
269	the empty string. */
270
271	QMARK, /* QMARK is an operator of one argument that
272	matches zero or one occurrences of its
273	argument. */
274
275	STAR, /* STAR is an operator of one argument that
276	matches the Kleene closure (zero or more
277	occurrences) of its argument. */
278
279	PLUS, /* PLUS is an operator of one argument that
280	matches the positive closure (one or more
281	occurrences) of its argument. */
282
283	REPMN, /* REPMN is a lexical token corresponding
284	to the {m,n} construct. REPMN never
285	appears in the compiled token vector. */
286
287	CAT, /* CAT is an operator of two arguments that
288	matches the concatenation of its
289	arguments. CAT is never returned by the
290	lexical analyzer. */
291
292	OR, /* OR is an operator of two arguments that
293	matches either of its arguments. */
294
295	LPAREN, /* LPAREN never appears in the parse tree,
296	it is only a lexeme. */
297
298	RPAREN, /* RPAREN never appears in the parse tree. */
299
300	WCHAR, /* Only returned by lex. wctok contains
301	the wide character representation. */
302
303	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
304	a valid multibyte (or single byte) character.
305	It is used only if MB_CUR_MAX > 1. */
306
307	BEG, /* BEG is an initial symbol that matches the
308	beginning of input. */
309
310	BEGLINE, /* BEGLINE is a terminal symbol that matches
311	the empty string at the beginning of a
312	line. */
313
314	ENDLINE, /* ENDLINE is a terminal symbol that matches
315	the empty string at the end of a line. */
316
317	BEGWORD, /* BEGWORD is a terminal symbol that matches
318	the empty string at the beginning of a
319	word. */
320
321	ENDWORD, /* ENDWORD is a terminal symbol that matches
322	the empty string at the end of a word. */
323
324	LIMWORD, /* LIMWORD is a terminal symbol that matches
325	the empty string at the beginning or the
326	end of a word. */
327
328	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
329	matches the empty string not at
330	the beginning or end of a word. */
331
332	BACKREF, /* BACKREF is generated by \<digit>
333	or by any other construct that
334	is not completely handled. If the scanner
335	detects a transition on backref, it returns
336	a kind of "semi-success" indicating that
337	the match will have to be verified with
338	a backtracking matcher. */
339
340	MBCSET, /* MBCSET is similar to CSET, but for
341	multibyte characters. */
342
343	CSET /* CSET and (and any value greater) is a
344	terminal symbol that matches any of a
345	class of characters. */
346	};
347
348
349	/* States of the recognizer correspond to sets of positions in the parse
350	tree, together with the constraints under which they may be matched.
351	So a position is encoded as an index into the parse tree together with
352	a constraint. */
353	typedef struct
354	{
355	idx_t index; /* Index into the parse array. */
356	unsigned int constraint; /* Constraint for matching this position. */
357	} position;
358
359	/* Sets of positions are stored as arrays. */
360	typedef struct
361	{
362	position elems; / Elements of this position set. */
363	idx_t nelem; /* Number of elements in this set. */
364	idx_t alloc; /* Number of elements allocated in ELEMS. */
365	} position_set;
366
367	/* A state of the dfa consists of a set of positions, some flags,
368	and the token value of the lowest-numbered position of the state that
369	contains an END token. */
370	typedef struct
371	{
372	size_t hash; /* Hash of the positions of this state. */
373	position_set elems; /* Positions this state could match. */
374	unsigned char context; /* Context from previous state. */
375	unsigned short constraint; /* Constraint for this state to accept. */
376	position_set mbps; /* Positions which can match multibyte
377	characters or the follows, e.g., period.
378	Used only if MB_CUR_MAX > 1. */
379	state_num mb_trindex; /* Index of this state in MB_TRANS, or
380	negative if the state does not have
381	ANYCHAR. */
382	} dfa_state;
383
384	/* Maximum for any transition table count. This should be at least 3,
385	for the initial state setup. */
386	enum { MAX_TRCOUNT = 1024 };
387
388	/* A bracket operator.
389	e.g., [a-c], [[:alpha:]], etc. */
390	struct mb_char_classes
391	{
392	ptrdiff_t cset;
393	bool invert;
394	wchar_t chars; / Normal characters. */
395	idx_t nchars;
396	idx_t nchars_alloc;
397	};
398
399	struct regex_syntax
400	{
401	/* Syntax bits controlling the behavior of the lexical analyzer. */
402	reg_syntax_t syntax_bits;
403	int dfaopts;
404	bool syntax_bits_set;
405
406	/* Flag for case-folding letters into sets. */
407	bool case_fold;
408
409	/* End-of-line byte in data. */
410	unsigned char eolbyte;
411
412	/* Cache of char-context values. */
413	char sbit[NOTCHAR];
414
415	/* If never_trail[B], the byte B cannot be a non-initial byte in a
416	multibyte character. */
417	bool never_trail[NOTCHAR];
418
419	/* Set of characters considered letters. */
420	charclass letters;
421
422	/* Set of characters that are newline. */
423	charclass newline;
424	};
425
426	/* Lexical analyzer. All the dross that deals with the obnoxious
427	GNU Regex syntax bits is located here. The poor, suffering
428	reader is referred to the GNU Regex documentation for the
429	meaning of the @#%!@#%^!@ syntax bits. */
430	struct lexer_state
431	{
432	char const ptr; / Pointer to next input character. */
433	idx_t left; /* Number of characters remaining. */
434	token lasttok; /* Previous token returned; initially END. */
435	idx_t parens; /* Count of outstanding left parens. */
436	int minrep, maxrep; /* Repeat counts for {m,n}. */
437
438	/* Wide character representation of the current multibyte character,
439	or WEOF if there was an encoding error. Used only if
440	MB_CUR_MAX > 1. */
441	wint_t wctok;
442
443	/* The most recently analyzed multibyte bracket expression. */
444	struct mb_char_classes brack;
445
446	/* We're separated from beginning or (, \| only by zero-width characters. */
447	bool laststart;
448	};
449
450	/* Recursive descent parser for regular expressions. */
451
452	struct parser_state
453	{
454	token tok; /* Lookahead token. */
455	idx_t depth; /* Current depth of a hypothetical stack
456	holding deferred productions. This is
457	used to determine the depth that will be
458	required of the real stack later on in
459	dfaanalyze. */
460	};
461
462	/* A compiled regular expression. */
463	struct dfa
464	{
465	/* Fields filled by the scanner. */
466	charclass charclasses; / Array of character sets for CSET tokens. */
467	idx_t cindex; /* Index for adding new charclasses. */
468	idx_t calloc; /* Number of charclasses allocated. */
469	ptrdiff_t canychar; /* Index of anychar class, or -1. */
470
471	/* Scanner state */
472	struct lexer_state lex;
473
474	/* Parser state */
475	struct parser_state parse;
476
477	/* Fields filled by the parser. */
478	token tokens; / Postfix parse array. */
479	idx_t tindex; /* Index for adding new tokens. */
480	idx_t talloc; /* Number of tokens currently allocated. */
481	idx_t depth; /* Depth required of an evaluation stack
482	used for depth-first traversal of the
483	parse tree. */
484	idx_t nleaves; /* Number of non-EMPTY leaves
485	in the parse tree. */
486	idx_t nregexps; /* Count of parallel regexps being built
487	with dfaparse. */
488	bool fast; /* The DFA is fast. */
489	bool epsilon; /* Does a token match only the empty string? */
490	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
491	mbstate_t mbs; /* Multibyte conversion state. */
492
493	/* The following are valid only if MB_CUR_MAX > 1. */
494
495	/* The value of multibyte_prop[i] is defined by following rule.
496	if tokens[i] < NOTCHAR
497	bit 0 : tokens[i] is the first byte of a character, including
498	single-byte characters.
499	bit 1 : tokens[i] is the last byte of a character, including
500	single-byte characters.
501
502	e.g.
503	tokens
504	= 'single_byte_a', 'multi_byte_A', single_byte_b'
505	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
506	multibyte_prop
507	= 3 , 1 , 0 , 2 , 3
508	*/
509	char *multibyte_prop;
510
511	/* Fields filled by the superset. */
512	struct dfa superset; / Hint of the dfa. */
513
514	/* Fields filled by the state builder. */
515	dfa_state states; / States of the dfa. */
516	state_num sindex; /* Index for adding new states. */
517	idx_t salloc; /* Number of states currently allocated. */
518
519	/* Fields filled by the parse tree->NFA conversion. */
520	position_set follows; / Array of follow sets, indexed by position
521	index. The follow of a position is the set
522	of positions containing characters that
523	could conceivably follow a character
524	matching the given position in a string
525	matching the regexp. Allocated to the
526	maximum possible position index. */
527	bool searchflag; /* We are supposed to build a searching
528	as opposed to an exact matcher. A searching
529	matcher finds the first and shortest string
530	matching a regexp anywhere in the buffer,
531	whereas an exact matcher finds the longest
532	string matching, but anchored to the
533	beginning of the buffer. */
534
535	/* Fields filled by dfaanalyze. */
536	int constraints; / Array of union of accepting constraints
537	in the follow of a position. */
538	int separates; / Array of contexts on follow of a
539	position. */
540
541	/* Fields filled by dfaexec. */
542	state_num tralloc; /* Number of transition tables that have
543	slots so far, not counting trans[-1] and
544	trans[-2]. */
545	int trcount; /* Number of transition tables that have
546	been built, other than for initial
547	states. */
548	int min_trcount; /* Number of initial states. Equivalently,
549	the minimum state number for which trcount
550	counts transitions. */
551	state_num *trans; / Transition tables for states that can
552	never accept. If the transitions for a
553	state have not yet been computed, or the
554	state could possibly accept, its entry in
555	this table is NULL. This points to two
556	past the start of the allocated array,
557	and trans[-1] and trans[-2] are always
558	NULL. */
559	state_num *fails; / Transition tables after failing to accept
560	on a state that potentially could do so.
561	If trans[i] is non-null, fails[i] must
562	be null. */
563	char success; / Table of acceptance conditions used in
564	dfaexec and computed in build_state. */
565	state_num newlines; / Transitions on newlines. The entry for a
566	newline in any transition table is always
567	-1 so we can count lines without wasting
568	too many cycles. The transition for a
569	newline is stored separately and handled
570	as a special case. Newline is also used
571	as a sentinel at the end of the buffer. */
572	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
573	context in multibyte locales, in which we
574	do not distinguish between their contexts,
575	as not supported word. */
576	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
577	state_num *mb_trans; / Transition tables for states with
578	ANYCHAR. */
579	state_num mb_trcount; /* Number of transition tables for states with
580	ANYCHAR that have actually been built. */
581
582	/* Syntax configuration. This is near the end so that dfacopysyntax
583	can memset up to here. */
584	struct regex_syntax syntax;
585
586	/* Information derived from the locale. This is at the end so that
587	a quick memset need not clear it specially. */
588
589	/* dfaexec implementation. */
590	char (dfaexec) (struct dfa , char const , char *,
591	bool, idx_t , bool );
592
593	/* Other cached information derived from the locale. */
594	struct localeinfo localeinfo;
595	};
596
597	/* User access to dfa internals. */
598
599	/* S could possibly be an accepting state of R. */
600	static bool
601	accepting (state_num s, struct dfa const *r)
602	{
603	return r->states[s].constraint != 0;
604	}
605
606	/* STATE accepts in the specified context. */
607	static bool
608	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
609	{
610	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
611	}
612
613	static void regexp (struct dfa *dfa);
614
615	/* Store into *PWC the result of converting the leading bytes of the
616	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
617	and updating the conversion state in *D. On conversion error,
618	convert just a single byte, to WEOF. Return the number of bytes
619	converted.
620
621	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
622
623	* PWC points to wint_t, not to wchar_t.
624	* The last arg is a dfa *D instead of merely a multibyte conversion
625	state D->mbs.
626	* N is idx_t not size_t, and must be at least 1.
627	* S[N - 1] must be a sentinel byte.
628	* Shift encodings are not supported.
629	* The return value is always in the range 1..N.
630	* D->mbs is always valid afterwards.
631	* PWC is always set to something. /
632	static int
633	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
634	{
635	unsigned char uc = s[0];
636	wint_t wc = d->localeinfo.sbctowc[uc];
637
638	if (wc == WEOF)
639	{
640	wchar_t wch;
641	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
642	if (0 < nbytes && nbytes < (size_t) -2)
643	{
644	*pwc = wch;
645	return nbytes;
646	}
647	memset (&d->mbs, 0, sizeof d->mbs);
648	}
649
650	*pwc = wc;
651	return 1;
652	}
653
654	#ifdef DEBUG
655
656	static void
657	prtok (token t)
658	{
659	if (t <= END)
660	fprintf (stderr, "END");
661	else if (0 <= t && t < NOTCHAR)
662	{
663	unsigned int ch = t;
664	fprintf (stderr, "0x%02x", ch);
665	}
666	else
667	{
668	char const *s;
669	switch (t)
670	{
671	case BEG:
672	s = "BEG";
673	break;
674	case EMPTY:
675	s = "EMPTY";
676	break;
677	case BACKREF:
678	s = "BACKREF";
679	break;
680	case BEGLINE:
681	s = "BEGLINE";
682	break;
683	case ENDLINE:
684	s = "ENDLINE";
685	break;
686	case BEGWORD:
687	s = "BEGWORD";
688	break;
689	case ENDWORD:
690	s = "ENDWORD";
691	break;
692	case LIMWORD:
693	s = "LIMWORD";
694	break;
695	case NOTLIMWORD:
696	s = "NOTLIMWORD";
697	break;
698	case QMARK:
699	s = "QMARK";
700	break;
701	case STAR:
702	s = "STAR";
703	break;
704	case PLUS:
705	s = "PLUS";
706	break;
707	case CAT:
708	s = "CAT";
709	break;
710	case OR:
711	s = "OR";
712	break;
713	case LPAREN:
714	s = "LPAREN";
715	break;
716	case RPAREN:
717	s = "RPAREN";
718	break;
719	case ANYCHAR:
720	s = "ANYCHAR";
721	break;
722	case MBCSET:
723	s = "MBCSET";
724	break;
725	default:
726	s = "CSET";
727	break;
728	}
729	fprintf (stderr, "%s", s);
730	}
731	}
732	#endif /* DEBUG */
733
734	/* Stuff pertaining to charclasses. */
735
736	static bool
737	tstbit (unsigned int b, charclass const *c)
738	{
739	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
740	}
741
742	static void
743	setbit (unsigned int b, charclass *c)
744	{
745	charclass_word one = 1;
746	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
747	}
748
749	static void
750	clrbit (unsigned int b, charclass *c)
751	{
752	charclass_word one = 1;
753	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
754	}
755
756	static void
757	zeroset (charclass *s)
758	{
759	memset (s, 0, sizeof *s);
760	}
761
762	static void
763	fillset (charclass *s)
764	{
765	for (int i = 0; i < CHARCLASS_WORDS; i++)
766	s->w[i] = CHARCLASS_WORD_MASK;
767	}
768
769	static void
770	notset (charclass *s)
771	{
772	for (int i = 0; i < CHARCLASS_WORDS; ++i)
773	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
774	}
775
776	static bool
777	equal (charclass const s1, charclass const s2)
778	{
779	charclass_word w = 0;
780	for (int i = 0; i < CHARCLASS_WORDS; i++)
781	w \|= s1->w[i] ^ s2->w[i];
782	return w == 0;
783	}
784
785	static bool
786	emptyset (charclass const *s)
787	{
788	charclass_word w = 0;
789	for (int i = 0; i < CHARCLASS_WORDS; i++)
790	w \|= s->w[i];
791	return w == 0;
792	}
793
794	/* Ensure that the array addressed by PA holds at least I + 1 items.
795	Either return PA, or reallocate the array and return its new address.
796	Although PA may be null, the returned value is never null.
797
798	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
799	is updated on reallocation. If PA is null, *NITEMS must be zero.
800	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
801	ITEM_SIZE is the size of one item; it must be positive.
802	Avoid O(N*2) behavior on arrays growing linearly. /
803	static void *
804	maybe_realloc (void pa, idx_t i, idx_t nitems,
805	ptrdiff_t nitems_max, idx_t item_size)
806	{
807	if (i < *nitems)
808	return pa;
809	return xpalloc (pa, nitems, 1, nitems_max, item_size);
810	}
811
812	/* In DFA D, find the index of charclass S, or allocate a new one. */
813	static idx_t
814	charclass_index (struct dfa d, charclass const s)
815	{
816	idx_t i;
817
818	for (i = 0; i < d->cindex; ++i)
819	if (equal (s, &d->charclasses[i]))
820	return i;
821	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
822	TOKEN_MAX - CSET, sizeof *d->charclasses);
823	++d->cindex;
824	d->charclasses[i] = *s;
825	return i;
826	}
827
828	static bool
829	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
830	{
831	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
832	}
833
834	static int
835	char_context (struct dfa const *dfa, unsigned char c)
836	{
837	if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
838	return CTX_NEWLINE;
839	if (unibyte_word_constituent (dfa, c))
840	return CTX_LETTER;
841	return CTX_NONE;
842	}
843
844	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
845	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
846	this may happen when folding case in weird Turkish locales where
847	dotless i/dotted I are not included in the chosen character set.
848	Return whether a bit was set in the charclass. */
849	static bool
850	setbit_wc (wint_t wc, charclass *c)
851	{
852	int b = wctob (wc);
853	if (b < 0)
854	return false;
855
856	setbit (b, c);
857	return true;
858	}
859
860	/* Set a bit for B and its case variants in the charclass C.
861	MB_CUR_MAX must be 1. */
862	static void
863	setbit_case_fold_c (int b, charclass *c)
864	{
865	int ub = toupper (b);
866	for (int i = 0; i < NOTCHAR; i++)
867	if (toupper (i) == ub)
868	setbit (i, c);
869	}
870
871	/* Fetch the next lexical input character from the pattern. There
872	must at least one byte of pattern input. Set DFA->lex.wctok to the
873	value of the character or to WEOF depending on whether the input is
874	a valid multibyte character (possibly of length 1). Then return
875	the next input byte value, except return EOF if the input is a
876	multibyte character of length greater than 1. */
877	static int
878	fetch_wc (struct dfa *dfa)
879	{
880	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
881	dfa);
882	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
883	dfa->lex.ptr += nbytes;
884	dfa->lex.left -= nbytes;
885	return c;
886	}
887
888	/* If there is no more input, report an error about unbalanced brackets.
889	Otherwise, behave as with fetch_wc (DFA). */
890	static int
891	bracket_fetch_wc (struct dfa *dfa)
892	{
893	if (! dfa->lex.left)
894	dfaerror (_("unbalanced ["));
895	return fetch_wc (dfa);
896	}
897
898	typedef int predicate (int);
899
900	/* The following list maps the names of the Posix named character classes
901	to predicate functions that determine whether a given character is in
902	the class. The leading [ has already been eaten by the lexical
903	analyzer. */
904	struct dfa_ctype
905	{
906	const char *name;
907	predicate *func;
908	bool single_byte_only;
909	};
910
911	static const struct dfa_ctype prednames[] = {
912	{"alpha", isalpha, false},
913	{"upper", isupper, false},
914	{"lower", islower, false},
915	{"digit", isdigit, true},
916	{"xdigit", isxdigit, false},
917	{"space", isspace, false},
918	{"punct", ispunct, false},
919	{"alnum", isalnum, false},
920	{"print", isprint, false},
921	{"graph", isgraph, false},
922	{"cntrl", iscntrl, false},
923	{"blank", isblank, false},
924	{NULL, NULL, false}
925	};
926
927	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
928	find_pred (const char *str)
929	{
930	for (int i = 0; prednames[i].name; i++)
931	if (str_eq (str, prednames[i].name))
932	return &prednames[i];
933	return NULL;
934	}
935
936	/* Parse a bracket expression, which possibly includes multibyte
937	characters. */
938	static token
939	parse_bracket_exp (struct dfa *dfa)
940	{
941	/* This is a bracket expression that dfaexec is known to
942	process correctly. */
943	bool known_bracket_exp = true;
944
945	/* Used to warn about [:space:].
946	Bit 0 = first character is a colon.
947	Bit 1 = last character is a colon.
948	Bit 2 = includes any other character but a colon.
949	Bit 3 = includes ranges, char/equiv classes or collation elements. */
950	int colon_warning_state;
951
952	dfa->lex.brack.nchars = 0;
953	charclass ccl;
954	zeroset (&ccl);
955	int c = bracket_fetch_wc (dfa);
956	bool invert = c == '^';
957	if (invert)
958	{
959	c = bracket_fetch_wc (dfa);
960	known_bracket_exp = dfa->localeinfo.simple;
961	}
962	wint_t wc = dfa->lex.wctok;
963	int c1;
964	wint_t wc1;
965	colon_warning_state = (c == ':');
966	do
967	{
968	c1 = NOTCHAR; /* Mark c1 as not initialized. */
969	colon_warning_state &= ~2;
970
971	/* Note that if we're looking at some other [:...:] construct,
972	we just treat it as a bunch of ordinary characters. We can do
973	this because we assume regex has checked for syntax errors before
974	dfa is ever called. */
975	if (c == '[')
976	{
977	c1 = bracket_fetch_wc (dfa);
978	wc1 = dfa->lex.wctok;
979
980	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
981	\|\| c1 == '.' \|\| c1 == '=')
982	{
983	enum { MAX_BRACKET_STRING_LEN = 32 };
984	char str[MAX_BRACKET_STRING_LEN + 1];
985	int len = 0;
986	for (;;)
987	{
988	c = bracket_fetch_wc (dfa);
989	if (dfa->lex.left == 0
990	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
991	break;
992	if (len < MAX_BRACKET_STRING_LEN)
993	str[len++] = c;
994	else
995	/* This is in any case an invalid class name. */
996	str[0] = '\0';
997	}
998	str[len] = '\0';
999
1000	/* Fetch bracket. */
1001	c = bracket_fetch_wc (dfa);
1002	wc = dfa->lex.wctok;
1003	if (c1 == ':')
1004	/* Build character class. POSIX allows character
1005	classes to match multicharacter collating elements,
1006	but the regex code does not support that, so do not
1007	worry about that possibility. */
1008	{
1009	char const *class
1010	= (dfa->syntax.case_fold && (str_eq (str, "upper")
1011	\|\| str_eq (str, "lower"))
1012	? "alpha" : str);
1013	const struct dfa_ctype *pred = find_pred (class);
1014	if (!pred)
1015	dfaerror (_("invalid character class"));
1016
1017	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1018	known_bracket_exp = false;
1019	else
1020	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1021	if (pred->func (c2))
1022	setbit (c2, &ccl);
1023	}
1024	else
1025	known_bracket_exp = false;
1026
1027	colon_warning_state \|= 8;
1028
1029	/* Fetch new lookahead character. */
1030	c1 = bracket_fetch_wc (dfa);
1031	wc1 = dfa->lex.wctok;
1032	continue;
1033	}
1034
1035	/* We treat '[' as a normal character here. c/c1/wc/wc1
1036	are already set up. */
1037	}
1038
1039	if (c == '\\'
1040	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1041	{
1042	c = bracket_fetch_wc (dfa);
1043	wc = dfa->lex.wctok;
1044	}
1045
1046	if (c1 == NOTCHAR)
1047	{
1048	c1 = bracket_fetch_wc (dfa);
1049	wc1 = dfa->lex.wctok;
1050	}
1051
1052	if (c1 == '-')
1053	/* build range characters. */
1054	{
1055	int c2 = bracket_fetch_wc (dfa);
1056	wint_t wc2 = dfa->lex.wctok;
1057
1058	/* A bracket expression like [a-[.aa.]] matches an unknown set.
1059	Treat it like [-a[.aa.]] while parsing it, and
1060	remember that the set is unknown. */
1061	if (c2 == '[' && dfa->lex.ptr[0] == '.')
1062	{
1063	known_bracket_exp = false;
1064	c2 = ']';
1065	}
1066
1067	if (c2 == ']')
1068	{
1069	/* In the case [x-], the - is an ordinary hyphen,
1070	which is left in c1, the lookahead character. */
1071	dfa->lex.ptr--;
1072	dfa->lex.left++;
1073	}
1074	else
1075	{
1076	if (c2 == '\\' && (dfa->syntax.syntax_bits
1077	& RE_BACKSLASH_ESCAPE_IN_LISTS))
1078	{
1079	c2 = bracket_fetch_wc (dfa);
1080	wc2 = dfa->lex.wctok;
1081	}
1082
1083	colon_warning_state \|= 8;
1084	c1 = bracket_fetch_wc (dfa);
1085	wc1 = dfa->lex.wctok;
1086
1087	/* Treat [x-y] as a range if x != y. */
1088	if (wc != wc2 \|\| wc == WEOF)
1089	{
1090	if (dfa->localeinfo.simple
1091	\|\| (c_isdigit (c) & c_isdigit (c2)))
1092	{
1093	for (int ci = c; ci <= c2; ci++)
1094	if (dfa->syntax.case_fold && isalpha (ci))
1095	setbit_case_fold_c (ci, &ccl);
1096	else
1097	setbit (ci, &ccl);
1098	}
1099	else
1100	known_bracket_exp = false;
1101
1102	continue;
1103	}
1104	}
1105	}
1106
1107	colon_warning_state \|= (c == ':') ? 2 : 4;
1108
1109	if (!dfa->localeinfo.multibyte)
1110	{
1111	if (dfa->syntax.case_fold && isalpha (c))
1112	setbit_case_fold_c (c, &ccl);
1113	else
1114	setbit (c, &ccl);
1115	continue;
1116	}
1117
1118	if (wc == WEOF)
1119	known_bracket_exp = false;
1120	else
1121	{
1122	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1123	int n = (dfa->syntax.case_fold
1124	? case_folded_counterparts (wc, folded + 1) + 1
1125	: 1);
1126	folded[0] = wc;
1127	for (int i = 0; i < n; i++)
1128	if (!setbit_wc (folded[i], &ccl))
1129	{
1130	dfa->lex.brack.chars
1131	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1132	&dfa->lex.brack.nchars_alloc, -1,
1133	sizeof *dfa->lex.brack.chars);
1134	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1135	}
1136	}
1137	}
1138	while ((wc = wc1, (c = c1) != ']'));
1139
1140	if (colon_warning_state == 7)
1141	((dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR
1142	? dfaerror : dfawarn)
1143	(_("character class syntax is [[:space:]], not [:space:]")));
1144
1145	if (! known_bracket_exp)
1146	return BACKREF;
1147
1148	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
1149	{
1150	dfa->lex.brack.invert = invert;
1151	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152	return MBCSET;
1153	}
1154
1155	if (invert)
1156	{
1157	notset (&ccl);
1158	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159	clrbit ('\n', &ccl);
1160	}
1161
1162	return CSET + charclass_index (dfa, &ccl);
1163	}
1164
1165	struct lexptr
1166	{
1167	char const *ptr;
1168	idx_t left;
1169	};
1170
1171	static void
1172	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
1173	{
1174	ls->ptr = dfa->lex.ptr;
1175	ls->left = dfa->lex.left;
1176	dfa->lex.ptr = s;
1177	dfa->lex.left = strlen (s);
1178	}
1179
1180	static void
1181	pop_lex_state (struct dfa dfa, struct lexptr const ls)
1182	{
1183	dfa->lex.ptr = ls->ptr;
1184	dfa->lex.left = ls->left;
1185	}
1186
1187	static token
1188	lex (struct dfa *dfa)
1189	{
1190	bool backslash = false;
1191
1192	/* Basic plan: We fetch a character. If it's a backslash,
1193	we set the backslash flag and go through the loop again.
1194	On the plus side, this avoids having a duplicate of the
1195	main switch inside the backslash case. On the minus side,
1196	it means that just about every case tests the backslash flag. */
1197	for (int i = 0; i < 2; ++i)
1198	{
1199	if (! dfa->lex.left)
1200	return dfa->lex.lasttok = END;
1201	int c = fetch_wc (dfa);
1202
1203	switch (c)
1204	{
1205	case '\\':
1206	if (backslash)
1207	goto normal_char;
1208	if (dfa->lex.left == 0)
1209	dfaerror (_("unfinished \\ escape"));
1210	backslash = true;
1211	break;
1212
1213	case '^':
1214	if (backslash)
1215	goto normal_char;
1216	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1217	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
1218	\|\| dfa->lex.lasttok == OR)
1219	return dfa->lex.lasttok = BEGLINE;
1220	goto normal_char;
1221
1222	case '$':
1223	if (backslash)
1224	goto normal_char;
1225	/* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
1226	for speed, workaround is to optimize it for size instead. The
1227	symptom is that the following SED expression fail to match:
1228	s/^[0-9a-fA-F]\{1,\} $00[0-9a-fA-F]$ ABS notype External \| $[^.]\{1,\}$\.$.*$$/ 1=\1 2=\2 3=\3/
1229
1230	Seems the exact problem is that it gets the indexing here wrong:
1231	dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
1232	It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
1233	ANDs with a register initialized to zero. Rewriting the
1234	expressions using the tinary operator works around the problem,
1235	although the resulting code is a lot bulkier.
1236	*/
1237	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1238	\|\| dfa->lex.left == 0
1239	#ifdef _MSC_VER /* see above */
1240	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1241	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
1242	: dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
1243	#else
1244	\|\| ((dfa->lex.left
1245	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1246	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1247	& (dfa->lex.ptr[0] == '\\')]
1248	== ')'))
1249	#endif
1250	#ifdef _MSC_VER /* see above */
1251	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1252	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '\|'
1253	: dfa->lex.left > 0 && dfa->lex.ptr[0] == '\|')
1254	#else
1255	\|\| ((dfa->lex.left
1256	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1257	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1258	& (dfa->lex.ptr[0] == '\\')]
1259	== '\|'))
1260	#endif
1261	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1262	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1263	return dfa->lex.lasttok = ENDLINE;
1264	goto normal_char;
1265
1266	case '1':
1267	case '2':
1268	case '3':
1269	case '4':
1270	case '5':
1271	case '6':
1272	case '7':
1273	case '8':
1274	case '9':
1275	if (!backslash)
1276	goto normal_char;
1277	if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1278	goto stray_backslash;
1279
1280	dfa->lex.laststart = false;
1281	return dfa->lex.lasttok = BACKREF;
1282
1283	case '`':
1284	if (!backslash)
1285	goto normal_char;
1286	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1287	goto stray_backslash;
1288
1289	/* FIXME: should be beginning of string */
1290	return dfa->lex.lasttok = BEGLINE;
1291
1292	case '\'':
1293	if (!backslash)
1294	goto normal_char;
1295	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1296	goto stray_backslash;
1297
1298	/* FIXME: should be end of string */
1299	return dfa->lex.lasttok = ENDLINE;
1300
1301	case '<':
1302	if (!backslash)
1303	goto normal_char;
1304	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1305	goto stray_backslash;
1306
1307	return dfa->lex.lasttok = BEGWORD;
1308
1309	case '>':
1310	if (!backslash)
1311	goto normal_char;
1312	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1313	goto stray_backslash;
1314
1315	return dfa->lex.lasttok = ENDWORD;
1316
1317	case 'b':
1318	if (!backslash)
1319	goto normal_char;
1320	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1321	goto stray_backslash;
1322
1323	return dfa->lex.lasttok = LIMWORD;
1324
1325	case 'B':
1326	if (!backslash)
1327	goto normal_char;
1328	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1329	goto stray_backslash;
1330
1331	return dfa->lex.lasttok = NOTLIMWORD;
1332
1333	case '?':
1334	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1335	goto default_case;
1336	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1337	goto normal_char;
1338	if (dfa->lex.laststart)
1339	{
1340	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1341	goto default_case;
1342	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1343	dfawarn (_("? at start of expression"));
1344	}
1345	return dfa->lex.lasttok = QMARK;
1346
1347	case '*':
1348	if (backslash)
1349	goto normal_char;
1350	if (dfa->lex.laststart)
1351	{
1352	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1353	goto default_case;
1354	if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1355	dfawarn (_("* at start of expression"));
1356	}
1357	return dfa->lex.lasttok = STAR;
1358
1359	case '+':
1360	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1361	goto default_case;
1362	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1363	goto normal_char;
1364	if (dfa->lex.laststart)
1365	{
1366	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1367	goto default_case;
1368	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1369	dfawarn (_("+ at start of expression"));
1370	}
1371	return dfa->lex.lasttok = PLUS;
1372
1373	case '{':
1374	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1375	goto default_case;
1376	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1377	goto normal_char;
1378
1379	/* Cases:
1380	{M} - exact count
1381	{M,} - minimum count, maximum is infinity
1382	{,N} - 0 through N
1383	{,} - 0 to infinity (same as '*')
1384	{M,N} - M through N */
1385	{
1386	char const *p = dfa->lex.ptr;
1387	char const *lim = p + dfa->lex.left;
1388	dfa->lex.minrep = dfa->lex.maxrep = -1;
1389	for (; p != lim && c_isdigit (*p); p++)
1390	dfa->lex.minrep = (dfa->lex.minrep < 0
1391	? *p - '0'
1392	: MIN (RE_DUP_MAX + 1,
1393	dfa->lex.minrep * 10 + *p - '0'));
1394	if (p != lim)
1395	{
1396	if (*p != ',')
1397	dfa->lex.maxrep = dfa->lex.minrep;
1398	else
1399	{
1400	if (dfa->lex.minrep < 0)
1401	dfa->lex.minrep = 0;
1402	while (++p != lim && c_isdigit (*p))
1403	dfa->lex.maxrep
1404	= (dfa->lex.maxrep < 0
1405	? *p - '0'
1406	: MIN (RE_DUP_MAX + 1,
1407	dfa->lex.maxrep * 10 + *p - '0'));
1408	}
1409	}
1410	bool invalid_content
1411	= ! ((! backslash \|\| (p != lim && *p++ == '\\'))
1412	&& p != lim && *p++ == '}'
1413	&& 0 <= dfa->lex.minrep
1414	&& (dfa->lex.maxrep < 0
1415	\|\| dfa->lex.minrep <= dfa->lex.maxrep));
1416	if (invalid_content
1417	&& (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1418	goto normal_char;
1419	if (dfa->lex.laststart)
1420	{
1421	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1422	goto default_case;
1423	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1424	dfawarn (_("{...} at start of expression"));
1425	}
1426	if (invalid_content)
1427	dfaerror (_("invalid content of \\{\\}"));
1428	if (RE_DUP_MAX < dfa->lex.maxrep)
1429	dfaerror (_("regular expression too big"));
1430	dfa->lex.ptr = p;
1431	dfa->lex.left = lim - p;
1432	}
1433	dfa->lex.laststart = false;
1434	return dfa->lex.lasttok = REPMN;
1435
1436	case '\|':
1437	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1438	goto default_case;
1439	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1440	goto normal_char;
1441	dfa->lex.laststart = true;
1442	return dfa->lex.lasttok = OR;
1443
1444	case '\n':
1445	if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1446	goto default_case;
1447	if (backslash)
1448	goto normal_char;
1449	dfa->lex.laststart = true;
1450	return dfa->lex.lasttok = OR;
1451
1452	case '(':
1453	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1454	goto normal_char;
1455	dfa->lex.parens++;
1456	dfa->lex.laststart = true;
1457	return dfa->lex.lasttok = LPAREN;
1458
1459	case ')':
1460	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1461	goto normal_char;
1462	if (dfa->lex.parens == 0
1463	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1464	goto normal_char;
1465	dfa->lex.parens--;
1466	dfa->lex.laststart = false;
1467	return dfa->lex.lasttok = RPAREN;
1468
1469	case '.':
1470	if (backslash)
1471	goto normal_char;
1472	if (dfa->canychar < 0)
1473	{
1474	charclass ccl;
1475	fillset (&ccl);
1476	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1477	clrbit ('\n', &ccl);
1478	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1479	clrbit ('\0', &ccl);
1480	if (dfa->localeinfo.multibyte)
1481	for (int c2 = 0; c2 < NOTCHAR; c2++)
1482	if (dfa->localeinfo.sbctowc[c2] == WEOF)
1483	clrbit (c2, &ccl);
1484	dfa->canychar = charclass_index (dfa, &ccl);
1485	}
1486	dfa->lex.laststart = false;
1487	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1488	? ANYCHAR
1489	: CSET + dfa->canychar);
1490
1491	case 's':
1492	case 'S':
1493	if (!backslash)
1494	goto normal_char;
1495	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1496	goto stray_backslash;
1497
1498	if (!dfa->localeinfo.multibyte)
1499	{
1500	charclass ccl;
1501	zeroset (&ccl);
1502	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1503	if (isspace (c2))
1504	setbit (c2, &ccl);
1505	if (c == 'S')
1506	notset (&ccl);
1507	dfa->lex.laststart = false;
1508	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1509	}
1510
1511	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1512	add_utf8_anychar, makes sense. */
1513
1514	/* \s and \S are documented to be equivalent to [[:space:]] and
1515	[^[:space:]] respectively, so tell the lexer to process those
1516	strings, each minus its "already processed" '['. */
1517	{
1518	struct lexptr ls;
1519	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1520	dfa->lex.lasttok = parse_bracket_exp (dfa);
1521	pop_lex_state (dfa, &ls);
1522	}
1523
1524	dfa->lex.laststart = false;
1525	return dfa->lex.lasttok;
1526
1527	case 'w':
1528	case 'W':
1529	if (!backslash)
1530	goto normal_char;
1531	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1532	goto stray_backslash;
1533
1534	if (!dfa->localeinfo.multibyte)
1535	{
1536	charclass ccl;
1537	zeroset (&ccl);
1538	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1539	if (dfa->syntax.sbit[c2] == CTX_LETTER)
1540	setbit (c2, &ccl);
1541	if (c == 'W')
1542	notset (&ccl);
1543	dfa->lex.laststart = false;
1544	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1545	}
1546
1547	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1548	add_utf8_anychar, makes sense. */
1549
1550	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
1551	[^_[:alnum:]] respectively, so tell the lexer to process those
1552	strings, each minus its "already processed" '['. */
1553	{
1554	struct lexptr ls;
1555	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1556	dfa->lex.lasttok = parse_bracket_exp (dfa);
1557	pop_lex_state (dfa, &ls);
1558	}
1559
1560	dfa->lex.laststart = false;
1561	return dfa->lex.lasttok;
1562
1563	case '[':
1564	if (backslash)
1565	goto normal_char;
1566	dfa->lex.laststart = false;
1567	return dfa->lex.lasttok = parse_bracket_exp (dfa);
1568
1569	default:
1570	default_case:
1571	if (!backslash)
1572	goto normal_char;
1573	stray_backslash:
1574	if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1575	{
1576	char const *msg;
1577	char msgbuf[100];
1578	if (!iswprint (dfa->lex.wctok))
1579	msg = _("stray \\ before unprintable character");
1580	else if (iswspace (dfa->lex.wctok))
1581	msg = _("stray \\ before white space");
1582	else
1583	{
1584	int n = snprintf (msgbuf, sizeof msgbuf,
1585	_("stray \\ before %lc"), dfa->lex.wctok);
1586	msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1587	}
1588	dfawarn (msg);
1589	}
1590	FALLTHROUGH;
1591	case ']': case '}':
1592	normal_char:
1593	dfa->lex.laststart = false;
1594	/* For multibyte character sets, folding is done in atom. Always
1595	return WCHAR. */
1596	if (dfa->localeinfo.multibyte)
1597	return dfa->lex.lasttok = WCHAR;
1598
1599	if (dfa->syntax.case_fold && isalpha (c))
1600	{
1601	charclass ccl;
1602	zeroset (&ccl);
1603	setbit_case_fold_c (c, &ccl);
1604	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1605	}
1606
1607	return dfa->lex.lasttok = c;
1608	}
1609	}
1610
1611	/* The above loop should consume at most a backslash
1612	and some other character. */
1613	abort ();
1614	return END; /* keeps pedantic compilers happy. */
1615	}
1616
1617	static void
1618	addtok_mb (struct dfa *dfa, token t, char mbprop)
1619	{
1620	if (dfa->talloc == dfa->tindex)
1621	{
1622	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1623	sizeof *dfa->tokens);
1624	if (dfa->localeinfo.multibyte)
1625	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1626	sizeof *dfa->multibyte_prop);
1627	}
1628	if (dfa->localeinfo.multibyte)
1629	dfa->multibyte_prop[dfa->tindex] = mbprop;
1630	dfa->tokens[dfa->tindex++] = t;
1631
1632	switch (t)
1633	{
1634	case QMARK:
1635	case STAR:
1636	case PLUS:
1637	break;
1638
1639	case CAT:
1640	case OR:
1641	dfa->parse.depth--;
1642	break;
1643
1644	case EMPTY:
1645	dfa->epsilon = true;
1646	goto increment_depth;
1647
1648	case BACKREF:
1649	dfa->fast = false;
1650	goto increment_nleaves;
1651
1652	case BEGLINE:
1653	case ENDLINE:
1654	case BEGWORD:
1655	case ENDWORD:
1656	case LIMWORD:
1657	case NOTLIMWORD:
1658	dfa->epsilon = true;
1659	FALLTHROUGH;
1660	default:
1661	increment_nleaves:
1662	dfa->nleaves++;
1663	increment_depth:
1664	dfa->parse.depth++;
1665	if (dfa->depth < dfa->parse.depth)
1666	dfa->depth = dfa->parse.depth;
1667	break;
1668	}
1669	}
1670
1671	static void addtok_wc (struct dfa *dfa, wint_t wc);
1672
1673	/* Add the given token to the parse tree, maintaining the depth count and
1674	updating the maximum depth if necessary. */
1675	static void
1676	addtok (struct dfa *dfa, token t)
1677	{
1678	if (dfa->localeinfo.multibyte && t == MBCSET)
1679	{
1680	bool need_or = false;
1681
1682	/* Extract wide characters into alternations for better performance.
1683	This does not require UTF-8. */
1684	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1685	{
1686	addtok_wc (dfa, dfa->lex.brack.chars[i]);
1687	if (need_or)
1688	addtok (dfa, OR);
1689	need_or = true;
1690	}
1691	dfa->lex.brack.nchars = 0;
1692
1693	/* Wide characters have been handled above, so it is possible
1694	that the set is empty now. Do nothing in that case. */
1695	if (dfa->lex.brack.cset != -1)
1696	{
1697	addtok (dfa, CSET + dfa->lex.brack.cset);
1698	if (need_or)
1699	addtok (dfa, OR);
1700	}
1701	}
1702	else
1703	{
1704	addtok_mb (dfa, t, 3);
1705	}
1706	}
1707
1708	/* We treat a multibyte character as a single atom, so that DFA
1709	can treat a multibyte character as a single expression.
1710
1711	e.g., we construct the following tree from "<mb1><mb2>".
1712	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1713	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1714	static void
1715	addtok_wc (struct dfa *dfa, wint_t wc)
1716	{
1717	unsigned char buf[MB_LEN_MAX];
1718	mbstate_t s = { 0 };
1719	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1720	int buflen;
1721
1722	if (stored_bytes != (size_t) -1)
1723	buflen = stored_bytes;
1724	else
1725	{
1726	/* This is merely stop-gap. buf[0] is undefined, yet skipping
1727	the addtok_mb call altogether can corrupt the heap. */
1728	buflen = 1;
1729	buf[0] = 0;
1730	}
1731
1732	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1733	for (int i = 1; i < buflen; i++)
1734	{
1735	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1736	addtok (dfa, CAT);
1737	}
1738	}
1739
1740	static void
1741	add_utf8_anychar (struct dfa *dfa)
1742	{
1743	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1744	UTF-8 byte sequence has been defined as follows:
1745
1746	([\x00-\x7f]
1747	\|[\xc2-\xdf][\x80-\xbf]
1748	\|[\xe0][\xa0-\xbf][\x80-\xbf]
1749	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1750	\|[\xed][\x80-\x9f][\x80-\xbf]
1751	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1752	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1753	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1754
1755	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
1756	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1757	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1758	H = [\x80-\x9f], I = [\xf0],
1759	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1760
1761	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
1762
1763	/* Mnemonics for classes containing two or more bytes. */
1764	enum { A, B, C, E, F, H, J, K, M };
1765
1766	/* Mnemonics for single-byte tokens. */
1767	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1768
1769	static charclass const utf8_classes[] = {
1770	/* A. 00-7f: 1-byte sequence. */
1771	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1772
1773	/* B. c2-df: 1st byte of a 2-byte sequence. */
1774	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1775
1776	/* C. 80-bf: non-leading bytes. */
1777	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1778
1779	/* D. e0 (just a token). */
1780
1781	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
1782	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1783
1784	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1785	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1786
1787	/* G. ed (just a token). */
1788
1789	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
1790	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1791
1792	/* I. f0 (just a token). */
1793
1794	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1795	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1796
1797	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
1798	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1799
1800	/* L. f4 (just a token). */
1801
1802	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1803	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1804	};
1805
1806	/* Define the character classes that are needed below. */
1807	if (dfa->utf8_anychar_classes[0] == 0)
1808	{
1809	charclass c = utf8_classes[0];
1810	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1811	clrbit ('\n', &c);
1812	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1813	clrbit ('\0', &c);
1814	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1815
1816	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1817	dfa->utf8_anychar_classes[i]
1818	= CSET + charclass_index (dfa, &utf8_classes[i]);
1819	}
1820
1821	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
1822	The token buffer is in reverse Polish order, so we get
1823	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1824	C CAT OR C CAT OR C CAT OR". */
1825	addtok (dfa, dfa->utf8_anychar_classes[A]);
1826	addtok (dfa, dfa->utf8_anychar_classes[B]);
1827	addtok (dfa, D_token);
1828	addtok (dfa, dfa->utf8_anychar_classes[E]);
1829	addtok (dfa, CAT);
1830	addtok (dfa, OR);
1831	addtok (dfa, G_token);
1832	addtok (dfa, dfa->utf8_anychar_classes[H]);
1833	addtok (dfa, CAT);
1834	addtok (dfa, OR);
1835	addtok (dfa, dfa->utf8_anychar_classes[F]);
1836	addtok (dfa, I_token);
1837	addtok (dfa, dfa->utf8_anychar_classes[J]);
1838	addtok (dfa, CAT);
1839	addtok (dfa, OR);
1840	addtok (dfa, L_token);
1841	addtok (dfa, dfa->utf8_anychar_classes[M]);
1842	addtok (dfa, CAT);
1843	addtok (dfa, OR);
1844	addtok (dfa, dfa->utf8_anychar_classes[K]);
1845	for (int i = 0; i < 3; i++)
1846	{
1847	addtok (dfa, dfa->utf8_anychar_classes[C]);
1848	addtok (dfa, CAT);
1849	addtok (dfa, OR);
1850	}
1851	}
1852
1853	/* The grammar understood by the parser is as follows.
1854
1855	regexp:
1856	regexp OR branch
1857	branch
1858
1859	branch:
1860	branch closure
1861	closure
1862
1863	closure:
1864	closure QMARK
1865	closure STAR
1866	closure PLUS
1867	closure REPMN
1868	atom
1869
1870	atom:
1871	<normal character>
1872	<multibyte character>
1873	ANYCHAR
1874	MBCSET
1875	CSET
1876	BACKREF
1877	BEGLINE
1878	ENDLINE
1879	BEGWORD
1880	ENDWORD
1881	LIMWORD
1882	NOTLIMWORD
1883	LPAREN regexp RPAREN
1884	<empty>
1885
1886	The parser builds a parse tree in postfix form in an array of tokens. */
1887
1888	static void
1889	atom (struct dfa *dfa)
1890	{
1891	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1892	\|\| dfa->parse.tok >= CSET
1893	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
1894	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
1895	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
1896	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
1897	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
1898	{
1899	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1900	{
1901	/* For UTF-8 expand the period to a series of CSETs that define a
1902	valid UTF-8 character. This avoids using the slow multibyte
1903	path. I'm pretty sure it would be both profitable and correct to
1904	do it for any encoding; however, the optimization must be done
1905	manually as it is done above in add_utf8_anychar. So, let's
1906	start with UTF-8: it is the most used, and the structure of the
1907	encoding makes the correctness more obvious. */
1908	add_utf8_anychar (dfa);
1909	}
1910	else
1911	addtok (dfa, dfa->parse.tok);
1912	dfa->parse.tok = lex (dfa);
1913	}
1914	else if (dfa->parse.tok == WCHAR)
1915	{
1916	if (dfa->lex.wctok == WEOF)
1917	addtok (dfa, BACKREF);
1918	else
1919	{
1920	addtok_wc (dfa, dfa->lex.wctok);
1921
1922	if (dfa->syntax.case_fold)
1923	{
1924	wchar_t folded[CASE_FOLDED_BUFSIZE];
1925	int n = case_folded_counterparts (dfa->lex.wctok, folded);
1926	for (int i = 0; i < n; i++)
1927	{
1928	addtok_wc (dfa, folded[i]);
1929	addtok (dfa, OR);
1930	}
1931	}
1932	}
1933
1934	dfa->parse.tok = lex (dfa);
1935	}
1936	else if (dfa->parse.tok == LPAREN)
1937	{
1938	dfa->parse.tok = lex (dfa);
1939	regexp (dfa);
1940	if (dfa->parse.tok != RPAREN)
1941	dfaerror (_("unbalanced ("));
1942	dfa->parse.tok = lex (dfa);
1943	}
1944	else
1945	addtok (dfa, EMPTY);
1946	}
1947
1948	/* Return the number of tokens in the given subexpression. */
1949	static idx_t _GL_ATTRIBUTE_PURE
1950	nsubtoks (struct dfa const *dfa, idx_t tindex)
1951	{
1952	switch (dfa->tokens[tindex - 1])
1953	{
1954	default:
1955	return 1;
1956	case QMARK:
1957	case STAR:
1958	case PLUS:
1959	return 1 + nsubtoks (dfa, tindex - 1);
1960	case CAT:
1961	case OR:
1962	{
1963	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1964	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1965	}
1966	}
1967	}
1968
1969	/* Copy the given subexpression to the top of the tree. */
1970	static void
1971	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1972	{
1973	if (dfa->localeinfo.multibyte)
1974	for (idx_t i = 0; i < ntokens; i++)
1975	addtok_mb (dfa, dfa->tokens[tindex + i],
1976	dfa->multibyte_prop[tindex + i]);
1977	else
1978	for (idx_t i = 0; i < ntokens; i++)
1979	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1980	}
1981
1982	static void
1983	closure (struct dfa *dfa)
1984	{
1985	atom (dfa);
1986	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
1987	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
1988	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
1989	{
1990	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1991	idx_t tindex = dfa->tindex - ntokens;
1992	if (dfa->lex.maxrep < 0)
1993	addtok (dfa, PLUS);
1994	if (dfa->lex.minrep == 0)
1995	addtok (dfa, QMARK);
1996	int i;
1997	for (i = 1; i < dfa->lex.minrep; i++)
1998	{
1999	copytoks (dfa, tindex, ntokens);
2000	addtok (dfa, CAT);
2001	}
2002	for (; i < dfa->lex.maxrep; i++)
2003	{
2004	copytoks (dfa, tindex, ntokens);
2005	addtok (dfa, QMARK);
2006	addtok (dfa, CAT);
2007	}
2008	dfa->parse.tok = lex (dfa);
2009	}
2010	else if (dfa->parse.tok == REPMN)
2011	{
2012	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2013	dfa->parse.tok = lex (dfa);
2014	closure (dfa);
2015	}
2016	else
2017	{
2018	addtok (dfa, dfa->parse.tok);
2019	dfa->parse.tok = lex (dfa);
2020	}
2021	}
2022
2023	static void
2024	branch (struct dfa* dfa)
2025	{
2026	closure (dfa);
2027	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2028	&& dfa->parse.tok >= 0)
2029	{
2030	closure (dfa);
2031	addtok (dfa, CAT);
2032	}
2033	}
2034
2035	static void
2036	regexp (struct dfa *dfa)
2037	{
2038	branch (dfa);
2039	while (dfa->parse.tok == OR)
2040	{
2041	dfa->parse.tok = lex (dfa);
2042	branch (dfa);
2043	addtok (dfa, OR);
2044	}
2045	}
2046
2047	/* Parse a string S of length LEN into D. S can include NUL characters.
2048	This is the main entry point for the parser. */
2049	void
2050	dfaparse (char const s, idx_t len, struct dfa d)
2051	{
2052	d->lex.ptr = s;
2053	d->lex.left = len;
2054	d->lex.lasttok = END;
2055	d->lex.laststart = true;
2056
2057	if (!d->syntax.syntax_bits_set)
2058	dfaerror (_("no syntax specified"));
2059
2060	if (!d->nregexps)
2061	addtok (d, BEG);
2062
2063	d->parse.tok = lex (d);
2064	d->parse.depth = d->depth;
2065
2066	regexp (d);
2067
2068	if (d->parse.tok != END)
2069	dfaerror (_("unbalanced )"));
2070
2071	addtok (d, END - d->nregexps);
2072	addtok (d, CAT);
2073
2074	if (d->nregexps)
2075	addtok (d, OR);
2076
2077	++d->nregexps;
2078	}
2079
2080	/* Some primitives for operating on sets of positions. */
2081
2082	/* Copy one set to another. */
2083	static void
2084	copy (position_set const src, position_set dst)
2085	{
2086	if (dst->alloc < src->nelem)
2087	{
2088	free (dst->elems);
2089	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2090	sizeof *dst->elems);
2091	}
2092	dst->nelem = src->nelem;
2093	if (src->nelem != 0)
2094	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2095	}
2096
2097	static void
2098	alloc_position_set (position_set *s, idx_t size)
2099	{
2100	s->elems = xnmalloc (size, sizeof *s->elems);
2101	s->alloc = size;
2102	s->nelem = 0;
2103	}
2104
2105	/* Insert position P in set S. S is maintained in sorted order on
2106	decreasing index. If there is already an entry in S with P.index
2107	then merge (logically-OR) P's constraints into the one in S.
2108	S->elems must point to an array large enough to hold the resulting set. */
2109	static void
2110	insert (position p, position_set *s)
2111	{
2112	idx_t count = s->nelem;
2113	idx_t lo = 0, hi = count;
2114	while (lo < hi)
2115	{
2116	idx_t mid = (lo + hi) >> 1;
2117	if (s->elems[mid].index < p.index)
2118	lo = mid + 1;
2119	else if (s->elems[mid].index == p.index)
2120	{
2121	s->elems[mid].constraint \|= p.constraint;
2122	return;
2123	}
2124	else
2125	hi = mid;
2126	}
2127
2128	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2129	for (idx_t i = count; i > lo; i--)
2130	s->elems[i] = s->elems[i - 1];
2131	s->elems[lo] = p;
2132	++s->nelem;
2133	}
2134
2135	static void
2136	append (position p, position_set *s)
2137	{
2138	idx_t count = s->nelem;
2139	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2140	s->elems[s->nelem++] = p;
2141	}
2142
2143	/* Merge S1 and S2 (with the additional constraint C2) into M. The
2144	result is as if the positions of S1, and of S2 with the additional
2145	constraint C2, were inserted into an initially empty set. */
2146	static void
2147	merge_constrained (position_set const s1, position_set const s2,
2148	unsigned int c2, position_set *m)
2149	{
2150	idx_t i = 0, j = 0;
2151
2152	if (m->alloc - s1->nelem < s2->nelem)
2153	{
2154	free (m->elems);
2155	m->alloc = s1->nelem;
2156	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2157	}
2158	m->nelem = 0;
2159	while (i < s1->nelem \|\| j < s2->nelem)
2160	if (! (j < s2->nelem)
2161	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2162	{
2163	unsigned int c = ((i < s1->nelem && j < s2->nelem
2164	&& s1->elems[i].index == s2->elems[j].index)
2165	? s2->elems[j++].constraint & c2
2166	: 0);
2167	m->elems[m->nelem].index = s1->elems[i].index;
2168	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
2169	}
2170	else
2171	{
2172	if (s2->elems[j].constraint & c2)
2173	{
2174	m->elems[m->nelem].index = s2->elems[j].index;
2175	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2176	}
2177	j++;
2178	}
2179	}
2180
2181	/* Merge two sets of positions into a third. The result is exactly as if
2182	the positions of both sets were inserted into an initially empty set. */
2183	static void
2184	merge (position_set const s1, position_set const s2, position_set *m)
2185	{
2186	merge_constrained (s1, s2, -1, m);
2187	}
2188
2189	/* Merge into DST all the elements of SRC, possibly destroying
2190	the contents of the temporary M. */
2191	static void
2192	merge2 (position_set dst, position_set const src, position_set *m)
2193	{
2194	if (src->nelem < 4)
2195	{
2196	for (idx_t i = 0; i < src->nelem; i++)
2197	insert (src->elems[i], dst);
2198	}
2199	else
2200	{
2201	merge (src, dst, m);
2202	copy (m, dst);
2203	}
2204	}
2205
2206	/* Delete a position from a set. Return the nonzero constraint of the
2207	deleted position, or zero if there was no such position. */
2208	static unsigned int
2209	delete (idx_t del, position_set *s)
2210	{
2211	idx_t count = s->nelem;
2212	idx_t lo = 0, hi = count;
2213	while (lo < hi)
2214	{
2215	idx_t mid = (lo + hi) >> 1;
2216	if (s->elems[mid].index < del)
2217	lo = mid + 1;
2218	else if (s->elems[mid].index == del)
2219	{
2220	unsigned int c = s->elems[mid].constraint;
2221	idx_t i;
2222	for (i = mid; i + 1 < count; i++)
2223	s->elems[i] = s->elems[i + 1];
2224	s->nelem = i;
2225	return c;
2226	}
2227	else
2228	hi = mid;
2229	}
2230	return 0;
2231	}
2232
2233	/* Replace a position with the followed set. */
2234	static void
2235	replace (position_set dst, idx_t del, position_set add,
2236	unsigned int constraint, position_set *tmp)
2237	{
2238	unsigned int c = delete (del, dst) & constraint;
2239
2240	if (c)
2241	{
2242	copy (dst, tmp);
2243	merge_constrained (tmp, add, c, dst);
2244	}
2245	}
2246
2247	/* Find the index of the state corresponding to the given position set with
2248	the given preceding context, or create a new state if there is no such
2249	state. Context tells whether we got here on a newline or letter. */
2250	static state_num
2251	state_index (struct dfa d, position_set const s, int context)
2252	{
2253	size_t hash = 0;
2254	int constraint = 0;
2255	state_num i;
2256
2257	for (i = 0; i < s->nelem; ++i)
2258	{
2259	idx_t ind = s->elems[i].index;
2260	hash ^= ind + s->elems[i].constraint;
2261	}
2262
2263	/* Try to find a state that exactly matches the proposed one. */
2264	for (i = 0; i < d->sindex; ++i)
2265	{
2266	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2267	\|\| context != d->states[i].context)
2268	continue;
2269	state_num j;
2270	for (j = 0; j < s->nelem; ++j)
2271	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2272	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2273	break;
2274	if (j == s->nelem)
2275	return i;
2276	}
2277
2278	#ifdef DEBUG
2279	fprintf (stderr, "new state %td\n nextpos:", i);
2280	for (state_num j = 0; j < s->nelem; j++)
2281	{
2282	fprintf (stderr, " %td:", s->elems[j].index);
2283	prtok (d->tokens[s->elems[j].index]);
2284	}
2285	fprintf (stderr, "\n context:");
2286	if (context ^ CTX_ANY)
2287	{
2288	if (context & CTX_NONE)
2289	fprintf (stderr, " CTX_NONE");
2290	if (context & CTX_LETTER)
2291	fprintf (stderr, " CTX_LETTER");
2292	if (context & CTX_NEWLINE)
2293	fprintf (stderr, " CTX_NEWLINE");
2294	}
2295	else
2296	fprintf (stderr, " CTX_ANY");
2297	fprintf (stderr, "\n");
2298	#endif
2299
2300	for (state_num j = 0; j < s->nelem; j++)
2301	{
2302	int c = d->constraints[s->elems[j].index];
2303
2304	if (c != 0)
2305	{
2306	if (succeeds_in_context (c, context, CTX_ANY))
2307	constraint \|= c;
2308	}
2309	else if (d->tokens[s->elems[j].index] == BACKREF)
2310	constraint = NO_CONSTRAINT;
2311	}
2312
2313
2314	/* Create a new state. */
2315	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2316	sizeof *d->states);
2317	d->states[i].hash = hash;
2318	alloc_position_set (&d->states[i].elems, s->nelem);
2319	copy (s, &d->states[i].elems);
2320	d->states[i].context = context;
2321	d->states[i].constraint = constraint;
2322	d->states[i].mbps.nelem = 0;
2323	d->states[i].mbps.elems = NULL;
2324	d->states[i].mb_trindex = -1;
2325
2326	++d->sindex;
2327
2328	return i;
2329	}
2330
2331	/* Find the epsilon closure of D's set of positions. If any position of the set
2332	contains a symbol that matches the empty string in some context, replace
2333	that position with the elements of its follow labeled with an appropriate
2334	constraint. Repeat exhaustively until no funny positions are left.
2335	S->elems must be large enough to hold the result. BACKWARD is D's
2336	backward set; use and update it too. */
2337	static void
2338	epsclosure (struct dfa const d, position_set backward)
2339	{
2340	position_set tmp;
2341	alloc_position_set (&tmp, d->nleaves);
2342	for (idx_t i = 0; i < d->tindex; i++)
2343	if (0 < d->follows[i].nelem)
2344	{
2345	unsigned int constraint;
2346	switch (d->tokens[i])
2347	{
2348	default:
2349	continue;
2350
2351	case BEGLINE:
2352	constraint = BEGLINE_CONSTRAINT;
2353	break;
2354	case ENDLINE:
2355	constraint = ENDLINE_CONSTRAINT;
2356	break;
2357	case BEGWORD:
2358	constraint = BEGWORD_CONSTRAINT;
2359	break;
2360	case ENDWORD:
2361	constraint = ENDWORD_CONSTRAINT;
2362	break;
2363	case LIMWORD:
2364	constraint = LIMWORD_CONSTRAINT;
2365	break;
2366	case NOTLIMWORD:
2367	constraint = NOTLIMWORD_CONSTRAINT;
2368	break;
2369	case EMPTY:
2370	constraint = NO_CONSTRAINT;
2371	break;
2372	}
2373
2374	delete (i, &d->follows[i]);
2375
2376	for (idx_t j = 0; j < backward[i].nelem; j++)
2377	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2378	constraint, &tmp);
2379	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2380	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2381	NO_CONSTRAINT, &tmp);
2382	}
2383	free (tmp.elems);
2384	}
2385
2386	/* Returns the set of contexts for which there is at least one
2387	character included in C. */
2388
2389	static int
2390	charclass_context (struct dfa const dfa, charclass const c)
2391	{
2392	int context = 0;
2393
2394	for (int j = 0; j < CHARCLASS_WORDS; j++)
2395	{
2396	if (c->w[j] & dfa->syntax.newline.w[j])
2397	context \|= CTX_NEWLINE;
2398	if (c->w[j] & dfa->syntax.letters.w[j])
2399	context \|= CTX_LETTER;
2400	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
2401	context \|= CTX_NONE;
2402	}
2403
2404	return context;
2405	}
2406
2407	/* Returns the contexts on which the position set S depends. Each context
2408	in the set of returned contexts (let's call it SC) may have a different
2409	follow set than other contexts in SC, and also different from the
2410	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2411	in the complement set will have the same follow set. */
2412
2413	static int _GL_ATTRIBUTE_PURE
2414	state_separate_contexts (struct dfa d, position_set const s)
2415	{
2416	int separate_contexts = 0;
2417
2418	for (idx_t j = 0; j < s->nelem; j++)
2419	separate_contexts \|= d->separates[s->elems[j].index];
2420
2421	return separate_contexts;
2422	}
2423
2424	enum
2425	{
2426	/* Single token is repeated. It is distinguished from non-repeated. */
2427	OPT_REPEAT = (1 << 0),
2428
2429	/* Multiple tokens are repeated. This flag is on at head of tokens. The
2430	node is not merged. */
2431	OPT_LPAREN = (1 << 1),
2432
2433	/* Multiple branches are joined. The node is not merged. */
2434	OPT_RPAREN = (1 << 2),
2435
2436	/* The node is walked. If the node is found in walking again, OPT_RPAREN
2437	flag is turned on. */
2438	OPT_WALKED = (1 << 3),
2439
2440	/* The node is queued. The node is not queued again. */
2441	OPT_QUEUED = (1 << 4)
2442	};
2443
2444	static void
2445	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
2446	position_set *merged)
2447	{
2448	position_set *follows = d->follows;
2449	idx_t nelem = 0;
2450
2451	for (idx_t i = 0; i < follows[tindex].nelem; i++)
2452	{
2453	idx_t sindex = follows[tindex].elems[i].index;
2454
2455	/* Skip the node as pruned in future. */
2456	unsigned int iconstraint = follows[tindex].elems[i].constraint;
2457	if (iconstraint == 0)
2458	continue;
2459
2460	if (d->tokens[follows[tindex].elems[i].index] <= END)
2461	{
2462	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
2463	continue;
2464	}
2465
2466	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
2467	{
2468	idx_t j;
2469
2470	for (j = 0; j < nelem; j++)
2471	{
2472	idx_t dindex = follows[tindex].elems[j].index;
2473
2474	if (dindex == tindex)
2475	continue;
2476
2477	if (follows[tindex].elems[j].constraint != iconstraint)
2478	continue;
2479
2480	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
2481	continue;
2482
2483	if (d->tokens[sindex] != d->tokens[dindex])
2484	continue;
2485
2486	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2487	continue;
2488
2489	if (flags[sindex] & OPT_REPEAT)
2490	delete (sindex, &follows[sindex]);
2491
2492	merge2 (&follows[dindex], &follows[sindex], merged);
2493
2494	break;
2495	}
2496
2497	if (j < nelem)
2498	continue;
2499	}
2500
2501	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2502	flags[sindex] \|= OPT_QUEUED;
2503	}
2504
2505	follows[tindex].nelem = nelem;
2506	}
2507
2508	static int
2509	compare (const void a, const void b)
2510	{
2511	position const p = a, q = b;
2512	return (p->index > q->index) - (p->index < q->index);
2513	}
2514
2515	static void
2516	reorder_tokens (struct dfa *d)
2517	{
2518	idx_t nleaves = 0;
2519	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
2520	map[0] = nleaves++;
2521	for (idx_t i = 1; i < d->tindex; i++)
2522	map[i] = -1;
2523
2524	token tokens = xnmalloc (d->nleaves, sizeof tokens);
2525	position_set follows = xnmalloc (d->nleaves, sizeof follows);
2526	int constraints = xnmalloc (d->nleaves, sizeof constraints);
2527	char *multibyte_prop = (d->localeinfo.multibyte
2528	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2529	: NULL);
2530
2531	for (idx_t i = 0; i < d->tindex; i++)
2532	{
2533	if (map[i] < 0)
2534	{
2535	free (d->follows[i].elems);
2536	d->follows[i].elems = NULL;
2537	d->follows[i].nelem = 0;
2538	continue;
2539	}
2540
2541	tokens[map[i]] = d->tokens[i];
2542	follows[map[i]] = d->follows[i];
2543	constraints[map[i]] = d->constraints[i];
2544
2545	if (multibyte_prop != NULL)
2546	multibyte_prop[map[i]] = d->multibyte_prop[i];
2547
2548	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2549	{
2550	if (map[d->follows[i].elems[j].index] == -1)
2551	map[d->follows[i].elems[j].index] = nleaves++;
2552
2553	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2554	}
2555
2556	qsort (d->follows[i].elems, d->follows[i].nelem,
2557	sizeof *d->follows[i].elems, compare);
2558	}
2559
2560	for (idx_t i = 0; i < nleaves; i++)
2561	{
2562	d->tokens[i] = tokens[i];
2563	d->follows[i] = follows[i];
2564	d->constraints[i] = constraints[i];
2565
2566	if (multibyte_prop != NULL)
2567	d->multibyte_prop[i] = multibyte_prop[i];
2568	}
2569
2570	d->tindex = d->nleaves = nleaves;
2571
2572	free (tokens);
2573	free (follows);
2574	free (constraints);
2575	free (multibyte_prop);
2576	free (map);
2577	}
2578
2579	static void
2580	dfaoptimize (struct dfa *d)
2581	{
2582	char *flags = xizalloc (d->tindex);
2583
2584	for (idx_t i = 0; i < d->tindex; i++)
2585	{
2586	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2587	{
2588	if (d->follows[i].elems[j].index == i)
2589	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
2590	else if (d->follows[i].elems[j].index < i)
2591	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
2592	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2593	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
2594	else
2595	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
2596	}
2597	}
2598
2599	flags[0] \|= OPT_QUEUED;
2600
2601	position_set merged0;
2602	position_set *merged = &merged0;
2603	alloc_position_set (merged, d->nleaves);
2604
2605	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2606
2607	for (idx_t i = 0; i < d->tindex; i++)
2608	if (flags[i] & OPT_QUEUED)
2609	merge_nfa_state (d, i, flags, merged);
2610
2611	reorder_tokens (d);
2612
2613	free (merged->elems);
2614	free (flags);
2615	}
2616
2617	/* Perform bottom-up analysis on the parse tree, computing various functions.
2618	Note that at this point, we're pretending constructs like \< are real
2619	characters rather than constraints on what can follow them.
2620
2621	Nullable: A node is nullable if it is at the root of a regexp that can
2622	match the empty string.
2623	* EMPTY leaves are nullable.
2624	* No other leaf is nullable.
2625	* A QMARK or STAR node is nullable.
2626	* A PLUS node is nullable if its argument is nullable.
2627	* A CAT node is nullable if both its arguments are nullable.
2628	* An OR node is nullable if either argument is nullable.
2629
2630	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2631	that could correspond to the first character of a string matching the
2632	regexp rooted at the given node.
2633	* EMPTY leaves have empty firstpos.
2634	* The firstpos of a nonempty leaf is that leaf itself.
2635	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2636	argument.
2637	* The firstpos of a CAT node is the firstpos of the left argument, union
2638	the firstpos of the right if the left argument is nullable.
2639	* The firstpos of an OR node is the union of firstpos of each argument.
2640
2641	Lastpos: The lastpos of a node is the set of positions that could
2642	correspond to the last character of a string matching the regexp at
2643	the given node.
2644	* EMPTY leaves have empty lastpos.
2645	* The lastpos of a nonempty leaf is that leaf itself.
2646	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2647	argument.
2648	* The lastpos of a CAT node is the lastpos of its right argument, union
2649	the lastpos of the left if the right argument is nullable.
2650	* The lastpos of an OR node is the union of the lastpos of each argument.
2651
2652	Follow: The follow of a position is the set of positions that could
2653	correspond to the character following a character matching the node in
2654	a string matching the regexp. At this point we consider special symbols
2655	that match the empty string in some context to be just normal characters.
2656	Later, if we find that a special symbol is in a follow set, we will
2657	replace it with the elements of its follow, labeled with an appropriate
2658	constraint.
2659	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2660	the follow of every node in the lastpos.
2661	* Every node in the firstpos of the second argument of a CAT node is in
2662	the follow of every node in the lastpos of the first argument.
2663
2664	Because of the postfix representation of the parse tree, the depth-first
2665	analysis is conveniently done by a linear scan with the aid of a stack.
2666	Sets are stored as arrays of the elements, obeying a stack-like allocation
2667	scheme; the number of elements in each set deeper in the stack can be
2668	used to determine the address of a particular set's array. */
2669	static void
2670	dfaanalyze (struct dfa *d, bool searchflag)
2671	{
2672	/* Array allocated to hold position sets. */
2673	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
2674	/* Firstpos and lastpos elements. */
2675	position *firstpos = posalloc;
2676	position *lastpos = firstpos + d->nleaves;
2677	position pos;
2678	position_set tmp;
2679
2680	/* Stack for element counts and nullable flags. */
2681	struct
2682	{
2683	/* Whether the entry is nullable. */
2684	bool nullable;
2685
2686	/* Counts of firstpos and lastpos sets. */
2687	idx_t nfirstpos;
2688	idx_t nlastpos;
2689	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
2690
2691	position_set merged; /* Result of merging sets. */
2692
2693	addtok (d, CAT);
2694	idx_t tindex = d->tindex;
2695
2696	#ifdef DEBUG
2697	fprintf (stderr, "dfaanalyze:\n");
2698	for (idx_t i = 0; i < tindex; i++)
2699	{
2700	fprintf (stderr, " %td:", i);
2701	prtok (d->tokens[i]);
2702	}
2703	putc ('\n', stderr);
2704	#endif
2705
2706	d->searchflag = searchflag;
2707	alloc_position_set (&merged, d->nleaves);
2708	d->follows = xicalloc (tindex, sizeof *d->follows);
2709	position_set *backward
2710	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2711
2712	for (idx_t i = 0; i < tindex; i++)
2713	{
2714	switch (d->tokens[i])
2715	{
2716	case EMPTY:
2717	/* The empty set is nullable. */
2718	stk->nullable = true;
2719
2720	/* The firstpos and lastpos of the empty leaf are both empty. */
2721	stk->nfirstpos = stk->nlastpos = 0;
2722	stk++;
2723	break;
2724
2725	case STAR:
2726	case PLUS:
2727	/* Every element in the lastpos of the argument is in the backward
2728	set of every element in the firstpos. */
2729	if (d->epsilon)
2730	{
2731	tmp.elems = lastpos - stk[-1].nlastpos;
2732	tmp.nelem = stk[-1].nlastpos;
2733	for (position *p = firstpos - stk[-1].nfirstpos;
2734	p < firstpos; p++)
2735	merge2 (&backward[p->index], &tmp, &merged);
2736	}
2737
2738	/* Every element in the firstpos of the argument is in the follow
2739	of every element in the lastpos. */
2740	{
2741	tmp.elems = firstpos - stk[-1].nfirstpos;
2742	tmp.nelem = stk[-1].nfirstpos;
2743	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2744	merge2 (&d->follows[p->index], &tmp, &merged);
2745	}
2746	FALLTHROUGH;
2747	case QMARK:
2748	/* A QMARK or STAR node is automatically nullable. */
2749	if (d->tokens[i] != PLUS)
2750	stk[-1].nullable = true;
2751	break;
2752
2753	case CAT:
2754	/* Every element in the lastpos of the first argument is in
2755	the backward set of every element in the firstpos of the
2756	second argument. */
2757	if (backward)
2758	{
2759	tmp.nelem = stk[-2].nlastpos;
2760	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2761	for (position *p = firstpos - stk[-1].nfirstpos;
2762	p < firstpos; p++)
2763	merge2 (&backward[p->index], &tmp, &merged);
2764	}
2765
2766	/* Every element in the firstpos of the second argument is in the
2767	follow of every element in the lastpos of the first argument. */
2768	{
2769	tmp.nelem = stk[-1].nfirstpos;
2770	tmp.elems = firstpos - stk[-1].nfirstpos;
2771	for (position *plim = lastpos - stk[-1].nlastpos,
2772	*p = plim - stk[-2].nlastpos;
2773	p < plim; p++)
2774	merge2 (&d->follows[p->index], &tmp, &merged);
2775	}
2776
2777	/* The firstpos of a CAT node is the firstpos of the first argument,
2778	union that of the second argument if the first is nullable. */
2779	if (stk[-2].nullable)
2780	stk[-2].nfirstpos += stk[-1].nfirstpos;
2781	else
2782	firstpos -= stk[-1].nfirstpos;
2783
2784	/* The lastpos of a CAT node is the lastpos of the second argument,
2785	union that of the first argument if the second is nullable. */
2786	if (stk[-1].nullable)
2787	stk[-2].nlastpos += stk[-1].nlastpos;
2788	else
2789	{
2790	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2791	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2792	p[j] = p[j + stk[-2].nlastpos];
2793	lastpos -= stk[-2].nlastpos;
2794	stk[-2].nlastpos = stk[-1].nlastpos;
2795	}
2796
2797	/* A CAT node is nullable if both arguments are nullable. */
2798	stk[-2].nullable &= stk[-1].nullable;
2799	stk--;
2800	break;
2801
2802	case OR:
2803	/* The firstpos is the union of the firstpos of each argument. */
2804	stk[-2].nfirstpos += stk[-1].nfirstpos;
2805
2806	/* The lastpos is the union of the lastpos of each argument. */
2807	stk[-2].nlastpos += stk[-1].nlastpos;
2808
2809	/* An OR node is nullable if either argument is nullable. */
2810	stk[-2].nullable \|= stk[-1].nullable;
2811	stk--;
2812	break;
2813
2814	default:
2815	/* Anything else is a nonempty position. (Note that special
2816	constructs like \< are treated as nonempty strings here;
2817	an "epsilon closure" effectively makes them nullable later.
2818	Backreferences have to get a real position so we can detect
2819	transitions on them later. But they are nullable. */
2820	stk->nullable = d->tokens[i] == BACKREF;
2821
2822	/* This position is in its own firstpos and lastpos. */
2823	stk->nfirstpos = stk->nlastpos = 1;
2824	stk++;
2825
2826	firstpos->index = lastpos->index = i;
2827	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2828	firstpos++, lastpos++;
2829
2830	break;
2831	}
2832	#ifdef DEBUG
2833	/* ... balance the above nonsyntactic #ifdef goo... */
2834	fprintf (stderr, "node %td:", i);
2835	prtok (d->tokens[i]);
2836	putc ('\n', stderr);
2837	fprintf (stderr,
2838	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2839	fprintf (stderr, " firstpos:");
2840	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2841	{
2842	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2843	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2844	}
2845	fprintf (stderr, "\n lastpos:");
2846	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2847	{
2848	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2849	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2850	}
2851	putc ('\n', stderr);
2852	#endif
2853	}
2854
2855	if (backward)
2856	{
2857	/* For each follow set that is the follow set of a real position,
2858	replace it with its epsilon closure. */
2859	epsclosure (d, backward);
2860
2861	for (idx_t i = 0; i < tindex; i++)
2862	free (backward[i].elems);
2863	free (backward);
2864	}
2865
2866	dfaoptimize (d);
2867
2868	#ifdef DEBUG
2869	for (idx_t i = 0; i < tindex; i++)
2870	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
2871	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
2872	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
2873	{
2874	fprintf (stderr, "follows(%td:", i);
2875	prtok (d->tokens[i]);
2876	fprintf (stderr, "):");
2877	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2878	{
2879	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2880	prtok (d->tokens[d->follows[i].elems[j].index]);
2881	}
2882	putc ('\n', stderr);
2883	}
2884	#endif
2885
2886	pos.index = 0;
2887	pos.constraint = NO_CONSTRAINT;
2888
2889	alloc_position_set (&tmp, 1);
2890
2891	append (pos, &tmp);
2892
2893	d->separates = xicalloc (tindex, sizeof *d->separates);
2894
2895	for (idx_t i = 0; i < tindex; i++)
2896	{
2897	if (prev_newline_dependent (d->constraints[i]))
2898	d->separates[i] \|= CTX_NEWLINE;
2899	if (prev_letter_dependent (d->constraints[i]))
2900	d->separates[i] \|= CTX_LETTER;
2901
2902	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2903	{
2904	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2905	d->separates[i] \|= CTX_NEWLINE;
2906	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2907	d->separates[i] \|= CTX_LETTER;
2908	}
2909	}
2910
2911	/* Context wanted by some position. */
2912	int separate_contexts = state_separate_contexts (d, &tmp);
2913
2914	/* Build the initial state. */
2915	if (separate_contexts & CTX_NEWLINE)
2916	state_index (d, &tmp, CTX_NEWLINE);
2917	d->initstate_notbol = d->min_trcount
2918	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2919	if (separate_contexts & CTX_LETTER)
2920	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2921	d->min_trcount++;
2922	d->trcount = 0;
2923
2924	free (posalloc);
2925	free (stkalloc);
2926	free (merged.elems);
2927	free (tmp.elems);
2928	}
2929
2930	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2931	static void
2932	realloc_trans_if_necessary (struct dfa *d)
2933	{
2934	state_num oldalloc = d->tralloc;
2935	if (oldalloc < d->sindex)
2936	{
2937	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2938	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2939	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2940	-1, sizeof *realtrans);
2941	realtrans[0] = realtrans[1] = NULL;
2942	d->trans = realtrans + 2;
2943	idx_t newalloc = d->tralloc = newalloc1 - 2;
2944	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2945	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2946	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2947	if (d->localeinfo.multibyte)
2948	{
2949	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2950	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2951	if (oldalloc == 0)
2952	realtrans[0] = realtrans[1] = NULL;
2953	d->mb_trans = realtrans + 2;
2954	}
2955	for (; oldalloc < newalloc; oldalloc++)
2956	{
2957	d->trans[oldalloc] = NULL;
2958	d->fails[oldalloc] = NULL;
2959	if (d->localeinfo.multibyte)
2960	d->mb_trans[oldalloc] = NULL;
2961	}
2962	}
2963	}
2964
2965	/*
2966	Calculate the transition table for a new state derived from state s
2967	for a compiled dfa d after input character uc, and return the new
2968	state number.
2969
2970	Do not worry about all possible input characters; calculate just the group
2971	of positions that match uc. Label it with the set of characters that
2972	every position in the group matches (taking into account, if necessary,
2973	preceding context information of s). Then find the union
2974	of these positions' follows, i.e., the set of positions of the
2975	new state. For each character in the group's label, set the transition
2976	on this character to be to a state corresponding to the set's positions,
2977	and its associated backward context information, if necessary.
2978
2979	When building a searching matcher, include the positions of state
2980	0 in every state.
2981
2982	The group is constructed by building an equivalence-class
2983	partition of the positions of s.
2984
2985	For each position, find the set of characters C that it matches. Eliminate
2986	any characters from C that fail on grounds of backward context.
2987
2988	Check whether the group's label L has nonempty
2989	intersection with C. If L - C is nonempty, create a new group labeled
2990	L - C and having the same positions as the current group, and set L to
2991	the intersection of L and C. Insert the position in the group, set
2992	C = C - L, and resume scanning.
2993
2994	If after comparing with every group there are characters remaining in C,
2995	create a new group labeled with the characters of C and insert this
2996	position in that group. */
2997
2998	static state_num
2999	build_state (state_num s, struct dfa *d, unsigned char uc)
3000	{
3001	position_set follows; /* Union of the follows for each
3002	position of the current state. */
3003	position_set group; /* Positions that match the input char. */
3004	position_set tmp; /* Temporary space for merging sets. */
3005	state_num state; /* New state. */
3006	state_num state_newline; /* New state on a newline transition. */
3007	state_num state_letter; /* New state on a letter transition. */
3008
3009	#ifdef DEBUG
3010	fprintf (stderr, "build state %td\n", s);
3011	#endif
3012
3013	/* A pointer to the new transition table, and the table itself. */
3014	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3015	state_num trans = ptrans;
3016
3017	if (!trans)
3018	{
3019	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
3020	transition tables that can exist at once, other than for
3021	initial states. Often-used transition tables are quickly
3022	rebuilt, whereas rarely-used ones are cleared away. */
3023	if (MAX_TRCOUNT <= d->trcount)
3024	{
3025	for (state_num i = d->min_trcount; i < d->tralloc; i++)
3026	{
3027	free (d->trans[i]);
3028	free (d->fails[i]);
3029	d->trans[i] = d->fails[i] = NULL;
3030	}
3031	d->trcount = 0;
3032	}
3033
3034	d->trcount++;
3035	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
3036
3037	/* Fill transition table with a default value which means that the
3038	transited state has not been calculated yet. */
3039	for (int i = 0; i < NOTCHAR; i++)
3040	trans[i] = -2;
3041	}
3042
3043	/* Set up the success bits for this state. */
3044	d->success[s] = 0;
3045	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3046	d->success[s] \|= CTX_NEWLINE;
3047	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3048	d->success[s] \|= CTX_LETTER;
3049	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3050	d->success[s] \|= CTX_NONE;
3051
3052	alloc_position_set (&follows, d->nleaves);
3053
3054	/* Find the union of the follows of the positions of the group.
3055	This is a hideously inefficient loop. Fix it someday. */
3056	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3057	for (idx_t k = 0;
3058	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3059	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3060	&follows);
3061
3062	/* Positions that match the input char. */
3063	alloc_position_set (&group, d->nleaves);
3064
3065	/* The group's label. */
3066	charclass label;
3067	fillset (&label);
3068
3069	for (idx_t i = 0; i < follows.nelem; i++)
3070	{
3071	charclass matches; /* Set of matching characters. */
3072	position pos = follows.elems[i];
3073	bool matched = false;
3074	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3075	{
3076	zeroset (&matches);
3077	setbit (d->tokens[pos.index], &matches);
3078	if (d->tokens[pos.index] == uc)
3079	matched = true;
3080	}
3081	else if (d->tokens[pos.index] >= CSET)
3082	{
3083	matches = d->charclasses[d->tokens[pos.index] - CSET];
3084	if (tstbit (uc, &matches))
3085	matched = true;
3086	}
3087	else if (d->tokens[pos.index] == ANYCHAR)
3088	{
3089	matches = d->charclasses[d->canychar];
3090	if (tstbit (uc, &matches))
3091	matched = true;
3092
3093	/* ANYCHAR must match with a single character, so we must put
3094	it to D->states[s].mbps which contains the positions which
3095	can match with a single character not a byte. If all
3096	positions which has ANYCHAR does not depend on context of
3097	next character, we put the follows instead of it to
3098	D->states[s].mbps to optimize. */
3099	if (succeeds_in_context (pos.constraint, d->states[s].context,
3100	CTX_NONE))
3101	{
3102	if (d->states[s].mbps.nelem == 0)
3103	alloc_position_set (&d->states[s].mbps, 1);
3104	insert (pos, &d->states[s].mbps);
3105	}
3106	}
3107	else
3108	continue;
3109
3110	/* Some characters may need to be eliminated from matches because
3111	they fail in the current context. */
3112	if (pos.constraint != NO_CONSTRAINT)
3113	{
3114	if (!succeeds_in_context (pos.constraint,
3115	d->states[s].context, CTX_NEWLINE))
3116	for (int j = 0; j < CHARCLASS_WORDS; j++)
3117	matches.w[j] &= ~d->syntax.newline.w[j];
3118	if (!succeeds_in_context (pos.constraint,
3119	d->states[s].context, CTX_LETTER))
3120	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3121	matches.w[j] &= ~d->syntax.letters.w[j];
3122	if (!succeeds_in_context (pos.constraint,
3123	d->states[s].context, CTX_NONE))
3124	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3125	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
3126
3127	/* If there are no characters left, there's no point in going on. */
3128	if (emptyset (&matches))
3129	continue;
3130
3131	/* If we have reset the bit that made us declare "matched", reset
3132	that indicator, too. This is required to avoid an infinite loop
3133	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
3134	if (!tstbit (uc, &matches))
3135	matched = false;
3136	}
3137
3138	#ifdef DEBUG
3139	fprintf (stderr, " nextpos %td:", pos.index);
3140	prtok (d->tokens[pos.index]);
3141	fprintf (stderr, " of");
3142	for (unsigned j = 0; j < NOTCHAR; j++)
3143	if (tstbit (j, &matches))
3144	fprintf (stderr, " 0x%02x", j);
3145	fprintf (stderr, "\n");
3146	#endif
3147
3148	if (matched)
3149	{
3150	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3151	label.w[k] &= matches.w[k];
3152	append (pos, &group);
3153	}
3154	else
3155	{
3156	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3157	label.w[k] &= ~matches.w[k];
3158	}
3159	}
3160
3161	alloc_position_set (&tmp, d->nleaves);
3162
3163	if (group.nelem > 0)
3164	{
3165	/* If we are building a searching matcher, throw in the positions
3166	of state 0 as well, if possible. */
3167	if (d->searchflag)
3168	{
3169	/* If a token in follows.elems is not 1st byte of a multibyte
3170	character, or the states of follows must accept the bytes
3171	which are not 1st byte of the multibyte character.
3172	Then, if a state of follows encounters a byte, it must not be
3173	a 1st byte of a multibyte character nor a single byte character.
3174	In this case, do not add state[0].follows to next state, because
3175	state[0] must accept 1st-byte.
3176
3177	For example, suppose <sb a> is a certain single byte character,
3178	<mb A> is a certain multibyte character, and the codepoint of
3179	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
3180	state[0] accepts <sb a>, state[i] transits to state[i+1] by
3181	accepting the 1st byte of <mb A>, and state[i+1] accepts the
3182	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3183	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3184	not add state[0]. */
3185
3186	bool mergeit = !d->localeinfo.multibyte;
3187	if (!mergeit)
3188	{
3189	mergeit = true;
3190	for (idx_t j = 0; mergeit && j < group.nelem; j++)
3191	mergeit &= d->multibyte_prop[group.elems[j].index];
3192	}
3193	if (mergeit)
3194	merge2 (&group, &d->states[0].elems, &tmp);
3195	}
3196
3197	/* Find out if the new state will want any context information,
3198	by calculating possible contexts that the group can match,
3199	and separate contexts that the new state wants to know. */
3200	int possible_contexts = charclass_context (d, &label);
3201	int separate_contexts = state_separate_contexts (d, &group);
3202
3203	/* Find the state(s) corresponding to the union of the follows. */
3204	if (possible_contexts & ~separate_contexts)
3205	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3206	else
3207	state = -1;
3208	if (separate_contexts & possible_contexts & CTX_NEWLINE)
3209	state_newline = state_index (d, &group, CTX_NEWLINE);
3210	else
3211	state_newline = state;
3212	if (separate_contexts & possible_contexts & CTX_LETTER)
3213	state_letter = state_index (d, &group, CTX_LETTER);
3214	else
3215	state_letter = state;
3216
3217	/* Reallocate now, to reallocate any newline transition properly. */
3218	realloc_trans_if_necessary (d);
3219	}
3220
3221	/* If we are a searching matcher, the default transition is to a state
3222	containing the positions of state 0, otherwise the default transition
3223	is to fail miserably. */
3224	else if (d->searchflag)
3225	{
3226	state_newline = 0;
3227	state_letter = d->min_trcount - 1;
3228	state = d->initstate_notbol;
3229	}
3230	else
3231	{
3232	state_newline = -1;
3233	state_letter = -1;
3234	state = -1;
3235	}
3236
3237	/* Set the transitions for each character in the label. */
3238	for (int i = 0; i < NOTCHAR; i++)
3239	if (tstbit (i, &label))
3240	switch (d->syntax.sbit[i])
3241	{
3242	case CTX_NEWLINE:
3243	trans[i] = state_newline;
3244	break;
3245	case CTX_LETTER:
3246	trans[i] = state_letter;
3247	break;
3248	default:
3249	trans[i] = state;
3250	break;
3251	}
3252
3253	#ifdef DEBUG
3254	fprintf (stderr, "trans table %td", s);
3255	for (int i = 0; i < NOTCHAR; ++i)
3256	{
3257	if (!(i & 0xf))
3258	fprintf (stderr, "\n");
3259	fprintf (stderr, " %2td", trans[i]);
3260	}
3261	fprintf (stderr, "\n");
3262	#endif
3263
3264	free (group.elems);
3265	free (follows.elems);
3266	free (tmp.elems);
3267
3268	/* Keep the newline transition in a special place so we can use it as
3269	a sentinel. */
3270	if (tstbit (d->syntax.eolbyte, &label))
3271	{
3272	d->newlines[s] = trans[d->syntax.eolbyte];
3273	trans[d->syntax.eolbyte] = -1;
3274	}
3275
3276	return trans[uc];
3277	}
3278
3279	/* Multibyte character handling sub-routines for dfaexec. */
3280
3281	/* Consume a single byte and transit state from 's' to '*next_state'.
3282	This function is almost same as the state transition routin in dfaexec.
3283	But state transition is done just once, otherwise matching succeed or
3284	reach the end of the buffer. */
3285	static state_num
3286	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
3287	{
3288	state_num *t;
3289
3290	if (d->trans[s])
3291	t = d->trans[s];
3292	else if (d->fails[s])
3293	t = d->fails[s];
3294	else
3295	{
3296	build_state (s, d, **pp);
3297	if (d->trans[s])
3298	t = d->trans[s];
3299	else
3300	{
3301	t = d->fails[s];
3302	assert (t);
3303	}
3304	}
3305
3306	if (t[**pp] == -2)
3307	build_state (s, d, **pp);
3308
3309	return t[(pp)++];
3310	}
3311
3312	/* Transit state from s, then return new state and update the pointer of
3313	the buffer. This function is for a period operator which can match a
3314	multi-byte character. */
3315	static state_num
3316	transit_state (struct dfa d, state_num s, unsigned char const *pp,
3317	unsigned char const *end)
3318	{
3319	wint_t wc;
3320
3321	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
3322
3323	/* This state has some operators which can match a multibyte character. */
3324	d->mb_follows.nelem = 0;
3325
3326	/* Calculate the state which can be reached from the state 's' by
3327	consuming 'mbclen' single bytes from the buffer. */
3328	state_num s1 = s;
3329	int mbci;
3330	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
3331	s = transit_state_singlebyte (d, s, pp);
3332	*pp += mbclen - mbci;
3333
3334	if (wc == WEOF)
3335	{
3336	/* It is an invalid character, so ANYCHAR is not accepted. */
3337	return s;
3338	}
3339
3340	/* If all positions which have ANYCHAR do not depend on the context
3341	of the next character, calculate the next state with
3342	pre-calculated follows and cache the result. */
3343	if (d->states[s1].mb_trindex < 0)
3344	{
3345	if (MAX_TRCOUNT <= d->mb_trcount)
3346	{
3347	state_num s3;
3348	for (s3 = -1; s3 < d->tralloc; s3++)
3349	{
3350	free (d->mb_trans[s3]);
3351	d->mb_trans[s3] = NULL;
3352	}
3353
3354	for (state_num i = 0; i < d->sindex; i++)
3355	d->states[i].mb_trindex = -1;
3356	d->mb_trcount = 0;
3357	}
3358	d->states[s1].mb_trindex = d->mb_trcount++;
3359	}
3360
3361	if (! d->mb_trans[s])
3362	{
3363	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3364	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3365	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3366	for (int i = 0; i < MAX_TRCOUNT; i++)
3367	d->mb_trans[s][i] = -1;
3368	}
3369	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3370	return d->mb_trans[s][d->states[s1].mb_trindex];
3371
3372	if (s == -1)
3373	copy (&d->states[s1].mbps, &d->mb_follows);
3374	else
3375	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3376
3377	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3378	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3379	realloc_trans_if_necessary (d);
3380
3381	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3382
3383	return s2;
3384	}
3385
3386	/* The initial state may encounter a byte which is not a single byte character
3387	nor the first byte of a multibyte character. But it is incorrect for the
3388	initial state to accept such a byte. For example, in Shift JIS the regular
3389	expression "\\" accepts the codepoint 0x5c, but should not accept the second
3390	byte of the codepoint 0x815c. Then the initial state must skip the bytes
3391	that are not a single byte character nor the first byte of a multibyte
3392	character.
3393
3394	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3395	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3396	the result is greater than P, set *WCP to the final wide character
3397	processed, or to WEOF if no wide character is processed. Otherwise,
3398	if WCP is non-NULL, *WCP may or may not be updated.
3399
3400	Both P and MBP must be no larger than END. */
3401	static unsigned char const *
3402	skip_remains_mb (struct dfa d, unsigned char const p,
3403	unsigned char const mbp, char const end)
3404	{
3405	if (d->syntax.never_trail[*p])
3406	return p;
3407	while (mbp < p)
3408	{
3409	wint_t wc;
3410	mbp += mbs_to_wchar (&wc, (char const *) mbp,
3411	end - (char const *) mbp, d);
3412	}
3413	return mbp;
3414	}
3415
3416	/* Search through a buffer looking for a match to the struct dfa *D.
3417	Find the first occurrence of a string matching the regexp in the
3418	buffer, and the shortest possible version thereof. Return a pointer to
3419	the first character after the match, or NULL if none is found. BEGIN
3420	points to the beginning of the buffer, and END points to the first byte
3421	after its end. Note however that we store a sentinel byte (usually
3422	newline) in *END, so the actual buffer must be one byte longer.
3423	When ALLOW_NL, newlines may appear in the matching string.
3424	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3425	If MULTIBYTE, the input consists of multibyte characters and/or
3426	encoding-error bytes. Otherwise, it consists of single-byte characters.
3427	Here is the list of features that make this DFA matcher punt:
3428	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3429	- [^...] in non-simple locale
3430	- [[=foo=]] or [[.foo.]]
3431	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3432	- back-reference: (.)\1
3433	- word-delimiter in multibyte locale: \<, \>, \b, \B
3434	See struct localeinfo.simple for the definition of "simple locale". */
3435
3436	static inline char *
3437	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
3438	idx_t *count, bool multibyte)
3439	{
3440	if (MAX_TRCOUNT <= d->sindex)
3441	{
3442	for (state_num s = d->min_trcount; s < d->sindex; s++)
3443	{
3444	free (d->states[s].elems.elems);
3445	free (d->states[s].mbps.elems);
3446	}
3447	d->sindex = d->min_trcount;
3448
3449	if (d->trans)
3450	{
3451	for (state_num s = 0; s < d->tralloc; s++)
3452	{
3453	free (d->trans[s]);
3454	free (d->fails[s]);
3455	d->trans[s] = d->fails[s] = NULL;
3456	}
3457	d->trcount = 0;
3458	}
3459
3460	if (d->localeinfo.multibyte && d->mb_trans)
3461	{
3462	for (state_num s = -1; s < d->tralloc; s++)
3463	{
3464	free (d->mb_trans[s]);
3465	d->mb_trans[s] = NULL;
3466	}
3467	for (state_num s = 0; s < d->min_trcount; s++)
3468	d->states[s].mb_trindex = -1;
3469	d->mb_trcount = 0;
3470	}
3471	}
3472
3473	if (!d->tralloc)
3474	realloc_trans_if_necessary (d);
3475
3476	/* Current state. */
3477	state_num s = 0, s1 = 0;
3478
3479	/* Current input character. */
3480	unsigned char const p = (unsigned char const ) begin;
3481	unsigned char const *mbp = p;
3482
3483	/* Copy of d->trans so it can be optimized into a register. */
3484	state_num **trans = d->trans;
3485	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3486	unsigned char saved_end = (unsigned char ) end;
3487	*end = eol;
3488
3489	if (multibyte)
3490	{
3491	memset (&d->mbs, 0, sizeof d->mbs);
3492	if (d->mb_follows.alloc == 0)
3493	alloc_position_set (&d->mb_follows, d->nleaves);
3494	}
3495
3496	idx_t nlcount = 0;
3497	for (;;)
3498	{
3499	state_num *t;
3500	while ((t = trans[s]) != NULL)
3501	{
3502	if (s < d->min_trcount)
3503	{
3504	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
3505	{
3506	while (t[*p] == s)
3507	p++;
3508	}
3509	if (multibyte)
3510	p = mbp = skip_remains_mb (d, p, mbp, end);
3511	}
3512
3513	if (multibyte)
3514	{
3515	s1 = s;
3516
3517	if (d->states[s].mbps.nelem == 0
3518	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3519	{
3520	/* If an input character does not match ANYCHAR, do it
3521	like a single-byte character. */
3522	s = t[*p++];
3523	}
3524	else
3525	{
3526	s = transit_state (d, s, &p, (unsigned char *) end);
3527	mbp = p;
3528	trans = d->trans;
3529	}
3530	}
3531	else
3532	{
3533	s1 = t[*p++];
3534	t = trans[s1];
3535	if (! t)
3536	{
3537	state_num tmp = s;
3538	s = s1;
3539	s1 = tmp; /* swap */
3540	break;
3541	}
3542	if (s < d->min_trcount)
3543	{
3544	while (t[*p] == s1)
3545	p++;
3546	}
3547	s = t[*p++];
3548	}
3549	}
3550
3551	if (s < 0)
3552	{
3553	if (s == -2)
3554	{
3555	s = build_state (s1, d, p[-1]);
3556	trans = d->trans;
3557	}
3558	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3559	{
3560	/* The previous character was a newline. Count it, and skip
3561	checking of multibyte character boundary until here. */
3562	nlcount++;
3563	mbp = p;
3564
3565	s = (allow_nl ? d->newlines[s1]
3566	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3567	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3568	: d->initstate_notbol);
3569	}
3570	else
3571	{
3572	p = NULL;
3573	goto done;
3574	}
3575	}
3576	else if (d->fails[s])
3577	{
3578	if ((d->success[s] & d->syntax.sbit[*p])
3579	\|\| ((char *) p == end
3580	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3581	d)))
3582	goto done;
3583
3584	if (multibyte && s < d->min_trcount)
3585	p = mbp = skip_remains_mb (d, p, mbp, end);
3586
3587	s1 = s;
3588	if (!multibyte \|\| d->states[s].mbps.nelem == 0
3589	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3590	{
3591	/* If a input character does not match ANYCHAR, do it
3592	like a single-byte character. */
3593	s = d->fails[s][*p++];
3594	}
3595	else
3596	{
3597	s = transit_state (d, s, &p, (unsigned char *) end);
3598	mbp = p;
3599	trans = d->trans;
3600	}
3601	}
3602	else
3603	{
3604	build_state (s, d, p[0]);
3605	trans = d->trans;
3606	}
3607	}
3608
3609	done:
3610	if (count)
3611	*count += nlcount;
3612	*end = saved_end;
3613	return (char *) p;
3614	}
3615
3616	/* Specialized versions of dfaexec for multibyte and single-byte cases.
3617	This is for performance, as dfaexec_main is an inline function. */
3618
3619	static char *
3620	dfaexec_mb (struct dfa d, char const begin, char *end,
3621	bool allow_nl, idx_t count, bool backref)
3622	{
3623	return dfaexec_main (d, begin, end, allow_nl, count, true);
3624	}
3625
3626	static char *
3627	dfaexec_sb (struct dfa d, char const begin, char *end,
3628	bool allow_nl, idx_t count, bool backref)
3629	{
3630	return dfaexec_main (d, begin, end, allow_nl, count, false);
3631	}
3632
3633	/* Always set *BACKREF and return BEGIN. Use this wrapper for
3634	any regexp that uses a construct not supported by this code. */
3635	static char *
3636	dfaexec_noop (struct dfa d, char const begin, char *end,
3637	bool allow_nl, idx_t count, bool backref)
3638	{
3639	*backref = true;
3640	return (char *) begin;
3641	}
3642
3643	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3644	but faster and set *BACKREF if the DFA code does not support this
3645	regexp usage. */
3646
3647	char *
3648	dfaexec (struct dfa d, char const begin, char *end,
3649	bool allow_nl, idx_t count, bool backref)
3650	{
3651	return d->dfaexec (d, begin, end, allow_nl, count, backref);
3652	}
3653
3654	struct dfa *
3655	dfasuperset (struct dfa const *d)
3656	{
3657	return d->superset;
3658	}
3659
3660	bool
3661	dfaisfast (struct dfa const *d)
3662	{
3663	return d->fast;
3664	}
3665
3666	static void
3667	free_mbdata (struct dfa *d)
3668	{
3669	free (d->multibyte_prop);
3670	free (d->lex.brack.chars);
3671	free (d->mb_follows.elems);
3672
3673	if (d->mb_trans)
3674	{
3675	state_num s;
3676	for (s = -1; s < d->tralloc; s++)
3677	free (d->mb_trans[s]);
3678	free (d->mb_trans - 2);
3679	}
3680	}
3681
3682	/* Return true if every construct in D is supported by this DFA matcher. */
3683	bool
3684	dfasupported (struct dfa const *d)
3685	{
3686	for (idx_t i = 0; i < d->tindex; i++)
3687	{
3688	switch (d->tokens[i])
3689	{
3690	case BEGWORD:
3691	case ENDWORD:
3692	case LIMWORD:
3693	case NOTLIMWORD:
3694	if (!d->localeinfo.multibyte)
3695	continue;
3696	FALLTHROUGH;
3697	case BACKREF:
3698	case MBCSET:
3699	return false;
3700	}
3701	}
3702	return true;
3703	}
3704
3705	/* Disable use of the superset DFA if it is not likely to help
3706	performance. */
3707	static void
3708	maybe_disable_superset_dfa (struct dfa *d)
3709	{
3710	if (!d->localeinfo.using_utf8)
3711	return;
3712
3713	bool have_backref = false;
3714	for (idx_t i = 0; i < d->tindex; i++)
3715	{
3716	switch (d->tokens[i])
3717	{
3718	case ANYCHAR:
3719	/* Lowered. */
3720	abort ();
3721	case BACKREF:
3722	have_backref = true;
3723	break;
3724	case MBCSET:
3725	/* Requires multi-byte algorithm. */
3726	return;
3727	default:
3728	break;
3729	}
3730	}
3731
3732	if (!have_backref && d->superset)
3733	{
3734	/* The superset DFA is not likely to be much faster, so remove it. */
3735	dfafree (d->superset);
3736	free (d->superset);
3737	d->superset = NULL;
3738	}
3739
3740	free_mbdata (d);
3741	d->localeinfo.multibyte = false;
3742	d->dfaexec = dfaexec_sb;
3743	d->fast = true;
3744	}
3745
3746	static void
3747	dfassbuild (struct dfa *d)
3748	{
3749	struct dfa *sup = dfaalloc ();
3750
3751	sup = d;
3752	sup->localeinfo.multibyte = false;
3753	sup->dfaexec = dfaexec_sb;
3754	sup->multibyte_prop = NULL;
3755	sup->superset = NULL;
3756	sup->states = NULL;
3757	sup->sindex = 0;
3758	sup->constraints = NULL;
3759	sup->separates = NULL;
3760	sup->follows = NULL;
3761	sup->tralloc = 0;
3762	sup->trans = NULL;
3763	sup->fails = NULL;
3764	sup->success = NULL;
3765	sup->newlines = NULL;
3766
3767	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3768	if (d->cindex)
3769	{
3770	memcpy (sup->charclasses, d->charclasses,
3771	d->cindex * sizeof *sup->charclasses);
3772	}
3773
3774	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3775	sup->talloc = d->tindex * 2;
3776
3777	bool have_achar = false;
3778	bool have_nchar = false;
3779	idx_t j;
3780	for (idx_t i = j = 0; i < d->tindex; i++)
3781	{
3782	switch (d->tokens[i])
3783	{
3784	case ANYCHAR:
3785	case MBCSET:
3786	case BACKREF:
3787	{
3788	charclass ccl;
3789	fillset (&ccl);
3790	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3791	sup->tokens[j++] = STAR;
3792	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
3793	\|\| d->tokens[i + 1] == PLUS)
3794	i++;
3795	have_achar = true;
3796	}
3797	break;
3798	case BEGWORD:
3799	case ENDWORD:
3800	case LIMWORD:
3801	case NOTLIMWORD:
3802	if (d->localeinfo.multibyte)
3803	{
3804	/* These constraints aren't supported in a multibyte locale.
3805	Ignore them in the superset DFA. */
3806	sup->tokens[j++] = EMPTY;
3807	break;
3808	}
3809	FALLTHROUGH;
3810	default:
3811	sup->tokens[j++] = d->tokens[i];
3812	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3813	\|\| d->tokens[i] >= CSET)
3814	have_nchar = true;
3815	break;
3816	}
3817	}
3818	sup->tindex = j;
3819
3820	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
3821	d->superset = sup;
3822	else
3823	{
3824	dfafree (sup);
3825	free (sup);
3826	}
3827	}
3828
3829	/* Parse a string S of length LEN into D (but skip this step if S is null).
3830	Then analyze D and build a matcher for it.
3831	SEARCHFLAG says whether to build a searching or an exact matcher. */
3832	void
3833	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
3834	{
3835	if (s != NULL)
3836	dfaparse (s, len, d);
3837
3838	dfassbuild (d);
3839
3840	if (dfasupported (d))
3841	{
3842	maybe_disable_superset_dfa (d);
3843	dfaanalyze (d, searchflag);
3844	}
3845	else
3846	{
3847	d->dfaexec = dfaexec_noop;
3848	}
3849
3850	if (d->superset)
3851	{
3852	d->fast = true;
3853	dfaanalyze (d->superset, searchflag);
3854	}
3855	}
3856
3857	/* Free the storage held by the components of a dfa. */
3858	void
3859	dfafree (struct dfa *d)
3860	{
3861	free (d->charclasses);
3862	free (d->tokens);
3863
3864	if (d->localeinfo.multibyte)
3865	free_mbdata (d);
3866
3867	free (d->constraints);
3868	free (d->separates);
3869
3870	for (idx_t i = 0; i < d->sindex; i++)
3871	{
3872	free (d->states[i].elems.elems);
3873	free (d->states[i].mbps.elems);
3874	}
3875	free (d->states);
3876
3877	if (d->follows)
3878	{
3879	for (idx_t i = 0; i < d->tindex; i++)
3880	free (d->follows[i].elems);
3881	free (d->follows);
3882	}
3883
3884	if (d->trans)
3885	{
3886	for (idx_t i = 0; i < d->tralloc; i++)
3887	{
3888	free (d->trans[i]);
3889	free (d->fails[i]);
3890	}
3891
3892	free (d->trans - 2);
3893	free (d->fails);
3894	free (d->newlines);
3895	free (d->success);
3896	}
3897
3898	if (d->superset)
3899	{
3900	dfafree (d->superset);
3901	free (d->superset);
3902	}
3903	}
3904
3905	/* Having found the postfix representation of the regular expression,
3906	try to find a long sequence of characters that must appear in any line
3907	containing the r.e.
3908	Finding a "longest" sequence is beyond the scope here;
3909	we take an easy way out and hope for the best.
3910	(Take "(ab\|a)b"--please.)
3911
3912	We do a bottom-up calculation of sequences of characters that must appear
3913	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3914	representation:
3915	sequences that must appear at the left of the match ("left")
3916	sequences that must appear at the right of the match ("right")
3917	lists of sequences that must appear somewhere in the match ("in")
3918	sequences that must constitute the match ("is")
3919
3920	When we get to the root of the tree, we use one of the longest of its
3921	calculated "in" sequences as our answer.
3922
3923	The sequences calculated for the various types of node (in pseudo ANSI c)
3924	are shown below. "p" is the operand of unary operators (and the left-hand
3925	operand of binary operators); "q" is the right-hand operand of binary
3926	operators.
3927
3928	"ZERO" means "a zero-length sequence" below.
3929
3930	Type left right is in
3931	---- ---- ----- -- --
3932	char c # c # c # c # c
3933
3934	ANYCHAR ZERO ZERO ZERO ZERO
3935
3936	MBCSET ZERO ZERO ZERO ZERO
3937
3938	CSET ZERO ZERO ZERO ZERO
3939
3940	STAR ZERO ZERO ZERO ZERO
3941
3942	QMARK ZERO ZERO ZERO ZERO
3943
3944	PLUS p->left p->right ZERO p->in
3945
3946	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3947	p->left : q->right : q->is!=ZERO) ? q->in plus
3948	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3949	ZERO
3950
3951	OR longest common longest common (do p->is and substrings common
3952	leading trailing to q->is have same p->in and
3953	(sub)sequence (sub)sequence q->in length and content) ?
3954	of p->left of p->right
3955	and q->left and q->right p->is : NULL
3956
3957	If there's anything else we recognize in the tree, all four sequences get set
3958	to zero-length sequences. If there's something we don't recognize in the
3959	tree, we just return a zero-length sequence.
3960
3961	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3962	'aaa')?
3963
3964	And ... is it here or someplace that we might ponder "optimizations" such as
3965	egrep 'psi\|epsilon' -> egrep 'psi'
3966	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3967	(Yes, we now find "epsi" as a "string
3968	that must occur", but we might also
3969	simplify the entire r.e. being sought)
3970	grep '[c]' -> grep 'c'
3971	grep '(ab\|a)b' -> grep 'ab'
3972	grep 'ab*' -> grep 'a'
3973	grep 'a*b' -> grep 'b'
3974
3975	There are several issues:
3976
3977	Is optimization easy (enough)?
3978
3979	Does optimization actually accomplish anything,
3980	or is the automaton you get from "psi\|epsilon" (for example)
3981	the same as the one you get from "psi" (for example)?
3982
3983	Are optimizable r.e.'s likely to be used in real-life situations
3984	(something like 'ab*' is probably unlikely; something like is
3985	'psi\|epsilon' is likelier)? */
3986
3987	static char *
3988	icatalloc (char old, char const new)
3989	{
3990	idx_t newsize = strlen (new);
3991	if (newsize == 0)
3992	return old;
3993	idx_t oldsize = strlen (old);
3994	char *result = xirealloc (old, oldsize + newsize + 1);
3995	memcpy (result + oldsize, new, newsize + 1);
3996	return result;
3997	}
3998
3999	static void
4000	freelist (char **cpp)
4001	{
4002	while (*cpp)
4003	free (*cpp++);
4004	}
4005
4006	static char **
4007	enlistnew (char *cpp, char new)
4008	{
4009	/* Is there already something in the list that's new (or longer)? */
4010	idx_t i;
4011	for (i = 0; cpp[i] != NULL; i++)
4012	if (strstr (cpp[i], new) != NULL)
4013	{
4014	free (new);
4015	return cpp;
4016	}
4017	/* Eliminate any obsoleted strings. */
4018	for (idx_t j = 0; cpp[j] != NULL; )
4019	if (strstr (new, cpp[j]) == NULL)
4020	++j;
4021	else
4022	{
4023	free (cpp[j]);
4024	if (--i == j)
4025	break;
4026	cpp[j] = cpp[i];
4027	cpp[i] = NULL;
4028	}
4029	/* Add the new string. */
4030	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4031	cpp[i] = new;
4032	cpp[i + 1] = NULL;
4033	return cpp;
4034	}
4035
4036	static char **
4037	enlist (char *cpp, char const str, idx_t len)
4038	{
4039	return enlistnew (cpp, ximemdup0 (str, len));
4040	}
4041
4042	/* Given pointers to two strings, return a pointer to an allocated
4043	list of their distinct common substrings. */
4044	static char **
4045	comsubs (char left, char const right)
4046	{
4047	char *cpp = xzalloc (sizeof cpp);
4048
4049	for (char lcp = left; lcp != '\0'; lcp++)
4050	{
4051	idx_t len = 0;
4052	char rcp = strchr (right, lcp);
4053	while (rcp != NULL)
4054	{
4055	idx_t i;
4056	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4057	continue;
4058	if (i > len)
4059	len = i;
4060	rcp = strchr (rcp + 1, *lcp);
4061	}
4062	if (len != 0)
4063	cpp = enlist (cpp, lcp, len);
4064	}
4065	return cpp;
4066	}
4067
4068	static char **
4069	addlists (char old, char new)
4070	{
4071	for (; *new; new++)
4072	old = enlistnew (old, xstrdup (*new));
4073	return old;
4074	}
4075
4076	/* Given two lists of substrings, return a new list giving substrings
4077	common to both. */
4078	static char **
4079	inboth (char left, char right)
4080	{
4081	char *both = xzalloc (sizeof both);
4082
4083	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4084	{
4085	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4086	{
4087	char **temp = comsubs (left[lnum], right[rnum]);
4088	both = addlists (both, temp);
4089	freelist (temp);
4090	free (temp);
4091	}
4092	}
4093	return both;
4094	}
4095
4096	typedef struct must must;
4097
4098	struct must
4099	{
4100	char **in;
4101	char *left;
4102	char *right;
4103	char *is;
4104	bool begline;
4105	bool endline;
4106	must *prev;
4107	};
4108
4109	static must *
4110	allocmust (must *mp, idx_t size)
4111	{
4112	must new_mp = xmalloc (sizeof new_mp);
4113	new_mp->in = xzalloc (sizeof *new_mp->in);
4114	new_mp->left = xizalloc (size);
4115	new_mp->right = xizalloc (size);
4116	new_mp->is = xizalloc (size);
4117	new_mp->begline = false;
4118	new_mp->endline = false;
4119	new_mp->prev = mp;
4120	return new_mp;
4121	}
4122
4123	static void
4124	resetmust (must *mp)
4125	{
4126	freelist (mp->in);
4127	mp->in[0] = NULL;
4128	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4129	mp->begline = false;
4130	mp->endline = false;
4131	}
4132
4133	static void
4134	freemust (must *mp)
4135	{
4136	freelist (mp->in);
4137	free (mp->in);
4138	free (mp->left);
4139	free (mp->right);
4140	free (mp->is);
4141	free (mp);
4142	}
4143
4144	struct dfamust *
4145	dfamust (struct dfa const *d)
4146	{
4147	must *mp = NULL;
4148	char const *result = "";
4149	bool exact = false;
4150	bool begline = false;
4151	bool endline = false;
4152	bool need_begline = false;
4153	bool need_endline = false;
4154	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4155
4156	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4157	{
4158	token t = d->tokens[ri];
4159	switch (t)
4160	{
4161	case BEGLINE:
4162	mp = allocmust (mp, 2);
4163	mp->begline = true;
4164	need_begline = true;
4165	break;
4166	case ENDLINE:
4167	mp = allocmust (mp, 2);
4168	mp->endline = true;
4169	need_endline = true;
4170	break;
4171	case LPAREN:
4172	case RPAREN:
4173	assert (!"neither LPAREN nor RPAREN may appear here");
4174
4175	case EMPTY:
4176	case BEGWORD:
4177	case ENDWORD:
4178	case LIMWORD:
4179	case NOTLIMWORD:
4180	case BACKREF:
4181	case ANYCHAR:
4182	case MBCSET:
4183	mp = allocmust (mp, 2);
4184	break;
4185
4186	case STAR:
4187	case QMARK:
4188	assume_nonnull (mp);
4189	resetmust (mp);
4190	break;
4191
4192	case OR:
4193	{
4194	char **new;
4195	must *rmp = mp;
4196	assume_nonnull (rmp);
4197	must *lmp = mp = mp->prev;
4198	assume_nonnull (lmp);
4199	idx_t j, ln, rn, n;
4200
4201	/* Guaranteed to be. Unlikely, but ... */
4202	if (str_eq (lmp->is, rmp->is))
4203	{
4204	lmp->begline &= rmp->begline;
4205	lmp->endline &= rmp->endline;
4206	}
4207	else
4208	{
4209	lmp->is[0] = '\0';
4210	lmp->begline = false;
4211	lmp->endline = false;
4212	}
4213	/* Left side--easy */
4214	idx_t i = 0;
4215	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4216	++i;
4217	lmp->left[i] = '\0';
4218	/* Right side */
4219	ln = strlen (lmp->right);
4220	rn = strlen (rmp->right);
4221	n = ln;
4222	if (n > rn)
4223	n = rn;
4224	for (i = 0; i < n; ++i)
4225	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4226	break;
4227	for (j = 0; j < i; ++j)
4228	lmp->right[j] = lmp->right[(ln - i) + j];
4229	lmp->right[j] = '\0';
4230	new = inboth (lmp->in, rmp->in);
4231	freelist (lmp->in);
4232	free (lmp->in);
4233	lmp->in = new;
4234	freemust (rmp);
4235	}
4236	break;
4237
4238	case PLUS:
4239	assume_nonnull (mp);
4240	mp->is[0] = '\0';
4241	break;
4242
4243	case END:
4244	assume_nonnull (mp);
4245	assert (!mp->prev);
4246	for (idx_t i = 0; mp->in[i] != NULL; i++)
4247	if (strlen (mp->in[i]) > strlen (result))
4248	result = mp->in[i];
4249	if (str_eq (result, mp->is))
4250	{
4251	if ((!need_begline \|\| mp->begline) && (!need_endline
4252	\|\| mp->endline))
4253	exact = true;
4254	begline = mp->begline;
4255	endline = mp->endline;
4256	}
4257	goto done;
4258
4259	case CAT:
4260	{
4261	must *rmp = mp;
4262	assume_nonnull (rmp);
4263	must *lmp = mp = mp->prev;
4264	assume_nonnull (lmp);
4265
4266	/* In. Everything in left, plus everything in
4267	right, plus concatenation of
4268	left's right and right's left. */
4269	lmp->in = addlists (lmp->in, rmp->in);
4270	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4271	{
4272	idx_t lrlen = strlen (lmp->right);
4273	idx_t rllen = strlen (rmp->left);
4274	char *tp = ximalloc (lrlen + rllen + 1);
4275	memcpy (tp + lrlen, rmp->left, rllen + 1);
4276	memcpy (tp, lmp->right, lrlen);
4277	lmp->in = enlistnew (lmp->in, tp);
4278	}
4279	/* Left-hand */
4280	if (lmp->is[0] != '\0')
4281	lmp->left = icatalloc (lmp->left, rmp->left);
4282	/* Right-hand */
4283	if (rmp->is[0] == '\0')
4284	lmp->right[0] = '\0';
4285	lmp->right = icatalloc (lmp->right, rmp->right);
4286	/* Guaranteed to be */
4287	if ((lmp->is[0] != '\0' \|\| lmp->begline)
4288	&& (rmp->is[0] != '\0' \|\| rmp->endline))
4289	{
4290	lmp->is = icatalloc (lmp->is, rmp->is);
4291	lmp->endline = rmp->endline;
4292	}
4293	else
4294	{
4295	lmp->is[0] = '\0';
4296	lmp->begline = false;
4297	lmp->endline = false;
4298	}
4299	freemust (rmp);
4300	}
4301	break;
4302
4303	case '\0':
4304	/* Not on my shift. */
4305	goto done;
4306
4307	default:
4308	if (CSET <= t)
4309	{
4310	/* If T is a singleton, or if case-folding in a unibyte
4311	locale and T's members all case-fold to the same char,
4312	convert T to one of its members. Otherwise, do
4313	nothing further with T. */
4314	charclass *ccl = &d->charclasses[t - CSET];
4315	int j;
4316	for (j = 0; j < NOTCHAR; j++)
4317	if (tstbit (j, ccl))
4318	break;
4319	if (! (j < NOTCHAR))
4320	{
4321	mp = allocmust (mp, 2);
4322	break;
4323	}
4324	t = j;
4325	while (++j < NOTCHAR)
4326	if (tstbit (j, ccl)
4327	&& ! (case_fold_unibyte
4328	&& toupper (j) == toupper (t)))
4329	break;
4330	if (j < NOTCHAR)
4331	{
4332	mp = allocmust (mp, 2);
4333	break;
4334	}
4335	}
4336
4337	idx_t rj = ri + 2;
4338	if (d->tokens[ri + 1] == CAT)
4339	{
4340	for (; rj < d->tindex - 1; rj += 2)
4341	{
4342	if ((rj != ri && (d->tokens[rj] <= 0
4343	\|\| NOTCHAR <= d->tokens[rj]))
4344	\|\| d->tokens[rj + 1] != CAT)
4345	break;
4346	}
4347	}
4348	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4349	mp->is[0] = mp->left[0] = mp->right[0]
4350	= case_fold_unibyte ? toupper (t) : t;
4351
4352	idx_t i;
4353	for (i = 1; ri + 2 < rj; i++)
4354	{
4355	ri += 2;
4356	t = d->tokens[ri];
4357	mp->is[i] = mp->left[i] = mp->right[i]
4358	= case_fold_unibyte ? toupper (t) : t;
4359	}
4360	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4361	mp->in = enlist (mp->in, mp->is, i);
4362	break;
4363	}
4364	}
4365	done:;
4366
4367	struct dfamust *dm = NULL;
4368	if (*result)
4369	{
4370	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4371	dm->exact = exact;
4372	dm->begline = begline;
4373	dm->endline = endline;
4374	strcpy (dm->must, result);
4375	}
4376
4377	while (mp)
4378	{
4379	must *prev = mp->prev;
4380	freemust (mp);
4381	mp = prev;
4382	}
4383
4384	return dm;
4385	}
4386
4387	void
4388	dfamustfree (struct dfamust *dm)
4389	{
4390	free (dm);
4391	}
4392
4393	struct dfa *
4394	dfaalloc (void)
4395	{
4396	return xmalloc (sizeof (struct dfa));
4397	}
4398
4399	/* Initialize DFA. */
4400	void
4401	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
4402	reg_syntax_t bits, int dfaopts)
4403	{
4404	memset (dfa, 0, offsetof (struct dfa, dfaexec));
4405	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4406	dfa->localeinfo = *linfo;
4407
4408	dfa->fast = !dfa->localeinfo.multibyte;
4409
4410	dfa->canychar = -1;
4411	dfa->syntax.syntax_bits_set = true;
4412	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4413	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4414	dfa->syntax.syntax_bits = bits;
4415	dfa->syntax.dfaopts = dfaopts;
4416
4417	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4418	{
4419	unsigned char uc = i;
4420
4421	dfa->syntax.sbit[uc] = char_context (dfa, uc);
4422	switch (dfa->syntax.sbit[uc])
4423	{
4424	case CTX_LETTER:
4425	setbit (uc, &dfa->syntax.letters);
4426	break;
4427	case CTX_NEWLINE:
4428	setbit (uc, &dfa->syntax.newline);
4429	break;
4430	}
4431
4432	/* POSIX requires that the five bytes in "\n\r./" (including the
4433	terminating NUL) cannot occur inside a multibyte character. */
4434	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4435	? (uc & 0xc0) != 0x80
4436	: strchr ("\n\r./", uc) != NULL);
4437	}
4438	}
4439
4440	/* Initialize TO by copying FROM's syntax settings. */
4441	void
4442	dfacopysyntax (struct dfa to, struct dfa const from)
4443	{
4444	memset (to, 0, offsetof (struct dfa, syntax));
4445	to->canychar = -1;
4446	to->fast = from->fast;
4447	to->syntax = from->syntax;
4448	to->dfaexec = from->dfaexec;
4449	to->localeinfo = from->localeinfo;
4450	}
4451
4452	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/sed/lib/dfa.c@ 3657

Download in other formats: