dfa.c@ 3646

Last change on this file since 3646 was 3532, checked in by bird, 3 years ago
grep: Initial windows config and adjustments.
Property svn:eol-style set to `native`
File size: 133.4 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24
25	#include "dfa.h"
26
27	#include "flexmember.h"
28	#include "idx.h"
29	#include "verify.h"
30
31	#include <assert.h>
32	#include <ctype.h>
33	#include <stdint.h>
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include <limits.h>
37	#include <string.h>
38
39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40	understandably cannot deduce that the input comes from a
41	well-formed regular expression. There's little point to the
42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
43	MMU will check anyway. */
44	#define assume_nonnull(x) assume ((x) != NULL)
45
46	static bool
47	streq (char const a, char const b)
48	{
49	return strcmp (a, b) == 0;
50	}
51
52	static bool
53	isasciidigit (char c)
54	{
55	return '0' <= c && c <= '9';
56	}
57
58	#include "gettext.h"
59	#define _(str) gettext (str)
60
61	#include <wchar.h>
62
63	#include "xalloc.h"
64	#include "localeinfo.h"
65
66	#ifndef FALLTHROUGH
67	# if 201710L < __STDC_VERSION__
68	# define FALLTHROUGH [[__fallthrough__]]
69	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
70	# define FALLTHROUGH __attribute__ ((__fallthrough__))
71	# else
72	# define FALLTHROUGH ((void) 0)
73	# endif
74	#endif
75
76	#ifndef MIN
77	# define MIN(a,b) ((a) < (b) ? (a) : (b))
78	#endif
79
80	/* HPUX defines these as macros in sys/param.h. */
81	#ifdef setbit
82	# undef setbit
83	#endif
84	#ifdef clrbit
85	# undef clrbit
86	#endif
87
88	/* For code that does not use Gnulib’s isblank module. */
89	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90	# define isblank dfa_isblank
91	static int
92	isblank (int c)
93	{
94	return c == ' ' \|\| c == '\t';
95	}
96	#endif
97
98	/* First integer value that is greater than any character code. */
99	enum { NOTCHAR = 1 << CHAR_BIT };
100
101	#ifdef UINT_LEAST64_MAX
102
103	/* Number of bits used in a charclass word. */
104	enum { CHARCLASS_WORD_BITS = 64 };
105
106	/* This represents part of a character class. It must be unsigned and
107	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108	typedef uint_least64_t charclass_word;
109
110	/* Part of a charclass initializer that represents 64 bits' worth of a
111	charclass, where LO and HI are the low and high-order 32 bits of
112	the 64-bit quantity. */
113	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115	#else
116	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117	enum { CHARCLASS_WORD_BITS = 32 };
118	typedef unsigned long charclass_word;
119	# define CHARCLASS_PAIR(lo, hi) lo, hi
120	#endif
121
122	/* An initializer for a charclass whose 32-bit words are A through H. */
123	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124	{{ \
125	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127	}}
128
129	/* The maximum useful value of a charclass_word; all used bits are 1. */
130	static charclass_word const CHARCLASS_WORD_MASK
131	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133	/* Number of words required to hold a bit for every character. */
134	enum
135	{
136	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137	};
138
139	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142	/* Convert a possibly-signed character to an unsigned character. This is
143	a bit safer than casting to unsigned char, since it catches some type
144	errors that the cast doesn't. */
145	static unsigned char
146	to_uchar (char ch)
147	{
148	return ch;
149	}
150
151	/* Contexts tell us whether a character is a newline or a word constituent.
152	Word-constituent characters are those that satisfy iswalnum, plus '_'.
153	Each character has a single CTX_* value; bitmasks of CTX_* values denote
154	a particular character class.
155
156	A state also stores a context value, which is a bitmask of CTX_* values.
157	A state's context represents a set of characters that the state's
158	predecessors must match. For example, a state whose context does not
159	include CTX_LETTER will never have transitions where the previous
160	character is a word constituent. A state whose context is CTX_ANY
161	might have transitions from any character. */
162
163	enum
164	{
165	CTX_NONE = 1,
166	CTX_LETTER = 2,
167	CTX_NEWLINE = 4,
168	CTX_ANY = 7
169	};
170
171	/* Sometimes characters can only be matched depending on the surrounding
172	context. Such context decisions depend on what the previous character
173	was, and the value of the current (lookahead) character. Context
174	dependent constraints are encoded as 9-bit integers. Each bit that
175	is set indicates that the constraint succeeds in the corresponding
176	context.
177
178	bit 6-8 - valid contexts when next character is CTX_NEWLINE
179	bit 3-5 - valid contexts when next character is CTX_LETTER
180	bit 0-2 - valid contexts when next character is CTX_NONE
181
182	succeeds_in_context determines whether a given constraint
183	succeeds in a particular context. Prev is a bitmask of possible
184	context values for the previous character, curr is the (single-bit)
185	context value for the lookahead character. */
186	static int
187	newline_constraint (int constraint)
188	{
189	return (constraint >> 6) & 7;
190	}
191	static int
192	letter_constraint (int constraint)
193	{
194	return (constraint >> 3) & 7;
195	}
196	static int
197	other_constraint (int constraint)
198	{
199	return constraint & 7;
200	}
201
202	static bool
203	succeeds_in_context (int constraint, int prev, int curr)
204	{
205	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208	& prev);
209	}
210
211	/* The following describe what a constraint depends on. */
212	static bool
213	prev_newline_dependent (int constraint)
214	{
215	return ((constraint ^ constraint >> 2) & 0111) != 0;
216	}
217	static bool
218	prev_letter_dependent (int constraint)
219	{
220	return ((constraint ^ constraint >> 1) & 0111) != 0;
221	}
222
223	/* Tokens that match the empty string subject to some constraint actually
224	work by applying that constraint to determine what may follow them,
225	taking into account what has gone before. The following values are
226	the constraints corresponding to the special tokens previously defined. */
227	enum
228	{
229	NO_CONSTRAINT = 0777,
230	BEGLINE_CONSTRAINT = 0444,
231	ENDLINE_CONSTRAINT = 0700,
232	BEGWORD_CONSTRAINT = 0050,
233	ENDWORD_CONSTRAINT = 0202,
234	LIMWORD_CONSTRAINT = 0252,
235	NOTLIMWORD_CONSTRAINT = 0525
236	};
237
238	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239	are operators and others are terminal symbols. Most (but not all) of these
240	codes are returned by the lexical analyzer. */
241
242	typedef ptrdiff_t token;
243	static token const TOKEN_MAX = PTRDIFF_MAX;
244
245	/* States are indexed by state_num values. These are normally
246	nonnegative but -1 is used as a special value. */
247	typedef ptrdiff_t state_num;
248
249	/* Predefined token values. */
250	enum
251	{
252	END = -1, /* END is a terminal symbol that matches the
253	end of input; any value of END or less in
254	the parse tree is such a symbol. Accepting
255	states of the DFA are those that would have
256	a transition on END. This is -1, not some
257	more-negative value, to tweak the speed of
258	comparisons to END. */
259
260	/* Ordinary character values are terminal symbols that match themselves. */
261
262	/* CSET must come last in the following list of special tokens. Otherwise,
263	the list order matters only for performance. Related special tokens
264	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
265	\|\| CSET <= t) can be done with a single machine-level comparison. */
266
267	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268	the empty string. */
269
270	QMARK, /* QMARK is an operator of one argument that
271	matches zero or one occurrences of its
272	argument. */
273
274	STAR, /* STAR is an operator of one argument that
275	matches the Kleene closure (zero or more
276	occurrences) of its argument. */
277
278	PLUS, /* PLUS is an operator of one argument that
279	matches the positive closure (one or more
280	occurrences) of its argument. */
281
282	REPMN, /* REPMN is a lexical token corresponding
283	to the {m,n} construct. REPMN never
284	appears in the compiled token vector. */
285
286	CAT, /* CAT is an operator of two arguments that
287	matches the concatenation of its
288	arguments. CAT is never returned by the
289	lexical analyzer. */
290
291	OR, /* OR is an operator of two arguments that
292	matches either of its arguments. */
293
294	LPAREN, /* LPAREN never appears in the parse tree,
295	it is only a lexeme. */
296
297	RPAREN, /* RPAREN never appears in the parse tree. */
298
299	#if defined(KMK_GREP) && defined(KBUILD_OS_WINDOWS)
300	# define WCHAR DFA_WCHAR
301	#endif
302	WCHAR, /* Only returned by lex. wctok contains
303	the wide character representation. */
304
305	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
306	a valid multibyte (or single byte) character.
307	It is used only if MB_CUR_MAX > 1. */
308
309	BEG, /* BEG is an initial symbol that matches the
310	beginning of input. */
311
312	BEGLINE, /* BEGLINE is a terminal symbol that matches
313	the empty string at the beginning of a
314	line. */
315
316	ENDLINE, /* ENDLINE is a terminal symbol that matches
317	the empty string at the end of a line. */
318
319	BEGWORD, /* BEGWORD is a terminal symbol that matches
320	the empty string at the beginning of a
321	word. */
322
323	ENDWORD, /* ENDWORD is a terminal symbol that matches
324	the empty string at the end of a word. */
325
326	LIMWORD, /* LIMWORD is a terminal symbol that matches
327	the empty string at the beginning or the
328	end of a word. */
329
330	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
331	matches the empty string not at
332	the beginning or end of a word. */
333
334	BACKREF, /* BACKREF is generated by \<digit>
335	or by any other construct that
336	is not completely handled. If the scanner
337	detects a transition on backref, it returns
338	a kind of "semi-success" indicating that
339	the match will have to be verified with
340	a backtracking matcher. */
341
342	MBCSET, /* MBCSET is similar to CSET, but for
343	multibyte characters. */
344
345	CSET /* CSET and (and any value greater) is a
346	terminal symbol that matches any of a
347	class of characters. */
348	};
349
350
351	/* States of the recognizer correspond to sets of positions in the parse
352	tree, together with the constraints under which they may be matched.
353	So a position is encoded as an index into the parse tree together with
354	a constraint. */
355	typedef struct
356	{
357	idx_t index; /* Index into the parse array. */
358	unsigned int constraint; /* Constraint for matching this position. */
359	} position;
360
361	/* Sets of positions are stored as arrays. */
362	typedef struct
363	{
364	position elems; / Elements of this position set. */
365	idx_t nelem; /* Number of elements in this set. */
366	idx_t alloc; /* Number of elements allocated in ELEMS. */
367	} position_set;
368
369	/* A state of the dfa consists of a set of positions, some flags,
370	and the token value of the lowest-numbered position of the state that
371	contains an END token. */
372	typedef struct
373	{
374	size_t hash; /* Hash of the positions of this state. */
375	position_set elems; /* Positions this state could match. */
376	unsigned char context; /* Context from previous state. */
377	unsigned short constraint; /* Constraint for this state to accept. */
378	position_set mbps; /* Positions which can match multibyte
379	characters or the follows, e.g., period.
380	Used only if MB_CUR_MAX > 1. */
381	state_num mb_trindex; /* Index of this state in MB_TRANS, or
382	negative if the state does not have
383	ANYCHAR. */
384	} dfa_state;
385
386	/* Maximum for any transition table count. This should be at least 3,
387	for the initial state setup. */
388	enum { MAX_TRCOUNT = 1024 };
389
390	/* A bracket operator.
391	e.g., [a-c], [[:alpha:]], etc. */
392	struct mb_char_classes
393	{
394	ptrdiff_t cset;
395	bool invert;
396	wchar_t chars; / Normal characters. */
397	idx_t nchars;
398	idx_t nchars_alloc;
399	};
400
401	struct regex_syntax
402	{
403	/* Syntax bits controlling the behavior of the lexical analyzer. */
404	reg_syntax_t syntax_bits;
405	bool syntax_bits_set;
406
407	/* Flag for case-folding letters into sets. */
408	bool case_fold;
409
410	/* True if ^ and $ match only the start and end of data, and do not match
411	end-of-line within data. */
412	bool anchor;
413
414	/* End-of-line byte in data. */
415	unsigned char eolbyte;
416
417	/* Cache of char-context values. */
418	char sbit[NOTCHAR];
419
420	/* If never_trail[B], the byte B cannot be a non-initial byte in a
421	multibyte character. */
422	bool never_trail[NOTCHAR];
423
424	/* Set of characters considered letters. */
425	charclass letters;
426
427	/* Set of characters that are newline. */
428	charclass newline;
429	};
430
431	/* Lexical analyzer. All the dross that deals with the obnoxious
432	GNU Regex syntax bits is located here. The poor, suffering
433	reader is referred to the GNU Regex documentation for the
434	meaning of the @#%!@#%^!@ syntax bits. */
435	struct lexer_state
436	{
437	char const ptr; / Pointer to next input character. */
438	idx_t left; /* Number of characters remaining. */
439	token lasttok; /* Previous token returned; initially END. */
440	idx_t parens; /* Count of outstanding left parens. */
441	int minrep, maxrep; /* Repeat counts for {m,n}. */
442
443	/* Wide character representation of the current multibyte character,
444	or WEOF if there was an encoding error. Used only if
445	MB_CUR_MAX > 1. */
446	wint_t wctok;
447
448	/* The most recently analyzed multibyte bracket expression. */
449	struct mb_char_classes brack;
450
451	/* We're separated from beginning or (, \| only by zero-width characters. */
452	bool laststart;
453	};
454
455	/* Recursive descent parser for regular expressions. */
456
457	struct parser_state
458	{
459	token tok; /* Lookahead token. */
460	idx_t depth; /* Current depth of a hypothetical stack
461	holding deferred productions. This is
462	used to determine the depth that will be
463	required of the real stack later on in
464	dfaanalyze. */
465	};
466
467	/* A compiled regular expression. */
468	struct dfa
469	{
470	/* Fields filled by the scanner. */
471	charclass charclasses; / Array of character sets for CSET tokens. */
472	idx_t cindex; /* Index for adding new charclasses. */
473	idx_t calloc; /* Number of charclasses allocated. */
474	ptrdiff_t canychar; /* Index of anychar class, or -1. */
475
476	/* Scanner state */
477	struct lexer_state lex;
478
479	/* Parser state */
480	struct parser_state parse;
481
482	/* Fields filled by the parser. */
483	token tokens; / Postfix parse array. */
484	idx_t tindex; /* Index for adding new tokens. */
485	idx_t talloc; /* Number of tokens currently allocated. */
486	idx_t depth; /* Depth required of an evaluation stack
487	used for depth-first traversal of the
488	parse tree. */
489	idx_t nleaves; /* Number of non-EMPTY leaves
490	in the parse tree. */
491	idx_t nregexps; /* Count of parallel regexps being built
492	with dfaparse. */
493	bool fast; /* The DFA is fast. */
494	bool epsilon; /* Does a token match only the empty string? */
495	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
496	mbstate_t mbs; /* Multibyte conversion state. */
497
498	/* The following are valid only if MB_CUR_MAX > 1. */
499
500	/* The value of multibyte_prop[i] is defined by following rule.
501	if tokens[i] < NOTCHAR
502	bit 0 : tokens[i] is the first byte of a character, including
503	single-byte characters.
504	bit 1 : tokens[i] is the last byte of a character, including
505	single-byte characters.
506
507	e.g.
508	tokens
509	= 'single_byte_a', 'multi_byte_A', single_byte_b'
510	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
511	multibyte_prop
512	= 3 , 1 , 0 , 2 , 3
513	*/
514	char *multibyte_prop;
515
516	/* Fields filled by the superset. */
517	struct dfa superset; / Hint of the dfa. */
518
519	/* Fields filled by the state builder. */
520	dfa_state states; / States of the dfa. */
521	state_num sindex; /* Index for adding new states. */
522	idx_t salloc; /* Number of states currently allocated. */
523
524	/* Fields filled by the parse tree->NFA conversion. */
525	position_set follows; / Array of follow sets, indexed by position
526	index. The follow of a position is the set
527	of positions containing characters that
528	could conceivably follow a character
529	matching the given position in a string
530	matching the regexp. Allocated to the
531	maximum possible position index. */
532	bool searchflag; /* We are supposed to build a searching
533	as opposed to an exact matcher. A searching
534	matcher finds the first and shortest string
535	matching a regexp anywhere in the buffer,
536	whereas an exact matcher finds the longest
537	string matching, but anchored to the
538	beginning of the buffer. */
539
540	/* Fields filled by dfaanalyze. */
541	int constraints; / Array of union of accepting constraints
542	in the follow of a position. */
543	int separates; / Array of contexts on follow of a
544	position. */
545
546	/* Fields filled by dfaexec. */
547	state_num tralloc; /* Number of transition tables that have
548	slots so far, not counting trans[-1] and
549	trans[-2]. */
550	int trcount; /* Number of transition tables that have
551	been built, other than for initial
552	states. */
553	int min_trcount; /* Number of initial states. Equivalently,
554	the minimum state number for which trcount
555	counts transitions. */
556	state_num *trans; / Transition tables for states that can
557	never accept. If the transitions for a
558	state have not yet been computed, or the
559	state could possibly accept, its entry in
560	this table is NULL. This points to two
561	past the start of the allocated array,
562	and trans[-1] and trans[-2] are always
563	NULL. */
564	state_num *fails; / Transition tables after failing to accept
565	on a state that potentially could do so.
566	If trans[i] is non-null, fails[i] must
567	be null. */
568	char success; / Table of acceptance conditions used in
569	dfaexec and computed in build_state. */
570	state_num newlines; / Transitions on newlines. The entry for a
571	newline in any transition table is always
572	-1 so we can count lines without wasting
573	too many cycles. The transition for a
574	newline is stored separately and handled
575	as a special case. Newline is also used
576	as a sentinel at the end of the buffer. */
577	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
578	context in multibyte locales, in which we
579	do not distinguish between their contexts,
580	as not supported word. */
581	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
582	state_num *mb_trans; / Transition tables for states with
583	ANYCHAR. */
584	state_num mb_trcount; /* Number of transition tables for states with
585	ANYCHAR that have actually been built. */
586
587	/* Syntax configuration. This is near the end so that dfacopysyntax
588	can memset up to here. */
589	struct regex_syntax syntax;
590
591	/* Information derived from the locale. This is at the end so that
592	a quick memset need not clear it specially. */
593
594	/* dfaexec implementation. */
595	char (dfaexec) (struct dfa , char const , char *,
596	bool, ptrdiff_t , bool );
597
598	/* Other cached information derived from the locale. */
599	struct localeinfo localeinfo;
600	};
601
602	/* User access to dfa internals. */
603
604	/* S could possibly be an accepting state of R. */
605	static bool
606	accepting (state_num s, struct dfa const *r)
607	{
608	return r->states[s].constraint != 0;
609	}
610
611	/* STATE accepts in the specified context. */
612	static bool
613	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
614	{
615	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
616	}
617
618	static void regexp (struct dfa *dfa);
619
620	/* Store into *PWC the result of converting the leading bytes of the
621	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
622	and updating the conversion state in *D. On conversion error,
623	convert just a single byte, to WEOF. Return the number of bytes
624	converted.
625
626	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
627
628	* PWC points to wint_t, not to wchar_t.
629	* The last arg is a dfa *D instead of merely a multibyte conversion
630	state D->mbs.
631	* N is idx_t not size_t, and must be at least 1.
632	* S[N - 1] must be a sentinel byte.
633	* Shift encodings are not supported.
634	* The return value is always in the range 1..N.
635	* D->mbs is always valid afterwards.
636	* PWC is always set to something. /
637	static int
638	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
639	{
640	unsigned char uc = s[0];
641	wint_t wc = d->localeinfo.sbctowc[uc];
642
643	if (wc == WEOF)
644	{
645	wchar_t wch;
646	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
647	if (0 < nbytes && nbytes < (size_t) -2)
648	{
649	*pwc = wch;
650	return nbytes;
651	}
652	memset (&d->mbs, 0, sizeof d->mbs);
653	}
654
655	*pwc = wc;
656	return 1;
657	}
658
659	#ifdef DEBUG
660
661	static void
662	prtok (token t)
663	{
664	if (t <= END)
665	fprintf (stderr, "END");
666	else if (0 <= t && t < NOTCHAR)
667	{
668	unsigned int ch = t;
669	fprintf (stderr, "0x%02x", ch);
670	}
671	else
672	{
673	char const *s;
674	switch (t)
675	{
676	case BEG:
677	s = "BEG";
678	break;
679	case EMPTY:
680	s = "EMPTY";
681	break;
682	case BACKREF:
683	s = "BACKREF";
684	break;
685	case BEGLINE:
686	s = "BEGLINE";
687	break;
688	case ENDLINE:
689	s = "ENDLINE";
690	break;
691	case BEGWORD:
692	s = "BEGWORD";
693	break;
694	case ENDWORD:
695	s = "ENDWORD";
696	break;
697	case LIMWORD:
698	s = "LIMWORD";
699	break;
700	case NOTLIMWORD:
701	s = "NOTLIMWORD";
702	break;
703	case QMARK:
704	s = "QMARK";
705	break;
706	case STAR:
707	s = "STAR";
708	break;
709	case PLUS:
710	s = "PLUS";
711	break;
712	case CAT:
713	s = "CAT";
714	break;
715	case OR:
716	s = "OR";
717	break;
718	case LPAREN:
719	s = "LPAREN";
720	break;
721	case RPAREN:
722	s = "RPAREN";
723	break;
724	case ANYCHAR:
725	s = "ANYCHAR";
726	break;
727	case MBCSET:
728	s = "MBCSET";
729	break;
730	default:
731	s = "CSET";
732	break;
733	}
734	fprintf (stderr, "%s", s);
735	}
736	}
737	#endif /* DEBUG */
738
739	/* Stuff pertaining to charclasses. */
740
741	static bool
742	tstbit (unsigned int b, charclass const *c)
743	{
744	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
745	}
746
747	static void
748	setbit (unsigned int b, charclass *c)
749	{
750	charclass_word one = 1;
751	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
752	}
753
754	static void
755	clrbit (unsigned int b, charclass *c)
756	{
757	charclass_word one = 1;
758	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
759	}
760
761	static void
762	zeroset (charclass *s)
763	{
764	memset (s, 0, sizeof *s);
765	}
766
767	static void
768	fillset (charclass *s)
769	{
770	for (int i = 0; i < CHARCLASS_WORDS; i++)
771	s->w[i] = CHARCLASS_WORD_MASK;
772	}
773
774	static void
775	notset (charclass *s)
776	{
777	for (int i = 0; i < CHARCLASS_WORDS; ++i)
778	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
779	}
780
781	static bool
782	equal (charclass const s1, charclass const s2)
783	{
784	charclass_word w = 0;
785	for (int i = 0; i < CHARCLASS_WORDS; i++)
786	w \|= s1->w[i] ^ s2->w[i];
787	return w == 0;
788	}
789
790	static bool
791	emptyset (charclass const *s)
792	{
793	charclass_word w = 0;
794	for (int i = 0; i < CHARCLASS_WORDS; i++)
795	w \|= s->w[i];
796	return w == 0;
797	}
798
799	/* Ensure that the array addressed by PA holds at least I + 1 items.
800	Either return PA, or reallocate the array and return its new address.
801	Although PA may be null, the returned value is never null.
802
803	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
804	is updated on reallocation. If PA is null, *NITEMS must be zero.
805	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
806	ITEM_SIZE is the size of one item; it must be positive.
807	Avoid O(N*2) behavior on arrays growing linearly. /
808	static void *
809	maybe_realloc (void pa, idx_t i, idx_t nitems,
810	ptrdiff_t nitems_max, idx_t item_size)
811	{
812	if (i < *nitems)
813	return pa;
814	return xpalloc (pa, nitems, 1, nitems_max, item_size);
815	}
816
817	/* In DFA D, find the index of charclass S, or allocate a new one. */
818	static idx_t
819	charclass_index (struct dfa d, charclass const s)
820	{
821	idx_t i;
822
823	for (i = 0; i < d->cindex; ++i)
824	if (equal (s, &d->charclasses[i]))
825	return i;
826	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
827	TOKEN_MAX - CSET, sizeof *d->charclasses);
828	++d->cindex;
829	d->charclasses[i] = *s;
830	return i;
831	}
832
833	static bool
834	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
835	{
836	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
837	}
838
839	static int
840	char_context (struct dfa const *dfa, unsigned char c)
841	{
842	if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
843	return CTX_NEWLINE;
844	if (unibyte_word_constituent (dfa, c))
845	return CTX_LETTER;
846	return CTX_NONE;
847	}
848
849	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
850	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
851	this may happen when folding case in weird Turkish locales where
852	dotless i/dotted I are not included in the chosen character set.
853	Return whether a bit was set in the charclass. */
854	static bool
855	setbit_wc (wint_t wc, charclass *c)
856	{
857	int b = wctob (wc);
858	if (b < 0)
859	return false;
860
861	setbit (b, c);
862	return true;
863	}
864
865	/* Set a bit for B and its case variants in the charclass C.
866	MB_CUR_MAX must be 1. */
867	static void
868	setbit_case_fold_c (int b, charclass *c)
869	{
870	int ub = toupper (b);
871	for (int i = 0; i < NOTCHAR; i++)
872	if (toupper (i) == ub)
873	setbit (i, c);
874	}
875
876	/* Fetch the next lexical input character from the pattern. There
877	must at least one byte of pattern input. Set DFA->lex.wctok to the
878	value of the character or to WEOF depending on whether the input is
879	a valid multibyte character (possibly of length 1). Then return
880	the next input byte value, except return EOF if the input is a
881	multibyte character of length greater than 1. */
882	static int
883	fetch_wc (struct dfa *dfa)
884	{
885	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
886	dfa);
887	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
888	dfa->lex.ptr += nbytes;
889	dfa->lex.left -= nbytes;
890	return c;
891	}
892
893	/* If there is no more input, report an error about unbalanced brackets.
894	Otherwise, behave as with fetch_wc (DFA). */
895	static int
896	bracket_fetch_wc (struct dfa *dfa)
897	{
898	if (! dfa->lex.left)
899	dfaerror (_("unbalanced ["));
900	return fetch_wc (dfa);
901	}
902
903	typedef int predicate (int);
904
905	/* The following list maps the names of the Posix named character classes
906	to predicate functions that determine whether a given character is in
907	the class. The leading [ has already been eaten by the lexical
908	analyzer. */
909	struct dfa_ctype
910	{
911	const char *name;
912	predicate *func;
913	bool single_byte_only;
914	};
915
916	static const struct dfa_ctype prednames[] = {
917	{"alpha", isalpha, false},
918	{"upper", isupper, false},
919	{"lower", islower, false},
920	{"digit", isdigit, true},
921	{"xdigit", isxdigit, false},
922	{"space", isspace, false},
923	{"punct", ispunct, false},
924	{"alnum", isalnum, false},
925	{"print", isprint, false},
926	{"graph", isgraph, false},
927	{"cntrl", iscntrl, false},
928	{"blank", isblank, false},
929	{NULL, NULL, false}
930	};
931
932	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
933	find_pred (const char *str)
934	{
935	for (int i = 0; prednames[i].name; i++)
936	if (streq (str, prednames[i].name))
937	return &prednames[i];
938	return NULL;
939	}
940
941	/* Parse a bracket expression, which possibly includes multibyte
942	characters. */
943	static token
944	parse_bracket_exp (struct dfa *dfa)
945	{
946	/* This is a bracket expression that dfaexec is known to
947	process correctly. */
948	bool known_bracket_exp = true;
949
950	/* Used to warn about [:space:].
951	Bit 0 = first character is a colon.
952	Bit 1 = last character is a colon.
953	Bit 2 = includes any other character but a colon.
954	Bit 3 = includes ranges, char/equiv classes or collation elements. */
955	int colon_warning_state;
956
957	dfa->lex.brack.nchars = 0;
958	charclass ccl;
959	zeroset (&ccl);
960	int c = bracket_fetch_wc (dfa);
961	bool invert = c == '^';
962	if (invert)
963	{
964	c = bracket_fetch_wc (dfa);
965	known_bracket_exp = dfa->localeinfo.simple;
966	}
967	wint_t wc = dfa->lex.wctok;
968	int c1;
969	wint_t wc1;
970	colon_warning_state = (c == ':');
971	do
972	{
973	c1 = NOTCHAR; /* Mark c1 as not initialized. */
974	colon_warning_state &= ~2;
975
976	/* Note that if we're looking at some other [:...:] construct,
977	we just treat it as a bunch of ordinary characters. We can do
978	this because we assume regex has checked for syntax errors before
979	dfa is ever called. */
980	if (c == '[')
981	{
982	c1 = bracket_fetch_wc (dfa);
983	wc1 = dfa->lex.wctok;
984
985	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
986	\|\| c1 == '.' \|\| c1 == '=')
987	{
988	enum { MAX_BRACKET_STRING_LEN = 32 };
989	char str[MAX_BRACKET_STRING_LEN + 1];
990	int len = 0;
991	for (;;)
992	{
993	c = bracket_fetch_wc (dfa);
994	if (dfa->lex.left == 0
995	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
996	break;
997	if (len < MAX_BRACKET_STRING_LEN)
998	str[len++] = c;
999	else
1000	/* This is in any case an invalid class name. */
1001	str[0] = '\0';
1002	}
1003	str[len] = '\0';
1004
1005	/* Fetch bracket. */
1006	c = bracket_fetch_wc (dfa);
1007	wc = dfa->lex.wctok;
1008	if (c1 == ':')
1009	/* Build character class. POSIX allows character
1010	classes to match multicharacter collating elements,
1011	but the regex code does not support that, so do not
1012	worry about that possibility. */
1013	{
1014	char const *class
1015	= (dfa->syntax.case_fold && (streq (str, "upper")
1016	\|\| streq (str, "lower"))
1017	? "alpha" : str);
1018	const struct dfa_ctype *pred = find_pred (class);
1019	if (!pred)
1020	dfaerror (_("invalid character class"));
1021
1022	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1023	known_bracket_exp = false;
1024	else
1025	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1026	if (pred->func (c2))
1027	setbit (c2, &ccl);
1028	}
1029	else
1030	known_bracket_exp = false;
1031
1032	colon_warning_state \|= 8;
1033
1034	/* Fetch new lookahead character. */
1035	c1 = bracket_fetch_wc (dfa);
1036	wc1 = dfa->lex.wctok;
1037	continue;
1038	}
1039
1040	/* We treat '[' as a normal character here. c/c1/wc/wc1
1041	are already set up. */
1042	}
1043
1044	if (c == '\\'
1045	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1046	{
1047	c = bracket_fetch_wc (dfa);
1048	wc = dfa->lex.wctok;
1049	}
1050
1051	if (c1 == NOTCHAR)
1052	{
1053	c1 = bracket_fetch_wc (dfa);
1054	wc1 = dfa->lex.wctok;
1055	}
1056
1057	if (c1 == '-')
1058	/* build range characters. */
1059	{
1060	int c2 = bracket_fetch_wc (dfa);
1061	wint_t wc2 = dfa->lex.wctok;
1062
1063	/* A bracket expression like [a-[.aa.]] matches an unknown set.
1064	Treat it like [-a[.aa.]] while parsing it, and
1065	remember that the set is unknown. */
1066	if (c2 == '[' && dfa->lex.ptr[0] == '.')
1067	{
1068	known_bracket_exp = false;
1069	c2 = ']';
1070	}
1071
1072	if (c2 == ']')
1073	{
1074	/* In the case [x-], the - is an ordinary hyphen,
1075	which is left in c1, the lookahead character. */
1076	dfa->lex.ptr--;
1077	dfa->lex.left++;
1078	}
1079	else
1080	{
1081	if (c2 == '\\' && (dfa->syntax.syntax_bits
1082	& RE_BACKSLASH_ESCAPE_IN_LISTS))
1083	{
1084	c2 = bracket_fetch_wc (dfa);
1085	wc2 = dfa->lex.wctok;
1086	}
1087
1088	colon_warning_state \|= 8;
1089	c1 = bracket_fetch_wc (dfa);
1090	wc1 = dfa->lex.wctok;
1091
1092	/* Treat [x-y] as a range if x != y. */
1093	if (wc != wc2 \|\| wc == WEOF)
1094	{
1095	if (dfa->localeinfo.simple
1096	\|\| (isasciidigit (c) & isasciidigit (c2)))
1097	{
1098	for (int ci = c; ci <= c2; ci++)
1099	if (dfa->syntax.case_fold && isalpha (ci))
1100	setbit_case_fold_c (ci, &ccl);
1101	else
1102	setbit (ci, &ccl);
1103	}
1104	else
1105	known_bracket_exp = false;
1106
1107	continue;
1108	}
1109	}
1110	}
1111
1112	colon_warning_state \|= (c == ':') ? 2 : 4;
1113
1114	if (!dfa->localeinfo.multibyte)
1115	{
1116	if (dfa->syntax.case_fold && isalpha (c))
1117	setbit_case_fold_c (c, &ccl);
1118	else
1119	setbit (c, &ccl);
1120	continue;
1121	}
1122
1123	if (wc == WEOF)
1124	known_bracket_exp = false;
1125	else
1126	{
1127	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1128	int n = (dfa->syntax.case_fold
1129	? case_folded_counterparts (wc, folded + 1) + 1
1130	: 1);
1131	folded[0] = wc;
1132	for (int i = 0; i < n; i++)
1133	if (!setbit_wc (folded[i], &ccl))
1134	{
1135	dfa->lex.brack.chars
1136	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1137	&dfa->lex.brack.nchars_alloc, -1,
1138	sizeof *dfa->lex.brack.chars);
1139	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1140	}
1141	}
1142	}
1143	while ((wc = wc1, (c = c1) != ']'));
1144
1145	if (colon_warning_state == 7)
1146	dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1147
1148	if (! known_bracket_exp)
1149	return BACKREF;
1150
1151	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
1152	{
1153	dfa->lex.brack.invert = invert;
1154	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1155	return MBCSET;
1156	}
1157
1158	if (invert)
1159	{
1160	notset (&ccl);
1161	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1162	clrbit ('\n', &ccl);
1163	}
1164
1165	return CSET + charclass_index (dfa, &ccl);
1166	}
1167
1168	struct lexptr
1169	{
1170	char const *ptr;
1171	idx_t left;
1172	};
1173
1174	static void
1175	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
1176	{
1177	ls->ptr = dfa->lex.ptr;
1178	ls->left = dfa->lex.left;
1179	dfa->lex.ptr = s;
1180	dfa->lex.left = strlen (s);
1181	}
1182
1183	static void
1184	pop_lex_state (struct dfa dfa, struct lexptr const ls)
1185	{
1186	dfa->lex.ptr = ls->ptr;
1187	dfa->lex.left = ls->left;
1188	}
1189
1190	static token
1191	lex (struct dfa *dfa)
1192	{
1193	bool backslash = false;
1194
1195	/* Basic plan: We fetch a character. If it's a backslash,
1196	we set the backslash flag and go through the loop again.
1197	On the plus side, this avoids having a duplicate of the
1198	main switch inside the backslash case. On the minus side,
1199	it means that just about every case begins with
1200	"if (backslash) ...". */
1201	for (int i = 0; i < 2; ++i)
1202	{
1203	if (! dfa->lex.left)
1204	return dfa->lex.lasttok = END;
1205	int c = fetch_wc (dfa);
1206
1207	switch (c)
1208	{
1209	case '\\':
1210	if (backslash)
1211	goto normal_char;
1212	if (dfa->lex.left == 0)
1213	dfaerror (_("unfinished \\ escape"));
1214	backslash = true;
1215	break;
1216
1217	case '^':
1218	if (backslash)
1219	goto normal_char;
1220	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1221	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
1222	\|\| dfa->lex.lasttok == OR)
1223	return dfa->lex.lasttok = BEGLINE;
1224	goto normal_char;
1225
1226	case '$':
1227	if (backslash)
1228	goto normal_char;
1229	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1230	\|\| dfa->lex.left == 0
1231	\|\| ((dfa->lex.left
1232	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1233	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1234	& (dfa->lex.ptr[0] == '\\')]
1235	== ')'))
1236	\|\| ((dfa->lex.left
1237	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1238	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1239	& (dfa->lex.ptr[0] == '\\')]
1240	== '\|'))
1241	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1242	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1243	return dfa->lex.lasttok = ENDLINE;
1244	goto normal_char;
1245
1246	case '1':
1247	case '2':
1248	case '3':
1249	case '4':
1250	case '5':
1251	case '6':
1252	case '7':
1253	case '8':
1254	case '9':
1255	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1256	{
1257	dfa->lex.laststart = false;
1258	return dfa->lex.lasttok = BACKREF;
1259	}
1260	goto normal_char;
1261
1262	case '`':
1263	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1264	{
1265	/* FIXME: should be beginning of string */
1266	return dfa->lex.lasttok = BEGLINE;
1267	}
1268	goto normal_char;
1269
1270	case '\'':
1271	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1272	{
1273	/* FIXME: should be end of string */
1274	return dfa->lex.lasttok = ENDLINE;
1275	}
1276	goto normal_char;
1277
1278	case '<':
1279	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1280	return dfa->lex.lasttok = BEGWORD;
1281	goto normal_char;
1282
1283	case '>':
1284	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1285	return dfa->lex.lasttok = ENDWORD;
1286	goto normal_char;
1287
1288	case 'b':
1289	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1290	return dfa->lex.lasttok = LIMWORD;
1291	goto normal_char;
1292
1293	case 'B':
1294	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1295	return dfa->lex.lasttok = NOTLIMWORD;
1296	goto normal_char;
1297
1298	case '?':
1299	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1300	goto normal_char;
1301	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1302	goto normal_char;
1303	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1304	&& dfa->lex.laststart)
1305	goto normal_char;
1306	return dfa->lex.lasttok = QMARK;
1307
1308	case '*':
1309	if (backslash)
1310	goto normal_char;
1311	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1312	&& dfa->lex.laststart)
1313	goto normal_char;
1314	return dfa->lex.lasttok = STAR;
1315
1316	case '+':
1317	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1318	goto normal_char;
1319	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1320	goto normal_char;
1321	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1322	&& dfa->lex.laststart)
1323	goto normal_char;
1324	return dfa->lex.lasttok = PLUS;
1325
1326	case '{':
1327	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1328	goto normal_char;
1329	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1330	goto normal_char;
1331	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1332	&& dfa->lex.laststart)
1333	goto normal_char;
1334
1335	/* Cases:
1336	{M} - exact count
1337	{M,} - minimum count, maximum is infinity
1338	{,N} - 0 through N
1339	{,} - 0 to infinity (same as '*')
1340	{M,N} - M through N */
1341	{
1342	char const *p = dfa->lex.ptr;
1343	char const *lim = p + dfa->lex.left;
1344	dfa->lex.minrep = dfa->lex.maxrep = -1;
1345	for (; p != lim && isasciidigit (*p); p++)
1346	dfa->lex.minrep = (dfa->lex.minrep < 0
1347	? *p - '0'
1348	: MIN (RE_DUP_MAX + 1,
1349	dfa->lex.minrep * 10 + *p - '0'));
1350	if (p != lim)
1351	{
1352	if (*p != ',')
1353	dfa->lex.maxrep = dfa->lex.minrep;
1354	else
1355	{
1356	if (dfa->lex.minrep < 0)
1357	dfa->lex.minrep = 0;
1358	while (++p != lim && isasciidigit (*p))
1359	dfa->lex.maxrep
1360	= (dfa->lex.maxrep < 0
1361	? *p - '0'
1362	: MIN (RE_DUP_MAX + 1,
1363	dfa->lex.maxrep * 10 + *p - '0'));
1364	}
1365	}
1366	if (! ((! backslash \|\| (p != lim && *p++ == '\\'))
1367	&& p != lim && *p++ == '}'
1368	&& 0 <= dfa->lex.minrep
1369	&& (dfa->lex.maxrep < 0
1370	\|\| dfa->lex.minrep <= dfa->lex.maxrep)))
1371	{
1372	if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1373	goto normal_char;
1374	dfaerror (_("invalid content of \\{\\}"));
1375	}
1376	if (RE_DUP_MAX < dfa->lex.maxrep)
1377	dfaerror (_("regular expression too big"));
1378	dfa->lex.ptr = p;
1379	dfa->lex.left = lim - p;
1380	}
1381	dfa->lex.laststart = false;
1382	return dfa->lex.lasttok = REPMN;
1383
1384	case '\|':
1385	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1386	goto normal_char;
1387	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1388	goto normal_char;
1389	dfa->lex.laststart = true;
1390	return dfa->lex.lasttok = OR;
1391
1392	case '\n':
1393	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1394	\|\| backslash \|\| !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1395	goto normal_char;
1396	dfa->lex.laststart = true;
1397	return dfa->lex.lasttok = OR;
1398
1399	case '(':
1400	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1401	goto normal_char;
1402	dfa->lex.parens++;
1403	dfa->lex.laststart = true;
1404	return dfa->lex.lasttok = LPAREN;
1405
1406	case ')':
1407	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1408	goto normal_char;
1409	if (dfa->lex.parens == 0
1410	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1411	goto normal_char;
1412	dfa->lex.parens--;
1413	dfa->lex.laststart = false;
1414	return dfa->lex.lasttok = RPAREN;
1415
1416	case '.':
1417	if (backslash)
1418	goto normal_char;
1419	if (dfa->canychar < 0)
1420	{
1421	charclass ccl;
1422	fillset (&ccl);
1423	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1424	clrbit ('\n', &ccl);
1425	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1426	clrbit ('\0', &ccl);
1427	if (dfa->localeinfo.multibyte)
1428	for (int c2 = 0; c2 < NOTCHAR; c2++)
1429	if (dfa->localeinfo.sbctowc[c2] == WEOF)
1430	clrbit (c2, &ccl);
1431	dfa->canychar = charclass_index (dfa, &ccl);
1432	}
1433	dfa->lex.laststart = false;
1434	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1435	? ANYCHAR
1436	: CSET + dfa->canychar);
1437
1438	case 's':
1439	case 'S':
1440	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1441	goto normal_char;
1442	if (!dfa->localeinfo.multibyte)
1443	{
1444	charclass ccl;
1445	zeroset (&ccl);
1446	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1447	if (isspace (c2))
1448	setbit (c2, &ccl);
1449	if (c == 'S')
1450	notset (&ccl);
1451	dfa->lex.laststart = false;
1452	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1453	}
1454
1455	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1456	add_utf8_anychar, makes sense. */
1457
1458	/* \s and \S are documented to be equivalent to [[:space:]] and
1459	[^[:space:]] respectively, so tell the lexer to process those
1460	strings, each minus its "already processed" '['. */
1461	{
1462	struct lexptr ls;
1463	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1464	dfa->lex.lasttok = parse_bracket_exp (dfa);
1465	pop_lex_state (dfa, &ls);
1466	}
1467
1468	dfa->lex.laststart = false;
1469	return dfa->lex.lasttok;
1470
1471	case 'w':
1472	case 'W':
1473	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1474	goto normal_char;
1475
1476	if (!dfa->localeinfo.multibyte)
1477	{
1478	charclass ccl;
1479	zeroset (&ccl);
1480	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1481	if (dfa->syntax.sbit[c2] == CTX_LETTER)
1482	setbit (c2, &ccl);
1483	if (c == 'W')
1484	notset (&ccl);
1485	dfa->lex.laststart = false;
1486	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1487	}
1488
1489	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1490	add_utf8_anychar, makes sense. */
1491
1492	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
1493	[^_[:alnum:]] respectively, so tell the lexer to process those
1494	strings, each minus its "already processed" '['. */
1495	{
1496	struct lexptr ls;
1497	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1498	dfa->lex.lasttok = parse_bracket_exp (dfa);
1499	pop_lex_state (dfa, &ls);
1500	}
1501
1502	dfa->lex.laststart = false;
1503	return dfa->lex.lasttok;
1504
1505	case '[':
1506	if (backslash)
1507	goto normal_char;
1508	dfa->lex.laststart = false;
1509	return dfa->lex.lasttok = parse_bracket_exp (dfa);
1510
1511	default:
1512	normal_char:
1513	dfa->lex.laststart = false;
1514	/* For multibyte character sets, folding is done in atom. Always
1515	return WCHAR. */
1516	if (dfa->localeinfo.multibyte)
1517	return dfa->lex.lasttok = WCHAR;
1518
1519	if (dfa->syntax.case_fold && isalpha (c))
1520	{
1521	charclass ccl;
1522	zeroset (&ccl);
1523	setbit_case_fold_c (c, &ccl);
1524	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1525	}
1526
1527	return dfa->lex.lasttok = c;
1528	}
1529	}
1530
1531	/* The above loop should consume at most a backslash
1532	and some other character. */
1533	abort ();
1534	return END; /* keeps pedantic compilers happy. */
1535	}
1536
1537	static void
1538	addtok_mb (struct dfa *dfa, token t, char mbprop)
1539	{
1540	if (dfa->talloc == dfa->tindex)
1541	{
1542	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1543	sizeof *dfa->tokens);
1544	if (dfa->localeinfo.multibyte)
1545	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1546	sizeof *dfa->multibyte_prop);
1547	}
1548	if (dfa->localeinfo.multibyte)
1549	dfa->multibyte_prop[dfa->tindex] = mbprop;
1550	dfa->tokens[dfa->tindex++] = t;
1551
1552	switch (t)
1553	{
1554	case QMARK:
1555	case STAR:
1556	case PLUS:
1557	break;
1558
1559	case CAT:
1560	case OR:
1561	dfa->parse.depth--;
1562	break;
1563
1564	case EMPTY:
1565	dfa->epsilon = true;
1566	goto increment_depth;
1567
1568	case BACKREF:
1569	dfa->fast = false;
1570	goto increment_nleaves;
1571
1572	case BEGLINE:
1573	case ENDLINE:
1574	case BEGWORD:
1575	case ENDWORD:
1576	case LIMWORD:
1577	case NOTLIMWORD:
1578	dfa->epsilon = true;
1579	FALLTHROUGH;
1580	default:
1581	increment_nleaves:
1582	dfa->nleaves++;
1583	increment_depth:
1584	dfa->parse.depth++;
1585	if (dfa->depth < dfa->parse.depth)
1586	dfa->depth = dfa->parse.depth;
1587	break;
1588	}
1589	}
1590
1591	static void addtok_wc (struct dfa *dfa, wint_t wc);
1592
1593	/* Add the given token to the parse tree, maintaining the depth count and
1594	updating the maximum depth if necessary. */
1595	static void
1596	addtok (struct dfa *dfa, token t)
1597	{
1598	if (dfa->localeinfo.multibyte && t == MBCSET)
1599	{
1600	bool need_or = false;
1601
1602	/* Extract wide characters into alternations for better performance.
1603	This does not require UTF-8. */
1604	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1605	{
1606	addtok_wc (dfa, dfa->lex.brack.chars[i]);
1607	if (need_or)
1608	addtok (dfa, OR);
1609	need_or = true;
1610	}
1611	dfa->lex.brack.nchars = 0;
1612
1613	/* Wide characters have been handled above, so it is possible
1614	that the set is empty now. Do nothing in that case. */
1615	if (dfa->lex.brack.cset != -1)
1616	{
1617	addtok (dfa, CSET + dfa->lex.brack.cset);
1618	if (need_or)
1619	addtok (dfa, OR);
1620	}
1621	}
1622	else
1623	{
1624	addtok_mb (dfa, t, 3);
1625	}
1626	}
1627
1628	/* We treat a multibyte character as a single atom, so that DFA
1629	can treat a multibyte character as a single expression.
1630
1631	e.g., we construct the following tree from "<mb1><mb2>".
1632	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1633	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1634	static void
1635	addtok_wc (struct dfa *dfa, wint_t wc)
1636	{
1637	unsigned char buf[MB_LEN_MAX];
1638	mbstate_t s = { 0 };
1639	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1640	int buflen;
1641
1642	if (stored_bytes != (size_t) -1)
1643	buflen = stored_bytes;
1644	else
1645	{
1646	/* This is merely stop-gap. buf[0] is undefined, yet skipping
1647	the addtok_mb call altogether can corrupt the heap. */
1648	buflen = 1;
1649	buf[0] = 0;
1650	}
1651
1652	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1653	for (int i = 1; i < buflen; i++)
1654	{
1655	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1656	addtok (dfa, CAT);
1657	}
1658	}
1659
1660	static void
1661	add_utf8_anychar (struct dfa *dfa)
1662	{
1663	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1664	UTF-8 byte sequence has been defined as follows:
1665
1666	([\x00-\x7f]
1667	\|[\xc2-\xdf][\x80-\xbf]
1668	\|[\xe0][\xa0-\xbf][\x80-\xbf]
1669	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1670	\|[\xed][\x80-\x9f][\x80-\xbf]
1671	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1672	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1673	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1674
1675	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
1676	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1677	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1678	H = [\x80-\x9f], I = [\xf0],
1679	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1680
1681	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
1682
1683	/* Mnemonics for classes containing two or more bytes. */
1684	enum { A, B, C, E, F, H, J, K, M };
1685
1686	/* Mnemonics for single-byte tokens. */
1687	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1688
1689	static charclass const utf8_classes[] = {
1690	/* A. 00-7f: 1-byte sequence. */
1691	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1692
1693	/* B. c2-df: 1st byte of a 2-byte sequence. */
1694	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1695
1696	/* C. 80-bf: non-leading bytes. */
1697	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1698
1699	/* D. e0 (just a token). */
1700
1701	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
1702	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1703
1704	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1705	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1706
1707	/* G. ed (just a token). */
1708
1709	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
1710	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1711
1712	/* I. f0 (just a token). */
1713
1714	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1715	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1716
1717	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
1718	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1719
1720	/* L. f4 (just a token). */
1721
1722	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1723	CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1724	};
1725
1726	/* Define the character classes that are needed below. */
1727	if (dfa->utf8_anychar_classes[0] == 0)
1728	{
1729	charclass c = utf8_classes[0];
1730	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1731	clrbit ('\n', &c);
1732	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1733	clrbit ('\0', &c);
1734	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1735
1736	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1737	dfa->utf8_anychar_classes[i]
1738	= CSET + charclass_index (dfa, &utf8_classes[i]);
1739	}
1740
1741	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
1742	The token buffer is in reverse Polish order, so we get
1743	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1744	C CAT OR C CAT OR C CAT OR". */
1745	addtok (dfa, dfa->utf8_anychar_classes[A]);
1746	addtok (dfa, dfa->utf8_anychar_classes[B]);
1747	addtok (dfa, D_token);
1748	addtok (dfa, dfa->utf8_anychar_classes[E]);
1749	addtok (dfa, CAT);
1750	addtok (dfa, OR);
1751	addtok (dfa, G_token);
1752	addtok (dfa, dfa->utf8_anychar_classes[H]);
1753	addtok (dfa, CAT);
1754	addtok (dfa, OR);
1755	addtok (dfa, dfa->utf8_anychar_classes[F]);
1756	addtok (dfa, I_token);
1757	addtok (dfa, dfa->utf8_anychar_classes[J]);
1758	addtok (dfa, CAT);
1759	addtok (dfa, OR);
1760	addtok (dfa, L_token);
1761	addtok (dfa, dfa->utf8_anychar_classes[M]);
1762	addtok (dfa, CAT);
1763	addtok (dfa, OR);
1764	addtok (dfa, dfa->utf8_anychar_classes[K]);
1765	for (int i = 0; i < 3; i++)
1766	{
1767	addtok (dfa, dfa->utf8_anychar_classes[C]);
1768	addtok (dfa, CAT);
1769	addtok (dfa, OR);
1770	}
1771	}
1772
1773	/* The grammar understood by the parser is as follows.
1774
1775	regexp:
1776	regexp OR branch
1777	branch
1778
1779	branch:
1780	branch closure
1781	closure
1782
1783	closure:
1784	closure QMARK
1785	closure STAR
1786	closure PLUS
1787	closure REPMN
1788	atom
1789
1790	atom:
1791	<normal character>
1792	<multibyte character>
1793	ANYCHAR
1794	MBCSET
1795	CSET
1796	BACKREF
1797	BEGLINE
1798	ENDLINE
1799	BEGWORD
1800	ENDWORD
1801	LIMWORD
1802	NOTLIMWORD
1803	LPAREN regexp RPAREN
1804	<empty>
1805
1806	The parser builds a parse tree in postfix form in an array of tokens. */
1807
1808	static void
1809	atom (struct dfa *dfa)
1810	{
1811	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1812	\|\| dfa->parse.tok >= CSET
1813	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
1814	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
1815	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
1816	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
1817	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
1818	{
1819	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1820	{
1821	/* For UTF-8 expand the period to a series of CSETs that define a
1822	valid UTF-8 character. This avoids using the slow multibyte
1823	path. I'm pretty sure it would be both profitable and correct to
1824	do it for any encoding; however, the optimization must be done
1825	manually as it is done above in add_utf8_anychar. So, let's
1826	start with UTF-8: it is the most used, and the structure of the
1827	encoding makes the correctness more obvious. */
1828	add_utf8_anychar (dfa);
1829	}
1830	else
1831	addtok (dfa, dfa->parse.tok);
1832	dfa->parse.tok = lex (dfa);
1833	}
1834	else if (dfa->parse.tok == WCHAR)
1835	{
1836	if (dfa->lex.wctok == WEOF)
1837	addtok (dfa, BACKREF);
1838	else
1839	{
1840	addtok_wc (dfa, dfa->lex.wctok);
1841
1842	if (dfa->syntax.case_fold)
1843	{
1844	wchar_t folded[CASE_FOLDED_BUFSIZE];
1845	int n = case_folded_counterparts (dfa->lex.wctok, folded);
1846	for (int i = 0; i < n; i++)
1847	{
1848	addtok_wc (dfa, folded[i]);
1849	addtok (dfa, OR);
1850	}
1851	}
1852	}
1853
1854	dfa->parse.tok = lex (dfa);
1855	}
1856	else if (dfa->parse.tok == LPAREN)
1857	{
1858	dfa->parse.tok = lex (dfa);
1859	regexp (dfa);
1860	if (dfa->parse.tok != RPAREN)
1861	dfaerror (_("unbalanced ("));
1862	dfa->parse.tok = lex (dfa);
1863	}
1864	else
1865	addtok (dfa, EMPTY);
1866	}
1867
1868	/* Return the number of tokens in the given subexpression. */
1869	static idx_t _GL_ATTRIBUTE_PURE
1870	nsubtoks (struct dfa const *dfa, idx_t tindex)
1871	{
1872	switch (dfa->tokens[tindex - 1])
1873	{
1874	default:
1875	return 1;
1876	case QMARK:
1877	case STAR:
1878	case PLUS:
1879	return 1 + nsubtoks (dfa, tindex - 1);
1880	case CAT:
1881	case OR:
1882	{
1883	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1884	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1885	}
1886	}
1887	}
1888
1889	/* Copy the given subexpression to the top of the tree. */
1890	static void
1891	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1892	{
1893	if (dfa->localeinfo.multibyte)
1894	for (idx_t i = 0; i < ntokens; i++)
1895	addtok_mb (dfa, dfa->tokens[tindex + i],
1896	dfa->multibyte_prop[tindex + i]);
1897	else
1898	for (idx_t i = 0; i < ntokens; i++)
1899	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1900	}
1901
1902	static void
1903	closure (struct dfa *dfa)
1904	{
1905	atom (dfa);
1906	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
1907	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
1908	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
1909	{
1910	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1911	idx_t tindex = dfa->tindex - ntokens;
1912	if (dfa->lex.maxrep < 0)
1913	addtok (dfa, PLUS);
1914	if (dfa->lex.minrep == 0)
1915	addtok (dfa, QMARK);
1916	int i;
1917	for (i = 1; i < dfa->lex.minrep; i++)
1918	{
1919	copytoks (dfa, tindex, ntokens);
1920	addtok (dfa, CAT);
1921	}
1922	for (; i < dfa->lex.maxrep; i++)
1923	{
1924	copytoks (dfa, tindex, ntokens);
1925	addtok (dfa, QMARK);
1926	addtok (dfa, CAT);
1927	}
1928	dfa->parse.tok = lex (dfa);
1929	}
1930	else if (dfa->parse.tok == REPMN)
1931	{
1932	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1933	dfa->parse.tok = lex (dfa);
1934	closure (dfa);
1935	}
1936	else
1937	{
1938	addtok (dfa, dfa->parse.tok);
1939	dfa->parse.tok = lex (dfa);
1940	}
1941	}
1942
1943	static void
1944	branch (struct dfa* dfa)
1945	{
1946	closure (dfa);
1947	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1948	&& dfa->parse.tok >= 0)
1949	{
1950	closure (dfa);
1951	addtok (dfa, CAT);
1952	}
1953	}
1954
1955	static void
1956	regexp (struct dfa *dfa)
1957	{
1958	branch (dfa);
1959	while (dfa->parse.tok == OR)
1960	{
1961	dfa->parse.tok = lex (dfa);
1962	branch (dfa);
1963	addtok (dfa, OR);
1964	}
1965	}
1966
1967	/* Parse a string S of length LEN into D. S can include NUL characters.
1968	This is the main entry point for the parser. */
1969	void
1970	dfaparse (char const s, idx_t len, struct dfa d)
1971	{
1972	d->lex.ptr = s;
1973	d->lex.left = len;
1974	d->lex.lasttok = END;
1975	d->lex.laststart = true;
1976
1977	if (!d->syntax.syntax_bits_set)
1978	dfaerror (_("no syntax specified"));
1979
1980	if (!d->nregexps)
1981	addtok (d, BEG);
1982
1983	d->parse.tok = lex (d);
1984	d->parse.depth = d->depth;
1985
1986	regexp (d);
1987
1988	if (d->parse.tok != END)
1989	dfaerror (_("unbalanced )"));
1990
1991	addtok (d, END - d->nregexps);
1992	addtok (d, CAT);
1993
1994	if (d->nregexps)
1995	addtok (d, OR);
1996
1997	++d->nregexps;
1998	}
1999
2000	/* Some primitives for operating on sets of positions. */
2001
2002	/* Copy one set to another. */
2003	static void
2004	copy (position_set const src, position_set dst)
2005	{
2006	if (dst->alloc < src->nelem)
2007	{
2008	free (dst->elems);
2009	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2010	sizeof *dst->elems);
2011	}
2012	dst->nelem = src->nelem;
2013	if (src->nelem != 0)
2014	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2015	}
2016
2017	static void
2018	alloc_position_set (position_set *s, idx_t size)
2019	{
2020	s->elems = xnmalloc (size, sizeof *s->elems);
2021	s->alloc = size;
2022	s->nelem = 0;
2023	}
2024
2025	/* Insert position P in set S. S is maintained in sorted order on
2026	decreasing index. If there is already an entry in S with P.index
2027	then merge (logically-OR) P's constraints into the one in S.
2028	S->elems must point to an array large enough to hold the resulting set. */
2029	static void
2030	insert (position p, position_set *s)
2031	{
2032	idx_t count = s->nelem;
2033	idx_t lo = 0, hi = count;
2034	while (lo < hi)
2035	{
2036	idx_t mid = (lo + hi) >> 1;
2037	if (s->elems[mid].index < p.index)
2038	lo = mid + 1;
2039	else if (s->elems[mid].index == p.index)
2040	{
2041	s->elems[mid].constraint \|= p.constraint;
2042	return;
2043	}
2044	else
2045	hi = mid;
2046	}
2047
2048	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2049	for (idx_t i = count; i > lo; i--)
2050	s->elems[i] = s->elems[i - 1];
2051	s->elems[lo] = p;
2052	++s->nelem;
2053	}
2054
2055	static void
2056	append (position p, position_set *s)
2057	{
2058	idx_t count = s->nelem;
2059	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2060	s->elems[s->nelem++] = p;
2061	}
2062
2063	/* Merge S1 and S2 (with the additional constraint C2) into M. The
2064	result is as if the positions of S1, and of S2 with the additional
2065	constraint C2, were inserted into an initially empty set. */
2066	static void
2067	merge_constrained (position_set const s1, position_set const s2,
2068	unsigned int c2, position_set *m)
2069	{
2070	idx_t i = 0, j = 0;
2071
2072	if (m->alloc - s1->nelem < s2->nelem)
2073	{
2074	free (m->elems);
2075	m->alloc = s1->nelem;
2076	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2077	}
2078	m->nelem = 0;
2079	while (i < s1->nelem \|\| j < s2->nelem)
2080	if (! (j < s2->nelem)
2081	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2082	{
2083	unsigned int c = ((i < s1->nelem && j < s2->nelem
2084	&& s1->elems[i].index == s2->elems[j].index)
2085	? s2->elems[j++].constraint & c2
2086	: 0);
2087	m->elems[m->nelem].index = s1->elems[i].index;
2088	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
2089	}
2090	else
2091	{
2092	if (s2->elems[j].constraint & c2)
2093	{
2094	m->elems[m->nelem].index = s2->elems[j].index;
2095	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2096	}
2097	j++;
2098	}
2099	}
2100
2101	/* Merge two sets of positions into a third. The result is exactly as if
2102	the positions of both sets were inserted into an initially empty set. */
2103	static void
2104	merge (position_set const s1, position_set const s2, position_set *m)
2105	{
2106	merge_constrained (s1, s2, -1, m);
2107	}
2108
2109	/* Merge into DST all the elements of SRC, possibly destroying
2110	the contents of the temporary M. */
2111	static void
2112	merge2 (position_set dst, position_set const src, position_set *m)
2113	{
2114	if (src->nelem < 4)
2115	{
2116	for (idx_t i = 0; i < src->nelem; i++)
2117	insert (src->elems[i], dst);
2118	}
2119	else
2120	{
2121	merge (src, dst, m);
2122	copy (m, dst);
2123	}
2124	}
2125
2126	/* Delete a position from a set. Return the nonzero constraint of the
2127	deleted position, or zero if there was no such position. */
2128	static unsigned int
2129	delete (idx_t del, position_set *s)
2130	{
2131	idx_t count = s->nelem;
2132	idx_t lo = 0, hi = count;
2133	while (lo < hi)
2134	{
2135	idx_t mid = (lo + hi) >> 1;
2136	if (s->elems[mid].index < del)
2137	lo = mid + 1;
2138	else if (s->elems[mid].index == del)
2139	{
2140	unsigned int c = s->elems[mid].constraint;
2141	idx_t i;
2142	for (i = mid; i + 1 < count; i++)
2143	s->elems[i] = s->elems[i + 1];
2144	s->nelem = i;
2145	return c;
2146	}
2147	else
2148	hi = mid;
2149	}
2150	return 0;
2151	}
2152
2153	/* Replace a position with the followed set. */
2154	static void
2155	replace (position_set dst, idx_t del, position_set add,
2156	unsigned int constraint, position_set *tmp)
2157	{
2158	unsigned int c = delete (del, dst) & constraint;
2159
2160	if (c)
2161	{
2162	copy (dst, tmp);
2163	merge_constrained (tmp, add, c, dst);
2164	}
2165	}
2166
2167	/* Find the index of the state corresponding to the given position set with
2168	the given preceding context, or create a new state if there is no such
2169	state. Context tells whether we got here on a newline or letter. */
2170	static state_num
2171	state_index (struct dfa d, position_set const s, int context)
2172	{
2173	size_t hash = 0;
2174	int constraint = 0;
2175	state_num i;
2176
2177	for (i = 0; i < s->nelem; ++i)
2178	{
2179	idx_t ind = s->elems[i].index;
2180	hash ^= ind + s->elems[i].constraint;
2181	}
2182
2183	/* Try to find a state that exactly matches the proposed one. */
2184	for (i = 0; i < d->sindex; ++i)
2185	{
2186	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2187	\|\| context != d->states[i].context)
2188	continue;
2189	state_num j;
2190	for (j = 0; j < s->nelem; ++j)
2191	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2192	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2193	break;
2194	if (j == s->nelem)
2195	return i;
2196	}
2197
2198	#ifdef DEBUG
2199	fprintf (stderr, "new state %td\n nextpos:", i);
2200	for (state_num j = 0; j < s->nelem; j++)
2201	{
2202	fprintf (stderr, " %td:", s->elems[j].index);
2203	prtok (d->tokens[s->elems[j].index]);
2204	}
2205	fprintf (stderr, "\n context:");
2206	if (context ^ CTX_ANY)
2207	{
2208	if (context & CTX_NONE)
2209	fprintf (stderr, " CTX_NONE");
2210	if (context & CTX_LETTER)
2211	fprintf (stderr, " CTX_LETTER");
2212	if (context & CTX_NEWLINE)
2213	fprintf (stderr, " CTX_NEWLINE");
2214	}
2215	else
2216	fprintf (stderr, " CTX_ANY");
2217	fprintf (stderr, "\n");
2218	#endif
2219
2220	for (state_num j = 0; j < s->nelem; j++)
2221	{
2222	int c = d->constraints[s->elems[j].index];
2223
2224	if (c != 0)
2225	{
2226	if (succeeds_in_context (c, context, CTX_ANY))
2227	constraint \|= c;
2228	}
2229	else if (d->tokens[s->elems[j].index] == BACKREF)
2230	constraint = NO_CONSTRAINT;
2231	}
2232
2233
2234	/* Create a new state. */
2235	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2236	sizeof *d->states);
2237	d->states[i].hash = hash;
2238	alloc_position_set (&d->states[i].elems, s->nelem);
2239	copy (s, &d->states[i].elems);
2240	d->states[i].context = context;
2241	d->states[i].constraint = constraint;
2242	d->states[i].mbps.nelem = 0;
2243	d->states[i].mbps.elems = NULL;
2244	d->states[i].mb_trindex = -1;
2245
2246	++d->sindex;
2247
2248	return i;
2249	}
2250
2251	/* Find the epsilon closure of D's set of positions. If any position of the set
2252	contains a symbol that matches the empty string in some context, replace
2253	that position with the elements of its follow labeled with an appropriate
2254	constraint. Repeat exhaustively until no funny positions are left.
2255	S->elems must be large enough to hold the result. BACKWARD is D's
2256	backward set; use and update it too. */
2257	static void
2258	epsclosure (struct dfa const d, position_set backward)
2259	{
2260	position_set tmp;
2261	alloc_position_set (&tmp, d->nleaves);
2262	for (idx_t i = 0; i < d->tindex; i++)
2263	if (0 < d->follows[i].nelem)
2264	{
2265	unsigned int constraint;
2266	switch (d->tokens[i])
2267	{
2268	default:
2269	continue;
2270
2271	case BEGLINE:
2272	constraint = BEGLINE_CONSTRAINT;
2273	break;
2274	case ENDLINE:
2275	constraint = ENDLINE_CONSTRAINT;
2276	break;
2277	case BEGWORD:
2278	constraint = BEGWORD_CONSTRAINT;
2279	break;
2280	case ENDWORD:
2281	constraint = ENDWORD_CONSTRAINT;
2282	break;
2283	case LIMWORD:
2284	constraint = LIMWORD_CONSTRAINT;
2285	break;
2286	case NOTLIMWORD:
2287	constraint = NOTLIMWORD_CONSTRAINT;
2288	break;
2289	case EMPTY:
2290	constraint = NO_CONSTRAINT;
2291	break;
2292	}
2293
2294	delete (i, &d->follows[i]);
2295
2296	for (idx_t j = 0; j < backward[i].nelem; j++)
2297	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2298	constraint, &tmp);
2299	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2300	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2301	NO_CONSTRAINT, &tmp);
2302	}
2303	free (tmp.elems);
2304	}
2305
2306	/* Returns the set of contexts for which there is at least one
2307	character included in C. */
2308
2309	static int
2310	charclass_context (struct dfa const dfa, charclass const c)
2311	{
2312	int context = 0;
2313
2314	for (int j = 0; j < CHARCLASS_WORDS; j++)
2315	{
2316	if (c->w[j] & dfa->syntax.newline.w[j])
2317	context \|= CTX_NEWLINE;
2318	if (c->w[j] & dfa->syntax.letters.w[j])
2319	context \|= CTX_LETTER;
2320	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
2321	context \|= CTX_NONE;
2322	}
2323
2324	return context;
2325	}
2326
2327	/* Returns the contexts on which the position set S depends. Each context
2328	in the set of returned contexts (let's call it SC) may have a different
2329	follow set than other contexts in SC, and also different from the
2330	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2331	in the complement set will have the same follow set. */
2332
2333	static int _GL_ATTRIBUTE_PURE
2334	state_separate_contexts (struct dfa d, position_set const s)
2335	{
2336	int separate_contexts = 0;
2337
2338	for (idx_t j = 0; j < s->nelem; j++)
2339	separate_contexts \|= d->separates[s->elems[j].index];
2340
2341	return separate_contexts;
2342	}
2343
2344	enum
2345	{
2346	/* Single token is repeated. It is distinguished from non-repeated. */
2347	OPT_REPEAT = (1 << 0),
2348
2349	/* Multiple tokens are repeated. This flag is on at head of tokens. The
2350	node is not merged. */
2351	OPT_LPAREN = (1 << 1),
2352
2353	/* Multiple branches are joined. The node is not merged. */
2354	OPT_RPAREN = (1 << 2),
2355
2356	/* The node is walked. If the node is found in walking again, OPT_RPAREN
2357	flag is turned on. */
2358	OPT_WALKED = (1 << 3),
2359
2360	/* The node is queued. The node is not queued again. */
2361	OPT_QUEUED = (1 << 4)
2362	};
2363
2364	static void
2365	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
2366	position_set *merged)
2367	{
2368	position_set *follows = d->follows;
2369	idx_t nelem = 0;
2370
2371	for (idx_t i = 0; i < follows[tindex].nelem; i++)
2372	{
2373	idx_t sindex = follows[tindex].elems[i].index;
2374
2375	/* Skip the node as pruned in future. */
2376	unsigned int iconstraint = follows[tindex].elems[i].constraint;
2377	if (iconstraint == 0)
2378	continue;
2379
2380	if (d->tokens[follows[tindex].elems[i].index] <= END)
2381	{
2382	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
2383	continue;
2384	}
2385
2386	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
2387	{
2388	idx_t j;
2389
2390	for (j = 0; j < nelem; j++)
2391	{
2392	idx_t dindex = follows[tindex].elems[j].index;
2393
2394	if (dindex == tindex)
2395	continue;
2396
2397	if (follows[tindex].elems[j].constraint != iconstraint)
2398	continue;
2399
2400	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
2401	continue;
2402
2403	if (d->tokens[sindex] != d->tokens[dindex])
2404	continue;
2405
2406	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2407	continue;
2408
2409	if (flags[sindex] & OPT_REPEAT)
2410	delete (sindex, &follows[sindex]);
2411
2412	merge2 (&follows[dindex], &follows[sindex], merged);
2413
2414	break;
2415	}
2416
2417	if (j < nelem)
2418	continue;
2419	}
2420
2421	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2422	flags[sindex] \|= OPT_QUEUED;
2423	}
2424
2425	follows[tindex].nelem = nelem;
2426	}
2427
2428	static int
2429	compare (const void a, const void b)
2430	{
2431	position const p = a, q = b;
2432	return (p->index > q->index) - (p->index < q->index);
2433	}
2434
2435	static void
2436	reorder_tokens (struct dfa *d)
2437	{
2438	idx_t nleaves = 0;
2439	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
2440	map[0] = nleaves++;
2441	for (idx_t i = 1; i < d->tindex; i++)
2442	map[i] = -1;
2443
2444	token tokens = xnmalloc (d->nleaves, sizeof tokens);
2445	position_set follows = xnmalloc (d->nleaves, sizeof follows);
2446	int constraints = xnmalloc (d->nleaves, sizeof constraints);
2447	char *multibyte_prop = (d->localeinfo.multibyte
2448	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2449	: NULL);
2450
2451	for (idx_t i = 0; i < d->tindex; i++)
2452	{
2453	if (map[i] < 0)
2454	{
2455	free (d->follows[i].elems);
2456	d->follows[i].elems = NULL;
2457	d->follows[i].nelem = 0;
2458	continue;
2459	}
2460
2461	tokens[map[i]] = d->tokens[i];
2462	follows[map[i]] = d->follows[i];
2463	constraints[map[i]] = d->constraints[i];
2464
2465	if (multibyte_prop != NULL)
2466	multibyte_prop[map[i]] = d->multibyte_prop[i];
2467
2468	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2469	{
2470	if (map[d->follows[i].elems[j].index] == -1)
2471	map[d->follows[i].elems[j].index] = nleaves++;
2472
2473	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2474	}
2475
2476	qsort (d->follows[i].elems, d->follows[i].nelem,
2477	sizeof *d->follows[i].elems, compare);
2478	}
2479
2480	for (idx_t i = 0; i < nleaves; i++)
2481	{
2482	d->tokens[i] = tokens[i];
2483	d->follows[i] = follows[i];
2484	d->constraints[i] = constraints[i];
2485
2486	if (multibyte_prop != NULL)
2487	d->multibyte_prop[i] = multibyte_prop[i];
2488	}
2489
2490	d->tindex = d->nleaves = nleaves;
2491
2492	free (tokens);
2493	free (follows);
2494	free (constraints);
2495	free (multibyte_prop);
2496	free (map);
2497	}
2498
2499	static void
2500	dfaoptimize (struct dfa *d)
2501	{
2502	char *flags = xizalloc (d->tindex);
2503
2504	for (idx_t i = 0; i < d->tindex; i++)
2505	{
2506	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2507	{
2508	if (d->follows[i].elems[j].index == i)
2509	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
2510	else if (d->follows[i].elems[j].index < i)
2511	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
2512	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2513	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
2514	else
2515	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
2516	}
2517	}
2518
2519	flags[0] \|= OPT_QUEUED;
2520
2521	position_set merged0;
2522	position_set *merged = &merged0;
2523	alloc_position_set (merged, d->nleaves);
2524
2525	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2526
2527	for (idx_t i = 0; i < d->tindex; i++)
2528	if (flags[i] & OPT_QUEUED)
2529	merge_nfa_state (d, i, flags, merged);
2530
2531	reorder_tokens (d);
2532
2533	free (merged->elems);
2534	free (flags);
2535	}
2536
2537	/* Perform bottom-up analysis on the parse tree, computing various functions.
2538	Note that at this point, we're pretending constructs like \< are real
2539	characters rather than constraints on what can follow them.
2540
2541	Nullable: A node is nullable if it is at the root of a regexp that can
2542	match the empty string.
2543	* EMPTY leaves are nullable.
2544	* No other leaf is nullable.
2545	* A QMARK or STAR node is nullable.
2546	* A PLUS node is nullable if its argument is nullable.
2547	* A CAT node is nullable if both its arguments are nullable.
2548	* An OR node is nullable if either argument is nullable.
2549
2550	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2551	that could correspond to the first character of a string matching the
2552	regexp rooted at the given node.
2553	* EMPTY leaves have empty firstpos.
2554	* The firstpos of a nonempty leaf is that leaf itself.
2555	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2556	argument.
2557	* The firstpos of a CAT node is the firstpos of the left argument, union
2558	the firstpos of the right if the left argument is nullable.
2559	* The firstpos of an OR node is the union of firstpos of each argument.
2560
2561	Lastpos: The lastpos of a node is the set of positions that could
2562	correspond to the last character of a string matching the regexp at
2563	the given node.
2564	* EMPTY leaves have empty lastpos.
2565	* The lastpos of a nonempty leaf is that leaf itself.
2566	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2567	argument.
2568	* The lastpos of a CAT node is the lastpos of its right argument, union
2569	the lastpos of the left if the right argument is nullable.
2570	* The lastpos of an OR node is the union of the lastpos of each argument.
2571
2572	Follow: The follow of a position is the set of positions that could
2573	correspond to the character following a character matching the node in
2574	a string matching the regexp. At this point we consider special symbols
2575	that match the empty string in some context to be just normal characters.
2576	Later, if we find that a special symbol is in a follow set, we will
2577	replace it with the elements of its follow, labeled with an appropriate
2578	constraint.
2579	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2580	the follow of every node in the lastpos.
2581	* Every node in the firstpos of the second argument of a CAT node is in
2582	the follow of every node in the lastpos of the first argument.
2583
2584	Because of the postfix representation of the parse tree, the depth-first
2585	analysis is conveniently done by a linear scan with the aid of a stack.
2586	Sets are stored as arrays of the elements, obeying a stack-like allocation
2587	scheme; the number of elements in each set deeper in the stack can be
2588	used to determine the address of a particular set's array. */
2589	static void
2590	dfaanalyze (struct dfa *d, bool searchflag)
2591	{
2592	/* Array allocated to hold position sets. */
2593	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
2594	/* Firstpos and lastpos elements. */
2595	position *firstpos = posalloc;
2596	position *lastpos = firstpos + d->nleaves;
2597	position pos;
2598	position_set tmp;
2599
2600	/* Stack for element counts and nullable flags. */
2601	struct
2602	{
2603	/* Whether the entry is nullable. */
2604	bool nullable;
2605
2606	/* Counts of firstpos and lastpos sets. */
2607	idx_t nfirstpos;
2608	idx_t nlastpos;
2609	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
2610
2611	position_set merged; /* Result of merging sets. */
2612
2613	addtok (d, CAT);
2614	idx_t tindex = d->tindex;
2615
2616	#ifdef DEBUG
2617	fprintf (stderr, "dfaanalyze:\n");
2618	for (idx_t i = 0; i < tindex; i++)
2619	{
2620	fprintf (stderr, " %td:", i);
2621	prtok (d->tokens[i]);
2622	}
2623	putc ('\n', stderr);
2624	#endif
2625
2626	d->searchflag = searchflag;
2627	alloc_position_set (&merged, d->nleaves);
2628	d->follows = xicalloc (tindex, sizeof *d->follows);
2629	position_set *backward
2630	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2631
2632	for (idx_t i = 0; i < tindex; i++)
2633	{
2634	switch (d->tokens[i])
2635	{
2636	case EMPTY:
2637	/* The empty set is nullable. */
2638	stk->nullable = true;
2639
2640	/* The firstpos and lastpos of the empty leaf are both empty. */
2641	stk->nfirstpos = stk->nlastpos = 0;
2642	stk++;
2643	break;
2644
2645	case STAR:
2646	case PLUS:
2647	/* Every element in the lastpos of the argument is in the backward
2648	set of every element in the firstpos. */
2649	if (d->epsilon)
2650	{
2651	tmp.elems = lastpos - stk[-1].nlastpos;
2652	tmp.nelem = stk[-1].nlastpos;
2653	for (position *p = firstpos - stk[-1].nfirstpos;
2654	p < firstpos; p++)
2655	merge2 (&backward[p->index], &tmp, &merged);
2656	}
2657
2658	/* Every element in the firstpos of the argument is in the follow
2659	of every element in the lastpos. */
2660	{
2661	tmp.elems = firstpos - stk[-1].nfirstpos;
2662	tmp.nelem = stk[-1].nfirstpos;
2663	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2664	merge2 (&d->follows[p->index], &tmp, &merged);
2665	}
2666	FALLTHROUGH;
2667	case QMARK:
2668	/* A QMARK or STAR node is automatically nullable. */
2669	if (d->tokens[i] != PLUS)
2670	stk[-1].nullable = true;
2671	break;
2672
2673	case CAT:
2674	/* Every element in the lastpos of the first argument is in
2675	the backward set of every element in the firstpos of the
2676	second argument. */
2677	if (backward)
2678	{
2679	tmp.nelem = stk[-2].nlastpos;
2680	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2681	for (position *p = firstpos - stk[-1].nfirstpos;
2682	p < firstpos; p++)
2683	merge2 (&backward[p->index], &tmp, &merged);
2684	}
2685
2686	/* Every element in the firstpos of the second argument is in the
2687	follow of every element in the lastpos of the first argument. */
2688	{
2689	tmp.nelem = stk[-1].nfirstpos;
2690	tmp.elems = firstpos - stk[-1].nfirstpos;
2691	for (position *plim = lastpos - stk[-1].nlastpos,
2692	*p = plim - stk[-2].nlastpos;
2693	p < plim; p++)
2694	merge2 (&d->follows[p->index], &tmp, &merged);
2695	}
2696
2697	/* The firstpos of a CAT node is the firstpos of the first argument,
2698	union that of the second argument if the first is nullable. */
2699	if (stk[-2].nullable)
2700	stk[-2].nfirstpos += stk[-1].nfirstpos;
2701	else
2702	firstpos -= stk[-1].nfirstpos;
2703
2704	/* The lastpos of a CAT node is the lastpos of the second argument,
2705	union that of the first argument if the second is nullable. */
2706	if (stk[-1].nullable)
2707	stk[-2].nlastpos += stk[-1].nlastpos;
2708	else
2709	{
2710	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2711	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2712	p[j] = p[j + stk[-2].nlastpos];
2713	lastpos -= stk[-2].nlastpos;
2714	stk[-2].nlastpos = stk[-1].nlastpos;
2715	}
2716
2717	/* A CAT node is nullable if both arguments are nullable. */
2718	stk[-2].nullable &= stk[-1].nullable;
2719	stk--;
2720	break;
2721
2722	case OR:
2723	/* The firstpos is the union of the firstpos of each argument. */
2724	stk[-2].nfirstpos += stk[-1].nfirstpos;
2725
2726	/* The lastpos is the union of the lastpos of each argument. */
2727	stk[-2].nlastpos += stk[-1].nlastpos;
2728
2729	/* An OR node is nullable if either argument is nullable. */
2730	stk[-2].nullable \|= stk[-1].nullable;
2731	stk--;
2732	break;
2733
2734	default:
2735	/* Anything else is a nonempty position. (Note that special
2736	constructs like \< are treated as nonempty strings here;
2737	an "epsilon closure" effectively makes them nullable later.
2738	Backreferences have to get a real position so we can detect
2739	transitions on them later. But they are nullable. */
2740	stk->nullable = d->tokens[i] == BACKREF;
2741
2742	/* This position is in its own firstpos and lastpos. */
2743	stk->nfirstpos = stk->nlastpos = 1;
2744	stk++;
2745
2746	firstpos->index = lastpos->index = i;
2747	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2748	firstpos++, lastpos++;
2749
2750	break;
2751	}
2752	#ifdef DEBUG
2753	/* ... balance the above nonsyntactic #ifdef goo... */
2754	fprintf (stderr, "node %td:", i);
2755	prtok (d->tokens[i]);
2756	putc ('\n', stderr);
2757	fprintf (stderr,
2758	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2759	fprintf (stderr, " firstpos:");
2760	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2761	{
2762	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2763	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2764	}
2765	fprintf (stderr, "\n lastpos:");
2766	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2767	{
2768	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2769	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2770	}
2771	putc ('\n', stderr);
2772	#endif
2773	}
2774
2775	if (backward)
2776	{
2777	/* For each follow set that is the follow set of a real position,
2778	replace it with its epsilon closure. */
2779	epsclosure (d, backward);
2780
2781	for (idx_t i = 0; i < tindex; i++)
2782	free (backward[i].elems);
2783	free (backward);
2784	}
2785
2786	dfaoptimize (d);
2787
2788	#ifdef DEBUG
2789	for (idx_t i = 0; i < tindex; i++)
2790	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
2791	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
2792	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
2793	{
2794	fprintf (stderr, "follows(%td:", i);
2795	prtok (d->tokens[i]);
2796	fprintf (stderr, "):");
2797	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2798	{
2799	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2800	prtok (d->tokens[d->follows[i].elems[j].index]);
2801	}
2802	putc ('\n', stderr);
2803	}
2804	#endif
2805
2806	pos.index = 0;
2807	pos.constraint = NO_CONSTRAINT;
2808
2809	alloc_position_set (&tmp, 1);
2810
2811	append (pos, &tmp);
2812
2813	d->separates = xicalloc (tindex, sizeof *d->separates);
2814
2815	for (idx_t i = 0; i < tindex; i++)
2816	{
2817	if (prev_newline_dependent (d->constraints[i]))
2818	d->separates[i] \|= CTX_NEWLINE;
2819	if (prev_letter_dependent (d->constraints[i]))
2820	d->separates[i] \|= CTX_LETTER;
2821
2822	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2823	{
2824	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2825	d->separates[i] \|= CTX_NEWLINE;
2826	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2827	d->separates[i] \|= CTX_LETTER;
2828	}
2829	}
2830
2831	/* Context wanted by some position. */
2832	int separate_contexts = state_separate_contexts (d, &tmp);
2833
2834	/* Build the initial state. */
2835	if (separate_contexts & CTX_NEWLINE)
2836	state_index (d, &tmp, CTX_NEWLINE);
2837	d->initstate_notbol = d->min_trcount
2838	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2839	if (separate_contexts & CTX_LETTER)
2840	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2841	d->min_trcount++;
2842	d->trcount = 0;
2843
2844	free (posalloc);
2845	free (stkalloc);
2846	free (merged.elems);
2847	free (tmp.elems);
2848	}
2849
2850	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2851	static void
2852	realloc_trans_if_necessary (struct dfa *d)
2853	{
2854	state_num oldalloc = d->tralloc;
2855	if (oldalloc < d->sindex)
2856	{
2857	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2858	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2859	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2860	-1, sizeof *realtrans);
2861	realtrans[0] = realtrans[1] = NULL;
2862	d->trans = realtrans + 2;
2863	idx_t newalloc = d->tralloc = newalloc1 - 2;
2864	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2865	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2866	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2867	if (d->localeinfo.multibyte)
2868	{
2869	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2870	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2871	if (oldalloc == 0)
2872	realtrans[0] = realtrans[1] = NULL;
2873	d->mb_trans = realtrans + 2;
2874	}
2875	for (; oldalloc < newalloc; oldalloc++)
2876	{
2877	d->trans[oldalloc] = NULL;
2878	d->fails[oldalloc] = NULL;
2879	if (d->localeinfo.multibyte)
2880	d->mb_trans[oldalloc] = NULL;
2881	}
2882	}
2883	}
2884
2885	/*
2886	Calculate the transition table for a new state derived from state s
2887	for a compiled dfa d after input character uc, and return the new
2888	state number.
2889
2890	Do not worry about all possible input characters; calculate just the group
2891	of positions that match uc. Label it with the set of characters that
2892	every position in the group matches (taking into account, if necessary,
2893	preceding context information of s). Then find the union
2894	of these positions' follows, i.e., the set of positions of the
2895	new state. For each character in the group's label, set the transition
2896	on this character to be to a state corresponding to the set's positions,
2897	and its associated backward context information, if necessary.
2898
2899	When building a searching matcher, include the positions of state
2900	0 in every state.
2901
2902	The group is constructed by building an equivalence-class
2903	partition of the positions of s.
2904
2905	For each position, find the set of characters C that it matches. Eliminate
2906	any characters from C that fail on grounds of backward context.
2907
2908	Check whether the group's label L has nonempty
2909	intersection with C. If L - C is nonempty, create a new group labeled
2910	L - C and having the same positions as the current group, and set L to
2911	the intersection of L and C. Insert the position in the group, set
2912	C = C - L, and resume scanning.
2913
2914	If after comparing with every group there are characters remaining in C,
2915	create a new group labeled with the characters of C and insert this
2916	position in that group. */
2917
2918	static state_num
2919	build_state (state_num s, struct dfa *d, unsigned char uc)
2920	{
2921	position_set follows; /* Union of the follows for each
2922	position of the current state. */
2923	position_set group; /* Positions that match the input char. */
2924	position_set tmp; /* Temporary space for merging sets. */
2925	state_num state; /* New state. */
2926	state_num state_newline; /* New state on a newline transition. */
2927	state_num state_letter; /* New state on a letter transition. */
2928
2929	#ifdef DEBUG
2930	fprintf (stderr, "build state %td\n", s);
2931	#endif
2932
2933	/* A pointer to the new transition table, and the table itself. */
2934	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2935	state_num trans = ptrans;
2936
2937	if (!trans)
2938	{
2939	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
2940	transition tables that can exist at once, other than for
2941	initial states. Often-used transition tables are quickly
2942	rebuilt, whereas rarely-used ones are cleared away. */
2943	if (MAX_TRCOUNT <= d->trcount)
2944	{
2945	for (state_num i = d->min_trcount; i < d->tralloc; i++)
2946	{
2947	free (d->trans[i]);
2948	free (d->fails[i]);
2949	d->trans[i] = d->fails[i] = NULL;
2950	}
2951	d->trcount = 0;
2952	}
2953
2954	d->trcount++;
2955	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
2956
2957	/* Fill transition table with a default value which means that the
2958	transited state has not been calculated yet. */
2959	for (int i = 0; i < NOTCHAR; i++)
2960	trans[i] = -2;
2961	}
2962
2963	/* Set up the success bits for this state. */
2964	d->success[s] = 0;
2965	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2966	d->success[s] \|= CTX_NEWLINE;
2967	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2968	d->success[s] \|= CTX_LETTER;
2969	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2970	d->success[s] \|= CTX_NONE;
2971
2972	alloc_position_set (&follows, d->nleaves);
2973
2974	/* Find the union of the follows of the positions of the group.
2975	This is a hideously inefficient loop. Fix it someday. */
2976	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2977	for (idx_t k = 0;
2978	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2979	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2980	&follows);
2981
2982	/* Positions that match the input char. */
2983	alloc_position_set (&group, d->nleaves);
2984
2985	/* The group's label. */
2986	charclass label;
2987	fillset (&label);
2988
2989	for (idx_t i = 0; i < follows.nelem; i++)
2990	{
2991	charclass matches; /* Set of matching characters. */
2992	position pos = follows.elems[i];
2993	bool matched = false;
2994	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2995	{
2996	zeroset (&matches);
2997	setbit (d->tokens[pos.index], &matches);
2998	if (d->tokens[pos.index] == uc)
2999	matched = true;
3000	}
3001	else if (d->tokens[pos.index] >= CSET)
3002	{
3003	matches = d->charclasses[d->tokens[pos.index] - CSET];
3004	if (tstbit (uc, &matches))
3005	matched = true;
3006	}
3007	else if (d->tokens[pos.index] == ANYCHAR)
3008	{
3009	matches = d->charclasses[d->canychar];
3010	if (tstbit (uc, &matches))
3011	matched = true;
3012
3013	/* ANYCHAR must match with a single character, so we must put
3014	it to D->states[s].mbps which contains the positions which
3015	can match with a single character not a byte. If all
3016	positions which has ANYCHAR does not depend on context of
3017	next character, we put the follows instead of it to
3018	D->states[s].mbps to optimize. */
3019	if (succeeds_in_context (pos.constraint, d->states[s].context,
3020	CTX_NONE))
3021	{
3022	if (d->states[s].mbps.nelem == 0)
3023	alloc_position_set (&d->states[s].mbps, 1);
3024	insert (pos, &d->states[s].mbps);
3025	}
3026	}
3027	else
3028	continue;
3029
3030	/* Some characters may need to be eliminated from matches because
3031	they fail in the current context. */
3032	if (pos.constraint != NO_CONSTRAINT)
3033	{
3034	if (!succeeds_in_context (pos.constraint,
3035	d->states[s].context, CTX_NEWLINE))
3036	for (int j = 0; j < CHARCLASS_WORDS; j++)
3037	matches.w[j] &= ~d->syntax.newline.w[j];
3038	if (!succeeds_in_context (pos.constraint,
3039	d->states[s].context, CTX_LETTER))
3040	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3041	matches.w[j] &= ~d->syntax.letters.w[j];
3042	if (!succeeds_in_context (pos.constraint,
3043	d->states[s].context, CTX_NONE))
3044	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3045	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
3046
3047	/* If there are no characters left, there's no point in going on. */
3048	if (emptyset (&matches))
3049	continue;
3050
3051	/* If we have reset the bit that made us declare "matched", reset
3052	that indicator, too. This is required to avoid an infinite loop
3053	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
3054	if (!tstbit (uc, &matches))
3055	matched = false;
3056	}
3057
3058	#ifdef DEBUG
3059	fprintf (stderr, " nextpos %td:", pos.index);
3060	prtok (d->tokens[pos.index]);
3061	fprintf (stderr, " of");
3062	for (unsigned j = 0; j < NOTCHAR; j++)
3063	if (tstbit (j, &matches))
3064	fprintf (stderr, " 0x%02x", j);
3065	fprintf (stderr, "\n");
3066	#endif
3067
3068	if (matched)
3069	{
3070	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3071	label.w[k] &= matches.w[k];
3072	append (pos, &group);
3073	}
3074	else
3075	{
3076	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3077	label.w[k] &= ~matches.w[k];
3078	}
3079	}
3080
3081	alloc_position_set (&tmp, d->nleaves);
3082
3083	if (group.nelem > 0)
3084	{
3085	/* If we are building a searching matcher, throw in the positions
3086	of state 0 as well, if possible. */
3087	if (d->searchflag)
3088	{
3089	/* If a token in follows.elems is not 1st byte of a multibyte
3090	character, or the states of follows must accept the bytes
3091	which are not 1st byte of the multibyte character.
3092	Then, if a state of follows encounters a byte, it must not be
3093	a 1st byte of a multibyte character nor a single byte character.
3094	In this case, do not add state[0].follows to next state, because
3095	state[0] must accept 1st-byte.
3096
3097	For example, suppose <sb a> is a certain single byte character,
3098	<mb A> is a certain multibyte character, and the codepoint of
3099	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
3100	state[0] accepts <sb a>, state[i] transits to state[i+1] by
3101	accepting the 1st byte of <mb A>, and state[i+1] accepts the
3102	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3103	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3104	not add state[0]. */
3105
3106	bool mergeit = !d->localeinfo.multibyte;
3107	if (!mergeit)
3108	{
3109	mergeit = true;
3110	for (idx_t j = 0; mergeit && j < group.nelem; j++)
3111	mergeit &= d->multibyte_prop[group.elems[j].index];
3112	}
3113	if (mergeit)
3114	merge2 (&group, &d->states[0].elems, &tmp);
3115	}
3116
3117	/* Find out if the new state will want any context information,
3118	by calculating possible contexts that the group can match,
3119	and separate contexts that the new state wants to know. */
3120	int possible_contexts = charclass_context (d, &label);
3121	int separate_contexts = state_separate_contexts (d, &group);
3122
3123	/* Find the state(s) corresponding to the union of the follows. */
3124	if (possible_contexts & ~separate_contexts)
3125	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3126	else
3127	state = -1;
3128	if (separate_contexts & possible_contexts & CTX_NEWLINE)
3129	state_newline = state_index (d, &group, CTX_NEWLINE);
3130	else
3131	state_newline = state;
3132	if (separate_contexts & possible_contexts & CTX_LETTER)
3133	state_letter = state_index (d, &group, CTX_LETTER);
3134	else
3135	state_letter = state;
3136
3137	/* Reallocate now, to reallocate any newline transition properly. */
3138	realloc_trans_if_necessary (d);
3139	}
3140
3141	/* If we are a searching matcher, the default transition is to a state
3142	containing the positions of state 0, otherwise the default transition
3143	is to fail miserably. */
3144	else if (d->searchflag)
3145	{
3146	state_newline = 0;
3147	state_letter = d->min_trcount - 1;
3148	state = d->initstate_notbol;
3149	}
3150	else
3151	{
3152	state_newline = -1;
3153	state_letter = -1;
3154	state = -1;
3155	}
3156
3157	/* Set the transitions for each character in the label. */
3158	for (int i = 0; i < NOTCHAR; i++)
3159	if (tstbit (i, &label))
3160	switch (d->syntax.sbit[i])
3161	{
3162	case CTX_NEWLINE:
3163	trans[i] = state_newline;
3164	break;
3165	case CTX_LETTER:
3166	trans[i] = state_letter;
3167	break;
3168	default:
3169	trans[i] = state;
3170	break;
3171	}
3172
3173	#ifdef DEBUG
3174	fprintf (stderr, "trans table %td", s);
3175	for (int i = 0; i < NOTCHAR; ++i)
3176	{
3177	if (!(i & 0xf))
3178	fprintf (stderr, "\n");
3179	fprintf (stderr, " %2td", trans[i]);
3180	}
3181	fprintf (stderr, "\n");
3182	#endif
3183
3184	free (group.elems);
3185	free (follows.elems);
3186	free (tmp.elems);
3187
3188	/* Keep the newline transition in a special place so we can use it as
3189	a sentinel. */
3190	if (tstbit (d->syntax.eolbyte, &label))
3191	{
3192	d->newlines[s] = trans[d->syntax.eolbyte];
3193	trans[d->syntax.eolbyte] = -1;
3194	}
3195
3196	return trans[uc];
3197	}
3198
3199	/* Multibyte character handling sub-routines for dfaexec. */
3200
3201	/* Consume a single byte and transit state from 's' to '*next_state'.
3202	This function is almost same as the state transition routin in dfaexec.
3203	But state transition is done just once, otherwise matching succeed or
3204	reach the end of the buffer. */
3205	static state_num
3206	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
3207	{
3208	state_num *t;
3209
3210	if (d->trans[s])
3211	t = d->trans[s];
3212	else if (d->fails[s])
3213	t = d->fails[s];
3214	else
3215	{
3216	build_state (s, d, **pp);
3217	if (d->trans[s])
3218	t = d->trans[s];
3219	else
3220	{
3221	t = d->fails[s];
3222	assert (t);
3223	}
3224	}
3225
3226	if (t[**pp] == -2)
3227	build_state (s, d, **pp);
3228
3229	return t[(pp)++];
3230	}
3231
3232	/* Transit state from s, then return new state and update the pointer of
3233	the buffer. This function is for a period operator which can match a
3234	multi-byte character. */
3235	static state_num
3236	transit_state (struct dfa d, state_num s, unsigned char const *pp,
3237	unsigned char const *end)
3238	{
3239	wint_t wc;
3240
3241	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
3242
3243	/* This state has some operators which can match a multibyte character. */
3244	d->mb_follows.nelem = 0;
3245
3246	/* Calculate the state which can be reached from the state 's' by
3247	consuming 'mbclen' single bytes from the buffer. */
3248	state_num s1 = s;
3249	int mbci;
3250	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
3251	s = transit_state_singlebyte (d, s, pp);
3252	*pp += mbclen - mbci;
3253
3254	if (wc == WEOF)
3255	{
3256	/* It is an invalid character, so ANYCHAR is not accepted. */
3257	return s;
3258	}
3259
3260	/* If all positions which have ANYCHAR do not depend on the context
3261	of the next character, calculate the next state with
3262	pre-calculated follows and cache the result. */
3263	if (d->states[s1].mb_trindex < 0)
3264	{
3265	if (MAX_TRCOUNT <= d->mb_trcount)
3266	{
3267	state_num s3;
3268	for (s3 = -1; s3 < d->tralloc; s3++)
3269	{
3270	free (d->mb_trans[s3]);
3271	d->mb_trans[s3] = NULL;
3272	}
3273
3274	for (state_num i = 0; i < d->sindex; i++)
3275	d->states[i].mb_trindex = -1;
3276	d->mb_trcount = 0;
3277	}
3278	d->states[s1].mb_trindex = d->mb_trcount++;
3279	}
3280
3281	if (! d->mb_trans[s])
3282	{
3283	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3284	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3285	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3286	for (int i = 0; i < MAX_TRCOUNT; i++)
3287	d->mb_trans[s][i] = -1;
3288	}
3289	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3290	return d->mb_trans[s][d->states[s1].mb_trindex];
3291
3292	if (s == -1)
3293	copy (&d->states[s1].mbps, &d->mb_follows);
3294	else
3295	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3296
3297	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3298	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3299	realloc_trans_if_necessary (d);
3300
3301	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3302
3303	return s2;
3304	}
3305
3306	/* The initial state may encounter a byte which is not a single byte character
3307	nor the first byte of a multibyte character. But it is incorrect for the
3308	initial state to accept such a byte. For example, in Shift JIS the regular
3309	expression "\\" accepts the codepoint 0x5c, but should not accept the second
3310	byte of the codepoint 0x815c. Then the initial state must skip the bytes
3311	that are not a single byte character nor the first byte of a multibyte
3312	character.
3313
3314	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3315	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3316	the result is greater than P, set *WCP to the final wide character
3317	processed, or to WEOF if no wide character is processed. Otherwise,
3318	if WCP is non-NULL, *WCP may or may not be updated.
3319
3320	Both P and MBP must be no larger than END. */
3321	static unsigned char const *
3322	skip_remains_mb (struct dfa d, unsigned char const p,
3323	unsigned char const mbp, char const end)
3324	{
3325	if (d->syntax.never_trail[*p])
3326	return p;
3327	while (mbp < p)
3328	{
3329	wint_t wc;
3330	mbp += mbs_to_wchar (&wc, (char const *) mbp,
3331	end - (char const *) mbp, d);
3332	}
3333	return mbp;
3334	}
3335
3336	/* Search through a buffer looking for a match to the struct dfa *D.
3337	Find the first occurrence of a string matching the regexp in the
3338	buffer, and the shortest possible version thereof. Return a pointer to
3339	the first character after the match, or NULL if none is found. BEGIN
3340	points to the beginning of the buffer, and END points to the first byte
3341	after its end. Note however that we store a sentinel byte (usually
3342	newline) in *END, so the actual buffer must be one byte longer.
3343	When ALLOW_NL, newlines may appear in the matching string.
3344	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3345	If MULTIBYTE, the input consists of multibyte characters and/or
3346	encoding-error bytes. Otherwise, it consists of single-byte characters.
3347	Here is the list of features that make this DFA matcher punt:
3348	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3349	- [^...] in non-simple locale
3350	- [[=foo=]] or [[.foo.]]
3351	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3352	- back-reference: (.)\1
3353	- word-delimiter in multibyte locale: \<, \>, \b, \B
3354	See struct localeinfo.simple for the definition of "simple locale". */
3355
3356	static inline char *
3357	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
3358	ptrdiff_t *count, bool multibyte)
3359	{
3360	if (MAX_TRCOUNT <= d->sindex)
3361	{
3362	for (state_num s = d->min_trcount; s < d->sindex; s++)
3363	{
3364	free (d->states[s].elems.elems);
3365	free (d->states[s].mbps.elems);
3366	}
3367	d->sindex = d->min_trcount;
3368
3369	if (d->trans)
3370	{
3371	for (state_num s = 0; s < d->tralloc; s++)
3372	{
3373	free (d->trans[s]);
3374	free (d->fails[s]);
3375	d->trans[s] = d->fails[s] = NULL;
3376	}
3377	d->trcount = 0;
3378	}
3379
3380	if (d->localeinfo.multibyte && d->mb_trans)
3381	{
3382	for (state_num s = -1; s < d->tralloc; s++)
3383	{
3384	free (d->mb_trans[s]);
3385	d->mb_trans[s] = NULL;
3386	}
3387	for (state_num s = 0; s < d->min_trcount; s++)
3388	d->states[s].mb_trindex = -1;
3389	d->mb_trcount = 0;
3390	}
3391	}
3392
3393	if (!d->tralloc)
3394	realloc_trans_if_necessary (d);
3395
3396	/* Current state. */
3397	state_num s = 0, s1 = 0;
3398
3399	/* Current input character. */
3400	unsigned char const p = (unsigned char const ) begin;
3401	unsigned char const *mbp = p;
3402
3403	/* Copy of d->trans so it can be optimized into a register. */
3404	state_num **trans = d->trans;
3405	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3406	unsigned char saved_end = (unsigned char ) end;
3407	*end = eol;
3408
3409	if (multibyte)
3410	{
3411	memset (&d->mbs, 0, sizeof d->mbs);
3412	if (d->mb_follows.alloc == 0)
3413	alloc_position_set (&d->mb_follows, d->nleaves);
3414	}
3415
3416	idx_t nlcount = 0;
3417	for (;;)
3418	{
3419	state_num *t;
3420	while ((t = trans[s]) != NULL)
3421	{
3422	if (s < d->min_trcount)
3423	{
3424	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
3425	{
3426	while (t[*p] == s)
3427	p++;
3428	}
3429	if (multibyte)
3430	p = mbp = skip_remains_mb (d, p, mbp, end);
3431	}
3432
3433	if (multibyte)
3434	{
3435	s1 = s;
3436
3437	if (d->states[s].mbps.nelem == 0
3438	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3439	{
3440	/* If an input character does not match ANYCHAR, do it
3441	like a single-byte character. */
3442	s = t[*p++];
3443	}
3444	else
3445	{
3446	s = transit_state (d, s, &p, (unsigned char *) end);
3447	mbp = p;
3448	trans = d->trans;
3449	}
3450	}
3451	else
3452	{
3453	s1 = t[*p++];
3454	t = trans[s1];
3455	if (! t)
3456	{
3457	state_num tmp = s;
3458	s = s1;
3459	s1 = tmp; /* swap */
3460	break;
3461	}
3462	if (s < d->min_trcount)
3463	{
3464	while (t[*p] == s1)
3465	p++;
3466	}
3467	s = t[*p++];
3468	}
3469	}
3470
3471	if (s < 0)
3472	{
3473	if (s == -2)
3474	{
3475	s = build_state (s1, d, p[-1]);
3476	trans = d->trans;
3477	}
3478	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3479	{
3480	/* The previous character was a newline. Count it, and skip
3481	checking of multibyte character boundary until here. */
3482	nlcount++;
3483	mbp = p;
3484
3485	s = (allow_nl ? d->newlines[s1]
3486	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3487	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3488	: d->initstate_notbol);
3489	}
3490	else
3491	{
3492	p = NULL;
3493	goto done;
3494	}
3495	}
3496	else if (d->fails[s])
3497	{
3498	if ((d->success[s] & d->syntax.sbit[*p])
3499	\|\| ((char *) p == end
3500	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3501	d)))
3502	goto done;
3503
3504	if (multibyte && s < d->min_trcount)
3505	p = mbp = skip_remains_mb (d, p, mbp, end);
3506
3507	s1 = s;
3508	if (!multibyte \|\| d->states[s].mbps.nelem == 0
3509	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3510	{
3511	/* If a input character does not match ANYCHAR, do it
3512	like a single-byte character. */
3513	s = d->fails[s][*p++];
3514	}
3515	else
3516	{
3517	s = transit_state (d, s, &p, (unsigned char *) end);
3518	mbp = p;
3519	trans = d->trans;
3520	}
3521	}
3522	else
3523	{
3524	build_state (s, d, p[0]);
3525	trans = d->trans;
3526	}
3527	}
3528
3529	done:
3530	if (count)
3531	*count += nlcount;
3532	*end = saved_end;
3533	return (char *) p;
3534	}
3535
3536	/* Specialized versions of dfaexec for multibyte and single-byte cases.
3537	This is for performance, as dfaexec_main is an inline function. */
3538
3539	static char *
3540	dfaexec_mb (struct dfa d, char const begin, char *end,
3541	bool allow_nl, ptrdiff_t count, bool backref)
3542	{
3543	return dfaexec_main (d, begin, end, allow_nl, count, true);
3544	}
3545
3546	static char *
3547	dfaexec_sb (struct dfa d, char const begin, char *end,
3548	bool allow_nl, ptrdiff_t count, bool backref)
3549	{
3550	return dfaexec_main (d, begin, end, allow_nl, count, false);
3551	}
3552
3553	/* Always set *BACKREF and return BEGIN. Use this wrapper for
3554	any regexp that uses a construct not supported by this code. */
3555	static char *
3556	dfaexec_noop (struct dfa d, char const begin, char *end,
3557	bool allow_nl, ptrdiff_t count, bool backref)
3558	{
3559	*backref = true;
3560	return (char *) begin;
3561	}
3562
3563	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3564	but faster and set *BACKREF if the DFA code does not support this
3565	regexp usage. */
3566
3567	char *
3568	dfaexec (struct dfa d, char const begin, char *end,
3569	bool allow_nl, ptrdiff_t count, bool backref)
3570	{
3571	return d->dfaexec (d, begin, end, allow_nl, count, backref);
3572	}
3573
3574	struct dfa *
3575	dfasuperset (struct dfa const *d)
3576	{
3577	return d->superset;
3578	}
3579
3580	bool
3581	dfaisfast (struct dfa const *d)
3582	{
3583	return d->fast;
3584	}
3585
3586	static void
3587	free_mbdata (struct dfa *d)
3588	{
3589	free (d->multibyte_prop);
3590	free (d->lex.brack.chars);
3591	free (d->mb_follows.elems);
3592
3593	if (d->mb_trans)
3594	{
3595	state_num s;
3596	for (s = -1; s < d->tralloc; s++)
3597	free (d->mb_trans[s]);
3598	free (d->mb_trans - 2);
3599	}
3600	}
3601
3602	/* Return true if every construct in D is supported by this DFA matcher. */
3603	bool
3604	dfasupported (struct dfa const *d)
3605	{
3606	for (idx_t i = 0; i < d->tindex; i++)
3607	{
3608	switch (d->tokens[i])
3609	{
3610	case BEGWORD:
3611	case ENDWORD:
3612	case LIMWORD:
3613	case NOTLIMWORD:
3614	if (!d->localeinfo.multibyte)
3615	continue;
3616	FALLTHROUGH;
3617	case BACKREF:
3618	case MBCSET:
3619	return false;
3620	}
3621	}
3622	return true;
3623	}
3624
3625	/* Disable use of the superset DFA if it is not likely to help
3626	performance. */
3627	static void
3628	maybe_disable_superset_dfa (struct dfa *d)
3629	{
3630	if (!d->localeinfo.using_utf8)
3631	return;
3632
3633	bool have_backref = false;
3634	for (idx_t i = 0; i < d->tindex; i++)
3635	{
3636	switch (d->tokens[i])
3637	{
3638	case ANYCHAR:
3639	/* Lowered. */
3640	abort ();
3641	case BACKREF:
3642	have_backref = true;
3643	break;
3644	case MBCSET:
3645	/* Requires multi-byte algorithm. */
3646	return;
3647	default:
3648	break;
3649	}
3650	}
3651
3652	if (!have_backref && d->superset)
3653	{
3654	/* The superset DFA is not likely to be much faster, so remove it. */
3655	dfafree (d->superset);
3656	free (d->superset);
3657	d->superset = NULL;
3658	}
3659
3660	free_mbdata (d);
3661	d->localeinfo.multibyte = false;
3662	d->dfaexec = dfaexec_sb;
3663	d->fast = true;
3664	}
3665
3666	static void
3667	dfassbuild (struct dfa *d)
3668	{
3669	struct dfa *sup = dfaalloc ();
3670
3671	sup = d;
3672	sup->localeinfo.multibyte = false;
3673	sup->dfaexec = dfaexec_sb;
3674	sup->multibyte_prop = NULL;
3675	sup->superset = NULL;
3676	sup->states = NULL;
3677	sup->sindex = 0;
3678	sup->constraints = NULL;
3679	sup->separates = NULL;
3680	sup->follows = NULL;
3681	sup->tralloc = 0;
3682	sup->trans = NULL;
3683	sup->fails = NULL;
3684	sup->success = NULL;
3685	sup->newlines = NULL;
3686
3687	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3688	if (d->cindex)
3689	{
3690	memcpy (sup->charclasses, d->charclasses,
3691	d->cindex * sizeof *sup->charclasses);
3692	}
3693
3694	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3695	sup->talloc = d->tindex * 2;
3696
3697	bool have_achar = false;
3698	bool have_nchar = false;
3699	idx_t j;
3700	for (idx_t i = j = 0; i < d->tindex; i++)
3701	{
3702	switch (d->tokens[i])
3703	{
3704	case ANYCHAR:
3705	case MBCSET:
3706	case BACKREF:
3707	{
3708	charclass ccl;
3709	fillset (&ccl);
3710	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3711	sup->tokens[j++] = STAR;
3712	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
3713	\|\| d->tokens[i + 1] == PLUS)
3714	i++;
3715	have_achar = true;
3716	}
3717	break;
3718	case BEGWORD:
3719	case ENDWORD:
3720	case LIMWORD:
3721	case NOTLIMWORD:
3722	if (d->localeinfo.multibyte)
3723	{
3724	/* These constraints aren't supported in a multibyte locale.
3725	Ignore them in the superset DFA. */
3726	sup->tokens[j++] = EMPTY;
3727	break;
3728	}
3729	FALLTHROUGH;
3730	default:
3731	sup->tokens[j++] = d->tokens[i];
3732	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3733	\|\| d->tokens[i] >= CSET)
3734	have_nchar = true;
3735	break;
3736	}
3737	}
3738	sup->tindex = j;
3739
3740	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
3741	d->superset = sup;
3742	else
3743	{
3744	dfafree (sup);
3745	free (sup);
3746	}
3747	}
3748
3749	/* Parse a string S of length LEN into D (but skip this step if S is null).
3750	Then analyze D and build a matcher for it.
3751	SEARCHFLAG says whether to build a searching or an exact matcher. */
3752	void
3753	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
3754	{
3755	if (s != NULL)
3756	dfaparse (s, len, d);
3757
3758	dfassbuild (d);
3759
3760	if (dfasupported (d))
3761	{
3762	maybe_disable_superset_dfa (d);
3763	dfaanalyze (d, searchflag);
3764	}
3765	else
3766	{
3767	d->dfaexec = dfaexec_noop;
3768	}
3769
3770	if (d->superset)
3771	{
3772	d->fast = true;
3773	dfaanalyze (d->superset, searchflag);
3774	}
3775	}
3776
3777	/* Free the storage held by the components of a dfa. */
3778	void
3779	dfafree (struct dfa *d)
3780	{
3781	free (d->charclasses);
3782	free (d->tokens);
3783
3784	if (d->localeinfo.multibyte)
3785	free_mbdata (d);
3786
3787	free (d->constraints);
3788	free (d->separates);
3789
3790	for (idx_t i = 0; i < d->sindex; i++)
3791	{
3792	free (d->states[i].elems.elems);
3793	free (d->states[i].mbps.elems);
3794	}
3795	free (d->states);
3796
3797	if (d->follows)
3798	{
3799	for (idx_t i = 0; i < d->tindex; i++)
3800	free (d->follows[i].elems);
3801	free (d->follows);
3802	}
3803
3804	if (d->trans)
3805	{
3806	for (idx_t i = 0; i < d->tralloc; i++)
3807	{
3808	free (d->trans[i]);
3809	free (d->fails[i]);
3810	}
3811
3812	free (d->trans - 2);
3813	free (d->fails);
3814	free (d->newlines);
3815	free (d->success);
3816	}
3817
3818	if (d->superset)
3819	{
3820	dfafree (d->superset);
3821	free (d->superset);
3822	}
3823	}
3824
3825	/* Having found the postfix representation of the regular expression,
3826	try to find a long sequence of characters that must appear in any line
3827	containing the r.e.
3828	Finding a "longest" sequence is beyond the scope here;
3829	we take an easy way out and hope for the best.
3830	(Take "(ab\|a)b"--please.)
3831
3832	We do a bottom-up calculation of sequences of characters that must appear
3833	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3834	representation:
3835	sequences that must appear at the left of the match ("left")
3836	sequences that must appear at the right of the match ("right")
3837	lists of sequences that must appear somewhere in the match ("in")
3838	sequences that must constitute the match ("is")
3839
3840	When we get to the root of the tree, we use one of the longest of its
3841	calculated "in" sequences as our answer.
3842
3843	The sequences calculated for the various types of node (in pseudo ANSI c)
3844	are shown below. "p" is the operand of unary operators (and the left-hand
3845	operand of binary operators); "q" is the right-hand operand of binary
3846	operators.
3847
3848	"ZERO" means "a zero-length sequence" below.
3849
3850	Type left right is in
3851	---- ---- ----- -- --
3852	char c # c # c # c # c
3853
3854	ANYCHAR ZERO ZERO ZERO ZERO
3855
3856	MBCSET ZERO ZERO ZERO ZERO
3857
3858	CSET ZERO ZERO ZERO ZERO
3859
3860	STAR ZERO ZERO ZERO ZERO
3861
3862	QMARK ZERO ZERO ZERO ZERO
3863
3864	PLUS p->left p->right ZERO p->in
3865
3866	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3867	p->left : q->right : q->is!=ZERO) ? q->in plus
3868	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3869	ZERO
3870
3871	OR longest common longest common (do p->is and substrings common
3872	leading trailing to q->is have same p->in and
3873	(sub)sequence (sub)sequence q->in length and content) ?
3874	of p->left of p->right
3875	and q->left and q->right p->is : NULL
3876
3877	If there's anything else we recognize in the tree, all four sequences get set
3878	to zero-length sequences. If there's something we don't recognize in the
3879	tree, we just return a zero-length sequence.
3880
3881	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3882	'aaa')?
3883
3884	And ... is it here or someplace that we might ponder "optimizations" such as
3885	egrep 'psi\|epsilon' -> egrep 'psi'
3886	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3887	(Yes, we now find "epsi" as a "string
3888	that must occur", but we might also
3889	simplify the entire r.e. being sought)
3890	grep '[c]' -> grep 'c'
3891	grep '(ab\|a)b' -> grep 'ab'
3892	grep 'ab*' -> grep 'a'
3893	grep 'a*b' -> grep 'b'
3894
3895	There are several issues:
3896
3897	Is optimization easy (enough)?
3898
3899	Does optimization actually accomplish anything,
3900	or is the automaton you get from "psi\|epsilon" (for example)
3901	the same as the one you get from "psi" (for example)?
3902
3903	Are optimizable r.e.'s likely to be used in real-life situations
3904	(something like 'ab*' is probably unlikely; something like is
3905	'psi\|epsilon' is likelier)? */
3906
3907	static char *
3908	icatalloc (char old, char const new)
3909	{
3910	idx_t newsize = strlen (new);
3911	if (newsize == 0)
3912	return old;
3913	idx_t oldsize = strlen (old);
3914	char *result = xirealloc (old, oldsize + newsize + 1);
3915	memcpy (result + oldsize, new, newsize + 1);
3916	return result;
3917	}
3918
3919	static void
3920	freelist (char **cpp)
3921	{
3922	while (*cpp)
3923	free (*cpp++);
3924	}
3925
3926	static char **
3927	enlistnew (char *cpp, char new)
3928	{
3929	/* Is there already something in the list that's new (or longer)? */
3930	idx_t i;
3931	for (i = 0; cpp[i] != NULL; i++)
3932	if (strstr (cpp[i], new) != NULL)
3933	{
3934	free (new);
3935	return cpp;
3936	}
3937	/* Eliminate any obsoleted strings. */
3938	for (idx_t j = 0; cpp[j] != NULL; )
3939	if (strstr (new, cpp[j]) == NULL)
3940	++j;
3941	else
3942	{
3943	free (cpp[j]);
3944	if (--i == j)
3945	break;
3946	cpp[j] = cpp[i];
3947	cpp[i] = NULL;
3948	}
3949	/* Add the new string. */
3950	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3951	cpp[i] = new;
3952	cpp[i + 1] = NULL;
3953	return cpp;
3954	}
3955
3956	static char **
3957	enlist (char *cpp, char const str, idx_t len)
3958	{
3959	return enlistnew (cpp, ximemdup0 (str, len));
3960	}
3961
3962	/* Given pointers to two strings, return a pointer to an allocated
3963	list of their distinct common substrings. */
3964	static char **
3965	comsubs (char left, char const right)
3966	{
3967	char *cpp = xzalloc (sizeof cpp);
3968
3969	for (char lcp = left; lcp != '\0'; lcp++)
3970	{
3971	idx_t len = 0;
3972	char rcp = strchr (right, lcp);
3973	while (rcp != NULL)
3974	{
3975	idx_t i;
3976	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3977	continue;
3978	if (i > len)
3979	len = i;
3980	rcp = strchr (rcp + 1, *lcp);
3981	}
3982	if (len != 0)
3983	cpp = enlist (cpp, lcp, len);
3984	}
3985	return cpp;
3986	}
3987
3988	static char **
3989	addlists (char old, char new)
3990	{
3991	for (; *new; new++)
3992	old = enlistnew (old, xstrdup (*new));
3993	return old;
3994	}
3995
3996	/* Given two lists of substrings, return a new list giving substrings
3997	common to both. */
3998	static char **
3999	inboth (char left, char right)
4000	{
4001	char *both = xzalloc (sizeof both);
4002
4003	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4004	{
4005	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4006	{
4007	char **temp = comsubs (left[lnum], right[rnum]);
4008	both = addlists (both, temp);
4009	freelist (temp);
4010	free (temp);
4011	}
4012	}
4013	return both;
4014	}
4015
4016	typedef struct must must;
4017
4018	struct must
4019	{
4020	char **in;
4021	char *left;
4022	char *right;
4023	char *is;
4024	bool begline;
4025	bool endline;
4026	must *prev;
4027	};
4028
4029	static must *
4030	allocmust (must *mp, idx_t size)
4031	{
4032	must new_mp = xmalloc (sizeof new_mp);
4033	new_mp->in = xzalloc (sizeof *new_mp->in);
4034	new_mp->left = xizalloc (size);
4035	new_mp->right = xizalloc (size);
4036	new_mp->is = xizalloc (size);
4037	new_mp->begline = false;
4038	new_mp->endline = false;
4039	new_mp->prev = mp;
4040	return new_mp;
4041	}
4042
4043	static void
4044	resetmust (must *mp)
4045	{
4046	freelist (mp->in);
4047	mp->in[0] = NULL;
4048	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4049	mp->begline = false;
4050	mp->endline = false;
4051	}
4052
4053	static void
4054	freemust (must *mp)
4055	{
4056	freelist (mp->in);
4057	free (mp->in);
4058	free (mp->left);
4059	free (mp->right);
4060	free (mp->is);
4061	free (mp);
4062	}
4063
4064	struct dfamust *
4065	dfamust (struct dfa const *d)
4066	{
4067	must *mp = NULL;
4068	char const *result = "";
4069	bool exact = false;
4070	bool begline = false;
4071	bool endline = false;
4072	bool need_begline = false;
4073	bool need_endline = false;
4074	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4075
4076	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4077	{
4078	token t = d->tokens[ri];
4079	switch (t)
4080	{
4081	case BEGLINE:
4082	mp = allocmust (mp, 2);
4083	mp->begline = true;
4084	need_begline = true;
4085	break;
4086	case ENDLINE:
4087	mp = allocmust (mp, 2);
4088	mp->endline = true;
4089	need_endline = true;
4090	break;
4091	case LPAREN:
4092	case RPAREN:
4093	assert (!"neither LPAREN nor RPAREN may appear here");
4094
4095	case EMPTY:
4096	case BEGWORD:
4097	case ENDWORD:
4098	case LIMWORD:
4099	case NOTLIMWORD:
4100	case BACKREF:
4101	case ANYCHAR:
4102	case MBCSET:
4103	mp = allocmust (mp, 2);
4104	break;
4105
4106	case STAR:
4107	case QMARK:
4108	assume_nonnull (mp);
4109	resetmust (mp);
4110	break;
4111
4112	case OR:
4113	{
4114	char **new;
4115	must *rmp = mp;
4116	assume_nonnull (rmp);
4117	must *lmp = mp = mp->prev;
4118	assume_nonnull (lmp);
4119	idx_t j, ln, rn, n;
4120
4121	/* Guaranteed to be. Unlikely, but ... */
4122	if (streq (lmp->is, rmp->is))
4123	{
4124	lmp->begline &= rmp->begline;
4125	lmp->endline &= rmp->endline;
4126	}
4127	else
4128	{
4129	lmp->is[0] = '\0';
4130	lmp->begline = false;
4131	lmp->endline = false;
4132	}
4133	/* Left side--easy */
4134	idx_t i = 0;
4135	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4136	++i;
4137	lmp->left[i] = '\0';
4138	/* Right side */
4139	ln = strlen (lmp->right);
4140	rn = strlen (rmp->right);
4141	n = ln;
4142	if (n > rn)
4143	n = rn;
4144	for (i = 0; i < n; ++i)
4145	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4146	break;
4147	for (j = 0; j < i; ++j)
4148	lmp->right[j] = lmp->right[(ln - i) + j];
4149	lmp->right[j] = '\0';
4150	new = inboth (lmp->in, rmp->in);
4151	freelist (lmp->in);
4152	free (lmp->in);
4153	lmp->in = new;
4154	freemust (rmp);
4155	}
4156	break;
4157
4158	case PLUS:
4159	assume_nonnull (mp);
4160	mp->is[0] = '\0';
4161	break;
4162
4163	case END:
4164	assume_nonnull (mp);
4165	assert (!mp->prev);
4166	for (idx_t i = 0; mp->in[i] != NULL; i++)
4167	if (strlen (mp->in[i]) > strlen (result))
4168	result = mp->in[i];
4169	if (streq (result, mp->is))
4170	{
4171	if ((!need_begline \|\| mp->begline) && (!need_endline
4172	\|\| mp->endline))
4173	exact = true;
4174	begline = mp->begline;
4175	endline = mp->endline;
4176	}
4177	goto done;
4178
4179	case CAT:
4180	{
4181	must *rmp = mp;
4182	assume_nonnull (rmp);
4183	must *lmp = mp = mp->prev;
4184	assume_nonnull (lmp);
4185
4186	/* In. Everything in left, plus everything in
4187	right, plus concatenation of
4188	left's right and right's left. */
4189	lmp->in = addlists (lmp->in, rmp->in);
4190	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4191	{
4192	idx_t lrlen = strlen (lmp->right);
4193	idx_t rllen = strlen (rmp->left);
4194	char *tp = ximalloc (lrlen + rllen + 1);
4195	memcpy (tp + lrlen, rmp->left, rllen + 1);
4196	memcpy (tp, lmp->right, lrlen);
4197	lmp->in = enlistnew (lmp->in, tp);
4198	}
4199	/* Left-hand */
4200	if (lmp->is[0] != '\0')
4201	lmp->left = icatalloc (lmp->left, rmp->left);
4202	/* Right-hand */
4203	if (rmp->is[0] == '\0')
4204	lmp->right[0] = '\0';
4205	lmp->right = icatalloc (lmp->right, rmp->right);
4206	/* Guaranteed to be */
4207	if ((lmp->is[0] != '\0' \|\| lmp->begline)
4208	&& (rmp->is[0] != '\0' \|\| rmp->endline))
4209	{
4210	lmp->is = icatalloc (lmp->is, rmp->is);
4211	lmp->endline = rmp->endline;
4212	}
4213	else
4214	{
4215	lmp->is[0] = '\0';
4216	lmp->begline = false;
4217	lmp->endline = false;
4218	}
4219	freemust (rmp);
4220	}
4221	break;
4222
4223	case '\0':
4224	/* Not on my shift. */
4225	goto done;
4226
4227	default:
4228	if (CSET <= t)
4229	{
4230	/* If T is a singleton, or if case-folding in a unibyte
4231	locale and T's members all case-fold to the same char,
4232	convert T to one of its members. Otherwise, do
4233	nothing further with T. */
4234	charclass *ccl = &d->charclasses[t - CSET];
4235	int j;
4236	for (j = 0; j < NOTCHAR; j++)
4237	if (tstbit (j, ccl))
4238	break;
4239	if (! (j < NOTCHAR))
4240	{
4241	mp = allocmust (mp, 2);
4242	break;
4243	}
4244	t = j;
4245	while (++j < NOTCHAR)
4246	if (tstbit (j, ccl)
4247	&& ! (case_fold_unibyte
4248	&& toupper (j) == toupper (t)))
4249	break;
4250	if (j < NOTCHAR)
4251	{
4252	mp = allocmust (mp, 2);
4253	break;
4254	}
4255	}
4256
4257	idx_t rj = ri + 2;
4258	if (d->tokens[ri + 1] == CAT)
4259	{
4260	for (; rj < d->tindex - 1; rj += 2)
4261	{
4262	if ((rj != ri && (d->tokens[rj] <= 0
4263	\|\| NOTCHAR <= d->tokens[rj]))
4264	\|\| d->tokens[rj + 1] != CAT)
4265	break;
4266	}
4267	}
4268	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4269	mp->is[0] = mp->left[0] = mp->right[0]
4270	= case_fold_unibyte ? toupper (t) : t;
4271
4272	idx_t i;
4273	for (i = 1; ri + 2 < rj; i++)
4274	{
4275	ri += 2;
4276	t = d->tokens[ri];
4277	mp->is[i] = mp->left[i] = mp->right[i]
4278	= case_fold_unibyte ? toupper (t) : t;
4279	}
4280	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4281	mp->in = enlist (mp->in, mp->is, i);
4282	break;
4283	}
4284	}
4285	done:;
4286
4287	struct dfamust *dm = NULL;
4288	if (*result)
4289	{
4290	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4291	dm->exact = exact;
4292	dm->begline = begline;
4293	dm->endline = endline;
4294	strcpy (dm->must, result);
4295	}
4296
4297	while (mp)
4298	{
4299	must *prev = mp->prev;
4300	freemust (mp);
4301	mp = prev;
4302	}
4303
4304	return dm;
4305	}
4306
4307	void
4308	dfamustfree (struct dfamust *dm)
4309	{
4310	free (dm);
4311	}
4312
4313	struct dfa *
4314	dfaalloc (void)
4315	{
4316	return xmalloc (sizeof (struct dfa));
4317	}
4318
4319	/* Initialize DFA. */
4320	void
4321	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
4322	reg_syntax_t bits, int dfaopts)
4323	{
4324	memset (dfa, 0, offsetof (struct dfa, dfaexec));
4325	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4326	dfa->localeinfo = *linfo;
4327
4328	dfa->fast = !dfa->localeinfo.multibyte;
4329
4330	dfa->canychar = -1;
4331	dfa->syntax.syntax_bits_set = true;
4332	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4333	dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4334	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4335	dfa->syntax.syntax_bits = bits;
4336
4337	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4338	{
4339	unsigned char uc = i;
4340
4341	dfa->syntax.sbit[uc] = char_context (dfa, uc);
4342	switch (dfa->syntax.sbit[uc])
4343	{
4344	case CTX_LETTER:
4345	setbit (uc, &dfa->syntax.letters);
4346	break;
4347	case CTX_NEWLINE:
4348	setbit (uc, &dfa->syntax.newline);
4349	break;
4350	}
4351
4352	/* POSIX requires that the five bytes in "\n\r./" (including the
4353	terminating NUL) cannot occur inside a multibyte character. */
4354	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4355	? (uc & 0xc0) != 0x80
4356	: strchr ("\n\r./", uc) != NULL);
4357	}
4358	}
4359
4360	/* Initialize TO by copying FROM's syntax settings. */
4361	void
4362	dfacopysyntax (struct dfa to, struct dfa const from)
4363	{
4364	memset (to, 0, offsetof (struct dfa, syntax));
4365	to->canychar = -1;
4366	to->fast = from->fast;
4367	to->syntax = from->syntax;
4368	to->dfaexec = from->dfaexec;
4369	to->localeinfo = from->localeinfo;
4370	}
4371
4372	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/grep/lib/dfa.c@ 3646

Download in other formats: