dfa.c@ 3530

Last change on this file since 3530 was 3529, checked in by bird, 3 years ago
Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.
Property svn:eol-style set to `native`
File size: 133.3 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24
25	#include "dfa.h"
26
27	#include "flexmember.h"
28	#include "idx.h"
29	#include "verify.h"
30
31	#include <assert.h>
32	#include <ctype.h>
33	#include <stdint.h>
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include <limits.h>
37	#include <string.h>
38
39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40	understandably cannot deduce that the input comes from a
41	well-formed regular expression. There's little point to the
42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
43	MMU will check anyway. */
44	#define assume_nonnull(x) assume ((x) != NULL)
45
46	static bool
47	streq (char const a, char const b)
48	{
49	return strcmp (a, b) == 0;
50	}
51
52	static bool
53	isasciidigit (char c)
54	{
55	return '0' <= c && c <= '9';
56	}
57
58	#include "gettext.h"
59	#define _(str) gettext (str)
60
61	#include <wchar.h>
62
63	#include "xalloc.h"
64	#include "localeinfo.h"
65
66	#ifndef FALLTHROUGH
67	# if 201710L < __STDC_VERSION__
68	# define FALLTHROUGH [[__fallthrough__]]
69	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
70	# define FALLTHROUGH __attribute__ ((__fallthrough__))
71	# else
72	# define FALLTHROUGH ((void) 0)
73	# endif
74	#endif
75
76	#ifndef MIN
77	# define MIN(a,b) ((a) < (b) ? (a) : (b))
78	#endif
79
80	/* HPUX defines these as macros in sys/param.h. */
81	#ifdef setbit
82	# undef setbit
83	#endif
84	#ifdef clrbit
85	# undef clrbit
86	#endif
87
88	/* For code that does not use Gnulib’s isblank module. */
89	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90	# define isblank dfa_isblank
91	static int
92	isblank (int c)
93	{
94	return c == ' ' \|\| c == '\t';
95	}
96	#endif
97
98	/* First integer value that is greater than any character code. */
99	enum { NOTCHAR = 1 << CHAR_BIT };
100
101	#ifdef UINT_LEAST64_MAX
102
103	/* Number of bits used in a charclass word. */
104	enum { CHARCLASS_WORD_BITS = 64 };
105
106	/* This represents part of a character class. It must be unsigned and
107	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108	typedef uint_least64_t charclass_word;
109
110	/* Part of a charclass initializer that represents 64 bits' worth of a
111	charclass, where LO and HI are the low and high-order 32 bits of
112	the 64-bit quantity. */
113	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115	#else
116	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117	enum { CHARCLASS_WORD_BITS = 32 };
118	typedef unsigned long charclass_word;
119	# define CHARCLASS_PAIR(lo, hi) lo, hi
120	#endif
121
122	/* An initializer for a charclass whose 32-bit words are A through H. */
123	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124	{{ \
125	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127	}}
128
129	/* The maximum useful value of a charclass_word; all used bits are 1. */
130	static charclass_word const CHARCLASS_WORD_MASK
131	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133	/* Number of words required to hold a bit for every character. */
134	enum
135	{
136	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137	};
138
139	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142	/* Convert a possibly-signed character to an unsigned character. This is
143	a bit safer than casting to unsigned char, since it catches some type
144	errors that the cast doesn't. */
145	static unsigned char
146	to_uchar (char ch)
147	{
148	return ch;
149	}
150
151	/* Contexts tell us whether a character is a newline or a word constituent.
152	Word-constituent characters are those that satisfy iswalnum, plus '_'.
153	Each character has a single CTX_* value; bitmasks of CTX_* values denote
154	a particular character class.
155
156	A state also stores a context value, which is a bitmask of CTX_* values.
157	A state's context represents a set of characters that the state's
158	predecessors must match. For example, a state whose context does not
159	include CTX_LETTER will never have transitions where the previous
160	character is a word constituent. A state whose context is CTX_ANY
161	might have transitions from any character. */
162
163	enum
164	{
165	CTX_NONE = 1,
166	CTX_LETTER = 2,
167	CTX_NEWLINE = 4,
168	CTX_ANY = 7
169	};
170
171	/* Sometimes characters can only be matched depending on the surrounding
172	context. Such context decisions depend on what the previous character
173	was, and the value of the current (lookahead) character. Context
174	dependent constraints are encoded as 9-bit integers. Each bit that
175	is set indicates that the constraint succeeds in the corresponding
176	context.
177
178	bit 6-8 - valid contexts when next character is CTX_NEWLINE
179	bit 3-5 - valid contexts when next character is CTX_LETTER
180	bit 0-2 - valid contexts when next character is CTX_NONE
181
182	succeeds_in_context determines whether a given constraint
183	succeeds in a particular context. Prev is a bitmask of possible
184	context values for the previous character, curr is the (single-bit)
185	context value for the lookahead character. */
186	static int
187	newline_constraint (int constraint)
188	{
189	return (constraint >> 6) & 7;
190	}
191	static int
192	letter_constraint (int constraint)
193	{
194	return (constraint >> 3) & 7;
195	}
196	static int
197	other_constraint (int constraint)
198	{
199	return constraint & 7;
200	}
201
202	static bool
203	succeeds_in_context (int constraint, int prev, int curr)
204	{
205	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208	& prev);
209	}
210
211	/* The following describe what a constraint depends on. */
212	static bool
213	prev_newline_dependent (int constraint)
214	{
215	return ((constraint ^ constraint >> 2) & 0111) != 0;
216	}
217	static bool
218	prev_letter_dependent (int constraint)
219	{
220	return ((constraint ^ constraint >> 1) & 0111) != 0;
221	}
222
223	/* Tokens that match the empty string subject to some constraint actually
224	work by applying that constraint to determine what may follow them,
225	taking into account what has gone before. The following values are
226	the constraints corresponding to the special tokens previously defined. */
227	enum
228	{
229	NO_CONSTRAINT = 0777,
230	BEGLINE_CONSTRAINT = 0444,
231	ENDLINE_CONSTRAINT = 0700,
232	BEGWORD_CONSTRAINT = 0050,
233	ENDWORD_CONSTRAINT = 0202,
234	LIMWORD_CONSTRAINT = 0252,
235	NOTLIMWORD_CONSTRAINT = 0525
236	};
237
238	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239	are operators and others are terminal symbols. Most (but not all) of these
240	codes are returned by the lexical analyzer. */
241
242	typedef ptrdiff_t token;
243	static token const TOKEN_MAX = PTRDIFF_MAX;
244
245	/* States are indexed by state_num values. These are normally
246	nonnegative but -1 is used as a special value. */
247	typedef ptrdiff_t state_num;
248
249	/* Predefined token values. */
250	enum
251	{
252	END = -1, /* END is a terminal symbol that matches the
253	end of input; any value of END or less in
254	the parse tree is such a symbol. Accepting
255	states of the DFA are those that would have
256	a transition on END. This is -1, not some
257	more-negative value, to tweak the speed of
258	comparisons to END. */
259
260	/* Ordinary character values are terminal symbols that match themselves. */
261
262	/* CSET must come last in the following list of special tokens. Otherwise,
263	the list order matters only for performance. Related special tokens
264	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
265	\|\| CSET <= t) can be done with a single machine-level comparison. */
266
267	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268	the empty string. */
269
270	QMARK, /* QMARK is an operator of one argument that
271	matches zero or one occurrences of its
272	argument. */
273
274	STAR, /* STAR is an operator of one argument that
275	matches the Kleene closure (zero or more
276	occurrences) of its argument. */
277
278	PLUS, /* PLUS is an operator of one argument that
279	matches the positive closure (one or more
280	occurrences) of its argument. */
281
282	REPMN, /* REPMN is a lexical token corresponding
283	to the {m,n} construct. REPMN never
284	appears in the compiled token vector. */
285
286	CAT, /* CAT is an operator of two arguments that
287	matches the concatenation of its
288	arguments. CAT is never returned by the
289	lexical analyzer. */
290
291	OR, /* OR is an operator of two arguments that
292	matches either of its arguments. */
293
294	LPAREN, /* LPAREN never appears in the parse tree,
295	it is only a lexeme. */
296
297	RPAREN, /* RPAREN never appears in the parse tree. */
298
299	WCHAR, /* Only returned by lex. wctok contains
300	the wide character representation. */
301
302	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
303	a valid multibyte (or single byte) character.
304	It is used only if MB_CUR_MAX > 1. */
305
306	BEG, /* BEG is an initial symbol that matches the
307	beginning of input. */
308
309	BEGLINE, /* BEGLINE is a terminal symbol that matches
310	the empty string at the beginning of a
311	line. */
312
313	ENDLINE, /* ENDLINE is a terminal symbol that matches
314	the empty string at the end of a line. */
315
316	BEGWORD, /* BEGWORD is a terminal symbol that matches
317	the empty string at the beginning of a
318	word. */
319
320	ENDWORD, /* ENDWORD is a terminal symbol that matches
321	the empty string at the end of a word. */
322
323	LIMWORD, /* LIMWORD is a terminal symbol that matches
324	the empty string at the beginning or the
325	end of a word. */
326
327	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
328	matches the empty string not at
329	the beginning or end of a word. */
330
331	BACKREF, /* BACKREF is generated by \<digit>
332	or by any other construct that
333	is not completely handled. If the scanner
334	detects a transition on backref, it returns
335	a kind of "semi-success" indicating that
336	the match will have to be verified with
337	a backtracking matcher. */
338
339	MBCSET, /* MBCSET is similar to CSET, but for
340	multibyte characters. */
341
342	CSET /* CSET and (and any value greater) is a
343	terminal symbol that matches any of a
344	class of characters. */
345	};
346
347
348	/* States of the recognizer correspond to sets of positions in the parse
349	tree, together with the constraints under which they may be matched.
350	So a position is encoded as an index into the parse tree together with
351	a constraint. */
352	typedef struct
353	{
354	idx_t index; /* Index into the parse array. */
355	unsigned int constraint; /* Constraint for matching this position. */
356	} position;
357
358	/* Sets of positions are stored as arrays. */
359	typedef struct
360	{
361	position elems; / Elements of this position set. */
362	idx_t nelem; /* Number of elements in this set. */
363	idx_t alloc; /* Number of elements allocated in ELEMS. */
364	} position_set;
365
366	/* A state of the dfa consists of a set of positions, some flags,
367	and the token value of the lowest-numbered position of the state that
368	contains an END token. */
369	typedef struct
370	{
371	size_t hash; /* Hash of the positions of this state. */
372	position_set elems; /* Positions this state could match. */
373	unsigned char context; /* Context from previous state. */
374	unsigned short constraint; /* Constraint for this state to accept. */
375	position_set mbps; /* Positions which can match multibyte
376	characters or the follows, e.g., period.
377	Used only if MB_CUR_MAX > 1. */
378	state_num mb_trindex; /* Index of this state in MB_TRANS, or
379	negative if the state does not have
380	ANYCHAR. */
381	} dfa_state;
382
383	/* Maximum for any transition table count. This should be at least 3,
384	for the initial state setup. */
385	enum { MAX_TRCOUNT = 1024 };
386
387	/* A bracket operator.
388	e.g., [a-c], [[:alpha:]], etc. */
389	struct mb_char_classes
390	{
391	ptrdiff_t cset;
392	bool invert;
393	wchar_t chars; / Normal characters. */
394	idx_t nchars;
395	idx_t nchars_alloc;
396	};
397
398	struct regex_syntax
399	{
400	/* Syntax bits controlling the behavior of the lexical analyzer. */
401	reg_syntax_t syntax_bits;
402	bool syntax_bits_set;
403
404	/* Flag for case-folding letters into sets. */
405	bool case_fold;
406
407	/* True if ^ and $ match only the start and end of data, and do not match
408	end-of-line within data. */
409	bool anchor;
410
411	/* End-of-line byte in data. */
412	unsigned char eolbyte;
413
414	/* Cache of char-context values. */
415	char sbit[NOTCHAR];
416
417	/* If never_trail[B], the byte B cannot be a non-initial byte in a
418	multibyte character. */
419	bool never_trail[NOTCHAR];
420
421	/* Set of characters considered letters. */
422	charclass letters;
423
424	/* Set of characters that are newline. */
425	charclass newline;
426	};
427
428	/* Lexical analyzer. All the dross that deals with the obnoxious
429	GNU Regex syntax bits is located here. The poor, suffering
430	reader is referred to the GNU Regex documentation for the
431	meaning of the @#%!@#%^!@ syntax bits. */
432	struct lexer_state
433	{
434	char const ptr; / Pointer to next input character. */
435	idx_t left; /* Number of characters remaining. */
436	token lasttok; /* Previous token returned; initially END. */
437	idx_t parens; /* Count of outstanding left parens. */
438	int minrep, maxrep; /* Repeat counts for {m,n}. */
439
440	/* Wide character representation of the current multibyte character,
441	or WEOF if there was an encoding error. Used only if
442	MB_CUR_MAX > 1. */
443	wint_t wctok;
444
445	/* The most recently analyzed multibyte bracket expression. */
446	struct mb_char_classes brack;
447
448	/* We're separated from beginning or (, \| only by zero-width characters. */
449	bool laststart;
450	};
451
452	/* Recursive descent parser for regular expressions. */
453
454	struct parser_state
455	{
456	token tok; /* Lookahead token. */
457	idx_t depth; /* Current depth of a hypothetical stack
458	holding deferred productions. This is
459	used to determine the depth that will be
460	required of the real stack later on in
461	dfaanalyze. */
462	};
463
464	/* A compiled regular expression. */
465	struct dfa
466	{
467	/* Fields filled by the scanner. */
468	charclass charclasses; / Array of character sets for CSET tokens. */
469	idx_t cindex; /* Index for adding new charclasses. */
470	idx_t calloc; /* Number of charclasses allocated. */
471	ptrdiff_t canychar; /* Index of anychar class, or -1. */
472
473	/* Scanner state */
474	struct lexer_state lex;
475
476	/* Parser state */
477	struct parser_state parse;
478
479	/* Fields filled by the parser. */
480	token tokens; / Postfix parse array. */
481	idx_t tindex; /* Index for adding new tokens. */
482	idx_t talloc; /* Number of tokens currently allocated. */
483	idx_t depth; /* Depth required of an evaluation stack
484	used for depth-first traversal of the
485	parse tree. */
486	idx_t nleaves; /* Number of non-EMPTY leaves
487	in the parse tree. */
488	idx_t nregexps; /* Count of parallel regexps being built
489	with dfaparse. */
490	bool fast; /* The DFA is fast. */
491	bool epsilon; /* Does a token match only the empty string? */
492	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
493	mbstate_t mbs; /* Multibyte conversion state. */
494
495	/* The following are valid only if MB_CUR_MAX > 1. */
496
497	/* The value of multibyte_prop[i] is defined by following rule.
498	if tokens[i] < NOTCHAR
499	bit 0 : tokens[i] is the first byte of a character, including
500	single-byte characters.
501	bit 1 : tokens[i] is the last byte of a character, including
502	single-byte characters.
503
504	e.g.
505	tokens
506	= 'single_byte_a', 'multi_byte_A', single_byte_b'
507	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
508	multibyte_prop
509	= 3 , 1 , 0 , 2 , 3
510	*/
511	char *multibyte_prop;
512
513	/* Fields filled by the superset. */
514	struct dfa superset; / Hint of the dfa. */
515
516	/* Fields filled by the state builder. */
517	dfa_state states; / States of the dfa. */
518	state_num sindex; /* Index for adding new states. */
519	idx_t salloc; /* Number of states currently allocated. */
520
521	/* Fields filled by the parse tree->NFA conversion. */
522	position_set follows; / Array of follow sets, indexed by position
523	index. The follow of a position is the set
524	of positions containing characters that
525	could conceivably follow a character
526	matching the given position in a string
527	matching the regexp. Allocated to the
528	maximum possible position index. */
529	bool searchflag; /* We are supposed to build a searching
530	as opposed to an exact matcher. A searching
531	matcher finds the first and shortest string
532	matching a regexp anywhere in the buffer,
533	whereas an exact matcher finds the longest
534	string matching, but anchored to the
535	beginning of the buffer. */
536
537	/* Fields filled by dfaanalyze. */
538	int constraints; / Array of union of accepting constraints
539	in the follow of a position. */
540	int separates; / Array of contexts on follow of a
541	position. */
542
543	/* Fields filled by dfaexec. */
544	state_num tralloc; /* Number of transition tables that have
545	slots so far, not counting trans[-1] and
546	trans[-2]. */
547	int trcount; /* Number of transition tables that have
548	been built, other than for initial
549	states. */
550	int min_trcount; /* Number of initial states. Equivalently,
551	the minimum state number for which trcount
552	counts transitions. */
553	state_num *trans; / Transition tables for states that can
554	never accept. If the transitions for a
555	state have not yet been computed, or the
556	state could possibly accept, its entry in
557	this table is NULL. This points to two
558	past the start of the allocated array,
559	and trans[-1] and trans[-2] are always
560	NULL. */
561	state_num *fails; / Transition tables after failing to accept
562	on a state that potentially could do so.
563	If trans[i] is non-null, fails[i] must
564	be null. */
565	char success; / Table of acceptance conditions used in
566	dfaexec and computed in build_state. */
567	state_num newlines; / Transitions on newlines. The entry for a
568	newline in any transition table is always
569	-1 so we can count lines without wasting
570	too many cycles. The transition for a
571	newline is stored separately and handled
572	as a special case. Newline is also used
573	as a sentinel at the end of the buffer. */
574	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
575	context in multibyte locales, in which we
576	do not distinguish between their contexts,
577	as not supported word. */
578	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
579	state_num *mb_trans; / Transition tables for states with
580	ANYCHAR. */
581	state_num mb_trcount; /* Number of transition tables for states with
582	ANYCHAR that have actually been built. */
583
584	/* Syntax configuration. This is near the end so that dfacopysyntax
585	can memset up to here. */
586	struct regex_syntax syntax;
587
588	/* Information derived from the locale. This is at the end so that
589	a quick memset need not clear it specially. */
590
591	/* dfaexec implementation. */
592	char (dfaexec) (struct dfa , char const , char *,
593	bool, ptrdiff_t , bool );
594
595	/* Other cached information derived from the locale. */
596	struct localeinfo localeinfo;
597	};
598
599	/* User access to dfa internals. */
600
601	/* S could possibly be an accepting state of R. */
602	static bool
603	accepting (state_num s, struct dfa const *r)
604	{
605	return r->states[s].constraint != 0;
606	}
607
608	/* STATE accepts in the specified context. */
609	static bool
610	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
611	{
612	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
613	}
614
615	static void regexp (struct dfa *dfa);
616
617	/* Store into *PWC the result of converting the leading bytes of the
618	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
619	and updating the conversion state in *D. On conversion error,
620	convert just a single byte, to WEOF. Return the number of bytes
621	converted.
622
623	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
624
625	* PWC points to wint_t, not to wchar_t.
626	* The last arg is a dfa *D instead of merely a multibyte conversion
627	state D->mbs.
628	* N is idx_t not size_t, and must be at least 1.
629	* S[N - 1] must be a sentinel byte.
630	* Shift encodings are not supported.
631	* The return value is always in the range 1..N.
632	* D->mbs is always valid afterwards.
633	* PWC is always set to something. /
634	static int
635	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
636	{
637	unsigned char uc = s[0];
638	wint_t wc = d->localeinfo.sbctowc[uc];
639
640	if (wc == WEOF)
641	{
642	wchar_t wch;
643	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
644	if (0 < nbytes && nbytes < (size_t) -2)
645	{
646	*pwc = wch;
647	return nbytes;
648	}
649	memset (&d->mbs, 0, sizeof d->mbs);
650	}
651
652	*pwc = wc;
653	return 1;
654	}
655
656	#ifdef DEBUG
657
658	static void
659	prtok (token t)
660	{
661	if (t <= END)
662	fprintf (stderr, "END");
663	else if (0 <= t && t < NOTCHAR)
664	{
665	unsigned int ch = t;
666	fprintf (stderr, "0x%02x", ch);
667	}
668	else
669	{
670	char const *s;
671	switch (t)
672	{
673	case BEG:
674	s = "BEG";
675	break;
676	case EMPTY:
677	s = "EMPTY";
678	break;
679	case BACKREF:
680	s = "BACKREF";
681	break;
682	case BEGLINE:
683	s = "BEGLINE";
684	break;
685	case ENDLINE:
686	s = "ENDLINE";
687	break;
688	case BEGWORD:
689	s = "BEGWORD";
690	break;
691	case ENDWORD:
692	s = "ENDWORD";
693	break;
694	case LIMWORD:
695	s = "LIMWORD";
696	break;
697	case NOTLIMWORD:
698	s = "NOTLIMWORD";
699	break;
700	case QMARK:
701	s = "QMARK";
702	break;
703	case STAR:
704	s = "STAR";
705	break;
706	case PLUS:
707	s = "PLUS";
708	break;
709	case CAT:
710	s = "CAT";
711	break;
712	case OR:
713	s = "OR";
714	break;
715	case LPAREN:
716	s = "LPAREN";
717	break;
718	case RPAREN:
719	s = "RPAREN";
720	break;
721	case ANYCHAR:
722	s = "ANYCHAR";
723	break;
724	case MBCSET:
725	s = "MBCSET";
726	break;
727	default:
728	s = "CSET";
729	break;
730	}
731	fprintf (stderr, "%s", s);
732	}
733	}
734	#endif /* DEBUG */
735
736	/* Stuff pertaining to charclasses. */
737
738	static bool
739	tstbit (unsigned int b, charclass const *c)
740	{
741	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
742	}
743
744	static void
745	setbit (unsigned int b, charclass *c)
746	{
747	charclass_word one = 1;
748	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
749	}
750
751	static void
752	clrbit (unsigned int b, charclass *c)
753	{
754	charclass_word one = 1;
755	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
756	}
757
758	static void
759	zeroset (charclass *s)
760	{
761	memset (s, 0, sizeof *s);
762	}
763
764	static void
765	fillset (charclass *s)
766	{
767	for (int i = 0; i < CHARCLASS_WORDS; i++)
768	s->w[i] = CHARCLASS_WORD_MASK;
769	}
770
771	static void
772	notset (charclass *s)
773	{
774	for (int i = 0; i < CHARCLASS_WORDS; ++i)
775	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
776	}
777
778	static bool
779	equal (charclass const s1, charclass const s2)
780	{
781	charclass_word w = 0;
782	for (int i = 0; i < CHARCLASS_WORDS; i++)
783	w \|= s1->w[i] ^ s2->w[i];
784	return w == 0;
785	}
786
787	static bool
788	emptyset (charclass const *s)
789	{
790	charclass_word w = 0;
791	for (int i = 0; i < CHARCLASS_WORDS; i++)
792	w \|= s->w[i];
793	return w == 0;
794	}
795
796	/* Ensure that the array addressed by PA holds at least I + 1 items.
797	Either return PA, or reallocate the array and return its new address.
798	Although PA may be null, the returned value is never null.
799
800	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
801	is updated on reallocation. If PA is null, *NITEMS must be zero.
802	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
803	ITEM_SIZE is the size of one item; it must be positive.
804	Avoid O(N*2) behavior on arrays growing linearly. /
805	static void *
806	maybe_realloc (void pa, idx_t i, idx_t nitems,
807	ptrdiff_t nitems_max, idx_t item_size)
808	{
809	if (i < *nitems)
810	return pa;
811	return xpalloc (pa, nitems, 1, nitems_max, item_size);
812	}
813
814	/* In DFA D, find the index of charclass S, or allocate a new one. */
815	static idx_t
816	charclass_index (struct dfa d, charclass const s)
817	{
818	idx_t i;
819
820	for (i = 0; i < d->cindex; ++i)
821	if (equal (s, &d->charclasses[i]))
822	return i;
823	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
824	TOKEN_MAX - CSET, sizeof *d->charclasses);
825	++d->cindex;
826	d->charclasses[i] = *s;
827	return i;
828	}
829
830	static bool
831	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
832	{
833	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
834	}
835
836	static int
837	char_context (struct dfa const *dfa, unsigned char c)
838	{
839	if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
840	return CTX_NEWLINE;
841	if (unibyte_word_constituent (dfa, c))
842	return CTX_LETTER;
843	return CTX_NONE;
844	}
845
846	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
847	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
848	this may happen when folding case in weird Turkish locales where
849	dotless i/dotted I are not included in the chosen character set.
850	Return whether a bit was set in the charclass. */
851	static bool
852	setbit_wc (wint_t wc, charclass *c)
853	{
854	int b = wctob (wc);
855	if (b < 0)
856	return false;
857
858	setbit (b, c);
859	return true;
860	}
861
862	/* Set a bit for B and its case variants in the charclass C.
863	MB_CUR_MAX must be 1. */
864	static void
865	setbit_case_fold_c (int b, charclass *c)
866	{
867	int ub = toupper (b);
868	for (int i = 0; i < NOTCHAR; i++)
869	if (toupper (i) == ub)
870	setbit (i, c);
871	}
872
873	/* Fetch the next lexical input character from the pattern. There
874	must at least one byte of pattern input. Set DFA->lex.wctok to the
875	value of the character or to WEOF depending on whether the input is
876	a valid multibyte character (possibly of length 1). Then return
877	the next input byte value, except return EOF if the input is a
878	multibyte character of length greater than 1. */
879	static int
880	fetch_wc (struct dfa *dfa)
881	{
882	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
883	dfa);
884	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
885	dfa->lex.ptr += nbytes;
886	dfa->lex.left -= nbytes;
887	return c;
888	}
889
890	/* If there is no more input, report an error about unbalanced brackets.
891	Otherwise, behave as with fetch_wc (DFA). */
892	static int
893	bracket_fetch_wc (struct dfa *dfa)
894	{
895	if (! dfa->lex.left)
896	dfaerror (_("unbalanced ["));
897	return fetch_wc (dfa);
898	}
899
900	typedef int predicate (int);
901
902	/* The following list maps the names of the Posix named character classes
903	to predicate functions that determine whether a given character is in
904	the class. The leading [ has already been eaten by the lexical
905	analyzer. */
906	struct dfa_ctype
907	{
908	const char *name;
909	predicate *func;
910	bool single_byte_only;
911	};
912
913	static const struct dfa_ctype prednames[] = {
914	{"alpha", isalpha, false},
915	{"upper", isupper, false},
916	{"lower", islower, false},
917	{"digit", isdigit, true},
918	{"xdigit", isxdigit, false},
919	{"space", isspace, false},
920	{"punct", ispunct, false},
921	{"alnum", isalnum, false},
922	{"print", isprint, false},
923	{"graph", isgraph, false},
924	{"cntrl", iscntrl, false},
925	{"blank", isblank, false},
926	{NULL, NULL, false}
927	};
928
929	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
930	find_pred (const char *str)
931	{
932	for (int i = 0; prednames[i].name; i++)
933	if (streq (str, prednames[i].name))
934	return &prednames[i];
935	return NULL;
936	}
937
938	/* Parse a bracket expression, which possibly includes multibyte
939	characters. */
940	static token
941	parse_bracket_exp (struct dfa *dfa)
942	{
943	/* This is a bracket expression that dfaexec is known to
944	process correctly. */
945	bool known_bracket_exp = true;
946
947	/* Used to warn about [:space:].
948	Bit 0 = first character is a colon.
949	Bit 1 = last character is a colon.
950	Bit 2 = includes any other character but a colon.
951	Bit 3 = includes ranges, char/equiv classes or collation elements. */
952	int colon_warning_state;
953
954	dfa->lex.brack.nchars = 0;
955	charclass ccl;
956	zeroset (&ccl);
957	int c = bracket_fetch_wc (dfa);
958	bool invert = c == '^';
959	if (invert)
960	{
961	c = bracket_fetch_wc (dfa);
962	known_bracket_exp = dfa->localeinfo.simple;
963	}
964	wint_t wc = dfa->lex.wctok;
965	int c1;
966	wint_t wc1;
967	colon_warning_state = (c == ':');
968	do
969	{
970	c1 = NOTCHAR; /* Mark c1 as not initialized. */
971	colon_warning_state &= ~2;
972
973	/* Note that if we're looking at some other [:...:] construct,
974	we just treat it as a bunch of ordinary characters. We can do
975	this because we assume regex has checked for syntax errors before
976	dfa is ever called. */
977	if (c == '[')
978	{
979	c1 = bracket_fetch_wc (dfa);
980	wc1 = dfa->lex.wctok;
981
982	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
983	\|\| c1 == '.' \|\| c1 == '=')
984	{
985	enum { MAX_BRACKET_STRING_LEN = 32 };
986	char str[MAX_BRACKET_STRING_LEN + 1];
987	int len = 0;
988	for (;;)
989	{
990	c = bracket_fetch_wc (dfa);
991	if (dfa->lex.left == 0
992	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
993	break;
994	if (len < MAX_BRACKET_STRING_LEN)
995	str[len++] = c;
996	else
997	/* This is in any case an invalid class name. */
998	str[0] = '\0';
999	}
1000	str[len] = '\0';
1001
1002	/* Fetch bracket. */
1003	c = bracket_fetch_wc (dfa);
1004	wc = dfa->lex.wctok;
1005	if (c1 == ':')
1006	/* Build character class. POSIX allows character
1007	classes to match multicharacter collating elements,
1008	but the regex code does not support that, so do not
1009	worry about that possibility. */
1010	{
1011	char const *class
1012	= (dfa->syntax.case_fold && (streq (str, "upper")
1013	\|\| streq (str, "lower"))
1014	? "alpha" : str);
1015	const struct dfa_ctype *pred = find_pred (class);
1016	if (!pred)
1017	dfaerror (_("invalid character class"));
1018
1019	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1020	known_bracket_exp = false;
1021	else
1022	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1023	if (pred->func (c2))
1024	setbit (c2, &ccl);
1025	}
1026	else
1027	known_bracket_exp = false;
1028
1029	colon_warning_state \|= 8;
1030
1031	/* Fetch new lookahead character. */
1032	c1 = bracket_fetch_wc (dfa);
1033	wc1 = dfa->lex.wctok;
1034	continue;
1035	}
1036
1037	/* We treat '[' as a normal character here. c/c1/wc/wc1
1038	are already set up. */
1039	}
1040
1041	if (c == '\\'
1042	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1043	{
1044	c = bracket_fetch_wc (dfa);
1045	wc = dfa->lex.wctok;
1046	}
1047
1048	if (c1 == NOTCHAR)
1049	{
1050	c1 = bracket_fetch_wc (dfa);
1051	wc1 = dfa->lex.wctok;
1052	}
1053
1054	if (c1 == '-')
1055	/* build range characters. */
1056	{
1057	int c2 = bracket_fetch_wc (dfa);
1058	wint_t wc2 = dfa->lex.wctok;
1059
1060	/* A bracket expression like [a-[.aa.]] matches an unknown set.
1061	Treat it like [-a[.aa.]] while parsing it, and
1062	remember that the set is unknown. */
1063	if (c2 == '[' && dfa->lex.ptr[0] == '.')
1064	{
1065	known_bracket_exp = false;
1066	c2 = ']';
1067	}
1068
1069	if (c2 == ']')
1070	{
1071	/* In the case [x-], the - is an ordinary hyphen,
1072	which is left in c1, the lookahead character. */
1073	dfa->lex.ptr--;
1074	dfa->lex.left++;
1075	}
1076	else
1077	{
1078	if (c2 == '\\' && (dfa->syntax.syntax_bits
1079	& RE_BACKSLASH_ESCAPE_IN_LISTS))
1080	{
1081	c2 = bracket_fetch_wc (dfa);
1082	wc2 = dfa->lex.wctok;
1083	}
1084
1085	colon_warning_state \|= 8;
1086	c1 = bracket_fetch_wc (dfa);
1087	wc1 = dfa->lex.wctok;
1088
1089	/* Treat [x-y] as a range if x != y. */
1090	if (wc != wc2 \|\| wc == WEOF)
1091	{
1092	if (dfa->localeinfo.simple
1093	\|\| (isasciidigit (c) & isasciidigit (c2)))
1094	{
1095	for (int ci = c; ci <= c2; ci++)
1096	if (dfa->syntax.case_fold && isalpha (ci))
1097	setbit_case_fold_c (ci, &ccl);
1098	else
1099	setbit (ci, &ccl);
1100	}
1101	else
1102	known_bracket_exp = false;
1103
1104	continue;
1105	}
1106	}
1107	}
1108
1109	colon_warning_state \|= (c == ':') ? 2 : 4;
1110
1111	if (!dfa->localeinfo.multibyte)
1112	{
1113	if (dfa->syntax.case_fold && isalpha (c))
1114	setbit_case_fold_c (c, &ccl);
1115	else
1116	setbit (c, &ccl);
1117	continue;
1118	}
1119
1120	if (wc == WEOF)
1121	known_bracket_exp = false;
1122	else
1123	{
1124	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1125	int n = (dfa->syntax.case_fold
1126	? case_folded_counterparts (wc, folded + 1) + 1
1127	: 1);
1128	folded[0] = wc;
1129	for (int i = 0; i < n; i++)
1130	if (!setbit_wc (folded[i], &ccl))
1131	{
1132	dfa->lex.brack.chars
1133	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1134	&dfa->lex.brack.nchars_alloc, -1,
1135	sizeof *dfa->lex.brack.chars);
1136	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1137	}
1138	}
1139	}
1140	while ((wc = wc1, (c = c1) != ']'));
1141
1142	if (colon_warning_state == 7)
1143	dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1144
1145	if (! known_bracket_exp)
1146	return BACKREF;
1147
1148	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
1149	{
1150	dfa->lex.brack.invert = invert;
1151	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1152	return MBCSET;
1153	}
1154
1155	if (invert)
1156	{
1157	notset (&ccl);
1158	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1159	clrbit ('\n', &ccl);
1160	}
1161
1162	return CSET + charclass_index (dfa, &ccl);
1163	}
1164
1165	struct lexptr
1166	{
1167	char const *ptr;
1168	idx_t left;
1169	};
1170
1171	static void
1172	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
1173	{
1174	ls->ptr = dfa->lex.ptr;
1175	ls->left = dfa->lex.left;
1176	dfa->lex.ptr = s;
1177	dfa->lex.left = strlen (s);
1178	}
1179
1180	static void
1181	pop_lex_state (struct dfa dfa, struct lexptr const ls)
1182	{
1183	dfa->lex.ptr = ls->ptr;
1184	dfa->lex.left = ls->left;
1185	}
1186
1187	static token
1188	lex (struct dfa *dfa)
1189	{
1190	bool backslash = false;
1191
1192	/* Basic plan: We fetch a character. If it's a backslash,
1193	we set the backslash flag and go through the loop again.
1194	On the plus side, this avoids having a duplicate of the
1195	main switch inside the backslash case. On the minus side,
1196	it means that just about every case begins with
1197	"if (backslash) ...". */
1198	for (int i = 0; i < 2; ++i)
1199	{
1200	if (! dfa->lex.left)
1201	return dfa->lex.lasttok = END;
1202	int c = fetch_wc (dfa);
1203
1204	switch (c)
1205	{
1206	case '\\':
1207	if (backslash)
1208	goto normal_char;
1209	if (dfa->lex.left == 0)
1210	dfaerror (_("unfinished \\ escape"));
1211	backslash = true;
1212	break;
1213
1214	case '^':
1215	if (backslash)
1216	goto normal_char;
1217	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1218	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
1219	\|\| dfa->lex.lasttok == OR)
1220	return dfa->lex.lasttok = BEGLINE;
1221	goto normal_char;
1222
1223	case '$':
1224	if (backslash)
1225	goto normal_char;
1226	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1227	\|\| dfa->lex.left == 0
1228	\|\| ((dfa->lex.left
1229	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1230	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1231	& (dfa->lex.ptr[0] == '\\')]
1232	== ')'))
1233	\|\| ((dfa->lex.left
1234	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1235	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1236	& (dfa->lex.ptr[0] == '\\')]
1237	== '\|'))
1238	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1239	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1240	return dfa->lex.lasttok = ENDLINE;
1241	goto normal_char;
1242
1243	case '1':
1244	case '2':
1245	case '3':
1246	case '4':
1247	case '5':
1248	case '6':
1249	case '7':
1250	case '8':
1251	case '9':
1252	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1253	{
1254	dfa->lex.laststart = false;
1255	return dfa->lex.lasttok = BACKREF;
1256	}
1257	goto normal_char;
1258
1259	case '`':
1260	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1261	{
1262	/* FIXME: should be beginning of string */
1263	return dfa->lex.lasttok = BEGLINE;
1264	}
1265	goto normal_char;
1266
1267	case '\'':
1268	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1269	{
1270	/* FIXME: should be end of string */
1271	return dfa->lex.lasttok = ENDLINE;
1272	}
1273	goto normal_char;
1274
1275	case '<':
1276	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1277	return dfa->lex.lasttok = BEGWORD;
1278	goto normal_char;
1279
1280	case '>':
1281	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1282	return dfa->lex.lasttok = ENDWORD;
1283	goto normal_char;
1284
1285	case 'b':
1286	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1287	return dfa->lex.lasttok = LIMWORD;
1288	goto normal_char;
1289
1290	case 'B':
1291	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1292	return dfa->lex.lasttok = NOTLIMWORD;
1293	goto normal_char;
1294
1295	case '?':
1296	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1297	goto normal_char;
1298	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1299	goto normal_char;
1300	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1301	&& dfa->lex.laststart)
1302	goto normal_char;
1303	return dfa->lex.lasttok = QMARK;
1304
1305	case '*':
1306	if (backslash)
1307	goto normal_char;
1308	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1309	&& dfa->lex.laststart)
1310	goto normal_char;
1311	return dfa->lex.lasttok = STAR;
1312
1313	case '+':
1314	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1315	goto normal_char;
1316	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1317	goto normal_char;
1318	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1319	&& dfa->lex.laststart)
1320	goto normal_char;
1321	return dfa->lex.lasttok = PLUS;
1322
1323	case '{':
1324	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1325	goto normal_char;
1326	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1327	goto normal_char;
1328	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1329	&& dfa->lex.laststart)
1330	goto normal_char;
1331
1332	/* Cases:
1333	{M} - exact count
1334	{M,} - minimum count, maximum is infinity
1335	{,N} - 0 through N
1336	{,} - 0 to infinity (same as '*')
1337	{M,N} - M through N */
1338	{
1339	char const *p = dfa->lex.ptr;
1340	char const *lim = p + dfa->lex.left;
1341	dfa->lex.minrep = dfa->lex.maxrep = -1;
1342	for (; p != lim && isasciidigit (*p); p++)
1343	dfa->lex.minrep = (dfa->lex.minrep < 0
1344	? *p - '0'
1345	: MIN (RE_DUP_MAX + 1,
1346	dfa->lex.minrep * 10 + *p - '0'));
1347	if (p != lim)
1348	{
1349	if (*p != ',')
1350	dfa->lex.maxrep = dfa->lex.minrep;
1351	else
1352	{
1353	if (dfa->lex.minrep < 0)
1354	dfa->lex.minrep = 0;
1355	while (++p != lim && isasciidigit (*p))
1356	dfa->lex.maxrep
1357	= (dfa->lex.maxrep < 0
1358	? *p - '0'
1359	: MIN (RE_DUP_MAX + 1,
1360	dfa->lex.maxrep * 10 + *p - '0'));
1361	}
1362	}
1363	if (! ((! backslash \|\| (p != lim && *p++ == '\\'))
1364	&& p != lim && *p++ == '}'
1365	&& 0 <= dfa->lex.minrep
1366	&& (dfa->lex.maxrep < 0
1367	\|\| dfa->lex.minrep <= dfa->lex.maxrep)))
1368	{
1369	if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1370	goto normal_char;
1371	dfaerror (_("invalid content of \\{\\}"));
1372	}
1373	if (RE_DUP_MAX < dfa->lex.maxrep)
1374	dfaerror (_("regular expression too big"));
1375	dfa->lex.ptr = p;
1376	dfa->lex.left = lim - p;
1377	}
1378	dfa->lex.laststart = false;
1379	return dfa->lex.lasttok = REPMN;
1380
1381	case '\|':
1382	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1383	goto normal_char;
1384	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1385	goto normal_char;
1386	dfa->lex.laststart = true;
1387	return dfa->lex.lasttok = OR;
1388
1389	case '\n':
1390	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1391	\|\| backslash \|\| !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1392	goto normal_char;
1393	dfa->lex.laststart = true;
1394	return dfa->lex.lasttok = OR;
1395
1396	case '(':
1397	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1398	goto normal_char;
1399	dfa->lex.parens++;
1400	dfa->lex.laststart = true;
1401	return dfa->lex.lasttok = LPAREN;
1402
1403	case ')':
1404	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1405	goto normal_char;
1406	if (dfa->lex.parens == 0
1407	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1408	goto normal_char;
1409	dfa->lex.parens--;
1410	dfa->lex.laststart = false;
1411	return dfa->lex.lasttok = RPAREN;
1412
1413	case '.':
1414	if (backslash)
1415	goto normal_char;
1416	if (dfa->canychar < 0)
1417	{
1418	charclass ccl;
1419	fillset (&ccl);
1420	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1421	clrbit ('\n', &ccl);
1422	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1423	clrbit ('\0', &ccl);
1424	if (dfa->localeinfo.multibyte)
1425	for (int c2 = 0; c2 < NOTCHAR; c2++)
1426	if (dfa->localeinfo.sbctowc[c2] == WEOF)
1427	clrbit (c2, &ccl);
1428	dfa->canychar = charclass_index (dfa, &ccl);
1429	}
1430	dfa->lex.laststart = false;
1431	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1432	? ANYCHAR
1433	: CSET + dfa->canychar);
1434
1435	case 's':
1436	case 'S':
1437	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1438	goto normal_char;
1439	if (!dfa->localeinfo.multibyte)
1440	{
1441	charclass ccl;
1442	zeroset (&ccl);
1443	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1444	if (isspace (c2))
1445	setbit (c2, &ccl);
1446	if (c == 'S')
1447	notset (&ccl);
1448	dfa->lex.laststart = false;
1449	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1450	}
1451
1452	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1453	add_utf8_anychar, makes sense. */
1454
1455	/* \s and \S are documented to be equivalent to [[:space:]] and
1456	[^[:space:]] respectively, so tell the lexer to process those
1457	strings, each minus its "already processed" '['. */
1458	{
1459	struct lexptr ls;
1460	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1461	dfa->lex.lasttok = parse_bracket_exp (dfa);
1462	pop_lex_state (dfa, &ls);
1463	}
1464
1465	dfa->lex.laststart = false;
1466	return dfa->lex.lasttok;
1467
1468	case 'w':
1469	case 'W':
1470	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1471	goto normal_char;
1472
1473	if (!dfa->localeinfo.multibyte)
1474	{
1475	charclass ccl;
1476	zeroset (&ccl);
1477	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1478	if (dfa->syntax.sbit[c2] == CTX_LETTER)
1479	setbit (c2, &ccl);
1480	if (c == 'W')
1481	notset (&ccl);
1482	dfa->lex.laststart = false;
1483	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1484	}
1485
1486	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1487	add_utf8_anychar, makes sense. */
1488
1489	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
1490	[^_[:alnum:]] respectively, so tell the lexer to process those
1491	strings, each minus its "already processed" '['. */
1492	{
1493	struct lexptr ls;
1494	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1495	dfa->lex.lasttok = parse_bracket_exp (dfa);
1496	pop_lex_state (dfa, &ls);
1497	}
1498
1499	dfa->lex.laststart = false;
1500	return dfa->lex.lasttok;
1501
1502	case '[':
1503	if (backslash)
1504	goto normal_char;
1505	dfa->lex.laststart = false;
1506	return dfa->lex.lasttok = parse_bracket_exp (dfa);
1507
1508	default:
1509	normal_char:
1510	dfa->lex.laststart = false;
1511	/* For multibyte character sets, folding is done in atom. Always
1512	return WCHAR. */
1513	if (dfa->localeinfo.multibyte)
1514	return dfa->lex.lasttok = WCHAR;
1515
1516	if (dfa->syntax.case_fold && isalpha (c))
1517	{
1518	charclass ccl;
1519	zeroset (&ccl);
1520	setbit_case_fold_c (c, &ccl);
1521	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1522	}
1523
1524	return dfa->lex.lasttok = c;
1525	}
1526	}
1527
1528	/* The above loop should consume at most a backslash
1529	and some other character. */
1530	abort ();
1531	return END; /* keeps pedantic compilers happy. */
1532	}
1533
1534	static void
1535	addtok_mb (struct dfa *dfa, token t, char mbprop)
1536	{
1537	if (dfa->talloc == dfa->tindex)
1538	{
1539	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1540	sizeof *dfa->tokens);
1541	if (dfa->localeinfo.multibyte)
1542	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1543	sizeof *dfa->multibyte_prop);
1544	}
1545	if (dfa->localeinfo.multibyte)
1546	dfa->multibyte_prop[dfa->tindex] = mbprop;
1547	dfa->tokens[dfa->tindex++] = t;
1548
1549	switch (t)
1550	{
1551	case QMARK:
1552	case STAR:
1553	case PLUS:
1554	break;
1555
1556	case CAT:
1557	case OR:
1558	dfa->parse.depth--;
1559	break;
1560
1561	case EMPTY:
1562	dfa->epsilon = true;
1563	goto increment_depth;
1564
1565	case BACKREF:
1566	dfa->fast = false;
1567	goto increment_nleaves;
1568
1569	case BEGLINE:
1570	case ENDLINE:
1571	case BEGWORD:
1572	case ENDWORD:
1573	case LIMWORD:
1574	case NOTLIMWORD:
1575	dfa->epsilon = true;
1576	FALLTHROUGH;
1577	default:
1578	increment_nleaves:
1579	dfa->nleaves++;
1580	increment_depth:
1581	dfa->parse.depth++;
1582	if (dfa->depth < dfa->parse.depth)
1583	dfa->depth = dfa->parse.depth;
1584	break;
1585	}
1586	}
1587
1588	static void addtok_wc (struct dfa *dfa, wint_t wc);
1589
1590	/* Add the given token to the parse tree, maintaining the depth count and
1591	updating the maximum depth if necessary. */
1592	static void
1593	addtok (struct dfa *dfa, token t)
1594	{
1595	if (dfa->localeinfo.multibyte && t == MBCSET)
1596	{
1597	bool need_or = false;
1598
1599	/* Extract wide characters into alternations for better performance.
1600	This does not require UTF-8. */
1601	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1602	{
1603	addtok_wc (dfa, dfa->lex.brack.chars[i]);
1604	if (need_or)
1605	addtok (dfa, OR);
1606	need_or = true;
1607	}
1608	dfa->lex.brack.nchars = 0;
1609
1610	/* Wide characters have been handled above, so it is possible
1611	that the set is empty now. Do nothing in that case. */
1612	if (dfa->lex.brack.cset != -1)
1613	{
1614	addtok (dfa, CSET + dfa->lex.brack.cset);
1615	if (need_or)
1616	addtok (dfa, OR);
1617	}
1618	}
1619	else
1620	{
1621	addtok_mb (dfa, t, 3);
1622	}
1623	}
1624
1625	/* We treat a multibyte character as a single atom, so that DFA
1626	can treat a multibyte character as a single expression.
1627
1628	e.g., we construct the following tree from "<mb1><mb2>".
1629	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1630	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1631	static void
1632	addtok_wc (struct dfa *dfa, wint_t wc)
1633	{
1634	unsigned char buf[MB_LEN_MAX];
1635	mbstate_t s = { 0 };
1636	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1637	int buflen;
1638
1639	if (stored_bytes != (size_t) -1)
1640	buflen = stored_bytes;
1641	else
1642	{
1643	/* This is merely stop-gap. buf[0] is undefined, yet skipping
1644	the addtok_mb call altogether can corrupt the heap. */
1645	buflen = 1;
1646	buf[0] = 0;
1647	}
1648
1649	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1650	for (int i = 1; i < buflen; i++)
1651	{
1652	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1653	addtok (dfa, CAT);
1654	}
1655	}
1656
1657	static void
1658	add_utf8_anychar (struct dfa *dfa)
1659	{
1660	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1661	UTF-8 byte sequence has been defined as follows:
1662
1663	([\x00-\x7f]
1664	\|[\xc2-\xdf][\x80-\xbf]
1665	\|[\xe0][\xa0-\xbf][\x80-\xbf]
1666	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1667	\|[\xed][\x80-\x9f][\x80-\xbf]
1668	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1669	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1670	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1671
1672	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
1673	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1674	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1675	H = [\x80-\x9f], I = [\xf0],
1676	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1677
1678	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
1679
1680	/* Mnemonics for classes containing two or more bytes. */
1681	enum { A, B, C, E, F, H, J, K, M };
1682
1683	/* Mnemonics for single-byte tokens. */
1684	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1685
1686	static charclass const utf8_classes[] = {
1687	/* A. 00-7f: 1-byte sequence. */
1688	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1689
1690	/* B. c2-df: 1st byte of a 2-byte sequence. */
1691	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1692
1693	/* C. 80-bf: non-leading bytes. */
1694	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1695
1696	/* D. e0 (just a token). */
1697
1698	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
1699	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1700
1701	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1702	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1703
1704	/* G. ed (just a token). */
1705
1706	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
1707	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1708
1709	/* I. f0 (just a token). */
1710
1711	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1712	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1713
1714	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
1715	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1716
1717	/* L. f4 (just a token). */
1718
1719	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1720	CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1721	};
1722
1723	/* Define the character classes that are needed below. */
1724	if (dfa->utf8_anychar_classes[0] == 0)
1725	{
1726	charclass c = utf8_classes[0];
1727	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1728	clrbit ('\n', &c);
1729	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1730	clrbit ('\0', &c);
1731	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1732
1733	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1734	dfa->utf8_anychar_classes[i]
1735	= CSET + charclass_index (dfa, &utf8_classes[i]);
1736	}
1737
1738	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
1739	The token buffer is in reverse Polish order, so we get
1740	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1741	C CAT OR C CAT OR C CAT OR". */
1742	addtok (dfa, dfa->utf8_anychar_classes[A]);
1743	addtok (dfa, dfa->utf8_anychar_classes[B]);
1744	addtok (dfa, D_token);
1745	addtok (dfa, dfa->utf8_anychar_classes[E]);
1746	addtok (dfa, CAT);
1747	addtok (dfa, OR);
1748	addtok (dfa, G_token);
1749	addtok (dfa, dfa->utf8_anychar_classes[H]);
1750	addtok (dfa, CAT);
1751	addtok (dfa, OR);
1752	addtok (dfa, dfa->utf8_anychar_classes[F]);
1753	addtok (dfa, I_token);
1754	addtok (dfa, dfa->utf8_anychar_classes[J]);
1755	addtok (dfa, CAT);
1756	addtok (dfa, OR);
1757	addtok (dfa, L_token);
1758	addtok (dfa, dfa->utf8_anychar_classes[M]);
1759	addtok (dfa, CAT);
1760	addtok (dfa, OR);
1761	addtok (dfa, dfa->utf8_anychar_classes[K]);
1762	for (int i = 0; i < 3; i++)
1763	{
1764	addtok (dfa, dfa->utf8_anychar_classes[C]);
1765	addtok (dfa, CAT);
1766	addtok (dfa, OR);
1767	}
1768	}
1769
1770	/* The grammar understood by the parser is as follows.
1771
1772	regexp:
1773	regexp OR branch
1774	branch
1775
1776	branch:
1777	branch closure
1778	closure
1779
1780	closure:
1781	closure QMARK
1782	closure STAR
1783	closure PLUS
1784	closure REPMN
1785	atom
1786
1787	atom:
1788	<normal character>
1789	<multibyte character>
1790	ANYCHAR
1791	MBCSET
1792	CSET
1793	BACKREF
1794	BEGLINE
1795	ENDLINE
1796	BEGWORD
1797	ENDWORD
1798	LIMWORD
1799	NOTLIMWORD
1800	LPAREN regexp RPAREN
1801	<empty>
1802
1803	The parser builds a parse tree in postfix form in an array of tokens. */
1804
1805	static void
1806	atom (struct dfa *dfa)
1807	{
1808	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1809	\|\| dfa->parse.tok >= CSET
1810	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
1811	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
1812	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
1813	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
1814	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
1815	{
1816	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1817	{
1818	/* For UTF-8 expand the period to a series of CSETs that define a
1819	valid UTF-8 character. This avoids using the slow multibyte
1820	path. I'm pretty sure it would be both profitable and correct to
1821	do it for any encoding; however, the optimization must be done
1822	manually as it is done above in add_utf8_anychar. So, let's
1823	start with UTF-8: it is the most used, and the structure of the
1824	encoding makes the correctness more obvious. */
1825	add_utf8_anychar (dfa);
1826	}
1827	else
1828	addtok (dfa, dfa->parse.tok);
1829	dfa->parse.tok = lex (dfa);
1830	}
1831	else if (dfa->parse.tok == WCHAR)
1832	{
1833	if (dfa->lex.wctok == WEOF)
1834	addtok (dfa, BACKREF);
1835	else
1836	{
1837	addtok_wc (dfa, dfa->lex.wctok);
1838
1839	if (dfa->syntax.case_fold)
1840	{
1841	wchar_t folded[CASE_FOLDED_BUFSIZE];
1842	int n = case_folded_counterparts (dfa->lex.wctok, folded);
1843	for (int i = 0; i < n; i++)
1844	{
1845	addtok_wc (dfa, folded[i]);
1846	addtok (dfa, OR);
1847	}
1848	}
1849	}
1850
1851	dfa->parse.tok = lex (dfa);
1852	}
1853	else if (dfa->parse.tok == LPAREN)
1854	{
1855	dfa->parse.tok = lex (dfa);
1856	regexp (dfa);
1857	if (dfa->parse.tok != RPAREN)
1858	dfaerror (_("unbalanced ("));
1859	dfa->parse.tok = lex (dfa);
1860	}
1861	else
1862	addtok (dfa, EMPTY);
1863	}
1864
1865	/* Return the number of tokens in the given subexpression. */
1866	static idx_t _GL_ATTRIBUTE_PURE
1867	nsubtoks (struct dfa const *dfa, idx_t tindex)
1868	{
1869	switch (dfa->tokens[tindex - 1])
1870	{
1871	default:
1872	return 1;
1873	case QMARK:
1874	case STAR:
1875	case PLUS:
1876	return 1 + nsubtoks (dfa, tindex - 1);
1877	case CAT:
1878	case OR:
1879	{
1880	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1881	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1882	}
1883	}
1884	}
1885
1886	/* Copy the given subexpression to the top of the tree. */
1887	static void
1888	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1889	{
1890	if (dfa->localeinfo.multibyte)
1891	for (idx_t i = 0; i < ntokens; i++)
1892	addtok_mb (dfa, dfa->tokens[tindex + i],
1893	dfa->multibyte_prop[tindex + i]);
1894	else
1895	for (idx_t i = 0; i < ntokens; i++)
1896	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1897	}
1898
1899	static void
1900	closure (struct dfa *dfa)
1901	{
1902	atom (dfa);
1903	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
1904	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
1905	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
1906	{
1907	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1908	idx_t tindex = dfa->tindex - ntokens;
1909	if (dfa->lex.maxrep < 0)
1910	addtok (dfa, PLUS);
1911	if (dfa->lex.minrep == 0)
1912	addtok (dfa, QMARK);
1913	int i;
1914	for (i = 1; i < dfa->lex.minrep; i++)
1915	{
1916	copytoks (dfa, tindex, ntokens);
1917	addtok (dfa, CAT);
1918	}
1919	for (; i < dfa->lex.maxrep; i++)
1920	{
1921	copytoks (dfa, tindex, ntokens);
1922	addtok (dfa, QMARK);
1923	addtok (dfa, CAT);
1924	}
1925	dfa->parse.tok = lex (dfa);
1926	}
1927	else if (dfa->parse.tok == REPMN)
1928	{
1929	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1930	dfa->parse.tok = lex (dfa);
1931	closure (dfa);
1932	}
1933	else
1934	{
1935	addtok (dfa, dfa->parse.tok);
1936	dfa->parse.tok = lex (dfa);
1937	}
1938	}
1939
1940	static void
1941	branch (struct dfa* dfa)
1942	{
1943	closure (dfa);
1944	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1945	&& dfa->parse.tok >= 0)
1946	{
1947	closure (dfa);
1948	addtok (dfa, CAT);
1949	}
1950	}
1951
1952	static void
1953	regexp (struct dfa *dfa)
1954	{
1955	branch (dfa);
1956	while (dfa->parse.tok == OR)
1957	{
1958	dfa->parse.tok = lex (dfa);
1959	branch (dfa);
1960	addtok (dfa, OR);
1961	}
1962	}
1963
1964	/* Parse a string S of length LEN into D. S can include NUL characters.
1965	This is the main entry point for the parser. */
1966	void
1967	dfaparse (char const s, idx_t len, struct dfa d)
1968	{
1969	d->lex.ptr = s;
1970	d->lex.left = len;
1971	d->lex.lasttok = END;
1972	d->lex.laststart = true;
1973
1974	if (!d->syntax.syntax_bits_set)
1975	dfaerror (_("no syntax specified"));
1976
1977	if (!d->nregexps)
1978	addtok (d, BEG);
1979
1980	d->parse.tok = lex (d);
1981	d->parse.depth = d->depth;
1982
1983	regexp (d);
1984
1985	if (d->parse.tok != END)
1986	dfaerror (_("unbalanced )"));
1987
1988	addtok (d, END - d->nregexps);
1989	addtok (d, CAT);
1990
1991	if (d->nregexps)
1992	addtok (d, OR);
1993
1994	++d->nregexps;
1995	}
1996
1997	/* Some primitives for operating on sets of positions. */
1998
1999	/* Copy one set to another. */
2000	static void
2001	copy (position_set const src, position_set dst)
2002	{
2003	if (dst->alloc < src->nelem)
2004	{
2005	free (dst->elems);
2006	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2007	sizeof *dst->elems);
2008	}
2009	dst->nelem = src->nelem;
2010	if (src->nelem != 0)
2011	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2012	}
2013
2014	static void
2015	alloc_position_set (position_set *s, idx_t size)
2016	{
2017	s->elems = xnmalloc (size, sizeof *s->elems);
2018	s->alloc = size;
2019	s->nelem = 0;
2020	}
2021
2022	/* Insert position P in set S. S is maintained in sorted order on
2023	decreasing index. If there is already an entry in S with P.index
2024	then merge (logically-OR) P's constraints into the one in S.
2025	S->elems must point to an array large enough to hold the resulting set. */
2026	static void
2027	insert (position p, position_set *s)
2028	{
2029	idx_t count = s->nelem;
2030	idx_t lo = 0, hi = count;
2031	while (lo < hi)
2032	{
2033	idx_t mid = (lo + hi) >> 1;
2034	if (s->elems[mid].index < p.index)
2035	lo = mid + 1;
2036	else if (s->elems[mid].index == p.index)
2037	{
2038	s->elems[mid].constraint \|= p.constraint;
2039	return;
2040	}
2041	else
2042	hi = mid;
2043	}
2044
2045	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2046	for (idx_t i = count; i > lo; i--)
2047	s->elems[i] = s->elems[i - 1];
2048	s->elems[lo] = p;
2049	++s->nelem;
2050	}
2051
2052	static void
2053	append (position p, position_set *s)
2054	{
2055	idx_t count = s->nelem;
2056	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2057	s->elems[s->nelem++] = p;
2058	}
2059
2060	/* Merge S1 and S2 (with the additional constraint C2) into M. The
2061	result is as if the positions of S1, and of S2 with the additional
2062	constraint C2, were inserted into an initially empty set. */
2063	static void
2064	merge_constrained (position_set const s1, position_set const s2,
2065	unsigned int c2, position_set *m)
2066	{
2067	idx_t i = 0, j = 0;
2068
2069	if (m->alloc - s1->nelem < s2->nelem)
2070	{
2071	free (m->elems);
2072	m->alloc = s1->nelem;
2073	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2074	}
2075	m->nelem = 0;
2076	while (i < s1->nelem \|\| j < s2->nelem)
2077	if (! (j < s2->nelem)
2078	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2079	{
2080	unsigned int c = ((i < s1->nelem && j < s2->nelem
2081	&& s1->elems[i].index == s2->elems[j].index)
2082	? s2->elems[j++].constraint & c2
2083	: 0);
2084	m->elems[m->nelem].index = s1->elems[i].index;
2085	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
2086	}
2087	else
2088	{
2089	if (s2->elems[j].constraint & c2)
2090	{
2091	m->elems[m->nelem].index = s2->elems[j].index;
2092	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2093	}
2094	j++;
2095	}
2096	}
2097
2098	/* Merge two sets of positions into a third. The result is exactly as if
2099	the positions of both sets were inserted into an initially empty set. */
2100	static void
2101	merge (position_set const s1, position_set const s2, position_set *m)
2102	{
2103	merge_constrained (s1, s2, -1, m);
2104	}
2105
2106	/* Merge into DST all the elements of SRC, possibly destroying
2107	the contents of the temporary M. */
2108	static void
2109	merge2 (position_set dst, position_set const src, position_set *m)
2110	{
2111	if (src->nelem < 4)
2112	{
2113	for (idx_t i = 0; i < src->nelem; i++)
2114	insert (src->elems[i], dst);
2115	}
2116	else
2117	{
2118	merge (src, dst, m);
2119	copy (m, dst);
2120	}
2121	}
2122
2123	/* Delete a position from a set. Return the nonzero constraint of the
2124	deleted position, or zero if there was no such position. */
2125	static unsigned int
2126	delete (idx_t del, position_set *s)
2127	{
2128	idx_t count = s->nelem;
2129	idx_t lo = 0, hi = count;
2130	while (lo < hi)
2131	{
2132	idx_t mid = (lo + hi) >> 1;
2133	if (s->elems[mid].index < del)
2134	lo = mid + 1;
2135	else if (s->elems[mid].index == del)
2136	{
2137	unsigned int c = s->elems[mid].constraint;
2138	idx_t i;
2139	for (i = mid; i + 1 < count; i++)
2140	s->elems[i] = s->elems[i + 1];
2141	s->nelem = i;
2142	return c;
2143	}
2144	else
2145	hi = mid;
2146	}
2147	return 0;
2148	}
2149
2150	/* Replace a position with the followed set. */
2151	static void
2152	replace (position_set dst, idx_t del, position_set add,
2153	unsigned int constraint, position_set *tmp)
2154	{
2155	unsigned int c = delete (del, dst) & constraint;
2156
2157	if (c)
2158	{
2159	copy (dst, tmp);
2160	merge_constrained (tmp, add, c, dst);
2161	}
2162	}
2163
2164	/* Find the index of the state corresponding to the given position set with
2165	the given preceding context, or create a new state if there is no such
2166	state. Context tells whether we got here on a newline or letter. */
2167	static state_num
2168	state_index (struct dfa d, position_set const s, int context)
2169	{
2170	size_t hash = 0;
2171	int constraint = 0;
2172	state_num i;
2173
2174	for (i = 0; i < s->nelem; ++i)
2175	{
2176	idx_t ind = s->elems[i].index;
2177	hash ^= ind + s->elems[i].constraint;
2178	}
2179
2180	/* Try to find a state that exactly matches the proposed one. */
2181	for (i = 0; i < d->sindex; ++i)
2182	{
2183	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2184	\|\| context != d->states[i].context)
2185	continue;
2186	state_num j;
2187	for (j = 0; j < s->nelem; ++j)
2188	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2189	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2190	break;
2191	if (j == s->nelem)
2192	return i;
2193	}
2194
2195	#ifdef DEBUG
2196	fprintf (stderr, "new state %td\n nextpos:", i);
2197	for (state_num j = 0; j < s->nelem; j++)
2198	{
2199	fprintf (stderr, " %td:", s->elems[j].index);
2200	prtok (d->tokens[s->elems[j].index]);
2201	}
2202	fprintf (stderr, "\n context:");
2203	if (context ^ CTX_ANY)
2204	{
2205	if (context & CTX_NONE)
2206	fprintf (stderr, " CTX_NONE");
2207	if (context & CTX_LETTER)
2208	fprintf (stderr, " CTX_LETTER");
2209	if (context & CTX_NEWLINE)
2210	fprintf (stderr, " CTX_NEWLINE");
2211	}
2212	else
2213	fprintf (stderr, " CTX_ANY");
2214	fprintf (stderr, "\n");
2215	#endif
2216
2217	for (state_num j = 0; j < s->nelem; j++)
2218	{
2219	int c = d->constraints[s->elems[j].index];
2220
2221	if (c != 0)
2222	{
2223	if (succeeds_in_context (c, context, CTX_ANY))
2224	constraint \|= c;
2225	}
2226	else if (d->tokens[s->elems[j].index] == BACKREF)
2227	constraint = NO_CONSTRAINT;
2228	}
2229
2230
2231	/* Create a new state. */
2232	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2233	sizeof *d->states);
2234	d->states[i].hash = hash;
2235	alloc_position_set (&d->states[i].elems, s->nelem);
2236	copy (s, &d->states[i].elems);
2237	d->states[i].context = context;
2238	d->states[i].constraint = constraint;
2239	d->states[i].mbps.nelem = 0;
2240	d->states[i].mbps.elems = NULL;
2241	d->states[i].mb_trindex = -1;
2242
2243	++d->sindex;
2244
2245	return i;
2246	}
2247
2248	/* Find the epsilon closure of D's set of positions. If any position of the set
2249	contains a symbol that matches the empty string in some context, replace
2250	that position with the elements of its follow labeled with an appropriate
2251	constraint. Repeat exhaustively until no funny positions are left.
2252	S->elems must be large enough to hold the result. BACKWARD is D's
2253	backward set; use and update it too. */
2254	static void
2255	epsclosure (struct dfa const d, position_set backward)
2256	{
2257	position_set tmp;
2258	alloc_position_set (&tmp, d->nleaves);
2259	for (idx_t i = 0; i < d->tindex; i++)
2260	if (0 < d->follows[i].nelem)
2261	{
2262	unsigned int constraint;
2263	switch (d->tokens[i])
2264	{
2265	default:
2266	continue;
2267
2268	case BEGLINE:
2269	constraint = BEGLINE_CONSTRAINT;
2270	break;
2271	case ENDLINE:
2272	constraint = ENDLINE_CONSTRAINT;
2273	break;
2274	case BEGWORD:
2275	constraint = BEGWORD_CONSTRAINT;
2276	break;
2277	case ENDWORD:
2278	constraint = ENDWORD_CONSTRAINT;
2279	break;
2280	case LIMWORD:
2281	constraint = LIMWORD_CONSTRAINT;
2282	break;
2283	case NOTLIMWORD:
2284	constraint = NOTLIMWORD_CONSTRAINT;
2285	break;
2286	case EMPTY:
2287	constraint = NO_CONSTRAINT;
2288	break;
2289	}
2290
2291	delete (i, &d->follows[i]);
2292
2293	for (idx_t j = 0; j < backward[i].nelem; j++)
2294	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2295	constraint, &tmp);
2296	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2297	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2298	NO_CONSTRAINT, &tmp);
2299	}
2300	free (tmp.elems);
2301	}
2302
2303	/* Returns the set of contexts for which there is at least one
2304	character included in C. */
2305
2306	static int
2307	charclass_context (struct dfa const dfa, charclass const c)
2308	{
2309	int context = 0;
2310
2311	for (int j = 0; j < CHARCLASS_WORDS; j++)
2312	{
2313	if (c->w[j] & dfa->syntax.newline.w[j])
2314	context \|= CTX_NEWLINE;
2315	if (c->w[j] & dfa->syntax.letters.w[j])
2316	context \|= CTX_LETTER;
2317	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
2318	context \|= CTX_NONE;
2319	}
2320
2321	return context;
2322	}
2323
2324	/* Returns the contexts on which the position set S depends. Each context
2325	in the set of returned contexts (let's call it SC) may have a different
2326	follow set than other contexts in SC, and also different from the
2327	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2328	in the complement set will have the same follow set. */
2329
2330	static int _GL_ATTRIBUTE_PURE
2331	state_separate_contexts (struct dfa d, position_set const s)
2332	{
2333	int separate_contexts = 0;
2334
2335	for (idx_t j = 0; j < s->nelem; j++)
2336	separate_contexts \|= d->separates[s->elems[j].index];
2337
2338	return separate_contexts;
2339	}
2340
2341	enum
2342	{
2343	/* Single token is repeated. It is distinguished from non-repeated. */
2344	OPT_REPEAT = (1 << 0),
2345
2346	/* Multiple tokens are repeated. This flag is on at head of tokens. The
2347	node is not merged. */
2348	OPT_LPAREN = (1 << 1),
2349
2350	/* Multiple branches are joined. The node is not merged. */
2351	OPT_RPAREN = (1 << 2),
2352
2353	/* The node is walked. If the node is found in walking again, OPT_RPAREN
2354	flag is turned on. */
2355	OPT_WALKED = (1 << 3),
2356
2357	/* The node is queued. The node is not queued again. */
2358	OPT_QUEUED = (1 << 4)
2359	};
2360
2361	static void
2362	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
2363	position_set *merged)
2364	{
2365	position_set *follows = d->follows;
2366	idx_t nelem = 0;
2367
2368	for (idx_t i = 0; i < follows[tindex].nelem; i++)
2369	{
2370	idx_t sindex = follows[tindex].elems[i].index;
2371
2372	/* Skip the node as pruned in future. */
2373	unsigned int iconstraint = follows[tindex].elems[i].constraint;
2374	if (iconstraint == 0)
2375	continue;
2376
2377	if (d->tokens[follows[tindex].elems[i].index] <= END)
2378	{
2379	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
2380	continue;
2381	}
2382
2383	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
2384	{
2385	idx_t j;
2386
2387	for (j = 0; j < nelem; j++)
2388	{
2389	idx_t dindex = follows[tindex].elems[j].index;
2390
2391	if (dindex == tindex)
2392	continue;
2393
2394	if (follows[tindex].elems[j].constraint != iconstraint)
2395	continue;
2396
2397	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
2398	continue;
2399
2400	if (d->tokens[sindex] != d->tokens[dindex])
2401	continue;
2402
2403	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2404	continue;
2405
2406	if (flags[sindex] & OPT_REPEAT)
2407	delete (sindex, &follows[sindex]);
2408
2409	merge2 (&follows[dindex], &follows[sindex], merged);
2410
2411	break;
2412	}
2413
2414	if (j < nelem)
2415	continue;
2416	}
2417
2418	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2419	flags[sindex] \|= OPT_QUEUED;
2420	}
2421
2422	follows[tindex].nelem = nelem;
2423	}
2424
2425	static int
2426	compare (const void a, const void b)
2427	{
2428	position const p = a, q = b;
2429	return (p->index > q->index) - (p->index < q->index);
2430	}
2431
2432	static void
2433	reorder_tokens (struct dfa *d)
2434	{
2435	idx_t nleaves = 0;
2436	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
2437	map[0] = nleaves++;
2438	for (idx_t i = 1; i < d->tindex; i++)
2439	map[i] = -1;
2440
2441	token tokens = xnmalloc (d->nleaves, sizeof tokens);
2442	position_set follows = xnmalloc (d->nleaves, sizeof follows);
2443	int constraints = xnmalloc (d->nleaves, sizeof constraints);
2444	char *multibyte_prop = (d->localeinfo.multibyte
2445	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2446	: NULL);
2447
2448	for (idx_t i = 0; i < d->tindex; i++)
2449	{
2450	if (map[i] < 0)
2451	{
2452	free (d->follows[i].elems);
2453	d->follows[i].elems = NULL;
2454	d->follows[i].nelem = 0;
2455	continue;
2456	}
2457
2458	tokens[map[i]] = d->tokens[i];
2459	follows[map[i]] = d->follows[i];
2460	constraints[map[i]] = d->constraints[i];
2461
2462	if (multibyte_prop != NULL)
2463	multibyte_prop[map[i]] = d->multibyte_prop[i];
2464
2465	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2466	{
2467	if (map[d->follows[i].elems[j].index] == -1)
2468	map[d->follows[i].elems[j].index] = nleaves++;
2469
2470	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2471	}
2472
2473	qsort (d->follows[i].elems, d->follows[i].nelem,
2474	sizeof *d->follows[i].elems, compare);
2475	}
2476
2477	for (idx_t i = 0; i < nleaves; i++)
2478	{
2479	d->tokens[i] = tokens[i];
2480	d->follows[i] = follows[i];
2481	d->constraints[i] = constraints[i];
2482
2483	if (multibyte_prop != NULL)
2484	d->multibyte_prop[i] = multibyte_prop[i];
2485	}
2486
2487	d->tindex = d->nleaves = nleaves;
2488
2489	free (tokens);
2490	free (follows);
2491	free (constraints);
2492	free (multibyte_prop);
2493	free (map);
2494	}
2495
2496	static void
2497	dfaoptimize (struct dfa *d)
2498	{
2499	char *flags = xizalloc (d->tindex);
2500
2501	for (idx_t i = 0; i < d->tindex; i++)
2502	{
2503	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2504	{
2505	if (d->follows[i].elems[j].index == i)
2506	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
2507	else if (d->follows[i].elems[j].index < i)
2508	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
2509	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2510	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
2511	else
2512	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
2513	}
2514	}
2515
2516	flags[0] \|= OPT_QUEUED;
2517
2518	position_set merged0;
2519	position_set *merged = &merged0;
2520	alloc_position_set (merged, d->nleaves);
2521
2522	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2523
2524	for (idx_t i = 0; i < d->tindex; i++)
2525	if (flags[i] & OPT_QUEUED)
2526	merge_nfa_state (d, i, flags, merged);
2527
2528	reorder_tokens (d);
2529
2530	free (merged->elems);
2531	free (flags);
2532	}
2533
2534	/* Perform bottom-up analysis on the parse tree, computing various functions.
2535	Note that at this point, we're pretending constructs like \< are real
2536	characters rather than constraints on what can follow them.
2537
2538	Nullable: A node is nullable if it is at the root of a regexp that can
2539	match the empty string.
2540	* EMPTY leaves are nullable.
2541	* No other leaf is nullable.
2542	* A QMARK or STAR node is nullable.
2543	* A PLUS node is nullable if its argument is nullable.
2544	* A CAT node is nullable if both its arguments are nullable.
2545	* An OR node is nullable if either argument is nullable.
2546
2547	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2548	that could correspond to the first character of a string matching the
2549	regexp rooted at the given node.
2550	* EMPTY leaves have empty firstpos.
2551	* The firstpos of a nonempty leaf is that leaf itself.
2552	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2553	argument.
2554	* The firstpos of a CAT node is the firstpos of the left argument, union
2555	the firstpos of the right if the left argument is nullable.
2556	* The firstpos of an OR node is the union of firstpos of each argument.
2557
2558	Lastpos: The lastpos of a node is the set of positions that could
2559	correspond to the last character of a string matching the regexp at
2560	the given node.
2561	* EMPTY leaves have empty lastpos.
2562	* The lastpos of a nonempty leaf is that leaf itself.
2563	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2564	argument.
2565	* The lastpos of a CAT node is the lastpos of its right argument, union
2566	the lastpos of the left if the right argument is nullable.
2567	* The lastpos of an OR node is the union of the lastpos of each argument.
2568
2569	Follow: The follow of a position is the set of positions that could
2570	correspond to the character following a character matching the node in
2571	a string matching the regexp. At this point we consider special symbols
2572	that match the empty string in some context to be just normal characters.
2573	Later, if we find that a special symbol is in a follow set, we will
2574	replace it with the elements of its follow, labeled with an appropriate
2575	constraint.
2576	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2577	the follow of every node in the lastpos.
2578	* Every node in the firstpos of the second argument of a CAT node is in
2579	the follow of every node in the lastpos of the first argument.
2580
2581	Because of the postfix representation of the parse tree, the depth-first
2582	analysis is conveniently done by a linear scan with the aid of a stack.
2583	Sets are stored as arrays of the elements, obeying a stack-like allocation
2584	scheme; the number of elements in each set deeper in the stack can be
2585	used to determine the address of a particular set's array. */
2586	static void
2587	dfaanalyze (struct dfa *d, bool searchflag)
2588	{
2589	/* Array allocated to hold position sets. */
2590	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
2591	/* Firstpos and lastpos elements. */
2592	position *firstpos = posalloc;
2593	position *lastpos = firstpos + d->nleaves;
2594	position pos;
2595	position_set tmp;
2596
2597	/* Stack for element counts and nullable flags. */
2598	struct
2599	{
2600	/* Whether the entry is nullable. */
2601	bool nullable;
2602
2603	/* Counts of firstpos and lastpos sets. */
2604	idx_t nfirstpos;
2605	idx_t nlastpos;
2606	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
2607
2608	position_set merged; /* Result of merging sets. */
2609
2610	addtok (d, CAT);
2611	idx_t tindex = d->tindex;
2612
2613	#ifdef DEBUG
2614	fprintf (stderr, "dfaanalyze:\n");
2615	for (idx_t i = 0; i < tindex; i++)
2616	{
2617	fprintf (stderr, " %td:", i);
2618	prtok (d->tokens[i]);
2619	}
2620	putc ('\n', stderr);
2621	#endif
2622
2623	d->searchflag = searchflag;
2624	alloc_position_set (&merged, d->nleaves);
2625	d->follows = xicalloc (tindex, sizeof *d->follows);
2626	position_set *backward
2627	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2628
2629	for (idx_t i = 0; i < tindex; i++)
2630	{
2631	switch (d->tokens[i])
2632	{
2633	case EMPTY:
2634	/* The empty set is nullable. */
2635	stk->nullable = true;
2636
2637	/* The firstpos and lastpos of the empty leaf are both empty. */
2638	stk->nfirstpos = stk->nlastpos = 0;
2639	stk++;
2640	break;
2641
2642	case STAR:
2643	case PLUS:
2644	/* Every element in the lastpos of the argument is in the backward
2645	set of every element in the firstpos. */
2646	if (d->epsilon)
2647	{
2648	tmp.elems = lastpos - stk[-1].nlastpos;
2649	tmp.nelem = stk[-1].nlastpos;
2650	for (position *p = firstpos - stk[-1].nfirstpos;
2651	p < firstpos; p++)
2652	merge2 (&backward[p->index], &tmp, &merged);
2653	}
2654
2655	/* Every element in the firstpos of the argument is in the follow
2656	of every element in the lastpos. */
2657	{
2658	tmp.elems = firstpos - stk[-1].nfirstpos;
2659	tmp.nelem = stk[-1].nfirstpos;
2660	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2661	merge2 (&d->follows[p->index], &tmp, &merged);
2662	}
2663	FALLTHROUGH;
2664	case QMARK:
2665	/* A QMARK or STAR node is automatically nullable. */
2666	if (d->tokens[i] != PLUS)
2667	stk[-1].nullable = true;
2668	break;
2669
2670	case CAT:
2671	/* Every element in the lastpos of the first argument is in
2672	the backward set of every element in the firstpos of the
2673	second argument. */
2674	if (backward)
2675	{
2676	tmp.nelem = stk[-2].nlastpos;
2677	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2678	for (position *p = firstpos - stk[-1].nfirstpos;
2679	p < firstpos; p++)
2680	merge2 (&backward[p->index], &tmp, &merged);
2681	}
2682
2683	/* Every element in the firstpos of the second argument is in the
2684	follow of every element in the lastpos of the first argument. */
2685	{
2686	tmp.nelem = stk[-1].nfirstpos;
2687	tmp.elems = firstpos - stk[-1].nfirstpos;
2688	for (position *plim = lastpos - stk[-1].nlastpos,
2689	*p = plim - stk[-2].nlastpos;
2690	p < plim; p++)
2691	merge2 (&d->follows[p->index], &tmp, &merged);
2692	}
2693
2694	/* The firstpos of a CAT node is the firstpos of the first argument,
2695	union that of the second argument if the first is nullable. */
2696	if (stk[-2].nullable)
2697	stk[-2].nfirstpos += stk[-1].nfirstpos;
2698	else
2699	firstpos -= stk[-1].nfirstpos;
2700
2701	/* The lastpos of a CAT node is the lastpos of the second argument,
2702	union that of the first argument if the second is nullable. */
2703	if (stk[-1].nullable)
2704	stk[-2].nlastpos += stk[-1].nlastpos;
2705	else
2706	{
2707	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2709	p[j] = p[j + stk[-2].nlastpos];
2710	lastpos -= stk[-2].nlastpos;
2711	stk[-2].nlastpos = stk[-1].nlastpos;
2712	}
2713
2714	/* A CAT node is nullable if both arguments are nullable. */
2715	stk[-2].nullable &= stk[-1].nullable;
2716	stk--;
2717	break;
2718
2719	case OR:
2720	/* The firstpos is the union of the firstpos of each argument. */
2721	stk[-2].nfirstpos += stk[-1].nfirstpos;
2722
2723	/* The lastpos is the union of the lastpos of each argument. */
2724	stk[-2].nlastpos += stk[-1].nlastpos;
2725
2726	/* An OR node is nullable if either argument is nullable. */
2727	stk[-2].nullable \|= stk[-1].nullable;
2728	stk--;
2729	break;
2730
2731	default:
2732	/* Anything else is a nonempty position. (Note that special
2733	constructs like \< are treated as nonempty strings here;
2734	an "epsilon closure" effectively makes them nullable later.
2735	Backreferences have to get a real position so we can detect
2736	transitions on them later. But they are nullable. */
2737	stk->nullable = d->tokens[i] == BACKREF;
2738
2739	/* This position is in its own firstpos and lastpos. */
2740	stk->nfirstpos = stk->nlastpos = 1;
2741	stk++;
2742
2743	firstpos->index = lastpos->index = i;
2744	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745	firstpos++, lastpos++;
2746
2747	break;
2748	}
2749	#ifdef DEBUG
2750	/* ... balance the above nonsyntactic #ifdef goo... */
2751	fprintf (stderr, "node %td:", i);
2752	prtok (d->tokens[i]);
2753	putc ('\n', stderr);
2754	fprintf (stderr,
2755	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756	fprintf (stderr, " firstpos:");
2757	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2758	{
2759	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2760	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2761	}
2762	fprintf (stderr, "\n lastpos:");
2763	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2764	{
2765	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2766	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2767	}
2768	putc ('\n', stderr);
2769	#endif
2770	}
2771
2772	if (backward)
2773	{
2774	/* For each follow set that is the follow set of a real position,
2775	replace it with its epsilon closure. */
2776	epsclosure (d, backward);
2777
2778	for (idx_t i = 0; i < tindex; i++)
2779	free (backward[i].elems);
2780	free (backward);
2781	}
2782
2783	dfaoptimize (d);
2784
2785	#ifdef DEBUG
2786	for (idx_t i = 0; i < tindex; i++)
2787	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
2788	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
2789	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
2790	{
2791	fprintf (stderr, "follows(%td:", i);
2792	prtok (d->tokens[i]);
2793	fprintf (stderr, "):");
2794	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2795	{
2796	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2797	prtok (d->tokens[d->follows[i].elems[j].index]);
2798	}
2799	putc ('\n', stderr);
2800	}
2801	#endif
2802
2803	pos.index = 0;
2804	pos.constraint = NO_CONSTRAINT;
2805
2806	alloc_position_set (&tmp, 1);
2807
2808	append (pos, &tmp);
2809
2810	d->separates = xicalloc (tindex, sizeof *d->separates);
2811
2812	for (idx_t i = 0; i < tindex; i++)
2813	{
2814	if (prev_newline_dependent (d->constraints[i]))
2815	d->separates[i] \|= CTX_NEWLINE;
2816	if (prev_letter_dependent (d->constraints[i]))
2817	d->separates[i] \|= CTX_LETTER;
2818
2819	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2820	{
2821	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2822	d->separates[i] \|= CTX_NEWLINE;
2823	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2824	d->separates[i] \|= CTX_LETTER;
2825	}
2826	}
2827
2828	/* Context wanted by some position. */
2829	int separate_contexts = state_separate_contexts (d, &tmp);
2830
2831	/* Build the initial state. */
2832	if (separate_contexts & CTX_NEWLINE)
2833	state_index (d, &tmp, CTX_NEWLINE);
2834	d->initstate_notbol = d->min_trcount
2835	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2836	if (separate_contexts & CTX_LETTER)
2837	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2838	d->min_trcount++;
2839	d->trcount = 0;
2840
2841	free (posalloc);
2842	free (stkalloc);
2843	free (merged.elems);
2844	free (tmp.elems);
2845	}
2846
2847	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2848	static void
2849	realloc_trans_if_necessary (struct dfa *d)
2850	{
2851	state_num oldalloc = d->tralloc;
2852	if (oldalloc < d->sindex)
2853	{
2854	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2855	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2856	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2857	-1, sizeof *realtrans);
2858	realtrans[0] = realtrans[1] = NULL;
2859	d->trans = realtrans + 2;
2860	idx_t newalloc = d->tralloc = newalloc1 - 2;
2861	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2862	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2863	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2864	if (d->localeinfo.multibyte)
2865	{
2866	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2867	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2868	if (oldalloc == 0)
2869	realtrans[0] = realtrans[1] = NULL;
2870	d->mb_trans = realtrans + 2;
2871	}
2872	for (; oldalloc < newalloc; oldalloc++)
2873	{
2874	d->trans[oldalloc] = NULL;
2875	d->fails[oldalloc] = NULL;
2876	if (d->localeinfo.multibyte)
2877	d->mb_trans[oldalloc] = NULL;
2878	}
2879	}
2880	}
2881
2882	/*
2883	Calculate the transition table for a new state derived from state s
2884	for a compiled dfa d after input character uc, and return the new
2885	state number.
2886
2887	Do not worry about all possible input characters; calculate just the group
2888	of positions that match uc. Label it with the set of characters that
2889	every position in the group matches (taking into account, if necessary,
2890	preceding context information of s). Then find the union
2891	of these positions' follows, i.e., the set of positions of the
2892	new state. For each character in the group's label, set the transition
2893	on this character to be to a state corresponding to the set's positions,
2894	and its associated backward context information, if necessary.
2895
2896	When building a searching matcher, include the positions of state
2897	0 in every state.
2898
2899	The group is constructed by building an equivalence-class
2900	partition of the positions of s.
2901
2902	For each position, find the set of characters C that it matches. Eliminate
2903	any characters from C that fail on grounds of backward context.
2904
2905	Check whether the group's label L has nonempty
2906	intersection with C. If L - C is nonempty, create a new group labeled
2907	L - C and having the same positions as the current group, and set L to
2908	the intersection of L and C. Insert the position in the group, set
2909	C = C - L, and resume scanning.
2910
2911	If after comparing with every group there are characters remaining in C,
2912	create a new group labeled with the characters of C and insert this
2913	position in that group. */
2914
2915	static state_num
2916	build_state (state_num s, struct dfa *d, unsigned char uc)
2917	{
2918	position_set follows; /* Union of the follows for each
2919	position of the current state. */
2920	position_set group; /* Positions that match the input char. */
2921	position_set tmp; /* Temporary space for merging sets. */
2922	state_num state; /* New state. */
2923	state_num state_newline; /* New state on a newline transition. */
2924	state_num state_letter; /* New state on a letter transition. */
2925
2926	#ifdef DEBUG
2927	fprintf (stderr, "build state %td\n", s);
2928	#endif
2929
2930	/* A pointer to the new transition table, and the table itself. */
2931	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2932	state_num trans = ptrans;
2933
2934	if (!trans)
2935	{
2936	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
2937	transition tables that can exist at once, other than for
2938	initial states. Often-used transition tables are quickly
2939	rebuilt, whereas rarely-used ones are cleared away. */
2940	if (MAX_TRCOUNT <= d->trcount)
2941	{
2942	for (state_num i = d->min_trcount; i < d->tralloc; i++)
2943	{
2944	free (d->trans[i]);
2945	free (d->fails[i]);
2946	d->trans[i] = d->fails[i] = NULL;
2947	}
2948	d->trcount = 0;
2949	}
2950
2951	d->trcount++;
2952	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
2953
2954	/* Fill transition table with a default value which means that the
2955	transited state has not been calculated yet. */
2956	for (int i = 0; i < NOTCHAR; i++)
2957	trans[i] = -2;
2958	}
2959
2960	/* Set up the success bits for this state. */
2961	d->success[s] = 0;
2962	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2963	d->success[s] \|= CTX_NEWLINE;
2964	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2965	d->success[s] \|= CTX_LETTER;
2966	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2967	d->success[s] \|= CTX_NONE;
2968
2969	alloc_position_set (&follows, d->nleaves);
2970
2971	/* Find the union of the follows of the positions of the group.
2972	This is a hideously inefficient loop. Fix it someday. */
2973	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2974	for (idx_t k = 0;
2975	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2976	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2977	&follows);
2978
2979	/* Positions that match the input char. */
2980	alloc_position_set (&group, d->nleaves);
2981
2982	/* The group's label. */
2983	charclass label;
2984	fillset (&label);
2985
2986	for (idx_t i = 0; i < follows.nelem; i++)
2987	{
2988	charclass matches; /* Set of matching characters. */
2989	position pos = follows.elems[i];
2990	bool matched = false;
2991	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2992	{
2993	zeroset (&matches);
2994	setbit (d->tokens[pos.index], &matches);
2995	if (d->tokens[pos.index] == uc)
2996	matched = true;
2997	}
2998	else if (d->tokens[pos.index] >= CSET)
2999	{
3000	matches = d->charclasses[d->tokens[pos.index] - CSET];
3001	if (tstbit (uc, &matches))
3002	matched = true;
3003	}
3004	else if (d->tokens[pos.index] == ANYCHAR)
3005	{
3006	matches = d->charclasses[d->canychar];
3007	if (tstbit (uc, &matches))
3008	matched = true;
3009
3010	/* ANYCHAR must match with a single character, so we must put
3011	it to D->states[s].mbps which contains the positions which
3012	can match with a single character not a byte. If all
3013	positions which has ANYCHAR does not depend on context of
3014	next character, we put the follows instead of it to
3015	D->states[s].mbps to optimize. */
3016	if (succeeds_in_context (pos.constraint, d->states[s].context,
3017	CTX_NONE))
3018	{
3019	if (d->states[s].mbps.nelem == 0)
3020	alloc_position_set (&d->states[s].mbps, 1);
3021	insert (pos, &d->states[s].mbps);
3022	}
3023	}
3024	else
3025	continue;
3026
3027	/* Some characters may need to be eliminated from matches because
3028	they fail in the current context. */
3029	if (pos.constraint != NO_CONSTRAINT)
3030	{
3031	if (!succeeds_in_context (pos.constraint,
3032	d->states[s].context, CTX_NEWLINE))
3033	for (int j = 0; j < CHARCLASS_WORDS; j++)
3034	matches.w[j] &= ~d->syntax.newline.w[j];
3035	if (!succeeds_in_context (pos.constraint,
3036	d->states[s].context, CTX_LETTER))
3037	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3038	matches.w[j] &= ~d->syntax.letters.w[j];
3039	if (!succeeds_in_context (pos.constraint,
3040	d->states[s].context, CTX_NONE))
3041	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3042	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
3043
3044	/* If there are no characters left, there's no point in going on. */
3045	if (emptyset (&matches))
3046	continue;
3047
3048	/* If we have reset the bit that made us declare "matched", reset
3049	that indicator, too. This is required to avoid an infinite loop
3050	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
3051	if (!tstbit (uc, &matches))
3052	matched = false;
3053	}
3054
3055	#ifdef DEBUG
3056	fprintf (stderr, " nextpos %td:", pos.index);
3057	prtok (d->tokens[pos.index]);
3058	fprintf (stderr, " of");
3059	for (unsigned j = 0; j < NOTCHAR; j++)
3060	if (tstbit (j, &matches))
3061	fprintf (stderr, " 0x%02x", j);
3062	fprintf (stderr, "\n");
3063	#endif
3064
3065	if (matched)
3066	{
3067	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3068	label.w[k] &= matches.w[k];
3069	append (pos, &group);
3070	}
3071	else
3072	{
3073	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3074	label.w[k] &= ~matches.w[k];
3075	}
3076	}
3077
3078	alloc_position_set (&tmp, d->nleaves);
3079
3080	if (group.nelem > 0)
3081	{
3082	/* If we are building a searching matcher, throw in the positions
3083	of state 0 as well, if possible. */
3084	if (d->searchflag)
3085	{
3086	/* If a token in follows.elems is not 1st byte of a multibyte
3087	character, or the states of follows must accept the bytes
3088	which are not 1st byte of the multibyte character.
3089	Then, if a state of follows encounters a byte, it must not be
3090	a 1st byte of a multibyte character nor a single byte character.
3091	In this case, do not add state[0].follows to next state, because
3092	state[0] must accept 1st-byte.
3093
3094	For example, suppose <sb a> is a certain single byte character,
3095	<mb A> is a certain multibyte character, and the codepoint of
3096	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
3097	state[0] accepts <sb a>, state[i] transits to state[i+1] by
3098	accepting the 1st byte of <mb A>, and state[i+1] accepts the
3099	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3100	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3101	not add state[0]. */
3102
3103	bool mergeit = !d->localeinfo.multibyte;
3104	if (!mergeit)
3105	{
3106	mergeit = true;
3107	for (idx_t j = 0; mergeit && j < group.nelem; j++)
3108	mergeit &= d->multibyte_prop[group.elems[j].index];
3109	}
3110	if (mergeit)
3111	merge2 (&group, &d->states[0].elems, &tmp);
3112	}
3113
3114	/* Find out if the new state will want any context information,
3115	by calculating possible contexts that the group can match,
3116	and separate contexts that the new state wants to know. */
3117	int possible_contexts = charclass_context (d, &label);
3118	int separate_contexts = state_separate_contexts (d, &group);
3119
3120	/* Find the state(s) corresponding to the union of the follows. */
3121	if (possible_contexts & ~separate_contexts)
3122	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123	else
3124	state = -1;
3125	if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126	state_newline = state_index (d, &group, CTX_NEWLINE);
3127	else
3128	state_newline = state;
3129	if (separate_contexts & possible_contexts & CTX_LETTER)
3130	state_letter = state_index (d, &group, CTX_LETTER);
3131	else
3132	state_letter = state;
3133
3134	/* Reallocate now, to reallocate any newline transition properly. */
3135	realloc_trans_if_necessary (d);
3136	}
3137
3138	/* If we are a searching matcher, the default transition is to a state
3139	containing the positions of state 0, otherwise the default transition
3140	is to fail miserably. */
3141	else if (d->searchflag)
3142	{
3143	state_newline = 0;
3144	state_letter = d->min_trcount - 1;
3145	state = d->initstate_notbol;
3146	}
3147	else
3148	{
3149	state_newline = -1;
3150	state_letter = -1;
3151	state = -1;
3152	}
3153
3154	/* Set the transitions for each character in the label. */
3155	for (int i = 0; i < NOTCHAR; i++)
3156	if (tstbit (i, &label))
3157	switch (d->syntax.sbit[i])
3158	{
3159	case CTX_NEWLINE:
3160	trans[i] = state_newline;
3161	break;
3162	case CTX_LETTER:
3163	trans[i] = state_letter;
3164	break;
3165	default:
3166	trans[i] = state;
3167	break;
3168	}
3169
3170	#ifdef DEBUG
3171	fprintf (stderr, "trans table %td", s);
3172	for (int i = 0; i < NOTCHAR; ++i)
3173	{
3174	if (!(i & 0xf))
3175	fprintf (stderr, "\n");
3176	fprintf (stderr, " %2td", trans[i]);
3177	}
3178	fprintf (stderr, "\n");
3179	#endif
3180
3181	free (group.elems);
3182	free (follows.elems);
3183	free (tmp.elems);
3184
3185	/* Keep the newline transition in a special place so we can use it as
3186	a sentinel. */
3187	if (tstbit (d->syntax.eolbyte, &label))
3188	{
3189	d->newlines[s] = trans[d->syntax.eolbyte];
3190	trans[d->syntax.eolbyte] = -1;
3191	}
3192
3193	return trans[uc];
3194	}
3195
3196	/* Multibyte character handling sub-routines for dfaexec. */
3197
3198	/* Consume a single byte and transit state from 's' to '*next_state'.
3199	This function is almost same as the state transition routin in dfaexec.
3200	But state transition is done just once, otherwise matching succeed or
3201	reach the end of the buffer. */
3202	static state_num
3203	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
3204	{
3205	state_num *t;
3206
3207	if (d->trans[s])
3208	t = d->trans[s];
3209	else if (d->fails[s])
3210	t = d->fails[s];
3211	else
3212	{
3213	build_state (s, d, **pp);
3214	if (d->trans[s])
3215	t = d->trans[s];
3216	else
3217	{
3218	t = d->fails[s];
3219	assert (t);
3220	}
3221	}
3222
3223	if (t[**pp] == -2)
3224	build_state (s, d, **pp);
3225
3226	return t[(pp)++];
3227	}
3228
3229	/* Transit state from s, then return new state and update the pointer of
3230	the buffer. This function is for a period operator which can match a
3231	multi-byte character. */
3232	static state_num
3233	transit_state (struct dfa d, state_num s, unsigned char const *pp,
3234	unsigned char const *end)
3235	{
3236	wint_t wc;
3237
3238	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
3239
3240	/* This state has some operators which can match a multibyte character. */
3241	d->mb_follows.nelem = 0;
3242
3243	/* Calculate the state which can be reached from the state 's' by
3244	consuming 'mbclen' single bytes from the buffer. */
3245	state_num s1 = s;
3246	int mbci;
3247	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
3248	s = transit_state_singlebyte (d, s, pp);
3249	*pp += mbclen - mbci;
3250
3251	if (wc == WEOF)
3252	{
3253	/* It is an invalid character, so ANYCHAR is not accepted. */
3254	return s;
3255	}
3256
3257	/* If all positions which have ANYCHAR do not depend on the context
3258	of the next character, calculate the next state with
3259	pre-calculated follows and cache the result. */
3260	if (d->states[s1].mb_trindex < 0)
3261	{
3262	if (MAX_TRCOUNT <= d->mb_trcount)
3263	{
3264	state_num s3;
3265	for (s3 = -1; s3 < d->tralloc; s3++)
3266	{
3267	free (d->mb_trans[s3]);
3268	d->mb_trans[s3] = NULL;
3269	}
3270
3271	for (state_num i = 0; i < d->sindex; i++)
3272	d->states[i].mb_trindex = -1;
3273	d->mb_trcount = 0;
3274	}
3275	d->states[s1].mb_trindex = d->mb_trcount++;
3276	}
3277
3278	if (! d->mb_trans[s])
3279	{
3280	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283	for (int i = 0; i < MAX_TRCOUNT; i++)
3284	d->mb_trans[s][i] = -1;
3285	}
3286	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287	return d->mb_trans[s][d->states[s1].mb_trindex];
3288
3289	if (s == -1)
3290	copy (&d->states[s1].mbps, &d->mb_follows);
3291	else
3292	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293
3294	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296	realloc_trans_if_necessary (d);
3297
3298	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299
3300	return s2;
3301	}
3302
3303	/* The initial state may encounter a byte which is not a single byte character
3304	nor the first byte of a multibyte character. But it is incorrect for the
3305	initial state to accept such a byte. For example, in Shift JIS the regular
3306	expression "\\" accepts the codepoint 0x5c, but should not accept the second
3307	byte of the codepoint 0x815c. Then the initial state must skip the bytes
3308	that are not a single byte character nor the first byte of a multibyte
3309	character.
3310
3311	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3312	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3313	the result is greater than P, set *WCP to the final wide character
3314	processed, or to WEOF if no wide character is processed. Otherwise,
3315	if WCP is non-NULL, *WCP may or may not be updated.
3316
3317	Both P and MBP must be no larger than END. */
3318	static unsigned char const *
3319	skip_remains_mb (struct dfa d, unsigned char const p,
3320	unsigned char const mbp, char const end)
3321	{
3322	if (d->syntax.never_trail[*p])
3323	return p;
3324	while (mbp < p)
3325	{
3326	wint_t wc;
3327	mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328	end - (char const *) mbp, d);
3329	}
3330	return mbp;
3331	}
3332
3333	/* Search through a buffer looking for a match to the struct dfa *D.
3334	Find the first occurrence of a string matching the regexp in the
3335	buffer, and the shortest possible version thereof. Return a pointer to
3336	the first character after the match, or NULL if none is found. BEGIN
3337	points to the beginning of the buffer, and END points to the first byte
3338	after its end. Note however that we store a sentinel byte (usually
3339	newline) in *END, so the actual buffer must be one byte longer.
3340	When ALLOW_NL, newlines may appear in the matching string.
3341	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3342	If MULTIBYTE, the input consists of multibyte characters and/or
3343	encoding-error bytes. Otherwise, it consists of single-byte characters.
3344	Here is the list of features that make this DFA matcher punt:
3345	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3346	- [^...] in non-simple locale
3347	- [[=foo=]] or [[.foo.]]
3348	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3349	- back-reference: (.)\1
3350	- word-delimiter in multibyte locale: \<, \>, \b, \B
3351	See struct localeinfo.simple for the definition of "simple locale". */
3352
3353	static inline char *
3354	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
3355	ptrdiff_t *count, bool multibyte)
3356	{
3357	if (MAX_TRCOUNT <= d->sindex)
3358	{
3359	for (state_num s = d->min_trcount; s < d->sindex; s++)
3360	{
3361	free (d->states[s].elems.elems);
3362	free (d->states[s].mbps.elems);
3363	}
3364	d->sindex = d->min_trcount;
3365
3366	if (d->trans)
3367	{
3368	for (state_num s = 0; s < d->tralloc; s++)
3369	{
3370	free (d->trans[s]);
3371	free (d->fails[s]);
3372	d->trans[s] = d->fails[s] = NULL;
3373	}
3374	d->trcount = 0;
3375	}
3376
3377	if (d->localeinfo.multibyte && d->mb_trans)
3378	{
3379	for (state_num s = -1; s < d->tralloc; s++)
3380	{
3381	free (d->mb_trans[s]);
3382	d->mb_trans[s] = NULL;
3383	}
3384	for (state_num s = 0; s < d->min_trcount; s++)
3385	d->states[s].mb_trindex = -1;
3386	d->mb_trcount = 0;
3387	}
3388	}
3389
3390	if (!d->tralloc)
3391	realloc_trans_if_necessary (d);
3392
3393	/* Current state. */
3394	state_num s = 0, s1 = 0;
3395
3396	/* Current input character. */
3397	unsigned char const p = (unsigned char const ) begin;
3398	unsigned char const *mbp = p;
3399
3400	/* Copy of d->trans so it can be optimized into a register. */
3401	state_num **trans = d->trans;
3402	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3403	unsigned char saved_end = (unsigned char ) end;
3404	*end = eol;
3405
3406	if (multibyte)
3407	{
3408	memset (&d->mbs, 0, sizeof d->mbs);
3409	if (d->mb_follows.alloc == 0)
3410	alloc_position_set (&d->mb_follows, d->nleaves);
3411	}
3412
3413	idx_t nlcount = 0;
3414	for (;;)
3415	{
3416	state_num *t;
3417	while ((t = trans[s]) != NULL)
3418	{
3419	if (s < d->min_trcount)
3420	{
3421	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
3422	{
3423	while (t[*p] == s)
3424	p++;
3425	}
3426	if (multibyte)
3427	p = mbp = skip_remains_mb (d, p, mbp, end);
3428	}
3429
3430	if (multibyte)
3431	{
3432	s1 = s;
3433
3434	if (d->states[s].mbps.nelem == 0
3435	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3436	{
3437	/* If an input character does not match ANYCHAR, do it
3438	like a single-byte character. */
3439	s = t[*p++];
3440	}
3441	else
3442	{
3443	s = transit_state (d, s, &p, (unsigned char *) end);
3444	mbp = p;
3445	trans = d->trans;
3446	}
3447	}
3448	else
3449	{
3450	s1 = t[*p++];
3451	t = trans[s1];
3452	if (! t)
3453	{
3454	state_num tmp = s;
3455	s = s1;
3456	s1 = tmp; /* swap */
3457	break;
3458	}
3459	if (s < d->min_trcount)
3460	{
3461	while (t[*p] == s1)
3462	p++;
3463	}
3464	s = t[*p++];
3465	}
3466	}
3467
3468	if (s < 0)
3469	{
3470	if (s == -2)
3471	{
3472	s = build_state (s1, d, p[-1]);
3473	trans = d->trans;
3474	}
3475	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476	{
3477	/* The previous character was a newline. Count it, and skip
3478	checking of multibyte character boundary until here. */
3479	nlcount++;
3480	mbp = p;
3481
3482	s = (allow_nl ? d->newlines[s1]
3483	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485	: d->initstate_notbol);
3486	}
3487	else
3488	{
3489	p = NULL;
3490	goto done;
3491	}
3492	}
3493	else if (d->fails[s])
3494	{
3495	if ((d->success[s] & d->syntax.sbit[*p])
3496	\|\| ((char *) p == end
3497	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498	d)))
3499	goto done;
3500
3501	if (multibyte && s < d->min_trcount)
3502	p = mbp = skip_remains_mb (d, p, mbp, end);
3503
3504	s1 = s;
3505	if (!multibyte \|\| d->states[s].mbps.nelem == 0
3506	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3507	{
3508	/* If a input character does not match ANYCHAR, do it
3509	like a single-byte character. */
3510	s = d->fails[s][*p++];
3511	}
3512	else
3513	{
3514	s = transit_state (d, s, &p, (unsigned char *) end);
3515	mbp = p;
3516	trans = d->trans;
3517	}
3518	}
3519	else
3520	{
3521	build_state (s, d, p[0]);
3522	trans = d->trans;
3523	}
3524	}
3525
3526	done:
3527	if (count)
3528	*count += nlcount;
3529	*end = saved_end;
3530	return (char *) p;
3531	}
3532
3533	/* Specialized versions of dfaexec for multibyte and single-byte cases.
3534	This is for performance, as dfaexec_main is an inline function. */
3535
3536	static char *
3537	dfaexec_mb (struct dfa d, char const begin, char *end,
3538	bool allow_nl, ptrdiff_t count, bool backref)
3539	{
3540	return dfaexec_main (d, begin, end, allow_nl, count, true);
3541	}
3542
3543	static char *
3544	dfaexec_sb (struct dfa d, char const begin, char *end,
3545	bool allow_nl, ptrdiff_t count, bool backref)
3546	{
3547	return dfaexec_main (d, begin, end, allow_nl, count, false);
3548	}
3549
3550	/* Always set *BACKREF and return BEGIN. Use this wrapper for
3551	any regexp that uses a construct not supported by this code. */
3552	static char *
3553	dfaexec_noop (struct dfa d, char const begin, char *end,
3554	bool allow_nl, ptrdiff_t count, bool backref)
3555	{
3556	*backref = true;
3557	return (char *) begin;
3558	}
3559
3560	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3561	but faster and set *BACKREF if the DFA code does not support this
3562	regexp usage. */
3563
3564	char *
3565	dfaexec (struct dfa d, char const begin, char *end,
3566	bool allow_nl, ptrdiff_t count, bool backref)
3567	{
3568	return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569	}
3570
3571	struct dfa *
3572	dfasuperset (struct dfa const *d)
3573	{
3574	return d->superset;
3575	}
3576
3577	bool
3578	dfaisfast (struct dfa const *d)
3579	{
3580	return d->fast;
3581	}
3582
3583	static void
3584	free_mbdata (struct dfa *d)
3585	{
3586	free (d->multibyte_prop);
3587	free (d->lex.brack.chars);
3588	free (d->mb_follows.elems);
3589
3590	if (d->mb_trans)
3591	{
3592	state_num s;
3593	for (s = -1; s < d->tralloc; s++)
3594	free (d->mb_trans[s]);
3595	free (d->mb_trans - 2);
3596	}
3597	}
3598
3599	/* Return true if every construct in D is supported by this DFA matcher. */
3600	bool
3601	dfasupported (struct dfa const *d)
3602	{
3603	for (idx_t i = 0; i < d->tindex; i++)
3604	{
3605	switch (d->tokens[i])
3606	{
3607	case BEGWORD:
3608	case ENDWORD:
3609	case LIMWORD:
3610	case NOTLIMWORD:
3611	if (!d->localeinfo.multibyte)
3612	continue;
3613	FALLTHROUGH;
3614	case BACKREF:
3615	case MBCSET:
3616	return false;
3617	}
3618	}
3619	return true;
3620	}
3621
3622	/* Disable use of the superset DFA if it is not likely to help
3623	performance. */
3624	static void
3625	maybe_disable_superset_dfa (struct dfa *d)
3626	{
3627	if (!d->localeinfo.using_utf8)
3628	return;
3629
3630	bool have_backref = false;
3631	for (idx_t i = 0; i < d->tindex; i++)
3632	{
3633	switch (d->tokens[i])
3634	{
3635	case ANYCHAR:
3636	/* Lowered. */
3637	abort ();
3638	case BACKREF:
3639	have_backref = true;
3640	break;
3641	case MBCSET:
3642	/* Requires multi-byte algorithm. */
3643	return;
3644	default:
3645	break;
3646	}
3647	}
3648
3649	if (!have_backref && d->superset)
3650	{
3651	/* The superset DFA is not likely to be much faster, so remove it. */
3652	dfafree (d->superset);
3653	free (d->superset);
3654	d->superset = NULL;
3655	}
3656
3657	free_mbdata (d);
3658	d->localeinfo.multibyte = false;
3659	d->dfaexec = dfaexec_sb;
3660	d->fast = true;
3661	}
3662
3663	static void
3664	dfassbuild (struct dfa *d)
3665	{
3666	struct dfa *sup = dfaalloc ();
3667
3668	sup = d;
3669	sup->localeinfo.multibyte = false;
3670	sup->dfaexec = dfaexec_sb;
3671	sup->multibyte_prop = NULL;
3672	sup->superset = NULL;
3673	sup->states = NULL;
3674	sup->sindex = 0;
3675	sup->constraints = NULL;
3676	sup->separates = NULL;
3677	sup->follows = NULL;
3678	sup->tralloc = 0;
3679	sup->trans = NULL;
3680	sup->fails = NULL;
3681	sup->success = NULL;
3682	sup->newlines = NULL;
3683
3684	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685	if (d->cindex)
3686	{
3687	memcpy (sup->charclasses, d->charclasses,
3688	d->cindex * sizeof *sup->charclasses);
3689	}
3690
3691	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692	sup->talloc = d->tindex * 2;
3693
3694	bool have_achar = false;
3695	bool have_nchar = false;
3696	idx_t j;
3697	for (idx_t i = j = 0; i < d->tindex; i++)
3698	{
3699	switch (d->tokens[i])
3700	{
3701	case ANYCHAR:
3702	case MBCSET:
3703	case BACKREF:
3704	{
3705	charclass ccl;
3706	fillset (&ccl);
3707	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708	sup->tokens[j++] = STAR;
3709	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
3710	\|\| d->tokens[i + 1] == PLUS)
3711	i++;
3712	have_achar = true;
3713	}
3714	break;
3715	case BEGWORD:
3716	case ENDWORD:
3717	case LIMWORD:
3718	case NOTLIMWORD:
3719	if (d->localeinfo.multibyte)
3720	{
3721	/* These constraints aren't supported in a multibyte locale.
3722	Ignore them in the superset DFA. */
3723	sup->tokens[j++] = EMPTY;
3724	break;
3725	}
3726	FALLTHROUGH;
3727	default:
3728	sup->tokens[j++] = d->tokens[i];
3729	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730	\|\| d->tokens[i] >= CSET)
3731	have_nchar = true;
3732	break;
3733	}
3734	}
3735	sup->tindex = j;
3736
3737	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
3738	d->superset = sup;
3739	else
3740	{
3741	dfafree (sup);
3742	free (sup);
3743	}
3744	}
3745
3746	/* Parse a string S of length LEN into D (but skip this step if S is null).
3747	Then analyze D and build a matcher for it.
3748	SEARCHFLAG says whether to build a searching or an exact matcher. */
3749	void
3750	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
3751	{
3752	if (s != NULL)
3753	dfaparse (s, len, d);
3754
3755	dfassbuild (d);
3756
3757	if (dfasupported (d))
3758	{
3759	maybe_disable_superset_dfa (d);
3760	dfaanalyze (d, searchflag);
3761	}
3762	else
3763	{
3764	d->dfaexec = dfaexec_noop;
3765	}
3766
3767	if (d->superset)
3768	{
3769	d->fast = true;
3770	dfaanalyze (d->superset, searchflag);
3771	}
3772	}
3773
3774	/* Free the storage held by the components of a dfa. */
3775	void
3776	dfafree (struct dfa *d)
3777	{
3778	free (d->charclasses);
3779	free (d->tokens);
3780
3781	if (d->localeinfo.multibyte)
3782	free_mbdata (d);
3783
3784	free (d->constraints);
3785	free (d->separates);
3786
3787	for (idx_t i = 0; i < d->sindex; i++)
3788	{
3789	free (d->states[i].elems.elems);
3790	free (d->states[i].mbps.elems);
3791	}
3792	free (d->states);
3793
3794	if (d->follows)
3795	{
3796	for (idx_t i = 0; i < d->tindex; i++)
3797	free (d->follows[i].elems);
3798	free (d->follows);
3799	}
3800
3801	if (d->trans)
3802	{
3803	for (idx_t i = 0; i < d->tralloc; i++)
3804	{
3805	free (d->trans[i]);
3806	free (d->fails[i]);
3807	}
3808
3809	free (d->trans - 2);
3810	free (d->fails);
3811	free (d->newlines);
3812	free (d->success);
3813	}
3814
3815	if (d->superset)
3816	{
3817	dfafree (d->superset);
3818	free (d->superset);
3819	}
3820	}
3821
3822	/* Having found the postfix representation of the regular expression,
3823	try to find a long sequence of characters that must appear in any line
3824	containing the r.e.
3825	Finding a "longest" sequence is beyond the scope here;
3826	we take an easy way out and hope for the best.
3827	(Take "(ab\|a)b"--please.)
3828
3829	We do a bottom-up calculation of sequences of characters that must appear
3830	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3831	representation:
3832	sequences that must appear at the left of the match ("left")
3833	sequences that must appear at the right of the match ("right")
3834	lists of sequences that must appear somewhere in the match ("in")
3835	sequences that must constitute the match ("is")
3836
3837	When we get to the root of the tree, we use one of the longest of its
3838	calculated "in" sequences as our answer.
3839
3840	The sequences calculated for the various types of node (in pseudo ANSI c)
3841	are shown below. "p" is the operand of unary operators (and the left-hand
3842	operand of binary operators); "q" is the right-hand operand of binary
3843	operators.
3844
3845	"ZERO" means "a zero-length sequence" below.
3846
3847	Type left right is in
3848	---- ---- ----- -- --
3849	char c # c # c # c # c
3850
3851	ANYCHAR ZERO ZERO ZERO ZERO
3852
3853	MBCSET ZERO ZERO ZERO ZERO
3854
3855	CSET ZERO ZERO ZERO ZERO
3856
3857	STAR ZERO ZERO ZERO ZERO
3858
3859	QMARK ZERO ZERO ZERO ZERO
3860
3861	PLUS p->left p->right ZERO p->in
3862
3863	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3864	p->left : q->right : q->is!=ZERO) ? q->in plus
3865	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3866	ZERO
3867
3868	OR longest common longest common (do p->is and substrings common
3869	leading trailing to q->is have same p->in and
3870	(sub)sequence (sub)sequence q->in length and content) ?
3871	of p->left of p->right
3872	and q->left and q->right p->is : NULL
3873
3874	If there's anything else we recognize in the tree, all four sequences get set
3875	to zero-length sequences. If there's something we don't recognize in the
3876	tree, we just return a zero-length sequence.
3877
3878	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3879	'aaa')?
3880
3881	And ... is it here or someplace that we might ponder "optimizations" such as
3882	egrep 'psi\|epsilon' -> egrep 'psi'
3883	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3884	(Yes, we now find "epsi" as a "string
3885	that must occur", but we might also
3886	simplify the entire r.e. being sought)
3887	grep '[c]' -> grep 'c'
3888	grep '(ab\|a)b' -> grep 'ab'
3889	grep 'ab*' -> grep 'a'
3890	grep 'a*b' -> grep 'b'
3891
3892	There are several issues:
3893
3894	Is optimization easy (enough)?
3895
3896	Does optimization actually accomplish anything,
3897	or is the automaton you get from "psi\|epsilon" (for example)
3898	the same as the one you get from "psi" (for example)?
3899
3900	Are optimizable r.e.'s likely to be used in real-life situations
3901	(something like 'ab*' is probably unlikely; something like is
3902	'psi\|epsilon' is likelier)? */
3903
3904	static char *
3905	icatalloc (char old, char const new)
3906	{
3907	idx_t newsize = strlen (new);
3908	if (newsize == 0)
3909	return old;
3910	idx_t oldsize = strlen (old);
3911	char *result = xirealloc (old, oldsize + newsize + 1);
3912	memcpy (result + oldsize, new, newsize + 1);
3913	return result;
3914	}
3915
3916	static void
3917	freelist (char **cpp)
3918	{
3919	while (*cpp)
3920	free (*cpp++);
3921	}
3922
3923	static char **
3924	enlistnew (char *cpp, char new)
3925	{
3926	/* Is there already something in the list that's new (or longer)? */
3927	idx_t i;
3928	for (i = 0; cpp[i] != NULL; i++)
3929	if (strstr (cpp[i], new) != NULL)
3930	{
3931	free (new);
3932	return cpp;
3933	}
3934	/* Eliminate any obsoleted strings. */
3935	for (idx_t j = 0; cpp[j] != NULL; )
3936	if (strstr (new, cpp[j]) == NULL)
3937	++j;
3938	else
3939	{
3940	free (cpp[j]);
3941	if (--i == j)
3942	break;
3943	cpp[j] = cpp[i];
3944	cpp[i] = NULL;
3945	}
3946	/* Add the new string. */
3947	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3948	cpp[i] = new;
3949	cpp[i + 1] = NULL;
3950	return cpp;
3951	}
3952
3953	static char **
3954	enlist (char *cpp, char const str, idx_t len)
3955	{
3956	return enlistnew (cpp, ximemdup0 (str, len));
3957	}
3958
3959	/* Given pointers to two strings, return a pointer to an allocated
3960	list of their distinct common substrings. */
3961	static char **
3962	comsubs (char left, char const right)
3963	{
3964	char *cpp = xzalloc (sizeof cpp);
3965
3966	for (char lcp = left; lcp != '\0'; lcp++)
3967	{
3968	idx_t len = 0;
3969	char rcp = strchr (right, lcp);
3970	while (rcp != NULL)
3971	{
3972	idx_t i;
3973	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3974	continue;
3975	if (i > len)
3976	len = i;
3977	rcp = strchr (rcp + 1, *lcp);
3978	}
3979	if (len != 0)
3980	cpp = enlist (cpp, lcp, len);
3981	}
3982	return cpp;
3983	}
3984
3985	static char **
3986	addlists (char old, char new)
3987	{
3988	for (; *new; new++)
3989	old = enlistnew (old, xstrdup (*new));
3990	return old;
3991	}
3992
3993	/* Given two lists of substrings, return a new list giving substrings
3994	common to both. */
3995	static char **
3996	inboth (char left, char right)
3997	{
3998	char *both = xzalloc (sizeof both);
3999
4000	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4001	{
4002	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4003	{
4004	char **temp = comsubs (left[lnum], right[rnum]);
4005	both = addlists (both, temp);
4006	freelist (temp);
4007	free (temp);
4008	}
4009	}
4010	return both;
4011	}
4012
4013	typedef struct must must;
4014
4015	struct must
4016	{
4017	char **in;
4018	char *left;
4019	char *right;
4020	char *is;
4021	bool begline;
4022	bool endline;
4023	must *prev;
4024	};
4025
4026	static must *
4027	allocmust (must *mp, idx_t size)
4028	{
4029	must new_mp = xmalloc (sizeof new_mp);
4030	new_mp->in = xzalloc (sizeof *new_mp->in);
4031	new_mp->left = xizalloc (size);
4032	new_mp->right = xizalloc (size);
4033	new_mp->is = xizalloc (size);
4034	new_mp->begline = false;
4035	new_mp->endline = false;
4036	new_mp->prev = mp;
4037	return new_mp;
4038	}
4039
4040	static void
4041	resetmust (must *mp)
4042	{
4043	freelist (mp->in);
4044	mp->in[0] = NULL;
4045	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4046	mp->begline = false;
4047	mp->endline = false;
4048	}
4049
4050	static void
4051	freemust (must *mp)
4052	{
4053	freelist (mp->in);
4054	free (mp->in);
4055	free (mp->left);
4056	free (mp->right);
4057	free (mp->is);
4058	free (mp);
4059	}
4060
4061	struct dfamust *
4062	dfamust (struct dfa const *d)
4063	{
4064	must *mp = NULL;
4065	char const *result = "";
4066	bool exact = false;
4067	bool begline = false;
4068	bool endline = false;
4069	bool need_begline = false;
4070	bool need_endline = false;
4071	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4072
4073	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4074	{
4075	token t = d->tokens[ri];
4076	switch (t)
4077	{
4078	case BEGLINE:
4079	mp = allocmust (mp, 2);
4080	mp->begline = true;
4081	need_begline = true;
4082	break;
4083	case ENDLINE:
4084	mp = allocmust (mp, 2);
4085	mp->endline = true;
4086	need_endline = true;
4087	break;
4088	case LPAREN:
4089	case RPAREN:
4090	assert (!"neither LPAREN nor RPAREN may appear here");
4091
4092	case EMPTY:
4093	case BEGWORD:
4094	case ENDWORD:
4095	case LIMWORD:
4096	case NOTLIMWORD:
4097	case BACKREF:
4098	case ANYCHAR:
4099	case MBCSET:
4100	mp = allocmust (mp, 2);
4101	break;
4102
4103	case STAR:
4104	case QMARK:
4105	assume_nonnull (mp);
4106	resetmust (mp);
4107	break;
4108
4109	case OR:
4110	{
4111	char **new;
4112	must *rmp = mp;
4113	assume_nonnull (rmp);
4114	must *lmp = mp = mp->prev;
4115	assume_nonnull (lmp);
4116	idx_t j, ln, rn, n;
4117
4118	/* Guaranteed to be. Unlikely, but ... */
4119	if (streq (lmp->is, rmp->is))
4120	{
4121	lmp->begline &= rmp->begline;
4122	lmp->endline &= rmp->endline;
4123	}
4124	else
4125	{
4126	lmp->is[0] = '\0';
4127	lmp->begline = false;
4128	lmp->endline = false;
4129	}
4130	/* Left side--easy */
4131	idx_t i = 0;
4132	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4133	++i;
4134	lmp->left[i] = '\0';
4135	/* Right side */
4136	ln = strlen (lmp->right);
4137	rn = strlen (rmp->right);
4138	n = ln;
4139	if (n > rn)
4140	n = rn;
4141	for (i = 0; i < n; ++i)
4142	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4143	break;
4144	for (j = 0; j < i; ++j)
4145	lmp->right[j] = lmp->right[(ln - i) + j];
4146	lmp->right[j] = '\0';
4147	new = inboth (lmp->in, rmp->in);
4148	freelist (lmp->in);
4149	free (lmp->in);
4150	lmp->in = new;
4151	freemust (rmp);
4152	}
4153	break;
4154
4155	case PLUS:
4156	assume_nonnull (mp);
4157	mp->is[0] = '\0';
4158	break;
4159
4160	case END:
4161	assume_nonnull (mp);
4162	assert (!mp->prev);
4163	for (idx_t i = 0; mp->in[i] != NULL; i++)
4164	if (strlen (mp->in[i]) > strlen (result))
4165	result = mp->in[i];
4166	if (streq (result, mp->is))
4167	{
4168	if ((!need_begline \|\| mp->begline) && (!need_endline
4169	\|\| mp->endline))
4170	exact = true;
4171	begline = mp->begline;
4172	endline = mp->endline;
4173	}
4174	goto done;
4175
4176	case CAT:
4177	{
4178	must *rmp = mp;
4179	assume_nonnull (rmp);
4180	must *lmp = mp = mp->prev;
4181	assume_nonnull (lmp);
4182
4183	/* In. Everything in left, plus everything in
4184	right, plus concatenation of
4185	left's right and right's left. */
4186	lmp->in = addlists (lmp->in, rmp->in);
4187	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4188	{
4189	idx_t lrlen = strlen (lmp->right);
4190	idx_t rllen = strlen (rmp->left);
4191	char *tp = ximalloc (lrlen + rllen + 1);
4192	memcpy (tp + lrlen, rmp->left, rllen + 1);
4193	memcpy (tp, lmp->right, lrlen);
4194	lmp->in = enlistnew (lmp->in, tp);
4195	}
4196	/* Left-hand */
4197	if (lmp->is[0] != '\0')
4198	lmp->left = icatalloc (lmp->left, rmp->left);
4199	/* Right-hand */
4200	if (rmp->is[0] == '\0')
4201	lmp->right[0] = '\0';
4202	lmp->right = icatalloc (lmp->right, rmp->right);
4203	/* Guaranteed to be */
4204	if ((lmp->is[0] != '\0' \|\| lmp->begline)
4205	&& (rmp->is[0] != '\0' \|\| rmp->endline))
4206	{
4207	lmp->is = icatalloc (lmp->is, rmp->is);
4208	lmp->endline = rmp->endline;
4209	}
4210	else
4211	{
4212	lmp->is[0] = '\0';
4213	lmp->begline = false;
4214	lmp->endline = false;
4215	}
4216	freemust (rmp);
4217	}
4218	break;
4219
4220	case '\0':
4221	/* Not on my shift. */
4222	goto done;
4223
4224	default:
4225	if (CSET <= t)
4226	{
4227	/* If T is a singleton, or if case-folding in a unibyte
4228	locale and T's members all case-fold to the same char,
4229	convert T to one of its members. Otherwise, do
4230	nothing further with T. */
4231	charclass *ccl = &d->charclasses[t - CSET];
4232	int j;
4233	for (j = 0; j < NOTCHAR; j++)
4234	if (tstbit (j, ccl))
4235	break;
4236	if (! (j < NOTCHAR))
4237	{
4238	mp = allocmust (mp, 2);
4239	break;
4240	}
4241	t = j;
4242	while (++j < NOTCHAR)
4243	if (tstbit (j, ccl)
4244	&& ! (case_fold_unibyte
4245	&& toupper (j) == toupper (t)))
4246	break;
4247	if (j < NOTCHAR)
4248	{
4249	mp = allocmust (mp, 2);
4250	break;
4251	}
4252	}
4253
4254	idx_t rj = ri + 2;
4255	if (d->tokens[ri + 1] == CAT)
4256	{
4257	for (; rj < d->tindex - 1; rj += 2)
4258	{
4259	if ((rj != ri && (d->tokens[rj] <= 0
4260	\|\| NOTCHAR <= d->tokens[rj]))
4261	\|\| d->tokens[rj + 1] != CAT)
4262	break;
4263	}
4264	}
4265	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4266	mp->is[0] = mp->left[0] = mp->right[0]
4267	= case_fold_unibyte ? toupper (t) : t;
4268
4269	idx_t i;
4270	for (i = 1; ri + 2 < rj; i++)
4271	{
4272	ri += 2;
4273	t = d->tokens[ri];
4274	mp->is[i] = mp->left[i] = mp->right[i]
4275	= case_fold_unibyte ? toupper (t) : t;
4276	}
4277	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4278	mp->in = enlist (mp->in, mp->is, i);
4279	break;
4280	}
4281	}
4282	done:;
4283
4284	struct dfamust *dm = NULL;
4285	if (*result)
4286	{
4287	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4288	dm->exact = exact;
4289	dm->begline = begline;
4290	dm->endline = endline;
4291	strcpy (dm->must, result);
4292	}
4293
4294	while (mp)
4295	{
4296	must *prev = mp->prev;
4297	freemust (mp);
4298	mp = prev;
4299	}
4300
4301	return dm;
4302	}
4303
4304	void
4305	dfamustfree (struct dfamust *dm)
4306	{
4307	free (dm);
4308	}
4309
4310	struct dfa *
4311	dfaalloc (void)
4312	{
4313	return xmalloc (sizeof (struct dfa));
4314	}
4315
4316	/* Initialize DFA. */
4317	void
4318	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
4319	reg_syntax_t bits, int dfaopts)
4320	{
4321	memset (dfa, 0, offsetof (struct dfa, dfaexec));
4322	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4323	dfa->localeinfo = *linfo;
4324
4325	dfa->fast = !dfa->localeinfo.multibyte;
4326
4327	dfa->canychar = -1;
4328	dfa->syntax.syntax_bits_set = true;
4329	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4330	dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4331	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4332	dfa->syntax.syntax_bits = bits;
4333
4334	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4335	{
4336	unsigned char uc = i;
4337
4338	dfa->syntax.sbit[uc] = char_context (dfa, uc);
4339	switch (dfa->syntax.sbit[uc])
4340	{
4341	case CTX_LETTER:
4342	setbit (uc, &dfa->syntax.letters);
4343	break;
4344	case CTX_NEWLINE:
4345	setbit (uc, &dfa->syntax.newline);
4346	break;
4347	}
4348
4349	/* POSIX requires that the five bytes in "\n\r./" (including the
4350	terminating NUL) cannot occur inside a multibyte character. */
4351	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4352	? (uc & 0xc0) != 0x80
4353	: strchr ("\n\r./", uc) != NULL);
4354	}
4355	}
4356
4357	/* Initialize TO by copying FROM's syntax settings. */
4358	void
4359	dfacopysyntax (struct dfa to, struct dfa const from)
4360	{
4361	memset (to, 0, offsetof (struct dfa, syntax));
4362	to->canychar = -1;
4363	to->fast = from->fast;
4364	to->syntax = from->syntax;
4365	to->dfaexec = from->dfaexec;
4366	to->localeinfo = from->localeinfo;
4367	}
4368
4369	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/grep/current/lib/dfa.c@ 3530

Download in other formats: