dfa.c@ 3576

Last change on this file since 3576 was 2595, checked in by bird, 13 years ago
gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)
Property svn:eol-style set to `native`
File size: 124.9 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24	#include <assert.h>
25	#include <ctype.h>
26	#include <stdio.h>
27	#include <sys/types.h>
28	#include <stdlib.h>
29	#include <limits.h>
30	#include <string.h>
31	#include <locale.h>
32
33	#define STREQ(a, b) (strcmp (a, b) == 0)
34
35	/* ISASCIIDIGIT differs from isdigit, as follows:
36	- Its arg may be any int or unsigned int; it need not be an unsigned char.
37	- It's guaranteed to evaluate its argument exactly once.
38	- It's typically faster.
39	Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
40	only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
41	it's important to use the locale's definition of `digit' even when the
42	host does not conform to Posix. */
43	#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
44
45	/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
46	#include "gettext.h"
47	#define _(str) gettext (str)
48
49	#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
50	#include <wchar.h>
51	#include <wctype.h>
52
53	#if HAVE_LANGINFO_CODESET
54	# include <langinfo.h>
55	#endif
56
57	#include "regex.h"
58	#include "dfa.h"
59	#include "hard-locale.h"
60	#include "xalloc.h"
61
62	/* HPUX, define those as macros in sys/param.h */
63	#ifdef setbit
64	# undef setbit
65	#endif
66	#ifdef clrbit
67	# undef clrbit
68	#endif
69
70	/* Number of bits in an unsigned char. */
71	#ifndef CHARBITS
72	# define CHARBITS 8
73	#endif
74
75	/* First integer value that is greater than any character code. */
76	#define NOTCHAR (1 << CHARBITS)
77
78	/* INTBITS need not be exact, just a lower bound. */
79	#ifndef INTBITS
80	# define INTBITS (CHARBITS * sizeof (int))
81	#endif
82
83	/* Number of ints required to hold a bit for every character. */
84	#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
85
86	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
87	typedef int charclass[CHARCLASS_INTS];
88
89	/* Convert a possibly-signed character to an unsigned character. This is
90	a bit safer than casting to unsigned char, since it catches some type
91	errors that the cast doesn't. */
92	static inline unsigned char
93	to_uchar (char ch)
94	{
95	return ch;
96	}
97
98	/* Contexts tell us whether a character is a newline or a word constituent.
99	Word-constituent characters are those that satisfy iswalnum(), plus '_'.
100	Each character has a single CTX_* value; bitmasks of CTX_* values denote
101	a particular character class.
102
103	A state also stores a context value, which is a bitmask of CTX_* values.
104	A state's context represents a set of characters that the state's
105	predecessors must match. For example, a state whose context does not
106	include CTX_LETTER will never have transitions where the previous
107	character is a word constituent. A state whose context is CTX_ANY
108	might have transitions from any character. */
109
110	#define CTX_NONE 1
111	#define CTX_LETTER 2
112	#define CTX_NEWLINE 4
113	#define CTX_ANY 7
114
115	/* Sometimes characters can only be matched depending on the surrounding
116	context. Such context decisions depend on what the previous character
117	was, and the value of the current (lookahead) character. Context
118	dependent constraints are encoded as 8 bit integers. Each bit that
119	is set indicates that the constraint succeeds in the corresponding
120	context.
121
122	bit 8-11 - valid contexts when next character is CTX_NEWLINE
123	bit 4-7 - valid contexts when next character is CTX_LETTER
124	bit 0-3 - valid contexts when next character is CTX_NONE
125
126	The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
127	succeeds in a particular context. Prev is a bitmask of possible
128	context values for the previous character, curr is the (single-bit)
129	context value for the lookahead character. */
130	#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
131	#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
132	#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
133
134	#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
135	((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
136	\| ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
137	\| ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
138
139	/* The following macros give information about what a constraint depends on. */
140	#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
141	#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
142	#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
143
144	#define PREV_NEWLINE_DEPENDENT(constraint) \
145	(PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
146	#define PREV_LETTER_DEPENDENT(constraint) \
147	(PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
148
149	/* Tokens that match the empty string subject to some constraint actually
150	work by applying that constraint to determine what may follow them,
151	taking into account what has gone before. The following values are
152	the constraints corresponding to the special tokens previously defined. */
153	#define NO_CONSTRAINT 0x777
154	#define BEGLINE_CONSTRAINT 0x444
155	#define ENDLINE_CONSTRAINT 0x700
156	#define BEGWORD_CONSTRAINT 0x050
157	#define ENDWORD_CONSTRAINT 0x202
158	#define LIMWORD_CONSTRAINT 0x252
159	#define NOTLIMWORD_CONSTRAINT 0x525
160
161	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
162	are operators and others are terminal symbols. Most (but not all) of these
163	codes are returned by the lexical analyzer. */
164
165	typedef ptrdiff_t token;
166
167	/* Predefined token values. */
168	enum
169	{
170	END = -1, /* END is a terminal symbol that matches the
171	end of input; any value of END or less in
172	the parse tree is such a symbol. Accepting
173	states of the DFA are those that would have
174	a transition on END. */
175
176	/* Ordinary character values are terminal symbols that match themselves. */
177
178	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
179	the empty string. */
180
181	BACKREF, /* BACKREF is generated by \<digit>; it
182	is not completely handled. If the scanner
183	detects a transition on backref, it returns
184	a kind of "semi-success" indicating that
185	the match will have to be verified with
186	a backtracking matcher. */
187
188	BEGLINE, /* BEGLINE is a terminal symbol that matches
189	the empty string if it is at the beginning
190	of a line. */
191
192	ENDLINE, /* ENDLINE is a terminal symbol that matches
193	the empty string if it is at the end of
194	a line. */
195
196	BEGWORD, /* BEGWORD is a terminal symbol that matches
197	the empty string if it is at the beginning
198	of a word. */
199
200	ENDWORD, /* ENDWORD is a terminal symbol that matches
201	the empty string if it is at the end of
202	a word. */
203
204	LIMWORD, /* LIMWORD is a terminal symbol that matches
205	the empty string if it is at the beginning
206	or the end of a word. */
207
208	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
209	matches the empty string if it is not at
210	the beginning or end of a word. */
211
212	QMARK, /* QMARK is an operator of one argument that
213	matches zero or one occurrences of its
214	argument. */
215
216	STAR, /* STAR is an operator of one argument that
217	matches the Kleene closure (zero or more
218	occurrences) of its argument. */
219
220	PLUS, /* PLUS is an operator of one argument that
221	matches the positive closure (one or more
222	occurrences) of its argument. */
223
224	REPMN, /* REPMN is a lexical token corresponding
225	to the {m,n} construct. REPMN never
226	appears in the compiled token vector. */
227
228	CAT, /* CAT is an operator of two arguments that
229	matches the concatenation of its
230	arguments. CAT is never returned by the
231	lexical analyzer. */
232
233	OR, /* OR is an operator of two arguments that
234	matches either of its arguments. */
235
236	LPAREN, /* LPAREN never appears in the parse tree,
237	it is only a lexeme. */
238
239	RPAREN, /* RPAREN never appears in the parse tree. */
240
241	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
242	any multibyte (or single byte) characters.
243	It is used only if MB_CUR_MAX > 1. */
244
245	MBCSET, /* MBCSET is similar to CSET, but for
246	multibyte characters. */
247
248	WCHAR, /* Only returned by lex. wctok contains
249	the wide character representation. */
250
251	CSET /* CSET and (and any value greater) is a
252	terminal symbol that matches any of a
253	class of characters. */
254	};
255
256
257	/* States of the recognizer correspond to sets of positions in the parse
258	tree, together with the constraints under which they may be matched.
259	So a position is encoded as an index into the parse tree together with
260	a constraint. */
261	typedef struct
262	{
263	size_t index; /* Index into the parse array. */
264	unsigned int constraint; /* Constraint for matching this position. */
265	} position;
266
267	/* Sets of positions are stored as arrays. */
268	typedef struct
269	{
270	position elems; / Elements of this position set. */
271	size_t nelem; /* Number of elements in this set. */
272	size_t alloc; /* Number of elements allocated in ELEMS. */
273	} position_set;
274
275	/* Sets of leaves are also stored as arrays. */
276	typedef struct
277	{
278	size_t elems; / Elements of this position set. */
279	size_t nelem; /* Number of elements in this set. */
280	} leaf_set;
281
282	/* A state of the dfa consists of a set of positions, some flags,
283	and the token value of the lowest-numbered position of the state that
284	contains an END token. */
285	typedef struct
286	{
287	size_t hash; /* Hash of the positions of this state. */
288	position_set elems; /* Positions this state could match. */
289	unsigned char context; /* Context from previous state. */
290	char backref; /* True if this state matches a \<digit>. */
291	unsigned short constraint; /* Constraint for this state to accept. */
292	token first_end; /* Token value of the first END in elems. */
293	position_set mbps; /* Positions which can match multibyte
294	characters. e.g. period.
295	These staff are used only if
296	MB_CUR_MAX > 1. */
297	} dfa_state;
298
299	/* States are indexed by state_num values. These are normally
300	nonnegative but -1 is used as a special value. */
301	typedef ptrdiff_t state_num;
302
303	/* A bracket operator.
304	e.g. [a-c], [[:alpha:]], etc. */
305	struct mb_char_classes
306	{
307	ptrdiff_t cset;
308	int invert;
309	wchar_t chars; / Normal characters. */
310	size_t nchars;
311	wctype_t ch_classes; / Character classes. */
312	size_t nch_classes;
313	wchar_t range_sts; / Range characters (start of the range). */
314	wchar_t range_ends; / Range characters (end of the range). */
315	size_t nranges;
316	char *equivs; / Equivalence classes. */
317	size_t nequivs;
318	char **coll_elems;
319	size_t ncoll_elems; /* Collating elements. */
320	};
321
322	/* A compiled regular expression. */
323	struct dfa
324	{
325	/* Fields filled by the scanner. */
326	charclass charclasses; / Array of character sets for CSET tokens. */
327	size_t cindex; /* Index for adding new charclasses. */
328	size_t calloc; /* Number of charclasses currently allocated. */
329
330	/* Fields filled by the parser. */
331	token tokens; / Postfix parse array. */
332	size_t tindex; /* Index for adding new tokens. */
333	size_t talloc; /* Number of tokens currently allocated. */
334	size_t depth; /* Depth required of an evaluation stack
335	used for depth-first traversal of the
336	parse tree. */
337	size_t nleaves; /* Number of leaves on the parse tree. */
338	size_t nregexps; /* Count of parallel regexps being built
339	with dfaparse(). */
340	unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
341	token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
342
343	/* The following are used only if MB_CUR_MAX > 1. */
344
345	/* The value of multibyte_prop[i] is defined by following rule.
346	if tokens[i] < NOTCHAR
347	bit 0 : tokens[i] is the first byte of a character, including
348	single-byte characters.
349	bit 1 : tokens[i] is the last byte of a character, including
350	single-byte characters.
351
352	if tokens[i] = MBCSET
353	("the index of mbcsets corresponding to this operator" << 2) + 3
354
355	e.g.
356	tokens
357	= 'single_byte_a', 'multi_byte_A', single_byte_b'
358	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
359	multibyte_prop
360	= 3 , 1 , 0 , 2 , 3
361	*/
362	size_t nmultibyte_prop;
363	int *multibyte_prop;
364
365	/* Array of the bracket expression in the DFA. */
366	struct mb_char_classes *mbcsets;
367	size_t nmbcsets;
368	size_t mbcsets_alloc;
369
370	/* Fields filled by the state builder. */
371	dfa_state states; / States of the dfa. */
372	state_num sindex; /* Index for adding new states. */
373	state_num salloc; /* Number of states currently allocated. */
374
375	/* Fields filled by the parse tree->NFA conversion. */
376	position_set follows; / Array of follow sets, indexed by position
377	index. The follow of a position is the set
378	of positions containing characters that
379	could conceivably follow a character
380	matching the given position in a string
381	matching the regexp. Allocated to the
382	maximum possible position index. */
383	int searchflag; /* True if we are supposed to build a searching
384	as opposed to an exact matcher. A searching
385	matcher finds the first and shortest string
386	matching a regexp anywhere in the buffer,
387	whereas an exact matcher finds the longest
388	string matching, but anchored to the
389	beginning of the buffer. */
390
391	/* Fields filled by dfaexec. */
392	state_num tralloc; /* Number of transition tables that have
393	slots so far. */
394	int trcount; /* Number of transition tables that have
395	actually been built. */
396	state_num *trans; / Transition tables for states that can
397	never accept. If the transitions for a
398	state have not yet been computed, or the
399	state could possibly accept, its entry in
400	this table is NULL. */
401	state_num *realtrans; / Trans always points to realtrans + 1; this
402	is so trans[-1] can contain NULL. */
403	state_num *fails; / Transition tables after failing to accept
404	on a state that potentially could do so. */
405	int success; / Table of acceptance conditions used in
406	dfaexec and computed in build_state. */
407	state_num newlines; / Transitions on newlines. The entry for a
408	newline in any transition table is always
409	-1 so we can count lines without wasting
410	too many cycles. The transition for a
411	newline is stored separately and handled
412	as a special case. Newline is also used
413	as a sentinel at the end of the buffer. */
414	struct dfamust musts; / List of strings, at least one of which
415	is known to appear in any r.e. matching
416	the dfa. */
417	};
418
419	/* Some macros for user access to dfa internals. */
420
421	/* ACCEPTING returns true if s could possibly be an accepting state of r. */
422	#define ACCEPTING(s, r) ((r).states[s].constraint)
423
424	/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
425	specified context. */
426	#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
427	SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
428
429	static void dfamust (struct dfa *dfa);
430	static void regexp (void);
431
432	/* These two macros are identical to the ones in gnulib's xalloc.h,
433	except that they not to case the result to "(t *)", and thus may
434	be used via type-free CALLOC and MALLOC macros. */
435	#undef XNMALLOC
436	#undef XCALLOC
437
438	/* Allocate memory for N elements of type T, with error checking. */
439	/* extern t XNMALLOC (size_t n, typename t); /
440	# define XNMALLOC(n, t) \
441	(sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
442
443	/* Allocate memory for N elements of type T, with error checking,
444	and zero it. */
445	/* extern t XCALLOC (size_t n, typename t); /
446	# define XCALLOC(n, t) \
447	(sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
448
449	#define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
450	#define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
451	#define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
452
453	/* Reallocate an array of type P if N_ALLOC is <= N_REQUIRED. /
454	#define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
455	do \
456	{ \
457	if ((n_alloc) <= (n_required)) \
458	{ \
459	size_t new_n_alloc = (n_required) + !(p); \
460	(p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
461	(n_alloc) = new_n_alloc; \
462	} \
463	} \
464	while (false)
465
466
467	#ifdef DEBUG
468
469	static void
470	prtok (token t)
471	{
472	char const *s;
473
474	if (t < 0)
475	fprintf (stderr, "END");
476	else if (t < NOTCHAR)
477	{
478	int ch = t;
479	fprintf (stderr, "%c", ch);
480	}
481	else
482	{
483	switch (t)
484	{
485	case EMPTY:
486	s = "EMPTY";
487	break;
488	case BACKREF:
489	s = "BACKREF";
490	break;
491	case BEGLINE:
492	s = "BEGLINE";
493	break;
494	case ENDLINE:
495	s = "ENDLINE";
496	break;
497	case BEGWORD:
498	s = "BEGWORD";
499	break;
500	case ENDWORD:
501	s = "ENDWORD";
502	break;
503	case LIMWORD:
504	s = "LIMWORD";
505	break;
506	case NOTLIMWORD:
507	s = "NOTLIMWORD";
508	break;
509	case QMARK:
510	s = "QMARK";
511	break;
512	case STAR:
513	s = "STAR";
514	break;
515	case PLUS:
516	s = "PLUS";
517	break;
518	case CAT:
519	s = "CAT";
520	break;
521	case OR:
522	s = "OR";
523	break;
524	case LPAREN:
525	s = "LPAREN";
526	break;
527	case RPAREN:
528	s = "RPAREN";
529	break;
530	case ANYCHAR:
531	s = "ANYCHAR";
532	break;
533	case MBCSET:
534	s = "MBCSET";
535	break;
536	default:
537	s = "CSET";
538	break;
539	}
540	fprintf (stderr, "%s", s);
541	}
542	}
543	#endif /* DEBUG */
544
545	/* Stuff pertaining to charclasses. */
546
547	static int
548	tstbit (unsigned int b, charclass const c)
549	{
550	return c[b / INTBITS] & 1 << b % INTBITS;
551	}
552
553	static void
554	setbit (unsigned int b, charclass c)
555	{
556	c[b / INTBITS] \|= 1 << b % INTBITS;
557	}
558
559	static void
560	clrbit (unsigned int b, charclass c)
561	{
562	c[b / INTBITS] &= ~(1 << b % INTBITS);
563	}
564
565	static void
566	copyset (charclass const src, charclass dst)
567	{
568	memcpy (dst, src, sizeof (charclass));
569	}
570
571	static void
572	zeroset (charclass s)
573	{
574	memset (s, 0, sizeof (charclass));
575	}
576
577	static void
578	notset (charclass s)
579	{
580	int i;
581
582	for (i = 0; i < CHARCLASS_INTS; ++i)
583	s[i] = ~s[i];
584	}
585
586	static int
587	equal (charclass const s1, charclass const s2)
588	{
589	return memcmp (s1, s2, sizeof (charclass)) == 0;
590	}
591
592	/* A pointer to the current dfa is kept here during parsing. */
593	static struct dfa *dfa;
594
595	/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
596	static size_t
597	charclass_index (charclass const s)
598	{
599	size_t i;
600
601	for (i = 0; i < dfa->cindex; ++i)
602	if (equal (s, dfa->charclasses[i]))
603	return i;
604	REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
605	++dfa->cindex;
606	copyset (s, dfa->charclasses[i]);
607	return i;
608	}
609
610	/* Syntax bits controlling the behavior of the lexical analyzer. */
611	static reg_syntax_t syntax_bits, syntax_bits_set;
612
613	/* Flag for case-folding letters into sets. */
614	static int case_fold;
615
616	/* End-of-line byte in data. */
617	static unsigned char eolbyte;
618
619	/* Cache of char-context values. */
620	static int sbit[NOTCHAR];
621
622	/* Set of characters considered letters. */
623	static charclass letters;
624
625	/* Set of characters that are newline. */
626	static charclass newline;
627
628	/* Add this to the test for whether a byte is word-constituent, since on
629	BSD-based systems, many values in the 128..255 range are classified as
630	alphabetic, while on glibc-based systems, they are not. */
631	#ifdef __GLIBC__
632	# define is_valid_unibyte_character(c) 1
633	#else
634	# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
635	#endif
636
637	/* Return non-zero if C is a 'word-constituent' byte; zero otherwise. */
638	#define IS_WORD_CONSTITUENT(C) \
639	(is_valid_unibyte_character (C) && (isalnum (C) \|\| (C) == '_'))
640
641	static int
642	char_context (unsigned char c)
643	{
644	if (c == eolbyte \|\| c == 0)
645	return CTX_NEWLINE;
646	if (IS_WORD_CONSTITUENT (c))
647	return CTX_LETTER;
648	return CTX_NONE;
649	}
650
651	static int
652	wchar_context (wint_t wc)
653	{
654	if (wc == (wchar_t) eolbyte \|\| wc == 0)
655	return CTX_NEWLINE;
656	if (wc == L'_' \|\| iswalnum (wc))
657	return CTX_LETTER;
658	return CTX_NONE;
659	}
660
661	/* Entry point to set syntax options. */
662	void
663	dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
664	{
665	unsigned int i;
666
667	syntax_bits_set = 1;
668	syntax_bits = bits;
669	case_fold = fold;
670	eolbyte = eol;
671
672	for (i = 0; i < NOTCHAR; ++i)
673	{
674	sbit[i] = char_context (i);
675	switch (sbit[i])
676	{
677	case CTX_LETTER:
678	setbit (i, letters);
679	break;
680	case CTX_NEWLINE:
681	setbit (i, newline);
682	break;
683	}
684	}
685	}
686
687	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
688	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
689	this may happen when folding case in weird Turkish locales where
690	dotless i/dotted I are not included in the chosen character set.
691	Return whether a bit was set in the charclass. */
692	#if MBS_SUPPORT
693	static bool
694	setbit_wc (wint_t wc, charclass c)
695	{
696	int b = wctob (wc);
697	if (b == EOF)
698	return false;
699
700	setbit (b, c);
701	return true;
702	}
703
704	/* Set a bit in the charclass for the given single byte character,
705	if it is valid in the current character set. */
706	static void
707	setbit_c (int b, charclass c)
708	{
709	/* Do nothing if b is invalid in this character set. */
710	if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
711	return;
712	setbit (b, c);
713	}
714	#else
715	# define setbit_c setbit
716	static inline bool
717	setbit_wc (wint_t wc, charclass c)
718	{
719	abort ();
720	/NOTREACHED/ return false;
721	}
722	#endif
723
724	/* Like setbit_c, but if case is folded, set both cases of a letter. For
725	MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
726	and the caller takes care of setting the appropriate field of struct
727	mb_char_classes. */
728	static void
729	setbit_case_fold_c (int b, charclass c)
730	{
731	if (MB_CUR_MAX > 1)
732	{
733	wint_t wc = btowc (b);
734	if (wc == WEOF)
735	return;
736	setbit (b, c);
737	if (case_fold && iswalpha (wc))
738	setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
739	}
740	else
741	{
742	setbit (b, c);
743	if (case_fold && isalpha (b))
744	setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
745	}
746	}
747
748
749
750	/* UTF-8 encoding allows some optimizations that we can't otherwise
751	assume in a multibyte encoding. */
752	static inline int
753	using_utf8 (void)
754	{
755	static int utf8 = -1;
756	if (utf8 == -1)
757	{
758	#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
759	utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
760	#else
761	utf8 = 0;
762	#endif
763	}
764
765	return utf8;
766	}
767
768	/* Lexical analyzer. All the dross that deals with the obnoxious
769	GNU Regex syntax bits is located here. The poor, suffering
770	reader is referred to the GNU Regex documentation for the
771	meaning of the @#%!@#%^!@ syntax bits. */
772
773	static char const lexptr; / Pointer to next input character. */
774	static size_t lexleft; /* Number of characters remaining. */
775	static token lasttok; /* Previous token returned; initially END. */
776	static int laststart; /* True if we're separated from beginning or (, \|
777	only by zero-width characters. */
778	static size_t parens; /* Count of outstanding left parens. */
779	static int minrep, maxrep; /* Repeat counts for {m,n}. */
780	static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
781
782	static int cur_mb_len = 1; /* Length of the multibyte representation of
783	wctok. */
784	/* These variables are used only if (MB_CUR_MAX > 1). */
785	static mbstate_t mbs; /* Mbstate for mbrlen(). */
786	static wchar_t wctok; /* Wide character representation of the current
787	multibyte character. */
788	static unsigned char mblen_buf; / Correspond to the input buffer in dfaexec().
789	Each element store the amount of remain
790	byte of corresponding multibyte character
791	in the input string. A element's value
792	is 0 if corresponding character is a
793	single byte character.
794	e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
795	mblen_buf : 0, 3, 2, 1
796	*/
797	static wchar_t inputwcs; / Wide character representation of input
798	string in dfaexec().
799	The length of this array is same as
800	the length of input string(char array).
801	inputstring[i] is a single-byte char,
802	or 1st byte of a multibyte char.
803	And inputwcs[i] is the codepoint. */
804	static unsigned char const buf_begin; / reference to begin in dfaexec(). */
805	static unsigned char const buf_end; / reference to end in dfaexec(). */
806
807
808	#if MBS_SUPPORT
809	/* Note that characters become unsigned here. */
810	# define FETCH_WC(c, wc, eoferr) \
811	do { \
812	if (! lexleft) \
813	{ \
814	if ((eoferr) != 0) \
815	dfaerror (eoferr); \
816	else \
817	return lasttok = END; \
818	} \
819	else \
820	{ \
821	wchar_t _wc; \
822	cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
823	if (cur_mb_len <= 0) \
824	{ \
825	cur_mb_len = 1; \
826	--lexleft; \
827	(wc) = (c) = to_uchar (*lexptr++); \
828	} \
829	else \
830	{ \
831	lexptr += cur_mb_len; \
832	lexleft -= cur_mb_len; \
833	(wc) = _wc; \
834	(c) = wctob (wc); \
835	} \
836	} \
837	} while(0)
838
839	# define FETCH(c, eoferr) \
840	do { \
841	wint_t wc; \
842	FETCH_WC (c, wc, eoferr); \
843	} while (0)
844
845	#else
846	/* Note that characters become unsigned here. */
847	# define FETCH(c, eoferr) \
848	do { \
849	if (! lexleft) \
850	{ \
851	if ((eoferr) != 0) \
852	dfaerror (eoferr); \
853	else \
854	return lasttok = END; \
855	} \
856	(c) = to_uchar (*lexptr++); \
857	--lexleft; \
858	} while(0)
859
860	# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
861
862	#endif /* MBS_SUPPORT */
863
864	#ifndef MIN
865	# define MIN(a,b) ((a) < (b) ? (a) : (b))
866	#endif
867
868	typedef int predicate (int);
869
870	/* The following list maps the names of the Posix named character classes
871	to predicate functions that determine whether a given character is in
872	the class. The leading [ has already been eaten by the lexical analyzer. */
873	struct dfa_ctype
874	{
875	const char *name;
876	predicate *func;
877	bool single_byte_only;
878	};
879
880	static const struct dfa_ctype prednames[] = {
881	{"alpha", isalpha, false},
882	{"upper", isupper, false},
883	{"lower", islower, false},
884	{"digit", isdigit, true},
885	{"xdigit", isxdigit, true},
886	{"space", isspace, false},
887	{"punct", ispunct, false},
888	{"alnum", isalnum, false},
889	{"print", isprint, false},
890	{"graph", isgraph, false},
891	{"cntrl", iscntrl, false},
892	{"blank", isblank, false},
893	{NULL, NULL, false}
894	};
895
896	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
897	find_pred (const char *str)
898	{
899	unsigned int i;
900	for (i = 0; prednames[i].name; ++i)
901	if (STREQ (str, prednames[i].name))
902	break;
903
904	return &prednames[i];
905	}
906
907	/* Multibyte character handling sub-routine for lex.
908	This function parse a bracket expression and build a struct
909	mb_char_classes. */
910	static token
911	parse_bracket_exp (void)
912	{
913	int invert;
914	int c, c1, c2;
915	charclass ccl;
916
917	/* Used to warn about [:space:].
918	Bit 0 = first character is a colon.
919	Bit 1 = last character is a colon.
920	Bit 2 = includes any other character but a colon.
921	Bit 3 = includes ranges, char/equiv classes or collation elements. */
922	int colon_warning_state;
923
924	wint_t wc;
925	wint_t wc2;
926	wint_t wc1 = 0;
927
928	/* Work area to build a mb_char_classes. */
929	struct mb_char_classes *work_mbc;
930	size_t chars_al, range_sts_al, range_ends_al, ch_classes_al,
931	equivs_al, coll_elems_al;
932
933	chars_al = 0;
934	range_sts_al = range_ends_al = 0;
935	ch_classes_al = equivs_al = coll_elems_al = 0;
936	if (MB_CUR_MAX > 1)
937	{
938	REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc,
939	dfa->nmbcsets + 1);
940
941	/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
942	We will update dfa->multibyte_prop[] in addtok(), because we can't
943	decide the index in dfa->tokens[]. */
944
945	/* Initialize work area. */
946	work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
947	memset (work_mbc, 0, sizeof *work_mbc);
948	}
949	else
950	work_mbc = NULL;
951
952	memset (ccl, 0, sizeof ccl);
953	FETCH_WC (c, wc, _("unbalanced ["));
954	if (c == '^')
955	{
956	FETCH_WC (c, wc, _("unbalanced ["));
957	invert = 1;
958	}
959	else
960	invert = 0;
961
962	colon_warning_state = (c == ':');
963	do
964	{
965	c1 = EOF; /* mark c1 is not initialized". */
966	colon_warning_state &= ~2;
967
968	/* Note that if we're looking at some other [:...:] construct,
969	we just treat it as a bunch of ordinary characters. We can do
970	this because we assume regex has checked for syntax errors before
971	dfa is ever called. */
972	if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
973	{
974	#define BRACKET_BUFFER_SIZE 128
975	char str[BRACKET_BUFFER_SIZE];
976	FETCH_WC (c1, wc1, _("unbalanced ["));
977
978	/* If pattern contains `[[:', `[[.', or `[[='. */
979	if (c1 == ':'
980	/* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
981	\|\| (MB_CUR_MAX > 1 && (c1 == '.' \|\| c1 == '=')))
982	{
983	size_t len = 0;
984	for (;;)
985	{
986	FETCH_WC (c, wc, _("unbalanced ["));
987	if ((c == c1 && *lexptr == ']') \|\| lexleft == 0)
988	break;
989	if (len < BRACKET_BUFFER_SIZE)
990	str[len++] = c;
991	else
992	/* This is in any case an invalid class name. */
993	str[0] = '\0';
994	}
995	str[len] = '\0';
996
997	/* Fetch bracket. */
998	FETCH_WC (c, wc, _("unbalanced ["));
999	if (c1 == ':')
1000	/* build character class. */
1001	{
1002	char const *class
1003	= (case_fold && (STREQ (str, "upper")
1004	\|\| STREQ (str, "lower")) ? "alpha" : str);
1005	const struct dfa_ctype *pred = find_pred (class);
1006	if (!pred)
1007	dfaerror (_("invalid character class"));
1008
1009	if (MB_CUR_MAX > 1 && !pred->single_byte_only)
1010	{
1011	/* Store the character class as wctype_t. */
1012	wctype_t wt = wctype (class);
1013
1014	REALLOC_IF_NECESSARY (work_mbc->ch_classes,
1015	ch_classes_al,
1016	work_mbc->nch_classes + 1);
1017	work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
1018	}
1019
1020	for (c2 = 0; c2 < NOTCHAR; ++c2)
1021	if (pred->func (c2))
1022	setbit_case_fold_c (c2, ccl);
1023	}
1024
1025	else if (MBS_SUPPORT && (c1 == '=' \|\| c1 == '.'))
1026	{
1027	char *elem = xmemdup (str, len + 1);
1028
1029	if (c1 == '=')
1030	/* build equivalence class. */
1031	{
1032	REALLOC_IF_NECESSARY (work_mbc->equivs,
1033	equivs_al, work_mbc->nequivs + 1);
1034	work_mbc->equivs[work_mbc->nequivs++] = elem;
1035	}
1036
1037	if (c1 == '.')
1038	/* build collating element. */
1039	{
1040	REALLOC_IF_NECESSARY (work_mbc->coll_elems,
1041	coll_elems_al,
1042	work_mbc->ncoll_elems + 1);
1043	work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
1044	}
1045	}
1046	colon_warning_state \|= 8;
1047
1048	/* Fetch new lookahead character. */
1049	FETCH_WC (c1, wc1, _("unbalanced ["));
1050	continue;
1051	}
1052
1053	/* We treat '[' as a normal character here. c/c1/wc/wc1
1054	are already set up. */
1055	}
1056
1057	if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1058	FETCH_WC (c, wc, _("unbalanced ["));
1059
1060	if (c1 == EOF)
1061	FETCH_WC (c1, wc1, _("unbalanced ["));
1062
1063	if (c1 == '-')
1064	/* build range characters. */
1065	{
1066	FETCH_WC (c2, wc2, _("unbalanced ["));
1067	if (c2 == ']')
1068	{
1069	/* In the case [x-], the - is an ordinary hyphen,
1070	which is left in c1, the lookahead character. */
1071	lexptr -= cur_mb_len;
1072	lexleft += cur_mb_len;
1073	}
1074	}
1075
1076	if (c1 == '-' && c2 != ']')
1077	{
1078	if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1079	FETCH_WC (c2, wc2, _("unbalanced ["));
1080
1081	if (MB_CUR_MAX > 1)
1082	{
1083	/* When case folding map a range, say [m-z] (or even [M-z])
1084	to the pair of ranges, [m-z] [M-Z]. */
1085	REALLOC_IF_NECESSARY (work_mbc->range_sts,
1086	range_sts_al, work_mbc->nranges + 1);
1087	REALLOC_IF_NECESSARY (work_mbc->range_ends,
1088	range_ends_al, work_mbc->nranges + 1);
1089	work_mbc->range_sts[work_mbc->nranges] =
1090	case_fold ? towlower (wc) : (wchar_t) wc;
1091	work_mbc->range_ends[work_mbc->nranges++] =
1092	case_fold ? towlower (wc2) : (wchar_t) wc2;
1093
1094	#ifndef GREP
1095	if (case_fold && (iswalpha (wc) \|\| iswalpha (wc2)))
1096	{
1097	REALLOC_IF_NECESSARY (work_mbc->range_sts,
1098	range_sts_al, work_mbc->nranges + 1);
1099	work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
1100	REALLOC_IF_NECESSARY (work_mbc->range_ends,
1101	range_ends_al, work_mbc->nranges + 1);
1102	work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
1103	}
1104	#endif
1105	}
1106	else
1107	{
1108	c1 = c;
1109	if (case_fold)
1110	{
1111	c1 = tolower (c1);
1112	c2 = tolower (c2);
1113	}
1114	if (!hard_LC_COLLATE)
1115	for (c = c1; c <= c2; c++)
1116	setbit_case_fold_c (c, ccl);
1117	else
1118	{
1119	/* Defer to the system regex library about the meaning
1120	of range expressions. */
1121	regex_t re;
1122	char pattern[6] = { '[', c1, '-', c2, ']', 0 };
1123	char subject[2] = { 0, 0 };
1124	regcomp (&re, pattern, REG_NOSUB);
1125	for (c = 0; c < NOTCHAR; ++c)
1126	{
1127	subject[0] = c;
1128	if (!(case_fold && isupper (c))
1129	&& regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
1130	setbit_case_fold_c (c, ccl);
1131	}
1132	regfree (&re);
1133	}
1134	}
1135
1136	colon_warning_state \|= 8;
1137	FETCH_WC (c1, wc1, _("unbalanced ["));
1138	continue;
1139	}
1140
1141	colon_warning_state \|= (c == ':') ? 2 : 4;
1142
1143	if (MB_CUR_MAX == 1)
1144	{
1145	setbit_case_fold_c (c, ccl);
1146	continue;
1147	}
1148
1149	if (case_fold && iswalpha (wc))
1150	{
1151	wc = towlower (wc);
1152	if (!setbit_wc (wc, ccl))
1153	{
1154	REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1155	work_mbc->nchars + 1);
1156	work_mbc->chars[work_mbc->nchars++] = wc;
1157	}
1158	#ifdef GREP
1159	continue;
1160	#else
1161	wc = towupper (wc);
1162	#endif
1163	}
1164	if (!setbit_wc (wc, ccl))
1165	{
1166	REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1167	work_mbc->nchars + 1);
1168	work_mbc->chars[work_mbc->nchars++] = wc;
1169	}
1170	}
1171	while ((wc = wc1, (c = c1) != ']'));
1172
1173	if (colon_warning_state == 7)
1174	dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1175
1176	if (MB_CUR_MAX > 1)
1177	{
1178	static charclass zeroclass;
1179	work_mbc->invert = invert;
1180	work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
1181	return MBCSET;
1182	}
1183
1184	if (invert)
1185	{
1186	assert (MB_CUR_MAX == 1);
1187	notset (ccl);
1188	if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1189	clrbit (eolbyte, ccl);
1190	}
1191
1192	return CSET + charclass_index (ccl);
1193	}
1194
1195	static token
1196	lex (void)
1197	{
1198	unsigned int c, c2;
1199	int backslash = 0;
1200	charclass ccl;
1201	int i;
1202
1203	/* Basic plan: We fetch a character. If it's a backslash,
1204	we set the backslash flag and go through the loop again.
1205	On the plus side, this avoids having a duplicate of the
1206	main switch inside the backslash case. On the minus side,
1207	it means that just about every case begins with
1208	"if (backslash) ...". */
1209	for (i = 0; i < 2; ++i)
1210	{
1211	if (MB_CUR_MAX > 1)
1212	{
1213	FETCH_WC (c, wctok, NULL);
1214	if ((int) c == EOF)
1215	goto normal_char;
1216	}
1217	else
1218	FETCH (c, NULL);
1219
1220	switch (c)
1221	{
1222	case '\\':
1223	if (backslash)
1224	goto normal_char;
1225	if (lexleft == 0)
1226	dfaerror (_("unfinished \\ escape"));
1227	backslash = 1;
1228	break;
1229
1230	case '^':
1231	if (backslash)
1232	goto normal_char;
1233	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1234	\|\| lasttok == END \|\| lasttok == LPAREN \|\| lasttok == OR)
1235	return lasttok = BEGLINE;
1236	goto normal_char;
1237
1238	case '$':
1239	if (backslash)
1240	goto normal_char;
1241	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242	\|\| lexleft == 0
1243	\|\| (syntax_bits & RE_NO_BK_PARENS
1244	? lexleft > 0 && *lexptr == ')'
1245	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1246	\|\| (syntax_bits & RE_NO_BK_VBAR
1247	? lexleft > 0 && *lexptr == '\|'
1248	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '\|')
1249	\|\| ((syntax_bits & RE_NEWLINE_ALT)
1250	&& lexleft > 0 && *lexptr == '\n'))
1251	return lasttok = ENDLINE;
1252	goto normal_char;
1253
1254	case '1':
1255	case '2':
1256	case '3':
1257	case '4':
1258	case '5':
1259	case '6':
1260	case '7':
1261	case '8':
1262	case '9':
1263	if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1264	{
1265	laststart = 0;
1266	return lasttok = BACKREF;
1267	}
1268	goto normal_char;
1269
1270	case '`':
1271	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1272	return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1273	goto normal_char;
1274
1275	case '\'':
1276	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1277	return lasttok = ENDLINE; /* FIXME: should be end of string */
1278	goto normal_char;
1279
1280	case '<':
1281	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1282	return lasttok = BEGWORD;
1283	goto normal_char;
1284
1285	case '>':
1286	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1287	return lasttok = ENDWORD;
1288	goto normal_char;
1289
1290	case 'b':
1291	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1292	return lasttok = LIMWORD;
1293	goto normal_char;
1294
1295	case 'B':
1296	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1297	return lasttok = NOTLIMWORD;
1298	goto normal_char;
1299
1300	case '?':
1301	if (syntax_bits & RE_LIMITED_OPS)
1302	goto normal_char;
1303	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1304	goto normal_char;
1305	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1306	goto normal_char;
1307	return lasttok = QMARK;
1308
1309	case '*':
1310	if (backslash)
1311	goto normal_char;
1312	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1313	goto normal_char;
1314	return lasttok = STAR;
1315
1316	case '+':
1317	if (syntax_bits & RE_LIMITED_OPS)
1318	goto normal_char;
1319	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1320	goto normal_char;
1321	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1322	goto normal_char;
1323	return lasttok = PLUS;
1324
1325	case '{':
1326	if (!(syntax_bits & RE_INTERVALS))
1327	goto normal_char;
1328	if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1329	goto normal_char;
1330	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1331	goto normal_char;
1332
1333	/* Cases:
1334	{M} - exact count
1335	{M,} - minimum count, maximum is infinity
1336	{,N} - 0 through N
1337	{,} - 0 to infinity (same as '*')
1338	{M,N} - M through N */
1339	{
1340	char const *p = lexptr;
1341	char const *lim = p + lexleft;
1342	minrep = maxrep = -1;
1343	for (; p != lim && ISASCIIDIGIT (*p); p++)
1344	{
1345	if (minrep < 0)
1346	minrep = *p - '0';
1347	else
1348	minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
1349	}
1350	if (p != lim)
1351	{
1352	if (*p != ',')
1353	maxrep = minrep;
1354	else
1355	{
1356	if (minrep < 0)
1357	minrep = 0;
1358	while (++p != lim && ISASCIIDIGIT (*p))
1359	{
1360	if (maxrep < 0)
1361	maxrep = *p - '0';
1362	else
1363	maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
1364	}
1365	}
1366	}
1367	if (! ((! backslash \|\| (p != lim && *p++ == '\\'))
1368	&& p != lim && *p++ == '}'
1369	&& 0 <= minrep && (maxrep < 0 \|\| minrep <= maxrep)))
1370	{
1371	if (syntax_bits & RE_INVALID_INTERVAL_ORD)
1372	goto normal_char;
1373	dfaerror (_("Invalid content of \\{\\}"));
1374	}
1375	if (RE_DUP_MAX < maxrep)
1376	dfaerror (_("Regular expression too big"));
1377	lexptr = p;
1378	lexleft = lim - p;
1379	}
1380	laststart = 0;
1381	return lasttok = REPMN;
1382
1383	case '\|':
1384	if (syntax_bits & RE_LIMITED_OPS)
1385	goto normal_char;
1386	if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1387	goto normal_char;
1388	laststart = 1;
1389	return lasttok = OR;
1390
1391	case '\n':
1392	if (syntax_bits & RE_LIMITED_OPS
1393	\|\| backslash \|\| !(syntax_bits & RE_NEWLINE_ALT))
1394	goto normal_char;
1395	laststart = 1;
1396	return lasttok = OR;
1397
1398	case '(':
1399	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1400	goto normal_char;
1401	++parens;
1402	laststart = 1;
1403	return lasttok = LPAREN;
1404
1405	case ')':
1406	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1407	goto normal_char;
1408	if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1409	goto normal_char;
1410	--parens;
1411	laststart = 0;
1412	return lasttok = RPAREN;
1413
1414	case '.':
1415	if (backslash)
1416	goto normal_char;
1417	if (MB_CUR_MAX > 1)
1418	{
1419	/* In multibyte environment period must match with a single
1420	character not a byte. So we use ANYCHAR. */
1421	laststart = 0;
1422	return lasttok = ANYCHAR;
1423	}
1424	zeroset (ccl);
1425	notset (ccl);
1426	if (!(syntax_bits & RE_DOT_NEWLINE))
1427	clrbit (eolbyte, ccl);
1428	if (syntax_bits & RE_DOT_NOT_NULL)
1429	clrbit ('\0', ccl);
1430	laststart = 0;
1431	return lasttok = CSET + charclass_index (ccl);
1432
1433	case 's':
1434	case 'S':
1435	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
1436	goto normal_char;
1437	zeroset (ccl);
1438	for (c2 = 0; c2 < NOTCHAR; ++c2)
1439	if (isspace (c2))
1440	setbit (c2, ccl);
1441	if (c == 'S')
1442	notset (ccl);
1443	laststart = 0;
1444	return lasttok = CSET + charclass_index (ccl);
1445
1446	case 'w':
1447	case 'W':
1448	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
1449	goto normal_char;
1450	zeroset (ccl);
1451	for (c2 = 0; c2 < NOTCHAR; ++c2)
1452	if (IS_WORD_CONSTITUENT (c2))
1453	setbit (c2, ccl);
1454	if (c == 'W')
1455	notset (ccl);
1456	laststart = 0;
1457	return lasttok = CSET + charclass_index (ccl);
1458
1459	case '[':
1460	if (backslash)
1461	goto normal_char;
1462	laststart = 0;
1463	return lasttok = parse_bracket_exp ();
1464
1465	default:
1466	normal_char:
1467	laststart = 0;
1468	/* For multibyte character sets, folding is done in atom. Always
1469	return WCHAR. */
1470	if (MB_CUR_MAX > 1)
1471	return lasttok = WCHAR;
1472
1473	if (case_fold && isalpha (c))
1474	{
1475	zeroset (ccl);
1476	setbit_case_fold_c (c, ccl);
1477	return lasttok = CSET + charclass_index (ccl);
1478	}
1479
1480	return lasttok = c;
1481	}
1482	}
1483
1484	/* The above loop should consume at most a backslash
1485	and some other character. */
1486	abort ();
1487	return END; /* keeps pedantic compilers happy. */
1488	}
1489
1490	/* Recursive descent parser for regular expressions. */
1491
1492	static token tok; /* Lookahead token. */
1493	static size_t depth; /* Current depth of a hypothetical stack
1494	holding deferred productions. This is
1495	used to determine the depth that will be
1496	required of the real stack later on in
1497	dfaanalyze(). */
1498
1499	static void
1500	addtok_mb (token t, int mbprop)
1501	{
1502	if (MB_CUR_MAX > 1)
1503	{
1504	REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop,
1505	dfa->tindex + 1);
1506	dfa->multibyte_prop[dfa->tindex] = mbprop;
1507	}
1508
1509	REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1);
1510	dfa->tokens[dfa->tindex++] = t;
1511
1512	switch (t)
1513	{
1514	case QMARK:
1515	case STAR:
1516	case PLUS:
1517	break;
1518
1519	case CAT:
1520	case OR:
1521	--depth;
1522	break;
1523
1524	default:
1525	++dfa->nleaves;
1526	case EMPTY:
1527	++depth;
1528	break;
1529	}
1530	if (depth > dfa->depth)
1531	dfa->depth = depth;
1532	}
1533
1534	static void addtok_wc (wint_t wc);
1535
1536	/* Add the given token to the parse tree, maintaining the depth count and
1537	updating the maximum depth if necessary. */
1538	static void
1539	addtok (token t)
1540	{
1541	if (MB_CUR_MAX > 1 && t == MBCSET)
1542	{
1543	bool need_or = false;
1544	struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1545
1546	/* Extract wide characters into alternations for better performance.
1547	This does not require UTF-8. */
1548	if (!work_mbc->invert)
1549	{
1550	size_t i;
1551	for (i = 0; i < work_mbc->nchars; i++)
1552	{
1553	addtok_wc (work_mbc->chars[i]);
1554	if (need_or)
1555	addtok (OR);
1556	need_or = true;
1557	}
1558	work_mbc->nchars = 0;
1559	}
1560
1561	/* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
1562	if (work_mbc->invert
1563	\|\| (!using_utf8 () && work_mbc->cset != -1)
1564	\|\| work_mbc->nchars != 0
1565	\|\| work_mbc->nch_classes != 0
1566	\|\| work_mbc->nranges != 0
1567	\|\| work_mbc->nequivs != 0 \|\| work_mbc->ncoll_elems != 0)
1568	{
1569	addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1570	if (need_or)
1571	addtok (OR);
1572	}
1573	else
1574	{
1575	/* Characters have been handled above, so it is possible
1576	that the mbcset is empty now. Do nothing in that case. */
1577	if (work_mbc->cset != -1)
1578	{
1579	assert (using_utf8 ());
1580	addtok (CSET + work_mbc->cset);
1581	if (need_or)
1582	addtok (OR);
1583	}
1584	}
1585	}
1586	else
1587	{
1588	addtok_mb (t, 3);
1589	}
1590	}
1591
1592	#if MBS_SUPPORT
1593	/* We treat a multibyte character as a single atom, so that DFA
1594	can treat a multibyte character as a single expression.
1595
1596	e.g. We construct following tree from "<mb1><mb2>".
1597	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1598	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1599	static void
1600	addtok_wc (wint_t wc)
1601	{
1602	unsigned char buf[MB_LEN_MAX];
1603	mbstate_t s;
1604	int i;
1605	memset (&s, 0, sizeof s);
1606	cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1607
1608	/* This is merely stop-gap. When cur_mb_len is 0 or negative,
1609	buf[0] is undefined, yet skipping the addtok_mb call altogether
1610	can result in heap corruption. */
1611	if (cur_mb_len <= 0)
1612	buf[0] = 0;
1613
1614	addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
1615	for (i = 1; i < cur_mb_len; i++)
1616	{
1617	addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
1618	addtok (CAT);
1619	}
1620	}
1621	#else
1622	static void
1623	addtok_wc (wint_t wc)
1624	{
1625	}
1626	#endif
1627
1628	static void
1629	add_utf8_anychar (void)
1630	{
1631	#if MBS_SUPPORT
1632	static const charclass utf8_classes[5] = {
1633	{0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-lead bytes */
1634	{~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
1635	{0, 0, 0, 0, 0, 0, 0xfffffffcU, 0}, /* c2-df: 2-byte sequence */
1636	{0, 0, 0, 0, 0, 0, 0, 0xffff}, /* e0-ef: 3-byte sequence */
1637	{0, 0, 0, 0, 0, 0, 0, 0xff0000} /* f0-f7: 4-byte sequence */
1638	};
1639	const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1640	unsigned int i;
1641
1642	/* Define the five character classes that are needed below. */
1643	if (dfa->utf8_anychar_classes[0] == 0)
1644	for (i = 0; i < n; i++)
1645	{
1646	charclass c;
1647	copyset (utf8_classes[i], c);
1648	if (i == 1)
1649	{
1650	if (!(syntax_bits & RE_DOT_NEWLINE))
1651	clrbit (eolbyte, c);
1652	if (syntax_bits & RE_DOT_NOT_NULL)
1653	clrbit ('\0', c);
1654	}
1655	dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
1656	}
1657
1658	/* A valid UTF-8 character is
1659
1660	([0x00-0x7f]
1661	\|[0xc2-0xdf][0x80-0xbf]
1662	\|[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1663	\|[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1664
1665	which I'll write more concisely "B\|CA\|DAA\|EAAA". Factor the [0x00-0x7f]
1666	and you get "B\|(C\|(D\|EA)A)A". And since the token buffer is in reverse
1667	Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1668	for (i = 1; i < n; i++)
1669	addtok (dfa->utf8_anychar_classes[i]);
1670	while (--i > 1)
1671	{
1672	addtok (dfa->utf8_anychar_classes[0]);
1673	addtok (CAT);
1674	addtok (OR);
1675	}
1676	#endif
1677	}
1678
1679	/* The grammar understood by the parser is as follows.
1680
1681	regexp:
1682	regexp OR branch
1683	branch
1684
1685	branch:
1686	branch closure
1687	closure
1688
1689	closure:
1690	closure QMARK
1691	closure STAR
1692	closure PLUS
1693	closure REPMN
1694	atom
1695
1696	atom:
1697	<normal character>
1698	<multibyte character>
1699	ANYCHAR
1700	MBCSET
1701	CSET
1702	BACKREF
1703	BEGLINE
1704	ENDLINE
1705	BEGWORD
1706	ENDWORD
1707	LIMWORD
1708	NOTLIMWORD
1709	LPAREN regexp RPAREN
1710	<empty>
1711
1712	The parser builds a parse tree in postfix form in an array of tokens. */
1713
1714	static void
1715	atom (void)
1716	{
1717	if (0)
1718	{
1719	/* empty */
1720	}
1721	else if (MBS_SUPPORT && tok == WCHAR)
1722	{
1723	addtok_wc (case_fold ? towlower (wctok) : wctok);
1724	#ifndef GREP
1725	if (case_fold && iswalpha (wctok))
1726	{
1727	addtok_wc (towupper (wctok));
1728	addtok (OR);
1729	}
1730	#endif
1731
1732	tok = lex ();
1733	}
1734	else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
1735	{
1736	/* For UTF-8 expand the period to a series of CSETs that define a valid
1737	UTF-8 character. This avoids using the slow multibyte path. I'm
1738	pretty sure it would be both profitable and correct to do it for
1739	any encoding; however, the optimization must be done manually as
1740	it is done above in add_utf8_anychar. So, let's start with
1741	UTF-8: it is the most used, and the structure of the encoding
1742	makes the correctness more obvious. */
1743	add_utf8_anychar ();
1744	tok = lex ();
1745	}
1746	else if ((tok >= 0 && tok < NOTCHAR) \|\| tok >= CSET \|\| tok == BACKREF
1747	\|\| tok == BEGLINE \|\| tok == ENDLINE \|\| tok == BEGWORD
1748	#if MBS_SUPPORT
1749	\|\| tok == ANYCHAR \|\| tok == MBCSET
1750	#endif /* MBS_SUPPORT */
1751	\|\| tok == ENDWORD \|\| tok == LIMWORD \|\| tok == NOTLIMWORD)
1752	{
1753	addtok (tok);
1754	tok = lex ();
1755	}
1756	else if (tok == LPAREN)
1757	{
1758	tok = lex ();
1759	regexp ();
1760	if (tok != RPAREN)
1761	dfaerror (_("unbalanced ("));
1762	tok = lex ();
1763	}
1764	else
1765	addtok (EMPTY);
1766	}
1767
1768	/* Return the number of tokens in the given subexpression. */
1769	static size_t _GL_ATTRIBUTE_PURE
1770	nsubtoks (size_t tindex)
1771	{
1772	size_t ntoks1;
1773
1774	switch (dfa->tokens[tindex - 1])
1775	{
1776	default:
1777	return 1;
1778	case QMARK:
1779	case STAR:
1780	case PLUS:
1781	return 1 + nsubtoks (tindex - 1);
1782	case CAT:
1783	case OR:
1784	ntoks1 = nsubtoks (tindex - 1);
1785	return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
1786	}
1787	}
1788
1789	/* Copy the given subexpression to the top of the tree. */
1790	static void
1791	copytoks (size_t tindex, size_t ntokens)
1792	{
1793	size_t i;
1794
1795	for (i = 0; i < ntokens; ++i)
1796	{
1797	addtok (dfa->tokens[tindex + i]);
1798	/* Update index into multibyte csets. */
1799	if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1800	dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1801	}
1802	}
1803
1804	static void
1805	closure (void)
1806	{
1807	int i;
1808	size_t tindex, ntokens;
1809
1810	atom ();
1811	while (tok == QMARK \|\| tok == STAR \|\| tok == PLUS \|\| tok == REPMN)
1812	if (tok == REPMN && (minrep \|\| maxrep))
1813	{
1814	ntokens = nsubtoks (dfa->tindex);
1815	tindex = dfa->tindex - ntokens;
1816	if (maxrep < 0)
1817	addtok (PLUS);
1818	if (minrep == 0)
1819	addtok (QMARK);
1820	for (i = 1; i < minrep; ++i)
1821	{
1822	copytoks (tindex, ntokens);
1823	addtok (CAT);
1824	}
1825	for (; i < maxrep; ++i)
1826	{
1827	copytoks (tindex, ntokens);
1828	addtok (QMARK);
1829	addtok (CAT);
1830	}
1831	tok = lex ();
1832	}
1833	else if (tok == REPMN)
1834	{
1835	dfa->tindex -= nsubtoks (dfa->tindex);
1836	tok = lex ();
1837	closure ();
1838	}
1839	else
1840	{
1841	addtok (tok);
1842	tok = lex ();
1843	}
1844	}
1845
1846	static void
1847	branch (void)
1848	{
1849	closure ();
1850	while (tok != RPAREN && tok != OR && tok >= 0)
1851	{
1852	closure ();
1853	addtok (CAT);
1854	}
1855	}
1856
1857	static void
1858	regexp (void)
1859	{
1860	branch ();
1861	while (tok == OR)
1862	{
1863	tok = lex ();
1864	branch ();
1865	addtok (OR);
1866	}
1867	}
1868
1869	/* Main entry point for the parser. S is a string to be parsed, len is the
1870	length of the string, so s can include NUL characters. D is a pointer to
1871	the struct dfa to parse into. */
1872	void
1873	dfaparse (char const s, size_t len, struct dfa d)
1874	{
1875	dfa = d;
1876	lexptr = s;
1877	lexleft = len;
1878	lasttok = END;
1879	laststart = 1;
1880	parens = 0;
1881	#ifdef LC_COLLATE
1882	hard_LC_COLLATE = hard_locale (LC_COLLATE);
1883	#endif
1884	if (MB_CUR_MAX > 1)
1885	{
1886	cur_mb_len = 0;
1887	memset (&mbs, 0, sizeof mbs);
1888	}
1889
1890	if (!syntax_bits_set)
1891	dfaerror (_("no syntax specified"));
1892
1893	tok = lex ();
1894	depth = d->depth;
1895
1896	regexp ();
1897
1898	if (tok != END)
1899	dfaerror (_("unbalanced )"));
1900
1901	addtok (END - d->nregexps);
1902	addtok (CAT);
1903
1904	if (d->nregexps)
1905	addtok (OR);
1906
1907	++d->nregexps;
1908	}
1909
1910	/* Some primitives for operating on sets of positions. */
1911
1912	/* Copy one set to another; the destination must be large enough. */
1913	static void
1914	copy (position_set const src, position_set dst)
1915	{
1916	REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem);
1917	memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem);
1918	dst->nelem = src->nelem;
1919	}
1920
1921	static void
1922	alloc_position_set (position_set * s, size_t size)
1923	{
1924	MALLOC (s->elems, size);
1925	s->alloc = size;
1926	s->nelem = 0;
1927	}
1928
1929	/* Insert position P in set S. S is maintained in sorted order on
1930	decreasing index. If there is already an entry in S with P.index
1931	then merge (logically-OR) P's constraints into the one in S.
1932	S->elems must point to an array large enough to hold the resulting set. */
1933	static void
1934	insert (position p, position_set * s)
1935	{
1936	size_t count = s->nelem;
1937	size_t lo = 0, hi = count;
1938	size_t i;
1939	while (lo < hi)
1940	{
1941	size_t mid = (lo + hi) >> 1;
1942	if (s->elems[mid].index > p.index)
1943	lo = mid + 1;
1944	else
1945	hi = mid;
1946	}
1947
1948	if (lo < count && p.index == s->elems[lo].index)
1949	{
1950	s->elems[lo].constraint \|= p.constraint;
1951	return;
1952	}
1953
1954	REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1);
1955	for (i = count; i > lo; i--)
1956	s->elems[i] = s->elems[i - 1];
1957	s->elems[lo] = p;
1958	++s->nelem;
1959	}
1960
1961	/* Merge two sets of positions into a third. The result is exactly as if
1962	the positions of both sets were inserted into an initially empty set. */
1963	static void
1964	merge (position_set const s1, position_set const s2, position_set * m)
1965	{
1966	size_t i = 0, j = 0;
1967
1968	REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem);
1969	m->nelem = 0;
1970	while (i < s1->nelem && j < s2->nelem)
1971	if (s1->elems[i].index > s2->elems[j].index)
1972	m->elems[m->nelem++] = s1->elems[i++];
1973	else if (s1->elems[i].index < s2->elems[j].index)
1974	m->elems[m->nelem++] = s2->elems[j++];
1975	else
1976	{
1977	m->elems[m->nelem] = s1->elems[i++];
1978	m->elems[m->nelem++].constraint \|= s2->elems[j++].constraint;
1979	}
1980	while (i < s1->nelem)
1981	m->elems[m->nelem++] = s1->elems[i++];
1982	while (j < s2->nelem)
1983	m->elems[m->nelem++] = s2->elems[j++];
1984	}
1985
1986	/* Delete a position from a set. */
1987	static void
1988	delete (position p, position_set * s)
1989	{
1990	size_t i;
1991
1992	for (i = 0; i < s->nelem; ++i)
1993	if (p.index == s->elems[i].index)
1994	break;
1995	if (i < s->nelem)
1996	for (--s->nelem; i < s->nelem; ++i)
1997	s->elems[i] = s->elems[i + 1];
1998	}
1999
2000	/* Find the index of the state corresponding to the given position set with
2001	the given preceding context, or create a new state if there is no such
2002	state. Context tells whether we got here on a newline or letter. */
2003	static state_num
2004	state_index (struct dfa d, position_set const s, int context)
2005	{
2006	size_t hash = 0;
2007	int constraint;
2008	state_num i, j;
2009
2010	for (i = 0; i < s->nelem; ++i)
2011	hash ^= s->elems[i].index + s->elems[i].constraint;
2012
2013	/* Try to find a state that exactly matches the proposed one. */
2014	for (i = 0; i < d->sindex; ++i)
2015	{
2016	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2017	\|\| context != d->states[i].context)
2018	continue;
2019	for (j = 0; j < s->nelem; ++j)
2020	if (s->elems[j].constraint
2021	!= d->states[i].elems.elems[j].constraint
2022	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2023	break;
2024	if (j == s->nelem)
2025	return i;
2026	}
2027
2028	/* We'll have to create a new state. */
2029	REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
2030	d->states[i].hash = hash;
2031	alloc_position_set (&d->states[i].elems, s->nelem);
2032	copy (s, &d->states[i].elems);
2033	d->states[i].context = context;
2034	d->states[i].backref = 0;
2035	d->states[i].constraint = 0;
2036	d->states[i].first_end = 0;
2037	if (MBS_SUPPORT)
2038	{
2039	d->states[i].mbps.nelem = 0;
2040	d->states[i].mbps.elems = NULL;
2041	}
2042	for (j = 0; j < s->nelem; ++j)
2043	if (d->tokens[s->elems[j].index] < 0)
2044	{
2045	constraint = s->elems[j].constraint;
2046	if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
2047	d->states[i].constraint \|= constraint;
2048	if (!d->states[i].first_end)
2049	d->states[i].first_end = d->tokens[s->elems[j].index];
2050	}
2051	else if (d->tokens[s->elems[j].index] == BACKREF)
2052	{
2053	d->states[i].constraint = NO_CONSTRAINT;
2054	d->states[i].backref = 1;
2055	}
2056
2057	++d->sindex;
2058
2059	return i;
2060	}
2061
2062	/* Find the epsilon closure of a set of positions. If any position of the set
2063	contains a symbol that matches the empty string in some context, replace
2064	that position with the elements of its follow labeled with an appropriate
2065	constraint. Repeat exhaustively until no funny positions are left.
2066	S->elems must be large enough to hold the result. */
2067	static void
2068	epsclosure (position_set * s, struct dfa const *d)
2069	{
2070	size_t i, j;
2071	char visited; / array of booleans, enough to use char, not int */
2072	position p, old;
2073
2074	CALLOC (visited, d->tindex);
2075
2076	for (i = 0; i < s->nelem; ++i)
2077	if (d->tokens[s->elems[i].index] >= NOTCHAR
2078	&& d->tokens[s->elems[i].index] != BACKREF
2079	#if MBS_SUPPORT
2080	&& d->tokens[s->elems[i].index] != ANYCHAR
2081	&& d->tokens[s->elems[i].index] != MBCSET
2082	#endif
2083	&& d->tokens[s->elems[i].index] < CSET)
2084	{
2085	old = s->elems[i];
2086	p.constraint = old.constraint;
2087	delete (s->elems[i], s);
2088	if (visited[old.index])
2089	{
2090	--i;
2091	continue;
2092	}
2093	visited[old.index] = 1;
2094	switch (d->tokens[old.index])
2095	{
2096	case BEGLINE:
2097	p.constraint &= BEGLINE_CONSTRAINT;
2098	break;
2099	case ENDLINE:
2100	p.constraint &= ENDLINE_CONSTRAINT;
2101	break;
2102	case BEGWORD:
2103	p.constraint &= BEGWORD_CONSTRAINT;
2104	break;
2105	case ENDWORD:
2106	p.constraint &= ENDWORD_CONSTRAINT;
2107	break;
2108	case LIMWORD:
2109	p.constraint &= LIMWORD_CONSTRAINT;
2110	break;
2111	case NOTLIMWORD:
2112	p.constraint &= NOTLIMWORD_CONSTRAINT;
2113	break;
2114	default:
2115	break;
2116	}
2117	for (j = 0; j < d->follows[old.index].nelem; ++j)
2118	{
2119	p.index = d->follows[old.index].elems[j].index;
2120	insert (p, s);
2121	}
2122	/* Force rescan to start at the beginning. */
2123	i = -1;
2124	}
2125
2126	free (visited);
2127	}
2128
2129	/* Returns the set of contexts for which there is at least one
2130	character included in C. */
2131
2132	static int
2133	charclass_context (charclass c)
2134	{
2135	int context = 0;
2136	unsigned int j;
2137
2138	if (tstbit (eolbyte, c))
2139	context \|= CTX_NEWLINE;
2140
2141	for (j = 0; j < CHARCLASS_INTS; ++j)
2142	{
2143	if (c[j] & letters[j])
2144	context \|= CTX_LETTER;
2145	if (c[j] & ~(letters[j] \| newline[j]))
2146	context \|= CTX_NONE;
2147	}
2148
2149	return context;
2150	}
2151
2152	/* Returns the contexts on which the position set S depends. Each context
2153	in the set of returned contexts (let's call it SC) may have a different
2154	follow set than other contexts in SC, and also different from the
2155	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2156	in the complement set will have the same follow set. */
2157
2158	static int _GL_ATTRIBUTE_PURE
2159	state_separate_contexts (position_set const *s)
2160	{
2161	int separate_contexts = 0;
2162	size_t j;
2163
2164	for (j = 0; j < s->nelem; ++j)
2165	{
2166	if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2167	separate_contexts \|= CTX_NEWLINE;
2168	if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2169	separate_contexts \|= CTX_LETTER;
2170	}
2171
2172	return separate_contexts;
2173	}
2174
2175
2176	/* Perform bottom-up analysis on the parse tree, computing various functions.
2177	Note that at this point, we're pretending constructs like \< are real
2178	characters rather than constraints on what can follow them.
2179
2180	Nullable: A node is nullable if it is at the root of a regexp that can
2181	match the empty string.
2182	* EMPTY leaves are nullable.
2183	* No other leaf is nullable.
2184	* A QMARK or STAR node is nullable.
2185	* A PLUS node is nullable if its argument is nullable.
2186	* A CAT node is nullable if both its arguments are nullable.
2187	* An OR node is nullable if either argument is nullable.
2188
2189	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2190	that could correspond to the first character of a string matching the
2191	regexp rooted at the given node.
2192	* EMPTY leaves have empty firstpos.
2193	* The firstpos of a nonempty leaf is that leaf itself.
2194	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2195	argument.
2196	* The firstpos of a CAT node is the firstpos of the left argument, union
2197	the firstpos of the right if the left argument is nullable.
2198	* The firstpos of an OR node is the union of firstpos of each argument.
2199
2200	Lastpos: The lastpos of a node is the set of positions that could
2201	correspond to the last character of a string matching the regexp at
2202	the given node.
2203	* EMPTY leaves have empty lastpos.
2204	* The lastpos of a nonempty leaf is that leaf itself.
2205	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2206	argument.
2207	* The lastpos of a CAT node is the lastpos of its right argument, union
2208	the lastpos of the left if the right argument is nullable.
2209	* The lastpos of an OR node is the union of the lastpos of each argument.
2210
2211	Follow: The follow of a position is the set of positions that could
2212	correspond to the character following a character matching the node in
2213	a string matching the regexp. At this point we consider special symbols
2214	that match the empty string in some context to be just normal characters.
2215	Later, if we find that a special symbol is in a follow set, we will
2216	replace it with the elements of its follow, labeled with an appropriate
2217	constraint.
2218	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2219	the follow of every node in the lastpos.
2220	* Every node in the firstpos of the second argument of a CAT node is in
2221	the follow of every node in the lastpos of the first argument.
2222
2223	Because of the postfix representation of the parse tree, the depth-first
2224	analysis is conveniently done by a linear scan with the aid of a stack.
2225	Sets are stored as arrays of the elements, obeying a stack-like allocation
2226	scheme; the number of elements in each set deeper in the stack can be
2227	used to determine the address of a particular set's array. */
2228	void
2229	dfaanalyze (struct dfa *d, int searchflag)
2230	{
2231	int nullable; / Nullable stack. */
2232	size_t nfirstpos; / Element count stack for firstpos sets. */
2233	position firstpos; / Array where firstpos elements are stored. */
2234	size_t nlastpos; / Element count stack for lastpos sets. */
2235	position lastpos; / Array where lastpos elements are stored. */
2236	position_set tmp; /* Temporary set for merging sets. */
2237	position_set merged; /* Result of merging sets. */
2238	int separate_contexts; /* Context wanted by some position. */
2239	int *o_nullable;
2240	size_t o_nfirst, o_nlast;
2241	position o_firstpos, o_lastpos;
2242	size_t i, j;
2243	position *pos;
2244
2245	#ifdef DEBUG
2246	fprintf (stderr, "dfaanalyze:\n");
2247	for (i = 0; i < d->tindex; ++i)
2248	{
2249	fprintf (stderr, " %zd:", i);
2250	prtok (d->tokens[i]);
2251	}
2252	putc ('\n', stderr);
2253	#endif
2254
2255	d->searchflag = searchflag;
2256
2257	MALLOC (nullable, d->depth);
2258	o_nullable = nullable;
2259	MALLOC (nfirstpos, d->depth);
2260	o_nfirst = nfirstpos;
2261	MALLOC (firstpos, d->nleaves);
2262	o_firstpos = firstpos, firstpos += d->nleaves;
2263	MALLOC (nlastpos, d->depth);
2264	o_nlast = nlastpos;
2265	MALLOC (lastpos, d->nleaves);
2266	o_lastpos = lastpos, lastpos += d->nleaves;
2267	alloc_position_set (&merged, d->nleaves);
2268
2269	CALLOC (d->follows, d->tindex);
2270
2271	for (i = 0; i < d->tindex; ++i)
2272	{
2273	switch (d->tokens[i])
2274	{
2275	case EMPTY:
2276	/* The empty set is nullable. */
2277	*nullable++ = 1;
2278
2279	/* The firstpos and lastpos of the empty leaf are both empty. */
2280	nfirstpos++ = nlastpos++ = 0;
2281	break;
2282
2283	case STAR:
2284	case PLUS:
2285	/* Every element in the firstpos of the argument is in the follow
2286	of every element in the lastpos. */
2287	tmp.nelem = nfirstpos[-1];
2288	tmp.elems = firstpos;
2289	pos = lastpos;
2290	for (j = 0; j < nlastpos[-1]; ++j)
2291	{
2292	merge (&tmp, &d->follows[pos[j].index], &merged);
2293	copy (&merged, &d->follows[pos[j].index]);
2294	}
2295
2296	case QMARK:
2297	/* A QMARK or STAR node is automatically nullable. */
2298	if (d->tokens[i] != PLUS)
2299	nullable[-1] = 1;
2300	break;
2301
2302	case CAT:
2303	/* Every element in the firstpos of the second argument is in the
2304	follow of every element in the lastpos of the first argument. */
2305	tmp.nelem = nfirstpos[-1];
2306	tmp.elems = firstpos;
2307	pos = lastpos + nlastpos[-1];
2308	for (j = 0; j < nlastpos[-2]; ++j)
2309	{
2310	merge (&tmp, &d->follows[pos[j].index], &merged);
2311	copy (&merged, &d->follows[pos[j].index]);
2312	}
2313
2314	/* The firstpos of a CAT node is the firstpos of the first argument,
2315	union that of the second argument if the first is nullable. */
2316	if (nullable[-2])
2317	nfirstpos[-2] += nfirstpos[-1];
2318	else
2319	firstpos += nfirstpos[-1];
2320	--nfirstpos;
2321
2322	/* The lastpos of a CAT node is the lastpos of the second argument,
2323	union that of the first argument if the second is nullable. */
2324	if (nullable[-1])
2325	nlastpos[-2] += nlastpos[-1];
2326	else
2327	{
2328	pos = lastpos + nlastpos[-2];
2329	for (j = nlastpos[-1]; j-- > 0;)
2330	pos[j] = lastpos[j];
2331	lastpos += nlastpos[-2];
2332	nlastpos[-2] = nlastpos[-1];
2333	}
2334	--nlastpos;
2335
2336	/* A CAT node is nullable if both arguments are nullable. */
2337	nullable[-2] = nullable[-1] && nullable[-2];
2338	--nullable;
2339	break;
2340
2341	case OR:
2342	/* The firstpos is the union of the firstpos of each argument. */
2343	nfirstpos[-2] += nfirstpos[-1];
2344	--nfirstpos;
2345
2346	/* The lastpos is the union of the lastpos of each argument. */
2347	nlastpos[-2] += nlastpos[-1];
2348	--nlastpos;
2349
2350	/* An OR node is nullable if either argument is nullable. */
2351	nullable[-2] = nullable[-1] \|\| nullable[-2];
2352	--nullable;
2353	break;
2354
2355	default:
2356	/* Anything else is a nonempty position. (Note that special
2357	constructs like \< are treated as nonempty strings here;
2358	an "epsilon closure" effectively makes them nullable later.
2359	Backreferences have to get a real position so we can detect
2360	transitions on them later. But they are nullable. */
2361	*nullable++ = d->tokens[i] == BACKREF;
2362
2363	/* This position is in its own firstpos and lastpos. */
2364	nfirstpos++ = nlastpos++ = 1;
2365	--firstpos, --lastpos;
2366	firstpos->index = lastpos->index = i;
2367	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2368
2369	/* Allocate the follow set for this position. */
2370	alloc_position_set (&d->follows[i], 1);
2371	break;
2372	}
2373	#ifdef DEBUG
2374	/* ... balance the above nonsyntactic #ifdef goo... */
2375	fprintf (stderr, "node %zd:", i);
2376	prtok (d->tokens[i]);
2377	putc ('\n', stderr);
2378	fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2379	fprintf (stderr, " firstpos:");
2380	for (j = nfirstpos[-1]; j-- > 0;)
2381	{
2382	fprintf (stderr, " %zd:", firstpos[j].index);
2383	prtok (d->tokens[firstpos[j].index]);
2384	}
2385	fprintf (stderr, "\n lastpos:");
2386	for (j = nlastpos[-1]; j-- > 0;)
2387	{
2388	fprintf (stderr, " %zd:", lastpos[j].index);
2389	prtok (d->tokens[lastpos[j].index]);
2390	}
2391	putc ('\n', stderr);
2392	#endif
2393	}
2394
2395	/* For each follow set that is the follow set of a real position, replace
2396	it with its epsilon closure. */
2397	for (i = 0; i < d->tindex; ++i)
2398	if (d->tokens[i] < NOTCHAR \|\| d->tokens[i] == BACKREF
2399	#if MBS_SUPPORT
2400	\|\| d->tokens[i] == ANYCHAR \|\| d->tokens[i] == MBCSET
2401	#endif
2402	\|\| d->tokens[i] >= CSET)
2403	{
2404	#ifdef DEBUG
2405	fprintf (stderr, "follows(%zd:", i);
2406	prtok (d->tokens[i]);
2407	fprintf (stderr, "):");
2408	for (j = d->follows[i].nelem; j-- > 0;)
2409	{
2410	fprintf (stderr, " %zd:", d->follows[i].elems[j].index);
2411	prtok (d->tokens[d->follows[i].elems[j].index]);
2412	}
2413	putc ('\n', stderr);
2414	#endif
2415	copy (&d->follows[i], &merged);
2416	epsclosure (&merged, d);
2417	copy (&merged, &d->follows[i]);
2418	}
2419
2420	/* Get the epsilon closure of the firstpos of the regexp. The result will
2421	be the set of positions of state 0. */
2422	merged.nelem = 0;
2423	for (i = 0; i < nfirstpos[-1]; ++i)
2424	insert (firstpos[i], &merged);
2425	epsclosure (&merged, d);
2426
2427	/* Build the initial state. */
2428	d->salloc = 1;
2429	d->sindex = 0;
2430	MALLOC (d->states, d->salloc);
2431
2432	separate_contexts = state_separate_contexts (&merged);
2433	state_index (d, &merged,
2434	(separate_contexts & CTX_NEWLINE
2435	? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
2436
2437	free (o_nullable);
2438	free (o_nfirst);
2439	free (o_firstpos);
2440	free (o_nlast);
2441	free (o_lastpos);
2442	free (merged.elems);
2443	}
2444
2445
2446	/* Find, for each character, the transition out of state s of d, and store
2447	it in the appropriate slot of trans.
2448
2449	We divide the positions of s into groups (positions can appear in more
2450	than one group). Each group is labeled with a set of characters that
2451	every position in the group matches (taking into account, if necessary,
2452	preceding context information of s). For each group, find the union
2453	of the its elements' follows. This set is the set of positions of the
2454	new state. For each character in the group's label, set the transition
2455	on this character to be to a state corresponding to the set's positions,
2456	and its associated backward context information, if necessary.
2457
2458	If we are building a searching matcher, we include the positions of state
2459	0 in every state.
2460
2461	The collection of groups is constructed by building an equivalence-class
2462	partition of the positions of s.
2463
2464	For each position, find the set of characters C that it matches. Eliminate
2465	any characters from C that fail on grounds of backward context.
2466
2467	Search through the groups, looking for a group whose label L has nonempty
2468	intersection with C. If L - C is nonempty, create a new group labeled
2469	L - C and having the same positions as the current group, and set L to
2470	the intersection of L and C. Insert the position in this group, set
2471	C = C - L, and resume scanning.
2472
2473	If after comparing with every group there are characters remaining in C,
2474	create a new group labeled with the characters of C and insert this
2475	position in that group. */
2476	void
2477	dfastate (state_num s, struct dfa *d, state_num trans[])
2478	{
2479	leaf_set grps; / As many as will ever be needed. */
2480	charclass labels; / Labels corresponding to the groups. */
2481	size_t ngrps = 0; /* Number of groups actually used. */
2482	position pos; /* Current position being considered. */
2483	charclass matches; /* Set of matching characters. */
2484	int matchesf; /* True if matches is nonempty. */
2485	charclass intersect; /* Intersection with some label set. */
2486	int intersectf; /* True if intersect is nonempty. */
2487	charclass leftovers; /* Stuff in the label that didn't match. */
2488	int leftoversf; /* True if leftovers is nonempty. */
2489	position_set follows; /* Union of the follows of some group. */
2490	position_set tmp; /* Temporary space for merging sets. */
2491	int possible_contexts; /* Contexts that this group can match. */
2492	int separate_contexts; /* Context that new state wants to know. */
2493	state_num state; /* New state. */
2494	state_num state_newline; /* New state on a newline transition. */
2495	state_num state_letter; /* New state on a letter transition. */
2496	int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2497	size_t i, j, k;
2498
2499	MALLOC (grps, NOTCHAR);
2500	MALLOC (labels, NOTCHAR);
2501
2502	zeroset (matches);
2503
2504	for (i = 0; i < d->states[s].elems.nelem; ++i)
2505	{
2506	pos = d->states[s].elems.elems[i];
2507	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2508	setbit (d->tokens[pos.index], matches);
2509	else if (d->tokens[pos.index] >= CSET)
2510	copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2511	else if (MBS_SUPPORT
2512	&& (d->tokens[pos.index] == ANYCHAR
2513	\|\| d->tokens[pos.index] == MBCSET))
2514	/* MB_CUR_MAX > 1 */
2515	{
2516	/* ANYCHAR and MBCSET must match with a single character, so we
2517	must put it to d->states[s].mbps, which contains the positions
2518	which can match with a single character not a byte. */
2519	if (d->states[s].mbps.nelem == 0)
2520	alloc_position_set (&d->states[s].mbps, 1);
2521	insert (pos, &(d->states[s].mbps));
2522	continue;
2523	}
2524	else
2525	continue;
2526
2527	/* Some characters may need to be eliminated from matches because
2528	they fail in the current context. */
2529	if (pos.constraint != NO_CONSTRAINT)
2530	{
2531	if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2532	d->states[s].context, CTX_NEWLINE))
2533	for (j = 0; j < CHARCLASS_INTS; ++j)
2534	matches[j] &= ~newline[j];
2535	if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2536	d->states[s].context, CTX_LETTER))
2537	for (j = 0; j < CHARCLASS_INTS; ++j)
2538	matches[j] &= ~letters[j];
2539	if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2540	d->states[s].context, CTX_NONE))
2541	for (j = 0; j < CHARCLASS_INTS; ++j)
2542	matches[j] &= letters[j] \| newline[j];
2543
2544	/* If there are no characters left, there's no point in going on. */
2545	for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2546	continue;
2547	if (j == CHARCLASS_INTS)
2548	continue;
2549	}
2550
2551	for (j = 0; j < ngrps; ++j)
2552	{
2553	/* If matches contains a single character only, and the current
2554	group's label doesn't contain that character, go on to the
2555	next group. */
2556	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2557	&& !tstbit (d->tokens[pos.index], labels[j]))
2558	continue;
2559
2560	/* Check if this group's label has a nonempty intersection with
2561	matches. */
2562	intersectf = 0;
2563	for (k = 0; k < CHARCLASS_INTS; ++k)
2564	(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2565	if (!intersectf)
2566	continue;
2567
2568	/* It does; now find the set differences both ways. */
2569	leftoversf = matchesf = 0;
2570	for (k = 0; k < CHARCLASS_INTS; ++k)
2571	{
2572	/* Even an optimizing compiler can't know this for sure. */
2573	int match = matches[k], label = labels[j][k];
2574
2575	(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2576	(matches[k] = match & ~label) ? (matchesf = 1) : 0;
2577	}
2578
2579	/* If there were leftovers, create a new group labeled with them. */
2580	if (leftoversf)
2581	{
2582	copyset (leftovers, labels[ngrps]);
2583	copyset (intersect, labels[j]);
2584	MALLOC (grps[ngrps].elems, d->nleaves);
2585	memcpy (grps[ngrps].elems, grps[j].elems,
2586	sizeof (grps[j].elems[0]) * grps[j].nelem);
2587	grps[ngrps].nelem = grps[j].nelem;
2588	++ngrps;
2589	}
2590
2591	/* Put the position in the current group. The constraint is
2592	irrelevant here. */
2593	grps[j].elems[grps[j].nelem++] = pos.index;
2594
2595	/* If every character matching the current position has been
2596	accounted for, we're done. */
2597	if (!matchesf)
2598	break;
2599	}
2600
2601	/* If we've passed the last group, and there are still characters
2602	unaccounted for, then we'll have to create a new group. */
2603	if (j == ngrps)
2604	{
2605	copyset (matches, labels[ngrps]);
2606	zeroset (matches);
2607	MALLOC (grps[ngrps].elems, d->nleaves);
2608	grps[ngrps].nelem = 1;
2609	grps[ngrps].elems[0] = pos.index;
2610	++ngrps;
2611	}
2612	}
2613
2614	alloc_position_set (&follows, d->nleaves);
2615	alloc_position_set (&tmp, d->nleaves);
2616
2617	/* If we are a searching matcher, the default transition is to a state
2618	containing the positions of state 0, otherwise the default transition
2619	is to fail miserably. */
2620	if (d->searchflag)
2621	{
2622	/* Find the state(s) corresponding to the positions of state 0. */
2623	copy (&d->states[0].elems, &follows);
2624	separate_contexts = state_separate_contexts (&follows);
2625	state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2626	if (separate_contexts & CTX_NEWLINE)
2627	state_newline = state_index (d, &follows, CTX_NEWLINE);
2628	else
2629	state_newline = state;
2630	if (separate_contexts & CTX_LETTER)
2631	state_letter = state_index (d, &follows, CTX_LETTER);
2632	else
2633	state_letter = state;
2634
2635	for (i = 0; i < NOTCHAR; ++i)
2636	trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
2637	trans[eolbyte] = state_newline;
2638	}
2639	else
2640	for (i = 0; i < NOTCHAR; ++i)
2641	trans[i] = -1;
2642
2643	for (i = 0; i < ngrps; ++i)
2644	{
2645	follows.nelem = 0;
2646
2647	/* Find the union of the follows of the positions of the group.
2648	This is a hideously inefficient loop. Fix it someday. */
2649	for (j = 0; j < grps[i].nelem; ++j)
2650	for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2651	insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2652
2653	if (d->mb_cur_max > 1)
2654	{
2655	/* If a token in follows.elems is not 1st byte of a multibyte
2656	character, or the states of follows must accept the bytes
2657	which are not 1st byte of the multibyte character.
2658	Then, if a state of follows encounter a byte, it must not be
2659	a 1st byte of a multibyte character nor single byte character.
2660	We cansel to add state[0].follows to next state, because
2661	state[0] must accept 1st-byte
2662
2663	For example, we assume <sb a> is a certain single byte
2664	character, <mb A> is a certain multibyte character, and the
2665	codepoint of <sb a> equals the 2nd byte of the codepoint of
2666	<mb A>.
2667	When state[0] accepts <sb a>, state[i] transit to state[i+1]
2668	by accepting accepts 1st byte of <mb A>, and state[i+1]
2669	accepts 2nd byte of <mb A>, if state[i+1] encounter the
2670	codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2671	<mb A>, so we cannot add state[0]. */
2672
2673	next_isnt_1st_byte = 0;
2674	for (j = 0; j < follows.nelem; ++j)
2675	{
2676	if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2677	{
2678	next_isnt_1st_byte = 1;
2679	break;
2680	}
2681	}
2682	}
2683
2684	/* If we are building a searching matcher, throw in the positions
2685	of state 0 as well. */
2686	if (d->searchflag
2687	&& (!MBS_SUPPORT \|\| (d->mb_cur_max == 1 \|\| !next_isnt_1st_byte)))
2688	for (j = 0; j < d->states[0].elems.nelem; ++j)
2689	insert (d->states[0].elems.elems[j], &follows);
2690
2691	/* Find out if the new state will want any context information. */
2692	possible_contexts = charclass_context (labels[i]);
2693	separate_contexts = state_separate_contexts (&follows);
2694
2695	/* Find the state(s) corresponding to the union of the follows. */
2696	if ((separate_contexts & possible_contexts) != possible_contexts)
2697	state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2698	else
2699	state = -1;
2700	if (separate_contexts & possible_contexts & CTX_NEWLINE)
2701	state_newline = state_index (d, &follows, CTX_NEWLINE);
2702	else
2703	state_newline = state;
2704	if (separate_contexts & possible_contexts & CTX_LETTER)
2705	state_letter = state_index (d, &follows, CTX_LETTER);
2706	else
2707	state_letter = state;
2708
2709	/* Set the transitions for each character in the current label. */
2710	for (j = 0; j < CHARCLASS_INTS; ++j)
2711	for (k = 0; k < INTBITS; ++k)
2712	if (labels[i][j] & 1 << k)
2713	{
2714	int c = j * INTBITS + k;
2715
2716	if (c == eolbyte)
2717	trans[c] = state_newline;
2718	else if (IS_WORD_CONSTITUENT (c))
2719	trans[c] = state_letter;
2720	else if (c < NOTCHAR)
2721	trans[c] = state;
2722	}
2723	}
2724
2725	for (i = 0; i < ngrps; ++i)
2726	free (grps[i].elems);
2727	free (follows.elems);
2728	free (tmp.elems);
2729	free (grps);
2730	free (labels);
2731	}
2732
2733	/* Some routines for manipulating a compiled dfa's transition tables.
2734	Each state may or may not have a transition table; if it does, and it
2735	is a non-accepting state, then d->trans[state] points to its table.
2736	If it is an accepting state then d->fails[state] points to its table.
2737	If it has no table at all, then d->trans[state] is NULL.
2738	TODO: Improve this comment, get rid of the unnecessary redundancy. */
2739
2740	static void
2741	build_state (state_num s, struct dfa *d)
2742	{
2743	state_num trans; / The new transition table. */
2744	state_num i;
2745
2746	/* Set an upper limit on the number of transition tables that will ever
2747	exist at once. 1024 is arbitrary. The idea is that the frequently
2748	used transition tables will be quickly rebuilt, whereas the ones that
2749	were only needed once or twice will be cleared away. */
2750	if (d->trcount >= 1024)
2751	{
2752	for (i = 0; i < d->tralloc; ++i)
2753	{
2754	free (d->trans[i]);
2755	free (d->fails[i]);
2756	d->trans[i] = d->fails[i] = NULL;
2757	}
2758	d->trcount = 0;
2759	}
2760
2761	++d->trcount;
2762
2763	/* Set up the success bits for this state. */
2764	d->success[s] = 0;
2765	if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2766	d->success[s] \|= CTX_NEWLINE;
2767	if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2768	d->success[s] \|= CTX_LETTER;
2769	if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2770	d->success[s] \|= CTX_NONE;
2771
2772	MALLOC (trans, NOTCHAR);
2773	dfastate (s, d, trans);
2774
2775	/* Now go through the new transition table, and make sure that the trans
2776	and fail arrays are allocated large enough to hold a pointer for the
2777	largest state mentioned in the table. */
2778	for (i = 0; i < NOTCHAR; ++i)
2779	if (trans[i] >= d->tralloc)
2780	{
2781	state_num oldalloc = d->tralloc;
2782
2783	while (trans[i] >= d->tralloc)
2784	d->tralloc *= 2;
2785	REALLOC (d->realtrans, d->tralloc + 1);
2786	d->trans = d->realtrans + 1;
2787	REALLOC (d->fails, d->tralloc);
2788	REALLOC (d->success, d->tralloc);
2789	REALLOC (d->newlines, d->tralloc);
2790	while (oldalloc < d->tralloc)
2791	{
2792	d->trans[oldalloc] = NULL;
2793	d->fails[oldalloc++] = NULL;
2794	}
2795	}
2796
2797	/* Keep the newline transition in a special place so we can use it as
2798	a sentinel. */
2799	d->newlines[s] = trans[eolbyte];
2800	trans[eolbyte] = -1;
2801
2802	if (ACCEPTING (s, *d))
2803	d->fails[s] = trans;
2804	else
2805	d->trans[s] = trans;
2806	}
2807
2808	static void
2809	build_state_zero (struct dfa *d)
2810	{
2811	d->tralloc = 1;
2812	d->trcount = 0;
2813	CALLOC (d->realtrans, d->tralloc + 1);
2814	d->trans = d->realtrans + 1;
2815	CALLOC (d->fails, d->tralloc);
2816	MALLOC (d->success, d->tralloc);
2817	MALLOC (d->newlines, d->tralloc);
2818	build_state (0, d);
2819	}
2820
2821	/* Multibyte character handling sub-routines for dfaexec. */
2822
2823	/* Initial state may encounter the byte which is not a single byte character
2824	nor 1st byte of a multibyte character. But it is incorrect for initial
2825	state to accept such a byte.
2826	For example, in sjis encoding the regular expression like "\\" accepts
2827	the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2828	0x815c. Then Initial state must skip the bytes which are not a single byte
2829	character nor 1st byte of a multibyte character. */
2830	#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2831	if (s == 0) \
2832	{ \
2833	while (inputwcs[p - buf_begin] == 0 \
2834	&& mblen_buf[p - buf_begin] > 0 \
2835	&& (unsigned char const *) p < buf_end) \
2836	++p; \
2837	if ((char *) p >= end) \
2838	{ \
2839	free (mblen_buf); \
2840	free (inputwcs); \
2841	*end = saved_end; \
2842	return NULL; \
2843	} \
2844	}
2845
2846	static void
2847	realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2848	{
2849	/* Make sure that the trans and fail arrays are allocated large enough
2850	to hold a pointer for the new state. */
2851	if (new_state >= d->tralloc)
2852	{
2853	state_num oldalloc = d->tralloc;
2854
2855	while (new_state >= d->tralloc)
2856	d->tralloc *= 2;
2857	REALLOC (d->realtrans, d->tralloc + 1);
2858	d->trans = d->realtrans + 1;
2859	REALLOC (d->fails, d->tralloc);
2860	REALLOC (d->success, d->tralloc);
2861	REALLOC (d->newlines, d->tralloc);
2862	while (oldalloc < d->tralloc)
2863	{
2864	d->trans[oldalloc] = NULL;
2865	d->fails[oldalloc++] = NULL;
2866	}
2867	}
2868	}
2869
2870	/* Return values of transit_state_singlebyte(), and
2871	transit_state_consume_1char. */
2872	typedef enum
2873	{
2874	TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2875	TRANSIT_STATE_DONE, /* State transition has finished. */
2876	TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2877	} status_transit_state;
2878
2879	/* Consume a single byte and transit state from 's' to '*next_state'.
2880	This function is almost same as the state transition routin in dfaexec().
2881	But state transition is done just once, otherwise matching succeed or
2882	reach the end of the buffer. */
2883	static status_transit_state
2884	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const p,
2885	state_num * next_state)
2886	{
2887	state_num *t;
2888	state_num works = s;
2889
2890	status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2891
2892	while (rval == TRANSIT_STATE_IN_PROGRESS)
2893	{
2894	if ((t = d->trans[works]) != NULL)
2895	{
2896	works = t[*p];
2897	rval = TRANSIT_STATE_DONE;
2898	if (works < 0)
2899	works = 0;
2900	}
2901	else if (works < 0)
2902	{
2903	if (p == buf_end)
2904	{
2905	/* At the moment, it must not happen. */
2906	abort ();
2907	}
2908	works = 0;
2909	}
2910	else if (d->fails[works])
2911	{
2912	works = d->fails[works][*p];
2913	rval = TRANSIT_STATE_DONE;
2914	}
2915	else
2916	{
2917	build_state (works, d);
2918	}
2919	}
2920	*next_state = works;
2921	return rval;
2922	}
2923
2924	/* Match a "." against the current context. buf_begin[IDX] is the
2925	current position. Return the length of the match, in bytes.
2926	POS is the position of the ".". */
2927	static int
2928	match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
2929	{
2930	int context;
2931	wchar_t wc;
2932	int mbclen;
2933
2934	wc = inputwcs[idx];
2935	mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2936
2937	/* Check syntax bits. */
2938	if (wc == (wchar_t) eolbyte)
2939	{
2940	if (!(syntax_bits & RE_DOT_NEWLINE))
2941	return 0;
2942	}
2943	else if (wc == (wchar_t) '\0')
2944	{
2945	if (syntax_bits & RE_DOT_NOT_NULL)
2946	return 0;
2947	}
2948
2949	context = wchar_context (wc);
2950	if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2951	return 0;
2952
2953	return mbclen;
2954	}
2955
2956	/* Match a bracket expression against the current context.
2957	buf_begin[IDX] is the current position.
2958	Return the length of the match, in bytes.
2959	POS is the position of the bracket expression. */
2960	static int
2961	match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
2962	{
2963	size_t i;
2964	int match; /* Flag which represent that matching succeed. */
2965	int match_len; /* Length of the character (or collating element)
2966	with which this operator match. */
2967	int op_len; /* Length of the operator. */
2968	char buffer[128];
2969	wchar_t wcbuf[6];
2970
2971	/* Pointer to the structure to which we are currently referring. */
2972	struct mb_char_classes *work_mbc;
2973
2974	int context;
2975	wchar_t wc; /* Current referring character. */
2976
2977	wc = inputwcs[idx];
2978
2979	/* Check syntax bits. */
2980	if (wc == (wchar_t) eolbyte)
2981	{
2982	if (!(syntax_bits & RE_DOT_NEWLINE))
2983	return 0;
2984	}
2985	else if (wc == (wchar_t) '\0')
2986	{
2987	if (syntax_bits & RE_DOT_NOT_NULL)
2988	return 0;
2989	}
2990
2991	context = wchar_context (wc);
2992	if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2993	return 0;
2994
2995	/* Assign the current referring operator to work_mbc. */
2996	work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2997	match = !work_mbc->invert;
2998	match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2999
3000	/* Match in range 0-255? */
3001	if (wc < NOTCHAR && work_mbc->cset != -1
3002	&& tstbit ((unsigned char) wc, d->charclasses[work_mbc->cset]))
3003	goto charset_matched;
3004
3005	/* match with a character class? */
3006	for (i = 0; i < work_mbc->nch_classes; i++)
3007	{
3008	if (iswctype ((wint_t) wc, work_mbc->ch_classes[i]))
3009	goto charset_matched;
3010	}
3011
3012	strncpy (buffer, (char const *) buf_begin + idx, match_len);
3013	buffer[match_len] = '\0';
3014
3015	/* match with an equivalence class? */
3016	for (i = 0; i < work_mbc->nequivs; i++)
3017	{
3018	op_len = strlen (work_mbc->equivs[i]);
3019	strncpy (buffer, (char const *) buf_begin + idx, op_len);
3020	buffer[op_len] = '\0';
3021	if (strcoll (work_mbc->equivs[i], buffer) == 0)
3022	{
3023	match_len = op_len;
3024	goto charset_matched;
3025	}
3026	}
3027
3028	/* match with a collating element? */
3029	for (i = 0; i < work_mbc->ncoll_elems; i++)
3030	{
3031	op_len = strlen (work_mbc->coll_elems[i]);
3032	strncpy (buffer, (char const *) buf_begin + idx, op_len);
3033	buffer[op_len] = '\0';
3034
3035	if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
3036	{
3037	match_len = op_len;
3038	goto charset_matched;
3039	}
3040	}
3041
3042	wcbuf[0] = wc;
3043	wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
3044
3045	/* match with a range? */
3046	for (i = 0; i < work_mbc->nranges; i++)
3047	{
3048	wcbuf[2] = work_mbc->range_sts[i];
3049	wcbuf[4] = work_mbc->range_ends[i];
3050
3051	if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0)
3052	goto charset_matched;
3053	}
3054
3055	/* match with a character? */
3056	for (i = 0; i < work_mbc->nchars; i++)
3057	{
3058	if (wc == work_mbc->chars[i])
3059	goto charset_matched;
3060	}
3061
3062	match = !match;
3063
3064	charset_matched:
3065	return match ? match_len : 0;
3066	}
3067
3068	/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
3069	array which corresponds to `d->states[s].mbps.elem' and each element of
3070	the array contains the amount of the bytes with which the element can
3071	match.
3072	`idx' is the index from the buf_begin, and it is the current position
3073	in the buffer.
3074	Caller MUST free the array which this function return. */
3075	static int *
3076	check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
3077	{
3078	size_t i;
3079	int *rarray;
3080
3081	MALLOC (rarray, d->states[s].mbps.nelem);
3082	for (i = 0; i < d->states[s].mbps.nelem; ++i)
3083	{
3084	position pos = d->states[s].mbps.elems[i];
3085	switch (d->tokens[pos.index])
3086	{
3087	case ANYCHAR:
3088	rarray[i] = match_anychar (d, s, pos, idx);
3089	break;
3090	case MBCSET:
3091	rarray[i] = match_mb_charset (d, s, pos, idx);
3092	break;
3093	default:
3094	break; /* cannot happen. */
3095	}
3096	}
3097	return rarray;
3098	}
3099
3100	/* Consume a single character and enumerate all of the positions which can
3101	be next position from the state `s'.
3102	`match_lens' is the input. It can be NULL, but it can also be the output
3103	of check_matching_with_multibyte_ops() for optimization.
3104	`mbclen' and `pps' are the output. `mbclen' is the length of the
3105	character consumed, and `pps' is the set this function enumerate. */
3106	static status_transit_state
3107	transit_state_consume_1char (struct dfa *d, state_num s,
3108	unsigned char const **pp,
3109	int match_lens, int mbclen, position_set * pps)
3110	{
3111	size_t i, j;
3112	int k;
3113	state_num s1, s2;
3114	int *work_mbls;
3115	status_transit_state rs = TRANSIT_STATE_DONE;
3116
3117	/* Calculate the length of the (single/multi byte) character
3118	to which p points. */
3119	mbclen = (mblen_buf[pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
3120
3121	/* Calculate the state which can be reached from the state `s' by
3122	consuming `mbclen' single bytes from the buffer. /
3123	s1 = s;
3124	for (k = 0; k < *mbclen; k++)
3125	{
3126	s2 = s1;
3127	rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
3128	}
3129	/* Copy the positions contained by `s1' to the set `pps'. */
3130	copy (&(d->states[s1].elems), pps);
3131
3132	/* Check (input) match_lens, and initialize if it is NULL. */
3133	if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3134	work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3135	else
3136	work_mbls = match_lens;
3137
3138	/* Add all of the positions which can be reached from `s' by consuming
3139	a single character. */
3140	for (i = 0; i < d->states[s].mbps.nelem; i++)
3141	{
3142	if (work_mbls[i] == *mbclen)
3143	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3144	j++)
3145	insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
3146	}
3147
3148	if (match_lens == NULL && work_mbls != NULL)
3149	free (work_mbls);
3150
3151	/* FIXME: this return value is always ignored. */
3152	return rs;
3153	}
3154
3155	/* Transit state from s, then return new state and update the pointer of the
3156	buffer. This function is for some operator which can match with a multi-
3157	byte character or a collating element (which may be multi characters). */
3158	static state_num
3159	transit_state (struct dfa d, state_num s, unsigned char const *pp)
3160	{
3161	state_num s1;
3162	int mbclen; /* The length of current input multibyte character. */
3163	int maxlen = 0;
3164	size_t i, j;
3165	int *match_lens = NULL;
3166	size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
3167	position_set follows;
3168	unsigned char const p1 = pp;
3169	wchar_t wc;
3170
3171	if (nelem > 0)
3172	/* This state has (a) multibyte operator(s).
3173	We check whether each of them can match or not. */
3174	{
3175	/* Note: caller must free the return value of this function. */
3176	match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3177
3178	for (i = 0; i < nelem; i++)
3179	/* Search the operator which match the longest string,
3180	in this state. */
3181	{
3182	if (match_lens[i] > maxlen)
3183	maxlen = match_lens[i];
3184	}
3185	}
3186
3187	if (nelem == 0 \|\| maxlen == 0)
3188	/* This state has no multibyte operator which can match.
3189	We need to check only one single byte character. */
3190	{
3191	status_transit_state rs;
3192	rs = transit_state_singlebyte (d, s, *pp, &s1);
3193
3194	/* We must update the pointer if state transition succeeded. */
3195	if (rs == TRANSIT_STATE_DONE)
3196	++*pp;
3197
3198	free (match_lens);
3199	return s1;
3200	}
3201
3202	/* This state has some operators which can match a multibyte character. */
3203	alloc_position_set (&follows, d->nleaves);
3204
3205	/* `maxlen' may be longer than the length of a character, because it may
3206	not be a character but a (multi character) collating element.
3207	We enumerate all of the positions which `s' can reach by consuming
3208	`maxlen' bytes. */
3209	transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
3210
3211	wc = inputwcs[*pp - mbclen - buf_begin];
3212	s1 = state_index (d, &follows, wchar_context (wc));
3213	realloc_trans_if_necessary (d, s1);
3214
3215	while (*pp - p1 < maxlen)
3216	{
3217	transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
3218
3219	for (i = 0; i < nelem; i++)
3220	{
3221	if (match_lens[i] == *pp - p1)
3222	for (j = 0;
3223	j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3224	insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3225	&follows);
3226	}
3227
3228	wc = inputwcs[*pp - mbclen - buf_begin];
3229	s1 = state_index (d, &follows, wchar_context (wc));
3230	realloc_trans_if_necessary (d, s1);
3231	}
3232	free (match_lens);
3233	free (follows.elems);
3234	return s1;
3235	}
3236
3237
3238	/* Initialize mblen_buf and inputwcs with data from the next line. */
3239
3240	static void
3241	prepare_wc_buf (const char begin, const char end)
3242	{
3243	#if MBS_SUPPORT
3244	unsigned char eol = eolbyte;
3245	size_t remain_bytes, i;
3246
3247	buf_begin = (unsigned char *) begin;
3248
3249	remain_bytes = 0;
3250	for (i = 0; i < end - begin + 1; i++)
3251	{
3252	if (remain_bytes == 0)
3253	{
3254	remain_bytes
3255	= mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3256	if (remain_bytes < 1
3257	\|\| remain_bytes == (size_t) -1
3258	\|\| remain_bytes == (size_t) -2
3259	\|\| (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
3260	{
3261	remain_bytes = 0;
3262	inputwcs[i] = (wchar_t) begin[i];
3263	mblen_buf[i] = 0;
3264	if (begin[i] == eol)
3265	break;
3266	}
3267	else
3268	{
3269	mblen_buf[i] = remain_bytes;
3270	remain_bytes--;
3271	}
3272	}
3273	else
3274	{
3275	mblen_buf[i] = remain_bytes;
3276	inputwcs[i] = 0;
3277	remain_bytes--;
3278	}
3279	}
3280
3281	buf_end = (unsigned char *) (begin + i);
3282	mblen_buf[i] = 0;
3283	inputwcs[i] = 0; /* sentinel */
3284	#endif /* MBS_SUPPORT */
3285	}
3286
3287	/* Search through a buffer looking for a match to the given struct dfa.
3288	Find the first occurrence of a string matching the regexp in the
3289	buffer, and the shortest possible version thereof. Return a pointer to
3290	the first character after the match, or NULL if none is found. BEGIN
3291	points to the beginning of the buffer, and END points to the first byte
3292	after its end. Note however that we store a sentinel byte (usually
3293	newline) in *END, so the actual buffer must be one byte longer.
3294	When ALLOW_NL is nonzero, newlines may appear in the matching string.
3295	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3296	Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3297	encountered a back-reference (1) or not (0). The caller may use this
3298	to decide whether to fall back on a backtracking matcher. */
3299	char *
3300	dfaexec (struct dfa d, char const begin, char *end,
3301	int allow_nl, size_t count, int backref)
3302	{
3303	state_num s, s1; /* Current state. */
3304	unsigned char const p; / Current input character. */
3305	state_num *trans, t; /* Copy of d->trans so it can be optimized
3306	into a register. */
3307	unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3308	unsigned char saved_end;
3309
3310	if (!d->tralloc)
3311	build_state_zero (d);
3312
3313	s = s1 = 0;
3314	p = (unsigned char const *) begin;
3315	trans = d->trans;
3316	saved_end = (unsigned char ) end;
3317	*end = eol;
3318
3319	if (d->mb_cur_max > 1)
3320	{
3321	MALLOC (mblen_buf, end - begin + 2);
3322	MALLOC (inputwcs, end - begin + 2);
3323	memset (&mbs, 0, sizeof (mbstate_t));
3324	prepare_wc_buf ((const char *) p, end);
3325	}
3326
3327	for (;;)
3328	{
3329	if (d->mb_cur_max > 1)
3330	while ((t = trans[s]) != NULL)
3331	{
3332	if (p > buf_end)
3333	break;
3334	s1 = s;
3335	SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
3336
3337	if (d->states[s].mbps.nelem == 0)
3338	{
3339	s = t[*p++];
3340	continue;
3341	}
3342
3343	/* Falling back to the glibc matcher in this case gives
3344	better performance (up to 25% better on [a-z], for
3345	example) and enables support for collating symbols and
3346	equivalence classes. */
3347	if (backref)
3348	{
3349	*backref = 1;
3350	free (mblen_buf);
3351	free (inputwcs);
3352	*end = saved_end;
3353	return (char *) p;
3354	}
3355
3356	/* Can match with a multibyte character (and multi character
3357	collating element). Transition table might be updated. */
3358	s = transit_state (d, s, &p);
3359	trans = d->trans;
3360	}
3361	else
3362	{
3363	while ((t = trans[s]) != NULL)
3364	{
3365	s1 = t[*p++];
3366	if ((t = trans[s1]) == NULL)
3367	{
3368	state_num tmp = s;
3369	s = s1;
3370	s1 = tmp; /* swap */
3371	break;
3372	}
3373	s = t[*p++];
3374	}
3375	}
3376
3377	if (s >= 0 && (char *) p <= end && d->fails[s])
3378	{
3379	if (d->success[s] & sbit[*p])
3380	{
3381	if (backref)
3382	*backref = (d->states[s].backref != 0);
3383	if (d->mb_cur_max > 1)
3384	{
3385	free (mblen_buf);
3386	free (inputwcs);
3387	}
3388	*end = saved_end;
3389	return (char *) p;
3390	}
3391
3392	s1 = s;
3393	if (d->mb_cur_max > 1)
3394	{
3395	/* Can match with a multibyte character (and multicharacter
3396	collating element). Transition table might be updated. */
3397	s = transit_state (d, s, &p);
3398	trans = d->trans;
3399	}
3400	else
3401	s = d->fails[s][*p++];
3402	continue;
3403	}
3404
3405	/* If the previous character was a newline, count it. */
3406	if ((char *) p <= end && p[-1] == eol)
3407	{
3408	if (count)
3409	++*count;
3410
3411	if (d->mb_cur_max > 1)
3412	prepare_wc_buf ((const char *) p, end);
3413	}
3414
3415	/* Check if we've run off the end of the buffer. */
3416	if ((char *) p > end)
3417	{
3418	if (d->mb_cur_max > 1)
3419	{
3420	free (mblen_buf);
3421	free (inputwcs);
3422	}
3423	*end = saved_end;
3424	return NULL;
3425	}
3426
3427	if (s >= 0)
3428	{
3429	build_state (s, d);
3430	trans = d->trans;
3431	continue;
3432	}
3433
3434	if (p[-1] == eol && allow_nl)
3435	{
3436	s = d->newlines[s1];
3437	continue;
3438	}
3439
3440	s = 0;
3441	}
3442	}
3443
3444	static void
3445	free_mbdata (struct dfa *d)
3446	{
3447	size_t i;
3448
3449	free (d->multibyte_prop);
3450	d->multibyte_prop = NULL;
3451
3452	for (i = 0; i < d->nmbcsets; ++i)
3453	{
3454	size_t j;
3455	struct mb_char_classes *p = &(d->mbcsets[i]);
3456	free (p->chars);
3457	free (p->ch_classes);
3458	free (p->range_sts);
3459	free (p->range_ends);
3460
3461	for (j = 0; j < p->nequivs; ++j)
3462	free (p->equivs[j]);
3463	free (p->equivs);
3464
3465	for (j = 0; j < p->ncoll_elems; ++j)
3466	free (p->coll_elems[j]);
3467	free (p->coll_elems);
3468	}
3469
3470	free (d->mbcsets);
3471	d->mbcsets = NULL;
3472	d->nmbcsets = 0;
3473	}
3474
3475	/* Initialize the components of a dfa that the other routines don't
3476	initialize for themselves. */
3477	void
3478	dfainit (struct dfa *d)
3479	{
3480	memset (d, 0, sizeof *d);
3481
3482	d->calloc = 1;
3483	MALLOC (d->charclasses, d->calloc);
3484
3485	d->talloc = 1;
3486	MALLOC (d->tokens, d->talloc);
3487
3488	d->mb_cur_max = MB_CUR_MAX;
3489
3490	if (d->mb_cur_max > 1)
3491	{
3492	d->nmultibyte_prop = 1;
3493	MALLOC (d->multibyte_prop, d->nmultibyte_prop);
3494	d->mbcsets_alloc = 1;
3495	MALLOC (d->mbcsets, d->mbcsets_alloc);
3496	}
3497	}
3498
3499	static void
3500	dfaoptimize (struct dfa *d)
3501	{
3502	size_t i;
3503
3504	if (!MBS_SUPPORT \|\| !using_utf8 ())
3505	return;
3506
3507	for (i = 0; i < d->tindex; ++i)
3508	{
3509	switch (d->tokens[i])
3510	{
3511	case ANYCHAR:
3512	/* Lowered. */
3513	abort ();
3514	case MBCSET:
3515	/* Requires multi-byte algorithm. */
3516	return;
3517	default:
3518	break;
3519	}
3520	}
3521
3522	free_mbdata (d);
3523	d->mb_cur_max = 1;
3524	}
3525
3526	/* Parse and analyze a single string of the given length. */
3527	void
3528	dfacomp (char const s, size_t len, struct dfa d, int searchflag)
3529	{
3530	dfainit (d);
3531	dfaparse (s, len, d);
3532	dfamust (d);
3533	dfaoptimize (d);
3534	dfaanalyze (d, searchflag);
3535	}
3536
3537	/* Free the storage held by the components of a dfa. */
3538	void
3539	dfafree (struct dfa *d)
3540	{
3541	size_t i;
3542	struct dfamust dm, ndm;
3543
3544	free (d->charclasses);
3545	free (d->tokens);
3546
3547	if (d->mb_cur_max > 1)
3548	free_mbdata (d);
3549
3550	for (i = 0; i < d->sindex; ++i)
3551	{
3552	free (d->states[i].elems.elems);
3553	if (MBS_SUPPORT)
3554	free (d->states[i].mbps.elems);
3555	}
3556	free (d->states);
3557	for (i = 0; i < d->tindex; ++i)
3558	free (d->follows[i].elems);
3559	free (d->follows);
3560	for (i = 0; i < d->tralloc; ++i)
3561	{
3562	free (d->trans[i]);
3563	free (d->fails[i]);
3564	}
3565	free (d->realtrans);
3566	free (d->fails);
3567	free (d->newlines);
3568	free (d->success);
3569	for (dm = d->musts; dm; dm = ndm)
3570	{
3571	ndm = dm->next;
3572	free (dm->must);
3573	free (dm);
3574	}
3575	}
3576
3577	/* Having found the postfix representation of the regular expression,
3578	try to find a long sequence of characters that must appear in any line
3579	containing the r.e.
3580	Finding a "longest" sequence is beyond the scope here;
3581	we take an easy way out and hope for the best.
3582	(Take "(ab\|a)b"--please.)
3583
3584	We do a bottom-up calculation of sequences of characters that must appear
3585	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3586	representation:
3587	sequences that must appear at the left of the match ("left")
3588	sequences that must appear at the right of the match ("right")
3589	lists of sequences that must appear somewhere in the match ("in")
3590	sequences that must constitute the match ("is")
3591
3592	When we get to the root of the tree, we use one of the longest of its
3593	calculated "in" sequences as our answer. The sequence we find is returned in
3594	d->must (where "d" is the single argument passed to "dfamust");
3595	the length of the sequence is returned in d->mustn.
3596
3597	The sequences calculated for the various types of node (in pseudo ANSI c)
3598	are shown below. "p" is the operand of unary operators (and the left-hand
3599	operand of binary operators); "q" is the right-hand operand of binary
3600	operators.
3601
3602	"ZERO" means "a zero-length sequence" below.
3603
3604	Type left right is in
3605	---- ---- ----- -- --
3606	char c # c # c # c # c
3607
3608	ANYCHAR ZERO ZERO ZERO ZERO
3609
3610	MBCSET ZERO ZERO ZERO ZERO
3611
3612	CSET ZERO ZERO ZERO ZERO
3613
3614	STAR ZERO ZERO ZERO ZERO
3615
3616	QMARK ZERO ZERO ZERO ZERO
3617
3618	PLUS p->left p->right ZERO p->in
3619
3620	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3621	p->left : q->right : q->is!=ZERO) ? q->in plus
3622	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3623	ZERO
3624
3625	OR longest common longest common (do p->is and substrings common to
3626	leading trailing q->is have same p->in and q->in
3627	(sub)sequence (sub)sequence length and
3628	of p->left of p->right content) ?
3629	and q->left and q->right p->is : NULL
3630
3631	If there's anything else we recognize in the tree, all four sequences get set
3632	to zero-length sequences. If there's something we don't recognize in the tree,
3633	we just return a zero-length sequence.
3634
3635	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3636	'aaa')?
3637
3638	And. . .is it here or someplace that we might ponder "optimizations" such as
3639	egrep 'psi\|epsilon' -> egrep 'psi'
3640	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3641	(Yes, we now find "epsi" as a "string
3642	that must occur", but we might also
3643	simplify the entire r.e. being sought)
3644	grep '[c]' -> grep 'c'
3645	grep '(ab\|a)b' -> grep 'ab'
3646	grep 'ab*' -> grep 'a'
3647	grep 'a*b' -> grep 'b'
3648
3649	There are several issues:
3650
3651	Is optimization easy (enough)?
3652
3653	Does optimization actually accomplish anything,
3654	or is the automaton you get from "psi\|epsilon" (for example)
3655	the same as the one you get from "psi" (for example)?
3656
3657	Are optimizable r.e.'s likely to be used in real-life situations
3658	(something like 'ab*' is probably unlikely; something like is
3659	'psi\|epsilon' is likelier)? */
3660
3661	static char *
3662	icatalloc (char old, char const new)
3663	{
3664	char *result;
3665	size_t oldsize = old == NULL ? 0 : strlen (old);
3666	size_t newsize = new == NULL ? 0 : strlen (new);
3667	if (newsize == 0)
3668	return old;
3669	result = xrealloc (old, oldsize + newsize + 1);
3670	memcpy (result + oldsize, new, newsize + 1);
3671	return result;
3672	}
3673
3674	static char *
3675	icpyalloc (char const *string)
3676	{
3677	return icatalloc (NULL, string);
3678	}
3679
3680	static char *_GL_ATTRIBUTE_PURE
3681	istrstr (char const lookin, char const lookfor)
3682	{
3683	char const *cp;
3684	size_t len;
3685
3686	len = strlen (lookfor);
3687	for (cp = lookin; *cp != '\0'; ++cp)
3688	if (strncmp (cp, lookfor, len) == 0)
3689	return (char *) cp;
3690	return NULL;
3691	}
3692
3693	static void
3694	freelist (char **cpp)
3695	{
3696	size_t i;
3697
3698	if (cpp == NULL)
3699	return;
3700	for (i = 0; cpp[i] != NULL; ++i)
3701	{
3702	free (cpp[i]);
3703	cpp[i] = NULL;
3704	}
3705	}
3706
3707	static char **
3708	enlist (char *cpp, char new, size_t len)
3709	{
3710	size_t i, j;
3711
3712	if (cpp == NULL)
3713	return NULL;
3714	if ((new = icpyalloc (new)) == NULL)
3715	{
3716	freelist (cpp);
3717	return NULL;
3718	}
3719	new[len] = '\0';
3720	/* Is there already something in the list that's new (or longer)? */
3721	for (i = 0; cpp[i] != NULL; ++i)
3722	if (istrstr (cpp[i], new) != NULL)
3723	{
3724	free (new);
3725	return cpp;
3726	}
3727	/* Eliminate any obsoleted strings. */
3728	j = 0;
3729	while (cpp[j] != NULL)
3730	if (istrstr (new, cpp[j]) == NULL)
3731	++j;
3732	else
3733	{
3734	free (cpp[j]);
3735	if (--i == j)
3736	break;
3737	cpp[j] = cpp[i];
3738	cpp[i] = NULL;
3739	}
3740	/* Add the new string. */
3741	REALLOC (cpp, i + 2);
3742	cpp[i] = new;
3743	cpp[i + 1] = NULL;
3744	return cpp;
3745	}
3746
3747	/* Given pointers to two strings, return a pointer to an allocated
3748	list of their distinct common substrings. Return NULL if something
3749	seems wild. */
3750	static char **
3751	comsubs (char left, char const right)
3752	{
3753	char **cpp;
3754	char *lcp;
3755	char *rcp;
3756	size_t i, len;
3757
3758	if (left == NULL \|\| right == NULL)
3759	return NULL;
3760	cpp = malloc (sizeof *cpp);
3761	if (cpp == NULL)
3762	return NULL;
3763	cpp[0] = NULL;
3764	for (lcp = left; *lcp != '\0'; ++lcp)
3765	{
3766	len = 0;
3767	rcp = strchr (right, *lcp);
3768	while (rcp != NULL)
3769	{
3770	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3771	continue;
3772	if (i > len)
3773	len = i;
3774	rcp = strchr (rcp + 1, *lcp);
3775	}
3776	if (len == 0)
3777	continue;
3778	{
3779	char **p = enlist (cpp, lcp, len);
3780	if (p == NULL)
3781	{
3782	freelist (cpp);
3783	cpp = NULL;
3784	break;
3785	}
3786	cpp = p;
3787	}
3788	}
3789	return cpp;
3790	}
3791
3792	static char **
3793	addlists (char old, char new)
3794	{
3795	size_t i;
3796
3797	if (old == NULL \|\| new == NULL)
3798	return NULL;
3799	for (i = 0; new[i] != NULL; ++i)
3800	{
3801	old = enlist (old, new[i], strlen (new[i]));
3802	if (old == NULL)
3803	break;
3804	}
3805	return old;
3806	}
3807
3808	/* Given two lists of substrings, return a new list giving substrings
3809	common to both. */
3810	static char **
3811	inboth (char left, char right)
3812	{
3813	char **both;
3814	char **temp;
3815	size_t lnum, rnum;
3816
3817	if (left == NULL \|\| right == NULL)
3818	return NULL;
3819	both = malloc (sizeof *both);
3820	if (both == NULL)
3821	return NULL;
3822	both[0] = NULL;
3823	for (lnum = 0; left[lnum] != NULL; ++lnum)
3824	{
3825	for (rnum = 0; right[rnum] != NULL; ++rnum)
3826	{
3827	temp = comsubs (left[lnum], right[rnum]);
3828	if (temp == NULL)
3829	{
3830	freelist (both);
3831	return NULL;
3832	}
3833	both = addlists (both, temp);
3834	freelist (temp);
3835	free (temp);
3836	if (both == NULL)
3837	return NULL;
3838	}
3839	}
3840	return both;
3841	}
3842
3843	typedef struct
3844	{
3845	char **in;
3846	char *left;
3847	char *right;
3848	char *is;
3849	} must;
3850
3851	static void
3852	resetmust (must * mp)
3853	{
3854	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3855	freelist (mp->in);
3856	}
3857
3858	static void
3859	dfamust (struct dfa *d)
3860	{
3861	must *musts;
3862	must *mp;
3863	char *result;
3864	size_t ri;
3865	size_t i;
3866	int exact;
3867	token t;
3868	static must must0;
3869	struct dfamust *dm;
3870	static char empty_string[] = "";
3871
3872	result = empty_string;
3873	exact = 0;
3874	MALLOC (musts, d->tindex + 1);
3875	mp = musts;
3876	for (i = 0; i <= d->tindex; ++i)
3877	mp[i] = must0;
3878	for (i = 0; i <= d->tindex; ++i)
3879	{
3880	mp[i].in = xmalloc (sizeof *mp[i].in);
3881	mp[i].left = xmalloc (2);
3882	mp[i].right = xmalloc (2);
3883	mp[i].is = xmalloc (2);
3884	mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3885	mp[i].in[0] = NULL;
3886	}
3887	#ifdef DEBUG
3888	fprintf (stderr, "dfamust:\n");
3889	for (i = 0; i < d->tindex; ++i)
3890	{
3891	fprintf (stderr, " %zd:", i);
3892	prtok (d->tokens[i]);
3893	}
3894	putc ('\n', stderr);
3895	#endif
3896	for (ri = 0; ri < d->tindex; ++ri)
3897	{
3898	switch (t = d->tokens[ri])
3899	{
3900	case LPAREN:
3901	case RPAREN:
3902	assert (!"neither LPAREN nor RPAREN may appear here");
3903	case EMPTY:
3904	case BEGLINE:
3905	case ENDLINE:
3906	case BEGWORD:
3907	case ENDWORD:
3908	case LIMWORD:
3909	case NOTLIMWORD:
3910	case BACKREF:
3911	resetmust (mp);
3912	break;
3913	case STAR:
3914	case QMARK:
3915	assert (musts < mp);
3916	--mp;
3917	resetmust (mp);
3918	break;
3919	case OR:
3920	assert (&musts[2] <= mp);
3921	{
3922	char **new;
3923	must *lmp;
3924	must *rmp;
3925	size_t j, ln, rn, n;
3926
3927	rmp = --mp;
3928	lmp = --mp;
3929	/* Guaranteed to be. Unlikely, but. . . */
3930	if (!STREQ (lmp->is, rmp->is))
3931	lmp->is[0] = '\0';
3932	/* Left side--easy */
3933	i = 0;
3934	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3935	++i;
3936	lmp->left[i] = '\0';
3937	/* Right side */
3938	ln = strlen (lmp->right);
3939	rn = strlen (rmp->right);
3940	n = ln;
3941	if (n > rn)
3942	n = rn;
3943	for (i = 0; i < n; ++i)
3944	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3945	break;
3946	for (j = 0; j < i; ++j)
3947	lmp->right[j] = lmp->right[(ln - i) + j];
3948	lmp->right[j] = '\0';
3949	new = inboth (lmp->in, rmp->in);
3950	if (new == NULL)
3951	goto done;
3952	freelist (lmp->in);
3953	free (lmp->in);
3954	lmp->in = new;
3955	}
3956	break;
3957	case PLUS:
3958	assert (musts < mp);
3959	--mp;
3960	mp->is[0] = '\0';
3961	break;
3962	case END:
3963	assert (mp == &musts[1]);
3964	for (i = 0; musts[0].in[i] != NULL; ++i)
3965	if (strlen (musts[0].in[i]) > strlen (result))
3966	result = musts[0].in[i];
3967	if (STREQ (result, musts[0].is))
3968	exact = 1;
3969	goto done;
3970	case CAT:
3971	assert (&musts[2] <= mp);
3972	{
3973	must *lmp;
3974	must *rmp;
3975
3976	rmp = --mp;
3977	lmp = --mp;
3978	/* In. Everything in left, plus everything in
3979	right, plus concatenation of
3980	left's right and right's left. */
3981	lmp->in = addlists (lmp->in, rmp->in);
3982	if (lmp->in == NULL)
3983	goto done;
3984	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3985	{
3986	char *tp;
3987
3988	tp = icpyalloc (lmp->right);
3989	tp = icatalloc (tp, rmp->left);
3990	lmp->in = enlist (lmp->in, tp, strlen (tp));
3991	free (tp);
3992	if (lmp->in == NULL)
3993	goto done;
3994	}
3995	/* Left-hand */
3996	if (lmp->is[0] != '\0')
3997	{
3998	lmp->left = icatalloc (lmp->left, rmp->left);
3999	if (lmp->left == NULL)
4000	goto done;
4001	}
4002	/* Right-hand */
4003	if (rmp->is[0] == '\0')
4004	lmp->right[0] = '\0';
4005	lmp->right = icatalloc (lmp->right, rmp->right);
4006	if (lmp->right == NULL)
4007	goto done;
4008	/* Guaranteed to be */
4009	if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
4010	{
4011	lmp->is = icatalloc (lmp->is, rmp->is);
4012	if (lmp->is == NULL)
4013	goto done;
4014	}
4015	else
4016	lmp->is[0] = '\0';
4017	}
4018	break;
4019	default:
4020	if (t < END)
4021	{
4022	assert (!"oops! t >= END");
4023	}
4024	else if (t == '\0')
4025	{
4026	/* not on my shift */
4027	goto done;
4028	}
4029	else if (t >= CSET \|\| !MBS_SUPPORT \|\| t == ANYCHAR \|\| t == MBCSET)
4030	{
4031	/* easy enough */
4032	resetmust (mp);
4033	}
4034	else
4035	{
4036	/* plain character */
4037	resetmust (mp);
4038	mp->is[0] = mp->left[0] = mp->right[0] = t;
4039	mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4040	mp->in = enlist (mp->in, mp->is, (size_t) 1);
4041	if (mp->in == NULL)
4042	goto done;
4043	}
4044	break;
4045	}
4046	#ifdef DEBUG
4047	fprintf (stderr, " node: %zd:", ri);
4048	prtok (d->tokens[ri]);
4049	fprintf (stderr, "\n in:");
4050	for (i = 0; mp->in[i]; ++i)
4051	fprintf (stderr, " \"%s\"", mp->in[i]);
4052	fprintf (stderr, "\n is: \"%s\"\n", mp->is);
4053	fprintf (stderr, " left: \"%s\"\n", mp->left);
4054	fprintf (stderr, " right: \"%s\"\n", mp->right);
4055	#endif
4056	++mp;
4057	}
4058	done:
4059	if (strlen (result))
4060	{
4061	MALLOC (dm, 1);
4062	dm->exact = exact;
4063	dm->must = xmemdup (result, strlen (result) + 1);
4064	dm->next = d->musts;
4065	d->musts = dm;
4066	}
4067	mp = musts;
4068	for (i = 0; i <= d->tindex; ++i)
4069	{
4070	freelist (mp[i].in);
4071	free (mp[i].in);
4072	free (mp[i].left);
4073	free (mp[i].right);
4074	free (mp[i].is);
4075	}
4076	free (mp);
4077	}
4078
4079	struct dfa *
4080	dfaalloc (void)
4081	{
4082	return xmalloc (sizeof (struct dfa));
4083	}
4084
4085	struct dfamust *_GL_ATTRIBUTE_PURE
4086	dfamusts (struct dfa const *d)
4087	{
4088	return d->musts;
4089	}
4090
4091	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/grep/2.12/src/dfa.c@ 3576

Download in other formats: