regex_internal.c@ 3576

Last change on this file since 3576 was 2595, checked in by bird, 13 years ago
gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)
Property svn:eol-style set to `native`
File size: 47.6 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2012 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <[email protected]>.
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 3, or (at your option)
9	any later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License along
17	with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19	#include "verify.h"
20	#include "intprops.h"
21	static void re_string_construct_common (const char *str, Idx len,
22	re_string_t *pstr,
23	RE_TRANSLATE_TYPE trans, bool icase,
24	const re_dfa_t *dfa) internal_function;
25	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
26	const re_node_set *nodes,
27	re_hashval_t hash) internal_function;
28	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
29	const re_node_set *nodes,
30	unsigned int context,
31	re_hashval_t hash) internal_function;
32
33
34	/* Functions for string operation. */
35
36	/* This function allocate the buffers. It is necessary to call
37	re_string_reconstruct before using the object. */
38
39	static reg_errcode_t
40	internal_function __attribute_warn_unused_result__
41	re_string_allocate (re_string_t pstr, const char str, Idx len, Idx init_len,
42	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
43	{
44	reg_errcode_t ret;
45	Idx init_buf_len;
46
47	/* Ensure at least one character fits into the buffers. */
48	if (init_len < dfa->mb_cur_max)
49	init_len = dfa->mb_cur_max;
50	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51	re_string_construct_common (str, len, pstr, trans, icase, dfa);
52
53	ret = re_string_realloc_buffers (pstr, init_buf_len);
54	if (BE (ret != REG_NOERROR, 0))
55	return ret;
56
57	pstr->word_char = dfa->word_char;
58	pstr->word_ops_used = dfa->word_ops_used;
59	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
61	pstr->valid_raw_len = pstr->valid_len;
62	return REG_NOERROR;
63	}
64
65	/* This function allocate the buffers, and initialize them. */
66
67	static reg_errcode_t
68	internal_function __attribute_warn_unused_result__
69	re_string_construct (re_string_t pstr, const char str, Idx len,
70	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71	{
72	reg_errcode_t ret;
73	memset (pstr, '\0', sizeof (re_string_t));
74	re_string_construct_common (str, len, pstr, trans, icase, dfa);
75
76	if (len > 0)
77	{
78	ret = re_string_realloc_buffers (pstr, len + 1);
79	if (BE (ret != REG_NOERROR, 0))
80	return ret;
81	}
82	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83
84	if (icase)
85	{
86	#ifdef RE_ENABLE_I18N
87	if (dfa->mb_cur_max > 1)
88	{
89	while (1)
90	{
91	ret = build_wcs_upper_buffer (pstr);
92	if (BE (ret != REG_NOERROR, 0))
93	return ret;
94	if (pstr->valid_raw_len >= len)
95	break;
96	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97	break;
98	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99	if (BE (ret != REG_NOERROR, 0))
100	return ret;
101	}
102	}
103	else
104	#endif /* RE_ENABLE_I18N */
105	build_upper_buffer (pstr);
106	}
107	else
108	{
109	#ifdef RE_ENABLE_I18N
110	if (dfa->mb_cur_max > 1)
111	build_wcs_buffer (pstr);
112	else
113	#endif /* RE_ENABLE_I18N */
114	{
115	if (trans != NULL)
116	re_string_translate_buffer (pstr);
117	else
118	{
119	pstr->valid_len = pstr->bufs_len;
120	pstr->valid_raw_len = pstr->bufs_len;
121	}
122	}
123	}
124
125	return REG_NOERROR;
126	}
127
128	/* Helper functions for re_string_allocate, and re_string_construct. */
129
130	static reg_errcode_t
131	internal_function __attribute_warn_unused_result__
132	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
133	{
134	#ifdef RE_ENABLE_I18N
135	if (pstr->mb_cur_max > 1)
136	{
137	wint_t *new_wcs;
138
139	/* Avoid overflow in realloc. */
140	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
141	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
142	return REG_ESPACE;
143
144	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
145	if (BE (new_wcs == NULL, 0))
146	return REG_ESPACE;
147	pstr->wcs = new_wcs;
148	if (pstr->offsets != NULL)
149	{
150	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
151	if (BE (new_offsets == NULL, 0))
152	return REG_ESPACE;
153	pstr->offsets = new_offsets;
154	}
155	}
156	#endif /* RE_ENABLE_I18N */
157	if (pstr->mbs_allocated)
158	{
159	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
160	new_buf_len);
161	if (BE (new_mbs == NULL, 0))
162	return REG_ESPACE;
163	pstr->mbs = new_mbs;
164	}
165	pstr->bufs_len = new_buf_len;
166	return REG_NOERROR;
167	}
168
169
170	static void
171	internal_function
172	re_string_construct_common (const char str, Idx len, re_string_t pstr,
173	RE_TRANSLATE_TYPE trans, bool icase,
174	const re_dfa_t *dfa)
175	{
176	pstr->raw_mbs = (const unsigned char *) str;
177	pstr->len = len;
178	pstr->raw_len = len;
179	pstr->trans = trans;
180	pstr->icase = icase;
181	pstr->mbs_allocated = (trans != NULL \|\| icase);
182	pstr->mb_cur_max = dfa->mb_cur_max;
183	pstr->is_utf8 = dfa->is_utf8;
184	pstr->map_notascii = dfa->map_notascii;
185	pstr->stop = pstr->len;
186	pstr->raw_stop = pstr->stop;
187	}
188
189	#ifdef RE_ENABLE_I18N
190
191	/* Build wide character buffer PSTR->WCS.
192	If the byte sequence of the string are:
193	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194	Then wide character buffer will be:
195	<wc1> , WEOF , <wc2> , WEOF , <wc3>
196	We use WEOF for padding, they indicate that the position isn't
197	a first byte of a multibyte character.
198
199	Note that this function assumes PSTR->VALID_LEN elements are already
200	built and starts from PSTR->VALID_LEN. */
201
202	static void
203	internal_function
204	build_wcs_buffer (re_string_t *pstr)
205	{
206	#ifdef _LIBC
207	unsigned char buf[MB_LEN_MAX];
208	assert (MB_LEN_MAX >= pstr->mb_cur_max);
209	#else
210	unsigned char buf[64];
211	#endif
212	mbstate_t prev_st;
213	Idx byte_idx, end_idx, remain_len;
214	size_t mbclen;
215
216	/* Build the buffers from pstr->valid_len to either pstr->len or
217	pstr->bufs_len. */
218	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
219	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220	{
221	wchar_t wc;
222	const char *p;
223
224	remain_len = end_idx - byte_idx;
225	prev_st = pstr->cur_state;
226	/* Apply the translation if we need. */
227	if (BE (pstr->trans != NULL, 0))
228	{
229	int i, ch;
230
231	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232	{
233	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
234	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235	}
236	p = (const char *) buf;
237	}
238	else
239	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
240	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
241	if (BE (mbclen == (size_t) -1 \|\| mbclen == 0
242	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
243	{
244	/* We treat these cases as a singlebyte character. */
245	mbclen = 1;
246	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247	if (BE (pstr->trans != NULL, 0))
248	wc = pstr->trans[wc];
249	pstr->cur_state = prev_st;
250	}
251	else if (BE (mbclen == (size_t) -2, 0))
252	{
253	/* The buffer doesn't have enough space, finish to build. */
254	pstr->cur_state = prev_st;
255	break;
256	}
257
258	/* Write wide character and padding. */
259	pstr->wcs[byte_idx++] = wc;
260	/* Write paddings. */
261	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262	pstr->wcs[byte_idx++] = WEOF;
263	}
264	pstr->valid_len = byte_idx;
265	pstr->valid_raw_len = byte_idx;
266	}
267
268	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269	but for REG_ICASE. */
270
271	static reg_errcode_t
272	internal_function __attribute_warn_unused_result__
273	build_wcs_upper_buffer (re_string_t *pstr)
274	{
275	mbstate_t prev_st;
276	Idx src_idx, byte_idx, end_idx, remain_len;
277	size_t mbclen;
278	#ifdef _LIBC
279	char buf[MB_LEN_MAX];
280	assert (MB_LEN_MAX >= pstr->mb_cur_max);
281	#else
282	char buf[64];
283	#endif
284
285	byte_idx = pstr->valid_len;
286	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288	/* The following optimization assumes that ASCII characters can be
289	mapped to wide characters with a simple cast. */
290	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291	{
292	while (byte_idx < end_idx)
293	{
294	wchar_t wc;
295
296	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297	&& mbsinit (&pstr->cur_state))
298	{
299	/* In case of a singlebyte character. */
300	pstr->mbs[byte_idx]
301	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302	/* The next step uses the assumption that wchar_t is encoded
303	ASCII-safe: all ASCII values can be converted like this. */
304	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305	++byte_idx;
306	continue;
307	}
308
309	remain_len = end_idx - byte_idx;
310	prev_st = pstr->cur_state;
311	mbclen = __mbrtowc (&wc,
312	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313	+ byte_idx), remain_len, &pstr->cur_state);
314	if (BE (mbclen < (size_t) -2, 1))
315	{
316	wchar_t wcu = wc;
317	if (iswlower (wc))
318	{
319	size_t mbcdlen;
320
321	wcu = towupper (wc);
322	mbcdlen = wcrtomb (buf, wcu, &prev_st);
323	if (BE (mbclen == mbcdlen, 1))
324	memcpy (pstr->mbs + byte_idx, buf, mbclen);
325	else
326	{
327	src_idx = byte_idx;
328	goto offsets_needed;
329	}
330	}
331	else
332	memcpy (pstr->mbs + byte_idx,
333	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334	pstr->wcs[byte_idx++] = wcu;
335	/* Write paddings. */
336	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337	pstr->wcs[byte_idx++] = WEOF;
338	}
339	else if (mbclen == (size_t) -1 \|\| mbclen == 0
340	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341	{
342	/* It is an invalid character, an incomplete character
343	at the end of the string, or '\0'. Just use the byte. */
344	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345	pstr->mbs[byte_idx] = ch;
346	/* And also cast it to wide char. */
347	pstr->wcs[byte_idx++] = (wchar_t) ch;
348	if (BE (mbclen == (size_t) -1, 0))
349	pstr->cur_state = prev_st;
350	}
351	else
352	{
353	/* The buffer doesn't have enough space, finish to build. */
354	pstr->cur_state = prev_st;
355	break;
356	}
357	}
358	pstr->valid_len = byte_idx;
359	pstr->valid_raw_len = byte_idx;
360	return REG_NOERROR;
361	}
362	else
363	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364	{
365	wchar_t wc;
366	const char *p;
367	offsets_needed:
368	remain_len = end_idx - byte_idx;
369	prev_st = pstr->cur_state;
370	if (BE (pstr->trans != NULL, 0))
371	{
372	int i, ch;
373
374	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375	{
376	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377	buf[i] = pstr->trans[ch];
378	}
379	p = (const char *) buf;
380	}
381	else
382	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384	if (BE (mbclen < (size_t) -2, 1))
385	{
386	wchar_t wcu = wc;
387	if (iswlower (wc))
388	{
389	size_t mbcdlen;
390
391	wcu = towupper (wc);
392	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393	if (BE (mbclen == mbcdlen, 1))
394	memcpy (pstr->mbs + byte_idx, buf, mbclen);
395	else if (mbcdlen != (size_t) -1)
396	{
397	size_t i;
398
399	if (byte_idx + mbcdlen > pstr->bufs_len)
400	{
401	pstr->cur_state = prev_st;
402	break;
403	}
404
405	if (pstr->offsets == NULL)
406	{
407	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408
409	if (pstr->offsets == NULL)
410	return REG_ESPACE;
411	}
412	if (!pstr->offsets_needed)
413	{
414	for (i = 0; i < (size_t) byte_idx; ++i)
415	pstr->offsets[i] = i;
416	pstr->offsets_needed = 1;
417	}
418
419	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420	pstr->wcs[byte_idx] = wcu;
421	pstr->offsets[byte_idx] = src_idx;
422	for (i = 1; i < mbcdlen; ++i)
423	{
424	pstr->offsets[byte_idx + i]
425	= src_idx + (i < mbclen ? i : mbclen - 1);
426	pstr->wcs[byte_idx + i] = WEOF;
427	}
428	pstr->len += mbcdlen - mbclen;
429	if (pstr->raw_stop > src_idx)
430	pstr->stop += mbcdlen - mbclen;
431	end_idx = (pstr->bufs_len > pstr->len)
432	? pstr->len : pstr->bufs_len;
433	byte_idx += mbcdlen;
434	src_idx += mbclen;
435	continue;
436	}
437	else
438	memcpy (pstr->mbs + byte_idx, p, mbclen);
439	}
440	else
441	memcpy (pstr->mbs + byte_idx, p, mbclen);
442
443	if (BE (pstr->offsets_needed != 0, 0))
444	{
445	size_t i;
446	for (i = 0; i < mbclen; ++i)
447	pstr->offsets[byte_idx + i] = src_idx + i;
448	}
449	src_idx += mbclen;
450
451	pstr->wcs[byte_idx++] = wcu;
452	/* Write paddings. */
453	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454	pstr->wcs[byte_idx++] = WEOF;
455	}
456	else if (mbclen == (size_t) -1 \|\| mbclen == 0
457	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
458	{
459	/* It is an invalid character or '\0'. Just use the byte. */
460	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
461
462	if (BE (pstr->trans != NULL, 0))
463	ch = pstr->trans [ch];
464	pstr->mbs[byte_idx] = ch;
465
466	if (BE (pstr->offsets_needed != 0, 0))
467	pstr->offsets[byte_idx] = src_idx;
468	++src_idx;
469
470	/* And also cast it to wide char. */
471	pstr->wcs[byte_idx++] = (wchar_t) ch;
472	if (BE (mbclen == (size_t) -1, 0))
473	pstr->cur_state = prev_st;
474	}
475	else
476	{
477	/* The buffer doesn't have enough space, finish to build. */
478	pstr->cur_state = prev_st;
479	break;
480	}
481	}
482	pstr->valid_len = byte_idx;
483	pstr->valid_raw_len = src_idx;
484	return REG_NOERROR;
485	}
486
487	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
488	Return the index. */
489
490	static Idx
491	internal_function
492	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
493	{
494	mbstate_t prev_st;
495	Idx rawbuf_idx;
496	size_t mbclen;
497	wint_t wc = WEOF;
498
499	/* Skip the characters which are not necessary to check. */
500	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
501	rawbuf_idx < new_raw_idx;)
502	{
503	wchar_t wc2;
504	Idx remain_len = pstr->raw_len - rawbuf_idx;
505	prev_st = pstr->cur_state;
506	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507	remain_len, &pstr->cur_state);
508	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
509	{
510	/* We treat these cases as a single byte character. */
511	if (mbclen == 0 \|\| remain_len == 0)
512	wc = L'\0';
513	else
514	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
515	mbclen = 1;
516	pstr->cur_state = prev_st;
517	}
518	else
519	wc = wc2;
520	/* Then proceed the next character. */
521	rawbuf_idx += mbclen;
522	}
523	*last_wc = wc;
524	return rawbuf_idx;
525	}
526	#endif /* RE_ENABLE_I18N */
527
528	/* Build the buffer PSTR->MBS, and apply the translation if we need.
529	This function is used in case of REG_ICASE. */
530
531	static void
532	internal_function
533	build_upper_buffer (re_string_t *pstr)
534	{
535	Idx char_idx, end_idx;
536	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537
538	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539	{
540	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541	if (BE (pstr->trans != NULL, 0))
542	ch = pstr->trans[ch];
543	if (islower (ch))
544	pstr->mbs[char_idx] = toupper (ch);
545	else
546	pstr->mbs[char_idx] = ch;
547	}
548	pstr->valid_len = char_idx;
549	pstr->valid_raw_len = char_idx;
550	}
551
552	/* Apply TRANS to the buffer in PSTR. */
553
554	static void
555	internal_function
556	re_string_translate_buffer (re_string_t *pstr)
557	{
558	Idx buf_idx, end_idx;
559	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560
561	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562	{
563	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564	pstr->mbs[buf_idx] = pstr->trans[ch];
565	}
566
567	pstr->valid_len = buf_idx;
568	pstr->valid_raw_len = buf_idx;
569	}
570
571	/* This function re-construct the buffers.
572	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573	convert to upper case in case of REG_ICASE, apply translation. */
574
575	static reg_errcode_t
576	internal_function __attribute_warn_unused_result__
577	re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578	{
579	Idx offset;
580
581	if (BE (pstr->raw_mbs_idx <= idx, 0))
582	offset = idx - pstr->raw_mbs_idx;
583	else
584	{
585	/* Reset buffer. */
586	#ifdef RE_ENABLE_I18N
587	if (pstr->mb_cur_max > 1)
588	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589	#endif /* RE_ENABLE_I18N */
590	pstr->len = pstr->raw_len;
591	pstr->stop = pstr->raw_stop;
592	pstr->valid_len = 0;
593	pstr->raw_mbs_idx = 0;
594	pstr->valid_raw_len = 0;
595	pstr->offsets_needed = 0;
596	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
598	if (!pstr->mbs_allocated)
599	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600	offset = idx;
601	}
602
603	if (BE (offset != 0, 1))
604	{
605	/* Should the already checked characters be kept? */
606	if (BE (offset < pstr->valid_raw_len, 1))
607	{
608	/* Yes, move them to the front of the buffer. */
609	#ifdef RE_ENABLE_I18N
610	if (BE (pstr->offsets_needed, 0))
611	{
612	Idx low = 0, high = pstr->valid_len, mid;
613	do
614	{
615	mid = (high + low) / 2;
616	if (pstr->offsets[mid] > offset)
617	high = mid;
618	else if (pstr->offsets[mid] < offset)
619	low = mid + 1;
620	else
621	break;
622	}
623	while (low < high);
624	if (pstr->offsets[mid] < offset)
625	++mid;
626	pstr->tip_context = re_string_context_at (pstr, mid - 1,
627	eflags);
628	/* This can be quite complicated, so handle specially
629	only the common and easy case where the character with
630	different length representation of lower and upper
631	case is present at or after offset. */
632	if (pstr->valid_len > offset
633	&& mid == offset && pstr->offsets[mid] == offset)
634	{
635	memmove (pstr->wcs, pstr->wcs + offset,
636	(pstr->valid_len - offset) * sizeof (wint_t));
637	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638	pstr->valid_len -= offset;
639	pstr->valid_raw_len -= offset;
640	for (low = 0; low < pstr->valid_len; low++)
641	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642	}
643	else
644	{
645	/* Otherwise, just find out how long the partial multibyte
646	character at offset is and fill it with WEOF/255. */
647	pstr->len = pstr->raw_len - idx + offset;
648	pstr->stop = pstr->raw_stop - idx + offset;
649	pstr->offsets_needed = 0;
650	while (mid > 0 && pstr->offsets[mid - 1] == offset)
651	--mid;
652	while (mid < pstr->valid_len)
653	if (pstr->wcs[mid] != WEOF)
654	break;
655	else
656	++mid;
657	if (mid == pstr->valid_len)
658	pstr->valid_len = 0;
659	else
660	{
661	pstr->valid_len = pstr->offsets[mid] - offset;
662	if (pstr->valid_len)
663	{
664	for (low = 0; low < pstr->valid_len; ++low)
665	pstr->wcs[low] = WEOF;
666	memset (pstr->mbs, 255, pstr->valid_len);
667	}
668	}
669	pstr->valid_raw_len = pstr->valid_len;
670	}
671	}
672	else
673	#endif
674	{
675	pstr->tip_context = re_string_context_at (pstr, offset - 1,
676	eflags);
677	#ifdef RE_ENABLE_I18N
678	if (pstr->mb_cur_max > 1)
679	memmove (pstr->wcs, pstr->wcs + offset,
680	(pstr->valid_len - offset) * sizeof (wint_t));
681	#endif /* RE_ENABLE_I18N */
682	if (BE (pstr->mbs_allocated, 0))
683	memmove (pstr->mbs, pstr->mbs + offset,
684	pstr->valid_len - offset);
685	pstr->valid_len -= offset;
686	pstr->valid_raw_len -= offset;
687	#if DEBUG
688	assert (pstr->valid_len > 0);
689	#endif
690	}
691	}
692	else
693	{
694	#ifdef RE_ENABLE_I18N
695	/* No, skip all characters until IDX. */
696	Idx prev_valid_len = pstr->valid_len;
697
698	if (BE (pstr->offsets_needed, 0))
699	{
700	pstr->len = pstr->raw_len - idx + offset;
701	pstr->stop = pstr->raw_stop - idx + offset;
702	pstr->offsets_needed = 0;
703	}
704	#endif
705	pstr->valid_len = 0;
706	#ifdef RE_ENABLE_I18N
707	if (pstr->mb_cur_max > 1)
708	{
709	Idx wcs_idx;
710	wint_t wc = WEOF;
711
712	if (pstr->is_utf8)
713	{
714	const unsigned char raw, p, *end;
715
716	/* Special case UTF-8. Multi-byte chars start with any
717	byte other than 0x80 - 0xbf. */
718	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719	end = raw + (offset - pstr->mb_cur_max);
720	if (end < pstr->raw_mbs)
721	end = pstr->raw_mbs;
722	p = raw + offset - 1;
723	#ifdef _LIBC
724	/* We know the wchar_t encoding is UCS4, so for the simple
725	case, ASCII characters, skip the conversion step. */
726	if (isascii (*p) && BE (pstr->trans == NULL, 1))
727	{
728	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729	/* pstr->valid_len = 0; */
730	wc = (wchar_t) *p;
731	}
732	else
733	#endif
734	for (; p >= end; --p)
735	if ((*p & 0xc0) != 0x80)
736	{
737	mbstate_t cur_state;
738	wchar_t wc2;
739	Idx mlen = raw + pstr->len - p;
740	unsigned char buf[6];
741	size_t mbclen;
742
743	const unsigned char *pp = p;
744	if (BE (pstr->trans != NULL, 0))
745	{
746	int i = mlen < 6 ? mlen : 6;
747	while (--i >= 0)
748	buf[i] = pstr->trans[p[i]];
749	pp = buf;
750	}
751	/* XXX Don't use mbrtowc, we know which conversion
752	to use (UTF-8 -> UCS4). */
753	memset (&cur_state, 0, sizeof (cur_state));
754	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755	&cur_state);
756	if (raw + offset - p <= mbclen
757	&& mbclen < (size_t) -2)
758	{
759	memset (&pstr->cur_state, '\0',
760	sizeof (mbstate_t));
761	pstr->valid_len = mbclen - (raw + offset - p);
762	wc = wc2;
763	}
764	break;
765	}
766	}
767
768	if (wc == WEOF)
769	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770	if (wc == WEOF)
771	pstr->tip_context
772	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
773	else
774	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775	&& IS_WIDE_WORD_CHAR (wc))
776	? CONTEXT_WORD
777	: ((IS_WIDE_NEWLINE (wc)
778	&& pstr->newline_anchor)
779	? CONTEXT_NEWLINE : 0));
780	if (BE (pstr->valid_len, 0))
781	{
782	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783	pstr->wcs[wcs_idx] = WEOF;
784	if (pstr->mbs_allocated)
785	memset (pstr->mbs, 255, pstr->valid_len);
786	}
787	pstr->valid_raw_len = pstr->valid_len;
788	}
789	else
790	#endif /* RE_ENABLE_I18N */
791	{
792	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793	pstr->valid_raw_len = 0;
794	if (pstr->trans)
795	c = pstr->trans[c];
796	pstr->tip_context = (bitset_contain (pstr->word_char, c)
797	? CONTEXT_WORD
798	: ((IS_NEWLINE (c) && pstr->newline_anchor)
799	? CONTEXT_NEWLINE : 0));
800	}
801	}
802	if (!BE (pstr->mbs_allocated, 0))
803	pstr->mbs += offset;
804	}
805	pstr->raw_mbs_idx = idx;
806	pstr->len -= offset;
807	pstr->stop -= offset;
808
809	/* Then build the buffers. */
810	#ifdef RE_ENABLE_I18N
811	if (pstr->mb_cur_max > 1)
812	{
813	if (pstr->icase)
814	{
815	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816	if (BE (ret != REG_NOERROR, 0))
817	return ret;
818	}
819	else
820	build_wcs_buffer (pstr);
821	}
822	else
823	#endif /* RE_ENABLE_I18N */
824	if (BE (pstr->mbs_allocated, 0))
825	{
826	if (pstr->icase)
827	build_upper_buffer (pstr);
828	else if (pstr->trans != NULL)
829	re_string_translate_buffer (pstr);
830	}
831	else
832	pstr->valid_len = pstr->len;
833
834	pstr->cur_idx = 0;
835	return REG_NOERROR;
836	}
837
838	static unsigned char
839	internal_function __attribute ((pure))
840	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841	{
842	int ch;
843	Idx off;
844
845	/* Handle the common (easiest) cases first. */
846	if (BE (!pstr->mbs_allocated, 1))
847	return re_string_peek_byte (pstr, idx);
848
849	#ifdef RE_ENABLE_I18N
850	if (pstr->mb_cur_max > 1
851	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852	return re_string_peek_byte (pstr, idx);
853	#endif
854
855	off = pstr->cur_idx + idx;
856	#ifdef RE_ENABLE_I18N
857	if (pstr->offsets_needed)
858	off = pstr->offsets[off];
859	#endif
860
861	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862
863	#ifdef RE_ENABLE_I18N
864	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865	this function returns CAPITAL LETTER I instead of first byte of
866	DOTLESS SMALL LETTER I. The latter would confuse the parser,
867	since peek_byte_case doesn't advance cur_idx in any way. */
868	if (pstr->offsets_needed && !isascii (ch))
869	return re_string_peek_byte (pstr, idx);
870	#endif
871
872	return ch;
873	}
874
875	static unsigned char
876	internal_function
877	re_string_fetch_byte_case (re_string_t *pstr)
878	{
879	if (BE (!pstr->mbs_allocated, 1))
880	return re_string_fetch_byte (pstr);
881
882	#ifdef RE_ENABLE_I18N
883	if (pstr->offsets_needed)
884	{
885	Idx off;
886	int ch;
887
888	/* For tr_TR.UTF-8 [[:islower:]] there is
889	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890	in that case the whole multi-byte character and return
891	the original letter. On the other side, with
892	[[: DOTLESS SMALL LETTER I return [[:I, as doing
893	anything else would complicate things too much. */
894
895	if (!re_string_first_byte (pstr, pstr->cur_idx))
896	return re_string_fetch_byte (pstr);
897
898	off = pstr->offsets[pstr->cur_idx];
899	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900
901	if (! isascii (ch))
902	return re_string_fetch_byte (pstr);
903
904	re_string_skip_bytes (pstr,
905	re_string_char_size_at (pstr, pstr->cur_idx));
906	return ch;
907	}
908	#endif
909
910	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911	}
912
913	static void
914	internal_function
915	re_string_destruct (re_string_t *pstr)
916	{
917	#ifdef RE_ENABLE_I18N
918	re_free (pstr->wcs);
919	re_free (pstr->offsets);
920	#endif /* RE_ENABLE_I18N */
921	if (pstr->mbs_allocated)
922	re_free (pstr->mbs);
923	}
924
925	/* Return the context at IDX in INPUT. */
926
927	static unsigned int
928	internal_function
929	re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930	{
931	int c;
932	if (BE (! REG_VALID_INDEX (idx), 0))
933	/* In this case, we use the value stored in input->tip_context,
934	since we can't know the character in input->mbs[-1] here. */
935	return input->tip_context;
936	if (BE (idx == input->len, 0))
937	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
939	#ifdef RE_ENABLE_I18N
940	if (input->mb_cur_max > 1)
941	{
942	wint_t wc;
943	Idx wc_idx = idx;
944	while(input->wcs[wc_idx] == WEOF)
945	{
946	#ifdef DEBUG
947	/* It must not happen. */
948	assert (REG_VALID_INDEX (wc_idx));
949	#endif
950	--wc_idx;
951	if (! REG_VALID_INDEX (wc_idx))
952	return input->tip_context;
953	}
954	wc = input->wcs[wc_idx];
955	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956	return CONTEXT_WORD;
957	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958	? CONTEXT_NEWLINE : 0);
959	}
960	else
961	#endif
962	{
963	c = re_string_byte_at (input, idx);
964	if (bitset_contain (input->word_char, c))
965	return CONTEXT_WORD;
966	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967	}
968	}
969
970
971	/* Functions for set operation. */
972
973	static reg_errcode_t
974	internal_function __attribute_warn_unused_result__
975	re_node_set_alloc (re_node_set *set, Idx size)
976	{
977	set->alloc = size;
978	set->nelem = 0;
979	set->elems = re_malloc (Idx, size);
980	if (BE (set->elems == NULL, 0))
981	return REG_ESPACE;
982	return REG_NOERROR;
983	}
984
985	static reg_errcode_t
986	internal_function __attribute_warn_unused_result__
987	re_node_set_init_1 (re_node_set *set, Idx elem)
988	{
989	set->alloc = 1;
990	set->nelem = 1;
991	set->elems = re_malloc (Idx, 1);
992	if (BE (set->elems == NULL, 0))
993	{
994	set->alloc = set->nelem = 0;
995	return REG_ESPACE;
996	}
997	set->elems[0] = elem;
998	return REG_NOERROR;
999	}
1000
1001	static reg_errcode_t
1002	internal_function __attribute_warn_unused_result__
1003	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1004	{
1005	set->alloc = 2;
1006	set->elems = re_malloc (Idx, 2);
1007	if (BE (set->elems == NULL, 0))
1008	return REG_ESPACE;
1009	if (elem1 == elem2)
1010	{
1011	set->nelem = 1;
1012	set->elems[0] = elem1;
1013	}
1014	else
1015	{
1016	set->nelem = 2;
1017	if (elem1 < elem2)
1018	{
1019	set->elems[0] = elem1;
1020	set->elems[1] = elem2;
1021	}
1022	else
1023	{
1024	set->elems[0] = elem2;
1025	set->elems[1] = elem1;
1026	}
1027	}
1028	return REG_NOERROR;
1029	}
1030
1031	static reg_errcode_t
1032	internal_function __attribute_warn_unused_result__
1033	re_node_set_init_copy (re_node_set dest, const re_node_set src)
1034	{
1035	dest->nelem = src->nelem;
1036	if (src->nelem > 0)
1037	{
1038	dest->alloc = dest->nelem;
1039	dest->elems = re_malloc (Idx, dest->alloc);
1040	if (BE (dest->elems == NULL, 0))
1041	{
1042	dest->alloc = dest->nelem = 0;
1043	return REG_ESPACE;
1044	}
1045	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1046	}
1047	else
1048	re_node_set_init_empty (dest);
1049	return REG_NOERROR;
1050	}
1051
1052	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1053	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1054	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1055
1056	static reg_errcode_t
1057	internal_function __attribute_warn_unused_result__
1058	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
1059	const re_node_set *src2)
1060	{
1061	Idx i1, i2, is, id, delta, sbase;
1062	if (src1->nelem == 0 \|\| src2->nelem == 0)
1063	return REG_NOERROR;
1064
1065	/* We need dest->nelem + 2 * elems_in_intersection; this is a
1066	conservative estimate. */
1067	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1068	{
1069	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1070	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1071	if (BE (new_elems == NULL, 0))
1072	return REG_ESPACE;
1073	dest->elems = new_elems;
1074	dest->alloc = new_alloc;
1075	}
1076
1077	/* Find the items in the intersection of SRC1 and SRC2, and copy
1078	into the top of DEST those that are not already in DEST itself. */
1079	sbase = dest->nelem + src1->nelem + src2->nelem;
1080	i1 = src1->nelem - 1;
1081	i2 = src2->nelem - 1;
1082	id = dest->nelem - 1;
1083	for (;;)
1084	{
1085	if (src1->elems[i1] == src2->elems[i2])
1086	{
1087	/* Try to find the item in DEST. Maybe we could binary search? */
1088	while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1089	--id;
1090
1091	if (! REG_VALID_INDEX (id) \|\| dest->elems[id] != src1->elems[i1])
1092	dest->elems[--sbase] = src1->elems[i1];
1093
1094	if (! REG_VALID_INDEX (--i1) \|\| ! REG_VALID_INDEX (--i2))
1095	break;
1096	}
1097
1098	/* Lower the highest of the two items. */
1099	else if (src1->elems[i1] < src2->elems[i2])
1100	{
1101	if (! REG_VALID_INDEX (--i2))
1102	break;
1103	}
1104	else
1105	{
1106	if (! REG_VALID_INDEX (--i1))
1107	break;
1108	}
1109	}
1110
1111	id = dest->nelem - 1;
1112	is = dest->nelem + src1->nelem + src2->nelem - 1;
1113	delta = is - sbase + 1;
1114
1115	/* Now copy. When DELTA becomes zero, the remaining
1116	DEST elements are already in place; this is more or
1117	less the same loop that is in re_node_set_merge. */
1118	dest->nelem += delta;
1119	if (delta > 0 && REG_VALID_INDEX (id))
1120	for (;;)
1121	{
1122	if (dest->elems[is] > dest->elems[id])
1123	{
1124	/* Copy from the top. */
1125	dest->elems[id + delta--] = dest->elems[is--];
1126	if (delta == 0)
1127	break;
1128	}
1129	else
1130	{
1131	/* Slide from the bottom. */
1132	dest->elems[id + delta] = dest->elems[id];
1133	if (! REG_VALID_INDEX (--id))
1134	break;
1135	}
1136	}
1137
1138	/* Copy remaining SRC elements. */
1139	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1140
1141	return REG_NOERROR;
1142	}
1143
1144	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1145	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1146
1147	static reg_errcode_t
1148	internal_function __attribute_warn_unused_result__
1149	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1150	const re_node_set *src2)
1151	{
1152	Idx i1, i2, id;
1153	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1154	{
1155	dest->alloc = src1->nelem + src2->nelem;
1156	dest->elems = re_malloc (Idx, dest->alloc);
1157	if (BE (dest->elems == NULL, 0))
1158	return REG_ESPACE;
1159	}
1160	else
1161	{
1162	if (src1 != NULL && src1->nelem > 0)
1163	return re_node_set_init_copy (dest, src1);
1164	else if (src2 != NULL && src2->nelem > 0)
1165	return re_node_set_init_copy (dest, src2);
1166	else
1167	re_node_set_init_empty (dest);
1168	return REG_NOERROR;
1169	}
1170	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1171	{
1172	if (src1->elems[i1] > src2->elems[i2])
1173	{
1174	dest->elems[id++] = src2->elems[i2++];
1175	continue;
1176	}
1177	if (src1->elems[i1] == src2->elems[i2])
1178	++i2;
1179	dest->elems[id++] = src1->elems[i1++];
1180	}
1181	if (i1 < src1->nelem)
1182	{
1183	memcpy (dest->elems + id, src1->elems + i1,
1184	(src1->nelem - i1) * sizeof (Idx));
1185	id += src1->nelem - i1;
1186	}
1187	else if (i2 < src2->nelem)
1188	{
1189	memcpy (dest->elems + id, src2->elems + i2,
1190	(src2->nelem - i2) * sizeof (Idx));
1191	id += src2->nelem - i2;
1192	}
1193	dest->nelem = id;
1194	return REG_NOERROR;
1195	}
1196
1197	/* Calculate the union set of the sets DEST and SRC. And store it to
1198	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1199
1200	static reg_errcode_t
1201	internal_function __attribute_warn_unused_result__
1202	re_node_set_merge (re_node_set dest, const re_node_set src)
1203	{
1204	Idx is, id, sbase, delta;
1205	if (src == NULL \|\| src->nelem == 0)
1206	return REG_NOERROR;
1207	if (dest->alloc < 2 * src->nelem + dest->nelem)
1208	{
1209	Idx new_alloc = 2 * (src->nelem + dest->alloc);
1210	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1211	if (BE (new_buffer == NULL, 0))
1212	return REG_ESPACE;
1213	dest->elems = new_buffer;
1214	dest->alloc = new_alloc;
1215	}
1216
1217	if (BE (dest->nelem == 0, 0))
1218	{
1219	dest->nelem = src->nelem;
1220	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1221	return REG_NOERROR;
1222	}
1223
1224	/* Copy into the top of DEST the items of SRC that are not
1225	found in DEST. Maybe we could binary search in DEST? */
1226	for (sbase = dest->nelem + 2 * src->nelem,
1227	is = src->nelem - 1, id = dest->nelem - 1;
1228	REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1229	{
1230	if (dest->elems[id] == src->elems[is])
1231	is--, id--;
1232	else if (dest->elems[id] < src->elems[is])
1233	dest->elems[--sbase] = src->elems[is--];
1234	else /* if (dest->elems[id] > src->elems[is]) */
1235	--id;
1236	}
1237
1238	if (REG_VALID_INDEX (is))
1239	{
1240	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1241	sbase -= is + 1;
1242	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243	}
1244
1245	id = dest->nelem - 1;
1246	is = dest->nelem + 2 * src->nelem - 1;
1247	delta = is - sbase + 1;
1248	if (delta == 0)
1249	return REG_NOERROR;
1250
1251	/* Now copy. When DELTA becomes zero, the remaining
1252	DEST elements are already in place. */
1253	dest->nelem += delta;
1254	for (;;)
1255	{
1256	if (dest->elems[is] > dest->elems[id])
1257	{
1258	/* Copy from the top. */
1259	dest->elems[id + delta--] = dest->elems[is--];
1260	if (delta == 0)
1261	break;
1262	}
1263	else
1264	{
1265	/* Slide from the bottom. */
1266	dest->elems[id + delta] = dest->elems[id];
1267	if (! REG_VALID_INDEX (--id))
1268	{
1269	/* Copy remaining SRC elements. */
1270	memcpy (dest->elems, dest->elems + sbase,
1271	delta * sizeof (Idx));
1272	break;
1273	}
1274	}
1275	}
1276
1277	return REG_NOERROR;
1278	}
1279
1280	/* Insert the new element ELEM to the re_node_set* SET.
1281	SET should not already have ELEM.
1282	Return true if successful. */
1283
1284	static bool
1285	internal_function __attribute_warn_unused_result__
1286	re_node_set_insert (re_node_set *set, Idx elem)
1287	{
1288	Idx idx;
1289	/* In case the set is empty. */
1290	if (set->alloc == 0)
1291	return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1292
1293	if (BE (set->nelem, 0) == 0)
1294	{
1295	/* We already guaranteed above that set->alloc != 0. */
1296	set->elems[0] = elem;
1297	++set->nelem;
1298	return true;
1299	}
1300
1301	/* Realloc if we need. */
1302	if (set->alloc == set->nelem)
1303	{
1304	Idx *new_elems;
1305	set->alloc = set->alloc * 2;
1306	new_elems = re_realloc (set->elems, Idx, set->alloc);
1307	if (BE (new_elems == NULL, 0))
1308	return false;
1309	set->elems = new_elems;
1310	}
1311
1312	/* Move the elements which follows the new element. Test the
1313	first element separately to skip a check in the inner loop. */
1314	if (elem < set->elems[0])
1315	{
1316	idx = 0;
1317	for (idx = set->nelem; idx > 0; idx--)
1318	set->elems[idx] = set->elems[idx - 1];
1319	}
1320	else
1321	{
1322	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1323	set->elems[idx] = set->elems[idx - 1];
1324	}
1325
1326	/* Insert the new element. */
1327	set->elems[idx] = elem;
1328	++set->nelem;
1329	return true;
1330	}
1331
1332	/* Insert the new element ELEM to the re_node_set* SET.
1333	SET should not already have any element greater than or equal to ELEM.
1334	Return true if successful. */
1335
1336	static bool
1337	internal_function __attribute_warn_unused_result__
1338	re_node_set_insert_last (re_node_set *set, Idx elem)
1339	{
1340	/* Realloc if we need. */
1341	if (set->alloc == set->nelem)
1342	{
1343	Idx *new_elems;
1344	set->alloc = (set->alloc + 1) * 2;
1345	new_elems = re_realloc (set->elems, Idx, set->alloc);
1346	if (BE (new_elems == NULL, 0))
1347	return false;
1348	set->elems = new_elems;
1349	}
1350
1351	/* Insert the new element. */
1352	set->elems[set->nelem++] = elem;
1353	return true;
1354	}
1355
1356	/* Compare two node sets SET1 and SET2.
1357	Return true if SET1 and SET2 are equivalent. */
1358
1359	static bool
1360	internal_function __attribute ((pure))
1361	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1362	{
1363	Idx i;
1364	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1365	return false;
1366	for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1367	if (set1->elems[i] != set2->elems[i])
1368	return false;
1369	return true;
1370	}
1371
1372	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1373
1374	static Idx
1375	internal_function __attribute ((pure))
1376	re_node_set_contains (const re_node_set *set, Idx elem)
1377	{
1378	__re_size_t idx, right, mid;
1379	if (! REG_VALID_NONZERO_INDEX (set->nelem))
1380	return 0;
1381
1382	/* Binary search the element. */
1383	idx = 0;
1384	right = set->nelem - 1;
1385	while (idx < right)
1386	{
1387	mid = (idx + right) / 2;
1388	if (set->elems[mid] < elem)
1389	idx = mid + 1;
1390	else
1391	right = mid;
1392	}
1393	return set->elems[idx] == elem ? idx + 1 : 0;
1394	}
1395
1396	static void
1397	internal_function
1398	re_node_set_remove_at (re_node_set *set, Idx idx)
1399	{
1400	verify (! TYPE_SIGNED (Idx));
1401	/* if (idx < 0)
1402	return; */
1403	if (idx >= set->nelem)
1404	return;
1405	--set->nelem;
1406	for (; idx < set->nelem; idx++)
1407	set->elems[idx] = set->elems[idx + 1];
1408	}
1409
1410
1411
1412	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1413	Or return REG_MISSING if an error occurred. */
1414
1415	static Idx
1416	internal_function
1417	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1418	{
1419	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1420	{
1421	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1422	Idx new_nexts, new_indices;
1423	re_node_set new_edests, new_eclosures;
1424	re_token_t *new_nodes;
1425
1426	/* Avoid overflows in realloc. */
1427	const size_t max_object_size = MAX (sizeof (re_token_t),
1428	MAX (sizeof (re_node_set),
1429	sizeof (Idx)));
1430	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1431	return REG_MISSING;
1432
1433	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1434	if (BE (new_nodes == NULL, 0))
1435	return REG_MISSING;
1436	dfa->nodes = new_nodes;
1437	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1438	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1439	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1440	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1441	if (BE (new_nexts == NULL \|\| new_indices == NULL
1442	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1443	return REG_MISSING;
1444	dfa->nexts = new_nexts;
1445	dfa->org_indices = new_indices;
1446	dfa->edests = new_edests;
1447	dfa->eclosures = new_eclosures;
1448	dfa->nodes_alloc = new_nodes_alloc;
1449	}
1450	dfa->nodes[dfa->nodes_len] = token;
1451	dfa->nodes[dfa->nodes_len].constraint = 0;
1452	#ifdef RE_ENABLE_I18N
1453	{
1454	int type = token.type;
1455	dfa->nodes[dfa->nodes_len].accept_mb =
1456	(type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| type == COMPLEX_BRACKET;
1457	}
1458	#endif
1459	dfa->nexts[dfa->nodes_len] = REG_MISSING;
1460	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1461	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1462	return dfa->nodes_len++;
1463	}
1464
1465	static inline re_hashval_t
1466	internal_function
1467	calc_state_hash (const re_node_set *nodes, unsigned int context)
1468	{
1469	re_hashval_t hash = nodes->nelem + context;
1470	Idx i;
1471	for (i = 0 ; i < nodes->nelem ; i++)
1472	hash += nodes->elems[i];
1473	return hash;
1474	}
1475
1476	/* Search for the state whose node_set is equivalent to NODES.
1477	Return the pointer to the state, if we found it in the DFA.
1478	Otherwise create the new one and return it. In case of an error
1479	return NULL and set the error code in ERR.
1480	Note: - We assume NULL as the invalid state, then it is possible that
1481	return value is NULL and ERR is REG_NOERROR.
1482	- We never return non-NULL value in case of any errors, it is for
1483	optimization. */
1484
1485	static re_dfastate_t *
1486	internal_function __attribute_warn_unused_result__
1487	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1488	const re_node_set *nodes)
1489	{
1490	re_hashval_t hash;
1491	re_dfastate_t *new_state;
1492	struct re_state_table_entry *spot;
1493	Idx i;
1494	#ifdef lint
1495	/* Suppress bogus uninitialized-variable warnings. */
1496	*err = REG_NOERROR;
1497	#endif
1498	if (BE (nodes->nelem == 0, 0))
1499	{
1500	*err = REG_NOERROR;
1501	return NULL;
1502	}
1503	hash = calc_state_hash (nodes, 0);
1504	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1505
1506	for (i = 0 ; i < spot->num ; i++)
1507	{
1508	re_dfastate_t *state = spot->array[i];
1509	if (hash != state->hash)
1510	continue;
1511	if (re_node_set_compare (&state->nodes, nodes))
1512	return state;
1513	}
1514
1515	/* There are no appropriate state in the dfa, create the new one. */
1516	new_state = create_ci_newstate (dfa, nodes, hash);
1517	if (BE (new_state == NULL, 0))
1518	*err = REG_ESPACE;
1519
1520	return new_state;
1521	}
1522
1523	/* Search for the state whose node_set is equivalent to NODES and
1524	whose context is equivalent to CONTEXT.
1525	Return the pointer to the state, if we found it in the DFA.
1526	Otherwise create the new one and return it. In case of an error
1527	return NULL and set the error code in ERR.
1528	Note: - We assume NULL as the invalid state, then it is possible that
1529	return value is NULL and ERR is REG_NOERROR.
1530	- We never return non-NULL value in case of any errors, it is for
1531	optimization. */
1532
1533	static re_dfastate_t *
1534	internal_function __attribute_warn_unused_result__
1535	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1536	const re_node_set *nodes, unsigned int context)
1537	{
1538	re_hashval_t hash;
1539	re_dfastate_t *new_state;
1540	struct re_state_table_entry *spot;
1541	Idx i;
1542	#ifdef lint
1543	/* Suppress bogus uninitialized-variable warnings. */
1544	*err = REG_NOERROR;
1545	#endif
1546	if (nodes->nelem == 0)
1547	{
1548	*err = REG_NOERROR;
1549	return NULL;
1550	}
1551	hash = calc_state_hash (nodes, context);
1552	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1553
1554	for (i = 0 ; i < spot->num ; i++)
1555	{
1556	re_dfastate_t *state = spot->array[i];
1557	if (state->hash == hash
1558	&& state->context == context
1559	&& re_node_set_compare (state->entrance_nodes, nodes))
1560	return state;
1561	}
1562	/* There are no appropriate state in 'dfa', create the new one. */
1563	new_state = create_cd_newstate (dfa, nodes, context, hash);
1564	if (BE (new_state == NULL, 0))
1565	*err = REG_ESPACE;
1566
1567	return new_state;
1568	}
1569
1570	/* Finish initialization of the new state NEWSTATE, and using its hash value
1571	HASH put in the appropriate bucket of DFA's state table. Return value
1572	indicates the error code if failed. */
1573
1574	static reg_errcode_t
1575	__attribute_warn_unused_result__
1576	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1577	re_hashval_t hash)
1578	{
1579	struct re_state_table_entry *spot;
1580	reg_errcode_t err;
1581	Idx i;
1582
1583	newstate->hash = hash;
1584	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1585	if (BE (err != REG_NOERROR, 0))
1586	return REG_ESPACE;
1587	for (i = 0; i < newstate->nodes.nelem; i++)
1588	{
1589	Idx elem = newstate->nodes.elems[i];
1590	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1591	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1592	return REG_ESPACE;
1593	}
1594
1595	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1596	if (BE (spot->alloc <= spot->num, 0))
1597	{
1598	Idx new_alloc = 2 * spot->num + 2;
1599	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1600	new_alloc);
1601	if (BE (new_array == NULL, 0))
1602	return REG_ESPACE;
1603	spot->array = new_array;
1604	spot->alloc = new_alloc;
1605	}
1606	spot->array[spot->num++] = newstate;
1607	return REG_NOERROR;
1608	}
1609
1610	static void
1611	free_state (re_dfastate_t *state)
1612	{
1613	re_node_set_free (&state->non_eps_nodes);
1614	re_node_set_free (&state->inveclosure);
1615	if (state->entrance_nodes != &state->nodes)
1616	{
1617	re_node_set_free (state->entrance_nodes);
1618	re_free (state->entrance_nodes);
1619	}
1620	re_node_set_free (&state->nodes);
1621	re_free (state->word_trtable);
1622	re_free (state->trtable);
1623	re_free (state);
1624	}
1625
1626	/* Create the new state which is independent of contexts.
1627	Return the new state if succeeded, otherwise return NULL. */
1628
1629	static re_dfastate_t *
1630	internal_function __attribute_warn_unused_result__
1631	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1632	re_hashval_t hash)
1633	{
1634	Idx i;
1635	reg_errcode_t err;
1636	re_dfastate_t *newstate;
1637
1638	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1639	if (BE (newstate == NULL, 0))
1640	return NULL;
1641	err = re_node_set_init_copy (&newstate->nodes, nodes);
1642	if (BE (err != REG_NOERROR, 0))
1643	{
1644	re_free (newstate);
1645	return NULL;
1646	}
1647
1648	newstate->entrance_nodes = &newstate->nodes;
1649	for (i = 0 ; i < nodes->nelem ; i++)
1650	{
1651	re_token_t *node = dfa->nodes + nodes->elems[i];
1652	re_token_type_t type = node->type;
1653	if (type == CHARACTER && !node->constraint)
1654	continue;
1655	#ifdef RE_ENABLE_I18N
1656	newstate->accept_mb \|= node->accept_mb;
1657	#endif /* RE_ENABLE_I18N */
1658
1659	/* If the state has the halt node, the state is a halt state. */
1660	if (type == END_OF_RE)
1661	newstate->halt = 1;
1662	else if (type == OP_BACK_REF)
1663	newstate->has_backref = 1;
1664	else if (type == ANCHOR \|\| node->constraint)
1665	newstate->has_constraint = 1;
1666	}
1667	err = register_state (dfa, newstate, hash);
1668	if (BE (err != REG_NOERROR, 0))
1669	{
1670	free_state (newstate);
1671	newstate = NULL;
1672	}
1673	return newstate;
1674	}
1675
1676	/* Create the new state which is depend on the context CONTEXT.
1677	Return the new state if succeeded, otherwise return NULL. */
1678
1679	static re_dfastate_t *
1680	internal_function __attribute_warn_unused_result__
1681	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1682	unsigned int context, re_hashval_t hash)
1683	{
1684	Idx i, nctx_nodes = 0;
1685	reg_errcode_t err;
1686	re_dfastate_t *newstate;
1687
1688	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1689	if (BE (newstate == NULL, 0))
1690	return NULL;
1691	err = re_node_set_init_copy (&newstate->nodes, nodes);
1692	if (BE (err != REG_NOERROR, 0))
1693	{
1694	re_free (newstate);
1695	return NULL;
1696	}
1697
1698	newstate->context = context;
1699	newstate->entrance_nodes = &newstate->nodes;
1700
1701	for (i = 0 ; i < nodes->nelem ; i++)
1702	{
1703	re_token_t *node = dfa->nodes + nodes->elems[i];
1704	re_token_type_t type = node->type;
1705	unsigned int constraint = node->constraint;
1706
1707	if (type == CHARACTER && !constraint)
1708	continue;
1709	#ifdef RE_ENABLE_I18N
1710	newstate->accept_mb \|= node->accept_mb;
1711	#endif /* RE_ENABLE_I18N */
1712
1713	/* If the state has the halt node, the state is a halt state. */
1714	if (type == END_OF_RE)
1715	newstate->halt = 1;
1716	else if (type == OP_BACK_REF)
1717	newstate->has_backref = 1;
1718
1719	if (constraint)
1720	{
1721	if (newstate->entrance_nodes == &newstate->nodes)
1722	{
1723	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1724	if (BE (newstate->entrance_nodes == NULL, 0))
1725	{
1726	free_state (newstate);
1727	return NULL;
1728	}
1729	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1730	!= REG_NOERROR)
1731	return NULL;
1732	nctx_nodes = 0;
1733	newstate->has_constraint = 1;
1734	}
1735
1736	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1737	{
1738	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1739	++nctx_nodes;
1740	}
1741	}
1742	}
1743	err = register_state (dfa, newstate, hash);
1744	if (BE (err != REG_NOERROR, 0))
1745	{
1746	free_state (newstate);
1747	newstate = NULL;
1748	}
1749	return newstate;
1750	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/grep/2.12/lib/regex_internal.c@ 3576

Download in other formats: