regex_internal.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 6 months ago
src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge ^{/vendor/sed/4.1.5}/vendor/sed/current .)
File size: 47.5 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <[email protected]>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. */
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash);
27	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash);
31	static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32	Idx new_buf_len);
33	static void build_wcs_buffer (re_string_t *pstr);
34	static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
35	static void build_upper_buffer (re_string_t *pstr);
36	static void re_string_translate_buffer (re_string_t *pstr);
37	static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
38	int eflags) __attribute__ ((pure));
39
40
41	/* Functions for string operation. */
42
43	/* This function allocate the buffers. It is necessary to call
44	re_string_reconstruct before using the object. */
45
46	static reg_errcode_t
47	__attribute_warn_unused_result__
48	re_string_allocate (re_string_t pstr, const char str, Idx len, Idx init_len,
49	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
50	{
51	reg_errcode_t ret;
52	Idx init_buf_len;
53
54	/* Ensure at least one character fits into the buffers. */
55	if (init_len < dfa->mb_cur_max)
56	init_len = dfa->mb_cur_max;
57	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
58	re_string_construct_common (str, len, pstr, trans, icase, dfa);
59
60	ret = re_string_realloc_buffers (pstr, init_buf_len);
61	if (__glibc_unlikely (ret != REG_NOERROR))
62	return ret;
63
64	pstr->word_char = dfa->word_char;
65	pstr->word_ops_used = dfa->word_ops_used;
66	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
67	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
68	pstr->valid_raw_len = pstr->valid_len;
69	return REG_NOERROR;
70	}
71
72	/* This function allocate the buffers, and initialize them. */
73
74	static reg_errcode_t
75	__attribute_warn_unused_result__
76	re_string_construct (re_string_t pstr, const char str, Idx len,
77	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
78	{
79	reg_errcode_t ret;
80	memset (pstr, '\0', sizeof (re_string_t));
81	re_string_construct_common (str, len, pstr, trans, icase, dfa);
82
83	if (len > 0)
84	{
85	ret = re_string_realloc_buffers (pstr, len + 1);
86	if (__glibc_unlikely (ret != REG_NOERROR))
87	return ret;
88	}
89	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
90
91	if (icase)
92	{
93	if (dfa->mb_cur_max > 1)
94	{
95	while (1)
96	{
97	ret = build_wcs_upper_buffer (pstr);
98	if (__glibc_unlikely (ret != REG_NOERROR))
99	return ret;
100	if (pstr->valid_raw_len >= len)
101	break;
102	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
103	break;
104	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
105	if (__glibc_unlikely (ret != REG_NOERROR))
106	return ret;
107	}
108	}
109	else
110	build_upper_buffer (pstr);
111	}
112	else
113	{
114	if (dfa->mb_cur_max > 1)
115	build_wcs_buffer (pstr);
116	else
117	{
118	if (trans != NULL)
119	re_string_translate_buffer (pstr);
120	else
121	{
122	pstr->valid_len = pstr->bufs_len;
123	pstr->valid_raw_len = pstr->bufs_len;
124	}
125	}
126	}
127
128	return REG_NOERROR;
129	}
130
131	/* Helper functions for re_string_allocate, and re_string_construct. */
132
133	static reg_errcode_t
134	__attribute_warn_unused_result__
135	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
136	{
137	if (pstr->mb_cur_max > 1)
138	{
139	wint_t *new_wcs;
140
141	/* Avoid overflow in realloc. */
142	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
143	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
144	< new_buf_len))
145	return REG_ESPACE;
146
147	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
148	if (__glibc_unlikely (new_wcs == NULL))
149	return REG_ESPACE;
150	pstr->wcs = new_wcs;
151	if (pstr->offsets != NULL)
152	{
153	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
154	if (__glibc_unlikely (new_offsets == NULL))
155	return REG_ESPACE;
156	pstr->offsets = new_offsets;
157	}
158	}
159	if (pstr->mbs_allocated)
160	{
161	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
162	new_buf_len);
163	if (__glibc_unlikely (new_mbs == NULL))
164	return REG_ESPACE;
165	pstr->mbs = new_mbs;
166	}
167	pstr->bufs_len = new_buf_len;
168	return REG_NOERROR;
169	}
170
171
172	static void
173	re_string_construct_common (const char str, Idx len, re_string_t pstr,
174	RE_TRANSLATE_TYPE trans, bool icase,
175	const re_dfa_t *dfa)
176	{
177	pstr->raw_mbs = (const unsigned char *) str;
178	pstr->len = len;
179	pstr->raw_len = len;
180	pstr->trans = trans;
181	pstr->icase = icase;
182	pstr->mbs_allocated = (trans != NULL \|\| icase);
183	pstr->mb_cur_max = dfa->mb_cur_max;
184	pstr->is_utf8 = dfa->is_utf8;
185	pstr->map_notascii = dfa->map_notascii;
186	pstr->stop = pstr->len;
187	pstr->raw_stop = pstr->stop;
188	}
189
190
191	/* Build wide character buffer PSTR->WCS.
192	If the byte sequence of the string are:
193	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194	Then wide character buffer will be:
195	<wc1> , WEOF , <wc2> , WEOF , <wc3>
196	We use WEOF for padding, they indicate that the position isn't
197	a first byte of a multibyte character.
198
199	Note that this function assumes PSTR->VALID_LEN elements are already
200	built and starts from PSTR->VALID_LEN. */
201
202	static void
203	build_wcs_buffer (re_string_t *pstr)
204	{
205	#ifdef _LIBC
206	unsigned char buf[MB_LEN_MAX];
207	DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
208	#else
209	unsigned char buf[64];
210	#endif
211	mbstate_t prev_st;
212	Idx byte_idx, end_idx, remain_len;
213	size_t mbclen;
214
215	/* Build the buffers from pstr->valid_len to either pstr->len or
216	pstr->bufs_len. */
217	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219	{
220	wchar_t wc;
221	const char *p;
222
223	remain_len = end_idx - byte_idx;
224	prev_st = pstr->cur_state;
225	/* Apply the translation if we need. */
226	if (__glibc_unlikely (pstr->trans != NULL))
227	{
228	int i, ch;
229
230	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231	{
232	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234	}
235	p = (const char *) buf;
236	}
237	else
238	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240	if (__glibc_unlikely (mbclen == (size_t) -1 \|\| mbclen == 0
241	\|\| (mbclen == (size_t) -2
242	&& pstr->bufs_len >= pstr->len)))
243	{
244	/* We treat these cases as a singlebyte character. */
245	mbclen = 1;
246	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247	if (__glibc_unlikely (pstr->trans != NULL))
248	wc = pstr->trans[wc];
249	pstr->cur_state = prev_st;
250	}
251	else if (__glibc_unlikely (mbclen == (size_t) -2))
252	{
253	/* The buffer doesn't have enough space, finish to build. */
254	pstr->cur_state = prev_st;
255	break;
256	}
257
258	/* Write wide character and padding. */
259	pstr->wcs[byte_idx++] = wc;
260	/* Write paddings. */
261	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262	pstr->wcs[byte_idx++] = WEOF;
263	}
264	pstr->valid_len = byte_idx;
265	pstr->valid_raw_len = byte_idx;
266	}
267
268	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269	but for REG_ICASE. */
270
271	static reg_errcode_t
272	__attribute_warn_unused_result__
273	build_wcs_upper_buffer (re_string_t *pstr)
274	{
275	mbstate_t prev_st;
276	Idx src_idx, byte_idx, end_idx, remain_len;
277	size_t mbclen;
278	#ifdef _LIBC
279	char buf[MB_LEN_MAX];
280	DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
281	#else
282	char buf[64];
283	#endif
284
285	byte_idx = pstr->valid_len;
286	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288	/* The following optimization assumes that ASCII characters can be
289	mapped to wide characters with a simple cast. */
290	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291	{
292	while (byte_idx < end_idx)
293	{
294	wchar_t wc;
295	unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
296
297	if (isascii (ch) && mbsinit (&pstr->cur_state))
298	{
299	/* The next step uses the assumption that wchar_t is encoded
300	ASCII-safe: all ASCII values can be converted like this. */
301	wchar_t wcu = __towupper (ch);
302	if (isascii (wcu))
303	{
304	pstr->mbs[byte_idx] = wcu;
305	pstr->wcs[byte_idx] = wcu;
306	byte_idx++;
307	continue;
308	}
309	}
310
311	remain_len = end_idx - byte_idx;
312	prev_st = pstr->cur_state;
313	mbclen = __mbrtowc (&wc,
314	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
315	+ byte_idx), remain_len, &pstr->cur_state);
316	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
317	{
318	wchar_t wcu = __towupper (wc);
319	if (wcu != wc)
320	{
321	size_t mbcdlen;
322
323	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
324	if (__glibc_likely (mbclen == mbcdlen))
325	memcpy (pstr->mbs + byte_idx, buf, mbclen);
326	else
327	{
328	src_idx = byte_idx;
329	goto offsets_needed;
330	}
331	}
332	else
333	memcpy (pstr->mbs + byte_idx,
334	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
335	pstr->wcs[byte_idx++] = wcu;
336	/* Write paddings. */
337	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
338	pstr->wcs[byte_idx++] = WEOF;
339	}
340	else if (mbclen == (size_t) -1 \|\| mbclen == 0
341	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
342	{
343	/* It is an invalid character, an incomplete character
344	at the end of the string, or '\0'. Just use the byte. */
345	pstr->mbs[byte_idx] = ch;
346	/* And also cast it to wide char. */
347	pstr->wcs[byte_idx++] = (wchar_t) ch;
348	if (__glibc_unlikely (mbclen == (size_t) -1))
349	pstr->cur_state = prev_st;
350	}
351	else
352	{
353	/* The buffer doesn't have enough space, finish to build. */
354	pstr->cur_state = prev_st;
355	break;
356	}
357	}
358	pstr->valid_len = byte_idx;
359	pstr->valid_raw_len = byte_idx;
360	return REG_NOERROR;
361	}
362	else
363	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364	{
365	wchar_t wc;
366	const char *p;
367	offsets_needed:
368	remain_len = end_idx - byte_idx;
369	prev_st = pstr->cur_state;
370	if (__glibc_unlikely (pstr->trans != NULL))
371	{
372	int i, ch;
373
374	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375	{
376	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377	buf[i] = pstr->trans[ch];
378	}
379	p = (const char *) buf;
380	}
381	else
382	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
385	{
386	wchar_t wcu = __towupper (wc);
387	if (wcu != wc)
388	{
389	size_t mbcdlen;
390
391	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
392	if (__glibc_likely (mbclen == mbcdlen))
393	memcpy (pstr->mbs + byte_idx, buf, mbclen);
394	else if (mbcdlen != (size_t) -1)
395	{
396	size_t i;
397
398	if (byte_idx + mbcdlen > pstr->bufs_len)
399	{
400	pstr->cur_state = prev_st;
401	break;
402	}
403
404	if (pstr->offsets == NULL)
405	{
406	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407
408	if (pstr->offsets == NULL)
409	return REG_ESPACE;
410	}
411	if (!pstr->offsets_needed)
412	{
413	for (i = 0; i < (size_t) byte_idx; ++i)
414	pstr->offsets[i] = i;
415	pstr->offsets_needed = 1;
416	}
417
418	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419	pstr->wcs[byte_idx] = wcu;
420	pstr->offsets[byte_idx] = src_idx;
421	for (i = 1; i < mbcdlen; ++i)
422	{
423	pstr->offsets[byte_idx + i]
424	= src_idx + (i < mbclen ? i : mbclen - 1);
425	pstr->wcs[byte_idx + i] = WEOF;
426	}
427	pstr->len += mbcdlen - mbclen;
428	if (pstr->raw_stop > src_idx)
429	pstr->stop += mbcdlen - mbclen;
430	end_idx = (pstr->bufs_len > pstr->len)
431	? pstr->len : pstr->bufs_len;
432	byte_idx += mbcdlen;
433	src_idx += mbclen;
434	continue;
435	}
436	else
437	memcpy (pstr->mbs + byte_idx, p, mbclen);
438	}
439	else
440	memcpy (pstr->mbs + byte_idx, p, mbclen);
441
442	if (__glibc_unlikely (pstr->offsets_needed != 0))
443	{
444	size_t i;
445	for (i = 0; i < mbclen; ++i)
446	pstr->offsets[byte_idx + i] = src_idx + i;
447	}
448	src_idx += mbclen;
449
450	pstr->wcs[byte_idx++] = wcu;
451	/* Write paddings. */
452	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453	pstr->wcs[byte_idx++] = WEOF;
454	}
455	else if (mbclen == (size_t) -1 \|\| mbclen == 0
456	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
457	{
458	/* It is an invalid character or '\0'. Just use the byte. */
459	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460
461	if (__glibc_unlikely (pstr->trans != NULL))
462	ch = pstr->trans [ch];
463	pstr->mbs[byte_idx] = ch;
464
465	if (__glibc_unlikely (pstr->offsets_needed != 0))
466	pstr->offsets[byte_idx] = src_idx;
467	++src_idx;
468
469	/* And also cast it to wide char. */
470	pstr->wcs[byte_idx++] = (wchar_t) ch;
471	if (__glibc_unlikely (mbclen == (size_t) -1))
472	pstr->cur_state = prev_st;
473	}
474	else
475	{
476	/* The buffer doesn't have enough space, finish to build. */
477	pstr->cur_state = prev_st;
478	break;
479	}
480	}
481	pstr->valid_len = byte_idx;
482	pstr->valid_raw_len = src_idx;
483	return REG_NOERROR;
484	}
485
486	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
487	Return the index. */
488
489	static Idx
490	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
491	{
492	mbstate_t prev_st;
493	Idx rawbuf_idx;
494	size_t mbclen;
495	wint_t wc = WEOF;
496
497	/* Skip the characters which are not necessary to check. */
498	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499	rawbuf_idx < new_raw_idx;)
500	{
501	wchar_t wc2;
502	Idx remain_len = pstr->raw_len - rawbuf_idx;
503	prev_st = pstr->cur_state;
504	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505	remain_len, &pstr->cur_state);
506	if (__glibc_unlikely (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1
507	\|\| mbclen == 0))
508	{
509	/* We treat these cases as a single byte character. */
510	if (mbclen == 0 \|\| remain_len == 0)
511	wc = L'\0';
512	else
513	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
514	mbclen = 1;
515	pstr->cur_state = prev_st;
516	}
517	else
518	wc = wc2;
519	/* Then proceed the next character. */
520	rawbuf_idx += mbclen;
521	}
522	*last_wc = wc;
523	return rawbuf_idx;
524	}
525
526	/* Build the buffer PSTR->MBS, and apply the translation if we need.
527	This function is used in case of REG_ICASE. */
528
529	static void
530	build_upper_buffer (re_string_t *pstr)
531	{
532	Idx char_idx, end_idx;
533	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534
535	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
536	{
537	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538	if (__glibc_unlikely (pstr->trans != NULL))
539	ch = pstr->trans[ch];
540	pstr->mbs[char_idx] = toupper (ch);
541	}
542	pstr->valid_len = char_idx;
543	pstr->valid_raw_len = char_idx;
544	}
545
546	/* Apply TRANS to the buffer in PSTR. */
547
548	static void
549	re_string_translate_buffer (re_string_t *pstr)
550	{
551	Idx buf_idx, end_idx;
552	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553
554	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555	{
556	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557	pstr->mbs[buf_idx] = pstr->trans[ch];
558	}
559
560	pstr->valid_len = buf_idx;
561	pstr->valid_raw_len = buf_idx;
562	}
563
564	/* This function re-construct the buffers.
565	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566	convert to upper case in case of REG_ICASE, apply translation. */
567
568	static reg_errcode_t
569	__attribute_warn_unused_result__
570	re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571	{
572	Idx offset;
573
574	if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
575	offset = idx - pstr->raw_mbs_idx;
576	else
577	{
578	/* Reset buffer. */
579	if (pstr->mb_cur_max > 1)
580	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
581	pstr->len = pstr->raw_len;
582	pstr->stop = pstr->raw_stop;
583	pstr->valid_len = 0;
584	pstr->raw_mbs_idx = 0;
585	pstr->valid_raw_len = 0;
586	pstr->offsets_needed = 0;
587	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
588	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
589	if (!pstr->mbs_allocated)
590	pstr->mbs = (unsigned char *) pstr->raw_mbs;
591	offset = idx;
592	}
593
594	if (__glibc_likely (offset != 0))
595	{
596	/* Should the already checked characters be kept? */
597	if (__glibc_likely (offset < pstr->valid_raw_len))
598	{
599	/* Yes, move them to the front of the buffer. */
600	if (__glibc_unlikely (pstr->offsets_needed))
601	{
602	Idx low = 0, high = pstr->valid_len, mid;
603	do
604	{
605	mid = (high + low) / 2;
606	if (pstr->offsets[mid] > offset)
607	high = mid;
608	else if (pstr->offsets[mid] < offset)
609	low = mid + 1;
610	else
611	break;
612	}
613	while (low < high);
614	if (pstr->offsets[mid] < offset)
615	++mid;
616	pstr->tip_context = re_string_context_at (pstr, mid - 1,
617	eflags);
618	/* This can be quite complicated, so handle specially
619	only the common and easy case where the character with
620	different length representation of lower and upper
621	case is present at or after offset. */
622	if (pstr->valid_len > offset
623	&& mid == offset && pstr->offsets[mid] == offset)
624	{
625	memmove (pstr->wcs, pstr->wcs + offset,
626	(pstr->valid_len - offset) * sizeof (wint_t));
627	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
628	pstr->valid_len -= offset;
629	pstr->valid_raw_len -= offset;
630	for (low = 0; low < pstr->valid_len; low++)
631	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
632	}
633	else
634	{
635	/* Otherwise, just find out how long the partial multibyte
636	character at offset is and fill it with WEOF/255. */
637	pstr->len = pstr->raw_len - idx + offset;
638	pstr->stop = pstr->raw_stop - idx + offset;
639	pstr->offsets_needed = 0;
640	while (mid > 0 && pstr->offsets[mid - 1] == offset)
641	--mid;
642	while (mid < pstr->valid_len)
643	if (pstr->wcs[mid] != WEOF)
644	break;
645	else
646	++mid;
647	if (mid == pstr->valid_len)
648	pstr->valid_len = 0;
649	else
650	{
651	pstr->valid_len = pstr->offsets[mid] - offset;
652	if (pstr->valid_len)
653	{
654	for (low = 0; low < pstr->valid_len; ++low)
655	pstr->wcs[low] = WEOF;
656	memset (pstr->mbs, 255, pstr->valid_len);
657	}
658	}
659	pstr->valid_raw_len = pstr->valid_len;
660	}
661	}
662	else
663	{
664	pstr->tip_context = re_string_context_at (pstr, offset - 1,
665	eflags);
666	if (pstr->mb_cur_max > 1)
667	memmove (pstr->wcs, pstr->wcs + offset,
668	(pstr->valid_len - offset) * sizeof (wint_t));
669	if (__glibc_unlikely (pstr->mbs_allocated))
670	memmove (pstr->mbs, pstr->mbs + offset,
671	pstr->valid_len - offset);
672	pstr->valid_len -= offset;
673	pstr->valid_raw_len -= offset;
674	DEBUG_ASSERT (pstr->valid_len > 0);
675	}
676	}
677	else
678	{
679	/* No, skip all characters until IDX. */
680	Idx prev_valid_len = pstr->valid_len;
681
682	if (__glibc_unlikely (pstr->offsets_needed))
683	{
684	pstr->len = pstr->raw_len - idx + offset;
685	pstr->stop = pstr->raw_stop - idx + offset;
686	pstr->offsets_needed = 0;
687	}
688	pstr->valid_len = 0;
689	if (pstr->mb_cur_max > 1)
690	{
691	Idx wcs_idx;
692	wint_t wc = WEOF;
693
694	if (pstr->is_utf8)
695	{
696	const unsigned char raw, p, *end;
697
698	/* Special case UTF-8. Multi-byte chars start with any
699	byte other than 0x80 - 0xbf. */
700	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
701	end = raw + (offset - pstr->mb_cur_max);
702	if (end < pstr->raw_mbs)
703	end = pstr->raw_mbs;
704	p = raw + offset - 1;
705	#ifdef _LIBC
706	/* We know the wchar_t encoding is UCS4, so for the simple
707	case, ASCII characters, skip the conversion step. */
708	if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
709	{
710	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
711	/* pstr->valid_len = 0; */
712	wc = (wchar_t) *p;
713	}
714	else
715	#endif
716	for (; p >= end; --p)
717	if ((*p & 0xc0) != 0x80)
718	{
719	mbstate_t cur_state;
720	wchar_t wc2;
721	Idx mlen = raw + pstr->len - p;
722	unsigned char buf[6];
723	size_t mbclen;
724
725	const unsigned char *pp = p;
726	if (__glibc_unlikely (pstr->trans != NULL))
727	{
728	int i = mlen < 6 ? mlen : 6;
729	while (--i >= 0)
730	buf[i] = pstr->trans[p[i]];
731	pp = buf;
732	}
733	/* XXX Don't use mbrtowc, we know which conversion
734	to use (UTF-8 -> UCS4). */
735	memset (&cur_state, 0, sizeof (cur_state));
736	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
737	&cur_state);
738	if (raw + offset - p <= mbclen
739	&& mbclen < (size_t) -2)
740	{
741	memset (&pstr->cur_state, '\0',
742	sizeof (mbstate_t));
743	pstr->valid_len = mbclen - (raw + offset - p);
744	wc = wc2;
745	}
746	break;
747	}
748	}
749
750	if (wc == WEOF)
751	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
752	if (wc == WEOF)
753	pstr->tip_context
754	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
755	else
756	pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
757	&& IS_WIDE_WORD_CHAR (wc))
758	? CONTEXT_WORD
759	: ((IS_WIDE_NEWLINE (wc)
760	&& pstr->newline_anchor)
761	? CONTEXT_NEWLINE : 0));
762	if (__glibc_unlikely (pstr->valid_len))
763	{
764	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
765	pstr->wcs[wcs_idx] = WEOF;
766	if (pstr->mbs_allocated)
767	memset (pstr->mbs, 255, pstr->valid_len);
768	}
769	pstr->valid_raw_len = pstr->valid_len;
770	}
771	else
772	{
773	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
774	pstr->valid_raw_len = 0;
775	if (pstr->trans)
776	c = pstr->trans[c];
777	pstr->tip_context = (bitset_contain (pstr->word_char, c)
778	? CONTEXT_WORD
779	: ((IS_NEWLINE (c) && pstr->newline_anchor)
780	? CONTEXT_NEWLINE : 0));
781	}
782	}
783	if (!__glibc_unlikely (pstr->mbs_allocated))
784	pstr->mbs += offset;
785	}
786	pstr->raw_mbs_idx = idx;
787	pstr->len -= offset;
788	pstr->stop -= offset;
789
790	/* Then build the buffers. */
791	if (pstr->mb_cur_max > 1)
792	{
793	if (pstr->icase)
794	{
795	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
796	if (__glibc_unlikely (ret != REG_NOERROR))
797	return ret;
798	}
799	else
800	build_wcs_buffer (pstr);
801	}
802	else
803	if (__glibc_unlikely (pstr->mbs_allocated))
804	{
805	if (pstr->icase)
806	build_upper_buffer (pstr);
807	else if (pstr->trans != NULL)
808	re_string_translate_buffer (pstr);
809	}
810	else
811	pstr->valid_len = pstr->len;
812
813	pstr->cur_idx = 0;
814	return REG_NOERROR;
815	}
816
817	static unsigned char
818	__attribute__ ((pure))
819	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
820	{
821	int ch;
822	Idx off;
823
824	/* Handle the common (easiest) cases first. */
825	if (__glibc_likely (!pstr->mbs_allocated))
826	return re_string_peek_byte (pstr, idx);
827
828	if (pstr->mb_cur_max > 1
829	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
830	return re_string_peek_byte (pstr, idx);
831
832	off = pstr->cur_idx + idx;
833	if (pstr->offsets_needed)
834	off = pstr->offsets[off];
835
836	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
837
838	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
839	this function returns CAPITAL LETTER I instead of first byte of
840	DOTLESS SMALL LETTER I. The latter would confuse the parser,
841	since peek_byte_case doesn't advance cur_idx in any way. */
842	if (pstr->offsets_needed && !isascii (ch))
843	return re_string_peek_byte (pstr, idx);
844
845	return ch;
846	}
847
848	static unsigned char
849	re_string_fetch_byte_case (re_string_t *pstr)
850	{
851	if (__glibc_likely (!pstr->mbs_allocated))
852	return re_string_fetch_byte (pstr);
853
854	if (pstr->offsets_needed)
855	{
856	Idx off;
857	int ch;
858
859	/* For tr_TR.UTF-8 [[:islower:]] there is
860	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
861	in that case the whole multi-byte character and return
862	the original letter. On the other side, with
863	[[: DOTLESS SMALL LETTER I return [[:I, as doing
864	anything else would complicate things too much. */
865
866	if (!re_string_first_byte (pstr, pstr->cur_idx))
867	return re_string_fetch_byte (pstr);
868
869	off = pstr->offsets[pstr->cur_idx];
870	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
871
872	if (! isascii (ch))
873	return re_string_fetch_byte (pstr);
874
875	re_string_skip_bytes (pstr,
876	re_string_char_size_at (pstr, pstr->cur_idx));
877	return ch;
878	}
879
880	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
881	}
882
883	static void
884	re_string_destruct (re_string_t *pstr)
885	{
886	re_free (pstr->wcs);
887	re_free (pstr->offsets);
888	if (pstr->mbs_allocated)
889	re_free (pstr->mbs);
890	}
891
892	/* Return the context at IDX in INPUT. */
893
894	static unsigned int
895	re_string_context_at (const re_string_t *input, Idx idx, int eflags)
896	{
897	int c;
898	if (__glibc_unlikely (idx < 0))
899	/* In this case, we use the value stored in input->tip_context,
900	since we can't know the character in input->mbs[-1] here. */
901	return input->tip_context;
902	if (__glibc_unlikely (idx == input->len))
903	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
904	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
905	if (input->mb_cur_max > 1)
906	{
907	wint_t wc;
908	Idx wc_idx = idx;
909	while(input->wcs[wc_idx] == WEOF)
910	{
911	DEBUG_ASSERT (wc_idx >= 0);
912	--wc_idx;
913	if (wc_idx < 0)
914	return input->tip_context;
915	}
916	wc = input->wcs[wc_idx];
917	if (__glibc_unlikely (input->word_ops_used != 0)
918	&& IS_WIDE_WORD_CHAR (wc))
919	return CONTEXT_WORD;
920	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
921	? CONTEXT_NEWLINE : 0);
922	}
923	else
924	{
925	c = re_string_byte_at (input, idx);
926	if (bitset_contain (input->word_char, c))
927	return CONTEXT_WORD;
928	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
929	}
930	}
931
932
933	/* Functions for set operation. */
934
935	static reg_errcode_t
936	__attribute_warn_unused_result__
937	re_node_set_alloc (re_node_set *set, Idx size)
938	{
939	set->alloc = size;
940	set->nelem = 0;
941	set->elems = re_malloc (Idx, size);
942	if (__glibc_unlikely (set->elems == NULL)
943	&& (MALLOC_0_IS_NONNULL \|\| size != 0))
944	return REG_ESPACE;
945	return REG_NOERROR;
946	}
947
948	static reg_errcode_t
949	__attribute_warn_unused_result__
950	re_node_set_init_1 (re_node_set *set, Idx elem)
951	{
952	set->alloc = 1;
953	set->nelem = 1;
954	set->elems = re_malloc (Idx, 1);
955	if (__glibc_unlikely (set->elems == NULL))
956	{
957	set->alloc = set->nelem = 0;
958	return REG_ESPACE;
959	}
960	set->elems[0] = elem;
961	return REG_NOERROR;
962	}
963
964	static reg_errcode_t
965	__attribute_warn_unused_result__
966	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
967	{
968	set->alloc = 2;
969	set->elems = re_malloc (Idx, 2);
970	if (__glibc_unlikely (set->elems == NULL))
971	return REG_ESPACE;
972	if (elem1 == elem2)
973	{
974	set->nelem = 1;
975	set->elems[0] = elem1;
976	}
977	else
978	{
979	set->nelem = 2;
980	if (elem1 < elem2)
981	{
982	set->elems[0] = elem1;
983	set->elems[1] = elem2;
984	}
985	else
986	{
987	set->elems[0] = elem2;
988	set->elems[1] = elem1;
989	}
990	}
991	return REG_NOERROR;
992	}
993
994	static reg_errcode_t
995	__attribute_warn_unused_result__
996	re_node_set_init_copy (re_node_set dest, const re_node_set src)
997	{
998	dest->nelem = src->nelem;
999	if (src->nelem > 0)
1000	{
1001	dest->alloc = dest->nelem;
1002	dest->elems = re_malloc (Idx, dest->alloc);
1003	if (__glibc_unlikely (dest->elems == NULL))
1004	{
1005	dest->alloc = dest->nelem = 0;
1006	return REG_ESPACE;
1007	}
1008	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1009	}
1010	else
1011	re_node_set_init_empty (dest);
1012	return REG_NOERROR;
1013	}
1014
1015	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1016	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1017	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1018
1019	static reg_errcode_t
1020	__attribute_warn_unused_result__
1021	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
1022	const re_node_set *src2)
1023	{
1024	Idx i1, i2, is, id, delta, sbase;
1025	if (src1->nelem == 0 \|\| src2->nelem == 0)
1026	return REG_NOERROR;
1027
1028	/* We need dest->nelem + 2 * elems_in_intersection; this is a
1029	conservative estimate. */
1030	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1031	{
1032	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1033	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1034	if (__glibc_unlikely (new_elems == NULL))
1035	return REG_ESPACE;
1036	dest->elems = new_elems;
1037	dest->alloc = new_alloc;
1038	}
1039
1040	/* Find the items in the intersection of SRC1 and SRC2, and copy
1041	into the top of DEST those that are not already in DEST itself. */
1042	sbase = dest->nelem + src1->nelem + src2->nelem;
1043	i1 = src1->nelem - 1;
1044	i2 = src2->nelem - 1;
1045	id = dest->nelem - 1;
1046	for (;;)
1047	{
1048	if (src1->elems[i1] == src2->elems[i2])
1049	{
1050	/* Try to find the item in DEST. Maybe we could binary search? */
1051	while (id >= 0 && dest->elems[id] > src1->elems[i1])
1052	--id;
1053
1054	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
1055	dest->elems[--sbase] = src1->elems[i1];
1056
1057	if (--i1 < 0 \|\| --i2 < 0)
1058	break;
1059	}
1060
1061	/* Lower the highest of the two items. */
1062	else if (src1->elems[i1] < src2->elems[i2])
1063	{
1064	if (--i2 < 0)
1065	break;
1066	}
1067	else
1068	{
1069	if (--i1 < 0)
1070	break;
1071	}
1072	}
1073
1074	id = dest->nelem - 1;
1075	is = dest->nelem + src1->nelem + src2->nelem - 1;
1076	delta = is - sbase + 1;
1077
1078	/* Now copy. When DELTA becomes zero, the remaining
1079	DEST elements are already in place; this is more or
1080	less the same loop that is in re_node_set_merge. */
1081	dest->nelem += delta;
1082	if (delta > 0 && id >= 0)
1083	for (;;)
1084	{
1085	if (dest->elems[is] > dest->elems[id])
1086	{
1087	/* Copy from the top. */
1088	dest->elems[id + delta--] = dest->elems[is--];
1089	if (delta == 0)
1090	break;
1091	}
1092	else
1093	{
1094	/* Slide from the bottom. */
1095	dest->elems[id + delta] = dest->elems[id];
1096	if (--id < 0)
1097	break;
1098	}
1099	}
1100
1101	/* Copy remaining SRC elements. */
1102	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1103
1104	return REG_NOERROR;
1105	}
1106
1107	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1108	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1109
1110	static reg_errcode_t
1111	__attribute_warn_unused_result__
1112	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1113	const re_node_set *src2)
1114	{
1115	Idx i1, i2, id;
1116	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1117	{
1118	dest->alloc = src1->nelem + src2->nelem;
1119	dest->elems = re_malloc (Idx, dest->alloc);
1120	if (__glibc_unlikely (dest->elems == NULL))
1121	return REG_ESPACE;
1122	}
1123	else
1124	{
1125	if (src1 != NULL && src1->nelem > 0)
1126	return re_node_set_init_copy (dest, src1);
1127	else if (src2 != NULL && src2->nelem > 0)
1128	return re_node_set_init_copy (dest, src2);
1129	else
1130	re_node_set_init_empty (dest);
1131	return REG_NOERROR;
1132	}
1133	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1134	{
1135	if (src1->elems[i1] > src2->elems[i2])
1136	{
1137	dest->elems[id++] = src2->elems[i2++];
1138	continue;
1139	}
1140	if (src1->elems[i1] == src2->elems[i2])
1141	++i2;
1142	dest->elems[id++] = src1->elems[i1++];
1143	}
1144	if (i1 < src1->nelem)
1145	{
1146	memcpy (dest->elems + id, src1->elems + i1,
1147	(src1->nelem - i1) * sizeof (Idx));
1148	id += src1->nelem - i1;
1149	}
1150	else if (i2 < src2->nelem)
1151	{
1152	memcpy (dest->elems + id, src2->elems + i2,
1153	(src2->nelem - i2) * sizeof (Idx));
1154	id += src2->nelem - i2;
1155	}
1156	dest->nelem = id;
1157	return REG_NOERROR;
1158	}
1159
1160	/* Calculate the union set of the sets DEST and SRC. And store it to
1161	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1162
1163	static reg_errcode_t
1164	__attribute_warn_unused_result__
1165	re_node_set_merge (re_node_set dest, const re_node_set src)
1166	{
1167	Idx is, id, sbase, delta;
1168	if (src == NULL \|\| src->nelem == 0)
1169	return REG_NOERROR;
1170	if (dest->alloc < 2 * src->nelem + dest->nelem)
1171	{
1172	Idx new_alloc = 2 * (src->nelem + dest->alloc);
1173	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1174	if (__glibc_unlikely (new_buffer == NULL))
1175	return REG_ESPACE;
1176	dest->elems = new_buffer;
1177	dest->alloc = new_alloc;
1178	}
1179
1180	if (__glibc_unlikely (dest->nelem == 0))
1181	{
1182	/* Although we already guaranteed above that dest->alloc != 0 and
1183	therefore dest->elems != NULL, add a debug assertion to pacify
1184	GCC 11.2.1's -fanalyzer. */
1185	DEBUG_ASSERT (dest->elems);
1186	dest->nelem = src->nelem;
1187	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1188	return REG_NOERROR;
1189	}
1190
1191	/* Copy into the top of DEST the items of SRC that are not
1192	found in DEST. Maybe we could binary search in DEST? */
1193	for (sbase = dest->nelem + 2 * src->nelem,
1194	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1195	{
1196	if (dest->elems[id] == src->elems[is])
1197	is--, id--;
1198	else if (dest->elems[id] < src->elems[is])
1199	dest->elems[--sbase] = src->elems[is--];
1200	else /* if (dest->elems[id] > src->elems[is]) */
1201	--id;
1202	}
1203
1204	if (is >= 0)
1205	{
1206	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1207	sbase -= is + 1;
1208	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1209	}
1210
1211	id = dest->nelem - 1;
1212	is = dest->nelem + 2 * src->nelem - 1;
1213	delta = is - sbase + 1;
1214	if (delta == 0)
1215	return REG_NOERROR;
1216
1217	/* Now copy. When DELTA becomes zero, the remaining
1218	DEST elements are already in place. */
1219	dest->nelem += delta;
1220	for (;;)
1221	{
1222	if (dest->elems[is] > dest->elems[id])
1223	{
1224	/* Copy from the top. */
1225	dest->elems[id + delta--] = dest->elems[is--];
1226	if (delta == 0)
1227	break;
1228	}
1229	else
1230	{
1231	/* Slide from the bottom. */
1232	dest->elems[id + delta] = dest->elems[id];
1233	if (--id < 0)
1234	{
1235	/* Copy remaining SRC elements. */
1236	memcpy (dest->elems, dest->elems + sbase,
1237	delta * sizeof (Idx));
1238	break;
1239	}
1240	}
1241	}
1242
1243	return REG_NOERROR;
1244	}
1245
1246	/* Insert the new element ELEM to the re_node_set* SET.
1247	SET should not already have ELEM.
1248	Return true if successful. */
1249
1250	static bool
1251	__attribute_warn_unused_result__
1252	re_node_set_insert (re_node_set *set, Idx elem)
1253	{
1254	Idx idx;
1255	/* In case the set is empty. */
1256	if (set->alloc == 0)
1257	return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1258
1259	if (__glibc_unlikely (set->nelem) == 0)
1260	{
1261	/* Although we already guaranteed above that set->alloc != 0 and
1262	therefore set->elems != NULL, add a debug assertion to pacify
1263	GCC 11.2 -fanalyzer. */
1264	DEBUG_ASSERT (set->elems);
1265	set->elems[0] = elem;
1266	++set->nelem;
1267	return true;
1268	}
1269
1270	/* Realloc if we need. */
1271	if (set->alloc == set->nelem)
1272	{
1273	Idx *new_elems;
1274	set->alloc = set->alloc * 2;
1275	new_elems = re_realloc (set->elems, Idx, set->alloc);
1276	if (__glibc_unlikely (new_elems == NULL))
1277	return false;
1278	set->elems = new_elems;
1279	}
1280
1281	/* Move the elements which follows the new element. Test the
1282	first element separately to skip a check in the inner loop. */
1283	if (elem < set->elems[0])
1284	{
1285	for (idx = set->nelem; idx > 0; idx--)
1286	set->elems[idx] = set->elems[idx - 1];
1287	}
1288	else
1289	{
1290	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1291	set->elems[idx] = set->elems[idx - 1];
1292	DEBUG_ASSERT (set->elems[idx - 1] < elem);
1293	}
1294
1295	/* Insert the new element. */
1296	set->elems[idx] = elem;
1297	++set->nelem;
1298	return true;
1299	}
1300
1301	/* Insert the new element ELEM to the re_node_set* SET.
1302	SET should not already have any element greater than or equal to ELEM.
1303	Return true if successful. */
1304
1305	static bool
1306	__attribute_warn_unused_result__
1307	re_node_set_insert_last (re_node_set *set, Idx elem)
1308	{
1309	/* Realloc if we need. */
1310	if (set->alloc == set->nelem)
1311	{
1312	Idx *new_elems;
1313	set->alloc = (set->alloc + 1) * 2;
1314	new_elems = re_realloc (set->elems, Idx, set->alloc);
1315	if (__glibc_unlikely (new_elems == NULL))
1316	return false;
1317	set->elems = new_elems;
1318	}
1319
1320	/* Insert the new element. */
1321	set->elems[set->nelem++] = elem;
1322	return true;
1323	}
1324
1325	/* Compare two node sets SET1 and SET2.
1326	Return true if SET1 and SET2 are equivalent. */
1327
1328	static bool
1329	__attribute__ ((pure))
1330	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1331	{
1332	Idx i;
1333	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1334	return false;
1335	for (i = set1->nelem ; --i >= 0 ; )
1336	if (set1->elems[i] != set2->elems[i])
1337	return false;
1338	return true;
1339	}
1340
1341	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1342
1343	static Idx
1344	__attribute__ ((pure))
1345	re_node_set_contains (const re_node_set *set, Idx elem)
1346	{
1347	__re_size_t idx, right, mid;
1348	if (set->nelem <= 0)
1349	return 0;
1350
1351	/* Binary search the element. */
1352	idx = 0;
1353	right = set->nelem - 1;
1354	while (idx < right)
1355	{
1356	mid = (idx + right) / 2;
1357	if (set->elems[mid] < elem)
1358	idx = mid + 1;
1359	else
1360	right = mid;
1361	}
1362	return set->elems[idx] == elem ? idx + 1 : 0;
1363	}
1364
1365	static void
1366	re_node_set_remove_at (re_node_set *set, Idx idx)
1367	{
1368	if (idx < 0 \|\| idx >= set->nelem)
1369	return;
1370	--set->nelem;
1371	for (; idx < set->nelem; idx++)
1372	set->elems[idx] = set->elems[idx + 1];
1373	}
1374
1375
1376
1377	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1378	Or return -1 if an error occurred. */
1379
1380	static Idx
1381	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1382	{
1383	if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1384	{
1385	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1386	Idx new_nexts, new_indices;
1387	re_node_set new_edests, new_eclosures;
1388	re_token_t *new_nodes;
1389
1390	/* Avoid overflows in realloc. */
1391	const size_t max_object_size = MAX (sizeof (re_token_t),
1392	MAX (sizeof (re_node_set),
1393	sizeof (Idx)));
1394	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1395	< new_nodes_alloc))
1396	return -1;
1397
1398	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1399	if (__glibc_unlikely (new_nodes == NULL))
1400	return -1;
1401	dfa->nodes = new_nodes;
1402	dfa->nodes_alloc = new_nodes_alloc;
1403	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1404	if (new_nexts != NULL)
1405	dfa->nexts = new_nexts;
1406	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1407	if (new_indices != NULL)
1408	dfa->org_indices = new_indices;
1409	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1410	if (new_edests != NULL)
1411	dfa->edests = new_edests;
1412	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1413	if (new_eclosures != NULL)
1414	dfa->eclosures = new_eclosures;
1415	if (__glibc_unlikely (new_nexts == NULL \|\| new_indices == NULL
1416	\|\| new_edests == NULL \|\| new_eclosures == NULL))
1417	return -1;
1418	}
1419	dfa->nodes[dfa->nodes_len] = token;
1420	dfa->nodes[dfa->nodes_len].constraint = 0;
1421	dfa->nodes[dfa->nodes_len].accept_mb =
1422	((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1423	\|\| token.type == COMPLEX_BRACKET);
1424	dfa->nexts[dfa->nodes_len] = -1;
1425	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1426	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1427	return dfa->nodes_len++;
1428	}
1429
1430	static re_hashval_t
1431	calc_state_hash (const re_node_set *nodes, unsigned int context)
1432	{
1433	re_hashval_t hash = nodes->nelem + context;
1434	Idx i;
1435	for (i = 0 ; i < nodes->nelem ; i++)
1436	hash += nodes->elems[i];
1437	return hash;
1438	}
1439
1440	/* Search for the state whose node_set is equivalent to NODES.
1441	Return the pointer to the state, if we found it in the DFA.
1442	Otherwise create the new one and return it. In case of an error
1443	return NULL and set the error code in ERR.
1444	Note: - We assume NULL as the invalid state, then it is possible that
1445	return value is NULL and ERR is REG_NOERROR.
1446	- We never return non-NULL value in case of any errors, it is for
1447	optimization. */
1448
1449	static re_dfastate_t *
1450	__attribute_warn_unused_result__
1451	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1452	const re_node_set *nodes)
1453	{
1454	re_hashval_t hash;
1455	re_dfastate_t *new_state;
1456	struct re_state_table_entry *spot;
1457	Idx i;
1458	#if defined GCC_LINT \|\| defined lint
1459	/* Suppress bogus uninitialized-variable warnings. */
1460	*err = REG_NOERROR;
1461	#endif
1462	if (__glibc_unlikely (nodes->nelem == 0))
1463	{
1464	*err = REG_NOERROR;
1465	return NULL;
1466	}
1467	hash = calc_state_hash (nodes, 0);
1468	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469
1470	for (i = 0 ; i < spot->num ; i++)
1471	{
1472	re_dfastate_t *state = spot->array[i];
1473	if (hash != state->hash)
1474	continue;
1475	if (re_node_set_compare (&state->nodes, nodes))
1476	return state;
1477	}
1478
1479	/* There are no appropriate state in the dfa, create the new one. */
1480	new_state = create_ci_newstate (dfa, nodes, hash);
1481	if (__glibc_unlikely (new_state == NULL))
1482	*err = REG_ESPACE;
1483
1484	return new_state;
1485	}
1486
1487	/* Search for the state whose node_set is equivalent to NODES and
1488	whose context is equivalent to CONTEXT.
1489	Return the pointer to the state, if we found it in the DFA.
1490	Otherwise create the new one and return it. In case of an error
1491	return NULL and set the error code in ERR.
1492	Note: - We assume NULL as the invalid state, then it is possible that
1493	return value is NULL and ERR is REG_NOERROR.
1494	- We never return non-NULL value in case of any errors, it is for
1495	optimization. */
1496
1497	static re_dfastate_t *
1498	__attribute_warn_unused_result__
1499	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1500	const re_node_set *nodes, unsigned int context)
1501	{
1502	re_hashval_t hash;
1503	re_dfastate_t *new_state;
1504	struct re_state_table_entry *spot;
1505	Idx i;
1506	#if defined GCC_LINT \|\| defined lint
1507	/* Suppress bogus uninitialized-variable warnings. */
1508	*err = REG_NOERROR;
1509	#endif
1510	if (nodes->nelem == 0)
1511	{
1512	*err = REG_NOERROR;
1513	return NULL;
1514	}
1515	hash = calc_state_hash (nodes, context);
1516	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1517
1518	for (i = 0 ; i < spot->num ; i++)
1519	{
1520	re_dfastate_t *state = spot->array[i];
1521	if (state->hash == hash
1522	&& state->context == context
1523	&& re_node_set_compare (state->entrance_nodes, nodes))
1524	return state;
1525	}
1526	/* There are no appropriate state in 'dfa', create the new one. */
1527	new_state = create_cd_newstate (dfa, nodes, context, hash);
1528	if (__glibc_unlikely (new_state == NULL))
1529	*err = REG_ESPACE;
1530
1531	return new_state;
1532	}
1533
1534	/* Finish initialization of the new state NEWSTATE, and using its hash value
1535	HASH put in the appropriate bucket of DFA's state table. Return value
1536	indicates the error code if failed. */
1537
1538	static reg_errcode_t
1539	__attribute_warn_unused_result__
1540	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1541	re_hashval_t hash)
1542	{
1543	struct re_state_table_entry *spot;
1544	reg_errcode_t err;
1545	Idx i;
1546
1547	newstate->hash = hash;
1548	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1549	if (__glibc_unlikely (err != REG_NOERROR))
1550	return REG_ESPACE;
1551	for (i = 0; i < newstate->nodes.nelem; i++)
1552	{
1553	Idx elem = newstate->nodes.elems[i];
1554	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1555	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1556	return REG_ESPACE;
1557	}
1558
1559	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1560	if (__glibc_unlikely (spot->alloc <= spot->num))
1561	{
1562	Idx new_alloc = 2 * spot->num + 2;
1563	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1564	new_alloc);
1565	if (__glibc_unlikely (new_array == NULL))
1566	return REG_ESPACE;
1567	spot->array = new_array;
1568	spot->alloc = new_alloc;
1569	}
1570	spot->array[spot->num++] = newstate;
1571	return REG_NOERROR;
1572	}
1573
1574	static void
1575	free_state (re_dfastate_t *state)
1576	{
1577	re_node_set_free (&state->non_eps_nodes);
1578	re_node_set_free (&state->inveclosure);
1579	if (state->entrance_nodes != &state->nodes)
1580	{
1581	re_node_set_free (state->entrance_nodes);
1582	re_free (state->entrance_nodes);
1583	}
1584	re_node_set_free (&state->nodes);
1585	re_free (state->word_trtable);
1586	re_free (state->trtable);
1587	re_free (state);
1588	}
1589
1590	/* Create the new state which is independent of contexts.
1591	Return the new state if succeeded, otherwise return NULL. */
1592
1593	static re_dfastate_t *
1594	__attribute_warn_unused_result__
1595	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1596	re_hashval_t hash)
1597	{
1598	Idx i;
1599	reg_errcode_t err;
1600	re_dfastate_t *newstate;
1601
1602	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1603	if (__glibc_unlikely (newstate == NULL))
1604	return NULL;
1605	err = re_node_set_init_copy (&newstate->nodes, nodes);
1606	if (__glibc_unlikely (err != REG_NOERROR))
1607	{
1608	re_free (newstate);
1609	return NULL;
1610	}
1611
1612	newstate->entrance_nodes = &newstate->nodes;
1613	for (i = 0 ; i < nodes->nelem ; i++)
1614	{
1615	re_token_t *node = dfa->nodes + nodes->elems[i];
1616	re_token_type_t type = node->type;
1617	if (type == CHARACTER && !node->constraint)
1618	continue;
1619	newstate->accept_mb \|= node->accept_mb;
1620
1621	/* If the state has the halt node, the state is a halt state. */
1622	if (type == END_OF_RE)
1623	newstate->halt = 1;
1624	else if (type == OP_BACK_REF)
1625	newstate->has_backref = 1;
1626	else if (type == ANCHOR \|\| node->constraint)
1627	newstate->has_constraint = 1;
1628	}
1629	err = register_state (dfa, newstate, hash);
1630	if (__glibc_unlikely (err != REG_NOERROR))
1631	{
1632	free_state (newstate);
1633	newstate = NULL;
1634	}
1635	return newstate;
1636	}
1637
1638	/* Create the new state which is depend on the context CONTEXT.
1639	Return the new state if succeeded, otherwise return NULL. */
1640
1641	static re_dfastate_t *
1642	__attribute_warn_unused_result__
1643	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1644	unsigned int context, re_hashval_t hash)
1645	{
1646	Idx i, nctx_nodes = 0;
1647	reg_errcode_t err;
1648	re_dfastate_t *newstate;
1649
1650	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1651	if (__glibc_unlikely (newstate == NULL))
1652	return NULL;
1653	err = re_node_set_init_copy (&newstate->nodes, nodes);
1654	if (__glibc_unlikely (err != REG_NOERROR))
1655	{
1656	re_free (newstate);
1657	return NULL;
1658	}
1659
1660	newstate->context = context;
1661	newstate->entrance_nodes = &newstate->nodes;
1662
1663	for (i = 0 ; i < nodes->nelem ; i++)
1664	{
1665	re_token_t *node = dfa->nodes + nodes->elems[i];
1666	re_token_type_t type = node->type;
1667	unsigned int constraint = node->constraint;
1668
1669	if (type == CHARACTER && !constraint)
1670	continue;
1671	newstate->accept_mb \|= node->accept_mb;
1672
1673	/* If the state has the halt node, the state is a halt state. */
1674	if (type == END_OF_RE)
1675	newstate->halt = 1;
1676	else if (type == OP_BACK_REF)
1677	newstate->has_backref = 1;
1678
1679	if (constraint)
1680	{
1681	if (newstate->entrance_nodes == &newstate->nodes)
1682	{
1683	re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1684	if (__glibc_unlikely (entrance_nodes == NULL))
1685	{
1686	free_state (newstate);
1687	return NULL;
1688	}
1689	newstate->entrance_nodes = entrance_nodes;
1690	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1691	!= REG_NOERROR)
1692	{
1693	free_state (newstate);
1694	return NULL;
1695	}
1696	nctx_nodes = 0;
1697	newstate->has_constraint = 1;
1698	}
1699
1700	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1701	{
1702	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1703	++nctx_nodes;
1704	}
1705	}
1706	}
1707	err = register_state (dfa, newstate, hash);
1708	if (__glibc_unlikely (err != REG_NOERROR))
1709	{
1710	free_state (newstate);
1711	newstate = NULL;
1712	}
1713	return newstate;
1714	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/sed/lib/regex_internal.c@ 3613

Download in other formats: