regex_internal.c@ 3530

Last change on this file since 3530 was 3529, checked in by bird, 3 years ago
Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.
Property svn:eol-style set to `native`
File size: 48.3 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <[email protected]>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. */
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash);
27	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash);
31	static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32	Idx new_buf_len);
33	#ifdef RE_ENABLE_I18N
34	static void build_wcs_buffer (re_string_t *pstr);
35	static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36	#endif /* RE_ENABLE_I18N */
37	static void build_upper_buffer (re_string_t *pstr);
38	static void re_string_translate_buffer (re_string_t *pstr);
39	static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40	int eflags) __attribute__ ((pure));
41
42
43	/* Functions for string operation. */
44
45	/* This function allocate the buffers. It is necessary to call
46	re_string_reconstruct before using the object. */
47
48	static reg_errcode_t
49	__attribute_warn_unused_result__
50	re_string_allocate (re_string_t pstr, const char str, Idx len, Idx init_len,
51	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
52	{
53	reg_errcode_t ret;
54	Idx init_buf_len;
55
56	/* Ensure at least one character fits into the buffers. */
57	if (init_len < dfa->mb_cur_max)
58	init_len = dfa->mb_cur_max;
59	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
60	re_string_construct_common (str, len, pstr, trans, icase, dfa);
61
62	ret = re_string_realloc_buffers (pstr, init_buf_len);
63	if (__glibc_unlikely (ret != REG_NOERROR))
64	return ret;
65
66	pstr->word_char = dfa->word_char;
67	pstr->word_ops_used = dfa->word_ops_used;
68	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
69	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
70	pstr->valid_raw_len = pstr->valid_len;
71	return REG_NOERROR;
72	}
73
74	/* This function allocate the buffers, and initialize them. */
75
76	static reg_errcode_t
77	__attribute_warn_unused_result__
78	re_string_construct (re_string_t pstr, const char str, Idx len,
79	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
80	{
81	reg_errcode_t ret;
82	memset (pstr, '\0', sizeof (re_string_t));
83	re_string_construct_common (str, len, pstr, trans, icase, dfa);
84
85	if (len > 0)
86	{
87	ret = re_string_realloc_buffers (pstr, len + 1);
88	if (__glibc_unlikely (ret != REG_NOERROR))
89	return ret;
90	}
91	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
92
93	if (icase)
94	{
95	#ifdef RE_ENABLE_I18N
96	if (dfa->mb_cur_max > 1)
97	{
98	while (1)
99	{
100	ret = build_wcs_upper_buffer (pstr);
101	if (__glibc_unlikely (ret != REG_NOERROR))
102	return ret;
103	if (pstr->valid_raw_len >= len)
104	break;
105	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
106	break;
107	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
108	if (__glibc_unlikely (ret != REG_NOERROR))
109	return ret;
110	}
111	}
112	else
113	#endif /* RE_ENABLE_I18N */
114	build_upper_buffer (pstr);
115	}
116	else
117	{
118	#ifdef RE_ENABLE_I18N
119	if (dfa->mb_cur_max > 1)
120	build_wcs_buffer (pstr);
121	else
122	#endif /* RE_ENABLE_I18N */
123	{
124	if (trans != NULL)
125	re_string_translate_buffer (pstr);
126	else
127	{
128	pstr->valid_len = pstr->bufs_len;
129	pstr->valid_raw_len = pstr->bufs_len;
130	}
131	}
132	}
133
134	return REG_NOERROR;
135	}
136
137	/* Helper functions for re_string_allocate, and re_string_construct. */
138
139	static reg_errcode_t
140	__attribute_warn_unused_result__
141	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
142	{
143	#ifdef RE_ENABLE_I18N
144	if (pstr->mb_cur_max > 1)
145	{
146	wint_t *new_wcs;
147
148	/* Avoid overflow in realloc. */
149	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
150	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
151	< new_buf_len))
152	return REG_ESPACE;
153
154	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
155	if (__glibc_unlikely (new_wcs == NULL))
156	return REG_ESPACE;
157	pstr->wcs = new_wcs;
158	if (pstr->offsets != NULL)
159	{
160	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
161	if (__glibc_unlikely (new_offsets == NULL))
162	return REG_ESPACE;
163	pstr->offsets = new_offsets;
164	}
165	}
166	#endif /* RE_ENABLE_I18N */
167	if (pstr->mbs_allocated)
168	{
169	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
170	new_buf_len);
171	if (__glibc_unlikely (new_mbs == NULL))
172	return REG_ESPACE;
173	pstr->mbs = new_mbs;
174	}
175	pstr->bufs_len = new_buf_len;
176	return REG_NOERROR;
177	}
178
179
180	static void
181	re_string_construct_common (const char str, Idx len, re_string_t pstr,
182	RE_TRANSLATE_TYPE trans, bool icase,
183	const re_dfa_t *dfa)
184	{
185	pstr->raw_mbs = (const unsigned char *) str;
186	pstr->len = len;
187	pstr->raw_len = len;
188	pstr->trans = trans;
189	pstr->icase = icase;
190	pstr->mbs_allocated = (trans != NULL \|\| icase);
191	pstr->mb_cur_max = dfa->mb_cur_max;
192	pstr->is_utf8 = dfa->is_utf8;
193	pstr->map_notascii = dfa->map_notascii;
194	pstr->stop = pstr->len;
195	pstr->raw_stop = pstr->stop;
196	}
197
198	#ifdef RE_ENABLE_I18N
199
200	/* Build wide character buffer PSTR->WCS.
201	If the byte sequence of the string are:
202	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
203	Then wide character buffer will be:
204	<wc1> , WEOF , <wc2> , WEOF , <wc3>
205	We use WEOF for padding, they indicate that the position isn't
206	a first byte of a multibyte character.
207
208	Note that this function assumes PSTR->VALID_LEN elements are already
209	built and starts from PSTR->VALID_LEN. */
210
211	static void
212	build_wcs_buffer (re_string_t *pstr)
213	{
214	#ifdef _LIBC
215	unsigned char buf[MB_LEN_MAX];
216	DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
217	#else
218	unsigned char buf[64];
219	#endif
220	mbstate_t prev_st;
221	Idx byte_idx, end_idx, remain_len;
222	size_t mbclen;
223
224	/* Build the buffers from pstr->valid_len to either pstr->len or
225	pstr->bufs_len. */
226	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
227	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
228	{
229	wchar_t wc;
230	const char *p;
231
232	remain_len = end_idx - byte_idx;
233	prev_st = pstr->cur_state;
234	/* Apply the translation if we need. */
235	if (__glibc_unlikely (pstr->trans != NULL))
236	{
237	int i, ch;
238
239	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
240	{
241	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
242	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
243	}
244	p = (const char *) buf;
245	}
246	else
247	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
248	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
249	if (__glibc_unlikely (mbclen == (size_t) -1 \|\| mbclen == 0
250	\|\| (mbclen == (size_t) -2
251	&& pstr->bufs_len >= pstr->len)))
252	{
253	/* We treat these cases as a singlebyte character. */
254	mbclen = 1;
255	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
256	if (__glibc_unlikely (pstr->trans != NULL))
257	wc = pstr->trans[wc];
258	pstr->cur_state = prev_st;
259	}
260	else if (__glibc_unlikely (mbclen == (size_t) -2))
261	{
262	/* The buffer doesn't have enough space, finish to build. */
263	pstr->cur_state = prev_st;
264	break;
265	}
266
267	/* Write wide character and padding. */
268	pstr->wcs[byte_idx++] = wc;
269	/* Write paddings. */
270	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271	pstr->wcs[byte_idx++] = WEOF;
272	}
273	pstr->valid_len = byte_idx;
274	pstr->valid_raw_len = byte_idx;
275	}
276
277	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278	but for REG_ICASE. */
279
280	static reg_errcode_t
281	__attribute_warn_unused_result__
282	build_wcs_upper_buffer (re_string_t *pstr)
283	{
284	mbstate_t prev_st;
285	Idx src_idx, byte_idx, end_idx, remain_len;
286	size_t mbclen;
287	#ifdef _LIBC
288	char buf[MB_LEN_MAX];
289	DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
290	#else
291	char buf[64];
292	#endif
293
294	byte_idx = pstr->valid_len;
295	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296
297	/* The following optimization assumes that ASCII characters can be
298	mapped to wide characters with a simple cast. */
299	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300	{
301	while (byte_idx < end_idx)
302	{
303	wchar_t wc;
304	unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
305
306	if (isascii (ch) && mbsinit (&pstr->cur_state))
307	{
308	/* The next step uses the assumption that wchar_t is encoded
309	ASCII-safe: all ASCII values can be converted like this. */
310	wchar_t wcu = __towupper (ch);
311	if (isascii (wcu))
312	{
313	pstr->mbs[byte_idx] = wcu;
314	pstr->wcs[byte_idx] = wcu;
315	byte_idx++;
316	continue;
317	}
318	}
319
320	remain_len = end_idx - byte_idx;
321	prev_st = pstr->cur_state;
322	mbclen = __mbrtowc (&wc,
323	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
324	+ byte_idx), remain_len, &pstr->cur_state);
325	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
326	{
327	wchar_t wcu = __towupper (wc);
328	if (wcu != wc)
329	{
330	size_t mbcdlen;
331
332	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
333	if (__glibc_likely (mbclen == mbcdlen))
334	memcpy (pstr->mbs + byte_idx, buf, mbclen);
335	else
336	{
337	src_idx = byte_idx;
338	goto offsets_needed;
339	}
340	}
341	else
342	memcpy (pstr->mbs + byte_idx,
343	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344	pstr->wcs[byte_idx++] = wcu;
345	/* Write paddings. */
346	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347	pstr->wcs[byte_idx++] = WEOF;
348	}
349	else if (mbclen == (size_t) -1 \|\| mbclen == 0
350	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
351	{
352	/* It is an invalid character, an incomplete character
353	at the end of the string, or '\0'. Just use the byte. */
354	pstr->mbs[byte_idx] = ch;
355	/* And also cast it to wide char. */
356	pstr->wcs[byte_idx++] = (wchar_t) ch;
357	if (__glibc_unlikely (mbclen == (size_t) -1))
358	pstr->cur_state = prev_st;
359	}
360	else
361	{
362	/* The buffer doesn't have enough space, finish to build. */
363	pstr->cur_state = prev_st;
364	break;
365	}
366	}
367	pstr->valid_len = byte_idx;
368	pstr->valid_raw_len = byte_idx;
369	return REG_NOERROR;
370	}
371	else
372	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
373	{
374	wchar_t wc;
375	const char *p;
376	offsets_needed:
377	remain_len = end_idx - byte_idx;
378	prev_st = pstr->cur_state;
379	if (__glibc_unlikely (pstr->trans != NULL))
380	{
381	int i, ch;
382
383	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
384	{
385	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
386	buf[i] = pstr->trans[ch];
387	}
388	p = (const char *) buf;
389	}
390	else
391	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
392	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
393	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
394	{
395	wchar_t wcu = __towupper (wc);
396	if (wcu != wc)
397	{
398	size_t mbcdlen;
399
400	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
401	if (__glibc_likely (mbclen == mbcdlen))
402	memcpy (pstr->mbs + byte_idx, buf, mbclen);
403	else if (mbcdlen != (size_t) -1)
404	{
405	size_t i;
406
407	if (byte_idx + mbcdlen > pstr->bufs_len)
408	{
409	pstr->cur_state = prev_st;
410	break;
411	}
412
413	if (pstr->offsets == NULL)
414	{
415	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
416
417	if (pstr->offsets == NULL)
418	return REG_ESPACE;
419	}
420	if (!pstr->offsets_needed)
421	{
422	for (i = 0; i < (size_t) byte_idx; ++i)
423	pstr->offsets[i] = i;
424	pstr->offsets_needed = 1;
425	}
426
427	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428	pstr->wcs[byte_idx] = wcu;
429	pstr->offsets[byte_idx] = src_idx;
430	for (i = 1; i < mbcdlen; ++i)
431	{
432	pstr->offsets[byte_idx + i]
433	= src_idx + (i < mbclen ? i : mbclen - 1);
434	pstr->wcs[byte_idx + i] = WEOF;
435	}
436	pstr->len += mbcdlen - mbclen;
437	if (pstr->raw_stop > src_idx)
438	pstr->stop += mbcdlen - mbclen;
439	end_idx = (pstr->bufs_len > pstr->len)
440	? pstr->len : pstr->bufs_len;
441	byte_idx += mbcdlen;
442	src_idx += mbclen;
443	continue;
444	}
445	else
446	memcpy (pstr->mbs + byte_idx, p, mbclen);
447	}
448	else
449	memcpy (pstr->mbs + byte_idx, p, mbclen);
450
451	if (__glibc_unlikely (pstr->offsets_needed != 0))
452	{
453	size_t i;
454	for (i = 0; i < mbclen; ++i)
455	pstr->offsets[byte_idx + i] = src_idx + i;
456	}
457	src_idx += mbclen;
458
459	pstr->wcs[byte_idx++] = wcu;
460	/* Write paddings. */
461	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462	pstr->wcs[byte_idx++] = WEOF;
463	}
464	else if (mbclen == (size_t) -1 \|\| mbclen == 0
465	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
466	{
467	/* It is an invalid character or '\0'. Just use the byte. */
468	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
469
470	if (__glibc_unlikely (pstr->trans != NULL))
471	ch = pstr->trans [ch];
472	pstr->mbs[byte_idx] = ch;
473
474	if (__glibc_unlikely (pstr->offsets_needed != 0))
475	pstr->offsets[byte_idx] = src_idx;
476	++src_idx;
477
478	/* And also cast it to wide char. */
479	pstr->wcs[byte_idx++] = (wchar_t) ch;
480	if (__glibc_unlikely (mbclen == (size_t) -1))
481	pstr->cur_state = prev_st;
482	}
483	else
484	{
485	/* The buffer doesn't have enough space, finish to build. */
486	pstr->cur_state = prev_st;
487	break;
488	}
489	}
490	pstr->valid_len = byte_idx;
491	pstr->valid_raw_len = src_idx;
492	return REG_NOERROR;
493	}
494
495	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
496	Return the index. */
497
498	static Idx
499	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
500	{
501	mbstate_t prev_st;
502	Idx rawbuf_idx;
503	size_t mbclen;
504	wint_t wc = WEOF;
505
506	/* Skip the characters which are not necessary to check. */
507	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
508	rawbuf_idx < new_raw_idx;)
509	{
510	wchar_t wc2;
511	Idx remain_len = pstr->raw_len - rawbuf_idx;
512	prev_st = pstr->cur_state;
513	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
514	remain_len, &pstr->cur_state);
515	if (__glibc_unlikely (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1
516	\|\| mbclen == 0))
517	{
518	/* We treat these cases as a single byte character. */
519	if (mbclen == 0 \|\| remain_len == 0)
520	wc = L'\0';
521	else
522	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
523	mbclen = 1;
524	pstr->cur_state = prev_st;
525	}
526	else
527	wc = wc2;
528	/* Then proceed the next character. */
529	rawbuf_idx += mbclen;
530	}
531	*last_wc = wc;
532	return rawbuf_idx;
533	}
534	#endif /* RE_ENABLE_I18N */
535
536	/* Build the buffer PSTR->MBS, and apply the translation if we need.
537	This function is used in case of REG_ICASE. */
538
539	static void
540	build_upper_buffer (re_string_t *pstr)
541	{
542	Idx char_idx, end_idx;
543	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544
545	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
546	{
547	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
548	if (__glibc_unlikely (pstr->trans != NULL))
549	ch = pstr->trans[ch];
550	pstr->mbs[char_idx] = toupper (ch);
551	}
552	pstr->valid_len = char_idx;
553	pstr->valid_raw_len = char_idx;
554	}
555
556	/* Apply TRANS to the buffer in PSTR. */
557
558	static void
559	re_string_translate_buffer (re_string_t *pstr)
560	{
561	Idx buf_idx, end_idx;
562	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565	{
566	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567	pstr->mbs[buf_idx] = pstr->trans[ch];
568	}
569
570	pstr->valid_len = buf_idx;
571	pstr->valid_raw_len = buf_idx;
572	}
573
574	/* This function re-construct the buffers.
575	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576	convert to upper case in case of REG_ICASE, apply translation. */
577
578	static reg_errcode_t
579	__attribute_warn_unused_result__
580	re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
581	{
582	Idx offset;
583
584	if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
585	offset = idx - pstr->raw_mbs_idx;
586	else
587	{
588	/* Reset buffer. */
589	#ifdef RE_ENABLE_I18N
590	if (pstr->mb_cur_max > 1)
591	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
592	#endif /* RE_ENABLE_I18N */
593	pstr->len = pstr->raw_len;
594	pstr->stop = pstr->raw_stop;
595	pstr->valid_len = 0;
596	pstr->raw_mbs_idx = 0;
597	pstr->valid_raw_len = 0;
598	pstr->offsets_needed = 0;
599	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
600	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
601	if (!pstr->mbs_allocated)
602	pstr->mbs = (unsigned char *) pstr->raw_mbs;
603	offset = idx;
604	}
605
606	if (__glibc_likely (offset != 0))
607	{
608	/* Should the already checked characters be kept? */
609	if (__glibc_likely (offset < pstr->valid_raw_len))
610	{
611	/* Yes, move them to the front of the buffer. */
612	#ifdef RE_ENABLE_I18N
613	if (__glibc_unlikely (pstr->offsets_needed))
614	{
615	Idx low = 0, high = pstr->valid_len, mid;
616	do
617	{
618	mid = (high + low) / 2;
619	if (pstr->offsets[mid] > offset)
620	high = mid;
621	else if (pstr->offsets[mid] < offset)
622	low = mid + 1;
623	else
624	break;
625	}
626	while (low < high);
627	if (pstr->offsets[mid] < offset)
628	++mid;
629	pstr->tip_context = re_string_context_at (pstr, mid - 1,
630	eflags);
631	/* This can be quite complicated, so handle specially
632	only the common and easy case where the character with
633	different length representation of lower and upper
634	case is present at or after offset. */
635	if (pstr->valid_len > offset
636	&& mid == offset && pstr->offsets[mid] == offset)
637	{
638	memmove (pstr->wcs, pstr->wcs + offset,
639	(pstr->valid_len - offset) * sizeof (wint_t));
640	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
641	pstr->valid_len -= offset;
642	pstr->valid_raw_len -= offset;
643	for (low = 0; low < pstr->valid_len; low++)
644	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
645	}
646	else
647	{
648	/* Otherwise, just find out how long the partial multibyte
649	character at offset is and fill it with WEOF/255. */
650	pstr->len = pstr->raw_len - idx + offset;
651	pstr->stop = pstr->raw_stop - idx + offset;
652	pstr->offsets_needed = 0;
653	while (mid > 0 && pstr->offsets[mid - 1] == offset)
654	--mid;
655	while (mid < pstr->valid_len)
656	if (pstr->wcs[mid] != WEOF)
657	break;
658	else
659	++mid;
660	if (mid == pstr->valid_len)
661	pstr->valid_len = 0;
662	else
663	{
664	pstr->valid_len = pstr->offsets[mid] - offset;
665	if (pstr->valid_len)
666	{
667	for (low = 0; low < pstr->valid_len; ++low)
668	pstr->wcs[low] = WEOF;
669	memset (pstr->mbs, 255, pstr->valid_len);
670	}
671	}
672	pstr->valid_raw_len = pstr->valid_len;
673	}
674	}
675	else
676	#endif
677	{
678	pstr->tip_context = re_string_context_at (pstr, offset - 1,
679	eflags);
680	#ifdef RE_ENABLE_I18N
681	if (pstr->mb_cur_max > 1)
682	memmove (pstr->wcs, pstr->wcs + offset,
683	(pstr->valid_len - offset) * sizeof (wint_t));
684	#endif /* RE_ENABLE_I18N */
685	if (__glibc_unlikely (pstr->mbs_allocated))
686	memmove (pstr->mbs, pstr->mbs + offset,
687	pstr->valid_len - offset);
688	pstr->valid_len -= offset;
689	pstr->valid_raw_len -= offset;
690	DEBUG_ASSERT (pstr->valid_len > 0);
691	}
692	}
693	else
694	{
695	#ifdef RE_ENABLE_I18N
696	/* No, skip all characters until IDX. */
697	Idx prev_valid_len = pstr->valid_len;
698
699	if (__glibc_unlikely (pstr->offsets_needed))
700	{
701	pstr->len = pstr->raw_len - idx + offset;
702	pstr->stop = pstr->raw_stop - idx + offset;
703	pstr->offsets_needed = 0;
704	}
705	#endif
706	pstr->valid_len = 0;
707	#ifdef RE_ENABLE_I18N
708	if (pstr->mb_cur_max > 1)
709	{
710	Idx wcs_idx;
711	wint_t wc = WEOF;
712
713	if (pstr->is_utf8)
714	{
715	const unsigned char raw, p, *end;
716
717	/* Special case UTF-8. Multi-byte chars start with any
718	byte other than 0x80 - 0xbf. */
719	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720	end = raw + (offset - pstr->mb_cur_max);
721	if (end < pstr->raw_mbs)
722	end = pstr->raw_mbs;
723	p = raw + offset - 1;
724	#ifdef _LIBC
725	/* We know the wchar_t encoding is UCS4, so for the simple
726	case, ASCII characters, skip the conversion step. */
727	if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
728	{
729	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730	/* pstr->valid_len = 0; */
731	wc = (wchar_t) *p;
732	}
733	else
734	#endif
735	for (; p >= end; --p)
736	if ((*p & 0xc0) != 0x80)
737	{
738	mbstate_t cur_state;
739	wchar_t wc2;
740	Idx mlen = raw + pstr->len - p;
741	unsigned char buf[6];
742	size_t mbclen;
743
744	const unsigned char *pp = p;
745	if (__glibc_unlikely (pstr->trans != NULL))
746	{
747	int i = mlen < 6 ? mlen : 6;
748	while (--i >= 0)
749	buf[i] = pstr->trans[p[i]];
750	pp = buf;
751	}
752	/* XXX Don't use mbrtowc, we know which conversion
753	to use (UTF-8 -> UCS4). */
754	memset (&cur_state, 0, sizeof (cur_state));
755	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
756	&cur_state);
757	if (raw + offset - p <= mbclen
758	&& mbclen < (size_t) -2)
759	{
760	memset (&pstr->cur_state, '\0',
761	sizeof (mbstate_t));
762	pstr->valid_len = mbclen - (raw + offset - p);
763	wc = wc2;
764	}
765	break;
766	}
767	}
768
769	if (wc == WEOF)
770	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
771	if (wc == WEOF)
772	pstr->tip_context
773	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
774	else
775	pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
776	&& IS_WIDE_WORD_CHAR (wc))
777	? CONTEXT_WORD
778	: ((IS_WIDE_NEWLINE (wc)
779	&& pstr->newline_anchor)
780	? CONTEXT_NEWLINE : 0));
781	if (__glibc_unlikely (pstr->valid_len))
782	{
783	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
784	pstr->wcs[wcs_idx] = WEOF;
785	if (pstr->mbs_allocated)
786	memset (pstr->mbs, 255, pstr->valid_len);
787	}
788	pstr->valid_raw_len = pstr->valid_len;
789	}
790	else
791	#endif /* RE_ENABLE_I18N */
792	{
793	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
794	pstr->valid_raw_len = 0;
795	if (pstr->trans)
796	c = pstr->trans[c];
797	pstr->tip_context = (bitset_contain (pstr->word_char, c)
798	? CONTEXT_WORD
799	: ((IS_NEWLINE (c) && pstr->newline_anchor)
800	? CONTEXT_NEWLINE : 0));
801	}
802	}
803	if (!__glibc_unlikely (pstr->mbs_allocated))
804	pstr->mbs += offset;
805	}
806	pstr->raw_mbs_idx = idx;
807	pstr->len -= offset;
808	pstr->stop -= offset;
809
810	/* Then build the buffers. */
811	#ifdef RE_ENABLE_I18N
812	if (pstr->mb_cur_max > 1)
813	{
814	if (pstr->icase)
815	{
816	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
817	if (__glibc_unlikely (ret != REG_NOERROR))
818	return ret;
819	}
820	else
821	build_wcs_buffer (pstr);
822	}
823	else
824	#endif /* RE_ENABLE_I18N */
825	if (__glibc_unlikely (pstr->mbs_allocated))
826	{
827	if (pstr->icase)
828	build_upper_buffer (pstr);
829	else if (pstr->trans != NULL)
830	re_string_translate_buffer (pstr);
831	}
832	else
833	pstr->valid_len = pstr->len;
834
835	pstr->cur_idx = 0;
836	return REG_NOERROR;
837	}
838
839	static unsigned char
840	__attribute__ ((pure))
841	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
842	{
843	int ch;
844	Idx off;
845
846	/* Handle the common (easiest) cases first. */
847	if (__glibc_likely (!pstr->mbs_allocated))
848	return re_string_peek_byte (pstr, idx);
849
850	#ifdef RE_ENABLE_I18N
851	if (pstr->mb_cur_max > 1
852	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
853	return re_string_peek_byte (pstr, idx);
854	#endif
855
856	off = pstr->cur_idx + idx;
857	#ifdef RE_ENABLE_I18N
858	if (pstr->offsets_needed)
859	off = pstr->offsets[off];
860	#endif
861
862	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
863
864	#ifdef RE_ENABLE_I18N
865	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
866	this function returns CAPITAL LETTER I instead of first byte of
867	DOTLESS SMALL LETTER I. The latter would confuse the parser,
868	since peek_byte_case doesn't advance cur_idx in any way. */
869	if (pstr->offsets_needed && !isascii (ch))
870	return re_string_peek_byte (pstr, idx);
871	#endif
872
873	return ch;
874	}
875
876	static unsigned char
877	re_string_fetch_byte_case (re_string_t *pstr)
878	{
879	if (__glibc_likely (!pstr->mbs_allocated))
880	return re_string_fetch_byte (pstr);
881
882	#ifdef RE_ENABLE_I18N
883	if (pstr->offsets_needed)
884	{
885	Idx off;
886	int ch;
887
888	/* For tr_TR.UTF-8 [[:islower:]] there is
889	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890	in that case the whole multi-byte character and return
891	the original letter. On the other side, with
892	[[: DOTLESS SMALL LETTER I return [[:I, as doing
893	anything else would complicate things too much. */
894
895	if (!re_string_first_byte (pstr, pstr->cur_idx))
896	return re_string_fetch_byte (pstr);
897
898	off = pstr->offsets[pstr->cur_idx];
899	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900
901	if (! isascii (ch))
902	return re_string_fetch_byte (pstr);
903
904	re_string_skip_bytes (pstr,
905	re_string_char_size_at (pstr, pstr->cur_idx));
906	return ch;
907	}
908	#endif
909
910	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911	}
912
913	static void
914	re_string_destruct (re_string_t *pstr)
915	{
916	#ifdef RE_ENABLE_I18N
917	re_free (pstr->wcs);
918	re_free (pstr->offsets);
919	#endif /* RE_ENABLE_I18N */
920	if (pstr->mbs_allocated)
921	re_free (pstr->mbs);
922	}
923
924	/* Return the context at IDX in INPUT. */
925
926	static unsigned int
927	re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928	{
929	int c;
930	if (__glibc_unlikely (idx < 0))
931	/* In this case, we use the value stored in input->tip_context,
932	since we can't know the character in input->mbs[-1] here. */
933	return input->tip_context;
934	if (__glibc_unlikely (idx == input->len))
935	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
937	#ifdef RE_ENABLE_I18N
938	if (input->mb_cur_max > 1)
939	{
940	wint_t wc;
941	Idx wc_idx = idx;
942	while(input->wcs[wc_idx] == WEOF)
943	{
944	DEBUG_ASSERT (wc_idx >= 0);
945	--wc_idx;
946	if (wc_idx < 0)
947	return input->tip_context;
948	}
949	wc = input->wcs[wc_idx];
950	if (__glibc_unlikely (input->word_ops_used != 0)
951	&& IS_WIDE_WORD_CHAR (wc))
952	return CONTEXT_WORD;
953	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954	? CONTEXT_NEWLINE : 0);
955	}
956	else
957	#endif
958	{
959	c = re_string_byte_at (input, idx);
960	if (bitset_contain (input->word_char, c))
961	return CONTEXT_WORD;
962	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963	}
964	}
965
966
967	/* Functions for set operation. */
968
969	static reg_errcode_t
970	__attribute_warn_unused_result__
971	re_node_set_alloc (re_node_set *set, Idx size)
972	{
973	set->alloc = size;
974	set->nelem = 0;
975	set->elems = re_malloc (Idx, size);
976	if (__glibc_unlikely (set->elems == NULL)
977	&& (MALLOC_0_IS_NONNULL \|\| size != 0))
978	return REG_ESPACE;
979	return REG_NOERROR;
980	}
981
982	static reg_errcode_t
983	__attribute_warn_unused_result__
984	re_node_set_init_1 (re_node_set *set, Idx elem)
985	{
986	set->alloc = 1;
987	set->nelem = 1;
988	set->elems = re_malloc (Idx, 1);
989	if (__glibc_unlikely (set->elems == NULL))
990	{
991	set->alloc = set->nelem = 0;
992	return REG_ESPACE;
993	}
994	set->elems[0] = elem;
995	return REG_NOERROR;
996	}
997
998	static reg_errcode_t
999	__attribute_warn_unused_result__
1000	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1001	{
1002	set->alloc = 2;
1003	set->elems = re_malloc (Idx, 2);
1004	if (__glibc_unlikely (set->elems == NULL))
1005	return REG_ESPACE;
1006	if (elem1 == elem2)
1007	{
1008	set->nelem = 1;
1009	set->elems[0] = elem1;
1010	}
1011	else
1012	{
1013	set->nelem = 2;
1014	if (elem1 < elem2)
1015	{
1016	set->elems[0] = elem1;
1017	set->elems[1] = elem2;
1018	}
1019	else
1020	{
1021	set->elems[0] = elem2;
1022	set->elems[1] = elem1;
1023	}
1024	}
1025	return REG_NOERROR;
1026	}
1027
1028	static reg_errcode_t
1029	__attribute_warn_unused_result__
1030	re_node_set_init_copy (re_node_set dest, const re_node_set src)
1031	{
1032	dest->nelem = src->nelem;
1033	if (src->nelem > 0)
1034	{
1035	dest->alloc = dest->nelem;
1036	dest->elems = re_malloc (Idx, dest->alloc);
1037	if (__glibc_unlikely (dest->elems == NULL))
1038	{
1039	dest->alloc = dest->nelem = 0;
1040	return REG_ESPACE;
1041	}
1042	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1043	}
1044	else
1045	re_node_set_init_empty (dest);
1046	return REG_NOERROR;
1047	}
1048
1049	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1050	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1051	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1052
1053	static reg_errcode_t
1054	__attribute_warn_unused_result__
1055	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
1056	const re_node_set *src2)
1057	{
1058	Idx i1, i2, is, id, delta, sbase;
1059	if (src1->nelem == 0 \|\| src2->nelem == 0)
1060	return REG_NOERROR;
1061
1062	/* We need dest->nelem + 2 * elems_in_intersection; this is a
1063	conservative estimate. */
1064	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1065	{
1066	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1067	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1068	if (__glibc_unlikely (new_elems == NULL))
1069	return REG_ESPACE;
1070	dest->elems = new_elems;
1071	dest->alloc = new_alloc;
1072	}
1073
1074	/* Find the items in the intersection of SRC1 and SRC2, and copy
1075	into the top of DEST those that are not already in DEST itself. */
1076	sbase = dest->nelem + src1->nelem + src2->nelem;
1077	i1 = src1->nelem - 1;
1078	i2 = src2->nelem - 1;
1079	id = dest->nelem - 1;
1080	for (;;)
1081	{
1082	if (src1->elems[i1] == src2->elems[i2])
1083	{
1084	/* Try to find the item in DEST. Maybe we could binary search? */
1085	while (id >= 0 && dest->elems[id] > src1->elems[i1])
1086	--id;
1087
1088	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
1089	dest->elems[--sbase] = src1->elems[i1];
1090
1091	if (--i1 < 0 \|\| --i2 < 0)
1092	break;
1093	}
1094
1095	/* Lower the highest of the two items. */
1096	else if (src1->elems[i1] < src2->elems[i2])
1097	{
1098	if (--i2 < 0)
1099	break;
1100	}
1101	else
1102	{
1103	if (--i1 < 0)
1104	break;
1105	}
1106	}
1107
1108	id = dest->nelem - 1;
1109	is = dest->nelem + src1->nelem + src2->nelem - 1;
1110	delta = is - sbase + 1;
1111
1112	/* Now copy. When DELTA becomes zero, the remaining
1113	DEST elements are already in place; this is more or
1114	less the same loop that is in re_node_set_merge. */
1115	dest->nelem += delta;
1116	if (delta > 0 && id >= 0)
1117	for (;;)
1118	{
1119	if (dest->elems[is] > dest->elems[id])
1120	{
1121	/* Copy from the top. */
1122	dest->elems[id + delta--] = dest->elems[is--];
1123	if (delta == 0)
1124	break;
1125	}
1126	else
1127	{
1128	/* Slide from the bottom. */
1129	dest->elems[id + delta] = dest->elems[id];
1130	if (--id < 0)
1131	break;
1132	}
1133	}
1134
1135	/* Copy remaining SRC elements. */
1136	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1137
1138	return REG_NOERROR;
1139	}
1140
1141	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1142	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1143
1144	static reg_errcode_t
1145	__attribute_warn_unused_result__
1146	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1147	const re_node_set *src2)
1148	{
1149	Idx i1, i2, id;
1150	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1151	{
1152	dest->alloc = src1->nelem + src2->nelem;
1153	dest->elems = re_malloc (Idx, dest->alloc);
1154	if (__glibc_unlikely (dest->elems == NULL))
1155	return REG_ESPACE;
1156	}
1157	else
1158	{
1159	if (src1 != NULL && src1->nelem > 0)
1160	return re_node_set_init_copy (dest, src1);
1161	else if (src2 != NULL && src2->nelem > 0)
1162	return re_node_set_init_copy (dest, src2);
1163	else
1164	re_node_set_init_empty (dest);
1165	return REG_NOERROR;
1166	}
1167	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1168	{
1169	if (src1->elems[i1] > src2->elems[i2])
1170	{
1171	dest->elems[id++] = src2->elems[i2++];
1172	continue;
1173	}
1174	if (src1->elems[i1] == src2->elems[i2])
1175	++i2;
1176	dest->elems[id++] = src1->elems[i1++];
1177	}
1178	if (i1 < src1->nelem)
1179	{
1180	memcpy (dest->elems + id, src1->elems + i1,
1181	(src1->nelem - i1) * sizeof (Idx));
1182	id += src1->nelem - i1;
1183	}
1184	else if (i2 < src2->nelem)
1185	{
1186	memcpy (dest->elems + id, src2->elems + i2,
1187	(src2->nelem - i2) * sizeof (Idx));
1188	id += src2->nelem - i2;
1189	}
1190	dest->nelem = id;
1191	return REG_NOERROR;
1192	}
1193
1194	/* Calculate the union set of the sets DEST and SRC. And store it to
1195	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1196
1197	static reg_errcode_t
1198	__attribute_warn_unused_result__
1199	re_node_set_merge (re_node_set dest, const re_node_set src)
1200	{
1201	Idx is, id, sbase, delta;
1202	if (src == NULL \|\| src->nelem == 0)
1203	return REG_NOERROR;
1204	if (dest->alloc < 2 * src->nelem + dest->nelem)
1205	{
1206	Idx new_alloc = 2 * (src->nelem + dest->alloc);
1207	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1208	if (__glibc_unlikely (new_buffer == NULL))
1209	return REG_ESPACE;
1210	dest->elems = new_buffer;
1211	dest->alloc = new_alloc;
1212	}
1213
1214	if (__glibc_unlikely (dest->nelem == 0))
1215	{
1216	/* Although we already guaranteed above that dest->alloc != 0 and
1217	therefore dest->elems != NULL, add a debug assertion to pacify
1218	GCC 11.2.1's -fanalyzer. */
1219	DEBUG_ASSERT (dest->elems);
1220	dest->nelem = src->nelem;
1221	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1222	return REG_NOERROR;
1223	}
1224
1225	/* Copy into the top of DEST the items of SRC that are not
1226	found in DEST. Maybe we could binary search in DEST? */
1227	for (sbase = dest->nelem + 2 * src->nelem,
1228	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1229	{
1230	if (dest->elems[id] == src->elems[is])
1231	is--, id--;
1232	else if (dest->elems[id] < src->elems[is])
1233	dest->elems[--sbase] = src->elems[is--];
1234	else /* if (dest->elems[id] > src->elems[is]) */
1235	--id;
1236	}
1237
1238	if (is >= 0)
1239	{
1240	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1241	sbase -= is + 1;
1242	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243	}
1244
1245	id = dest->nelem - 1;
1246	is = dest->nelem + 2 * src->nelem - 1;
1247	delta = is - sbase + 1;
1248	if (delta == 0)
1249	return REG_NOERROR;
1250
1251	/* Now copy. When DELTA becomes zero, the remaining
1252	DEST elements are already in place. */
1253	dest->nelem += delta;
1254	for (;;)
1255	{
1256	if (dest->elems[is] > dest->elems[id])
1257	{
1258	/* Copy from the top. */
1259	dest->elems[id + delta--] = dest->elems[is--];
1260	if (delta == 0)
1261	break;
1262	}
1263	else
1264	{
1265	/* Slide from the bottom. */
1266	dest->elems[id + delta] = dest->elems[id];
1267	if (--id < 0)
1268	{
1269	/* Copy remaining SRC elements. */
1270	memcpy (dest->elems, dest->elems + sbase,
1271	delta * sizeof (Idx));
1272	break;
1273	}
1274	}
1275	}
1276
1277	return REG_NOERROR;
1278	}
1279
1280	/* Insert the new element ELEM to the re_node_set* SET.
1281	SET should not already have ELEM.
1282	Return true if successful. */
1283
1284	static bool
1285	__attribute_warn_unused_result__
1286	re_node_set_insert (re_node_set *set, Idx elem)
1287	{
1288	Idx idx;
1289	/* In case the set is empty. */
1290	if (set->alloc == 0)
1291	return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1292
1293	if (__glibc_unlikely (set->nelem) == 0)
1294	{
1295	/* Although we already guaranteed above that set->alloc != 0 and
1296	therefore set->elems != NULL, add a debug assertion to pacify
1297	GCC 11.2 -fanalyzer. */
1298	DEBUG_ASSERT (set->elems);
1299	set->elems[0] = elem;
1300	++set->nelem;
1301	return true;
1302	}
1303
1304	/* Realloc if we need. */
1305	if (set->alloc == set->nelem)
1306	{
1307	Idx *new_elems;
1308	set->alloc = set->alloc * 2;
1309	new_elems = re_realloc (set->elems, Idx, set->alloc);
1310	if (__glibc_unlikely (new_elems == NULL))
1311	return false;
1312	set->elems = new_elems;
1313	}
1314
1315	/* Move the elements which follows the new element. Test the
1316	first element separately to skip a check in the inner loop. */
1317	if (elem < set->elems[0])
1318	{
1319	for (idx = set->nelem; idx > 0; idx--)
1320	set->elems[idx] = set->elems[idx - 1];
1321	}
1322	else
1323	{
1324	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1325	set->elems[idx] = set->elems[idx - 1];
1326	DEBUG_ASSERT (set->elems[idx - 1] < elem);
1327	}
1328
1329	/* Insert the new element. */
1330	set->elems[idx] = elem;
1331	++set->nelem;
1332	return true;
1333	}
1334
1335	/* Insert the new element ELEM to the re_node_set* SET.
1336	SET should not already have any element greater than or equal to ELEM.
1337	Return true if successful. */
1338
1339	static bool
1340	__attribute_warn_unused_result__
1341	re_node_set_insert_last (re_node_set *set, Idx elem)
1342	{
1343	/* Realloc if we need. */
1344	if (set->alloc == set->nelem)
1345	{
1346	Idx *new_elems;
1347	set->alloc = (set->alloc + 1) * 2;
1348	new_elems = re_realloc (set->elems, Idx, set->alloc);
1349	if (__glibc_unlikely (new_elems == NULL))
1350	return false;
1351	set->elems = new_elems;
1352	}
1353
1354	/* Insert the new element. */
1355	set->elems[set->nelem++] = elem;
1356	return true;
1357	}
1358
1359	/* Compare two node sets SET1 and SET2.
1360	Return true if SET1 and SET2 are equivalent. */
1361
1362	static bool
1363	__attribute__ ((pure))
1364	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1365	{
1366	Idx i;
1367	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1368	return false;
1369	for (i = set1->nelem ; --i >= 0 ; )
1370	if (set1->elems[i] != set2->elems[i])
1371	return false;
1372	return true;
1373	}
1374
1375	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1376
1377	static Idx
1378	__attribute__ ((pure))
1379	re_node_set_contains (const re_node_set *set, Idx elem)
1380	{
1381	__re_size_t idx, right, mid;
1382	if (set->nelem <= 0)
1383	return 0;
1384
1385	/* Binary search the element. */
1386	idx = 0;
1387	right = set->nelem - 1;
1388	while (idx < right)
1389	{
1390	mid = (idx + right) / 2;
1391	if (set->elems[mid] < elem)
1392	idx = mid + 1;
1393	else
1394	right = mid;
1395	}
1396	return set->elems[idx] == elem ? idx + 1 : 0;
1397	}
1398
1399	static void
1400	re_node_set_remove_at (re_node_set *set, Idx idx)
1401	{
1402	if (idx < 0 \|\| idx >= set->nelem)
1403	return;
1404	--set->nelem;
1405	for (; idx < set->nelem; idx++)
1406	set->elems[idx] = set->elems[idx + 1];
1407	}
1408
1409
1410
1411	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1412	Or return -1 if an error occurred. */
1413
1414	static Idx
1415	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1416	{
1417	if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1418	{
1419	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420	Idx new_nexts, new_indices;
1421	re_node_set new_edests, new_eclosures;
1422	re_token_t *new_nodes;
1423
1424	/* Avoid overflows in realloc. */
1425	const size_t max_object_size = MAX (sizeof (re_token_t),
1426	MAX (sizeof (re_node_set),
1427	sizeof (Idx)));
1428	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1429	< new_nodes_alloc))
1430	return -1;
1431
1432	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1433	if (__glibc_unlikely (new_nodes == NULL))
1434	return -1;
1435	dfa->nodes = new_nodes;
1436	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1437	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1438	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1439	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1440	if (__glibc_unlikely (new_nexts == NULL \|\| new_indices == NULL
1441	\|\| new_edests == NULL \|\| new_eclosures == NULL))
1442	{
1443	re_free (new_nexts);
1444	re_free (new_indices);
1445	re_free (new_edests);
1446	re_free (new_eclosures);
1447	return -1;
1448	}
1449	dfa->nexts = new_nexts;
1450	dfa->org_indices = new_indices;
1451	dfa->edests = new_edests;
1452	dfa->eclosures = new_eclosures;
1453	dfa->nodes_alloc = new_nodes_alloc;
1454	}
1455	dfa->nodes[dfa->nodes_len] = token;
1456	dfa->nodes[dfa->nodes_len].constraint = 0;
1457	#ifdef RE_ENABLE_I18N
1458	dfa->nodes[dfa->nodes_len].accept_mb =
1459	((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1460	\|\| token.type == COMPLEX_BRACKET);
1461	#endif
1462	dfa->nexts[dfa->nodes_len] = -1;
1463	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1464	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1465	return dfa->nodes_len++;
1466	}
1467
1468	static re_hashval_t
1469	calc_state_hash (const re_node_set *nodes, unsigned int context)
1470	{
1471	re_hashval_t hash = nodes->nelem + context;
1472	Idx i;
1473	for (i = 0 ; i < nodes->nelem ; i++)
1474	hash += nodes->elems[i];
1475	return hash;
1476	}
1477
1478	/* Search for the state whose node_set is equivalent to NODES.
1479	Return the pointer to the state, if we found it in the DFA.
1480	Otherwise create the new one and return it. In case of an error
1481	return NULL and set the error code in ERR.
1482	Note: - We assume NULL as the invalid state, then it is possible that
1483	return value is NULL and ERR is REG_NOERROR.
1484	- We never return non-NULL value in case of any errors, it is for
1485	optimization. */
1486
1487	static re_dfastate_t *
1488	__attribute_warn_unused_result__
1489	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1490	const re_node_set *nodes)
1491	{
1492	re_hashval_t hash;
1493	re_dfastate_t *new_state;
1494	struct re_state_table_entry *spot;
1495	Idx i;
1496	#if defined GCC_LINT \|\| defined lint
1497	/* Suppress bogus uninitialized-variable warnings. */
1498	*err = REG_NOERROR;
1499	#endif
1500	if (__glibc_unlikely (nodes->nelem == 0))
1501	{
1502	*err = REG_NOERROR;
1503	return NULL;
1504	}
1505	hash = calc_state_hash (nodes, 0);
1506	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507
1508	for (i = 0 ; i < spot->num ; i++)
1509	{
1510	re_dfastate_t *state = spot->array[i];
1511	if (hash != state->hash)
1512	continue;
1513	if (re_node_set_compare (&state->nodes, nodes))
1514	return state;
1515	}
1516
1517	/* There are no appropriate state in the dfa, create the new one. */
1518	new_state = create_ci_newstate (dfa, nodes, hash);
1519	if (__glibc_unlikely (new_state == NULL))
1520	*err = REG_ESPACE;
1521
1522	return new_state;
1523	}
1524
1525	/* Search for the state whose node_set is equivalent to NODES and
1526	whose context is equivalent to CONTEXT.
1527	Return the pointer to the state, if we found it in the DFA.
1528	Otherwise create the new one and return it. In case of an error
1529	return NULL and set the error code in ERR.
1530	Note: - We assume NULL as the invalid state, then it is possible that
1531	return value is NULL and ERR is REG_NOERROR.
1532	- We never return non-NULL value in case of any errors, it is for
1533	optimization. */
1534
1535	static re_dfastate_t *
1536	__attribute_warn_unused_result__
1537	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1538	const re_node_set *nodes, unsigned int context)
1539	{
1540	re_hashval_t hash;
1541	re_dfastate_t *new_state;
1542	struct re_state_table_entry *spot;
1543	Idx i;
1544	#if defined GCC_LINT \|\| defined lint
1545	/* Suppress bogus uninitialized-variable warnings. */
1546	*err = REG_NOERROR;
1547	#endif
1548	if (nodes->nelem == 0)
1549	{
1550	*err = REG_NOERROR;
1551	return NULL;
1552	}
1553	hash = calc_state_hash (nodes, context);
1554	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1555
1556	for (i = 0 ; i < spot->num ; i++)
1557	{
1558	re_dfastate_t *state = spot->array[i];
1559	if (state->hash == hash
1560	&& state->context == context
1561	&& re_node_set_compare (state->entrance_nodes, nodes))
1562	return state;
1563	}
1564	/* There are no appropriate state in 'dfa', create the new one. */
1565	new_state = create_cd_newstate (dfa, nodes, context, hash);
1566	if (__glibc_unlikely (new_state == NULL))
1567	*err = REG_ESPACE;
1568
1569	return new_state;
1570	}
1571
1572	/* Finish initialization of the new state NEWSTATE, and using its hash value
1573	HASH put in the appropriate bucket of DFA's state table. Return value
1574	indicates the error code if failed. */
1575
1576	static reg_errcode_t
1577	__attribute_warn_unused_result__
1578	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1579	re_hashval_t hash)
1580	{
1581	struct re_state_table_entry *spot;
1582	reg_errcode_t err;
1583	Idx i;
1584
1585	newstate->hash = hash;
1586	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1587	if (__glibc_unlikely (err != REG_NOERROR))
1588	return REG_ESPACE;
1589	for (i = 0; i < newstate->nodes.nelem; i++)
1590	{
1591	Idx elem = newstate->nodes.elems[i];
1592	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1593	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1594	return REG_ESPACE;
1595	}
1596
1597	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1598	if (__glibc_unlikely (spot->alloc <= spot->num))
1599	{
1600	Idx new_alloc = 2 * spot->num + 2;
1601	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1602	new_alloc);
1603	if (__glibc_unlikely (new_array == NULL))
1604	return REG_ESPACE;
1605	spot->array = new_array;
1606	spot->alloc = new_alloc;
1607	}
1608	spot->array[spot->num++] = newstate;
1609	return REG_NOERROR;
1610	}
1611
1612	static void
1613	free_state (re_dfastate_t *state)
1614	{
1615	re_node_set_free (&state->non_eps_nodes);
1616	re_node_set_free (&state->inveclosure);
1617	if (state->entrance_nodes != &state->nodes)
1618	{
1619	re_node_set_free (state->entrance_nodes);
1620	re_free (state->entrance_nodes);
1621	}
1622	re_node_set_free (&state->nodes);
1623	re_free (state->word_trtable);
1624	re_free (state->trtable);
1625	re_free (state);
1626	}
1627
1628	/* Create the new state which is independent of contexts.
1629	Return the new state if succeeded, otherwise return NULL. */
1630
1631	static re_dfastate_t *
1632	__attribute_warn_unused_result__
1633	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1634	re_hashval_t hash)
1635	{
1636	Idx i;
1637	reg_errcode_t err;
1638	re_dfastate_t *newstate;
1639
1640	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1641	if (__glibc_unlikely (newstate == NULL))
1642	return NULL;
1643	err = re_node_set_init_copy (&newstate->nodes, nodes);
1644	if (__glibc_unlikely (err != REG_NOERROR))
1645	{
1646	re_free (newstate);
1647	return NULL;
1648	}
1649
1650	newstate->entrance_nodes = &newstate->nodes;
1651	for (i = 0 ; i < nodes->nelem ; i++)
1652	{
1653	re_token_t *node = dfa->nodes + nodes->elems[i];
1654	re_token_type_t type = node->type;
1655	if (type == CHARACTER && !node->constraint)
1656	continue;
1657	#ifdef RE_ENABLE_I18N
1658	newstate->accept_mb \|= node->accept_mb;
1659	#endif /* RE_ENABLE_I18N */
1660
1661	/* If the state has the halt node, the state is a halt state. */
1662	if (type == END_OF_RE)
1663	newstate->halt = 1;
1664	else if (type == OP_BACK_REF)
1665	newstate->has_backref = 1;
1666	else if (type == ANCHOR \|\| node->constraint)
1667	newstate->has_constraint = 1;
1668	}
1669	err = register_state (dfa, newstate, hash);
1670	if (__glibc_unlikely (err != REG_NOERROR))
1671	{
1672	free_state (newstate);
1673	newstate = NULL;
1674	}
1675	return newstate;
1676	}
1677
1678	/* Create the new state which is depend on the context CONTEXT.
1679	Return the new state if succeeded, otherwise return NULL. */
1680
1681	static re_dfastate_t *
1682	__attribute_warn_unused_result__
1683	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1684	unsigned int context, re_hashval_t hash)
1685	{
1686	Idx i, nctx_nodes = 0;
1687	reg_errcode_t err;
1688	re_dfastate_t *newstate;
1689
1690	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1691	if (__glibc_unlikely (newstate == NULL))
1692	return NULL;
1693	err = re_node_set_init_copy (&newstate->nodes, nodes);
1694	if (__glibc_unlikely (err != REG_NOERROR))
1695	{
1696	re_free (newstate);
1697	return NULL;
1698	}
1699
1700	newstate->context = context;
1701	newstate->entrance_nodes = &newstate->nodes;
1702
1703	for (i = 0 ; i < nodes->nelem ; i++)
1704	{
1705	re_token_t *node = dfa->nodes + nodes->elems[i];
1706	re_token_type_t type = node->type;
1707	unsigned int constraint = node->constraint;
1708
1709	if (type == CHARACTER && !constraint)
1710	continue;
1711	#ifdef RE_ENABLE_I18N
1712	newstate->accept_mb \|= node->accept_mb;
1713	#endif /* RE_ENABLE_I18N */
1714
1715	/* If the state has the halt node, the state is a halt state. */
1716	if (type == END_OF_RE)
1717	newstate->halt = 1;
1718	else if (type == OP_BACK_REF)
1719	newstate->has_backref = 1;
1720
1721	if (constraint)
1722	{
1723	if (newstate->entrance_nodes == &newstate->nodes)
1724	{
1725	re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1726	if (__glibc_unlikely (entrance_nodes == NULL))
1727	{
1728	free_state (newstate);
1729	return NULL;
1730	}
1731	newstate->entrance_nodes = entrance_nodes;
1732	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1733	!= REG_NOERROR)
1734	{
1735	free_state (newstate);
1736	return NULL;
1737	}
1738	nctx_nodes = 0;
1739	newstate->has_constraint = 1;
1740	}
1741
1742	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1743	{
1744	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1745	++nctx_nodes;
1746	}
1747	}
1748	}
1749	err = register_state (dfa, newstate, hash);
1750	if (__glibc_unlikely (err != REG_NOERROR))
1751	{
1752	free_state (newstate);
1753	newstate = NULL;
1754	}
1755	return newstate;
1756	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/grep/current/lib/regex_internal.c@ 3530

Download in other formats: