VirtualBox

source: kBuild/trunk/src/sed/lib/regex_internal.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 6 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 47.5 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33static void build_wcs_buffer (re_string_t *pstr);
34static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
35static void build_upper_buffer (re_string_t *pstr);
36static void re_string_translate_buffer (re_string_t *pstr);
37static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
38 int eflags) __attribute__ ((pure));
39
40
41/* Functions for string operation. */
42
43/* This function allocate the buffers. It is necessary to call
44 re_string_reconstruct before using the object. */
45
46static reg_errcode_t
47__attribute_warn_unused_result__
48re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
49 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
50{
51 reg_errcode_t ret;
52 Idx init_buf_len;
53
54 /* Ensure at least one character fits into the buffers. */
55 if (init_len < dfa->mb_cur_max)
56 init_len = dfa->mb_cur_max;
57 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
58 re_string_construct_common (str, len, pstr, trans, icase, dfa);
59
60 ret = re_string_realloc_buffers (pstr, init_buf_len);
61 if (__glibc_unlikely (ret != REG_NOERROR))
62 return ret;
63
64 pstr->word_char = dfa->word_char;
65 pstr->word_ops_used = dfa->word_ops_used;
66 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
67 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
68 pstr->valid_raw_len = pstr->valid_len;
69 return REG_NOERROR;
70}
71
72/* This function allocate the buffers, and initialize them. */
73
74static reg_errcode_t
75__attribute_warn_unused_result__
76re_string_construct (re_string_t *pstr, const char *str, Idx len,
77 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
78{
79 reg_errcode_t ret;
80 memset (pstr, '\0', sizeof (re_string_t));
81 re_string_construct_common (str, len, pstr, trans, icase, dfa);
82
83 if (len > 0)
84 {
85 ret = re_string_realloc_buffers (pstr, len + 1);
86 if (__glibc_unlikely (ret != REG_NOERROR))
87 return ret;
88 }
89 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
90
91 if (icase)
92 {
93 if (dfa->mb_cur_max > 1)
94 {
95 while (1)
96 {
97 ret = build_wcs_upper_buffer (pstr);
98 if (__glibc_unlikely (ret != REG_NOERROR))
99 return ret;
100 if (pstr->valid_raw_len >= len)
101 break;
102 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
103 break;
104 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
105 if (__glibc_unlikely (ret != REG_NOERROR))
106 return ret;
107 }
108 }
109 else
110 build_upper_buffer (pstr);
111 }
112 else
113 {
114 if (dfa->mb_cur_max > 1)
115 build_wcs_buffer (pstr);
116 else
117 {
118 if (trans != NULL)
119 re_string_translate_buffer (pstr);
120 else
121 {
122 pstr->valid_len = pstr->bufs_len;
123 pstr->valid_raw_len = pstr->bufs_len;
124 }
125 }
126 }
127
128 return REG_NOERROR;
129}
130
131/* Helper functions for re_string_allocate, and re_string_construct. */
132
133static reg_errcode_t
134__attribute_warn_unused_result__
135re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
136{
137 if (pstr->mb_cur_max > 1)
138 {
139 wint_t *new_wcs;
140
141 /* Avoid overflow in realloc. */
142 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
143 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
144 < new_buf_len))
145 return REG_ESPACE;
146
147 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
148 if (__glibc_unlikely (new_wcs == NULL))
149 return REG_ESPACE;
150 pstr->wcs = new_wcs;
151 if (pstr->offsets != NULL)
152 {
153 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
154 if (__glibc_unlikely (new_offsets == NULL))
155 return REG_ESPACE;
156 pstr->offsets = new_offsets;
157 }
158 }
159 if (pstr->mbs_allocated)
160 {
161 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
162 new_buf_len);
163 if (__glibc_unlikely (new_mbs == NULL))
164 return REG_ESPACE;
165 pstr->mbs = new_mbs;
166 }
167 pstr->bufs_len = new_buf_len;
168 return REG_NOERROR;
169}
170
171
172static void
173re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
174 RE_TRANSLATE_TYPE trans, bool icase,
175 const re_dfa_t *dfa)
176{
177 pstr->raw_mbs = (const unsigned char *) str;
178 pstr->len = len;
179 pstr->raw_len = len;
180 pstr->trans = trans;
181 pstr->icase = icase;
182 pstr->mbs_allocated = (trans != NULL || icase);
183 pstr->mb_cur_max = dfa->mb_cur_max;
184 pstr->is_utf8 = dfa->is_utf8;
185 pstr->map_notascii = dfa->map_notascii;
186 pstr->stop = pstr->len;
187 pstr->raw_stop = pstr->stop;
188}
189
190
191/* Build wide character buffer PSTR->WCS.
192 If the byte sequence of the string are:
193 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194 Then wide character buffer will be:
195 <wc1> , WEOF , <wc2> , WEOF , <wc3>
196 We use WEOF for padding, they indicate that the position isn't
197 a first byte of a multibyte character.
198
199 Note that this function assumes PSTR->VALID_LEN elements are already
200 built and starts from PSTR->VALID_LEN. */
201
202static void
203build_wcs_buffer (re_string_t *pstr)
204{
205#ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
208#else
209 unsigned char buf[64];
210#endif
211 mbstate_t prev_st;
212 Idx byte_idx, end_idx, remain_len;
213 size_t mbclen;
214
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219 {
220 wchar_t wc;
221 const char *p;
222
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (__glibc_unlikely (pstr->trans != NULL))
227 {
228 int i, ch;
229
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 {
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 }
235 p = (const char *) buf;
236 }
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
241 || (mbclen == (size_t) -2
242 && pstr->bufs_len >= pstr->len)))
243 {
244 /* We treat these cases as a singlebyte character. */
245 mbclen = 1;
246 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247 if (__glibc_unlikely (pstr->trans != NULL))
248 wc = pstr->trans[wc];
249 pstr->cur_state = prev_st;
250 }
251 else if (__glibc_unlikely (mbclen == (size_t) -2))
252 {
253 /* The buffer doesn't have enough space, finish to build. */
254 pstr->cur_state = prev_st;
255 break;
256 }
257
258 /* Write wide character and padding. */
259 pstr->wcs[byte_idx++] = wc;
260 /* Write paddings. */
261 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 pstr->wcs[byte_idx++] = WEOF;
263 }
264 pstr->valid_len = byte_idx;
265 pstr->valid_raw_len = byte_idx;
266}
267
268/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269 but for REG_ICASE. */
270
271static reg_errcode_t
272__attribute_warn_unused_result__
273build_wcs_upper_buffer (re_string_t *pstr)
274{
275 mbstate_t prev_st;
276 Idx src_idx, byte_idx, end_idx, remain_len;
277 size_t mbclen;
278#ifdef _LIBC
279 char buf[MB_LEN_MAX];
280 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
281#else
282 char buf[64];
283#endif
284
285 byte_idx = pstr->valid_len;
286 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288 /* The following optimization assumes that ASCII characters can be
289 mapped to wide characters with a simple cast. */
290 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291 {
292 while (byte_idx < end_idx)
293 {
294 wchar_t wc;
295 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
296
297 if (isascii (ch) && mbsinit (&pstr->cur_state))
298 {
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 wchar_t wcu = __towupper (ch);
302 if (isascii (wcu))
303 {
304 pstr->mbs[byte_idx] = wcu;
305 pstr->wcs[byte_idx] = wcu;
306 byte_idx++;
307 continue;
308 }
309 }
310
311 remain_len = end_idx - byte_idx;
312 prev_st = pstr->cur_state;
313 mbclen = __mbrtowc (&wc,
314 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
315 + byte_idx), remain_len, &pstr->cur_state);
316 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
317 {
318 wchar_t wcu = __towupper (wc);
319 if (wcu != wc)
320 {
321 size_t mbcdlen;
322
323 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
324 if (__glibc_likely (mbclen == mbcdlen))
325 memcpy (pstr->mbs + byte_idx, buf, mbclen);
326 else
327 {
328 src_idx = byte_idx;
329 goto offsets_needed;
330 }
331 }
332 else
333 memcpy (pstr->mbs + byte_idx,
334 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
335 pstr->wcs[byte_idx++] = wcu;
336 /* Write paddings. */
337 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
338 pstr->wcs[byte_idx++] = WEOF;
339 }
340 else if (mbclen == (size_t) -1 || mbclen == 0
341 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
342 {
343 /* It is an invalid character, an incomplete character
344 at the end of the string, or '\0'. Just use the byte. */
345 pstr->mbs[byte_idx] = ch;
346 /* And also cast it to wide char. */
347 pstr->wcs[byte_idx++] = (wchar_t) ch;
348 if (__glibc_unlikely (mbclen == (size_t) -1))
349 pstr->cur_state = prev_st;
350 }
351 else
352 {
353 /* The buffer doesn't have enough space, finish to build. */
354 pstr->cur_state = prev_st;
355 break;
356 }
357 }
358 pstr->valid_len = byte_idx;
359 pstr->valid_raw_len = byte_idx;
360 return REG_NOERROR;
361 }
362 else
363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 {
365 wchar_t wc;
366 const char *p;
367 offsets_needed:
368 remain_len = end_idx - byte_idx;
369 prev_st = pstr->cur_state;
370 if (__glibc_unlikely (pstr->trans != NULL))
371 {
372 int i, ch;
373
374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 {
376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 buf[i] = pstr->trans[ch];
378 }
379 p = (const char *) buf;
380 }
381 else
382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
385 {
386 wchar_t wcu = __towupper (wc);
387 if (wcu != wc)
388 {
389 size_t mbcdlen;
390
391 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
392 if (__glibc_likely (mbclen == mbcdlen))
393 memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 else if (mbcdlen != (size_t) -1)
395 {
396 size_t i;
397
398 if (byte_idx + mbcdlen > pstr->bufs_len)
399 {
400 pstr->cur_state = prev_st;
401 break;
402 }
403
404 if (pstr->offsets == NULL)
405 {
406 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407
408 if (pstr->offsets == NULL)
409 return REG_ESPACE;
410 }
411 if (!pstr->offsets_needed)
412 {
413 for (i = 0; i < (size_t) byte_idx; ++i)
414 pstr->offsets[i] = i;
415 pstr->offsets_needed = 1;
416 }
417
418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 pstr->wcs[byte_idx] = wcu;
420 pstr->offsets[byte_idx] = src_idx;
421 for (i = 1; i < mbcdlen; ++i)
422 {
423 pstr->offsets[byte_idx + i]
424 = src_idx + (i < mbclen ? i : mbclen - 1);
425 pstr->wcs[byte_idx + i] = WEOF;
426 }
427 pstr->len += mbcdlen - mbclen;
428 if (pstr->raw_stop > src_idx)
429 pstr->stop += mbcdlen - mbclen;
430 end_idx = (pstr->bufs_len > pstr->len)
431 ? pstr->len : pstr->bufs_len;
432 byte_idx += mbcdlen;
433 src_idx += mbclen;
434 continue;
435 }
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
438 }
439 else
440 memcpy (pstr->mbs + byte_idx, p, mbclen);
441
442 if (__glibc_unlikely (pstr->offsets_needed != 0))
443 {
444 size_t i;
445 for (i = 0; i < mbclen; ++i)
446 pstr->offsets[byte_idx + i] = src_idx + i;
447 }
448 src_idx += mbclen;
449
450 pstr->wcs[byte_idx++] = wcu;
451 /* Write paddings. */
452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 pstr->wcs[byte_idx++] = WEOF;
454 }
455 else if (mbclen == (size_t) -1 || mbclen == 0
456 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
457 {
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460
461 if (__glibc_unlikely (pstr->trans != NULL))
462 ch = pstr->trans [ch];
463 pstr->mbs[byte_idx] = ch;
464
465 if (__glibc_unlikely (pstr->offsets_needed != 0))
466 pstr->offsets[byte_idx] = src_idx;
467 ++src_idx;
468
469 /* And also cast it to wide char. */
470 pstr->wcs[byte_idx++] = (wchar_t) ch;
471 if (__glibc_unlikely (mbclen == (size_t) -1))
472 pstr->cur_state = prev_st;
473 }
474 else
475 {
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr->cur_state = prev_st;
478 break;
479 }
480 }
481 pstr->valid_len = byte_idx;
482 pstr->valid_raw_len = src_idx;
483 return REG_NOERROR;
484}
485
486/* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 Return the index. */
488
489static Idx
490re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
491{
492 mbstate_t prev_st;
493 Idx rawbuf_idx;
494 size_t mbclen;
495 wint_t wc = WEOF;
496
497 /* Skip the characters which are not necessary to check. */
498 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499 rawbuf_idx < new_raw_idx;)
500 {
501 wchar_t wc2;
502 Idx remain_len = pstr->raw_len - rawbuf_idx;
503 prev_st = pstr->cur_state;
504 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505 remain_len, &pstr->cur_state);
506 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
507 || mbclen == 0))
508 {
509 /* We treat these cases as a single byte character. */
510 if (mbclen == 0 || remain_len == 0)
511 wc = L'\0';
512 else
513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 mbclen = 1;
515 pstr->cur_state = prev_st;
516 }
517 else
518 wc = wc2;
519 /* Then proceed the next character. */
520 rawbuf_idx += mbclen;
521 }
522 *last_wc = wc;
523 return rawbuf_idx;
524}
525
526/* Build the buffer PSTR->MBS, and apply the translation if we need.
527 This function is used in case of REG_ICASE. */
528
529static void
530build_upper_buffer (re_string_t *pstr)
531{
532 Idx char_idx, end_idx;
533 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534
535 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
536 {
537 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538 if (__glibc_unlikely (pstr->trans != NULL))
539 ch = pstr->trans[ch];
540 pstr->mbs[char_idx] = toupper (ch);
541 }
542 pstr->valid_len = char_idx;
543 pstr->valid_raw_len = char_idx;
544}
545
546/* Apply TRANS to the buffer in PSTR. */
547
548static void
549re_string_translate_buffer (re_string_t *pstr)
550{
551 Idx buf_idx, end_idx;
552 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553
554 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555 {
556 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557 pstr->mbs[buf_idx] = pstr->trans[ch];
558 }
559
560 pstr->valid_len = buf_idx;
561 pstr->valid_raw_len = buf_idx;
562}
563
564/* This function re-construct the buffers.
565 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566 convert to upper case in case of REG_ICASE, apply translation. */
567
568static reg_errcode_t
569__attribute_warn_unused_result__
570re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571{
572 Idx offset;
573
574 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
575 offset = idx - pstr->raw_mbs_idx;
576 else
577 {
578 /* Reset buffer. */
579 if (pstr->mb_cur_max > 1)
580 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
581 pstr->len = pstr->raw_len;
582 pstr->stop = pstr->raw_stop;
583 pstr->valid_len = 0;
584 pstr->raw_mbs_idx = 0;
585 pstr->valid_raw_len = 0;
586 pstr->offsets_needed = 0;
587 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
588 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
589 if (!pstr->mbs_allocated)
590 pstr->mbs = (unsigned char *) pstr->raw_mbs;
591 offset = idx;
592 }
593
594 if (__glibc_likely (offset != 0))
595 {
596 /* Should the already checked characters be kept? */
597 if (__glibc_likely (offset < pstr->valid_raw_len))
598 {
599 /* Yes, move them to the front of the buffer. */
600 if (__glibc_unlikely (pstr->offsets_needed))
601 {
602 Idx low = 0, high = pstr->valid_len, mid;
603 do
604 {
605 mid = (high + low) / 2;
606 if (pstr->offsets[mid] > offset)
607 high = mid;
608 else if (pstr->offsets[mid] < offset)
609 low = mid + 1;
610 else
611 break;
612 }
613 while (low < high);
614 if (pstr->offsets[mid] < offset)
615 ++mid;
616 pstr->tip_context = re_string_context_at (pstr, mid - 1,
617 eflags);
618 /* This can be quite complicated, so handle specially
619 only the common and easy case where the character with
620 different length representation of lower and upper
621 case is present at or after offset. */
622 if (pstr->valid_len > offset
623 && mid == offset && pstr->offsets[mid] == offset)
624 {
625 memmove (pstr->wcs, pstr->wcs + offset,
626 (pstr->valid_len - offset) * sizeof (wint_t));
627 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
628 pstr->valid_len -= offset;
629 pstr->valid_raw_len -= offset;
630 for (low = 0; low < pstr->valid_len; low++)
631 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
632 }
633 else
634 {
635 /* Otherwise, just find out how long the partial multibyte
636 character at offset is and fill it with WEOF/255. */
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
640 while (mid > 0 && pstr->offsets[mid - 1] == offset)
641 --mid;
642 while (mid < pstr->valid_len)
643 if (pstr->wcs[mid] != WEOF)
644 break;
645 else
646 ++mid;
647 if (mid == pstr->valid_len)
648 pstr->valid_len = 0;
649 else
650 {
651 pstr->valid_len = pstr->offsets[mid] - offset;
652 if (pstr->valid_len)
653 {
654 for (low = 0; low < pstr->valid_len; ++low)
655 pstr->wcs[low] = WEOF;
656 memset (pstr->mbs, 255, pstr->valid_len);
657 }
658 }
659 pstr->valid_raw_len = pstr->valid_len;
660 }
661 }
662 else
663 {
664 pstr->tip_context = re_string_context_at (pstr, offset - 1,
665 eflags);
666 if (pstr->mb_cur_max > 1)
667 memmove (pstr->wcs, pstr->wcs + offset,
668 (pstr->valid_len - offset) * sizeof (wint_t));
669 if (__glibc_unlikely (pstr->mbs_allocated))
670 memmove (pstr->mbs, pstr->mbs + offset,
671 pstr->valid_len - offset);
672 pstr->valid_len -= offset;
673 pstr->valid_raw_len -= offset;
674 DEBUG_ASSERT (pstr->valid_len > 0);
675 }
676 }
677 else
678 {
679 /* No, skip all characters until IDX. */
680 Idx prev_valid_len = pstr->valid_len;
681
682 if (__glibc_unlikely (pstr->offsets_needed))
683 {
684 pstr->len = pstr->raw_len - idx + offset;
685 pstr->stop = pstr->raw_stop - idx + offset;
686 pstr->offsets_needed = 0;
687 }
688 pstr->valid_len = 0;
689 if (pstr->mb_cur_max > 1)
690 {
691 Idx wcs_idx;
692 wint_t wc = WEOF;
693
694 if (pstr->is_utf8)
695 {
696 const unsigned char *raw, *p, *end;
697
698 /* Special case UTF-8. Multi-byte chars start with any
699 byte other than 0x80 - 0xbf. */
700 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
701 end = raw + (offset - pstr->mb_cur_max);
702 if (end < pstr->raw_mbs)
703 end = pstr->raw_mbs;
704 p = raw + offset - 1;
705#ifdef _LIBC
706 /* We know the wchar_t encoding is UCS4, so for the simple
707 case, ASCII characters, skip the conversion step. */
708 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
709 {
710 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
711 /* pstr->valid_len = 0; */
712 wc = (wchar_t) *p;
713 }
714 else
715#endif
716 for (; p >= end; --p)
717 if ((*p & 0xc0) != 0x80)
718 {
719 mbstate_t cur_state;
720 wchar_t wc2;
721 Idx mlen = raw + pstr->len - p;
722 unsigned char buf[6];
723 size_t mbclen;
724
725 const unsigned char *pp = p;
726 if (__glibc_unlikely (pstr->trans != NULL))
727 {
728 int i = mlen < 6 ? mlen : 6;
729 while (--i >= 0)
730 buf[i] = pstr->trans[p[i]];
731 pp = buf;
732 }
733 /* XXX Don't use mbrtowc, we know which conversion
734 to use (UTF-8 -> UCS4). */
735 memset (&cur_state, 0, sizeof (cur_state));
736 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
737 &cur_state);
738 if (raw + offset - p <= mbclen
739 && mbclen < (size_t) -2)
740 {
741 memset (&pstr->cur_state, '\0',
742 sizeof (mbstate_t));
743 pstr->valid_len = mbclen - (raw + offset - p);
744 wc = wc2;
745 }
746 break;
747 }
748 }
749
750 if (wc == WEOF)
751 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
752 if (wc == WEOF)
753 pstr->tip_context
754 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
755 else
756 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
757 && IS_WIDE_WORD_CHAR (wc))
758 ? CONTEXT_WORD
759 : ((IS_WIDE_NEWLINE (wc)
760 && pstr->newline_anchor)
761 ? CONTEXT_NEWLINE : 0));
762 if (__glibc_unlikely (pstr->valid_len))
763 {
764 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
765 pstr->wcs[wcs_idx] = WEOF;
766 if (pstr->mbs_allocated)
767 memset (pstr->mbs, 255, pstr->valid_len);
768 }
769 pstr->valid_raw_len = pstr->valid_len;
770 }
771 else
772 {
773 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
774 pstr->valid_raw_len = 0;
775 if (pstr->trans)
776 c = pstr->trans[c];
777 pstr->tip_context = (bitset_contain (pstr->word_char, c)
778 ? CONTEXT_WORD
779 : ((IS_NEWLINE (c) && pstr->newline_anchor)
780 ? CONTEXT_NEWLINE : 0));
781 }
782 }
783 if (!__glibc_unlikely (pstr->mbs_allocated))
784 pstr->mbs += offset;
785 }
786 pstr->raw_mbs_idx = idx;
787 pstr->len -= offset;
788 pstr->stop -= offset;
789
790 /* Then build the buffers. */
791 if (pstr->mb_cur_max > 1)
792 {
793 if (pstr->icase)
794 {
795 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
796 if (__glibc_unlikely (ret != REG_NOERROR))
797 return ret;
798 }
799 else
800 build_wcs_buffer (pstr);
801 }
802 else
803 if (__glibc_unlikely (pstr->mbs_allocated))
804 {
805 if (pstr->icase)
806 build_upper_buffer (pstr);
807 else if (pstr->trans != NULL)
808 re_string_translate_buffer (pstr);
809 }
810 else
811 pstr->valid_len = pstr->len;
812
813 pstr->cur_idx = 0;
814 return REG_NOERROR;
815}
816
817static unsigned char
818__attribute__ ((pure))
819re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
820{
821 int ch;
822 Idx off;
823
824 /* Handle the common (easiest) cases first. */
825 if (__glibc_likely (!pstr->mbs_allocated))
826 return re_string_peek_byte (pstr, idx);
827
828 if (pstr->mb_cur_max > 1
829 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
830 return re_string_peek_byte (pstr, idx);
831
832 off = pstr->cur_idx + idx;
833 if (pstr->offsets_needed)
834 off = pstr->offsets[off];
835
836 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
837
838 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
839 this function returns CAPITAL LETTER I instead of first byte of
840 DOTLESS SMALL LETTER I. The latter would confuse the parser,
841 since peek_byte_case doesn't advance cur_idx in any way. */
842 if (pstr->offsets_needed && !isascii (ch))
843 return re_string_peek_byte (pstr, idx);
844
845 return ch;
846}
847
848static unsigned char
849re_string_fetch_byte_case (re_string_t *pstr)
850{
851 if (__glibc_likely (!pstr->mbs_allocated))
852 return re_string_fetch_byte (pstr);
853
854 if (pstr->offsets_needed)
855 {
856 Idx off;
857 int ch;
858
859 /* For tr_TR.UTF-8 [[:islower:]] there is
860 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
861 in that case the whole multi-byte character and return
862 the original letter. On the other side, with
863 [[: DOTLESS SMALL LETTER I return [[:I, as doing
864 anything else would complicate things too much. */
865
866 if (!re_string_first_byte (pstr, pstr->cur_idx))
867 return re_string_fetch_byte (pstr);
868
869 off = pstr->offsets[pstr->cur_idx];
870 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
871
872 if (! isascii (ch))
873 return re_string_fetch_byte (pstr);
874
875 re_string_skip_bytes (pstr,
876 re_string_char_size_at (pstr, pstr->cur_idx));
877 return ch;
878 }
879
880 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
881}
882
883static void
884re_string_destruct (re_string_t *pstr)
885{
886 re_free (pstr->wcs);
887 re_free (pstr->offsets);
888 if (pstr->mbs_allocated)
889 re_free (pstr->mbs);
890}
891
892/* Return the context at IDX in INPUT. */
893
894static unsigned int
895re_string_context_at (const re_string_t *input, Idx idx, int eflags)
896{
897 int c;
898 if (__glibc_unlikely (idx < 0))
899 /* In this case, we use the value stored in input->tip_context,
900 since we can't know the character in input->mbs[-1] here. */
901 return input->tip_context;
902 if (__glibc_unlikely (idx == input->len))
903 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
904 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
905 if (input->mb_cur_max > 1)
906 {
907 wint_t wc;
908 Idx wc_idx = idx;
909 while(input->wcs[wc_idx] == WEOF)
910 {
911 DEBUG_ASSERT (wc_idx >= 0);
912 --wc_idx;
913 if (wc_idx < 0)
914 return input->tip_context;
915 }
916 wc = input->wcs[wc_idx];
917 if (__glibc_unlikely (input->word_ops_used != 0)
918 && IS_WIDE_WORD_CHAR (wc))
919 return CONTEXT_WORD;
920 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
921 ? CONTEXT_NEWLINE : 0);
922 }
923 else
924 {
925 c = re_string_byte_at (input, idx);
926 if (bitset_contain (input->word_char, c))
927 return CONTEXT_WORD;
928 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
929 }
930}
931
932
933/* Functions for set operation. */
934
935static reg_errcode_t
936__attribute_warn_unused_result__
937re_node_set_alloc (re_node_set *set, Idx size)
938{
939 set->alloc = size;
940 set->nelem = 0;
941 set->elems = re_malloc (Idx, size);
942 if (__glibc_unlikely (set->elems == NULL)
943 && (MALLOC_0_IS_NONNULL || size != 0))
944 return REG_ESPACE;
945 return REG_NOERROR;
946}
947
948static reg_errcode_t
949__attribute_warn_unused_result__
950re_node_set_init_1 (re_node_set *set, Idx elem)
951{
952 set->alloc = 1;
953 set->nelem = 1;
954 set->elems = re_malloc (Idx, 1);
955 if (__glibc_unlikely (set->elems == NULL))
956 {
957 set->alloc = set->nelem = 0;
958 return REG_ESPACE;
959 }
960 set->elems[0] = elem;
961 return REG_NOERROR;
962}
963
964static reg_errcode_t
965__attribute_warn_unused_result__
966re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
967{
968 set->alloc = 2;
969 set->elems = re_malloc (Idx, 2);
970 if (__glibc_unlikely (set->elems == NULL))
971 return REG_ESPACE;
972 if (elem1 == elem2)
973 {
974 set->nelem = 1;
975 set->elems[0] = elem1;
976 }
977 else
978 {
979 set->nelem = 2;
980 if (elem1 < elem2)
981 {
982 set->elems[0] = elem1;
983 set->elems[1] = elem2;
984 }
985 else
986 {
987 set->elems[0] = elem2;
988 set->elems[1] = elem1;
989 }
990 }
991 return REG_NOERROR;
992}
993
994static reg_errcode_t
995__attribute_warn_unused_result__
996re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
997{
998 dest->nelem = src->nelem;
999 if (src->nelem > 0)
1000 {
1001 dest->alloc = dest->nelem;
1002 dest->elems = re_malloc (Idx, dest->alloc);
1003 if (__glibc_unlikely (dest->elems == NULL))
1004 {
1005 dest->alloc = dest->nelem = 0;
1006 return REG_ESPACE;
1007 }
1008 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1009 }
1010 else
1011 re_node_set_init_empty (dest);
1012 return REG_NOERROR;
1013}
1014
1015/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1016 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1017 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1018
1019static reg_errcode_t
1020__attribute_warn_unused_result__
1021re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1022 const re_node_set *src2)
1023{
1024 Idx i1, i2, is, id, delta, sbase;
1025 if (src1->nelem == 0 || src2->nelem == 0)
1026 return REG_NOERROR;
1027
1028 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1029 conservative estimate. */
1030 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1031 {
1032 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1033 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1034 if (__glibc_unlikely (new_elems == NULL))
1035 return REG_ESPACE;
1036 dest->elems = new_elems;
1037 dest->alloc = new_alloc;
1038 }
1039
1040 /* Find the items in the intersection of SRC1 and SRC2, and copy
1041 into the top of DEST those that are not already in DEST itself. */
1042 sbase = dest->nelem + src1->nelem + src2->nelem;
1043 i1 = src1->nelem - 1;
1044 i2 = src2->nelem - 1;
1045 id = dest->nelem - 1;
1046 for (;;)
1047 {
1048 if (src1->elems[i1] == src2->elems[i2])
1049 {
1050 /* Try to find the item in DEST. Maybe we could binary search? */
1051 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1052 --id;
1053
1054 if (id < 0 || dest->elems[id] != src1->elems[i1])
1055 dest->elems[--sbase] = src1->elems[i1];
1056
1057 if (--i1 < 0 || --i2 < 0)
1058 break;
1059 }
1060
1061 /* Lower the highest of the two items. */
1062 else if (src1->elems[i1] < src2->elems[i2])
1063 {
1064 if (--i2 < 0)
1065 break;
1066 }
1067 else
1068 {
1069 if (--i1 < 0)
1070 break;
1071 }
1072 }
1073
1074 id = dest->nelem - 1;
1075 is = dest->nelem + src1->nelem + src2->nelem - 1;
1076 delta = is - sbase + 1;
1077
1078 /* Now copy. When DELTA becomes zero, the remaining
1079 DEST elements are already in place; this is more or
1080 less the same loop that is in re_node_set_merge. */
1081 dest->nelem += delta;
1082 if (delta > 0 && id >= 0)
1083 for (;;)
1084 {
1085 if (dest->elems[is] > dest->elems[id])
1086 {
1087 /* Copy from the top. */
1088 dest->elems[id + delta--] = dest->elems[is--];
1089 if (delta == 0)
1090 break;
1091 }
1092 else
1093 {
1094 /* Slide from the bottom. */
1095 dest->elems[id + delta] = dest->elems[id];
1096 if (--id < 0)
1097 break;
1098 }
1099 }
1100
1101 /* Copy remaining SRC elements. */
1102 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1103
1104 return REG_NOERROR;
1105}
1106
1107/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1108 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1109
1110static reg_errcode_t
1111__attribute_warn_unused_result__
1112re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1113 const re_node_set *src2)
1114{
1115 Idx i1, i2, id;
1116 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1117 {
1118 dest->alloc = src1->nelem + src2->nelem;
1119 dest->elems = re_malloc (Idx, dest->alloc);
1120 if (__glibc_unlikely (dest->elems == NULL))
1121 return REG_ESPACE;
1122 }
1123 else
1124 {
1125 if (src1 != NULL && src1->nelem > 0)
1126 return re_node_set_init_copy (dest, src1);
1127 else if (src2 != NULL && src2->nelem > 0)
1128 return re_node_set_init_copy (dest, src2);
1129 else
1130 re_node_set_init_empty (dest);
1131 return REG_NOERROR;
1132 }
1133 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1134 {
1135 if (src1->elems[i1] > src2->elems[i2])
1136 {
1137 dest->elems[id++] = src2->elems[i2++];
1138 continue;
1139 }
1140 if (src1->elems[i1] == src2->elems[i2])
1141 ++i2;
1142 dest->elems[id++] = src1->elems[i1++];
1143 }
1144 if (i1 < src1->nelem)
1145 {
1146 memcpy (dest->elems + id, src1->elems + i1,
1147 (src1->nelem - i1) * sizeof (Idx));
1148 id += src1->nelem - i1;
1149 }
1150 else if (i2 < src2->nelem)
1151 {
1152 memcpy (dest->elems + id, src2->elems + i2,
1153 (src2->nelem - i2) * sizeof (Idx));
1154 id += src2->nelem - i2;
1155 }
1156 dest->nelem = id;
1157 return REG_NOERROR;
1158}
1159
1160/* Calculate the union set of the sets DEST and SRC. And store it to
1161 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1162
1163static reg_errcode_t
1164__attribute_warn_unused_result__
1165re_node_set_merge (re_node_set *dest, const re_node_set *src)
1166{
1167 Idx is, id, sbase, delta;
1168 if (src == NULL || src->nelem == 0)
1169 return REG_NOERROR;
1170 if (dest->alloc < 2 * src->nelem + dest->nelem)
1171 {
1172 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1173 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1174 if (__glibc_unlikely (new_buffer == NULL))
1175 return REG_ESPACE;
1176 dest->elems = new_buffer;
1177 dest->alloc = new_alloc;
1178 }
1179
1180 if (__glibc_unlikely (dest->nelem == 0))
1181 {
1182 /* Although we already guaranteed above that dest->alloc != 0 and
1183 therefore dest->elems != NULL, add a debug assertion to pacify
1184 GCC 11.2.1's -fanalyzer. */
1185 DEBUG_ASSERT (dest->elems);
1186 dest->nelem = src->nelem;
1187 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1188 return REG_NOERROR;
1189 }
1190
1191 /* Copy into the top of DEST the items of SRC that are not
1192 found in DEST. Maybe we could binary search in DEST? */
1193 for (sbase = dest->nelem + 2 * src->nelem,
1194 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1195 {
1196 if (dest->elems[id] == src->elems[is])
1197 is--, id--;
1198 else if (dest->elems[id] < src->elems[is])
1199 dest->elems[--sbase] = src->elems[is--];
1200 else /* if (dest->elems[id] > src->elems[is]) */
1201 --id;
1202 }
1203
1204 if (is >= 0)
1205 {
1206 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1207 sbase -= is + 1;
1208 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1209 }
1210
1211 id = dest->nelem - 1;
1212 is = dest->nelem + 2 * src->nelem - 1;
1213 delta = is - sbase + 1;
1214 if (delta == 0)
1215 return REG_NOERROR;
1216
1217 /* Now copy. When DELTA becomes zero, the remaining
1218 DEST elements are already in place. */
1219 dest->nelem += delta;
1220 for (;;)
1221 {
1222 if (dest->elems[is] > dest->elems[id])
1223 {
1224 /* Copy from the top. */
1225 dest->elems[id + delta--] = dest->elems[is--];
1226 if (delta == 0)
1227 break;
1228 }
1229 else
1230 {
1231 /* Slide from the bottom. */
1232 dest->elems[id + delta] = dest->elems[id];
1233 if (--id < 0)
1234 {
1235 /* Copy remaining SRC elements. */
1236 memcpy (dest->elems, dest->elems + sbase,
1237 delta * sizeof (Idx));
1238 break;
1239 }
1240 }
1241 }
1242
1243 return REG_NOERROR;
1244}
1245
1246/* Insert the new element ELEM to the re_node_set* SET.
1247 SET should not already have ELEM.
1248 Return true if successful. */
1249
1250static bool
1251__attribute_warn_unused_result__
1252re_node_set_insert (re_node_set *set, Idx elem)
1253{
1254 Idx idx;
1255 /* In case the set is empty. */
1256 if (set->alloc == 0)
1257 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1258
1259 if (__glibc_unlikely (set->nelem) == 0)
1260 {
1261 /* Although we already guaranteed above that set->alloc != 0 and
1262 therefore set->elems != NULL, add a debug assertion to pacify
1263 GCC 11.2 -fanalyzer. */
1264 DEBUG_ASSERT (set->elems);
1265 set->elems[0] = elem;
1266 ++set->nelem;
1267 return true;
1268 }
1269
1270 /* Realloc if we need. */
1271 if (set->alloc == set->nelem)
1272 {
1273 Idx *new_elems;
1274 set->alloc = set->alloc * 2;
1275 new_elems = re_realloc (set->elems, Idx, set->alloc);
1276 if (__glibc_unlikely (new_elems == NULL))
1277 return false;
1278 set->elems = new_elems;
1279 }
1280
1281 /* Move the elements which follows the new element. Test the
1282 first element separately to skip a check in the inner loop. */
1283 if (elem < set->elems[0])
1284 {
1285 for (idx = set->nelem; idx > 0; idx--)
1286 set->elems[idx] = set->elems[idx - 1];
1287 }
1288 else
1289 {
1290 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1291 set->elems[idx] = set->elems[idx - 1];
1292 DEBUG_ASSERT (set->elems[idx - 1] < elem);
1293 }
1294
1295 /* Insert the new element. */
1296 set->elems[idx] = elem;
1297 ++set->nelem;
1298 return true;
1299}
1300
1301/* Insert the new element ELEM to the re_node_set* SET.
1302 SET should not already have any element greater than or equal to ELEM.
1303 Return true if successful. */
1304
1305static bool
1306__attribute_warn_unused_result__
1307re_node_set_insert_last (re_node_set *set, Idx elem)
1308{
1309 /* Realloc if we need. */
1310 if (set->alloc == set->nelem)
1311 {
1312 Idx *new_elems;
1313 set->alloc = (set->alloc + 1) * 2;
1314 new_elems = re_realloc (set->elems, Idx, set->alloc);
1315 if (__glibc_unlikely (new_elems == NULL))
1316 return false;
1317 set->elems = new_elems;
1318 }
1319
1320 /* Insert the new element. */
1321 set->elems[set->nelem++] = elem;
1322 return true;
1323}
1324
1325/* Compare two node sets SET1 and SET2.
1326 Return true if SET1 and SET2 are equivalent. */
1327
1328static bool
1329__attribute__ ((pure))
1330re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1331{
1332 Idx i;
1333 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1334 return false;
1335 for (i = set1->nelem ; --i >= 0 ; )
1336 if (set1->elems[i] != set2->elems[i])
1337 return false;
1338 return true;
1339}
1340
1341/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1342
1343static Idx
1344__attribute__ ((pure))
1345re_node_set_contains (const re_node_set *set, Idx elem)
1346{
1347 __re_size_t idx, right, mid;
1348 if (set->nelem <= 0)
1349 return 0;
1350
1351 /* Binary search the element. */
1352 idx = 0;
1353 right = set->nelem - 1;
1354 while (idx < right)
1355 {
1356 mid = (idx + right) / 2;
1357 if (set->elems[mid] < elem)
1358 idx = mid + 1;
1359 else
1360 right = mid;
1361 }
1362 return set->elems[idx] == elem ? idx + 1 : 0;
1363}
1364
1365static void
1366re_node_set_remove_at (re_node_set *set, Idx idx)
1367{
1368 if (idx < 0 || idx >= set->nelem)
1369 return;
1370 --set->nelem;
1371 for (; idx < set->nelem; idx++)
1372 set->elems[idx] = set->elems[idx + 1];
1373}
1374
1375
1376
1377/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1378 Or return -1 if an error occurred. */
1379
1380static Idx
1381re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1382{
1383 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1384 {
1385 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1386 Idx *new_nexts, *new_indices;
1387 re_node_set *new_edests, *new_eclosures;
1388 re_token_t *new_nodes;
1389
1390 /* Avoid overflows in realloc. */
1391 const size_t max_object_size = MAX (sizeof (re_token_t),
1392 MAX (sizeof (re_node_set),
1393 sizeof (Idx)));
1394 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1395 < new_nodes_alloc))
1396 return -1;
1397
1398 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1399 if (__glibc_unlikely (new_nodes == NULL))
1400 return -1;
1401 dfa->nodes = new_nodes;
1402 dfa->nodes_alloc = new_nodes_alloc;
1403 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1404 if (new_nexts != NULL)
1405 dfa->nexts = new_nexts;
1406 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1407 if (new_indices != NULL)
1408 dfa->org_indices = new_indices;
1409 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1410 if (new_edests != NULL)
1411 dfa->edests = new_edests;
1412 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1413 if (new_eclosures != NULL)
1414 dfa->eclosures = new_eclosures;
1415 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1416 || new_edests == NULL || new_eclosures == NULL))
1417 return -1;
1418 }
1419 dfa->nodes[dfa->nodes_len] = token;
1420 dfa->nodes[dfa->nodes_len].constraint = 0;
1421 dfa->nodes[dfa->nodes_len].accept_mb =
1422 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1423 || token.type == COMPLEX_BRACKET);
1424 dfa->nexts[dfa->nodes_len] = -1;
1425 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1426 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1427 return dfa->nodes_len++;
1428}
1429
1430static re_hashval_t
1431calc_state_hash (const re_node_set *nodes, unsigned int context)
1432{
1433 re_hashval_t hash = nodes->nelem + context;
1434 Idx i;
1435 for (i = 0 ; i < nodes->nelem ; i++)
1436 hash += nodes->elems[i];
1437 return hash;
1438}
1439
1440/* Search for the state whose node_set is equivalent to NODES.
1441 Return the pointer to the state, if we found it in the DFA.
1442 Otherwise create the new one and return it. In case of an error
1443 return NULL and set the error code in ERR.
1444 Note: - We assume NULL as the invalid state, then it is possible that
1445 return value is NULL and ERR is REG_NOERROR.
1446 - We never return non-NULL value in case of any errors, it is for
1447 optimization. */
1448
1449static re_dfastate_t *
1450__attribute_warn_unused_result__
1451re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1452 const re_node_set *nodes)
1453{
1454 re_hashval_t hash;
1455 re_dfastate_t *new_state;
1456 struct re_state_table_entry *spot;
1457 Idx i;
1458#if defined GCC_LINT || defined lint
1459 /* Suppress bogus uninitialized-variable warnings. */
1460 *err = REG_NOERROR;
1461#endif
1462 if (__glibc_unlikely (nodes->nelem == 0))
1463 {
1464 *err = REG_NOERROR;
1465 return NULL;
1466 }
1467 hash = calc_state_hash (nodes, 0);
1468 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469
1470 for (i = 0 ; i < spot->num ; i++)
1471 {
1472 re_dfastate_t *state = spot->array[i];
1473 if (hash != state->hash)
1474 continue;
1475 if (re_node_set_compare (&state->nodes, nodes))
1476 return state;
1477 }
1478
1479 /* There are no appropriate state in the dfa, create the new one. */
1480 new_state = create_ci_newstate (dfa, nodes, hash);
1481 if (__glibc_unlikely (new_state == NULL))
1482 *err = REG_ESPACE;
1483
1484 return new_state;
1485}
1486
1487/* Search for the state whose node_set is equivalent to NODES and
1488 whose context is equivalent to CONTEXT.
1489 Return the pointer to the state, if we found it in the DFA.
1490 Otherwise create the new one and return it. In case of an error
1491 return NULL and set the error code in ERR.
1492 Note: - We assume NULL as the invalid state, then it is possible that
1493 return value is NULL and ERR is REG_NOERROR.
1494 - We never return non-NULL value in case of any errors, it is for
1495 optimization. */
1496
1497static re_dfastate_t *
1498__attribute_warn_unused_result__
1499re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1500 const re_node_set *nodes, unsigned int context)
1501{
1502 re_hashval_t hash;
1503 re_dfastate_t *new_state;
1504 struct re_state_table_entry *spot;
1505 Idx i;
1506#if defined GCC_LINT || defined lint
1507 /* Suppress bogus uninitialized-variable warnings. */
1508 *err = REG_NOERROR;
1509#endif
1510 if (nodes->nelem == 0)
1511 {
1512 *err = REG_NOERROR;
1513 return NULL;
1514 }
1515 hash = calc_state_hash (nodes, context);
1516 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1517
1518 for (i = 0 ; i < spot->num ; i++)
1519 {
1520 re_dfastate_t *state = spot->array[i];
1521 if (state->hash == hash
1522 && state->context == context
1523 && re_node_set_compare (state->entrance_nodes, nodes))
1524 return state;
1525 }
1526 /* There are no appropriate state in 'dfa', create the new one. */
1527 new_state = create_cd_newstate (dfa, nodes, context, hash);
1528 if (__glibc_unlikely (new_state == NULL))
1529 *err = REG_ESPACE;
1530
1531 return new_state;
1532}
1533
1534/* Finish initialization of the new state NEWSTATE, and using its hash value
1535 HASH put in the appropriate bucket of DFA's state table. Return value
1536 indicates the error code if failed. */
1537
1538static reg_errcode_t
1539__attribute_warn_unused_result__
1540register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1541 re_hashval_t hash)
1542{
1543 struct re_state_table_entry *spot;
1544 reg_errcode_t err;
1545 Idx i;
1546
1547 newstate->hash = hash;
1548 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1549 if (__glibc_unlikely (err != REG_NOERROR))
1550 return REG_ESPACE;
1551 for (i = 0; i < newstate->nodes.nelem; i++)
1552 {
1553 Idx elem = newstate->nodes.elems[i];
1554 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1555 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1556 return REG_ESPACE;
1557 }
1558
1559 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1560 if (__glibc_unlikely (spot->alloc <= spot->num))
1561 {
1562 Idx new_alloc = 2 * spot->num + 2;
1563 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1564 new_alloc);
1565 if (__glibc_unlikely (new_array == NULL))
1566 return REG_ESPACE;
1567 spot->array = new_array;
1568 spot->alloc = new_alloc;
1569 }
1570 spot->array[spot->num++] = newstate;
1571 return REG_NOERROR;
1572}
1573
1574static void
1575free_state (re_dfastate_t *state)
1576{
1577 re_node_set_free (&state->non_eps_nodes);
1578 re_node_set_free (&state->inveclosure);
1579 if (state->entrance_nodes != &state->nodes)
1580 {
1581 re_node_set_free (state->entrance_nodes);
1582 re_free (state->entrance_nodes);
1583 }
1584 re_node_set_free (&state->nodes);
1585 re_free (state->word_trtable);
1586 re_free (state->trtable);
1587 re_free (state);
1588}
1589
1590/* Create the new state which is independent of contexts.
1591 Return the new state if succeeded, otherwise return NULL. */
1592
1593static re_dfastate_t *
1594__attribute_warn_unused_result__
1595create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1596 re_hashval_t hash)
1597{
1598 Idx i;
1599 reg_errcode_t err;
1600 re_dfastate_t *newstate;
1601
1602 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1603 if (__glibc_unlikely (newstate == NULL))
1604 return NULL;
1605 err = re_node_set_init_copy (&newstate->nodes, nodes);
1606 if (__glibc_unlikely (err != REG_NOERROR))
1607 {
1608 re_free (newstate);
1609 return NULL;
1610 }
1611
1612 newstate->entrance_nodes = &newstate->nodes;
1613 for (i = 0 ; i < nodes->nelem ; i++)
1614 {
1615 re_token_t *node = dfa->nodes + nodes->elems[i];
1616 re_token_type_t type = node->type;
1617 if (type == CHARACTER && !node->constraint)
1618 continue;
1619 newstate->accept_mb |= node->accept_mb;
1620
1621 /* If the state has the halt node, the state is a halt state. */
1622 if (type == END_OF_RE)
1623 newstate->halt = 1;
1624 else if (type == OP_BACK_REF)
1625 newstate->has_backref = 1;
1626 else if (type == ANCHOR || node->constraint)
1627 newstate->has_constraint = 1;
1628 }
1629 err = register_state (dfa, newstate, hash);
1630 if (__glibc_unlikely (err != REG_NOERROR))
1631 {
1632 free_state (newstate);
1633 newstate = NULL;
1634 }
1635 return newstate;
1636}
1637
1638/* Create the new state which is depend on the context CONTEXT.
1639 Return the new state if succeeded, otherwise return NULL. */
1640
1641static re_dfastate_t *
1642__attribute_warn_unused_result__
1643create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1644 unsigned int context, re_hashval_t hash)
1645{
1646 Idx i, nctx_nodes = 0;
1647 reg_errcode_t err;
1648 re_dfastate_t *newstate;
1649
1650 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1651 if (__glibc_unlikely (newstate == NULL))
1652 return NULL;
1653 err = re_node_set_init_copy (&newstate->nodes, nodes);
1654 if (__glibc_unlikely (err != REG_NOERROR))
1655 {
1656 re_free (newstate);
1657 return NULL;
1658 }
1659
1660 newstate->context = context;
1661 newstate->entrance_nodes = &newstate->nodes;
1662
1663 for (i = 0 ; i < nodes->nelem ; i++)
1664 {
1665 re_token_t *node = dfa->nodes + nodes->elems[i];
1666 re_token_type_t type = node->type;
1667 unsigned int constraint = node->constraint;
1668
1669 if (type == CHARACTER && !constraint)
1670 continue;
1671 newstate->accept_mb |= node->accept_mb;
1672
1673 /* If the state has the halt node, the state is a halt state. */
1674 if (type == END_OF_RE)
1675 newstate->halt = 1;
1676 else if (type == OP_BACK_REF)
1677 newstate->has_backref = 1;
1678
1679 if (constraint)
1680 {
1681 if (newstate->entrance_nodes == &newstate->nodes)
1682 {
1683 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1684 if (__glibc_unlikely (entrance_nodes == NULL))
1685 {
1686 free_state (newstate);
1687 return NULL;
1688 }
1689 newstate->entrance_nodes = entrance_nodes;
1690 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1691 != REG_NOERROR)
1692 {
1693 free_state (newstate);
1694 return NULL;
1695 }
1696 nctx_nodes = 0;
1697 newstate->has_constraint = 1;
1698 }
1699
1700 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1701 {
1702 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1703 ++nctx_nodes;
1704 }
1705 }
1706 }
1707 err = register_state (dfa, newstate, hash);
1708 if (__glibc_unlikely (err != REG_NOERROR))
1709 {
1710 free_state (newstate);
1711 newstate = NULL;
1712 }
1713 return newstate;
1714}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette