VirtualBox

source: kBuild/vendor/grep/current/lib/regex_internal.c@ 3530

Last change on this file since 3530 was 3529, checked in by bird, 3 years ago

Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.

  • Property svn:eol-style set to native
File size: 48.3 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa);
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash);
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash);
31static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 Idx new_buf_len);
33#ifdef RE_ENABLE_I18N
34static void build_wcs_buffer (re_string_t *pstr);
35static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36#endif /* RE_ENABLE_I18N */
37static void build_upper_buffer (re_string_t *pstr);
38static void re_string_translate_buffer (re_string_t *pstr);
39static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 int eflags) __attribute__ ((pure));
41
42
43/* Functions for string operation. */
44
45/* This function allocate the buffers. It is necessary to call
46 re_string_reconstruct before using the object. */
47
48static reg_errcode_t
49__attribute_warn_unused_result__
50re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
51 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
52{
53 reg_errcode_t ret;
54 Idx init_buf_len;
55
56 /* Ensure at least one character fits into the buffers. */
57 if (init_len < dfa->mb_cur_max)
58 init_len = dfa->mb_cur_max;
59 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
60 re_string_construct_common (str, len, pstr, trans, icase, dfa);
61
62 ret = re_string_realloc_buffers (pstr, init_buf_len);
63 if (__glibc_unlikely (ret != REG_NOERROR))
64 return ret;
65
66 pstr->word_char = dfa->word_char;
67 pstr->word_ops_used = dfa->word_ops_used;
68 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
69 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
70 pstr->valid_raw_len = pstr->valid_len;
71 return REG_NOERROR;
72}
73
74/* This function allocate the buffers, and initialize them. */
75
76static reg_errcode_t
77__attribute_warn_unused_result__
78re_string_construct (re_string_t *pstr, const char *str, Idx len,
79 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
80{
81 reg_errcode_t ret;
82 memset (pstr, '\0', sizeof (re_string_t));
83 re_string_construct_common (str, len, pstr, trans, icase, dfa);
84
85 if (len > 0)
86 {
87 ret = re_string_realloc_buffers (pstr, len + 1);
88 if (__glibc_unlikely (ret != REG_NOERROR))
89 return ret;
90 }
91 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
92
93 if (icase)
94 {
95#ifdef RE_ENABLE_I18N
96 if (dfa->mb_cur_max > 1)
97 {
98 while (1)
99 {
100 ret = build_wcs_upper_buffer (pstr);
101 if (__glibc_unlikely (ret != REG_NOERROR))
102 return ret;
103 if (pstr->valid_raw_len >= len)
104 break;
105 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
106 break;
107 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
108 if (__glibc_unlikely (ret != REG_NOERROR))
109 return ret;
110 }
111 }
112 else
113#endif /* RE_ENABLE_I18N */
114 build_upper_buffer (pstr);
115 }
116 else
117 {
118#ifdef RE_ENABLE_I18N
119 if (dfa->mb_cur_max > 1)
120 build_wcs_buffer (pstr);
121 else
122#endif /* RE_ENABLE_I18N */
123 {
124 if (trans != NULL)
125 re_string_translate_buffer (pstr);
126 else
127 {
128 pstr->valid_len = pstr->bufs_len;
129 pstr->valid_raw_len = pstr->bufs_len;
130 }
131 }
132 }
133
134 return REG_NOERROR;
135}
136
137/* Helper functions for re_string_allocate, and re_string_construct. */
138
139static reg_errcode_t
140__attribute_warn_unused_result__
141re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
142{
143#ifdef RE_ENABLE_I18N
144 if (pstr->mb_cur_max > 1)
145 {
146 wint_t *new_wcs;
147
148 /* Avoid overflow in realloc. */
149 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
150 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
151 < new_buf_len))
152 return REG_ESPACE;
153
154 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
155 if (__glibc_unlikely (new_wcs == NULL))
156 return REG_ESPACE;
157 pstr->wcs = new_wcs;
158 if (pstr->offsets != NULL)
159 {
160 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
161 if (__glibc_unlikely (new_offsets == NULL))
162 return REG_ESPACE;
163 pstr->offsets = new_offsets;
164 }
165 }
166#endif /* RE_ENABLE_I18N */
167 if (pstr->mbs_allocated)
168 {
169 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
170 new_buf_len);
171 if (__glibc_unlikely (new_mbs == NULL))
172 return REG_ESPACE;
173 pstr->mbs = new_mbs;
174 }
175 pstr->bufs_len = new_buf_len;
176 return REG_NOERROR;
177}
178
179
180static void
181re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
182 RE_TRANSLATE_TYPE trans, bool icase,
183 const re_dfa_t *dfa)
184{
185 pstr->raw_mbs = (const unsigned char *) str;
186 pstr->len = len;
187 pstr->raw_len = len;
188 pstr->trans = trans;
189 pstr->icase = icase;
190 pstr->mbs_allocated = (trans != NULL || icase);
191 pstr->mb_cur_max = dfa->mb_cur_max;
192 pstr->is_utf8 = dfa->is_utf8;
193 pstr->map_notascii = dfa->map_notascii;
194 pstr->stop = pstr->len;
195 pstr->raw_stop = pstr->stop;
196}
197
198#ifdef RE_ENABLE_I18N
199
200/* Build wide character buffer PSTR->WCS.
201 If the byte sequence of the string are:
202 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
203 Then wide character buffer will be:
204 <wc1> , WEOF , <wc2> , WEOF , <wc3>
205 We use WEOF for padding, they indicate that the position isn't
206 a first byte of a multibyte character.
207
208 Note that this function assumes PSTR->VALID_LEN elements are already
209 built and starts from PSTR->VALID_LEN. */
210
211static void
212build_wcs_buffer (re_string_t *pstr)
213{
214#ifdef _LIBC
215 unsigned char buf[MB_LEN_MAX];
216 DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
217#else
218 unsigned char buf[64];
219#endif
220 mbstate_t prev_st;
221 Idx byte_idx, end_idx, remain_len;
222 size_t mbclen;
223
224 /* Build the buffers from pstr->valid_len to either pstr->len or
225 pstr->bufs_len. */
226 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
227 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
228 {
229 wchar_t wc;
230 const char *p;
231
232 remain_len = end_idx - byte_idx;
233 prev_st = pstr->cur_state;
234 /* Apply the translation if we need. */
235 if (__glibc_unlikely (pstr->trans != NULL))
236 {
237 int i, ch;
238
239 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
240 {
241 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
242 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
243 }
244 p = (const char *) buf;
245 }
246 else
247 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
248 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
249 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
250 || (mbclen == (size_t) -2
251 && pstr->bufs_len >= pstr->len)))
252 {
253 /* We treat these cases as a singlebyte character. */
254 mbclen = 1;
255 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
256 if (__glibc_unlikely (pstr->trans != NULL))
257 wc = pstr->trans[wc];
258 pstr->cur_state = prev_st;
259 }
260 else if (__glibc_unlikely (mbclen == (size_t) -2))
261 {
262 /* The buffer doesn't have enough space, finish to build. */
263 pstr->cur_state = prev_st;
264 break;
265 }
266
267 /* Write wide character and padding. */
268 pstr->wcs[byte_idx++] = wc;
269 /* Write paddings. */
270 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 pstr->wcs[byte_idx++] = WEOF;
272 }
273 pstr->valid_len = byte_idx;
274 pstr->valid_raw_len = byte_idx;
275}
276
277/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278 but for REG_ICASE. */
279
280static reg_errcode_t
281__attribute_warn_unused_result__
282build_wcs_upper_buffer (re_string_t *pstr)
283{
284 mbstate_t prev_st;
285 Idx src_idx, byte_idx, end_idx, remain_len;
286 size_t mbclen;
287#ifdef _LIBC
288 char buf[MB_LEN_MAX];
289 DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
290#else
291 char buf[64];
292#endif
293
294 byte_idx = pstr->valid_len;
295 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296
297 /* The following optimization assumes that ASCII characters can be
298 mapped to wide characters with a simple cast. */
299 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300 {
301 while (byte_idx < end_idx)
302 {
303 wchar_t wc;
304 unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
305
306 if (isascii (ch) && mbsinit (&pstr->cur_state))
307 {
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 wchar_t wcu = __towupper (ch);
311 if (isascii (wcu))
312 {
313 pstr->mbs[byte_idx] = wcu;
314 pstr->wcs[byte_idx] = wcu;
315 byte_idx++;
316 continue;
317 }
318 }
319
320 remain_len = end_idx - byte_idx;
321 prev_st = pstr->cur_state;
322 mbclen = __mbrtowc (&wc,
323 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
324 + byte_idx), remain_len, &pstr->cur_state);
325 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
326 {
327 wchar_t wcu = __towupper (wc);
328 if (wcu != wc)
329 {
330 size_t mbcdlen;
331
332 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
333 if (__glibc_likely (mbclen == mbcdlen))
334 memcpy (pstr->mbs + byte_idx, buf, mbclen);
335 else
336 {
337 src_idx = byte_idx;
338 goto offsets_needed;
339 }
340 }
341 else
342 memcpy (pstr->mbs + byte_idx,
343 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344 pstr->wcs[byte_idx++] = wcu;
345 /* Write paddings. */
346 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347 pstr->wcs[byte_idx++] = WEOF;
348 }
349 else if (mbclen == (size_t) -1 || mbclen == 0
350 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
351 {
352 /* It is an invalid character, an incomplete character
353 at the end of the string, or '\0'. Just use the byte. */
354 pstr->mbs[byte_idx] = ch;
355 /* And also cast it to wide char. */
356 pstr->wcs[byte_idx++] = (wchar_t) ch;
357 if (__glibc_unlikely (mbclen == (size_t) -1))
358 pstr->cur_state = prev_st;
359 }
360 else
361 {
362 /* The buffer doesn't have enough space, finish to build. */
363 pstr->cur_state = prev_st;
364 break;
365 }
366 }
367 pstr->valid_len = byte_idx;
368 pstr->valid_raw_len = byte_idx;
369 return REG_NOERROR;
370 }
371 else
372 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
373 {
374 wchar_t wc;
375 const char *p;
376 offsets_needed:
377 remain_len = end_idx - byte_idx;
378 prev_st = pstr->cur_state;
379 if (__glibc_unlikely (pstr->trans != NULL))
380 {
381 int i, ch;
382
383 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
384 {
385 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
386 buf[i] = pstr->trans[ch];
387 }
388 p = (const char *) buf;
389 }
390 else
391 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
392 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
393 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
394 {
395 wchar_t wcu = __towupper (wc);
396 if (wcu != wc)
397 {
398 size_t mbcdlen;
399
400 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
401 if (__glibc_likely (mbclen == mbcdlen))
402 memcpy (pstr->mbs + byte_idx, buf, mbclen);
403 else if (mbcdlen != (size_t) -1)
404 {
405 size_t i;
406
407 if (byte_idx + mbcdlen > pstr->bufs_len)
408 {
409 pstr->cur_state = prev_st;
410 break;
411 }
412
413 if (pstr->offsets == NULL)
414 {
415 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
416
417 if (pstr->offsets == NULL)
418 return REG_ESPACE;
419 }
420 if (!pstr->offsets_needed)
421 {
422 for (i = 0; i < (size_t) byte_idx; ++i)
423 pstr->offsets[i] = i;
424 pstr->offsets_needed = 1;
425 }
426
427 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428 pstr->wcs[byte_idx] = wcu;
429 pstr->offsets[byte_idx] = src_idx;
430 for (i = 1; i < mbcdlen; ++i)
431 {
432 pstr->offsets[byte_idx + i]
433 = src_idx + (i < mbclen ? i : mbclen - 1);
434 pstr->wcs[byte_idx + i] = WEOF;
435 }
436 pstr->len += mbcdlen - mbclen;
437 if (pstr->raw_stop > src_idx)
438 pstr->stop += mbcdlen - mbclen;
439 end_idx = (pstr->bufs_len > pstr->len)
440 ? pstr->len : pstr->bufs_len;
441 byte_idx += mbcdlen;
442 src_idx += mbclen;
443 continue;
444 }
445 else
446 memcpy (pstr->mbs + byte_idx, p, mbclen);
447 }
448 else
449 memcpy (pstr->mbs + byte_idx, p, mbclen);
450
451 if (__glibc_unlikely (pstr->offsets_needed != 0))
452 {
453 size_t i;
454 for (i = 0; i < mbclen; ++i)
455 pstr->offsets[byte_idx + i] = src_idx + i;
456 }
457 src_idx += mbclen;
458
459 pstr->wcs[byte_idx++] = wcu;
460 /* Write paddings. */
461 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462 pstr->wcs[byte_idx++] = WEOF;
463 }
464 else if (mbclen == (size_t) -1 || mbclen == 0
465 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
466 {
467 /* It is an invalid character or '\0'. Just use the byte. */
468 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
469
470 if (__glibc_unlikely (pstr->trans != NULL))
471 ch = pstr->trans [ch];
472 pstr->mbs[byte_idx] = ch;
473
474 if (__glibc_unlikely (pstr->offsets_needed != 0))
475 pstr->offsets[byte_idx] = src_idx;
476 ++src_idx;
477
478 /* And also cast it to wide char. */
479 pstr->wcs[byte_idx++] = (wchar_t) ch;
480 if (__glibc_unlikely (mbclen == (size_t) -1))
481 pstr->cur_state = prev_st;
482 }
483 else
484 {
485 /* The buffer doesn't have enough space, finish to build. */
486 pstr->cur_state = prev_st;
487 break;
488 }
489 }
490 pstr->valid_len = byte_idx;
491 pstr->valid_raw_len = src_idx;
492 return REG_NOERROR;
493}
494
495/* Skip characters until the index becomes greater than NEW_RAW_IDX.
496 Return the index. */
497
498static Idx
499re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
500{
501 mbstate_t prev_st;
502 Idx rawbuf_idx;
503 size_t mbclen;
504 wint_t wc = WEOF;
505
506 /* Skip the characters which are not necessary to check. */
507 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
508 rawbuf_idx < new_raw_idx;)
509 {
510 wchar_t wc2;
511 Idx remain_len = pstr->raw_len - rawbuf_idx;
512 prev_st = pstr->cur_state;
513 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
514 remain_len, &pstr->cur_state);
515 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
516 || mbclen == 0))
517 {
518 /* We treat these cases as a single byte character. */
519 if (mbclen == 0 || remain_len == 0)
520 wc = L'\0';
521 else
522 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
523 mbclen = 1;
524 pstr->cur_state = prev_st;
525 }
526 else
527 wc = wc2;
528 /* Then proceed the next character. */
529 rawbuf_idx += mbclen;
530 }
531 *last_wc = wc;
532 return rawbuf_idx;
533}
534#endif /* RE_ENABLE_I18N */
535
536/* Build the buffer PSTR->MBS, and apply the translation if we need.
537 This function is used in case of REG_ICASE. */
538
539static void
540build_upper_buffer (re_string_t *pstr)
541{
542 Idx char_idx, end_idx;
543 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544
545 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
546 {
547 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
548 if (__glibc_unlikely (pstr->trans != NULL))
549 ch = pstr->trans[ch];
550 pstr->mbs[char_idx] = toupper (ch);
551 }
552 pstr->valid_len = char_idx;
553 pstr->valid_raw_len = char_idx;
554}
555
556/* Apply TRANS to the buffer in PSTR. */
557
558static void
559re_string_translate_buffer (re_string_t *pstr)
560{
561 Idx buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565 {
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
568 }
569
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
572}
573
574/* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
577
578static reg_errcode_t
579__attribute_warn_unused_result__
580re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
581{
582 Idx offset;
583
584 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
585 offset = idx - pstr->raw_mbs_idx;
586 else
587 {
588 /* Reset buffer. */
589#ifdef RE_ENABLE_I18N
590 if (pstr->mb_cur_max > 1)
591 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
592#endif /* RE_ENABLE_I18N */
593 pstr->len = pstr->raw_len;
594 pstr->stop = pstr->raw_stop;
595 pstr->valid_len = 0;
596 pstr->raw_mbs_idx = 0;
597 pstr->valid_raw_len = 0;
598 pstr->offsets_needed = 0;
599 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
600 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
601 if (!pstr->mbs_allocated)
602 pstr->mbs = (unsigned char *) pstr->raw_mbs;
603 offset = idx;
604 }
605
606 if (__glibc_likely (offset != 0))
607 {
608 /* Should the already checked characters be kept? */
609 if (__glibc_likely (offset < pstr->valid_raw_len))
610 {
611 /* Yes, move them to the front of the buffer. */
612#ifdef RE_ENABLE_I18N
613 if (__glibc_unlikely (pstr->offsets_needed))
614 {
615 Idx low = 0, high = pstr->valid_len, mid;
616 do
617 {
618 mid = (high + low) / 2;
619 if (pstr->offsets[mid] > offset)
620 high = mid;
621 else if (pstr->offsets[mid] < offset)
622 low = mid + 1;
623 else
624 break;
625 }
626 while (low < high);
627 if (pstr->offsets[mid] < offset)
628 ++mid;
629 pstr->tip_context = re_string_context_at (pstr, mid - 1,
630 eflags);
631 /* This can be quite complicated, so handle specially
632 only the common and easy case where the character with
633 different length representation of lower and upper
634 case is present at or after offset. */
635 if (pstr->valid_len > offset
636 && mid == offset && pstr->offsets[mid] == offset)
637 {
638 memmove (pstr->wcs, pstr->wcs + offset,
639 (pstr->valid_len - offset) * sizeof (wint_t));
640 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
641 pstr->valid_len -= offset;
642 pstr->valid_raw_len -= offset;
643 for (low = 0; low < pstr->valid_len; low++)
644 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
645 }
646 else
647 {
648 /* Otherwise, just find out how long the partial multibyte
649 character at offset is and fill it with WEOF/255. */
650 pstr->len = pstr->raw_len - idx + offset;
651 pstr->stop = pstr->raw_stop - idx + offset;
652 pstr->offsets_needed = 0;
653 while (mid > 0 && pstr->offsets[mid - 1] == offset)
654 --mid;
655 while (mid < pstr->valid_len)
656 if (pstr->wcs[mid] != WEOF)
657 break;
658 else
659 ++mid;
660 if (mid == pstr->valid_len)
661 pstr->valid_len = 0;
662 else
663 {
664 pstr->valid_len = pstr->offsets[mid] - offset;
665 if (pstr->valid_len)
666 {
667 for (low = 0; low < pstr->valid_len; ++low)
668 pstr->wcs[low] = WEOF;
669 memset (pstr->mbs, 255, pstr->valid_len);
670 }
671 }
672 pstr->valid_raw_len = pstr->valid_len;
673 }
674 }
675 else
676#endif
677 {
678 pstr->tip_context = re_string_context_at (pstr, offset - 1,
679 eflags);
680#ifdef RE_ENABLE_I18N
681 if (pstr->mb_cur_max > 1)
682 memmove (pstr->wcs, pstr->wcs + offset,
683 (pstr->valid_len - offset) * sizeof (wint_t));
684#endif /* RE_ENABLE_I18N */
685 if (__glibc_unlikely (pstr->mbs_allocated))
686 memmove (pstr->mbs, pstr->mbs + offset,
687 pstr->valid_len - offset);
688 pstr->valid_len -= offset;
689 pstr->valid_raw_len -= offset;
690 DEBUG_ASSERT (pstr->valid_len > 0);
691 }
692 }
693 else
694 {
695#ifdef RE_ENABLE_I18N
696 /* No, skip all characters until IDX. */
697 Idx prev_valid_len = pstr->valid_len;
698
699 if (__glibc_unlikely (pstr->offsets_needed))
700 {
701 pstr->len = pstr->raw_len - idx + offset;
702 pstr->stop = pstr->raw_stop - idx + offset;
703 pstr->offsets_needed = 0;
704 }
705#endif
706 pstr->valid_len = 0;
707#ifdef RE_ENABLE_I18N
708 if (pstr->mb_cur_max > 1)
709 {
710 Idx wcs_idx;
711 wint_t wc = WEOF;
712
713 if (pstr->is_utf8)
714 {
715 const unsigned char *raw, *p, *end;
716
717 /* Special case UTF-8. Multi-byte chars start with any
718 byte other than 0x80 - 0xbf. */
719 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720 end = raw + (offset - pstr->mb_cur_max);
721 if (end < pstr->raw_mbs)
722 end = pstr->raw_mbs;
723 p = raw + offset - 1;
724#ifdef _LIBC
725 /* We know the wchar_t encoding is UCS4, so for the simple
726 case, ASCII characters, skip the conversion step. */
727 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
728 {
729 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730 /* pstr->valid_len = 0; */
731 wc = (wchar_t) *p;
732 }
733 else
734#endif
735 for (; p >= end; --p)
736 if ((*p & 0xc0) != 0x80)
737 {
738 mbstate_t cur_state;
739 wchar_t wc2;
740 Idx mlen = raw + pstr->len - p;
741 unsigned char buf[6];
742 size_t mbclen;
743
744 const unsigned char *pp = p;
745 if (__glibc_unlikely (pstr->trans != NULL))
746 {
747 int i = mlen < 6 ? mlen : 6;
748 while (--i >= 0)
749 buf[i] = pstr->trans[p[i]];
750 pp = buf;
751 }
752 /* XXX Don't use mbrtowc, we know which conversion
753 to use (UTF-8 -> UCS4). */
754 memset (&cur_state, 0, sizeof (cur_state));
755 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
756 &cur_state);
757 if (raw + offset - p <= mbclen
758 && mbclen < (size_t) -2)
759 {
760 memset (&pstr->cur_state, '\0',
761 sizeof (mbstate_t));
762 pstr->valid_len = mbclen - (raw + offset - p);
763 wc = wc2;
764 }
765 break;
766 }
767 }
768
769 if (wc == WEOF)
770 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
771 if (wc == WEOF)
772 pstr->tip_context
773 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
774 else
775 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
776 && IS_WIDE_WORD_CHAR (wc))
777 ? CONTEXT_WORD
778 : ((IS_WIDE_NEWLINE (wc)
779 && pstr->newline_anchor)
780 ? CONTEXT_NEWLINE : 0));
781 if (__glibc_unlikely (pstr->valid_len))
782 {
783 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
784 pstr->wcs[wcs_idx] = WEOF;
785 if (pstr->mbs_allocated)
786 memset (pstr->mbs, 255, pstr->valid_len);
787 }
788 pstr->valid_raw_len = pstr->valid_len;
789 }
790 else
791#endif /* RE_ENABLE_I18N */
792 {
793 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
794 pstr->valid_raw_len = 0;
795 if (pstr->trans)
796 c = pstr->trans[c];
797 pstr->tip_context = (bitset_contain (pstr->word_char, c)
798 ? CONTEXT_WORD
799 : ((IS_NEWLINE (c) && pstr->newline_anchor)
800 ? CONTEXT_NEWLINE : 0));
801 }
802 }
803 if (!__glibc_unlikely (pstr->mbs_allocated))
804 pstr->mbs += offset;
805 }
806 pstr->raw_mbs_idx = idx;
807 pstr->len -= offset;
808 pstr->stop -= offset;
809
810 /* Then build the buffers. */
811#ifdef RE_ENABLE_I18N
812 if (pstr->mb_cur_max > 1)
813 {
814 if (pstr->icase)
815 {
816 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
817 if (__glibc_unlikely (ret != REG_NOERROR))
818 return ret;
819 }
820 else
821 build_wcs_buffer (pstr);
822 }
823 else
824#endif /* RE_ENABLE_I18N */
825 if (__glibc_unlikely (pstr->mbs_allocated))
826 {
827 if (pstr->icase)
828 build_upper_buffer (pstr);
829 else if (pstr->trans != NULL)
830 re_string_translate_buffer (pstr);
831 }
832 else
833 pstr->valid_len = pstr->len;
834
835 pstr->cur_idx = 0;
836 return REG_NOERROR;
837}
838
839static unsigned char
840__attribute__ ((pure))
841re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
842{
843 int ch;
844 Idx off;
845
846 /* Handle the common (easiest) cases first. */
847 if (__glibc_likely (!pstr->mbs_allocated))
848 return re_string_peek_byte (pstr, idx);
849
850#ifdef RE_ENABLE_I18N
851 if (pstr->mb_cur_max > 1
852 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
853 return re_string_peek_byte (pstr, idx);
854#endif
855
856 off = pstr->cur_idx + idx;
857#ifdef RE_ENABLE_I18N
858 if (pstr->offsets_needed)
859 off = pstr->offsets[off];
860#endif
861
862 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
863
864#ifdef RE_ENABLE_I18N
865 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
866 this function returns CAPITAL LETTER I instead of first byte of
867 DOTLESS SMALL LETTER I. The latter would confuse the parser,
868 since peek_byte_case doesn't advance cur_idx in any way. */
869 if (pstr->offsets_needed && !isascii (ch))
870 return re_string_peek_byte (pstr, idx);
871#endif
872
873 return ch;
874}
875
876static unsigned char
877re_string_fetch_byte_case (re_string_t *pstr)
878{
879 if (__glibc_likely (!pstr->mbs_allocated))
880 return re_string_fetch_byte (pstr);
881
882#ifdef RE_ENABLE_I18N
883 if (pstr->offsets_needed)
884 {
885 Idx off;
886 int ch;
887
888 /* For tr_TR.UTF-8 [[:islower:]] there is
889 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890 in that case the whole multi-byte character and return
891 the original letter. On the other side, with
892 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 anything else would complicate things too much. */
894
895 if (!re_string_first_byte (pstr, pstr->cur_idx))
896 return re_string_fetch_byte (pstr);
897
898 off = pstr->offsets[pstr->cur_idx];
899 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900
901 if (! isascii (ch))
902 return re_string_fetch_byte (pstr);
903
904 re_string_skip_bytes (pstr,
905 re_string_char_size_at (pstr, pstr->cur_idx));
906 return ch;
907 }
908#endif
909
910 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911}
912
913static void
914re_string_destruct (re_string_t *pstr)
915{
916#ifdef RE_ENABLE_I18N
917 re_free (pstr->wcs);
918 re_free (pstr->offsets);
919#endif /* RE_ENABLE_I18N */
920 if (pstr->mbs_allocated)
921 re_free (pstr->mbs);
922}
923
924/* Return the context at IDX in INPUT. */
925
926static unsigned int
927re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928{
929 int c;
930 if (__glibc_unlikely (idx < 0))
931 /* In this case, we use the value stored in input->tip_context,
932 since we can't know the character in input->mbs[-1] here. */
933 return input->tip_context;
934 if (__glibc_unlikely (idx == input->len))
935 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
937#ifdef RE_ENABLE_I18N
938 if (input->mb_cur_max > 1)
939 {
940 wint_t wc;
941 Idx wc_idx = idx;
942 while(input->wcs[wc_idx] == WEOF)
943 {
944 DEBUG_ASSERT (wc_idx >= 0);
945 --wc_idx;
946 if (wc_idx < 0)
947 return input->tip_context;
948 }
949 wc = input->wcs[wc_idx];
950 if (__glibc_unlikely (input->word_ops_used != 0)
951 && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
955 }
956 else
957#endif
958 {
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963 }
964}
965
966
967/* Functions for set operation. */
968
969static reg_errcode_t
970__attribute_warn_unused_result__
971re_node_set_alloc (re_node_set *set, Idx size)
972{
973 set->alloc = size;
974 set->nelem = 0;
975 set->elems = re_malloc (Idx, size);
976 if (__glibc_unlikely (set->elems == NULL)
977 && (MALLOC_0_IS_NONNULL || size != 0))
978 return REG_ESPACE;
979 return REG_NOERROR;
980}
981
982static reg_errcode_t
983__attribute_warn_unused_result__
984re_node_set_init_1 (re_node_set *set, Idx elem)
985{
986 set->alloc = 1;
987 set->nelem = 1;
988 set->elems = re_malloc (Idx, 1);
989 if (__glibc_unlikely (set->elems == NULL))
990 {
991 set->alloc = set->nelem = 0;
992 return REG_ESPACE;
993 }
994 set->elems[0] = elem;
995 return REG_NOERROR;
996}
997
998static reg_errcode_t
999__attribute_warn_unused_result__
1000re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1001{
1002 set->alloc = 2;
1003 set->elems = re_malloc (Idx, 2);
1004 if (__glibc_unlikely (set->elems == NULL))
1005 return REG_ESPACE;
1006 if (elem1 == elem2)
1007 {
1008 set->nelem = 1;
1009 set->elems[0] = elem1;
1010 }
1011 else
1012 {
1013 set->nelem = 2;
1014 if (elem1 < elem2)
1015 {
1016 set->elems[0] = elem1;
1017 set->elems[1] = elem2;
1018 }
1019 else
1020 {
1021 set->elems[0] = elem2;
1022 set->elems[1] = elem1;
1023 }
1024 }
1025 return REG_NOERROR;
1026}
1027
1028static reg_errcode_t
1029__attribute_warn_unused_result__
1030re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1031{
1032 dest->nelem = src->nelem;
1033 if (src->nelem > 0)
1034 {
1035 dest->alloc = dest->nelem;
1036 dest->elems = re_malloc (Idx, dest->alloc);
1037 if (__glibc_unlikely (dest->elems == NULL))
1038 {
1039 dest->alloc = dest->nelem = 0;
1040 return REG_ESPACE;
1041 }
1042 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1043 }
1044 else
1045 re_node_set_init_empty (dest);
1046 return REG_NOERROR;
1047}
1048
1049/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1050 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1051 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1052
1053static reg_errcode_t
1054__attribute_warn_unused_result__
1055re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1056 const re_node_set *src2)
1057{
1058 Idx i1, i2, is, id, delta, sbase;
1059 if (src1->nelem == 0 || src2->nelem == 0)
1060 return REG_NOERROR;
1061
1062 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1063 conservative estimate. */
1064 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1065 {
1066 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1067 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1068 if (__glibc_unlikely (new_elems == NULL))
1069 return REG_ESPACE;
1070 dest->elems = new_elems;
1071 dest->alloc = new_alloc;
1072 }
1073
1074 /* Find the items in the intersection of SRC1 and SRC2, and copy
1075 into the top of DEST those that are not already in DEST itself. */
1076 sbase = dest->nelem + src1->nelem + src2->nelem;
1077 i1 = src1->nelem - 1;
1078 i2 = src2->nelem - 1;
1079 id = dest->nelem - 1;
1080 for (;;)
1081 {
1082 if (src1->elems[i1] == src2->elems[i2])
1083 {
1084 /* Try to find the item in DEST. Maybe we could binary search? */
1085 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1086 --id;
1087
1088 if (id < 0 || dest->elems[id] != src1->elems[i1])
1089 dest->elems[--sbase] = src1->elems[i1];
1090
1091 if (--i1 < 0 || --i2 < 0)
1092 break;
1093 }
1094
1095 /* Lower the highest of the two items. */
1096 else if (src1->elems[i1] < src2->elems[i2])
1097 {
1098 if (--i2 < 0)
1099 break;
1100 }
1101 else
1102 {
1103 if (--i1 < 0)
1104 break;
1105 }
1106 }
1107
1108 id = dest->nelem - 1;
1109 is = dest->nelem + src1->nelem + src2->nelem - 1;
1110 delta = is - sbase + 1;
1111
1112 /* Now copy. When DELTA becomes zero, the remaining
1113 DEST elements are already in place; this is more or
1114 less the same loop that is in re_node_set_merge. */
1115 dest->nelem += delta;
1116 if (delta > 0 && id >= 0)
1117 for (;;)
1118 {
1119 if (dest->elems[is] > dest->elems[id])
1120 {
1121 /* Copy from the top. */
1122 dest->elems[id + delta--] = dest->elems[is--];
1123 if (delta == 0)
1124 break;
1125 }
1126 else
1127 {
1128 /* Slide from the bottom. */
1129 dest->elems[id + delta] = dest->elems[id];
1130 if (--id < 0)
1131 break;
1132 }
1133 }
1134
1135 /* Copy remaining SRC elements. */
1136 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1137
1138 return REG_NOERROR;
1139}
1140
1141/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1142 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1143
1144static reg_errcode_t
1145__attribute_warn_unused_result__
1146re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1147 const re_node_set *src2)
1148{
1149 Idx i1, i2, id;
1150 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1151 {
1152 dest->alloc = src1->nelem + src2->nelem;
1153 dest->elems = re_malloc (Idx, dest->alloc);
1154 if (__glibc_unlikely (dest->elems == NULL))
1155 return REG_ESPACE;
1156 }
1157 else
1158 {
1159 if (src1 != NULL && src1->nelem > 0)
1160 return re_node_set_init_copy (dest, src1);
1161 else if (src2 != NULL && src2->nelem > 0)
1162 return re_node_set_init_copy (dest, src2);
1163 else
1164 re_node_set_init_empty (dest);
1165 return REG_NOERROR;
1166 }
1167 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1168 {
1169 if (src1->elems[i1] > src2->elems[i2])
1170 {
1171 dest->elems[id++] = src2->elems[i2++];
1172 continue;
1173 }
1174 if (src1->elems[i1] == src2->elems[i2])
1175 ++i2;
1176 dest->elems[id++] = src1->elems[i1++];
1177 }
1178 if (i1 < src1->nelem)
1179 {
1180 memcpy (dest->elems + id, src1->elems + i1,
1181 (src1->nelem - i1) * sizeof (Idx));
1182 id += src1->nelem - i1;
1183 }
1184 else if (i2 < src2->nelem)
1185 {
1186 memcpy (dest->elems + id, src2->elems + i2,
1187 (src2->nelem - i2) * sizeof (Idx));
1188 id += src2->nelem - i2;
1189 }
1190 dest->nelem = id;
1191 return REG_NOERROR;
1192}
1193
1194/* Calculate the union set of the sets DEST and SRC. And store it to
1195 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1196
1197static reg_errcode_t
1198__attribute_warn_unused_result__
1199re_node_set_merge (re_node_set *dest, const re_node_set *src)
1200{
1201 Idx is, id, sbase, delta;
1202 if (src == NULL || src->nelem == 0)
1203 return REG_NOERROR;
1204 if (dest->alloc < 2 * src->nelem + dest->nelem)
1205 {
1206 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1207 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1208 if (__glibc_unlikely (new_buffer == NULL))
1209 return REG_ESPACE;
1210 dest->elems = new_buffer;
1211 dest->alloc = new_alloc;
1212 }
1213
1214 if (__glibc_unlikely (dest->nelem == 0))
1215 {
1216 /* Although we already guaranteed above that dest->alloc != 0 and
1217 therefore dest->elems != NULL, add a debug assertion to pacify
1218 GCC 11.2.1's -fanalyzer. */
1219 DEBUG_ASSERT (dest->elems);
1220 dest->nelem = src->nelem;
1221 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1222 return REG_NOERROR;
1223 }
1224
1225 /* Copy into the top of DEST the items of SRC that are not
1226 found in DEST. Maybe we could binary search in DEST? */
1227 for (sbase = dest->nelem + 2 * src->nelem,
1228 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1229 {
1230 if (dest->elems[id] == src->elems[is])
1231 is--, id--;
1232 else if (dest->elems[id] < src->elems[is])
1233 dest->elems[--sbase] = src->elems[is--];
1234 else /* if (dest->elems[id] > src->elems[is]) */
1235 --id;
1236 }
1237
1238 if (is >= 0)
1239 {
1240 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1241 sbase -= is + 1;
1242 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243 }
1244
1245 id = dest->nelem - 1;
1246 is = dest->nelem + 2 * src->nelem - 1;
1247 delta = is - sbase + 1;
1248 if (delta == 0)
1249 return REG_NOERROR;
1250
1251 /* Now copy. When DELTA becomes zero, the remaining
1252 DEST elements are already in place. */
1253 dest->nelem += delta;
1254 for (;;)
1255 {
1256 if (dest->elems[is] > dest->elems[id])
1257 {
1258 /* Copy from the top. */
1259 dest->elems[id + delta--] = dest->elems[is--];
1260 if (delta == 0)
1261 break;
1262 }
1263 else
1264 {
1265 /* Slide from the bottom. */
1266 dest->elems[id + delta] = dest->elems[id];
1267 if (--id < 0)
1268 {
1269 /* Copy remaining SRC elements. */
1270 memcpy (dest->elems, dest->elems + sbase,
1271 delta * sizeof (Idx));
1272 break;
1273 }
1274 }
1275 }
1276
1277 return REG_NOERROR;
1278}
1279
1280/* Insert the new element ELEM to the re_node_set* SET.
1281 SET should not already have ELEM.
1282 Return true if successful. */
1283
1284static bool
1285__attribute_warn_unused_result__
1286re_node_set_insert (re_node_set *set, Idx elem)
1287{
1288 Idx idx;
1289 /* In case the set is empty. */
1290 if (set->alloc == 0)
1291 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1292
1293 if (__glibc_unlikely (set->nelem) == 0)
1294 {
1295 /* Although we already guaranteed above that set->alloc != 0 and
1296 therefore set->elems != NULL, add a debug assertion to pacify
1297 GCC 11.2 -fanalyzer. */
1298 DEBUG_ASSERT (set->elems);
1299 set->elems[0] = elem;
1300 ++set->nelem;
1301 return true;
1302 }
1303
1304 /* Realloc if we need. */
1305 if (set->alloc == set->nelem)
1306 {
1307 Idx *new_elems;
1308 set->alloc = set->alloc * 2;
1309 new_elems = re_realloc (set->elems, Idx, set->alloc);
1310 if (__glibc_unlikely (new_elems == NULL))
1311 return false;
1312 set->elems = new_elems;
1313 }
1314
1315 /* Move the elements which follows the new element. Test the
1316 first element separately to skip a check in the inner loop. */
1317 if (elem < set->elems[0])
1318 {
1319 for (idx = set->nelem; idx > 0; idx--)
1320 set->elems[idx] = set->elems[idx - 1];
1321 }
1322 else
1323 {
1324 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1325 set->elems[idx] = set->elems[idx - 1];
1326 DEBUG_ASSERT (set->elems[idx - 1] < elem);
1327 }
1328
1329 /* Insert the new element. */
1330 set->elems[idx] = elem;
1331 ++set->nelem;
1332 return true;
1333}
1334
1335/* Insert the new element ELEM to the re_node_set* SET.
1336 SET should not already have any element greater than or equal to ELEM.
1337 Return true if successful. */
1338
1339static bool
1340__attribute_warn_unused_result__
1341re_node_set_insert_last (re_node_set *set, Idx elem)
1342{
1343 /* Realloc if we need. */
1344 if (set->alloc == set->nelem)
1345 {
1346 Idx *new_elems;
1347 set->alloc = (set->alloc + 1) * 2;
1348 new_elems = re_realloc (set->elems, Idx, set->alloc);
1349 if (__glibc_unlikely (new_elems == NULL))
1350 return false;
1351 set->elems = new_elems;
1352 }
1353
1354 /* Insert the new element. */
1355 set->elems[set->nelem++] = elem;
1356 return true;
1357}
1358
1359/* Compare two node sets SET1 and SET2.
1360 Return true if SET1 and SET2 are equivalent. */
1361
1362static bool
1363__attribute__ ((pure))
1364re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1365{
1366 Idx i;
1367 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1368 return false;
1369 for (i = set1->nelem ; --i >= 0 ; )
1370 if (set1->elems[i] != set2->elems[i])
1371 return false;
1372 return true;
1373}
1374
1375/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1376
1377static Idx
1378__attribute__ ((pure))
1379re_node_set_contains (const re_node_set *set, Idx elem)
1380{
1381 __re_size_t idx, right, mid;
1382 if (set->nelem <= 0)
1383 return 0;
1384
1385 /* Binary search the element. */
1386 idx = 0;
1387 right = set->nelem - 1;
1388 while (idx < right)
1389 {
1390 mid = (idx + right) / 2;
1391 if (set->elems[mid] < elem)
1392 idx = mid + 1;
1393 else
1394 right = mid;
1395 }
1396 return set->elems[idx] == elem ? idx + 1 : 0;
1397}
1398
1399static void
1400re_node_set_remove_at (re_node_set *set, Idx idx)
1401{
1402 if (idx < 0 || idx >= set->nelem)
1403 return;
1404 --set->nelem;
1405 for (; idx < set->nelem; idx++)
1406 set->elems[idx] = set->elems[idx + 1];
1407}
1408
1409
1410
1411/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1412 Or return -1 if an error occurred. */
1413
1414static Idx
1415re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1416{
1417 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1418 {
1419 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420 Idx *new_nexts, *new_indices;
1421 re_node_set *new_edests, *new_eclosures;
1422 re_token_t *new_nodes;
1423
1424 /* Avoid overflows in realloc. */
1425 const size_t max_object_size = MAX (sizeof (re_token_t),
1426 MAX (sizeof (re_node_set),
1427 sizeof (Idx)));
1428 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1429 < new_nodes_alloc))
1430 return -1;
1431
1432 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1433 if (__glibc_unlikely (new_nodes == NULL))
1434 return -1;
1435 dfa->nodes = new_nodes;
1436 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1437 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1438 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1439 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1440 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1441 || new_edests == NULL || new_eclosures == NULL))
1442 {
1443 re_free (new_nexts);
1444 re_free (new_indices);
1445 re_free (new_edests);
1446 re_free (new_eclosures);
1447 return -1;
1448 }
1449 dfa->nexts = new_nexts;
1450 dfa->org_indices = new_indices;
1451 dfa->edests = new_edests;
1452 dfa->eclosures = new_eclosures;
1453 dfa->nodes_alloc = new_nodes_alloc;
1454 }
1455 dfa->nodes[dfa->nodes_len] = token;
1456 dfa->nodes[dfa->nodes_len].constraint = 0;
1457#ifdef RE_ENABLE_I18N
1458 dfa->nodes[dfa->nodes_len].accept_mb =
1459 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1460 || token.type == COMPLEX_BRACKET);
1461#endif
1462 dfa->nexts[dfa->nodes_len] = -1;
1463 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1464 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1465 return dfa->nodes_len++;
1466}
1467
1468static re_hashval_t
1469calc_state_hash (const re_node_set *nodes, unsigned int context)
1470{
1471 re_hashval_t hash = nodes->nelem + context;
1472 Idx i;
1473 for (i = 0 ; i < nodes->nelem ; i++)
1474 hash += nodes->elems[i];
1475 return hash;
1476}
1477
1478/* Search for the state whose node_set is equivalent to NODES.
1479 Return the pointer to the state, if we found it in the DFA.
1480 Otherwise create the new one and return it. In case of an error
1481 return NULL and set the error code in ERR.
1482 Note: - We assume NULL as the invalid state, then it is possible that
1483 return value is NULL and ERR is REG_NOERROR.
1484 - We never return non-NULL value in case of any errors, it is for
1485 optimization. */
1486
1487static re_dfastate_t *
1488__attribute_warn_unused_result__
1489re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490 const re_node_set *nodes)
1491{
1492 re_hashval_t hash;
1493 re_dfastate_t *new_state;
1494 struct re_state_table_entry *spot;
1495 Idx i;
1496#if defined GCC_LINT || defined lint
1497 /* Suppress bogus uninitialized-variable warnings. */
1498 *err = REG_NOERROR;
1499#endif
1500 if (__glibc_unlikely (nodes->nelem == 0))
1501 {
1502 *err = REG_NOERROR;
1503 return NULL;
1504 }
1505 hash = calc_state_hash (nodes, 0);
1506 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507
1508 for (i = 0 ; i < spot->num ; i++)
1509 {
1510 re_dfastate_t *state = spot->array[i];
1511 if (hash != state->hash)
1512 continue;
1513 if (re_node_set_compare (&state->nodes, nodes))
1514 return state;
1515 }
1516
1517 /* There are no appropriate state in the dfa, create the new one. */
1518 new_state = create_ci_newstate (dfa, nodes, hash);
1519 if (__glibc_unlikely (new_state == NULL))
1520 *err = REG_ESPACE;
1521
1522 return new_state;
1523}
1524
1525/* Search for the state whose node_set is equivalent to NODES and
1526 whose context is equivalent to CONTEXT.
1527 Return the pointer to the state, if we found it in the DFA.
1528 Otherwise create the new one and return it. In case of an error
1529 return NULL and set the error code in ERR.
1530 Note: - We assume NULL as the invalid state, then it is possible that
1531 return value is NULL and ERR is REG_NOERROR.
1532 - We never return non-NULL value in case of any errors, it is for
1533 optimization. */
1534
1535static re_dfastate_t *
1536__attribute_warn_unused_result__
1537re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1538 const re_node_set *nodes, unsigned int context)
1539{
1540 re_hashval_t hash;
1541 re_dfastate_t *new_state;
1542 struct re_state_table_entry *spot;
1543 Idx i;
1544#if defined GCC_LINT || defined lint
1545 /* Suppress bogus uninitialized-variable warnings. */
1546 *err = REG_NOERROR;
1547#endif
1548 if (nodes->nelem == 0)
1549 {
1550 *err = REG_NOERROR;
1551 return NULL;
1552 }
1553 hash = calc_state_hash (nodes, context);
1554 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1555
1556 for (i = 0 ; i < spot->num ; i++)
1557 {
1558 re_dfastate_t *state = spot->array[i];
1559 if (state->hash == hash
1560 && state->context == context
1561 && re_node_set_compare (state->entrance_nodes, nodes))
1562 return state;
1563 }
1564 /* There are no appropriate state in 'dfa', create the new one. */
1565 new_state = create_cd_newstate (dfa, nodes, context, hash);
1566 if (__glibc_unlikely (new_state == NULL))
1567 *err = REG_ESPACE;
1568
1569 return new_state;
1570}
1571
1572/* Finish initialization of the new state NEWSTATE, and using its hash value
1573 HASH put in the appropriate bucket of DFA's state table. Return value
1574 indicates the error code if failed. */
1575
1576static reg_errcode_t
1577__attribute_warn_unused_result__
1578register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1579 re_hashval_t hash)
1580{
1581 struct re_state_table_entry *spot;
1582 reg_errcode_t err;
1583 Idx i;
1584
1585 newstate->hash = hash;
1586 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1587 if (__glibc_unlikely (err != REG_NOERROR))
1588 return REG_ESPACE;
1589 for (i = 0; i < newstate->nodes.nelem; i++)
1590 {
1591 Idx elem = newstate->nodes.elems[i];
1592 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1593 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1594 return REG_ESPACE;
1595 }
1596
1597 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1598 if (__glibc_unlikely (spot->alloc <= spot->num))
1599 {
1600 Idx new_alloc = 2 * spot->num + 2;
1601 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1602 new_alloc);
1603 if (__glibc_unlikely (new_array == NULL))
1604 return REG_ESPACE;
1605 spot->array = new_array;
1606 spot->alloc = new_alloc;
1607 }
1608 spot->array[spot->num++] = newstate;
1609 return REG_NOERROR;
1610}
1611
1612static void
1613free_state (re_dfastate_t *state)
1614{
1615 re_node_set_free (&state->non_eps_nodes);
1616 re_node_set_free (&state->inveclosure);
1617 if (state->entrance_nodes != &state->nodes)
1618 {
1619 re_node_set_free (state->entrance_nodes);
1620 re_free (state->entrance_nodes);
1621 }
1622 re_node_set_free (&state->nodes);
1623 re_free (state->word_trtable);
1624 re_free (state->trtable);
1625 re_free (state);
1626}
1627
1628/* Create the new state which is independent of contexts.
1629 Return the new state if succeeded, otherwise return NULL. */
1630
1631static re_dfastate_t *
1632__attribute_warn_unused_result__
1633create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1634 re_hashval_t hash)
1635{
1636 Idx i;
1637 reg_errcode_t err;
1638 re_dfastate_t *newstate;
1639
1640 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1641 if (__glibc_unlikely (newstate == NULL))
1642 return NULL;
1643 err = re_node_set_init_copy (&newstate->nodes, nodes);
1644 if (__glibc_unlikely (err != REG_NOERROR))
1645 {
1646 re_free (newstate);
1647 return NULL;
1648 }
1649
1650 newstate->entrance_nodes = &newstate->nodes;
1651 for (i = 0 ; i < nodes->nelem ; i++)
1652 {
1653 re_token_t *node = dfa->nodes + nodes->elems[i];
1654 re_token_type_t type = node->type;
1655 if (type == CHARACTER && !node->constraint)
1656 continue;
1657#ifdef RE_ENABLE_I18N
1658 newstate->accept_mb |= node->accept_mb;
1659#endif /* RE_ENABLE_I18N */
1660
1661 /* If the state has the halt node, the state is a halt state. */
1662 if (type == END_OF_RE)
1663 newstate->halt = 1;
1664 else if (type == OP_BACK_REF)
1665 newstate->has_backref = 1;
1666 else if (type == ANCHOR || node->constraint)
1667 newstate->has_constraint = 1;
1668 }
1669 err = register_state (dfa, newstate, hash);
1670 if (__glibc_unlikely (err != REG_NOERROR))
1671 {
1672 free_state (newstate);
1673 newstate = NULL;
1674 }
1675 return newstate;
1676}
1677
1678/* Create the new state which is depend on the context CONTEXT.
1679 Return the new state if succeeded, otherwise return NULL. */
1680
1681static re_dfastate_t *
1682__attribute_warn_unused_result__
1683create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1684 unsigned int context, re_hashval_t hash)
1685{
1686 Idx i, nctx_nodes = 0;
1687 reg_errcode_t err;
1688 re_dfastate_t *newstate;
1689
1690 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1691 if (__glibc_unlikely (newstate == NULL))
1692 return NULL;
1693 err = re_node_set_init_copy (&newstate->nodes, nodes);
1694 if (__glibc_unlikely (err != REG_NOERROR))
1695 {
1696 re_free (newstate);
1697 return NULL;
1698 }
1699
1700 newstate->context = context;
1701 newstate->entrance_nodes = &newstate->nodes;
1702
1703 for (i = 0 ; i < nodes->nelem ; i++)
1704 {
1705 re_token_t *node = dfa->nodes + nodes->elems[i];
1706 re_token_type_t type = node->type;
1707 unsigned int constraint = node->constraint;
1708
1709 if (type == CHARACTER && !constraint)
1710 continue;
1711#ifdef RE_ENABLE_I18N
1712 newstate->accept_mb |= node->accept_mb;
1713#endif /* RE_ENABLE_I18N */
1714
1715 /* If the state has the halt node, the state is a halt state. */
1716 if (type == END_OF_RE)
1717 newstate->halt = 1;
1718 else if (type == OP_BACK_REF)
1719 newstate->has_backref = 1;
1720
1721 if (constraint)
1722 {
1723 if (newstate->entrance_nodes == &newstate->nodes)
1724 {
1725 re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1726 if (__glibc_unlikely (entrance_nodes == NULL))
1727 {
1728 free_state (newstate);
1729 return NULL;
1730 }
1731 newstate->entrance_nodes = entrance_nodes;
1732 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1733 != REG_NOERROR)
1734 {
1735 free_state (newstate);
1736 return NULL;
1737 }
1738 nctx_nodes = 0;
1739 newstate->has_constraint = 1;
1740 }
1741
1742 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1743 {
1744 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1745 ++nctx_nodes;
1746 }
1747 }
1748 }
1749 err = register_state (dfa, newstate, hash);
1750 if (__glibc_unlikely (err != REG_NOERROR))
1751 {
1752 free_state (newstate);
1753 newstate = NULL;
1754 }
1755 return newstate;
1756}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette