VirtualBox

source: kBuild/vendor/grep/2.12/lib/regex_internal.c@ 3576

Last change on this file since 3576 was 2595, checked in by bird, 13 years ago

gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)

  • Property svn:eol-style set to native
File size: 47.6 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19#include "verify.h"
20#include "intprops.h"
21static void re_string_construct_common (const char *str, Idx len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, bool icase,
24 const re_dfa_t *dfa) internal_function;
25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 re_hashval_t hash) internal_function;
28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 re_hashval_t hash) internal_function;
32
33
34/* Functions for string operation. */
35
36/* This function allocate the buffers. It is necessary to call
37 re_string_reconstruct before using the object. */
38
39static reg_errcode_t
40internal_function __attribute_warn_unused_result__
41re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
42 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
43{
44 reg_errcode_t ret;
45 Idx init_buf_len;
46
47 /* Ensure at least one character fits into the buffers. */
48 if (init_len < dfa->mb_cur_max)
49 init_len = dfa->mb_cur_max;
50 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52
53 ret = re_string_realloc_buffers (pstr, init_buf_len);
54 if (BE (ret != REG_NOERROR, 0))
55 return ret;
56
57 pstr->word_char = dfa->word_char;
58 pstr->word_ops_used = dfa->word_ops_used;
59 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
61 pstr->valid_raw_len = pstr->valid_len;
62 return REG_NOERROR;
63}
64
65/* This function allocate the buffers, and initialize them. */
66
67static reg_errcode_t
68internal_function __attribute_warn_unused_result__
69re_string_construct (re_string_t *pstr, const char *str, Idx len,
70 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
71{
72 reg_errcode_t ret;
73 memset (pstr, '\0', sizeof (re_string_t));
74 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75
76 if (len > 0)
77 {
78 ret = re_string_realloc_buffers (pstr, len + 1);
79 if (BE (ret != REG_NOERROR, 0))
80 return ret;
81 }
82 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83
84 if (icase)
85 {
86#ifdef RE_ENABLE_I18N
87 if (dfa->mb_cur_max > 1)
88 {
89 while (1)
90 {
91 ret = build_wcs_upper_buffer (pstr);
92 if (BE (ret != REG_NOERROR, 0))
93 return ret;
94 if (pstr->valid_raw_len >= len)
95 break;
96 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97 break;
98 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
101 }
102 }
103 else
104#endif /* RE_ENABLE_I18N */
105 build_upper_buffer (pstr);
106 }
107 else
108 {
109#ifdef RE_ENABLE_I18N
110 if (dfa->mb_cur_max > 1)
111 build_wcs_buffer (pstr);
112 else
113#endif /* RE_ENABLE_I18N */
114 {
115 if (trans != NULL)
116 re_string_translate_buffer (pstr);
117 else
118 {
119 pstr->valid_len = pstr->bufs_len;
120 pstr->valid_raw_len = pstr->bufs_len;
121 }
122 }
123 }
124
125 return REG_NOERROR;
126}
127
128/* Helper functions for re_string_allocate, and re_string_construct. */
129
130static reg_errcode_t
131internal_function __attribute_warn_unused_result__
132re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
133{
134#ifdef RE_ENABLE_I18N
135 if (pstr->mb_cur_max > 1)
136 {
137 wint_t *new_wcs;
138
139 /* Avoid overflow in realloc. */
140 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
141 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
142 return REG_ESPACE;
143
144 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
145 if (BE (new_wcs == NULL, 0))
146 return REG_ESPACE;
147 pstr->wcs = new_wcs;
148 if (pstr->offsets != NULL)
149 {
150 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
151 if (BE (new_offsets == NULL, 0))
152 return REG_ESPACE;
153 pstr->offsets = new_offsets;
154 }
155 }
156#endif /* RE_ENABLE_I18N */
157 if (pstr->mbs_allocated)
158 {
159 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
160 new_buf_len);
161 if (BE (new_mbs == NULL, 0))
162 return REG_ESPACE;
163 pstr->mbs = new_mbs;
164 }
165 pstr->bufs_len = new_buf_len;
166 return REG_NOERROR;
167}
168
169
170static void
171internal_function
172re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
173 RE_TRANSLATE_TYPE trans, bool icase,
174 const re_dfa_t *dfa)
175{
176 pstr->raw_mbs = (const unsigned char *) str;
177 pstr->len = len;
178 pstr->raw_len = len;
179 pstr->trans = trans;
180 pstr->icase = icase;
181 pstr->mbs_allocated = (trans != NULL || icase);
182 pstr->mb_cur_max = dfa->mb_cur_max;
183 pstr->is_utf8 = dfa->is_utf8;
184 pstr->map_notascii = dfa->map_notascii;
185 pstr->stop = pstr->len;
186 pstr->raw_stop = pstr->stop;
187}
188
189#ifdef RE_ENABLE_I18N
190
191/* Build wide character buffer PSTR->WCS.
192 If the byte sequence of the string are:
193 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
194 Then wide character buffer will be:
195 <wc1> , WEOF , <wc2> , WEOF , <wc3>
196 We use WEOF for padding, they indicate that the position isn't
197 a first byte of a multibyte character.
198
199 Note that this function assumes PSTR->VALID_LEN elements are already
200 built and starts from PSTR->VALID_LEN. */
201
202static void
203internal_function
204build_wcs_buffer (re_string_t *pstr)
205{
206#ifdef _LIBC
207 unsigned char buf[MB_LEN_MAX];
208 assert (MB_LEN_MAX >= pstr->mb_cur_max);
209#else
210 unsigned char buf[64];
211#endif
212 mbstate_t prev_st;
213 Idx byte_idx, end_idx, remain_len;
214 size_t mbclen;
215
216 /* Build the buffers from pstr->valid_len to either pstr->len or
217 pstr->bufs_len. */
218 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
219 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
220 {
221 wchar_t wc;
222 const char *p;
223
224 remain_len = end_idx - byte_idx;
225 prev_st = pstr->cur_state;
226 /* Apply the translation if we need. */
227 if (BE (pstr->trans != NULL, 0))
228 {
229 int i, ch;
230
231 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
232 {
233 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
234 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
235 }
236 p = (const char *) buf;
237 }
238 else
239 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
240 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
241 if (BE (mbclen == (size_t) -1 || mbclen == 0
242 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
243 {
244 /* We treat these cases as a singlebyte character. */
245 mbclen = 1;
246 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
247 if (BE (pstr->trans != NULL, 0))
248 wc = pstr->trans[wc];
249 pstr->cur_state = prev_st;
250 }
251 else if (BE (mbclen == (size_t) -2, 0))
252 {
253 /* The buffer doesn't have enough space, finish to build. */
254 pstr->cur_state = prev_st;
255 break;
256 }
257
258 /* Write wide character and padding. */
259 pstr->wcs[byte_idx++] = wc;
260 /* Write paddings. */
261 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 pstr->wcs[byte_idx++] = WEOF;
263 }
264 pstr->valid_len = byte_idx;
265 pstr->valid_raw_len = byte_idx;
266}
267
268/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269 but for REG_ICASE. */
270
271static reg_errcode_t
272internal_function __attribute_warn_unused_result__
273build_wcs_upper_buffer (re_string_t *pstr)
274{
275 mbstate_t prev_st;
276 Idx src_idx, byte_idx, end_idx, remain_len;
277 size_t mbclen;
278#ifdef _LIBC
279 char buf[MB_LEN_MAX];
280 assert (MB_LEN_MAX >= pstr->mb_cur_max);
281#else
282 char buf[64];
283#endif
284
285 byte_idx = pstr->valid_len;
286 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288 /* The following optimization assumes that ASCII characters can be
289 mapped to wide characters with a simple cast. */
290 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291 {
292 while (byte_idx < end_idx)
293 {
294 wchar_t wc;
295
296 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297 && mbsinit (&pstr->cur_state))
298 {
299 /* In case of a singlebyte character. */
300 pstr->mbs[byte_idx]
301 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302 /* The next step uses the assumption that wchar_t is encoded
303 ASCII-safe: all ASCII values can be converted like this. */
304 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305 ++byte_idx;
306 continue;
307 }
308
309 remain_len = end_idx - byte_idx;
310 prev_st = pstr->cur_state;
311 mbclen = __mbrtowc (&wc,
312 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313 + byte_idx), remain_len, &pstr->cur_state);
314 if (BE (mbclen < (size_t) -2, 1))
315 {
316 wchar_t wcu = wc;
317 if (iswlower (wc))
318 {
319 size_t mbcdlen;
320
321 wcu = towupper (wc);
322 mbcdlen = wcrtomb (buf, wcu, &prev_st);
323 if (BE (mbclen == mbcdlen, 1))
324 memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 else
326 {
327 src_idx = byte_idx;
328 goto offsets_needed;
329 }
330 }
331 else
332 memcpy (pstr->mbs + byte_idx,
333 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 pstr->wcs[byte_idx++] = wcu;
335 /* Write paddings. */
336 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 pstr->wcs[byte_idx++] = WEOF;
338 }
339 else if (mbclen == (size_t) -1 || mbclen == 0
340 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 {
342 /* It is an invalid character, an incomplete character
343 at the end of the string, or '\0'. Just use the byte. */
344 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 pstr->mbs[byte_idx] = ch;
346 /* And also cast it to wide char. */
347 pstr->wcs[byte_idx++] = (wchar_t) ch;
348 if (BE (mbclen == (size_t) -1, 0))
349 pstr->cur_state = prev_st;
350 }
351 else
352 {
353 /* The buffer doesn't have enough space, finish to build. */
354 pstr->cur_state = prev_st;
355 break;
356 }
357 }
358 pstr->valid_len = byte_idx;
359 pstr->valid_raw_len = byte_idx;
360 return REG_NOERROR;
361 }
362 else
363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 {
365 wchar_t wc;
366 const char *p;
367 offsets_needed:
368 remain_len = end_idx - byte_idx;
369 prev_st = pstr->cur_state;
370 if (BE (pstr->trans != NULL, 0))
371 {
372 int i, ch;
373
374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 {
376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 buf[i] = pstr->trans[ch];
378 }
379 p = (const char *) buf;
380 }
381 else
382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 if (BE (mbclen < (size_t) -2, 1))
385 {
386 wchar_t wcu = wc;
387 if (iswlower (wc))
388 {
389 size_t mbcdlen;
390
391 wcu = towupper (wc);
392 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393 if (BE (mbclen == mbcdlen, 1))
394 memcpy (pstr->mbs + byte_idx, buf, mbclen);
395 else if (mbcdlen != (size_t) -1)
396 {
397 size_t i;
398
399 if (byte_idx + mbcdlen > pstr->bufs_len)
400 {
401 pstr->cur_state = prev_st;
402 break;
403 }
404
405 if (pstr->offsets == NULL)
406 {
407 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408
409 if (pstr->offsets == NULL)
410 return REG_ESPACE;
411 }
412 if (!pstr->offsets_needed)
413 {
414 for (i = 0; i < (size_t) byte_idx; ++i)
415 pstr->offsets[i] = i;
416 pstr->offsets_needed = 1;
417 }
418
419 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420 pstr->wcs[byte_idx] = wcu;
421 pstr->offsets[byte_idx] = src_idx;
422 for (i = 1; i < mbcdlen; ++i)
423 {
424 pstr->offsets[byte_idx + i]
425 = src_idx + (i < mbclen ? i : mbclen - 1);
426 pstr->wcs[byte_idx + i] = WEOF;
427 }
428 pstr->len += mbcdlen - mbclen;
429 if (pstr->raw_stop > src_idx)
430 pstr->stop += mbcdlen - mbclen;
431 end_idx = (pstr->bufs_len > pstr->len)
432 ? pstr->len : pstr->bufs_len;
433 byte_idx += mbcdlen;
434 src_idx += mbclen;
435 continue;
436 }
437 else
438 memcpy (pstr->mbs + byte_idx, p, mbclen);
439 }
440 else
441 memcpy (pstr->mbs + byte_idx, p, mbclen);
442
443 if (BE (pstr->offsets_needed != 0, 0))
444 {
445 size_t i;
446 for (i = 0; i < mbclen; ++i)
447 pstr->offsets[byte_idx + i] = src_idx + i;
448 }
449 src_idx += mbclen;
450
451 pstr->wcs[byte_idx++] = wcu;
452 /* Write paddings. */
453 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454 pstr->wcs[byte_idx++] = WEOF;
455 }
456 else if (mbclen == (size_t) -1 || mbclen == 0
457 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
458 {
459 /* It is an invalid character or '\0'. Just use the byte. */
460 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
461
462 if (BE (pstr->trans != NULL, 0))
463 ch = pstr->trans [ch];
464 pstr->mbs[byte_idx] = ch;
465
466 if (BE (pstr->offsets_needed != 0, 0))
467 pstr->offsets[byte_idx] = src_idx;
468 ++src_idx;
469
470 /* And also cast it to wide char. */
471 pstr->wcs[byte_idx++] = (wchar_t) ch;
472 if (BE (mbclen == (size_t) -1, 0))
473 pstr->cur_state = prev_st;
474 }
475 else
476 {
477 /* The buffer doesn't have enough space, finish to build. */
478 pstr->cur_state = prev_st;
479 break;
480 }
481 }
482 pstr->valid_len = byte_idx;
483 pstr->valid_raw_len = src_idx;
484 return REG_NOERROR;
485}
486
487/* Skip characters until the index becomes greater than NEW_RAW_IDX.
488 Return the index. */
489
490static Idx
491internal_function
492re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
493{
494 mbstate_t prev_st;
495 Idx rawbuf_idx;
496 size_t mbclen;
497 wint_t wc = WEOF;
498
499 /* Skip the characters which are not necessary to check. */
500 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
501 rawbuf_idx < new_raw_idx;)
502 {
503 wchar_t wc2;
504 Idx remain_len = pstr->raw_len - rawbuf_idx;
505 prev_st = pstr->cur_state;
506 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507 remain_len, &pstr->cur_state);
508 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
509 {
510 /* We treat these cases as a single byte character. */
511 if (mbclen == 0 || remain_len == 0)
512 wc = L'\0';
513 else
514 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515 mbclen = 1;
516 pstr->cur_state = prev_st;
517 }
518 else
519 wc = wc2;
520 /* Then proceed the next character. */
521 rawbuf_idx += mbclen;
522 }
523 *last_wc = wc;
524 return rawbuf_idx;
525}
526#endif /* RE_ENABLE_I18N */
527
528/* Build the buffer PSTR->MBS, and apply the translation if we need.
529 This function is used in case of REG_ICASE. */
530
531static void
532internal_function
533build_upper_buffer (re_string_t *pstr)
534{
535 Idx char_idx, end_idx;
536 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537
538 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539 {
540 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541 if (BE (pstr->trans != NULL, 0))
542 ch = pstr->trans[ch];
543 if (islower (ch))
544 pstr->mbs[char_idx] = toupper (ch);
545 else
546 pstr->mbs[char_idx] = ch;
547 }
548 pstr->valid_len = char_idx;
549 pstr->valid_raw_len = char_idx;
550}
551
552/* Apply TRANS to the buffer in PSTR. */
553
554static void
555internal_function
556re_string_translate_buffer (re_string_t *pstr)
557{
558 Idx buf_idx, end_idx;
559 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560
561 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562 {
563 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564 pstr->mbs[buf_idx] = pstr->trans[ch];
565 }
566
567 pstr->valid_len = buf_idx;
568 pstr->valid_raw_len = buf_idx;
569}
570
571/* This function re-construct the buffers.
572 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573 convert to upper case in case of REG_ICASE, apply translation. */
574
575static reg_errcode_t
576internal_function __attribute_warn_unused_result__
577re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578{
579 Idx offset;
580
581 if (BE (pstr->raw_mbs_idx <= idx, 0))
582 offset = idx - pstr->raw_mbs_idx;
583 else
584 {
585 /* Reset buffer. */
586#ifdef RE_ENABLE_I18N
587 if (pstr->mb_cur_max > 1)
588 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589#endif /* RE_ENABLE_I18N */
590 pstr->len = pstr->raw_len;
591 pstr->stop = pstr->raw_stop;
592 pstr->valid_len = 0;
593 pstr->raw_mbs_idx = 0;
594 pstr->valid_raw_len = 0;
595 pstr->offsets_needed = 0;
596 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598 if (!pstr->mbs_allocated)
599 pstr->mbs = (unsigned char *) pstr->raw_mbs;
600 offset = idx;
601 }
602
603 if (BE (offset != 0, 1))
604 {
605 /* Should the already checked characters be kept? */
606 if (BE (offset < pstr->valid_raw_len, 1))
607 {
608 /* Yes, move them to the front of the buffer. */
609#ifdef RE_ENABLE_I18N
610 if (BE (pstr->offsets_needed, 0))
611 {
612 Idx low = 0, high = pstr->valid_len, mid;
613 do
614 {
615 mid = (high + low) / 2;
616 if (pstr->offsets[mid] > offset)
617 high = mid;
618 else if (pstr->offsets[mid] < offset)
619 low = mid + 1;
620 else
621 break;
622 }
623 while (low < high);
624 if (pstr->offsets[mid] < offset)
625 ++mid;
626 pstr->tip_context = re_string_context_at (pstr, mid - 1,
627 eflags);
628 /* This can be quite complicated, so handle specially
629 only the common and easy case where the character with
630 different length representation of lower and upper
631 case is present at or after offset. */
632 if (pstr->valid_len > offset
633 && mid == offset && pstr->offsets[mid] == offset)
634 {
635 memmove (pstr->wcs, pstr->wcs + offset,
636 (pstr->valid_len - offset) * sizeof (wint_t));
637 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638 pstr->valid_len -= offset;
639 pstr->valid_raw_len -= offset;
640 for (low = 0; low < pstr->valid_len; low++)
641 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642 }
643 else
644 {
645 /* Otherwise, just find out how long the partial multibyte
646 character at offset is and fill it with WEOF/255. */
647 pstr->len = pstr->raw_len - idx + offset;
648 pstr->stop = pstr->raw_stop - idx + offset;
649 pstr->offsets_needed = 0;
650 while (mid > 0 && pstr->offsets[mid - 1] == offset)
651 --mid;
652 while (mid < pstr->valid_len)
653 if (pstr->wcs[mid] != WEOF)
654 break;
655 else
656 ++mid;
657 if (mid == pstr->valid_len)
658 pstr->valid_len = 0;
659 else
660 {
661 pstr->valid_len = pstr->offsets[mid] - offset;
662 if (pstr->valid_len)
663 {
664 for (low = 0; low < pstr->valid_len; ++low)
665 pstr->wcs[low] = WEOF;
666 memset (pstr->mbs, 255, pstr->valid_len);
667 }
668 }
669 pstr->valid_raw_len = pstr->valid_len;
670 }
671 }
672 else
673#endif
674 {
675 pstr->tip_context = re_string_context_at (pstr, offset - 1,
676 eflags);
677#ifdef RE_ENABLE_I18N
678 if (pstr->mb_cur_max > 1)
679 memmove (pstr->wcs, pstr->wcs + offset,
680 (pstr->valid_len - offset) * sizeof (wint_t));
681#endif /* RE_ENABLE_I18N */
682 if (BE (pstr->mbs_allocated, 0))
683 memmove (pstr->mbs, pstr->mbs + offset,
684 pstr->valid_len - offset);
685 pstr->valid_len -= offset;
686 pstr->valid_raw_len -= offset;
687#if DEBUG
688 assert (pstr->valid_len > 0);
689#endif
690 }
691 }
692 else
693 {
694#ifdef RE_ENABLE_I18N
695 /* No, skip all characters until IDX. */
696 Idx prev_valid_len = pstr->valid_len;
697
698 if (BE (pstr->offsets_needed, 0))
699 {
700 pstr->len = pstr->raw_len - idx + offset;
701 pstr->stop = pstr->raw_stop - idx + offset;
702 pstr->offsets_needed = 0;
703 }
704#endif
705 pstr->valid_len = 0;
706#ifdef RE_ENABLE_I18N
707 if (pstr->mb_cur_max > 1)
708 {
709 Idx wcs_idx;
710 wint_t wc = WEOF;
711
712 if (pstr->is_utf8)
713 {
714 const unsigned char *raw, *p, *end;
715
716 /* Special case UTF-8. Multi-byte chars start with any
717 byte other than 0x80 - 0xbf. */
718 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719 end = raw + (offset - pstr->mb_cur_max);
720 if (end < pstr->raw_mbs)
721 end = pstr->raw_mbs;
722 p = raw + offset - 1;
723#ifdef _LIBC
724 /* We know the wchar_t encoding is UCS4, so for the simple
725 case, ASCII characters, skip the conversion step. */
726 if (isascii (*p) && BE (pstr->trans == NULL, 1))
727 {
728 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729 /* pstr->valid_len = 0; */
730 wc = (wchar_t) *p;
731 }
732 else
733#endif
734 for (; p >= end; --p)
735 if ((*p & 0xc0) != 0x80)
736 {
737 mbstate_t cur_state;
738 wchar_t wc2;
739 Idx mlen = raw + pstr->len - p;
740 unsigned char buf[6];
741 size_t mbclen;
742
743 const unsigned char *pp = p;
744 if (BE (pstr->trans != NULL, 0))
745 {
746 int i = mlen < 6 ? mlen : 6;
747 while (--i >= 0)
748 buf[i] = pstr->trans[p[i]];
749 pp = buf;
750 }
751 /* XXX Don't use mbrtowc, we know which conversion
752 to use (UTF-8 -> UCS4). */
753 memset (&cur_state, 0, sizeof (cur_state));
754 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755 &cur_state);
756 if (raw + offset - p <= mbclen
757 && mbclen < (size_t) -2)
758 {
759 memset (&pstr->cur_state, '\0',
760 sizeof (mbstate_t));
761 pstr->valid_len = mbclen - (raw + offset - p);
762 wc = wc2;
763 }
764 break;
765 }
766 }
767
768 if (wc == WEOF)
769 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770 if (wc == WEOF)
771 pstr->tip_context
772 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773 else
774 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775 && IS_WIDE_WORD_CHAR (wc))
776 ? CONTEXT_WORD
777 : ((IS_WIDE_NEWLINE (wc)
778 && pstr->newline_anchor)
779 ? CONTEXT_NEWLINE : 0));
780 if (BE (pstr->valid_len, 0))
781 {
782 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783 pstr->wcs[wcs_idx] = WEOF;
784 if (pstr->mbs_allocated)
785 memset (pstr->mbs, 255, pstr->valid_len);
786 }
787 pstr->valid_raw_len = pstr->valid_len;
788 }
789 else
790#endif /* RE_ENABLE_I18N */
791 {
792 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793 pstr->valid_raw_len = 0;
794 if (pstr->trans)
795 c = pstr->trans[c];
796 pstr->tip_context = (bitset_contain (pstr->word_char, c)
797 ? CONTEXT_WORD
798 : ((IS_NEWLINE (c) && pstr->newline_anchor)
799 ? CONTEXT_NEWLINE : 0));
800 }
801 }
802 if (!BE (pstr->mbs_allocated, 0))
803 pstr->mbs += offset;
804 }
805 pstr->raw_mbs_idx = idx;
806 pstr->len -= offset;
807 pstr->stop -= offset;
808
809 /* Then build the buffers. */
810#ifdef RE_ENABLE_I18N
811 if (pstr->mb_cur_max > 1)
812 {
813 if (pstr->icase)
814 {
815 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816 if (BE (ret != REG_NOERROR, 0))
817 return ret;
818 }
819 else
820 build_wcs_buffer (pstr);
821 }
822 else
823#endif /* RE_ENABLE_I18N */
824 if (BE (pstr->mbs_allocated, 0))
825 {
826 if (pstr->icase)
827 build_upper_buffer (pstr);
828 else if (pstr->trans != NULL)
829 re_string_translate_buffer (pstr);
830 }
831 else
832 pstr->valid_len = pstr->len;
833
834 pstr->cur_idx = 0;
835 return REG_NOERROR;
836}
837
838static unsigned char
839internal_function __attribute ((pure))
840re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841{
842 int ch;
843 Idx off;
844
845 /* Handle the common (easiest) cases first. */
846 if (BE (!pstr->mbs_allocated, 1))
847 return re_string_peek_byte (pstr, idx);
848
849#ifdef RE_ENABLE_I18N
850 if (pstr->mb_cur_max > 1
851 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852 return re_string_peek_byte (pstr, idx);
853#endif
854
855 off = pstr->cur_idx + idx;
856#ifdef RE_ENABLE_I18N
857 if (pstr->offsets_needed)
858 off = pstr->offsets[off];
859#endif
860
861 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862
863#ifdef RE_ENABLE_I18N
864 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865 this function returns CAPITAL LETTER I instead of first byte of
866 DOTLESS SMALL LETTER I. The latter would confuse the parser,
867 since peek_byte_case doesn't advance cur_idx in any way. */
868 if (pstr->offsets_needed && !isascii (ch))
869 return re_string_peek_byte (pstr, idx);
870#endif
871
872 return ch;
873}
874
875static unsigned char
876internal_function
877re_string_fetch_byte_case (re_string_t *pstr)
878{
879 if (BE (!pstr->mbs_allocated, 1))
880 return re_string_fetch_byte (pstr);
881
882#ifdef RE_ENABLE_I18N
883 if (pstr->offsets_needed)
884 {
885 Idx off;
886 int ch;
887
888 /* For tr_TR.UTF-8 [[:islower:]] there is
889 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
890 in that case the whole multi-byte character and return
891 the original letter. On the other side, with
892 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 anything else would complicate things too much. */
894
895 if (!re_string_first_byte (pstr, pstr->cur_idx))
896 return re_string_fetch_byte (pstr);
897
898 off = pstr->offsets[pstr->cur_idx];
899 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900
901 if (! isascii (ch))
902 return re_string_fetch_byte (pstr);
903
904 re_string_skip_bytes (pstr,
905 re_string_char_size_at (pstr, pstr->cur_idx));
906 return ch;
907 }
908#endif
909
910 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911}
912
913static void
914internal_function
915re_string_destruct (re_string_t *pstr)
916{
917#ifdef RE_ENABLE_I18N
918 re_free (pstr->wcs);
919 re_free (pstr->offsets);
920#endif /* RE_ENABLE_I18N */
921 if (pstr->mbs_allocated)
922 re_free (pstr->mbs);
923}
924
925/* Return the context at IDX in INPUT. */
926
927static unsigned int
928internal_function
929re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930{
931 int c;
932 if (BE (! REG_VALID_INDEX (idx), 0))
933 /* In this case, we use the value stored in input->tip_context,
934 since we can't know the character in input->mbs[-1] here. */
935 return input->tip_context;
936 if (BE (idx == input->len, 0))
937 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939#ifdef RE_ENABLE_I18N
940 if (input->mb_cur_max > 1)
941 {
942 wint_t wc;
943 Idx wc_idx = idx;
944 while(input->wcs[wc_idx] == WEOF)
945 {
946#ifdef DEBUG
947 /* It must not happen. */
948 assert (REG_VALID_INDEX (wc_idx));
949#endif
950 --wc_idx;
951 if (! REG_VALID_INDEX (wc_idx))
952 return input->tip_context;
953 }
954 wc = input->wcs[wc_idx];
955 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956 return CONTEXT_WORD;
957 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958 ? CONTEXT_NEWLINE : 0);
959 }
960 else
961#endif
962 {
963 c = re_string_byte_at (input, idx);
964 if (bitset_contain (input->word_char, c))
965 return CONTEXT_WORD;
966 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967 }
968}
969
970
971/* Functions for set operation. */
972
973static reg_errcode_t
974internal_function __attribute_warn_unused_result__
975re_node_set_alloc (re_node_set *set, Idx size)
976{
977 set->alloc = size;
978 set->nelem = 0;
979 set->elems = re_malloc (Idx, size);
980 if (BE (set->elems == NULL, 0))
981 return REG_ESPACE;
982 return REG_NOERROR;
983}
984
985static reg_errcode_t
986internal_function __attribute_warn_unused_result__
987re_node_set_init_1 (re_node_set *set, Idx elem)
988{
989 set->alloc = 1;
990 set->nelem = 1;
991 set->elems = re_malloc (Idx, 1);
992 if (BE (set->elems == NULL, 0))
993 {
994 set->alloc = set->nelem = 0;
995 return REG_ESPACE;
996 }
997 set->elems[0] = elem;
998 return REG_NOERROR;
999}
1000
1001static reg_errcode_t
1002internal_function __attribute_warn_unused_result__
1003re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1004{
1005 set->alloc = 2;
1006 set->elems = re_malloc (Idx, 2);
1007 if (BE (set->elems == NULL, 0))
1008 return REG_ESPACE;
1009 if (elem1 == elem2)
1010 {
1011 set->nelem = 1;
1012 set->elems[0] = elem1;
1013 }
1014 else
1015 {
1016 set->nelem = 2;
1017 if (elem1 < elem2)
1018 {
1019 set->elems[0] = elem1;
1020 set->elems[1] = elem2;
1021 }
1022 else
1023 {
1024 set->elems[0] = elem2;
1025 set->elems[1] = elem1;
1026 }
1027 }
1028 return REG_NOERROR;
1029}
1030
1031static reg_errcode_t
1032internal_function __attribute_warn_unused_result__
1033re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1034{
1035 dest->nelem = src->nelem;
1036 if (src->nelem > 0)
1037 {
1038 dest->alloc = dest->nelem;
1039 dest->elems = re_malloc (Idx, dest->alloc);
1040 if (BE (dest->elems == NULL, 0))
1041 {
1042 dest->alloc = dest->nelem = 0;
1043 return REG_ESPACE;
1044 }
1045 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1046 }
1047 else
1048 re_node_set_init_empty (dest);
1049 return REG_NOERROR;
1050}
1051
1052/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1053 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1054 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1055
1056static reg_errcode_t
1057internal_function __attribute_warn_unused_result__
1058re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1059 const re_node_set *src2)
1060{
1061 Idx i1, i2, is, id, delta, sbase;
1062 if (src1->nelem == 0 || src2->nelem == 0)
1063 return REG_NOERROR;
1064
1065 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1066 conservative estimate. */
1067 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1068 {
1069 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1070 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1071 if (BE (new_elems == NULL, 0))
1072 return REG_ESPACE;
1073 dest->elems = new_elems;
1074 dest->alloc = new_alloc;
1075 }
1076
1077 /* Find the items in the intersection of SRC1 and SRC2, and copy
1078 into the top of DEST those that are not already in DEST itself. */
1079 sbase = dest->nelem + src1->nelem + src2->nelem;
1080 i1 = src1->nelem - 1;
1081 i2 = src2->nelem - 1;
1082 id = dest->nelem - 1;
1083 for (;;)
1084 {
1085 if (src1->elems[i1] == src2->elems[i2])
1086 {
1087 /* Try to find the item in DEST. Maybe we could binary search? */
1088 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1089 --id;
1090
1091 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1092 dest->elems[--sbase] = src1->elems[i1];
1093
1094 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1095 break;
1096 }
1097
1098 /* Lower the highest of the two items. */
1099 else if (src1->elems[i1] < src2->elems[i2])
1100 {
1101 if (! REG_VALID_INDEX (--i2))
1102 break;
1103 }
1104 else
1105 {
1106 if (! REG_VALID_INDEX (--i1))
1107 break;
1108 }
1109 }
1110
1111 id = dest->nelem - 1;
1112 is = dest->nelem + src1->nelem + src2->nelem - 1;
1113 delta = is - sbase + 1;
1114
1115 /* Now copy. When DELTA becomes zero, the remaining
1116 DEST elements are already in place; this is more or
1117 less the same loop that is in re_node_set_merge. */
1118 dest->nelem += delta;
1119 if (delta > 0 && REG_VALID_INDEX (id))
1120 for (;;)
1121 {
1122 if (dest->elems[is] > dest->elems[id])
1123 {
1124 /* Copy from the top. */
1125 dest->elems[id + delta--] = dest->elems[is--];
1126 if (delta == 0)
1127 break;
1128 }
1129 else
1130 {
1131 /* Slide from the bottom. */
1132 dest->elems[id + delta] = dest->elems[id];
1133 if (! REG_VALID_INDEX (--id))
1134 break;
1135 }
1136 }
1137
1138 /* Copy remaining SRC elements. */
1139 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1140
1141 return REG_NOERROR;
1142}
1143
1144/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1145 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1146
1147static reg_errcode_t
1148internal_function __attribute_warn_unused_result__
1149re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1150 const re_node_set *src2)
1151{
1152 Idx i1, i2, id;
1153 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1154 {
1155 dest->alloc = src1->nelem + src2->nelem;
1156 dest->elems = re_malloc (Idx, dest->alloc);
1157 if (BE (dest->elems == NULL, 0))
1158 return REG_ESPACE;
1159 }
1160 else
1161 {
1162 if (src1 != NULL && src1->nelem > 0)
1163 return re_node_set_init_copy (dest, src1);
1164 else if (src2 != NULL && src2->nelem > 0)
1165 return re_node_set_init_copy (dest, src2);
1166 else
1167 re_node_set_init_empty (dest);
1168 return REG_NOERROR;
1169 }
1170 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1171 {
1172 if (src1->elems[i1] > src2->elems[i2])
1173 {
1174 dest->elems[id++] = src2->elems[i2++];
1175 continue;
1176 }
1177 if (src1->elems[i1] == src2->elems[i2])
1178 ++i2;
1179 dest->elems[id++] = src1->elems[i1++];
1180 }
1181 if (i1 < src1->nelem)
1182 {
1183 memcpy (dest->elems + id, src1->elems + i1,
1184 (src1->nelem - i1) * sizeof (Idx));
1185 id += src1->nelem - i1;
1186 }
1187 else if (i2 < src2->nelem)
1188 {
1189 memcpy (dest->elems + id, src2->elems + i2,
1190 (src2->nelem - i2) * sizeof (Idx));
1191 id += src2->nelem - i2;
1192 }
1193 dest->nelem = id;
1194 return REG_NOERROR;
1195}
1196
1197/* Calculate the union set of the sets DEST and SRC. And store it to
1198 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1199
1200static reg_errcode_t
1201internal_function __attribute_warn_unused_result__
1202re_node_set_merge (re_node_set *dest, const re_node_set *src)
1203{
1204 Idx is, id, sbase, delta;
1205 if (src == NULL || src->nelem == 0)
1206 return REG_NOERROR;
1207 if (dest->alloc < 2 * src->nelem + dest->nelem)
1208 {
1209 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1210 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1211 if (BE (new_buffer == NULL, 0))
1212 return REG_ESPACE;
1213 dest->elems = new_buffer;
1214 dest->alloc = new_alloc;
1215 }
1216
1217 if (BE (dest->nelem == 0, 0))
1218 {
1219 dest->nelem = src->nelem;
1220 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1221 return REG_NOERROR;
1222 }
1223
1224 /* Copy into the top of DEST the items of SRC that are not
1225 found in DEST. Maybe we could binary search in DEST? */
1226 for (sbase = dest->nelem + 2 * src->nelem,
1227 is = src->nelem - 1, id = dest->nelem - 1;
1228 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1229 {
1230 if (dest->elems[id] == src->elems[is])
1231 is--, id--;
1232 else if (dest->elems[id] < src->elems[is])
1233 dest->elems[--sbase] = src->elems[is--];
1234 else /* if (dest->elems[id] > src->elems[is]) */
1235 --id;
1236 }
1237
1238 if (REG_VALID_INDEX (is))
1239 {
1240 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1241 sbase -= is + 1;
1242 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243 }
1244
1245 id = dest->nelem - 1;
1246 is = dest->nelem + 2 * src->nelem - 1;
1247 delta = is - sbase + 1;
1248 if (delta == 0)
1249 return REG_NOERROR;
1250
1251 /* Now copy. When DELTA becomes zero, the remaining
1252 DEST elements are already in place. */
1253 dest->nelem += delta;
1254 for (;;)
1255 {
1256 if (dest->elems[is] > dest->elems[id])
1257 {
1258 /* Copy from the top. */
1259 dest->elems[id + delta--] = dest->elems[is--];
1260 if (delta == 0)
1261 break;
1262 }
1263 else
1264 {
1265 /* Slide from the bottom. */
1266 dest->elems[id + delta] = dest->elems[id];
1267 if (! REG_VALID_INDEX (--id))
1268 {
1269 /* Copy remaining SRC elements. */
1270 memcpy (dest->elems, dest->elems + sbase,
1271 delta * sizeof (Idx));
1272 break;
1273 }
1274 }
1275 }
1276
1277 return REG_NOERROR;
1278}
1279
1280/* Insert the new element ELEM to the re_node_set* SET.
1281 SET should not already have ELEM.
1282 Return true if successful. */
1283
1284static bool
1285internal_function __attribute_warn_unused_result__
1286re_node_set_insert (re_node_set *set, Idx elem)
1287{
1288 Idx idx;
1289 /* In case the set is empty. */
1290 if (set->alloc == 0)
1291 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1292
1293 if (BE (set->nelem, 0) == 0)
1294 {
1295 /* We already guaranteed above that set->alloc != 0. */
1296 set->elems[0] = elem;
1297 ++set->nelem;
1298 return true;
1299 }
1300
1301 /* Realloc if we need. */
1302 if (set->alloc == set->nelem)
1303 {
1304 Idx *new_elems;
1305 set->alloc = set->alloc * 2;
1306 new_elems = re_realloc (set->elems, Idx, set->alloc);
1307 if (BE (new_elems == NULL, 0))
1308 return false;
1309 set->elems = new_elems;
1310 }
1311
1312 /* Move the elements which follows the new element. Test the
1313 first element separately to skip a check in the inner loop. */
1314 if (elem < set->elems[0])
1315 {
1316 idx = 0;
1317 for (idx = set->nelem; idx > 0; idx--)
1318 set->elems[idx] = set->elems[idx - 1];
1319 }
1320 else
1321 {
1322 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1323 set->elems[idx] = set->elems[idx - 1];
1324 }
1325
1326 /* Insert the new element. */
1327 set->elems[idx] = elem;
1328 ++set->nelem;
1329 return true;
1330}
1331
1332/* Insert the new element ELEM to the re_node_set* SET.
1333 SET should not already have any element greater than or equal to ELEM.
1334 Return true if successful. */
1335
1336static bool
1337internal_function __attribute_warn_unused_result__
1338re_node_set_insert_last (re_node_set *set, Idx elem)
1339{
1340 /* Realloc if we need. */
1341 if (set->alloc == set->nelem)
1342 {
1343 Idx *new_elems;
1344 set->alloc = (set->alloc + 1) * 2;
1345 new_elems = re_realloc (set->elems, Idx, set->alloc);
1346 if (BE (new_elems == NULL, 0))
1347 return false;
1348 set->elems = new_elems;
1349 }
1350
1351 /* Insert the new element. */
1352 set->elems[set->nelem++] = elem;
1353 return true;
1354}
1355
1356/* Compare two node sets SET1 and SET2.
1357 Return true if SET1 and SET2 are equivalent. */
1358
1359static bool
1360internal_function __attribute ((pure))
1361re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1362{
1363 Idx i;
1364 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1365 return false;
1366 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1367 if (set1->elems[i] != set2->elems[i])
1368 return false;
1369 return true;
1370}
1371
1372/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1373
1374static Idx
1375internal_function __attribute ((pure))
1376re_node_set_contains (const re_node_set *set, Idx elem)
1377{
1378 __re_size_t idx, right, mid;
1379 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1380 return 0;
1381
1382 /* Binary search the element. */
1383 idx = 0;
1384 right = set->nelem - 1;
1385 while (idx < right)
1386 {
1387 mid = (idx + right) / 2;
1388 if (set->elems[mid] < elem)
1389 idx = mid + 1;
1390 else
1391 right = mid;
1392 }
1393 return set->elems[idx] == elem ? idx + 1 : 0;
1394}
1395
1396static void
1397internal_function
1398re_node_set_remove_at (re_node_set *set, Idx idx)
1399{
1400 verify (! TYPE_SIGNED (Idx));
1401 /* if (idx < 0)
1402 return; */
1403 if (idx >= set->nelem)
1404 return;
1405 --set->nelem;
1406 for (; idx < set->nelem; idx++)
1407 set->elems[idx] = set->elems[idx + 1];
1408}
1409
1410
1411
1412/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1413 Or return REG_MISSING if an error occurred. */
1414
1415static Idx
1416internal_function
1417re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1418{
1419 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1420 {
1421 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1422 Idx *new_nexts, *new_indices;
1423 re_node_set *new_edests, *new_eclosures;
1424 re_token_t *new_nodes;
1425
1426 /* Avoid overflows in realloc. */
1427 const size_t max_object_size = MAX (sizeof (re_token_t),
1428 MAX (sizeof (re_node_set),
1429 sizeof (Idx)));
1430 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1431 return REG_MISSING;
1432
1433 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1434 if (BE (new_nodes == NULL, 0))
1435 return REG_MISSING;
1436 dfa->nodes = new_nodes;
1437 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1438 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1439 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1440 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1441 if (BE (new_nexts == NULL || new_indices == NULL
1442 || new_edests == NULL || new_eclosures == NULL, 0))
1443 return REG_MISSING;
1444 dfa->nexts = new_nexts;
1445 dfa->org_indices = new_indices;
1446 dfa->edests = new_edests;
1447 dfa->eclosures = new_eclosures;
1448 dfa->nodes_alloc = new_nodes_alloc;
1449 }
1450 dfa->nodes[dfa->nodes_len] = token;
1451 dfa->nodes[dfa->nodes_len].constraint = 0;
1452#ifdef RE_ENABLE_I18N
1453 {
1454 int type = token.type;
1455 dfa->nodes[dfa->nodes_len].accept_mb =
1456 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1457 }
1458#endif
1459 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1460 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1461 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1462 return dfa->nodes_len++;
1463}
1464
1465static inline re_hashval_t
1466internal_function
1467calc_state_hash (const re_node_set *nodes, unsigned int context)
1468{
1469 re_hashval_t hash = nodes->nelem + context;
1470 Idx i;
1471 for (i = 0 ; i < nodes->nelem ; i++)
1472 hash += nodes->elems[i];
1473 return hash;
1474}
1475
1476/* Search for the state whose node_set is equivalent to NODES.
1477 Return the pointer to the state, if we found it in the DFA.
1478 Otherwise create the new one and return it. In case of an error
1479 return NULL and set the error code in ERR.
1480 Note: - We assume NULL as the invalid state, then it is possible that
1481 return value is NULL and ERR is REG_NOERROR.
1482 - We never return non-NULL value in case of any errors, it is for
1483 optimization. */
1484
1485static re_dfastate_t *
1486internal_function __attribute_warn_unused_result__
1487re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1488 const re_node_set *nodes)
1489{
1490 re_hashval_t hash;
1491 re_dfastate_t *new_state;
1492 struct re_state_table_entry *spot;
1493 Idx i;
1494#ifdef lint
1495 /* Suppress bogus uninitialized-variable warnings. */
1496 *err = REG_NOERROR;
1497#endif
1498 if (BE (nodes->nelem == 0, 0))
1499 {
1500 *err = REG_NOERROR;
1501 return NULL;
1502 }
1503 hash = calc_state_hash (nodes, 0);
1504 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1505
1506 for (i = 0 ; i < spot->num ; i++)
1507 {
1508 re_dfastate_t *state = spot->array[i];
1509 if (hash != state->hash)
1510 continue;
1511 if (re_node_set_compare (&state->nodes, nodes))
1512 return state;
1513 }
1514
1515 /* There are no appropriate state in the dfa, create the new one. */
1516 new_state = create_ci_newstate (dfa, nodes, hash);
1517 if (BE (new_state == NULL, 0))
1518 *err = REG_ESPACE;
1519
1520 return new_state;
1521}
1522
1523/* Search for the state whose node_set is equivalent to NODES and
1524 whose context is equivalent to CONTEXT.
1525 Return the pointer to the state, if we found it in the DFA.
1526 Otherwise create the new one and return it. In case of an error
1527 return NULL and set the error code in ERR.
1528 Note: - We assume NULL as the invalid state, then it is possible that
1529 return value is NULL and ERR is REG_NOERROR.
1530 - We never return non-NULL value in case of any errors, it is for
1531 optimization. */
1532
1533static re_dfastate_t *
1534internal_function __attribute_warn_unused_result__
1535re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1536 const re_node_set *nodes, unsigned int context)
1537{
1538 re_hashval_t hash;
1539 re_dfastate_t *new_state;
1540 struct re_state_table_entry *spot;
1541 Idx i;
1542#ifdef lint
1543 /* Suppress bogus uninitialized-variable warnings. */
1544 *err = REG_NOERROR;
1545#endif
1546 if (nodes->nelem == 0)
1547 {
1548 *err = REG_NOERROR;
1549 return NULL;
1550 }
1551 hash = calc_state_hash (nodes, context);
1552 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1553
1554 for (i = 0 ; i < spot->num ; i++)
1555 {
1556 re_dfastate_t *state = spot->array[i];
1557 if (state->hash == hash
1558 && state->context == context
1559 && re_node_set_compare (state->entrance_nodes, nodes))
1560 return state;
1561 }
1562 /* There are no appropriate state in 'dfa', create the new one. */
1563 new_state = create_cd_newstate (dfa, nodes, context, hash);
1564 if (BE (new_state == NULL, 0))
1565 *err = REG_ESPACE;
1566
1567 return new_state;
1568}
1569
1570/* Finish initialization of the new state NEWSTATE, and using its hash value
1571 HASH put in the appropriate bucket of DFA's state table. Return value
1572 indicates the error code if failed. */
1573
1574static reg_errcode_t
1575__attribute_warn_unused_result__
1576register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1577 re_hashval_t hash)
1578{
1579 struct re_state_table_entry *spot;
1580 reg_errcode_t err;
1581 Idx i;
1582
1583 newstate->hash = hash;
1584 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1585 if (BE (err != REG_NOERROR, 0))
1586 return REG_ESPACE;
1587 for (i = 0; i < newstate->nodes.nelem; i++)
1588 {
1589 Idx elem = newstate->nodes.elems[i];
1590 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1591 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1592 return REG_ESPACE;
1593 }
1594
1595 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1596 if (BE (spot->alloc <= spot->num, 0))
1597 {
1598 Idx new_alloc = 2 * spot->num + 2;
1599 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1600 new_alloc);
1601 if (BE (new_array == NULL, 0))
1602 return REG_ESPACE;
1603 spot->array = new_array;
1604 spot->alloc = new_alloc;
1605 }
1606 spot->array[spot->num++] = newstate;
1607 return REG_NOERROR;
1608}
1609
1610static void
1611free_state (re_dfastate_t *state)
1612{
1613 re_node_set_free (&state->non_eps_nodes);
1614 re_node_set_free (&state->inveclosure);
1615 if (state->entrance_nodes != &state->nodes)
1616 {
1617 re_node_set_free (state->entrance_nodes);
1618 re_free (state->entrance_nodes);
1619 }
1620 re_node_set_free (&state->nodes);
1621 re_free (state->word_trtable);
1622 re_free (state->trtable);
1623 re_free (state);
1624}
1625
1626/* Create the new state which is independent of contexts.
1627 Return the new state if succeeded, otherwise return NULL. */
1628
1629static re_dfastate_t *
1630internal_function __attribute_warn_unused_result__
1631create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1632 re_hashval_t hash)
1633{
1634 Idx i;
1635 reg_errcode_t err;
1636 re_dfastate_t *newstate;
1637
1638 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1639 if (BE (newstate == NULL, 0))
1640 return NULL;
1641 err = re_node_set_init_copy (&newstate->nodes, nodes);
1642 if (BE (err != REG_NOERROR, 0))
1643 {
1644 re_free (newstate);
1645 return NULL;
1646 }
1647
1648 newstate->entrance_nodes = &newstate->nodes;
1649 for (i = 0 ; i < nodes->nelem ; i++)
1650 {
1651 re_token_t *node = dfa->nodes + nodes->elems[i];
1652 re_token_type_t type = node->type;
1653 if (type == CHARACTER && !node->constraint)
1654 continue;
1655#ifdef RE_ENABLE_I18N
1656 newstate->accept_mb |= node->accept_mb;
1657#endif /* RE_ENABLE_I18N */
1658
1659 /* If the state has the halt node, the state is a halt state. */
1660 if (type == END_OF_RE)
1661 newstate->halt = 1;
1662 else if (type == OP_BACK_REF)
1663 newstate->has_backref = 1;
1664 else if (type == ANCHOR || node->constraint)
1665 newstate->has_constraint = 1;
1666 }
1667 err = register_state (dfa, newstate, hash);
1668 if (BE (err != REG_NOERROR, 0))
1669 {
1670 free_state (newstate);
1671 newstate = NULL;
1672 }
1673 return newstate;
1674}
1675
1676/* Create the new state which is depend on the context CONTEXT.
1677 Return the new state if succeeded, otherwise return NULL. */
1678
1679static re_dfastate_t *
1680internal_function __attribute_warn_unused_result__
1681create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1682 unsigned int context, re_hashval_t hash)
1683{
1684 Idx i, nctx_nodes = 0;
1685 reg_errcode_t err;
1686 re_dfastate_t *newstate;
1687
1688 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1689 if (BE (newstate == NULL, 0))
1690 return NULL;
1691 err = re_node_set_init_copy (&newstate->nodes, nodes);
1692 if (BE (err != REG_NOERROR, 0))
1693 {
1694 re_free (newstate);
1695 return NULL;
1696 }
1697
1698 newstate->context = context;
1699 newstate->entrance_nodes = &newstate->nodes;
1700
1701 for (i = 0 ; i < nodes->nelem ; i++)
1702 {
1703 re_token_t *node = dfa->nodes + nodes->elems[i];
1704 re_token_type_t type = node->type;
1705 unsigned int constraint = node->constraint;
1706
1707 if (type == CHARACTER && !constraint)
1708 continue;
1709#ifdef RE_ENABLE_I18N
1710 newstate->accept_mb |= node->accept_mb;
1711#endif /* RE_ENABLE_I18N */
1712
1713 /* If the state has the halt node, the state is a halt state. */
1714 if (type == END_OF_RE)
1715 newstate->halt = 1;
1716 else if (type == OP_BACK_REF)
1717 newstate->has_backref = 1;
1718
1719 if (constraint)
1720 {
1721 if (newstate->entrance_nodes == &newstate->nodes)
1722 {
1723 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1724 if (BE (newstate->entrance_nodes == NULL, 0))
1725 {
1726 free_state (newstate);
1727 return NULL;
1728 }
1729 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1730 != REG_NOERROR)
1731 return NULL;
1732 nctx_nodes = 0;
1733 newstate->has_constraint = 1;
1734 }
1735
1736 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1737 {
1738 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1739 ++nctx_nodes;
1740 }
1741 }
1742 }
1743 err = register_state (dfa, newstate, hash);
1744 if (BE (err != REG_NOERROR, 0))
1745 {
1746 free_state (newstate);
1747 newstate = NULL;
1748 }
1749 return newstate;
1750}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette