VirtualBox

source: kBuild/trunk/src/sed/lib/regex_internal.c@ 602

Last change on this file since 602 was 599, checked in by bird, 18 years ago

GNU sed 4.1.5.

File size: 44.0 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <[email protected]>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
30 unsigned int context,
31 unsigned int hash) internal_function;
32
33
34/* Functions for string operation. */
35
36/* This function allocate the buffers. It is necessary to call
37 re_string_reconstruct before using the object. */
38
39static reg_errcode_t
40internal_function
41re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
42 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43{
44 reg_errcode_t ret;
45 int init_buf_len;
46
47 /* Ensure at least one character fits into the buffers. */
48 if (init_len < dfa->mb_cur_max)
49 init_len = dfa->mb_cur_max;
50 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52
53 ret = re_string_realloc_buffers (pstr, init_buf_len);
54 if (BE (ret != REG_NOERROR, 0))
55 return ret;
56
57 pstr->word_char = dfa->word_char;
58 pstr->word_ops_used = dfa->word_ops_used;
59 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
61 pstr->valid_raw_len = pstr->valid_len;
62 return REG_NOERROR;
63}
64
65/* This function allocate the buffers, and initialize them. */
66
67static reg_errcode_t
68internal_function
69re_string_construct (re_string_t *pstr, const char *str, int len,
70 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71{
72 reg_errcode_t ret;
73 memset (pstr, '\0', sizeof (re_string_t));
74 re_string_construct_common (str, len, pstr, trans, icase, dfa);
75
76 if (len > 0)
77 {
78 ret = re_string_realloc_buffers (pstr, len + 1);
79 if (BE (ret != REG_NOERROR, 0))
80 return ret;
81 }
82 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83
84 if (icase)
85 {
86#ifdef RE_ENABLE_I18N
87 if (dfa->mb_cur_max > 1)
88 {
89 while (1)
90 {
91 ret = build_wcs_upper_buffer (pstr);
92 if (BE (ret != REG_NOERROR, 0))
93 return ret;
94 if (pstr->valid_raw_len >= len)
95 break;
96 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97 break;
98 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99 if (BE (ret != REG_NOERROR, 0))
100 return ret;
101 }
102 }
103 else
104#endif /* RE_ENABLE_I18N */
105 build_upper_buffer (pstr);
106 }
107 else
108 {
109#ifdef RE_ENABLE_I18N
110 if (dfa->mb_cur_max > 1)
111 build_wcs_buffer (pstr);
112 else
113#endif /* RE_ENABLE_I18N */
114 {
115 if (trans != NULL)
116 re_string_translate_buffer (pstr);
117 else
118 {
119 pstr->valid_len = pstr->bufs_len;
120 pstr->valid_raw_len = pstr->bufs_len;
121 }
122 }
123 }
124
125 return REG_NOERROR;
126}
127
128/* Helper functions for re_string_allocate, and re_string_construct. */
129
130static reg_errcode_t
131internal_function
132re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133{
134#ifdef RE_ENABLE_I18N
135 if (pstr->mb_cur_max > 1)
136 {
137 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
138 if (BE (new_wcs == NULL, 0))
139 return REG_ESPACE;
140 pstr->wcs = new_wcs;
141 if (pstr->offsets != NULL)
142 {
143 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
144 if (BE (new_offsets == NULL, 0))
145 return REG_ESPACE;
146 pstr->offsets = new_offsets;
147 }
148 }
149#endif /* RE_ENABLE_I18N */
150 if (pstr->mbs_allocated)
151 {
152 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
153 new_buf_len);
154 if (BE (new_mbs == NULL, 0))
155 return REG_ESPACE;
156 pstr->mbs = new_mbs;
157 }
158 pstr->bufs_len = new_buf_len;
159 return REG_NOERROR;
160}
161
162
163static void
164internal_function
165re_string_construct_common (const char *str, int len, re_string_t *pstr,
166 RE_TRANSLATE_TYPE trans, int icase,
167 const re_dfa_t *dfa)
168{
169 pstr->raw_mbs = (const unsigned char *) str;
170 pstr->len = len;
171 pstr->raw_len = len;
172 pstr->trans = trans;
173 pstr->icase = icase ? 1 : 0;
174 pstr->mbs_allocated = (trans != NULL || icase);
175 pstr->mb_cur_max = dfa->mb_cur_max;
176 pstr->is_utf8 = dfa->is_utf8;
177 pstr->map_notascii = dfa->map_notascii;
178 pstr->stop = pstr->len;
179 pstr->raw_stop = pstr->stop;
180}
181
182#ifdef RE_ENABLE_I18N
183
184/* Build wide character buffer PSTR->WCS.
185 If the byte sequence of the string are:
186 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
187 Then wide character buffer will be:
188 <wc1> , WEOF , <wc2> , WEOF , <wc3>
189 We use WEOF for padding, they indicate that the position isn't
190 a first byte of a multibyte character.
191
192 Note that this function assumes PSTR->VALID_LEN elements are already
193 built and starts from PSTR->VALID_LEN. */
194
195static void
196internal_function
197build_wcs_buffer (re_string_t *pstr)
198{
199#ifdef _LIBC
200 unsigned char buf[MB_LEN_MAX];
201 assert (MB_LEN_MAX >= pstr->mb_cur_max);
202#else
203 unsigned char buf[64];
204#endif
205 mbstate_t prev_st;
206 int byte_idx, end_idx, remain_len;
207 size_t mbclen;
208
209 /* Build the buffers from pstr->valid_len to either pstr->len or
210 pstr->bufs_len. */
211 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
212 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
213 {
214 wchar_t wc;
215 const char *p;
216
217 remain_len = end_idx - byte_idx;
218 prev_st = pstr->cur_state;
219 /* Apply the translation if we need. */
220 if (BE (pstr->trans != NULL, 0))
221 {
222 int i, ch;
223
224 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
225 {
226 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
227 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
228 }
229 p = (const char *) buf;
230 }
231 else
232 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
233 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
234 if (BE (mbclen == (size_t) -2, 0))
235 {
236 /* The buffer doesn't have enough space, finish to build. */
237 pstr->cur_state = prev_st;
238 break;
239 }
240 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
241 {
242 /* We treat these cases as a singlebyte character. */
243 mbclen = 1;
244 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 if (BE (pstr->trans != NULL, 0))
246 wc = pstr->trans[wc];
247 pstr->cur_state = prev_st;
248 }
249
250 /* Write wide character and padding. */
251 pstr->wcs[byte_idx++] = wc;
252 /* Write paddings. */
253 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
254 pstr->wcs[byte_idx++] = WEOF;
255 }
256 pstr->valid_len = byte_idx;
257 pstr->valid_raw_len = byte_idx;
258}
259
260/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
261 but for REG_ICASE. */
262
263static int
264internal_function
265build_wcs_upper_buffer (re_string_t *pstr)
266{
267 mbstate_t prev_st;
268 int src_idx, byte_idx, end_idx, remain_len;
269 size_t mbclen;
270#ifdef _LIBC
271 char buf[MB_LEN_MAX];
272 assert (MB_LEN_MAX >= pstr->mb_cur_max);
273#else
274 char buf[64];
275#endif
276
277 byte_idx = pstr->valid_len;
278 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
279
280 /* The following optimization assumes that ASCII characters can be
281 mapped to wide characters with a simple cast. */
282 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
283 {
284 while (byte_idx < end_idx)
285 {
286 wchar_t wc;
287
288 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
289 && mbsinit (&pstr->cur_state))
290 {
291 /* In case of a singlebyte character. */
292 pstr->mbs[byte_idx]
293 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
294 /* The next step uses the assumption that wchar_t is encoded
295 ASCII-safe: all ASCII values can be converted like this. */
296 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
297 ++byte_idx;
298 continue;
299 }
300
301 remain_len = end_idx - byte_idx;
302 prev_st = pstr->cur_state;
303 mbclen = mbrtowc (&wc,
304 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
305 + byte_idx), remain_len, &pstr->cur_state);
306 if (BE (mbclen + 2 > 2, 1))
307 {
308 wchar_t wcu = wc;
309 if (iswlower (wc))
310 {
311 size_t mbcdlen;
312
313 wcu = towupper (wc);
314 mbcdlen = wcrtomb (buf, wcu, &prev_st);
315 if (BE (mbclen == mbcdlen, 1))
316 memcpy (pstr->mbs + byte_idx, buf, mbclen);
317 else
318 {
319 src_idx = byte_idx;
320 goto offsets_needed;
321 }
322 }
323 else
324 memcpy (pstr->mbs + byte_idx,
325 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
326 pstr->wcs[byte_idx++] = wcu;
327 /* Write paddings. */
328 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
329 pstr->wcs[byte_idx++] = WEOF;
330 }
331 else if (mbclen == (size_t) -1 || mbclen == 0)
332 {
333 /* It is an invalid character or '\0'. Just use the byte. */
334 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
335 pstr->mbs[byte_idx] = ch;
336 /* And also cast it to wide char. */
337 pstr->wcs[byte_idx++] = (wchar_t) ch;
338 if (BE (mbclen == (size_t) -1, 0))
339 pstr->cur_state = prev_st;
340 }
341 else
342 {
343 /* The buffer doesn't have enough space, finish to build. */
344 pstr->cur_state = prev_st;
345 break;
346 }
347 }
348 pstr->valid_len = byte_idx;
349 pstr->valid_raw_len = byte_idx;
350 return REG_NOERROR;
351 }
352 else
353 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
354 {
355 wchar_t wc;
356 const char *p;
357 offsets_needed:
358 remain_len = end_idx - byte_idx;
359 prev_st = pstr->cur_state;
360 if (BE (pstr->trans != NULL, 0))
361 {
362 int i, ch;
363
364 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
365 {
366 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
367 buf[i] = pstr->trans[ch];
368 }
369 p = (const char *) buf;
370 }
371 else
372 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
373 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
374 if (BE (mbclen + 2 > 2, 1))
375 {
376 wchar_t wcu = wc;
377 if (iswlower (wc))
378 {
379 size_t mbcdlen;
380
381 wcu = towupper (wc);
382 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
383 if (BE (mbclen == mbcdlen, 1))
384 memcpy (pstr->mbs + byte_idx, buf, mbclen);
385 else if (mbcdlen != (size_t) -1)
386 {
387 size_t i;
388
389 if (byte_idx + mbcdlen > pstr->bufs_len)
390 {
391 pstr->cur_state = prev_st;
392 break;
393 }
394
395 if (pstr->offsets == NULL)
396 {
397 pstr->offsets = re_malloc (int, pstr->bufs_len);
398
399 if (pstr->offsets == NULL)
400 return REG_ESPACE;
401 }
402 if (!pstr->offsets_needed)
403 {
404 for (i = 0; i < (size_t) byte_idx; ++i)
405 pstr->offsets[i] = i;
406 pstr->offsets_needed = 1;
407 }
408
409 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
410 pstr->wcs[byte_idx] = wcu;
411 pstr->offsets[byte_idx] = src_idx;
412 for (i = 1; i < mbcdlen; ++i)
413 {
414 pstr->offsets[byte_idx + i]
415 = src_idx + (i < mbclen ? i : mbclen - 1);
416 pstr->wcs[byte_idx + i] = WEOF;
417 }
418 pstr->len += mbcdlen - mbclen;
419 if (pstr->raw_stop > src_idx)
420 pstr->stop += mbcdlen - mbclen;
421 end_idx = (pstr->bufs_len > pstr->len)
422 ? pstr->len : pstr->bufs_len;
423 byte_idx += mbcdlen;
424 src_idx += mbclen;
425 continue;
426 }
427 else
428 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 }
430 else
431 memcpy (pstr->mbs + byte_idx, p, mbclen);
432
433 if (BE (pstr->offsets_needed != 0, 0))
434 {
435 size_t i;
436 for (i = 0; i < mbclen; ++i)
437 pstr->offsets[byte_idx + i] = src_idx + i;
438 }
439 src_idx += mbclen;
440
441 pstr->wcs[byte_idx++] = wcu;
442 /* Write paddings. */
443 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
444 pstr->wcs[byte_idx++] = WEOF;
445 }
446 else if (mbclen == (size_t) -1 || mbclen == 0)
447 {
448 /* It is an invalid character or '\0'. Just use the byte. */
449 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
450
451 if (BE (pstr->trans != NULL, 0))
452 ch = pstr->trans [ch];
453 pstr->mbs[byte_idx] = ch;
454
455 if (BE (pstr->offsets_needed != 0, 0))
456 pstr->offsets[byte_idx] = src_idx;
457 ++src_idx;
458
459 /* And also cast it to wide char. */
460 pstr->wcs[byte_idx++] = (wchar_t) ch;
461 if (BE (mbclen == (size_t) -1, 0))
462 pstr->cur_state = prev_st;
463 }
464 else
465 {
466 /* The buffer doesn't have enough space, finish to build. */
467 pstr->cur_state = prev_st;
468 break;
469 }
470 }
471 pstr->valid_len = byte_idx;
472 pstr->valid_raw_len = src_idx;
473 return REG_NOERROR;
474}
475
476/* Skip characters until the index becomes greater than NEW_RAW_IDX.
477 Return the index. */
478
479static int
480internal_function
481re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
482{
483 mbstate_t prev_st;
484 int rawbuf_idx;
485 size_t mbclen;
486 wchar_t wc = 0;
487
488 /* Skip the characters which are not necessary to check. */
489 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
490 rawbuf_idx < new_raw_idx;)
491 {
492 int remain_len;
493 remain_len = pstr->len - rawbuf_idx;
494 prev_st = pstr->cur_state;
495 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
496 remain_len, &pstr->cur_state);
497 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
498 {
499 /* We treat these cases as a singlebyte character. */
500 mbclen = 1;
501 pstr->cur_state = prev_st;
502 }
503 /* Then proceed the next character. */
504 rawbuf_idx += mbclen;
505 }
506 *last_wc = (wint_t) wc;
507 return rawbuf_idx;
508}
509#endif /* RE_ENABLE_I18N */
510
511/* Build the buffer PSTR->MBS, and apply the translation if we need.
512 This function is used in case of REG_ICASE. */
513
514static void
515internal_function
516build_upper_buffer (re_string_t *pstr)
517{
518 int char_idx, end_idx;
519 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
520
521 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
522 {
523 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
524 if (BE (pstr->trans != NULL, 0))
525 ch = pstr->trans[ch];
526 if (islower (ch))
527 pstr->mbs[char_idx] = toupper (ch);
528 else
529 pstr->mbs[char_idx] = ch;
530 }
531 pstr->valid_len = char_idx;
532 pstr->valid_raw_len = char_idx;
533}
534
535/* Apply TRANS to the buffer in PSTR. */
536
537static void
538internal_function
539re_string_translate_buffer (re_string_t *pstr)
540{
541 int buf_idx, end_idx;
542 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
543
544 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
545 {
546 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
547 pstr->mbs[buf_idx] = pstr->trans[ch];
548 }
549
550 pstr->valid_len = buf_idx;
551 pstr->valid_raw_len = buf_idx;
552}
553
554/* This function re-construct the buffers.
555 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
556 convert to upper case in case of REG_ICASE, apply translation. */
557
558static reg_errcode_t
559internal_function
560re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
561{
562 int offset = idx - pstr->raw_mbs_idx;
563 if (BE (offset < 0, 0))
564 {
565 /* Reset buffer. */
566#ifdef RE_ENABLE_I18N
567 if (pstr->mb_cur_max > 1)
568 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
569#endif /* RE_ENABLE_I18N */
570 pstr->len = pstr->raw_len;
571 pstr->stop = pstr->raw_stop;
572 pstr->valid_len = 0;
573 pstr->raw_mbs_idx = 0;
574 pstr->valid_raw_len = 0;
575 pstr->offsets_needed = 0;
576 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
577 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
578 if (!pstr->mbs_allocated)
579 pstr->mbs = (unsigned char *) pstr->raw_mbs;
580 offset = idx;
581 }
582
583 if (BE (offset != 0, 1))
584 {
585 /* Are the characters which are already checked remain? */
586 if (BE (offset < pstr->valid_raw_len, 1)
587#ifdef RE_ENABLE_I18N
588 /* Handling this would enlarge the code too much.
589 Accept a slowdown in that case. */
590 && pstr->offsets_needed == 0
591#endif
592 )
593 {
594 /* Yes, move them to the front of the buffer. */
595 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
596#ifdef RE_ENABLE_I18N
597 if (pstr->mb_cur_max > 1)
598 memmove (pstr->wcs, pstr->wcs + offset,
599 (pstr->valid_len - offset) * sizeof (wint_t));
600#endif /* RE_ENABLE_I18N */
601 if (BE (pstr->mbs_allocated, 0))
602 memmove (pstr->mbs, pstr->mbs + offset,
603 pstr->valid_len - offset);
604 pstr->valid_len -= offset;
605 pstr->valid_raw_len -= offset;
606#if DEBUG
607 assert (pstr->valid_len > 0);
608#endif
609 }
610 else
611 {
612 /* No, skip all characters until IDX. */
613#ifdef RE_ENABLE_I18N
614 if (BE (pstr->offsets_needed, 0))
615 {
616 pstr->len = pstr->raw_len - idx + offset;
617 pstr->stop = pstr->raw_stop - idx + offset;
618 pstr->offsets_needed = 0;
619 }
620#endif
621 pstr->valid_len = 0;
622 pstr->valid_raw_len = 0;
623#ifdef RE_ENABLE_I18N
624 if (pstr->mb_cur_max > 1)
625 {
626 int wcs_idx;
627 wint_t wc = WEOF;
628
629 if (pstr->is_utf8)
630 {
631 const unsigned char *raw, *p, *q, *end;
632
633 /* Special case UTF-8. Multi-byte chars start with any
634 byte other than 0x80 - 0xbf. */
635 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
636 end = raw + (offset - pstr->mb_cur_max);
637 p = raw + offset - 1;
638#ifdef _LIBC
639 /* We know the wchar_t encoding is UCS4, so for the simple
640 case, ASCII characters, skip the conversion step. */
641 if (isascii (*p) && BE (pstr->trans == NULL, 1))
642 {
643 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
644 pstr->valid_len = 0;
645 wc = (wchar_t) *p;
646 }
647 else
648#endif
649 for (; p >= end; --p)
650 if ((*p & 0xc0) != 0x80)
651 {
652 mbstate_t cur_state;
653 wchar_t wc2;
654 int mlen = raw + pstr->len - p;
655 unsigned char buf[6];
656 size_t mbclen;
657
658 q = p;
659 if (BE (pstr->trans != NULL, 0))
660 {
661 int i = mlen < 6 ? mlen : 6;
662 while (--i >= 0)
663 buf[i] = pstr->trans[p[i]];
664 q = buf;
665 }
666 /* XXX Don't use mbrtowc, we know which conversion
667 to use (UTF-8 -> UCS4). */
668 memset (&cur_state, 0, sizeof (cur_state));
669 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
670 &cur_state);
671 if (raw + offset - p <= mbclen
672 && mbclen < (size_t) -2)
673 {
674 memset (&pstr->cur_state, '\0',
675 sizeof (mbstate_t));
676 pstr->valid_len = mbclen - (raw + offset - p);
677 wc = wc2;
678 }
679 break;
680 }
681 }
682
683 if (wc == WEOF)
684 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
685 if (BE (pstr->valid_len, 0))
686 {
687 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
688 pstr->wcs[wcs_idx] = WEOF;
689 if (pstr->mbs_allocated)
690 memset (pstr->mbs, 255, pstr->valid_len);
691 }
692 pstr->valid_raw_len = pstr->valid_len;
693 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
694 && IS_WIDE_WORD_CHAR (wc))
695 ? CONTEXT_WORD
696 : ((IS_WIDE_NEWLINE (wc)
697 && pstr->newline_anchor)
698 ? CONTEXT_NEWLINE : 0));
699 }
700 else
701#endif /* RE_ENABLE_I18N */
702 {
703 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
704 if (pstr->trans)
705 c = pstr->trans[c];
706 pstr->tip_context = (bitset_contain (pstr->word_char, c)
707 ? CONTEXT_WORD
708 : ((IS_NEWLINE (c) && pstr->newline_anchor)
709 ? CONTEXT_NEWLINE : 0));
710 }
711 }
712 if (!BE (pstr->mbs_allocated, 0))
713 pstr->mbs += offset;
714 }
715 pstr->raw_mbs_idx = idx;
716 pstr->len -= offset;
717 pstr->stop -= offset;
718
719 /* Then build the buffers. */
720#ifdef RE_ENABLE_I18N
721 if (pstr->mb_cur_max > 1)
722 {
723 if (pstr->icase)
724 {
725 int ret = build_wcs_upper_buffer (pstr);
726 if (BE (ret != REG_NOERROR, 0))
727 return ret;
728 }
729 else
730 build_wcs_buffer (pstr);
731 }
732 else
733#endif /* RE_ENABLE_I18N */
734 if (BE (pstr->mbs_allocated, 0))
735 {
736 if (pstr->icase)
737 build_upper_buffer (pstr);
738 else if (pstr->trans != NULL)
739 re_string_translate_buffer (pstr);
740 }
741 else
742 pstr->valid_len = pstr->len;
743
744 pstr->cur_idx = 0;
745 return REG_NOERROR;
746}
747
748static unsigned char
749internal_function __attribute ((pure))
750re_string_peek_byte_case (const re_string_t *pstr, int idx)
751{
752 int ch, off;
753
754 /* Handle the common (easiest) cases first. */
755 if (BE (!pstr->mbs_allocated, 1))
756 return re_string_peek_byte (pstr, idx);
757
758#ifdef RE_ENABLE_I18N
759 if (pstr->mb_cur_max > 1
760 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
761 return re_string_peek_byte (pstr, idx);
762#endif
763
764 off = pstr->cur_idx + idx;
765#ifdef RE_ENABLE_I18N
766 if (pstr->offsets_needed)
767 off = pstr->offsets[off];
768#endif
769
770 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
771
772#ifdef RE_ENABLE_I18N
773 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
774 this function returns CAPITAL LETTER I instead of first byte of
775 DOTLESS SMALL LETTER I. The latter would confuse the parser,
776 since peek_byte_case doesn't advance cur_idx in any way. */
777 if (pstr->offsets_needed && !isascii (ch))
778 return re_string_peek_byte (pstr, idx);
779#endif
780
781 return ch;
782}
783
784static unsigned char
785internal_function __attribute ((pure))
786re_string_fetch_byte_case (re_string_t *pstr)
787{
788 if (BE (!pstr->mbs_allocated, 1))
789 return re_string_fetch_byte (pstr);
790
791#ifdef RE_ENABLE_I18N
792 if (pstr->offsets_needed)
793 {
794 int off, ch;
795
796 /* For tr_TR.UTF-8 [[:islower:]] there is
797 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
798 in that case the whole multi-byte character and return
799 the original letter. On the other side, with
800 [[: DOTLESS SMALL LETTER I return [[:I, as doing
801 anything else would complicate things too much. */
802
803 if (!re_string_first_byte (pstr, pstr->cur_idx))
804 return re_string_fetch_byte (pstr);
805
806 off = pstr->offsets[pstr->cur_idx];
807 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
808
809 if (! isascii (ch))
810 return re_string_fetch_byte (pstr);
811
812 re_string_skip_bytes (pstr,
813 re_string_char_size_at (pstr, pstr->cur_idx));
814 return ch;
815 }
816#endif
817
818 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
819}
820
821static void
822internal_function
823re_string_destruct (re_string_t *pstr)
824{
825#ifdef RE_ENABLE_I18N
826 re_free (pstr->wcs);
827 re_free (pstr->offsets);
828#endif /* RE_ENABLE_I18N */
829 if (pstr->mbs_allocated)
830 re_free (pstr->mbs);
831}
832
833/* Return the context at IDX in INPUT. */
834
835static unsigned int
836internal_function
837re_string_context_at (const re_string_t *input, int idx, int eflags)
838{
839 int c;
840 if (BE (idx < 0, 0))
841 /* In this case, we use the value stored in input->tip_context,
842 since we can't know the character in input->mbs[-1] here. */
843 return input->tip_context;
844 if (BE (idx == input->len, 0))
845 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
846 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
847#ifdef RE_ENABLE_I18N
848 if (input->mb_cur_max > 1)
849 {
850 wint_t wc;
851 int wc_idx = idx;
852 while(input->wcs[wc_idx] == WEOF)
853 {
854#ifdef DEBUG
855 /* It must not happen. */
856 assert (wc_idx >= 0);
857#endif
858 --wc_idx;
859 if (wc_idx < 0)
860 return input->tip_context;
861 }
862 wc = input->wcs[wc_idx];
863 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
864 return CONTEXT_WORD;
865 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
866 ? CONTEXT_NEWLINE : 0);
867 }
868 else
869#endif
870 {
871 c = re_string_byte_at (input, idx);
872 if (bitset_contain (input->word_char, c))
873 return CONTEXT_WORD;
874 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
875 }
876}
877
878
879/* Functions for set operation. */
880
881static reg_errcode_t
882internal_function
883re_node_set_alloc (re_node_set *set, int size)
884{
885 set->alloc = size;
886 set->nelem = 0;
887 set->elems = re_malloc (int, size);
888 if (BE (set->elems == NULL, 0))
889 return REG_ESPACE;
890 return REG_NOERROR;
891}
892
893static reg_errcode_t
894internal_function
895re_node_set_init_1 (re_node_set *set, int elem)
896{
897 set->alloc = 1;
898 set->nelem = 1;
899 set->elems = re_malloc (int, 1);
900 if (BE (set->elems == NULL, 0))
901 {
902 set->alloc = set->nelem = 0;
903 return REG_ESPACE;
904 }
905 set->elems[0] = elem;
906 return REG_NOERROR;
907}
908
909static reg_errcode_t
910internal_function
911re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
912{
913 set->alloc = 2;
914 set->elems = re_malloc (int, 2);
915 if (BE (set->elems == NULL, 0))
916 return REG_ESPACE;
917 if (elem1 == elem2)
918 {
919 set->nelem = 1;
920 set->elems[0] = elem1;
921 }
922 else
923 {
924 set->nelem = 2;
925 if (elem1 < elem2)
926 {
927 set->elems[0] = elem1;
928 set->elems[1] = elem2;
929 }
930 else
931 {
932 set->elems[0] = elem2;
933 set->elems[1] = elem1;
934 }
935 }
936 return REG_NOERROR;
937}
938
939static reg_errcode_t
940internal_function
941re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
942{
943 dest->nelem = src->nelem;
944 if (src->nelem > 0)
945 {
946 dest->alloc = dest->nelem;
947 dest->elems = re_malloc (int, dest->alloc);
948 if (BE (dest->elems == NULL, 0))
949 {
950 dest->alloc = dest->nelem = 0;
951 return REG_ESPACE;
952 }
953 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
954 }
955 else
956 re_node_set_init_empty (dest);
957 return REG_NOERROR;
958}
959
960/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
961 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
962 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
963
964static reg_errcode_t
965internal_function
966re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
967 const re_node_set *src2)
968{
969 int i1, i2, is, id, delta, sbase;
970 if (src1->nelem == 0 || src2->nelem == 0)
971 return REG_NOERROR;
972
973 /* We need dest->nelem + 2 * elems_in_intersection; this is a
974 conservative estimate. */
975 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
976 {
977 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
978 int *new_elems = re_realloc (dest->elems, int, new_alloc);
979 if (BE (new_elems == NULL, 0))
980 return REG_ESPACE;
981 dest->elems = new_elems;
982 dest->alloc = new_alloc;
983 }
984
985 /* Find the items in the intersection of SRC1 and SRC2, and copy
986 into the top of DEST those that are not already in DEST itself. */
987 sbase = dest->nelem + src1->nelem + src2->nelem;
988 i1 = src1->nelem - 1;
989 i2 = src2->nelem - 1;
990 id = dest->nelem - 1;
991 for (;;)
992 {
993 if (src1->elems[i1] == src2->elems[i2])
994 {
995 /* Try to find the item in DEST. Maybe we could binary search? */
996 while (id >= 0 && dest->elems[id] > src1->elems[i1])
997 --id;
998
999 if (id < 0 || dest->elems[id] != src1->elems[i1])
1000 dest->elems[--sbase] = src1->elems[i1];
1001
1002 if (--i1 < 0 || --i2 < 0)
1003 break;
1004 }
1005
1006 /* Lower the highest of the two items. */
1007 else if (src1->elems[i1] < src2->elems[i2])
1008 {
1009 if (--i2 < 0)
1010 break;
1011 }
1012 else
1013 {
1014 if (--i1 < 0)
1015 break;
1016 }
1017 }
1018
1019 id = dest->nelem - 1;
1020 is = dest->nelem + src1->nelem + src2->nelem - 1;
1021 delta = is - sbase + 1;
1022
1023 /* Now copy. When DELTA becomes zero, the remaining
1024 DEST elements are already in place; this is more or
1025 less the same loop that is in re_node_set_merge. */
1026 dest->nelem += delta;
1027 if (delta > 0 && id >= 0)
1028 for (;;)
1029 {
1030 if (dest->elems[is] > dest->elems[id])
1031 {
1032 /* Copy from the top. */
1033 dest->elems[id + delta--] = dest->elems[is--];
1034 if (delta == 0)
1035 break;
1036 }
1037 else
1038 {
1039 /* Slide from the bottom. */
1040 dest->elems[id + delta] = dest->elems[id];
1041 if (--id < 0)
1042 break;
1043 }
1044 }
1045
1046 /* Copy remaining SRC elements. */
1047 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1048
1049 return REG_NOERROR;
1050}
1051
1052/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1053 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1054
1055static reg_errcode_t
1056internal_function
1057re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1058 const re_node_set *src2)
1059{
1060 int i1, i2, id;
1061 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1062 {
1063 dest->alloc = src1->nelem + src2->nelem;
1064 dest->elems = re_malloc (int, dest->alloc);
1065 if (BE (dest->elems == NULL, 0))
1066 return REG_ESPACE;
1067 }
1068 else
1069 {
1070 if (src1 != NULL && src1->nelem > 0)
1071 return re_node_set_init_copy (dest, src1);
1072 else if (src2 != NULL && src2->nelem > 0)
1073 return re_node_set_init_copy (dest, src2);
1074 else
1075 re_node_set_init_empty (dest);
1076 return REG_NOERROR;
1077 }
1078 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1079 {
1080 if (src1->elems[i1] > src2->elems[i2])
1081 {
1082 dest->elems[id++] = src2->elems[i2++];
1083 continue;
1084 }
1085 if (src1->elems[i1] == src2->elems[i2])
1086 ++i2;
1087 dest->elems[id++] = src1->elems[i1++];
1088 }
1089 if (i1 < src1->nelem)
1090 {
1091 memcpy (dest->elems + id, src1->elems + i1,
1092 (src1->nelem - i1) * sizeof (int));
1093 id += src1->nelem - i1;
1094 }
1095 else if (i2 < src2->nelem)
1096 {
1097 memcpy (dest->elems + id, src2->elems + i2,
1098 (src2->nelem - i2) * sizeof (int));
1099 id += src2->nelem - i2;
1100 }
1101 dest->nelem = id;
1102 return REG_NOERROR;
1103}
1104
1105/* Calculate the union set of the sets DEST and SRC. And store it to
1106 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1107
1108static reg_errcode_t
1109internal_function
1110re_node_set_merge (re_node_set *dest, const re_node_set *src)
1111{
1112 int is, id, sbase, delta;
1113 if (src == NULL || src->nelem == 0)
1114 return REG_NOERROR;
1115 if (dest->alloc < 2 * src->nelem + dest->nelem)
1116 {
1117 int new_alloc = 2 * (src->nelem + dest->alloc);
1118 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1119 if (BE (new_buffer == NULL, 0))
1120 return REG_ESPACE;
1121 dest->elems = new_buffer;
1122 dest->alloc = new_alloc;
1123 }
1124
1125 if (BE (dest->nelem == 0, 0))
1126 {
1127 dest->nelem = src->nelem;
1128 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1129 return REG_NOERROR;
1130 }
1131
1132 /* Copy into the top of DEST the items of SRC that are not
1133 found in DEST. Maybe we could binary search in DEST? */
1134 for (sbase = dest->nelem + 2 * src->nelem,
1135 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1136 {
1137 if (dest->elems[id] == src->elems[is])
1138 is--, id--;
1139 else if (dest->elems[id] < src->elems[is])
1140 dest->elems[--sbase] = src->elems[is--];
1141 else /* if (dest->elems[id] > src->elems[is]) */
1142 --id;
1143 }
1144
1145 if (is >= 0)
1146 {
1147 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1148 sbase -= is + 1;
1149 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1150 }
1151
1152 id = dest->nelem - 1;
1153 is = dest->nelem + 2 * src->nelem - 1;
1154 delta = is - sbase + 1;
1155 if (delta == 0)
1156 return REG_NOERROR;
1157
1158 /* Now copy. When DELTA becomes zero, the remaining
1159 DEST elements are already in place. */
1160 dest->nelem += delta;
1161 for (;;)
1162 {
1163 if (dest->elems[is] > dest->elems[id])
1164 {
1165 /* Copy from the top. */
1166 dest->elems[id + delta--] = dest->elems[is--];
1167 if (delta == 0)
1168 break;
1169 }
1170 else
1171 {
1172 /* Slide from the bottom. */
1173 dest->elems[id + delta] = dest->elems[id];
1174 if (--id < 0)
1175 {
1176 /* Copy remaining SRC elements. */
1177 memcpy (dest->elems, dest->elems + sbase,
1178 delta * sizeof (int));
1179 break;
1180 }
1181 }
1182 }
1183
1184 return REG_NOERROR;
1185}
1186
1187/* Insert the new element ELEM to the re_node_set* SET.
1188 SET should not already have ELEM.
1189 return -1 if an error is occured, return 1 otherwise. */
1190
1191static int
1192internal_function
1193re_node_set_insert (re_node_set *set, int elem)
1194{
1195 int idx;
1196 /* In case the set is empty. */
1197 if (set->alloc == 0)
1198 {
1199 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1200 return 1;
1201 else
1202 return -1;
1203 }
1204
1205 if (BE (set->nelem, 0) == 0)
1206 {
1207 /* We already guaranteed above that set->alloc != 0. */
1208 set->elems[0] = elem;
1209 ++set->nelem;
1210 return 1;
1211 }
1212
1213 /* Realloc if we need. */
1214 if (set->alloc == set->nelem)
1215 {
1216 int *new_elems;
1217 set->alloc = set->alloc * 2;
1218 new_elems = re_realloc (set->elems, int, set->alloc);
1219 if (BE (new_elems == NULL, 0))
1220 return -1;
1221 set->elems = new_elems;
1222 }
1223
1224 /* Move the elements which follows the new element. Test the
1225 first element separately to skip a check in the inner loop. */
1226 if (elem < set->elems[0])
1227 {
1228 idx = 0;
1229 for (idx = set->nelem; idx > 0; idx--)
1230 set->elems[idx] = set->elems[idx - 1];
1231 }
1232 else
1233 {
1234 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1235 set->elems[idx] = set->elems[idx - 1];
1236 }
1237
1238 /* Insert the new element. */
1239 set->elems[idx] = elem;
1240 ++set->nelem;
1241 return 1;
1242}
1243
1244/* Insert the new element ELEM to the re_node_set* SET.
1245 SET should not already have any element greater than or equal to ELEM.
1246 Return -1 if an error is occured, return 1 otherwise. */
1247
1248static int
1249internal_function
1250re_node_set_insert_last (re_node_set *set, int elem)
1251{
1252 /* Realloc if we need. */
1253 if (set->alloc == set->nelem)
1254 {
1255 int *new_elems;
1256 set->alloc = (set->alloc + 1) * 2;
1257 new_elems = re_realloc (set->elems, int, set->alloc);
1258 if (BE (new_elems == NULL, 0))
1259 return -1;
1260 set->elems = new_elems;
1261 }
1262
1263 /* Insert the new element. */
1264 set->elems[set->nelem++] = elem;
1265 return 1;
1266}
1267
1268/* Compare two node sets SET1 and SET2.
1269 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1270
1271static int
1272internal_function __attribute ((pure))
1273re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1274{
1275 int i;
1276 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1277 return 0;
1278 for (i = set1->nelem ; --i >= 0 ; )
1279 if (set1->elems[i] != set2->elems[i])
1280 return 0;
1281 return 1;
1282}
1283
1284/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1285
1286static int
1287internal_function __attribute ((pure))
1288re_node_set_contains (const re_node_set *set, int elem)
1289{
1290 unsigned int idx, right, mid;
1291 if (set->nelem <= 0)
1292 return 0;
1293
1294 /* Binary search the element. */
1295 idx = 0;
1296 right = set->nelem - 1;
1297 while (idx < right)
1298 {
1299 mid = (idx + right) / 2;
1300 if (set->elems[mid] < elem)
1301 idx = mid + 1;
1302 else
1303 right = mid;
1304 }
1305 return set->elems[idx] == elem ? idx + 1 : 0;
1306}
1307
1308static void
1309internal_function
1310re_node_set_remove_at (re_node_set *set, int idx)
1311{
1312 if (idx < 0 || idx >= set->nelem)
1313 return;
1314 --set->nelem;
1315 for (; idx < set->nelem; idx++)
1316 set->elems[idx] = set->elems[idx + 1];
1317}
1318
1319
1320
1321/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1322 Or return -1, if an error will be occured. */
1323
1324static int
1325internal_function
1326re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1327{
1328 int type = token.type;
1329 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1330 {
1331 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1332 int *new_nexts, *new_indices;
1333 re_node_set *new_edests, *new_eclosures;
1334 re_token_t *new_nodes;
1335
1336 /* Avoid overflows. */
1337 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1338 return -1;
1339
1340 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1341 if (BE (new_nodes == NULL, 0))
1342 return -1;
1343 dfa->nodes = new_nodes;
1344 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1345 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1346 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1347 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1348 if (BE (new_nexts == NULL || new_indices == NULL
1349 || new_edests == NULL || new_eclosures == NULL, 0))
1350 return -1;
1351 dfa->nexts = new_nexts;
1352 dfa->org_indices = new_indices;
1353 dfa->edests = new_edests;
1354 dfa->eclosures = new_eclosures;
1355 dfa->nodes_alloc = new_nodes_alloc;
1356 }
1357 dfa->nodes[dfa->nodes_len] = token;
1358 dfa->nodes[dfa->nodes_len].constraint = 0;
1359#ifdef RE_ENABLE_I18N
1360 dfa->nodes[dfa->nodes_len].accept_mb =
1361 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1362#endif
1363 dfa->nexts[dfa->nodes_len] = -1;
1364 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1365 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1366 return dfa->nodes_len++;
1367}
1368
1369static inline unsigned int
1370internal_function
1371calc_state_hash (const re_node_set *nodes, unsigned int context)
1372{
1373 unsigned int hash = nodes->nelem + context;
1374 int i;
1375 for (i = 0 ; i < nodes->nelem ; i++)
1376 hash += nodes->elems[i];
1377 return hash;
1378}
1379
1380/* Search for the state whose node_set is equivalent to NODES.
1381 Return the pointer to the state, if we found it in the DFA.
1382 Otherwise create the new one and return it. In case of an error
1383 return NULL and set the error code in ERR.
1384 Note: - We assume NULL as the invalid state, then it is possible that
1385 return value is NULL and ERR is REG_NOERROR.
1386 - We never return non-NULL value in case of any errors, it is for
1387 optimization. */
1388
1389static re_dfastate_t *
1390internal_function
1391re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1392 const re_node_set *nodes)
1393{
1394 unsigned int hash;
1395 re_dfastate_t *new_state;
1396 struct re_state_table_entry *spot;
1397 int i;
1398 if (BE (nodes->nelem == 0, 0))
1399 {
1400 *err = REG_NOERROR;
1401 return NULL;
1402 }
1403 hash = calc_state_hash (nodes, 0);
1404 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1405
1406 for (i = 0 ; i < spot->num ; i++)
1407 {
1408 re_dfastate_t *state = spot->array[i];
1409 if (hash != state->hash)
1410 continue;
1411 if (re_node_set_compare (&state->nodes, nodes))
1412 return state;
1413 }
1414
1415 /* There are no appropriate state in the dfa, create the new one. */
1416 new_state = create_ci_newstate (dfa, nodes, hash);
1417 if (BE (new_state == NULL, 0))
1418 *err = REG_ESPACE;
1419
1420 return new_state;
1421}
1422
1423/* Search for the state whose node_set is equivalent to NODES and
1424 whose context is equivalent to CONTEXT.
1425 Return the pointer to the state, if we found it in the DFA.
1426 Otherwise create the new one and return it. In case of an error
1427 return NULL and set the error code in ERR.
1428 Note: - We assume NULL as the invalid state, then it is possible that
1429 return value is NULL and ERR is REG_NOERROR.
1430 - We never return non-NULL value in case of any errors, it is for
1431 optimization. */
1432
1433static re_dfastate_t *
1434internal_function
1435re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1436 const re_node_set *nodes, unsigned int context)
1437{
1438 unsigned int hash;
1439 re_dfastate_t *new_state;
1440 struct re_state_table_entry *spot;
1441 int i;
1442 if (nodes->nelem == 0)
1443 {
1444 *err = REG_NOERROR;
1445 return NULL;
1446 }
1447 hash = calc_state_hash (nodes, context);
1448 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1449
1450 for (i = 0 ; i < spot->num ; i++)
1451 {
1452 re_dfastate_t *state = spot->array[i];
1453 if (state->hash == hash
1454 && state->context == context
1455 && re_node_set_compare (state->entrance_nodes, nodes))
1456 return state;
1457 }
1458 /* There are no appropriate state in `dfa', create the new one. */
1459 new_state = create_cd_newstate (dfa, nodes, context, hash);
1460 if (BE (new_state == NULL, 0))
1461 *err = REG_ESPACE;
1462
1463 return new_state;
1464}
1465
1466/* Finish initialization of the new state NEWSTATE, and using its hash value
1467 HASH put in the appropriate bucket of DFA's state table. Return value
1468 indicates the error code if failed. */
1469
1470static reg_errcode_t
1471register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1472 unsigned int hash)
1473{
1474 struct re_state_table_entry *spot;
1475 reg_errcode_t err;
1476 int i;
1477
1478 newstate->hash = hash;
1479 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1480 if (BE (err != REG_NOERROR, 0))
1481 return REG_ESPACE;
1482 for (i = 0; i < newstate->nodes.nelem; i++)
1483 {
1484 int elem = newstate->nodes.elems[i];
1485 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1486 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1487 }
1488
1489 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1490 if (BE (spot->alloc <= spot->num, 0))
1491 {
1492 int new_alloc = 2 * spot->num + 2;
1493 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1494 new_alloc);
1495 if (BE (new_array == NULL, 0))
1496 return REG_ESPACE;
1497 spot->array = new_array;
1498 spot->alloc = new_alloc;
1499 }
1500 spot->array[spot->num++] = newstate;
1501 return REG_NOERROR;
1502}
1503
1504static void
1505free_state (re_dfastate_t *state)
1506{
1507 re_node_set_free (&state->non_eps_nodes);
1508 re_node_set_free (&state->inveclosure);
1509 if (state->entrance_nodes != &state->nodes)
1510 {
1511 re_node_set_free (state->entrance_nodes);
1512 re_free (state->entrance_nodes);
1513 }
1514 re_node_set_free (&state->nodes);
1515 re_free (state->word_trtable);
1516 re_free (state->trtable);
1517 re_free (state);
1518}
1519
1520/* Create the new state which is independ of contexts.
1521 Return the new state if succeeded, otherwise return NULL. */
1522
1523static re_dfastate_t *
1524internal_function
1525create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1526 unsigned int hash)
1527{
1528 int i;
1529 reg_errcode_t err;
1530 re_dfastate_t *newstate;
1531
1532 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1533 if (BE (newstate == NULL, 0))
1534 return NULL;
1535 err = re_node_set_init_copy (&newstate->nodes, nodes);
1536 if (BE (err != REG_NOERROR, 0))
1537 {
1538 re_free (newstate);
1539 return NULL;
1540 }
1541
1542 newstate->entrance_nodes = &newstate->nodes;
1543 for (i = 0 ; i < nodes->nelem ; i++)
1544 {
1545 re_token_t *node = dfa->nodes + nodes->elems[i];
1546 re_token_type_t type = node->type;
1547 if (type == CHARACTER && !node->constraint)
1548 continue;
1549#ifdef RE_ENABLE_I18N
1550 newstate->accept_mb |= node->accept_mb;
1551#endif /* RE_ENABLE_I18N */
1552
1553 /* If the state has the halt node, the state is a halt state. */
1554 if (type == END_OF_RE)
1555 newstate->halt = 1;
1556 else if (type == OP_BACK_REF)
1557 newstate->has_backref = 1;
1558 else if (type == ANCHOR || node->constraint)
1559 newstate->has_constraint = 1;
1560 }
1561 err = register_state (dfa, newstate, hash);
1562 if (BE (err != REG_NOERROR, 0))
1563 {
1564 free_state (newstate);
1565 newstate = NULL;
1566 }
1567 return newstate;
1568}
1569
1570/* Create the new state which is depend on the context CONTEXT.
1571 Return the new state if succeeded, otherwise return NULL. */
1572
1573static re_dfastate_t *
1574internal_function
1575create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1576 unsigned int context, unsigned int hash)
1577{
1578 int i, nctx_nodes = 0;
1579 reg_errcode_t err;
1580 re_dfastate_t *newstate;
1581
1582 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1583 if (BE (newstate == NULL, 0))
1584 return NULL;
1585 err = re_node_set_init_copy (&newstate->nodes, nodes);
1586 if (BE (err != REG_NOERROR, 0))
1587 {
1588 re_free (newstate);
1589 return NULL;
1590 }
1591
1592 newstate->context = context;
1593 newstate->entrance_nodes = &newstate->nodes;
1594
1595 for (i = 0 ; i < nodes->nelem ; i++)
1596 {
1597 unsigned int constraint = 0;
1598 re_token_t *node = dfa->nodes + nodes->elems[i];
1599 re_token_type_t type = node->type;
1600 if (node->constraint)
1601 constraint = node->constraint;
1602
1603 if (type == CHARACTER && !constraint)
1604 continue;
1605#ifdef RE_ENABLE_I18N
1606 newstate->accept_mb |= node->accept_mb;
1607#endif /* RE_ENABLE_I18N */
1608
1609 /* If the state has the halt node, the state is a halt state. */
1610 if (type == END_OF_RE)
1611 newstate->halt = 1;
1612 else if (type == OP_BACK_REF)
1613 newstate->has_backref = 1;
1614 else if (type == ANCHOR)
1615 constraint = node->opr.ctx_type;
1616
1617 if (constraint)
1618 {
1619 if (newstate->entrance_nodes == &newstate->nodes)
1620 {
1621 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1622 if (BE (newstate->entrance_nodes == NULL, 0))
1623 {
1624 free_state (newstate);
1625 return NULL;
1626 }
1627 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1628 nctx_nodes = 0;
1629 newstate->has_constraint = 1;
1630 }
1631
1632 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1633 {
1634 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1635 ++nctx_nodes;
1636 }
1637 }
1638 }
1639 err = register_state (dfa, newstate, hash);
1640 if (BE (err != REG_NOERROR, 0))
1641 {
1642 free_state (newstate);
1643 newstate = NULL;
1644 }
1645 return newstate;
1646}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette