VirtualBox

source: kBuild/vendor/grep/2.12/src/dfasearch.c@ 3576

Last change on this file since 3576 was 2595, checked in by bird, 13 years ago

gnu grep version 2.12 (grep-2.12.tar.xz, md5sum=8d2f0346d08b13c18afb81f0e8aa1e2f)

  • Property svn:eol-style set to native
File size: 13.7 KB
Line 
1/* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
21#include <config.h>
22#include "intprops.h"
23#include "search.h"
24#include "dfa.h"
25
26/* For -w, we also consider _ to be word constituent. */
27#define WCHAR(C) (isalnum (C) || (C) == '_')
28
29/* KWset compiled pattern. For Ecompile and Gcompile, we compile
30 a list of strings, at least one of which is known to occur in
31 any string matching the regexp. */
32static kwset_t kwset;
33
34/* DFA compiled regexp. */
35static struct dfa *dfa;
36
37/* The Regex compiled patterns. */
38static struct patterns
39{
40 /* Regex compiled regexp. */
41 struct re_pattern_buffer regexbuf;
42 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
43 Q@#%!# library interface in regex.c. */
44} patterns0;
45
46static struct patterns *patterns;
47static size_t pcount;
48
49void
50dfaerror (char const *mesg)
51{
52 error (EXIT_TROUBLE, 0, "%s", mesg);
53
54 /* notreached */
55 /* Tell static analyzers that this function does not return. */
56 abort ();
57}
58
59/* For now, the sole dfawarn-eliciting condition (use of a regexp
60 like '[:lower:]') is unequivocally an error, so treat it as such,
61 when possible. */
62void
63dfawarn (char const *mesg)
64{
65 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
66 if (mode == DW_NONE)
67 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
68 if (mode == DW_GNU)
69 dfaerror (mesg);
70}
71
72/* Number of compiled fixed strings known to exactly match the regexp.
73 If kwsexec returns < kwset_exact_matches, then we don't need to
74 call the regexp matcher at all. */
75static size_t kwset_exact_matches;
76
77static char const *
78kwsincr_case (const char *must)
79{
80 size_t n = strlen (must);
81 const char *buf = (match_icase && MB_CUR_MAX > 1
82 ? mbtolower (must, &n)
83 : must);
84 return kwsincr (kwset, buf, n);
85}
86
87/* If the DFA turns out to have some set of fixed strings one of
88 which must occur in the match, then we build a kwset matcher
89 to find those strings, and thus quickly filter out impossible
90 matches. */
91static void
92kwsmusts (void)
93{
94 struct dfamust const *dm;
95 char const *err;
96
97 dm = dfamusts (dfa);
98 if (dm)
99 {
100 kwsinit (&kwset);
101 /* First, we compile in the substrings known to be exact
102 matches. The kwset matcher will return the index
103 of the matching string that it chooses. */
104 for (; dm; dm = dm->next)
105 {
106 if (!dm->exact)
107 continue;
108 ++kwset_exact_matches;
109 if ((err = kwsincr_case (dm->must)) != NULL)
110 error (EXIT_TROUBLE, 0, "%s", err);
111 }
112 /* Now, we compile the substrings that will require
113 the use of the regexp matcher. */
114 for (dm = dfamusts (dfa); dm; dm = dm->next)
115 {
116 if (dm->exact)
117 continue;
118 if ((err = kwsincr_case (dm->must)) != NULL)
119 error (EXIT_TROUBLE, 0, "%s", err);
120 }
121 if ((err = kwsprep (kwset)) != NULL)
122 error (EXIT_TROUBLE, 0, "%s", err);
123 }
124}
125
126void
127GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
128{
129 const char *err;
130 const char *p, *sep;
131 size_t total = size;
132 char *motif;
133
134 if (match_icase)
135 syntax_bits |= RE_ICASE;
136 re_set_syntax (syntax_bits);
137 dfasyntax (syntax_bits, match_icase, eolbyte);
138
139 /* For GNU regex compiler we have to pass the patterns separately to detect
140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
141 GNU regex should have raise a syntax error. The same for backref, where
142 the backref should have been local to each pattern. */
143 p = pattern;
144 do
145 {
146 size_t len;
147 sep = memchr (p, '\n', total);
148 if (sep)
149 {
150 len = sep - p;
151 sep++;
152 total -= (len + 1);
153 }
154 else
155 {
156 len = total;
157 total = 0;
158 }
159
160 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
161 patterns[pcount] = patterns0;
162
163 if ((err = re_compile_pattern (p, len,
164 &(patterns[pcount].regexbuf))) != NULL)
165 error (EXIT_TROUBLE, 0, "%s", err);
166 pcount++;
167
168 p = sep;
169 } while (sep && total != 0);
170
171 /* In the match_words and match_lines cases, we use a different pattern
172 for the DFA matcher that will quickly throw out cases that won't work.
173 Then if DFA succeeds we do some hairy stuff using the regex matcher
174 to decide whether the match should really count. */
175 if (match_words || match_lines)
176 {
177 static char const line_beg_no_bk[] = "^(";
178 static char const line_end_no_bk[] = ")$";
179 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
180 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
181 static char const line_beg_bk[] = "^\\(";
182 static char const line_end_bk[] = "\\)$";
183 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185 int bk = !(syntax_bits & RE_NO_BK_PARENS);
186 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
187
188 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
189 : (bk ? word_beg_bk : word_beg_no_bk));
190 total = strlen(n);
191 memcpy (n + total, pattern, size);
192 total += size;
193 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
194 : (bk ? word_end_bk : word_end_no_bk));
195 total += strlen (n + total);
196 pattern = motif = n;
197 size = total;
198 }
199 else
200 motif = NULL;
201
202 dfa = dfaalloc ();
203 dfacomp (pattern, size, dfa, 1);
204 kwsmusts ();
205
206 free(motif);
207}
208
209size_t
210EGexecute (char const *buf, size_t size, size_t *match_size,
211 char const *start_ptr)
212{
213 char const *buflim, *beg, *end, *match, *best_match, *mb_start;
214 char eol = eolbyte;
215 int backref;
216 regoff_t start;
217 ptrdiff_t len, best_len;
218 struct kwsmatch kwsm;
219 size_t i, ret_val;
220 if (MB_CUR_MAX > 1)
221 {
222 if (match_icase)
223 {
224 /* mbtolower adds a NUL byte at the end. That will provide
225 space for the sentinel byte dfaexec may add. */
226 char *case_buf = mbtolower (buf, &size);
227 if (start_ptr)
228 start_ptr = case_buf + (start_ptr - buf);
229 buf = case_buf;
230 }
231 }
232
233 mb_start = buf;
234 buflim = buf + size;
235
236 for (beg = end = buf; end < buflim; beg = end)
237 {
238 if (!start_ptr)
239 {
240 /* We don't care about an exact match. */
241 if (kwset)
242 {
243 /* Find a possible match using the KWset matcher. */
244 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
245 if (offset == (size_t) -1)
246 goto failure;
247 beg += offset;
248 /* Narrow down to the line containing the candidate, and
249 run it through DFA. */
250 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
251 end++;
252 else
253 end = buflim;
254 match = beg;
255 while (beg > buf && beg[-1] != eol)
256 --beg;
257 if (kwsm.index < kwset_exact_matches)
258 {
259 if (!MBS_SUPPORT)
260 goto success;
261
262 if (mb_start < beg)
263 mb_start = beg;
264 if (MB_CUR_MAX == 1
265 || !is_mb_middle (&mb_start, match, buflim,
266 kwsm.size[0]))
267 goto success;
268 }
269 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
270 continue;
271 }
272 else
273 {
274 /* No good fixed strings; start with DFA. */
275 char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
276 0, NULL, &backref);
277 if (next_beg == NULL)
278 break;
279 /* Narrow down to the line we've found. */
280 beg = next_beg;
281 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
282 end++;
283 else
284 end = buflim;
285 while (beg > buf && beg[-1] != eol)
286 --beg;
287 }
288 /* Successful, no backreferences encountered! */
289 if (!backref)
290 goto success;
291 }
292 else
293 {
294 /* We are looking for the leftmost (then longest) exact match.
295 We will go through the outer loop only once. */
296 beg = start_ptr;
297 end = buflim;
298 }
299
300 /* If the "line" is longer than the maximum regexp offset,
301 die as if we've run out of memory. */
302 if (TYPE_MAXIMUM (regoff_t) < end - buf - 1)
303 xalloc_die ();
304
305 /* If we've made it to this point, this means DFA has seen
306 a probable match, and we need to run it through Regex. */
307 best_match = end;
308 best_len = 0;
309 for (i = 0; i < pcount; i++)
310 {
311 patterns[i].regexbuf.not_eol = 0;
312 start = re_search (&(patterns[i].regexbuf),
313 buf, end - buf - 1,
314 beg - buf, end - beg - 1,
315 &(patterns[i].regs));
316 if (start < -1)
317 xalloc_die ();
318 else if (0 <= start)
319 {
320 len = patterns[i].regs.end[0] - start;
321 match = buf + start;
322 if (match > best_match)
323 continue;
324 if (start_ptr && !match_words)
325 goto assess_pattern_match;
326 if ((!match_lines && !match_words)
327 || (match_lines && len == end - beg - 1))
328 {
329 match = beg;
330 len = end - beg;
331 goto assess_pattern_match;
332 }
333 /* If -w, check if the match aligns with word boundaries.
334 We do this iteratively because:
335 (a) the line may contain more than one occurrence of the
336 pattern, and
337 (b) Several alternatives in the pattern might be valid at a
338 given point, and we may need to consider a shorter one to
339 find a word boundary. */
340 if (match_words)
341 while (match <= best_match)
342 {
343 if ((match == buf || !WCHAR ((unsigned char) match[-1]))
344 && (start + len == end - buf - 1
345 || !WCHAR ((unsigned char) match[len])))
346 goto assess_pattern_match;
347 if (len > 0)
348 {
349 /* Try a shorter length anchored at the same place. */
350 --len;
351 patterns[i].regexbuf.not_eol = 1;
352 len = re_match (&(patterns[i].regexbuf),
353 buf, match + len - beg, match - buf,
354 &(patterns[i].regs));
355 if (len < -1)
356 xalloc_die ();
357 }
358 if (len <= 0)
359 {
360 /* Try looking further on. */
361 if (match == end - 1)
362 break;
363 match++;
364 patterns[i].regexbuf.not_eol = 0;
365 start = re_search (&(patterns[i].regexbuf),
366 buf, end - buf - 1,
367 match - buf, end - match - 1,
368 &(patterns[i].regs));
369 if (start < 0)
370 {
371 if (start < -1)
372 xalloc_die ();
373 break;
374 }
375 len = patterns[i].regs.end[0] - start;
376 match = buf + start;
377 }
378 } /* while (match <= best_match) */
379 continue;
380 assess_pattern_match:
381 if (!start_ptr)
382 {
383 /* Good enough for a non-exact match.
384 No need to look at further patterns, if any. */
385 goto success;
386 }
387 if (match < best_match || (match == best_match && len > best_len))
388 {
389 /* Best exact match: leftmost, then longest. */
390 best_match = match;
391 best_len = len;
392 }
393 } /* if re_search >= 0 */
394 } /* for Regex patterns. */
395 if (best_match < end)
396 {
397 /* We have found an exact match. We were just
398 waiting for the best one (leftmost then longest). */
399 beg = best_match;
400 len = best_len;
401 goto success_in_len;
402 }
403 } /* for (beg = end ..) */
404
405 failure:
406 ret_val = -1;
407 goto out;
408
409 success:
410 len = end - beg;
411 success_in_len:
412 *match_size = len;
413 ret_val = beg - buf;
414 out:
415 return ret_val;
416}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette