1 | #! /bin/sh
|
---|
2 | # Test whether \s matches SP and UTF-8 multi-byte white space characters.
|
---|
3 | #
|
---|
4 | # Copyright (C) 2013-2021 Free Software Foundation, Inc.
|
---|
5 | #
|
---|
6 | # Copying and distribution of this file, with or without modification,
|
---|
7 | # are permitted in any medium without royalty provided the copyright
|
---|
8 | # notice and this notice are preserved.
|
---|
9 |
|
---|
10 | . "${srcdir=.}/init.sh"; path_prepend_ ../src
|
---|
11 |
|
---|
12 | require_en_utf8_locale_
|
---|
13 |
|
---|
14 | LC_ALL=en_US.UTF-8
|
---|
15 | export LC_ALL
|
---|
16 |
|
---|
17 | # It would have been nice to be able to use all UTF8 characters
|
---|
18 | # with the Unicode WSpace=Y character property,
|
---|
19 | # https://en.wikipedia.org/wiki/Whitespace_character, but that
|
---|
20 | # would currently cause distracting failures everywhere I've tried.
|
---|
21 | # Instead, I've listed each with an indicator column, telling what
|
---|
22 | # this test should do if the system's locale/tools produce the
|
---|
23 | # wrong answer.
|
---|
24 |
|
---|
25 | # The values in that column:
|
---|
26 | # X required on all systems (fail if \s or \S fail to work as expected)
|
---|
27 | # x required on "modern enough" systems
|
---|
28 | # O optional: \s or \S misbehavior elicits a warning, but never failure
|
---|
29 |
|
---|
30 | utf8_space_characters=$(sed 's/.*: *//;s/ */\\x/g' <<\EOF
|
---|
31 | U+0009 Horizontal Tab: X 09
|
---|
32 | U+000A Line feed: O 0a
|
---|
33 | U+000B Vertical Tab: X 0b
|
---|
34 | U+000C Form feed: X 0c
|
---|
35 | U+000D Carriage return: X 0d
|
---|
36 | U+0020 SPACE: X 20
|
---|
37 | U+0085 Next line: O 85
|
---|
38 | U+00A0 NO-BREAK SPACE: O c2 a0
|
---|
39 | U+1680 OGHAM SPACE MARK: x e1 9a 80
|
---|
40 | U+2000 EN QUAD: x e2 80 80
|
---|
41 | U+2001 EM QUAD: x e2 80 81
|
---|
42 | U+2002 EN SPACE: x e2 80 82
|
---|
43 | U+2003 EM SPACE: x e2 80 83
|
---|
44 | U+2004 THREE-PER-EM SPACE: x e2 80 84
|
---|
45 | U+2005 FOUR-PER-EM SPACE: x e2 80 85
|
---|
46 | U+2006 SIX-PER-EM SPACE: x e2 80 86
|
---|
47 | U+2007 FIGURE SPACE: O e2 80 87
|
---|
48 | U+2008 PUNCTUATION SPACE: x e2 80 88
|
---|
49 | U+2009 THIN SPACE: x e2 80 89
|
---|
50 | U+200A HAIR SPACE: x e2 80 8a
|
---|
51 | U+200B ZERO WIDTH SPACE: O e2 80 8b
|
---|
52 | U+202F NARROW NO-BREAK SPACE: O e2 80 af
|
---|
53 | U+205F MEDIUM MATHEMATICAL SPACE: x e2 81 9f
|
---|
54 | U+3000 IDEOGRAPHIC SPACE: x e3 80 80
|
---|
55 | EOF
|
---|
56 | )
|
---|
57 |
|
---|
58 | fail=0
|
---|
59 |
|
---|
60 | # On systems that are not "modern enough," simply warn when an "x"-marked
|
---|
61 | # character is not classified as white space. Too many systems
|
---|
62 | # have inadequate UTF-8 tables in this respect, and that lack should not
|
---|
63 | # discourage/confuse those who consider whether to install grep.
|
---|
64 |
|
---|
65 | # As for what constitutes "modern enough", I've arbitrarily started
|
---|
66 | # with "Fedora 20 or newer". Tested additions welcome.
|
---|
67 | modern_enough=0
|
---|
68 | grep -iE 'fedora release [2-9][0-9]+\b' /etc/redhat-release >/dev/null 2>&1 \
|
---|
69 | && modern_enough=1
|
---|
70 |
|
---|
71 | for i in $utf8_space_characters; do
|
---|
72 | eval 'fail() { fail=1; }'
|
---|
73 | m=ERROR
|
---|
74 | case $i in
|
---|
75 | X*) ;;
|
---|
76 | x*) test $modern_enough = 1 || { eval 'fail() { :; }'; m=warning; } ;;
|
---|
77 | O*) m=warning; eval 'fail() { :; }' ;;
|
---|
78 | *) warn_ "unexpected prefix: $i"; exit 1 ;;
|
---|
79 | esac
|
---|
80 |
|
---|
81 | # Strip the prefix byte.
|
---|
82 | i=${i#?}
|
---|
83 |
|
---|
84 | hex_printf_ "$i" | grep -q '^\s$' \
|
---|
85 | || { warn_ " $m: \\s failed to match $i in the $LC_ALL locale"; fail; }
|
---|
86 | hex_printf_ "$i" | returns_ 1 grep -q '\S' \
|
---|
87 | || { warn_ " $m: \\S mistakenly matched $i in the $LC_ALL locale"; fail; }
|
---|
88 | done
|
---|
89 |
|
---|
90 |
|
---|
91 | # This is a separate test, only nominally related to \s.
|
---|
92 | # It is solely to get coverage of a code path (exercising dfa.c's
|
---|
93 | # match_mb_charset function) that would have otherwise been untouched.
|
---|
94 | # However, as of the change-set adding this new test, match_mb_charset
|
---|
95 | # is unreachable via grep.
|
---|
96 | printf '\0' | returns_ 1 grep -aE '^\s?$' > out 2>&1 || fail=1
|
---|
97 | compare /dev/null out
|
---|
98 |
|
---|
99 | Exit $fail
|
---|