multibyte-white-space

Last change on this file was 3529, checked in by bird, 3 years ago
Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.
Property svn:executable set to ``*
File size: 3.5 KB

Line
1	#! /bin/sh
2	# Test whether \s matches SP and UTF-8 multi-byte white space characters.
3	#
4	# Copyright (C) 2013-2021 Free Software Foundation, Inc.
5	#
6	# Copying and distribution of this file, with or without modification,
7	# are permitted in any medium without royalty provided the copyright
8	# notice and this notice are preserved.
9
10	. "${srcdir=.}/init.sh"; path_prepend_ ../src
11
12	require_en_utf8_locale_
13
14	LC_ALL=en_US.UTF-8
15	export LC_ALL
16
17	# It would have been nice to be able to use all UTF8 characters
18	# with the Unicode WSpace=Y character property,
19	# https://en.wikipedia.org/wiki/Whitespace_character, but that
20	# would currently cause distracting failures everywhere I've tried.
21	# Instead, I've listed each with an indicator column, telling what
22	# this test should do if the system's locale/tools produce the
23	# wrong answer.
24
25	# The values in that column:
26	# X required on all systems (fail if \s or \S fail to work as expected)
27	# x required on "modern enough" systems
28	# O optional: \s or \S misbehavior elicits a warning, but never failure
29
30	utf8_space_characters=$(sed 's/.: //;s/ */\\x/g' <<\EOF
31	U+0009 Horizontal Tab: X 09
32	U+000A Line feed: O 0a
33	U+000B Vertical Tab: X 0b
34	U+000C Form feed: X 0c
35	U+000D Carriage return: X 0d
36	U+0020 SPACE: X 20
37	U+0085 Next line: O 85
38	U+00A0 NO-BREAK SPACE: O c2 a0
39	U+1680 OGHAM SPACE MARK: x e1 9a 80
40	U+2000 EN QUAD: x e2 80 80
41	U+2001 EM QUAD: x e2 80 81
42	U+2002 EN SPACE: x e2 80 82
43	U+2003 EM SPACE: x e2 80 83
44	U+2004 THREE-PER-EM SPACE: x e2 80 84
45	U+2005 FOUR-PER-EM SPACE: x e2 80 85
46	U+2006 SIX-PER-EM SPACE: x e2 80 86
47	U+2007 FIGURE SPACE: O e2 80 87
48	U+2008 PUNCTUATION SPACE: x e2 80 88
49	U+2009 THIN SPACE: x e2 80 89
50	U+200A HAIR SPACE: x e2 80 8a
51	U+200B ZERO WIDTH SPACE: O e2 80 8b
52	U+202F NARROW NO-BREAK SPACE: O e2 80 af
53	U+205F MEDIUM MATHEMATICAL SPACE: x e2 81 9f
54	U+3000 IDEOGRAPHIC SPACE: x e3 80 80
55	EOF
56	)
57
58	fail=0
59
60	# On systems that are not "modern enough," simply warn when an "x"-marked
61	# character is not classified as white space. Too many systems
62	# have inadequate UTF-8 tables in this respect, and that lack should not
63	# discourage/confuse those who consider whether to install grep.
64
65	# As for what constitutes "modern enough", I've arbitrarily started
66	# with "Fedora 20 or newer". Tested additions welcome.
67	modern_enough=0
68	grep -iE 'fedora release [2-9][0-9]+\b' /etc/redhat-release >/dev/null 2>&1 \
69	&& modern_enough=1
70
71	for i in $utf8_space_characters; do
72	eval 'fail() { fail=1; }'
73	m=ERROR
74	case $i in
75	X*) ;;
76	x*) test $modern_enough = 1 \|\| { eval 'fail() { :; }'; m=warning; } ;;
77	O*) m=warning; eval 'fail() { :; }' ;;
78	*) warn_ "unexpected prefix: $i"; exit 1 ;;
79	esac
80
81	# Strip the prefix byte.
82	i=${i#?}
83
84	hex_printf_ "$i" \| grep -q '^\s$' \
85	\|\| { warn_ " $m: \\s failed to match $i in the $LC_ALL locale"; fail; }
86	hex_printf_ "$i" \| returns_ 1 grep -q '\S' \
87	\|\| { warn_ " $m: \\S mistakenly matched $i in the $LC_ALL locale"; fail; }
88	done
89
90
91	# This is a separate test, only nominally related to \s.
92	# It is solely to get coverage of a code path (exercising dfa.c's
93	# match_mb_charset function) that would have otherwise been untouched.
94	# However, as of the change-set adding this new test, match_mb_charset
95	# is unreachable via grep.
96	printf '\0' \| returns_ 1 grep -aE '^\s?$' > out 2>&1 \|\| fail=1
97	compare /dev/null out
98
99	Exit $fail

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format