summaryrefslogtreecommitdiff
path: root/t/lib-unicode-nfc-nfd.sh
blob: 22232247efc34d28804311f44ead4661b9ad2afd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Help detect how Unicode NFC and NFD are handled on the filesystem.
 
# A simple character that has a NFD form.
#
# NFC:       U+00e9 LATIN SMALL LETTER E WITH ACUTE
# UTF8(NFC): \xc3 \xa9
#
# NFD:       U+0065 LATIN SMALL LETTER E
#            U+0301 COMBINING ACUTE ACCENT
# UTF8(NFD): \x65  +  \xcc \x81
#
utf8_nfc=$(printf "\xc3\xa9")
utf8_nfd=$(printf "\x65\xcc\x81")
 
# Is the OS or the filesystem "Unicode composition sensitive"?
#
# That is, does the OS or the filesystem allow files to exist with
# both the NFC and NFD spellings?  Or, does the OS/FS lie to us and
# tell us that the NFC and NFD forms are equivalent.
#
# This is or may be independent of what type of filesystem we have,
# since it might be handled by the OS at a layer above the FS.
# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
# collision, for example.
#
# This does not tell us how the Unicode pathname will be spelled
# on disk, but rather only that the two spelling "collide".  We
# will examine the actual on disk spelling in a later prereq.
#
test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
	mkdir trial_${utf8_nfc} &&
	mkdir trial_${utf8_nfd}
'
 
# Is the spelling of an NFC pathname preserved on disk?
#
# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
# and on APFS, NFC paths are preserved.  As we have established
# above, this is independent of "composition sensitivity".
#
test_lazy_prereq UNICODE_NFC_PRESERVED '
	mkdir c_${utf8_nfc} &&
	ls | test-tool hexdump >dump &&
	grep "63 5f c3 a9" dump
'
 
# Is the spelling of an NFD pathname preserved on disk?
#
test_lazy_prereq UNICODE_NFD_PRESERVED '
	mkdir d_${utf8_nfd} &&
	ls | test-tool hexdump >dump &&
	grep "64 5f 65 cc 81" dump
'
 
# The following _DOUBLE_ forms are more for my curiosity,
# but there may be quirks lurking when there are multiple
# combining characters in non-canonical order.
 
# Unicode also allows multiple combining characters
# that can be decomposed in pieces.
#
# NFC:        U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
# UTF8(NFC):  \xe1 \xbd \xa7
#
# NFD1:       U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
#             U+0342 COMBINING GREEK PERISPOMENI
# UTF8(NFD1): \xe1 \xbd \xa1  +  \xcd \x82
#
# But U+1f61 decomposes into
# NFD2:       U+03c9 GREEK SMALL LETTER OMEGA
#             U+0314 COMBINING REVERSED COMMA ABOVE
# UTF8(NFD2): \xcf \x89  +  \xcc \x94
#
# Yielding:   \xcf \x89  +  \xcc \x94  +  \xcd \x82
#
# Note that I've used the canonical ordering of the
# combinining characters.  It is also possible to
# swap them.  My testing shows that that non-standard
# ordering also causes a collision in mkdir.  However,
# the resulting names don't draw correctly on the
# terminal (implying that the on-disk format also has
# them out of order).
#
greek_nfc=$(printf "\xe1\xbd\xa7")
greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")
 
# See if a double decomposition also collides.
#
test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
	mkdir trial_${greek_nfc} &&
	mkdir trial_${greek_nfd2}
'
 
# See if the NFC spelling appears on the disk.
#
test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
	mkdir c_${greek_nfc} &&
	ls | test-tool hexdump >dump &&
	grep "63 5f e1 bd a7" dump
'
 
# See if the NFD spelling appears on the disk.
#
test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
	mkdir d_${greek_nfd2} &&
	ls | test-tool hexdump >dump &&
	grep "64 5f cf 89 cc 94 cd 82" dump
'
 
# The following is for debugging. I found it useful when
# trying to understand the various (OS, FS) quirks WRT
# Unicode and how composition/decomposition is handled.
# For example, when trying to understand how (macOS, APFS)
# and (macOS, HFS) and (macOS, FAT32) compare.
#
# It is rather noisy, so it is disabled by default.
#
if test "$unicode_debug" = "true"
then
	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
	then
		echo NFC and NFD are distinct on this OS/filesystem.
	else
		echo NFC and NFD are aliases on this OS/filesystem.
	fi
 
	if test_have_prereq UNICODE_NFC_PRESERVED
	then
		echo NFC maintains original spelling.
	else
		echo NFC is modified.
	fi
 
	if test_have_prereq UNICODE_NFD_PRESERVED
	then
		echo NFD maintains original spelling.
	else
		echo NFD is modified.
	fi
 
	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
	then
		echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
	else
		echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
	fi
 
	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
	then
		echo Double NFC maintains original spelling.
	else
		echo Double NFC is modified.
	fi
 
	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
	then
		echo Double NFD maintains original spelling.
	else
		echo Double NFD is modified.
	fi
fi