# Help detect how Unicode NFC and NFD are handled on the filesystem. # A simple character that has a NFD form. # # NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE # UTF8(NFC): \xc3 \xa9 # # NFD: U+0065 LATIN SMALL LETTER E # U+0301 COMBINING ACUTE ACCENT # UTF8(NFD): \x65 + \xcc \x81 # utf8_nfc=$(printf "\xc3\xa9") utf8_nfd=$(printf "\x65\xcc\x81") # Is the OS or the filesystem "Unicode composition sensitive"? # # That is, does the OS or the filesystem allow files to exist with # both the NFC and NFD spellings? Or, does the OS/FS lie to us and # tell us that the NFC and NFD forms are equivalent. # # This is or may be independent of what type of filesystem we have, # since it might be handled by the OS at a layer above the FS. # Testing shows on MacOS using APFS, HFS+, and FAT32 reports a # collision, for example. # # This does not tell us how the Unicode pathname will be spelled # on disk, but rather only that the two spelling "collide". We # will examine the actual on disk spelling in a later prereq. # test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE ' mkdir trial_${utf8_nfc} && mkdir trial_${utf8_nfd} ' # Is the spelling of an NFC pathname preserved on disk? # # On MacOS with HFS+ and FAT32, NFC paths are converted into NFD # and on APFS, NFC paths are preserved. As we have established # above, this is independent of "composition sensitivity". # test_lazy_prereq UNICODE_NFC_PRESERVED ' mkdir c_${utf8_nfc} && ls | test-tool hexdump >dump && grep "63 5f c3 a9" dump ' # Is the spelling of an NFD pathname preserved on disk? # test_lazy_prereq UNICODE_NFD_PRESERVED ' mkdir d_${utf8_nfd} && ls | test-tool hexdump >dump && grep "64 5f 65 cc 81" dump ' # The following _DOUBLE_ forms are more for my curiosity, # but there may be quirks lurking when there are multiple # combining characters in non-canonical order. # Unicode also allows multiple combining characters # that can be decomposed in pieces. # # NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI # UTF8(NFC): \xe1 \xbd \xa7 # # NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA # U+0342 COMBINING GREEK PERISPOMENI # UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82 # # But U+1f61 decomposes into # NFD2: U+03c9 GREEK SMALL LETTER OMEGA # U+0314 COMBINING REVERSED COMMA ABOVE # UTF8(NFD2): \xcf \x89 + \xcc \x94 # # Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82 # # Note that I've used the canonical ordering of the # combinining characters. It is also possible to # swap them. My testing shows that that non-standard # ordering also causes a collision in mkdir. However, # the resulting names don't draw correctly on the # terminal (implying that the on-disk format also has # them out of order). # greek_nfc=$(printf "\xe1\xbd\xa7") greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82") greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82") # See if a double decomposition also collides. # test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE ' mkdir trial_${greek_nfc} && mkdir trial_${greek_nfd2} ' # See if the NFC spelling appears on the disk. # test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED ' mkdir c_${greek_nfc} && ls | test-tool hexdump >dump && grep "63 5f e1 bd a7" dump ' # See if the NFD spelling appears on the disk. # test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED ' mkdir d_${greek_nfd2} && ls | test-tool hexdump >dump && grep "64 5f cf 89 cc 94 cd 82" dump ' # The following is for debugging. I found it useful when # trying to understand the various (OS, FS) quirks WRT # Unicode and how composition/decomposition is handled. # For example, when trying to understand how (macOS, APFS) # and (macOS, HFS) and (macOS, FAT32) compare. # # It is rather noisy, so it is disabled by default. # if test "$unicode_debug" = "true" then if test_have_prereq UNICODE_COMPOSITION_SENSITIVE then echo NFC and NFD are distinct on this OS/filesystem. else echo NFC and NFD are aliases on this OS/filesystem. fi if test_have_prereq UNICODE_NFC_PRESERVED then echo NFC maintains original spelling. else echo NFC is modified. fi if test_have_prereq UNICODE_NFD_PRESERVED then echo NFD maintains original spelling. else echo NFD is modified. fi if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE then echo DOUBLE NFC and NFD are distinct on this OS/filesystem. else echo DOUBLE NFC and NFD are aliases on this OS/filesystem. fi if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED then echo Double NFC maintains original spelling. else echo Double NFC is modified. fi if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED then echo Double NFD maintains original spelling. else echo Double NFD is modified. fi fi