summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2014-06-06 18:29:38 (GMT)
committerJunio C Hamano <gitster@pobox.com>2014-06-06 18:29:38 (GMT)
commit334d40e951fa3b3961135b3183633706d976c4bd (patch)
tree445e33f7e58e9e7e9b30be0952b6bf493ac0931c /utf8.c
parenta0460132a740d8ff0c08dcbd54520f1b795298b9 (diff)
parent9c94389c3ee02df891100b894c1790a524268d91 (diff)
downloadgit-334d40e951fa3b3961135b3183633706d976c4bd.zip
git-334d40e951fa3b3961135b3183633706d976c4bd.tar.gz
git-334d40e951fa3b3961135b3183633706d976c4bd.tar.bz2
Merge branch 'tb/unicode-6.3-zero-width'
Update the logic to compute the display width needed for utf8 strings and allow us to more easily maintain the tables used in that logic. We may want to let the users choose if codepoints with ambiguous widths are treated as a double or single width in a follow-up patch. * tb/unicode-6.3-zero-width: utf8: make it easier to auto-update git_wcwidth() utf8.c: use a table for double_width
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c76
1 files changed, 7 insertions, 69 deletions
diff --git a/utf8.c b/utf8.c
index 77c28d4..b30790d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -80,52 +80,8 @@ static int git_wcwidth(ucs_char_t ch)
{
/*
* Sorted list of non-overlapping intervals of non-spacing characters,
- * generated by
- * "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c".
*/
- static const struct interval combining[] = {
- { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
- { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
- { 0x05C7, 0x05C7 }, { 0x0600, 0x0604 }, { 0x0610, 0x061A },
- { 0x064B, 0x065F }, { 0x0670, 0x0670 }, { 0x06D6, 0x06E4 },
- { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x070F, 0x070F },
- { 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 },
- { 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 },
- { 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 },
- { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 },
- { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 },
- { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 },
- { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 },
- { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
- { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
- { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 },
- { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 },
- { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 },
- { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 },
- { 0x0CBC, 0x0CBC }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 },
- { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
- { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
- { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
- { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
- { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
- { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
- { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
- { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
- { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
- { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x1712, 0x1714 },
- { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, { 0x1772, 0x1773 },
- { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 },
- { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, { 0x180B, 0x180D },
- { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, { 0x1927, 0x1928 },
- { 0x1932, 0x1932 }, { 0x1939, 0x193B }, { 0x200B, 0x200F },
- { 0x202A, 0x202E }, { 0x2060, 0x2063 }, { 0x206A, 0x206F },
- { 0x20D0, 0x20EA }, { 0x302A, 0x302F }, { 0x3099, 0x309A },
- { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE23 },
- { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB }, { 0x1D167, 0x1D169 },
- { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B },
- { 0x1D1AA, 0x1D1AD }, { 0xE0001, 0xE0001 },
- { 0xE0020, 0xE007F }, { 0xE0100, 0xE01EF }
- };
+#include "unicode_width.h"
/* test for 8-bit control characters */
if (ch == 0)
@@ -134,34 +90,16 @@ static int git_wcwidth(ucs_char_t ch)
return -1;
/* binary search in table of non-spacing characters */
- if (bisearch(ch, combining, sizeof(combining)
+ if (bisearch(ch, zero_width, sizeof(zero_width)
/ sizeof(struct interval) - 1))
return 0;
- /*
- * If we arrive here, ch is neither a combining nor a C0/C1
- * control character.
- */
+ /* binary search in table of double width characters */
+ if (bisearch(ch, double_width, sizeof(double_width)
+ / sizeof(struct interval) - 1))
+ return 2;
- return 1 +
- (ch >= 0x1100 &&
- /* Hangul Jamo init. consonants */
- (ch <= 0x115f ||
- ch == 0x2329 || ch == 0x232a ||
- /* CJK ... Yi */
- (ch >= 0x2e80 && ch <= 0xa4cf &&
- ch != 0x303f) ||
- /* Hangul Syllables */
- (ch >= 0xac00 && ch <= 0xd7a3) ||
- /* CJK Compatibility Ideographs */
- (ch >= 0xf900 && ch <= 0xfaff) ||
- /* CJK Compatibility Forms */
- (ch >= 0xfe30 && ch <= 0xfe6f) ||
- /* Fullwidth Forms */
- (ch >= 0xff00 && ch <= 0xff60) ||
- (ch >= 0xffe0 && ch <= 0xffe6) ||
- (ch >= 0x20000 && ch <= 0x2fffd) ||
- (ch >= 0x30000 && ch <= 0x3fffd)));
+ return 1;
}
/*