summaryrefslogtreecommitdiff
path: root/convert.c
diff options
context:
space:
mode:
authorLars Schneider <larsxschneider@gmail.com>2018-04-15 18:16:10 (GMT)
committerJunio C Hamano <gitster@pobox.com>2018-04-16 02:40:56 (GMT)
commite92d6225361eba5ff34696122d1491dc7ace2a5a (patch)
treedcb3ac5d5b153ce1052c139c52cacfc4a0706b0e /convert.c
parent541d059cd903bb8e510f876ea2bc33719b76b33c (diff)
downloadgit-e92d6225361eba5ff34696122d1491dc7ace2a5a.zip
git-e92d6225361eba5ff34696122d1491dc7ace2a5a.tar.gz
git-e92d6225361eba5ff34696122d1491dc7ace2a5a.tar.bz2
convert: add round trip check based on 'core.checkRoundtripEncoding'
UTF supports lossless conversion round tripping and conversions between UTF and other encodings are mostly round trip safe as Unicode aims to be a superset of all other character encodings. However, certain encodings (e.g. SHIFT-JIS) are known to have round trip issues [1]. Add 'core.checkRoundtripEncoding', which contains a comma separated list of encodings, to define for what encodings Git should check the conversion round trip if they are used in the 'working-tree-encoding' attribute. Set SHIFT-JIS as default value for 'core.checkRoundtripEncoding'. [1] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode Signed-off-by: Lars Schneider <larsxschneider@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'convert.c')
-rw-r--r--convert.c77
1 files changed, 77 insertions, 0 deletions
diff --git a/convert.c b/convert.c
index bc35c33..1ae6301 100644
--- a/convert.c
+++ b/convert.c
@@ -347,6 +347,42 @@ static void trace_encoding(const char *context, const char *path,
strbuf_release(&trace);
}
+static int check_roundtrip(const char *enc_name)
+{
+ /*
+ * check_roundtrip_encoding contains a string of comma and/or
+ * space separated encodings (eg. "UTF-16, ASCII, CP1125").
+ * Search for the given encoding in that string.
+ */
+ const char *found = strcasestr(check_roundtrip_encoding, enc_name);
+ const char *next;
+ int len;
+ if (!found)
+ return 0;
+ next = found + strlen(enc_name);
+ len = strlen(check_roundtrip_encoding);
+ return (found && (
+ /*
+ * check that the found encoding is at the
+ * beginning of check_roundtrip_encoding or
+ * that it is prefixed with a space or comma
+ */
+ found == check_roundtrip_encoding || (
+ (isspace(found[-1]) || found[-1] == ',')
+ )
+ ) && (
+ /*
+ * check that the found encoding is at the
+ * end of check_roundtrip_encoding or
+ * that it is suffixed with a space or comma
+ */
+ next == check_roundtrip_encoding + len || (
+ next < check_roundtrip_encoding + len &&
+ (isspace(next[0]) || next[0] == ',')
+ )
+ ));
+}
+
static const char *default_encoding = "UTF-8";
static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -395,6 +431,47 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
}
trace_encoding("destination", path, default_encoding, dst, dst_len);
+ /*
+ * UTF supports lossless conversion round tripping [1] and conversions
+ * between UTF and other encodings are mostly round trip safe as
+ * Unicode aims to be a superset of all other character encodings.
+ * However, certain encodings (e.g. SHIFT-JIS) are known to have round
+ * trip issues [2]. Check the round trip conversion for all encodings
+ * listed in core.checkRoundtripEncoding.
+ *
+ * The round trip check is only performed if content is written to Git.
+ * This ensures that no information is lost during conversion to/from
+ * the internal UTF-8 representation.
+ *
+ * Please note, the code below is not tested because I was not able to
+ * generate a faulty round trip without an iconv error. Iconv errors
+ * are already caught above.
+ *
+ * [1] http://unicode.org/faq/utf_bom.html#gen2
+ * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode
+ */
+ if (die_on_error && check_roundtrip(enc)) {
+ char *re_src;
+ int re_src_len;
+
+ re_src = reencode_string_len(dst, dst_len,
+ enc, default_encoding,
+ &re_src_len);
+
+ trace_printf("Checking roundtrip encoding for %s...\n", enc);
+ trace_encoding("reencoded source", path, enc,
+ re_src, re_src_len);
+
+ if (!re_src || src_len != re_src_len ||
+ memcmp(src, re_src, src_len)) {
+ const char* msg = _("encoding '%s' from %s to %s and "
+ "back is not the same");
+ die(msg, path, enc, default_encoding);
+ }
+
+ free(re_src);
+ }
+
strbuf_attach(buf, dst, dst_len, dst_len + 1);
return 1;
}