summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2019-06-13 20:19:39 (GMT)
committerJunio C Hamano <gitster@pobox.com>2019-06-13 20:19:40 (GMT)
commit66dc7b68e4a332c662980b0266b9a1cb545f40ff (patch)
treeb0a77642e65e7dbb8eff9e6ed2b4c9b065e0c144
parentc0e78f7e4687e7bd5ff0b83974b28c1cc81c5487 (diff)
parente80001f8fd7f608c6d05ce0ab2ebfc90d724a307 (diff)
downloadgit-66dc7b68e4a332c662980b0266b9a1cb545f40ff.zip
git-66dc7b68e4a332c662980b0266b9a1cb545f40ff.tar.gz
git-66dc7b68e4a332c662980b0266b9a1cb545f40ff.tar.bz2
Merge branch 'en/fast-export-encoding'
The "git fast-export/import" pair has been taught to handle commits with log messages in encoding other than UTF-8 better. * en/fast-export-encoding: fast-export: do automatic reencoding of commit messages only if requested fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8 fast-export: avoid stripping encoding header if we cannot reencode fast-import: support 'encoding' commit header t9350: fix encoding test to actually test reencoding
-rw-r--r--Documentation/git-fast-export.txt7
-rw-r--r--Documentation/git-fast-import.txt7
-rw-r--r--builtin/fast-export.c55
-rw-r--r--fast-import.c11
-rwxr-xr-xt/t9300-fast-import.sh20
-rwxr-xr-xt/t9350-fast-export.sh78
-rw-r--r--t/t9350/broken-iso-8859-7-commit-message.txt1
-rw-r--r--t/t9350/simple-iso-8859-7-commit-message.txt1
8 files changed, 163 insertions, 17 deletions
diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt
index 64c01ba..11427ac 100644
--- a/Documentation/git-fast-export.txt
+++ b/Documentation/git-fast-export.txt
@@ -129,6 +129,13 @@ marks the same across runs.
for intermediary filters (e.g. for rewriting commit messages
which refer to older commits, or for stripping blobs by id).
+--reencode=(yes|no|abort)::
+ Specify how to handle `encoding` header in commit objects. When
+ asking to 'abort' (which is the default), this program will die
+ when encountering such a commit object. With 'yes', the commit
+ message will be reencoded into UTF-8. With 'no', the original
+ encoding will be preserved.
+
--refspec::
Apply the specified refspec to each ref exported. Multiple of them can
be specified.
diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index d65cdb3..7baf9e4 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -388,6 +388,7 @@ change to the project.
original-oid?
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
+ ('encoding' SP <encoding>)?
data
('from' SP <commit-ish> LF)?
('merge' SP <commit-ish> LF)?
@@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
See ``Date Formats'' above for the set of supported formats, and
their syntax.
+`encoding`
+^^^^^^^^^^
+The optional `encoding` command indicates the encoding of the commit
+message. Most commits are UTF-8 and the encoding is omitted, but this
+allows importing commit messages into git without first reencoding them.
+
`from`
^^^^^^
The `from` command is used to specify the commit to initialize
diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 9e28348..c22cef3 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
static int progress;
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
+static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
static int fake_missing_tagger;
static int use_done_feature;
static int no_data;
@@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
return 0;
}
+static int parse_opt_reencode_mode(const struct option *opt,
+ const char *arg, int unset)
+{
+ if (unset) {
+ reencode_mode = REENCODE_ABORT;
+ return 0;
+ }
+
+ switch (git_parse_maybe_bool(arg)) {
+ case 0:
+ reencode_mode = REENCODE_NO;
+ break;
+ case 1:
+ reencode_mode = REENCODE_YES;
+ break;
+ default:
+ if (!strcasecmp(arg, "abort"))
+ reencode_mode = REENCODE_ABORT;
+ else
+ return error("Unknown reencoding mode: %s", arg);
+ }
+
+ return 0;
+}
+
static struct decoration idnums;
static uint32_t last_idnum;
@@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end)
bol = memmem(begin, end ? end - begin : strlen(begin),
needle, strlen(needle));
if (!bol)
- return git_commit_encoding;
+ return NULL;
bol += strlen(needle);
eol = strchrnul(bol, '\n');
*eol = '\0';
@@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
}
mark_next_object(&commit->object);
- if (anonymize)
+ if (anonymize) {
reencoded = anonymize_commit_message(message);
- else if (!is_encoding_utf8(encoding))
- reencoded = reencode_string(message, "UTF-8", encoding);
+ } else if (encoding) {
+ switch(reencode_mode) {
+ case REENCODE_YES:
+ reencoded = reencode_string(message, "UTF-8", encoding);
+ break;
+ case REENCODE_NO:
+ break;
+ case REENCODE_ABORT:
+ die("Encountered commit-specific encoding %s in commit "
+ "%s; use --reencode=[yes|no] to handle it",
+ encoding, oid_to_hex(&commit->object.oid));
+ }
+ }
if (!commit->parents)
printf("reset %s\n", refname);
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
if (show_original_ids)
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
- printf("%.*s\n%.*s\ndata %u\n%s",
+ printf("%.*s\n%.*s\n",
(int)(author_end - author), author,
- (int)(committer_end - committer), committer,
+ (int)(committer_end - committer), committer);
+ if (!reencoded && encoding)
+ printf("encoding %s\n", encoding);
+ printf("data %u\n%s",
(unsigned)(reencoded
? strlen(reencoded) : message
? strlen(message) : 0),
@@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
N_("select handling of tags that tag filtered objects"),
parse_opt_tag_of_filtered_mode),
+ OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
+ N_("select handling of commit messages in an alternate encoding"),
+ parse_opt_reencode_mode),
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
N_("Dump marks to this file")),
OPT_STRING(0, "import-marks", &import_filename, N_("file"),
diff --git a/fast-import.c b/fast-import.c
index f38d04f..76a7bd3 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
struct branch *b;
char *author = NULL;
char *committer = NULL;
+ const char *encoding = NULL;
struct hash_list *merge_list = NULL;
unsigned int merge_count;
unsigned char prev_fanout, new_fanout;
@@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
}
if (!committer)
die("Expected committer but didn't get one");
+ if (skip_prefix(command_buf.buf, "encoding ", &encoding))
+ read_next_command();
parse_data(&msg, 0, NULL);
read_next_command();
parse_from(b);
@@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
}
strbuf_addf(&new_data,
"author %s\n"
- "committer %s\n"
- "\n",
+ "committer %s\n",
author ? author : committer, committer);
+ if (encoding)
+ strbuf_addf(&new_data,
+ "encoding %s\n",
+ encoding);
+ strbuf_addch(&new_data, '\n');
strbuf_addbuf(&new_data, &msg);
free(author);
free(committer);
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index 3668263..141b7fa 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
'
+###
+### series X (other new features)
+###
+
+test_expect_success 'X: handling encoding' '
+ test_tick &&
+ cat >input <<-INPUT_END &&
+ commit refs/heads/encoding
+ committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+ encoding iso-8859-7
+ data <<COMMIT
+ INPUT_END
+
+ printf "Pi: \360\nCOMMIT\n" >>input &&
+
+ git fast-import <input &&
+ git cat-file -p encoding | grep $(printf "\360") &&
+ git log -1 --format=%B encoding | grep $(printf "\317\200")
+'
+
test_done
diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
index 5690fe2..b4004e0 100755
--- a/t/t9350-fast-export.sh
+++ b/t/t9350-fast-export.sh
@@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
test $MUSS = $(git rev-parse --verify refs/tags/muss)
'
-test_expect_success 'iso-8859-1' '
+test_expect_success 'reencoding iso-8859-7' '
- git config i18n.commitencoding ISO8859-1 &&
- # use author and committer name in ISO-8859-1 to match it.
- . "$TEST_DIRECTORY"/t3901/8859-1.txt &&
+ test_when_finished "git reset --hard HEAD~1" &&
+ test_config i18n.commitencoding iso-8859-7 &&
test_tick &&
echo rosten >file &&
- git commit -s -m den file &&
- git fast-export wer^..wer >iso8859-1.fi &&
- sed "s/wer/i18n/" iso8859-1.fi |
+ git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+ git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
+ sed "s/wer/i18n/" iso-8859-7.fi |
(cd new &&
git fast-import &&
+ # The commit object, if not re-encoded, would be 240 bytes.
+ # Removing the "encoding iso-8859-7\n" header drops 20 bytes.
+ # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
+ # to \xCF\x80 (\317\200) in UTF-8 adds a byte. Check for
+ # the expected size.
+ test 221 -eq "$(git cat-file -s i18n)" &&
+ # ...and for the expected translation of bytes.
git cat-file commit i18n >actual &&
- grep "Áéí óú" actual)
+ grep $(printf "\317\200") actual &&
+ # Also make sure the commit does not have the "encoding" header
+ ! grep ^encoding actual)
+'
+
+test_expect_success 'aborting on iso-8859-7' '
+ test_when_finished "git reset --hard HEAD~1" &&
+ test_config i18n.commitencoding iso-8859-7 &&
+ echo rosten >file &&
+ git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+ test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
'
+
+test_expect_success 'preserving iso-8859-7' '
+
+ test_when_finished "git reset --hard HEAD~1" &&
+ test_config i18n.commitencoding iso-8859-7 &&
+ echo rosten >file &&
+ git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+ git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
+ sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
+ (cd new &&
+ git fast-import &&
+ # The commit object, if not re-encoded, is 240 bytes.
+ # Removing the "encoding iso-8859-7\n" header would drops 20
+ # bytes. Re-encoding the Pi character from \xF0 (\360) in
+ # iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
+ # Check for the expected size...
+ test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
+ # ...as well as the expected byte.
+ git cat-file commit i18n-no-recoding >actual &&
+ grep $(printf "\360") actual &&
+ # Also make sure the commit has the "encoding" header
+ grep ^encoding actual)
+'
+
+test_expect_success 'encoding preserved if reencoding fails' '
+
+ test_when_finished "git reset --hard HEAD~1" &&
+ test_config i18n.commitencoding iso-8859-7 &&
+ echo rosten >file &&
+ git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
+ git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
+ sed "s/wer/i18n-invalid/" iso-8859-7.fi |
+ (cd new &&
+ git fast-import &&
+ git cat-file commit i18n-invalid >actual &&
+ # Make sure the commit still has the encoding header
+ grep ^encoding actual &&
+ # Verify that the commit has the expected size; i.e.
+ # that no bytes were re-encoded to a different encoding.
+ test 252 -eq "$(git cat-file -s i18n-invalid)" &&
+ # ...and check for the original special bytes
+ grep $(printf "\360") actual &&
+ grep $(printf "\377") actual)
+'
+
test_expect_success 'import/export-marks' '
git checkout -b marks master &&
@@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME
test_expect_success 'setup copies' '
- git config --unset i18n.commitencoding &&
git checkout -b copy rein &&
git mv file file3 &&
git commit -m move1 &&
diff --git a/t/t9350/broken-iso-8859-7-commit-message.txt b/t/t9350/broken-iso-8859-7-commit-message.txt
new file mode 100644
index 0000000..d06ad75
--- /dev/null
+++ b/t/t9350/broken-iso-8859-7-commit-message.txt
@@ -0,0 +1 @@
+Pi: ; Invalid: \ No newline at end of file
diff --git a/t/t9350/simple-iso-8859-7-commit-message.txt b/t/t9350/simple-iso-8859-7-commit-message.txt
new file mode 100644
index 0000000..8b3f0c3
--- /dev/null
+++ b/t/t9350/simple-iso-8859-7-commit-message.txt
@@ -0,0 +1 @@
+Pi: \ No newline at end of file