summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sha1-lookup.c47
-rw-r--r--t/lib-pack.sh78
-rwxr-xr-xt/t5308-pack-detect-duplicates.sh73
3 files changed, 198 insertions, 0 deletions
diff --git a/sha1-lookup.c b/sha1-lookup.c
index c4dc55d..2dd8515 100644
--- a/sha1-lookup.c
+++ b/sha1-lookup.c
@@ -204,7 +204,54 @@ int sha1_entry_pos(const void *table,
* byte 0 thru (ofs-1) are the same between
* lo and hi; ofs is the first byte that is
* different.
+ *
+ * If ofs==20, then no bytes are different,
+ * meaning we have entries with duplicate
+ * keys. We know that we are in a solid run
+ * of this entry (because the entries are
+ * sorted, and our lo and hi are the same,
+ * there can be nothing but this single key
+ * in between). So we can stop the search.
+ * Either one of these entries is it (and
+ * we do not care which), or we do not have
+ * it.
+ *
+ * Furthermore, we know that one of our
+ * endpoints must be the edge of the run of
+ * duplicates. For example, given this
+ * sequence:
+ *
+ * idx 0 1 2 3 4 5
+ * key A C C C C D
+ *
+ * If we are searching for "B", we might
+ * hit the duplicate run at lo=1, hi=3
+ * (e.g., by first mi=3, then mi=0). But we
+ * can never have lo > 1, because B < C.
+ * That is, if our key is less than the
+ * run, we know that "lo" is the edge, but
+ * we can say nothing of "hi". Similarly,
+ * if our key is greater than the run, we
+ * know that "hi" is the edge, but we can
+ * say nothing of "lo".
+ *
+ * Therefore if we do not find it, we also
+ * know where it would go if it did exist:
+ * just on the far side of the edge that we
+ * know about.
*/
+ if (ofs == 20) {
+ mi = lo;
+ mi_key = base + elem_size * mi + key_offset;
+ cmp = memcmp(mi_key, key, 20);
+ if (!cmp)
+ return mi;
+ if (cmp < 0)
+ return -1 - hi;
+ else
+ return -1 - lo;
+ }
+
hiv = hi_key[ofs_0];
if (ofs_0 < 19)
hiv = (hiv << 8) | hi_key[ofs_0+1];
diff --git a/t/lib-pack.sh b/t/lib-pack.sh
new file mode 100644
index 0000000..fecd5a0
--- /dev/null
+++ b/t/lib-pack.sh
@@ -0,0 +1,78 @@
+#!/bin/sh
+#
+# Support routines for hand-crafting weird or malicious packs.
+#
+# You can make a complete pack like:
+#
+# pack_header 2 >foo.pack &&
+# pack_obj e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 >>foo.pack &&
+# pack_obj e68fe8129b546b101aee9510c5328e7f21ca1d18 >>foo.pack &&
+# pack_trailer foo.pack
+
+# Print the big-endian 4-byte octal representation of $1
+uint32_octal () {
+ n=$1
+ printf '\%o' $(($n / 16777216)); n=$((n % 16777216))
+ printf '\%o' $(($n / 65536)); n=$((n % 65536))
+ printf '\%o' $(($n / 256)); n=$((n % 256))
+ printf '\%o' $(($n ));
+}
+
+# Print the big-endian 4-byte binary representation of $1
+uint32_binary () {
+ printf "$(uint32_octal "$1")"
+}
+
+# Print a pack header, version 2, for a pack with $1 objects
+pack_header () {
+ printf 'PACK' &&
+ printf '\0\0\0\2' &&
+ uint32_binary "$1"
+}
+
+# Print the pack data for object $1, as a delta against object $2 (or as a full
+# object if $2 is missing or empty). The output is suitable for including
+# directly in the packfile, and represents the entirety of the object entry.
+# Doing this on the fly (especially picking your deltas) is quite tricky, so we
+# have hardcoded some well-known objects. See the case statements below for the
+# complete list.
+pack_obj () {
+ case "$1" in
+ # empty blob
+ e69de29bb2d1d6434b8b29ae775ad8c2e48c5391)
+ case "$2" in
+ '')
+ printf '\060\170\234\003\0\0\0\0\1'
+ return
+ ;;
+ esac
+ ;;
+
+ # blob containing "\7\76"
+ e68fe8129b546b101aee9510c5328e7f21ca1d18)
+ case "$2" in
+ '')
+ printf '\062\170\234\143\267\3\0\0\116\0\106'
+ return
+ ;;
+ esac
+ ;;
+ esac
+
+ echo >&2 "BUG: don't know how to print $1${2:+ (from $2)}"
+ return 1
+}
+
+# Compute and append pack trailer to "$1"
+pack_trailer () {
+ test-sha1 -b <"$1" >trailer.tmp &&
+ cat trailer.tmp >>"$1" &&
+ rm -f trailer.tmp
+}
+
+# Remove any existing packs to make sure that
+# whatever we index next will be the pack that we
+# actually use.
+clear_packs () {
+ rm -f .git/objects/pack/*
+}
diff --git a/t/t5308-pack-detect-duplicates.sh b/t/t5308-pack-detect-duplicates.sh
new file mode 100755
index 0000000..04fe242
--- /dev/null
+++ b/t/t5308-pack-detect-duplicates.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+
+test_description='handling of duplicate objects in incoming packfiles'
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-pack.sh
+
+# The sha1s we have in our pack. It's important that these have the same
+# starting byte, so that they end up in the same fanout section of the index.
+# That lets us make sure we are exercising the binary search with both sets.
+LO_SHA1=e68fe8129b546b101aee9510c5328e7f21ca1d18
+HI_SHA1=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
+
+# And here's a "missing sha1" which will produce failed lookups. It must also
+# be in the same fanout section, and should be between the two (so that during
+# our binary search, we are sure to end up looking at one or the other of the
+# duplicate runs).
+MISSING_SHA1='e69d000000000000000000000000000000000000'
+
+# git will never intentionally create packfiles with
+# duplicate objects, so we have to construct them by hand.
+#
+# $1 is the name of the packfile to create
+#
+# $2 is the number of times to duplicate each object
+create_pack () {
+ pack_header "$((2 * $2))" >"$1" &&
+ for i in $(test_seq 1 "$2"); do
+ pack_obj $LO_SHA1 &&
+ pack_obj $HI_SHA1
+ done >>"$1" &&
+ pack_trailer "$1"
+}
+
+# double-check that create_pack actually works
+test_expect_success 'pack with no duplicates' '
+ create_pack no-dups.pack 1 &&
+ git index-pack --stdin <no-dups.pack
+'
+
+test_expect_success 'index-pack will allow duplicate objects by default' '
+ clear_packs &&
+ create_pack dups.pack 100 &&
+ git index-pack --stdin <dups.pack
+'
+
+test_expect_success 'create batch-check test vectors' '
+ cat >input <<-EOF &&
+ $LO_SHA1
+ $HI_SHA1
+ $MISSING_SHA1
+ EOF
+ cat >expect <<-EOF
+ $LO_SHA1 blob 2
+ $HI_SHA1 blob 0
+ $MISSING_SHA1 missing
+ EOF
+'
+
+test_expect_success 'lookup in duplicated pack (binary search)' '
+ git cat-file --batch-check <input >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'lookup in duplicated pack (GIT_USE_LOOKUP)' '
+ (
+ GIT_USE_LOOKUP=1 &&
+ export GIT_USE_LOOKUP &&
+ git cat-file --batch-check <input >actual
+ ) &&
+ test_cmp expect actual
+'
+
+test_done