From 3be7efcafceeae3400cd830be89c9601b43f3716 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:23 +0000
Subject: commit-graph: define and use MAX_NUM_CHUNKS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a minor cleanup to make it easier to change
the number of chunks being written to the commit
graph.

Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/commit-graph.c b/commit-graph.c
index f013a84..e4f1a5b 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -23,6 +23,7 @@
 #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
 #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
 #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
+#define MAX_NUM_CHUNKS 5
 
 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
 
@@ -1350,8 +1351,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	int fd;
 	struct hashfile *f;
 	struct lock_file lk = LOCK_INIT;
-	uint32_t chunk_ids[6];
-	uint64_t chunk_offsets[6];
+	uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
+	uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
-- 
cgit v0.10.2-6-g49f6


From f52207a45ca9e7cfbe431f4ffff79b3fdbcf3a37 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:24 +0000
Subject: bloom.c: add the murmur3 hash implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for computing changed paths Bloom filters,
implement the Murmur3 hash algorithm as described in [1].
It hashes the given data using the given seed and produces
a uniformly distributed hash value.

[1] https://en.wikipedia.org/wiki/MurmurHash#Algorithm

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/Makefile b/Makefile
index ef1ff22..491f75e 100644
--- a/Makefile
+++ b/Makefile
@@ -695,6 +695,7 @@ X =
 PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
 
 TEST_BUILTINS_OBJS += test-advise.o
+TEST_BUILTINS_OBJS += test-bloom.o
 TEST_BUILTINS_OBJS += test-chmtime.o
 TEST_BUILTINS_OBJS += test-config.o
 TEST_BUILTINS_OBJS += test-ctype.o
@@ -840,6 +841,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blame.o
 LIB_OBJS += blob.o
+LIB_OBJS += bloom.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
diff --git a/bloom.c b/bloom.c
new file mode 100644
index 0000000..40e8763
--- /dev/null
+++ b/bloom.c
@@ -0,0 +1,73 @@
+#include "git-compat-util.h"
+#include "bloom.h"
+
+static uint32_t rotate_left(uint32_t value, int32_t count)
+{
+	uint32_t mask = 8 * sizeof(uint32_t) - 1;
+	count &= mask;
+	return ((value << count) | (value >> ((-count) & mask)));
+}
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
+{
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t r1 = 15;
+	const uint32_t r2 = 13;
+	const uint32_t m = 5;
+	const uint32_t n = 0xe6546b64;
+	int i;
+	uint32_t k1 = 0;
+	const char *tail;
+
+	int len4 = len / sizeof(uint32_t);
+
+	uint32_t k;
+	for (i = 0; i < len4; i++) {
+		uint32_t byte1 = (uint32_t)data[4*i];
+		uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
+		uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
+		uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
+		k = byte1 | byte2 | byte3 | byte4;
+		k *= c1;
+		k = rotate_left(k, r1);
+		k *= c2;
+
+		seed ^= k;
+		seed = rotate_left(seed, r2) * m + n;
+	}
+
+	tail = (data + len4 * sizeof(uint32_t));
+
+	switch (len & (sizeof(uint32_t) - 1)) {
+	case 3:
+		k1 ^= ((uint32_t)tail[2]) << 16;
+		/*-fallthrough*/
+	case 2:
+		k1 ^= ((uint32_t)tail[1]) << 8;
+		/*-fallthrough*/
+	case 1:
+		k1 ^= ((uint32_t)tail[0]) << 0;
+		k1 *= c1;
+		k1 = rotate_left(k1, r1);
+		k1 *= c2;
+		seed ^= k1;
+		break;
+	}
+
+	seed ^= (uint32_t)len;
+	seed ^= (seed >> 16);
+	seed *= 0x85ebca6b;
+	seed ^= (seed >> 13);
+	seed *= 0xc2b2ae35;
+	seed ^= (seed >> 16);
+
+	return seed;
+}
\ No newline at end of file
diff --git a/bloom.h b/bloom.h
new file mode 100644
index 0000000..d0fcc5f
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,13 @@
+#ifndef BLOOM_H
+#define BLOOM_H
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
+
+#endif
\ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
new file mode 100644
index 0000000..60ee204
--- /dev/null
+++ b/t/helper/test-bloom.c
@@ -0,0 +1,13 @@
+#include "git-compat-util.h"
+#include "bloom.h"
+#include "test-tool.h"
+
+int cmd__bloom(int argc, const char **argv)
+{
+	if (!strcmp(argv[1], "get_murmur3")) {
+		uint32_t hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
+		printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
+	}
+
+	return 0;
+}
\ No newline at end of file
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 31eedcd..6e26bd6 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -15,6 +15,7 @@ struct test_cmd {
 
 static struct test_cmd cmds[] = {
 	{ "advise", cmd__advise_if_enabled },
+	{ "bloom", cmd__bloom },
 	{ "chmtime", cmd__chmtime },
 	{ "config", cmd__config },
 	{ "ctype", cmd__ctype },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 4eb5e66..dceeef1 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -5,6 +5,7 @@
 #include "git-compat-util.h"
 
 int cmd__advise_if_enabled(int argc, const char **argv);
+int cmd__bloom(int argc, const char **argv);
 int cmd__chmtime(int argc, const char **argv);
 int cmd__config(int argc, const char **argv);
 int cmd__ctype(int argc, const char **argv);
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
new file mode 100755
index 0000000..2dad8c4
--- /dev/null
+++ b/t/t0095-bloom.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+test_description='Testing the various Bloom filter computations in bloom.c'
+. ./test-lib.sh
+
+test_expect_success 'compute unseeded murmur3 hash for empty string' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x00000000
+	EOF
+	test-tool bloom get_murmur3 "" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 1' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x627b0c2c
+	EOF
+	test-tool bloom get_murmur3 "Hello world!" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 2' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x2e4ff723
+	EOF
+	test-tool bloom get_murmur3 "The quick brown fox jumps over the lazy dog" >actual &&
+	test_cmp expect actual
+'
+
+test_done
\ No newline at end of file
-- 
cgit v0.10.2-6-g49f6


From f1294eaf7fbf7673567b698b11e062566b9f1035 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:25 +0000
Subject: bloom.c: introduce core Bloom filter constructs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the constructs for Bloom filters, Bloom filter keys
and Bloom filter settings.
For details on what Bloom filters are and how they work, refer
to Dr. Derrick Stolee's blog post [1]. It provides a concise
explanation of the adoption of Bloom filters as described in
[2] and [3].

Implementation specifics:
1. We currently use 7 and 10 for the number of hashes and the
   size of each entry respectively. They served as great starting
   values, the mathematical details behind this choice are
   described in [1] and [4]. The implementation, while not
   completely open to it at the moment, is flexible enough to allow
   for tweaking these settings in the future.

   Note: The performance gains we have observed with these values
   are significant enough that we did not need to tweak these
   settings. The performance numbers are included in the cover letter
   of this series and in the commit message of the subsequent commit
   where we use Bloom filters to speed up `git log -- path`.

2. As described in [1] and [3], we do not need 7 independent hashing
   functions. We use the Murmur3 hashing scheme, seed it twice and
   then combine those to procure an arbitrary number of hash values.

3. The filters will be sized according to the number of changes in
   each commit, in multiples of 8 bit words.

[1] Derrick Stolee
      "Supercharging the Git Commit Graph IV: Bloom Filters"
      https://devblogs.microsoft.com/devops/super-charging-the-git-commit-graph-iv-Bloom-filters/

[2] Flavio Bonomi, Michael Mitzenmacher, Rina Panigrahy, Sushil Singh, George Varghese
    "An Improved Construction for Counting Bloom Filters"
    http://theory.stanford.edu/~rinap/papers/esa2006b.pdf
    https://doi.org/10.1007/11841036_61

[3] Peter C. Dillinger and Panagiotis Manolios
    "Bloom Filters in Probabilistic Verification"
    http://www.ccs.neu.edu/home/pete/pub/Bloom-filters-verification.pdf
    https://doi.org/10.1007/978-3-540-30494-4_26

[4] Thomas Mueller Graf, Daniel Lemire
    "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
    https://arxiv.org/abs/1912.08258

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index 40e8763..888b67f 100644
--- a/bloom.c
+++ b/bloom.c
@@ -8,6 +8,11 @@ static uint32_t rotate_left(uint32_t value, int32_t count)
 	return ((value << count) | (value >> ((-count) & mask)));
 }
 
+static inline unsigned char get_bitmask(uint32_t pos)
+{
+	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
+}
+
 /*
  * Calculate the murmur3 32-bit hash value for the given data
  * using the given seed.
@@ -70,4 +75,35 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
 	seed ^= (seed >> 16);
 
 	return seed;
-}
\ No newline at end of file
+}
+
+void fill_bloom_key(const char *data,
+					size_t len,
+					struct bloom_key *key,
+					const struct bloom_filter_settings *settings)
+{
+	int i;
+	const uint32_t seed0 = 0x293ae76f;
+	const uint32_t seed1 = 0x7e646e2c;
+	const uint32_t hash0 = murmur3_seeded(seed0, data, len);
+	const uint32_t hash1 = murmur3_seeded(seed1, data, len);
+
+	key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
+	for (i = 0; i < settings->num_hashes; i++)
+		key->hashes[i] = hash0 + i * hash1;
+}
+
+void add_key_to_filter(const struct bloom_key *key,
+					   struct bloom_filter *filter,
+					   const struct bloom_filter_settings *settings)
+{
+	int i;
+	uint64_t mod = filter->len * BITS_PER_WORD;
+
+	for (i = 0; i < settings->num_hashes; i++) {
+		uint64_t hash_mod = key->hashes[i] % mod;
+		uint64_t block_pos = hash_mod / BITS_PER_WORD;
+
+		filter->data[block_pos] |= get_bitmask(hash_mod);
+	}
+}
diff --git a/bloom.h b/bloom.h
index d0fcc5f..b9ce422 100644
--- a/bloom.h
+++ b/bloom.h
@@ -1,6 +1,60 @@
 #ifndef BLOOM_H
 #define BLOOM_H
 
+struct bloom_filter_settings {
+	/*
+	 * The version of the hashing technique being used.
+	 * We currently only support version = 1 which is
+	 * the seeded murmur3 hashing technique implemented
+	 * in bloom.c.
+	 */
+	uint32_t hash_version;
+
+	/*
+	 * The number of times a path is hashed, i.e. the
+	 * number of bit positions tht cumulatively
+	 * determine whether a path is present in the
+	 * Bloom filter.
+	 */
+	uint32_t num_hashes;
+
+	/*
+	 * The minimum number of bits per entry in the Bloom
+	 * filter. If the filter contains 'n' entries, then
+	 * filter size is the minimum number of 8-bit words
+	 * that contain n*b bits.
+	 */
+	uint32_t bits_per_entry;
+};
+
+#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
+#define BITS_PER_WORD 8
+
+/*
+ * A bloom_filter struct represents a data segment to
+ * use when testing hash values. The 'len' member
+ * dictates how many entries are stored in
+ * 'data'.
+ */
+struct bloom_filter {
+	unsigned char *data;
+	size_t len;
+};
+
+/*
+ * A bloom_key represents the k hash values for a
+ * given string. These can be precomputed and
+ * stored in a bloom_key for re-use when testing
+ * against a bloom_filter. The number of hashes is
+ * given by the Bloom filter settings and is the same
+ * for all Bloom filters and keys interacting with
+ * the loaded version of the commit graph file and
+ * the Bloom data chunks.
+ */
+struct bloom_key {
+	uint32_t *hashes;
+};
+
 /*
  * Calculate the murmur3 32-bit hash value for the given data
  * using the given seed.
@@ -10,4 +64,13 @@
  */
 uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
 
+void fill_bloom_key(const char *data,
+		    size_t len,
+		    struct bloom_key *key,
+		    const struct bloom_filter_settings *settings);
+
+void add_key_to_filter(const struct bloom_key *key,
+					   struct bloom_filter *filter,
+					   const struct bloom_filter_settings *settings);
+
 #endif
\ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index 60ee204..20460cd 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -2,6 +2,36 @@
 #include "bloom.h"
 #include "test-tool.h"
 
+struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+
+static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
+		struct bloom_key key;
+		int i;
+
+		fill_bloom_key(data, strlen(data), &key, &settings);
+		printf("Hashes:");
+		for (i = 0; i < settings.num_hashes; i++){
+			printf("0x%08x|", key.hashes[i]);
+		}
+		printf("\n");
+		add_key_to_filter(&key, filter, &settings);
+}
+
+static void print_bloom_filter(struct bloom_filter *filter) {
+	int i;
+
+	if (!filter) {
+		printf("No filter.\n");
+		return;
+	}
+	printf("Filter_Length:%d\n", (int)filter->len);
+	printf("Filter_Data:");
+	for (i = 0; i < filter->len; i++){
+		printf("%02x|", filter->data[i]);
+	}
+	printf("\n");
+}
+
 int cmd__bloom(int argc, const char **argv)
 {
 	if (!strcmp(argv[1], "get_murmur3")) {
@@ -9,5 +39,23 @@ int cmd__bloom(int argc, const char **argv)
 		printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
 	}
 
+    if (!strcmp(argv[1], "generate_filter")) {
+		struct bloom_filter filter;
+		int i = 2;
+		filter.len =  (settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+		filter.data = xcalloc(filter.len, sizeof(unsigned char));
+
+		if (!argv[2]){
+			die("at least one input string expected");
+		}
+
+		while (argv[i]) {
+			add_string_to_filter(argv[i], &filter);
+			i++;
+		}
+
+		print_bloom_filter(&filter);
+	}
+
 	return 0;
 }
\ No newline at end of file
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
index 2dad8c4..36a086c 100755
--- a/t/t0095-bloom.sh
+++ b/t/t0095-bloom.sh
@@ -27,4 +27,44 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' '
 	test_cmp expect actual
 '
 
+test_expect_success 'compute bloom key for empty string' '
+	cat >expect <<-\EOF &&
+	Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004|
+	Filter_Length:2
+	Filter_Data:11|11|
+	EOF
+	test-tool bloom generate_filter "" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for whitespace' '
+	cat >expect <<-\EOF &&
+	Hashes:0xf178874c|0x5f3d6eb6|0xcd025620|0x3ac73d8a|0xa88c24f4|0x16510c5e|0x8415f3c8|
+	Filter_Length:2
+	Filter_Data:51|55|
+	EOF
+	test-tool bloom generate_filter " " >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for test string 1' '
+	cat >expect <<-\EOF &&
+	Hashes:0xb270de9b|0x1bb6f26e|0x84fd0641|0xee431a14|0x57892de7|0xc0cf41ba|0x2a15558d|
+	Filter_Length:2
+	Filter_Data:92|6c|
+	EOF
+	test-tool bloom generate_filter "Hello world!" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for test string 2' '
+	cat >expect <<-\EOF &&
+	Hashes:0x20ab385b|0xf5237fe2|0xc99bc769|0x9e140ef0|0x728c5677|0x47049dfe|0x1b7ce585|
+	Filter_Length:2
+	Filter_Data:a5|4a|
+	EOF
+	test-tool bloom generate_filter "file.txt" >actual &&
+	test_cmp expect actual
+'
+
 test_done
\ No newline at end of file
-- 
cgit v0.10.2-6-g49f6


From ed591febb4a201ce48b34a4e90027414cd0d7966 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:26 +0000
Subject: bloom.c: core Bloom filter implementation for changed paths.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the core implementation for computing Bloom filters for
the paths changed between a commit and it's first parent.

We fill the Bloom filters as (const char *data, int len) pairs
as `struct bloom_filters" within a commit slab.

Filters for commits with no changes and more than 512 changes,
is represented with a filter of length zero. There is no gain
in distinguishing between a computed filter of length zero for
a commit with no changes, and an uncomputed filter for new commits
or for commits with more than 512 changes. The effect on
`git log -- path` is the same in both cases. We will fall back to
the normal diffing algorithm when we can't benefit from the
existence of Bloom filters.

Helped-by: Jeff King <peff@peff.net>
Helped-by: Derrick Stolee <dstolee@microsoft.com>
Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index 888b67f..881a984 100644
--- a/bloom.c
+++ b/bloom.c
@@ -1,5 +1,18 @@
 #include "git-compat-util.h"
 #include "bloom.h"
+#include "diff.h"
+#include "diffcore.h"
+#include "revision.h"
+#include "hashmap.h"
+
+define_commit_slab(bloom_filter_slab, struct bloom_filter);
+
+struct bloom_filter_slab bloom_filters;
+
+struct pathmap_hash_entry {
+    struct hashmap_entry entry;
+    const char path[FLEX_ARRAY];
+};
 
 static uint32_t rotate_left(uint32_t value, int32_t count)
 {
@@ -107,3 +120,87 @@ void add_key_to_filter(const struct bloom_key *key,
 		filter->data[block_pos] |= get_bitmask(hash_mod);
 	}
 }
+
+void init_bloom_filters(void)
+{
+	init_bloom_filter_slab(&bloom_filters);
+}
+
+struct bloom_filter *get_bloom_filter(struct repository *r,
+				      struct commit *c)
+{
+	struct bloom_filter *filter;
+	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+	int i;
+	struct diff_options diffopt;
+
+	if (bloom_filters.slab_size == 0)
+		return NULL;
+
+	filter = bloom_filter_slab_at(&bloom_filters, c);
+
+	repo_diff_setup(r, &diffopt);
+	diffopt.flags.recursive = 1;
+	diff_setup_done(&diffopt);
+
+	if (c->parents)
+		diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
+	else
+		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
+	diffcore_std(&diffopt);
+
+	if (diff_queued_diff.nr <= 512) {
+		struct hashmap pathmap;
+		struct pathmap_hash_entry *e;
+		struct hashmap_iter iter;
+		hashmap_init(&pathmap, NULL, NULL, 0);
+
+		for (i = 0; i < diff_queued_diff.nr; i++) {
+			const char *path = diff_queued_diff.queue[i]->two->path;
+
+			/*
+			* Add each leading directory of the changed file, i.e. for
+			* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
+			* the Bloom filter could be used to speed up commands like
+			* 'git log dir/subdir', too.
+			*
+			* Note that directories are added without the trailing '/'.
+			*/
+			do {
+				char *last_slash = strrchr(path, '/');
+
+				FLEX_ALLOC_STR(e, path, path);
+				hashmap_entry_init(&e->entry, strhash(path));
+				hashmap_add(&pathmap, &e->entry);
+
+				if (!last_slash)
+					last_slash = (char*)path;
+				*last_slash = '\0';
+
+			} while (*path);
+
+			diff_free_filepair(diff_queued_diff.queue[i]);
+		}
+
+		filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+		filter->data = xcalloc(filter->len, sizeof(unsigned char));
+
+		hashmap_for_each_entry(&pathmap, &iter, e, entry) {
+			struct bloom_key key;
+			fill_bloom_key(e->path, strlen(e->path), &key, &settings);
+			add_key_to_filter(&key, filter, &settings);
+		}
+
+		hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
+	} else {
+		for (i = 0; i < diff_queued_diff.nr; i++)
+			diff_free_filepair(diff_queued_diff.queue[i]);
+		filter->data = NULL;
+		filter->len = 0;
+	}
+
+	free(diff_queued_diff.queue);
+	DIFF_QUEUE_CLEAR(&diff_queued_diff);
+
+	return filter;
+}
diff --git a/bloom.h b/bloom.h
index b9ce422..85ab8e9 100644
--- a/bloom.h
+++ b/bloom.h
@@ -1,6 +1,9 @@
 #ifndef BLOOM_H
 #define BLOOM_H
 
+struct commit;
+struct repository;
+
 struct bloom_filter_settings {
 	/*
 	 * The version of the hashing technique being used.
@@ -73,4 +76,9 @@ void add_key_to_filter(const struct bloom_key *key,
 					   struct bloom_filter *filter,
 					   const struct bloom_filter_settings *settings);
 
+void init_bloom_filters(void);
+
+struct bloom_filter *get_bloom_filter(struct repository *r,
+				      struct commit *c);
+
 #endif
\ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index 20460cd..f18d1b7 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -1,6 +1,7 @@
 #include "git-compat-util.h"
 #include "bloom.h"
 #include "test-tool.h"
+#include "commit.h"
 
 struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
@@ -32,6 +33,16 @@ static void print_bloom_filter(struct bloom_filter *filter) {
 	printf("\n");
 }
 
+static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
+{
+	struct commit *c;
+	struct bloom_filter *filter;
+	setup_git_directory();
+	c = lookup_commit(the_repository, commit_oid);
+	filter = get_bloom_filter(the_repository, c);
+	print_bloom_filter(filter);
+}
+
 int cmd__bloom(int argc, const char **argv)
 {
 	if (!strcmp(argv[1], "get_murmur3")) {
@@ -57,5 +68,14 @@ int cmd__bloom(int argc, const char **argv)
 		print_bloom_filter(&filter);
 	}
 
+    if (!strcmp(argv[1], "get_filter_for_commit")) {
+		struct object_id oid;
+		const char *end;
+		if (parse_oid_hex(argv[2], &oid, &end))
+			die("cannot parse oid '%s'", argv[2]);
+		init_bloom_filters();
+		get_bloom_filter_for_commit(&oid);
+	}
+
 	return 0;
 }
\ No newline at end of file
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
index 36a086c..8f9eef1 100755
--- a/t/t0095-bloom.sh
+++ b/t/t0095-bloom.sh
@@ -67,4 +67,51 @@ test_expect_success 'compute bloom key for test string 2' '
 	test_cmp expect actual
 '
 
+test_expect_success 'get bloom filters for commit with no changes' '
+	git init &&
+	git commit --allow-empty -m "c0" &&
+	cat >expect <<-\EOF &&
+	Filter_Length:0
+	Filter_Data:
+	EOF
+	test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'get bloom filter for commit with 10 changes' '
+	rm actual &&
+	rm expect &&
+	mkdir smallDir &&
+	for i in $(test_seq 0 9)
+	do
+		echo $i >smallDir/$i
+	done &&
+	git add smallDir &&
+	git commit -m "commit with 10 changes" &&
+	cat >expect <<-\EOF &&
+	Filter_Length:25
+	Filter_Data:82|a0|65|47|0c|92|90|c0|a1|40|02|a0|e2|40|e0|04|0a|9a|66|cf|80|19|85|42|23|
+	EOF
+	test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success EXPENSIVE 'get bloom filter for commit with 513 changes' '
+	rm actual &&
+	rm expect &&
+	mkdir bigDir &&
+	for i in $(test_seq 0 512)
+	do
+		echo $i >bigDir/$i
+	done &&
+	git add bigDir &&
+	git commit -m "commit with 513 changes" &&
+	cat >expect <<-\EOF &&
+	Filter_Length:0
+	Filter_Data:
+	EOF
+	test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+	test_cmp expect actual
+'
+
 test_done
\ No newline at end of file
-- 
cgit v0.10.2-6-g49f6


From e3696980163bdbd3bc56e5ffc69e8770015f366f Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:27 +0000
Subject: diff: halt tree-diff early after max_changes

When computing the changed-paths bloom filters for the commit-graph,
we limit the size of the filter by restricting the number of paths
in the diff. Instead of computing a large diff and then ignoring the
result, it is better to halt the diff computation early.

Create a new "max_changes" option in struct diff_options. If non-zero,
then halt the diff computation after discovering strictly more changed
paths. This includes paths corresponding to trees that change.

Use this max_changes option in the bloom filter calculations. This
reduces the time taken to compute the filters for the Linux kernel
repo from 2m50s to 2m35s. On a large internal repository with ~500
commits that perform tree-wide changes, the time reduced from
6m15s to 3m48s.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index 881a984..a16eee9 100644
--- a/bloom.c
+++ b/bloom.c
@@ -133,6 +133,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 	int i;
 	struct diff_options diffopt;
+	int max_changes = 512;
 
 	if (bloom_filters.slab_size == 0)
 		return NULL;
@@ -141,6 +142,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
+	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
 	if (c->parents)
@@ -149,7 +151,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 		diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
 	diffcore_std(&diffopt);
 
-	if (diff_queued_diff.nr <= 512) {
+	if (diff_queued_diff.nr <= max_changes) {
 		struct hashmap pathmap;
 		struct pathmap_hash_entry *e;
 		struct hashmap_iter iter;
diff --git a/diff.h b/diff.h
index 6febe7e..9443dc1 100644
--- a/diff.h
+++ b/diff.h
@@ -285,6 +285,11 @@ struct diff_options {
 	/* Number of hexdigits to abbreviate raw format output to. */
 	int abbrev;
 
+	/* If non-zero, then stop computing after this many changes. */
+	int max_changes;
+	/* For internal use only. */
+	int num_changes;
+
 	int ita_invisible_in_index;
 /* white-space error highlighting */
 #define WSEH_NEW (1<<12)
diff --git a/tree-diff.c b/tree-diff.c
index 33ded7f..f3d303c 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		if (diff_can_quit_early(opt))
 			break;
 
+		if (opt->max_changes && opt->num_changes > opt->max_changes)
+			break;
+
 		if (opt->pathspec.nr) {
 			skip_uninteresting(&t, base, opt);
 			for (i = 0; i < nparent; i++)
@@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 
 			/* t↓ */
 			update_tree_entry(&t);
+			opt->num_changes++;
 		}
 
 		/* t > p[imin] */
@@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
 		skip_emit_tp:
 			/* ∀ pi=p[imin]  pi↓ */
 			update_tp_entries(tp, nparent);
+			opt->num_changes++;
 		}
 	}
 
@@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt)
 {
+	opt->num_changes = 0;
 	p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
 
 	/*
-- 
cgit v0.10.2-6-g49f6


From f97b9325f6d7ad2a28bfaf6fab3197d207fcb278 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:28 +0000
Subject: commit-graph: compute Bloom filters for changed paths

Add new COMMIT_GRAPH_WRITE_CHANGED_PATHS flag that makes Git compute
Bloom filters for the paths that changed between a commit and it's
first parent, for each commit in the commit-graph.  This computation
is done on a commit-by-commit basis.

We will write these Bloom filters to the commit-graph file, to store
this data on disk, in the next change in this series.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/commit-graph.c b/commit-graph.c
index e4f1a5b..862a00d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -16,6 +16,7 @@
 #include "hashmap.h"
 #include "replace-object.h"
 #include "progress.h"
+#include "bloom.h"
 
 #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
 #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
@@ -789,9 +790,11 @@ struct write_commit_graph_context {
 	unsigned append:1,
 		 report_progress:1,
 		 split:1,
-		 check_oids:1;
+		 check_oids:1,
+		 changed_paths:1;
 
 	const struct split_commit_graph_opts *split_opts;
+	size_t total_bloom_filter_data_size;
 };
 
 static void write_graph_chunk_fanout(struct hashfile *f,
@@ -1134,6 +1137,28 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
 	stop_progress(&ctx->progress);
 }
 
+static void compute_bloom_filters(struct write_commit_graph_context *ctx)
+{
+	int i;
+	struct progress *progress = NULL;
+
+	init_bloom_filters();
+
+	if (ctx->report_progress)
+		progress = start_delayed_progress(
+			_("Computing commit changed paths Bloom filters"),
+			ctx->commits.nr);
+
+	for (i = 0; i < ctx->commits.nr; i++) {
+		struct commit *c = ctx->commits.list[i];
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
+		ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
+		display_progress(progress, i + 1);
+	}
+
+	stop_progress(&progress);
+}
+
 static int add_ref_to_list(const char *refname,
 			   const struct object_id *oid,
 			   int flags, void *cb_data)
@@ -1776,6 +1801,8 @@ int write_commit_graph(struct object_directory *odb,
 	ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
 	ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
 	ctx->split_opts = split_opts;
+	ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
+	ctx->total_bloom_filter_data_size = 0;
 
 	if (ctx->split) {
 		struct commit_graph *g;
@@ -1870,6 +1897,9 @@ int write_commit_graph(struct object_directory *odb,
 
 	compute_generation_numbers(ctx);
 
+	if (ctx->changed_paths)
+		compute_bloom_filters(ctx);
+
 	res = write_commit_graph_file(ctx);
 
 	if (ctx->split)
diff --git a/commit-graph.h b/commit-graph.h
index e87a6f6..86be812 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -79,7 +79,8 @@ enum commit_graph_write_flags {
 	COMMIT_GRAPH_WRITE_PROGRESS   = (1 << 1),
 	COMMIT_GRAPH_WRITE_SPLIT      = (1 << 2),
 	/* Make sure that each OID in the input is a valid commit OID. */
-	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3)
+	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
+	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4),
 };
 
 struct split_commit_graph_opts {
-- 
cgit v0.10.2-6-g49f6


From d21ee7d111073dfd7a86f6fe870d0c1ec6a07126 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 30 Mar 2020 00:31:29 +0000
Subject: commit-graph: examine changed-path objects in pack order

Looking at the diff of commit objects in pack order is much faster than
in sha1 order, as it gives locality to the access of tree deltas
(whereas sha1 order is effectively random). Unfortunately the
commit-graph code sorts the commits (several times, sometimes as an oid
and sometimes a pointer-to-commit), and we ultimately traverse in sha1
order.

Instead, let's remember the position at which we see each commit, and
traverse in that order when looking at bloom filters. This drops my time
for "git commit-graph write --changed-paths" in linux.git from ~4
minutes to ~1.5 minutes.

Probably the "--reachable" code path would want something similar.

Or alternatively, we could use a different data structure (either a
hash, or maybe even just a bit in "struct commit") to keep track of
which oids we've seen, etc instead of sorting. And then we could keep
the original order.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/commit-graph.c b/commit-graph.c
index 862a00d..31b06f8 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -17,6 +17,7 @@
 #include "replace-object.h"
 #include "progress.h"
 #include "bloom.h"
+#include "commit-slab.h"
 
 #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
 #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
@@ -46,9 +47,32 @@
 /* Remember to update object flag allocation in object.h */
 #define REACHABLE       (1u<<15)
 
-char *get_commit_graph_filename(struct object_directory *odb)
+/* Keep track of the order in which commits are added to our list. */
+define_commit_slab(commit_pos, int);
+static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
+
+static void set_commit_pos(struct repository *r, const struct object_id *oid)
+{
+	static int32_t max_pos;
+	struct commit *commit = lookup_commit(r, oid);
+
+	if (!commit)
+		return; /* should never happen, but be lenient */
+
+	*commit_pos_at(&commit_pos, commit) = max_pos++;
+}
+
+static int commit_pos_cmp(const void *va, const void *vb)
 {
-	return xstrfmt("%s/info/commit-graph", odb->path);
+	const struct commit *a = *(const struct commit **)va;
+	const struct commit *b = *(const struct commit **)vb;
+	return commit_pos_at(&commit_pos, a) -
+	       commit_pos_at(&commit_pos, b);
+}
+
+char *get_commit_graph_filename(struct object_directory *obj_dir)
+{
+	return xstrfmt("%s/info/commit-graph", obj_dir->path);
 }
 
 static char *get_split_graph_filename(struct object_directory *odb,
@@ -1021,6 +1045,8 @@ static int add_packed_commits(const struct object_id *oid,
 	oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
 	ctx->oids.nr++;
 
+	set_commit_pos(ctx->r, oid);
+
 	return 0;
 }
 
@@ -1141,6 +1167,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 {
 	int i;
 	struct progress *progress = NULL;
+	struct commit **sorted_commits;
 
 	init_bloom_filters();
 
@@ -1149,13 +1176,18 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 			_("Computing commit changed paths Bloom filters"),
 			ctx->commits.nr);
 
+	ALLOC_ARRAY(sorted_commits, ctx->commits.nr);
+	COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr);
+	QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
+
 	for (i = 0; i < ctx->commits.nr; i++) {
-		struct commit *c = ctx->commits.list[i];
+		struct commit *c = sorted_commits[i];
 		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
 		ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
 		display_progress(progress, i + 1);
 	}
 
+	free(sorted_commits);
 	stop_progress(&progress);
 }
 
-- 
cgit v0.10.2-6-g49f6


From 3d11275505694ce4e5256516de1c5dd90e749303 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 30 Mar 2020 00:31:30 +0000
Subject: commit-graph: examine commits by generation number

When running 'git commit-graph write --changed-paths', we sort the
commits by pack-order to save time when computing the changed-paths
bloom filters. This does not help when finding the commits via the
'--reachable' flag.

If not using pack-order, then sort by generation number before
examining the diff. Commits with similar generation are more likely
to have many trees in common, making the diff faster.

On the Linux kernel repository, this change reduced the computation
time for 'git commit-graph write --reachable --changed-paths' from
3m00s to 1m37s.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/commit-graph.c b/commit-graph.c
index 31b06f8..732c81f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -70,6 +70,25 @@ static int commit_pos_cmp(const void *va, const void *vb)
 	       commit_pos_at(&commit_pos, b);
 }
 
+static int commit_gen_cmp(const void *va, const void *vb)
+{
+	const struct commit *a = *(const struct commit **)va;
+	const struct commit *b = *(const struct commit **)vb;
+
+	/* lower generation commits first */
+	if (a->generation < b->generation)
+		return -1;
+	else if (a->generation > b->generation)
+		return 1;
+
+	/* use date as a heuristic when generations are equal */
+	if (a->date < b->date)
+		return -1;
+	else if (a->date > b->date)
+		return 1;
+	return 0;
+}
+
 char *get_commit_graph_filename(struct object_directory *obj_dir)
 {
 	return xstrfmt("%s/info/commit-graph", obj_dir->path);
@@ -815,7 +834,8 @@ struct write_commit_graph_context {
 		 report_progress:1,
 		 split:1,
 		 check_oids:1,
-		 changed_paths:1;
+		 changed_paths:1,
+		 order_by_pack:1;
 
 	const struct split_commit_graph_opts *split_opts;
 	size_t total_bloom_filter_data_size;
@@ -1178,7 +1198,11 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 
 	ALLOC_ARRAY(sorted_commits, ctx->commits.nr);
 	COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr);
-	QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
+
+	if (ctx->order_by_pack)
+		QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
+	else
+		QSORT(sorted_commits, ctx->commits.nr, commit_gen_cmp);
 
 	for (i = 0; i < ctx->commits.nr; i++) {
 		struct commit *c = sorted_commits[i];
@@ -1884,6 +1908,7 @@ int write_commit_graph(struct object_directory *odb,
 	}
 
 	if (pack_indexes) {
+		ctx->order_by_pack = 1;
 		if ((res = fill_oids_from_packs(ctx, pack_indexes)))
 			goto cleanup;
 	}
@@ -1893,8 +1918,10 @@ int write_commit_graph(struct object_directory *odb,
 			goto cleanup;
 	}
 
-	if (!pack_indexes && !commit_hex)
+	if (!pack_indexes && !commit_hex) {
+		ctx->order_by_pack = 1;
 		fill_oids_from_all_packs(ctx);
+	}
 
 	close_reachable(ctx);
 
-- 
cgit v0.10.2-6-g49f6


From 76ffbca71a9c89d1e530f734e16a70b3924f4bea Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:49 +0000
Subject: commit-graph: write Bloom filters to commit graph file

Update the technical documentation for commit-graph-format with
the formats for the Bloom filter index (BIDX) and Bloom filter
data (BDAT) chunks. Write the computed Bloom filters information
to the commit graph file using this format.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt
index a4f1744..de56f9f 100644
--- a/Documentation/technical/commit-graph-format.txt
+++ b/Documentation/technical/commit-graph-format.txt
@@ -17,6 +17,9 @@ metadata, including:
 - The parents of the commit, stored using positional references within
   the graph file.
 
+- The Bloom filter of the commit carrying the paths that were changed between
+  the commit and its first parent, if requested.
+
 These positional references are stored as unsigned 32-bit integers
 corresponding to the array position within the list of commit OIDs. Due
 to some special constants we use to track parents, we can store at most
@@ -93,6 +96,33 @@ CHUNK DATA:
       positions for the parents until reaching a value with the most-significant
       bit on. The other bits correspond to the position of the last parent.
 
+  Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
+    * The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
+      Bloom filters from commit 0 to commit i (inclusive) in lexicographic
+      order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
+      BIDX[i] (plus header length), where BIDX[-1] is 0.
+    * The BIDX chunk is ignored if the BDAT chunk is not present.
+
+  Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
+    * It starts with header consisting of three unsigned 32-bit integers:
+      - Version of the hash algorithm being used. We currently only support
+	value 1 which corresponds to the 32-bit version of the murmur3 hash
+	implemented exactly as described in
+	https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
+	hashing technique using seed values 0x293ae76f and 0x7e646e2 as
+	described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
+	in Probabilistic Verification"
+      - The number of times a path is hashed and hence the number of bit positions
+	      that cumulatively determine whether a file is present in the commit.
+      - The minimum number of bits 'b' per entry in the Bloom filter. If the filter
+	      contains 'n' entries, then the filter size is the minimum number of 64-bit
+	      words that contain n*b bits.
+    * The rest of the chunk is the concatenation of all the computed Bloom
+      filters for the commits in lexicographic order.
+    * Note: Commits with no changes or more than 512 changes have Bloom filters
+      of length zero.
+    * The BDAT chunk is present if and only if BIDX is present.
+
   Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
       This list of H-byte hashes describe a set of B commit-graph files that
       form a commit-graph chain. The graph position for the ith commit in this
diff --git a/commit-graph.c b/commit-graph.c
index 732c81f..a8b6b5c 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -24,8 +24,10 @@
 #define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
 #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
+#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
+#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
 #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
-#define MAX_NUM_CHUNKS 5
+#define MAX_NUM_CHUNKS 7
 
 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
 
@@ -319,6 +321,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 				chunk_repeated = 1;
 			else
 				graph->chunk_base_graphs = data + chunk_offset;
+			break;
+
+		case GRAPH_CHUNKID_BLOOMINDEXES:
+			if (graph->chunk_bloom_indexes)
+				chunk_repeated = 1;
+			else
+				graph->chunk_bloom_indexes = data + chunk_offset;
+			break;
+
+		case GRAPH_CHUNKID_BLOOMDATA:
+			if (graph->chunk_bloom_data)
+				chunk_repeated = 1;
+			else {
+				uint32_t hash_version;
+				graph->chunk_bloom_data = data + chunk_offset;
+				hash_version = get_be32(data + chunk_offset);
+
+				if (hash_version != 1)
+					break;
+
+				graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
+				graph->bloom_filter_settings->hash_version = hash_version;
+				graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
+				graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
+			}
+			break;
 		}
 
 		if (chunk_repeated) {
@@ -337,6 +365,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		last_chunk_offset = chunk_offset;
 	}
 
+	if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
+		init_bloom_filters();
+	} else {
+		/* We need both the bloom chunks to exist together. Else ignore the data */
+		graph->chunk_bloom_indexes = NULL;
+		graph->chunk_bloom_data = NULL;
+		graph->bloom_filter_settings = NULL;
+	}
+
 	hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
 
 	if (verify_commit_graph_lite(graph)) {
@@ -1034,6 +1071,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
 	}
 }
 
+static void write_graph_chunk_bloom_indexes(struct hashfile *f,
+					    struct write_commit_graph_context *ctx)
+{
+	struct commit **list = ctx->commits.list;
+	struct commit **last = ctx->commits.list + ctx->commits.nr;
+	uint32_t cur_pos = 0;
+	struct progress *progress = NULL;
+	int i = 0;
+
+	if (ctx->report_progress)
+		progress = start_delayed_progress(
+			_("Writing changed paths Bloom filters index"),
+			ctx->commits.nr);
+
+	while (list < last) {
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+		cur_pos += filter->len;
+		display_progress(progress, ++i);
+		hashwrite_be32(f, cur_pos);
+		list++;
+	}
+
+	stop_progress(&progress);
+}
+
+static void write_graph_chunk_bloom_data(struct hashfile *f,
+					 struct write_commit_graph_context *ctx,
+					 const struct bloom_filter_settings *settings)
+{
+	struct commit **list = ctx->commits.list;
+	struct commit **last = ctx->commits.list + ctx->commits.nr;
+	struct progress *progress = NULL;
+	int i = 0;
+
+	if (ctx->report_progress)
+		progress = start_delayed_progress(
+			_("Writing changed paths Bloom filters data"),
+			ctx->commits.nr);
+
+	hashwrite_be32(f, settings->hash_version);
+	hashwrite_be32(f, settings->num_hashes);
+	hashwrite_be32(f, settings->bits_per_entry);
+
+	while (list < last) {
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+		display_progress(progress, ++i);
+		hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
+		list++;
+	}
+
+	stop_progress(&progress);
+}
+
 static int oid_compare(const void *_a, const void *_b)
 {
 	const struct object_id *a = (const struct object_id *)_a;
@@ -1438,6 +1528,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
 	struct object_id file_hash;
+	const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
 	if (ctx->split) {
 		struct strbuf tmp_file = STRBUF_INIT;
@@ -1482,6 +1573,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
 		num_chunks++;
 	}
+	if (ctx->changed_paths) {
+		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+		num_chunks++;
+		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+		num_chunks++;
+	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
 		num_chunks++;
@@ -1500,6 +1597,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 						4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
+	if (ctx->changed_paths) {
+		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+						sizeof(uint32_t) * ctx->commits.nr;
+		num_chunks++;
+
+		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+						sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
+		num_chunks++;
+	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
 						hashsz * (ctx->num_commit_graphs_after - 1);
@@ -1537,6 +1643,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	write_graph_chunk_data(f, hashsz, ctx);
 	if (ctx->num_extra_edges)
 		write_graph_chunk_extra_edges(f, ctx);
+	if (ctx->changed_paths) {
+		write_graph_chunk_bloom_indexes(f, ctx);
+		write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+	}
 	if (ctx->num_commit_graphs_after > 1 &&
 	    write_graph_chunk_base(f, ctx)) {
 		return -1;
@@ -2184,6 +2294,7 @@ void free_commit_graph(struct commit_graph *g)
 		close(g->graph_fd);
 	}
 	free(g->filename);
+	free(g->bloom_filter_settings);
 	free(g);
 }
 
diff --git a/commit-graph.h b/commit-graph.h
index 86be812..8e7a8e0 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -11,6 +11,7 @@
 #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
 
 struct commit;
+struct bloom_filter_settings;
 
 char *get_commit_graph_filename(struct object_directory *odb);
 int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
@@ -59,6 +60,10 @@ struct commit_graph {
 	const unsigned char *chunk_commit_data;
 	const unsigned char *chunk_extra_edges;
 	const unsigned char *chunk_base_graphs;
+	const unsigned char *chunk_bloom_indexes;
+	const unsigned char *chunk_bloom_data;
+
+	struct bloom_filter_settings *bloom_filter_settings;
 };
 
 struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st,
-- 
cgit v0.10.2-6-g49f6


From 1217c03e7b87b15f2c78af5b1e1915a675050454 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:50 +0000
Subject: commit-graph: reuse existing Bloom filters during write

Add logic to
a) parse Bloom filter information from the commit graph file and,
b) re-use existing Bloom filters.

See Documentation/technical/commit-graph-format for the format in which
the Bloom filter information is written to the commit graph file.

To read Bloom filter for a given commit with lexicographic position
'i' we need to:
1. Read BIDX[i] which essentially gives us the starting index in BDAT for
   filter of commit i+1. It is essentially the index past the end
   of the filter of commit i. It is called end_index in the code.

2. For i>0, read BIDX[i-1] which will give us the starting index in BDAT
   for filter of commit i. It is called the start_index in the code.
   For the first commit, where i = 0, Bloom filter data starts at the
   beginning, just past the header in the BDAT chunk. Hence, start_index
   will be 0.

3. The length of the filter will be end_index - start_index, because
   BIDX[i] gives the cumulative 8-byte words including the ith
   commit's filter.

We toggle whether Bloom filters should be recomputed based on the
compute_if_not_present flag.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index a16eee9..0f714dd 100644
--- a/bloom.c
+++ b/bloom.c
@@ -4,6 +4,8 @@
 #include "diffcore.h"
 #include "revision.h"
 #include "hashmap.h"
+#include "commit-graph.h"
+#include "commit.h"
 
 define_commit_slab(bloom_filter_slab, struct bloom_filter);
 
@@ -26,6 +28,36 @@ static inline unsigned char get_bitmask(uint32_t pos)
 	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
 }
 
+static int load_bloom_filter_from_graph(struct commit_graph *g,
+				   struct bloom_filter *filter,
+				   struct commit *c)
+{
+	uint32_t lex_pos, start_index, end_index;
+
+	while (c->graph_pos < g->num_commits_in_base)
+		g = g->base_graph;
+
+	/* The commit graph commit 'c' lives in doesn't carry bloom filters. */
+	if (!g->chunk_bloom_indexes)
+		return 0;
+
+	lex_pos = c->graph_pos - g->num_commits_in_base;
+
+	end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
+
+	if (lex_pos > 0)
+		start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
+	else
+		start_index = 0;
+
+	filter->len = end_index - start_index;
+	filter->data = (unsigned char *)(g->chunk_bloom_data +
+					sizeof(unsigned char) * start_index +
+					BLOOMDATA_CHUNK_HEADER_SIZE);
+
+	return 1;
+}
+
 /*
  * Calculate the murmur3 32-bit hash value for the given data
  * using the given seed.
@@ -127,7 +159,8 @@ void init_bloom_filters(void)
 }
 
 struct bloom_filter *get_bloom_filter(struct repository *r,
-				      struct commit *c)
+				      struct commit *c,
+					  int compute_if_not_present)
 {
 	struct bloom_filter *filter;
 	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
@@ -140,6 +173,20 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	filter = bloom_filter_slab_at(&bloom_filters, c);
 
+	if (!filter->data) {
+		load_commit_graph_info(r, c);
+		if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
+			r->objects->commit_graph->chunk_bloom_indexes) {
+			if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
+				return filter;
+			else
+				return NULL;
+		}
+	}
+
+	if (filter->data || !compute_if_not_present)
+		return filter;
+
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
 	diffopt.max_changes = max_changes;
diff --git a/bloom.h b/bloom.h
index 85ab8e9..760d712 100644
--- a/bloom.h
+++ b/bloom.h
@@ -32,6 +32,7 @@ struct bloom_filter_settings {
 
 #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
 #define BITS_PER_WORD 8
+#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
 
 /*
  * A bloom_filter struct represents a data segment to
@@ -79,6 +80,7 @@ void add_key_to_filter(const struct bloom_key *key,
 void init_bloom_filters(void);
 
 struct bloom_filter *get_bloom_filter(struct repository *r,
-				      struct commit *c);
+				      struct commit *c,
+				      int compute_if_not_present);
 
 #endif
\ No newline at end of file
diff --git a/commit-graph.c b/commit-graph.c
index a8b6b5c..7766862 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1086,7 +1086,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f,
 			ctx->commits.nr);
 
 	while (list < last) {
-		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
 		cur_pos += filter->len;
 		display_progress(progress, ++i);
 		hashwrite_be32(f, cur_pos);
@@ -1115,7 +1115,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f,
 	hashwrite_be32(f, settings->bits_per_entry);
 
 	while (list < last) {
-		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
 		display_progress(progress, ++i);
 		hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
 		list++;
@@ -1296,7 +1296,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 
 	for (i = 0; i < ctx->commits.nr; i++) {
 		struct commit *c = sorted_commits[i];
-		struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
+		struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
 		ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
 		display_progress(progress, i + 1);
 	}
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index f18d1b7..ce41266 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -39,7 +39,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
 	struct bloom_filter *filter;
 	setup_git_directory();
 	c = lookup_commit(the_repository, commit_oid);
-	filter = get_bloom_filter(the_repository, c);
+	filter = get_bloom_filter(the_repository, c, 1);
 	print_bloom_filter(filter);
 }
 
-- 
cgit v0.10.2-6-g49f6


From d38e07b8c44ffdb73e7eba1b7f6a73eb7eb0d5f9 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:51 +0000
Subject: commit-graph: add --changed-paths option to write subcommand

Add --changed-paths option to git commit-graph write. This option will
allow users to compute information about the paths that have changed
between a commit and its first parent, and write it into the commit graph
file. If the option is passed to the write subcommand we set the
COMMIT_GRAPH_WRITE_BLOOM_FILTERS flag and pass it down to the
commit-graph logic.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
index 28d1fee..f4b13c0 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -57,6 +57,11 @@ or `--stdin-packs`.)
 With the `--append` option, include all commits that are present in the
 existing commit-graph file.
 +
+With the `--changed-paths` option, compute and write information about the
+paths changed between a commit and it's first parent. This operation can
+take a while on large repositories. It provides significant performance gains
+for getting history of a directory or a file with `git log -- <path>`.
++
 With the `--split` option, write the commit-graph as a chain of multiple
 commit-graph files stored in `<dir>/info/commit-graphs`. The new commits
 not already in the commit-graph are added in a new "tip" file. This file
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index d1ab662..cacb5d0 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -9,7 +9,7 @@
 
 static char const * const builtin_commit_graph_usage[] = {
 	N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
-	N_("git commit-graph write [--object-dir <objdir>] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--[no-]progress] <split options>"),
+	N_("git commit-graph write [--object-dir <objdir>] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--changed-paths] [--[no-]progress] <split options>"),
 	NULL
 };
 
@@ -19,7 +19,7 @@ static const char * const builtin_commit_graph_verify_usage[] = {
 };
 
 static const char * const builtin_commit_graph_write_usage[] = {
-	N_("git commit-graph write [--object-dir <objdir>] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--[no-]progress] <split options>"),
+	N_("git commit-graph write [--object-dir <objdir>] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--changed-paths] [--[no-]progress] <split options>"),
 	NULL
 };
 
@@ -32,6 +32,7 @@ static struct opts_commit_graph {
 	int split;
 	int shallow;
 	int progress;
+	int enable_changed_paths;
 } opts;
 
 static struct object_directory *find_odb(struct repository *r,
@@ -135,6 +136,8 @@ static int graph_write(int argc, const char **argv)
 			N_("start walk at commits listed by stdin")),
 		OPT_BOOL(0, "append", &opts.append,
 			N_("include all commits already in the commit-graph file")),
+		OPT_BOOL(0, "changed-paths", &opts.enable_changed_paths,
+			N_("enable computation for changed paths")),
 		OPT_BOOL(0, "progress", &opts.progress, N_("force progress reporting")),
 		OPT_BOOL(0, "split", &opts.split,
 			N_("allow writing an incremental commit-graph file")),
@@ -168,6 +171,8 @@ static int graph_write(int argc, const char **argv)
 		flags |= COMMIT_GRAPH_WRITE_SPLIT;
 	if (opts.progress)
 		flags |= COMMIT_GRAPH_WRITE_PROGRESS;
+	if (opts.enable_changed_paths)
+		flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
 
 	read_replace_refs = 0;
 	odb = find_odb(the_repository, opts.obj_dir);
-- 
cgit v0.10.2-6-g49f6


From a56b9464cd0a49317fafde080ae4e73c5430ac9b Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:52 +0000
Subject: revision.c: use Bloom filters to speed up path based revision walks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revision walk will now use Bloom filters for commits to speed up
revision walks for a particular path (for computing history for
that path), if they are present in the commit-graph file.

We load the Bloom filters during the prepare_revision_walk step,
currently only when dealing with a single pathspec. Extending
it to work with multiple pathspecs can be explored and built on
top of this series in the future.

While comparing trees in rev_compare_trees(), if the Bloom filter
says that the file is not different between the two trees, we don't
need to compute the expensive diff. This is where we get our
performance gains. The other response of the Bloom filter is '`:maybe',
in which case we fall back to the full diff calculation to determine
if the path was changed in the commit.

We do not try to use Bloom filters when the '--walk-reflogs' option
is specified. The '--walk-reflogs' option does not walk the commit
ancestry chain like the rest of the options. Incorporating the
performance gains when walking reflog entries would add more
complexity, and can be explored in a later series.

Performance Gains:
We tested the performance of `git log -- <path>` on the git repo, the linux
and some internal large repos, with a variety of paths of varying depths.

On the git and linux repos:
- we observed a 2x to 5x speed up.

On a large internal repo with files seated 6-10 levels deep in the tree:
- we observed 10x to 20x speed ups, with some paths going up to 28 times
  faster.

Helped-by: Derrick Stolee <dstolee@microsoft.com
Helped-by: SZEDER Gábor <szeder.dev@gmail.com>
Helped-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index 0f714dd..c5b461d 100644
--- a/bloom.c
+++ b/bloom.c
@@ -253,3 +253,23 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	return filter;
 }
+
+int bloom_filter_contains(const struct bloom_filter *filter,
+			  const struct bloom_key *key,
+			  const struct bloom_filter_settings *settings)
+{
+	int i;
+	uint64_t mod = filter->len * BITS_PER_WORD;
+
+	if (!mod)
+		return -1;
+
+	for (i = 0; i < settings->num_hashes; i++) {
+		uint64_t hash_mod = key->hashes[i] % mod;
+		uint64_t block_pos = hash_mod / BITS_PER_WORD;
+		if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
+			return 0;
+	}
+
+	return 1;
+}
\ No newline at end of file
diff --git a/bloom.h b/bloom.h
index 760d712..b935186 100644
--- a/bloom.h
+++ b/bloom.h
@@ -83,4 +83,8 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 				      struct commit *c,
 				      int compute_if_not_present);
 
+int bloom_filter_contains(const struct bloom_filter *filter,
+			  const struct bloom_key *key,
+			  const struct bloom_filter_settings *settings);
+
 #endif
\ No newline at end of file
diff --git a/revision.c b/revision.c
index 8136929..d3fcb7c 100644
--- a/revision.c
+++ b/revision.c
@@ -29,6 +29,7 @@
 #include "prio-queue.h"
 #include "hashmap.h"
 #include "utf8.h"
+#include "bloom.h"
 
 volatile show_early_output_fn_t show_early_output;
 
@@ -624,11 +625,80 @@ static void file_change(struct diff_options *options,
 	options->flags.has_changes = 1;
 }
 
+static void prepare_to_use_bloom_filter(struct rev_info *revs)
+{
+	struct pathspec_item *pi;
+	char *path_alloc = NULL;
+	const char *path;
+	int last_index;
+	int len;
+
+	if (!revs->commits)
+	    return;
+
+	repo_parse_commit(revs->repo, revs->commits->item);
+
+	if (!revs->repo->objects->commit_graph)
+		return;
+
+	revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings;
+	if (!revs->bloom_filter_settings)
+		return;
+
+	pi = &revs->pruning.pathspec.items[0];
+	last_index = pi->len - 1;
+
+	/* remove single trailing slash from path, if needed */
+	if (pi->match[last_index] == '/') {
+	    path_alloc = xstrdup(pi->match);
+	    path_alloc[last_index] = '\0';
+	    path = path_alloc;
+	} else
+	    path = pi->match;
+
+	len = strlen(path);
+
+	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
+	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+
+	free(path_alloc);
+}
+
+static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
+						 struct commit *commit)
+{
+	struct bloom_filter *filter;
+	int result;
+
+	if (!revs->repo->objects->commit_graph)
+		return -1;
+
+	if (commit->generation == GENERATION_NUMBER_INFINITY)
+		return -1;
+
+	filter = get_bloom_filter(revs->repo, commit, 0);
+
+	if (!filter) {
+		return -1;
+	}
+
+	if (!filter->len) {
+		return -1;
+	}
+
+	result = bloom_filter_contains(filter,
+				       revs->bloom_key,
+				       revs->bloom_filter_settings);
+
+	return result;
+}
+
 static int rev_compare_tree(struct rev_info *revs,
-			    struct commit *parent, struct commit *commit)
+			    struct commit *parent, struct commit *commit, int nth_parent)
 {
 	struct tree *t1 = get_commit_tree(parent);
 	struct tree *t2 = get_commit_tree(commit);
+	int bloom_ret = 1;
 
 	if (!t1)
 		return REV_TREE_NEW;
@@ -653,11 +723,19 @@ static int rev_compare_tree(struct rev_info *revs,
 			return REV_TREE_SAME;
 	}
 
+	if (revs->bloom_key && !nth_parent) {
+		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
+
+		if (bloom_ret == 0)
+			return REV_TREE_SAME;
+	}
+
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
 	if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
 			   &revs->pruning) < 0)
 		return REV_TREE_DIFFERENT;
+
 	return tree_difference;
 }
 
@@ -855,7 +933,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 			die("cannot simplify commit %s (because of %s)",
 			    oid_to_hex(&commit->object.oid),
 			    oid_to_hex(&p->object.oid));
-		switch (rev_compare_tree(revs, p, commit)) {
+		switch (rev_compare_tree(revs, p, commit, nth_parent)) {
 		case REV_TREE_SAME:
 			if (!revs->simplify_history || !relevant_commit(p)) {
 				/* Even if a merge with an uninteresting
@@ -3362,6 +3440,8 @@ int prepare_revision_walk(struct rev_info *revs)
 				       FOR_EACH_OBJECT_PROMISOR_ONLY);
 	}
 
+	if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info)
+		prepare_to_use_bloom_filter(revs);
 	if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED)
 		commit_list_sort_by_date(&revs->commits);
 	if (revs->no_walk)
@@ -3379,6 +3459,7 @@ int prepare_revision_walk(struct rev_info *revs)
 		simplify_merges(revs);
 	if (revs->children.name)
 		set_children(revs);
+
 	return 0;
 }
 
diff --git a/revision.h b/revision.h
index 475f048..7c026fe 100644
--- a/revision.h
+++ b/revision.h
@@ -56,6 +56,8 @@ struct repository;
 struct rev_info;
 struct string_list;
 struct saved_parents;
+struct bloom_key;
+struct bloom_filter_settings;
 define_shared_commit_slab(revision_sources, char *);
 
 struct rev_cmdline_info {
@@ -291,6 +293,15 @@ struct rev_info {
 	struct revision_sources *sources;
 
 	struct topo_walk_info *topo_walk_info;
+
+	/* Commit graph bloom filter fields */
+	/* The bloom filter key for the pathspec */
+	struct bloom_key *bloom_key;
+	/*
+	 * The bloom filter settings used to generate the key.
+	 * This is loaded from the commit-graph being used.
+	 */
+	struct bloom_filter_settings *bloom_filter_settings;
 };
 
 int ref_excluded(struct string_list *, const char *path);
-- 
cgit v0.10.2-6-g49f6


From 42e50e78c6fd8978c2218bbd7b3483ae51d5e3f9 Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:53 +0000
Subject: revision.c: add trace2 stats around Bloom filter usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add trace2 statistics around Bloom filter usage and behavior
for 'git log -- path' commands that are hoping to benefit from
the presence of computed changed paths Bloom filters.

These statistics are great for performance analysis work and
for formal testing, which we will see in the commit following
this one.

Helped-by: Derrick Stolee <dstolee@microsoft.com
Helped-by: SZEDER Gábor <szeder.dev@gmail.com>
Helped-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/revision.c b/revision.c
index d3fcb7c..2b06ee7 100644
--- a/revision.c
+++ b/revision.c
@@ -30,6 +30,7 @@
 #include "hashmap.h"
 #include "utf8.h"
 #include "bloom.h"
+#include "json-writer.h"
 
 volatile show_early_output_fn_t show_early_output;
 
@@ -625,6 +626,30 @@ static void file_change(struct diff_options *options,
 	options->flags.has_changes = 1;
 }
 
+static int bloom_filter_atexit_registered;
+static unsigned int count_bloom_filter_maybe;
+static unsigned int count_bloom_filter_definitely_not;
+static unsigned int count_bloom_filter_false_positive;
+static unsigned int count_bloom_filter_not_present;
+static unsigned int count_bloom_filter_length_zero;
+
+static void trace2_bloom_filter_statistics_atexit(void)
+{
+	struct json_writer jw = JSON_WRITER_INIT;
+
+	jw_object_begin(&jw, 0);
+	jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present);
+	jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero);
+	jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe);
+	jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not);
+	jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive);
+	jw_end(&jw);
+
+	trace2_data_json("bloom", the_repository, "statistics", &jw);
+
+	jw_release(&jw);
+}
+
 static void prepare_to_use_bloom_filter(struct rev_info *revs)
 {
 	struct pathspec_item *pi;
@@ -661,6 +686,11 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
 	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
 
+	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
+		atexit(trace2_bloom_filter_statistics_atexit);
+		bloom_filter_atexit_registered = 1;
+	}
+
 	free(path_alloc);
 }
 
@@ -679,10 +709,12 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 	filter = get_bloom_filter(revs->repo, commit, 0);
 
 	if (!filter) {
+		count_bloom_filter_not_present++;
 		return -1;
 	}
 
 	if (!filter->len) {
+		count_bloom_filter_length_zero++;
 		return -1;
 	}
 
@@ -690,6 +722,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 				       revs->bloom_key,
 				       revs->bloom_filter_settings);
 
+	if (result)
+		count_bloom_filter_maybe++;
+	else
+		count_bloom_filter_definitely_not++;
+
 	return result;
 }
 
@@ -736,6 +773,10 @@ static int rev_compare_tree(struct rev_info *revs,
 			   &revs->pruning) < 0)
 		return REV_TREE_DIFFERENT;
 
+	if (!nth_parent)
+		if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
+			count_bloom_filter_false_positive++;
+
 	return tree_difference;
 }
 
-- 
cgit v0.10.2-6-g49f6


From a759bfa9eeb2a080d7c5c0a3c4096db5438c06bf Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:54 +0000
Subject: t4216: add end to end tests for git log with Bloom filters

These tests exercises writing commit graph with Bloom filters
and exercises 'git log -- path' with all the applicable
options. They check that the output is the same with and
without Bloom filters, confirm Bloom filters were used by
checking if trace2 statistics were logged correctly.

Also confirms cases where Bloom filters are not used:
1. Multiple path specs,
2. --walk-reflogs (see patch titled 'revision.c: use Bloom filters...'
   for details,
3. If the latest commit graph does not have Bloom filters

Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c
index f8a4617..4223ff3 100644
--- a/t/helper/test-read-graph.c
+++ b/t/helper/test-read-graph.c
@@ -45,6 +45,10 @@ int cmd__read_graph(int argc, const char **argv)
 		printf(" commit_metadata");
 	if (graph->chunk_extra_edges)
 		printf(" extra_edges");
+	if (graph->chunk_bloom_indexes)
+		printf(" bloom_indexes");
+	if (graph->chunk_bloom_data)
+		printf(" bloom_data");
 	printf("\n");
 
 	UNLEAK(graph);
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
new file mode 100755
index 0000000..c7011f3
--- /dev/null
+++ b/t/t4216-log-bloom.sh
@@ -0,0 +1,155 @@
+#!/bin/sh
+
+test_description='git log for a path with Bloom filters'
+. ./test-lib.sh
+
+GIT_TEST_COMMIT_GRAPH=0
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
+
+test_expect_success 'setup test - repo, commits, commit graph, log outputs' '
+	git init &&
+	mkdir A A/B A/B/C &&
+	test_commit c1 A/file1 &&
+	test_commit c2 A/B/file2 &&
+	test_commit c3 A/B/C/file3 &&
+	test_commit c4 A/file1 &&
+	test_commit c5 A/B/file2 &&
+	test_commit c6 A/B/C/file3 &&
+	test_commit c7 A/file1 &&
+	test_commit c8 A/B/file2 &&
+	test_commit c9 A/B/C/file3 &&
+	test_commit c10 file_to_be_deleted &&
+	git checkout -b side HEAD~4 &&
+	test_commit side-1 file4 &&
+	git checkout master &&
+	git merge side &&
+	test_commit c11 file5 &&
+	mv file5 file5_renamed &&
+	git add file5_renamed &&
+	git commit -m "rename" &&
+	rm file_to_be_deleted &&
+	git add . &&
+	git commit -m "file removed" &&
+	git commit-graph write --reachable --changed-paths
+'
+graph_read_expect () {
+	NUM_CHUNKS=5
+	cat >expect <<- EOF
+	header: 43475048 1 1 $NUM_CHUNKS 0
+	num_commits: $1
+	chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data
+	EOF
+	test-tool read-graph >actual &&
+	test_cmp expect actual
+}
+
+test_expect_success 'commit-graph write wrote out the bloom chunks' '
+	graph_read_expect 15
+'
+
+# Turn off any inherited trace2 settings for this test.
+sane_unset GIT_TRACE2 GIT_TRACE2_PERF GIT_TRACE2_EVENT
+sane_unset GIT_TRACE2_PERF_BRIEF
+sane_unset GIT_TRACE2_CONFIG_PARAMS
+
+setup () {
+	rm "$TRASH_DIRECTORY/trace.perf"
+	git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom &&
+	GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
+}
+
+test_bloom_filters_used () {
+	log_args=$1
+	bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\""
+	setup "$log_args" &&
+	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
+	test_cmp log_wo_bloom log_w_bloom &&
+    test_path_is_file "$TRASH_DIRECTORY/trace.perf"
+}
+
+test_bloom_filters_not_used () {
+	log_args=$1
+	setup "$log_args" &&
+	!(grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf") &&
+	test_cmp log_wo_bloom log_w_bloom
+}
+
+for path in A A/B A/B/C A/file1 A/B/file2 A/B/C/file3 file4 file5 file5_renamed file_to_be_deleted
+do
+	for option in "" \
+	      "--all" \
+		      "--full-history" \
+		      "--full-history --simplify-merges" \
+		      "--simplify-merges" \
+		      "--simplify-by-decoration" \
+		      "--follow" \
+		      "--first-parent" \
+		      "--topo-order" \
+		      "--date-order" \
+		      "--author-date-order" \
+		      "--ancestry-path side..master"
+	do
+		test_expect_success "git log option: $option for path: $path" '
+			test_bloom_filters_used "$option -- $path"
+		'
+	done
+done
+
+test_expect_success 'git log -- folder works with and without the trailing slash' '
+	test_bloom_filters_used "-- A" &&
+	test_bloom_filters_used "-- A/"
+'
+
+test_expect_success 'git log for path that does not exist. ' '
+	test_bloom_filters_used "-- path_does_not_exist"
+'
+
+test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
+	test_bloom_filters_not_used "--walk-reflogs -- A"
+'
+
+test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
+	test_bloom_filters_not_used "-- file4 A/file1"
+'
+
+test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' '
+	test_bloom_filters_used "-- *4" &&
+	test_bloom_filters_used "-- *renamed"
+'
+
+test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' '
+	test_bloom_filters_not_used "-- *" &&
+	test_bloom_filters_not_used "-- file*"
+'
+
+test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '
+	test_commit c14 A/anotherFile2 &&
+	test_commit c15 A/B/anotherFile2 &&
+	test_commit c16 A/B/C/anotherFile2 &&
+	GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split &&
+	test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
+'
+
+test_expect_success 'Do not use Bloom filters if the latest graph does not have Bloom filters.' '
+	test_bloom_filters_not_used "-- A/B"
+'
+
+test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
+	test_commit c17 A/anotherFile3 &&
+	git commit-graph write --reachable --changed-paths --split &&
+	test_line_count = 3 .git/objects/info/commit-graphs/commit-graph-chain
+'
+
+test_bloom_filters_used_when_some_filters_are_missing () {
+	log_args=$1
+	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
+	setup "$log_args" &&
+	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
+	test_cmp log_wo_bloom log_w_bloom
+}
+
+test_expect_success 'Use Bloom filters if they exist in the latest but not all commit graphs in the chain.' '
+	test_bloom_filters_used_when_some_filters_are_missing "-- A/B"
+'
+
+test_done
\ No newline at end of file
-- 
cgit v0.10.2-6-g49f6


From d5b873c832d832e44523d1d2a9d29afe2b84c84f Mon Sep 17 00:00:00 2001
From: Garima Singh <garima.singh@microsoft.com>
Date: Mon, 6 Apr 2020 16:59:55 +0000
Subject: commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag

Add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag to the test setup suite
in order to toggle writing Bloom filters when running any of the git tests.
If set to true, we will compute and write Bloom filters every time a test
calls `git commit-graph write`, as if the `--changed-paths` option was
passed in.

The test suite passes when GIT_TEST_COMMIT_GRAPH and
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS are enabled.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index cacb5d0..5900983 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -171,7 +171,8 @@ static int graph_write(int argc, const char **argv)
 		flags |= COMMIT_GRAPH_WRITE_SPLIT;
 	if (opts.progress)
 		flags |= COMMIT_GRAPH_WRITE_PROGRESS;
-	if (opts.enable_changed_paths)
+	if (opts.enable_changed_paths ||
+	    git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
 		flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
 
 	read_replace_refs = 0;
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 4df54c4..17e25aa 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -19,6 +19,7 @@ linux-gcc)
 	export GIT_TEST_OE_SIZE=10
 	export GIT_TEST_OE_DELTA_SIZE=5
 	export GIT_TEST_COMMIT_GRAPH=1
+	export GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1
 	export GIT_TEST_MULTI_PACK_INDEX=1
 	export GIT_TEST_ADD_I_USE_BUILTIN=1
 	make test
diff --git a/commit-graph.h b/commit-graph.h
index 8e7a8e0..8655d06 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -9,6 +9,7 @@
 
 #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
 #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
+#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
 
 struct commit;
 struct bloom_filter_settings;
diff --git a/t/README b/t/README
index da5b24f..8ad9bc1 100644
--- a/t/README
+++ b/t/README
@@ -378,6 +378,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
 be written after every 'git commit' command, and overrides the
 'core.commitGraph' setting to true.
 
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=<boolean>, when true, forces
+commit-graph write to compute and write changed path Bloom filters for
+every 'git commit-graph write', as if the `--changed-paths` option was
+passed in.
+
 GIT_TEST_FSMONITOR=$PWD/t7519/fsmonitor-all exercises the fsmonitor
 code path for utilizing a file system monitor to speed up detecting
 new or changed files.
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 9bf920a..18304a6 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -3,6 +3,8 @@
 test_description='commit graph'
 . ./test-lib.sh
 
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
+
 test_expect_success 'setup full repo' '
 	mkdir full &&
 	cd "$TRASH_DIRECTORY/full" &&
diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh
index 53b2e6b..d3f1f2c 100755
--- a/t/t5324-split-commit-graph.sh
+++ b/t/t5324-split-commit-graph.sh
@@ -4,6 +4,7 @@ test_description='split commit graph'
 . ./test-lib.sh
 
 GIT_TEST_COMMIT_GRAPH=0
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
 
 test_expect_success 'setup repo' '
 	git init &&
-- 
cgit v0.10.2-6-g49f6


From caf388caa101be90b7ec43d7f78ca4e935fc0150 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Thu, 9 Apr 2020 13:00:11 +0000
Subject: bloom: ignore renames when computing changed paths

The changed-path Bloom filters record an entry in the filter for
every path that was changed. This includes every add and delete,
regardless of whether a rename was detected. Detecting renames
causes significant performance issues, but also will trigger
downloading missing blobs in partial clone.

The simple fix is to disable rename detection when computing a
changed-path Bloom filter. This should already be disabled by
default, but it is good to explicitly enforce the intended
behavior.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/bloom.c b/bloom.c
index c5b461d..dd9bab9 100644
--- a/bloom.c
+++ b/bloom.c
@@ -189,6 +189,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
 	repo_diff_setup(r, &diffopt);
 	diffopt.flags.recursive = 1;
+	diffopt.detect_rename = 0;
 	diffopt.max_changes = max_changes;
 	diff_setup_done(&diffopt);
 
-- 
cgit v0.10.2-6-g49f6