summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2020-02-21 02:06:05 (GMT)
committerJunio C Hamano <gitster@pobox.com>2020-02-21 02:06:05 (GMT)
commiteb33976b5c8a4de3bc94e696f37f682a7c066c30 (patch)
tree14028b1f26d29b151bb15105d5dbfb4b72f4ca5e
parent0321685eba14150ecdb33de3e6a18abc38bd33a3 (diff)
parent80214a40c4a95af2b2f1b953cbc95740af20f630 (diff)
downloadgit-eb33976b5c8a4de3bc94e696f37f682a7c066c30.zip
git-eb33976b5c8a4de3bc94e696f37f682a7c066c30.tar.gz
git-eb33976b5c8a4de3bc94e696f37f682a7c066c30.tar.bz2
Merge branch 'gs/commit-graph-path-filter' into pu
Introduce an extension to the commit-graph to make it efficient to check for the paths that were modified at each commit using Bloom filters. * gs/commit-graph-path-filter: (bytesex breakage band-aid) commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag revision.c: use Bloom filters to speed up path based revision walks commit-graph: add --changed-paths option to write subcommand commit-graph: reuse existing Bloom filters during write. commit-graph: write Bloom filters to commit graph file commit-graph: examine commits by generation number commit-graph: examine changed-path objects in pack order commit-graph: compute Bloom filters for changed paths diff: halt tree-diff early after max_changes bloom: core Bloom filter implementation for changed paths commit-graph: use MAX_NUM_CHUNKS
-rw-r--r--Documentation/git-commit-graph.txt5
-rw-r--r--Documentation/technical/commit-graph-format.txt24
-rw-r--r--Makefile2
-rw-r--r--bloom.c277
-rw-r--r--bloom.h58
-rw-r--r--builtin/commit-graph.c8
-rwxr-xr-xci/run-build-and-tests.sh1
-rw-r--r--commit-graph.c211
-rw-r--r--commit-graph.h9
-rw-r--r--diff.h5
-rw-r--r--revision.c124
-rw-r--r--revision.h11
-rw-r--r--t/README5
-rw-r--r--t/helper/test-bloom.c84
-rw-r--r--t/helper/test-read-graph.c4
-rw-r--r--t/helper/test-tool.c1
-rw-r--r--t/helper/test-tool.h1
-rwxr-xr-xt/t0095-bloom.sh113
-rwxr-xr-xt/t4216-log-bloom.sh143
-rwxr-xr-xt/t5318-commit-graph.sh2
-rwxr-xr-xt/t5324-split-commit-graph.sh1
-rw-r--r--tree-diff.c6
22 files changed, 1088 insertions, 7 deletions
diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
index b210cef..717d52c 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -76,6 +76,11 @@ conducted, and the remaining options are ignored. Conversely, if
remaining options are ignored. A bare `--split` defers to the remaining
options.
+
+With the `--changed-paths` option, compute and write information about the
+paths changed between a commit and it's first parent. This operation can
+take a while on large repositories. It provides significant performance gains
+for getting history of a directory or a file with `git log -- <path>`.
++
* If `--size-multiple=<X>` is not specified, let `X` equal 2. If the new
tip file would have `N` commits and the previous tip has `M` commits and
`X` times `N` is greater than `M`, instead merge the two files into a
diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt
index a4f1744..22e5116 100644
--- a/Documentation/technical/commit-graph-format.txt
+++ b/Documentation/technical/commit-graph-format.txt
@@ -17,6 +17,9 @@ metadata, including:
- The parents of the commit, stored using positional references within
the graph file.
+- The Bloom filter of the commit carrying the paths that were changed between
+ the commit and its first parent.
+
These positional references are stored as unsigned 32-bit integers
corresponding to the array position within the list of commit OIDs. Due
to some special constants we use to track parents, we can store at most
@@ -93,6 +96,27 @@ CHUNK DATA:
positions for the parents until reaching a value with the most-significant
bit on. The other bits correspond to the position of the last parent.
+ Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
+ * The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
+ Bloom filters from commit 0 to commit i (inclusive) in lexicographic
+ order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
+ BIDX[i] (plus header length), where BIDX[-1] is 0.
+ * The BIDX chunk is ignored if the BDAT chunk is not present.
+
+ Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
+ * It starts with header consisting of three unsigned 32-bit integers:
+ - Version of the hash algorithm being used. We currently only support
+ value 1 which implies the murmur3 hash implemented exactly as described
+ in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ - The number of times a path is hashed and hence the number of bit positions
+ that cumulatively determine whether a file is present in the commit.
+ - The minimum number of bits 'b' per entry in the Bloom filter. If the filter
+ contains 'n' entries, then the filter size is the minimum number of 64-bit
+ words that contain n*b bits.
+ * The rest of the chunk is the concatenation of all the computed Bloom
+ filters for the commits in lexicographic order.
+ * The BDAT chunk is present iff BIDX is present.
+
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
This list of H-byte hashes describe a set of B commit-graph files that
form a commit-graph chain. The graph position for the ith commit in this
diff --git a/Makefile b/Makefile
index 339e09c..e9cbdf3 100644
--- a/Makefile
+++ b/Makefile
@@ -696,6 +696,7 @@ X =
PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
+TEST_BUILTINS_OBJS += test-bloom.o
TEST_BUILTINS_OBJS += test-advise.o
TEST_BUILTINS_OBJS += test-chmtime.o
TEST_BUILTINS_OBJS += test-config.o
@@ -844,6 +845,7 @@ LIB_OBJS += base85.o
LIB_OBJS += bisect.o
LIB_OBJS += blame.o
LIB_OBJS += blob.o
+LIB_OBJS += bloom.o
LIB_OBJS += branch.o
LIB_OBJS += bulk-checkin.o
LIB_OBJS += bundle.o
diff --git a/bloom.c b/bloom.c
new file mode 100644
index 0000000..0a59d9f
--- /dev/null
+++ b/bloom.c
@@ -0,0 +1,277 @@
+#include "git-compat-util.h"
+#include "bloom.h"
+#include "commit.h"
+#include "commit-slab.h"
+#include "commit-graph.h"
+#include "object-store.h"
+#include "diff.h"
+#include "diffcore.h"
+#include "revision.h"
+#include "hashmap.h"
+
+define_commit_slab(bloom_filter_slab, struct bloom_filter);
+
+struct bloom_filter_slab bloom_filters;
+
+struct pathmap_hash_entry {
+ struct hashmap_entry entry;
+ const char path[FLEX_ARRAY];
+};
+
+static uint32_t rotate_right(uint32_t value, int32_t count)
+{
+ uint32_t mask = 8 * sizeof(uint32_t) - 1;
+ count &= mask;
+ return ((value >> count) | (value << ((-count) & mask)));
+}
+
+/*
+ * Calculate a hash value for the given data using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ **/
+static uint32_t seed_murmur3(uint32_t seed, const char *data, int len)
+{
+ const uint32_t c1 = 0xcc9e2d51;
+ const uint32_t c2 = 0x1b873593;
+ const uint32_t r1 = 15;
+ const uint32_t r2 = 13;
+ const uint32_t m = 5;
+ const uint32_t n = 0xe6546b64;
+ int i;
+ uint32_t k1 = 0;
+ const char *tail;
+
+ int len4 = len / sizeof(uint32_t);
+
+ uint32_t k;
+ for (i = 0; i < len4; i++) {
+ uint32_t byte1 = (uint32_t)data[4*i];
+ uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
+ uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
+ uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
+ k = byte1 | byte2 | byte3 | byte4;
+ k *= c1;
+ k = rotate_right(k, r1);
+ k *= c2;
+
+ seed ^= k;
+ seed = rotate_right(seed, r2) * m + n;
+ }
+
+ tail = (data + len4 * sizeof(uint32_t));
+
+ switch (len & (sizeof(uint32_t) - 1)) {
+ case 3:
+ k1 ^= ((uint32_t)tail[2]) << 16;
+ /*-fallthrough*/
+ case 2:
+ k1 ^= ((uint32_t)tail[1]) << 8;
+ /*-fallthrough*/
+ case 1:
+ k1 ^= ((uint32_t)tail[0]) << 0;
+ k1 *= c1;
+ k1 = rotate_right(k1, r1);
+ k1 *= c2;
+ seed ^= k1;
+ break;
+ }
+
+ seed ^= (uint32_t)len;
+ seed ^= (seed >> 16);
+ seed *= 0x85ebca6b;
+ seed ^= (seed >> 13);
+ seed *= 0xc2b2ae35;
+ seed ^= (seed >> 16);
+
+ return seed;
+}
+
+static inline unsigned char get_bitmask(uint32_t pos)
+{
+ return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
+}
+
+void load_bloom_filters(void)
+{
+ init_bloom_filter_slab(&bloom_filters);
+}
+
+void fill_bloom_key(const char *data,
+ int len,
+ struct bloom_key *key,
+ struct bloom_filter_settings *settings)
+{
+ int i;
+ const uint32_t seed0 = 0x293ae76f;
+ const uint32_t seed1 = 0x7e646e2c;
+ const uint32_t hash0 = seed_murmur3(seed0, data, len);
+ const uint32_t hash1 = seed_murmur3(seed1, data, len);
+
+ key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
+ for (i = 0; i < settings->num_hashes; i++)
+ key->hashes[i] = hash0 + i * hash1;
+}
+
+void add_key_to_filter(struct bloom_key *key,
+ struct bloom_filter *filter,
+ struct bloom_filter_settings *settings)
+{
+ int i;
+ uint64_t mod = filter->len * BITS_PER_WORD;
+
+ for (i = 0; i < settings->num_hashes; i++) {
+ uint64_t hash_mod = key->hashes[i] % mod;
+ uint64_t block_pos = hash_mod / BITS_PER_WORD;
+
+ filter->data[block_pos] |= get_bitmask(hash_mod);
+ }
+}
+
+static int load_bloom_filter_from_graph(struct commit_graph *g,
+ struct bloom_filter *filter,
+ struct commit *c)
+{
+ uint32_t lex_pos, start_index, end_index;
+
+ while (c->graph_pos < g->num_commits_in_base)
+ g = g->base_graph;
+
+ /* The commit graph commit 'c' lives in doesn't carry bloom filters. */
+ if (!g->chunk_bloom_indexes)
+ return 0;
+
+ lex_pos = c->graph_pos - g->num_commits_in_base;
+
+ end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
+
+ if (lex_pos)
+ start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
+ else
+ start_index = 0;
+
+ filter->len = end_index - start_index;
+ filter->data = (unsigned char *)(g->chunk_bloom_data +
+ sizeof(unsigned char) * start_index +
+ BLOOMDATA_CHUNK_HEADER_SIZE);
+
+ return 1;
+}
+
+struct bloom_filter *get_bloom_filter(struct repository *r,
+ struct commit *c,
+ int compute_if_not_present)
+{
+ struct bloom_filter *filter;
+ struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+ int i;
+ struct diff_options diffopt;
+ int max_changes = 512;
+
+ if (!bloom_filters.slab_size)
+ return NULL;
+
+ filter = bloom_filter_slab_at(&bloom_filters, c);
+
+ if (!filter->data) {
+ load_commit_graph_info(r, c);
+ if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
+ r->objects->commit_graph->chunk_bloom_indexes) {
+ if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
+ return filter;
+ else
+ return NULL;
+ }
+ }
+
+ if (filter->data || !compute_if_not_present)
+ return filter;
+
+ repo_diff_setup(r, &diffopt);
+ diffopt.flags.recursive = 1;
+ diffopt.max_changes = max_changes;
+ diff_setup_done(&diffopt);
+
+ if (c->parents)
+ diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
+ else
+ diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
+ diffcore_std(&diffopt);
+
+ if (diff_queued_diff.nr <= max_changes) {
+ struct hashmap pathmap;
+ struct pathmap_hash_entry* e;
+ struct hashmap_iter iter;
+ hashmap_init(&pathmap, NULL, NULL, 0);
+
+ for (i = 0; i < diff_queued_diff.nr; i++) {
+ const char* path = diff_queued_diff.queue[i]->two->path;
+ const char* p = path;
+
+ /*
+ * Add each leading directory of the changed file, i.e. for
+ * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
+ * the Bloom filter could be used to speed up commands like
+ * 'git log dir/subdir', too.
+ *
+ * Note that directories are added without the trailing '/'.
+ */
+ do {
+ char* last_slash = strrchr(p, '/');
+
+ FLEX_ALLOC_STR(e, path, path);
+ hashmap_entry_init(&e->entry, strhash(p));
+ hashmap_add(&pathmap, &e->entry);
+
+ if (!last_slash)
+ last_slash = (char*)p;
+ *last_slash = '\0';
+
+ } while (*p);
+
+ diff_free_filepair(diff_queued_diff.queue[i]);
+ }
+
+ filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+ filter->data = xcalloc(filter->len, sizeof(unsigned char));
+
+ hashmap_for_each_entry(&pathmap, &iter, e, entry) {
+ struct bloom_key key;
+ fill_bloom_key(e->path, strlen(e->path), &key, &settings);
+ add_key_to_filter(&key, filter, &settings);
+ }
+
+ hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
+ } else {
+ for (i = 0; i < diff_queued_diff.nr; i++)
+ diff_free_filepair(diff_queued_diff.queue[i]);
+ filter->data = NULL;
+ filter->len = 0;
+ }
+
+ free(diff_queued_diff.queue);
+ DIFF_QUEUE_CLEAR(&diff_queued_diff);
+
+ return filter;
+}
+
+int bloom_filter_contains(struct bloom_filter *filter,
+ struct bloom_key *key,
+ struct bloom_filter_settings *settings)
+{
+ int i;
+ uint64_t mod = filter->len * BITS_PER_WORD;
+
+ if (!mod)
+ return -1;
+
+ for (i = 0; i < settings->num_hashes; i++) {
+ uint64_t hash_mod = key->hashes[i] % mod;
+ uint64_t block_pos = hash_mod / BITS_PER_WORD;
+ if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/bloom.h b/bloom.h
new file mode 100644
index 0000000..9604723
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,58 @@
+#ifndef BLOOM_H
+#define BLOOM_H
+
+struct commit;
+struct repository;
+struct commit_graph;
+
+struct bloom_filter_settings {
+ uint32_t hash_version;
+ uint32_t num_hashes;
+ uint32_t bits_per_entry;
+};
+
+#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
+#define BITS_PER_WORD 8
+#define BLOOMDATA_CHUNK_HEADER_SIZE 3*sizeof(uint32_t)
+
+/*
+ * A bloom_filter struct represents a data segment to
+ * use when testing hash values. The 'len' member
+ * dictates how many uint64_t entries are stored in
+ * 'data'.
+ */
+struct bloom_filter {
+ unsigned char *data;
+ int len;
+};
+
+/*
+ * A bloom_key represents the k hash values for a
+ * given hash input. These can be precomputed and
+ * stored in a bloom_key for re-use when testing
+ * against a bloom_filter.
+ */
+struct bloom_key {
+ uint32_t *hashes;
+};
+
+void load_bloom_filters(void);
+
+void fill_bloom_key(const char *data,
+ int len,
+ struct bloom_key *key,
+ struct bloom_filter_settings *settings);
+
+void add_key_to_filter(struct bloom_key *key,
+ struct bloom_filter *filter,
+ struct bloom_filter_settings *settings);
+
+struct bloom_filter *get_bloom_filter(struct repository *r,
+ struct commit *c,
+ int compute_if_not_present);
+
+int bloom_filter_contains(struct bloom_filter *filter,
+ struct bloom_key *key,
+ struct bloom_filter_settings *settings);
+
+#endif
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index a71af88..70f1fb2 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -12,6 +12,7 @@ static char const * const builtin_commit_graph_usage[] = {
N_("git commit-graph write [--object-dir <objdir>] "
"[--split[=<strategy>]] "
"[--input=<reachable|stdin-packs|stdin-commits|append|none>] "
+ "[--changed-paths] "
"[--[no-]progress] <split options>"),
NULL
};
@@ -25,6 +26,7 @@ static const char * const builtin_commit_graph_write_usage[] = {
N_("git commit-graph write [--object-dir <objdir>] "
"[--split[=<strategy>]] "
"[--input=<reachable|stdin-packs|stdin-commits|append|none>] "
+ "[--changed-paths] "
"[--[no-]progress] <split options>"),
NULL
};
@@ -43,6 +45,7 @@ static struct opts_commit_graph {
int split;
int shallow;
int progress;
+ int enable_changed_paths;
} opts;
static struct object_directory *find_odb(struct repository *r,
@@ -195,6 +198,8 @@ static int graph_write(int argc, const char **argv)
OPT_BIT(0, "append", &opts.input,
N_("include all commits already in the commit-graph file"),
COMMIT_GRAPH_INPUT_APPEND),
+ OPT_BOOL(0, "changed-paths", &opts.enable_changed_paths,
+ N_("enable computation for changed paths")),
OPT_BOOL(0, "progress", &opts.progress, N_("force progress reporting")),
OPT_CALLBACK_F(0, "split", &split_opts.flags, NULL,
N_("allow writing an incremental commit-graph file"),
@@ -234,6 +239,9 @@ static int graph_write(int argc, const char **argv)
flags |= COMMIT_GRAPH_WRITE_SPLIT;
if (opts.progress)
flags |= COMMIT_GRAPH_WRITE_PROGRESS;
+ if (opts.enable_changed_paths ||
+ git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
+ flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
read_replace_refs = 0;
odb = find_odb(the_repository, opts.obj_dir);
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 4df54c4..17e25aa 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -19,6 +19,7 @@ linux-gcc)
export GIT_TEST_OE_SIZE=10
export GIT_TEST_OE_DELTA_SIZE=5
export GIT_TEST_COMMIT_GRAPH=1
+ export GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1
export GIT_TEST_MULTI_PACK_INDEX=1
export GIT_TEST_ADD_I_USE_BUILTIN=1
make test
diff --git a/commit-graph.c b/commit-graph.c
index 417b7ea..b1e2a92 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -16,13 +16,18 @@
#include "hashmap.h"
#include "replace-object.h"
#include "progress.h"
+#include "bloom.h"
+#include "commit-slab.h"
#define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
#define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
+#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
+#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
+#define MAX_NUM_CHUNKS 7
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
@@ -44,6 +49,48 @@
/* Remember to update object flag allocation in object.h */
#define REACHABLE (1u<<15)
+/* Keep track of the order in which commits are added to our list. */
+define_commit_slab(commit_pos, int);
+static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
+
+static void set_commit_pos(struct repository *r, const struct object_id *oid)
+{
+ static int32_t max_pos;
+ struct commit *commit = lookup_commit(r, oid);
+
+ if (!commit)
+ return; /* should never happen, but be lenient */
+
+ *commit_pos_at(&commit_pos, commit) = max_pos++;
+}
+
+static int commit_pos_cmp(const void *va, const void *vb)
+{
+ const struct commit *a = *(const struct commit **)va;
+ const struct commit *b = *(const struct commit **)vb;
+ return commit_pos_at(&commit_pos, a) -
+ commit_pos_at(&commit_pos, b);
+}
+
+static int commit_gen_cmp(const void *va, const void *vb)
+{
+ const struct commit *a = *(const struct commit **)va;
+ const struct commit *b = *(const struct commit **)vb;
+
+ /* lower generation commits first */
+ if (a->generation < b->generation)
+ return -1;
+ else if (a->generation > b->generation)
+ return 1;
+
+ /* use date as a heuristic when generations are equal */
+ if (a->date < b->date)
+ return -1;
+ else if (a->date > b->date)
+ return 1;
+ return 0;
+}
+
char *get_commit_graph_filename(struct object_directory *odb)
{
return xstrfmt("%s/info/commit-graph", odb->path);
@@ -274,6 +321,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
chunk_repeated = 1;
else
graph->chunk_base_graphs = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMINDEXES:
+ if (graph->chunk_bloom_indexes)
+ chunk_repeated = 1;
+ else
+ graph->chunk_bloom_indexes = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_BLOOMDATA:
+ if (graph->chunk_bloom_data)
+ chunk_repeated = 1;
+ else {
+ uint32_t hash_version;
+ graph->chunk_bloom_data = data + chunk_offset;
+ hash_version = get_be32(data + chunk_offset);
+
+ if (hash_version != 1)
+ break;
+
+ graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
+ graph->bloom_filter_settings->hash_version = hash_version;
+ graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
+ graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
+ }
+ break;
}
if (chunk_repeated) {
@@ -292,6 +365,17 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
last_chunk_offset = chunk_offset;
}
+ /* We need both the bloom chunks to exist together. Else ignore the data */
+ if ((graph->chunk_bloom_indexes && !graph->chunk_bloom_data)
+ || (!graph->chunk_bloom_indexes && graph->chunk_bloom_data)) {
+ graph->chunk_bloom_indexes = NULL;
+ graph->chunk_bloom_data = NULL;
+ graph->bloom_filter_settings = NULL;
+ }
+
+ if (graph->chunk_bloom_indexes && graph->chunk_bloom_data)
+ load_bloom_filters();
+
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
if (verify_commit_graph_lite(graph)) {
@@ -789,9 +873,12 @@ struct write_commit_graph_context {
report_progress:1,
split:1,
check_oids:1,
- no_input:1;
+ no_input:1,
+ changed_paths:1,
+ order_by_pack:1;
const struct split_commit_graph_opts *split_opts;
+ uint32_t total_bloom_filter_data_size;
};
static void write_graph_chunk_fanout(struct hashfile *f,
@@ -987,6 +1074,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
}
}
+static void write_graph_chunk_bloom_indexes(struct hashfile *f,
+ struct write_commit_graph_context *ctx)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ uint32_t cur_pos = 0;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters index"),
+ ctx->commits.nr);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
+ cur_pos += filter->len;
+ display_progress(progress, ++i);
+ hashwrite_be32(f, cur_pos);
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
+static void write_graph_chunk_bloom_data(struct hashfile *f,
+ struct write_commit_graph_context *ctx,
+ struct bloom_filter_settings *settings)
+{
+ struct commit **list = ctx->commits.list;
+ struct commit **last = ctx->commits.list + ctx->commits.nr;
+ struct progress *progress = NULL;
+ int i = 0;
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Writing changed paths Bloom filters data"),
+ ctx->commits.nr);
+
+ hashwrite_be32(f, settings->hash_version);
+ hashwrite_be32(f, settings->num_hashes);
+ hashwrite_be32(f, settings->bits_per_entry);
+
+ while (list < last) {
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
+ display_progress(progress, ++i);
+ hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
+ list++;
+ }
+
+ stop_progress(&progress);
+}
+
static int oid_compare(const void *_a, const void *_b)
{
const struct object_id *a = (const struct object_id *)_a;
@@ -1018,6 +1158,8 @@ static int add_packed_commits(const struct object_id *oid,
oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
ctx->oids.nr++;
+ set_commit_pos(ctx->r, oid);
+
return 0;
}
@@ -1134,6 +1276,38 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
stop_progress(&ctx->progress);
}
+static void compute_bloom_filters(struct write_commit_graph_context *ctx)
+{
+ int i;
+ struct progress *progress = NULL;
+ struct commit **sorted_by_pos;
+
+ load_bloom_filters();
+
+ if (ctx->report_progress)
+ progress = start_delayed_progress(
+ _("Computing changed paths Bloom filters"),
+ ctx->commits.nr);
+
+ ALLOC_ARRAY(sorted_by_pos, ctx->commits.nr);
+ COPY_ARRAY(sorted_by_pos, ctx->commits.list, ctx->commits.nr);
+
+ if (ctx->order_by_pack)
+ QSORT(sorted_by_pos, ctx->commits.nr, commit_pos_cmp);
+ else
+ QSORT(sorted_by_pos, ctx->commits.nr, commit_gen_cmp);
+
+ for (i = 0; i < ctx->commits.nr; i++) {
+ struct commit *c = sorted_by_pos[i];
+ struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
+ ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
+ display_progress(progress, i + 1);
+ }
+
+ free(sorted_by_pos);
+ stop_progress(&progress);
+}
+
static int add_ref_to_list(const char *refname,
const struct object_id *oid,
int flags, void *cb_data)
@@ -1351,12 +1525,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
int fd;
struct hashfile *f;
struct lock_file lk = LOCK_INIT;
- uint32_t chunk_ids[6];
- uint64_t chunk_offsets[6];
+ uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
+ uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
const unsigned hashsz = the_hash_algo->rawsz;
struct strbuf progress_title = STRBUF_INIT;
int num_chunks = 3;
struct object_id file_hash;
+ struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
if (ctx->split) {
struct strbuf tmp_file = STRBUF_INIT;
@@ -1401,6 +1576,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+ num_chunks++;
+ chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
num_chunks++;
@@ -1419,6 +1600,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
4 * ctx->num_extra_edges;
num_chunks++;
}
+ if (ctx->changed_paths) {
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * ctx->commits.nr;
+ num_chunks++;
+
+ chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
+ sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
+ num_chunks++;
+ }
if (ctx->num_commit_graphs_after > 1) {
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
hashsz * (ctx->num_commit_graphs_after - 1);
@@ -1456,6 +1646,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
write_graph_chunk_data(f, hashsz, ctx);
if (ctx->num_extra_edges)
write_graph_chunk_extra_edges(f, ctx);
+ if (ctx->changed_paths) {
+ write_graph_chunk_bloom_indexes(f, ctx);
+ write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+ }
if (ctx->num_commit_graphs_after > 1 &&
write_graph_chunk_base(f, ctx)) {
return -1;
@@ -1787,6 +1981,8 @@ int write_commit_graph(struct object_directory *odb,
ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
ctx->split_opts = split_opts;
ctx->no_input = flags & COMMIT_GRAPH_WRITE_NO_INPUT ? 1 : 0;
+ ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
+ ctx->total_bloom_filter_data_size = 0;
if (ctx->split) {
struct commit_graph *g;
@@ -1836,6 +2032,7 @@ int write_commit_graph(struct object_directory *odb,
}
if (pack_indexes) {
+ ctx->order_by_pack = 1;
if ((res = fill_oids_from_packs(ctx, pack_indexes)))
goto cleanup;
}
@@ -1845,8 +2042,10 @@ int write_commit_graph(struct object_directory *odb,
goto cleanup;
}
- if (!ctx->no_input && !pack_indexes && !commit_hex)
+ if (!ctx->no_input && !pack_indexes && !commit_hex) {
+ ctx->order_by_pack = 1;
fill_oids_from_all_packs(ctx);
+ }
close_reachable(ctx);
@@ -1881,6 +2080,9 @@ int write_commit_graph(struct object_directory *odb,
compute_generation_numbers(ctx);
+ if (ctx->changed_paths)
+ compute_bloom_filters(ctx);
+
res = write_commit_graph_file(ctx);
if (ctx->split)
@@ -2105,6 +2307,7 @@ void free_commit_graph(struct commit_graph *g)
g->data = NULL;
close(g->graph_fd);
}
+ free(g->bloom_filter_settings);
free(g->filename);
free(g);
}
diff --git a/commit-graph.h b/commit-graph.h
index df7f3f5..8f5f62d 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -9,8 +9,10 @@
#define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
+#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
struct commit;
+struct bloom_filter_settings;
char *get_commit_graph_filename(struct object_directory *odb);
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
@@ -59,6 +61,10 @@ struct commit_graph {
const unsigned char *chunk_commit_data;
const unsigned char *chunk_extra_edges;
const unsigned char *chunk_base_graphs;
+ const unsigned char *chunk_bloom_indexes;
+ const unsigned char *chunk_bloom_data;
+
+ struct bloom_filter_settings *bloom_filter_settings;
};
struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st,
@@ -80,7 +86,8 @@ enum commit_graph_write_flags {
COMMIT_GRAPH_WRITE_SPLIT = (1 << 2),
/* Make sure that each OID in the input is a valid commit OID. */
COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
- COMMIT_GRAPH_WRITE_NO_INPUT = (1 << 4)
+ COMMIT_GRAPH_WRITE_NO_INPUT = (1 << 4),
+ COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 5),
};
enum commit_graph_split_flags {
diff --git a/diff.h b/diff.h
index 6febe7e..9443dc1 100644
--- a/diff.h
+++ b/diff.h
@@ -285,6 +285,11 @@ struct diff_options {
/* Number of hexdigits to abbreviate raw format output to. */
int abbrev;
+ /* If non-zero, then stop computing after this many changes. */
+ int max_changes;
+ /* For internal use only. */
+ int num_changes;
+
int ita_invisible_in_index;
/* white-space error highlighting */
#define WSEH_NEW (1<<12)
diff --git a/revision.c b/revision.c
index 8136929..d1622af 100644
--- a/revision.c
+++ b/revision.c
@@ -29,6 +29,8 @@
#include "prio-queue.h"
#include "hashmap.h"
#include "utf8.h"
+#include "bloom.h"
+#include "json-writer.h"
volatile show_early_output_fn_t show_early_output;
@@ -624,11 +626,114 @@ static void file_change(struct diff_options *options,
options->flags.has_changes = 1;
}
+static int bloom_filter_atexit_registered;
+static unsigned int count_bloom_filter_maybe;
+static unsigned int count_bloom_filter_definitely_not;
+static unsigned int count_bloom_filter_false_positive;
+static unsigned int count_bloom_filter_not_present;
+static unsigned int count_bloom_filter_length_zero;
+
+static void trace2_bloom_filter_statistics_atexit(void)
+{
+ struct json_writer jw = JSON_WRITER_INIT;
+
+ jw_object_begin(&jw, 0);
+ jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present);
+ jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero);
+ jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe);
+ jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not);
+ jw_end(&jw);
+
+ trace2_data_json("bloom", the_repository, "statistics", &jw);
+
+ jw_release(&jw);
+}
+
+static void prepare_to_use_bloom_filter(struct rev_info *revs)
+{
+ struct pathspec_item *pi;
+ char *path_alloc = NULL;
+ const char *path;
+ int last_index;
+ int len;
+
+ if (!revs->commits)
+ return;
+
+ repo_parse_commit(revs->repo, revs->commits->item);
+
+ if (!revs->repo->objects->commit_graph)
+ return;
+
+ revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings;
+ if (!revs->bloom_filter_settings)
+ return;
+
+ pi = &revs->pruning.pathspec.items[0];
+ last_index = pi->len - 1;
+
+ if (pi->match[last_index] == '/') {
+ path_alloc = xstrdup(pi->match);
+ path_alloc[last_index] = '\0';
+ path = path_alloc;
+ } else
+ path = pi->match;
+
+ len = strlen(path);
+
+ revs->bloom_key = xmalloc(sizeof(struct bloom_key));
+ fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+
+ if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
+ atexit(trace2_bloom_filter_statistics_atexit);
+ bloom_filter_atexit_registered = 1;
+ }
+
+ free(path_alloc);
+}
+
+static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
+ struct commit *commit)
+{
+ struct bloom_filter *filter;
+ int result;
+
+ if (!revs->repo->objects->commit_graph)
+ return -1;
+
+ if (commit->generation == GENERATION_NUMBER_INFINITY)
+ return -1;
+
+ filter = get_bloom_filter(revs->repo, commit, 0);
+
+ if (!filter) {
+ count_bloom_filter_not_present++;
+ return -1;
+ }
+
+ if (!filter->len) {
+ count_bloom_filter_length_zero++;
+ return -1;
+ }
+
+ result = bloom_filter_contains(filter,
+ revs->bloom_key,
+ revs->bloom_filter_settings);
+
+ if (result)
+ count_bloom_filter_maybe++;
+ else
+ count_bloom_filter_definitely_not++;
+
+ return result;
+}
+
static int rev_compare_tree(struct rev_info *revs,
- struct commit *parent, struct commit *commit)
+ struct commit *parent, struct commit *commit, int nth_parent)
{
struct tree *t1 = get_commit_tree(parent);
struct tree *t2 = get_commit_tree(commit);
+ int bloom_ret = 1;
if (!t1)
return REV_TREE_NEW;
@@ -653,11 +758,23 @@ static int rev_compare_tree(struct rev_info *revs,
return REV_TREE_SAME;
}
+ if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info && !nth_parent) {
+ bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
+
+ if (bloom_ret == 0)
+ return REV_TREE_SAME;
+ }
+
tree_difference = REV_TREE_SAME;
revs->pruning.flags.has_changes = 0;
if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
&revs->pruning) < 0)
return REV_TREE_DIFFERENT;
+
+ if (!nth_parent)
+ if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
+ count_bloom_filter_false_positive++;
+
return tree_difference;
}
@@ -855,7 +972,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
die("cannot simplify commit %s (because of %s)",
oid_to_hex(&commit->object.oid),
oid_to_hex(&p->object.oid));
- switch (rev_compare_tree(revs, p, commit)) {
+ switch (rev_compare_tree(revs, p, commit, nth_parent)) {
case REV_TREE_SAME:
if (!revs->simplify_history || !relevant_commit(p)) {
/* Even if a merge with an uninteresting
@@ -3362,6 +3479,8 @@ int prepare_revision_walk(struct rev_info *revs)
FOR_EACH_OBJECT_PROMISOR_ONLY);
}
+ if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info)
+ prepare_to_use_bloom_filter(revs);
if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED)
commit_list_sort_by_date(&revs->commits);
if (revs->no_walk)
@@ -3379,6 +3498,7 @@ int prepare_revision_walk(struct rev_info *revs)
simplify_merges(revs);
if (revs->children.name)
set_children(revs);
+
return 0;
}
diff --git a/revision.h b/revision.h
index 475f048..7c026fe 100644
--- a/revision.h
+++ b/revision.h
@@ -56,6 +56,8 @@ struct repository;
struct rev_info;
struct string_list;
struct saved_parents;
+struct bloom_key;
+struct bloom_filter_settings;
define_shared_commit_slab(revision_sources, char *);
struct rev_cmdline_info {
@@ -291,6 +293,15 @@ struct rev_info {
struct revision_sources *sources;
struct topo_walk_info *topo_walk_info;
+
+ /* Commit graph bloom filter fields */
+ /* The bloom filter key for the pathspec */
+ struct bloom_key *bloom_key;
+ /*
+ * The bloom filter settings used to generate the key.
+ * This is loaded from the commit-graph being used.
+ */
+ struct bloom_filter_settings *bloom_filter_settings;
};
int ref_excluded(struct string_list *, const char *path);
diff --git a/t/README b/t/README
index 9afd61e..8c577dc 100644
--- a/t/README
+++ b/t/README
@@ -378,6 +378,11 @@ GIT_TEST_COMMIT_GRAPH=<boolean>, when true, forces the commit-graph to
be written after every 'git commit' command, and overrides the
'core.commitGraph' setting to true.
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=<boolean>, when true, forces
+commit-graph write to compute and write changed path Bloom filters for
+every 'git commit-graph write', as if the `--changed-paths` option was
+passed in.
+
GIT_TEST_FSMONITOR=$PWD/t7519/fsmonitor-all exercises the fsmonitor
code path for utilizing a file system monitor to speed up detecting
new or changed files.
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
new file mode 100644
index 0000000..8fa2d8f
--- /dev/null
+++ b/t/helper/test-bloom.c
@@ -0,0 +1,84 @@
+#include "test-tool.h"
+#include "git-compat-util.h"
+#include "bloom.h"
+#include "test-tool.h"
+#include "cache.h"
+#include "commit-graph.h"
+#include "commit.h"
+#include "config.h"
+#include "object-store.h"
+#include "object.h"
+#include "repository.h"
+#include "tree.h"
+
+struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+
+static void print_bloom_filter(struct bloom_filter *filter) {
+ int i;
+
+ if (!filter) {
+ printf("No filter.\n");
+ return;
+ }
+ printf("Filter_Length:%d\n", filter->len);
+ printf("Filter_Data:");
+ for (i = 0; i < filter->len; i++){
+ printf("%02x|", filter->data[i]);
+ }
+ printf("\n");
+}
+
+static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
+ struct bloom_key key;
+ int i;
+
+ fill_bloom_key(data, strlen(data), &key, &settings);
+ printf("Hashes:");
+ for (i = 0; i < settings.num_hashes; i++){
+ printf("%08x|", key.hashes[i]);
+ }
+ printf("\n");
+ add_key_to_filter(&key, filter, &settings);
+}
+
+static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
+{
+ struct commit *c;
+ struct bloom_filter *filter;
+ setup_git_directory();
+ c = lookup_commit(the_repository, commit_oid);
+ filter = get_bloom_filter(the_repository, c, 1);
+ print_bloom_filter(filter);
+}
+
+int cmd__bloom(int argc, const char **argv)
+{
+ if (!strcmp(argv[1], "generate_filter")) {
+ struct bloom_filter filter;
+ int i = 2;
+ filter.len = (settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+ filter.data = xcalloc(filter.len, sizeof(unsigned char));
+
+ if (!argv[2]){
+ die("at least one input string expected");
+ }
+
+ while (argv[i]) {
+ add_string_to_filter(argv[i], &filter);
+ i++;
+ }
+
+ print_bloom_filter(&filter);
+ }
+
+ if (!strcmp(argv[1], "get_filter_for_commit")) {
+ struct object_id oid;
+ const char *end;
+ if (parse_oid_hex(argv[2], &oid, &end))
+ die("cannot parse oid '%s'", argv[2]);
+ load_bloom_filters();
+ get_bloom_filter_for_commit(&oid);
+ }
+
+ return 0;
+}
diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c
index f8a4617..4223ff3 100644
--- a/t/helper/test-read-graph.c
+++ b/t/helper/test-read-graph.c
@@ -45,6 +45,10 @@ int cmd__read_graph(int argc, const char **argv)
printf(" commit_metadata");
if (graph->chunk_extra_edges)
printf(" extra_edges");
+ if (graph->chunk_bloom_indexes)
+ printf(" bloom_indexes");
+ if (graph->chunk_bloom_data)
+ printf(" bloom_data");
printf("\n");
UNLEAK(graph);
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 31eedcd..6e26bd6 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -15,6 +15,7 @@ struct test_cmd {
static struct test_cmd cmds[] = {
{ "advise", cmd__advise_if_enabled },
+ { "bloom", cmd__bloom },
{ "chmtime", cmd__chmtime },
{ "config", cmd__config },
{ "ctype", cmd__ctype },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 4eb5e66..dceeef1 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -5,6 +5,7 @@
#include "git-compat-util.h"
int cmd__advise_if_enabled(int argc, const char **argv);
+int cmd__bloom(int argc, const char **argv);
int cmd__chmtime(int argc, const char **argv);
int cmd__config(int argc, const char **argv);
int cmd__ctype(int argc, const char **argv);
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
new file mode 100755
index 0000000..5827321
--- /dev/null
+++ b/t/t0095-bloom.sh
@@ -0,0 +1,113 @@
+#!/bin/sh
+
+test_description='test bloom.c'
+. ./test-lib.sh
+
+test_expect_success 'compute bloom key for empty string' '
+ cat >expect <<-\EOF &&
+ Hashes:5615800c|5b966560|61174ab4|66983008|6c19155c|7199fab0|771ae004|
+ Filter_Length:2
+ Filter_Data:11|11|
+ EOF
+ test-tool bloom generate_filter "" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for whitespace' '
+ cat >expect <<-\EOF &&
+ Hashes:1bf014e6|8a91b50b|f9335530|67d4f555|d676957a|4518359f|b3b9d5c4|
+ Filter_Length:2
+ Filter_Data:71|8c|
+ EOF
+ test-tool bloom generate_filter " " >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for a root level folder' '
+ cat >expect <<-\EOF &&
+ Hashes:1a21016f|fff1c06d|e5c27f6b|cb933e69|b163fd67|9734bc65|7d057b63|
+ Filter_Length:2
+ Filter_Data:a8|aa|
+ EOF
+ test-tool bloom generate_filter "A" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for a root level file' '
+ cat >expect <<-\EOF &&
+ Hashes:e2d51107|30970605|7e58fb03|cc1af001|19dce4ff|679ed9fd|b560cefb|
+ Filter_Length:2
+ Filter_Data:aa|a8|
+ EOF
+ test-tool bloom generate_filter "file.txt" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for a deep folder' '
+ cat >expect <<-\EOF &&
+ Hashes:864cf838|27f055cd|c993b362|6b3710f7|0cda6e8c|ae7dcc21|502129b6|
+ Filter_Length:2
+ Filter_Data:c6|31|
+ EOF
+ test-tool bloom generate_filter "A/B/C/D/E" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for a deep file' '
+ cat >expect <<-\EOF &&
+ Hashes:07cdf850|4af629c7|8e1e5b3e|d1468cb5|146ebe2c|5796efa3|9abf211a|
+ Filter_Length:2
+ Filter_Data:a9|54|
+ EOF
+ test-tool bloom generate_filter "A/B/C/D/E/file.txt" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'get bloom filters for commit with no changes' '
+ git init &&
+ git commit --allow-empty -m "c0" &&
+ cat >expect <<-\EOF &&
+ Filter_Length:0
+ Filter_Data:
+ EOF
+ test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'get bloom filter for commit with 10 changes' '
+ rm actual &&
+ rm expect &&
+ mkdir smallDir &&
+ for i in $(test_seq 0 9)
+ do
+ echo $i >smallDir/$i
+ done &&
+ git add smallDir &&
+ git commit -m "commit with 10 changes" &&
+ cat >expect <<-\EOF &&
+ Filter_Length:25
+ Filter_Data:c2|0b|b8|c0|10|88|f0|1d|c1|0c|01|a4|01|28|81|80|01|30|10|d0|92|be|88|10|8a|
+ EOF
+ test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success EXPENSIVE 'get bloom filter for commit with 513 changes' '
+ rm actual &&
+ rm expect &&
+ mkdir bigDir &&
+ for i in $(test_seq 0 512)
+ do
+ echo $i >bigDir/$i
+ done &&
+ git add bigDir &&
+ git commit -m "commit with 513 changes" &&
+ cat >expect <<-\EOF &&
+ Filter_Length:0
+ Filter_Data:
+ EOF
+ test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual &&
+ test_cmp expect actual
+'
+
+test_done
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
new file mode 100755
index 0000000..7acebb3
--- /dev/null
+++ b/t/t4216-log-bloom.sh
@@ -0,0 +1,143 @@
+#!/bin/sh
+
+test_description='git log for a path with bloom filters'
+. ./test-lib.sh
+
+GIT_TEST_COMMIT_GRAPH=0
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
+
+test_expect_success 'setup test - repo, commits, commit graph, log outputs' '
+ git init &&
+ mkdir A A/B A/B/C &&
+ test_commit c1 A/file1 &&
+ test_commit c2 A/B/file2 &&
+ test_commit c3 A/B/C/file3 &&
+ test_commit c4 A/file1 &&
+ test_commit c5 A/B/file2 &&
+ test_commit c6 A/B/C/file3 &&
+ test_commit c7 A/file1 &&
+ test_commit c8 A/B/file2 &&
+ test_commit c9 A/B/C/file3 &&
+ git checkout -b side HEAD~4 &&
+ test_commit side-1 file4 &&
+ git checkout master &&
+ git merge side &&
+ test_commit c10 file5 &&
+ mv file5 file5_renamed &&
+ git add file5_renamed &&
+ git commit -m "rename" &&
+ git commit-graph write --reachable --changed-paths
+'
+graph_read_expect() {
+ OPTIONAL=""
+ NUM_CHUNKS=5
+ cat >expect <<- EOF
+ header: 43475048 1 1 $NUM_CHUNKS 0
+ num_commits: $1
+ chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data
+ EOF
+ test-tool read-graph >output &&
+ test_cmp expect output
+}
+
+test_expect_success 'commit-graph write wrote out the bloom chunks' '
+ graph_read_expect 13
+'
+
+setup() {
+ rm output
+ rm "$TRASH_DIRECTORY/trace.perf"
+ git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom
+ GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
+}
+
+test_bloom_filters_used() {
+ log_args=$1
+ bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\""
+ setup "$log_args"
+ grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && test_cmp log_wo_bloom log_w_bloom
+}
+
+test_bloom_filters_not_used() {
+ log_args=$1
+ setup "$log_args"
+ !(grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf") && test_cmp log_wo_bloom log_w_bloom
+}
+
+for path in A A/B A/B/C A/file1 A/B/file2 A/B/C/file3 file4 file5_renamed
+do
+ for option in "" \
+ "--full-history" \
+ "--full-history --simplify-merges" \
+ "--simplify-merges" \
+ "--simplify-by-decoration" \
+ "--follow" \
+ "--first-parent" \
+ "--topo-order" \
+ "--date-order" \
+ "--author-date-order" \
+ "--ancestry-path side..master"
+ do
+ test_expect_success "git log option: $option for path: $path" '
+ test_bloom_filters_used "$option -- $path"
+ '
+ done
+done
+
+test_expect_success 'git log -- folder works with and without the trailing slash' '
+ test_bloom_filters_used "-- A" &&
+ test_bloom_filters_used "-- A/"
+'
+
+test_expect_success 'git log for path that does not exist. ' '
+ test_bloom_filters_used "-- path_does_not_exist"
+'
+
+test_expect_success 'git log with --walk-reflogs does not use bloom filters' '
+ test_bloom_filters_not_used "--walk-reflogs -- A"
+'
+
+test_expect_success 'git log -- multiple path specs does not use bloom filters' '
+ test_bloom_filters_not_used "-- file4 A/file1"
+'
+
+test_expect_success 'git log with wildcard that resolves to a single path uses bloom filters' '
+ test_bloom_filters_used "-- *4" &&
+ test_bloom_filters_used "-- *renamed"
+'
+
+test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses bloom filters' '
+ test_bloom_filters_not_used "-- *" &&
+ test_bloom_filters_not_used "-- file*"
+'
+
+test_expect_success 'setup - add commit-graph to the chain without bloom filters' '
+ test_commit c14 A/anotherFile2 &&
+ test_commit c15 A/B/anotherFile2 &&
+ test_commit c16 A/B/C/anotherFile2 &&
+ GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split &&
+ test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
+'
+
+test_expect_success 'git log does not use bloom filters if the latest graph does not have bloom filters.' '
+ test_bloom_filters_not_used "-- A/B"
+'
+
+test_expect_success 'setup - add commit-graph to the chain with bloom filters' '
+ test_commit c17 A/anotherFile3 &&
+ git commit-graph write --reachable --changed-paths --split &&
+ test_line_count = 3 .git/objects/info/commit-graphs/commit-graph-chain
+'
+
+test_bloom_filters_used_when_some_filters_are_missing() {
+ log_args=$1
+ bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":6"
+ setup "$log_args"
+ grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && test_cmp log_wo_bloom log_w_bloom
+}
+
+test_expect_success 'git log uses bloom filters if they exist in the latest but not all commit graphs in the chain.' '
+ test_bloom_filters_used_when_some_filters_are_missing "-- A/B"
+'
+
+test_done
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 117dca3..b7e11e3 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -3,6 +3,8 @@
test_description='commit graph'
. ./test-lib.sh
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
+
test_expect_success 'setup full repo' '
mkdir full &&
cd "$TRASH_DIRECTORY/full" &&
diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh
index 7614f39..c648018 100755
--- a/t/t5324-split-commit-graph.sh
+++ b/t/t5324-split-commit-graph.sh
@@ -4,6 +4,7 @@ test_description='split commit graph'
. ./test-lib.sh
GIT_TEST_COMMIT_GRAPH=0
+GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
test_expect_success 'setup repo' '
git init &&
diff --git a/tree-diff.c b/tree-diff.c
index 33ded7f..f3d303c 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
if (diff_can_quit_early(opt))
break;
+ if (opt->max_changes && opt->num_changes > opt->max_changes)
+ break;
+
if (opt->pathspec.nr) {
skip_uninteresting(&t, base, opt);
for (i = 0; i < nparent; i++)
@@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
/* t↓ */
update_tree_entry(&t);
+ opt->num_changes++;
}
/* t > p[imin] */
@@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths(
skip_emit_tp:
/* ∀ pi=p[imin] pi↓ */
update_tp_entries(tp, nparent);
+ opt->num_changes++;
}
}
@@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths(
const struct object_id **parents_oid, int nparent,
struct strbuf *base, struct diff_options *opt)
{
+ opt->num_changes = 0;
p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt);
/*