From 3be7efcafceeae3400cd830be89c9601b43f3716 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:23 +0000 Subject: commit-graph: define and use MAX_NUM_CHUNKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a minor cleanup to make it easier to change the number of chunks being written to the commit graph. Reviewed-by: Jakub Narębski Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/commit-graph.c b/commit-graph.c index f013a84..e4f1a5b 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -23,6 +23,7 @@ #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */ #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */ #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ +#define MAX_NUM_CHUNKS 5 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) @@ -1350,8 +1351,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) int fd; struct hashfile *f; struct lock_file lk = LOCK_INIT; - uint32_t chunk_ids[6]; - uint64_t chunk_offsets[6]; + uint32_t chunk_ids[MAX_NUM_CHUNKS + 1]; + uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1]; const unsigned hashsz = the_hash_algo->rawsz; struct strbuf progress_title = STRBUF_INIT; int num_chunks = 3; -- cgit v0.10.2-6-g49f6 From f52207a45ca9e7cfbe431f4ffff79b3fdbcf3a37 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:24 +0000 Subject: bloom.c: add the murmur3 hash implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for computing changed paths Bloom filters, implement the Murmur3 hash algorithm as described in [1]. It hashes the given data using the given seed and produces a uniformly distributed hash value. [1] https://en.wikipedia.org/wiki/MurmurHash#Algorithm Helped-by: Derrick Stolee Helped-by: Szeder Gábor Reviewed-by: Jakub Narębski Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/Makefile b/Makefile index ef1ff22..491f75e 100644 --- a/Makefile +++ b/Makefile @@ -695,6 +695,7 @@ X = PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS)) TEST_BUILTINS_OBJS += test-advise.o +TEST_BUILTINS_OBJS += test-bloom.o TEST_BUILTINS_OBJS += test-chmtime.o TEST_BUILTINS_OBJS += test-config.o TEST_BUILTINS_OBJS += test-ctype.o @@ -840,6 +841,7 @@ LIB_OBJS += base85.o LIB_OBJS += bisect.o LIB_OBJS += blame.o LIB_OBJS += blob.o +LIB_OBJS += bloom.o LIB_OBJS += branch.o LIB_OBJS += bulk-checkin.o LIB_OBJS += bundle.o diff --git a/bloom.c b/bloom.c new file mode 100644 index 0000000..40e8763 --- /dev/null +++ b/bloom.c @@ -0,0 +1,73 @@ +#include "git-compat-util.h" +#include "bloom.h" + +static uint32_t rotate_left(uint32_t value, int32_t count) +{ + uint32_t mask = 8 * sizeof(uint32_t) - 1; + count &= mask; + return ((value << count) | (value >> ((-count) & mask))); +} + +/* + * Calculate the murmur3 32-bit hash value for the given data + * using the given seed. + * Produces a uniformly distributed hash value. + * Not considered to be cryptographically secure. + * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm + */ +uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) +{ + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + const uint32_t r1 = 15; + const uint32_t r2 = 13; + const uint32_t m = 5; + const uint32_t n = 0xe6546b64; + int i; + uint32_t k1 = 0; + const char *tail; + + int len4 = len / sizeof(uint32_t); + + uint32_t k; + for (i = 0; i < len4; i++) { + uint32_t byte1 = (uint32_t)data[4*i]; + uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8; + uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16; + uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24; + k = byte1 | byte2 | byte3 | byte4; + k *= c1; + k = rotate_left(k, r1); + k *= c2; + + seed ^= k; + seed = rotate_left(seed, r2) * m + n; + } + + tail = (data + len4 * sizeof(uint32_t)); + + switch (len & (sizeof(uint32_t) - 1)) { + case 3: + k1 ^= ((uint32_t)tail[2]) << 16; + /*-fallthrough*/ + case 2: + k1 ^= ((uint32_t)tail[1]) << 8; + /*-fallthrough*/ + case 1: + k1 ^= ((uint32_t)tail[0]) << 0; + k1 *= c1; + k1 = rotate_left(k1, r1); + k1 *= c2; + seed ^= k1; + break; + } + + seed ^= (uint32_t)len; + seed ^= (seed >> 16); + seed *= 0x85ebca6b; + seed ^= (seed >> 13); + seed *= 0xc2b2ae35; + seed ^= (seed >> 16); + + return seed; +} \ No newline at end of file diff --git a/bloom.h b/bloom.h new file mode 100644 index 0000000..d0fcc5f --- /dev/null +++ b/bloom.h @@ -0,0 +1,13 @@ +#ifndef BLOOM_H +#define BLOOM_H + +/* + * Calculate the murmur3 32-bit hash value for the given data + * using the given seed. + * Produces a uniformly distributed hash value. + * Not considered to be cryptographically secure. + * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm + */ +uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); + +#endif \ No newline at end of file diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c new file mode 100644 index 0000000..60ee204 --- /dev/null +++ b/t/helper/test-bloom.c @@ -0,0 +1,13 @@ +#include "git-compat-util.h" +#include "bloom.h" +#include "test-tool.h" + +int cmd__bloom(int argc, const char **argv) +{ + if (!strcmp(argv[1], "get_murmur3")) { + uint32_t hashed = murmur3_seeded(0, argv[2], strlen(argv[2])); + printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); + } + + return 0; +} \ No newline at end of file diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index 31eedcd..6e26bd6 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -15,6 +15,7 @@ struct test_cmd { static struct test_cmd cmds[] = { { "advise", cmd__advise_if_enabled }, + { "bloom", cmd__bloom }, { "chmtime", cmd__chmtime }, { "config", cmd__config }, { "ctype", cmd__ctype }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 4eb5e66..dceeef1 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -5,6 +5,7 @@ #include "git-compat-util.h" int cmd__advise_if_enabled(int argc, const char **argv); +int cmd__bloom(int argc, const char **argv); int cmd__chmtime(int argc, const char **argv); int cmd__config(int argc, const char **argv); int cmd__ctype(int argc, const char **argv); diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh new file mode 100755 index 0000000..2dad8c4 --- /dev/null +++ b/t/t0095-bloom.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +test_description='Testing the various Bloom filter computations in bloom.c' +. ./test-lib.sh + +test_expect_success 'compute unseeded murmur3 hash for empty string' ' + cat >expect <<-\EOF && + Murmur3 Hash with seed=0:0x00000000 + EOF + test-tool bloom get_murmur3 "" >actual && + test_cmp expect actual +' + +test_expect_success 'compute unseeded murmur3 hash for test string 1' ' + cat >expect <<-\EOF && + Murmur3 Hash with seed=0:0x627b0c2c + EOF + test-tool bloom get_murmur3 "Hello world!" >actual && + test_cmp expect actual +' + +test_expect_success 'compute unseeded murmur3 hash for test string 2' ' + cat >expect <<-\EOF && + Murmur3 Hash with seed=0:0x2e4ff723 + EOF + test-tool bloom get_murmur3 "The quick brown fox jumps over the lazy dog" >actual && + test_cmp expect actual +' + +test_done \ No newline at end of file -- cgit v0.10.2-6-g49f6 From f1294eaf7fbf7673567b698b11e062566b9f1035 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:25 +0000 Subject: bloom.c: introduce core Bloom filter constructs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the constructs for Bloom filters, Bloom filter keys and Bloom filter settings. For details on what Bloom filters are and how they work, refer to Dr. Derrick Stolee's blog post [1]. It provides a concise explanation of the adoption of Bloom filters as described in [2] and [3]. Implementation specifics: 1. We currently use 7 and 10 for the number of hashes and the size of each entry respectively. They served as great starting values, the mathematical details behind this choice are described in [1] and [4]. The implementation, while not completely open to it at the moment, is flexible enough to allow for tweaking these settings in the future. Note: The performance gains we have observed with these values are significant enough that we did not need to tweak these settings. The performance numbers are included in the cover letter of this series and in the commit message of the subsequent commit where we use Bloom filters to speed up `git log -- path`. 2. As described in [1] and [3], we do not need 7 independent hashing functions. We use the Murmur3 hashing scheme, seed it twice and then combine those to procure an arbitrary number of hash values. 3. The filters will be sized according to the number of changes in each commit, in multiples of 8 bit words. [1] Derrick Stolee "Supercharging the Git Commit Graph IV: Bloom Filters" https://devblogs.microsoft.com/devops/super-charging-the-git-commit-graph-iv-Bloom-filters/ [2] Flavio Bonomi, Michael Mitzenmacher, Rina Panigrahy, Sushil Singh, George Varghese "An Improved Construction for Counting Bloom Filters" http://theory.stanford.edu/~rinap/papers/esa2006b.pdf https://doi.org/10.1007/11841036_61 [3] Peter C. Dillinger and Panagiotis Manolios "Bloom Filters in Probabilistic Verification" http://www.ccs.neu.edu/home/pete/pub/Bloom-filters-verification.pdf https://doi.org/10.1007/978-3-540-30494-4_26 [4] Thomas Mueller Graf, Daniel Lemire "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" https://arxiv.org/abs/1912.08258 Helped-by: Derrick Stolee Reviewed-by: Jakub Narębski Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index 40e8763..888b67f 100644 --- a/bloom.c +++ b/bloom.c @@ -8,6 +8,11 @@ static uint32_t rotate_left(uint32_t value, int32_t count) return ((value << count) | (value >> ((-count) & mask))); } +static inline unsigned char get_bitmask(uint32_t pos) +{ + return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); +} + /* * Calculate the murmur3 32-bit hash value for the given data * using the given seed. @@ -70,4 +75,35 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) seed ^= (seed >> 16); return seed; -} \ No newline at end of file +} + +void fill_bloom_key(const char *data, + size_t len, + struct bloom_key *key, + const struct bloom_filter_settings *settings) +{ + int i; + const uint32_t seed0 = 0x293ae76f; + const uint32_t seed1 = 0x7e646e2c; + const uint32_t hash0 = murmur3_seeded(seed0, data, len); + const uint32_t hash1 = murmur3_seeded(seed1, data, len); + + key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); + for (i = 0; i < settings->num_hashes; i++) + key->hashes[i] = hash0 + i * hash1; +} + +void add_key_to_filter(const struct bloom_key *key, + struct bloom_filter *filter, + const struct bloom_filter_settings *settings) +{ + int i; + uint64_t mod = filter->len * BITS_PER_WORD; + + for (i = 0; i < settings->num_hashes; i++) { + uint64_t hash_mod = key->hashes[i] % mod; + uint64_t block_pos = hash_mod / BITS_PER_WORD; + + filter->data[block_pos] |= get_bitmask(hash_mod); + } +} diff --git a/bloom.h b/bloom.h index d0fcc5f..b9ce422 100644 --- a/bloom.h +++ b/bloom.h @@ -1,6 +1,60 @@ #ifndef BLOOM_H #define BLOOM_H +struct bloom_filter_settings { + /* + * The version of the hashing technique being used. + * We currently only support version = 1 which is + * the seeded murmur3 hashing technique implemented + * in bloom.c. + */ + uint32_t hash_version; + + /* + * The number of times a path is hashed, i.e. the + * number of bit positions tht cumulatively + * determine whether a path is present in the + * Bloom filter. + */ + uint32_t num_hashes; + + /* + * The minimum number of bits per entry in the Bloom + * filter. If the filter contains 'n' entries, then + * filter size is the minimum number of 8-bit words + * that contain n*b bits. + */ + uint32_t bits_per_entry; +}; + +#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 } +#define BITS_PER_WORD 8 + +/* + * A bloom_filter struct represents a data segment to + * use when testing hash values. The 'len' member + * dictates how many entries are stored in + * 'data'. + */ +struct bloom_filter { + unsigned char *data; + size_t len; +}; + +/* + * A bloom_key represents the k hash values for a + * given string. These can be precomputed and + * stored in a bloom_key for re-use when testing + * against a bloom_filter. The number of hashes is + * given by the Bloom filter settings and is the same + * for all Bloom filters and keys interacting with + * the loaded version of the commit graph file and + * the Bloom data chunks. + */ +struct bloom_key { + uint32_t *hashes; +}; + /* * Calculate the murmur3 32-bit hash value for the given data * using the given seed. @@ -10,4 +64,13 @@ */ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); +void fill_bloom_key(const char *data, + size_t len, + struct bloom_key *key, + const struct bloom_filter_settings *settings); + +void add_key_to_filter(const struct bloom_key *key, + struct bloom_filter *filter, + const struct bloom_filter_settings *settings); + #endif \ No newline at end of file diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 60ee204..20460cd 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -2,6 +2,36 @@ #include "bloom.h" #include "test-tool.h" +struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; + +static void add_string_to_filter(const char *data, struct bloom_filter *filter) { + struct bloom_key key; + int i; + + fill_bloom_key(data, strlen(data), &key, &settings); + printf("Hashes:"); + for (i = 0; i < settings.num_hashes; i++){ + printf("0x%08x|", key.hashes[i]); + } + printf("\n"); + add_key_to_filter(&key, filter, &settings); +} + +static void print_bloom_filter(struct bloom_filter *filter) { + int i; + + if (!filter) { + printf("No filter.\n"); + return; + } + printf("Filter_Length:%d\n", (int)filter->len); + printf("Filter_Data:"); + for (i = 0; i < filter->len; i++){ + printf("%02x|", filter->data[i]); + } + printf("\n"); +} + int cmd__bloom(int argc, const char **argv) { if (!strcmp(argv[1], "get_murmur3")) { @@ -9,5 +39,23 @@ int cmd__bloom(int argc, const char **argv) printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); } + if (!strcmp(argv[1], "generate_filter")) { + struct bloom_filter filter; + int i = 2; + filter.len = (settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD; + filter.data = xcalloc(filter.len, sizeof(unsigned char)); + + if (!argv[2]){ + die("at least one input string expected"); + } + + while (argv[i]) { + add_string_to_filter(argv[i], &filter); + i++; + } + + print_bloom_filter(&filter); + } + return 0; } \ No newline at end of file diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index 2dad8c4..36a086c 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -27,4 +27,44 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' ' test_cmp expect actual ' +test_expect_success 'compute bloom key for empty string' ' + cat >expect <<-\EOF && + Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004| + Filter_Length:2 + Filter_Data:11|11| + EOF + test-tool bloom generate_filter "" >actual && + test_cmp expect actual +' + +test_expect_success 'compute bloom key for whitespace' ' + cat >expect <<-\EOF && + Hashes:0xf178874c|0x5f3d6eb6|0xcd025620|0x3ac73d8a|0xa88c24f4|0x16510c5e|0x8415f3c8| + Filter_Length:2 + Filter_Data:51|55| + EOF + test-tool bloom generate_filter " " >actual && + test_cmp expect actual +' + +test_expect_success 'compute bloom key for test string 1' ' + cat >expect <<-\EOF && + Hashes:0xb270de9b|0x1bb6f26e|0x84fd0641|0xee431a14|0x57892de7|0xc0cf41ba|0x2a15558d| + Filter_Length:2 + Filter_Data:92|6c| + EOF + test-tool bloom generate_filter "Hello world!" >actual && + test_cmp expect actual +' + +test_expect_success 'compute bloom key for test string 2' ' + cat >expect <<-\EOF && + Hashes:0x20ab385b|0xf5237fe2|0xc99bc769|0x9e140ef0|0x728c5677|0x47049dfe|0x1b7ce585| + Filter_Length:2 + Filter_Data:a5|4a| + EOF + test-tool bloom generate_filter "file.txt" >actual && + test_cmp expect actual +' + test_done \ No newline at end of file -- cgit v0.10.2-6-g49f6 From ed591febb4a201ce48b34a4e90027414cd0d7966 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:26 +0000 Subject: bloom.c: core Bloom filter implementation for changed paths. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the core implementation for computing Bloom filters for the paths changed between a commit and it's first parent. We fill the Bloom filters as (const char *data, int len) pairs as `struct bloom_filters" within a commit slab. Filters for commits with no changes and more than 512 changes, is represented with a filter of length zero. There is no gain in distinguishing between a computed filter of length zero for a commit with no changes, and an uncomputed filter for new commits or for commits with more than 512 changes. The effect on `git log -- path` is the same in both cases. We will fall back to the normal diffing algorithm when we can't benefit from the existence of Bloom filters. Helped-by: Jeff King Helped-by: Derrick Stolee Reviewed-by: Jakub Narębski Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index 888b67f..881a984 100644 --- a/bloom.c +++ b/bloom.c @@ -1,5 +1,18 @@ #include "git-compat-util.h" #include "bloom.h" +#include "diff.h" +#include "diffcore.h" +#include "revision.h" +#include "hashmap.h" + +define_commit_slab(bloom_filter_slab, struct bloom_filter); + +struct bloom_filter_slab bloom_filters; + +struct pathmap_hash_entry { + struct hashmap_entry entry; + const char path[FLEX_ARRAY]; +}; static uint32_t rotate_left(uint32_t value, int32_t count) { @@ -107,3 +120,87 @@ void add_key_to_filter(const struct bloom_key *key, filter->data[block_pos] |= get_bitmask(hash_mod); } } + +void init_bloom_filters(void) +{ + init_bloom_filter_slab(&bloom_filters); +} + +struct bloom_filter *get_bloom_filter(struct repository *r, + struct commit *c) +{ + struct bloom_filter *filter; + struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; + int i; + struct diff_options diffopt; + + if (bloom_filters.slab_size == 0) + return NULL; + + filter = bloom_filter_slab_at(&bloom_filters, c); + + repo_diff_setup(r, &diffopt); + diffopt.flags.recursive = 1; + diff_setup_done(&diffopt); + + if (c->parents) + diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); + else + diff_tree_oid(NULL, &c->object.oid, "", &diffopt); + diffcore_std(&diffopt); + + if (diff_queued_diff.nr <= 512) { + struct hashmap pathmap; + struct pathmap_hash_entry *e; + struct hashmap_iter iter; + hashmap_init(&pathmap, NULL, NULL, 0); + + for (i = 0; i < diff_queued_diff.nr; i++) { + const char *path = diff_queued_diff.queue[i]->two->path; + + /* + * Add each leading directory of the changed file, i.e. for + * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so + * the Bloom filter could be used to speed up commands like + * 'git log dir/subdir', too. + * + * Note that directories are added without the trailing '/'. + */ + do { + char *last_slash = strrchr(path, '/'); + + FLEX_ALLOC_STR(e, path, path); + hashmap_entry_init(&e->entry, strhash(path)); + hashmap_add(&pathmap, &e->entry); + + if (!last_slash) + last_slash = (char*)path; + *last_slash = '\0'; + + } while (*path); + + diff_free_filepair(diff_queued_diff.queue[i]); + } + + filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD; + filter->data = xcalloc(filter->len, sizeof(unsigned char)); + + hashmap_for_each_entry(&pathmap, &iter, e, entry) { + struct bloom_key key; + fill_bloom_key(e->path, strlen(e->path), &key, &settings); + add_key_to_filter(&key, filter, &settings); + } + + hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry); + } else { + for (i = 0; i < diff_queued_diff.nr; i++) + diff_free_filepair(diff_queued_diff.queue[i]); + filter->data = NULL; + filter->len = 0; + } + + free(diff_queued_diff.queue); + DIFF_QUEUE_CLEAR(&diff_queued_diff); + + return filter; +} diff --git a/bloom.h b/bloom.h index b9ce422..85ab8e9 100644 --- a/bloom.h +++ b/bloom.h @@ -1,6 +1,9 @@ #ifndef BLOOM_H #define BLOOM_H +struct commit; +struct repository; + struct bloom_filter_settings { /* * The version of the hashing technique being used. @@ -73,4 +76,9 @@ void add_key_to_filter(const struct bloom_key *key, struct bloom_filter *filter, const struct bloom_filter_settings *settings); +void init_bloom_filters(void); + +struct bloom_filter *get_bloom_filter(struct repository *r, + struct commit *c); + #endif \ No newline at end of file diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 20460cd..f18d1b7 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -1,6 +1,7 @@ #include "git-compat-util.h" #include "bloom.h" #include "test-tool.h" +#include "commit.h" struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; @@ -32,6 +33,16 @@ static void print_bloom_filter(struct bloom_filter *filter) { printf("\n"); } +static void get_bloom_filter_for_commit(const struct object_id *commit_oid) +{ + struct commit *c; + struct bloom_filter *filter; + setup_git_directory(); + c = lookup_commit(the_repository, commit_oid); + filter = get_bloom_filter(the_repository, c); + print_bloom_filter(filter); +} + int cmd__bloom(int argc, const char **argv) { if (!strcmp(argv[1], "get_murmur3")) { @@ -57,5 +68,14 @@ int cmd__bloom(int argc, const char **argv) print_bloom_filter(&filter); } + if (!strcmp(argv[1], "get_filter_for_commit")) { + struct object_id oid; + const char *end; + if (parse_oid_hex(argv[2], &oid, &end)) + die("cannot parse oid '%s'", argv[2]); + init_bloom_filters(); + get_bloom_filter_for_commit(&oid); + } + return 0; } \ No newline at end of file diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index 36a086c..8f9eef1 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -67,4 +67,51 @@ test_expect_success 'compute bloom key for test string 2' ' test_cmp expect actual ' +test_expect_success 'get bloom filters for commit with no changes' ' + git init && + git commit --allow-empty -m "c0" && + cat >expect <<-\EOF && + Filter_Length:0 + Filter_Data: + EOF + test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual && + test_cmp expect actual +' + +test_expect_success 'get bloom filter for commit with 10 changes' ' + rm actual && + rm expect && + mkdir smallDir && + for i in $(test_seq 0 9) + do + echo $i >smallDir/$i + done && + git add smallDir && + git commit -m "commit with 10 changes" && + cat >expect <<-\EOF && + Filter_Length:25 + Filter_Data:82|a0|65|47|0c|92|90|c0|a1|40|02|a0|e2|40|e0|04|0a|9a|66|cf|80|19|85|42|23| + EOF + test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual && + test_cmp expect actual +' + +test_expect_success EXPENSIVE 'get bloom filter for commit with 513 changes' ' + rm actual && + rm expect && + mkdir bigDir && + for i in $(test_seq 0 512) + do + echo $i >bigDir/$i + done && + git add bigDir && + git commit -m "commit with 513 changes" && + cat >expect <<-\EOF && + Filter_Length:0 + Filter_Data: + EOF + test-tool bloom get_filter_for_commit "$(git rev-parse HEAD)" >actual && + test_cmp expect actual +' + test_done \ No newline at end of file -- cgit v0.10.2-6-g49f6 From e3696980163bdbd3bc56e5ffc69e8770015f366f Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 30 Mar 2020 00:31:27 +0000 Subject: diff: halt tree-diff early after max_changes When computing the changed-paths bloom filters for the commit-graph, we limit the size of the filter by restricting the number of paths in the diff. Instead of computing a large diff and then ignoring the result, it is better to halt the diff computation early. Create a new "max_changes" option in struct diff_options. If non-zero, then halt the diff computation after discovering strictly more changed paths. This includes paths corresponding to trees that change. Use this max_changes option in the bloom filter calculations. This reduces the time taken to compute the filters for the Linux kernel repo from 2m50s to 2m35s. On a large internal repository with ~500 commits that perform tree-wide changes, the time reduced from 6m15s to 3m48s. Signed-off-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index 881a984..a16eee9 100644 --- a/bloom.c +++ b/bloom.c @@ -133,6 +133,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; int i; struct diff_options diffopt; + int max_changes = 512; if (bloom_filters.slab_size == 0) return NULL; @@ -141,6 +142,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, repo_diff_setup(r, &diffopt); diffopt.flags.recursive = 1; + diffopt.max_changes = max_changes; diff_setup_done(&diffopt); if (c->parents) @@ -149,7 +151,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, diff_tree_oid(NULL, &c->object.oid, "", &diffopt); diffcore_std(&diffopt); - if (diff_queued_diff.nr <= 512) { + if (diff_queued_diff.nr <= max_changes) { struct hashmap pathmap; struct pathmap_hash_entry *e; struct hashmap_iter iter; diff --git a/diff.h b/diff.h index 6febe7e..9443dc1 100644 --- a/diff.h +++ b/diff.h @@ -285,6 +285,11 @@ struct diff_options { /* Number of hexdigits to abbreviate raw format output to. */ int abbrev; + /* If non-zero, then stop computing after this many changes. */ + int max_changes; + /* For internal use only. */ + int num_changes; + int ita_invisible_in_index; /* white-space error highlighting */ #define WSEH_NEW (1<<12) diff --git a/tree-diff.c b/tree-diff.c index 33ded7f..f3d303c 100644 --- a/tree-diff.c +++ b/tree-diff.c @@ -434,6 +434,9 @@ static struct combine_diff_path *ll_diff_tree_paths( if (diff_can_quit_early(opt)) break; + if (opt->max_changes && opt->num_changes > opt->max_changes) + break; + if (opt->pathspec.nr) { skip_uninteresting(&t, base, opt); for (i = 0; i < nparent; i++) @@ -518,6 +521,7 @@ static struct combine_diff_path *ll_diff_tree_paths( /* t↓ */ update_tree_entry(&t); + opt->num_changes++; } /* t > p[imin] */ @@ -535,6 +539,7 @@ static struct combine_diff_path *ll_diff_tree_paths( skip_emit_tp: /* ∀ pi=p[imin] pi↓ */ update_tp_entries(tp, nparent); + opt->num_changes++; } } @@ -552,6 +557,7 @@ struct combine_diff_path *diff_tree_paths( const struct object_id **parents_oid, int nparent, struct strbuf *base, struct diff_options *opt) { + opt->num_changes = 0; p = ll_diff_tree_paths(p, oid, parents_oid, nparent, base, opt); /* -- cgit v0.10.2-6-g49f6 From f97b9325f6d7ad2a28bfaf6fab3197d207fcb278 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:28 +0000 Subject: commit-graph: compute Bloom filters for changed paths Add new COMMIT_GRAPH_WRITE_CHANGED_PATHS flag that makes Git compute Bloom filters for the paths that changed between a commit and it's first parent, for each commit in the commit-graph. This computation is done on a commit-by-commit basis. We will write these Bloom filters to the commit-graph file, to store this data on disk, in the next change in this series. Helped-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/commit-graph.c b/commit-graph.c index e4f1a5b..862a00d 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -16,6 +16,7 @@ #include "hashmap.h" #include "replace-object.h" #include "progress.h" +#include "bloom.h" #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */ #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ @@ -789,9 +790,11 @@ struct write_commit_graph_context { unsigned append:1, report_progress:1, split:1, - check_oids:1; + check_oids:1, + changed_paths:1; const struct split_commit_graph_opts *split_opts; + size_t total_bloom_filter_data_size; }; static void write_graph_chunk_fanout(struct hashfile *f, @@ -1134,6 +1137,28 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx) stop_progress(&ctx->progress); } +static void compute_bloom_filters(struct write_commit_graph_context *ctx) +{ + int i; + struct progress *progress = NULL; + + init_bloom_filters(); + + if (ctx->report_progress) + progress = start_delayed_progress( + _("Computing commit changed paths Bloom filters"), + ctx->commits.nr); + + for (i = 0; i < ctx->commits.nr; i++) { + struct commit *c = ctx->commits.list[i]; + struct bloom_filter *filter = get_bloom_filter(ctx->r, c); + ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len; + display_progress(progress, i + 1); + } + + stop_progress(&progress); +} + static int add_ref_to_list(const char *refname, const struct object_id *oid, int flags, void *cb_data) @@ -1776,6 +1801,8 @@ int write_commit_graph(struct object_directory *odb, ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0; ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0; ctx->split_opts = split_opts; + ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0; + ctx->total_bloom_filter_data_size = 0; if (ctx->split) { struct commit_graph *g; @@ -1870,6 +1897,9 @@ int write_commit_graph(struct object_directory *odb, compute_generation_numbers(ctx); + if (ctx->changed_paths) + compute_bloom_filters(ctx); + res = write_commit_graph_file(ctx); if (ctx->split) diff --git a/commit-graph.h b/commit-graph.h index e87a6f6..86be812 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -79,7 +79,8 @@ enum commit_graph_write_flags { COMMIT_GRAPH_WRITE_PROGRESS = (1 << 1), COMMIT_GRAPH_WRITE_SPLIT = (1 << 2), /* Make sure that each OID in the input is a valid commit OID. */ - COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3) + COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3), + COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4), }; struct split_commit_graph_opts { -- cgit v0.10.2-6-g49f6 From d21ee7d111073dfd7a86f6fe870d0c1ec6a07126 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 30 Mar 2020 00:31:29 +0000 Subject: commit-graph: examine changed-path objects in pack order Looking at the diff of commit objects in pack order is much faster than in sha1 order, as it gives locality to the access of tree deltas (whereas sha1 order is effectively random). Unfortunately the commit-graph code sorts the commits (several times, sometimes as an oid and sometimes a pointer-to-commit), and we ultimately traverse in sha1 order. Instead, let's remember the position at which we see each commit, and traverse in that order when looking at bloom filters. This drops my time for "git commit-graph write --changed-paths" in linux.git from ~4 minutes to ~1.5 minutes. Probably the "--reachable" code path would want something similar. Or alternatively, we could use a different data structure (either a hash, or maybe even just a bit in "struct commit") to keep track of which oids we've seen, etc instead of sorting. And then we could keep the original order. Signed-off-by: Jeff King Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/commit-graph.c b/commit-graph.c index 862a00d..31b06f8 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -17,6 +17,7 @@ #include "replace-object.h" #include "progress.h" #include "bloom.h" +#include "commit-slab.h" #define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */ #define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ @@ -46,9 +47,32 @@ /* Remember to update object flag allocation in object.h */ #define REACHABLE (1u<<15) -char *get_commit_graph_filename(struct object_directory *odb) +/* Keep track of the order in which commits are added to our list. */ +define_commit_slab(commit_pos, int); +static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos); + +static void set_commit_pos(struct repository *r, const struct object_id *oid) +{ + static int32_t max_pos; + struct commit *commit = lookup_commit(r, oid); + + if (!commit) + return; /* should never happen, but be lenient */ + + *commit_pos_at(&commit_pos, commit) = max_pos++; +} + +static int commit_pos_cmp(const void *va, const void *vb) { - return xstrfmt("%s/info/commit-graph", odb->path); + const struct commit *a = *(const struct commit **)va; + const struct commit *b = *(const struct commit **)vb; + return commit_pos_at(&commit_pos, a) - + commit_pos_at(&commit_pos, b); +} + +char *get_commit_graph_filename(struct object_directory *obj_dir) +{ + return xstrfmt("%s/info/commit-graph", obj_dir->path); } static char *get_split_graph_filename(struct object_directory *odb, @@ -1021,6 +1045,8 @@ static int add_packed_commits(const struct object_id *oid, oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid); ctx->oids.nr++; + set_commit_pos(ctx->r, oid); + return 0; } @@ -1141,6 +1167,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx) { int i; struct progress *progress = NULL; + struct commit **sorted_commits; init_bloom_filters(); @@ -1149,13 +1176,18 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx) _("Computing commit changed paths Bloom filters"), ctx->commits.nr); + ALLOC_ARRAY(sorted_commits, ctx->commits.nr); + COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr); + QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp); + for (i = 0; i < ctx->commits.nr; i++) { - struct commit *c = ctx->commits.list[i]; + struct commit *c = sorted_commits[i]; struct bloom_filter *filter = get_bloom_filter(ctx->r, c); ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len; display_progress(progress, i + 1); } + free(sorted_commits); stop_progress(&progress); } -- cgit v0.10.2-6-g49f6 From 3d11275505694ce4e5256516de1c5dd90e749303 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 30 Mar 2020 00:31:30 +0000 Subject: commit-graph: examine commits by generation number When running 'git commit-graph write --changed-paths', we sort the commits by pack-order to save time when computing the changed-paths bloom filters. This does not help when finding the commits via the '--reachable' flag. If not using pack-order, then sort by generation number before examining the diff. Commits with similar generation are more likely to have many trees in common, making the diff faster. On the Linux kernel repository, this change reduced the computation time for 'git commit-graph write --reachable --changed-paths' from 3m00s to 1m37s. Helped-by: Jeff King Signed-off-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/commit-graph.c b/commit-graph.c index 31b06f8..732c81f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -70,6 +70,25 @@ static int commit_pos_cmp(const void *va, const void *vb) commit_pos_at(&commit_pos, b); } +static int commit_gen_cmp(const void *va, const void *vb) +{ + const struct commit *a = *(const struct commit **)va; + const struct commit *b = *(const struct commit **)vb; + + /* lower generation commits first */ + if (a->generation < b->generation) + return -1; + else if (a->generation > b->generation) + return 1; + + /* use date as a heuristic when generations are equal */ + if (a->date < b->date) + return -1; + else if (a->date > b->date) + return 1; + return 0; +} + char *get_commit_graph_filename(struct object_directory *obj_dir) { return xstrfmt("%s/info/commit-graph", obj_dir->path); @@ -815,7 +834,8 @@ struct write_commit_graph_context { report_progress:1, split:1, check_oids:1, - changed_paths:1; + changed_paths:1, + order_by_pack:1; const struct split_commit_graph_opts *split_opts; size_t total_bloom_filter_data_size; @@ -1178,7 +1198,11 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx) ALLOC_ARRAY(sorted_commits, ctx->commits.nr); COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr); - QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp); + + if (ctx->order_by_pack) + QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp); + else + QSORT(sorted_commits, ctx->commits.nr, commit_gen_cmp); for (i = 0; i < ctx->commits.nr; i++) { struct commit *c = sorted_commits[i]; @@ -1884,6 +1908,7 @@ int write_commit_graph(struct object_directory *odb, } if (pack_indexes) { + ctx->order_by_pack = 1; if ((res = fill_oids_from_packs(ctx, pack_indexes))) goto cleanup; } @@ -1893,8 +1918,10 @@ int write_commit_graph(struct object_directory *odb, goto cleanup; } - if (!pack_indexes && !commit_hex) + if (!pack_indexes && !commit_hex) { + ctx->order_by_pack = 1; fill_oids_from_all_packs(ctx); + } close_reachable(ctx); -- cgit v0.10.2-6-g49f6 From 76ffbca71a9c89d1e530f734e16a70b3924f4bea Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:49 +0000 Subject: commit-graph: write Bloom filters to commit graph file Update the technical documentation for commit-graph-format with the formats for the Bloom filter index (BIDX) and Bloom filter data (BDAT) chunks. Write the computed Bloom filters information to the commit graph file using this format. Helped-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt index a4f1744..de56f9f 100644 --- a/Documentation/technical/commit-graph-format.txt +++ b/Documentation/technical/commit-graph-format.txt @@ -17,6 +17,9 @@ metadata, including: - The parents of the commit, stored using positional references within the graph file. +- The Bloom filter of the commit carrying the paths that were changed between + the commit and its first parent, if requested. + These positional references are stored as unsigned 32-bit integers corresponding to the array position within the list of commit OIDs. Due to some special constants we use to track parents, we can store at most @@ -93,6 +96,33 @@ CHUNK DATA: positions for the parents until reaching a value with the most-significant bit on. The other bits correspond to the position of the last parent. + Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional] + * The ith entry, BIDX[i], stores the number of 8-byte word blocks in all + Bloom filters from commit 0 to commit i (inclusive) in lexicographic + order. The Bloom filter for the i-th commit spans from BIDX[i-1] to + BIDX[i] (plus header length), where BIDX[-1] is 0. + * The BIDX chunk is ignored if the BDAT chunk is not present. + + Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional] + * It starts with header consisting of three unsigned 32-bit integers: + - Version of the hash algorithm being used. We currently only support + value 1 which corresponds to the 32-bit version of the murmur3 hash + implemented exactly as described in + https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double + hashing technique using seed values 0x293ae76f and 0x7e646e2 as + described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters + in Probabilistic Verification" + - The number of times a path is hashed and hence the number of bit positions + that cumulatively determine whether a file is present in the commit. + - The minimum number of bits 'b' per entry in the Bloom filter. If the filter + contains 'n' entries, then the filter size is the minimum number of 64-bit + words that contain n*b bits. + * The rest of the chunk is the concatenation of all the computed Bloom + filters for the commits in lexicographic order. + * Note: Commits with no changes or more than 512 changes have Bloom filters + of length zero. + * The BDAT chunk is present if and only if BIDX is present. + Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional] This list of H-byte hashes describe a set of B commit-graph files that form a commit-graph chain. The graph position for the ith commit in this diff --git a/commit-graph.c b/commit-graph.c index 732c81f..a8b6b5c 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -24,8 +24,10 @@ #define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ #define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */ #define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */ +#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */ +#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */ #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ -#define MAX_NUM_CHUNKS 5 +#define MAX_NUM_CHUNKS 7 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) @@ -319,6 +321,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd, chunk_repeated = 1; else graph->chunk_base_graphs = data + chunk_offset; + break; + + case GRAPH_CHUNKID_BLOOMINDEXES: + if (graph->chunk_bloom_indexes) + chunk_repeated = 1; + else + graph->chunk_bloom_indexes = data + chunk_offset; + break; + + case GRAPH_CHUNKID_BLOOMDATA: + if (graph->chunk_bloom_data) + chunk_repeated = 1; + else { + uint32_t hash_version; + graph->chunk_bloom_data = data + chunk_offset; + hash_version = get_be32(data + chunk_offset); + + if (hash_version != 1) + break; + + graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings)); + graph->bloom_filter_settings->hash_version = hash_version; + graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4); + graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8); + } + break; } if (chunk_repeated) { @@ -337,6 +365,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd, last_chunk_offset = chunk_offset; } + if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) { + init_bloom_filters(); + } else { + /* We need both the bloom chunks to exist together. Else ignore the data */ + graph->chunk_bloom_indexes = NULL; + graph->chunk_bloom_data = NULL; + graph->bloom_filter_settings = NULL; + } + hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len); if (verify_commit_graph_lite(graph)) { @@ -1034,6 +1071,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f, } } +static void write_graph_chunk_bloom_indexes(struct hashfile *f, + struct write_commit_graph_context *ctx) +{ + struct commit **list = ctx->commits.list; + struct commit **last = ctx->commits.list + ctx->commits.nr; + uint32_t cur_pos = 0; + struct progress *progress = NULL; + int i = 0; + + if (ctx->report_progress) + progress = start_delayed_progress( + _("Writing changed paths Bloom filters index"), + ctx->commits.nr); + + while (list < last) { + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); + cur_pos += filter->len; + display_progress(progress, ++i); + hashwrite_be32(f, cur_pos); + list++; + } + + stop_progress(&progress); +} + +static void write_graph_chunk_bloom_data(struct hashfile *f, + struct write_commit_graph_context *ctx, + const struct bloom_filter_settings *settings) +{ + struct commit **list = ctx->commits.list; + struct commit **last = ctx->commits.list + ctx->commits.nr; + struct progress *progress = NULL; + int i = 0; + + if (ctx->report_progress) + progress = start_delayed_progress( + _("Writing changed paths Bloom filters data"), + ctx->commits.nr); + + hashwrite_be32(f, settings->hash_version); + hashwrite_be32(f, settings->num_hashes); + hashwrite_be32(f, settings->bits_per_entry); + + while (list < last) { + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); + display_progress(progress, ++i); + hashwrite(f, filter->data, filter->len * sizeof(unsigned char)); + list++; + } + + stop_progress(&progress); +} + static int oid_compare(const void *_a, const void *_b) { const struct object_id *a = (const struct object_id *)_a; @@ -1438,6 +1528,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) struct strbuf progress_title = STRBUF_INIT; int num_chunks = 3; struct object_id file_hash; + const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; if (ctx->split) { struct strbuf tmp_file = STRBUF_INIT; @@ -1482,6 +1573,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES; num_chunks++; } + if (ctx->changed_paths) { + chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES; + num_chunks++; + chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA; + num_chunks++; + } if (ctx->num_commit_graphs_after > 1) { chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE; num_chunks++; @@ -1500,6 +1597,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) 4 * ctx->num_extra_edges; num_chunks++; } + if (ctx->changed_paths) { + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + + sizeof(uint32_t) * ctx->commits.nr; + num_chunks++; + + chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + + sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size; + num_chunks++; + } if (ctx->num_commit_graphs_after > 1) { chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] + hashsz * (ctx->num_commit_graphs_after - 1); @@ -1537,6 +1643,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) write_graph_chunk_data(f, hashsz, ctx); if (ctx->num_extra_edges) write_graph_chunk_extra_edges(f, ctx); + if (ctx->changed_paths) { + write_graph_chunk_bloom_indexes(f, ctx); + write_graph_chunk_bloom_data(f, ctx, &bloom_settings); + } if (ctx->num_commit_graphs_after > 1 && write_graph_chunk_base(f, ctx)) { return -1; @@ -2184,6 +2294,7 @@ void free_commit_graph(struct commit_graph *g) close(g->graph_fd); } free(g->filename); + free(g->bloom_filter_settings); free(g); } diff --git a/commit-graph.h b/commit-graph.h index 86be812..8e7a8e0 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -11,6 +11,7 @@ #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD" struct commit; +struct bloom_filter_settings; char *get_commit_graph_filename(struct object_directory *odb); int open_commit_graph(const char *graph_file, int *fd, struct stat *st); @@ -59,6 +60,10 @@ struct commit_graph { const unsigned char *chunk_commit_data; const unsigned char *chunk_extra_edges; const unsigned char *chunk_base_graphs; + const unsigned char *chunk_bloom_indexes; + const unsigned char *chunk_bloom_data; + + struct bloom_filter_settings *bloom_filter_settings; }; struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st, -- cgit v0.10.2-6-g49f6 From 1217c03e7b87b15f2c78af5b1e1915a675050454 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:50 +0000 Subject: commit-graph: reuse existing Bloom filters during write Add logic to a) parse Bloom filter information from the commit graph file and, b) re-use existing Bloom filters. See Documentation/technical/commit-graph-format for the format in which the Bloom filter information is written to the commit graph file. To read Bloom filter for a given commit with lexicographic position 'i' we need to: 1. Read BIDX[i] which essentially gives us the starting index in BDAT for filter of commit i+1. It is essentially the index past the end of the filter of commit i. It is called end_index in the code. 2. For i>0, read BIDX[i-1] which will give us the starting index in BDAT for filter of commit i. It is called the start_index in the code. For the first commit, where i = 0, Bloom filter data starts at the beginning, just past the header in the BDAT chunk. Hence, start_index will be 0. 3. The length of the filter will be end_index - start_index, because BIDX[i] gives the cumulative 8-byte words including the ith commit's filter. We toggle whether Bloom filters should be recomputed based on the compute_if_not_present flag. Helped-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index a16eee9..0f714dd 100644 --- a/bloom.c +++ b/bloom.c @@ -4,6 +4,8 @@ #include "diffcore.h" #include "revision.h" #include "hashmap.h" +#include "commit-graph.h" +#include "commit.h" define_commit_slab(bloom_filter_slab, struct bloom_filter); @@ -26,6 +28,36 @@ static inline unsigned char get_bitmask(uint32_t pos) return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); } +static int load_bloom_filter_from_graph(struct commit_graph *g, + struct bloom_filter *filter, + struct commit *c) +{ + uint32_t lex_pos, start_index, end_index; + + while (c->graph_pos < g->num_commits_in_base) + g = g->base_graph; + + /* The commit graph commit 'c' lives in doesn't carry bloom filters. */ + if (!g->chunk_bloom_indexes) + return 0; + + lex_pos = c->graph_pos - g->num_commits_in_base; + + end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos); + + if (lex_pos > 0) + start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1)); + else + start_index = 0; + + filter->len = end_index - start_index; + filter->data = (unsigned char *)(g->chunk_bloom_data + + sizeof(unsigned char) * start_index + + BLOOMDATA_CHUNK_HEADER_SIZE); + + return 1; +} + /* * Calculate the murmur3 32-bit hash value for the given data * using the given seed. @@ -127,7 +159,8 @@ void init_bloom_filters(void) } struct bloom_filter *get_bloom_filter(struct repository *r, - struct commit *c) + struct commit *c, + int compute_if_not_present) { struct bloom_filter *filter; struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; @@ -140,6 +173,20 @@ struct bloom_filter *get_bloom_filter(struct repository *r, filter = bloom_filter_slab_at(&bloom_filters, c); + if (!filter->data) { + load_commit_graph_info(r, c); + if (c->graph_pos != COMMIT_NOT_FROM_GRAPH && + r->objects->commit_graph->chunk_bloom_indexes) { + if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c)) + return filter; + else + return NULL; + } + } + + if (filter->data || !compute_if_not_present) + return filter; + repo_diff_setup(r, &diffopt); diffopt.flags.recursive = 1; diffopt.max_changes = max_changes; diff --git a/bloom.h b/bloom.h index 85ab8e9..760d712 100644 --- a/bloom.h +++ b/bloom.h @@ -32,6 +32,7 @@ struct bloom_filter_settings { #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 } #define BITS_PER_WORD 8 +#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t) /* * A bloom_filter struct represents a data segment to @@ -79,6 +80,7 @@ void add_key_to_filter(const struct bloom_key *key, void init_bloom_filters(void); struct bloom_filter *get_bloom_filter(struct repository *r, - struct commit *c); + struct commit *c, + int compute_if_not_present); #endif \ No newline at end of file diff --git a/commit-graph.c b/commit-graph.c index a8b6b5c..7766862 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1086,7 +1086,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f, ctx->commits.nr); while (list < last) { - struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); cur_pos += filter->len; display_progress(progress, ++i); hashwrite_be32(f, cur_pos); @@ -1115,7 +1115,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f, hashwrite_be32(f, settings->bits_per_entry); while (list < last) { - struct bloom_filter *filter = get_bloom_filter(ctx->r, *list); + struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0); display_progress(progress, ++i); hashwrite(f, filter->data, filter->len * sizeof(unsigned char)); list++; @@ -1296,7 +1296,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx) for (i = 0; i < ctx->commits.nr; i++) { struct commit *c = sorted_commits[i]; - struct bloom_filter *filter = get_bloom_filter(ctx->r, c); + struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1); ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len; display_progress(progress, i + 1); } diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index f18d1b7..ce41266 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -39,7 +39,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) struct bloom_filter *filter; setup_git_directory(); c = lookup_commit(the_repository, commit_oid); - filter = get_bloom_filter(the_repository, c); + filter = get_bloom_filter(the_repository, c, 1); print_bloom_filter(filter); } -- cgit v0.10.2-6-g49f6 From d38e07b8c44ffdb73e7eba1b7f6a73eb7eb0d5f9 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:51 +0000 Subject: commit-graph: add --changed-paths option to write subcommand Add --changed-paths option to git commit-graph write. This option will allow users to compute information about the paths that have changed between a commit and its first parent, and write it into the commit graph file. If the option is passed to the write subcommand we set the COMMIT_GRAPH_WRITE_BLOOM_FILTERS flag and pass it down to the commit-graph logic. Helped-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt index 28d1fee..f4b13c0 100644 --- a/Documentation/git-commit-graph.txt +++ b/Documentation/git-commit-graph.txt @@ -57,6 +57,11 @@ or `--stdin-packs`.) With the `--append` option, include all commits that are present in the existing commit-graph file. + +With the `--changed-paths` option, compute and write information about the +paths changed between a commit and it's first parent. This operation can +take a while on large repositories. It provides significant performance gains +for getting history of a directory or a file with `git log -- `. ++ With the `--split` option, write the commit-graph as a chain of multiple commit-graph files stored in `/info/commit-graphs`. The new commits not already in the commit-graph are added in a new "tip" file. This file diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index d1ab662..cacb5d0 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -9,7 +9,7 @@ static char const * const builtin_commit_graph_usage[] = { N_("git commit-graph verify [--object-dir ] [--shallow] [--[no-]progress]"), - N_("git commit-graph write [--object-dir ] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--[no-]progress] "), + N_("git commit-graph write [--object-dir ] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--changed-paths] [--[no-]progress] "), NULL }; @@ -19,7 +19,7 @@ static const char * const builtin_commit_graph_verify_usage[] = { }; static const char * const builtin_commit_graph_write_usage[] = { - N_("git commit-graph write [--object-dir ] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--[no-]progress] "), + N_("git commit-graph write [--object-dir ] [--append|--split] [--reachable|--stdin-packs|--stdin-commits] [--changed-paths] [--[no-]progress] "), NULL }; @@ -32,6 +32,7 @@ static struct opts_commit_graph { int split; int shallow; int progress; + int enable_changed_paths; } opts; static struct object_directory *find_odb(struct repository *r, @@ -135,6 +136,8 @@ static int graph_write(int argc, const char **argv) N_("start walk at commits listed by stdin")), OPT_BOOL(0, "append", &opts.append, N_("include all commits already in the commit-graph file")), + OPT_BOOL(0, "changed-paths", &opts.enable_changed_paths, + N_("enable computation for changed paths")), OPT_BOOL(0, "progress", &opts.progress, N_("force progress reporting")), OPT_BOOL(0, "split", &opts.split, N_("allow writing an incremental commit-graph file")), @@ -168,6 +171,8 @@ static int graph_write(int argc, const char **argv) flags |= COMMIT_GRAPH_WRITE_SPLIT; if (opts.progress) flags |= COMMIT_GRAPH_WRITE_PROGRESS; + if (opts.enable_changed_paths) + flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS; read_replace_refs = 0; odb = find_odb(the_repository, opts.obj_dir); -- cgit v0.10.2-6-g49f6 From a56b9464cd0a49317fafde080ae4e73c5430ac9b Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:52 +0000 Subject: revision.c: use Bloom filters to speed up path based revision walks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revision walk will now use Bloom filters for commits to speed up revision walks for a particular path (for computing history for that path), if they are present in the commit-graph file. We load the Bloom filters during the prepare_revision_walk step, currently only when dealing with a single pathspec. Extending it to work with multiple pathspecs can be explored and built on top of this series in the future. While comparing trees in rev_compare_trees(), if the Bloom filter says that the file is not different between the two trees, we don't need to compute the expensive diff. This is where we get our performance gains. The other response of the Bloom filter is '`:maybe', in which case we fall back to the full diff calculation to determine if the path was changed in the commit. We do not try to use Bloom filters when the '--walk-reflogs' option is specified. The '--walk-reflogs' option does not walk the commit ancestry chain like the rest of the options. Incorporating the performance gains when walking reflog entries would add more complexity, and can be explored in a later series. Performance Gains: We tested the performance of `git log -- ` on the git repo, the linux and some internal large repos, with a variety of paths of varying depths. On the git and linux repos: - we observed a 2x to 5x speed up. On a large internal repo with files seated 6-10 levels deep in the tree: - we observed 10x to 20x speed ups, with some paths going up to 28 times faster. Helped-by: Derrick Stolee Helped-by: Jonathan Tan Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index 0f714dd..c5b461d 100644 --- a/bloom.c +++ b/bloom.c @@ -253,3 +253,23 @@ struct bloom_filter *get_bloom_filter(struct repository *r, return filter; } + +int bloom_filter_contains(const struct bloom_filter *filter, + const struct bloom_key *key, + const struct bloom_filter_settings *settings) +{ + int i; + uint64_t mod = filter->len * BITS_PER_WORD; + + if (!mod) + return -1; + + for (i = 0; i < settings->num_hashes; i++) { + uint64_t hash_mod = key->hashes[i] % mod; + uint64_t block_pos = hash_mod / BITS_PER_WORD; + if (!(filter->data[block_pos] & get_bitmask(hash_mod))) + return 0; + } + + return 1; +} \ No newline at end of file diff --git a/bloom.h b/bloom.h index 760d712..b935186 100644 --- a/bloom.h +++ b/bloom.h @@ -83,4 +83,8 @@ struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c, int compute_if_not_present); +int bloom_filter_contains(const struct bloom_filter *filter, + const struct bloom_key *key, + const struct bloom_filter_settings *settings); + #endif \ No newline at end of file diff --git a/revision.c b/revision.c index 8136929..d3fcb7c 100644 --- a/revision.c +++ b/revision.c @@ -29,6 +29,7 @@ #include "prio-queue.h" #include "hashmap.h" #include "utf8.h" +#include "bloom.h" volatile show_early_output_fn_t show_early_output; @@ -624,11 +625,80 @@ static void file_change(struct diff_options *options, options->flags.has_changes = 1; } +static void prepare_to_use_bloom_filter(struct rev_info *revs) +{ + struct pathspec_item *pi; + char *path_alloc = NULL; + const char *path; + int last_index; + int len; + + if (!revs->commits) + return; + + repo_parse_commit(revs->repo, revs->commits->item); + + if (!revs->repo->objects->commit_graph) + return; + + revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings; + if (!revs->bloom_filter_settings) + return; + + pi = &revs->pruning.pathspec.items[0]; + last_index = pi->len - 1; + + /* remove single trailing slash from path, if needed */ + if (pi->match[last_index] == '/') { + path_alloc = xstrdup(pi->match); + path_alloc[last_index] = '\0'; + path = path_alloc; + } else + path = pi->match; + + len = strlen(path); + + revs->bloom_key = xmalloc(sizeof(struct bloom_key)); + fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); + + free(path_alloc); +} + +static int check_maybe_different_in_bloom_filter(struct rev_info *revs, + struct commit *commit) +{ + struct bloom_filter *filter; + int result; + + if (!revs->repo->objects->commit_graph) + return -1; + + if (commit->generation == GENERATION_NUMBER_INFINITY) + return -1; + + filter = get_bloom_filter(revs->repo, commit, 0); + + if (!filter) { + return -1; + } + + if (!filter->len) { + return -1; + } + + result = bloom_filter_contains(filter, + revs->bloom_key, + revs->bloom_filter_settings); + + return result; +} + static int rev_compare_tree(struct rev_info *revs, - struct commit *parent, struct commit *commit) + struct commit *parent, struct commit *commit, int nth_parent) { struct tree *t1 = get_commit_tree(parent); struct tree *t2 = get_commit_tree(commit); + int bloom_ret = 1; if (!t1) return REV_TREE_NEW; @@ -653,11 +723,19 @@ static int rev_compare_tree(struct rev_info *revs, return REV_TREE_SAME; } + if (revs->bloom_key && !nth_parent) { + bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); + + if (bloom_ret == 0) + return REV_TREE_SAME; + } + tree_difference = REV_TREE_SAME; revs->pruning.flags.has_changes = 0; if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning) < 0) return REV_TREE_DIFFERENT; + return tree_difference; } @@ -855,7 +933,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit) die("cannot simplify commit %s (because of %s)", oid_to_hex(&commit->object.oid), oid_to_hex(&p->object.oid)); - switch (rev_compare_tree(revs, p, commit)) { + switch (rev_compare_tree(revs, p, commit, nth_parent)) { case REV_TREE_SAME: if (!revs->simplify_history || !relevant_commit(p)) { /* Even if a merge with an uninteresting @@ -3362,6 +3440,8 @@ int prepare_revision_walk(struct rev_info *revs) FOR_EACH_OBJECT_PROMISOR_ONLY); } + if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info) + prepare_to_use_bloom_filter(revs); if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED) commit_list_sort_by_date(&revs->commits); if (revs->no_walk) @@ -3379,6 +3459,7 @@ int prepare_revision_walk(struct rev_info *revs) simplify_merges(revs); if (revs->children.name) set_children(revs); + return 0; } diff --git a/revision.h b/revision.h index 475f048..7c026fe 100644 --- a/revision.h +++ b/revision.h @@ -56,6 +56,8 @@ struct repository; struct rev_info; struct string_list; struct saved_parents; +struct bloom_key; +struct bloom_filter_settings; define_shared_commit_slab(revision_sources, char *); struct rev_cmdline_info { @@ -291,6 +293,15 @@ struct rev_info { struct revision_sources *sources; struct topo_walk_info *topo_walk_info; + + /* Commit graph bloom filter fields */ + /* The bloom filter key for the pathspec */ + struct bloom_key *bloom_key; + /* + * The bloom filter settings used to generate the key. + * This is loaded from the commit-graph being used. + */ + struct bloom_filter_settings *bloom_filter_settings; }; int ref_excluded(struct string_list *, const char *path); -- cgit v0.10.2-6-g49f6 From 42e50e78c6fd8978c2218bbd7b3483ae51d5e3f9 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:53 +0000 Subject: revision.c: add trace2 stats around Bloom filter usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add trace2 statistics around Bloom filter usage and behavior for 'git log -- path' commands that are hoping to benefit from the presence of computed changed paths Bloom filters. These statistics are great for performance analysis work and for formal testing, which we will see in the commit following this one. Helped-by: Derrick Stolee Helped-by: Jonathan Tan Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/revision.c b/revision.c index d3fcb7c..2b06ee7 100644 --- a/revision.c +++ b/revision.c @@ -30,6 +30,7 @@ #include "hashmap.h" #include "utf8.h" #include "bloom.h" +#include "json-writer.h" volatile show_early_output_fn_t show_early_output; @@ -625,6 +626,30 @@ static void file_change(struct diff_options *options, options->flags.has_changes = 1; } +static int bloom_filter_atexit_registered; +static unsigned int count_bloom_filter_maybe; +static unsigned int count_bloom_filter_definitely_not; +static unsigned int count_bloom_filter_false_positive; +static unsigned int count_bloom_filter_not_present; +static unsigned int count_bloom_filter_length_zero; + +static void trace2_bloom_filter_statistics_atexit(void) +{ + struct json_writer jw = JSON_WRITER_INIT; + + jw_object_begin(&jw, 0); + jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present); + jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero); + jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe); + jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not); + jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive); + jw_end(&jw); + + trace2_data_json("bloom", the_repository, "statistics", &jw); + + jw_release(&jw); +} + static void prepare_to_use_bloom_filter(struct rev_info *revs) { struct pathspec_item *pi; @@ -661,6 +686,11 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) revs->bloom_key = xmalloc(sizeof(struct bloom_key)); fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); + if (trace2_is_enabled() && !bloom_filter_atexit_registered) { + atexit(trace2_bloom_filter_statistics_atexit); + bloom_filter_atexit_registered = 1; + } + free(path_alloc); } @@ -679,10 +709,12 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, filter = get_bloom_filter(revs->repo, commit, 0); if (!filter) { + count_bloom_filter_not_present++; return -1; } if (!filter->len) { + count_bloom_filter_length_zero++; return -1; } @@ -690,6 +722,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, revs->bloom_key, revs->bloom_filter_settings); + if (result) + count_bloom_filter_maybe++; + else + count_bloom_filter_definitely_not++; + return result; } @@ -736,6 +773,10 @@ static int rev_compare_tree(struct rev_info *revs, &revs->pruning) < 0) return REV_TREE_DIFFERENT; + if (!nth_parent) + if (bloom_ret == 1 && tree_difference == REV_TREE_SAME) + count_bloom_filter_false_positive++; + return tree_difference; } -- cgit v0.10.2-6-g49f6 From a759bfa9eeb2a080d7c5c0a3c4096db5438c06bf Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:54 +0000 Subject: t4216: add end to end tests for git log with Bloom filters These tests exercises writing commit graph with Bloom filters and exercises 'git log -- path' with all the applicable options. They check that the output is the same with and without Bloom filters, confirm Bloom filters were used by checking if trace2 statistics were logged correctly. Also confirms cases where Bloom filters are not used: 1. Multiple path specs, 2. --walk-reflogs (see patch titled 'revision.c: use Bloom filters...' for details, 3. If the latest commit graph does not have Bloom filters Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c index f8a4617..4223ff3 100644 --- a/t/helper/test-read-graph.c +++ b/t/helper/test-read-graph.c @@ -45,6 +45,10 @@ int cmd__read_graph(int argc, const char **argv) printf(" commit_metadata"); if (graph->chunk_extra_edges) printf(" extra_edges"); + if (graph->chunk_bloom_indexes) + printf(" bloom_indexes"); + if (graph->chunk_bloom_data) + printf(" bloom_data"); printf("\n"); UNLEAK(graph); diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh new file mode 100755 index 0000000..c7011f3 --- /dev/null +++ b/t/t4216-log-bloom.sh @@ -0,0 +1,155 @@ +#!/bin/sh + +test_description='git log for a path with Bloom filters' +. ./test-lib.sh + +GIT_TEST_COMMIT_GRAPH=0 +GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 + +test_expect_success 'setup test - repo, commits, commit graph, log outputs' ' + git init && + mkdir A A/B A/B/C && + test_commit c1 A/file1 && + test_commit c2 A/B/file2 && + test_commit c3 A/B/C/file3 && + test_commit c4 A/file1 && + test_commit c5 A/B/file2 && + test_commit c6 A/B/C/file3 && + test_commit c7 A/file1 && + test_commit c8 A/B/file2 && + test_commit c9 A/B/C/file3 && + test_commit c10 file_to_be_deleted && + git checkout -b side HEAD~4 && + test_commit side-1 file4 && + git checkout master && + git merge side && + test_commit c11 file5 && + mv file5 file5_renamed && + git add file5_renamed && + git commit -m "rename" && + rm file_to_be_deleted && + git add . && + git commit -m "file removed" && + git commit-graph write --reachable --changed-paths +' +graph_read_expect () { + NUM_CHUNKS=5 + cat >expect <<- EOF + header: 43475048 1 1 $NUM_CHUNKS 0 + num_commits: $1 + chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data + EOF + test-tool read-graph >actual && + test_cmp expect actual +} + +test_expect_success 'commit-graph write wrote out the bloom chunks' ' + graph_read_expect 15 +' + +# Turn off any inherited trace2 settings for this test. +sane_unset GIT_TRACE2 GIT_TRACE2_PERF GIT_TRACE2_EVENT +sane_unset GIT_TRACE2_PERF_BRIEF +sane_unset GIT_TRACE2_CONFIG_PARAMS + +setup () { + rm "$TRASH_DIRECTORY/trace.perf" + git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom && + GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom +} + +test_bloom_filters_used () { + log_args=$1 + bloom_trace_prefix="statistics:{\"filter_not_present\":0,\"zero_length_filter\":0,\"maybe\"" + setup "$log_args" && + grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && + test_cmp log_wo_bloom log_w_bloom && + test_path_is_file "$TRASH_DIRECTORY/trace.perf" +} + +test_bloom_filters_not_used () { + log_args=$1 + setup "$log_args" && + !(grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf") && + test_cmp log_wo_bloom log_w_bloom +} + +for path in A A/B A/B/C A/file1 A/B/file2 A/B/C/file3 file4 file5 file5_renamed file_to_be_deleted +do + for option in "" \ + "--all" \ + "--full-history" \ + "--full-history --simplify-merges" \ + "--simplify-merges" \ + "--simplify-by-decoration" \ + "--follow" \ + "--first-parent" \ + "--topo-order" \ + "--date-order" \ + "--author-date-order" \ + "--ancestry-path side..master" + do + test_expect_success "git log option: $option for path: $path" ' + test_bloom_filters_used "$option -- $path" + ' + done +done + +test_expect_success 'git log -- folder works with and without the trailing slash' ' + test_bloom_filters_used "-- A" && + test_bloom_filters_used "-- A/" +' + +test_expect_success 'git log for path that does not exist. ' ' + test_bloom_filters_used "-- path_does_not_exist" +' + +test_expect_success 'git log with --walk-reflogs does not use Bloom filters' ' + test_bloom_filters_not_used "--walk-reflogs -- A" +' + +test_expect_success 'git log -- multiple path specs does not use Bloom filters' ' + test_bloom_filters_not_used "-- file4 A/file1" +' + +test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' ' + test_bloom_filters_used "-- *4" && + test_bloom_filters_used "-- *renamed" +' + +test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' ' + test_bloom_filters_not_used "-- *" && + test_bloom_filters_not_used "-- file*" +' + +test_expect_success 'setup - add commit-graph to the chain without Bloom filters' ' + test_commit c14 A/anotherFile2 && + test_commit c15 A/B/anotherFile2 && + test_commit c16 A/B/C/anotherFile2 && + GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split && + test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain +' + +test_expect_success 'Do not use Bloom filters if the latest graph does not have Bloom filters.' ' + test_bloom_filters_not_used "-- A/B" +' + +test_expect_success 'setup - add commit-graph to the chain with Bloom filters' ' + test_commit c17 A/anotherFile3 && + git commit-graph write --reachable --changed-paths --split && + test_line_count = 3 .git/objects/info/commit-graphs/commit-graph-chain +' + +test_bloom_filters_used_when_some_filters_are_missing () { + log_args=$1 + bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6" + setup "$log_args" && + grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" && + test_cmp log_wo_bloom log_w_bloom +} + +test_expect_success 'Use Bloom filters if they exist in the latest but not all commit graphs in the chain.' ' + test_bloom_filters_used_when_some_filters_are_missing "-- A/B" +' + +test_done \ No newline at end of file -- cgit v0.10.2-6-g49f6 From d5b873c832d832e44523d1d2a9d29afe2b84c84f Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:55 +0000 Subject: commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag Add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag to the test setup suite in order to toggle writing Bloom filters when running any of the git tests. If set to true, we will compute and write Bloom filters every time a test calls `git commit-graph write`, as if the `--changed-paths` option was passed in. The test suite passes when GIT_TEST_COMMIT_GRAPH and GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS are enabled. Helped-by: Derrick Stolee Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c index cacb5d0..5900983 100644 --- a/builtin/commit-graph.c +++ b/builtin/commit-graph.c @@ -171,7 +171,8 @@ static int graph_write(int argc, const char **argv) flags |= COMMIT_GRAPH_WRITE_SPLIT; if (opts.progress) flags |= COMMIT_GRAPH_WRITE_PROGRESS; - if (opts.enable_changed_paths) + if (opts.enable_changed_paths || + git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0)) flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS; read_replace_refs = 0; diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh index 4df54c4..17e25aa 100755 --- a/ci/run-build-and-tests.sh +++ b/ci/run-build-and-tests.sh @@ -19,6 +19,7 @@ linux-gcc) export GIT_TEST_OE_SIZE=10 export GIT_TEST_OE_DELTA_SIZE=5 export GIT_TEST_COMMIT_GRAPH=1 + export GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=1 export GIT_TEST_MULTI_PACK_INDEX=1 export GIT_TEST_ADD_I_USE_BUILTIN=1 make test diff --git a/commit-graph.h b/commit-graph.h index 8e7a8e0..8655d06 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -9,6 +9,7 @@ #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH" #define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD" +#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS" struct commit; struct bloom_filter_settings; diff --git a/t/README b/t/README index da5b24f..8ad9bc1 100644 --- a/t/README +++ b/t/README @@ -378,6 +378,11 @@ GIT_TEST_COMMIT_GRAPH=, when true, forces the commit-graph to be written after every 'git commit' command, and overrides the 'core.commitGraph' setting to true. +GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=, when true, forces +commit-graph write to compute and write changed path Bloom filters for +every 'git commit-graph write', as if the `--changed-paths` option was +passed in. + GIT_TEST_FSMONITOR=$PWD/t7519/fsmonitor-all exercises the fsmonitor code path for utilizing a file system monitor to speed up detecting new or changed files. diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index 9bf920a..18304a6 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -3,6 +3,8 @@ test_description='commit graph' . ./test-lib.sh +GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 + test_expect_success 'setup full repo' ' mkdir full && cd "$TRASH_DIRECTORY/full" && diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh index 53b2e6b..d3f1f2c 100755 --- a/t/t5324-split-commit-graph.sh +++ b/t/t5324-split-commit-graph.sh @@ -4,6 +4,7 @@ test_description='split commit graph' . ./test-lib.sh GIT_TEST_COMMIT_GRAPH=0 +GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 test_expect_success 'setup repo' ' git init && -- cgit v0.10.2-6-g49f6 From caf388caa101be90b7ec43d7f78ca4e935fc0150 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 9 Apr 2020 13:00:11 +0000 Subject: bloom: ignore renames when computing changed paths The changed-path Bloom filters record an entry in the filter for every path that was changed. This includes every add and delete, regardless of whether a rename was detected. Detecting renames causes significant performance issues, but also will trigger downloading missing blobs in partial clone. The simple fix is to disable rename detection when computing a changed-path Bloom filter. This should already be disabled by default, but it is good to explicitly enforce the intended behavior. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano diff --git a/bloom.c b/bloom.c index c5b461d..dd9bab9 100644 --- a/bloom.c +++ b/bloom.c @@ -189,6 +189,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r, repo_diff_setup(r, &diffopt); diffopt.flags.recursive = 1; + diffopt.detect_rename = 0; diffopt.max_changes = max_changes; diff_setup_done(&diffopt); -- cgit v0.10.2-6-g49f6