From 77023ea3c3951be97286bc241ae88bc6c860e2b7 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:06:09 -0400 Subject: t/perf: add tests for many-pack scenarios Git's pack storage does efficient (log n) lookups in a single packfile's index, but if we have multiple packfiles, we have to linearly search each for a given object. This patch introduces some timing tests for cases where we have a large number of packs, so that we can measure any improvements we make in the following patches. The main thing we want to time is object lookup. To do this, we measure "git rev-list --objects --all", which does a fairly large number of object lookups (essentially one per object in the repository). However, we also measure the time to do a full repack, which is interesting for two reasons. One is that in addition to the usual pack lookup, it has its own linear iteration over the list of packs. And two is that because it it is the tool one uses to go from an inefficient many-pack situation back to a single pack, we care about its performance not only at marginal numbers of packs, but at the extreme cases (e.g., if you somehow end up with 5,000 packs, it is the only way to get back to 1 pack, so we need to make sure it performs well). We measure the performance of each command in three scenarios: 1 pack, 50 packs, and 1,000 packs. The 1-pack case is a baseline; any optimizations we do to handle multiple packs cannot possibly perform better than this. The 50-pack case is as far as Git should generally allow your repository to go, if you have auto-gc enabled with the default settings. So this represents the maximum performance improvement we would expect under normal circumstances. The 1,000-pack case is hopefully rare, though I have seen it in the wild where automatic maintenance was broken for some time (and the repository continued to receive pushes). This represents cases where we care less about general performance, but want to make sure that a full repack command does not take excessively long. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/t/perf/p5303-many-packs.sh b/t/perf/p5303-many-packs.sh new file mode 100755 index 0000000..3779851 --- /dev/null +++ b/t/perf/p5303-many-packs.sh @@ -0,0 +1,87 @@ +#!/bin/sh + +test_description='performance with large numbers of packs' +. ./perf-lib.sh + +test_perf_large_repo + +# A real many-pack situation would probably come from having a lot of pushes +# over time. We don't know how big each push would be, but we can fake it by +# just walking the first-parent chain and having every 5 commits be their own +# "push". This isn't _entirely_ accurate, as real pushes would have some +# duplicate objects due to thin-pack fixing, but it's a reasonable +# approximation. +# +# And then all of the rest of the objects can go in a single packfile that +# represents the state before any of those pushes (actually, we'll generate +# that first because in such a setup it would be the oldest pack, and we sort +# the packs by reverse mtime inside git). +repack_into_n () { + rm -rf staging && + mkdir staging && + + git rev-list --first-parent HEAD | + sed -n '1~5p' | + head -n "$1" | + perl -e 'print reverse <>' \ + >pushes + + # create base packfile + head -n 1 pushes | + git pack-objects --delta-base-offset --revs staging/pack + + # and then incrementals between each pair of commits + last= && + while read rev + do + if test -n "$last"; then + { + echo "$rev" && + echo "^$last" + } | + git pack-objects --delta-base-offset --revs \ + staging/pack || return 1 + fi + last=$rev + done /dev/null + ' + + # This simulates the interesting part of the repack, which is the + # actual pack generation, without smudging the on-disk setup + # between trials. + test_perf "repack ($nr_packs)" ' + git pack-objects --keep-true-parents \ + --honor-pack-keep --non-empty --all \ + --reflog --indexed-objects --delta-base-offset \ + --stdout /dev/null + ' +done + +test_done -- cgit v0.10.2-6-g49f6 From 3157c880f6afab26af4f3e4eaceee68fc1b482a8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:06:48 -0400 Subject: sha1_file: drop free_pack_by_name The point of this function is to drop an entry from the "packed_git" cache that points to a file we might be overwriting, because our contents may not be the same (and hence the only caller was pack-objects as it moved a temporary packfile into place). In older versions of git, this could happen because the names of packfiles were derived from the set of objects they contained, not the actual bits on disk. But since 1190a1a (pack-objects: name pack files after trailer hash, 2013-12-05), the name reflects the actual bits on disk, and any two packfiles with the same name can be used interchangeably. Dropping this function not only saves a few lines of code, it makes the lifetime of "struct packed_git" much easier to reason about: namely, we now do not ever free these structs. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/cache.h b/cache.h index 2bf97cc..51c366c 100644 --- a/cache.h +++ b/cache.h @@ -1410,7 +1410,6 @@ extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t extern void close_pack_windows(struct packed_git *); extern void close_all_packs(void); extern void unuse_pack(struct pack_window **); -extern void free_pack_by_name(const char *); extern void clear_delta_base_cache(void); extern struct packed_git *add_packed_git(const char *path, size_t path_len, int local); diff --git a/pack-write.c b/pack-write.c index 33293ce..ea0b788 100644 --- a/pack-write.c +++ b/pack-write.c @@ -354,7 +354,6 @@ void finish_tmp_packfile(struct strbuf *name_buffer, die_errno("unable to make temporary index file readable"); strbuf_addf(name_buffer, "%s.pack", sha1_to_hex(sha1)); - free_pack_by_name(name_buffer->buf); if (rename(pack_tmp_name, name_buffer->buf)) die_errno("unable to rename temporary pack file"); diff --git a/sha1_file.c b/sha1_file.c index d5e1121..e045d2f 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -891,36 +891,6 @@ void close_pack_index(struct packed_git *p) } } -/* - * This is used by git-repack in case a newly created pack happens to - * contain the same set of objects as an existing one. In that case - * the resulting file might be different even if its name would be the - * same. It is best to close any reference to the old pack before it is - * replaced on disk. Of course no index pointers or windows for given pack - * must subsist at this point. If ever objects from this pack are requested - * again, the new version of the pack will be reinitialized through - * reprepare_packed_git(). - */ -void free_pack_by_name(const char *pack_name) -{ - struct packed_git *p, **pp = &packed_git; - - while (*pp) { - p = *pp; - if (strcmp(pack_name, p->pack_name) == 0) { - clear_delta_base_cache(); - close_pack(p); - free(p->bad_object_sha1); - *pp = p->next; - if (last_found_pack == p) - last_found_pack = NULL; - free(p); - return; - } - pp = &p->next; - } -} - static unsigned int get_max_fd_limit(void) { #ifdef RLIMIT_NOFILE -- cgit v0.10.2-6-g49f6 From 002f206faf09d39abee80f916cabaf3ab342b74f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:06:59 -0400 Subject: add generic most-recently-used list There are a few places in Git that would benefit from a fast most-recently-used cache (e.g., the list of packs, which we search linearly but would like to order based on locality). This patch introduces a generic list that can be used to store arbitrary pointers in most-recently-used order. The implementation is just a doubly-linked list, where "marking" an item as used moves it to the front of the list. Insertion and marking are O(1), and iteration is O(n). There's no lookup support provided; if you need fast lookups, you are better off with a different data structure in the first place. There is also no deletion support. This would not be hard to do, but it's not necessary for handling pack structs, which are created and never removed. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/Makefile b/Makefile index c1e88df..3b82ce0 100644 --- a/Makefile +++ b/Makefile @@ -751,6 +751,7 @@ LIB_OBJS += merge.o LIB_OBJS += merge-blobs.o LIB_OBJS += merge-recursive.o LIB_OBJS += mergesort.o +LIB_OBJS += mru.o LIB_OBJS += name-hash.o LIB_OBJS += notes.o LIB_OBJS += notes-cache.o diff --git a/mru.c b/mru.c new file mode 100644 index 0000000..9dedae0 --- /dev/null +++ b/mru.c @@ -0,0 +1,50 @@ +#include "cache.h" +#include "mru.h" + +void mru_append(struct mru *mru, void *item) +{ + struct mru_entry *cur = xmalloc(sizeof(*cur)); + cur->item = item; + cur->prev = mru->tail; + cur->next = NULL; + + if (mru->tail) + mru->tail->next = cur; + else + mru->head = cur; + mru->tail = cur; +} + +void mru_mark(struct mru *mru, struct mru_entry *entry) +{ + /* If we're already at the front of the list, nothing to do */ + if (mru->head == entry) + return; + + /* Otherwise, remove us from our current slot... */ + if (entry->prev) + entry->prev->next = entry->next; + if (entry->next) + entry->next->prev = entry->prev; + else + mru->tail = entry->prev; + + /* And insert us at the beginning. */ + entry->prev = NULL; + entry->next = mru->head; + if (mru->head) + mru->head->prev = entry; + mru->head = entry; +} + +void mru_clear(struct mru *mru) +{ + struct mru_entry *p = mru->head; + + while (p) { + struct mru_entry *to_free = p; + p = p->next; + free(to_free); + } + mru->head = mru->tail = NULL; +} diff --git a/mru.h b/mru.h new file mode 100644 index 0000000..42e4aea --- /dev/null +++ b/mru.h @@ -0,0 +1,45 @@ +#ifndef MRU_H +#define MRU_H + +/** + * A simple most-recently-used cache, backed by a doubly-linked list. + * + * Usage is roughly: + * + * // Create a list. Zero-initialization is required. + * static struct mru cache; + * mru_append(&cache, item); + * ... + * + * // Iterate in MRU order. + * struct mru_entry *p; + * for (p = cache.head; p; p = p->next) { + * if (matches(p->item)) + * break; + * } + * + * // Mark an item as used, moving it to the front of the list. + * mru_mark(&cache, p); + * + * // Reset the list to empty, cleaning up all resources. + * mru_clear(&cache); + * + * Note that you SHOULD NOT call mru_mark() and then continue traversing the + * list; it reorders the marked item to the front of the list, and therefore + * you will begin traversing the whole list again. + */ + +struct mru_entry { + void *item; + struct mru_entry *prev, *next; +}; + +struct mru { + struct mru_entry *head, *tail; +}; + +void mru_append(struct mru *mru, void *item); +void mru_mark(struct mru *mru, struct mru_entry *entry); +void mru_clear(struct mru *mru); + +#endif /* MRU_H */ -- cgit v0.10.2-6-g49f6 From a73cdd21c4514bf0890d4c1ac756fe02262b1192 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:09:46 -0400 Subject: find_pack_entry: replace last_found_pack with MRU cache Each pack has an index for looking up entries in O(log n) time, but if we have multiple packs, we have to scan through them linearly. This can produce a measurable overhead for some operations. We dealt with this long ago in f7c22cc (always start looking up objects in the last used pack first, 2007-05-30), which keeps what is essentially a 1-element most-recently-used cache. In theory, we should be able to do better by keeping a similar but longer cache, that is the same length as the pack-list itself. Since we now have a convenient generic MRU structure, we can plug it in and measure. Here are the numbers for running p5303 against linux.git: Test HEAD^ HEAD ------------------------------------------------------------------------ 5303.3: rev-list (1) 31.56(31.28+0.27) 31.30(31.08+0.20) -0.8% 5303.4: repack (1) 40.62(39.35+2.36) 40.60(39.27+2.44) -0.0% 5303.6: rev-list (50) 31.31(31.06+0.23) 31.23(31.00+0.22) -0.3% 5303.7: repack (50) 58.65(69.12+1.94) 58.27(68.64+2.05) -0.6% 5303.9: rev-list (1000) 38.74(38.40+0.33) 31.87(31.62+0.24) -17.7% 5303.10: repack (1000) 367.20(441.80+4.62) 342.00(414.04+3.72) -6.9% The main numbers of interest here are the rev-list ones (since that is exercising the normal object lookup code path). The single-pack case shouldn't improve at all; the 260ms speedup there is just part of the run-to-run noise (but it's important to note that we didn't make anything worse with the overhead of maintaining our cache). In the 50-pack case, we see similar results. There may be a slight improvement, but it's mostly within the noise. The 1000-pack case does show a big improvement, though. That carries over to the repack case, as well. Even though we haven't touched its pack-search loop yet, it does still do a lot of normal object lookups (e.g., for the internal revision walk), and so improves. As a point of reference, I also ran the 1000-pack test against a version of HEAD^ with the last_found_pack optimization disabled. It takes ~60s, so that gives an indication of how much even the single-element cache is helping. For comparison, here's a smaller repository, git.git: Test HEAD^ HEAD --------------------------------------------------------------------- 5303.3: rev-list (1) 1.56(1.54+0.01) 1.54(1.51+0.02) -1.3% 5303.4: repack (1) 1.84(1.80+0.10) 1.82(1.80+0.09) -1.1% 5303.6: rev-list (50) 1.58(1.55+0.02) 1.59(1.57+0.01) +0.6% 5303.7: repack (50) 2.50(3.18+0.04) 2.50(3.14+0.04) +0.0% 5303.9: rev-list (1000) 2.76(2.71+0.04) 2.24(2.21+0.02) -18.8% 5303.10: repack (1000) 13.21(19.56+0.25) 11.66(18.01+0.21) -11.7% You can see that the percentage improvement is similar. That's because the lookup we are optimizing is roughly O(nr_objects * nr_packs). Since the number of packs is constant in both tests, we'd expect the improvement to be linear in the number of objects. But the whole process is also linear in the number of objects, so the improvement is a constant factor. The exact improvement does also depend on the contents of the packs. In p5303, the extra packs all have 5 first-parent commits in them, which is a reasonable simulation of a pushed-to repository. But it also means that only 250 first-parent commits are in those packs (compared to almost 50,000 total in linux.git), and the rest are in the huge "base" pack. So once we start looking at history in taht big pack, that's where we'll find most everything, and even the 1-element cache gets close to 100% cache hits. You could almost certainly show better numbers with a more pathological case (e.g., distributing the objects more evenly across the packs). But that's simply not that realistic a scenario, so it makes more sense to focus on these numbers. The implementation itself is a straightforward application of the MRU code. We provide an MRU-ordered list of packs that shadows the packed_git list. This is easy to do because we only create and revise the pack list in one place. The "reprepare" code path actually drops the whole MRU and replaces it for simplicity. It would be more efficient to just add new entries, but there's not much point in optimizing here; repreparing happens rarely, and only after doing a lot of other expensive work. The key things to keep optimized are traversal (which is just a normal linked list, albeit with one extra level of indirection over the regular packed_git list), and marking (which is a constant number of pointer assignments, though slightly more than the old last_found_pack was; it doesn't seem to create a measurable slowdown, though). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/cache.h b/cache.h index 51c366c..40671d1 100644 --- a/cache.h +++ b/cache.h @@ -1371,6 +1371,13 @@ extern struct packed_git { char pack_name[FLEX_ARRAY]; /* more */ } *packed_git; +/* + * A most-recently-used ordered version of the packed_git list, which can + * be iterated instead of packed_git (and marked via mru_mark). + */ +struct mru; +extern struct mru *packed_git_mru; + struct pack_entry { off_t offset; unsigned char sha1[20]; diff --git a/sha1_file.c b/sha1_file.c index e045d2f..4eb3318 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -23,6 +23,7 @@ #include "bulk-checkin.h" #include "streaming.h" #include "dir.h" +#include "mru.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -59,14 +60,6 @@ static struct cached_object empty_tree = { 0 }; -/* - * A pointer to the last packed_git in which an object was found. - * When an object is sought, we look in this packfile first, because - * objects that are looked up at similar times are often in the same - * packfile as one another. - */ -static struct packed_git *last_found_pack; - static struct cached_object *find_cached_object(const unsigned char *sha1) { int i; @@ -522,6 +515,9 @@ static size_t peak_pack_mapped; static size_t pack_mapped; struct packed_git *packed_git; +static struct mru packed_git_mru_storage; +struct mru *packed_git_mru = &packed_git_mru_storage; + void pack_report(void) { fprintf(stderr, @@ -1355,6 +1351,15 @@ static void rearrange_packed_git(void) free(ary); } +static void prepare_packed_git_mru(void) +{ + struct packed_git *p; + + mru_clear(packed_git_mru); + for (p = packed_git; p; p = p->next) + mru_append(packed_git_mru, p); +} + static int prepare_packed_git_run_once = 0; void prepare_packed_git(void) { @@ -1370,6 +1375,7 @@ void prepare_packed_git(void) alt->name[-1] = '/'; } rearrange_packed_git(); + prepare_packed_git_mru(); prepare_packed_git_run_once = 1; } @@ -2574,21 +2580,15 @@ static int fill_pack_entry(const unsigned char *sha1, */ static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e) { - struct packed_git *p; + struct mru_entry *p; prepare_packed_git(); if (!packed_git) return 0; - if (last_found_pack && fill_pack_entry(sha1, e, last_found_pack)) - return 1; - - for (p = packed_git; p; p = p->next) { - if (p == last_found_pack) - continue; /* we already checked this one */ - - if (fill_pack_entry(sha1, e, p)) { - last_found_pack = p; + for (p = packed_git_mru->head; p; p = p->next) { + if (fill_pack_entry(sha1, e, p->item)) { + mru_mark(packed_git_mru, p); return 1; } } -- cgit v0.10.2-6-g49f6 From cd3799679533328dcf262549c9d6466b07628df1 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:10:31 -0400 Subject: pack-objects: break out of want_object loop early When pack-objects collects the list of objects to pack (either from stdin, or via its internal rev-list), it filters each one through want_object_in_pack(). This function loops through each existing packfile, looking for the object. When we find it, we mark the pack/offset combo for later use. However, we can't just return "yes, we want it" at that point. If --honor-pack-keep is in effect, we must keep looking to find it in _all_ packs, to make sure none of them has a .keep. Likewise, if --local is in effect, we must make sure it is not present in any non-local pack. As a result, the sum effort of these calls is effectively O(nr_objects * nr_packs). In an ordinary repository, we have only a handful of packs, and this doesn't make a big difference. But in pathological cases, it can slow the counting phase to a crawl. This patch notices the case that we have neither "--local" nor "--honor-pack-keep" in effect and breaks out of the loop early, after finding the first instance. Note that our worst case is still "objects * packs" (i.e., we might find each object in the last pack we look in), but in practice we will often break out early. On an "average" repo, my git.git with 8 packs, this shows a modest 2% (a few dozen milliseconds) improvement in the counting-objects phase of "git pack-objects --all Signed-off-by: Junio C Hamano diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index a2f8cfd..8ad11f2 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -977,6 +977,22 @@ static int want_object_in_pack(const unsigned char *sha1, return 1; if (incremental) return 0; + + /* + * When asked to do --local (do not include an + * object that appears in a pack we borrow + * from elsewhere) or --honor-pack-keep (do not + * include an object that appears in a pack marked + * with .keep), we need to make sure no copy of this + * object come from in _any_ pack that causes us to + * omit it, and need to complete this loop. When + * neither option is in effect, we know the object + * we just found is going to be packed, so break + * out of the loop to return 1 now. + */ + if (!ignore_packed_keep && !local) + break; + if (local && !p->pack_local) return 0; if (ignore_packed_keep && p->pack_local && p->pack_keep) -- cgit v0.10.2-6-g49f6 From 56dfeb62638760fa78a442a97f19abf1af374d29 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 29 Jul 2016 00:11:31 -0400 Subject: pack-objects: compute local/ignore_pack_keep early In want_object_in_pack(), we can exit early from our loop if neither "local" nor "ignore_pack_keep" are set. If they are, however, we must examine each pack to see if it has the object and is non-local or has a ".keep". It's quite common for there to be no non-local or .keep packs at all, in which case we know ahead of time that looking further will be pointless. We can pre-compute this by simply iterating over the list of packs ahead of time, and dropping the flags if there are no packs that could match. Another similar strategy would be to modify the loop in want_object_in_pack() to notice that we have already found the object once, and that we are looping only to check for "local" and "keep" attributes. If a pack has neither of those, we can skip the call to find_pack_entry_one(), which is the expensive part of the loop. This has two advantages: - it isn't all-or-nothing; we still get some improvement when there's a small number of kept or non-local packs, and a large number of non-kept local packs - it eliminates any possible race where we add new non-local or kept packs after our initial scan. In practice, I don't think this race matters; we already cache the packed_git information, so somebody who adds a new pack or .keep file after we've started will not be noticed at all, unless we happen to need to call reprepare_packed_git() because a lookup fails. In other words, we're already racy, and the race is not a big deal (losing the race means we might include an object in the pack that would not otherwise be, which is an acceptable outcome). However, it also has a disadvantage: we still loop over the rest of the packs for each object to check their flags. This is much less expensive than doing the object lookup, but still not free. So if we wanted to implement that strategy to cover the non-all-or-nothing cases, we could do so in addition to this one (so you get the most speedup in the all-or-nothing case, and the best we can do in the other cases). But given that the all-or-nothing case is likely the most common, it is probably not worth the trouble, and we can revisit this later if evidence points otherwise. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 8ad11f2..c4c2a3c 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -46,6 +46,7 @@ static int keep_unreachable, unpack_unreachable, include_tag; static unsigned long unpack_unreachable_expiration; static int pack_loose_unreachable; static int local; +static int have_non_local_packs; static int incremental; static int ignore_packed_keep; static int allow_ofs_delta; @@ -990,7 +991,8 @@ static int want_object_in_pack(const unsigned char *sha1, * we just found is going to be packed, so break * out of the loop to return 1 now. */ - if (!ignore_packed_keep && !local) + if (!ignore_packed_keep && + (!local || !have_non_local_packs)) break; if (local && !p->pack_local) @@ -2799,6 +2801,28 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) progress = 2; prepare_packed_git(); + if (ignore_packed_keep) { + struct packed_git *p; + for (p = packed_git; p; p = p->next) + if (p->pack_local && p->pack_keep) + break; + if (!p) /* no keep-able packs found */ + ignore_packed_keep = 0; + } + if (local) { + /* + * unlike ignore_packed_keep above, we do not want to + * unset "local" based on looking at packs, as it + * also covers non-local objects + */ + struct packed_git *p; + for (p = packed_git; p; p = p->next) { + if (!p->pack_local) { + have_non_local_packs = 1; + break; + } + } + } if (progress) progress_state = start_progress(_("Counting objects"), 0); -- cgit v0.10.2-6-g49f6