summaryrefslogtreecommitdiff
path: root/reachable.c
diff options
context:
space:
mode:
authorTaylor Blau <me@ttaylorr.com>2023-06-07 22:58:17 (GMT)
committerJunio C Hamano <gitster@pobox.com>2023-06-12 21:12:20 (GMT)
commit4dc16e2cb05fa467c3ef507679ae625f785770cf (patch)
treea0381d78d36569fd27c055d32ae9b01c20461b2f /reachable.c
parent01e9ca4a40e04cceeed01da05bed182556daa005 (diff)
downloadgit-4dc16e2cb05fa467c3ef507679ae625f785770cf.zip
git-4dc16e2cb05fa467c3ef507679ae625f785770cf.tar.gz
git-4dc16e2cb05fa467c3ef507679ae625f785770cf.tar.bz2
gc: introduce `gc.recentObjectsHook`
This patch introduces a new multi-valued configuration option, `gc.recentObjectsHook` as a means to mark certain objects as recent (and thus exempt from garbage collection), regardless of their age. When performing a garbage collection operation on a repository with unreachable objects, Git makes its decision on what to do with those object(s) based on how recent the objects are or not. Generally speaking, unreachable-but-recent objects stay in the repository, and older objects are discarded. However, we have no convenient way to keep certain precious, unreachable objects around in the repository, even if they have aged out and would be pruned. Our options today consist of: - Point references at the reachability tips of any objects you consider precious, which may be undesirable or infeasible if there are many such objects. - Track them via the reflog, which may be undesirable since the reflog's lifetime is limited to that of the reference it's tracking (and callers may want to keep those unreachable objects around for longer). - Extend the grace period, which may keep around other objects that the caller *does* want to discard. - Manually modify the mtimes of objects you want to keep. If those objects are already loose, this is easy enough to do (you can just enumerate and `touch -m` each one). But if they are packed, you will either end up modifying the mtimes of *all* objects in that pack, or be forced to write out a loose copy of that object, both of which may be undesirable. Even worse, if they are in a cruft pack, that requires modifying its `*.mtimes` file by hand, since there is no exposed plumbing for this. - Force the caller to construct the pack of objects they want to keep themselves, and then mark the pack as kept by adding a ".keep" file. This works, but is burdensome for the caller, and having extra packs is awkward as you roll forward your cruft pack. This patch introduces a new option to the above list via the `gc.recentObjectsHook` configuration, which allows the caller to specify a program (or set of programs) whose output is treated as a set of objects to treat as recent, regardless of their true age. The implementation is straightforward. Git enumerates recent objects via `add_unseen_recent_objects_to_traversal()`, which enumerates loose and packed objects, and eventually calls add_recent_object() on any objects for which `want_recent_object()`'s conditions are met. This patch modifies the recency condition from simply "is the mtime of this object more recent than the cutoff?" to "[...] or, is this object mentioned by at least one `gc.recentObjectsHook`?". Depending on whether or not we are generating a cruft pack, this allows the caller to do one of two things: - If generating a cruft pack, the caller is able to retain additional objects via the cruft pack, even if they would have otherwise been pruned due to their age. - If not generating a cruft pack, the caller is likewise able to retain additional objects as loose. A potential alternative here is to introduce a new mode to alter the contents of the reachable pack instead of the cruft one. One could imagine a new option to `pack-objects`, say `--extra-reachable-tips` that does the same thing as above, adding the visited set of objects along the traversal to the pack. But this has the unfortunate side-effect of altering the reachability closure of that pack. If parts of the unreachable object graph mentioned by one or more of the "extra reachable tips" programs is not closed, then the resulting pack won't be either. This makes it impossible in the general case to write out reachability bitmaps for that pack, since closure is a requirement there. Instead, keep these unreachable objects in the cruft pack (or set of unreachable, loose objects) instead, to ensure that we can continue to have a pack containing just reachable objects, which is always safe to write a bitmap over. Helped-by: Jeff King <peff@peff.net> Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'reachable.c')
-rw-r--r--reachable.c79
1 files changed, 76 insertions, 3 deletions
diff --git a/reachable.c b/reachable.c
index 7a42da5..60a7336 100644
--- a/reachable.c
+++ b/reachable.c
@@ -16,6 +16,8 @@
#include "object-store.h"
#include "pack-bitmap.h"
#include "pack-mtimes.h"
+#include "config.h"
+#include "run-command.h"
struct connectivity_progress {
struct progress *progress;
@@ -67,12 +69,75 @@ struct recent_data {
timestamp_t timestamp;
report_recent_object_fn *cb;
int ignore_in_core_kept_packs;
+
+ struct oidset extra_recent_oids;
+ int extra_recent_oids_loaded;
};
+static int run_one_gc_recent_objects_hook(struct oidset *set,
+ const char *args)
+{
+ struct child_process cmd = CHILD_PROCESS_INIT;
+ struct strbuf buf = STRBUF_INIT;
+ FILE *out;
+ int ret = 0;
+
+ cmd.use_shell = 1;
+ cmd.out = -1;
+
+ strvec_push(&cmd.args, args);
+
+ if (start_command(&cmd))
+ return -1;
+
+ out = xfdopen(cmd.out, "r");
+ while (strbuf_getline(&buf, out) != EOF) {
+ struct object_id oid;
+ const char *rest;
+
+ if (parse_oid_hex(buf.buf, &oid, &rest) || *rest) {
+ ret = error(_("invalid extra cruft tip: '%s'"), buf.buf);
+ break;
+ }
+
+ oidset_insert(set, &oid);
+ }
+
+ fclose(out);
+ ret |= finish_command(&cmd);
+
+ strbuf_release(&buf);
+ return ret;
+}
+
+static void load_gc_recent_objects(struct recent_data *data)
+{
+ const struct string_list *programs;
+ int ret = 0;
+ size_t i;
+
+ data->extra_recent_oids_loaded = 1;
+
+ if (git_config_get_string_multi("gc.recentobjectshook", &programs))
+ return;
+
+ for (i = 0; i < programs->nr; i++) {
+ ret = run_one_gc_recent_objects_hook(&data->extra_recent_oids,
+ programs->items[i].string);
+ if (ret)
+ die(_("unable to enumerate additional recent objects"));
+ }
+}
+
static int obj_is_recent(const struct object_id *oid, timestamp_t mtime,
struct recent_data *data)
{
- return mtime > data->timestamp;
+ if (mtime > data->timestamp)
+ return 1;
+
+ if (!data->extra_recent_oids_loaded)
+ load_gc_recent_objects(data);
+ return oidset_contains(&data->extra_recent_oids, oid);
}
static void add_recent_object(const struct object_id *oid,
@@ -199,16 +264,24 @@ int add_unseen_recent_objects_to_traversal(struct rev_info *revs,
data.cb = cb;
data.ignore_in_core_kept_packs = ignore_in_core_kept_packs;
+ oidset_init(&data.extra_recent_oids, 0);
+ data.extra_recent_oids_loaded = 0;
+
r = for_each_loose_object(add_recent_loose, &data,
FOR_EACH_OBJECT_LOCAL_ONLY);
if (r)
- return r;
+ goto done;
flags = FOR_EACH_OBJECT_LOCAL_ONLY | FOR_EACH_OBJECT_PACK_ORDER;
if (ignore_in_core_kept_packs)
flags |= FOR_EACH_OBJECT_SKIP_IN_CORE_KEPT_PACKS;
- return for_each_packed_object(add_recent_packed, &data, flags);
+ r = for_each_packed_object(add_recent_packed, &data, flags);
+
+done:
+ oidset_clear(&data.extra_recent_oids);
+
+ return r;
}
static int mark_object_seen(const struct object_id *oid,