summaryrefslogtreecommitdiff
path: root/builtin
diff options
context:
space:
mode:
authorJeff King <peff@peff.net>2021-02-09 10:53:50 (GMT)
committerJunio C Hamano <gitster@pobox.com>2021-02-11 17:57:55 (GMT)
commit16950f8384afa5106b1ce57da07a964c2aaef3f7 (patch)
tree608e6e56eef19255cfa4637cd54b49af26f4f3e1 /builtin
parent3803a3a0993045605d7f3db363188ce377e917c8 (diff)
downloadgit-16950f8384afa5106b1ce57da07a964c2aaef3f7.zip
git-16950f8384afa5106b1ce57da07a964c2aaef3f7.tar.gz
git-16950f8384afa5106b1ce57da07a964c2aaef3f7.tar.bz2
rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the overall repository size (e.g., does some branch have a bunch of objects not found elsewhere in history, which indicates that deleting it would shrink the size of a clone). You can find that out by generating a list of objects, getting their sizes from cat-file, and then summing them, like: git rev-list --objects --no-object-names main..branch git cat-file --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' Though note that the caveats from git-cat-file(1) apply here. We "blame" base objects more than their deltas, even though the relationship could easily be flipped. Still, it can be a useful rough measure. But one problem is that it's slow to run. Teaching rev-list to sum up the sizes can be much faster for two reasons: 1. It skips all of the piping of object names and sizes. 2. If bitmaps are in use, for objects that are in the bitmapped packfile we can skip the oid_object_info() lookup entirely, and just ask the revindex for the on-disk size. This patch implements a --disk-usage option which produces the same answer in a fraction of the time. Here are some timings using a clone of torvalds/linux: [rev-list piped to cat-file, no bitmaps] $ time git rev-list --objects --no-object-names --all | git cat-file --buffer --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' 1459938510 real 0m29.635s user 0m38.003s sys 0m1.093s [internal, no bitmaps] $ time git rev-list --disk-usage --objects --all 1459938510 real 0m31.262s user 0m30.885s sys 0m0.376s Even though the wall-clock time is slightly worse due to parallelism, notice the CPU savings between the two. We saved 21% of the CPU just by avoiding the pipes. But the real win is with bitmaps. If we use them without the new option: [rev-list piped to cat-file, bitmaps] $ time git rev-list --objects --no-object-names --all --use-bitmap-index | git cat-file --batch-check='%(objectsize:disk)' | perl -lne '$total += $_; END { print $total }' 1459938510 real 0m6.244s user 0m8.452s sys 0m0.311s then we're faster to generate the list of objects, but we still spend a lot of time piping and looking things up. But if we do both together: [internal, bitmaps] $ time git rev-list --disk-usage --objects --all --use-bitmap-index 1459938510 real 0m0.219s user 0m0.169s sys 0m0.049s then we get the same answer much faster. For "--all", that answer will correspond closely to "du objects/pack", of course. But we're actually checking reachability here, so we're still fast when we ask for more interesting things: $ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10 374798628 real 0m0.429s user 0m0.356s sys 0m0.072s Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'builtin')
-rw-r--r--builtin/rev-list.c46
1 files changed, 46 insertions, 0 deletions
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 25c6c3b..b4d8ea0 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -80,6 +80,19 @@ static int arg_show_object_names = 1;
#define DEFAULT_OIDSET_SIZE (16*1024)
+static int show_disk_usage;
+static off_t total_disk_usage;
+
+static off_t get_object_disk_usage(struct object *obj)
+{
+ off_t size;
+ struct object_info oi = OBJECT_INFO_INIT;
+ oi.disk_sizep = &size;
+ if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0)
+ die(_("unable to get disk usage of %s"), oid_to_hex(&obj->oid));
+ return size;
+}
+
static void finish_commit(struct commit *commit);
static void show_commit(struct commit *commit, void *data)
{
@@ -88,6 +101,9 @@ static void show_commit(struct commit *commit, void *data)
display_progress(progress, ++progress_counter);
+ if (show_disk_usage)
+ total_disk_usage += get_object_disk_usage(&commit->object);
+
if (info->flags & REV_LIST_QUIET) {
finish_commit(commit);
return;
@@ -258,6 +274,8 @@ static void show_object(struct object *obj, const char *name, void *cb_data)
if (finish_object(obj, name, cb_data))
return;
display_progress(progress, ++progress_counter);
+ if (show_disk_usage)
+ total_disk_usage += get_object_disk_usage(obj);
if (info->flags & REV_LIST_QUIET)
return;
@@ -452,6 +470,23 @@ static int try_bitmap_traversal(struct rev_info *revs,
return 0;
}
+static int try_bitmap_disk_usage(struct rev_info *revs,
+ struct list_objects_filter_options *filter)
+{
+ struct bitmap_index *bitmap_git;
+
+ if (!show_disk_usage)
+ return -1;
+
+ bitmap_git = prepare_bitmap_walk(revs, filter);
+ if (!bitmap_git)
+ return -1;
+
+ printf("%"PRIuMAX"\n",
+ (uintmax_t)get_disk_usage_from_bitmap(bitmap_git, revs));
+ return 0;
+}
+
int cmd_rev_list(int argc, const char **argv, const char *prefix)
{
struct rev_info revs;
@@ -584,6 +619,12 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
continue;
}
+ if (!strcmp(arg, "--disk-usage")) {
+ show_disk_usage = 1;
+ info.flags |= REV_LIST_QUIET;
+ continue;
+ }
+
usage(rev_list_usage);
}
@@ -626,6 +667,8 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
if (use_bitmap_index) {
if (!try_bitmap_count(&revs, &filter_options))
return 0;
+ if (!try_bitmap_disk_usage(&revs, &filter_options))
+ return 0;
if (!try_bitmap_traversal(&revs, &filter_options))
return 0;
}
@@ -690,5 +733,8 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
printf("%d\n", revs.count_left + revs.count_right);
}
+ if (show_disk_usage)
+ printf("%"PRIuMAX"\n", (uintmax_t)total_disk_usage);
+
return 0;
}