From 96168827802b08c4adf2a036594ab235b2c5630f Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:28 -0300
Subject: make_transient_cache_entry(): optionally alloc from mem_pool

Allow make_transient_cache_entry() to optionally receive a mem_pool
struct in which it should allocate the entry. This will be used in the
following patch, to store some transient entries which should persist
until parallel checkout finishes.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/builtin/checkout--worker.c b/builtin/checkout--worker.c
index 31e0de2..289a9b8 100644
--- a/builtin/checkout--worker.c
+++ b/builtin/checkout--worker.c
@@ -39,7 +39,7 @@ static void packet_to_pc_item(const char *buffer, int len,
 	}
 
 	memset(pc_item, 0, sizeof(*pc_item));
-	pc_item->ce = make_empty_transient_cache_entry(fixed_portion->name_len);
+	pc_item->ce = make_empty_transient_cache_entry(fixed_portion->name_len, NULL);
 	pc_item->ce->ce_namelen = fixed_portion->name_len;
 	pc_item->ce->ce_mode = fixed_portion->ce_mode;
 	memcpy(pc_item->ce->name, variant, pc_item->ce->ce_namelen);
diff --git a/builtin/checkout.c b/builtin/checkout.c
index 4c696ef..db667d0 100644
--- a/builtin/checkout.c
+++ b/builtin/checkout.c
@@ -291,7 +291,7 @@ static int checkout_merged(int pos, const struct checkout *state, int *nr_checko
 	if (write_object_file(result_buf.ptr, result_buf.size, blob_type, &oid))
 		die(_("Unable to add merge result for '%s'"), path);
 	free(result_buf.ptr);
-	ce = make_transient_cache_entry(mode, &oid, path, 2);
+	ce = make_transient_cache_entry(mode, &oid, path, 2, NULL);
 	if (!ce)
 		die(_("make_cache_entry failed for path '%s'"), path);
 	status = checkout_entry(ce, state, NULL, nr_checkouts);
diff --git a/builtin/difftool.c b/builtin/difftool.c
index ef25729..afacbcd 100644
--- a/builtin/difftool.c
+++ b/builtin/difftool.c
@@ -323,7 +323,7 @@ static int checkout_path(unsigned mode, struct object_id *oid,
 	struct cache_entry *ce;
 	int ret;
 
-	ce = make_transient_cache_entry(mode, oid, path, 0);
+	ce = make_transient_cache_entry(mode, oid, path, 0, NULL);
 	ret = checkout_entry(ce, state, NULL, NULL);
 
 	discard_cache_entry(ce);
diff --git a/cache.h b/cache.h
index 148d9ab..d6dab6c 100644
--- a/cache.h
+++ b/cache.h
@@ -356,16 +356,20 @@ struct cache_entry *make_empty_cache_entry(struct index_state *istate,
 					   size_t name_len);
 
 /*
- * Create a cache_entry that is not intended to be added to an index.
- * Caller is responsible for discarding the cache_entry
- * with `discard_cache_entry`.
+ * Create a cache_entry that is not intended to be added to an index. If
+ * `ce_mem_pool` is not NULL, the entry is allocated within the given memory
+ * pool. Caller is responsible for discarding "loose" entries with
+ * `discard_cache_entry()` and the memory pool with
+ * `mem_pool_discard(ce_mem_pool, should_validate_cache_entries())`.
  */
 struct cache_entry *make_transient_cache_entry(unsigned int mode,
 					       const struct object_id *oid,
 					       const char *path,
-					       int stage);
+					       int stage,
+					       struct mem_pool *ce_mem_pool);
 
-struct cache_entry *make_empty_transient_cache_entry(size_t name_len);
+struct cache_entry *make_empty_transient_cache_entry(size_t len,
+						     struct mem_pool *ce_mem_pool);
 
 /*
  * Discard cache entry.
diff --git a/read-cache.c b/read-cache.c
index 5a907af..b46be4a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -813,8 +813,11 @@ struct cache_entry *make_empty_cache_entry(struct index_state *istate, size_t le
 	return mem_pool__ce_calloc(find_mem_pool(istate), len);
 }
 
-struct cache_entry *make_empty_transient_cache_entry(size_t len)
+struct cache_entry *make_empty_transient_cache_entry(size_t len,
+						     struct mem_pool *ce_mem_pool)
 {
+	if (ce_mem_pool)
+		return mem_pool__ce_calloc(ce_mem_pool, len);
 	return xcalloc(1, cache_entry_size(len));
 }
 
@@ -848,8 +851,11 @@ struct cache_entry *make_cache_entry(struct index_state *istate,
 	return ret;
 }
 
-struct cache_entry *make_transient_cache_entry(unsigned int mode, const struct object_id *oid,
-					       const char *path, int stage)
+struct cache_entry *make_transient_cache_entry(unsigned int mode,
+					       const struct object_id *oid,
+					       const char *path,
+					       int stage,
+					       struct mem_pool *ce_mem_pool)
 {
 	struct cache_entry *ce;
 	int len;
@@ -860,7 +866,7 @@ struct cache_entry *make_transient_cache_entry(unsigned int mode, const struct o
 	}
 
 	len = strlen(path);
-	ce = make_empty_transient_cache_entry(len);
+	ce = make_empty_transient_cache_entry(len, ce_mem_pool);
 
 	oidcpy(&ce->oid, oid);
 	memcpy(ce->name, path, len);
diff --git a/unpack-trees.c b/unpack-trees.c
index 4b77e52..fa5b7ab 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1034,7 +1034,7 @@ static struct cache_entry *create_ce_entry(const struct traverse_info *info,
 	size_t len = traverse_path_len(info, tree_entry_len(n));
 	struct cache_entry *ce =
 		is_transient ?
-		make_empty_transient_cache_entry(len) :
+		make_empty_transient_cache_entry(len, NULL) :
 		make_empty_cache_entry(istate, len);
 
 	ce->ce_mode = create_ce_mode(n->mode);
-- 
cgit v0.10.2-6-g49f6


From 60539506329d60c80610ccbbfe1d18c746c4ae52 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:29 -0300
Subject: builtin/checkout.c: complete parallel checkout support

Pathspec-limited checkouts (like `git checkout *.txt`) are performed by
a code path that doesn't yet support parallel checkout because it calls
checkout_entry() directly, instead of unpack_trees(). Let's add parallel
checkout support for this code path too.

The transient cache entries allocated in checkout_merged() are now
allocated in a mem_pool which is only discarded after parallel checkout
finishes. This is done because the entries need to be valid when
run_parallel_checkout() is called.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/builtin/checkout.c b/builtin/checkout.c
index db667d0..99384d5 100644
--- a/builtin/checkout.c
+++ b/builtin/checkout.c
@@ -27,6 +27,7 @@
 #include "wt-status.h"
 #include "xdiff-interface.h"
 #include "entry.h"
+#include "parallel-checkout.h"
 
 static const char * const checkout_usage[] = {
 	N_("git checkout [<options>] <branch>"),
@@ -230,7 +231,8 @@ static int checkout_stage(int stage, const struct cache_entry *ce, int pos,
 		return error(_("path '%s' does not have their version"), ce->name);
 }
 
-static int checkout_merged(int pos, const struct checkout *state, int *nr_checkouts)
+static int checkout_merged(int pos, const struct checkout *state,
+			   int *nr_checkouts, struct mem_pool *ce_mem_pool)
 {
 	struct cache_entry *ce = active_cache[pos];
 	const char *path = ce->name;
@@ -291,11 +293,10 @@ static int checkout_merged(int pos, const struct checkout *state, int *nr_checko
 	if (write_object_file(result_buf.ptr, result_buf.size, blob_type, &oid))
 		die(_("Unable to add merge result for '%s'"), path);
 	free(result_buf.ptr);
-	ce = make_transient_cache_entry(mode, &oid, path, 2, NULL);
+	ce = make_transient_cache_entry(mode, &oid, path, 2, ce_mem_pool);
 	if (!ce)
 		die(_("make_cache_entry failed for path '%s'"), path);
 	status = checkout_entry(ce, state, NULL, nr_checkouts);
-	discard_cache_entry(ce);
 	return status;
 }
 
@@ -359,16 +360,23 @@ static int checkout_worktree(const struct checkout_opts *opts,
 	int nr_checkouts = 0, nr_unmerged = 0;
 	int errs = 0;
 	int pos;
+	int pc_workers, pc_threshold;
+	struct mem_pool ce_mem_pool;
 
 	state.force = 1;
 	state.refresh_cache = 1;
 	state.istate = &the_index;
 
+	mem_pool_init(&ce_mem_pool, 0);
+	get_parallel_checkout_configs(&pc_workers, &pc_threshold);
 	init_checkout_metadata(&state.meta, info->refname,
 			       info->commit ? &info->commit->object.oid : &info->oid,
 			       NULL);
 
 	enable_delayed_checkout(&state);
+	if (pc_workers > 1)
+		init_parallel_checkout();
+
 	for (pos = 0; pos < active_nr; pos++) {
 		struct cache_entry *ce = active_cache[pos];
 		if (ce->ce_flags & CE_MATCHED) {
@@ -384,10 +392,15 @@ static int checkout_worktree(const struct checkout_opts *opts,
 						       &nr_checkouts, opts->overlay_mode);
 			else if (opts->merge)
 				errs |= checkout_merged(pos, &state,
-							&nr_unmerged);
+							&nr_unmerged,
+							&ce_mem_pool);
 			pos = skip_same_name(ce, pos) - 1;
 		}
 	}
+	if (pc_workers > 1)
+		errs |= run_parallel_checkout(&state, pc_workers, pc_threshold,
+					      NULL, NULL);
+	mem_pool_discard(&ce_mem_pool, should_validate_cache_entries());
 	remove_marked_cache_entries(&the_index, 1);
 	remove_scheduled_dirs();
 	errs |= finish_delayed_checkout(&state, &nr_checkouts);
-- 
cgit v0.10.2-6-g49f6


From 70b052b209e53eb1f81bc925ea6aac0491228af0 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:30 -0300
Subject: checkout-index: add parallel checkout support

Allow checkout-index to use the parallel checkout framework, honoring
the checkout.workers configuration.

There are two code paths in checkout-index which call
`checkout_entry()`, and thus, can make use of parallel checkout:
`checkout_file()`, which is used to write paths explicitly given at the
command line; and `checkout_all()`, which is used to write all paths in
the index, when the `--all` option is given.

In both operation modes, checkout-index doesn't abort immediately on a
`checkout_entry()` failure. Instead, it tries to check out all remaining
paths before exiting with a non-zero exit code. To keep this behavior
when parallel checkout is being used, we must allow
`run_parallel_checkout()` to try writing the queued entries before we
exit, even if we already got an error code from a previous
`checkout_entry()` call.

However, `checkout_all()` doesn't return on errors, it calls `exit()`
with code 128. We could make it call `run_parallel_checkout()` before
exiting, but it makes the code easier to follow if we unify the exit
path for both checkout-index modes at `cmd_checkout_index()`, and let
this function take care of the interactions with the parallel checkout
API. So let's do that.

With this change, we also have to consider whether we want to keep using
128 as the error code for `git checkout-index --all`, while we use 1 for
`git checkout-index <path>` (even when the actual error is the same).
Since there is not much value in having code 128 only for `--all`, and
there is no mention about it in the docs (so it's unlikely that changing
it will break any existing script), let's make both modes exit with code
1 on `checkout_entry()` errors.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/builtin/checkout-index.c b/builtin/checkout-index.c
index c0bf4ac..e8a82ea 100644
--- a/builtin/checkout-index.c
+++ b/builtin/checkout-index.c
@@ -12,6 +12,7 @@
 #include "cache-tree.h"
 #include "parse-options.h"
 #include "entry.h"
+#include "parallel-checkout.h"
 
 #define CHECKOUT_ALL 4
 static int nul_term_line;
@@ -115,7 +116,7 @@ static int checkout_file(const char *name, const char *prefix)
 	return -1;
 }
 
-static void checkout_all(const char *prefix, int prefix_length)
+static int checkout_all(const char *prefix, int prefix_length)
 {
 	int i, errs = 0;
 	struct cache_entry *last_ce = NULL;
@@ -142,11 +143,7 @@ static void checkout_all(const char *prefix, int prefix_length)
 	}
 	if (last_ce && to_tempfile)
 		write_tempfile_record(last_ce->name, prefix);
-	if (errs)
-		/* we have already done our error reporting.
-		 * exit with the same code as die().
-		 */
-		exit(128);
+	return !!errs;
 }
 
 static const char * const builtin_checkout_index_usage[] = {
@@ -182,6 +179,7 @@ int cmd_checkout_index(int argc, const char **argv, const char *prefix)
 	int force = 0, quiet = 0, not_new = 0;
 	int index_opt = 0;
 	int err = 0;
+	int pc_workers, pc_threshold;
 	struct option builtin_checkout_index_options[] = {
 		OPT_BOOL('a', "all", &all,
 			N_("check out all files in the index")),
@@ -236,6 +234,10 @@ int cmd_checkout_index(int argc, const char **argv, const char *prefix)
 		hold_locked_index(&lock_file, LOCK_DIE_ON_ERROR);
 	}
 
+	get_parallel_checkout_configs(&pc_workers, &pc_threshold);
+	if (pc_workers > 1)
+		init_parallel_checkout();
+
 	/* Check out named files first */
 	for (i = 0; i < argc; i++) {
 		const char *arg = argv[i];
@@ -275,12 +277,16 @@ int cmd_checkout_index(int argc, const char **argv, const char *prefix)
 		strbuf_release(&buf);
 	}
 
+	if (all)
+		err |= checkout_all(prefix, prefix_length);
+
+	if (pc_workers > 1)
+		err |= run_parallel_checkout(&state, pc_workers, pc_threshold,
+					     NULL, NULL);
+
 	if (err)
 		return 1;
 
-	if (all)
-		checkout_all(prefix, prefix_length);
-
 	if (is_lock_file_locked(&lock_file) &&
 	    write_locked_index(&the_index, &lock_file, COMMIT_LOCK))
 		die("Unable to write new index file");
-- 
cgit v0.10.2-6-g49f6


From d0e5d357000f44af5ac80f4bab5809d0d8b196d3 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:31 -0300
Subject: parallel-checkout: add tests for basic operations

Add tests to populate the working tree during clone and checkout using
sequential and parallel mode, to confirm that they produce identical
results. Also test basic checkout mechanics, such as checking for
symlinks in the leading directories and the abidance to --force.

Note: some helper functions are added to a common lib file which is only
included by t2080 for now. But they will also be used by other
parallel-checkout tests in the following patches.

Co-authored-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/t/lib-parallel-checkout.sh b/t/lib-parallel-checkout.sh
new file mode 100644
index 0000000..f60b22e
--- /dev/null
+++ b/t/lib-parallel-checkout.sh
@@ -0,0 +1,42 @@
+# Helpers for tests invoking parallel-checkout
+
+set_checkout_config () {
+	if test $# -ne 2
+	then
+		BUG "usage: set_checkout_config <workers> <threshold>"
+	fi &&
+
+	test_config_global checkout.workers $1 &&
+	test_config_global checkout.thresholdForParallelism $2
+}
+
+# Run "${@:2}" and check that $1 checkout workers were used
+test_checkout_workers () {
+	if test $# -lt 2
+	then
+		BUG "too few arguments to test_checkout_workers"
+	fi &&
+
+	local expected_workers=$1 &&
+	shift &&
+
+	local trace_file=trace-test-checkout-workers &&
+	rm -f "$trace_file" &&
+	GIT_TRACE2="$(pwd)/$trace_file" "$@" &&
+
+	local workers=$(grep "child_start\[..*\] git checkout--worker" "$trace_file" | wc -l) &&
+	test $workers -eq $expected_workers &&
+	rm "$trace_file"
+}
+
+# Verify that both the working tree and the index were created correctly
+verify_checkout () {
+	if test $# -ne 1
+	then
+		BUG "usage: verify_checkout <repository path>"
+	fi &&
+
+	git -C "$1" diff-index --ignore-submodules=none --exit-code HEAD -- &&
+	git -C "$1" status --porcelain >"$1".status &&
+	test_must_be_empty "$1".status
+}
diff --git a/t/t2080-parallel-checkout-basics.sh b/t/t2080-parallel-checkout-basics.sh
new file mode 100755
index 0000000..7087818
--- /dev/null
+++ b/t/t2080-parallel-checkout-basics.sh
@@ -0,0 +1,229 @@
+#!/bin/sh
+
+test_description='parallel-checkout basics
+
+Ensure that parallel-checkout basically works on clone and checkout, spawning
+the required number of workers and correctly populating both the index and the
+working tree.
+'
+
+TEST_NO_CREATE_REPO=1
+. ./test-lib.sh
+. "$TEST_DIRECTORY/lib-parallel-checkout.sh"
+
+# Test parallel-checkout with a branch switch containing a variety of file
+# creations, deletions, and modifications, involving different entry types.
+# The branches B1 and B2 have the following paths:
+#
+#      B1                 B2
+#  a/a (file)         a   (file)
+#  b   (file)         b/b (file)
+#
+#  c/c (file)         c   (symlink)
+#  d   (symlink)      d/d (file)
+#
+#  e/e (file)         e   (submodule)
+#  f   (submodule)    f/f (file)
+#
+#  g   (submodule)    g   (symlink)
+#  h   (symlink)      h   (submodule)
+#
+# Additionally, the following paths are present on both branches, but with
+# different contents:
+#
+#  i   (file)         i   (file)
+#  j   (symlink)      j   (symlink)
+#  k   (submodule)    k   (submodule)
+#
+# And the following paths are only present in one of the branches:
+#
+#  l/l (file)         -
+#  -                  m/m (file)
+#
+test_expect_success 'setup repo for checkout with various types of changes' '
+	git init sub &&
+	(
+		cd sub &&
+		git checkout -b B2 &&
+		echo B2 >file &&
+		git add file &&
+		git commit -m file &&
+
+		git checkout -b B1 &&
+		echo B1 >file &&
+		git add file &&
+		git commit -m file
+	) &&
+
+	git init various &&
+	(
+		cd various &&
+
+		git checkout -b B1 &&
+		mkdir a c e &&
+		echo a/a >a/a &&
+		echo b >b &&
+		echo c/c >c/c &&
+		test_ln_s_add c d &&
+		echo e/e >e/e &&
+		git submodule add ../sub f &&
+		git submodule add ../sub g &&
+		test_ln_s_add c h &&
+
+		echo "B1 i" >i &&
+		test_ln_s_add c j &&
+		git submodule add -b B1 ../sub k &&
+		mkdir l &&
+		echo l/l >l/l &&
+
+		git add . &&
+		git commit -m B1 &&
+
+		git checkout -b B2 &&
+		git rm -rf :^.gitmodules :^k &&
+		mkdir b d f &&
+		echo a >a &&
+		echo b/b >b/b &&
+		test_ln_s_add b c &&
+		echo d/d >d/d &&
+		git submodule add ../sub e &&
+		echo f/f >f/f &&
+		test_ln_s_add b g &&
+		git submodule add ../sub h &&
+
+		echo "B2 i" >i &&
+		test_ln_s_add b j &&
+		git -C k checkout B2 &&
+		mkdir m &&
+		echo m/m >m/m &&
+
+		git add . &&
+		git commit -m B2 &&
+
+		git checkout --recurse-submodules B1
+	)
+'
+
+for mode in sequential parallel sequential-fallback
+do
+	case $mode in
+	sequential)          workers=1 threshold=0 expected_workers=0 ;;
+	parallel)            workers=2 threshold=0 expected_workers=2 ;;
+	sequential-fallback) workers=2 threshold=100 expected_workers=0 ;;
+	esac
+
+	test_expect_success "$mode checkout" '
+		repo=various_$mode &&
+		cp -R various $repo &&
+
+		# The just copied files have more recent timestamps than their
+		# associated index entries. So refresh the cached timestamps
+		# to avoid an "entry not up-to-date" error from `git checkout`.
+		# We only have to do this for the submodules as `git checkout`
+		# will already refresh the superproject index before performing
+		# the up-to-date check.
+		#
+		git -C $repo submodule foreach "git update-index --refresh" &&
+
+		set_checkout_config $workers $threshold &&
+		test_checkout_workers $expected_workers \
+			git -C $repo checkout --recurse-submodules B2 &&
+		verify_checkout $repo
+	'
+done
+
+for mode in parallel sequential-fallback
+do
+	case $mode in
+	parallel)            workers=2 threshold=0 expected_workers=2 ;;
+	sequential-fallback) workers=2 threshold=100 expected_workers=0 ;;
+	esac
+
+	test_expect_success "$mode checkout on clone" '
+		repo=various_${mode}_clone &&
+		set_checkout_config $workers $threshold &&
+		test_checkout_workers $expected_workers \
+			git clone --recurse-submodules --branch B2 various $repo &&
+		verify_checkout $repo
+	'
+done
+
+# Just to be paranoid, actually compare the working trees' contents directly.
+test_expect_success 'compare the working trees' '
+	rm -rf various_*/.git &&
+	rm -rf various_*/*/.git &&
+
+	# We use `git diff` instead of `diff -r` because the latter would
+	# follow symlinks, and not all `diff` implementations support the
+	# `--no-dereference` option.
+	#
+	git diff --no-index various_sequential various_parallel &&
+	git diff --no-index various_sequential various_parallel_clone &&
+	git diff --no-index various_sequential various_sequential-fallback &&
+	git diff --no-index various_sequential various_sequential-fallback_clone
+'
+
+# Currently, each submodule is checked out in a separated child process, but
+# these subprocesses must also be able to use parallel checkout workers to
+# write the submodules' entries.
+test_expect_success 'submodules can use parallel checkout' '
+	set_checkout_config 2 0 &&
+	git init super &&
+	(
+		cd super &&
+		git init sub &&
+		test_commit -C sub A &&
+		test_commit -C sub B &&
+		git submodule add ./sub &&
+		git commit -m sub &&
+		rm sub/* &&
+		test_checkout_workers 2 git checkout --recurse-submodules .
+	)
+'
+
+test_expect_success 'parallel checkout respects --[no]-force' '
+	set_checkout_config 2 0 &&
+	git init dirty &&
+	(
+		cd dirty &&
+		mkdir D &&
+		test_commit D/F &&
+		test_commit F &&
+
+		rm -rf D &&
+		echo changed >D &&
+		echo changed >F.t &&
+
+		# We expect 0 workers because there is nothing to be done
+		test_checkout_workers 0 git checkout HEAD &&
+		test_path_is_file D &&
+		grep changed D &&
+		grep changed F.t &&
+
+		test_checkout_workers 2 git checkout --force HEAD &&
+		test_path_is_dir D &&
+		grep D/F D/F.t &&
+		grep F F.t
+	)
+'
+
+test_expect_success SYMLINKS 'parallel checkout checks for symlinks in leading dirs' '
+	set_checkout_config 2 0 &&
+	git init symlinks &&
+	(
+		cd symlinks &&
+		mkdir D untracked &&
+		# Commit 2 files to have enough work for 2 parallel workers
+		test_commit D/A &&
+		test_commit D/B &&
+		rm -rf D &&
+		ln -s untracked D &&
+
+		test_checkout_workers 2 git checkout --force HEAD &&
+		! test -h D &&
+		grep D/A D/A.t &&
+		grep D/B D/B.t
+	)
+'
+
+test_done
-- 
cgit v0.10.2-6-g49f6


From 6a7bc9d11823239183dd5f6547b13824a6a15fc2 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:32 -0300
Subject: parallel-checkout: add tests related to path collisions

Add tests to confirm that path collisions are properly detected by
checkout workers, both to avoid race conditions and to report colliding
entries on clone.

Co-authored-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/parallel-checkout.c b/parallel-checkout.c
index 09e8b10..6fb3f1e 100644
--- a/parallel-checkout.c
+++ b/parallel-checkout.c
@@ -8,6 +8,7 @@
 #include "sigchain.h"
 #include "streaming.h"
 #include "thread-utils.h"
+#include "trace2.h"
 
 struct pc_worker {
 	struct child_process cp;
@@ -326,6 +327,7 @@ void write_pc_item(struct parallel_checkout_item *pc_item,
 	if (dir_sep && !has_dirs_only_path(path.buf, dir_sep - path.buf,
 					   state->base_dir_len)) {
 		pc_item->status = PC_ITEM_COLLIDED;
+		trace2_data_string("pcheckout", NULL, "collision/dirname", path.buf);
 		goto out;
 	}
 
@@ -341,6 +343,8 @@ void write_pc_item(struct parallel_checkout_item *pc_item,
 			 * call should have already caught these cases.
 			 */
 			pc_item->status = PC_ITEM_COLLIDED;
+			trace2_data_string("pcheckout", NULL,
+					   "collision/basename", path.buf);
 		} else {
 			error_errno("failed to open file '%s'", path.buf);
 			pc_item->status = PC_ITEM_FAILED;
diff --git a/t/lib-parallel-checkout.sh b/t/lib-parallel-checkout.sh
index f60b22e..d674042 100644
--- a/t/lib-parallel-checkout.sh
+++ b/t/lib-parallel-checkout.sh
@@ -22,12 +22,12 @@ test_checkout_workers () {
 
 	local trace_file=trace-test-checkout-workers &&
 	rm -f "$trace_file" &&
-	GIT_TRACE2="$(pwd)/$trace_file" "$@" &&
+	GIT_TRACE2="$(pwd)/$trace_file" "$@" 2>&8 &&
 
 	local workers=$(grep "child_start\[..*\] git checkout--worker" "$trace_file" | wc -l) &&
 	test $workers -eq $expected_workers &&
 	rm "$trace_file"
-}
+} 8>&2 2>&4
 
 # Verify that both the working tree and the index were created correctly
 verify_checkout () {
diff --git a/t/t2081-parallel-checkout-collisions.sh b/t/t2081-parallel-checkout-collisions.sh
new file mode 100755
index 0000000..f6fcfc0
--- /dev/null
+++ b/t/t2081-parallel-checkout-collisions.sh
@@ -0,0 +1,162 @@
+#!/bin/sh
+
+test_description="path collisions during parallel checkout
+
+Parallel checkout must detect path collisions to:
+
+1) Avoid racily writing to different paths that represent the same file on disk.
+2) Report the colliding entries on clone.
+
+The tests in this file exercise parallel checkout's collision detection code in
+both these mechanics.
+"
+
+. ./test-lib.sh
+. "$TEST_DIRECTORY/lib-parallel-checkout.sh"
+
+TEST_ROOT="$PWD"
+
+test_expect_success CASE_INSENSITIVE_FS 'setup' '
+	empty_oid=$(git hash-object -w --stdin </dev/null) &&
+	cat >objs <<-EOF &&
+	100644 $empty_oid	FILE_X
+	100644 $empty_oid	FILE_x
+	100644 $empty_oid	file_X
+	100644 $empty_oid	file_x
+	EOF
+	git update-index --index-info <objs &&
+	git commit -m "colliding files" &&
+	git tag basename_collision &&
+
+	write_script "$TEST_ROOT"/logger_script <<-\EOF
+	echo "$@" >>filter.log
+	EOF
+'
+
+test_workers_in_event_trace ()
+{
+	test $1 -eq $(grep ".event.:.child_start..*checkout--worker" $2 | wc -l)
+}
+
+test_expect_success CASE_INSENSITIVE_FS 'worker detects basename collision' '
+	GIT_TRACE2_EVENT="$(pwd)/trace" git \
+		-c checkout.workers=2 -c checkout.thresholdForParallelism=0 \
+		checkout . &&
+
+	test_workers_in_event_trace 2 trace &&
+	collisions=$(grep -i "category.:.pcheckout.,.key.:.collision/basename.,.value.:.file_x.}" trace | wc -l) &&
+	test $collisions -eq 3
+'
+
+test_expect_success CASE_INSENSITIVE_FS 'worker detects dirname collision' '
+	test_config filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" &&
+	empty_oid=$(git hash-object -w --stdin </dev/null) &&
+
+	# By setting a filter command to "a", we make it ineligible for parallel
+	# checkout, and thus it is checked out *first*. This way we can ensure
+	# that "A/B" and "A/C" will both collide with the regular file "a".
+	#
+	attr_oid=$(echo "a filter=logger" | git hash-object -w --stdin) &&
+
+	cat >objs <<-EOF &&
+	100644 $empty_oid	A/B
+	100644 $empty_oid	A/C
+	100644 $empty_oid	a
+	100644 $attr_oid	.gitattributes
+	EOF
+	git rm -rf . &&
+	git update-index --index-info <objs &&
+
+	rm -f trace filter.log &&
+	GIT_TRACE2_EVENT="$(pwd)/trace" git \
+		-c checkout.workers=2 -c checkout.thresholdForParallelism=0 \
+		checkout . &&
+
+	# Check that "a" (and only "a") was filtered
+	echo a >expected.log &&
+	test_cmp filter.log expected.log &&
+
+	# Check that it used the right number of workers and detected the collisions
+	test_workers_in_event_trace 2 trace &&
+	grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/B.}" trace &&
+	grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/C.}" trace
+'
+
+test_expect_success SYMLINKS,CASE_INSENSITIVE_FS 'do not follow symlinks colliding with leading dir' '
+	empty_oid=$(git hash-object -w --stdin </dev/null) &&
+	symlink_oid=$(echo "./e" | git hash-object -w --stdin) &&
+	mkdir e &&
+
+	cat >objs <<-EOF &&
+	120000 $symlink_oid	D
+	100644 $empty_oid	d/x
+	100644 $empty_oid	e/y
+	EOF
+	git rm -rf . &&
+	git update-index --index-info <objs &&
+
+	set_checkout_config 2 0 &&
+	test_checkout_workers 2 git checkout . &&
+	test_path_is_dir e &&
+	test_path_is_missing e/x
+'
+
+# The two following tests check that parallel checkout correctly reports
+# colliding entries on clone. The sequential code detects a collision by
+# calling lstat() before trying to open(O_CREAT) a file. (Note that this only
+# works for clone.) Then, to find the pair of a colliding item k, it searches
+# cache_entry[0, k-1]. This is not sufficient in parallel checkout because:
+#
+# - A colliding file may be created between the lstat() and open() calls;
+# - A colliding entry might appear in the second half of the cache_entry array.
+#
+test_expect_success CASE_INSENSITIVE_FS 'collision report on clone (w/ racy file creation)' '
+	git reset --hard basename_collision &&
+	set_checkout_config 2 0 &&
+	test_checkout_workers 2 git clone . clone-repo 2>stderr &&
+
+	grep FILE_X stderr &&
+	grep FILE_x stderr &&
+	grep file_X stderr &&
+	grep file_x stderr &&
+	grep "the following paths have collided" stderr
+'
+
+# This test ensures that the collision report code is correctly looking for
+# colliding peers in the second half of the cache_entry array. This is done by
+# defining a smudge command for the *last* array entry, which makes it
+# non-eligible for parallel-checkout. Thus, it is checked out *first*, before
+# spawning the workers.
+#
+# Note: this test doesn't work on Windows because, on this system, the
+# collision report code uses strcmp() to find the colliding pairs when
+# core.ignoreCase is false. And we need this setting for this test so that only
+# 'file_x' matches the pattern of the filter attribute. But the test works on
+# OSX, where the colliding pairs are found using inode.
+#
+test_expect_success CASE_INSENSITIVE_FS,!MINGW,!CYGWIN \
+	'collision report on clone (w/ colliding peer after the detected entry)' '
+
+	test_config_global filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" &&
+	git reset --hard basename_collision &&
+	echo "file_x filter=logger" >.gitattributes &&
+	git add .gitattributes &&
+	git commit -m "filter for file_x" &&
+
+	rm -rf clone-repo &&
+	set_checkout_config 2 0 &&
+	test_checkout_workers 2 \
+		git -c core.ignoreCase=false clone . clone-repo 2>stderr &&
+
+	grep FILE_X stderr &&
+	grep FILE_x stderr &&
+	grep file_X stderr &&
+	grep file_x stderr &&
+	grep "the following paths have collided" stderr &&
+
+	# Check that only "file_x" was filtered
+	echo file_x >expected.log &&
+	test_cmp clone-repo/filter.log expected.log
+'
+
+test_done
-- 
cgit v0.10.2-6-g49f6


From 2fa3cbadcdc63c3a723c2fd490f1d84552854bee Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:33 -0300
Subject: t0028: extract encoding helpers to lib-encoding.sh

The following patch will add tests outside t0028 which will also need to
re-encode some strings. Extract the auxiliary encoding functions from
t0028 to a common lib file so that they can be reused.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/t/lib-encoding.sh b/t/lib-encoding.sh
new file mode 100644
index 0000000..2dabc8c
--- /dev/null
+++ b/t/lib-encoding.sh
@@ -0,0 +1,25 @@
+# Encoding helpers
+
+test_lazy_prereq NO_UTF16_BOM '
+	test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
+'
+
+test_lazy_prereq NO_UTF32_BOM '
+	test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
+'
+
+write_utf16 () {
+	if test_have_prereq NO_UTF16_BOM
+	then
+		printf '\376\377'
+	fi &&
+	iconv -f UTF-8 -t UTF-16
+}
+
+write_utf32 () {
+	if test_have_prereq NO_UTF32_BOM
+	then
+		printf '\0\0\376\377'
+	fi &&
+	iconv -f UTF-8 -t UTF-32
+}
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index f970a98..82905a2 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -6,33 +6,10 @@ GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
 . ./test-lib.sh
+. "$TEST_DIRECTORY/lib-encoding.sh"
 
 GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
 
-test_lazy_prereq NO_UTF16_BOM '
-	test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
-'
-
-test_lazy_prereq NO_UTF32_BOM '
-	test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
-'
-
-write_utf16 () {
-	if test_have_prereq NO_UTF16_BOM
-	then
-		printf '\376\377'
-	fi &&
-	iconv -f UTF-8 -t UTF-16
-}
-
-write_utf32 () {
-	if test_have_prereq NO_UTF32_BOM
-	then
-		printf '\0\0\376\377'
-	fi &&
-	iconv -f UTF-8 -t UTF-32
-}
-
 test_expect_success 'setup test files' '
 	git config core.eol lf &&
 
-- 
cgit v0.10.2-6-g49f6


From d5904220bccc7f9e49a507d969bcc83428e582df Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:34 -0300
Subject: parallel-checkout: add tests related to .gitattributes

Add tests to confirm that the `struct conv_attrs` data is correctly
passed from the main process to the workers, and that they can properly
convert the blobs before writing them to the working tree.

Also check that parallel-ineligible entries, such as regular files that
require external filters, are correctly smudge and written when
parallel-checkout is enabled.

Co-authored-by: Jeff Hostetler <jeffhost@microsoft.com>
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/t/t2082-parallel-checkout-attributes.sh b/t/t2082-parallel-checkout-attributes.sh
new file mode 100755
index 0000000..2525457
--- /dev/null
+++ b/t/t2082-parallel-checkout-attributes.sh
@@ -0,0 +1,194 @@
+#!/bin/sh
+
+test_description='parallel-checkout: attributes
+
+Verify that parallel-checkout correctly creates files that require
+conversions, as specified in .gitattributes. The main point here is
+to check that the conv_attr data is correctly sent to the workers
+and that it contains sufficient information to smudge files
+properly (without access to the index or attribute stack).
+'
+
+TEST_NO_CREATE_REPO=1
+. ./test-lib.sh
+. "$TEST_DIRECTORY/lib-parallel-checkout.sh"
+. "$TEST_DIRECTORY/lib-encoding.sh"
+
+test_expect_success 'parallel-checkout with ident' '
+	set_checkout_config 2 0 &&
+	git init ident &&
+	(
+		cd ident &&
+		echo "A ident" >.gitattributes &&
+		echo "\$Id\$" >A &&
+		echo "\$Id\$" >B &&
+		git add -A &&
+		git commit -m id &&
+
+		rm A B &&
+		test_checkout_workers 2 git reset --hard &&
+		hexsz=$(test_oid hexsz) &&
+		grep -E "\\\$Id: [0-9a-f]{$hexsz} \\\$" A &&
+		grep "\\\$Id\\\$" B
+	)
+'
+
+test_expect_success 'parallel-checkout with re-encoding' '
+	set_checkout_config 2 0 &&
+	git init encoding &&
+	(
+		cd encoding &&
+		echo text >utf8-text &&
+		write_utf16 <utf8-text >utf16-text &&
+
+		echo "A working-tree-encoding=UTF-16" >.gitattributes &&
+		cp utf16-text A &&
+		cp utf8-text B &&
+		git add A B .gitattributes &&
+		git commit -m encoding &&
+
+		# Check that A is stored in UTF-8
+		git cat-file -p :A >A.internal &&
+		test_cmp_bin utf8-text A.internal &&
+
+		rm A B &&
+		test_checkout_workers 2 git checkout A B &&
+
+		# Check that A (and only A) is re-encoded during checkout
+		test_cmp_bin utf16-text A &&
+		test_cmp_bin utf8-text B
+	)
+'
+
+test_expect_success 'parallel-checkout with eol conversions' '
+	set_checkout_config 2 0 &&
+	git init eol &&
+	(
+		cd eol &&
+		printf "multi\r\nline\r\ntext" >crlf-text &&
+		printf "multi\nline\ntext" >lf-text &&
+
+		git config core.autocrlf false &&
+		echo "A eol=crlf" >.gitattributes &&
+		cp crlf-text A &&
+		cp lf-text B &&
+		git add A B .gitattributes &&
+		git commit -m eol &&
+
+		# Check that A is stored with LF format
+		git cat-file -p :A >A.internal &&
+		test_cmp_bin lf-text A.internal &&
+
+		rm A B &&
+		test_checkout_workers 2 git checkout A B &&
+
+		# Check that A (and only A) is converted to CRLF during checkout
+		test_cmp_bin crlf-text A &&
+		test_cmp_bin lf-text B
+	)
+'
+
+# Entries that require an external filter are not eligible for parallel
+# checkout. Check that both the parallel-eligible and non-eligible entries are
+# properly writen in a single checkout operation.
+#
+test_expect_success 'parallel-checkout and external filter' '
+	set_checkout_config 2 0 &&
+	git init filter &&
+	(
+		cd filter &&
+		write_script <<-\EOF rot13.sh &&
+		tr \
+		  "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+		  "nopqrstuvwxyzabcdefghijklmNOPQRSTUVWXYZABCDEFGHIJKLM"
+		EOF
+
+		git config filter.rot13.clean "\"$(pwd)/rot13.sh\"" &&
+		git config filter.rot13.smudge "\"$(pwd)/rot13.sh\"" &&
+		git config filter.rot13.required true &&
+
+		echo abcd >original &&
+		echo nopq >rot13 &&
+
+		echo "A filter=rot13" >.gitattributes &&
+		cp original A &&
+		cp original B &&
+		cp original C &&
+		git add A B C .gitattributes &&
+		git commit -m filter &&
+
+		# Check that A (and only A) was cleaned
+		git cat-file -p :A >A.internal &&
+		test_cmp rot13 A.internal &&
+		git cat-file -p :B >B.internal &&
+		test_cmp original B.internal &&
+		git cat-file -p :C >C.internal &&
+		test_cmp original C.internal &&
+
+		rm A B C *.internal &&
+		test_checkout_workers 2 git checkout A B C &&
+
+		# Check that A (and only A) was smudged during checkout
+		test_cmp original A &&
+		test_cmp original B &&
+		test_cmp original C
+	)
+'
+
+# The delayed queue is independent from the parallel queue, and they should be
+# able to work together in the same checkout process.
+#
+test_expect_success PERL 'parallel-checkout and delayed checkout' '
+	write_script rot13-filter.pl "$PERL_PATH" \
+		<"$TEST_DIRECTORY"/t0021/rot13-filter.pl &&
+
+	test_config_global filter.delay.process \
+		"\"$(pwd)/rot13-filter.pl\" --always-delay \"$(pwd)/delayed.log\" clean smudge delay" &&
+	test_config_global filter.delay.required true &&
+
+	echo "abcd" >original &&
+	echo "nopq" >rot13 &&
+
+	git init delayed &&
+	(
+		cd delayed &&
+		echo "*.d filter=delay" >.gitattributes &&
+		cp ../original W.d &&
+		cp ../original X.d &&
+		cp ../original Y &&
+		cp ../original Z &&
+		git add -A &&
+		git commit -m delayed &&
+
+		# Check that *.d files were cleaned
+		git cat-file -p :W.d >W.d.internal &&
+		test_cmp W.d.internal ../rot13 &&
+		git cat-file -p :X.d >X.d.internal &&
+		test_cmp X.d.internal ../rot13 &&
+		git cat-file -p :Y >Y.internal &&
+		test_cmp Y.internal ../original &&
+		git cat-file -p :Z >Z.internal &&
+		test_cmp Z.internal ../original &&
+
+		rm *
+	) &&
+
+	set_checkout_config 2 0 &&
+	test_checkout_workers 2 git -C delayed checkout -f &&
+	verify_checkout delayed &&
+
+	# Check that the *.d files got to the delay queue and were filtered
+	grep "smudge W.d .* \[DELAYED\]" delayed.log &&
+	grep "smudge X.d .* \[DELAYED\]" delayed.log &&
+	test_cmp delayed/W.d original &&
+	test_cmp delayed/X.d original &&
+
+	# Check that the parallel-eligible entries went to the right queue and
+	# were not filtered
+	! grep "smudge Y .* \[DELAYED\]" delayed.log &&
+	! grep "smudge Z .* \[DELAYED\]" delayed.log &&
+	test_cmp delayed/Y original &&
+	test_cmp delayed/Z original
+'
+
+test_done
-- 
cgit v0.10.2-6-g49f6


From 87094fc2daa9613c2fad454dbb068a8f23ce8de8 Mon Sep 17 00:00:00 2001
From: Matheus Tavares <matheus.bernardino@usp.br>
Date: Tue, 4 May 2021 13:27:35 -0300
Subject: ci: run test round with parallel-checkout enabled

We already have tests for the basic parallel-checkout operations. But
this code can also run be executed by other commands, such as
git-read-tree and git-sparse-checkout, which are currently not tested
with multiple workers. To promote a wider test coverage without
duplicating tests:

1. Add the GIT_TEST_CHECKOUT_WORKERS environment variable, to optionally
   force parallel-checkout execution during the whole test suite.

2. Set this variable (with a value of 2) in the second test round of our
   linux-gcc CI job. This round runs `make test` again with some
   optional GIT_TEST_* variables enabled, so there is no additional
   overhead in exercising the parallel-checkout code here.

Note that tests checking out less than two parallel-eligible entries
will fall back to the sequential mode. Nevertheless, it's still a good
exercise for the parallel-checkout framework as the fallback codepath
also writes the queued entries using the parallel-checkout functions
(only without spawning any worker).

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index a66b5e8..23b28e7 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -25,6 +25,7 @@ linux-gcc)
 	export GIT_TEST_ADD_I_USE_BUILTIN=1
 	export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=master
 	export GIT_TEST_WRITE_REV_INDEX=1
+	export GIT_TEST_CHECKOUT_WORKERS=2
 	make test
 	;;
 linux-clang)
diff --git a/parallel-checkout.c b/parallel-checkout.c
index 6fb3f1e..6b1af32 100644
--- a/parallel-checkout.c
+++ b/parallel-checkout.c
@@ -35,6 +35,20 @@ static const int DEFAULT_NUM_WORKERS = 1;
 
 void get_parallel_checkout_configs(int *num_workers, int *threshold)
 {
+	char *env_workers = getenv("GIT_TEST_CHECKOUT_WORKERS");
+
+	if (env_workers && *env_workers) {
+		if (strtol_i(env_workers, 10, num_workers)) {
+			die("invalid value for GIT_TEST_CHECKOUT_WORKERS: '%s'",
+			    env_workers);
+		}
+		if (*num_workers < 1)
+			*num_workers = online_cpus();
+
+		*threshold = 0;
+		return;
+	}
+
 	if (git_config_get_int("checkout.workers", num_workers))
 		*num_workers = DEFAULT_NUM_WORKERS;
 	else if (*num_workers < 1)
diff --git a/t/README b/t/README
index fd9375b..a194488 100644
--- a/t/README
+++ b/t/README
@@ -436,6 +436,10 @@ and "sha256".
 GIT_TEST_WRITE_REV_INDEX=<boolean>, when true enables the
 'pack.writeReverseIndex' setting.
 
+GIT_TEST_CHECKOUT_WORKERS=<n> overrides the 'checkout.workers' setting
+to <n> and 'checkout.thresholdForParallelism' to 0, forcing the
+execution of the parallel-checkout code.
+
 Naming Tests
 ------------
 
diff --git a/t/lib-parallel-checkout.sh b/t/lib-parallel-checkout.sh
index d674042..21f5759 100644
--- a/t/lib-parallel-checkout.sh
+++ b/t/lib-parallel-checkout.sh
@@ -1,5 +1,8 @@
 # Helpers for tests invoking parallel-checkout
 
+# Parallel checkout tests need full control of the number of workers
+unset GIT_TEST_CHECKOUT_WORKERS
+
 set_checkout_config () {
 	if test $# -ne 2
 	then
-- 
cgit v0.10.2-6-g49f6