path: root/dir.h
diff options
authorDerrick Stolee <>2019-11-21 22:04:41 (GMT)
committerJunio C Hamano <>2019-11-22 07:11:44 (GMT)
commit96cc8ab5318cd57c8bc203b8f064b35883b2386f (patch)
tree34c7c46a9525f853ba710fe070fd191060ac2cdd /dir.h
parent879321eb0bec25779386445d65242452825155be (diff)
sparse-checkout: use hashmaps for cone patterns
The parent and recursive patterns allowed by the "cone mode" option in sparse-checkout are restrictive enough that we can avoid using the regex parsing. Everything is based on prefix matches, so we can use hashsets to store the prefixes from the sparse-checkout file. When checking a path, we can strip path entries from the path and check the hashset for an exact match. As a test, I created a cone-mode sparse-checkout file for the Linux repository that actually includes every file. This was constructed by taking every folder in the Linux repo and creating the pattern pairs here: /$folder/ !/$folder/*/ This resulted in a sparse-checkout file sith 8,296 patterns. Running 'git read-tree -mu HEAD' on this file had the following performance: core.sparseCheckout=false: 0.21 s (0.00 s) core.sparseCheckout=true: 3.75 s (3.50 s) core.sparseCheckoutCone=true: 0.23 s (0.01 s) The times in parentheses above correspond to the time spent in the first clear_ce_flags() call, according to the trace2 performance traces. While this example is contrived, it demonstrates how these patterns can slow the sparse-checkout feature. Helped-by: Eric Wong <> Helped-by: Johannes Schindelin <> Signed-off-by: Derrick Stolee <> Signed-off-by: Junio C Hamano <>
Diffstat (limited to 'dir.h')
1 files changed, 31 insertions, 0 deletions
diff --git a/dir.h b/dir.h
index 2fbdef0..f8edbca 100644
--- a/dir.h
+++ b/dir.h
@@ -4,6 +4,7 @@
/* See Documentation/technical/api-directory-listing.txt */
#include "cache.h"
+#include "hashmap.h"
#include "strbuf.h"
struct dir_entry {
@@ -37,6 +38,13 @@ struct path_pattern {
int srcpos;
+/* used for hashmaps for cone patterns */
+struct pattern_entry {
+ struct hashmap_entry ent;
+ char *pattern;
+ size_t patternlen;
* Each excludes file will be parsed into a fresh exclude_list which
* is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,26 @@ struct pattern_list {
const char *src;
struct path_pattern **patterns;
+ /*
+ * While scanning the excludes, we attempt to match the patterns
+ * with a more restricted set that allows us to use hashsets for
+ * matching logic, which is faster than the linear lookup in the
+ * excludes array above. If non-zero, that check succeeded.
+ */
+ unsigned use_cone_patterns;
+ unsigned full_cone;
+ /*
+ * Stores paths where everything starting with those paths
+ * is included.
+ */
+ struct hashmap recursive_hashmap;
+ /*
+ * Used to check single-level parents of blobs.
+ */
+ struct hashmap parent_hashmap;
@@ -271,6 +299,9 @@ int is_excluded(struct dir_struct *dir,
struct index_state *istate,
const char *name, int *dtype);
+int hashmap_contains_parent(struct hashmap *map,
+ const char *path,
+ struct strbuf *buffer);
struct pattern_list *add_pattern_list(struct dir_struct *dir,
int group_type, const char *src);
int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,