From 37a251436447f5190105d01f18c6b398474be3d7 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:39 +0000 Subject: diffcore-rename: use directory rename guided basename comparisons A previous commit noted that it is very common for people to move files across directories while keeping their filename the same. The last few commits took advantage of this and showed that we can accelerate rename detection significantly using basenames; since files with the same basename serve as likely rename candidates, we can check those first and remove them from the rename candidate pool if they are sufficiently similar. Unfortunately, the previous optimization was limited by the fact that the remaining basenames after exact rename detection are not always unique. Many repositories have hundreds of build files with the same name (e.g. Makefile, .gitignore, build.gradle, etc.), and may even have hundreds of source files with the same name. (For example, the linux kernel has 100 setup.c, 87 irq.c, and 112 core.c files. A repository at $DAYJOB has a lot of ObjectFactory.java and Plugin.java files). For these files with non-unique basenames, we are faced with the task of attempting to determine or guess which directory they may have been relocated to. Such a task is precisely the job of directory rename detection. However, there are two catches: (1) the directory rename detection code has traditionally been part of the merge machinery rather than diffcore-rename.c, and (2) directory rename detection currently runs after regular rename detection is complete. The 1st catch is just an implementation issue that can be overcome by some code shuffling. The 2nd requires us to add a further approximation: we only have access to exact renames at this point, so we need to do directory rename detection based on just exact renames. In some cases we won't have exact renames, in which case this extra optimization won't apply. We also choose to not apply the optimization unless we know that the underlying directory was removed, which will require extra data to be passed in to diffcore_rename_extended(). Also, even if we get a prediction about which directory a file may have relocated to, we will still need to check to see if there is a file in the predicted directory, and then compare the two files to see if they meet the higher min_basename_score threshold required for marking the two files as renames. This commit introduces an idx_possible_rename() function which will do this directory rename detection for us and give us the index within rename_dst of the resulting filename. For now, this function is hardcoded to return -1 (not found) and just hooks up how its results would be used once we have a more complete implementation in place. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/Documentation/gitdiffcore.txt b/Documentation/gitdiffcore.txt index 80fcf95..8673a5c 100644 --- a/Documentation/gitdiffcore.txt +++ b/Documentation/gitdiffcore.txt @@ -186,7 +186,7 @@ mark a file pair as a rename and stop considering other candidates for better matches. At most, one comparison is done per file in this preliminary pass; so if there are several remaining ext.txt files throughout the directory hierarchy after exact rename detection, this -preliminary step will be skipped for those files. +preliminary step may be skipped for those files. Note. When the "-C" option is used with `--find-copies-harder` option, 'git diff-{asterisk}' commands feed unmodified filepairs to diff --git a/diffcore-rename.c b/diffcore-rename.c index 4155818..b305568 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -379,6 +379,12 @@ static const char *get_basename(const char *filename) return base ? base + 1 : filename; } +static int idx_possible_rename(char *filename) +{ + /* Unconditionally return -1, "not found", for now */ + return -1; +} + static int find_basename_matches(struct diff_options *options, int minimum_score) { @@ -415,8 +421,6 @@ static int find_basename_matches(struct diff_options *options, int i, renames = 0; struct strintmap sources; struct strintmap dests; - struct hashmap_iter iter; - struct strmap_entry *entry; /* * The prefeteching stuff wants to know if it can skip prefetching @@ -466,17 +470,39 @@ static int find_basename_matches(struct diff_options *options, } /* Now look for basename matchups and do similarity estimation */ - strintmap_for_each_entry(&sources, &iter, entry) { - const char *base = entry->key; - intptr_t src_index = (intptr_t)entry->value; + for (i = 0; i < rename_src_nr; ++i) { + char *filename = rename_src[i].p->one->path; + const char *base = NULL; + intptr_t src_index; intptr_t dst_index; - if (src_index == -1) - continue; - if (0 <= (dst_index = strintmap_get(&dests, base))) { + /* + * If the basename is unique among remaining sources, then + * src_index will equal 'i' and we can attempt to match it + * to a unique basename in the destinations. Otherwise, + * use directory rename heuristics, if possible. + */ + base = get_basename(filename); + src_index = strintmap_get(&sources, base); + assert(src_index == -1 || src_index == i); + + if (strintmap_contains(&dests, base)) { struct diff_filespec *one, *two; int score; + /* Find a matching destination, if possible */ + dst_index = strintmap_get(&dests, base); + if (src_index == -1 || dst_index == -1) { + src_index = i; + dst_index = idx_possible_rename(filename); + } + if (dst_index == -1) + continue; + + /* Ignore this dest if already used in a rename */ + if (rename_dst[dst_index].is_rename) + continue; /* already used previously */ + /* Estimate the similarity */ one = rename_src[src_index].p->one; two = rename_dst[dst_index].p->two; -- cgit v0.10.2-6-g49f6 From bde8b9f34c57f0ad567890de181ff94741ef2cbd Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:40 +0000 Subject: diffcore-rename: provide basic implementation of idx_possible_rename() Add a new struct dir_rename_info with various values we need inside our idx_possible_rename() function introduced in the previous commit. Add a basic implementation for this function showing how we plan to use the variables, but which will just return early with a value of -1 (not found) when those variables are not set up. Future commits will do the work necessary to set up those other variables so that idx_possible_rename() does not always return -1. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index b305568..edb0eff 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -367,6 +367,19 @@ static int find_exact_renames(struct diff_options *options) return renames; } +struct dir_rename_info { + struct strintmap idx_map; + struct strmap dir_rename_guess; + struct strmap *dir_rename_count; + unsigned setup; +}; + +static char *get_dirname(const char *filename) +{ + char *slash = strrchr(filename, '/'); + return slash ? xstrndup(filename, slash - filename) : xstrdup(""); +} + static const char *get_basename(const char *filename) { /* @@ -379,14 +392,86 @@ static const char *get_basename(const char *filename) return base ? base + 1 : filename; } -static int idx_possible_rename(char *filename) +static int idx_possible_rename(char *filename, struct dir_rename_info *info) { - /* Unconditionally return -1, "not found", for now */ - return -1; + /* + * Our comparison of files with the same basename (see + * find_basename_matches() below), is only helpful when after exact + * rename detection we have exactly one file with a given basename + * among the rename sources and also only exactly one file with + * that basename among the rename destinations. When we have + * multiple files with the same basename in either set, we do not + * know which to compare against. However, there are some + * filenames that occur in large numbers (particularly + * build-related filenames such as 'Makefile', '.gitignore', or + * 'build.gradle' that potentially exist within every single + * subdirectory), and for performance we want to be able to quickly + * find renames for these files too. + * + * The reason basename comparisons are a useful heuristic was that it + * is common for people to move files across directories while keeping + * their filename the same. If we had a way of determining or even + * making a good educated guess about which directory these non-unique + * basename files had moved the file to, we could check it. + * Luckily... + * + * When an entire directory is in fact renamed, we have two factors + * helping us out: + * (a) the original directory disappeared giving us a hint + * about when we can apply an extra heuristic. + * (a) we often have several files within that directory and + * subdirectories that are renamed without changes + * So, rules for a heuristic: + * (0) If there basename matches are non-unique (the condition under + * which this function is called) AND + * (1) the directory in which the file was found has disappeared + * (i.e. dirs_removed is non-NULL and has a relevant entry) THEN + * (2) use exact renames of files within the directory to determine + * where the directory is likely to have been renamed to. IF + * there is at least one exact rename from within that + * directory, we can proceed. + * (3) If there are multiple places the directory could have been + * renamed to based on exact renames, ignore all but one of them. + * Just use the destination with the most renames going to it. + * (4) Check if applying that directory rename to the original file + * would result in a destination filename that is in the + * potential rename set. If so, return the index of the + * destination file (the index within rename_dst). + * (5) Compare the original file and returned destination for + * similarity, and if they are sufficiently similar, record the + * rename. + * + * This function, idx_possible_rename(), is only responsible for (4). + * The conditions/steps in (1)-(3) will be handled via setting up + * dir_rename_count and dir_rename_guess in a future + * initialize_dir_rename_info() function. Steps (0) and (5) are + * handled by the caller of this function. + */ + char *old_dir, *new_dir; + struct strbuf new_path = STRBUF_INIT; + int idx; + + if (!info->setup) + return -1; + + old_dir = get_dirname(filename); + new_dir = strmap_get(&info->dir_rename_guess, old_dir); + free(old_dir); + if (!new_dir) + return -1; + + strbuf_addstr(&new_path, new_dir); + strbuf_addch(&new_path, '/'); + strbuf_addstr(&new_path, get_basename(filename)); + + idx = strintmap_get(&info->idx_map, new_path.buf); + strbuf_release(&new_path); + return idx; } static int find_basename_matches(struct diff_options *options, - int minimum_score) + int minimum_score, + struct dir_rename_info *info) { /* * When I checked in early 2020, over 76% of file renames in linux @@ -494,7 +579,7 @@ static int find_basename_matches(struct diff_options *options, dst_index = strintmap_get(&dests, base); if (src_index == -1 || dst_index == -1) { src_index = i; - dst_index = idx_possible_rename(filename); + dst_index = idx_possible_rename(filename, info); } if (dst_index == -1) continue; @@ -677,8 +762,10 @@ void diffcore_rename(struct diff_options *options) int num_destinations, dst_cnt; int num_sources, want_copies; struct progress *progress = NULL; + struct dir_rename_info info; trace2_region_enter("diff", "setup", options->repo); + info.setup = 0; want_copies = (detect_rename == DIFF_DETECT_COPY); if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; @@ -774,7 +861,8 @@ void diffcore_rename(struct diff_options *options) /* Utilize file basenames to quickly find renames. */ trace2_region_enter("diff", "basename matches", options->repo); rename_count += find_basename_matches(options, - min_basename_score); + min_basename_score, + &info); trace2_region_leave("diff", "basename matches", options->repo); /* -- cgit v0.10.2-6-g49f6 From ae8cf74d3f9e76332138ee8bb911b3fa53f17bbf Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:41 +0000 Subject: diffcore-rename: add a mapping of destination names to their indices Compute a mapping of full filename to the index within rename_dst where that filename is found, and store it in idx_map. idx_possible_rename() needs this to quickly finding an array entry in rename_dst given the pathname. While at it, add placeholder initializations for dir_rename_count and dir_rename_guess; these will be more fully populated in subsequent commits. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index edb0eff..8eeb8c7 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -380,6 +380,45 @@ static char *get_dirname(const char *filename) return slash ? xstrndup(filename, slash - filename) : xstrdup(""); } +static void initialize_dir_rename_info(struct dir_rename_info *info) +{ + int i; + + info->setup = 1; + + strintmap_init_with_options(&info->idx_map, -1, NULL, 0); + strmap_init_with_options(&info->dir_rename_guess, NULL, 0); + info->dir_rename_count = NULL; + + /* + * Loop setting up both info->idx_map. + */ + for (i = 0; i < rename_dst_nr; ++i) { + /* + * For non-renamed files, make idx_map contain mapping of + * filename -> index (index within rename_dst, that is) + */ + if (!rename_dst[i].is_rename) { + char *filename = rename_dst[i].p->two->path; + strintmap_set(&info->idx_map, filename, i); + } + } +} + +static void cleanup_dir_rename_info(struct dir_rename_info *info) +{ + if (!info->setup) + return; + + /* idx_map */ + strintmap_clear(&info->idx_map); + + /* dir_rename_guess */ + strmap_clear(&info->dir_rename_guess, 1); + + /* Nothing to do for dir_rename_count, yet */ +} + static const char *get_basename(const char *filename) { /* @@ -858,6 +897,11 @@ void diffcore_rename(struct diff_options *options) remove_unneeded_paths_from_src(want_copies); trace2_region_leave("diff", "cull after exact", options->repo); + /* Preparation for basename-driven matching. */ + trace2_region_enter("diff", "dir rename setup", options->repo); + initialize_dir_rename_info(&info); + trace2_region_leave("diff", "dir rename setup", options->repo); + /* Utilize file basenames to quickly find renames. */ trace2_region_enter("diff", "basename matches", options->repo); rename_count += find_basename_matches(options, @@ -1026,6 +1070,7 @@ void diffcore_rename(struct diff_options *options) if (rename_dst[i].filespec_to_free) free_filespec(rename_dst[i].filespec_to_free); + cleanup_dir_rename_info(&info); FREE_AND_NULL(rename_dst); rename_dst_nr = rename_dst_alloc = 0; FREE_AND_NULL(rename_src); -- cgit v0.10.2-6-g49f6 From 0c4fd732f043be570e08b51c475ff9f2e2066912 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:42 +0000 Subject: Move computation of dir_rename_count from merge-ort to diffcore-rename Move the computation of dir_rename_count from merge-ort.c to diffcore-rename.c, making slight adjustments to the data structures based on the move. While the diffstat looks large, viewing this commit with --color-moved makes it clear that only about 20 lines changed. With this patch, the computation of dir_rename_count is still only done after inexact rename detection, but subsequent commits will add a preliminary computation of dir_rename_count after exact rename detection, followed by some updates after inexact rename detection. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index 8eeb8c7..39e23d5 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -380,6 +380,129 @@ static char *get_dirname(const char *filename) return slash ? xstrndup(filename, slash - filename) : xstrdup(""); } +static void dirname_munge(char *filename) +{ + char *slash = strrchr(filename, '/'); + if (!slash) + slash = filename; + *slash = '\0'; +} + +static void increment_count(struct strmap *dir_rename_count, + char *old_dir, + char *new_dir) +{ + struct strintmap *counts; + struct strmap_entry *e; + + /* Get the {new_dirs -> counts} mapping using old_dir */ + e = strmap_get_entry(dir_rename_count, old_dir); + if (e) { + counts = e->value; + } else { + counts = xmalloc(sizeof(*counts)); + strintmap_init_with_options(counts, 0, NULL, 1); + strmap_put(dir_rename_count, old_dir, counts); + } + + /* Increment the count for new_dir */ + strintmap_incr(counts, new_dir, 1); +} + +static void update_dir_rename_counts(struct strmap *dir_rename_count, + struct strset *dirs_removed, + const char *oldname, + const char *newname) +{ + char *old_dir = xstrdup(oldname); + char *new_dir = xstrdup(newname); + char new_dir_first_char = new_dir[0]; + int first_time_in_loop = 1; + + while (1) { + dirname_munge(old_dir); + dirname_munge(new_dir); + + /* + * When renaming + * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" + * then this suggests that both + * a/b/c/d/e/ => a/b/some/thing/else/e/ + * a/b/c/d/ => a/b/some/thing/else/ + * so we want to increment counters for both. We do NOT, + * however, also want to suggest that there was the following + * rename: + * a/b/c/ => a/b/some/thing/ + * so we need to quit at that point. + * + * Note the when first_time_in_loop, we only strip off the + * basename, and we don't care if that's different. + */ + if (!first_time_in_loop) { + char *old_sub_dir = strchr(old_dir, '\0')+1; + char *new_sub_dir = strchr(new_dir, '\0')+1; + if (!*new_dir) { + /* + * Special case when renaming to root directory, + * i.e. when new_dir == "". In this case, we had + * something like + * a/b/subdir => subdir + * and so dirname_munge() sets things up so that + * old_dir = "a/b\0subdir\0" + * new_dir = "\0ubdir\0" + * We didn't have a '/' to overwrite a '\0' onto + * in new_dir, so we have to compare differently. + */ + if (new_dir_first_char != old_sub_dir[0] || + strcmp(old_sub_dir+1, new_sub_dir)) + break; + } else { + if (strcmp(old_sub_dir, new_sub_dir)) + break; + } + } + + if (strset_contains(dirs_removed, old_dir)) + increment_count(dir_rename_count, old_dir, new_dir); + else + break; + + /* If we hit toplevel directory ("") for old or new dir, quit */ + if (!*old_dir || !*new_dir) + break; + + first_time_in_loop = 0; + } + + /* Free resources we don't need anymore */ + free(old_dir); + free(new_dir); +} + +static void compute_dir_rename_counts(struct strmap *dir_rename_count, + struct strset *dirs_removed) +{ + int i; + + /* Set up dir_rename_count */ + for (i = 0; i < rename_dst_nr; ++i) { + /* File not part of directory rename counts if not a rename */ + if (!rename_dst[i].is_rename) + continue; + + /* + * Make dir_rename_count contain a map of a map: + * old_directory -> {new_directory -> count} + * In other words, for every pair look at the directories for + * the old filename and the new filename and count how many + * times that pairing occurs. + */ + update_dir_rename_counts(dir_rename_count, dirs_removed, + rename_dst[i].p->one->path, + rename_dst[i].p->two->path); + } +} + static void initialize_dir_rename_info(struct dir_rename_info *info) { int i; @@ -790,7 +913,9 @@ static void remove_unneeded_paths_from_src(int detecting_copies) rename_src_nr = new_num_src; } -void diffcore_rename(struct diff_options *options) +void diffcore_rename_extended(struct diff_options *options, + struct strset *dirs_removed, + struct strmap *dir_rename_count) { int detect_rename = options->detect_rename; int minimum_score = options->rename_score; @@ -805,6 +930,7 @@ void diffcore_rename(struct diff_options *options) trace2_region_enter("diff", "setup", options->repo); info.setup = 0; + assert(!dir_rename_count || strmap_empty(dir_rename_count)); want_copies = (detect_rename == DIFF_DETECT_COPY); if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; @@ -999,6 +1125,11 @@ void diffcore_rename(struct diff_options *options) trace2_region_leave("diff", "inexact renames", options->repo); cleanup: + /* + * Now that renames have been computed, compute dir_rename_count */ + if (dirs_removed && dir_rename_count) + compute_dir_rename_counts(dir_rename_count, dirs_removed); + /* At this point, we have found some renames and copies and they * are recorded in rename_dst. The original list is still in *q. */ @@ -1082,3 +1213,8 @@ void diffcore_rename(struct diff_options *options) trace2_region_leave("diff", "write back to queue", options->repo); return; } + +void diffcore_rename(struct diff_options *options) +{ + diffcore_rename_extended(options, NULL, NULL); +} diff --git a/diffcore.h b/diffcore.h index d2a63c5..db55d38 100644 --- a/diffcore.h +++ b/diffcore.h @@ -8,6 +8,8 @@ struct diff_options; struct repository; +struct strmap; +struct strset; struct userdiff_driver; /* This header file is internal between diff.c and its diff transformers @@ -161,6 +163,9 @@ void diff_q(struct diff_queue_struct *, struct diff_filepair *); void diffcore_break(struct repository *, int); void diffcore_rename(struct diff_options *); +void diffcore_rename_extended(struct diff_options *options, + struct strset *dirs_removed, + struct strmap *dir_rename_count); void diffcore_merge_broken(void); void diffcore_pickaxe(struct diff_options *); void diffcore_order(const char *orderfile); diff --git a/merge-ort.c b/merge-ort.c index 603d30c..c4467e0 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1302,131 +1302,6 @@ static char *handle_path_level_conflicts(struct merge_options *opt, return new_path; } -static void dirname_munge(char *filename) -{ - char *slash = strrchr(filename, '/'); - if (!slash) - slash = filename; - *slash = '\0'; -} - -static void increment_count(struct strmap *dir_rename_count, - char *old_dir, - char *new_dir) -{ - struct strintmap *counts; - struct strmap_entry *e; - - /* Get the {new_dirs -> counts} mapping using old_dir */ - e = strmap_get_entry(dir_rename_count, old_dir); - if (e) { - counts = e->value; - } else { - counts = xmalloc(sizeof(*counts)); - strintmap_init_with_options(counts, 0, NULL, 1); - strmap_put(dir_rename_count, old_dir, counts); - } - - /* Increment the count for new_dir */ - strintmap_incr(counts, new_dir, 1); -} - -static void update_dir_rename_counts(struct strmap *dir_rename_count, - struct strset *dirs_removed, - const char *oldname, - const char *newname) -{ - char *old_dir = xstrdup(oldname); - char *new_dir = xstrdup(newname); - char new_dir_first_char = new_dir[0]; - int first_time_in_loop = 1; - - while (1) { - dirname_munge(old_dir); - dirname_munge(new_dir); - - /* - * When renaming - * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c" - * then this suggests that both - * a/b/c/d/e/ => a/b/some/thing/else/e/ - * a/b/c/d/ => a/b/some/thing/else/ - * so we want to increment counters for both. We do NOT, - * however, also want to suggest that there was the following - * rename: - * a/b/c/ => a/b/some/thing/ - * so we need to quit at that point. - * - * Note the when first_time_in_loop, we only strip off the - * basename, and we don't care if that's different. - */ - if (!first_time_in_loop) { - char *old_sub_dir = strchr(old_dir, '\0')+1; - char *new_sub_dir = strchr(new_dir, '\0')+1; - if (!*new_dir) { - /* - * Special case when renaming to root directory, - * i.e. when new_dir == "". In this case, we had - * something like - * a/b/subdir => subdir - * and so dirname_munge() sets things up so that - * old_dir = "a/b\0subdir\0" - * new_dir = "\0ubdir\0" - * We didn't have a '/' to overwrite a '\0' onto - * in new_dir, so we have to compare differently. - */ - if (new_dir_first_char != old_sub_dir[0] || - strcmp(old_sub_dir+1, new_sub_dir)) - break; - } else { - if (strcmp(old_sub_dir, new_sub_dir)) - break; - } - } - - if (strset_contains(dirs_removed, old_dir)) - increment_count(dir_rename_count, old_dir, new_dir); - else - break; - - /* If we hit toplevel directory ("") for old or new dir, quit */ - if (!*old_dir || !*new_dir) - break; - - first_time_in_loop = 0; - } - - /* Free resources we don't need anymore */ - free(old_dir); - free(new_dir); -} - -static void compute_rename_counts(struct diff_queue_struct *pairs, - struct strmap *dir_rename_count, - struct strset *dirs_removed) -{ - int i; - - for (i = 0; i < pairs->nr; ++i) { - struct diff_filepair *pair = pairs->queue[i]; - - /* File not part of directory rename if it wasn't renamed */ - if (pair->status != 'R') - continue; - - /* - * Make dir_rename_count contain a map of a map: - * old_directory -> {new_directory -> count} - * In other words, for every pair look at the directories for - * the old filename and the new filename and count how many - * times that pairing occurs. - */ - update_dir_rename_counts(dir_rename_count, dirs_removed, - pair->one->path, - pair->two->path); - } -} - static void get_provisional_directory_renames(struct merge_options *opt, unsigned side, int *clean) @@ -1435,9 +1310,6 @@ static void get_provisional_directory_renames(struct merge_options *opt, struct strmap_entry *entry; struct rename_info *renames = &opt->priv->renames; - compute_rename_counts(&renames->pairs[side], - &renames->dir_rename_count[side], - &renames->dirs_removed[side]); /* * Collapse * dir_rename_count: old_directory -> {new_directory -> count} @@ -2162,7 +2034,9 @@ static void detect_regular_renames(struct merge_options *opt, diff_queued_diff = renames->pairs[side_index]; trace2_region_enter("diff", "diffcore_rename", opt->repo); - diffcore_rename(&diff_opts); + diffcore_rename_extended(&diff_opts, + &renames->dirs_removed[side_index], + &renames->dir_rename_count[side_index]); trace2_region_leave("diff", "diffcore_rename", opt->repo); resolve_diffpair_statuses(&diff_queued_diff); -- cgit v0.10.2-6-g49f6 From cd52e0050f0aa18bb35cda08f2dcbe94943df2cf Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:43 +0000 Subject: diffcore-rename: add function for clearing dir_rename_count As we adjust the usage of dir_rename_count we want to have a function for clearing, or partially clearing it out. Add a partial_clear_dir_rename_count() function for this purpose. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index 39e23d5..7dd475f 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -528,6 +528,18 @@ static void initialize_dir_rename_info(struct dir_rename_info *info) } } +void partial_clear_dir_rename_count(struct strmap *dir_rename_count) +{ + struct hashmap_iter iter; + struct strmap_entry *entry; + + strmap_for_each_entry(dir_rename_count, &iter, entry) { + struct strintmap *counts = entry->value; + strintmap_clear(counts); + } + strmap_partial_clear(dir_rename_count, 1); +} + static void cleanup_dir_rename_info(struct dir_rename_info *info) { if (!info->setup) diff --git a/diffcore.h b/diffcore.h index db55d38..c6ba64a 100644 --- a/diffcore.h +++ b/diffcore.h @@ -161,6 +161,8 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *, struct diff_filespec *); void diff_q(struct diff_queue_struct *, struct diff_filepair *); +void partial_clear_dir_rename_count(struct strmap *dir_rename_count); + void diffcore_break(struct repository *, int); void diffcore_rename(struct diff_options *); void diffcore_rename_extended(struct diff_options *options, diff --git a/merge-ort.c b/merge-ort.c index c4467e0..467404c 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -351,17 +351,11 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, /* Free memory used by various renames maps */ for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) { - struct hashmap_iter iter; - struct strmap_entry *entry; - strset_func(&renames->dirs_removed[i]); - strmap_for_each_entry(&renames->dir_rename_count[i], - &iter, entry) { - struct strintmap *counts = entry->value; - strintmap_clear(counts); - } - strmap_func(&renames->dir_rename_count[i], 1); + partial_clear_dir_rename_count(&renames->dir_rename_count[i]); + if (!reinitialize) + strmap_clear(&renames->dir_rename_count[i], 1); strmap_func(&renames->dir_renames[i], 0); } -- cgit v0.10.2-6-g49f6 From b6e3d2743410b0cfa749c1690de7f59683c0f372 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:44 +0000 Subject: diffcore-rename: move dir_rename_counts into dir_rename_info struct This continues the migration of the directory rename detection code into diffcore-rename, now taking the simple step of combining it with the dir_rename_info struct. Future commits will then make dir_rename_counts be computed in stages, and add computation of dir_rename_guess. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index 7dd475f..a1ccf14 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -388,7 +388,7 @@ static void dirname_munge(char *filename) *slash = '\0'; } -static void increment_count(struct strmap *dir_rename_count, +static void increment_count(struct dir_rename_info *info, char *old_dir, char *new_dir) { @@ -396,20 +396,20 @@ static void increment_count(struct strmap *dir_rename_count, struct strmap_entry *e; /* Get the {new_dirs -> counts} mapping using old_dir */ - e = strmap_get_entry(dir_rename_count, old_dir); + e = strmap_get_entry(info->dir_rename_count, old_dir); if (e) { counts = e->value; } else { counts = xmalloc(sizeof(*counts)); strintmap_init_with_options(counts, 0, NULL, 1); - strmap_put(dir_rename_count, old_dir, counts); + strmap_put(info->dir_rename_count, old_dir, counts); } /* Increment the count for new_dir */ strintmap_incr(counts, new_dir, 1); } -static void update_dir_rename_counts(struct strmap *dir_rename_count, +static void update_dir_rename_counts(struct dir_rename_info *info, struct strset *dirs_removed, const char *oldname, const char *newname) @@ -463,7 +463,7 @@ static void update_dir_rename_counts(struct strmap *dir_rename_count, } if (strset_contains(dirs_removed, old_dir)) - increment_count(dir_rename_count, old_dir, new_dir); + increment_count(info, old_dir, new_dir); else break; @@ -479,12 +479,15 @@ static void update_dir_rename_counts(struct strmap *dir_rename_count, free(new_dir); } -static void compute_dir_rename_counts(struct strmap *dir_rename_count, - struct strset *dirs_removed) +static void compute_dir_rename_counts(struct dir_rename_info *info, + struct strset *dirs_removed, + struct strmap *dir_rename_count) { int i; - /* Set up dir_rename_count */ + info->setup = 1; + info->dir_rename_count = dir_rename_count; + for (i = 0; i < rename_dst_nr; ++i) { /* File not part of directory rename counts if not a rename */ if (!rename_dst[i].is_rename) @@ -497,7 +500,7 @@ static void compute_dir_rename_counts(struct strmap *dir_rename_count, * the old filename and the new filename and count how many * times that pairing occurs. */ - update_dir_rename_counts(dir_rename_count, dirs_removed, + update_dir_rename_counts(info, dirs_removed, rename_dst[i].p->one->path, rename_dst[i].p->two->path); } @@ -551,7 +554,9 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info) /* dir_rename_guess */ strmap_clear(&info->dir_rename_guess, 1); - /* Nothing to do for dir_rename_count, yet */ + /* dir_rename_count */ + partial_clear_dir_rename_count(info->dir_rename_count); + strmap_clear(info->dir_rename_count, 1); } static const char *get_basename(const char *filename) @@ -1140,7 +1145,7 @@ void diffcore_rename_extended(struct diff_options *options, /* * Now that renames have been computed, compute dir_rename_count */ if (dirs_removed && dir_rename_count) - compute_dir_rename_counts(dir_rename_count, dirs_removed); + compute_dir_rename_counts(&info, dirs_removed, dir_rename_count); /* At this point, we have found some renames and copies and they * are recorded in rename_dst. The original list is still in *q. -- cgit v0.10.2-6-g49f6 From b1473019e8b2b4aafdf578ab3dade36c9c4d419d Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:45 +0000 Subject: diffcore-rename: extend cleanup_dir_rename_info() When diffcore_rename_extended() is passed a NULL dir_rename_count, we will still want to create a temporary one for use by find_basename_matches(), but have it fully deallocated before diffcore_rename_extended() returns. However, when diffcore_rename_extended() is passed a dir_rename_count, we want to fill that strmap with appropriate values and return it. However, for our interim purposes we may also add entries corresponding to directories that cannot have been renamed due to still existing on both sides. Extend cleanup_dir_rename_info() to handle these two different cases, cleaning up the relevant bits of information for each case. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index a1ccf14..2cf9c47 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -543,8 +543,15 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count) strmap_partial_clear(dir_rename_count, 1); } -static void cleanup_dir_rename_info(struct dir_rename_info *info) +static void cleanup_dir_rename_info(struct dir_rename_info *info, + struct strset *dirs_removed, + int keep_dir_rename_count) { + struct hashmap_iter iter; + struct strmap_entry *entry; + struct string_list to_remove = STRING_LIST_INIT_NODUP; + int i; + if (!info->setup) return; @@ -555,8 +562,33 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info) strmap_clear(&info->dir_rename_guess, 1); /* dir_rename_count */ - partial_clear_dir_rename_count(info->dir_rename_count); - strmap_clear(info->dir_rename_count, 1); + if (!keep_dir_rename_count) { + partial_clear_dir_rename_count(info->dir_rename_count); + strmap_clear(info->dir_rename_count, 1); + FREE_AND_NULL(info->dir_rename_count); + return; + } + + /* + * Although dir_rename_count was passed in + * diffcore_rename_extended() and we want to keep it around and + * return it to that caller, we first want to remove any data + * associated with directories that weren't renamed. + */ + strmap_for_each_entry(info->dir_rename_count, &iter, entry) { + const char *source_dir = entry->key; + struct strintmap *counts = entry->value; + + if (!strset_contains(dirs_removed, source_dir)) { + string_list_append(&to_remove, source_dir); + strintmap_clear(counts); + continue; + } + } + for (i = 0; i < to_remove.nr; ++i) + strmap_remove(info->dir_rename_count, + to_remove.items[i].string, 1); + string_list_clear(&to_remove, 0); } static const char *get_basename(const char *filename) @@ -1218,7 +1250,7 @@ void diffcore_rename_extended(struct diff_options *options, if (rename_dst[i].filespec_to_free) free_filespec(rename_dst[i].filespec_to_free); - cleanup_dir_rename_info(&info); + cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL); FREE_AND_NULL(rename_dst); rename_dst_nr = rename_dst_alloc = 0; FREE_AND_NULL(rename_src); -- cgit v0.10.2-6-g49f6 From 1ad69eb0dcfa61e175e7540517bd4e6e7a66822a Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:46 +0000 Subject: diffcore-rename: compute dir_rename_counts in stages Compute dir_rename_counts based just on exact renames to start, as that can provide us useful information in find_basename_matches(). This is done by moving the code from compute_dir_rename_counts() into initialize_dir_rename_info(), resulting in it being computed earlier and based just on exact renames. Since that's an incomplete result, we augment the counts via calling update_dir_rename_counts() after each basename-guide and inexact rename detection match is found. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index 2cf9c47..10f8f4a 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -419,6 +419,28 @@ static void update_dir_rename_counts(struct dir_rename_info *info, char new_dir_first_char = new_dir[0]; int first_time_in_loop = 1; + if (!info->setup) + /* + * info->setup is 0 here in two cases: (1) all auxiliary + * vars (like dirs_removed) were NULL so + * initialize_dir_rename_info() returned early, or (2) + * either break detection or copy detection are active so + * that we never called initialize_dir_rename_info(). In + * the former case, we don't have enough info to know if + * directories were renamed (because dirs_removed lets us + * know about a necessary prerequisite, namely if they were + * removed), and in the latter, we don't care about + * directory renames or find_basename_matches. + * + * This matters because both basename and inexact matching + * will also call update_dir_rename_counts(). In either of + * the above two cases info->dir_rename_counts will not + * have been properly initialized which prevents us from + * updating it, but in these two cases we don't care about + * dir_rename_counts anyway, so we can just exit early. + */ + return; + while (1) { dirname_munge(old_dir); dirname_munge(new_dir); @@ -479,45 +501,29 @@ static void update_dir_rename_counts(struct dir_rename_info *info, free(new_dir); } -static void compute_dir_rename_counts(struct dir_rename_info *info, - struct strset *dirs_removed, - struct strmap *dir_rename_count) +static void initialize_dir_rename_info(struct dir_rename_info *info, + struct strset *dirs_removed, + struct strmap *dir_rename_count) { int i; - info->setup = 1; - info->dir_rename_count = dir_rename_count; - - for (i = 0; i < rename_dst_nr; ++i) { - /* File not part of directory rename counts if not a rename */ - if (!rename_dst[i].is_rename) - continue; - - /* - * Make dir_rename_count contain a map of a map: - * old_directory -> {new_directory -> count} - * In other words, for every pair look at the directories for - * the old filename and the new filename and count how many - * times that pairing occurs. - */ - update_dir_rename_counts(info, dirs_removed, - rename_dst[i].p->one->path, - rename_dst[i].p->two->path); + if (!dirs_removed) { + info->setup = 0; + return; } -} - -static void initialize_dir_rename_info(struct dir_rename_info *info) -{ - int i; - info->setup = 1; + info->dir_rename_count = dir_rename_count; + if (!info->dir_rename_count) { + info->dir_rename_count = xmalloc(sizeof(*dir_rename_count)); + strmap_init(info->dir_rename_count); + } strintmap_init_with_options(&info->idx_map, -1, NULL, 0); strmap_init_with_options(&info->dir_rename_guess, NULL, 0); - info->dir_rename_count = NULL; /* - * Loop setting up both info->idx_map. + * Loop setting up both info->idx_map, and doing setup of + * info->dir_rename_count. */ for (i = 0; i < rename_dst_nr; ++i) { /* @@ -527,7 +533,20 @@ static void initialize_dir_rename_info(struct dir_rename_info *info) if (!rename_dst[i].is_rename) { char *filename = rename_dst[i].p->two->path; strintmap_set(&info->idx_map, filename, i); + continue; } + + /* + * For everything else (i.e. renamed files), make + * dir_rename_count contain a map of a map: + * old_directory -> {new_directory -> count} + * In other words, for every pair look at the directories for + * the old filename and the new filename and count how many + * times that pairing occurs. + */ + update_dir_rename_counts(info, dirs_removed, + rename_dst[i].p->one->path, + rename_dst[i].p->two->path); } } @@ -682,7 +701,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info) static int find_basename_matches(struct diff_options *options, int minimum_score, - struct dir_rename_info *info) + struct dir_rename_info *info, + struct strset *dirs_removed) { /* * When I checked in early 2020, over 76% of file renames in linux @@ -810,6 +830,8 @@ static int find_basename_matches(struct diff_options *options, continue; record_rename_pair(dst_index, src_index, score); renames++; + update_dir_rename_counts(info, dirs_removed, + one->path, two->path); /* * Found a rename so don't need text anymore; if we @@ -893,7 +915,12 @@ static int too_many_rename_candidates(int num_destinations, int num_sources, return 1; } -static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies) +static int find_renames(struct diff_score *mx, + int dst_cnt, + int minimum_score, + int copies, + struct dir_rename_info *info, + struct strset *dirs_removed) { int count = 0, i; @@ -910,6 +937,9 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i continue; record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); count++; + update_dir_rename_counts(info, dirs_removed, + rename_src[mx[i].src].p->one->path, + rename_dst[mx[i].dst].p->two->path); } return count; } @@ -981,6 +1011,8 @@ void diffcore_rename_extended(struct diff_options *options, info.setup = 0; assert(!dir_rename_count || strmap_empty(dir_rename_count)); want_copies = (detect_rename == DIFF_DETECT_COPY); + if (dirs_removed && (break_idx || want_copies)) + BUG("dirs_removed incompatible with break/copy detection"); if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; @@ -1074,14 +1106,15 @@ void diffcore_rename_extended(struct diff_options *options, /* Preparation for basename-driven matching. */ trace2_region_enter("diff", "dir rename setup", options->repo); - initialize_dir_rename_info(&info); + initialize_dir_rename_info(&info, + dirs_removed, dir_rename_count); trace2_region_leave("diff", "dir rename setup", options->repo); /* Utilize file basenames to quickly find renames. */ trace2_region_enter("diff", "basename matches", options->repo); rename_count += find_basename_matches(options, min_basename_score, - &info); + &info, dirs_removed); trace2_region_leave("diff", "basename matches", options->repo); /* @@ -1167,18 +1200,15 @@ void diffcore_rename_extended(struct diff_options *options, /* cost matrix sorted by most to least similar pair */ STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare); - rename_count += find_renames(mx, dst_cnt, minimum_score, 0); + rename_count += find_renames(mx, dst_cnt, minimum_score, 0, + &info, dirs_removed); if (want_copies) - rename_count += find_renames(mx, dst_cnt, minimum_score, 1); + rename_count += find_renames(mx, dst_cnt, minimum_score, 1, + &info, dirs_removed); free(mx); trace2_region_leave("diff", "inexact renames", options->repo); cleanup: - /* - * Now that renames have been computed, compute dir_rename_count */ - if (dirs_removed && dir_rename_count) - compute_dir_rename_counts(&info, dirs_removed, dir_rename_count); - /* At this point, we have found some renames and copies and they * are recorded in rename_dst. The original list is still in *q. */ -- cgit v0.10.2-6-g49f6 From 333899e1e3f15010a85588e67a4ef0f664966c44 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:47 +0000 Subject: diffcore-rename: limit dir_rename_counts computation to relevant dirs We are using dir_rename_counts to count the number of other directories that files within a directory moved to. We only need this information for directories that disappeared, though, so we can return early from update_dir_rename_counts() for other paths. If dirs_removed is passed to diffcore_rename_extended(), then it provides the relevant bits of information for us to limit this counting to relevant dirs. If dirs_removed is not passed, we would need to compute some replacement in order to do this limiting. Introduce a new info->relevant_source_dirs variable for this purpose, even though at this stage we will only set it to dirs_removed for simplicity. Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index 10f8f4a..e5fa0cb 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -371,6 +371,7 @@ struct dir_rename_info { struct strintmap idx_map; struct strmap dir_rename_guess; struct strmap *dir_rename_count; + struct strset *relevant_source_dirs; unsigned setup; }; @@ -442,7 +443,13 @@ static void update_dir_rename_counts(struct dir_rename_info *info, return; while (1) { + /* Get old_dir, skip if its directory isn't relevant. */ dirname_munge(old_dir); + if (info->relevant_source_dirs && + !strset_contains(info->relevant_source_dirs, old_dir)) + break; + + /* Get new_dir */ dirname_munge(new_dir); /* @@ -521,6 +528,9 @@ static void initialize_dir_rename_info(struct dir_rename_info *info, strintmap_init_with_options(&info->idx_map, -1, NULL, 0); strmap_init_with_options(&info->dir_rename_guess, NULL, 0); + /* Setup info->relevant_source_dirs */ + info->relevant_source_dirs = dirs_removed; + /* * Loop setting up both info->idx_map, and doing setup of * info->dir_rename_count. -- cgit v0.10.2-6-g49f6 From 81afdf7a2e625a7ecfb17c00c871b409e853694d Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Feb 2021 00:30:48 +0000 Subject: diffcore-rename: compute dir_rename_guess from dir_rename_counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dir_rename_counts has a mapping of a mapping, in particular, it has old_dir => { new_dir => count } We want a simple mapping of old_dir => new_dir based on which new_dir had the highest count for a given old_dir. Compute this and store it in dir_rename_guess. This is the final piece of the puzzle needed to make our guesses at which directory files have been moved to when basenames aren't unique. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 12.775 s ± 0.062 s 12.596 s ± 0.061 s mega-renames: 188.754 s ± 0.284 s 130.465 s ± 0.259 s just-one-mega: 5.599 s ± 0.019 s 3.958 s ± 0.010 s Reviewed-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano diff --git a/diffcore-rename.c b/diffcore-rename.c index e5fa0cb..1fe902e 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -389,6 +389,24 @@ static void dirname_munge(char *filename) *slash = '\0'; } +static const char *get_highest_rename_path(struct strintmap *counts) +{ + int highest_count = 0; + const char *highest_destination_dir = NULL; + struct hashmap_iter iter; + struct strmap_entry *entry; + + strintmap_for_each_entry(counts, &iter, entry) { + const char *destination_dir = entry->key; + intptr_t count = (intptr_t)entry->value; + if (count > highest_count) { + highest_count = count; + highest_destination_dir = destination_dir; + } + } + return highest_destination_dir; +} + static void increment_count(struct dir_rename_info *info, char *old_dir, char *new_dir) @@ -512,6 +530,8 @@ static void initialize_dir_rename_info(struct dir_rename_info *info, struct strset *dirs_removed, struct strmap *dir_rename_count) { + struct hashmap_iter iter; + struct strmap_entry *entry; int i; if (!dirs_removed) { @@ -558,6 +578,23 @@ static void initialize_dir_rename_info(struct dir_rename_info *info, rename_dst[i].p->one->path, rename_dst[i].p->two->path); } + + /* + * Now we collapse + * dir_rename_count: old_directory -> {new_directory -> count} + * down to + * dir_rename_guess: old_directory -> best_new_directory + * where best_new_directory is the one with the highest count. + */ + strmap_for_each_entry(info->dir_rename_count, &iter, entry) { + /* entry->key is source_dir */ + struct strintmap *counts = entry->value; + char *best_newdir; + + best_newdir = xstrdup(get_highest_rename_path(counts)); + strmap_put(&info->dir_rename_guess, entry->key, + best_newdir); + } } void partial_clear_dir_rename_count(struct strmap *dir_rename_count) @@ -682,10 +719,10 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info) * rename. * * This function, idx_possible_rename(), is only responsible for (4). - * The conditions/steps in (1)-(3) will be handled via setting up - * dir_rename_count and dir_rename_guess in a future - * initialize_dir_rename_info() function. Steps (0) and (5) are - * handled by the caller of this function. + * The conditions/steps in (1)-(3) are handled via setting up + * dir_rename_count and dir_rename_guess in + * initialize_dir_rename_info(). Steps (0) and (5) are handled by + * the caller of this function. */ char *old_dir, *new_dir; struct strbuf new_path = STRBUF_INIT; -- cgit v0.10.2-6-g49f6