From a93bedada88dc15b0143708e1cb87c8fe9b9c705 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Elfstr=C3=B6m?= Date: Tue, 9 Jun 2015 20:24:35 +0200 Subject: setup: add gentle version of read_gitfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_gitfile will die on most error cases. This makes it unsuitable for speculative calls. Extract the core logic and provide a gentle version that returns NULL on failure. The first usecase of the new gentle version will be to probe for submodules during git clean. Helped-by: Junio C Hamano Helped-by: Jeff King Signed-off-by: Erik Elfström Signed-off-by: Junio C Hamano diff --git a/cache.h b/cache.h index 571c98f..25578cb 100644 --- a/cache.h +++ b/cache.h @@ -446,7 +446,16 @@ extern int get_common_dir(struct strbuf *sb, const char *gitdir); extern const char *get_git_namespace(void); extern const char *strip_namespace(const char *namespaced_ref); extern const char *get_git_work_tree(void); -extern const char *read_gitfile(const char *path); + +#define READ_GITFILE_ERR_STAT_FAILED 1 +#define READ_GITFILE_ERR_NOT_A_FILE 2 +#define READ_GITFILE_ERR_OPEN_FAILED 3 +#define READ_GITFILE_ERR_READ_FAILED 4 +#define READ_GITFILE_ERR_INVALID_FORMAT 5 +#define READ_GITFILE_ERR_NO_PATH 6 +#define READ_GITFILE_ERR_NOT_A_REPO 7 +extern const char *read_gitfile_gently(const char *path, int *return_error_code); +#define read_gitfile(path) read_gitfile_gently((path), NULL) extern const char *resolve_gitdir(const char *suspect); extern void set_git_work_tree(const char *tree); diff --git a/setup.c b/setup.c index 863ddfd..4748b63 100644 --- a/setup.c +++ b/setup.c @@ -406,35 +406,53 @@ static void update_linked_gitdir(const char *gitfile, const char *gitdir) /* * Try to read the location of the git directory from the .git file, * return path to git directory if found. + * + * On failure, if return_error_code is not NULL, return_error_code + * will be set to an error code and NULL will be returned. If + * return_error_code is NULL the function will die instead (for most + * cases). */ -const char *read_gitfile(const char *path) +const char *read_gitfile_gently(const char *path, int *return_error_code) { - char *buf; - char *dir; + int error_code = 0; + char *buf = NULL; + char *dir = NULL; const char *slash; struct stat st; int fd; ssize_t len; - if (stat(path, &st)) - return NULL; - if (!S_ISREG(st.st_mode)) - return NULL; + if (stat(path, &st)) { + error_code = READ_GITFILE_ERR_STAT_FAILED; + goto cleanup_return; + } + if (!S_ISREG(st.st_mode)) { + error_code = READ_GITFILE_ERR_NOT_A_FILE; + goto cleanup_return; + } fd = open(path, O_RDONLY); - if (fd < 0) - die_errno("Error opening '%s'", path); + if (fd < 0) { + error_code = READ_GITFILE_ERR_OPEN_FAILED; + goto cleanup_return; + } buf = xmalloc(st.st_size + 1); len = read_in_full(fd, buf, st.st_size); close(fd); - if (len != st.st_size) - die("Error reading %s", path); + if (len != st.st_size) { + error_code = READ_GITFILE_ERR_READ_FAILED; + goto cleanup_return; + } buf[len] = '\0'; - if (!starts_with(buf, "gitdir: ")) - die("Invalid gitfile format: %s", path); + if (!starts_with(buf, "gitdir: ")) { + error_code = READ_GITFILE_ERR_INVALID_FORMAT; + goto cleanup_return; + } while (buf[len - 1] == '\n' || buf[len - 1] == '\r') len--; - if (len < 9) - die("No path in gitfile: %s", path); + if (len < 9) { + error_code = READ_GITFILE_ERR_NO_PATH; + goto cleanup_return; + } buf[len] = '\0'; dir = buf + 8; @@ -448,14 +466,42 @@ const char *read_gitfile(const char *path) free(buf); buf = dir; } - - if (!is_git_directory(dir)) - die("Not a git repository: %s", dir); - + if (!is_git_directory(dir)) { + error_code = READ_GITFILE_ERR_NOT_A_REPO; + goto cleanup_return; + } update_linked_gitdir(path, dir); path = real_path(dir); +cleanup_return: free(buf); + + if (return_error_code) + *return_error_code = error_code; + + if (error_code) { + if (return_error_code) + return NULL; + + switch (error_code) { + case READ_GITFILE_ERR_STAT_FAILED: + case READ_GITFILE_ERR_NOT_A_FILE: + return NULL; + case READ_GITFILE_ERR_OPEN_FAILED: + die_errno("Error opening '%s'", path); + case READ_GITFILE_ERR_READ_FAILED: + die("Error reading %s", path); + case READ_GITFILE_ERR_INVALID_FORMAT: + die("Invalid gitfile format: %s", path); + case READ_GITFILE_ERR_NO_PATH: + die("No path in gitfile: %s", path); + case READ_GITFILE_ERR_NOT_A_REPO: + die("Not a git repository: %s", dir); + default: + assert(0); + } + } + return path; } -- cgit v0.10.2-6-g49f6 From 921bdd96afc17ca055af261066eabdf026cb2195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Elfstr=C3=B6m?= Date: Mon, 15 Jun 2015 21:39:52 +0200 Subject: setup: sanity check file size in read_gitfile_gently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_gitfile_gently will allocate a buffer to fit the entire file that should be read. Add a sanity check of the file size before opening to avoid allocating a potentially huge amount of memory if we come across a large file that someone happened to name ".git". The limit is set to a sufficiently unreasonable size that should never be exceeded by a genuine .git file. Signed-off-by: Erik Elfström Signed-off-by: Junio C Hamano diff --git a/cache.h b/cache.h index 25578cb..858d9b3 100644 --- a/cache.h +++ b/cache.h @@ -454,6 +454,7 @@ extern const char *get_git_work_tree(void); #define READ_GITFILE_ERR_INVALID_FORMAT 5 #define READ_GITFILE_ERR_NO_PATH 6 #define READ_GITFILE_ERR_NOT_A_REPO 7 +#define READ_GITFILE_ERR_TOO_LARGE 8 extern const char *read_gitfile_gently(const char *path, int *return_error_code); #define read_gitfile(path) read_gitfile_gently((path), NULL) extern const char *resolve_gitdir(const char *suspect); diff --git a/setup.c b/setup.c index 4748b63..a03ca94 100644 --- a/setup.c +++ b/setup.c @@ -414,6 +414,7 @@ static void update_linked_gitdir(const char *gitfile, const char *gitdir) */ const char *read_gitfile_gently(const char *path, int *return_error_code) { + const int max_file_size = 1 << 20; /* 1MB */ int error_code = 0; char *buf = NULL; char *dir = NULL; @@ -430,6 +431,10 @@ const char *read_gitfile_gently(const char *path, int *return_error_code) error_code = READ_GITFILE_ERR_NOT_A_FILE; goto cleanup_return; } + if (st.st_size > max_file_size) { + error_code = READ_GITFILE_ERR_TOO_LARGE; + goto cleanup_return; + } fd = open(path, O_RDONLY); if (fd < 0) { error_code = READ_GITFILE_ERR_OPEN_FAILED; @@ -489,6 +494,8 @@ cleanup_return: return NULL; case READ_GITFILE_ERR_OPEN_FAILED: die_errno("Error opening '%s'", path); + case READ_GITFILE_ERR_TOO_LARGE: + die("Too large to be a .git file: '%s'", path); case READ_GITFILE_ERR_READ_FAILED: die("Error reading %s", path); case READ_GITFILE_ERR_INVALID_FORMAT: -- cgit v0.10.2-6-g49f6 From 91479b9c72f1ffb13493736954149a34fddaf3aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Elfstr=C3=B6m?= Date: Mon, 15 Jun 2015 21:39:53 +0200 Subject: t7300: add tests to document behavior of clean and nested git MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Erik Elfström Signed-off-by: Junio C Hamano diff --git a/t/t7300-clean.sh b/t/t7300-clean.sh index 99be5d9..fbfdf2d 100755 --- a/t/t7300-clean.sh +++ b/t/t7300-clean.sh @@ -455,6 +455,148 @@ test_expect_success 'nested git work tree' ' ! test -d bar ' +test_expect_failure 'should clean things that almost look like git but are not' ' + rm -fr almost_git almost_bare_git almost_submodule && + mkdir -p almost_git/.git/objects && + mkdir -p almost_git/.git/refs && + cat >almost_git/.git/HEAD <<-\EOF && + garbage + EOF + cp -r almost_git/.git/ almost_bare_git && + mkdir almost_submodule/ && + cat >almost_submodule/.git <<-\EOF && + garbage + EOF + test_when_finished "rm -rf almost_*" && + ## This will fail due to die("Invalid gitfile format: %s", path); in + ## setup.c:read_gitfile. + git clean -f -d && + test_path_is_missing almost_git && + test_path_is_missing almost_bare_git && + test_path_is_missing almost_submodule +' + +test_expect_success 'should not clean submodules' ' + rm -fr repo to_clean sub1 sub2 && + mkdir repo to_clean && + ( + cd repo && + git init && + test_commit msg hello.world + ) && + git submodule add ./repo/.git sub1 && + git commit -m "sub1" && + git branch before_sub2 && + git submodule add ./repo/.git sub2 && + git commit -m "sub2" && + git checkout before_sub2 && + >to_clean/should_clean.this && + git clean -f -d && + test_path_is_file repo/.git/index && + test_path_is_file repo/hello.world && + test_path_is_file sub1/.git && + test_path_is_file sub1/hello.world && + test_path_is_file sub2/.git && + test_path_is_file sub2/hello.world && + test_path_is_missing to_clean +' + +test_expect_failure 'should avoid cleaning possible submodules' ' + rm -fr to_clean possible_sub1 && + mkdir to_clean possible_sub1 && + test_when_finished "rm -rf possible_sub*" && + echo "gitdir: foo" >possible_sub1/.git && + >possible_sub1/hello.world && + chmod 0 possible_sub1/.git && + >to_clean/should_clean.this && + git clean -f -d && + test_path_is_file possible_sub1/.git && + test_path_is_file possible_sub1/hello.world && + test_path_is_missing to_clean +' + +test_expect_failure 'nested (empty) git should be kept' ' + rm -fr empty_repo to_clean && + git init empty_repo && + mkdir to_clean && + >to_clean/should_clean.this && + git clean -f -d && + test_path_is_file empty_repo/.git/HEAD && + test_path_is_missing to_clean +' + +test_expect_success 'nested bare repositories should be cleaned' ' + rm -fr bare1 bare2 subdir && + git init --bare bare1 && + git clone --local --bare . bare2 && + mkdir subdir && + cp -r bare2 subdir/bare3 && + git clean -f -d && + test_path_is_missing bare1 && + test_path_is_missing bare2 && + test_path_is_missing subdir +' + +test_expect_success 'nested (empty) bare repositories should be cleaned even when in .git' ' + rm -fr strange_bare && + mkdir strange_bare && + git init --bare strange_bare/.git && + git clean -f -d && + test_path_is_missing strange_bare +' + +test_expect_failure 'nested (non-empty) bare repositories should be cleaned even when in .git' ' + rm -fr strange_bare && + mkdir strange_bare && + git clone --local --bare . strange_bare/.git && + git clean -f -d && + test_path_is_missing strange_bare +' + +test_expect_success 'giving path in nested git work tree will remove it' ' + rm -fr repo && + mkdir repo && + ( + cd repo && + git init && + mkdir -p bar/baz && + test_commit msg bar/baz/hello.world + ) && + git clean -f -d repo/bar/baz && + test_path_is_file repo/.git/HEAD && + test_path_is_dir repo/bar/ && + test_path_is_missing repo/bar/baz +' + +test_expect_success 'giving path to nested .git will not remove it' ' + rm -fr repo && + mkdir repo untracked && + ( + cd repo && + git init && + test_commit msg hello.world + ) && + git clean -f -d repo/.git && + test_path_is_file repo/.git/HEAD && + test_path_is_dir repo/.git/refs && + test_path_is_dir repo/.git/objects && + test_path_is_dir untracked/ +' + +test_expect_success 'giving path to nested .git/ will remove contents' ' + rm -fr repo untracked && + mkdir repo untracked && + ( + cd repo && + git init && + test_commit msg hello.world + ) && + git clean -f -d repo/.git/ && + test_path_is_dir repo/.git && + test_dir_is_empty repo/.git && + test_path_is_dir untracked/ +' + test_expect_success 'force removal of nested git work tree' ' rm -fr foo bar baz && mkdir -p foo bar baz/boo && -- cgit v0.10.2-6-g49f6 From f49a5650ab536a31a22c2b48398e0e29bb59269f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Elfstr=C3=B6m?= Date: Mon, 15 Jun 2015 21:39:54 +0200 Subject: p7300: add performance tests for clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tests are run in dry-run mode to avoid having to restore the test directories for each timed iteration. Using dry-run is an acceptable compromise since we are mostly interested in the initial computation of what to clean and not so much in the cleaning it self. Signed-off-by: Erik Elfström Signed-off-by: Junio C Hamano diff --git a/t/perf/p7300-clean.sh b/t/perf/p7300-clean.sh new file mode 100755 index 0000000..ec94cdd --- /dev/null +++ b/t/perf/p7300-clean.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +test_description="Test git-clean performance" + +. ./perf-lib.sh + +test_perf_default_repo +test_checkout_worktree + +test_expect_success 'setup untracked directory with many sub dirs' ' + rm -rf 500_sub_dirs 100000_sub_dirs clean_test_dir && + mkdir 500_sub_dirs 100000_sub_dirs clean_test_dir && + for i in $(test_seq 1 500) + do + mkdir 500_sub_dirs/dir$i || return $? + done && + for i in $(test_seq 1 200) + do + cp -r 500_sub_dirs 100000_sub_dirs/dir$i || return $? + done +' + +test_perf 'clean many untracked sub dirs, check for nested git' ' + git clean -n -q -f -d 100000_sub_dirs/ +' + +test_perf 'clean many untracked sub dirs, ignore nested git' ' + git clean -n -q -f -f -d 100000_sub_dirs/ +' + +test_done -- cgit v0.10.2-6-g49f6 From 0179ca7a626e0a6c7bf5eaccf88dead307306dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Elfstr=C3=B6m?= Date: Mon, 15 Jun 2015 21:39:55 +0200 Subject: clean: improve performance when removing lots of directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "git clean" uses resolve_gitlink_ref() to check for the presence of nested git repositories, but it has the drawback of creating a ref_cache entry for every directory that should potentially be cleaned. The linear search through the ref_cache list causes a massive performance hit for large number of directories. Modify clean.c:remove_dirs to use setup.c:is_git_directory and setup.c:read_gitfile_gently instead. Both these functions will open files and parse contents when they find something that looks like a git repository. This is ok from a performance standpoint since finding repository candidates should be comparatively rare. Using is_git_directory and read_gitfile_gently should give a more standardized check for what is and what isn't a git repository but also gives three behavioral changes. The first change is that we will now detect and avoid cleaning empty nested git repositories (only init run). This is desirable. Second, we will no longer die when cleaning a file named ".git" with garbage content (it will be cleaned instead). This is also desirable. The last change is that we will detect and avoid cleaning empty bare repositories that have been placed in a directory named ".git". This is not desirable but should have no real user impact since we already fail to clean non-empty bare repositories in the same scenario. This is thus deemed acceptable. On top of this we add some extra precautions. If read_gitfile_gently fails to open the git file, read the git file or verify the path in the git file we assume that the path with the git file is a valid repository and avoid cleaning. Update t7300 to reflect these changes in behavior. The time to clean an untracked directory containing 100000 sub directories went from 61s to 1.7s after this change. Helped-by: Jeff King Signed-off-by: Erik Elfström Signed-off-by: Junio C Hamano diff --git a/builtin/clean.c b/builtin/clean.c index 6dcb72e..df53def 100644 --- a/builtin/clean.c +++ b/builtin/clean.c @@ -10,7 +10,6 @@ #include "cache.h" #include "dir.h" #include "parse-options.h" -#include "refs.h" #include "string-list.h" #include "quote.h" #include "column.h" @@ -148,6 +147,31 @@ static int exclude_cb(const struct option *opt, const char *arg, int unset) return 0; } +/* + * Return 1 if the given path is the root of a git repository or + * submodule else 0. Will not return 1 for bare repositories with the + * exception of creating a bare repository in "foo/.git" and calling + * is_git_repository("foo"). + */ +static int is_git_repository(struct strbuf *path) +{ + int ret = 0; + int gitfile_error; + size_t orig_path_len = path->len; + assert(orig_path_len != 0); + if (path->buf[orig_path_len - 1] != '/') + strbuf_addch(path, '/'); + strbuf_addstr(path, ".git"); + if (read_gitfile_gently(path->buf, &gitfile_error) || is_git_directory(path->buf)) + ret = 1; + if (gitfile_error == READ_GITFILE_ERR_OPEN_FAILED || + gitfile_error == READ_GITFILE_ERR_READ_FAILED) + ret = 1; /* This could be a real .git file, take the + * safe option and avoid cleaning */ + strbuf_setlen(path, orig_path_len); + return ret; +} + static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag, int dry_run, int quiet, int *dir_gone) { @@ -155,13 +179,11 @@ static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag, struct strbuf quoted = STRBUF_INIT; struct dirent *e; int res = 0, ret = 0, gone = 1, original_len = path->len, len; - unsigned char submodule_head[20]; struct string_list dels = STRING_LIST_INIT_DUP; *dir_gone = 1; - if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) && - !resolve_gitlink_ref(path->buf, "HEAD", submodule_head)) { + if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) && is_git_repository(path)) { if (!quiet) { quote_path_relative(path->buf, prefix, "ed); printf(dry_run ? _(msg_would_skip_git_dir) : _(msg_skip_git_dir), diff --git a/t/t7300-clean.sh b/t/t7300-clean.sh index fbfdf2d..32e96da 100755 --- a/t/t7300-clean.sh +++ b/t/t7300-clean.sh @@ -455,7 +455,7 @@ test_expect_success 'nested git work tree' ' ! test -d bar ' -test_expect_failure 'should clean things that almost look like git but are not' ' +test_expect_success 'should clean things that almost look like git but are not' ' rm -fr almost_git almost_bare_git almost_submodule && mkdir -p almost_git/.git/objects && mkdir -p almost_git/.git/refs && @@ -468,8 +468,6 @@ test_expect_failure 'should clean things that almost look like git but are not' garbage EOF test_when_finished "rm -rf almost_*" && - ## This will fail due to die("Invalid gitfile format: %s", path); in - ## setup.c:read_gitfile. git clean -f -d && test_path_is_missing almost_git && test_path_is_missing almost_bare_git && @@ -501,7 +499,7 @@ test_expect_success 'should not clean submodules' ' test_path_is_missing to_clean ' -test_expect_failure 'should avoid cleaning possible submodules' ' +test_expect_success 'should avoid cleaning possible submodules' ' rm -fr to_clean possible_sub1 && mkdir to_clean possible_sub1 && test_when_finished "rm -rf possible_sub*" && @@ -515,7 +513,7 @@ test_expect_failure 'should avoid cleaning possible submodules' ' test_path_is_missing to_clean ' -test_expect_failure 'nested (empty) git should be kept' ' +test_expect_success 'nested (empty) git should be kept' ' rm -fr empty_repo to_clean && git init empty_repo && mkdir to_clean && @@ -537,7 +535,7 @@ test_expect_success 'nested bare repositories should be cleaned' ' test_path_is_missing subdir ' -test_expect_success 'nested (empty) bare repositories should be cleaned even when in .git' ' +test_expect_failure 'nested (empty) bare repositories should be cleaned even when in .git' ' rm -fr strange_bare && mkdir strange_bare && git init --bare strange_bare/.git && -- cgit v0.10.2-6-g49f6 From 38ae8784074852c8e7b651f4f6e44e07466da7e1 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 26 Jun 2015 05:03:31 -0400 Subject: read_gitfile_gently: fix use-after-free The "dir" variable is a pointer into the "buf" array. When we hit the cleanup_return path, the first thing we do is free(buf); but one of the error messages prints "dir", which will access the memory after the free. We can fix this by reorganizing the error path a little. We act on the fatal, error-printing conditions first, as they want to access memory and do not care about freeing. Then we free any memory, and finally return. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/setup.c b/setup.c index a03ca94..97bb5e3 100644 --- a/setup.c +++ b/setup.c @@ -479,19 +479,14 @@ const char *read_gitfile_gently(const char *path, int *return_error_code) path = real_path(dir); cleanup_return: - free(buf); - if (return_error_code) *return_error_code = error_code; - - if (error_code) { - if (return_error_code) - return NULL; - + else if (error_code) { switch (error_code) { case READ_GITFILE_ERR_STAT_FAILED: case READ_GITFILE_ERR_NOT_A_FILE: - return NULL; + /* non-fatal; follow return path */ + break; case READ_GITFILE_ERR_OPEN_FAILED: die_errno("Error opening '%s'", path); case READ_GITFILE_ERR_TOO_LARGE: @@ -509,7 +504,8 @@ cleanup_return: } } - return path; + free(buf); + return error_code ? NULL : path; } static const char *setup_explicit_git_dir(const char *gitdirenv, -- cgit v0.10.2-6-g49f6