From cbc985a1f403a09dd22511aca3e6699d3f3c1c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Tue, 2 Nov 2021 15:46:04 +0000 Subject: test-genzeros: allow more than 2G zeros in Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit d5cfd142ec (tests: teach the test-tool to generate NUL bytes and use it, 2019-02-14), add a way to generate zeroes in a portable way without using /dev/zero (needed by HP NonStop), but uses a long variable that is limited to 2^31 in Windows. Use instead a (POSIX/C99) intmax_t that is at least 64bit wide in 64-bit Windows to use in a future test. Signed-off-by: Carlo Marcelo Arenas Belón Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/t/helper/test-genzeros.c b/t/helper/test-genzeros.c index 9532f5b..b1197e9 100644 --- a/t/helper/test-genzeros.c +++ b/t/helper/test-genzeros.c @@ -3,14 +3,14 @@ int cmd__genzeros(int argc, const char **argv) { - long count; + intmax_t count; if (argc > 2) { fprintf(stderr, "usage: %s []\n", argv[0]); return 1; } - count = argc > 1 ? strtol(argv[1], NULL, 0) : -1L; + count = argc > 1 ? strtoimax(argv[1], NULL, 0) : -1; while (count < 0 || count--) { if (putchar(0) == EOF) -- cgit v0.10.2-6-g49f6 From df7000cd910a5f6991f73991b87d838a5d75e2fc Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 2 Nov 2021 15:46:05 +0000 Subject: test-tool genzeros: generate large amounts of data more efficiently In this developer's tests, producing one gigabyte worth of NULs in a busy loop that writes out individual bytes, unbuffered, took ~27sec. Writing chunked 256kB buffers instead only took ~0.6sec This matters because we are about to introduce a pair of test cases that want to be able to produce 5GB of NULs, and we cannot use `/dev/zero` because of the HP NonStop platform's lack of support for that device. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/t/helper/test-genzeros.c b/t/helper/test-genzeros.c index b1197e9..8ca988d 100644 --- a/t/helper/test-genzeros.c +++ b/t/helper/test-genzeros.c @@ -3,7 +3,10 @@ int cmd__genzeros(int argc, const char **argv) { + /* static, so that it is NUL-initialized */ + static const char zeros[256 * 1024]; intmax_t count; + ssize_t n; if (argc > 2) { fprintf(stderr, "usage: %s []\n", argv[0]); @@ -12,9 +15,19 @@ int cmd__genzeros(int argc, const char **argv) count = argc > 1 ? strtoimax(argv[1], NULL, 0) : -1; - while (count < 0 || count--) { - if (putchar(0) == EOF) + /* Writing out individual NUL bytes is slow... */ + while (count < 0) + if (write(1, zeros, ARRAY_SIZE(zeros)) < 0) return -1; + + while (count > 0) { + n = write(1, zeros, count < ARRAY_SIZE(zeros) ? + count : ARRAY_SIZE(zeros)); + + if (n < 0) + return -1; + + count -= n; } return 0; -- cgit v0.10.2-6-g49f6 From 970fa57f768a276bddbc4bd1450bb1c3feb20f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Tue, 2 Nov 2021 15:46:06 +0000 Subject: test-lib: add prerequisite for 64-bit platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow tests that assume a 64-bit `size_t` to be skipped in 32-bit platforms and regardless of the size of `long`. This imitates the `LONG_IS_64BIT` prerequisite. Signed-off-by: Carlo Marcelo Arenas Belón Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/t/test-lib.sh b/t/test-lib.sh index adaf035..af1a94c 100644 --- a/t/test-lib.sh +++ b/t/test-lib.sh @@ -1642,6 +1642,10 @@ build_option () { sed -ne "s/^$1: //p" } +test_lazy_prereq SIZE_T_IS_64BIT ' + test 8 -eq "$(build_option sizeof-size_t)" +' + test_lazy_prereq LONG_IS_64BIT ' test 8 -le "$(build_option sizeof-long)" ' -- cgit v0.10.2-6-g49f6 From b79541af7a1a7b7f2438f43196b1774d7b71e852 Mon Sep 17 00:00:00 2001 From: Matt Cooper Date: Tue, 2 Nov 2021 15:46:07 +0000 Subject: t1051: introduce a smudge filter test for extremely large files The filter system allows for alterations to file contents when they're added to the database or working tree. ("Smudge" when moving to the working tree; "clean" when moving to the database.) This is used natively to handle CRLF to LF conversions. It's also employed by Git-LFS to replace large files from the working tree with small tracking files in the repo and vice versa. Git reads the entire smudged file into memory to convert it into a "clean" form to be used in-core. While this is inefficient, there's a more insidious problem on some platforms due to inconsistency between using unsigned long and size_t for the same type of data (size of a file in bytes). On most 64-bit platforms, unsigned long is 64 bits, and size_t is typedef'd to unsigned long. On Windows, however, unsigned long is only 32 bits (and therefore on 64-bit Windows, size_t is typedef'd to unsigned long long in order to be 64 bits). Practically speaking, this means 64-bit Windows users of Git-LFS can't handle files larger than 2^32 bytes. Other 64-bit platforms don't suffer this limitation. This commit introduces a test exposing the issue; future commits make it pass. The test simulates the way Git-LFS works by having a tiny file checked into the repository and expanding it to a huge file on checkout. Helped-by: Johannes Schindelin Signed-off-by: Matt Cooper Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh index 8b7640b..e7f9f0b 100755 --- a/t/t1051-large-conversion.sh +++ b/t/t1051-large-conversion.sh @@ -83,4 +83,19 @@ test_expect_success 'ident converts on output' ' test_cmp small.clean large.clean ' +# This smudge filter prepends 5GB of zeros to the file it checks out. This +# ensures that smudging doesn't mangle large files on 64-bit Windows. +test_expect_failure EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \ + 'files over 4GB convert on output' ' + test_commit test small "a small file" && + small_size=$(test_file_size small) && + test_config filter.makelarge.smudge \ + "test-tool genzeros $((5*1024*1024*1024)) && cat" && + echo "small filter=makelarge" >.gitattributes && + rm small && + git checkout -- small && + size=$(test_file_size small) && + test "$size" -eq $((5 * 1024 * 1024 * 1024 + $small_size)) +' + test_done -- cgit v0.10.2-6-g49f6 From e9aa762cc72e6cf8fd76fefe5ca2b5064be1a821 Mon Sep 17 00:00:00 2001 From: Matt Cooper Date: Tue, 2 Nov 2021 15:46:08 +0000 Subject: odb: teach read_blob_entry to use size_t There is mixed use of size_t and unsigned long to deal with sizes in the codebase. Recall that Windows defines unsigned long as 32 bits even on 64-bit platforms, meaning that converting size_t to unsigned long narrows the range. This mostly doesn't cause a problem since Git rarely deals with files larger than 2^32 bytes. But adjunct systems such as Git LFS, which use smudge/clean filters to keep huge files out of the repository, may have huge file contents passed through some of the functions in entry.c and convert.c. On Windows, this results in a truncated file being written to the workdir. I traced this to one specific use of unsigned long in write_entry (and a similar instance in write_pc_item_to_fd for parallel checkout). That appeared to be for the call to read_blob_entry, which expects a pointer to unsigned long. By altering the signature of read_blob_entry to expect a size_t, write_entry can be switched to use size_t internally (which all of its callers and most of its callees already used). To avoid touching dozens of additional files, read_blob_entry uses a local unsigned long to call a chain of functions which aren't prepared to accept size_t. Helped-by: Johannes Schindelin Signed-off-by: Matt Cooper Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/entry.c b/entry.c index 711ee06..4cb3942 100644 --- a/entry.c +++ b/entry.c @@ -82,11 +82,13 @@ static int create_file(const char *path, unsigned int mode) return open(path, O_WRONLY | O_CREAT | O_EXCL, mode); } -void *read_blob_entry(const struct cache_entry *ce, unsigned long *size) +void *read_blob_entry(const struct cache_entry *ce, size_t *size) { enum object_type type; - void *blob_data = read_object_file(&ce->oid, &type, size); + unsigned long ul; + void *blob_data = read_object_file(&ce->oid, &type, &ul); + *size = ul; if (blob_data) { if (type == OBJ_BLOB) return blob_data; @@ -270,7 +272,7 @@ static int write_entry(struct cache_entry *ce, char *path, struct conv_attrs *ca int fd, ret, fstat_done = 0; char *new_blob; struct strbuf buf = STRBUF_INIT; - unsigned long size; + size_t size; ssize_t wrote; size_t newsize = 0; struct stat st; diff --git a/entry.h b/entry.h index b8c0e17..61ee8c1 100644 --- a/entry.h +++ b/entry.h @@ -51,7 +51,7 @@ int finish_delayed_checkout(struct checkout *state, int *nr_checkouts); */ void unlink_entry(const struct cache_entry *ce); -void *read_blob_entry(const struct cache_entry *ce, unsigned long *size); +void *read_blob_entry(const struct cache_entry *ce, size_t *size); int fstat_checkout_output(int fd, const struct checkout *state, struct stat *st); void update_ce_after_write(const struct checkout *state, struct cache_entry *ce, struct stat *st); diff --git a/parallel-checkout.c b/parallel-checkout.c index 6b1af32..b6f4a25 100644 --- a/parallel-checkout.c +++ b/parallel-checkout.c @@ -261,7 +261,7 @@ static int write_pc_item_to_fd(struct parallel_checkout_item *pc_item, int fd, struct stream_filter *filter; struct strbuf buf = STRBUF_INIT; char *blob; - unsigned long size; + size_t size; ssize_t wrote; /* Sanity check */ diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh index e7f9f0b..e6d52f9 100755 --- a/t/t1051-large-conversion.sh +++ b/t/t1051-large-conversion.sh @@ -85,7 +85,7 @@ test_expect_success 'ident converts on output' ' # This smudge filter prepends 5GB of zeros to the file it checks out. This # ensures that smudging doesn't mangle large files on 64-bit Windows. -test_expect_failure EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \ +test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \ 'files over 4GB convert on output' ' test_commit test small "a small file" && small_size=$(test_file_size small) && -- cgit v0.10.2-6-g49f6 From e2ffeae3f6795939e1af9f8e2b5fa151343eec66 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 2 Nov 2021 15:46:09 +0000 Subject: git-compat-util: introduce more size_t helpers We will use them in the next commit. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/git-compat-util.h b/git-compat-util.h index a508dbe..1f41e56 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -113,6 +113,14 @@ #define unsigned_mult_overflows(a, b) \ ((a) && (b) > maximum_unsigned_value_of_type(a) / (a)) +/* + * Returns true if the left shift of "a" by "shift" bits will + * overflow. The type of "a" must be unsigned. + */ +#define unsigned_left_shift_overflows(a, shift) \ + ((shift) < bitsizeof(a) && \ + (a) > maximum_unsigned_value_of_type(a) >> (shift)) + #ifdef __GNUC__ #define TYPEOF(x) (__typeof__(x)) #else @@ -859,6 +867,23 @@ static inline size_t st_sub(size_t a, size_t b) return a - b; } +static inline size_t st_left_shift(size_t a, unsigned shift) +{ + if (unsigned_left_shift_overflows(a, shift)) + die("size_t overflow: %"PRIuMAX" << %u", + (uintmax_t)a, shift); + return a << shift; +} + +static inline unsigned long cast_size_t_to_ulong(size_t a) +{ + if (a != (unsigned long)a) + die("object too large to read on this platform: %" + PRIuMAX" is cut off to %lu", + (uintmax_t)a, (unsigned long)a); + return (unsigned long)a; +} + #ifdef HAVE_ALLOCA_H # include # define xalloca(size) (alloca(size)) -- cgit v0.10.2-6-g49f6 From d6a09e795d77ddc76c5141047340dc3cb62355f4 Mon Sep 17 00:00:00 2001 From: Matt Cooper Date: Tue, 2 Nov 2021 15:46:10 +0000 Subject: odb: guard against data loss checking out a huge file This introduces an additional guard for platforms where `unsigned long` and `size_t` are not of the same size. If the size of an object in the database would overflow `unsigned long`, instead we now exit with an error. A complete fix will have to update _many_ other functions throughout the codebase to use `size_t` instead of `unsigned long`. It will have to be implemented at some stage. This commit puts in a stop-gap for the time being. Helped-by: Johannes Schindelin Signed-off-by: Matt Cooper Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/delta.h b/delta.h index 2df5fe1..8a56ec0 100644 --- a/delta.h +++ b/delta.h @@ -90,15 +90,15 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap, const unsigned char *top) { const unsigned char *data = *datap; - unsigned long cmd, size = 0; + size_t cmd, size = 0; int i = 0; do { cmd = *data++; - size |= (cmd & 0x7f) << i; + size |= st_left_shift(cmd & 0x7f, i); i += 7; } while (cmd & 0x80 && data < top); *datap = data; - return size; + return cast_size_t_to_ulong(size); } #endif diff --git a/object-file.c b/object-file.c index f233b44..70e456f 100644 --- a/object-file.c +++ b/object-file.c @@ -1344,7 +1344,7 @@ static int parse_loose_header_extended(const char *hdr, struct object_info *oi, unsigned int flags) { const char *type_buf = hdr; - unsigned long size; + size_t size; int type, type_len = 0; /* @@ -1388,12 +1388,12 @@ static int parse_loose_header_extended(const char *hdr, struct object_info *oi, if (c > 9) break; hdr++; - size = size * 10 + c; + size = st_add(st_mult(size, 10), c); } } if (oi->sizep) - *oi->sizep = size; + *oi->sizep = cast_size_t_to_ulong(size); /* * The length must be followed by a zero byte diff --git a/packfile.c b/packfile.c index 755aa7a..3ccea00 100644 --- a/packfile.c +++ b/packfile.c @@ -1059,7 +1059,7 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep) { unsigned shift; - unsigned long size, c; + size_t size, c; unsigned long used = 0; c = buf[used++]; @@ -1073,10 +1073,10 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, break; } c = buf[used++]; - size += (c & 0x7f) << shift; + size = st_add(size, st_left_shift(c & 0x7f, shift)); shift += 7; } - *sizep = size; + *sizep = cast_size_t_to_ulong(size); return used; } -- cgit v0.10.2-6-g49f6 From 596b5e77c960cc57ad2e68407b298411ec5e8cb8 Mon Sep 17 00:00:00 2001 From: Matt Cooper Date: Tue, 2 Nov 2021 15:46:11 +0000 Subject: clean/smudge: allow clean filters to process extremely large files The filter system allows for alterations to file contents when they're moved between the database and the worktree. We already made sure that it is possible for smudge filters to produce contents that are larger than `unsigned long` can represent (which matters on systems where `unsigned long` is narrower than `size_t`, most notably 64-bit Windows). Now we make sure that clean filters can _consume_ contents that are larger than that. Note that this commit only allows clean filters' _input_ to be larger than can be represented by `unsigned long`. This change makes only a very minute dent into the much larger project to teach Git to use `size_t` instead of `unsigned long` wherever appropriate. Helped-by: Johannes Schindelin Signed-off-by: Matt Cooper Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano diff --git a/convert.c b/convert.c index fd9c84b..5ad6dfc 100644 --- a/convert.c +++ b/convert.c @@ -613,7 +613,7 @@ static int crlf_to_worktree(const char *src, size_t len, struct strbuf *buf, struct filter_params { const char *src; - unsigned long size; + size_t size; int fd; const char *cmd; const char *path; diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh index e6d52f9..042b0e4 100755 --- a/t/t1051-large-conversion.sh +++ b/t/t1051-large-conversion.sh @@ -98,4 +98,15 @@ test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \ test "$size" -eq $((5 * 1024 * 1024 * 1024 + $small_size)) ' +# This clean filter writes down the size of input it receives. By checking against +# the actual size, we ensure that cleaning doesn't mangle large files on 64-bit Windows. +test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \ + 'files over 4GB convert on input' ' + test-tool genzeros $((5*1024*1024*1024)) >big && + test_config filter.checklarge.clean "wc -c >big.size" && + echo "big filter=checklarge" >.gitattributes && + git add big && + test $(test_file_size big) -eq $(cat big.size) +' + test_done -- cgit v0.10.2-6-g49f6