csum-file.h: increase hashfile buffer size

The hashfile API uses a hard-coded buffer size of 8KB and has ever since it was introduced in c38138c (git-pack-objects: write the pack files with a SHA1 csum, 2005-06-26). It performs a similar function to the hashing buffers in read-cache.c, but that code was updated from 8KB to 128KB in f279894 (read-cache: make the index write buffer size 128K, 2021-02-18). The justification there was that do_write_index() improves from 1.02s to 0.72s. Since our end goal is to have the index writing code use the hashfile API, we need to unify this buffer size to avoid a performance regression. There is a buffer, 'check_buffer', that is used to verify the check_fd file descriptor. When this buffer increases to 128K to fit the data being flushed, it causes the stack to overflow the limits placed in the test suite. To avoid issues with stack size, move both 'buffer' and 'check_buffer' to be heap pointers within 'struct hashfile'. The 'check_buffer' member is left as NULL unless check_fd is set in hashfd_check(). Both buffers are cleared as part of finalize_hashfile() which also frees the full structure. Since these buffers are now on the heap, we can adjust their size based on the needs of the consumer. In particular, callers to hashfd_throughput() are expecting to report progress indicators as the buffer flushes. These callers would prefer the smaller 8k buffer to avoid large delays between updates, especially for users with slower networks. When the progress indicator is not used, the larger buffer is preferrable. By adding a new trace2 region in the chunk-format API, we can see that the writing portion of 'git multi-pack-index write' lowers from ~1.49s to ~1.47s on a Linux machine. These effects may be more pronounced or diminished on other filesystems. The end-to-end timing is too noisy to have a definitive change either way. Signed-off-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
author: Derrick Stolee <dstolee@microsoft.com> 2021-05-18 18:32:46 (GMT)
committer: Junio C Hamano <gitster@pobox.com> 2021-05-19 07:41:21 (GMT)
commit: 2ca245f8be56e3269d02076b658e825b91236e5d (patch)
tree: ef5896f7799d8049983f21703c40cf5ecec83c0e /csum-file.c
parent: 68142e117c080b7a563d681dc34c7df2ab945df5 (diff)
download: git-2ca245f8be56e3269d02076b658e825b91236e5d.zip
git-2ca245f8be56e3269d02076b658e825b91236e5d.tar.gz
git-2ca245f8be56e3269d02076b658e825b91236e5d.tar.bz2
1 files changed, 57 insertions, 20 deletions
diff --git a/csum-file.c b/csum-file.c
index 3c26389..3487d28 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -11,19 +11,24 @@
 #include "progress.h"
 #include "csum-file.h"
 
+static void verify_buffer_or_die(struct hashfile *f,
+				 const void *buf,
+				 unsigned int count)
+{
+	ssize_t ret = read_in_full(f->check_fd, f->check_buffer, count);
+
+	if (ret < 0)
+		die_errno("%s: sha1 file read error", f->name);
+	if (ret != count)
+		die("%s: sha1 file truncated", f->name);
+	if (memcmp(buf, f->check_buffer, count))
+		die("sha1 file '%s' validation error", f->name);
+}
+
 static void flush(struct hashfile *f, const void *buf, unsigned int count)
 {
-	if (0 <= f->check_fd && count)  {
-		unsigned char check_buffer[8192];
-		ssize_t ret = read_in_full(f->check_fd, check_buffer, count);
-
-		if (ret < 0)
-			die_errno("%s: sha1 file read error", f->name);
-		if (ret != count)
-			die("%s: sha1 file truncated", f->name);
-		if (memcmp(buf, check_buffer, count))
-			die("sha1 file '%s' validation error", f->name);
-	}
+	if (0 <= f->check_fd && count)
+		verify_buffer_or_die(f, buf, count);
 
 	if (write_in_full(f->fd, buf, count) < 0) {
 		if (errno == ENOSPC)
@@ -46,6 +51,13 @@ void hashflush(struct hashfile *f)
 	}
 }
 
+static void free_hashfile(struct hashfile *f)
+{
+	free(f->buffer);
+	free(f->check_buffer);
+	free(f);
+}
+
 int finalize_hashfile(struct hashfile *f, unsigned char *result, unsigned int flags)
 {
 	int fd;
@@ -75,20 +87,20 @@ int finalize_hashfile(struct hashfile *f, unsigned char *result, unsigned int fl
 		if (close(f->check_fd))
 			die_errno("%s: sha1 file error on close", f->name);
 	}
-	free(f);
+	free_hashfile(f);
 	return fd;
 }
 
 void hashwrite(struct hashfile *f, const void *buf, unsigned int count)
 {
 	while (count) {
-		unsigned left = sizeof(f->buffer) - f->offset;
+		unsigned left = f->buffer_len - f->offset;
 		unsigned nr = count > left ? left : count;
 
 		if (f->do_crc)
 			f->crc32 = crc32(f->crc32, buf, nr);
 
-		if (nr == sizeof(f->buffer)) {
+		if (nr == f->buffer_len) {
 			/*
 			 * Flush a full batch worth of data directly
 			 * from the input, skipping the memcpy() to
@@ -114,11 +126,6 @@ void hashwrite(struct hashfile *f, const void *buf, unsigned int count)
 	}
 }
 
-struct hashfile *hashfd(int fd, const char *name)
-{
-	return hashfd_throughput(fd, name, NULL);
-}
-
 struct hashfile *hashfd_check(const char *name)
 {
 	int sink, check;
@@ -132,10 +139,14 @@ struct hashfile *hashfd_check(const char *name)
 		die_errno("unable to open '%s'", name);
 	f = hashfd(sink, name);
 	f->check_fd = check;
+	f->check_buffer = xmalloc(f->buffer_len);
+
 	return f;
 }
 
-struct hashfile *hashfd_throughput(int fd, const char *name, struct progress *tp)
+static struct hashfile *hashfd_internal(int fd, const char *name,
+					struct progress *tp,
+					size_t buffer_len)
 {
 	struct hashfile *f = xmalloc(sizeof(*f));
 	f->fd = fd;
@@ -146,9 +157,35 @@ struct hashfile *hashfd_throughput(int fd, const char *name, struct progress *tp
 	f->name = name;
 	f->do_crc = 0;
 	the_hash_algo->init_fn(&f->ctx);
+
+	f->buffer_len = buffer_len;
+	f->buffer = xmalloc(buffer_len);
+	f->check_buffer = NULL;
+
 	return f;
 }
 
+struct hashfile *hashfd(int fd, const char *name)
+{
+	/*
+	 * Since we are not going to use a progress meter to
+	 * measure the rate of data passing through this hashfile,
+	 * use a larger buffer size to reduce fsync() calls.
+	 */
+	return hashfd_internal(fd, name, NULL, 128 * 1024);
+}
+
+struct hashfile *hashfd_throughput(int fd, const char *name, struct progress *tp)
+{
+	/*
+	 * Since we are expecting to report progress of the
+	 * write into this hashfile, use a smaller buffer
+	 * size so the progress indicators arrive at a more
+	 * frequent rate.
+	 */
+	return hashfd_internal(fd, name, tp, 8 * 1024);
+}
+
 void hashfile_checkpoint(struct hashfile *f, struct hashfile_checkpoint *checkpoint)
 {
 	hashflush(f);
author	Derrick Stolee <dstolee@microsoft.com>	2021-05-18 18:32:46 (GMT)
committer	Junio C Hamano <gitster@pobox.com>	2021-05-19 07:41:21 (GMT)
commit	2ca245f8be56e3269d02076b658e825b91236e5d (patch)
tree	ef5896f7799d8049983f21703c40cf5ecec83c0e /csum-file.c
parent	68142e117c080b7a563d681dc34c7df2ab945df5 (diff)
download	git-2ca245f8be56e3269d02076b658e825b91236e5d.zip git-2ca245f8be56e3269d02076b658e825b91236e5d.tar.gz git-2ca245f8be56e3269d02076b658e825b91236e5d.tar.bz2