40 files changed, 1974 insertions, 1463 deletions
diff --git a/reftable/basics.c b/reftable/basics.c
index f761e48..fea711d 100644
--- a/reftable/basics.c
+++ b/reftable/basics.c
@@ -27,7 +27,7 @@ void put_be16(uint8_t *out, uint16_t i)
 	out[1] = (uint8_t)(i & 0xff);
 }
 
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
 {
 	size_t lo = 0;
 	size_t hi = sz;
@@ -39,8 +39,11 @@ int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
 	 */
 	while (hi - lo > 1) {
 		size_t mid = lo + (hi - lo) / 2;
+		int ret = f(mid, args);
+		if (ret < 0)
+			return sz;
 
-		if (f(mid, args))
+		if (ret > 0)
 			hi = mid;
 		else
 			lo = mid;
@@ -64,12 +67,11 @@ void free_names(char **a)
 	reftable_free(a);
 }
 
-int names_length(char **names)
+size_t names_length(char **names)
 {
 	char **p = names;
-	for (; *p; p++) {
-		/* empty */
-	}
+	while (*p)
+		p++;
 	return p - names;
 }
 
@@ -89,17 +91,13 @@ void parse_names(char *buf, int size, char ***namesp)
 			next = end;
 		}
 		if (p < next) {
-			if (names_len == names_cap) {
-				names_cap = 2 * names_cap + 1;
-				names = reftable_realloc(
-					names, names_cap * sizeof(*names));
-			}
+			REFTABLE_ALLOC_GROW(names, names_len + 1, names_cap);
 			names[names_len++] = xstrdup(p);
 		}
 		p = next + 1;
 	}
 
-	names = reftable_realloc(names, (names_len + 1) * sizeof(*names));
+	REFTABLE_REALLOC_ARRAY(names, names_len + 1);
 	names[names_len] = NULL;
 	*namesp = names;
 }
diff --git a/reftable/basics.h b/reftable/basics.h
index 096b368..523ecd5 100644
--- a/reftable/basics.h
+++ b/reftable/basics.h
@@ -22,13 +22,14 @@ uint32_t get_be24(uint8_t *in);
 void put_be16(uint8_t *out, uint16_t i);
 
 /*
- * find smallest index i in [0, sz) at which f(i) is true, assuming
- * that f is ascending. Return sz if f(i) is false for all indices.
+ * find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is
+ * ascending. Return sz if `f(i) == 0` for all indices. The search is aborted
+ * and `sz` is returned in case `f(i) < 0`.
  *
  * Contrary to bsearch(3), this returns something useful if the argument is not
  * found.
  */
-int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
 
 /*
  * Frees a NULL terminated array of malloced strings. The array itself is also
@@ -44,14 +45,27 @@ void parse_names(char *buf, int size, char ***namesp);
 int names_equal(char **a, char **b);
 
 /* returns the array size of a NULL-terminated array of strings. */
-int names_length(char **names);
+size_t names_length(char **names);
 
 /* Allocation routines; they invoke the functions set through
  * reftable_set_alloc() */
 void *reftable_malloc(size_t sz);
 void *reftable_realloc(void *p, size_t sz);
 void reftable_free(void *p);
-void *reftable_calloc(size_t sz);
+void *reftable_calloc(size_t nelem, size_t elsize);
+
+#define REFTABLE_ALLOC_ARRAY(x, alloc) (x) = reftable_malloc(st_mult(sizeof(*(x)), (alloc)))
+#define REFTABLE_CALLOC_ARRAY(x, alloc) (x) = reftable_calloc((alloc), sizeof(*(x)))
+#define REFTABLE_REALLOC_ARRAY(x, alloc) (x) = reftable_realloc((x), st_mult(sizeof(*(x)), (alloc)))
+#define REFTABLE_ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			alloc = 2 * (alloc) + 1; \
+			if (alloc < (nr)) \
+				alloc = (nr); \
+			REFTABLE_REALLOC_ARRAY(x, alloc); \
+		} \
+	} while (0)
 
 /* Find the longest shared prefix size of `a` and `b` */
 struct strbuf;
diff --git a/reftable/basics_test.c b/reftable/basics_test.c
index 1fcd229..997c4d9 100644
--- a/reftable/basics_test.c
+++ b/reftable/basics_test.c
@@ -12,40 +12,47 @@ https://developers.google.com/open-source/licenses/bsd
 #include "test_framework.h"
 #include "reftable-tests.h"
 
-struct binsearch_args {
-	int key;
-	int *arr;
+struct integer_needle_lesseq_args {
+	int needle;
+	int *haystack;
 };
 
-static int binsearch_func(size_t i, void *void_args)
+static int integer_needle_lesseq(size_t i, void *_args)
 {
-	struct binsearch_args *args = void_args;
-
-	return args->key < args->arr[i];
+	struct integer_needle_lesseq_args *args = _args;
+	return args->needle <= args->haystack[i];
 }
 
 static void test_binsearch(void)
 {
-	int arr[] = { 2, 4, 6, 8, 10 };
-	size_t sz = ARRAY_SIZE(arr);
-	struct binsearch_args args = {
-		.arr = arr,
+	int haystack[] = { 2, 4, 6, 8, 10 };
+	struct {
+		int needle;
+		size_t expected_idx;
+	} testcases[] = {
+		{-9000, 0},
+		{-1, 0},
+		{0, 0},
+		{2, 0},
+		{3, 1},
+		{4, 1},
+		{7, 3},
+		{9, 4},
+		{10, 4},
+		{11, 5},
+		{9000, 5},
 	};
+	size_t i = 0;
 
-	int i = 0;
-	for (i = 1; i < 11; i++) {
-		int res;
-		args.key = i;
-		res = binsearch(sz, &binsearch_func, &args);
+	for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+		struct integer_needle_lesseq_args args = {
+			.haystack = haystack,
+			.needle = testcases[i].needle,
+		};
+		size_t idx;
 
-		if (res < sz) {
-			EXPECT(args.key < arr[res]);
-			if (res > 0) {
-				EXPECT(args.key >= arr[res - 1]);
-			}
-		} else {
-			EXPECT(args.key == 10 || args.key == 11);
-		}
+		idx = binsearch(ARRAY_SIZE(haystack), &integer_needle_lesseq, &args);
+		EXPECT(idx == testcases[i].expected_idx);
 	}
 }
 
diff --git a/reftable/block.c b/reftable/block.c
index 34d4d07..3e87460 100644
--- a/reftable/block.c
+++ b/reftable/block.c
@@ -51,12 +51,7 @@ static int block_writer_register_restart(struct block_writer *w, int n,
 	if (2 + 3 * rlen + n > w->block_size - w->next)
 		return -1;
 	if (is_restart) {
-		if (w->restart_len == w->restart_cap) {
-			w->restart_cap = w->restart_cap * 2 + 1;
-			w->restarts = reftable_realloc(
-				w->restarts, sizeof(uint32_t) * w->restart_cap);
-		}
-
+		REFTABLE_ALLOC_GROW(w->restarts, w->restart_len + 1, w->restart_cap);
 		w->restarts[w->restart_len++] = w->next;
 	}
 
@@ -148,8 +143,10 @@ int block_writer_finish(struct block_writer *w)
 		int block_header_skip = 4 + w->header_off;
 		uLongf src_len = w->next - block_header_skip;
 		uLongf dest_cap = src_len * 1.001 + 12;
+		uint8_t *compressed;
+
+		REFTABLE_ALLOC_ARRAY(compressed, dest_cap);
 
-		uint8_t *compressed = reftable_malloc(dest_cap);
 		while (1) {
 			uLongf out_dest_len = dest_cap;
 			int zresult = compress2(compressed, &out_dest_len,
@@ -178,11 +175,6 @@ int block_writer_finish(struct block_writer *w)
 	return w->next;
 }
 
-uint8_t block_reader_type(struct block_reader *r)
-{
-	return r->block.data[r->header_off];
-}
-
 int block_reader_init(struct block_reader *br, struct reftable_block *block,
 		      uint32_t header_off, uint32_t table_block_size,
 		      int hash_size)
@@ -194,7 +186,8 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	uint16_t restart_count = 0;
 	uint32_t restart_start = 0;
 	uint8_t *restart_bytes = NULL;
-	uint8_t *uncompressed = NULL;
+
+	reftable_block_done(&br->block);
 
 	if (!reftable_is_block_type(typ)) {
 		err =  REFTABLE_FORMAT_ERROR;
@@ -202,37 +195,57 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	}
 
 	if (typ == BLOCK_TYPE_LOG) {
-		int block_header_skip = 4 + header_off;
-		uLongf dst_len = sz - block_header_skip; /* total size of dest
-							    buffer. */
-		uLongf src_len = block->len - block_header_skip;
-		/* Log blocks specify the *uncompressed* size in their header.
-		 */
-		uncompressed = reftable_malloc(sz);
+		uint32_t block_header_skip = 4 + header_off;
+		uLong dst_len = sz - block_header_skip;
+		uLong src_len = block->len - block_header_skip;
+
+		/* Log blocks specify the *uncompressed* size in their header. */
+		REFTABLE_ALLOC_GROW(br->uncompressed_data, sz,
+				    br->uncompressed_cap);
 
 		/* Copy over the block header verbatim. It's not compressed. */
-		memcpy(uncompressed, block->data, block_header_skip);
+		memcpy(br->uncompressed_data, block->data, block_header_skip);
 
-		/* Uncompress */
-		if (Z_OK !=
-		    uncompress2(uncompressed + block_header_skip, &dst_len,
-				block->data + block_header_skip, &src_len)) {
+		if (!br->zstream) {
+			REFTABLE_CALLOC_ARRAY(br->zstream, 1);
+			err = inflateInit(br->zstream);
+		} else {
+			err = inflateReset(br->zstream);
+		}
+		if (err != Z_OK) {
+			err = REFTABLE_ZLIB_ERROR;
+			goto done;
+		}
+
+		br->zstream->next_in = block->data + block_header_skip;
+		br->zstream->avail_in = src_len;
+		br->zstream->next_out = br->uncompressed_data + block_header_skip;
+		br->zstream->avail_out = dst_len;
+
+		/*
+		 * We know both input as well as output size, and we know that
+		 * the sizes should never be bigger than `uInt_MAX` because
+		 * blocks can at most be 16MB large. We can thus use `Z_FINISH`
+		 * here to instruct zlib to inflate the data in one go, which
+		 * is more efficient than using `Z_NO_FLUSH`.
+		 */
+		err = inflate(br->zstream, Z_FINISH);
+		if (err != Z_STREAM_END) {
 			err = REFTABLE_ZLIB_ERROR;
 			goto done;
 		}
+		err = 0;
 
-		if (dst_len + block_header_skip != sz) {
+		if (br->zstream->total_out + block_header_skip != sz) {
 			err = REFTABLE_FORMAT_ERROR;
 			goto done;
 		}
 
 		/* We're done with the input data. */
 		reftable_block_done(block);
-		block->data = uncompressed;
-		uncompressed = NULL;
+		block->data = br->uncompressed_data;
 		block->len = sz;
-		block->source = malloc_block_source();
-		full_block_size = src_len + block_header_skip;
+		full_block_size = src_len + block_header_skip - br->zstream->avail_in;
 	} else if (full_block_size == 0) {
 		full_block_size = sz;
 	} else if (sz < full_block_size && sz < block->len &&
@@ -260,176 +273,242 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	br->restart_bytes = restart_bytes;
 
 done:
-	reftable_free(uncompressed);
 	return err;
 }
 
-static uint32_t block_reader_restart_offset(struct block_reader *br, int i)
+void block_reader_release(struct block_reader *br)
+{
+	inflateEnd(br->zstream);
+	reftable_free(br->zstream);
+	reftable_free(br->uncompressed_data);
+	reftable_block_done(&br->block);
+}
+
+uint8_t block_reader_type(const struct block_reader *r)
+{
+	return r->block.data[r->header_off];
+}
+
+int block_reader_first_key(const struct block_reader *br, struct strbuf *key)
+{
+	int off = br->header_off + 4, n;
+	struct string_view in = {
+		.buf = br->block.data + off,
+		.len = br->block_len - off,
+	};
+	uint8_t extra = 0;
+
+	strbuf_reset(key);
+
+	n = reftable_decode_key(key, &extra, in);
+	if (n < 0)
+		return n;
+	if (!key->len)
+		return REFTABLE_FORMAT_ERROR;
+
+	return 0;
+}
+
+static uint32_t block_reader_restart_offset(const struct block_reader *br, int i)
 {
 	return get_be24(br->restart_bytes + 3 * i);
 }
 
-void block_reader_start(struct block_reader *br, struct block_iter *it)
+void block_iter_seek_start(struct block_iter *it, const struct block_reader *br)
 {
-	it->br = br;
+	it->block = br->block.data;
+	it->block_len = br->block_len;
+	it->hash_size = br->hash_size;
 	strbuf_reset(&it->last_key);
 	it->next_off = br->header_off + 4;
 }
 
-struct restart_find_args {
+struct restart_needle_less_args {
 	int error;
-	struct strbuf key;
-	struct block_reader *r;
+	struct strbuf needle;
+	const struct block_reader *reader;
 };
 
-static int restart_key_less(size_t idx, void *args)
+static int restart_needle_less(size_t idx, void *_args)
 {
-	struct restart_find_args *a = args;
-	uint32_t off = block_reader_restart_offset(a->r, idx);
+	struct restart_needle_less_args *args = _args;
+	uint32_t off = block_reader_restart_offset(args->reader, idx);
 	struct string_view in = {
-		.buf = a->r->block.data + off,
-		.len = a->r->block_len - off,
+		.buf = args->reader->block.data + off,
+		.len = args->reader->block_len - off,
 	};
-
-	/* the restart key is verbatim in the block, so this could avoid the
-	   alloc for decoding the key */
-	struct strbuf rkey = STRBUF_INIT;
-	struct strbuf last_key = STRBUF_INIT;
-	uint8_t unused_extra;
-	int n = reftable_decode_key(&rkey, &unused_extra, last_key, in);
-	int result;
-	if (n < 0) {
-		a->error = 1;
+	uint64_t prefix_len, suffix_len;
+	uint8_t extra;
+	int n;
+
+	/*
+	 * Records at restart points are stored without prefix compression, so
+	 * there is no need to fully decode the record key here. This removes
+	 * the need for allocating memory.
+	 */
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
+	if (n < 0 || prefix_len) {
+		args->error = 1;
 		return -1;
 	}
 
-	result = strbuf_cmp(&a->key, &rkey);
-	strbuf_release(&rkey);
-	return result;
-}
+	string_view_consume(&in, n);
+	if (suffix_len > in.len) {
+		args->error = 1;
+		return -1;
+	}
 
-void block_iter_copy_from(struct block_iter *dest, struct block_iter *src)
-{
-	dest->br = src->br;
-	dest->next_off = src->next_off;
-	strbuf_reset(&dest->last_key);
-	strbuf_addbuf(&dest->last_key, &src->last_key);
+	n = memcmp(args->needle.buf, in.buf,
+		   args->needle.len < suffix_len ? args->needle.len : suffix_len);
+	if (n)
+		return n < 0;
+	return args->needle.len < suffix_len;
 }
 
 int block_iter_next(struct block_iter *it, struct reftable_record *rec)
 {
 	struct string_view in = {
-		.buf = it->br->block.data + it->next_off,
-		.len = it->br->block_len - it->next_off,
+		.buf = (unsigned char *) it->block + it->next_off,
+		.len = it->block_len - it->next_off,
 	};
 	struct string_view start = in;
-	struct strbuf key = STRBUF_INIT;
 	uint8_t extra = 0;
 	int n = 0;
 
-	if (it->next_off >= it->br->block_len)
+	if (it->next_off >= it->block_len)
 		return 1;
 
-	n = reftable_decode_key(&key, &extra, it->last_key, in);
+	n = reftable_decode_key(&it->last_key, &extra, in);
 	if (n < 0)
 		return -1;
-
-	if (!key.len)
+	if (!it->last_key.len)
 		return REFTABLE_FORMAT_ERROR;
 
 	string_view_consume(&in, n);
-	n = reftable_record_decode(rec, key, extra, in, it->br->hash_size);
+	n = reftable_record_decode(rec, it->last_key, extra, in, it->hash_size,
+				   &it->scratch);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	strbuf_reset(&it->last_key);
-	strbuf_addbuf(&it->last_key, &key);
 	it->next_off += start.len - in.len;
-	strbuf_release(&key);
 	return 0;
 }
 
-int block_reader_first_key(struct block_reader *br, struct strbuf *key)
+void block_iter_reset(struct block_iter *it)
 {
-	struct strbuf empty = STRBUF_INIT;
-	int off = br->header_off + 4;
-	struct string_view in = {
-		.buf = br->block.data + off,
-		.len = br->block_len - off,
-	};
-
-	uint8_t extra = 0;
-	int n = reftable_decode_key(key, &extra, empty, in);
-	if (n < 0)
-		return n;
-	if (!key->len)
-		return REFTABLE_FORMAT_ERROR;
-
-	return 0;
-}
-
-int block_iter_seek(struct block_iter *it, struct strbuf *want)
-{
-	return block_reader_seek(it->br, it, want);
+	strbuf_reset(&it->last_key);
+	it->next_off = 0;
+	it->block = NULL;
+	it->block_len = 0;
+	it->hash_size = 0;
 }
 
 void block_iter_close(struct block_iter *it)
 {
 	strbuf_release(&it->last_key);
+	strbuf_release(&it->scratch);
 }
 
-int block_reader_seek(struct block_reader *br, struct block_iter *it,
-		      struct strbuf *want)
+int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
+			struct strbuf *want)
 {
-	struct restart_find_args args = {
-		.key = *want,
-		.r = br,
+	struct restart_needle_less_args args = {
+		.needle = *want,
+		.reader = br,
 	};
-	struct reftable_record rec = reftable_new_record(block_reader_type(br));
-	struct strbuf key = STRBUF_INIT;
+	struct reftable_record rec;
 	int err = 0;
-	struct block_iter next = {
-		.last_key = STRBUF_INIT,
-	};
-
-	int i = binsearch(br->restart_count, &restart_key_less, &args);
+	size_t i;
+
+	/*
+	 * Perform a binary search over the block's restart points, which
+	 * avoids doing a linear scan over the whole block. Like this, we
+	 * identify the section of the block that should contain our key.
+	 *
+	 * Note that we explicitly search for the first restart point _greater_
+	 * than the sought-after record, not _greater or equal_ to it. In case
+	 * the sought-after record is located directly at the restart point we
+	 * would otherwise start doing the linear search at the preceding
+	 * restart point. While that works alright, we would end up scanning
+	 * too many record.
+	 */
+	i = binsearch(br->restart_count, &restart_needle_less, &args);
 	if (args.error) {
 		err = REFTABLE_FORMAT_ERROR;
 		goto done;
 	}
 
-	it->br = br;
-	if (i > 0) {
-		i--;
-		it->next_off = block_reader_restart_offset(br, i);
-	} else {
+	/*
+	 * Now there are multiple cases:
+	 *
+	 *   - `i == 0`: The wanted record is smaller than the record found at
+	 *     the first restart point. As the first restart point is the first
+	 *     record in the block, our wanted record cannot be located in this
+	 *     block at all. We still need to position the iterator so that the
+	 *     next call to `block_iter_next()` will yield an end-of-iterator
+	 *     signal.
+	 *
+	 *   - `i == restart_count`: The wanted record was not found at any of
+	 *     the restart points. As there is no restart point at the end of
+	 *     the section the record may thus be contained in the last block.
+	 *
+	 *   - `i > 0`: The wanted record must be contained in the section
+	 *     before the found restart point. We thus do a linear search
+	 *     starting from the preceding restart point.
+	 */
+	if (i > 0)
+		it->next_off = block_reader_restart_offset(br, i - 1);
+	else
 		it->next_off = br->header_off + 4;
-	}
-
-	/* We're looking for the last entry less/equal than the wanted key, so
-	   we have to go one entry too far and then back up.
-	*/
+	it->block = br->block.data;
+	it->block_len = br->block_len;
+	it->hash_size = br->hash_size;
+
+	reftable_record_init(&rec, block_reader_type(br));
+
+	/*
+	 * We're looking for the last entry less than the wanted key so that
+	 * the next call to `block_reader_next()` would yield the wanted
+	 * record. We thus don't want to position our reader at the sought
+	 * after record, but one before. To do so, we have to go one entry too
+	 * far and then back up.
+	 */
 	while (1) {
-		block_iter_copy_from(&next, it);
-		err = block_iter_next(&next, &rec);
+		size_t prev_off = it->next_off;
+
+		err = block_iter_next(it, &rec);
 		if (err < 0)
 			goto done;
-
-		reftable_record_key(&rec, &key);
-		if (err > 0 || strbuf_cmp(&key, want) >= 0) {
+		if (err > 0) {
+			it->next_off = prev_off;
 			err = 0;
 			goto done;
 		}
 
-		block_iter_copy_from(it, &next);
+		/*
+		 * Check whether the current key is greater or equal to the
+		 * sought-after key. In case it is greater we know that the
+		 * record does not exist in the block and can thus abort early.
+		 * In case it is equal to the sought-after key we have found
+		 * the desired record.
+		 *
+		 * Note that we store the next record's key record directly in
+		 * `last_key` without restoring the key of the preceding record
+		 * in case we need to go one record back. This is safe to do as
+		 * `block_iter_next()` would return the ref whose key is equal
+		 * to `last_key` now, and naturally all keys share a prefix
+		 * with themselves.
+		 */
+		reftable_record_key(&rec, &it->last_key);
+		if (strbuf_cmp(&it->last_key, want) >= 0) {
+			it->next_off = prev_off;
+			goto done;
+		}
 	}
 
 done:
-	strbuf_release(&key);
-	strbuf_release(&next.last_key);
 	reftable_record_release(&rec);
-
 	return err;
 }
 
diff --git a/reftable/block.h b/reftable/block.h
index 87c7753..ea4384a 100644
--- a/reftable/block.h
+++ b/reftable/block.h
@@ -56,6 +56,8 @@ int block_writer_finish(struct block_writer *w);
 /* clears out internally allocated block_writer members. */
 void block_writer_release(struct block_writer *bw);
 
+struct z_stream;
+
 /* Read a block. */
 struct block_reader {
 	/* offset of the block header; nonzero for the first block in a
@@ -66,6 +68,11 @@ struct block_reader {
 	struct reftable_block block;
 	int hash_size;
 
+	/* Uncompressed data for log entries. */
+	z_stream *zstream;
+	unsigned char *uncompressed_data;
+	size_t uncompressed_cap;
+
 	/* size of the data, excluding restart data. */
 	uint32_t block_len;
 	uint8_t *restart_bytes;
@@ -76,41 +83,49 @@ struct block_reader {
 	uint32_t full_block_size;
 };
 
+/* initializes a block reader. */
+int block_reader_init(struct block_reader *br, struct reftable_block *bl,
+		      uint32_t header_off, uint32_t table_block_size,
+		      int hash_size);
+
+void block_reader_release(struct block_reader *br);
+
+/* Returns the block type (eg. 'r' for refs) */
+uint8_t block_reader_type(const struct block_reader *r);
+
+/* Decodes the first key in the block */
+int block_reader_first_key(const struct block_reader *br, struct strbuf *key);
+
 /* Iterate over entries in a block */
 struct block_iter {
 	/* offset within the block of the next entry to read. */
 	uint32_t next_off;
-	struct block_reader *br;
+	const unsigned char *block;
+	size_t block_len;
+	int hash_size;
 
 	/* key for last entry we read. */
 	struct strbuf last_key;
+	struct strbuf scratch;
 };
 
-/* initializes a block reader. */
-int block_reader_init(struct block_reader *br, struct reftable_block *bl,
-		      uint32_t header_off, uint32_t table_block_size,
-		      int hash_size);
+#define BLOCK_ITER_INIT { \
+	.last_key = STRBUF_INIT, \
+	.scratch = STRBUF_INIT, \
+}
 
 /* Position `it` at start of the block */
-void block_reader_start(struct block_reader *br, struct block_iter *it);
+void block_iter_seek_start(struct block_iter *it, const struct block_reader *br);
 
 /* Position `it` to the `want` key in the block */
-int block_reader_seek(struct block_reader *br, struct block_iter *it,
-		      struct strbuf *want);
-
-/* Returns the block type (eg. 'r' for refs) */
-uint8_t block_reader_type(struct block_reader *r);
-
-/* Decodes the first key in the block */
-int block_reader_first_key(struct block_reader *br, struct strbuf *key);
-
-void block_iter_copy_from(struct block_iter *dest, struct block_iter *src);
+int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
+			struct strbuf *want);
 
 /* return < 0 for error, 0 for OK, > 0 for EOF. */
 int block_iter_next(struct block_iter *it, struct reftable_record *rec);
 
-/* Seek to `want` with in the block pointed to by `it` */
-int block_iter_seek(struct block_iter *it, struct strbuf *want);
+/* Reset the block iterator to pristine state without releasing its memory. */
+void block_iter_reset(struct block_iter *it);
 
 /* deallocate memory for `it`. The block reader and its block is left intact. */
 void block_iter_close(struct block_iter *it);
diff --git a/reftable/block_test.c b/reftable/block_test.c
index cb88af4..26a9cfb 100644
--- a/reftable/block_test.c
+++ b/reftable/block_test.c
@@ -32,11 +32,11 @@ static void test_block_read_write(void)
 	int i = 0;
 	int n;
 	struct block_reader br = { 0 };
-	struct block_iter it = { .last_key = STRBUF_INIT };
+	struct block_iter it = BLOCK_ITER_INIT;
 	int j = 0;
 	struct strbuf want = STRBUF_INIT;
 
-	block.data = reftable_calloc(block_size);
+	REFTABLE_CALLOC_ARRAY(block.data, block_size);
 	block.len = block_size;
 	block.source = malloc_block_source();
 	block_writer_init(&bw, BLOCK_TYPE_REF, block.data, block_size,
@@ -49,13 +49,11 @@ static void test_block_read_write(void)
 
 	for (i = 0; i < N; i++) {
 		char name[100];
-		uint8_t hash[GIT_SHA1_RAWSZ];
 		snprintf(name, sizeof(name), "branch%02d", i);
-		memset(hash, i, sizeof(hash));
 
 		rec.u.ref.refname = name;
 		rec.u.ref.value_type = REFTABLE_REF_VAL1;
-		rec.u.ref.value.val1 = hash;
+		memset(rec.u.ref.value.val1, i, GIT_SHA1_RAWSZ);
 
 		names[i] = xstrdup(name);
 		n = block_writer_add(&bw, &rec);
@@ -71,7 +69,7 @@ static void test_block_read_write(void)
 
 	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
 
-	block_reader_start(&br, &it);
+	block_iter_seek_start(&it, &br);
 
 	while (1) {
 		int r = block_iter_next(&it, &rec);
@@ -87,11 +85,11 @@ static void test_block_read_write(void)
 	block_iter_close(&it);
 
 	for (i = 0; i < N; i++) {
-		struct block_iter it = { .last_key = STRBUF_INIT };
+		struct block_iter it = BLOCK_ITER_INIT;
 		strbuf_reset(&want);
 		strbuf_addstr(&want, names[i]);
 
-		n = block_reader_seek(&br, &it, &want);
+		n = block_iter_seek_key(&it, &br, &want);
 		EXPECT(n == 0);
 
 		n = block_iter_next(&it, &rec);
@@ -100,7 +98,7 @@ static void test_block_read_write(void)
 		EXPECT_STREQ(names[i], rec.u.ref.refname);
 
 		want.len--;
-		n = block_reader_seek(&br, &it, &want);
+		n = block_iter_seek_key(&it, &br, &want);
 		EXPECT(n == 0);
 
 		n = block_iter_next(&it, &rec);
diff --git a/reftable/blocksource.c b/reftable/blocksource.c
index 8331b34..eeed254 100644
--- a/reftable/blocksource.c
+++ b/reftable/blocksource.c
@@ -29,7 +29,7 @@ static int strbuf_read_block(void *v, struct reftable_block *dest, uint64_t off,
 {
 	struct strbuf *b = v;
 	assert(off + size <= b->len);
-	dest->data = reftable_calloc(size);
+	REFTABLE_CALLOC_ARRAY(dest->data, size);
 	memcpy(dest->data, b->buf + off, size);
 	dest->len = size;
 	return size;
@@ -76,8 +76,8 @@ struct reftable_block_source malloc_block_source(void)
 }
 
 struct file_block_source {
-	int fd;
 	uint64_t size;
+	unsigned char *data;
 };
 
 static uint64_t file_size(void *b)
@@ -87,19 +87,12 @@ static uint64_t file_size(void *b)
 
 static void file_return_block(void *b, struct reftable_block *dest)
 {
-	if (dest->len)
-		memset(dest->data, 0xff, dest->len);
-	reftable_free(dest->data);
 }
 
-static void file_close(void *b)
+static void file_close(void *v)
 {
-	int fd = ((struct file_block_source *)b)->fd;
-	if (fd > 0) {
-		close(fd);
-		((struct file_block_source *)b)->fd = 0;
-	}
-
+	struct file_block_source *b = v;
+	munmap(b->data, b->size);
 	reftable_free(b);
 }
 
@@ -108,9 +101,7 @@ static int file_read_block(void *v, struct reftable_block *dest, uint64_t off,
 {
 	struct file_block_source *b = v;
 	assert(off + size <= b->size);
-	dest->data = reftable_malloc(size);
-	if (pread(b->fd, dest->data, size, off) != size)
-		return -1;
+	dest->data = b->data + off;
 	dest->len = size;
 	return size;
 }
@@ -125,26 +116,26 @@ static struct reftable_block_source_vtable file_vtable = {
 int reftable_block_source_from_file(struct reftable_block_source *bs,
 				    const char *name)
 {
-	struct stat st = { 0 };
-	int err = 0;
-	int fd = open(name, O_RDONLY);
-	struct file_block_source *p = NULL;
+	struct file_block_source *p;
+	struct stat st;
+	int fd;
+
+	fd = open(name, O_RDONLY);
 	if (fd < 0) {
-		if (errno == ENOENT) {
+		if (errno == ENOENT)
 			return REFTABLE_NOT_EXIST_ERROR;
-		}
 		return -1;
 	}
 
-	err = fstat(fd, &st);
-	if (err < 0) {
+	if (fstat(fd, &st) < 0) {
 		close(fd);
 		return REFTABLE_IO_ERROR;
 	}
 
-	p = reftable_calloc(sizeof(struct file_block_source));
+	REFTABLE_CALLOC_ARRAY(p, 1);
 	p->size = st.st_size;
-	p->fd = fd;
+	p->data = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
 
 	assert(!bs->ops);
 	bs->ops = &file_vtable;
diff --git a/reftable/dump.c b/reftable/dump.c
index 155953d..26e0393 100644
--- a/reftable/dump.c
+++ b/reftable/dump.c
@@ -7,18 +7,16 @@ https://developers.google.com/open-source/licenses/bsd
 */
 
 #include "git-compat-util.h"
-#include "hash.h"
+#include "hash-ll.h"
 
 #include "reftable-blocksource.h"
 #include "reftable-error.h"
-#include "reftable-merged.h"
 #include "reftable-record.h"
 #include "reftable-tests.h"
 #include "reftable-writer.h"
 #include "reftable-iterator.h"
 #include "reftable-reader.h"
 #include "reftable-stack.h"
-#include "reftable-generic.h"
 
 #include <stddef.h>
 #include <stdio.h>
diff --git a/reftable/error.c b/reftable/error.c
index 93941f2..cfb7a0f 100644
--- a/reftable/error.c
+++ b/reftable/error.c
@@ -6,6 +6,7 @@ license that can be found in the LICENSE file or at
 https://developers.google.com/open-source/licenses/bsd
 */
 
+#include "system.h"
 #include "reftable-error.h"
 
 #include <stdio.h>
@@ -21,7 +22,7 @@ const char *reftable_error_str(int err)
 	case REFTABLE_NOT_EXIST_ERROR:
 		return "file does not exist";
 	case REFTABLE_LOCK_ERROR:
-		return "data is outdated";
+		return "data is locked";
 	case REFTABLE_API_ERROR:
 		return "misuse of the reftable API";
 	case REFTABLE_ZLIB_ERROR:
@@ -34,6 +35,8 @@ const char *reftable_error_str(int err)
 		return "invalid refname";
 	case REFTABLE_ENTRY_TOO_BIG_ERROR:
 		return "entry too large";
+	case REFTABLE_OUTDATED_ERROR:
+		return "data concurrently modified";
 	case -1:
 		return "general error";
 	default:
diff --git a/reftable/generic.c b/reftable/generic.c
index 57f8032..b9f1c7c 100644
--- a/reftable/generic.c
+++ b/reftable/generic.c
@@ -6,7 +6,6 @@ license that can be found in the LICENSE file or at
 https://developers.google.com/open-source/licenses/bsd
 */
 
-#include "basics.h"
 #include "constants.h"
 #include "record.h"
 #include "generic.h"
diff --git a/reftable/iter.c b/reftable/iter.c
index a8d174c..aa9ac19 100644
--- a/reftable/iter.c
+++ b/reftable/iter.c
@@ -16,11 +16,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reader.h"
 #include "reftable-error.h"
 
-int iterator_is_null(struct reftable_iterator *it)
-{
-	return !it->ops;
-}
-
 static void filtering_ref_iterator_close(void *iter_arg)
 {
 	struct filtering_ref_iterator *fri = iter_arg;
@@ -120,7 +115,7 @@ static int indexed_table_ref_iter_next_block(struct indexed_table_ref_iter *it)
 		/* indexed block does not exist. */
 		return REFTABLE_FORMAT_ERROR;
 	}
-	block_reader_start(&it->block_reader, &it->cur);
+	block_iter_seek_start(&it->cur, &it->block_reader);
 	return 0;
 }
 
@@ -160,8 +155,7 @@ int new_indexed_table_ref_iter(struct indexed_table_ref_iter **dest,
 			       int oid_len, uint64_t *offsets, int offset_len)
 {
 	struct indexed_table_ref_iter empty = INDEXED_TABLE_REF_ITER_INIT;
-	struct indexed_table_ref_iter *itr =
-		reftable_calloc(sizeof(struct indexed_table_ref_iter));
+	struct indexed_table_ref_iter *itr = reftable_calloc(1, sizeof(*itr));
 	int err = 0;
 
 	*itr = empty;
diff --git a/reftable/iter.h b/reftable/iter.h
index 09eb0cb..537431b 100644
--- a/reftable/iter.h
+++ b/reftable/iter.h
@@ -16,10 +16,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-iterator.h"
 #include "reftable-generic.h"
 
-/* Returns true for a zeroed out iterator, such as the one returned from
- * iterator_destroy. */
-int iterator_is_null(struct reftable_iterator *it);
-
 /* iterator that produces only ref records that point to `oid` */
 struct filtering_ref_iterator {
 	int double_check;
@@ -53,10 +49,10 @@ struct indexed_table_ref_iter {
 	int is_finished;
 };
 
-#define INDEXED_TABLE_REF_ITER_INIT                                     \
-	{                                                               \
-		.cur = { .last_key = STRBUF_INIT }, .oid = STRBUF_INIT, \
-	}
+#define INDEXED_TABLE_REF_ITER_INIT { \
+	.cur = BLOCK_ITER_INIT, \
+	.oid = STRBUF_INIT, \
+}
 
 void iterator_from_indexed_table_ref_iter(struct reftable_iterator *it,
 					  struct indexed_table_ref_iter *itr);
diff --git a/reftable/merged.c b/reftable/merged.c
index 2a6efa1..f85a24c 100644
--- a/reftable/merged.c
+++ b/reftable/merged.c
@@ -11,33 +11,45 @@ https://developers.google.com/open-source/licenses/bsd
 #include "constants.h"
 #include "iter.h"
 #include "pq.h"
-#include "reader.h"
 #include "record.h"
 #include "generic.h"
 #include "reftable-merged.h"
 #include "reftable-error.h"
 #include "system.h"
 
+struct merged_subiter {
+	struct reftable_iterator iter;
+	struct reftable_record rec;
+};
+
+struct merged_iter {
+	struct merged_subiter *subiters;
+	struct merged_iter_pqueue pq;
+	uint32_t hash_id;
+	size_t stack_len;
+	uint8_t typ;
+	int suppress_deletions;
+	ssize_t advance_index;
+};
+
 static int merged_iter_init(struct merged_iter *mi)
 {
-	int i = 0;
-	for (i = 0; i < mi->stack_len; i++) {
-		struct reftable_record rec = reftable_new_record(mi->typ);
-		int err = iterator_next(&mi->stack[i], &rec);
-		if (err < 0) {
+	for (size_t i = 0; i < mi->stack_len; i++) {
+		struct pq_entry e = {
+			.index = i,
+			.rec = &mi->subiters[i].rec,
+		};
+		int err;
+
+		reftable_record_init(&mi->subiters[i].rec, mi->typ);
+		err = iterator_next(&mi->subiters[i].iter,
+				    &mi->subiters[i].rec);
+		if (err < 0)
 			return err;
-		}
+		if (err > 0)
+			continue;
 
-		if (err > 0) {
-			reftable_iterator_destroy(&mi->stack[i]);
-			reftable_record_release(&rec);
-		} else {
-			struct pq_entry e = {
-				.rec = rec,
-				.index = i,
-			};
-			merged_iter_pqueue_add(&mi->pq, e);
-		}
+		merged_iter_pqueue_add(&mi->pq, &e);
 	}
 
 	return 0;
@@ -46,56 +58,68 @@ static int merged_iter_init(struct merged_iter *mi)
 static void merged_iter_close(void *p)
 {
 	struct merged_iter *mi = p;
-	int i = 0;
+
 	merged_iter_pqueue_release(&mi->pq);
-	for (i = 0; i < mi->stack_len; i++) {
-		reftable_iterator_destroy(&mi->stack[i]);
+	for (size_t i = 0; i < mi->stack_len; i++) {
+		reftable_iterator_destroy(&mi->subiters[i].iter);
+		reftable_record_release(&mi->subiters[i].rec);
 	}
-	reftable_free(mi->stack);
+	reftable_free(mi->subiters);
 }
 
-static int merged_iter_advance_nonnull_subiter(struct merged_iter *mi,
-					       size_t idx)
+static int merged_iter_advance_subiter(struct merged_iter *mi, size_t idx)
 {
 	struct pq_entry e = {
-		.rec = reftable_new_record(mi->typ),
 		.index = idx,
+		.rec = &mi->subiters[idx].rec,
 	};
-	int err = iterator_next(&mi->stack[idx], &e.rec);
-	if (err < 0)
-		return err;
+	int err;
 
-	if (err > 0) {
-		reftable_iterator_destroy(&mi->stack[idx]);
-		reftable_record_release(&e.rec);
-		return 0;
-	}
+	err = iterator_next(&mi->subiters[idx].iter, &mi->subiters[idx].rec);
+	if (err)
+		return err;
 
-	merged_iter_pqueue_add(&mi->pq, e);
+	merged_iter_pqueue_add(&mi->pq, &e);
 	return 0;
 }
 
-static int merged_iter_advance_subiter(struct merged_iter *mi, size_t idx)
-{
-	if (iterator_is_null(&mi->stack[idx]))
-		return 0;
-	return merged_iter_advance_nonnull_subiter(mi, idx);
-}
-
 static int merged_iter_next_entry(struct merged_iter *mi,
 				  struct reftable_record *rec)
 {
-	struct strbuf entry_key = STRBUF_INIT;
 	struct pq_entry entry = { 0 };
-	int err = 0;
+	int err = 0, empty;
+
+	empty = merged_iter_pqueue_is_empty(mi->pq);
+
+	if (mi->advance_index >= 0) {
+		/*
+		 * When there are no pqueue entries then we only have a single
+		 * subiter left. There is no need to use the pqueue in that
+		 * case anymore as we know that the subiter will return entries
+		 * in the correct order already.
+		 *
+		 * While this may sound like a very specific edge case, it may
+		 * happen more frequently than you think. Most repositories
+		 * will end up having a single large base table that contains
+		 * most of the refs. It's thus likely that we exhaust all
+		 * subiters but the one from that base ref.
+		 */
+		if (empty)
+			return iterator_next(&mi->subiters[mi->advance_index].iter,
+					     rec);
+
+		err = merged_iter_advance_subiter(mi, mi->advance_index);
+		if (err < 0)
+			return err;
+		if (!err)
+			empty = 0;
+		mi->advance_index = -1;
+	}
 
-	if (merged_iter_pqueue_is_empty(mi->pq))
+	if (empty)
 		return 1;
 
 	entry = merged_iter_pqueue_remove(&mi->pq);
-	err = merged_iter_advance_subiter(mi, entry.index);
-	if (err < 0)
-		return err;
 
 	/*
 	  One can also use reftable as datacenter-local storage, where the ref
@@ -105,57 +129,38 @@ static int merged_iter_next_entry(struct merged_iter *mi,
 	  such a deployment, the loop below must be changed to collect all
 	  entries for the same key, and return new the newest one.
 	*/
-	reftable_record_key(&entry.rec, &entry_key);
 	while (!merged_iter_pqueue_is_empty(mi->pq)) {
 		struct pq_entry top = merged_iter_pqueue_top(mi->pq);
-		struct strbuf k = STRBUF_INIT;
-		int err = 0, cmp = 0;
-
-		reftable_record_key(&top.rec, &k);
+		int cmp;
 
-		cmp = strbuf_cmp(&k, &entry_key);
-		strbuf_release(&k);
-
-		if (cmp > 0) {
+		cmp = reftable_record_cmp(top.rec, entry.rec);
+		if (cmp > 0)
 			break;
-		}
 
 		merged_iter_pqueue_remove(&mi->pq);
 		err = merged_iter_advance_subiter(mi, top.index);
-		if (err < 0) {
+		if (err < 0)
 			return err;
-		}
-		reftable_record_release(&top.rec);
 	}
 
-	reftable_record_copy_from(rec, &entry.rec, hash_size(mi->hash_id));
-	reftable_record_release(&entry.rec);
-	strbuf_release(&entry_key);
+	mi->advance_index = entry.index;
+	SWAP(*rec, *entry.rec);
 	return 0;
 }
 
-static int merged_iter_next(struct merged_iter *mi, struct reftable_record *rec)
+static int merged_iter_next_void(void *p, struct reftable_record *rec)
 {
+	struct merged_iter *mi = p;
 	while (1) {
 		int err = merged_iter_next_entry(mi, rec);
-		if (err == 0 && mi->suppress_deletions &&
-		    reftable_record_is_deletion(rec)) {
+		if (err)
+			return err;
+		if (mi->suppress_deletions && reftable_record_is_deletion(rec))
 			continue;
-		}
-
-		return err;
+		return 0;
 	}
 }
 
-static int merged_iter_next_void(void *p, struct reftable_record *rec)
-{
-	struct merged_iter *mi = p;
-	if (merged_iter_pqueue_is_empty(mi->pq))
-		return 1;
-
-	return merged_iter_next(mi, rec);
-}
-
 static struct reftable_iterator_vtable merged_iter_vtable = {
 	.next = &merged_iter_next_void,
 	.close = &merged_iter_close,
@@ -170,14 +175,14 @@ static void iterator_from_merged_iter(struct reftable_iterator *it,
 }
 
 int reftable_new_merged_table(struct reftable_merged_table **dest,
-			      struct reftable_table *stack, int n,
+			      struct reftable_table *stack, size_t n,
 			      uint32_t hash_id)
 {
 	struct reftable_merged_table *m = NULL;
 	uint64_t last_max = 0;
 	uint64_t first_min = 0;
-	int i = 0;
-	for (i = 0; i < n; i++) {
+
+	for (size_t i = 0; i < n; i++) {
 		uint64_t min = reftable_table_min_update_index(&stack[i]);
 		uint64_t max = reftable_table_max_update_index(&stack[i]);
 
@@ -192,7 +197,7 @@ int reftable_new_merged_table(struct reftable_merged_table **dest,
 		}
 	}
 
-	m = reftable_calloc(sizeof(struct reftable_merged_table));
+	REFTABLE_CALLOC_ARRAY(m, 1);
 	m->stack = stack;
 	m->stack_len = n;
 	m->min = first_min;
@@ -241,48 +246,37 @@ static int merged_table_seek_record(struct reftable_merged_table *mt,
 				    struct reftable_iterator *it,
 				    struct reftable_record *rec)
 {
-	struct reftable_iterator *iters = reftable_calloc(
-		sizeof(struct reftable_iterator) * mt->stack_len);
 	struct merged_iter merged = {
-		.stack = iters,
 		.typ = reftable_record_type(rec),
 		.hash_id = mt->hash_id,
 		.suppress_deletions = mt->suppress_deletions,
+		.advance_index = -1,
 	};
-	int n = 0;
-	int err = 0;
-	int i = 0;
-	for (i = 0; i < mt->stack_len && err == 0; i++) {
-		int e = reftable_table_seek_record(&mt->stack[i], &iters[n],
-						   rec);
-		if (e < 0) {
-			err = e;
-		}
-		if (e == 0) {
-			n++;
-		}
-	}
-	if (err < 0) {
-		int i = 0;
-		for (i = 0; i < n; i++) {
-			reftable_iterator_destroy(&iters[i]);
-		}
-		reftable_free(iters);
-		return err;
+	struct merged_iter *p;
+	int err;
+
+	REFTABLE_CALLOC_ARRAY(merged.subiters, mt->stack_len);
+	for (size_t i = 0; i < mt->stack_len; i++) {
+		err = reftable_table_seek_record(&mt->stack[i],
+						 &merged.subiters[merged.stack_len].iter, rec);
+		if (err < 0)
+			goto out;
+		if (!err)
+			merged.stack_len++;
 	}
 
-	merged.stack_len = n;
 	err = merged_iter_init(&merged);
-	if (err < 0) {
+	if (err < 0)
+		goto out;
+
+	p = reftable_malloc(sizeof(struct merged_iter));
+	*p = merged;
+	iterator_from_merged_iter(it, p);
+
+out:
+	if (err < 0)
 		merged_iter_close(&merged);
-		return err;
-	} else {
-		struct merged_iter *p =
-			reftable_malloc(sizeof(struct merged_iter));
-		*p = merged;
-		iterator_from_merged_iter(it, p);
-	}
-	return 0;
+	return err;
 }
 
 int reftable_merged_table_seek_ref(struct reftable_merged_table *mt,
diff --git a/reftable/merged.h b/reftable/merged.h
index 7d9f95d..a2571db 100644
--- a/reftable/merged.h
+++ b/reftable/merged.h
@@ -9,7 +9,7 @@ https://developers.google.com/open-source/licenses/bsd
 #ifndef MERGED_H
 #define MERGED_H
 
-#include "pq.h"
+#include "system.h"
 
 struct reftable_merged_table {
 	struct reftable_table *stack;
@@ -24,15 +24,6 @@ struct reftable_merged_table {
 	uint64_t max;
 };
 
-struct merged_iter {
-	struct reftable_iterator *stack;
-	uint32_t hash_id;
-	size_t stack_len;
-	uint8_t typ;
-	int suppress_deletions;
-	struct merged_iter_pqueue pq;
-};
-
 void merged_table_release(struct reftable_merged_table *mt);
 
 #endif
diff --git a/reftable/merged_test.c b/reftable/merged_test.c
index d08c16a..530fc82 100644
--- a/reftable/merged_test.c
+++ b/reftable/merged_test.c
@@ -12,7 +12,6 @@ https://developers.google.com/open-source/licenses/bsd
 
 #include "basics.h"
 #include "blocksource.h"
-#include "constants.h"
 #include "reader.h"
 #include "record.h"
 #include "test_framework.h"
@@ -43,7 +42,7 @@ static void write_test_table(struct strbuf *buf,
 		}
 	}
 
-	w = reftable_new_writer(&strbuf_add_void, buf, &opts);
+	w = reftable_new_writer(&strbuf_add_void, &noop_flush, buf, &opts);
 	reftable_writer_set_limits(w, min, max);
 
 	for (i = 0; i < n; i++) {
@@ -71,7 +70,7 @@ static void write_test_log_table(struct strbuf *buf,
 		.exact_log_message = 1,
 	};
 	struct reftable_writer *w = NULL;
-	w = reftable_new_writer(&strbuf_add_void, buf, &opts);
+	w = reftable_new_writer(&strbuf_add_void, &noop_flush, buf, &opts);
 	reftable_writer_set_limits(w, update_index, update_index);
 
 	for (i = 0; i < n; i++) {
@@ -89,16 +88,17 @@ static struct reftable_merged_table *
 merged_table_from_records(struct reftable_ref_record **refs,
 			  struct reftable_block_source **source,
 			  struct reftable_reader ***readers, int *sizes,
-			  struct strbuf *buf, int n)
+			  struct strbuf *buf, size_t n)
 {
-	int i = 0;
 	struct reftable_merged_table *mt = NULL;
+	struct reftable_table *tabs;
 	int err;
-	struct reftable_table *tabs =
-		reftable_calloc(n * sizeof(struct reftable_table));
-	*readers = reftable_calloc(n * sizeof(struct reftable_reader *));
-	*source = reftable_calloc(n * sizeof(**source));
-	for (i = 0; i < n; i++) {
+
+	REFTABLE_CALLOC_ARRAY(tabs, n);
+	REFTABLE_CALLOC_ARRAY(*readers, n);
+	REFTABLE_CALLOC_ARRAY(*source, n);
+
+	for (size_t i = 0; i < n; i++) {
 		write_test_table(&buf[i], refs[i], sizes[i]);
 		block_source_from_strbuf(&(*source)[i], &buf[i]);
 
@@ -123,13 +123,11 @@ static void readers_destroy(struct reftable_reader **readers, size_t n)
 
 static void test_merged_between(void)
 {
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 0 };
-
 	struct reftable_ref_record r1[] = { {
 		.refname = "b",
 		.update_index = 1,
 		.value_type = REFTABLE_REF_VAL1,
-		.value.val1 = hash1,
+		.value.val1 = { 1, 2, 3, 0 },
 	} };
 	struct reftable_ref_record r2[] = { {
 		.refname = "a",
@@ -165,26 +163,24 @@ static void test_merged_between(void)
 
 static void test_merged(void)
 {
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 };
-	uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 };
 	struct reftable_ref_record r1[] = {
 		{
 			.refname = "a",
 			.update_index = 1,
 			.value_type = REFTABLE_REF_VAL1,
-			.value.val1 = hash1,
+			.value.val1 = { 1 },
 		},
 		{
 			.refname = "b",
 			.update_index = 1,
 			.value_type = REFTABLE_REF_VAL1,
-			.value.val1 = hash1,
+			.value.val1 = { 1 },
 		},
 		{
 			.refname = "c",
 			.update_index = 1,
 			.value_type = REFTABLE_REF_VAL1,
-			.value.val1 = hash1,
+			.value.val1 = { 1 },
 		}
 	};
 	struct reftable_ref_record r2[] = { {
@@ -197,13 +193,13 @@ static void test_merged(void)
 			.refname = "c",
 			.update_index = 3,
 			.value_type = REFTABLE_REF_VAL1,
-			.value.val1 = hash2,
+			.value.val1 = { 2 },
 		},
 		{
 			.refname = "d",
 			.update_index = 3,
 			.value_type = REFTABLE_REF_VAL1,
-			.value.val1 = hash1,
+			.value.val1 = { 1 },
 		},
 	};
 
@@ -236,14 +232,10 @@ static void test_merged(void)
 	while (len < 100) { /* cap loops/recursion. */
 		struct reftable_ref_record ref = { NULL };
 		int err = reftable_iterator_next_ref(&it, &ref);
-		if (err > 0) {
+		if (err > 0)
 			break;
-		}
-		if (len == cap) {
-			cap = 2 * cap + 1;
-			out = reftable_realloc(
-				out, sizeof(struct reftable_ref_record) * cap);
-		}
+
+		REFTABLE_ALLOC_GROW(out, len + 1, cap);
 		out[len++] = ref;
 	}
 	reftable_iterator_destroy(&it);
@@ -270,16 +262,17 @@ static struct reftable_merged_table *
 merged_table_from_log_records(struct reftable_log_record **logs,
 			      struct reftable_block_source **source,
 			      struct reftable_reader ***readers, int *sizes,
-			      struct strbuf *buf, int n)
+			      struct strbuf *buf, size_t n)
 {
-	int i = 0;
 	struct reftable_merged_table *mt = NULL;
+	struct reftable_table *tabs;
 	int err;
-	struct reftable_table *tabs =
-		reftable_calloc(n * sizeof(struct reftable_table));
-	*readers = reftable_calloc(n * sizeof(struct reftable_reader *));
-	*source = reftable_calloc(n * sizeof(**source));
-	for (i = 0; i < n; i++) {
+
+	REFTABLE_CALLOC_ARRAY(tabs, n);
+	REFTABLE_CALLOC_ARRAY(*readers, n);
+	REFTABLE_CALLOC_ARRAY(*source, n);
+
+	for (size_t i = 0; i < n; i++) {
 		write_test_log_table(&buf[i], logs[i], sizes[i], i + 1);
 		block_source_from_strbuf(&(*source)[i], &buf[i]);
 
@@ -296,16 +289,13 @@ merged_table_from_log_records(struct reftable_log_record **logs,
 
 static void test_merged_logs(void)
 {
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 };
-	uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 };
-	uint8_t hash3[GIT_SHA1_RAWSZ] = { 3 };
 	struct reftable_log_record r1[] = {
 		{
 			.refname = "a",
 			.update_index = 2,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.old_hash = hash2,
+				.old_hash = { 2 },
 				/* deletion */
 				.name = "jane doe",
 				.email = "jane@invalid",
@@ -317,8 +307,8 @@ static void test_merged_logs(void)
 			.update_index = 1,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.old_hash = hash1,
-				.new_hash = hash2,
+				.old_hash = { 1 },
+				.new_hash = { 2 },
 				.name = "jane doe",
 				.email = "jane@invalid",
 				.message = "message1",
@@ -331,7 +321,7 @@ static void test_merged_logs(void)
 			.update_index = 3,
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value.update = {
-				.new_hash = hash3,
+				.new_hash = { 3 },
 				.name = "jane doe",
 				.email = "jane@invalid",
 				.message = "message3",
@@ -373,14 +363,10 @@ static void test_merged_logs(void)
 	while (len < 100) { /* cap loops/recursion. */
 		struct reftable_log_record log = { NULL };
 		int err = reftable_iterator_next_log(&it, &log);
-		if (err > 0) {
+		if (err > 0)
 			break;
-		}
-		if (len == cap) {
-			cap = 2 * cap + 1;
-			out = reftable_realloc(
-				out, sizeof(struct reftable_log_record) * cap);
-		}
+
+		REFTABLE_ALLOC_GROW(out, len + 1, cap);
 		out[len++] = log;
 	}
 	reftable_iterator_destroy(&it);
@@ -417,7 +403,7 @@ static void test_default_write_opts(void)
 	struct reftable_write_options opts = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 
 	struct reftable_ref_record rec = {
 		.refname = "master",
@@ -425,7 +411,7 @@ static void test_default_write_opts(void)
 	};
 	int err;
 	struct reftable_block_source source = { NULL };
-	struct reftable_table *tab = reftable_calloc(sizeof(*tab) * 1);
+	struct reftable_table *tab = reftable_calloc(1, sizeof(*tab));
 	uint32_t hash_id;
 	struct reftable_reader *rd = NULL;
 	struct reftable_merged_table *merged = NULL;
diff --git a/reftable/pq.c b/reftable/pq.c
index 96ca6dd..7fb45d8 100644
--- a/reftable/pq.c
+++ b/reftable/pq.c
@@ -14,33 +14,12 @@ https://developers.google.com/open-source/licenses/bsd
 
 int pq_less(struct pq_entry *a, struct pq_entry *b)
 {
-	struct strbuf ak = STRBUF_INIT;
-	struct strbuf bk = STRBUF_INIT;
-	int cmp = 0;
-	reftable_record_key(&a->rec, &ak);
-	reftable_record_key(&b->rec, &bk);
-
-	cmp = strbuf_cmp(&ak, &bk);
-
-	strbuf_release(&ak);
-	strbuf_release(&bk);
-
+	int cmp = reftable_record_cmp(a->rec, b->rec);
 	if (cmp == 0)
 		return a->index > b->index;
-
 	return cmp < 0;
 }
 
-struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq)
-{
-	return pq.heap[0];
-}
-
-int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq)
-{
-	return pq.len == 0;
-}
-
 struct pq_entry merged_iter_pqueue_remove(struct merged_iter_pqueue *pq)
 {
 	int i = 0;
@@ -71,17 +50,13 @@ struct pq_entry merged_iter_pqueue_remove(struct merged_iter_pqueue *pq)
 	return e;
 }
 
-void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, struct pq_entry e)
+void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry *e)
 {
 	int i = 0;
 
-	if (pq->len == pq->cap) {
-		pq->cap = 2 * pq->cap + 1;
-		pq->heap = reftable_realloc(pq->heap,
-					    pq->cap * sizeof(struct pq_entry));
-	}
+	REFTABLE_ALLOC_GROW(pq->heap, pq->len + 1, pq->cap);
+	pq->heap[pq->len++] = *e;
 
-	pq->heap[pq->len++] = e;
 	i = pq->len - 1;
 	while (i > 0) {
 		int j = (i - 1) / 2;
@@ -97,10 +72,6 @@ void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, struct pq_entry e)
 
 void merged_iter_pqueue_release(struct merged_iter_pqueue *pq)
 {
-	int i = 0;
-	for (i = 0; i < pq->len; i++) {
-		reftable_record_release(&pq->heap[i].rec);
-	}
 	FREE_AND_NULL(pq->heap);
-	pq->len = pq->cap = 0;
+	memset(pq, 0, sizeof(*pq));
 }
diff --git a/reftable/pq.h b/reftable/pq.h
index 56fc1b6..f796c23 100644
--- a/reftable/pq.h
+++ b/reftable/pq.h
@@ -12,8 +12,8 @@ https://developers.google.com/open-source/licenses/bsd
 #include "record.h"
 
 struct pq_entry {
-	int index;
-	struct reftable_record rec;
+	size_t index;
+	struct reftable_record *rec;
 };
 
 struct merged_iter_pqueue {
@@ -22,12 +22,20 @@ struct merged_iter_pqueue {
 	size_t cap;
 };
 
-struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq);
-int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq);
 void merged_iter_pqueue_check(struct merged_iter_pqueue pq);
 struct pq_entry merged_iter_pqueue_remove(struct merged_iter_pqueue *pq);
-void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, struct pq_entry e);
+void merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry *e);
 void merged_iter_pqueue_release(struct merged_iter_pqueue *pq);
 int pq_less(struct pq_entry *a, struct pq_entry *b);
 
+static inline struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq)
+{
+	return pq.heap[0];
+}
+
+static inline int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq)
+{
+	return pq.len == 0;
+}
+
 #endif
diff --git a/reftable/pq_test.c b/reftable/pq_test.c
index 7de5e88..b7d3c80 100644
--- a/reftable/pq_test.c
+++ b/reftable/pq_test.c
@@ -27,48 +27,43 @@ void merged_iter_pqueue_check(struct merged_iter_pqueue pq)
 
 static void test_pq(void)
 {
-	char *names[54] = { NULL };
-	int N = ARRAY_SIZE(names) - 1;
-
 	struct merged_iter_pqueue pq = { NULL };
+	struct reftable_record recs[54];
+	int N = ARRAY_SIZE(recs) - 1, i;
 	char *last = NULL;
 
-	int i = 0;
 	for (i = 0; i < N; i++) {
-		char name[100];
-		snprintf(name, sizeof(name), "%02d", i);
-		names[i] = xstrdup(name);
+		struct strbuf refname = STRBUF_INIT;
+		strbuf_addf(&refname, "%02d", i);
+
+		reftable_record_init(&recs[i], BLOCK_TYPE_REF);
+		recs[i].u.ref.refname = strbuf_detach(&refname, NULL);
 	}
 
 	i = 1;
 	do {
-		struct pq_entry e = { .rec = { .type = BLOCK_TYPE_REF,
-					       .u.ref = {
-						       .refname = names[i],
-					       } } };
-		merged_iter_pqueue_add(&pq, e);
+		struct pq_entry e = {
+			.rec = &recs[i],
+		};
+
+		merged_iter_pqueue_add(&pq, &e);
 		merged_iter_pqueue_check(pq);
+
 		i = (i * 7) % N;
 	} while (i != 1);
 
 	while (!merged_iter_pqueue_is_empty(pq)) {
 		struct pq_entry e = merged_iter_pqueue_remove(&pq);
-		struct reftable_record *rec = &e.rec;
 		merged_iter_pqueue_check(pq);
 
-		EXPECT(reftable_record_type(rec) == BLOCK_TYPE_REF);
-		if (last) {
-			EXPECT(strcmp(last, rec->u.ref.refname) < 0);
-		}
-		// this is names[i], so don't dealloc.
-		last = rec->u.ref.refname;
-		rec->u.ref.refname = NULL;
-		reftable_record_release(rec);
-	}
-	for (i = 0; i < N; i++) {
-		reftable_free(names[i]);
+		EXPECT(reftable_record_type(e.rec) == BLOCK_TYPE_REF);
+		if (last)
+			EXPECT(strcmp(last, e.rec->u.ref.refname) < 0);
+		last = e.rec->u.ref.refname;
 	}
 
+	for (i = 0; i < N; i++)
+		reftable_record_release(&recs[i]);
 	merged_iter_pqueue_release(&pq);
 }
 
diff --git a/reftable/publicbasics.c b/reftable/publicbasics.c
index 0ad7d5c..44b84a1 100644
--- a/reftable/publicbasics.c
+++ b/reftable/publicbasics.c
@@ -6,10 +6,10 @@ license that can be found in the LICENSE file or at
 https://developers.google.com/open-source/licenses/bsd
 */
 
+#include "system.h"
 #include "reftable-malloc.h"
 
 #include "basics.h"
-#include "system.h"
 
 static void *(*reftable_malloc_ptr)(size_t sz);
 static void *(*reftable_realloc_ptr)(void *, size_t);
@@ -37,8 +37,9 @@ void reftable_free(void *p)
 		free(p);
 }
 
-void *reftable_calloc(size_t sz)
+void *reftable_calloc(size_t nelem, size_t elsize)
 {
+	size_t sz = st_mult(nelem, elsize);
 	void *p = reftable_malloc(sz);
 	memset(p, 0, sz);
 	return p;
diff --git a/reftable/reader.c b/reftable/reader.c
index 54b4025..481dff1 100644
--- a/reftable/reader.c
+++ b/reftable/reader.c
@@ -16,7 +16,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "record.h"
 #include "reftable-error.h"
 #include "reftable-generic.h"
-#include "tree.h"
 
 uint64_t block_source_size(struct reftable_block_source *source)
 {
@@ -221,22 +220,12 @@ struct table_iter {
 	struct reftable_reader *r;
 	uint8_t typ;
 	uint64_t block_off;
+	struct block_reader br;
 	struct block_iter bi;
 	int is_finished;
 };
-#define TABLE_ITER_INIT                          \
-	{                                        \
-		.bi = {.last_key = STRBUF_INIT } \
-	}
-
-static void table_iter_copy_from(struct table_iter *dest,
-				 struct table_iter *src)
-{
-	dest->r = src->r;
-	dest->typ = src->typ;
-	dest->block_off = src->block_off;
-	dest->is_finished = src->is_finished;
-	block_iter_copy_from(&dest->bi, &src->bi);
+#define TABLE_ITER_INIT { \
+	.bi = BLOCK_ITER_INIT \
 }
 
 static int table_iter_next_in_block(struct table_iter *ti,
@@ -252,14 +241,8 @@ static int table_iter_next_in_block(struct table_iter *ti,
 
 static void table_iter_block_done(struct table_iter *ti)
 {
-	if (!ti->bi.br) {
-		return;
-	}
-	reftable_block_done(&ti->bi.br->block);
-	FREE_AND_NULL(ti->bi.br);
-
-	ti->bi.last_key.len = 0;
-	ti->bi.next_off = 0;
+	block_reader_release(&ti->br);
+	block_iter_reset(&ti->bi);
 }
 
 static int32_t extract_block_size(uint8_t *data, uint8_t *typ, uint64_t off,
@@ -323,32 +306,27 @@ done:
 	return err;
 }
 
-static int table_iter_next_block(struct table_iter *dest,
-				 struct table_iter *src)
+static void table_iter_close(struct table_iter *ti)
 {
-	uint64_t next_block_off = src->block_off + src->bi.br->full_block_size;
-	struct block_reader br = { 0 };
-	int err = 0;
+	table_iter_block_done(ti);
+	block_iter_close(&ti->bi);
+}
 
-	dest->r = src->r;
-	dest->typ = src->typ;
-	dest->block_off = next_block_off;
+static int table_iter_next_block(struct table_iter *ti)
+{
+	uint64_t next_block_off = ti->block_off + ti->br.full_block_size;
+	int err;
 
-	err = reader_init_block_reader(src->r, &br, next_block_off, src->typ);
-	if (err > 0) {
-		dest->is_finished = 1;
-		return 1;
-	}
-	if (err != 0)
+	err = reader_init_block_reader(ti->r, &ti->br, next_block_off, ti->typ);
+	if (err > 0)
+		ti->is_finished = 1;
+	if (err)
 		return err;
-	else {
-		struct block_reader *brp =
-			reftable_malloc(sizeof(struct block_reader));
-		*brp = br;
 
-		dest->is_finished = 0;
-		block_reader_start(brp, &dest->bi);
-	}
+	ti->block_off = next_block_off;
+	ti->is_finished = 0;
+	block_iter_seek_start(&ti->bi, &ti->br);
+
 	return 0;
 }
 
@@ -358,27 +336,30 @@ static int table_iter_next(struct table_iter *ti, struct reftable_record *rec)
 		return REFTABLE_API_ERROR;
 
 	while (1) {
-		struct table_iter next = TABLE_ITER_INIT;
-		int err = 0;
-		if (ti->is_finished) {
+		int err;
+
+		if (ti->is_finished)
 			return 1;
-		}
 
+		/*
+		 * Check whether the current block still has more records. If
+		 * so, return it. If the iterator returns positive then the
+		 * current block has been exhausted.
+		 */
 		err = table_iter_next_in_block(ti, rec);
-		if (err <= 0) {
+		if (err <= 0)
 			return err;
-		}
 
-		err = table_iter_next_block(&next, ti);
-		if (err != 0) {
+		/*
+		 * Otherwise, we need to continue to the next block in the
+		 * table and retry. If there are no more blocks then the
+		 * iterator is drained.
+		 */
+		err = table_iter_next_block(ti);
+		if (err) {
 			ti->is_finished = 1;
-		}
-		table_iter_block_done(ti);
-		if (err != 0) {
 			return err;
 		}
-		table_iter_copy_from(ti, &next);
-		block_iter_close(&next.bi);
 	}
 }
 
@@ -387,16 +368,14 @@ static int table_iter_next_void(void *ti, struct reftable_record *rec)
 	return table_iter_next(ti, rec);
 }
 
-static void table_iter_close(void *p)
+static void table_iter_close_void(void *ti)
 {
-	struct table_iter *ti = p;
-	table_iter_block_done(ti);
-	block_iter_close(&ti->bi);
+	table_iter_close(ti);
 }
 
 static struct reftable_iterator_vtable table_iter_vtable = {
 	.next = &table_iter_next_void,
-	.close = &table_iter_close,
+	.close = &table_iter_close_void,
 };
 
 static void iterator_from_table_iter(struct reftable_iterator *it,
@@ -411,19 +390,16 @@ static int reader_table_iter_at(struct reftable_reader *r,
 				struct table_iter *ti, uint64_t off,
 				uint8_t typ)
 {
-	struct block_reader br = { 0 };
-	struct block_reader *brp = NULL;
+	int err;
 
-	int err = reader_init_block_reader(r, &br, off, typ);
+	err = reader_init_block_reader(r, &ti->br, off, typ);
 	if (err != 0)
 		return err;
 
-	brp = reftable_malloc(sizeof(struct block_reader));
-	*brp = br;
 	ti->r = r;
-	ti->typ = block_reader_type(brp);
+	ti->typ = block_reader_type(&ti->br);
 	ti->block_off = off;
-	block_reader_start(brp, &ti->bi);
+	block_iter_seek_start(&ti->bi, &ti->br);
 	return 0;
 }
 
@@ -443,28 +419,57 @@ static int reader_start(struct reftable_reader *r, struct table_iter *ti,
 	return reader_table_iter_at(r, ti, off, typ);
 }
 
-static int reader_seek_linear(struct reftable_reader *r, struct table_iter *ti,
+static int reader_seek_linear(struct table_iter *ti,
 			      struct reftable_record *want)
 {
-	struct reftable_record rec =
-		reftable_new_record(reftable_record_type(want));
 	struct strbuf want_key = STRBUF_INIT;
 	struct strbuf got_key = STRBUF_INIT;
-	struct table_iter next = TABLE_ITER_INIT;
+	struct reftable_record rec;
 	int err = -1;
 
+	reftable_record_init(&rec, reftable_record_type(want));
 	reftable_record_key(want, &want_key);
 
+	/*
+	 * First we need to locate the block that must contain our record. To
+	 * do so we scan through blocks linearly until we find the first block
+	 * whose first key is bigger than our wanted key. Once we have found
+	 * that block we know that the key must be contained in the preceding
+	 * block.
+	 *
+	 * This algorithm is somewhat unfortunate because it means that we
+	 * always have to seek one block too far and then back up. But as we
+	 * can only decode the _first_ key of a block but not its _last_ key we
+	 * have no other way to do this.
+	 */
 	while (1) {
-		err = table_iter_next_block(&next, ti);
+		struct table_iter next = *ti;
+
+		/*
+		 * We must be careful to not modify underlying data of `ti`
+		 * because we may find that `next` does not contain our desired
+		 * block, but that `ti` does. In that case, we would discard
+		 * `next` and continue with `ti`.
+		 *
+		 * This also means that we cannot reuse allocated memory for
+		 * `next` here. While it would be great if we could, it should
+		 * in practice not be too bad given that we should only ever
+		 * end up doing linear seeks with at most three blocks. As soon
+		 * as we have more than three blocks we would have an index, so
+		 * we would not do a linear search there anymore.
+		 */
+		memset(&next.br.block, 0, sizeof(next.br.block));
+		next.br.zstream = NULL;
+		next.br.uncompressed_data = NULL;
+		next.br.uncompressed_cap = 0;
+
+		err = table_iter_next_block(&next);
 		if (err < 0)
 			goto done;
-
-		if (err > 0) {
+		if (err > 0)
 			break;
-		}
 
-		err = block_reader_first_key(next.bi.br, &got_key);
+		err = block_reader_first_key(&next.br, &got_key);
 		if (err < 0)
 			goto done;
 
@@ -474,16 +479,20 @@ static int reader_seek_linear(struct reftable_reader *r, struct table_iter *ti,
 		}
 
 		table_iter_block_done(ti);
-		table_iter_copy_from(ti, &next);
+		*ti = next;
 	}
 
-	err = block_iter_seek(&ti->bi, &want_key);
+	/*
+	 * We have located the block that must contain our record, so we seek
+	 * the wanted key inside of it. If the block does not contain our key
+	 * we know that the corresponding record does not exist.
+	 */
+	err = block_iter_seek_key(&ti->bi, &ti->br, &want_key);
 	if (err < 0)
 		goto done;
 	err = 0;
 
 done:
-	block_iter_close(&next.bi);
 	reftable_record_release(&rec);
 	strbuf_release(&want_key);
 	strbuf_release(&got_key);
@@ -502,6 +511,7 @@ static int reader_seek_indexed(struct reftable_reader *r,
 		.u.idx = { .last_key = STRBUF_INIT },
 	};
 	struct table_iter index_iter = TABLE_ITER_INIT;
+	struct table_iter empty = TABLE_ITER_INIT;
 	struct table_iter next = TABLE_ITER_INIT;
 	int err = 0;
 
@@ -510,10 +520,39 @@ static int reader_seek_indexed(struct reftable_reader *r,
 	if (err < 0)
 		goto done;
 
-	err = reader_seek_linear(r, &index_iter, &want_index);
+	/*
+	 * The index may consist of multiple levels, where each level may have
+	 * multiple index blocks. We start by doing a linear search in the
+	 * highest layer that identifies the relevant index block as well as
+	 * the record inside that block that corresponds to our wanted key.
+	 */
+	err = reader_seek_linear(&index_iter, &want_index);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * Traverse down the levels until we find a non-index entry.
+	 */
 	while (1) {
+		/*
+		 * In case we seek a record that does not exist the index iter
+		 * will tell us that the iterator is over. This works because
+		 * the last index entry of the current level will contain the
+		 * last key it knows about. So in case our seeked key is larger
+		 * than the last indexed key we know that it won't exist.
+		 *
+		 * There is one subtlety in the layout of the index section
+		 * that makes this work as expected: the highest-level index is
+		 * at end of the section and will point backwards and thus we
+		 * start reading from the end of the index section, not the
+		 * beginning.
+		 *
+		 * If that wasn't the case and the order was reversed then the
+		 * linear seek would seek into the lower levels and traverse
+		 * all levels of the index only to find out that the key does
+		 * not exist.
+		 */
 		err = table_iter_next(&index_iter, &index_result);
-		table_iter_block_done(&index_iter);
 		if (err != 0)
 			goto done;
 
@@ -522,7 +561,7 @@ static int reader_seek_indexed(struct reftable_reader *r,
 		if (err != 0)
 			goto done;
 
-		err = block_iter_seek(&next.bi, &want_index.u.idx.last_key);
+		err = block_iter_seek_key(&next.bi, &next.br, &want_index.u.idx.last_key);
 		if (err < 0)
 			goto done;
 
@@ -536,19 +575,20 @@ static int reader_seek_indexed(struct reftable_reader *r,
 			break;
 		}
 
-		table_iter_copy_from(&index_iter, &next);
+		table_iter_close(&index_iter);
+		index_iter = next;
+		next = empty;
 	}
 
 	if (err == 0) {
-		struct table_iter empty = TABLE_ITER_INIT;
-		struct table_iter *malloced =
-			reftable_calloc(sizeof(struct table_iter));
-		*malloced = empty;
-		table_iter_copy_from(malloced, &next);
+		struct table_iter *malloced = reftable_calloc(1, sizeof(*malloced));
+		*malloced = next;
+		next = empty;
 		iterator_from_table_iter(it, malloced);
 	}
+
 done:
-	block_iter_close(&next.bi);
+	table_iter_close(&next);
 	table_iter_close(&index_iter);
 	reftable_record_release(&want_index);
 	reftable_record_release(&index_result);
@@ -562,25 +602,28 @@ static int reader_seek_internal(struct reftable_reader *r,
 	struct reftable_reader_offsets *offs =
 		reader_offsets_for(r, reftable_record_type(rec));
 	uint64_t idx = offs->index_offset;
-	struct table_iter ti = TABLE_ITER_INIT;
-	int err = 0;
+	struct table_iter ti = TABLE_ITER_INIT, *p;
+	int err;
+
 	if (idx > 0)
 		return reader_seek_indexed(r, it, rec);
 
 	err = reader_start(r, &ti, reftable_record_type(rec), 0);
 	if (err < 0)
-		return err;
-	err = reader_seek_linear(r, &ti, rec);
+		goto out;
+
+	err = reader_seek_linear(&ti, rec);
 	if (err < 0)
-		return err;
-	else {
-		struct table_iter *p =
-			reftable_malloc(sizeof(struct table_iter));
-		*p = ti;
-		iterator_from_table_iter(it, p);
-	}
+		goto out;
 
-	return 0;
+	REFTABLE_ALLOC_ARRAY(p, 1);
+	*p = ti;
+	iterator_from_table_iter(it, p);
+
+out:
+	if (err)
+		table_iter_close(&ti);
+	return err;
 }
 
 static int reader_seek(struct reftable_reader *r, struct reftable_iterator *it,
@@ -637,8 +680,7 @@ void reader_close(struct reftable_reader *r)
 int reftable_new_reader(struct reftable_reader **p,
 			struct reftable_block_source *src, char const *name)
 {
-	struct reftable_reader *rd =
-		reftable_calloc(sizeof(struct reftable_reader));
+	struct reftable_reader *rd = reftable_calloc(1, sizeof(*rd));
 	int err = init_reader(rd, src, name);
 	if (err == 0) {
 		*p = rd;
@@ -713,7 +755,7 @@ static int reftable_reader_refs_for_unindexed(struct reftable_reader *r,
 					      uint8_t *oid)
 {
 	struct table_iter ti_empty = TABLE_ITER_INIT;
-	struct table_iter *ti = reftable_calloc(sizeof(struct table_iter));
+	struct table_iter *ti = reftable_calloc(1, sizeof(*ti));
 	struct filtering_ref_iterator *filter = NULL;
 	struct filtering_ref_iterator empty = FILTERING_REF_ITERATOR_INIT;
 	int oid_len = hash_size(r->hash_id);
diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c
index 469ab79..a6dbd21 100644
--- a/reftable/readwrite_test.c
+++ b/reftable/readwrite_test.c
@@ -11,7 +11,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "basics.h"
 #include "block.h"
 #include "blocksource.h"
-#include "constants.h"
 #include "reader.h"
 #include "record.h"
 #include "test_framework.h"
@@ -52,26 +51,25 @@ static void write_table(char ***names, struct strbuf *buf, int N,
 		.hash_id = hash_id,
 	};
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, buf, &opts);
 	struct reftable_ref_record ref = { NULL };
 	int i = 0, n;
 	struct reftable_log_record log = { NULL };
 	const struct reftable_stats *stats = NULL;
-	*names = reftable_calloc(sizeof(char *) * (N + 1));
+
+	REFTABLE_CALLOC_ARRAY(*names, N + 1);
+
 	reftable_writer_set_limits(w, update_index, update_index);
 	for (i = 0; i < N; i++) {
-		uint8_t hash[GIT_SHA256_RAWSZ] = { 0 };
 		char name[100];
 		int n;
 
-		set_test_hash(hash, i);
-
 		snprintf(name, sizeof(name), "refs/heads/branch%02d", i);
 
 		ref.refname = name;
 		ref.update_index = update_index;
 		ref.value_type = REFTABLE_REF_VAL1;
-		ref.value.val1 = hash;
+		set_test_hash(ref.value.val1, i);
 		(*names)[i] = xstrdup(name);
 
 		n = reftable_writer_add_ref(w, &ref);
@@ -79,18 +77,15 @@ static void write_table(char ***names, struct strbuf *buf, int N,
 	}
 
 	for (i = 0; i < N; i++) {
-		uint8_t hash[GIT_SHA256_RAWSZ] = { 0 };
 		char name[100];
 		int n;
 
-		set_test_hash(hash, i);
-
 		snprintf(name, sizeof(name), "refs/heads/branch%02d", i);
 
 		log.refname = name;
 		log.update_index = update_index;
 		log.value_type = REFTABLE_LOG_UPDATE;
-		log.value.update.new_hash = hash;
+		set_test_hash(log.value.update.new_hash, i);
 		log.value.update.message = "message";
 
 		n = reftable_writer_add_log(w, &log);
@@ -134,18 +129,15 @@ static void test_log_buffer_size(void)
 					   .message = "commit: 9\n",
 				   } } };
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 
 	/* This tests buffer extension for log compression. Must use a random
 	   hash, to ensure that the compressed part is larger than the original.
 	*/
-	uint8_t hash1[GIT_SHA1_RAWSZ], hash2[GIT_SHA1_RAWSZ];
 	for (i = 0; i < GIT_SHA1_RAWSZ; i++) {
-		hash1[i] = (uint8_t)(rand() % 256);
-		hash2[i] = (uint8_t)(rand() % 256);
+		log.value.update.old_hash[i] = (uint8_t)(git_rand() % 256);
+		log.value.update.new_hash[i] = (uint8_t)(git_rand() % 256);
 	}
-	log.value.update.old_hash = hash1;
-	log.value.update.new_hash = hash2;
 	reftable_writer_set_limits(w, update_index, update_index);
 	err = reftable_writer_add_log(w, &log);
 	EXPECT_ERR(err);
@@ -163,25 +155,26 @@ static void test_log_overflow(void)
 		.block_size = ARRAY_SIZE(msg),
 	};
 	int err;
-	struct reftable_log_record
-		log = { .refname = "refs/heads/master",
-			.update_index = 0xa,
-			.value_type = REFTABLE_LOG_UPDATE,
-			.value = { .update = {
-					   .name = "Han-Wen Nienhuys",
-					   .email = "hanwen@google.com",
-					   .tz_offset = 100,
-					   .time = 0x5e430672,
-					   .message = msg,
-				   } } };
+	struct reftable_log_record log = {
+		.refname = "refs/heads/master",
+		.update_index = 0xa,
+		.value_type = REFTABLE_LOG_UPDATE,
+		.value = {
+			.update = {
+				.old_hash = { 1 },
+				.new_hash = { 2 },
+				.name = "Han-Wen Nienhuys",
+				.email = "hanwen@google.com",
+				.tz_offset = 100,
+				.time = 0x5e430672,
+				.message = msg,
+			},
+		},
+	};
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
-
-	uint8_t hash1[GIT_SHA1_RAWSZ]  = {1}, hash2[GIT_SHA1_RAWSZ] = { 2 };
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 
 	memset(msg, 'x', sizeof(msg) - 1);
-	log.value.update.old_hash = hash1;
-	log.value.update.new_hash = hash2;
 	reftable_writer_set_limits(w, update_index, update_index);
 	err = reftable_writer_add_log(w, &log);
 	EXPECT(err == REFTABLE_ENTRY_TOO_BIG_ERROR);
@@ -192,7 +185,7 @@ static void test_log_overflow(void)
 static void test_log_write_read(void)
 {
 	int N = 2;
-	char **names = reftable_calloc(sizeof(char *) * (N + 1));
+	char **names = reftable_calloc(N + 1, sizeof(*names));
 	int err;
 	struct reftable_write_options opts = {
 		.block_size = 256,
@@ -206,7 +199,7 @@ static void test_log_write_read(void)
 	struct reftable_block_source source = { NULL };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	const struct reftable_stats *stats = NULL;
 	reftable_writer_set_limits(w, 0, N);
 	for (i = 0; i < N; i++) {
@@ -221,16 +214,13 @@ static void test_log_write_read(void)
 		EXPECT_ERR(err);
 	}
 	for (i = 0; i < N; i++) {
-		uint8_t hash1[GIT_SHA1_RAWSZ], hash2[GIT_SHA1_RAWSZ];
 		struct reftable_log_record log = { NULL };
-		set_test_hash(hash1, i);
-		set_test_hash(hash2, i + 1);
 
 		log.refname = names[i];
 		log.update_index = i;
 		log.value_type = REFTABLE_LOG_UPDATE;
-		log.value.update.old_hash = hash1;
-		log.value.update.new_hash = hash2;
+		set_test_hash(log.value.update.old_hash, i);
+		set_test_hash(log.value.update.new_hash, i + 1);
 
 		err = reftable_writer_add_log(w, &log);
 		EXPECT_ERR(err);
@@ -298,20 +288,17 @@ static void test_log_zlib_corruption(void)
 	struct reftable_block_source source = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	const struct reftable_stats *stats = NULL;
-	uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 };
-	uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 };
 	char message[100] = { 0 };
 	int err, i, n;
-
 	struct reftable_log_record log = {
 		.refname = "refname",
 		.value_type = REFTABLE_LOG_UPDATE,
 		.value = {
 			.update = {
-				.new_hash = hash1,
-				.old_hash = hash2,
+				.new_hash = { 1 },
+				.old_hash = { 2 },
 				.name = "My Name",
 				.email = "myname@invalid",
 				.message = message,
@@ -320,7 +307,7 @@ static void test_log_zlib_corruption(void)
 	};
 
 	for (i = 0; i < sizeof(message) - 1; i++)
-		message[i] = (uint8_t)(rand() % 64 + ' ');
+		message[i] = (uint8_t)(git_rand() % 64 + ' ');
 
 	reftable_writer_set_limits(w, 1, 1);
 
@@ -523,7 +510,7 @@ static void test_table_read_write_seek_index(void)
 static void test_table_refs_for(int indexed)
 {
 	int N = 50;
-	char **want_names = reftable_calloc(sizeof(char *) * (N + 1));
+	char **want_names = reftable_calloc(N + 1, sizeof(*want_names));
 	int want_names_len = 0;
 	uint8_t want_hash[GIT_SHA1_RAWSZ];
 
@@ -539,7 +526,7 @@ static void test_table_refs_for(int indexed)
 
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 
 	struct reftable_iterator it = { NULL };
 	int j;
@@ -550,8 +537,6 @@ static void test_table_refs_for(int indexed)
 		uint8_t hash[GIT_SHA1_RAWSZ];
 		char fill[51] = { 0 };
 		char name[100];
-		uint8_t hash1[GIT_SHA1_RAWSZ];
-		uint8_t hash2[GIT_SHA1_RAWSZ];
 		struct reftable_ref_record ref = { NULL };
 
 		memset(hash, i, sizeof(hash));
@@ -561,11 +546,9 @@ static void test_table_refs_for(int indexed)
 		name[40] = 0;
 		ref.refname = name;
 
-		set_test_hash(hash1, i / 4);
-		set_test_hash(hash2, 3 + i / 4);
 		ref.value_type = REFTABLE_REF_VAL2;
-		ref.value.val2.value = hash1;
-		ref.value.val2.target_value = hash2;
+		set_test_hash(ref.value.val2.value, i / 4);
+		set_test_hash(ref.value.val2.target_value, 3 + i / 4);
 
 		/* 80 bytes / entry, so 3 entries per block. Yields 17
 		 */
@@ -573,8 +556,8 @@ static void test_table_refs_for(int indexed)
 		n = reftable_writer_add_ref(w, &ref);
 		EXPECT(n == 0);
 
-		if (!memcmp(hash1, want_hash, GIT_SHA1_RAWSZ) ||
-		    !memcmp(hash2, want_hash, GIT_SHA1_RAWSZ)) {
+		if (!memcmp(ref.value.val2.value, want_hash, GIT_SHA1_RAWSZ) ||
+		    !memcmp(ref.value.val2.target_value, want_hash, GIT_SHA1_RAWSZ)) {
 			want_names[want_names_len++] = xstrdup(name);
 		}
 	}
@@ -636,7 +619,7 @@ static void test_write_empty_table(void)
 	struct reftable_write_options opts = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_block_source source = { NULL };
 	struct reftable_reader *rd = NULL;
 	struct reftable_ref_record rec = { NULL };
@@ -674,12 +657,11 @@ static void test_write_object_id_min_length(void)
 	};
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
-	uint8_t hash[GIT_SHA1_RAWSZ] = {42};
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_ref_record ref = {
 		.update_index = 1,
 		.value_type = REFTABLE_REF_VAL1,
-		.value.val1 = hash,
+		.value.val1 = {42},
 	};
 	int err;
 	int i;
@@ -710,12 +692,11 @@ static void test_write_object_id_length(void)
 	};
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
-	uint8_t hash[GIT_SHA1_RAWSZ] = {42};
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_ref_record ref = {
 		.update_index = 1,
 		.value_type = REFTABLE_REF_VAL1,
-		.value.val1 = hash,
+		.value.val1 = {42},
 	};
 	int err;
 	int i;
@@ -745,7 +726,7 @@ static void test_write_empty_key(void)
 	struct reftable_write_options opts = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_ref_record ref = {
 		.refname = "",
 		.update_index = 1,
@@ -768,7 +749,7 @@ static void test_write_key_order(void)
 	struct reftable_write_options opts = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_ref_record refs[2] = {
 		{
 			.refname = "b",
@@ -798,6 +779,138 @@ static void test_write_key_order(void)
 	strbuf_release(&buf);
 }
 
+static void test_write_multiple_indices(void)
+{
+	struct reftable_write_options opts = {
+		.block_size = 100,
+	};
+	struct strbuf writer_buf = STRBUF_INIT, buf = STRBUF_INIT;
+	struct reftable_block_source source = { 0 };
+	struct reftable_iterator it = { 0 };
+	const struct reftable_stats *stats;
+	struct reftable_writer *writer;
+	struct reftable_reader *reader;
+	int err, i;
+
+	writer = reftable_new_writer(&strbuf_add_void, &noop_flush, &writer_buf, &opts);
+	reftable_writer_set_limits(writer, 1, 1);
+	for (i = 0; i < 100; i++) {
+		struct reftable_ref_record ref = {
+			.update_index = 1,
+			.value_type = REFTABLE_REF_VAL1,
+			.value.val1 = {i},
+		};
+
+		strbuf_reset(&buf);
+		strbuf_addf(&buf, "refs/heads/%04d", i);
+		ref.refname = buf.buf,
+
+		err = reftable_writer_add_ref(writer, &ref);
+		EXPECT_ERR(err);
+	}
+
+	for (i = 0; i < 100; i++) {
+		struct reftable_log_record log = {
+			.update_index = 1,
+			.value_type = REFTABLE_LOG_UPDATE,
+			.value.update = {
+				.old_hash = { i },
+				.new_hash = { i },
+			},
+		};
+
+		strbuf_reset(&buf);
+		strbuf_addf(&buf, "refs/heads/%04d", i);
+		log.refname = buf.buf,
+
+		err = reftable_writer_add_log(writer, &log);
+		EXPECT_ERR(err);
+	}
+
+	reftable_writer_close(writer);
+
+	/*
+	 * The written data should be sufficiently large to result in indices
+	 * for each of the block types.
+	 */
+	stats = reftable_writer_stats(writer);
+	EXPECT(stats->ref_stats.index_offset > 0);
+	EXPECT(stats->obj_stats.index_offset > 0);
+	EXPECT(stats->log_stats.index_offset > 0);
+
+	block_source_from_strbuf(&source, &writer_buf);
+	err = reftable_new_reader(&reader, &source, "filename");
+	EXPECT_ERR(err);
+
+	/*
+	 * Seeking the log uses the log index now. In case there is any
+	 * confusion regarding indices we would notice here.
+	 */
+	err = reftable_reader_seek_log(reader, &it, "");
+	EXPECT_ERR(err);
+
+	reftable_iterator_destroy(&it);
+	reftable_writer_free(writer);
+	reftable_reader_free(reader);
+	strbuf_release(&writer_buf);
+	strbuf_release(&buf);
+}
+
+static void test_write_multi_level_index(void)
+{
+	struct reftable_write_options opts = {
+		.block_size = 100,
+	};
+	struct strbuf writer_buf = STRBUF_INIT, buf = STRBUF_INIT;
+	struct reftable_block_source source = { 0 };
+	struct reftable_iterator it = { 0 };
+	const struct reftable_stats *stats;
+	struct reftable_writer *writer;
+	struct reftable_reader *reader;
+	int err;
+
+	writer = reftable_new_writer(&strbuf_add_void, &noop_flush, &writer_buf, &opts);
+	reftable_writer_set_limits(writer, 1, 1);
+	for (size_t i = 0; i < 200; i++) {
+		struct reftable_ref_record ref = {
+			.update_index = 1,
+			.value_type = REFTABLE_REF_VAL1,
+			.value.val1 = {i},
+		};
+
+		strbuf_reset(&buf);
+		strbuf_addf(&buf, "refs/heads/%03" PRIuMAX, (uintmax_t)i);
+		ref.refname = buf.buf,
+
+		err = reftable_writer_add_ref(writer, &ref);
+		EXPECT_ERR(err);
+	}
+	reftable_writer_close(writer);
+
+	/*
+	 * The written refs should be sufficiently large to result in a
+	 * multi-level index.
+	 */
+	stats = reftable_writer_stats(writer);
+	EXPECT(stats->ref_stats.max_index_level == 2);
+
+	block_source_from_strbuf(&source, &writer_buf);
+	err = reftable_new_reader(&reader, &source, "filename");
+	EXPECT_ERR(err);
+
+	/*
+	 * Seeking the last ref should work as expected.
+	 */
+	err = reftable_reader_seek_ref(reader, &it, "refs/heads/199");
+	EXPECT_ERR(err);
+
+	reftable_iterator_destroy(&it);
+	reftable_writer_free(writer);
+	reftable_reader_free(reader);
+	strbuf_release(&writer_buf);
+	strbuf_release(&buf);
+}
+
 static void test_corrupt_table_empty(void)
 {
 	struct strbuf buf = STRBUF_INIT;
@@ -847,5 +960,7 @@ int readwrite_test_main(int argc, const char *argv[])
 	RUN_TEST(test_log_overflow);
 	RUN_TEST(test_write_object_id_length);
 	RUN_TEST(test_write_object_id_min_length);
+	RUN_TEST(test_write_multiple_indices);
+	RUN_TEST(test_write_multi_level_index);
 	return 0;
 }
diff --git a/reftable/record.c b/reftable/record.c
index fbaa1fb..5506f3e 100644
--- a/reftable/record.c
+++ b/reftable/record.c
@@ -76,7 +76,7 @@ int reftable_is_block_type(uint8_t typ)
 	return 0;
 }
 
-uint8_t *reftable_ref_record_val1(const struct reftable_ref_record *rec)
+const unsigned char *reftable_ref_record_val1(const struct reftable_ref_record *rec)
 {
 	switch (rec->value_type) {
 	case REFTABLE_REF_VAL1:
@@ -88,7 +88,7 @@ uint8_t *reftable_ref_record_val1(const struct reftable_ref_record *rec)
 	}
 }
 
-uint8_t *reftable_ref_record_val2(const struct reftable_ref_record *rec)
+const unsigned char *reftable_ref_record_val2(const struct reftable_ref_record *rec)
 {
 	switch (rec->value_type) {
 	case REFTABLE_REF_VAL2:
@@ -159,34 +159,49 @@ int reftable_encode_key(int *restart, struct string_view dest,
 	return start.len - dest.len;
 }
 
-int reftable_decode_key(struct strbuf *key, uint8_t *extra,
-			struct strbuf last_key, struct string_view in)
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra)
 {
-	int start_len = in.len;
-	uint64_t prefix_len = 0;
-	uint64_t suffix_len = 0;
-	int n = get_var_int(&prefix_len, &in);
+	size_t start_len = in.len;
+	int n;
+
+	n = get_var_int(prefix_len, &in);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	if (prefix_len > last_key.len)
-		return -1;
-
-	n = get_var_int(&suffix_len, &in);
+	n = get_var_int(suffix_len, &in);
 	if (n <= 0)
 		return -1;
 	string_view_consume(&in, n);
 
-	*extra = (uint8_t)(suffix_len & 0x7);
-	suffix_len >>= 3;
+	*extra = (uint8_t)(*suffix_len & 0x7);
+	*suffix_len >>= 3;
+
+	return start_len - in.len;
+}
+
+int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
+			struct string_view in)
+{
+	int start_len = in.len;
+	uint64_t prefix_len = 0;
+	uint64_t suffix_len = 0;
+	int n;
+
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra);
+	if (n < 0)
+		return -1;
+	string_view_consume(&in, n);
 
-	if (in.len < suffix_len)
+	if (in.len < suffix_len ||
+	    prefix_len > last_key->len)
 		return -1;
 
-	strbuf_reset(key);
-	strbuf_add(key, last_key.buf, prefix_len);
-	strbuf_add(key, in.buf, suffix_len);
+	strbuf_setlen(last_key, prefix_len);
+	strbuf_add(last_key, in.buf, suffix_len);
 	string_view_consume(&in, suffix_len);
 
 	return start_len - in.len;
@@ -205,27 +220,36 @@ static void reftable_ref_record_copy_from(void *rec, const void *src_rec,
 {
 	struct reftable_ref_record *ref = rec;
 	const struct reftable_ref_record *src = src_rec;
+	char *refname = NULL;
+	size_t refname_cap = 0;
+
 	assert(hash_size > 0);
 
-	/* This is simple and correct, but we could probably reuse the hash
-	 * fields. */
+	SWAP(refname, ref->refname);
+	SWAP(refname_cap, ref->refname_cap);
 	reftable_ref_record_release(ref);
+	SWAP(ref->refname, refname);
+	SWAP(ref->refname_cap, refname_cap);
+
 	if (src->refname) {
-		ref->refname = xstrdup(src->refname);
+		size_t refname_len = strlen(src->refname);
+
+		REFTABLE_ALLOC_GROW(ref->refname, refname_len + 1,
+				    ref->refname_cap);
+		memcpy(ref->refname, src->refname, refname_len);
+		ref->refname[refname_len] = 0;
 	}
+
 	ref->update_index = src->update_index;
 	ref->value_type = src->value_type;
 	switch (src->value_type) {
 	case REFTABLE_REF_DELETION:
 		break;
 	case REFTABLE_REF_VAL1:
-		ref->value.val1 = reftable_malloc(hash_size);
 		memcpy(ref->value.val1, src->value.val1, hash_size);
 		break;
 	case REFTABLE_REF_VAL2:
-		ref->value.val2.value = reftable_malloc(hash_size);
 		memcpy(ref->value.val2.value, src->value.val2.value, hash_size);
-		ref->value.val2.target_value = reftable_malloc(hash_size);
 		memcpy(ref->value.val2.target_value,
 		       src->value.val2.target_value, hash_size);
 		break;
@@ -242,7 +266,7 @@ static char hexdigit(int c)
 	return 'a' + (c - 10);
 }
 
-static void hex_format(char *dest, uint8_t *src, int hash_size)
+static void hex_format(char *dest, const unsigned char *src, int hash_size)
 {
 	assert(hash_size > 0);
 	if (src) {
@@ -299,11 +323,8 @@ void reftable_ref_record_release(struct reftable_ref_record *ref)
 		reftable_free(ref->value.symref);
 		break;
 	case REFTABLE_REF_VAL2:
-		reftable_free(ref->value.val2.target_value);
-		reftable_free(ref->value.val2.value);
 		break;
 	case REFTABLE_REF_VAL1:
-		reftable_free(ref->value.val1);
 		break;
 	case REFTABLE_REF_DELETION:
 		break;
@@ -369,24 +390,33 @@ static int reftable_ref_record_encode(const void *rec, struct string_view s,
 
 static int reftable_ref_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct reftable_ref_record *r = rec;
 	struct string_view start = in;
 	uint64_t update_index = 0;
-	int n = get_var_int(&update_index, &in);
+	const char *refname = NULL;
+	size_t refname_cap = 0;
+	int n;
+
+	assert(hash_size > 0);
+
+	n = get_var_int(&update_index, &in);
 	if (n < 0)
 		return n;
 	string_view_consume(&in, n);
 
+	SWAP(refname, r->refname);
+	SWAP(refname_cap, r->refname_cap);
 	reftable_ref_record_release(r);
+	SWAP(r->refname, refname);
+	SWAP(r->refname_cap, refname_cap);
 
-	assert(hash_size > 0);
-
-	r->refname = reftable_realloc(r->refname, key.len + 1);
+	REFTABLE_ALLOC_GROW(r->refname, key.len + 1, r->refname_cap);
 	memcpy(r->refname, key.buf, key.len);
-	r->update_index = update_index;
 	r->refname[key.len] = 0;
+
+	r->update_index = update_index;
 	r->value_type = val_type;
 	switch (val_type) {
 	case REFTABLE_REF_VAL1:
@@ -394,7 +424,6 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key,
 			return -1;
 		}
 
-		r->value.val1 = reftable_malloc(hash_size);
 		memcpy(r->value.val1, in.buf, hash_size);
 		string_view_consume(&in, hash_size);
 		break;
@@ -404,23 +433,20 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key,
 			return -1;
 		}
 
-		r->value.val2.value = reftable_malloc(hash_size);
 		memcpy(r->value.val2.value, in.buf, hash_size);
 		string_view_consume(&in, hash_size);
 
-		r->value.val2.target_value = reftable_malloc(hash_size);
 		memcpy(r->value.val2.target_value, in.buf, hash_size);
 		string_view_consume(&in, hash_size);
 		break;
 
 	case REFTABLE_REF_SYMREF: {
-		struct strbuf dest = STRBUF_INIT;
-		int n = decode_string(&dest, in);
+		int n = decode_string(scratch, in);
 		if (n < 0) {
 			return -1;
 		}
 		string_view_consume(&in, n);
-		r->value.symref = dest.buf;
+		r->value.symref = strbuf_detach(scratch, NULL);
 	} break;
 
 	case REFTABLE_REF_DELETION:
@@ -439,7 +465,6 @@ static int reftable_ref_record_is_deletion_void(const void *p)
 		(const struct reftable_ref_record *)p);
 }
 
-
 static int reftable_ref_record_equal_void(const void *a,
 					  const void *b, int hash_size)
 {
@@ -448,6 +473,13 @@ static int reftable_ref_record_equal_void(const void *a,
 	return reftable_ref_record_equal(ra, rb, hash_size);
 }
 
+static int reftable_ref_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_ref_record *a = _a;
+	const struct reftable_ref_record *b = _b;
+	return strcmp(a->refname, b->refname);
+}
+
 static void reftable_ref_record_print_void(const void *rec,
 					   int hash_size)
 {
@@ -464,6 +496,7 @@ static struct reftable_record_vtable reftable_ref_record_vtable = {
 	.release = &reftable_ref_record_release_void,
 	.is_deletion = &reftable_ref_record_is_deletion_void,
 	.equal = &reftable_ref_record_equal_void,
+	.cmp = &reftable_ref_record_cmp_void,
 	.print = &reftable_ref_record_print_void,
 };
 
@@ -506,12 +539,13 @@ static void reftable_obj_record_copy_from(void *rec, const void *src_rec,
 		(const struct reftable_obj_record *)src_rec;
 
 	reftable_obj_record_release(obj);
-	obj->hash_prefix = reftable_malloc(src->hash_prefix_len);
+
+	REFTABLE_ALLOC_ARRAY(obj->hash_prefix, src->hash_prefix_len);
 	obj->hash_prefix_len = src->hash_prefix_len;
 	if (src->hash_prefix_len)
 		memcpy(obj->hash_prefix, src->hash_prefix, obj->hash_prefix_len);
 
-	obj->offsets = reftable_malloc(src->offset_len * sizeof(uint64_t));
+	REFTABLE_ALLOC_ARRAY(obj->offsets, src->offset_len);
 	obj->offset_len = src->offset_len;
 	COPY_ARRAY(obj->offsets, src->offsets, src->offset_len);
 }
@@ -560,7 +594,7 @@ static int reftable_obj_record_encode(const void *rec, struct string_view s,
 
 static int reftable_obj_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_obj_record *r = rec;
@@ -568,7 +602,10 @@ static int reftable_obj_record_decode(void *rec, struct strbuf key,
 	int n = 0;
 	uint64_t last;
 	int j;
-	r->hash_prefix = reftable_malloc(key.len);
+
+	reftable_obj_record_release(r);
+
+	REFTABLE_ALLOC_ARRAY(r->hash_prefix, key.len);
 	memcpy(r->hash_prefix, key.buf, key.len);
 	r->hash_prefix_len = key.len;
 
@@ -586,7 +623,7 @@ static int reftable_obj_record_decode(void *rec, struct strbuf key,
 	if (count == 0)
 		return start.len - in.len;
 
-	r->offsets = reftable_malloc(count * sizeof(uint64_t));
+	REFTABLE_ALLOC_ARRAY(r->offsets, count);
 	r->offset_len = count;
 
 	n = get_var_int(&r->offsets[0], &in);
@@ -634,6 +671,25 @@ static int reftable_obj_record_equal_void(const void *a, const void *b, int hash
 	return 1;
 }
 
+static int reftable_obj_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_obj_record *a = _a;
+	const struct reftable_obj_record *b = _b;
+	int cmp;
+
+	cmp = memcmp(a->hash_prefix, b->hash_prefix,
+		     a->hash_prefix_len > b->hash_prefix_len ?
+		     a->hash_prefix_len : b->hash_prefix_len);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * When the prefix is the same then the object record that is longer is
+	 * considered to be bigger.
+	 */
+	return a->hash_prefix_len - b->hash_prefix_len;
+}
+
 static struct reftable_record_vtable reftable_obj_record_vtable = {
 	.key = &reftable_obj_record_key,
 	.type = BLOCK_TYPE_OBJ,
@@ -644,6 +700,7 @@ static struct reftable_record_vtable reftable_obj_record_vtable = {
 	.release = &reftable_obj_record_release,
 	.is_deletion = &not_a_deletion,
 	.equal = &reftable_obj_record_equal_void,
+	.cmp = &reftable_obj_record_cmp_void,
 	.print = &reftable_obj_record_print,
 };
 
@@ -723,16 +780,10 @@ static void reftable_log_record_copy_from(void *rec, const void *src_rec,
 				xstrdup(dst->value.update.message);
 		}
 
-		if (dst->value.update.new_hash) {
-			dst->value.update.new_hash = reftable_malloc(hash_size);
-			memcpy(dst->value.update.new_hash,
-			       src->value.update.new_hash, hash_size);
-		}
-		if (dst->value.update.old_hash) {
-			dst->value.update.old_hash = reftable_malloc(hash_size);
-			memcpy(dst->value.update.old_hash,
-			       src->value.update.old_hash, hash_size);
-		}
+		memcpy(dst->value.update.new_hash,
+		       src->value.update.new_hash, hash_size);
+		memcpy(dst->value.update.old_hash,
+		       src->value.update.old_hash, hash_size);
 		break;
 	}
 }
@@ -750,8 +801,6 @@ void reftable_log_record_release(struct reftable_log_record *r)
 	case REFTABLE_LOG_DELETION:
 		break;
 	case REFTABLE_LOG_UPDATE:
-		reftable_free(r->value.update.new_hash);
-		reftable_free(r->value.update.old_hash);
 		reftable_free(r->value.update.name);
 		reftable_free(r->value.update.email);
 		reftable_free(r->value.update.message);
@@ -768,33 +817,20 @@ static uint8_t reftable_log_record_val_type(const void *rec)
 	return reftable_log_record_is_deletion(log) ? 0 : 1;
 }
 
-static uint8_t zero[GIT_SHA256_RAWSZ] = { 0 };
-
 static int reftable_log_record_encode(const void *rec, struct string_view s,
 				      int hash_size)
 {
 	const struct reftable_log_record *r = rec;
 	struct string_view start = s;
 	int n = 0;
-	uint8_t *oldh = NULL;
-	uint8_t *newh = NULL;
 	if (reftable_log_record_is_deletion(r))
 		return 0;
 
-	oldh = r->value.update.old_hash;
-	newh = r->value.update.new_hash;
-	if (!oldh) {
-		oldh = zero;
-	}
-	if (!newh) {
-		newh = zero;
-	}
-
 	if (s.len < 2 * hash_size)
 		return -1;
 
-	memcpy(s.buf, oldh, hash_size);
-	memcpy(s.buf + hash_size, newh, hash_size);
+	memcpy(s.buf, r->value.update.old_hash, hash_size);
+	memcpy(s.buf + hash_size, r->value.update.new_hash, hash_size);
 	string_view_consume(&s, 2 * hash_size);
 
 	n = encode_string(r->value.update.name ? r->value.update.name : "", s);
@@ -830,19 +866,18 @@ static int reftable_log_record_encode(const void *rec, struct string_view s,
 
 static int reftable_log_record_decode(void *rec, struct strbuf key,
 				      uint8_t val_type, struct string_view in,
-				      int hash_size)
+				      int hash_size, struct strbuf *scratch)
 {
 	struct string_view start = in;
 	struct reftable_log_record *r = rec;
 	uint64_t max = 0;
 	uint64_t ts = 0;
-	struct strbuf dest = STRBUF_INIT;
 	int n;
 
 	if (key.len <= 9 || key.buf[key.len - 9] != 0)
 		return REFTABLE_FORMAT_ERROR;
 
-	r->refname = reftable_realloc(r->refname, key.len - 8);
+	REFTABLE_ALLOC_GROW(r->refname, key.len - 8, r->refname_cap);
 	memcpy(r->refname, key.buf, key.len - 8);
 	ts = get_be64(key.buf + key.len - 8);
 
@@ -851,9 +886,8 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	if (val_type != r->value_type) {
 		switch (r->value_type) {
 		case REFTABLE_LOG_UPDATE:
-			FREE_AND_NULL(r->value.update.old_hash);
-			FREE_AND_NULL(r->value.update.new_hash);
 			FREE_AND_NULL(r->value.update.message);
+			r->value.update.message_cap = 0;
 			FREE_AND_NULL(r->value.update.email);
 			FREE_AND_NULL(r->value.update.name);
 			break;
@@ -869,36 +903,43 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	if (in.len < 2 * hash_size)
 		return REFTABLE_FORMAT_ERROR;
 
-	r->value.update.old_hash =
-		reftable_realloc(r->value.update.old_hash, hash_size);
-	r->value.update.new_hash =
-		reftable_realloc(r->value.update.new_hash, hash_size);
-
 	memcpy(r->value.update.old_hash, in.buf, hash_size);
 	memcpy(r->value.update.new_hash, in.buf + hash_size, hash_size);
 
 	string_view_consume(&in, 2 * hash_size);
 
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.name =
-		reftable_realloc(r->value.update.name, dest.len + 1);
-	memcpy(r->value.update.name, dest.buf, dest.len);
-	r->value.update.name[dest.len] = 0;
+	/*
+	 * In almost all cases we can expect the reflog name to not change for
+	 * reflog entries as they are tied to the local identity, not to the
+	 * target commits. As an optimization for this common case we can thus
+	 * skip copying over the name in case it's accurate already.
+	 */
+	if (!r->value.update.name ||
+	    strcmp(r->value.update.name, scratch->buf)) {
+		r->value.update.name =
+			reftable_realloc(r->value.update.name, scratch->len + 1);
+		memcpy(r->value.update.name, scratch->buf, scratch->len);
+		r->value.update.name[scratch->len] = 0;
+	}
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.email =
-		reftable_realloc(r->value.update.email, dest.len + 1);
-	memcpy(r->value.update.email, dest.buf, dest.len);
-	r->value.update.email[dest.len] = 0;
+	/* Same as above, but for the reflog email. */
+	if (!r->value.update.email ||
+	    strcmp(r->value.update.email, scratch->buf)) {
+		r->value.update.email =
+			reftable_realloc(r->value.update.email, scratch->len + 1);
+		memcpy(r->value.update.email, scratch->buf, scratch->len);
+		r->value.update.email[scratch->len] = 0;
+	}
 
 	ts = 0;
 	n = get_var_int(&ts, &in);
@@ -912,22 +953,19 @@ static int reftable_log_record_decode(void *rec, struct strbuf key,
 	r->value.update.tz_offset = get_be16(in.buf);
 	string_view_consume(&in, 2);
 
-	strbuf_reset(&dest);
-	n = decode_string(&dest, in);
+	n = decode_string(scratch, in);
 	if (n < 0)
 		goto done;
 	string_view_consume(&in, n);
 
-	r->value.update.message =
-		reftable_realloc(r->value.update.message, dest.len + 1);
-	memcpy(r->value.update.message, dest.buf, dest.len);
-	r->value.update.message[dest.len] = 0;
+	REFTABLE_ALLOC_GROW(r->value.update.message, scratch->len + 1,
+			    r->value.update.message_cap);
+	memcpy(r->value.update.message, scratch->buf, scratch->len);
+	r->value.update.message[scratch->len] = 0;
 
-	strbuf_release(&dest);
 	return start.len - in.len;
 
 done:
-	strbuf_release(&dest);
 	return REFTABLE_FORMAT_ERROR;
 }
 
@@ -943,17 +981,6 @@ static int null_streq(char *a, char *b)
 	return 0 == strcmp(a, b);
 }
 
-static int zero_hash_eq(uint8_t *a, uint8_t *b, int sz)
-{
-	if (!a)
-		a = zero;
-
-	if (!b)
-		b = zero;
-
-	return !memcmp(a, b, sz);
-}
-
 static int reftable_log_record_equal_void(const void *a,
 					  const void *b, int hash_size)
 {
@@ -962,6 +989,22 @@ static int reftable_log_record_equal_void(const void *a,
 					 hash_size);
 }
 
+static int reftable_log_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_log_record *a = _a;
+	const struct reftable_log_record *b = _b;
+	int cmp = strcmp(a->refname, b->refname);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * Note that the comparison here is reversed. This is because the
+	 * update index is reversed when comparing keys. For reference, see how
+	 * we handle this in reftable_log_record_key()`.
+	 */
+	return b->update_index - a->update_index;
+}
+
 int reftable_log_record_equal(const struct reftable_log_record *a,
 			      const struct reftable_log_record *b, int hash_size)
 {
@@ -981,10 +1024,10 @@ int reftable_log_record_equal(const struct reftable_log_record *a,
 				  b->value.update.email) &&
 		       null_streq(a->value.update.message,
 				  b->value.update.message) &&
-		       zero_hash_eq(a->value.update.old_hash,
-				    b->value.update.old_hash, hash_size) &&
-		       zero_hash_eq(a->value.update.new_hash,
-				    b->value.update.new_hash, hash_size);
+		       !memcmp(a->value.update.old_hash,
+			       b->value.update.old_hash, hash_size) &&
+		       !memcmp(a->value.update.new_hash,
+			       b->value.update.new_hash, hash_size);
 	}
 
 	abort();
@@ -1011,6 +1054,7 @@ static struct reftable_record_vtable reftable_log_record_vtable = {
 	.release = &reftable_log_record_release_void,
 	.is_deletion = &reftable_log_record_is_deletion_void,
 	.equal = &reftable_log_record_equal_void,
+	.cmp = &reftable_log_record_cmp_void,
 	.print = &reftable_log_record_print_void,
 };
 
@@ -1061,7 +1105,7 @@ static int reftable_index_record_encode(const void *rec, struct string_view out,
 
 static int reftable_index_record_decode(void *rec, struct strbuf key,
 					uint8_t val_type, struct string_view in,
-					int hash_size)
+					int hash_size, struct strbuf *scratch UNUSED)
 {
 	struct string_view start = in;
 	struct reftable_index_record *r = rec;
@@ -1086,6 +1130,13 @@ static int reftable_index_record_equal(const void *a, const void *b, int hash_si
 	return ia->offset == ib->offset && !strbuf_cmp(&ia->last_key, &ib->last_key);
 }
 
+static int reftable_index_record_cmp(const void *_a, const void *_b)
+{
+	const struct reftable_index_record *a = _a;
+	const struct reftable_index_record *b = _b;
+	return strbuf_cmp(&a->last_key, &b->last_key);
+}
+
 static void reftable_index_record_print(const void *rec, int hash_size)
 {
 	const struct reftable_index_record *idx = rec;
@@ -1103,6 +1154,7 @@ static struct reftable_record_vtable reftable_index_record_vtable = {
 	.release = &reftable_index_record_release,
 	.is_deletion = &not_a_deletion,
 	.equal = &reftable_index_record_equal,
+	.cmp = &reftable_index_record_cmp,
 	.print = &reftable_index_record_print,
 };
 
@@ -1111,11 +1163,6 @@ void reftable_record_key(struct reftable_record *rec, struct strbuf *dest)
 	reftable_record_vtable(rec)->key(reftable_record_data(rec), dest);
 }
 
-uint8_t reftable_record_type(struct reftable_record *rec)
-{
-	return rec->type;
-}
-
 int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
 			   int hash_size)
 {
@@ -1139,10 +1186,12 @@ uint8_t reftable_record_val_type(struct reftable_record *rec)
 }
 
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
-			   uint8_t extra, struct string_view src, int hash_size)
+			   uint8_t extra, struct string_view src, int hash_size,
+			   struct strbuf *scratch)
 {
 	return reftable_record_vtable(rec)->decode(reftable_record_data(rec),
-						   key, extra, src, hash_size);
+						   key, extra, src, hash_size,
+						   scratch);
 }
 
 void reftable_record_release(struct reftable_record *rec)
@@ -1156,6 +1205,14 @@ int reftable_record_is_deletion(struct reftable_record *rec)
 		reftable_record_data(rec));
 }
 
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b)
+{
+	if (a->type != b->type)
+		BUG("cannot compare reftable records of different type");
+	return reftable_record_vtable(a)->cmp(
+		reftable_record_data(a), reftable_record_data(b));
+}
+
 int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, int hash_size)
 {
 	if (a->type != b->type)
@@ -1164,7 +1221,7 @@ int reftable_record_equal(struct reftable_record *a, struct reftable_record *b,
 		reftable_record_data(a), reftable_record_data(b), hash_size);
 }
 
-static int hash_equal(uint8_t *a, uint8_t *b, int hash_size)
+static int hash_equal(const unsigned char *a, const unsigned char *b, int hash_size)
 {
 	if (a && b)
 		return !memcmp(a, b, hash_size);
@@ -1229,12 +1286,6 @@ int reftable_log_record_is_deletion(const struct reftable_log_record *log)
 	return (log->value_type == REFTABLE_LOG_DELETION);
 }
 
-void string_view_consume(struct string_view *s, int n)
-{
-	s->buf += n;
-	s->len -= n;
-}
-
 static void *reftable_record_data(struct reftable_record *rec)
 {
 	switch (rec->type) {
@@ -1266,45 +1317,22 @@ reftable_record_vtable(struct reftable_record *rec)
 	abort();
 }
 
-struct reftable_record reftable_new_record(uint8_t typ)
+void reftable_record_init(struct reftable_record *rec, uint8_t typ)
 {
-	struct reftable_record clean = {
-		.type = typ,
-	};
+	memset(rec, 0, sizeof(*rec));
+	rec->type = typ;
 
-	/* the following is involved, but the naive solution (just return
-	 * `clean` as is, except for BLOCK_TYPE_INDEX), returns a garbage
-	 * clean.u.obj.offsets pointer on Windows VS CI.  Go figure.
-	 */
 	switch (typ) {
-	case BLOCK_TYPE_OBJ:
-	{
-		struct reftable_obj_record obj = { 0 };
-		clean.u.obj = obj;
-		break;
-	}
-	case BLOCK_TYPE_INDEX:
-	{
-		struct reftable_index_record idx = {
-			.last_key = STRBUF_INIT,
-		};
-		clean.u.idx = idx;
-		break;
-	}
 	case BLOCK_TYPE_REF:
-	{
-		struct reftable_ref_record ref = { 0 };
-		clean.u.ref = ref;
-		break;
-	}
 	case BLOCK_TYPE_LOG:
-	{
-		struct reftable_log_record log = { 0 };
-		clean.u.log = log;
-		break;
-	}
+	case BLOCK_TYPE_OBJ:
+		return;
+	case BLOCK_TYPE_INDEX:
+		strbuf_init(&rec->u.idx.last_key, 0);
+		return;
+	default:
+		BUG("unhandled record type");
 	}
-	return clean;
 }
 
 void reftable_record_print(struct reftable_record *rec, int hash_size)
diff --git a/reftable/record.h b/reftable/record.h
index fd80cd4..d778133 100644
--- a/reftable/record.h
+++ b/reftable/record.h
@@ -25,7 +25,11 @@ struct string_view {
 };
 
 /* Advance `s.buf` by `n`, and decrease length. */
-void string_view_consume(struct string_view *s, int n);
+static inline void string_view_consume(struct string_view *s, int n)
+{
+	s->buf += n;
+	s->len -= n;
+}
 
 /* utilities for de/encoding varints */
 
@@ -51,7 +55,8 @@ struct reftable_record_vtable {
 
 	/* decode data from `src` into the record. */
 	int (*decode)(void *rec, struct strbuf key, uint8_t extra,
-		      struct string_view src, int hash_size);
+		      struct string_view src, int hash_size,
+		      struct strbuf *scratch);
 
 	/* deallocate and null the record. */
 	void (*release)(void *rec);
@@ -62,6 +67,12 @@ struct reftable_record_vtable {
 	/* Are two records equal? This assumes they have the same type. Returns 0 for non-equal. */
 	int (*equal)(const void *a, const void *b, int hash_size);
 
+	/*
+	 * Compare keys of two records with each other. The records must have
+	 * the same type.
+	 */
+	int (*cmp)(const void *a, const void *b);
+
 	/* Print on stdout, for debugging. */
 	void (*print)(const void *rec, int hash_size);
 };
@@ -69,18 +80,24 @@ struct reftable_record_vtable {
 /* returns true for recognized block types. Block start with the block type. */
 int reftable_is_block_type(uint8_t typ);
 
-/* return an initialized record for the given type */
-struct reftable_record reftable_new_record(uint8_t typ);
-
 /* Encode `key` into `dest`. Sets `is_restart` to indicate a restart. Returns
  * number of bytes written. */
 int reftable_encode_key(int *is_restart, struct string_view dest,
 			struct strbuf prev_key, struct strbuf key,
 			uint8_t extra);
 
-/* Decode into `key` and `extra` from `in` */
-int reftable_decode_key(struct strbuf *key, uint8_t *extra,
-			struct strbuf last_key, struct string_view in);
+/* Decode a record's key lengths. */
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra);
+
+/*
+ * Decode into `last_key` and `extra` from `in`. `last_key` is expected to
+ * contain the decoded key of the preceding record, if any.
+ */
+int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
+			struct string_view in);
 
 /* reftable_index_record are used internally to speed up lookups. */
 struct reftable_index_record {
@@ -100,8 +117,8 @@ struct reftable_obj_record {
 /* record is a generic wrapper for different types of records. It is normally
  * created on the stack, or embedded within another struct. If the type is
  * known, a fresh instance can be initialized explicitly. Otherwise, use
- * reftable_new_record() to initialize generically (as the index_record is not
- * valid as 0-initialized structure)
+ * `reftable_record_init()` to initialize generically (as the index_record is
+ * not valid as 0-initialized structure)
  */
 struct reftable_record {
 	uint8_t type;
@@ -113,11 +130,14 @@ struct reftable_record {
 	} u;
 };
 
+/* Initialize the reftable record for the given type */
+void reftable_record_init(struct reftable_record *rec, uint8_t typ);
+
 /* see struct record_vtable */
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b);
 int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, int hash_size);
 void reftable_record_print(struct reftable_record *rec, int hash_size);
 void reftable_record_key(struct reftable_record *rec, struct strbuf *dest);
-uint8_t reftable_record_type(struct reftable_record *rec);
 void reftable_record_copy_from(struct reftable_record *rec,
 			       struct reftable_record *src, int hash_size);
 uint8_t reftable_record_val_type(struct reftable_record *rec);
@@ -125,9 +145,14 @@ int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
 			   int hash_size);
 int reftable_record_decode(struct reftable_record *rec, struct strbuf key,
 			   uint8_t extra, struct string_view src,
-			   int hash_size);
+			   int hash_size, struct strbuf *scratch);
 int reftable_record_is_deletion(struct reftable_record *rec);
 
+static inline uint8_t reftable_record_type(struct reftable_record *rec)
+{
+	return rec->type;
+}
+
 /* frees and zeroes out the embedded record */
 void reftable_record_release(struct reftable_record *rec);
 
diff --git a/reftable/record_test.c b/reftable/record_test.c
index 70ae78f..c158ee7 100644
--- a/reftable/record_test.c
+++ b/reftable/record_test.c
@@ -16,11 +16,11 @@
 
 static void test_copy(struct reftable_record *rec)
 {
-	struct reftable_record copy = { 0 };
+	struct reftable_record copy;
 	uint8_t typ;
 
 	typ = reftable_record_type(rec);
-	copy = reftable_new_record(typ);
+	reftable_record_init(&copy, typ);
 	reftable_record_copy_from(&copy, rec, GIT_SHA1_RAWSZ);
 	/* do it twice to catch memory leaks */
 	reftable_record_copy_from(&copy, rec, GIT_SHA1_RAWSZ);
@@ -99,6 +99,7 @@ static void set_hash(uint8_t *h, int j)
 
 static void test_reftable_ref_record_roundtrip(void)
 {
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
 
 	for (i = REFTABLE_REF_DELETION; i < REFTABLE_NR_REF_VALUETYPES; i++) {
@@ -119,15 +120,10 @@ static void test_reftable_ref_record_roundtrip(void)
 		case REFTABLE_REF_DELETION:
 			break;
 		case REFTABLE_REF_VAL1:
-			in.u.ref.value.val1 = reftable_malloc(GIT_SHA1_RAWSZ);
 			set_hash(in.u.ref.value.val1, 1);
 			break;
 		case REFTABLE_REF_VAL2:
-			in.u.ref.value.val2.value =
-				reftable_malloc(GIT_SHA1_RAWSZ);
 			set_hash(in.u.ref.value.val2.value, 1);
-			in.u.ref.value.val2.target_value =
-				reftable_malloc(GIT_SHA1_RAWSZ);
 			set_hash(in.u.ref.value.val2.target_value, 2);
 			break;
 		case REFTABLE_REF_SYMREF:
@@ -145,7 +141,7 @@ static void test_reftable_ref_record_roundtrip(void)
 		EXPECT(n > 0);
 
 		/* decode into a non-zero reftable_record to test for leaks. */
-		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ);
+		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_ref_record_equal(&in.u.ref, &out.u.ref,
@@ -155,6 +151,8 @@ static void test_reftable_ref_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_log_record_equal(void)
@@ -180,7 +178,6 @@ static void test_reftable_log_record_equal(void)
 static void test_reftable_log_record_roundtrip(void)
 {
 	int i;
-
 	struct reftable_log_record in[] = {
 		{
 			.refname = xstrdup("refs/heads/master"),
@@ -188,8 +185,6 @@ static void test_reftable_log_record_roundtrip(void)
 			.value_type = REFTABLE_LOG_UPDATE,
 			.value = {
 				.update = {
-					.old_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					.new_hash = reftable_malloc(GIT_SHA1_RAWSZ),
 					.name = xstrdup("han-wen"),
 					.email = xstrdup("hanwen@google.com"),
 					.message = xstrdup("test"),
@@ -207,15 +202,10 @@ static void test_reftable_log_record_roundtrip(void)
 			.refname = xstrdup("branch"),
 			.update_index = 33,
 			.value_type = REFTABLE_LOG_UPDATE,
-			.value = {
-				.update = {
-					.old_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					.new_hash = reftable_malloc(GIT_SHA1_RAWSZ),
-					/* rest of fields left empty. */
-				},
-			},
 		}
 	};
+	struct strbuf scratch = STRBUF_INIT;
+
 	set_test_hash(in[0].value.update.new_hash, 1);
 	set_test_hash(in[0].value.update.old_hash, 2);
 	set_test_hash(in[2].value.update.new_hash, 3);
@@ -236,8 +226,6 @@ static void test_reftable_log_record_roundtrip(void)
 				.value_type = REFTABLE_LOG_UPDATE,
 				.value = {
 					.update = {
-						.new_hash = reftable_calloc(GIT_SHA1_RAWSZ),
-						.old_hash = reftable_calloc(GIT_SHA1_RAWSZ),
 						.name = xstrdup("old name"),
 						.email = xstrdup("old@email"),
 						.message = xstrdup("old message"),
@@ -257,7 +245,7 @@ static void test_reftable_log_record_roundtrip(void)
 		EXPECT(n >= 0);
 		valtype = reftable_record_val_type(&rec);
 		m = reftable_record_decode(&out, key, valtype, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_log_record_equal(&in[i], &out.u.log,
@@ -266,6 +254,8 @@ static void test_reftable_log_record_roundtrip(void)
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_u24_roundtrip(void)
@@ -300,7 +290,8 @@ static void test_key_roundtrip(void)
 	EXPECT(!restart);
 	EXPECT(n > 0);
 
-	m = reftable_decode_key(&roundtrip, &rt_extra, last_key, dest);
+	strbuf_addstr(&roundtrip, "refs/heads/master");
+	m = reftable_decode_key(&roundtrip, &rt_extra, dest);
 	EXPECT(n == m);
 	EXPECT(0 == strbuf_cmp(&key, &roundtrip));
 	EXPECT(rt_extra == extra);
@@ -314,23 +305,27 @@ static void test_reftable_obj_record_roundtrip(void)
 {
 	uint8_t testHash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 4, 0 };
 	uint64_t till9[] = { 1, 2, 3, 4, 500, 600, 700, 800, 9000 };
-	struct reftable_obj_record recs[3] = { {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 3,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-						       .offsets = till9,
-						       .offset_len = 9,
-					       },
-					       {
-						       .hash_prefix = testHash1,
-						       .hash_prefix_len = 5,
-					       } };
+	struct reftable_obj_record recs[3] = {
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 3,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+			.offsets = till9,
+			.offset_len = 9,
+		},
+		{
+			.hash_prefix = testHash1,
+			.hash_prefix_len = 5,
+		},
+	};
+	struct strbuf scratch = STRBUF_INIT;
 	int i = 0;
+
 	for (i = 0; i < ARRAY_SIZE(recs); i++) {
 		uint8_t buffer[1024] = { 0 };
 		struct string_view dest = {
@@ -354,13 +349,15 @@ static void test_reftable_obj_record_roundtrip(void)
 		EXPECT(n > 0);
 		extra = reftable_record_val_type(&in);
 		m = reftable_record_decode(&out, key, extra, dest,
-					   GIT_SHA1_RAWSZ);
+					   GIT_SHA1_RAWSZ, &scratch);
 		EXPECT(n == m);
 
 		EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 		strbuf_release(&key);
 		reftable_record_release(&out);
 	}
+
+	strbuf_release(&scratch);
 }
 
 static void test_reftable_index_record_roundtrip(void)
@@ -377,6 +374,7 @@ static void test_reftable_index_record_roundtrip(void)
 		.buf = buffer,
 		.len = sizeof(buffer),
 	};
+	struct strbuf scratch = STRBUF_INIT;
 	struct strbuf key = STRBUF_INIT;
 	struct reftable_record out = {
 		.type = BLOCK_TYPE_INDEX,
@@ -394,13 +392,15 @@ static void test_reftable_index_record_roundtrip(void)
 	EXPECT(n > 0);
 
 	extra = reftable_record_val_type(&in);
-	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ);
+	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ,
+				   &scratch);
 	EXPECT(m == n);
 
 	EXPECT(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
 
 	reftable_record_release(&out);
 	strbuf_release(&key);
+	strbuf_release(&scratch);
 	strbuf_release(&in.u.idx.last_key);
 }
 
diff --git a/reftable/refname.c b/reftable/refname.c
index 9573496..bbfde15 100644
--- a/reftable/refname.c
+++ b/reftable/refname.c
@@ -12,15 +12,15 @@
 #include "refname.h"
 #include "reftable-iterator.h"
 
-struct find_arg {
-	char **names;
-	const char *want;
+struct refname_needle_lesseq_args {
+	char **haystack;
+	const char *needle;
 };
 
-static int find_name(size_t k, void *arg)
+static int refname_needle_lesseq(size_t k, void *_args)
 {
-	struct find_arg *f_arg = arg;
-	return strcmp(f_arg->names[k], f_arg->want) >= 0;
+	struct refname_needle_lesseq_args *args = _args;
+	return strcmp(args->needle, args->haystack[k]) <= 0;
 }
 
 static int modification_has_ref(struct modification *mod, const char *name)
@@ -29,25 +29,23 @@ static int modification_has_ref(struct modification *mod, const char *name)
 	int err = 0;
 
 	if (mod->add_len > 0) {
-		struct find_arg arg = {
-			.names = mod->add,
-			.want = name,
+		struct refname_needle_lesseq_args args = {
+			.haystack = mod->add,
+			.needle = name,
 		};
-		int idx = binsearch(mod->add_len, find_name, &arg);
-		if (idx < mod->add_len && !strcmp(mod->add[idx], name)) {
+		size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args);
+		if (idx < mod->add_len && !strcmp(mod->add[idx], name))
 			return 0;
-		}
 	}
 
 	if (mod->del_len > 0) {
-		struct find_arg arg = {
-			.names = mod->del,
-			.want = name,
+		struct refname_needle_lesseq_args args = {
+			.haystack = mod->del,
+			.needle = name,
 		};
-		int idx = binsearch(mod->del_len, find_name, &arg);
-		if (idx < mod->del_len && !strcmp(mod->del[idx], name)) {
+		size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args);
+		if (idx < mod->del_len && !strcmp(mod->del[idx], name))
 			return 1;
-		}
 	}
 
 	err = reftable_table_read_ref(&mod->tab, name, &ref);
@@ -73,11 +71,11 @@ static int modification_has_ref_with_prefix(struct modification *mod,
 	int err = 0;
 
 	if (mod->add_len > 0) {
-		struct find_arg arg = {
-			.names = mod->add,
-			.want = prefix,
+		struct refname_needle_lesseq_args args = {
+			.haystack = mod->add,
+			.needle = prefix,
 		};
-		int idx = binsearch(mod->add_len, find_name, &arg);
+		size_t idx = binsearch(mod->add_len, refname_needle_lesseq, &args);
 		if (idx < mod->add_len &&
 		    !strncmp(prefix, mod->add[idx], strlen(prefix)))
 			goto done;
@@ -92,15 +90,14 @@ static int modification_has_ref_with_prefix(struct modification *mod,
 			goto done;
 
 		if (mod->del_len > 0) {
-			struct find_arg arg = {
-				.names = mod->del,
-				.want = ref.refname,
+			struct refname_needle_lesseq_args args = {
+				.haystack = mod->del,
+				.needle = ref.refname,
 			};
-			int idx = binsearch(mod->del_len, find_name, &arg);
+			size_t idx = binsearch(mod->del_len, refname_needle_lesseq, &args);
 			if (idx < mod->del_len &&
-			    !strcmp(ref.refname, mod->del[idx])) {
+			    !strcmp(ref.refname, mod->del[idx]))
 				continue;
-			}
 		}
 
 		if (strncmp(ref.refname, prefix, strlen(prefix))) {
@@ -140,8 +137,8 @@ int validate_ref_record_addition(struct reftable_table tab,
 {
 	struct modification mod = {
 		.tab = tab,
-		.add = reftable_calloc(sizeof(char *) * sz),
-		.del = reftable_calloc(sizeof(char *) * sz),
+		.add = reftable_calloc(sz, sizeof(*mod.add)),
+		.del = reftable_calloc(sz, sizeof(*mod.del)),
 	};
 	int i = 0;
 	int err = 0;
diff --git a/reftable/refname_test.c b/reftable/refname_test.c
index 8645cd9..b9cc625 100644
--- a/reftable/refname_test.c
+++ b/reftable/refname_test.c
@@ -9,7 +9,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "basics.h"
 #include "block.h"
 #include "blocksource.h"
-#include "constants.h"
 #include "reader.h"
 #include "record.h"
 #include "refname.h"
@@ -31,7 +30,7 @@ static void test_conflict(void)
 	struct reftable_write_options opts = { 0 };
 	struct strbuf buf = STRBUF_INIT;
 	struct reftable_writer *w =
-		reftable_new_writer(&strbuf_add_void, &buf, &opts);
+		reftable_new_writer(&strbuf_add_void, &noop_flush, &buf, &opts);
 	struct reftable_ref_record rec = {
 		.refname = "a/b",
 		.value_type = REFTABLE_REF_SYMREF,
diff --git a/reftable/reftable-error.h b/reftable/reftable-error.h
index 4c457aa..e9b07c9 100644
--- a/reftable/reftable-error.h
+++ b/reftable/reftable-error.h
@@ -25,7 +25,7 @@ enum reftable_error {
 	 */
 	REFTABLE_NOT_EXIST_ERROR = -4,
 
-	/* Trying to write out-of-date data. */
+	/* Trying to access locked data. */
 	REFTABLE_LOCK_ERROR = -5,
 
 	/* Misuse of the API:
@@ -57,6 +57,9 @@ enum reftable_error {
 	/* Entry does not fit. This can happen when writing outsize reflog
 	   messages. */
 	REFTABLE_ENTRY_TOO_BIG_ERROR = -11,
+
+	/* Trying to write out-of-date data. */
+	REFTABLE_OUTDATED_ERROR = -12,
 };
 
 /* convert the numeric error code to a string. The string should not be
diff --git a/reftable/reftable-merged.h b/reftable/reftable-merged.h
index 1a6d169..c91a2d8 100644
--- a/reftable/reftable-merged.h
+++ b/reftable/reftable-merged.h
@@ -33,7 +33,7 @@ struct reftable_table;
    the stack array.
 */
 int reftable_new_merged_table(struct reftable_merged_table **dest,
-			      struct reftable_table *stack, int n,
+			      struct reftable_table *stack, size_t n,
 			      uint32_t hash_id);
 
 /* returns an iterator positioned just before 'name' */
diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h
index 67104f8..2a2943c 100644
--- a/reftable/reftable-record.h
+++ b/reftable/reftable-record.h
@@ -9,6 +9,7 @@ https://developers.google.com/open-source/licenses/bsd
 #ifndef REFTABLE_RECORD_H
 #define REFTABLE_RECORD_H
 
+#include "hash-ll.h"
 #include <stdint.h>
 
 /*
@@ -21,6 +22,7 @@ https://developers.google.com/open-source/licenses/bsd
 /* reftable_ref_record holds a ref database entry target_value */
 struct reftable_ref_record {
 	char *refname; /* Name of the ref, malloced. */
+	size_t refname_cap;
 	uint64_t update_index; /* Logical timestamp at which this value is
 				* written */
 
@@ -38,10 +40,10 @@ struct reftable_ref_record {
 #define REFTABLE_NR_REF_VALUETYPES 4
 	} value_type;
 	union {
-		uint8_t *val1; /* malloced hash. */
+		unsigned char val1[GIT_MAX_RAWSZ];
 		struct {
-			uint8_t *value; /* first value, malloced hash  */
-			uint8_t *target_value; /* second value, malloced hash */
+			unsigned char value[GIT_MAX_RAWSZ]; /* first hash  */
+			unsigned char target_value[GIT_MAX_RAWSZ]; /* second hash */
 		} val2;
 		char *symref; /* referent, malloced 0-terminated string */
 	} value;
@@ -49,11 +51,11 @@ struct reftable_ref_record {
 
 /* Returns the first hash, or NULL if `rec` is not of type
  * REFTABLE_REF_VAL1 or REFTABLE_REF_VAL2. */
-uint8_t *reftable_ref_record_val1(const struct reftable_ref_record *rec);
+const unsigned char *reftable_ref_record_val1(const struct reftable_ref_record *rec);
 
 /* Returns the second hash, or NULL if `rec` is not of type
  * REFTABLE_REF_VAL2. */
-uint8_t *reftable_ref_record_val2(const struct reftable_ref_record *rec);
+const unsigned char *reftable_ref_record_val2(const struct reftable_ref_record *rec);
 
 /* returns whether 'ref' represents a deletion */
 int reftable_ref_record_is_deletion(const struct reftable_ref_record *ref);
@@ -72,6 +74,7 @@ int reftable_ref_record_equal(const struct reftable_ref_record *a,
 /* reftable_log_record holds a reflog entry */
 struct reftable_log_record {
 	char *refname;
+	size_t refname_cap;
 	uint64_t update_index; /* logical timestamp of a transactional update.
 				*/
 
@@ -86,13 +89,14 @@ struct reftable_log_record {
 
 	union {
 		struct {
-			uint8_t *new_hash;
-			uint8_t *old_hash;
+			unsigned char new_hash[GIT_MAX_RAWSZ];
+			unsigned char old_hash[GIT_MAX_RAWSZ];
 			char *name;
 			char *email;
 			uint64_t time;
 			int16_t tz_offset;
 			char *message;
+			size_t message_cap;
 		} update;
 	} value;
 };
diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h
index db8de19..155bf0b 100644
--- a/reftable/reftable-writer.h
+++ b/reftable/reftable-writer.h
@@ -46,6 +46,9 @@ struct reftable_write_options {
 	 *   is a single line, and add '\n' if missing.
 	 */
 	unsigned exact_log_message : 1;
+
+	/* boolean: Prevent auto-compaction of tables. */
+	unsigned disable_auto_compact : 1;
 };
 
 /* reftable_block_stats holds statistics for a single block type */
@@ -88,6 +91,7 @@ struct reftable_stats {
 /* reftable_new_writer creates a new writer */
 struct reftable_writer *
 reftable_new_writer(ssize_t (*writer_func)(void *, const void *, size_t),
+		    int (*flush_func)(void *),
 		    void *writer_arg, struct reftable_write_options *opts);
 
 /* Set the range of update indices for the records we will add. When writing a
diff --git a/reftable/stack.c b/reftable/stack.c
index ddbdf1b..80266bc 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -8,6 +8,7 @@ https://developers.google.com/open-source/licenses/bsd
 
 #include "stack.h"
 
+#include "../write-or-die.h"
 #include "system.h"
 #include "merged.h"
 #include "reader.h"
@@ -16,13 +17,15 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-record.h"
 #include "reftable-merged.h"
 #include "writer.h"
+#include "tempfile.h"
 
 static int stack_try_add(struct reftable_stack *st,
 			 int (*write_table)(struct reftable_writer *wr,
 					    void *arg),
 			 void *arg);
 static int stack_write_compact(struct reftable_stack *st,
-			       struct reftable_writer *wr, int first, int last,
+			       struct reftable_writer *wr,
+			       size_t first, size_t last,
 			       struct reftable_log_expiry_config *config);
 static int stack_check_addition(struct reftable_stack *st,
 				const char *new_tab_name);
@@ -42,14 +45,20 @@ static void stack_filename(struct strbuf *dest, struct reftable_stack *st,
 static ssize_t reftable_fd_write(void *arg, const void *data, size_t sz)
 {
 	int *fdp = (int *)arg;
-	return write(*fdp, data, sz);
+	return write_in_full(*fdp, data, sz);
+}
+
+static int reftable_fd_flush(void *arg)
+{
+	int *fdp = (int *)arg;
+
+	return fsync_component(FSYNC_COMPONENT_REFERENCE, *fdp);
 }
 
 int reftable_new_stack(struct reftable_stack **dest, const char *dir,
 		       struct reftable_write_options config)
 {
-	struct reftable_stack *p =
-		reftable_calloc(sizeof(struct reftable_stack));
+	struct reftable_stack *p = reftable_calloc(1, sizeof(*p));
 	struct strbuf list_file_name = STRBUF_INIT;
 	int err = 0;
 
@@ -64,6 +73,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir,
 	strbuf_addstr(&list_file_name, "/tables.list");
 
 	p->list_file = strbuf_detach(&list_file_name, NULL);
+	p->list_fd = -1;
 	p->reftable_dir = xstrdup(dir);
 	p->config = config;
 
@@ -91,8 +101,8 @@ static int fd_read_lines(int fd, char ***namesp)
 		goto done;
 	}
 
-	buf = reftable_malloc(size + 1);
-	if (read(fd, buf, size) != size) {
+	REFTABLE_ALLOC_ARRAY(buf, size + 1);
+	if (read_in_full(fd, buf, size) != size) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
@@ -111,7 +121,7 @@ int read_lines(const char *filename, char ***namesp)
 	int err = 0;
 	if (fd < 0) {
 		if (errno == ENOENT) {
-			*namesp = reftable_calloc(sizeof(char *));
+			REFTABLE_CALLOC_ARRAY(*namesp, 1);
 			return 0;
 		}
 
@@ -173,6 +183,12 @@ void reftable_stack_destroy(struct reftable_stack *st)
 		st->readers_len = 0;
 		FREE_AND_NULL(st->readers);
 	}
+
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
 	FREE_AND_NULL(st->list_file);
 	FREE_AND_NULL(st->reftable_dir);
 	reftable_free(st);
@@ -182,8 +198,7 @@ void reftable_stack_destroy(struct reftable_stack *st)
 static struct reftable_reader **stack_copy_readers(struct reftable_stack *st,
 						   int cur_len)
 {
-	struct reftable_reader **cur =
-		reftable_calloc(sizeof(struct reftable_reader *) * cur_len);
+	struct reftable_reader **cur = reftable_calloc(cur_len, sizeof(*cur));
 	int i = 0;
 	for (i = 0; i < cur_len; i++) {
 		cur[i] = st->readers[i];
@@ -194,17 +209,18 @@ static struct reftable_reader **stack_copy_readers(struct reftable_stack *st,
 static int reftable_stack_reload_once(struct reftable_stack *st, char **names,
 				      int reuse_open)
 {
-	int cur_len = !st->merged ? 0 : st->merged->stack_len;
+	size_t cur_len = !st->merged ? 0 : st->merged->stack_len;
 	struct reftable_reader **cur = stack_copy_readers(st, cur_len);
-	int err = 0;
-	int names_len = names_length(names);
+	size_t names_len = names_length(names);
 	struct reftable_reader **new_readers =
-		reftable_calloc(sizeof(struct reftable_reader *) * names_len);
+		reftable_calloc(names_len, sizeof(*new_readers));
 	struct reftable_table *new_tables =
-		reftable_calloc(sizeof(struct reftable_table) * names_len);
-	int new_readers_len = 0;
+		reftable_calloc(names_len, sizeof(*new_tables));
+	size_t new_readers_len = 0;
 	struct reftable_merged_table *new_merged = NULL;
-	int i;
+	struct strbuf table_path = STRBUF_INIT;
+	int err = 0;
+	size_t i;
 
 	while (*names) {
 		struct reftable_reader *rd = NULL;
@@ -212,24 +228,20 @@ static int reftable_stack_reload_once(struct reftable_stack *st, char **names,
 
 		/* this is linear; we assume compaction keeps the number of
 		   tables under control so this is not quadratic. */
-		int j = 0;
-		for (j = 0; reuse_open && j < cur_len; j++) {
-			if (cur[j] && 0 == strcmp(cur[j]->name, name)) {
-				rd = cur[j];
-				cur[j] = NULL;
+		for (i = 0; reuse_open && i < cur_len; i++) {
+			if (cur[i] && 0 == strcmp(cur[i]->name, name)) {
+				rd = cur[i];
+				cur[i] = NULL;
 				break;
 			}
 		}
 
 		if (!rd) {
 			struct reftable_block_source src = { NULL };
-			struct strbuf table_path = STRBUF_INIT;
 			stack_filename(&table_path, st, name);
 
 			err = reftable_block_source_from_file(&src,
 							      table_path.buf);
-			strbuf_release(&table_path);
-
 			if (err < 0)
 				goto done;
 
@@ -267,16 +279,13 @@ static int reftable_stack_reload_once(struct reftable_stack *st, char **names,
 	for (i = 0; i < cur_len; i++) {
 		if (cur[i]) {
 			const char *name = reader_name(cur[i]);
-			struct strbuf filename = STRBUF_INIT;
-			stack_filename(&filename, st, name);
+			stack_filename(&table_path, st, name);
 
 			reader_close(cur[i]);
 			reftable_reader_free(cur[i]);
 
 			/* On Windows, can only unlink after closing. */
-			unlink(filename.buf);
-
-			strbuf_release(&filename);
+			unlink(table_path.buf);
 		}
 	}
 
@@ -288,6 +297,7 @@ done:
 	reftable_free(new_readers);
 	reftable_free(new_tables);
 	reftable_free(cur);
+	strbuf_release(&table_path);
 	return err;
 }
 
@@ -306,69 +316,134 @@ static int tv_cmp(struct timeval *a, struct timeval *b)
 static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 					     int reuse_open)
 {
-	struct timeval deadline = { 0 };
-	int err = gettimeofday(&deadline, NULL);
+	char **names = NULL, **names_after = NULL;
+	struct timeval deadline;
 	int64_t delay = 0;
-	int tries = 0;
-	if (err < 0)
-		return err;
+	int tries = 0, err;
+	int fd = -1;
 
+	err = gettimeofday(&deadline, NULL);
+	if (err < 0)
+		goto out;
 	deadline.tv_sec += 3;
+
 	while (1) {
-		char **names = NULL;
-		char **names_after = NULL;
-		struct timeval now = { 0 };
-		int err = gettimeofday(&now, NULL);
-		int err2 = 0;
-		if (err < 0) {
-			return err;
-		}
+		struct timeval now;
+
+		err = gettimeofday(&now, NULL);
+		if (err < 0)
+			goto out;
 
-		/* Only look at deadlines after the first few times. This
-		   simplifies debugging in GDB */
+		/*
+		 * Only look at deadlines after the first few times. This
+		 * simplifies debugging in GDB.
+		 */
 		tries++;
-		if (tries > 3 && tv_cmp(&now, &deadline) >= 0) {
-			break;
-		}
+		if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
+			goto out;
 
-		err = read_lines(st->list_file, &names);
-		if (err < 0) {
-			free_names(names);
-			return err;
-		}
-		err = reftable_stack_reload_once(st, names, reuse_open);
-		if (err == 0) {
-			free_names(names);
-			break;
-		}
-		if (err != REFTABLE_NOT_EXIST_ERROR) {
-			free_names(names);
-			return err;
-		}
+		fd = open(st->list_file, O_RDONLY);
+		if (fd < 0) {
+			if (errno != ENOENT) {
+				err = REFTABLE_IO_ERROR;
+				goto out;
+			}
 
-		/* err == REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
-		   writer. Check if there was one by checking if the name list
-		   changed.
-		*/
-		err2 = read_lines(st->list_file, &names_after);
-		if (err2 < 0) {
-			free_names(names);
-			return err2;
+			REFTABLE_CALLOC_ARRAY(names, 1);
+		} else {
+			err = fd_read_lines(fd, &names);
+			if (err < 0)
+				goto out;
 		}
 
+		err = reftable_stack_reload_once(st, names, reuse_open);
+		if (!err)
+			break;
+		if (err != REFTABLE_NOT_EXIST_ERROR)
+			goto out;
+
+		/*
+		 * REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
+		 * writer. Check if there was one by checking if the name list
+		 * changed.
+		 */
+		err = read_lines(st->list_file, &names_after);
+		if (err < 0)
+			goto out;
 		if (names_equal(names_after, names)) {
-			free_names(names);
-			free_names(names_after);
-			return err;
+			err = REFTABLE_NOT_EXIST_ERROR;
+			goto out;
 		}
+
 		free_names(names);
+		names = NULL;
 		free_names(names_after);
+		names_after = NULL;
+		close(fd);
+		fd = -1;
 
 		delay = delay + (delay * rand()) / RAND_MAX + 1;
 		sleep_millisec(delay);
 	}
 
-	return 0;
+out:
+	/*
+	 * Invalidate the stat cache. It is sufficient to only close the file
+	 * descriptor and keep the cached stat info because we never use the
+	 * latter when the former is negative.
+	 */
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
+	/*
+	 * Cache stat information in case it provides a useful signal to us.
+	 * According to POSIX, "The st_ino and st_dev fields taken together
+	 * uniquely identify the file within the system." That being said,
+	 * Windows is not POSIX compliant and we do not have these fields
+	 * available. So the information we have there is insufficient to
+	 * determine whether two file descriptors point to the same file.
+	 *
+	 * While we could fall back to using other signals like the file's
+	 * mtime, those are not sufficient to avoid races. We thus refrain from
+	 * using the stat cache on such systems and fall back to the secondary
+	 * caching mechanism, which is to check whether contents of the file
+	 * have changed.
+	 *
+	 * On other systems which are POSIX compliant we must keep the file
+	 * descriptor open. This is to avoid a race condition where two
+	 * processes access the reftable stack at the same point in time:
+	 *
+	 *   1. A reads the reftable stack and caches its stat info.
+	 *
+	 *   2. B updates the stack, appending a new table to "tables.list".
+	 *      This will both use a new inode and result in a different file
+	 *      size, thus invalidating A's cache in theory.
+	 *
+	 *   3. B decides to auto-compact the stack and merges two tables. The
+	 *      file size now matches what A has cached again. Furthermore, the
+	 *      filesystem may decide to recycle the inode number of the file
+	 *      we have replaced in (2) because it is not in use anymore.
+	 *
+	 *   4. A reloads the reftable stack. Neither the inode number nor the
+	 *      file size changed. If the timestamps did not change either then
+	 *      we think the cached copy of our stack is up-to-date.
+	 *
+	 * By keeping the file descriptor open the inode number cannot be
+	 * recycled, mitigating the race.
+	 */
+	if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
+	    st->list_st.st_dev && st->list_st.st_ino) {
+		st->list_fd = fd;
+		fd = -1;
+	}
+
+	if (fd >= 0)
+		close(fd);
+	free_names(names);
+	free_names(names_after);
+	return err;
 }
 
 /* -1 = error
@@ -377,8 +452,44 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
 static int stack_uptodate(struct reftable_stack *st)
 {
 	char **names = NULL;
-	int err = read_lines(st->list_file, &names);
+	int err;
 	int i = 0;
+
+	/*
+	 * When we have cached stat information available then we use it to
+	 * verify whether the file has been rewritten.
+	 *
+	 * Note that we explicitly do not want to use `stat_validity_check()`
+	 * and friends here because they may end up not comparing the `st_dev`
+	 * and `st_ino` fields. These functions thus cannot guarantee that we
+	 * indeed still have the same file.
+	 */
+	if (st->list_fd >= 0) {
+		struct stat list_st;
+
+		if (stat(st->list_file, &list_st) < 0) {
+			/*
+			 * It's fine for "tables.list" to not exist. In that
+			 * case, we have to refresh when the loaded stack has
+			 * any readers.
+			 */
+			if (errno == ENOENT)
+				return !!st->readers_len;
+			return REFTABLE_IO_ERROR;
+		}
+
+		/*
+		 * When "tables.list" refers to the same file we can assume
+		 * that it didn't change. This is because we always use
+		 * rename(3P) to update the file and never write to it
+		 * directly.
+		 */
+		if (st->list_st.st_dev == list_st.st_dev &&
+		    st->list_st.st_ino == list_st.st_ino)
+			return 0;
+	}
+
+	err = read_lines(st->list_file, &names);
 	if (err < 0)
 		return err;
 
@@ -418,25 +529,22 @@ int reftable_stack_add(struct reftable_stack *st,
 {
 	int err = stack_try_add(st, write, arg);
 	if (err < 0) {
-		if (err == REFTABLE_LOCK_ERROR) {
+		if (err == REFTABLE_OUTDATED_ERROR) {
 			/* Ignore error return, we want to propagate
-			   REFTABLE_LOCK_ERROR.
+			   REFTABLE_OUTDATED_ERROR.
 			*/
 			reftable_stack_reload(st);
 		}
 		return err;
 	}
 
-	if (!st->disable_auto_compact)
-		return reftable_stack_auto_compact(st);
-
 	return 0;
 }
 
 static void format_name(struct strbuf *dest, uint64_t min, uint64_t max)
 {
 	char buf[100];
-	uint32_t rnd = (uint32_t)rand();
+	uint32_t rnd = (uint32_t)git_rand();
 	snprintf(buf, sizeof(buf), "0x%012" PRIx64 "-0x%012" PRIx64 "-%08x",
 		 min, max, rnd);
 	strbuf_reset(dest);
@@ -444,33 +552,27 @@ static void format_name(struct strbuf *dest, uint64_t min, uint64_t max)
 }
 
 struct reftable_addition {
-	int lock_file_fd;
-	struct strbuf lock_file_name;
+	struct tempfile *lock_file;
 	struct reftable_stack *stack;
 
 	char **new_tables;
-	int new_tables_len;
+	size_t new_tables_len, new_tables_cap;
 	uint64_t next_update_index;
 };
 
-#define REFTABLE_ADDITION_INIT                \
-	{                                     \
-		.lock_file_name = STRBUF_INIT \
-	}
+#define REFTABLE_ADDITION_INIT {0}
 
 static int reftable_stack_init_addition(struct reftable_addition *add,
 					struct reftable_stack *st)
 {
+	struct strbuf lock_file_name = STRBUF_INIT;
 	int err = 0;
 	add->stack = st;
 
-	strbuf_reset(&add->lock_file_name);
-	strbuf_addstr(&add->lock_file_name, st->list_file);
-	strbuf_addstr(&add->lock_file_name, ".lock");
+	strbuf_addf(&lock_file_name, "%s.lock", st->list_file);
 
-	add->lock_file_fd = open(add->lock_file_name.buf,
-				 O_EXCL | O_CREAT | O_WRONLY, 0666);
-	if (add->lock_file_fd < 0) {
+	add->lock_file = create_tempfile(lock_file_name.buf);
+	if (!add->lock_file) {
 		if (errno == EEXIST) {
 			err = REFTABLE_LOCK_ERROR;
 		} else {
@@ -479,7 +581,7 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 		goto done;
 	}
 	if (st->config.default_permissions) {
-		if (chmod(add->lock_file_name.buf, st->config.default_permissions) < 0) {
+		if (chmod(add->lock_file->filename.buf, st->config.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
@@ -488,9 +590,8 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 	err = stack_uptodate(st);
 	if (err < 0)
 		goto done;
-
-	if (err > 1) {
-		err = REFTABLE_LOCK_ERROR;
+	if (err > 0) {
+		err = REFTABLE_OUTDATED_ERROR;
 		goto done;
 	}
 
@@ -499,13 +600,15 @@ done:
 	if (err) {
 		reftable_addition_close(add);
 	}
+	strbuf_release(&lock_file_name);
 	return err;
 }
 
 static void reftable_addition_close(struct reftable_addition *add)
 {
-	int i = 0;
 	struct strbuf nm = STRBUF_INIT;
+	size_t i;
+
 	for (i = 0; i < add->new_tables_len; i++) {
 		stack_filename(&nm, add->stack, add->new_tables[i]);
 		unlink(nm.buf);
@@ -515,16 +618,9 @@ static void reftable_addition_close(struct reftable_addition *add)
 	reftable_free(add->new_tables);
 	add->new_tables = NULL;
 	add->new_tables_len = 0;
+	add->new_tables_cap = 0;
 
-	if (add->lock_file_fd > 0) {
-		close(add->lock_file_fd);
-		add->lock_file_fd = 0;
-	}
-	if (add->lock_file_name.len > 0) {
-		unlink(add->lock_file_name.buf);
-		strbuf_release(&add->lock_file_name);
-	}
-
+	delete_tempfile(&add->lock_file);
 	strbuf_release(&nm);
 }
 
@@ -540,8 +636,10 @@ void reftable_addition_destroy(struct reftable_addition *add)
 int reftable_addition_commit(struct reftable_addition *add)
 {
 	struct strbuf table_list = STRBUF_INIT;
-	int i = 0;
+	int lock_file_fd = get_tempfile_fd(add->lock_file);
 	int err = 0;
+	size_t i;
+
 	if (add->new_tables_len == 0)
 		goto done;
 
@@ -554,36 +652,48 @@ int reftable_addition_commit(struct reftable_addition *add)
 		strbuf_addstr(&table_list, "\n");
 	}
 
-	err = write(add->lock_file_fd, table_list.buf, table_list.len);
+	err = write_in_full(lock_file_fd, table_list.buf, table_list.len);
 	strbuf_release(&table_list);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	err = close(add->lock_file_fd);
-	add->lock_file_fd = 0;
-	if (err < 0) {
-		err = REFTABLE_IO_ERROR;
-		goto done;
-	}
+	fsync_component_or_die(FSYNC_COMPONENT_REFERENCE, lock_file_fd,
+			       get_tempfile_path(add->lock_file));
 
-	err = rename(add->lock_file_name.buf, add->stack->list_file);
+	err = rename_tempfile(&add->lock_file, add->stack->list_file);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
 	/* success, no more state to clean up. */
-	strbuf_release(&add->lock_file_name);
-	for (i = 0; i < add->new_tables_len; i++) {
+	for (i = 0; i < add->new_tables_len; i++)
 		reftable_free(add->new_tables[i]);
-	}
 	reftable_free(add->new_tables);
 	add->new_tables = NULL;
 	add->new_tables_len = 0;
+	add->new_tables_cap = 0;
+
+	err = reftable_stack_reload_maybe_reuse(add->stack, 1);
+	if (err)
+		goto done;
+
+	if (!add->stack->config.disable_auto_compact) {
+		/*
+		 * Auto-compact the stack to keep the number of tables in
+		 * control. It is possible that a concurrent writer is already
+		 * trying to compact parts of the stack, which would lead to a
+		 * `REFTABLE_LOCK_ERROR` because parts of the stack are locked
+		 * already. This is a benign error though, so we ignore it.
+		 */
+		err = reftable_stack_auto_compact(add->stack);
+		if (err < 0 && err != REFTABLE_LOCK_ERROR)
+			goto done;
+		err = 0;
+	}
 
-	err = reftable_stack_reload(add->stack);
 done:
 	reftable_addition_close(add);
 	return err;
@@ -594,7 +704,7 @@ int reftable_stack_new_addition(struct reftable_addition **dest,
 {
 	int err = 0;
 	struct reftable_addition empty = REFTABLE_ADDITION_INIT;
-	*dest = reftable_calloc(sizeof(**dest));
+	REFTABLE_CALLOC_ARRAY(*dest, 1);
 	**dest = empty;
 	err = reftable_stack_init_addition(*dest, st);
 	if (err) {
@@ -613,10 +723,6 @@ static int stack_try_add(struct reftable_stack *st,
 	int err = reftable_stack_init_addition(&add, st);
 	if (err < 0)
 		goto done;
-	if (err > 0) {
-		err = REFTABLE_LOCK_ERROR;
-		goto done;
-	}
 
 	err = reftable_addition_add(&add, write_table, arg);
 	if (err < 0)
@@ -637,8 +743,9 @@ int reftable_addition_add(struct reftable_addition *add,
 	struct strbuf tab_file_name = STRBUF_INIT;
 	struct strbuf next_name = STRBUF_INIT;
 	struct reftable_writer *wr = NULL;
+	struct tempfile *tab_file = NULL;
 	int err = 0;
-	int tab_fd = 0;
+	int tab_fd;
 
 	strbuf_reset(&next_name);
 	format_name(&next_name, add->next_update_index, add->next_update_index);
@@ -646,18 +753,21 @@ int reftable_addition_add(struct reftable_addition *add,
 	stack_filename(&temp_tab_file_name, add->stack, next_name.buf);
 	strbuf_addstr(&temp_tab_file_name, ".temp.XXXXXX");
 
-	tab_fd = mkstemp(temp_tab_file_name.buf);
-	if (tab_fd < 0) {
+	tab_file = mks_tempfile(temp_tab_file_name.buf);
+	if (!tab_file) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 	if (add->stack->config.default_permissions) {
-		if (chmod(temp_tab_file_name.buf, add->stack->config.default_permissions)) {
+		if (chmod(get_tempfile_path(tab_file),
+			  add->stack->config.default_permissions)) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
-	wr = reftable_new_writer(reftable_fd_write, &tab_fd,
+	tab_fd = get_tempfile_fd(tab_file);
+
+	wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush, &tab_fd,
 				 &add->stack->config);
 	err = write_table(wr, arg);
 	if (err < 0)
@@ -671,14 +781,13 @@ int reftable_addition_add(struct reftable_addition *add,
 	if (err < 0)
 		goto done;
 
-	err = close(tab_fd);
-	tab_fd = 0;
+	err = close_tempfile_gently(tab_file);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	err = stack_check_addition(add->stack, temp_tab_file_name.buf);
+	err = stack_check_addition(add->stack, get_tempfile_path(tab_file));
 	if (err < 0)
 		goto done;
 
@@ -689,33 +798,23 @@ int reftable_addition_add(struct reftable_addition *add,
 
 	format_name(&next_name, wr->min_update_index, wr->max_update_index);
 	strbuf_addstr(&next_name, ".ref");
-
 	stack_filename(&tab_file_name, add->stack, next_name.buf);
 
 	/*
 	  On windows, this relies on rand() picking a unique destination name.
 	  Maybe we should do retry loop as well?
 	 */
-	err = rename(temp_tab_file_name.buf, tab_file_name.buf);
+	err = rename_tempfile(&tab_file, tab_file_name.buf);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	add->new_tables = reftable_realloc(add->new_tables,
-					   sizeof(*add->new_tables) *
-						   (add->new_tables_len + 1));
-	add->new_tables[add->new_tables_len] = strbuf_detach(&next_name, NULL);
-	add->new_tables_len++;
+	REFTABLE_ALLOC_GROW(add->new_tables, add->new_tables_len + 1,
+			    add->new_tables_cap);
+	add->new_tables[add->new_tables_len++] = strbuf_detach(&next_name, NULL);
 done:
-	if (tab_fd > 0) {
-		close(tab_fd);
-		tab_fd = 0;
-	}
-	if (temp_tab_file_name.len > 0) {
-		unlink(temp_tab_file_name.buf);
-	}
-
+	delete_tempfile(&tab_file);
 	strbuf_release(&temp_tab_file_name);
 	strbuf_release(&tab_file_name);
 	strbuf_release(&next_name);
@@ -732,66 +831,77 @@ uint64_t reftable_stack_next_update_index(struct reftable_stack *st)
 	return 1;
 }
 
-static int stack_compact_locked(struct reftable_stack *st, int first, int last,
-				struct strbuf *temp_tab,
-				struct reftable_log_expiry_config *config)
+static int stack_compact_locked(struct reftable_stack *st,
+				size_t first, size_t last,
+				struct reftable_log_expiry_config *config,
+				struct tempfile **tab_file_out)
 {
 	struct strbuf next_name = STRBUF_INIT;
-	int tab_fd = -1;
+	struct strbuf tab_file_path = STRBUF_INIT;
 	struct reftable_writer *wr = NULL;
-	int err = 0;
+	struct tempfile *tab_file;
+	int tab_fd, err = 0;
 
 	format_name(&next_name,
 		    reftable_reader_min_update_index(st->readers[first]),
 		    reftable_reader_max_update_index(st->readers[last]));
+	stack_filename(&tab_file_path, st, next_name.buf);
+	strbuf_addstr(&tab_file_path, ".temp.XXXXXX");
 
-	stack_filename(temp_tab, st, next_name.buf);
-	strbuf_addstr(temp_tab, ".temp.XXXXXX");
+	tab_file = mks_tempfile(tab_file_path.buf);
+	if (!tab_file) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+	tab_fd = get_tempfile_fd(tab_file);
 
-	tab_fd = mkstemp(temp_tab->buf);
-	wr = reftable_new_writer(reftable_fd_write, &tab_fd, &st->config);
+	if (st->config.default_permissions &&
+	    chmod(get_tempfile_path(tab_file), st->config.default_permissions) < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
 
+	wr = reftable_new_writer(reftable_fd_write, reftable_fd_flush,
+				 &tab_fd, &st->config);
 	err = stack_write_compact(st, wr, first, last, config);
 	if (err < 0)
 		goto done;
+
 	err = reftable_writer_close(wr);
 	if (err < 0)
 		goto done;
 
-	err = close(tab_fd);
-	tab_fd = 0;
+	err = close_tempfile_gently(tab_file);
+	if (err < 0)
+		goto done;
+
+	*tab_file_out = tab_file;
+	tab_file = NULL;
 
 done:
+	delete_tempfile(&tab_file);
 	reftable_writer_free(wr);
-	if (tab_fd > 0) {
-		close(tab_fd);
-		tab_fd = 0;
-	}
-	if (err != 0 && temp_tab->len > 0) {
-		unlink(temp_tab->buf);
-		strbuf_release(temp_tab);
-	}
 	strbuf_release(&next_name);
+	strbuf_release(&tab_file_path);
 	return err;
 }
 
 static int stack_write_compact(struct reftable_stack *st,
-			       struct reftable_writer *wr, int first, int last,
+			       struct reftable_writer *wr,
+			       size_t first, size_t last,
 			       struct reftable_log_expiry_config *config)
 {
-	int subtabs_len = last - first + 1;
+	size_t subtabs_len = last - first + 1;
 	struct reftable_table *subtabs = reftable_calloc(
-		sizeof(struct reftable_table) * (last - first + 1));
+		last - first + 1, sizeof(*subtabs));
 	struct reftable_merged_table *mt = NULL;
-	int err = 0;
 	struct reftable_iterator it = { NULL };
 	struct reftable_ref_record ref = { NULL };
 	struct reftable_log_record log = { NULL };
-
 	uint64_t entries = 0;
+	int err = 0;
 
-	int i = 0, j = 0;
-	for (i = first, j = 0; i <= last; i++) {
+	for (size_t i = first, j = 0; i <= last; i++) {
 		struct reftable_reader *t = st->readers[i];
 		reftable_table_from_reader(&subtabs[j++], t);
 		st->stats.bytes += t->size;
@@ -816,18 +926,16 @@ static int stack_write_compact(struct reftable_stack *st,
 			err = 0;
 			break;
 		}
-		if (err < 0) {
-			break;
-		}
+		if (err < 0)
+			goto done;
 
 		if (first == 0 && reftable_ref_record_is_deletion(&ref)) {
 			continue;
 		}
 
 		err = reftable_writer_add_ref(wr, &ref);
-		if (err < 0) {
-			break;
-		}
+		if (err < 0)
+			goto done;
 		entries++;
 	}
 	reftable_iterator_destroy(&it);
@@ -842,9 +950,8 @@ static int stack_write_compact(struct reftable_stack *st,
 			err = 0;
 			break;
 		}
-		if (err < 0) {
-			break;
-		}
+		if (err < 0)
+			goto done;
 		if (first == 0 && reftable_log_record_is_deletion(&log)) {
 			continue;
 		}
@@ -860,9 +967,8 @@ static int stack_write_compact(struct reftable_stack *st,
 		}
 
 		err = reftable_writer_add_log(wr, &log);
-		if (err < 0) {
-			break;
-		}
+		if (err < 0)
+			goto done;
 		entries++;
 	}
 
@@ -878,27 +984,28 @@ done:
 	return err;
 }
 
-/* <  0: error. 0 == OK, > 0 attempt failed; could retry. */
-static int stack_compact_range(struct reftable_stack *st, int first, int last,
+/*
+ * Compact all tables in the range `[first, last)` into a single new table.
+ *
+ * This function returns `0` on success or a code `< 0` on failure. When the
+ * stack or any of the tables in the specified range are already locked then
+ * this function returns `REFTABLE_LOCK_ERROR`. This is a benign error that
+ * callers can either ignore, or they may choose to retry compaction after some
+ * amount of time.
+ */
+static int stack_compact_range(struct reftable_stack *st,
+			       size_t first, size_t last,
 			       struct reftable_log_expiry_config *expiry)
 {
-	struct strbuf temp_tab_file_name = STRBUF_INIT;
+	struct strbuf tables_list_buf = STRBUF_INIT;
 	struct strbuf new_table_name = STRBUF_INIT;
-	struct strbuf lock_file_name = STRBUF_INIT;
-	struct strbuf ref_list_contents = STRBUF_INIT;
 	struct strbuf new_table_path = STRBUF_INIT;
-	int err = 0;
-	int have_lock = 0;
-	int lock_file_fd = -1;
-	int compact_count = last - first + 1;
-	char **listp = NULL;
-	char **delete_on_success =
-		reftable_calloc(sizeof(char *) * (compact_count + 1));
-	char **subtable_locks =
-		reftable_calloc(sizeof(char *) * (compact_count + 1));
-	int i = 0;
-	int j = 0;
-	int is_empty_table = 0;
+	struct strbuf table_name = STRBUF_INIT;
+	struct lock_file tables_list_lock = LOCK_INIT;
+	struct lock_file *table_locks = NULL;
+	struct tempfile *new_table = NULL;
+	int is_empty_table = 0, err = 0;
+	size_t i;
 
 	if (first > last || (!expiry && first == last)) {
 		err = 0;
@@ -907,196 +1014,200 @@ static int stack_compact_range(struct reftable_stack *st, int first, int last,
 
 	st->stats.attempts++;
 
-	strbuf_reset(&lock_file_name);
-	strbuf_addstr(&lock_file_name, st->list_file);
-	strbuf_addstr(&lock_file_name, ".lock");
-
-	lock_file_fd =
-		open(lock_file_name.buf, O_EXCL | O_CREAT | O_WRONLY, 0666);
-	if (lock_file_fd < 0) {
-		if (errno == EEXIST) {
-			err = 1;
-		} else {
+	/*
+	 * Hold the lock so that we can read "tables.list" and lock all tables
+	 * which are part of the user-specified range.
+	 */
+	err = hold_lock_file_for_update(&tables_list_lock, st->list_file,
+					LOCK_NO_DEREF);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
 			err = REFTABLE_IO_ERROR;
-		}
 		goto done;
 	}
-	/* Don't want to write to the lock for now.  */
-	close(lock_file_fd);
-	lock_file_fd = -1;
 
-	have_lock = 1;
 	err = stack_uptodate(st);
-	if (err != 0)
+	if (err)
 		goto done;
 
-	for (i = first, j = 0; i <= last; i++) {
-		struct strbuf subtab_file_name = STRBUF_INIT;
-		struct strbuf subtab_lock = STRBUF_INIT;
-		int sublock_file_fd = -1;
-
-		stack_filename(&subtab_file_name, st,
-			       reader_name(st->readers[i]));
-
-		strbuf_reset(&subtab_lock);
-		strbuf_addbuf(&subtab_lock, &subtab_file_name);
-		strbuf_addstr(&subtab_lock, ".lock");
-
-		sublock_file_fd = open(subtab_lock.buf,
-				       O_EXCL | O_CREAT | O_WRONLY, 0666);
-		if (sublock_file_fd >= 0) {
-			close(sublock_file_fd);
-		} else if (sublock_file_fd < 0) {
-			if (errno == EEXIST) {
-				err = 1;
-			} else {
+	/*
+	 * Lock all tables in the user-provided range. This is the slice of our
+	 * stack which we'll compact.
+	 */
+	REFTABLE_CALLOC_ARRAY(table_locks, last - first + 1);
+	for (i = first; i <= last; i++) {
+		stack_filename(&table_name, st, reader_name(st->readers[i]));
+
+		err = hold_lock_file_for_update(&table_locks[i - first],
+						table_name.buf, LOCK_NO_DEREF);
+		if (err < 0) {
+			if (errno == EEXIST)
+				err = REFTABLE_LOCK_ERROR;
+			else
 				err = REFTABLE_IO_ERROR;
-			}
+			goto done;
 		}
 
-		subtable_locks[j] = subtab_lock.buf;
-		delete_on_success[j] = subtab_file_name.buf;
-		j++;
-
-		if (err != 0)
+		/*
+		 * We need to close the lockfiles as we might otherwise easily
+		 * run into file descriptor exhaustion when we compress a lot
+		 * of tables.
+		 */
+		err = close_lock_file_gently(&table_locks[i - first]);
+		if (err < 0) {
+			err = REFTABLE_IO_ERROR;
 			goto done;
+		}
 	}
 
-	err = unlink(lock_file_name.buf);
-	if (err < 0)
+	/*
+	 * We have locked all tables in our range and can thus release the
+	 * "tables.list" lock while compacting the locked tables. This allows
+	 * concurrent updates to the stack to proceed.
+	 */
+	err = rollback_lock_file(&tables_list_lock);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
 		goto done;
-	have_lock = 0;
-
-	err = stack_compact_locked(st, first, last, &temp_tab_file_name,
-				   expiry);
-	/* Compaction + tombstones can create an empty table out of non-empty
-	 * tables. */
-	is_empty_table = (err == REFTABLE_EMPTY_TABLE_ERROR);
-	if (is_empty_table) {
-		err = 0;
 	}
-	if (err < 0)
-		goto done;
 
-	lock_file_fd =
-		open(lock_file_name.buf, O_EXCL | O_CREAT | O_WRONLY, 0666);
-	if (lock_file_fd < 0) {
-		if (errno == EEXIST) {
-			err = 1;
-		} else {
+	/*
+	 * Compact the now-locked tables into a new table. Note that compacting
+	 * these tables may end up with an empty new table in case tombstones
+	 * end up cancelling out all refs in that range.
+	 */
+	err = stack_compact_locked(st, first, last, expiry, &new_table);
+	if (err < 0) {
+		if (err != REFTABLE_EMPTY_TABLE_ERROR)
+			goto done;
+		is_empty_table = 1;
+	}
+
+	/*
+	 * Now that we have written the new, compacted table we need to re-lock
+	 * "tables.list". We'll then replace the compacted range of tables with
+	 * the new table.
+	 */
+	err = hold_lock_file_for_update(&tables_list_lock, st->list_file,
+					LOCK_NO_DEREF);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
 			err = REFTABLE_IO_ERROR;
-		}
 		goto done;
 	}
-	have_lock = 1;
+
 	if (st->config.default_permissions) {
-		if (chmod(lock_file_name.buf, st->config.default_permissions) < 0) {
+		if (chmod(get_lock_file_path(&tables_list_lock),
+			  st->config.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
 
-	format_name(&new_table_name, st->readers[first]->min_update_index,
-		    st->readers[last]->max_update_index);
-	strbuf_addstr(&new_table_name, ".ref");
-
-	stack_filename(&new_table_path, st, new_table_name.buf);
-
+	/*
+	 * If the resulting compacted table is not empty, then we need to move
+	 * it into place now.
+	 */
 	if (!is_empty_table) {
-		/* retry? */
-		err = rename(temp_tab_file_name.buf, new_table_path.buf);
+		format_name(&new_table_name, st->readers[first]->min_update_index,
+			    st->readers[last]->max_update_index);
+		strbuf_addstr(&new_table_name, ".ref");
+		stack_filename(&new_table_path, st, new_table_name.buf);
+
+		err = rename_tempfile(&new_table, new_table_path.buf);
 		if (err < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
 
-	for (i = 0; i < first; i++) {
-		strbuf_addstr(&ref_list_contents, st->readers[i]->name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-	if (!is_empty_table) {
-		strbuf_addbuf(&ref_list_contents, &new_table_name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-	for (i = last + 1; i < st->merged->stack_len; i++) {
-		strbuf_addstr(&ref_list_contents, st->readers[i]->name);
-		strbuf_addstr(&ref_list_contents, "\n");
-	}
-
-	err = write(lock_file_fd, ref_list_contents.buf, ref_list_contents.len);
+	/*
+	 * Write the new "tables.list" contents with the compacted table we
+	 * have just written. In case the compacted table became empty we
+	 * simply skip writing it.
+	 */
+	for (i = 0; i < first; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+	if (!is_empty_table)
+		strbuf_addf(&tables_list_buf, "%s\n", new_table_name.buf);
+	for (i = last + 1; i < st->merged->stack_len; i++)
+		strbuf_addf(&tables_list_buf, "%s\n", st->readers[i]->name);
+
+	err = write_in_full(get_lock_file_fd(&tables_list_lock),
+			    tables_list_buf.buf, tables_list_buf.len);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
-	err = close(lock_file_fd);
-	lock_file_fd = -1;
+
+	err = fsync_component(FSYNC_COMPONENT_REFERENCE, get_lock_file_fd(&tables_list_lock));
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
 
-	err = rename(lock_file_name.buf, st->list_file);
+	err = commit_lock_file(&tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
-	have_lock = 0;
 
-	/* Reload the stack before deleting. On windows, we can only delete the
-	   files after we closed them.
-	*/
+	/*
+	 * Reload the stack before deleting the compacted tables. We can only
+	 * delete the files after we closed them on Windows, so this needs to
+	 * happen first.
+	 */
 	err = reftable_stack_reload_maybe_reuse(st, first < last);
+	if (err < 0)
+		goto done;
 
-	listp = delete_on_success;
-	while (*listp) {
-		if (strcmp(*listp, new_table_path.buf)) {
-			unlink(*listp);
-		}
-		listp++;
+	/*
+	 * Delete the old tables. They may still be in use by concurrent
+	 * readers, so it is expected that unlinking tables may fail.
+	 */
+	for (i = first; i <= last; i++) {
+		struct lock_file *table_lock = &table_locks[i - first];
+		char *table_path = get_locked_file_path(table_lock);
+		unlink(table_path);
+		free(table_path);
 	}
 
 done:
-	free_names(delete_on_success);
+	rollback_lock_file(&tables_list_lock);
+	for (i = first; table_locks && i <= last; i++)
+		rollback_lock_file(&table_locks[i - first]);
+	reftable_free(table_locks);
 
-	listp = subtable_locks;
-	while (*listp) {
-		unlink(*listp);
-		listp++;
-	}
-	free_names(subtable_locks);
-	if (lock_file_fd >= 0) {
-		close(lock_file_fd);
-		lock_file_fd = -1;
-	}
-	if (have_lock) {
-		unlink(lock_file_name.buf);
-	}
+	delete_tempfile(&new_table);
 	strbuf_release(&new_table_name);
 	strbuf_release(&new_table_path);
-	strbuf_release(&ref_list_contents);
-	strbuf_release(&temp_tab_file_name);
-	strbuf_release(&lock_file_name);
+
+	strbuf_release(&tables_list_buf);
+	strbuf_release(&table_name);
 	return err;
 }
 
 int reftable_stack_compact_all(struct reftable_stack *st,
 			       struct reftable_log_expiry_config *config)
 {
-	return stack_compact_range(st, 0, st->merged->stack_len - 1, config);
+	return stack_compact_range(st, 0, st->merged->stack_len ?
+			st->merged->stack_len - 1 : 0, config);
 }
 
-static int stack_compact_range_stats(struct reftable_stack *st, int first,
-				     int last,
+static int stack_compact_range_stats(struct reftable_stack *st,
+				     size_t first, size_t last,
 				     struct reftable_log_expiry_config *config)
 {
 	int err = stack_compact_range(st, first, last, config);
-	if (err > 0) {
+	if (err == REFTABLE_LOCK_ERROR)
 		st->stats.failures++;
-	}
 	return err;
 }
 
@@ -1105,84 +1216,82 @@ static int segment_size(struct segment *s)
 	return s->end - s->start;
 }
 
-int fastlog2(uint64_t sz)
+struct segment suggest_compaction_segment(uint64_t *sizes, size_t n)
 {
-	int l = 0;
-	if (sz == 0)
-		return 0;
-	for (; sz; sz /= 2) {
-		l++;
-	}
-	return l - 1;
-}
-
-struct segment *sizes_to_segments(int *seglen, uint64_t *sizes, int n)
-{
-	struct segment *segs = reftable_calloc(sizeof(struct segment) * n);
-	int next = 0;
-	struct segment cur = { 0 };
-	int i = 0;
+	struct segment seg = { 0 };
+	uint64_t bytes;
+	size_t i;
 
-	if (n == 0) {
-		*seglen = 0;
-		return segs;
-	}
-	for (i = 0; i < n; i++) {
-		int log = fastlog2(sizes[i]);
-		if (cur.log != log && cur.bytes > 0) {
-			struct segment fresh = {
-				.start = i,
-			};
+	/*
+	 * If there are no tables or only a single one then we don't have to
+	 * compact anything. The sequence is geometric by definition already.
+	 */
+	if (n <= 1)
+		return seg;
 
-			segs[next++] = cur;
-			cur = fresh;
+	/*
+	 * Find the ending table of the compaction segment needed to restore the
+	 * geometric sequence. Note that the segment end is exclusive.
+	 *
+	 * To do so, we iterate backwards starting from the most recent table
+	 * until a valid segment end is found. If the preceding table is smaller
+	 * than the current table multiplied by the geometric factor (2), the
+	 * compaction segment end has been identified.
+	 *
+	 * Tables after the ending point are not added to the byte count because
+	 * they are already valid members of the geometric sequence. Due to the
+	 * properties of a geometric sequence, it is not possible for the sum of
+	 * these tables to exceed the value of the ending point table.
+	 *
+	 * Example table size sequence requiring no compaction:
+	 * 	64, 32, 16, 8, 4, 2, 1
+	 *
+	 * Example table size sequence where compaction segment end is set to
+	 * the last table. Since the segment end is exclusive, the last table is
+	 * excluded during subsequent compaction and the table with size 3 is
+	 * the final table included:
+	 * 	64, 32, 16, 8, 4, 3, 1
+	 */
+	for (i = n - 1; i > 0; i--) {
+		if (sizes[i - 1] < sizes[i] * 2) {
+			seg.end = i + 1;
+			bytes = sizes[i];
+			break;
 		}
-
-		cur.log = log;
-		cur.end = i + 1;
-		cur.bytes += sizes[i];
 	}
-	segs[next++] = cur;
-	*seglen = next;
-	return segs;
-}
 
-struct segment suggest_compaction_segment(uint64_t *sizes, int n)
-{
-	int seglen = 0;
-	struct segment *segs = sizes_to_segments(&seglen, sizes, n);
-	struct segment min_seg = {
-		.log = 64,
-	};
-	int i = 0;
-	for (i = 0; i < seglen; i++) {
-		if (segment_size(&segs[i]) == 1) {
-			continue;
-		}
-
-		if (segs[i].log < min_seg.log) {
-			min_seg = segs[i];
-		}
-	}
+	/*
+	 * Find the starting table of the compaction segment by iterating
+	 * through the remaining tables and keeping track of the accumulated
+	 * size of all tables seen from the segment end table. The previous
+	 * table is compared to the accumulated size because the tables from the
+	 * segment end are merged backwards recursively.
+	 *
+	 * Note that we keep iterating even after we have found the first
+	 * starting point. This is because there may be tables in the stack
+	 * preceding that first starting point which violate the geometric
+	 * sequence.
+	 *
+	 * Example compaction segment start set to table with size 32:
+	 * 	128, 32, 16, 8, 4, 3, 1
+	 */
+	for (; i > 0; i--) {
+		uint64_t curr = bytes;
+		bytes += sizes[i - 1];
 
-	while (min_seg.start > 0) {
-		int prev = min_seg.start - 1;
-		if (fastlog2(min_seg.bytes) < fastlog2(sizes[prev])) {
-			break;
+		if (sizes[i - 1] < curr * 2) {
+			seg.start = i - 1;
+			seg.bytes = bytes;
 		}
-
-		min_seg.start = prev;
-		min_seg.bytes += sizes[prev];
 	}
 
-	reftable_free(segs);
-	return min_seg;
+	return seg;
 }
 
 static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
 {
 	uint64_t *sizes =
-		reftable_calloc(sizeof(uint64_t) * st->merged->stack_len);
+		reftable_calloc(st->merged->stack_len, sizeof(*sizes));
 	int version = (st->config.hash_id == GIT_SHA1_FORMAT_ID) ? 1 : 2;
 	int overhead = header_size(version) - 1;
 	int i = 0;
@@ -1281,17 +1390,12 @@ static int stack_check_addition(struct reftable_stack *st,
 	while (1) {
 		struct reftable_ref_record ref = { NULL };
 		err = reftable_iterator_next_ref(&it, &ref);
-		if (err > 0) {
+		if (err > 0)
 			break;
-		}
 		if (err < 0)
 			goto done;
 
-		if (len >= cap) {
-			cap = 2 * cap + 1;
-			refs = reftable_realloc(refs, cap * sizeof(refs[0]));
-		}
-
+		REFTABLE_ALLOC_GROW(refs, len + 1, cap);
 		refs[len++] = ref;
 	}
 
diff --git a/reftable/stack.h b/reftable/stack.h
index f570058..d43efa4 100644
--- a/reftable/stack.h
+++ b/reftable/stack.h
@@ -14,9 +14,11 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-stack.h"
 
 struct reftable_stack {
+	struct stat list_st;
 	char *list_file;
+	int list_fd;
+
 	char *reftable_dir;
-	int disable_auto_compact;
 
 	struct reftable_write_options config;
 
@@ -29,13 +31,10 @@ struct reftable_stack {
 int read_lines(const char *filename, char ***lines);
 
 struct segment {
-	int start, end;
-	int log;
+	size_t start, end;
 	uint64_t bytes;
 };
 
-int fastlog2(uint64_t sz);
-struct segment *sizes_to_segments(int *seglen, uint64_t *sizes, int n);
-struct segment suggest_compaction_segment(uint64_t *sizes, int n);
+struct segment suggest_compaction_segment(uint64_t *sizes, size_t n);
 
 #endif
diff --git a/reftable/stack_test.c b/reftable/stack_test.c
index d0b7175..1df3ffc 100644
--- a/reftable/stack_test.c
+++ b/reftable/stack_test.c
@@ -13,7 +13,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "reftable-reader.h"
 #include "merged.h"
 #include "basics.h"
-#include "constants.h"
 #include "record.h"
 #include "test_framework.h"
 #include "reftable-tests.h"
@@ -39,7 +38,17 @@ static int count_dir_entries(const char *dirname)
 		return 0;
 
 	while ((d = readdir(dir))) {
-		if (!strcmp(d->d_name, "..") || !strcmp(d->d_name, "."))
+		/*
+		 * Besides skipping over "." and "..", we also need to
+		 * skip over other files that have a leading ".". This
+		 * is due to behaviour of NFS, which will rename files
+		 * to ".nfs*" to emulate delete-on-last-close.
+		 *
+		 * In any case this should be fine as the reftable
+		 * library will never write files with leading dots
+		 * anyway.
+		 */
+		if (starts_with(d->d_name, "."))
 			continue;
 		len++;
 	}
@@ -78,7 +87,7 @@ static void test_read_file(void)
 	int i = 0;
 
 	EXPECT(fd > 0);
-	n = write(fd, out, strlen(out));
+	n = write_in_full(fd, out, strlen(out));
 	EXPECT(n == strlen(out));
 	err = close(fd);
 	EXPECT(err >= 0);
@@ -233,7 +242,7 @@ static void test_reftable_stack_uptodate(void)
 	EXPECT_ERR(err);
 
 	err = reftable_stack_add(st2, &write_test_ref, &ref2);
-	EXPECT(err == REFTABLE_LOCK_ERROR);
+	EXPECT(err == REFTABLE_OUTDATED_ERROR);
 
 	err = reftable_stack_reload(st2);
 	EXPECT_ERR(err);
@@ -289,6 +298,104 @@ static void test_reftable_stack_transaction_api(void)
 	clear_dir(dir);
 }
 
+static void test_reftable_stack_transaction_api_performs_auto_compaction(void)
+{
+	char *dir = get_tmp_dir(__LINE__);
+	struct reftable_write_options cfg = {0};
+	struct reftable_addition *add = NULL;
+	struct reftable_stack *st = NULL;
+	int i, n = 20, err;
+
+	err = reftable_new_stack(&st, dir, cfg);
+	EXPECT_ERR(err);
+
+	for (i = 0; i <= n; i++) {
+		struct reftable_ref_record ref = {
+			.update_index = reftable_stack_next_update_index(st),
+			.value_type = REFTABLE_REF_SYMREF,
+			.value.symref = "master",
+		};
+		char name[100];
+
+		snprintf(name, sizeof(name), "branch%04d", i);
+		ref.refname = name;
+
+		/*
+		 * Disable auto-compaction for all but the last runs. Like this
+		 * we can ensure that we indeed honor this setting and have
+		 * better control over when exactly auto compaction runs.
+		 */
+		st->config.disable_auto_compact = i != n;
+
+		err = reftable_stack_new_addition(&add, st);
+		EXPECT_ERR(err);
+
+		err = reftable_addition_add(add, &write_test_ref, &ref);
+		EXPECT_ERR(err);
+
+		err = reftable_addition_commit(add);
+		EXPECT_ERR(err);
+
+		reftable_addition_destroy(add);
+
+		/*
+		 * The stack length should grow continuously for all runs where
+		 * auto compaction is disabled. When enabled, we should merge
+		 * all tables in the stack.
+		 */
+		if (i != n)
+			EXPECT(st->merged->stack_len == i + 1);
+		else
+			EXPECT(st->merged->stack_len == 1);
+	}
+
+	reftable_stack_destroy(st);
+	clear_dir(dir);
+}
+
+static void test_reftable_stack_auto_compaction_fails_gracefully(void)
+{
+	struct reftable_ref_record ref = {
+		.refname = "refs/heads/master",
+		.update_index = 1,
+		.value_type = REFTABLE_REF_VAL1,
+		.value.val1 = {0x01},
+	};
+	struct reftable_write_options cfg = {0};
+	struct reftable_stack *st;
+	struct strbuf table_path = STRBUF_INIT;
+	char *dir = get_tmp_dir(__LINE__);
+	int err;
+
+	err = reftable_new_stack(&st, dir, cfg);
+	EXPECT_ERR(err);
+
+	err = reftable_stack_add(st, write_test_ref, &ref);
+	EXPECT_ERR(err);
+	EXPECT(st->merged->stack_len == 1);
+	EXPECT(st->stats.attempts == 0);
+	EXPECT(st->stats.failures == 0);
+
+	/*
+	 * Lock the newly written table such that it cannot be compacted.
+	 * Adding a new table to the stack should not be impacted by this, even
+	 * though auto-compaction will now fail.
+	 */
+	strbuf_addf(&table_path, "%s/%s.lock", dir, st->readers[0]->name);
+	write_file_buf(table_path.buf, "", 0);
+
+	ref.update_index = 2;
+	err = reftable_stack_add(st, write_test_ref, &ref);
+	EXPECT_ERR(err);
+	EXPECT(st->merged->stack_len == 2);
+	EXPECT(st->stats.attempts == 1);
+	EXPECT(st->stats.failures == 1);
+
+	reftable_stack_destroy(st);
+	strbuf_release(&table_path);
+	clear_dir(dir);
+}
+
 static void test_reftable_stack_validate_refname(void)
 {
 	struct reftable_write_options cfg = { 0 };
@@ -389,18 +496,19 @@ static void test_reftable_stack_add(void)
 	int err = 0;
 	struct reftable_write_options cfg = {
 		.exact_log_message = 1,
+		.default_permissions = 0660,
+		.disable_auto_compact = 1,
 	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
-
 	struct reftable_ref_record refs[2] = { { NULL } };
 	struct reftable_log_record logs[2] = { { NULL } };
+	struct strbuf path = STRBUF_INIT;
+	struct stat stat_result;
 	int N = ARRAY_SIZE(refs);
 
-
 	err = reftable_new_stack(&st, dir, cfg);
 	EXPECT_ERR(err);
-	st->disable_auto_compact = 1;
 
 	for (i = 0; i < N; i++) {
 		char buf[256];
@@ -408,14 +516,11 @@ static void test_reftable_stack_add(void)
 		refs[i].refname = xstrdup(buf);
 		refs[i].update_index = i + 1;
 		refs[i].value_type = REFTABLE_REF_VAL1;
-		refs[i].value.val1 = reftable_malloc(GIT_SHA1_RAWSZ);
 		set_test_hash(refs[i].value.val1, i);
 
 		logs[i].refname = xstrdup(buf);
 		logs[i].update_index = N + i + 1;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
-
-		logs[i].value.update.new_hash = reftable_malloc(GIT_SHA1_RAWSZ);
 		logs[i].value.update.email = xstrdup("identity@invalid");
 		set_test_hash(logs[i].value.update.new_hash, i);
 	}
@@ -456,12 +561,32 @@ static void test_reftable_stack_add(void)
 		reftable_log_record_release(&dest);
 	}
 
+#ifndef GIT_WINDOWS_NATIVE
+	strbuf_addstr(&path, dir);
+	strbuf_addstr(&path, "/tables.list");
+	err = stat(path.buf, &stat_result);
+	EXPECT(!err);
+	EXPECT((stat_result.st_mode & 0777) == cfg.default_permissions);
+
+	strbuf_reset(&path);
+	strbuf_addstr(&path, dir);
+	strbuf_addstr(&path, "/");
+	/* do not try at home; not an external API for reftable. */
+	strbuf_addstr(&path, st->readers[0]->name);
+	err = stat(path.buf, &stat_result);
+	EXPECT(!err);
+	EXPECT((stat_result.st_mode & 0777) == cfg.default_permissions);
+#else
+	(void) stat_result;
+#endif
+
 	/* cleanup */
 	reftable_stack_destroy(st);
 	for (i = 0; i < N; i++) {
 		reftable_ref_record_release(&refs[i]);
 		reftable_log_record_release(&logs[i]);
 	}
+	strbuf_release(&path);
 	clear_dir(dir);
 }
 
@@ -473,16 +598,17 @@ static void test_reftable_stack_log_normalize(void)
 	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
-
-	uint8_t h1[GIT_SHA1_RAWSZ] = { 0x01 }, h2[GIT_SHA1_RAWSZ] = { 0x02 };
-
-	struct reftable_log_record input = { .refname = "branch",
-					     .update_index = 1,
-					     .value_type = REFTABLE_LOG_UPDATE,
-					     .value = { .update = {
-								.new_hash = h1,
-								.old_hash = h2,
-							} } };
+	struct reftable_log_record input = {
+		.refname = "branch",
+		.update_index = 1,
+		.value_type = REFTABLE_LOG_UPDATE,
+		.value = {
+			.update = {
+				.new_hash = { 1 },
+				.old_hash = { 2 },
+			},
+		},
+	};
 	struct reftable_log_record dest = {
 		.update_index = 0,
 	};
@@ -545,7 +671,6 @@ static void test_reftable_stack_tombstone(void)
 		refs[i].update_index = i + 1;
 		if (i % 2 == 0) {
 			refs[i].value_type = REFTABLE_REF_VAL1;
-			refs[i].value.val1 = reftable_malloc(GIT_SHA1_RAWSZ);
 			set_test_hash(refs[i].value.val1, i);
 		}
 
@@ -554,8 +679,6 @@ static void test_reftable_stack_tombstone(void)
 		logs[i].update_index = 42;
 		if (i % 2 == 0) {
 			logs[i].value_type = REFTABLE_LOG_UPDATE;
-			logs[i].value.update.new_hash =
-				reftable_malloc(GIT_SHA1_RAWSZ);
 			set_test_hash(logs[i].value.update.new_hash, i);
 			logs[i].value.update.email =
 				xstrdup("identity@invalid");
@@ -647,60 +770,13 @@ static void test_reftable_stack_hash_id(void)
 	clear_dir(dir);
 }
 
-static void test_log2(void)
-{
-	EXPECT(1 == fastlog2(3));
-	EXPECT(2 == fastlog2(4));
-	EXPECT(2 == fastlog2(5));
-}
-
-static void test_sizes_to_segments(void)
-{
-	uint64_t sizes[] = { 2, 3, 4, 5, 7, 9 };
-	/* .................0  1  2  3  4  5 */
-
-	int seglen = 0;
-	struct segment *segs =
-		sizes_to_segments(&seglen, sizes, ARRAY_SIZE(sizes));
-	EXPECT(segs[2].log == 3);
-	EXPECT(segs[2].start == 5);
-	EXPECT(segs[2].end == 6);
-
-	EXPECT(segs[1].log == 2);
-	EXPECT(segs[1].start == 2);
-	EXPECT(segs[1].end == 5);
-	reftable_free(segs);
-}
-
-static void test_sizes_to_segments_empty(void)
-{
-	int seglen = 0;
-	struct segment *segs = sizes_to_segments(&seglen, NULL, 0);
-	EXPECT(seglen == 0);
-	reftable_free(segs);
-}
-
-static void test_sizes_to_segments_all_equal(void)
-{
-	uint64_t sizes[] = { 5, 5 };
-
-	int seglen = 0;
-	struct segment *segs =
-		sizes_to_segments(&seglen, sizes, ARRAY_SIZE(sizes));
-	EXPECT(seglen == 1);
-	EXPECT(segs[0].start == 0);
-	EXPECT(segs[0].end == 2);
-	reftable_free(segs);
-}
-
 static void test_suggest_compaction_segment(void)
 {
-	uint64_t sizes[] = { 128, 64, 17, 16, 9, 9, 9, 16, 16 };
-	/* .................0    1    2  3   4  5  6 */
+	uint64_t sizes[] = { 512, 64, 17, 16, 9, 9, 9, 16, 2, 16 };
 	struct segment min =
 		suggest_compaction_segment(sizes, ARRAY_SIZE(sizes));
-	EXPECT(min.start == 2);
-	EXPECT(min.end == 7);
+	EXPECT(min.start == 1);
+	EXPECT(min.end == 10);
 }
 
 static void test_suggest_compaction_segment_nothing(void)
@@ -738,7 +814,6 @@ static void test_reflog_expire(void)
 		logs[i].update_index = i;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
 		logs[i].value.update.time = i;
-		logs[i].value.update.new_hash = reftable_malloc(GIT_SHA1_RAWSZ);
 		logs[i].value.update.email = xstrdup("identity@invalid");
 		set_test_hash(logs[i].value.update.new_hash, i);
 	}
@@ -812,9 +887,21 @@ static void test_empty_add(void)
 	reftable_stack_destroy(st2);
 }
 
+static int fastlog2(uint64_t sz)
+{
+	int l = 0;
+	if (sz == 0)
+		return 0;
+	for (; sz; sz /= 2)
+		l++;
+	return l - 1;
+}
+
 static void test_reftable_stack_auto_compaction(void)
 {
-	struct reftable_write_options cfg = { 0 };
+	struct reftable_write_options cfg = {
+		.disable_auto_compact = 1,
+	};
 	struct reftable_stack *st = NULL;
 	char *dir = get_tmp_dir(__LINE__);
 
@@ -824,7 +911,6 @@ static void test_reftable_stack_auto_compaction(void)
 	err = reftable_new_stack(&st, dir, cfg);
 	EXPECT_ERR(err);
 
-	st->disable_auto_compact = 1; /* call manually below for coverage. */
 	for (i = 0; i < N; i++) {
 		char name[100];
 		struct reftable_ref_record ref = {
@@ -850,6 +936,54 @@ static void test_reftable_stack_auto_compaction(void)
 	clear_dir(dir);
 }
 
+static void test_reftable_stack_add_performs_auto_compaction(void)
+{
+	struct reftable_write_options cfg = { 0 };
+	struct reftable_stack *st = NULL;
+	struct strbuf refname = STRBUF_INIT;
+	char *dir = get_tmp_dir(__LINE__);
+	int err, i, n = 20;
+
+	err = reftable_new_stack(&st, dir, cfg);
+	EXPECT_ERR(err);
+
+	for (i = 0; i <= n; i++) {
+		struct reftable_ref_record ref = {
+			.update_index = reftable_stack_next_update_index(st),
+			.value_type = REFTABLE_REF_SYMREF,
+			.value.symref = "master",
+		};
+
+		/*
+		 * Disable auto-compaction for all but the last runs. Like this
+		 * we can ensure that we indeed honor this setting and have
+		 * better control over when exactly auto compaction runs.
+		 */
+		st->config.disable_auto_compact = i != n;
+
+		strbuf_reset(&refname);
+		strbuf_addf(&refname, "branch-%04d", i);
+		ref.refname = refname.buf;
+
+		err = reftable_stack_add(st, &write_test_ref, &ref);
+		EXPECT_ERR(err);
+
+		/*
+		 * The stack length should grow continuously for all runs where
+		 * auto compaction is disabled. When enabled, we should merge
+		 * all tables in the stack.
+		 */
+		if (i != n)
+			EXPECT(st->merged->stack_len == i + 1);
+		else
+			EXPECT(st->merged->stack_len == 1);
+	}
+
+	reftable_stack_destroy(st);
+	strbuf_release(&refname);
+	clear_dir(dir);
+}
+
 static void test_reftable_stack_compaction_concurrent(void)
 {
 	struct reftable_write_options cfg = { 0 };
@@ -952,7 +1086,6 @@ static void test_reftable_stack_compaction_concurrent_clean(void)
 int stack_test_main(int argc, const char *argv[])
 {
 	RUN_TEST(test_empty_add);
-	RUN_TEST(test_log2);
 	RUN_TEST(test_names_equal);
 	RUN_TEST(test_parse_names);
 	RUN_TEST(test_read_file);
@@ -960,6 +1093,7 @@ int stack_test_main(int argc, const char *argv[])
 	RUN_TEST(test_reftable_stack_add);
 	RUN_TEST(test_reftable_stack_add_one);
 	RUN_TEST(test_reftable_stack_auto_compaction);
+	RUN_TEST(test_reftable_stack_add_performs_auto_compaction);
 	RUN_TEST(test_reftable_stack_compaction_concurrent);
 	RUN_TEST(test_reftable_stack_compaction_concurrent_clean);
 	RUN_TEST(test_reftable_stack_hash_id);
@@ -967,12 +1101,11 @@ int stack_test_main(int argc, const char *argv[])
 	RUN_TEST(test_reftable_stack_log_normalize);
 	RUN_TEST(test_reftable_stack_tombstone);
 	RUN_TEST(test_reftable_stack_transaction_api);
+	RUN_TEST(test_reftable_stack_transaction_api_performs_auto_compaction);
+	RUN_TEST(test_reftable_stack_auto_compaction_fails_gracefully);
 	RUN_TEST(test_reftable_stack_update_index_check);
 	RUN_TEST(test_reftable_stack_uptodate);
 	RUN_TEST(test_reftable_stack_validate_refname);
-	RUN_TEST(test_sizes_to_segments);
-	RUN_TEST(test_sizes_to_segments_all_equal);
-	RUN_TEST(test_sizes_to_segments_empty);
 	RUN_TEST(test_suggest_compaction_segment);
 	RUN_TEST(test_suggest_compaction_segment_nothing);
 	return 0;
diff --git a/reftable/system.h b/reftable/system.h
index 18f9207..5d8b6de 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -12,8 +12,10 @@ https://developers.google.com/open-source/licenses/bsd
 /* This header glues the reftable library to the rest of Git */
 
 #include "git-compat-util.h"
+#include "lockfile.h"
 #include "strbuf.h"
-#include "hash.h" /* hash ID, sizes.*/
+#include "tempfile.h"
+#include "hash-ll.h" /* hash ID, sizes.*/
 #include "dir.h" /* remove_dir_recursively, for tests.*/
 
 int hash_size(uint32_t id);
diff --git a/reftable/test_framework.c b/reftable/test_framework.c
index 84ac972..4066924 100644
--- a/reftable/test_framework.c
+++ b/reftable/test_framework.c
@@ -9,7 +9,6 @@ https://developers.google.com/open-source/licenses/bsd
 #include "system.h"
 #include "test_framework.h"
 
-#include "basics.h"
 
 void set_test_hash(uint8_t *p, int i)
 {
@@ -21,3 +20,8 @@ ssize_t strbuf_add_void(void *b, const void *data, size_t sz)
 	strbuf_add(b, data, sz);
 	return sz;
 }
+
+int noop_flush(void *arg)
+{
+	return 0;
+}
diff --git a/reftable/test_framework.h b/reftable/test_framework.h
index 774cb27..687390f 100644
--- a/reftable/test_framework.h
+++ b/reftable/test_framework.h
@@ -12,32 +12,38 @@ https://developers.google.com/open-source/licenses/bsd
 #include "system.h"
 #include "reftable-error.h"
 
-#define EXPECT_ERR(c)                                                  \
-	if (c != 0) {                                                  \
-		fflush(stderr);                                        \
-		fflush(stdout);                                        \
-		fprintf(stderr, "%s: %d: error == %d (%s), want 0\n",  \
-			__FILE__, __LINE__, c, reftable_error_str(c)); \
-		abort();                                               \
-	}
-
-#define EXPECT_STREQ(a, b)                                               \
-	if (strcmp(a, b)) {                                              \
-		fflush(stderr);                                          \
-		fflush(stdout);                                          \
-		fprintf(stderr, "%s:%d: %s (%s) != %s (%s)\n", __FILE__, \
-			__LINE__, #a, a, #b, b);                         \
-		abort();                                                 \
-	}
-
-#define EXPECT(c)                                                          \
-	if (!(c)) {                                                        \
-		fflush(stderr);                                            \
-		fflush(stdout);                                            \
-		fprintf(stderr, "%s: %d: failed assertion %s\n", __FILE__, \
-			__LINE__, #c);                                     \
-		abort();                                                   \
-	}
+#define EXPECT_ERR(c)                                                          \
+	do {                                                                   \
+		if (c != 0) {                                                  \
+			fflush(stderr);                                        \
+			fflush(stdout);                                        \
+			fprintf(stderr, "%s: %d: error == %d (%s), want 0\n",  \
+				__FILE__, __LINE__, c, reftable_error_str(c)); \
+			abort();                                               \
+		}                                                              \
+	} while (0)
+
+#define EXPECT_STREQ(a, b)                                                       \
+	do {                                                                     \
+		if (strcmp(a, b)) {                                              \
+			fflush(stderr);                                          \
+			fflush(stdout);                                          \
+			fprintf(stderr, "%s:%d: %s (%s) != %s (%s)\n", __FILE__, \
+				__LINE__, #a, a, #b, b);                         \
+			abort();                                                 \
+		}                                                                \
+	} while (0)
+
+#define EXPECT(c)                                                                  \
+	do {                                                                       \
+		if (!(c)) {                                                        \
+			fflush(stderr);                                            \
+			fflush(stdout);                                            \
+			fprintf(stderr, "%s: %d: failed assertion %s\n", __FILE__, \
+				__LINE__, #c);                                     \
+			abort();                                                   \
+		}                                                                  \
+	} while (0)
 
 #define RUN_TEST(f)                          \
 	fprintf(stderr, "running %s\n", #f); \
@@ -50,4 +56,6 @@ void set_test_hash(uint8_t *p, int i);
  */
 ssize_t strbuf_add_void(void *b, const void *data, size_t sz);
 
+int noop_flush(void *);
+
 #endif
diff --git a/reftable/tree.c b/reftable/tree.c
index b8899e0..528f33a 100644
--- a/reftable/tree.c
+++ b/reftable/tree.c
@@ -6,10 +6,10 @@ license that can be found in the LICENSE file or at
 https://developers.google.com/open-source/licenses/bsd
 */
 
+#include "system.h"
 #include "tree.h"
 
 #include "basics.h"
-#include "system.h"
 
 struct tree_node *tree_search(void *key, struct tree_node **rootp,
 			      int (*compare)(const void *, const void *),
@@ -20,8 +20,8 @@ struct tree_node *tree_search(void *key, struct tree_node **rootp,
 		if (!insert) {
 			return NULL;
 		} else {
-			struct tree_node *n =
-				reftable_calloc(sizeof(struct tree_node));
+			struct tree_node *n;
+			REFTABLE_CALLOC_ARRAY(n, 1);
 			n->key = key;
 			*rootp = n;
 			return *rootp;
diff --git a/reftable/tree_test.c b/reftable/tree_test.c
index cbff125..6961a65 100644
--- a/reftable/tree_test.c
+++ b/reftable/tree_test.c
@@ -6,10 +6,9 @@ license that can be found in the LICENSE file or at
 https://developers.google.com/open-source/licenses/bsd
 */
 
+#include "system.h"
 #include "tree.h"
 
-#include "basics.h"
-#include "record.h"
 #include "test_framework.h"
 #include "reftable-tests.h"
 
diff --git a/reftable/writer.c b/reftable/writer.c
index 2e322a5..1d9ff0f 100644
--- a/reftable/writer.c
+++ b/reftable/writer.c
@@ -49,7 +49,7 @@ static int padded_write(struct reftable_writer *w, uint8_t *data, size_t len,
 {
 	int n = 0;
 	if (w->pending_padding > 0) {
-		uint8_t *zeroed = reftable_calloc(w->pending_padding);
+		uint8_t *zeroed = reftable_calloc(w->pending_padding, sizeof(*zeroed));
 		int n = w->write(w->write_arg, zeroed, w->pending_padding);
 		if (n < 0)
 			return n;
@@ -121,10 +121,10 @@ static struct strbuf reftable_empty_strbuf = STRBUF_INIT;
 
 struct reftable_writer *
 reftable_new_writer(ssize_t (*writer_func)(void *, const void *, size_t),
+		    int (*flush_func)(void *),
 		    void *writer_arg, struct reftable_write_options *opts)
 {
-	struct reftable_writer *wp =
-		reftable_calloc(sizeof(struct reftable_writer));
+	struct reftable_writer *wp = reftable_calloc(1, sizeof(*wp));
 	strbuf_init(&wp->block_writer_data.last_key, 0);
 	options_set_defaults(opts);
 	if (opts->block_size >= (1 << 24)) {
@@ -132,10 +132,11 @@ reftable_new_writer(ssize_t (*writer_func)(void *, const void *, size_t),
 		abort();
 	}
 	wp->last_key = reftable_empty_strbuf;
-	wp->block = reftable_calloc(opts->block_size);
+	REFTABLE_CALLOC_ARRAY(wp->block, opts->block_size);
 	wp->write = writer_func;
 	wp->write_arg = writer_arg;
 	wp->opts = *opts;
+	wp->flush = flush_func;
 	writer_reinit_block_writer(wp, BLOCK_TYPE_REF);
 
 	return wp;
@@ -200,12 +201,7 @@ static void writer_index_hash(struct reftable_writer *w, struct strbuf *hash)
 		return;
 	}
 
-	if (key->offset_len == key->offset_cap) {
-		key->offset_cap = 2 * key->offset_cap + 1;
-		key->offsets = reftable_realloc(
-			key->offsets, sizeof(uint64_t) * key->offset_cap);
-	}
-
+	REFTABLE_ALLOC_GROW(key->offsets, key->offset_len + 1, key->offset_cap);
 	key->offsets[key->offset_len++] = off;
 }
 
@@ -377,20 +373,39 @@ int reftable_writer_add_logs(struct reftable_writer *w,
 
 static int writer_finish_section(struct reftable_writer *w)
 {
+	struct reftable_block_stats *bstats = NULL;
 	uint8_t typ = block_writer_type(w->block_writer);
 	uint64_t index_start = 0;
 	int max_level = 0;
-	int threshold = w->opts.unpadded ? 1 : 3;
+	size_t threshold = w->opts.unpadded ? 1 : 3;
 	int before_blocks = w->stats.idx_stats.blocks;
-	int err = writer_flush_block(w);
-	int i = 0;
-	struct reftable_block_stats *bstats = NULL;
+	int err;
+
+	err = writer_flush_block(w);
 	if (err < 0)
 		return err;
 
+	/*
+	 * When the section we are about to index has a lot of blocks then the
+	 * index itself may span across multiple blocks, as well. This would
+	 * require a linear scan over index blocks only to find the desired
+	 * indexed block, which is inefficient. Instead, we write a multi-level
+	 * index where index records of level N+1 will refer to index blocks of
+	 * level N. This isn't constant time, either, but at least logarithmic.
+	 *
+	 * This loop handles writing this multi-level index. Note that we write
+	 * the lowest-level index pointing to the indexed blocks first. We then
+	 * continue writing additional index levels until the current level has
+	 * less blocks than the threshold so that the highest level will be at
+	 * the end of the index section.
+	 *
+	 * Readers are thus required to start reading the index section from
+	 * its end, which is why we set `index_start` to the beginning of the
+	 * last index section.
+	 */
 	while (w->index_len > threshold) {
 		struct reftable_index_record *idx = NULL;
-		int idx_len = 0;
+		size_t i, idx_len;
 
 		max_level++;
 		index_start = w->next;
@@ -409,35 +424,28 @@ static int writer_finish_section(struct reftable_writer *w)
 					.idx = idx[i],
 				},
 			};
-			if (block_writer_add(w->block_writer, &rec) == 0) {
-				continue;
-			}
 
-			err = writer_flush_block(w);
+			err = writer_add_record(w, &rec);
 			if (err < 0)
 				return err;
+		}
 
-			writer_reinit_block_writer(w, BLOCK_TYPE_INDEX);
+		err = writer_flush_block(w);
+		if (err < 0)
+			return err;
 
-			err = block_writer_add(w->block_writer, &rec);
-			if (err != 0) {
-				/* write into fresh block should always succeed
-				 */
-				abort();
-			}
-		}
-		for (i = 0; i < idx_len; i++) {
+		for (i = 0; i < idx_len; i++)
 			strbuf_release(&idx[i].last_key);
-		}
 		reftable_free(idx);
 	}
 
+	/*
+	 * The index may still contain a number of index blocks lower than the
+	 * threshold. Clear it so that these entries don't leak into the next
+	 * index section.
+	 */
 	writer_clear_index(w);
 
-	err = writer_flush_block(w);
-	if (err < 0)
-		return err;
-
 	bstats = writer_reftable_block_stats(w, typ);
 	bstats->index_blocks = w->stats.idx_stats.blocks - before_blocks;
 	bstats->index_offset = index_start;
@@ -603,6 +611,12 @@ int reftable_writer_close(struct reftable_writer *w)
 	put_be32(p, crc32(0, footer, p - footer));
 	p += 4;
 
+	err = w->flush(w->write_arg);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
 	err = padded_write(w, footer, footer_size(writer_version(w)), 0);
 	if (err < 0)
 		goto done;
@@ -622,11 +636,8 @@ done:
 
 static void writer_clear_index(struct reftable_writer *w)
 {
-	int i = 0;
-	for (i = 0; i < w->index_len; i++) {
+	for (size_t i = 0; i < w->index_len; i++)
 		strbuf_release(&w->index[i].last_key);
-	}
-
 	FREE_AND_NULL(w->index);
 	w->index_len = 0;
 	w->index_cap = 0;
@@ -674,12 +685,7 @@ static int writer_flush_nonempty_block(struct reftable_writer *w)
 	if (err < 0)
 		return err;
 
-	if (w->index_cap == w->index_len) {
-		w->index_cap = 2 * w->index_cap + 1;
-		w->index = reftable_realloc(
-			w->index,
-			sizeof(struct reftable_index_record) * w->index_cap);
-	}
+	REFTABLE_ALLOC_GROW(w->index, w->index_len + 1, w->index_cap);
 
 	ir.offset = w->next;
 	strbuf_reset(&ir.last_key);
diff --git a/reftable/writer.h b/reftable/writer.h
index 09b8867..8d0df9c 100644
--- a/reftable/writer.h
+++ b/reftable/writer.h
@@ -16,6 +16,7 @@ https://developers.google.com/open-source/licenses/bsd
 
 struct reftable_writer {
 	ssize_t (*write)(void *, const void *, size_t);
+	int (*flush)(void *);
 	void *write_arg;
 	int pending_padding;
 	struct strbuf last_key;