From 77aa03d6c7f07db4a5d34afe8f5b3a55e801057c Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:50:10 -0400
Subject: reencode_string: use st_add/st_mult helpers

When converting a string with iconv, if the output buffer
isn't big enough, we grow it. But our growth is done without
any concern for integer overflow. So when we add:

  outalloc = sofar + insz * 2 + 32;

we may end up wrapping outalloc (which is a size_t), and
allocating a too-small buffer. We then manipulate it
further:

  outsz = outalloc - sofar - 1;

and feed outsz back to iconv. If outalloc is wrapped and
smaller than sofar, we'll end up with a small allocation but
feed a very large outsz to iconv, which could result in it
overflowing the buffer.

Can we use this to construct an attack wherein the victim
clones a repository with a very large commit object with an
encoding header, and running "git log" reencodes it into
utf8, causing an overflow?

An attack of this sort is likely impossible in practice.
"sofar" is how many output bytes we've written total, and
"insz" is the number of input bytes remaining. Imagine our
input doubles in size as we output it (which is easy to do
by converting latin1 to utf8, for example), and that we
start with N input bytes. Our initial output buffer also
starts at N bytes, so after the first call we'd have N/2
input bytes remaining (insz), and have written N bytes
(sofar). That means our next allocation will be
(N + N/2 * 2 + 32) bytes, or (2N + 32).

We can therefore overflow a 32-bit size_t with a commit
message that's just under 2^31 bytes, assuming it consists
mostly of "doubling" sequences (e.g., latin1 0xe1 which
becomes utf8 0xc3 0xa1).

But we'll never make it that far with such a message. We'll
be spending 2^31 bytes on the original string. And our
initial output buffer will also be 2^31 bytes. Which is not
going to succeed on a system with a 32-bit size_t, since
there will be other things using the address space, too. The
initial malloc will fail.

If we imagine instead that we can triple the size when
converting, then our second allocation becomes
(N + 2/3N * 2 + 32), or (7/3N + 32). That still requires two
allocations of 3/7 of our address space (6/7 of the total)
to succeed.

If we imagine we can quadruple, it becomes (5/2N + 32); we
need to be able to allocate 4/5 of the address space to
succeed.

This might start to get plausible. But is it possible to get
a 4-to-1 increase in size? Probably if you're converting to
some obscure encoding. But since git defaults to utf8 for
its output, that's the likely destination encoding for an
attack. And while there are 4-character utf8 sequences, it's
unlikely that you'd be able find a single-byte source
sequence in any encoding.

So this is certainly buggy code which should be fixed, but
it is probably not a useful attack vector.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/utf8.c b/utf8.c
index d55e20c..a2fd24c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -477,7 +477,7 @@ char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outs
 	iconv_ibp cp;
 
 	outsz = insz;
-	outalloc = outsz + 1; /* for terminating NUL */
+	outalloc = st_add(outsz, 1); /* for terminating NUL */
 	out = xmalloc(outalloc);
 	outpos = out;
 	cp = (iconv_ibp)in;
@@ -497,7 +497,7 @@ char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outs
 			 * converting the rest.
 			 */
 			sofar = outpos - out;
-			outalloc = sofar + insz * 2 + 32;
+			outalloc = st_add3(sofar, st_mult(insz, 2), 32);
 			out = xrealloc(out, outalloc);
 			outpos = out + sofar;
 			outsz = outalloc - sofar - 1;
-- 
cgit v0.10.2-6-g49f6


From c7d017d7e1cca37ca20f73c11fa9f1b319a2c3a5 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:50:33 -0400
Subject: reencode_string: use size_t for string lengths

The iconv interface takes a size_t, which is the appropriate
type for an in-memory buffer. But our reencode_string_*
functions use integers, meaning we may get confusing results
when the sizes exceed INT_MAX. Let's use size_t
consistently.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/convert.c b/convert.c
index 64d0d30..e012959 100644
--- a/convert.c
+++ b/convert.c
@@ -389,7 +389,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
 			 struct strbuf *buf, const char *enc, int conv_flags)
 {
 	char *dst;
-	int dst_len;
+	size_t dst_len;
 	int die_on_error = conv_flags & CONV_WRITE_OBJECT;
 
 	/*
@@ -452,7 +452,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
 	 */
 	if (die_on_error && check_roundtrip(enc)) {
 		char *re_src;
-		int re_src_len;
+		size_t re_src_len;
 
 		re_src = reencode_string_len(dst, dst_len,
 					     enc, default_encoding,
@@ -480,7 +480,7 @@ static int encode_to_worktree(const char *path, const char *src, size_t src_len,
 			      struct strbuf *buf, const char *enc)
 {
 	char *dst;
-	int dst_len;
+	size_t dst_len;
 
 	/*
 	 * No encoding is specified or there is nothing to encode.
diff --git a/pretty.c b/pretty.c
index 703fa6f..e1e4060 100644
--- a/pretty.c
+++ b/pretty.c
@@ -1538,7 +1538,7 @@ void format_commit_message(const struct commit *commit,
 	}
 
 	if (output_enc) {
-		int outsz;
+		size_t outsz;
 		char *out = reencode_string_len(sb->buf, sb->len,
 						output_enc, utf8, &outsz);
 		if (out)
diff --git a/strbuf.c b/strbuf.c
index b0716ac..e79758b 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -134,7 +134,7 @@ void strbuf_ltrim(struct strbuf *sb)
 int strbuf_reencode(struct strbuf *sb, const char *from, const char *to)
 {
 	char *out;
-	int len;
+	size_t len;
 
 	if (same_encoding(from, to))
 		return 0;
diff --git a/utf8.c b/utf8.c
index a2fd24c..edcd1e8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -470,7 +470,7 @@ int utf8_fprintf(FILE *stream, const char *format, ...)
 #else
 	typedef char * iconv_ibp;
 #endif
-char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outsz_p)
+char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, size_t *outsz_p)
 {
 	size_t outsz, outalloc;
 	char *out, *outpos;
@@ -534,9 +534,9 @@ static const char *fallback_encoding(const char *name)
 	return name;
 }
 
-char *reencode_string_len(const char *in, int insz,
+char *reencode_string_len(const char *in, size_t insz,
 			  const char *out_encoding, const char *in_encoding,
-			  int *outsz)
+			  size_t *outsz)
 {
 	iconv_t conv;
 	char *out;
diff --git a/utf8.h b/utf8.h
index db73a2d..ce1c269 100644
--- a/utf8.h
+++ b/utf8.h
@@ -25,14 +25,14 @@ void strbuf_utf8_replace(struct strbuf *sb, int pos, int width,
 
 #ifndef NO_ICONV
 char *reencode_string_iconv(const char *in, size_t insz,
-			    iconv_t conv, int *outsz);
-char *reencode_string_len(const char *in, int insz,
+			    iconv_t conv, size_t *outsz);
+char *reencode_string_len(const char *in, size_t insz,
 			  const char *out_encoding,
 			  const char *in_encoding,
-			  int *outsz);
+			  size_t *outsz);
 #else
-static inline char *reencode_string_len(const char *a, int b,
-					const char *c, const char *d, int *e)
+static inline char *reencode_string_len(const char *a, size_t b,
+					const char *c, const char *d, size_t *e)
 { if (e) *e = 0; return NULL; }
 #endif
 
-- 
cgit v0.10.2-6-g49f6


From 26114c00be2cd49b97b18df69a909d3330886e9d Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:51:08 -0400
Subject: strbuf: use size_t for length in intermediate variables

A few strbuf functions store the length of a strbuf in a
temporary variable. We should always use size_t for this, as
it's possible for a strbuf to exceed an "int" (e.g., a 2GB
string on a 64-bit system). This is unlikely in practice,
but we should try to behave sensibly on silly or malicious
input.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/strbuf.c b/strbuf.c
index e79758b..6ff1f80 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -209,7 +209,7 @@ void strbuf_list_free(struct strbuf **sbs)
 
 int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
 {
-	int len = a->len < b->len ? a->len: b->len;
+	size_t len = a->len < b->len ? a->len: b->len;
 	int cmp = memcmp(a->buf, b->buf, len);
 	if (cmp)
 		return cmp;
@@ -389,7 +389,7 @@ size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
 
 void strbuf_addbuf_percentquote(struct strbuf *dst, const struct strbuf *src)
 {
-	int i, len = src->len;
+	size_t i, len = src->len;
 
 	for (i = 0; i < len; i++) {
 		if (src->buf[i] == '%')
@@ -960,7 +960,7 @@ static size_t cleanup(char *line, size_t len)
  */
 void strbuf_stripspace(struct strbuf *sb, int skip_comments)
 {
-	int empties = 0;
+	size_t empties = 0;
 	size_t i, j, len, newlen;
 	char *eol;
 
-- 
cgit v0.10.2-6-g49f6


From f3e76ed228d60688b49dbc2735e4633e55969e30 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:51:25 -0400
Subject: strbuf_readlink: use ssize_t

The return type of readlink() is ssize_t, not int. This
probably doesn't matter in practice, as it would require a
2GB symlink destination, but it doesn't hurt to be careful.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/strbuf.c b/strbuf.c
index 6ff1f80..db9069c 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -469,7 +469,7 @@ int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
 		hint = 32;
 
 	while (hint < STRBUF_MAXLINK) {
-		int len;
+		ssize_t len;
 
 		strbuf_grow(sb, hint);
 		len = readlink(path, sb->buf, hint);
-- 
cgit v0.10.2-6-g49f6


From 765b496dc6963ad8aaf40e9ac5dee358aa7fea47 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:51:39 -0400
Subject: pass st.st_size as hint for strbuf_readlink()

When we initially added the strbuf_readlink() function in
b11b7e13f4 (Add generic 'strbuf_readlink()' helper function,
2008-12-17), the point was that we generally have a _guess_
as to the correct size based on the stat information, but we
can't necessarily trust it.

Over the years, a few callers have grown up that simply pass
in 0, even though they have the stat information. Let's have
them pass in their hint for consistency (and in theory
efficiency, since it may avoid an extra resize/syscall loop,
but neither location is probably performance critical).

Note that st.st_size is actually an off_t, so in theory we
need xsize_t() here. But none of the other callsites use it,
and since this is just a hint, it doesn't matter either way
(if we wrap we'll simply start with a too-small hint and
then eventually complain when we cannot allocate the
memory).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/builtin/init-db.c b/builtin/init-db.c
index 4ecf909..12ddda7 100644
--- a/builtin/init-db.c
+++ b/builtin/init-db.c
@@ -73,7 +73,8 @@ static void copy_templates_1(struct strbuf *path, struct strbuf *template_path,
 			continue;
 		else if (S_ISLNK(st_template.st_mode)) {
 			struct strbuf lnk = STRBUF_INIT;
-			if (strbuf_readlink(&lnk, template_path->buf, 0) < 0)
+			if (strbuf_readlink(&lnk, template_path->buf,
+					    st_template.st_size) < 0)
 				die_errno(_("cannot readlink '%s'"), template_path->buf);
 			if (symlink(lnk.buf, path->buf))
 				die_errno(_("cannot symlink '%s' '%s'"),
diff --git a/refs/files-backend.c b/refs/files-backend.c
index a9a066d..c110c25 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -363,7 +363,7 @@ stat_ref:
 	/* Follow "normalized" - ie "refs/.." symlinks by hand */
 	if (S_ISLNK(st.st_mode)) {
 		strbuf_reset(&sb_contents);
-		if (strbuf_readlink(&sb_contents, path, 0) < 0) {
+		if (strbuf_readlink(&sb_contents, path, st.st_size) < 0) {
 			if (errno == ENOENT || errno == EINVAL)
 				/* inconsistent with lstat; retry */
 				goto stat_ref;
-- 
cgit v0.10.2-6-g49f6


From 7726d360b5ba859ae2b6ceefc5d88cc518c78063 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 24 Jul 2018 06:52:29 -0400
Subject: strbuf_humanise: use unsigned variables

All of the numeric formatting done by this function uses
"%u", but we pass in a signed "int". The actual range
doesn't matter here, since the conditional makes sure we're
always showing reasonably small numbers. And even gcc's
format-checker does not seem to mind. But it's potentially
confusing to a reader of the code to see the mismatch.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

diff --git a/strbuf.c b/strbuf.c
index db9069c..54f29bb 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -734,18 +734,18 @@ void strbuf_humanise_bytes(struct strbuf *buf, off_t bytes)
 {
 	if (bytes > 1 << 30) {
 		strbuf_addf(buf, "%u.%2.2u GiB",
-			    (int)(bytes >> 30),
-			    (int)(bytes & ((1 << 30) - 1)) / 10737419);
+			    (unsigned)(bytes >> 30),
+			    (unsigned)(bytes & ((1 << 30) - 1)) / 10737419);
 	} else if (bytes > 1 << 20) {
-		int x = bytes + 5243;  /* for rounding */
+		unsigned x = bytes + 5243;  /* for rounding */
 		strbuf_addf(buf, "%u.%2.2u MiB",
 			    x >> 20, ((x & ((1 << 20) - 1)) * 100) >> 20);
 	} else if (bytes > 1 << 10) {
-		int x = bytes + 5;  /* for rounding */
+		unsigned x = bytes + 5;  /* for rounding */
 		strbuf_addf(buf, "%u.%2.2u KiB",
 			    x >> 10, ((x & ((1 << 10) - 1)) * 100) >> 10);
 	} else {
-		strbuf_addf(buf, "%u bytes", (int)bytes);
+		strbuf_addf(buf, "%u bytes", (unsigned)bytes);
 	}
 }
 
-- 
cgit v0.10.2-6-g49f6