From 77aa03d6c7f07db4a5d34afe8f5b3a55e801057c Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:50:10 -0400 Subject: reencode_string: use st_add/st_mult helpers When converting a string with iconv, if the output buffer isn't big enough, we grow it. But our growth is done without any concern for integer overflow. So when we add: outalloc = sofar + insz * 2 + 32; we may end up wrapping outalloc (which is a size_t), and allocating a too-small buffer. We then manipulate it further: outsz = outalloc - sofar - 1; and feed outsz back to iconv. If outalloc is wrapped and smaller than sofar, we'll end up with a small allocation but feed a very large outsz to iconv, which could result in it overflowing the buffer. Can we use this to construct an attack wherein the victim clones a repository with a very large commit object with an encoding header, and running "git log" reencodes it into utf8, causing an overflow? An attack of this sort is likely impossible in practice. "sofar" is how many output bytes we've written total, and "insz" is the number of input bytes remaining. Imagine our input doubles in size as we output it (which is easy to do by converting latin1 to utf8, for example), and that we start with N input bytes. Our initial output buffer also starts at N bytes, so after the first call we'd have N/2 input bytes remaining (insz), and have written N bytes (sofar). That means our next allocation will be (N + N/2 * 2 + 32) bytes, or (2N + 32). We can therefore overflow a 32-bit size_t with a commit message that's just under 2^31 bytes, assuming it consists mostly of "doubling" sequences (e.g., latin1 0xe1 which becomes utf8 0xc3 0xa1). But we'll never make it that far with such a message. We'll be spending 2^31 bytes on the original string. And our initial output buffer will also be 2^31 bytes. Which is not going to succeed on a system with a 32-bit size_t, since there will be other things using the address space, too. The initial malloc will fail. If we imagine instead that we can triple the size when converting, then our second allocation becomes (N + 2/3N * 2 + 32), or (7/3N + 32). That still requires two allocations of 3/7 of our address space (6/7 of the total) to succeed. If we imagine we can quadruple, it becomes (5/2N + 32); we need to be able to allocate 4/5 of the address space to succeed. This might start to get plausible. But is it possible to get a 4-to-1 increase in size? Probably if you're converting to some obscure encoding. But since git defaults to utf8 for its output, that's the likely destination encoding for an attack. And while there are 4-character utf8 sequences, it's unlikely that you'd be able find a single-byte source sequence in any encoding. So this is certainly buggy code which should be fixed, but it is probably not a useful attack vector. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/utf8.c b/utf8.c index d55e20c..a2fd24c 100644 --- a/utf8.c +++ b/utf8.c @@ -477,7 +477,7 @@ char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outs iconv_ibp cp; outsz = insz; - outalloc = outsz + 1; /* for terminating NUL */ + outalloc = st_add(outsz, 1); /* for terminating NUL */ out = xmalloc(outalloc); outpos = out; cp = (iconv_ibp)in; @@ -497,7 +497,7 @@ char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outs * converting the rest. */ sofar = outpos - out; - outalloc = sofar + insz * 2 + 32; + outalloc = st_add3(sofar, st_mult(insz, 2), 32); out = xrealloc(out, outalloc); outpos = out + sofar; outsz = outalloc - sofar - 1; -- cgit v0.10.2-6-g49f6 From c7d017d7e1cca37ca20f73c11fa9f1b319a2c3a5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:50:33 -0400 Subject: reencode_string: use size_t for string lengths The iconv interface takes a size_t, which is the appropriate type for an in-memory buffer. But our reencode_string_* functions use integers, meaning we may get confusing results when the sizes exceed INT_MAX. Let's use size_t consistently. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/convert.c b/convert.c index 64d0d30..e012959 100644 --- a/convert.c +++ b/convert.c @@ -389,7 +389,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, struct strbuf *buf, const char *enc, int conv_flags) { char *dst; - int dst_len; + size_t dst_len; int die_on_error = conv_flags & CONV_WRITE_OBJECT; /* @@ -452,7 +452,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, */ if (die_on_error && check_roundtrip(enc)) { char *re_src; - int re_src_len; + size_t re_src_len; re_src = reencode_string_len(dst, dst_len, enc, default_encoding, @@ -480,7 +480,7 @@ static int encode_to_worktree(const char *path, const char *src, size_t src_len, struct strbuf *buf, const char *enc) { char *dst; - int dst_len; + size_t dst_len; /* * No encoding is specified or there is nothing to encode. diff --git a/pretty.c b/pretty.c index 703fa6f..e1e4060 100644 --- a/pretty.c +++ b/pretty.c @@ -1538,7 +1538,7 @@ void format_commit_message(const struct commit *commit, } if (output_enc) { - int outsz; + size_t outsz; char *out = reencode_string_len(sb->buf, sb->len, output_enc, utf8, &outsz); if (out) diff --git a/strbuf.c b/strbuf.c index b0716ac..e79758b 100644 --- a/strbuf.c +++ b/strbuf.c @@ -134,7 +134,7 @@ void strbuf_ltrim(struct strbuf *sb) int strbuf_reencode(struct strbuf *sb, const char *from, const char *to) { char *out; - int len; + size_t len; if (same_encoding(from, to)) return 0; diff --git a/utf8.c b/utf8.c index a2fd24c..edcd1e8 100644 --- a/utf8.c +++ b/utf8.c @@ -470,7 +470,7 @@ int utf8_fprintf(FILE *stream, const char *format, ...) #else typedef char * iconv_ibp; #endif -char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outsz_p) +char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, size_t *outsz_p) { size_t outsz, outalloc; char *out, *outpos; @@ -534,9 +534,9 @@ static const char *fallback_encoding(const char *name) return name; } -char *reencode_string_len(const char *in, int insz, +char *reencode_string_len(const char *in, size_t insz, const char *out_encoding, const char *in_encoding, - int *outsz) + size_t *outsz) { iconv_t conv; char *out; diff --git a/utf8.h b/utf8.h index db73a2d..ce1c269 100644 --- a/utf8.h +++ b/utf8.h @@ -25,14 +25,14 @@ void strbuf_utf8_replace(struct strbuf *sb, int pos, int width, #ifndef NO_ICONV char *reencode_string_iconv(const char *in, size_t insz, - iconv_t conv, int *outsz); -char *reencode_string_len(const char *in, int insz, + iconv_t conv, size_t *outsz); +char *reencode_string_len(const char *in, size_t insz, const char *out_encoding, const char *in_encoding, - int *outsz); + size_t *outsz); #else -static inline char *reencode_string_len(const char *a, int b, - const char *c, const char *d, int *e) +static inline char *reencode_string_len(const char *a, size_t b, + const char *c, const char *d, size_t *e) { if (e) *e = 0; return NULL; } #endif -- cgit v0.10.2-6-g49f6 From 26114c00be2cd49b97b18df69a909d3330886e9d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:51:08 -0400 Subject: strbuf: use size_t for length in intermediate variables A few strbuf functions store the length of a strbuf in a temporary variable. We should always use size_t for this, as it's possible for a strbuf to exceed an "int" (e.g., a 2GB string on a 64-bit system). This is unlikely in practice, but we should try to behave sensibly on silly or malicious input. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/strbuf.c b/strbuf.c index e79758b..6ff1f80 100644 --- a/strbuf.c +++ b/strbuf.c @@ -209,7 +209,7 @@ void strbuf_list_free(struct strbuf **sbs) int strbuf_cmp(const struct strbuf *a, const struct strbuf *b) { - int len = a->len < b->len ? a->len: b->len; + size_t len = a->len < b->len ? a->len: b->len; int cmp = memcmp(a->buf, b->buf, len); if (cmp) return cmp; @@ -389,7 +389,7 @@ size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void strbuf_addbuf_percentquote(struct strbuf *dst, const struct strbuf *src) { - int i, len = src->len; + size_t i, len = src->len; for (i = 0; i < len; i++) { if (src->buf[i] == '%') @@ -960,7 +960,7 @@ static size_t cleanup(char *line, size_t len) */ void strbuf_stripspace(struct strbuf *sb, int skip_comments) { - int empties = 0; + size_t empties = 0; size_t i, j, len, newlen; char *eol; -- cgit v0.10.2-6-g49f6 From f3e76ed228d60688b49dbc2735e4633e55969e30 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:51:25 -0400 Subject: strbuf_readlink: use ssize_t The return type of readlink() is ssize_t, not int. This probably doesn't matter in practice, as it would require a 2GB symlink destination, but it doesn't hurt to be careful. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/strbuf.c b/strbuf.c index 6ff1f80..db9069c 100644 --- a/strbuf.c +++ b/strbuf.c @@ -469,7 +469,7 @@ int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint) hint = 32; while (hint < STRBUF_MAXLINK) { - int len; + ssize_t len; strbuf_grow(sb, hint); len = readlink(path, sb->buf, hint); -- cgit v0.10.2-6-g49f6 From 765b496dc6963ad8aaf40e9ac5dee358aa7fea47 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:51:39 -0400 Subject: pass st.st_size as hint for strbuf_readlink() When we initially added the strbuf_readlink() function in b11b7e13f4 (Add generic 'strbuf_readlink()' helper function, 2008-12-17), the point was that we generally have a _guess_ as to the correct size based on the stat information, but we can't necessarily trust it. Over the years, a few callers have grown up that simply pass in 0, even though they have the stat information. Let's have them pass in their hint for consistency (and in theory efficiency, since it may avoid an extra resize/syscall loop, but neither location is probably performance critical). Note that st.st_size is actually an off_t, so in theory we need xsize_t() here. But none of the other callsites use it, and since this is just a hint, it doesn't matter either way (if we wrap we'll simply start with a too-small hint and then eventually complain when we cannot allocate the memory). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/builtin/init-db.c b/builtin/init-db.c index 4ecf909..12ddda7 100644 --- a/builtin/init-db.c +++ b/builtin/init-db.c @@ -73,7 +73,8 @@ static void copy_templates_1(struct strbuf *path, struct strbuf *template_path, continue; else if (S_ISLNK(st_template.st_mode)) { struct strbuf lnk = STRBUF_INIT; - if (strbuf_readlink(&lnk, template_path->buf, 0) < 0) + if (strbuf_readlink(&lnk, template_path->buf, + st_template.st_size) < 0) die_errno(_("cannot readlink '%s'"), template_path->buf); if (symlink(lnk.buf, path->buf)) die_errno(_("cannot symlink '%s' '%s'"), diff --git a/refs/files-backend.c b/refs/files-backend.c index a9a066d..c110c25 100644 --- a/refs/files-backend.c +++ b/refs/files-backend.c @@ -363,7 +363,7 @@ stat_ref: /* Follow "normalized" - ie "refs/.." symlinks by hand */ if (S_ISLNK(st.st_mode)) { strbuf_reset(&sb_contents); - if (strbuf_readlink(&sb_contents, path, 0) < 0) { + if (strbuf_readlink(&sb_contents, path, st.st_size) < 0) { if (errno == ENOENT || errno == EINVAL) /* inconsistent with lstat; retry */ goto stat_ref; -- cgit v0.10.2-6-g49f6 From 7726d360b5ba859ae2b6ceefc5d88cc518c78063 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 24 Jul 2018 06:52:29 -0400 Subject: strbuf_humanise: use unsigned variables All of the numeric formatting done by this function uses "%u", but we pass in a signed "int". The actual range doesn't matter here, since the conditional makes sure we're always showing reasonably small numbers. And even gcc's format-checker does not seem to mind. But it's potentially confusing to a reader of the code to see the mismatch. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/strbuf.c b/strbuf.c index db9069c..54f29bb 100644 --- a/strbuf.c +++ b/strbuf.c @@ -734,18 +734,18 @@ void strbuf_humanise_bytes(struct strbuf *buf, off_t bytes) { if (bytes > 1 << 30) { strbuf_addf(buf, "%u.%2.2u GiB", - (int)(bytes >> 30), - (int)(bytes & ((1 << 30) - 1)) / 10737419); + (unsigned)(bytes >> 30), + (unsigned)(bytes & ((1 << 30) - 1)) / 10737419); } else if (bytes > 1 << 20) { - int x = bytes + 5243; /* for rounding */ + unsigned x = bytes + 5243; /* for rounding */ strbuf_addf(buf, "%u.%2.2u MiB", x >> 20, ((x & ((1 << 20) - 1)) * 100) >> 20); } else if (bytes > 1 << 10) { - int x = bytes + 5; /* for rounding */ + unsigned x = bytes + 5; /* for rounding */ strbuf_addf(buf, "%u.%2.2u KiB", x >> 10, ((x & ((1 << 10) - 1)) * 100) >> 10); } else { - strbuf_addf(buf, "%u bytes", (int)bytes); + strbuf_addf(buf, "%u bytes", (unsigned)bytes); } } -- cgit v0.10.2-6-g49f6