From fae9627470615df5d40f2892e518447058567316 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:37:13 -0500 Subject: upload-pack: drop separate v2 "haves" array When upload-pack sees a "have" line in the v0 protocol, it immediately calls got_oid() with its argument and potentially produces an ACK response. In the v2 protocol, we simply record the argument in an oid_array, and only later process all of the "have" objects by calling the equivalent of got_oid() on the contents of the array. This makes some sense, as v2 is a pure request/response protocol, as opposed to v0's asynchronous negotiation phase. But there's a downside: a client can send us an infinite number of garbage "have" lines, which we'll happily slurp into the array, consuming memory. Whereas in v0, they are limited by the number of objects in the repository (because got_oid() only records objects we have ourselves, and we avoid duplicates by setting a flag on the object struct). We can make v2 behave more like v0 by also calling got_oid() directly when v2 parses a "have" line. Calling it early like this is OK because got_oid() itself does not interact with the client; it only confirms that we have the object and sets a few flags. Note that unlike v0, v2 does not ever (before or after this patch) check the return code of got_oid(), which lets the caller know whether we have the object. But again, that makes sense; v0 is using it to asynchronously tell the client to stop sending. In v2's synchronous protocol, we just discard those entries (and decide how to ACK at the end of each round). There is one slight tweak we need, though. In v2's state machine, we reach the SEND_ACKS state if the other side sent us any "have" lines, whether they were useful or not. Right now we do that by checking whether the "have" array had any entries, but if we record only the useful ones, that doesn't work. Instead, we can add a simple boolean that tells us whether we saw any have line (even if it was useless). This lets us drop the "haves" array entirely, as we're now placing objects directly into the "have_obj" object array (which is where got_oid() put them in the long run anyway). And as a bonus, we can drop the secondary "common" array used in process_haves_and_send_acks(). It was essentially a copy of "haves" minus the objects we do not have. But now that we are using "have_obj" directly, we know everything in it is useful. So in addition to protecting ourselves against malicious input, we should slightly lower our memory usage for normal inputs. Note that there is one user-visible effect. The trace2 output records the number of "haves". Previously this was the total number of "have" lines we saw, but now is the number of useful ones. We could retain the original meaning by keeping a separate counter, but it doesn't seem worth the effort; this trace info is for debugging and metrics, and arguably the count of common oids is at least as useful as the total count. Reported-by: Benjamin Flesch Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/upload-pack.c b/upload-pack.c index 2537aff..7a83127 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -61,7 +61,6 @@ struct upload_pack_data { struct string_list symref; /* v0 only */ struct object_array want_obj; struct object_array have_obj; - struct oid_array haves; /* v2 only */ struct string_list wanted_refs; /* v2 only */ struct strvec hidden_refs; @@ -113,6 +112,7 @@ struct upload_pack_data { unsigned done : 1; /* v2 only */ unsigned allow_ref_in_want : 1; /* v2 only */ unsigned allow_sideband_all : 1; /* v2 only */ + unsigned seen_haves : 1; /* v2 only */ unsigned advertise_sid : 1; unsigned sent_capabilities : 1; }; @@ -124,7 +124,6 @@ static void upload_pack_data_init(struct upload_pack_data *data) struct strvec hidden_refs = STRVEC_INIT; struct object_array want_obj = OBJECT_ARRAY_INIT; struct object_array have_obj = OBJECT_ARRAY_INIT; - struct oid_array haves = OID_ARRAY_INIT; struct object_array shallows = OBJECT_ARRAY_INIT; struct string_list deepen_not = STRING_LIST_INIT_DUP; struct string_list uri_protocols = STRING_LIST_INIT_DUP; @@ -137,7 +136,6 @@ static void upload_pack_data_init(struct upload_pack_data *data) data->hidden_refs = hidden_refs; data->want_obj = want_obj; data->have_obj = have_obj; - data->haves = haves; data->shallows = shallows; data->deepen_not = deepen_not; data->uri_protocols = uri_protocols; @@ -159,7 +157,6 @@ static void upload_pack_data_clear(struct upload_pack_data *data) strvec_clear(&data->hidden_refs); object_array_clear(&data->want_obj); object_array_clear(&data->have_obj); - oid_array_clear(&data->haves); object_array_clear(&data->shallows); string_list_clear(&data->deepen_not, 0); object_array_clear(&data->extra_edge_obj); @@ -1532,15 +1529,14 @@ static int parse_want_ref(struct packet_writer *writer, const char *line, return 0; } -static int parse_have(const char *line, struct oid_array *haves) +static int parse_have(const char *line, struct upload_pack_data *data) { const char *arg; if (skip_prefix(line, "have ", &arg)) { struct object_id oid; - if (get_oid_hex(arg, &oid)) - die("git upload-pack: expected SHA1 object, got '%s'", arg); - oid_array_append(haves, &oid); + got_oid(data, arg, &oid); + data->seen_haves = 1; return 1; } @@ -1552,7 +1548,7 @@ static void trace2_fetch_info(struct upload_pack_data *data) struct json_writer jw = JSON_WRITER_INIT; jw_object_begin(&jw, 0); - jw_object_intmax(&jw, "haves", data->haves.nr); + jw_object_intmax(&jw, "haves", data->have_obj.nr); jw_object_intmax(&jw, "wants", data->want_obj.nr); jw_object_intmax(&jw, "want-refs", data->wanted_refs.nr); jw_object_intmax(&jw, "depth", data->depth); @@ -1586,7 +1582,7 @@ static void process_args(struct packet_reader *request, &data->hidden_refs, &data->want_obj)) continue; /* process have line */ - if (parse_have(arg, &data->haves)) + if (parse_have(arg, data)) continue; /* process args like thin-pack */ @@ -1664,27 +1660,7 @@ static void process_args(struct packet_reader *request, trace2_fetch_info(data); } -static int process_haves(struct upload_pack_data *data, struct oid_array *common) -{ - int i; - - /* Process haves */ - for (i = 0; i < data->haves.nr; i++) { - const struct object_id *oid = &data->haves.oid[i]; - - if (!repo_has_object_file_with_flags(the_repository, oid, - OBJECT_INFO_QUICK | OBJECT_INFO_SKIP_FETCH_OBJECT)) - continue; - - oid_array_append(common, oid); - - do_got_oid(data, oid); - } - - return 0; -} - -static int send_acks(struct upload_pack_data *data, struct oid_array *acks) +static int send_acks(struct upload_pack_data *data, struct object_array *acks) { int i; @@ -1696,7 +1672,7 @@ static int send_acks(struct upload_pack_data *data, struct oid_array *acks) for (i = 0; i < acks->nr; i++) { packet_writer_write(&data->writer, "ACK %s\n", - oid_to_hex(&acks->oid[i])); + oid_to_hex(&acks->objects[i].item->oid)); } if (!data->wait_for_done && ok_to_give_up(data)) { @@ -1710,13 +1686,11 @@ static int send_acks(struct upload_pack_data *data, struct oid_array *acks) static int process_haves_and_send_acks(struct upload_pack_data *data) { - struct oid_array common = OID_ARRAY_INIT; int ret = 0; - process_haves(data, &common); if (data->done) { ret = 1; - } else if (send_acks(data, &common)) { + } else if (send_acks(data, &data->have_obj)) { packet_writer_delim(&data->writer); ret = 1; } else { @@ -1725,8 +1699,6 @@ static int process_haves_and_send_acks(struct upload_pack_data *data) ret = 0; } - oid_array_clear(&data->haves); - oid_array_clear(&common); return ret; } @@ -1796,7 +1768,7 @@ int upload_pack_v2(struct repository *r UNUSED, struct packet_reader *request) * they didn't want anything. */ state = FETCH_DONE; - } else if (data.haves.nr) { + } else if (data.seen_haves) { /* * Request had 'have' lines, so lets ACK them. */ -- cgit v0.10.2-6-g49f6 From 720ba25d993423ab7b12b9bbdc62dadaa78d315e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:37:20 -0500 Subject: upload-pack: switch deepen-not list to an oid_array When we see a "deepen-not" line from the client, we verify that the given name can be resolved as a ref, and then add it to a string list to be passed later to an internal "rev-list --not" traversal. We record the actual refname in the string list (so the traversal resolves it again later), but we'd be better off recording the resolved oid: 1. There's a tiny bit of wasted work in resolving it twice. 2. There's a small race condition with simultaneous updates; the later traversal may resolve to a different value (or not at all). This shouldn't cause any bad behavior (we do not care about the value in this first resolution, so whatever value rev-list gets is OK) but it could mean a confusing error message (if upload-pack fails to resolve the ref it produces a useful message, but a failing traversal later results in just "revision walk setup failed"). 3. It makes it simpler to de-duplicate the results. We don't de-dup at all right now, but we will in the next patch. >From the client's perspective the behavior should be the same. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/upload-pack.c b/upload-pack.c index 7a83127..f6395b2 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -65,7 +65,7 @@ struct upload_pack_data { struct strvec hidden_refs; struct object_array shallows; - struct string_list deepen_not; + struct oid_array deepen_not; struct object_array extra_edge_obj; int depth; timestamp_t deepen_since; @@ -125,7 +125,7 @@ static void upload_pack_data_init(struct upload_pack_data *data) struct object_array want_obj = OBJECT_ARRAY_INIT; struct object_array have_obj = OBJECT_ARRAY_INIT; struct object_array shallows = OBJECT_ARRAY_INIT; - struct string_list deepen_not = STRING_LIST_INIT_DUP; + struct oid_array deepen_not = OID_ARRAY_INIT; struct string_list uri_protocols = STRING_LIST_INIT_DUP; struct object_array extra_edge_obj = OBJECT_ARRAY_INIT; struct string_list allowed_filters = STRING_LIST_INIT_DUP; @@ -158,7 +158,7 @@ static void upload_pack_data_clear(struct upload_pack_data *data) object_array_clear(&data->want_obj); object_array_clear(&data->have_obj); object_array_clear(&data->shallows); - string_list_clear(&data->deepen_not, 0); + oid_array_clear(&data->deepen_not); object_array_clear(&data->extra_edge_obj); list_objects_filter_release(&data->filter_options); string_list_clear(&data->allowed_filters, 0); @@ -926,8 +926,8 @@ static int send_shallow_list(struct upload_pack_data *data) if (data->deepen_not.nr) { strvec_push(&av, "--not"); for (i = 0; i < data->deepen_not.nr; i++) { - struct string_list_item *s = data->deepen_not.items + i; - strvec_push(&av, s->string); + struct object_id *oid = data->deepen_not.oid + i; + strvec_push(&av, oid_to_hex(oid)); } strvec_push(&av, "--not"); } @@ -1004,7 +1004,7 @@ static int process_deepen_since(const char *line, timestamp_t *deepen_since, int return 0; } -static int process_deepen_not(const char *line, struct string_list *deepen_not, int *deepen_rev_list) +static int process_deepen_not(const char *line, struct oid_array *deepen_not, int *deepen_rev_list) { const char *arg; if (skip_prefix(line, "deepen-not ", &arg)) { @@ -1012,7 +1012,7 @@ static int process_deepen_not(const char *line, struct string_list *deepen_not, struct object_id oid; if (expand_ref(the_repository, arg, strlen(arg), &oid, &ref) != 1) die("git upload-pack: ambiguous deepen-not: %s", line); - string_list_append(deepen_not, ref); + oid_array_append(deepen_not, &oid); free(ref); *deepen_rev_list = 1; return 1; -- cgit v0.10.2-6-g49f6 From 388b96df319c62e9c8d984921b8967db37481d8a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:37:44 -0500 Subject: upload-pack: use oidset for deepen_not list We record the oid of every deepen-not line the client sends to us. For a well-behaved client, the resulting array should be bounded by the number of unique refs we have. But because there's no de-duplication, a malicious client can cause the array to grow unbounded by just sending the same "refs/heads/foo" over and over (assuming such a ref exists). Since the deepen-not list is just being fed to a "rev-list --not" traversal, the order of items doesn't matter. So we can replace the oid_array with an oidset which notices and skips duplicates. That bounds the memory in malicious cases to be linear in the number of unique refs. And even in non-malicious cases, there may be a slight improvement in memory usage if multiple refs point to the same oid (though in practice this list is probably pretty tiny anyway, as it comes from the user specifying "--shallow-exclude" on the client fetch). Note that in the trace2 output we'll now output the number of de-duplicated objects, rather than the total number of "deepen-not" lines we received. This is arguably a more useful value for tracing / debugging anyway. Reported-by: Benjamin Flesch Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/upload-pack.c b/upload-pack.c index f6395b2..ebad902 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -65,7 +65,7 @@ struct upload_pack_data { struct strvec hidden_refs; struct object_array shallows; - struct oid_array deepen_not; + struct oidset deepen_not; struct object_array extra_edge_obj; int depth; timestamp_t deepen_since; @@ -125,7 +125,7 @@ static void upload_pack_data_init(struct upload_pack_data *data) struct object_array want_obj = OBJECT_ARRAY_INIT; struct object_array have_obj = OBJECT_ARRAY_INIT; struct object_array shallows = OBJECT_ARRAY_INIT; - struct oid_array deepen_not = OID_ARRAY_INIT; + struct oidset deepen_not = OID_ARRAY_INIT; struct string_list uri_protocols = STRING_LIST_INIT_DUP; struct object_array extra_edge_obj = OBJECT_ARRAY_INIT; struct string_list allowed_filters = STRING_LIST_INIT_DUP; @@ -158,7 +158,7 @@ static void upload_pack_data_clear(struct upload_pack_data *data) object_array_clear(&data->want_obj); object_array_clear(&data->have_obj); object_array_clear(&data->shallows); - oid_array_clear(&data->deepen_not); + oidset_clear(&data->deepen_not); object_array_clear(&data->extra_edge_obj); list_objects_filter_release(&data->filter_options); string_list_clear(&data->allowed_filters, 0); @@ -923,12 +923,13 @@ static int send_shallow_list(struct upload_pack_data *data) strvec_push(&av, "rev-list"); if (data->deepen_since) strvec_pushf(&av, "--max-age=%"PRItime, data->deepen_since); - if (data->deepen_not.nr) { + if (oidset_size(&data->deepen_not)) { + const struct object_id *oid; + struct oidset_iter iter; strvec_push(&av, "--not"); - for (i = 0; i < data->deepen_not.nr; i++) { - struct object_id *oid = data->deepen_not.oid + i; + oidset_iter_init(&data->deepen_not, &iter); + while ((oid = oidset_iter_next(&iter))) strvec_push(&av, oid_to_hex(oid)); - } strvec_push(&av, "--not"); } for (i = 0; i < data->want_obj.nr; i++) { @@ -1004,7 +1005,7 @@ static int process_deepen_since(const char *line, timestamp_t *deepen_since, int return 0; } -static int process_deepen_not(const char *line, struct oid_array *deepen_not, int *deepen_rev_list) +static int process_deepen_not(const char *line, struct oidset *deepen_not, int *deepen_rev_list) { const char *arg; if (skip_prefix(line, "deepen-not ", &arg)) { @@ -1012,7 +1013,7 @@ static int process_deepen_not(const char *line, struct oid_array *deepen_not, in struct object_id oid; if (expand_ref(the_repository, arg, strlen(arg), &oid, &ref) != 1) die("git upload-pack: ambiguous deepen-not: %s", line); - oid_array_append(deepen_not, &oid); + oidset_insert(deepen_not, &oid); free(ref); *deepen_rev_list = 1; return 1; @@ -1554,7 +1555,7 @@ static void trace2_fetch_info(struct upload_pack_data *data) jw_object_intmax(&jw, "depth", data->depth); jw_object_intmax(&jw, "shallows", data->shallows.nr); jw_object_bool(&jw, "deepen-since", data->deepen_since); - jw_object_intmax(&jw, "deepen-not", data->deepen_not.nr); + jw_object_intmax(&jw, "deepen-not", oidset_size(&data->deepen_not)); jw_object_bool(&jw, "deepen-relative", data->deepen_relative); if (data->filter_options.choice) jw_object_string(&jw, "filter", list_object_filter_config_name(data->filter_options.choice)); -- cgit v0.10.2-6-g49f6 From b065063c570f8dfb25eccb5094fef7e89a1883f2 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:38:40 -0500 Subject: upload-pack: use a strmap for want-ref lines When the "ref-in-want" capability is advertised (which it is not by default), then upload-pack processes a "want-ref" line from the client by checking that the name is a valid ref and recording it in a string-list. In theory this list should grow no larger than the number of refs in the server-side repository. But since we don't do any de-duplication, a client which sends "want-ref refs/heads/foo" over and over will cause the array to grow without bound. We can fix this by switching to strmap, which efficiently detects duplicates. There are two client-visible changes here: 1. The "wanted-refs" response will now be in an apparently-random order (based on iterating the hashmap) rather than the order given by the client. The protocol documentation is quiet on ordering here. The current fetch-pack implementation is happy with any order, as it looks up each returned ref using a binary search in its local sorted list. JGit seems to implement want-ref on the server side, but has no client-side support. libgit2 doesn't support either side. It would obviously be possible to record the original order or to use the strmap as an auxiliary data structure. But if the client doesn't care, we may as well do the simplest thing. 2. We'll now reject duplicates explicitly as a protocol error. The client should never send them (and our current implementation, even when asked to "git fetch master:one master:two" will de-dup on the client side). If we wanted to be more forgiving, we could perhaps just throw away the duplicates. But then our "wanted-refs" response back to the client would omit the duplicates, and it's hard to say what a client that accidentally sent a duplicate would do with that. So I think we're better off to complain loudly before anybody accidentally writes such a client. Let's also add a note to the protocol documentation clarifying that duplicates are forbidden. As discussed above, this was already the intent, but it's not very explicit. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/Documentation/gitprotocol-v2.txt b/Documentation/gitprotocol-v2.txt index 0b800ab..837ea61 100644 --- a/Documentation/gitprotocol-v2.txt +++ b/Documentation/gitprotocol-v2.txt @@ -346,7 +346,8 @@ the 'wanted-refs' section in the server's response as explained below. want-ref Indicates to the server that the client wants to retrieve a particular ref, where is the full name of a ref on the - server. + server. It is a protocol error to send want-ref for the + same ref more than once. If the 'sideband-all' feature is advertised, the following argument can be included in the client's request: diff --git a/upload-pack.c b/upload-pack.c index ebad902..8b47576 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -28,6 +28,7 @@ #include "shallow.h" #include "write-or-die.h" #include "json-writer.h" +#include "strmap.h" /* Remember to update object flag allocation in object.h */ #define THEY_HAVE (1u << 11) @@ -61,7 +62,7 @@ struct upload_pack_data { struct string_list symref; /* v0 only */ struct object_array want_obj; struct object_array have_obj; - struct string_list wanted_refs; /* v2 only */ + struct strmap wanted_refs; /* v2 only */ struct strvec hidden_refs; struct object_array shallows; @@ -120,7 +121,7 @@ struct upload_pack_data { static void upload_pack_data_init(struct upload_pack_data *data) { struct string_list symref = STRING_LIST_INIT_DUP; - struct string_list wanted_refs = STRING_LIST_INIT_DUP; + struct strmap wanted_refs = STRMAP_INIT; struct strvec hidden_refs = STRVEC_INIT; struct object_array want_obj = OBJECT_ARRAY_INIT; struct object_array have_obj = OBJECT_ARRAY_INIT; @@ -153,7 +154,7 @@ static void upload_pack_data_init(struct upload_pack_data *data) static void upload_pack_data_clear(struct upload_pack_data *data) { string_list_clear(&data->symref, 1); - string_list_clear(&data->wanted_refs, 1); + strmap_clear(&data->wanted_refs, 1); strvec_clear(&data->hidden_refs); object_array_clear(&data->want_obj); object_array_clear(&data->have_obj); @@ -1488,14 +1489,13 @@ static int parse_want(struct packet_writer *writer, const char *line, } static int parse_want_ref(struct packet_writer *writer, const char *line, - struct string_list *wanted_refs, + struct strmap *wanted_refs, struct strvec *hidden_refs, struct object_array *want_obj) { const char *refname_nons; if (skip_prefix(line, "want-ref ", &refname_nons)) { struct object_id oid; - struct string_list_item *item; struct object *o = NULL; struct strbuf refname = STRBUF_INIT; @@ -1507,8 +1507,11 @@ static int parse_want_ref(struct packet_writer *writer, const char *line, } strbuf_release(&refname); - item = string_list_append(wanted_refs, refname_nons); - item->util = oiddup(&oid); + if (strmap_put(wanted_refs, refname_nons, oiddup(&oid))) { + packet_writer_error(writer, "duplicate want-ref %s", + refname_nons); + die("duplicate want-ref %s", refname_nons); + } if (!starts_with(refname_nons, "refs/tags/")) { struct commit *commit = lookup_commit_in_graph(the_repository, &oid); @@ -1551,7 +1554,7 @@ static void trace2_fetch_info(struct upload_pack_data *data) jw_object_begin(&jw, 0); jw_object_intmax(&jw, "haves", data->have_obj.nr); jw_object_intmax(&jw, "wants", data->want_obj.nr); - jw_object_intmax(&jw, "want-refs", data->wanted_refs.nr); + jw_object_intmax(&jw, "want-refs", strmap_get_size(&data->wanted_refs)); jw_object_intmax(&jw, "depth", data->depth); jw_object_intmax(&jw, "shallows", data->shallows.nr); jw_object_bool(&jw, "deepen-since", data->deepen_since); @@ -1705,17 +1708,18 @@ static int process_haves_and_send_acks(struct upload_pack_data *data) static void send_wanted_ref_info(struct upload_pack_data *data) { - const struct string_list_item *item; + struct hashmap_iter iter; + const struct strmap_entry *e; - if (!data->wanted_refs.nr) + if (strmap_empty(&data->wanted_refs)) return; packet_writer_write(&data->writer, "wanted-refs\n"); - for_each_string_list_item(item, &data->wanted_refs) { + strmap_for_each_entry(&data->wanted_refs, &iter, e) { packet_writer_write(&data->writer, "%s %s\n", - oid_to_hex(item->util), - item->string); + oid_to_hex(e->value), + e->key); } packet_writer_delim(&data->writer); -- cgit v0.10.2-6-g49f6 From 179776f9e6c4bd2f423c78fcf8b1e4cb4afc4c45 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:38:46 -0500 Subject: upload-pack: accept only a single packfile-uri line When we see a packfile-uri line from the client, we use string_list_split() to split it on commas and store the result in a string_list. A single packfile-uri line is therefore limited to storing ~64kb, the size of a pkt-line. But we'll happily accept multiple such lines, and each line appends to the string list, growing without bound. In theory this could be useful, making: 0017packfile-uris http 0018packfile-uris https equivalent to: 001dpackfile-uris http,https But the protocol documentation doesn't indicate that this should work (and indeed, refers to this in the singular as "the following argument can be included in the client's request"). And the client-side implementation in fetch-pack has always sent a single line (JGit appears to understand the line on the server side but has no client-side implementation, and libgit2 understands neither). If we were worried about compatibility, we could instead just put a limit on the maximum number of values we'd accept. The current client implementation limits itself to only two values: "http" and "https", so something like "256" would be more than enough. But accepting only a single line seems more in line with the protocol documentation, and matches other parts of the protocol (e.g., we will not accept a second "filter" line). We'll also make this more explicit in the protocol documentation; as above, I think this was always the intent, but there's no harm in making it clear. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/Documentation/gitprotocol-v2.txt b/Documentation/gitprotocol-v2.txt index 837ea61..414bc62 100644 --- a/Documentation/gitprotocol-v2.txt +++ b/Documentation/gitprotocol-v2.txt @@ -362,7 +362,8 @@ included in the client's request: If the 'packfile-uris' feature is advertised, the following argument can be included in the client's request as well as the potential addition of the 'packfile-uris' section in the server's response as -explained below. +explained below. Note that at most one `packfile-uris` line can be sent +to the server. packfile-uris Indicates to the server that the client is willing to receive diff --git a/upload-pack.c b/upload-pack.c index 8b47576..2a5c526 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -1646,6 +1646,9 @@ static void process_args(struct packet_reader *request, } if (skip_prefix(arg, "packfile-uris ", &p)) { + if (data->uri_protocols.nr) + send_err_and_die(data, + "multiple packfile-uris lines forbidden"); string_list_split(&data->uri_protocols, p, ',', -1); continue; } -- cgit v0.10.2-6-g49f6 From 8c735b11decc96d673140b268e4a257629e1dad4 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 28 Feb 2024 17:38:58 -0500 Subject: upload-pack: disallow object-info capability by default We added an "object-info" capability to the v2 upload-pack protocol in a2ba162cda (object-info: support for retrieving object info, 2021-04-20). In the almost 3 years since, we have not added any client-side support, and it does not appear to exist in other implementations either (JGit understands the verb on the server side, but not on the client side). Since this largely unused code is accessible over the network by default, it increases the attack surface of upload-pack. I don't know of any particularly severe problem, but one issue is that because of the request/response nature of the v2 protocol, it will happily read an unbounded number of packets, adding each one to a string list (without regard to whether they are objects we know about, duplicates, etc). This may be something we want to improve in the long run, but in the short term it makes sense to disable the feature entirely. We'll add a config option as an escape hatch for anybody who wants to develop the feature further. A more gentle option would be to add the config option to let people disable it manually, but leave it enabled by default. But given that there's no client side support, that seems like the wrong balance with security. Disabling by default will slow adoption a bit once client-side support does become available (there were some patches[1] in 2022, but nothing got merged and there's been nothing since). But clients have to deal with older servers that do not understand the option anyway (and the capability system handles that), so it will just be a matter of servers flipping their config at that point (and hopefully once any unbounded allocations have been addressed). [jk: this is a patch that GitHub has been running for several years, but rebased forward and with a new commit message for upstream] [1] https://lore.kernel.org/git/20220208231911.725273-1-calvinwan@google.com/ Signed-off-by: Taylor Blau Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/Documentation/config/transfer.txt b/Documentation/config/transfer.txt index a9cbdb8..f1ce50f 100644 --- a/Documentation/config/transfer.txt +++ b/Documentation/config/transfer.txt @@ -121,3 +121,7 @@ transfer.bundleURI:: information from the remote server (if advertised) and download bundles before continuing the clone through the Git protocol. Defaults to `false`. + +transfer.advertiseObjectInfo:: + When `true`, the `object-info` capability is advertised by + servers. Defaults to false. diff --git a/serve.c b/serve.c index a1d7113..aa651b7 100644 --- a/serve.c +++ b/serve.c @@ -12,6 +12,7 @@ #include "trace2.h" static int advertise_sid = -1; +static int advertise_object_info = -1; static int client_hash_algo = GIT_HASH_SHA1; static int always_advertise(struct repository *r UNUSED, @@ -67,6 +68,17 @@ static void session_id_receive(struct repository *r UNUSED, trace2_data_string("transfer", NULL, "client-sid", client_sid); } +static int object_info_advertise(struct repository *r, struct strbuf *value UNUSED) +{ + if (advertise_object_info == -1 && + repo_config_get_bool(r, "transfer.advertiseobjectinfo", + &advertise_object_info)) { + /* disabled by default */ + advertise_object_info = 0; + } + return advertise_object_info; +} + struct protocol_capability { /* * The name of the capability. The server uses this name when @@ -135,7 +147,7 @@ static struct protocol_capability capabilities[] = { }, { .name = "object-info", - .advertise = always_advertise, + .advertise = object_info_advertise, .command = cap_object_info, }, { diff --git a/t/t5555-http-smart-common.sh b/t/t5555-http-smart-common.sh index b1cfe8b..3dcb334 100755 --- a/t/t5555-http-smart-common.sh +++ b/t/t5555-http-smart-common.sh @@ -131,7 +131,6 @@ test_expect_success 'git upload-pack --advertise-refs: v2' ' fetch=shallow wait-for-done server-option object-format=$(test_oid algo) - object-info 0000 EOF diff --git a/t/t5701-git-serve.sh b/t/t5701-git-serve.sh index 3591bc2..c48830d 100755 --- a/t/t5701-git-serve.sh +++ b/t/t5701-git-serve.sh @@ -20,7 +20,6 @@ test_expect_success 'test capability advertisement' ' fetch=shallow wait-for-done server-option object-format=$(test_oid algo) - object-info EOF cat >expect.trailer <<-EOF && 0000 @@ -323,6 +322,8 @@ test_expect_success 'unexpected lines are not allowed in fetch request' ' # Test the basics of object-info # test_expect_success 'basics of object-info' ' + test_config transfer.advertiseObjectInfo true && + test-tool pkt-line pack >in <<-EOF && command=object-info object-format=$(test_oid algo) @@ -380,4 +381,25 @@ test_expect_success 'basics of bundle-uri: dies if not enabled' ' test_must_be_empty out ' +test_expect_success 'object-info missing from capabilities when disabled' ' + test_config transfer.advertiseObjectInfo false && + + GIT_TEST_SIDEBAND_ALL=0 test-tool serve-v2 \ + --advertise-capabilities >out && + test-tool pkt-line unpack actual && + + ! grep object.info actual +' + +test_expect_success 'object-info commands rejected when disabled' ' + test_config transfer.advertiseObjectInfo false && + + test-tool pkt-line pack >in <<-EOF && + command=object-info + EOF + + test_must_fail test-tool serve-v2 --stateless-rpc err && + grep invalid.command err +' + test_done -- cgit v0.10.2-6-g49f6 From 5f64279443520922949ea89babe9bd3712c11fec Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:39:00 -0500 Subject: upload-pack: always turn off save_commit_buffer When the client sends us "want $oid" lines, we call parse_object($oid) to get an object struct. It's important to parse the commits because we need to traverse them in the negotiation phase. But of course we don't need to hold on to the commit messages for each one. We've turned off the save_commit_buffer flag in get_common_commits() for a long time, since f0243f26f6 (git-upload-pack: More efficient usage of the has_sha1 array, 2005-10-28). That helps with the commits we see while actually traversing. But: 1. That function is only used by the v0 protocol. I think the v2 protocol's code path leaves the flag on (and thus pays the extra memory penalty), though I didn't measure it specifically. 2. If the client sends us a bunch of "want" lines, that happens before the negotiation phase. So we'll hold on to all of those commit messages. Generally the number of "want" lines scales with the refs, not with the number of objects in the repo. But a malicious client could send a lot in order to waste memory. As an example of (2), if I generate a request to fetch all commits in git.git like this: pktline() { local msg="$*" printf "%04x%s\n" $((1+4+${#msg})) "$msg" } want_commits() { pktline command=fetch printf 0001 git cat-file --batch-all-objects --batch-check='%(objectname) %(objecttype)' | while read oid type; do test "$type" = "commit" || continue pktline want $oid done pktline done printf 0000 } want_commits | GIT_PROTOCOL=version=2 valgrind --tool=massif git-upload-pack . >/dev/null before this patch upload-pack peaks at ~125MB, and after at ~35MB. The difference is not coincidentally about the same as the sum of all commit object sizes as computed by: git cat-file --batch-all-objects --batch-check='%(objecttype) %(objectsize)' | perl -alne '$v += $F[1] if $F[0] eq "commit"; END { print $v }' In a larger repository like linux.git, that number is ~1GB. In a repository with a full commit-graph file this will have no impact (and the commit graph would save us from parsing at all, so is a much better solution!). But it's easy to do, might help a little in real-world cases (where even if you have a commit graph it might not be fully up to date), and helps a lot for a worst-case malicious request. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/builtin/upload-pack.c b/builtin/upload-pack.c index 9b021ef..15afb97 100644 --- a/builtin/upload-pack.c +++ b/builtin/upload-pack.c @@ -8,6 +8,7 @@ #include "replace-object.h" #include "upload-pack.h" #include "serve.h" +#include "commit.h" static const char * const upload_pack_usage[] = { N_("git-upload-pack [--[no-]strict] [--timeout=] [--stateless-rpc]\n" @@ -37,6 +38,7 @@ int cmd_upload_pack(int argc, const char **argv, const char *prefix) packet_trace_identity("upload-pack"); disable_replace_refs(); + save_commit_buffer = 0; argc = parse_options(argc, argv, prefix, options, upload_pack_usage, 0); diff --git a/upload-pack.c b/upload-pack.c index 2a5c526..3970bb9 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -526,8 +526,6 @@ static int get_common_commits(struct upload_pack_data *data, int got_other = 0; int sent_ready = 0; - save_commit_buffer = 0; - for (;;) { const char *arg; -- cgit v0.10.2-6-g49f6 From a6ca601cdf82dda85db86c12973f2ecd746cb4bd Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:39:03 -0500 Subject: upload-pack: use PARSE_OBJECT_SKIP_HASH_CHECK in more places In commit 0bc2557951 (upload-pack: skip parse-object re-hashing of "want" objects, 2022-09-06), we optimized the parse_object() calls for v2 "want" lines from the client so that they avoided parsing blobs, and so that they used the commit-graph rather than parsing commit objects from scratch. We should extend that to two other spots: 1. We parse "have" objects in the got_oid() function. These won't generally be non-commits (unlike "want" lines from a partial clone). But we still benefit from the use of the commit-graph. 2. For v0, the "want" lines are parsed in receive_needs(). These are also less likely to be non-commits because by default they have to be ref tips. There are config options you might set to allow non-tip objects, but you'd mostly do so to support partial clones, and clients recent enough to support partial clone will generally speak v2 anyway. So I don't expect this change to improve performance much for day-to-day operations. But both are possible denial-of-service vectors, where an attacker can waste our time by sending over a large number of objects to parse (of course we may waste even more time serving a pack to them, but we try as much as possible to optimize that in pack-objects; we should do what we can here in upload-pack, too). With this patch, running p5600 with GIT_TEST_PROTOCOL_VERSION=0 shows similar results to what we saw in 0bc2557951 (which ran with the v2 protocol by default). Here are the numbers for linux.git: Test HEAD^ HEAD ----------------------------------------------------------------------------- 5600.3: checkout of result 50.91(87.95+2.93) 41.75(79.00+3.18) -18.0% Or for a more extreme (and malicious) case, we can claim to "have" every blob in git.git over the v0 protocol: $ { echo "0032want $(git rev-parse HEAD)" printf 0000 git cat-file --batch-all-objects --batch-check='%(objectname) %(objecttype)' | perl -alne 'print "0032have $F[0]" if $F[1] eq "blob"' } >input $ time ./git.old upload-pack . /dev/null real 0m52.951s user 0m51.633s sys 0m1.304s $ time ./git.new upload-pack . /dev/null real 0m0.261s user 0m0.156s sys 0m0.105s (Note that these don't actually compute a pack because of the hacky protocol usage, so those numbers are representing the raw blob-parsing effort done by upload-pack). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/upload-pack.c b/upload-pack.c index 3970bb9..b721155 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -469,7 +469,8 @@ static void create_pack_file(struct upload_pack_data *pack_data, static int do_got_oid(struct upload_pack_data *data, const struct object_id *oid) { int we_knew_they_have = 0; - struct object *o = parse_object(the_repository, oid); + struct object *o = parse_object_with_flags(the_repository, oid, + PARSE_OBJECT_SKIP_HASH_CHECK); if (!o) die("oops (%s)", oid_to_hex(oid)); @@ -1148,7 +1149,8 @@ static void receive_needs(struct upload_pack_data *data, free(client_sid); } - o = parse_object(the_repository, &oid_buf); + o = parse_object_with_flags(the_repository, &oid_buf, + PARSE_OBJECT_SKIP_HASH_CHECK); if (!o) { packet_writer_error(&data->writer, "upload-pack: not our ref %s", -- cgit v0.10.2-6-g49f6 From 6cd05e768b7e54ca48b16fb0214df4c70aecd46c Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 28 Feb 2024 17:39:07 -0500 Subject: upload-pack: free tree buffers after parsing When a client sends us a "want" or "have" line, we call parse_object() to get an object struct. If the object is a tree, then the parsed state means that tree->buffer points to the uncompressed contents of the tree. But we don't really care about it. We only really need to parse commits and tags; for trees and blobs, the important output is just a "struct object" with the correct type. But much worse, we do not ever free that tree buffer. It's not leaked in the traditional sense, in that we still have a pointer to it from the global object hash. But if the client requests many trees, we'll hold all of their contents in memory at the same time. Nobody really noticed because it's rare for clients to directly request a tree. It might happen for a lightweight tag pointing straight at a tree, or it might happen for a "tree:depth" partial clone filling in missing trees. But it's also possible for a malicious client to request a lot of trees, causing upload-pack's memory to balloon. For example, without this patch, requesting every tree in git.git like: pktline() { local msg="$*" printf "%04x%s\n" $((1+4+${#msg})) "$msg" } want_trees() { pktline command=fetch printf 0001 git cat-file --batch-all-objects --batch-check='%(objectname) %(objecttype)' | while read oid type; do test "$type" = "tree" || continue pktline want $oid done pktline done printf 0000 } want_trees | GIT_PROTOCOL=version=2 valgrind --tool=massif ./git upload-pack . >/dev/null shows a peak heap usage of ~3.7GB. Which is just about the sum of the sizes of all of the uncompressed trees. For linux.git, it's closer to 17GB. So the obvious thing to do is to call free_tree_buffer() after we realize that we've parsed a tree. We know that upload-pack won't need it later. But let's push the logic into parse_object_with_flags(), telling it to discard the tree buffer immediately. There are two reasons for this. One, all of the relevant call-sites already call the with_options variant to pass the SKIP_HASH flag. So it actually ends up as less code than manually free-ing in each spot. And two, it enables an extra optimization that I'll discuss below. I've touched all of the sites that currently use SKIP_HASH in upload-pack. That drops the peak heap of the upload-pack invocation above from 3.7GB to ~24MB. I've also modified the caller in get_reference(); a partial clone benefits from its use in pack-objects for the reasons given in 0bc2557951 (upload-pack: skip parse-object re-hashing of "want" objects, 2022-09-06), where we were measuring blob requests. But note that the results of get_reference() are used for traversing, as well; so we really would _eventually_ use the tree contents. That makes this at first glance a space/time tradeoff: we won't hold all of the trees in memory at once, but we'll have to reload them each when it comes time to traverse. And here's where our extra optimization comes in. If the caller is not going to immediately look at the tree contents, and it doesn't care about checking the hash, then parse_object() can simply skip loading the tree entirely, just like we do for blobs! And now it's not a space/time tradeoff in get_reference() anymore. It's just a lazy-load: we're delaying reading the tree contents until it's time to actually traverse them one by one. And of course for upload-pack, this optimization means we never load the trees at all, saving lots of CPU time. Timing the "every tree from git.git" request above shows upload-pack dropping from 32 seconds of CPU to 19 (the remainder is mostly due to pack-objects actually sending the pack; timing just the upload-pack portion shows we go from 13s to ~0.28s). These are all highly gamed numbers, of course. For real-world partial-clone requests we're saving only a small bit of time in practice. But it does help harden upload-pack against malicious denial-of-service attacks. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano diff --git a/object.c b/object.c index 2c61e4c..e305a30 100644 --- a/object.c +++ b/object.c @@ -272,6 +272,7 @@ struct object *parse_object_with_flags(struct repository *r, enum parse_object_flags flags) { int skip_hash = !!(flags & PARSE_OBJECT_SKIP_HASH_CHECK); + int discard_tree = !!(flags & PARSE_OBJECT_DISCARD_TREE); unsigned long size; enum object_type type; int eaten; @@ -299,6 +300,17 @@ struct object *parse_object_with_flags(struct repository *r, return lookup_object(r, oid); } + /* + * If the caller does not care about the tree buffer and does not + * care about checking the hash, we can simply verify that we + * have the on-disk object with the correct type. + */ + if (skip_hash && discard_tree && + (!obj || obj->type == OBJ_TREE) && + oid_object_info(r, oid, NULL) == OBJ_TREE) { + return &lookup_tree(r, oid)->object; + } + buffer = repo_read_object_file(r, oid, &type, &size); if (buffer) { if (!skip_hash && @@ -312,6 +324,8 @@ struct object *parse_object_with_flags(struct repository *r, buffer, &eaten); if (!eaten) free(buffer); + if (discard_tree && type == OBJ_TREE) + free_tree_buffer((struct tree *)obj); return obj; } return NULL; diff --git a/object.h b/object.h index 114d459..c7123ca 100644 --- a/object.h +++ b/object.h @@ -197,6 +197,7 @@ void *object_as_type(struct object *obj, enum object_type type, int quiet); */ enum parse_object_flags { PARSE_OBJECT_SKIP_HASH_CHECK = 1 << 0, + PARSE_OBJECT_DISCARD_TREE = 1 << 1, }; struct object *parse_object(struct repository *r, const struct object_id *oid); struct object *parse_object_with_flags(struct repository *r, diff --git a/revision.c b/revision.c index 2424c9b..b10f63a 100644 --- a/revision.c +++ b/revision.c @@ -381,7 +381,8 @@ static struct object *get_reference(struct rev_info *revs, const char *name, object = parse_object_with_flags(revs->repo, oid, revs->verify_objects ? 0 : - PARSE_OBJECT_SKIP_HASH_CHECK); + PARSE_OBJECT_SKIP_HASH_CHECK | + PARSE_OBJECT_DISCARD_TREE); if (!object) { if (revs->ignore_missing) diff --git a/upload-pack.c b/upload-pack.c index b721155..761af4a 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -470,7 +470,8 @@ static int do_got_oid(struct upload_pack_data *data, const struct object_id *oid { int we_knew_they_have = 0; struct object *o = parse_object_with_flags(the_repository, oid, - PARSE_OBJECT_SKIP_HASH_CHECK); + PARSE_OBJECT_SKIP_HASH_CHECK | + PARSE_OBJECT_DISCARD_TREE); if (!o) die("oops (%s)", oid_to_hex(oid)); @@ -1150,7 +1151,8 @@ static void receive_needs(struct upload_pack_data *data, } o = parse_object_with_flags(the_repository, &oid_buf, - PARSE_OBJECT_SKIP_HASH_CHECK); + PARSE_OBJECT_SKIP_HASH_CHECK | + PARSE_OBJECT_DISCARD_TREE); if (!o) { packet_writer_error(&data->writer, "upload-pack: not our ref %s", @@ -1467,7 +1469,8 @@ static int parse_want(struct packet_writer *writer, const char *line, "expected to get oid, not '%s'", line); o = parse_object_with_flags(the_repository, &oid, - PARSE_OBJECT_SKIP_HASH_CHECK); + PARSE_OBJECT_SKIP_HASH_CHECK | + PARSE_OBJECT_DISCARD_TREE); if (!o) { packet_writer_error(writer, -- cgit v0.10.2-6-g49f6