From 93b709c79eea6231b7d75a6817245a416b4f8fb5 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Sun, 10 Oct 2010 21:46:24 -0500 Subject: vcs-svn: improve support for reading large files Move from uint32_t to off_t as the fundamental unit of length used by the line_buffer library. Performance would get worse if anything but I think it's worth it for support of deltas that need to skip large pieces (> 4 GiB). Exception: buffer_read_string still takes a uint32_t, since it keeps its result in an in-core obj_pool. Callers still have to be updated to take advantage of this. Signed-off-by: Jonathan Nieder Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/line_buffer.c b/vcs-svn/line_buffer.c index eb8a6a7..747de07 100644 --- a/vcs-svn/line_buffer.c +++ b/vcs-svn/line_buffer.c @@ -104,7 +104,7 @@ void buffer_read_binary(struct line_buffer *buf, strbuf_fread(sb, size, buf->infile); } -void buffer_copy_bytes(struct line_buffer *buf, uint32_t len) +void buffer_copy_bytes(struct line_buffer *buf, off_t len) { char byte_buffer[COPY_BUFFER_LEN]; uint32_t in; @@ -120,7 +120,7 @@ void buffer_copy_bytes(struct line_buffer *buf, uint32_t len) } } -void buffer_skip_bytes(struct line_buffer *buf, uint32_t len) +void buffer_skip_bytes(struct line_buffer *buf, off_t len) { char byte_buffer[COPY_BUFFER_LEN]; uint32_t in; diff --git a/vcs-svn/line_buffer.h b/vcs-svn/line_buffer.h index 3c9629e..a090dd6 100644 --- a/vcs-svn/line_buffer.h +++ b/vcs-svn/line_buffer.h @@ -26,7 +26,7 @@ char *buffer_read_line(struct line_buffer *buf); char *buffer_read_string(struct line_buffer *buf, uint32_t len); int buffer_read_char(struct line_buffer *buf); void buffer_read_binary(struct line_buffer *buf, struct strbuf *sb, uint32_t len); -void buffer_copy_bytes(struct line_buffer *buf, uint32_t len); -void buffer_skip_bytes(struct line_buffer *buf, uint32_t len); +void buffer_copy_bytes(struct line_buffer *buf, off_t len); +void buffer_skip_bytes(struct line_buffer *buf, off_t len); #endif -- cgit v0.10.2-6-g49f6 From d234f54b2f82067699f36593188e687fc7dc321a Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Sun, 10 Oct 2010 21:44:21 -0500 Subject: vcs-svn: make buffer_skip_bytes return length read Currently there is no way to detect when input ended if it ended early during buffer_skip_bytes. Tell the calling program how many bytes were actually skipped for easier debugging. Existing callers will still ignore early EOF. Signed-off-by: Jonathan Nieder Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/line_buffer.c b/vcs-svn/line_buffer.c index 747de07..39d52b8 100644 --- a/vcs-svn/line_buffer.c +++ b/vcs-svn/line_buffer.c @@ -120,15 +120,16 @@ void buffer_copy_bytes(struct line_buffer *buf, off_t len) } } -void buffer_skip_bytes(struct line_buffer *buf, off_t len) +off_t buffer_skip_bytes(struct line_buffer *buf, off_t nbytes) { char byte_buffer[COPY_BUFFER_LEN]; - uint32_t in; - while (len > 0 && !feof(buf->infile) && !ferror(buf->infile)) { - in = len < COPY_BUFFER_LEN ? len : COPY_BUFFER_LEN; - in = fread(byte_buffer, 1, in, buf->infile); - len -= in; + off_t done = 0; + while (done < nbytes && !feof(buf->infile) && !ferror(buf->infile)) { + off_t len = nbytes - done; + size_t in = len < COPY_BUFFER_LEN ? len : COPY_BUFFER_LEN; + done += fread(byte_buffer, 1, in, buf->infile); } + return done; } void buffer_reset(struct line_buffer *buf) diff --git a/vcs-svn/line_buffer.h b/vcs-svn/line_buffer.h index a090dd6..7d10f9c 100644 --- a/vcs-svn/line_buffer.h +++ b/vcs-svn/line_buffer.h @@ -27,6 +27,6 @@ char *buffer_read_string(struct line_buffer *buf, uint32_t len); int buffer_read_char(struct line_buffer *buf); void buffer_read_binary(struct line_buffer *buf, struct strbuf *sb, uint32_t len); void buffer_copy_bytes(struct line_buffer *buf, off_t len); -void buffer_skip_bytes(struct line_buffer *buf, off_t len); +off_t buffer_skip_bytes(struct line_buffer *buf, off_t len); #endif diff --git a/vcs-svn/line_buffer.txt b/vcs-svn/line_buffer.txt index e89cc41..4ef0755 100644 --- a/vcs-svn/line_buffer.txt +++ b/vcs-svn/line_buffer.txt @@ -76,7 +76,8 @@ Functions `buffer_skip_bytes`:: Discards `len` bytes from the input stream (stopping early - if necessary because of an error or eof). + if necessary because of an error or eof). Return value is + the number of bytes successfully read. `buffer_reset`:: Deallocates non-static buffers. -- cgit v0.10.2-6-g49f6 From 26557fc1b37480d184a32de025b060aa1aa231db Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Dec 2010 04:26:17 -0600 Subject: vcs-svn: make buffer_copy_bytes return length read Currently buffer_copy_bytes does not report to its caller whether it encountered an early end of file. Add a return value representing the number of bytes read (but not the number of bytes copied). This way all three unusual conditions can be distinguished: input error with buffer_ferror, output error with ferror(outfile), early end of input by checking the return value. Signed-off-by: Jonathan Nieder Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/line_buffer.c b/vcs-svn/line_buffer.c index 39d52b8..33e733a 100644 --- a/vcs-svn/line_buffer.c +++ b/vcs-svn/line_buffer.c @@ -104,20 +104,20 @@ void buffer_read_binary(struct line_buffer *buf, strbuf_fread(sb, size, buf->infile); } -void buffer_copy_bytes(struct line_buffer *buf, off_t len) +off_t buffer_copy_bytes(struct line_buffer *buf, off_t nbytes) { char byte_buffer[COPY_BUFFER_LEN]; - uint32_t in; - while (len > 0 && !feof(buf->infile) && !ferror(buf->infile)) { - in = len < COPY_BUFFER_LEN ? len : COPY_BUFFER_LEN; + off_t done = 0; + while (done < nbytes && !feof(buf->infile) && !ferror(buf->infile)) { + off_t len = nbytes - done; + size_t in = len < COPY_BUFFER_LEN ? len : COPY_BUFFER_LEN; in = fread(byte_buffer, 1, in, buf->infile); - len -= in; + done += in; fwrite(byte_buffer, 1, in, stdout); - if (ferror(stdout)) { - buffer_skip_bytes(buf, len); - return; - } + if (ferror(stdout)) + return done + buffer_skip_bytes(buf, nbytes - done); } + return done; } off_t buffer_skip_bytes(struct line_buffer *buf, off_t nbytes) diff --git a/vcs-svn/line_buffer.h b/vcs-svn/line_buffer.h index 7d10f9c..f5c468a 100644 --- a/vcs-svn/line_buffer.h +++ b/vcs-svn/line_buffer.h @@ -26,7 +26,8 @@ char *buffer_read_line(struct line_buffer *buf); char *buffer_read_string(struct line_buffer *buf, uint32_t len); int buffer_read_char(struct line_buffer *buf); void buffer_read_binary(struct line_buffer *buf, struct strbuf *sb, uint32_t len); -void buffer_copy_bytes(struct line_buffer *buf, off_t len); +/* Returns number of bytes read (not necessarily written). */ +off_t buffer_copy_bytes(struct line_buffer *buf, off_t len); off_t buffer_skip_bytes(struct line_buffer *buf, off_t len); #endif -- cgit v0.10.2-6-g49f6 From c9d1c8ba059577e64fb2213cb0c5f3c4619c7519 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Dec 2010 04:30:54 -0600 Subject: vcs-svn: improve reporting of input errors Catch input errors and exit early enough to print a reasonable diagnosis based on errno. Signed-off-by: Jonathan Nieder Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/fast_export.c b/vcs-svn/fast_export.c index 260cf50..07a8353 100644 --- a/vcs-svn/fast_export.c +++ b/vcs-svn/fast_export.c @@ -63,14 +63,23 @@ void fast_export_commit(uint32_t revision, uint32_t author, char *log, printf("progress Imported commit %"PRIu32".\n\n", revision); } +static void die_short_read(struct line_buffer *input) +{ + if (buffer_ferror(input)) + die_errno("error reading dump file"); + die("invalid dump: unexpected end of file"); +} + void fast_export_blob(uint32_t mode, uint32_t mark, uint32_t len, struct line_buffer *input) { if (mode == REPO_MODE_LNK) { /* svn symlink blobs start with "link " */ - buffer_skip_bytes(input, 5); len -= 5; + if (buffer_skip_bytes(input, 5) != 5) + die_short_read(input); } printf("blob\nmark :%"PRIu32"\ndata %"PRIu32"\n", mark, len); - buffer_copy_bytes(input, len); + if (buffer_copy_bytes(input, len) != len) + die_short_read(input); fputc('\n', stdout); } diff --git a/vcs-svn/svndump.c b/vcs-svn/svndump.c index e6d84ba..15f822e 100644 --- a/vcs-svn/svndump.c +++ b/vcs-svn/svndump.c @@ -149,6 +149,13 @@ static void handle_property(uint32_t key, const char *val, uint32_t len, } } +static void die_short_read(void) +{ + if (buffer_ferror(&input)) + die_errno("error reading dump file"); + die("invalid dump: unexpected end of file"); +} + static void read_props(void) { uint32_t key = ~0; @@ -170,12 +177,21 @@ static void read_props(void) uint32_t len; const char *val; const char type = t[0]; + int ch; if (!type || t[1] != ' ') die("invalid property line: %s\n", t); len = atoi(&t[2]); val = buffer_read_string(&input, len); - buffer_skip_bytes(&input, 1); /* Discard trailing newline. */ + if (!val || strlen(val) != len) + die_short_read(); + + /* Discard trailing newline. */ + ch = buffer_read_char(&input); + if (ch == EOF) + die_short_read(); + if (ch != '\n') + die("invalid dump: expected newline after %s", val); switch (type) { case 'K': @@ -344,7 +360,11 @@ void svndump_read(const char *url) node_ctx.prop_delta = !strcmp(val, "true"); } else if (key == keys.content_length) { len = atoi(val); - buffer_read_line(&input); + t = buffer_read_line(&input); + if (!t) + die_short_read(); + if (*t) + die("invalid dump: expected blank line after content length header"); if (active_ctx == REV_CTX) { read_props(); } else if (active_ctx == NODE_CTX) { @@ -352,10 +372,13 @@ void svndump_read(const char *url) active_ctx = REV_CTX; } else { fprintf(stderr, "Unexpected content length header: %"PRIu32"\n", len); - buffer_skip_bytes(&input, len); + if (buffer_skip_bytes(&input, len) != len) + die_short_read(); } } } + if (buffer_ferror(&input)) + die_short_read(); if (active_ctx == NODE_CTX) handle_node(); if (active_ctx != DUMP_CTX) -- cgit v0.10.2-6-g49f6 From 044ad2906a5e4b805bc8c8d121466d8ff94ecbfb Mon Sep 17 00:00:00 2001 From: David Barr Date: Mon, 13 Dec 2010 19:13:24 +1100 Subject: vcs-svn: implement perfect hash for node-prop keys Instead of interning property names and comparing their string_pool keys, look them up in a table by string length, which should be about as fast. This is a small step towards removing dependence on string_pool. Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/svndump.c b/vcs-svn/svndump.c index 15f822e..322d1cd 100644 --- a/vcs-svn/svndump.c +++ b/vcs-svn/svndump.c @@ -14,6 +14,12 @@ #include "obj_pool.h" #include "string_pool.h" +/* + * Compare start of string to literal of equal length; + * must be guarded by length test. + */ +#define constcmp(s, ref) memcmp(s, ref, sizeof(ref) - 1) + #define NODEACT_REPLACE 4 #define NODEACT_DELETE 3 #define NODEACT_ADD 2 @@ -58,8 +64,7 @@ static struct { } dump_ctx; static struct { - uint32_t svn_log, svn_author, svn_date, svn_executable, svn_special, uuid, - revision_number, node_path, node_kind, node_action, + uint32_t uuid, revision_number, node_path, node_kind, node_action, node_copyfrom_path, node_copyfrom_rev, text_content_length, prop_content_length, content_length, svn_fs_dump_format_version, /* version 3 format */ @@ -96,11 +101,6 @@ static void reset_dump_ctx(uint32_t url) static void init_keys(void) { - keys.svn_log = pool_intern("svn:log"); - keys.svn_author = pool_intern("svn:author"); - keys.svn_date = pool_intern("svn:date"); - keys.svn_executable = pool_intern("svn:executable"); - keys.svn_special = pool_intern("svn:special"); keys.uuid = pool_intern("UUID"); keys.revision_number = pool_intern("Revision-number"); keys.node_path = pool_intern("Node-path"); @@ -117,22 +117,43 @@ static void init_keys(void) keys.prop_delta = pool_intern("Prop-delta"); } -static void handle_property(uint32_t key, const char *val, uint32_t len, +static void handle_property(const struct strbuf *key_buf, + const char *val, uint32_t len, uint32_t *type_set) { - if (key == keys.svn_log) { + const char *key = key_buf->buf; + size_t keylen = key_buf->len; + + switch (keylen + 1) { + case sizeof("svn:log"): + if (constcmp(key, "svn:log")) + break; if (!val) die("invalid dump: unsets svn:log"); /* Value length excludes terminating nul. */ rev_ctx.log = log_copy(len + 1, val); - } else if (key == keys.svn_author) { + break; + case sizeof("svn:author"): + if (constcmp(key, "svn:author")) + break; rev_ctx.author = pool_intern(val); - } else if (key == keys.svn_date) { + break; + case sizeof("svn:date"): + if (constcmp(key, "svn:date")) + break; if (!val) die("invalid dump: unsets svn:date"); if (parse_date_basic(val, &rev_ctx.timestamp, NULL)) warning("invalid timestamp: %s", val); - } else if (key == keys.svn_executable || key == keys.svn_special) { + break; + case sizeof("svn:executable"): + case sizeof("svn:special"): + if (keylen == strlen("svn:executable") && + constcmp(key, "svn:executable")) + break; + if (keylen == strlen("svn:special") && + constcmp(key, "svn:special")) + break; if (*type_set) { if (!val) return; @@ -143,7 +164,7 @@ static void handle_property(uint32_t key, const char *val, uint32_t len, return; } *type_set = 1; - node_ctx.type = key == keys.svn_executable ? + node_ctx.type = keylen == strlen("svn:executable") ? REPO_MODE_EXE : REPO_MODE_LNK; } @@ -158,7 +179,7 @@ static void die_short_read(void) static void read_props(void) { - uint32_t key = ~0; + static struct strbuf key = STRBUF_INIT; const char *t; /* * NEEDSWORK: to support simple mode changes like @@ -195,16 +216,19 @@ static void read_props(void) switch (type) { case 'K': - key = pool_intern(val); - continue; case 'D': - key = pool_intern(val); + strbuf_reset(&key); + if (val) + strbuf_add(&key, val, len); + if (type == 'K') + continue; + assert(type == 'D'); val = NULL; len = 0; /* fall through */ case 'V': - handle_property(key, val, len, &type_set); - key = ~0; + handle_property(&key, val, len, &type_set); + strbuf_reset(&key); continue; default: die("invalid property line: %s\n", t); -- cgit v0.10.2-6-g49f6 From 90c0a3cfe390208c86144bf97ec8fa5610febe0f Mon Sep 17 00:00:00 2001 From: David Barr Date: Mon, 13 Dec 2010 19:56:01 +1100 Subject: vcs-svn: implement perfect hash for top-level keys Instead of interning property names and comparing their string_pool keys, look them up in a table by string length, which should be about as fast. Another small step towards removing dependence on string_pool altogether. Signed-off-by: David Barr Signed-off-by: Jonathan Nieder diff --git a/vcs-svn/svndump.c b/vcs-svn/svndump.c index 322d1cd..77680a3 100644 --- a/vcs-svn/svndump.c +++ b/vcs-svn/svndump.c @@ -63,14 +63,6 @@ static struct { uint32_t version, uuid, url; } dump_ctx; -static struct { - uint32_t uuid, revision_number, node_path, node_kind, node_action, - node_copyfrom_path, node_copyfrom_rev, text_content_length, - prop_content_length, content_length, svn_fs_dump_format_version, - /* version 3 format */ - text_delta, prop_delta; -} keys; - static void reset_node_ctx(char *fname) { node_ctx.type = 0; @@ -99,24 +91,6 @@ static void reset_dump_ctx(uint32_t url) dump_ctx.uuid = ~0; } -static void init_keys(void) -{ - keys.uuid = pool_intern("UUID"); - keys.revision_number = pool_intern("Revision-number"); - keys.node_path = pool_intern("Node-path"); - keys.node_kind = pool_intern("Node-kind"); - keys.node_action = pool_intern("Node-action"); - keys.node_copyfrom_path = pool_intern("Node-copyfrom-path"); - keys.node_copyfrom_rev = pool_intern("Node-copyfrom-rev"); - keys.text_content_length = pool_intern("Text-content-length"); - keys.prop_content_length = pool_intern("Prop-content-length"); - keys.content_length = pool_intern("Content-length"); - keys.svn_fs_dump_format_version = pool_intern("SVN-fs-dump-format-version"); - /* version 3 format (Subversion 1.1.0) */ - keys.text_delta = pool_intern("Text-delta"); - keys.prop_delta = pool_intern("Prop-delta"); -} - static void handle_property(const struct strbuf *key_buf, const char *val, uint32_t len, uint32_t *type_set) @@ -320,44 +294,61 @@ void svndump_read(const char *url) char *t; uint32_t active_ctx = DUMP_CTX; uint32_t len; - uint32_t key; reset_dump_ctx(pool_intern(url)); while ((t = buffer_read_line(&input))) { val = strstr(t, ": "); if (!val) continue; - *val++ = '\0'; - *val++ = '\0'; - key = pool_intern(t); + val += 2; - if (key == keys.svn_fs_dump_format_version) { + /* strlen(key) + 1 */ + switch (val - t - 1) { + case sizeof("SVN-fs-dump-format-version"): + if (constcmp(t, "SVN-fs-dump-format-version")) + continue; dump_ctx.version = atoi(val); if (dump_ctx.version > 3) die("expected svn dump format version <= 3, found %"PRIu32, dump_ctx.version); - } else if (key == keys.uuid) { + break; + case sizeof("UUID"): + if (constcmp(t, "UUID")) + continue; dump_ctx.uuid = pool_intern(val); - } else if (key == keys.revision_number) { + break; + case sizeof("Revision-number"): + if (constcmp(t, "Revision-number")) + continue; if (active_ctx == NODE_CTX) handle_node(); if (active_ctx != DUMP_CTX) handle_revision(); active_ctx = REV_CTX; reset_rev_ctx(atoi(val)); - } else if (key == keys.node_path) { - if (active_ctx == NODE_CTX) - handle_node(); - active_ctx = NODE_CTX; - reset_node_ctx(val); - } else if (key == keys.node_kind) { + break; + case sizeof("Node-path"): + if (prefixcmp(t, "Node-")) + continue; + if (!constcmp(t + strlen("Node-"), "path")) { + if (active_ctx == NODE_CTX) + handle_node(); + active_ctx = NODE_CTX; + reset_node_ctx(val); + break; + } + if (constcmp(t + strlen("Node-"), "kind")) + continue; if (!strcmp(val, "dir")) node_ctx.type = REPO_MODE_DIR; else if (!strcmp(val, "file")) node_ctx.type = REPO_MODE_BLB; else fprintf(stderr, "Unknown node-kind: %s\n", val); - } else if (key == keys.node_action) { + break; + case sizeof("Node-action"): + if (constcmp(t, "Node-action")) + continue; if (!strcmp(val, "delete")) { node_ctx.action = NODEACT_DELETE; } else if (!strcmp(val, "add")) { @@ -370,19 +361,38 @@ void svndump_read(const char *url) fprintf(stderr, "Unknown node-action: %s\n", val); node_ctx.action = NODEACT_UNKNOWN; } - } else if (key == keys.node_copyfrom_path) { + break; + case sizeof("Node-copyfrom-path"): + if (constcmp(t, "Node-copyfrom-path")) + continue; pool_tok_seq(REPO_MAX_PATH_DEPTH, node_ctx.src, "/", val); - } else if (key == keys.node_copyfrom_rev) { + break; + case sizeof("Node-copyfrom-rev"): + if (constcmp(t, "Node-copyfrom-rev")) + continue; node_ctx.srcRev = atoi(val); - } else if (key == keys.text_content_length) { - node_ctx.textLength = atoi(val); - } else if (key == keys.prop_content_length) { + break; + case sizeof("Text-content-length"): + if (!constcmp(t, "Text-content-length")) { + node_ctx.textLength = atoi(val); + break; + } + if (constcmp(t, "Prop-content-length")) + continue; node_ctx.propLength = atoi(val); - } else if (key == keys.text_delta) { - node_ctx.text_delta = !strcmp(val, "true"); - } else if (key == keys.prop_delta) { + break; + case sizeof("Text-delta"): + if (!constcmp(t, "Text-delta")) { + node_ctx.text_delta = !strcmp(val, "true"); + break; + } + if (constcmp(t, "Prop-delta")) + continue; node_ctx.prop_delta = !strcmp(val, "true"); - } else if (key == keys.content_length) { + break; + case sizeof("Content-length"): + if (constcmp(t, "Content-length")) + continue; len = atoi(val); t = buffer_read_line(&input); if (!t) @@ -417,7 +427,6 @@ int svndump_init(const char *filename) reset_dump_ctx(~0); reset_rev_ctx(0); reset_node_ctx(NULL); - init_keys(); return 0; } -- cgit v0.10.2-6-g49f6