/* * diff-delta.c: generate a delta between two buffers * * Many parts of this file have been lifted from LibXDiff version 0.10. * http://www.xmailserver.org/xdiff-lib.html * * LibXDiff was written by Davide Libenzi * Copyright (C) 2003 Davide Libenzi * * Many mods for GIT usage by Nicolas Pitre , (C) 2005. * * This file is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * Use of this within git automatically means that the LGPL * licensing gets turned into GPLv2 within this project. */ #include #include #include #include "delta.h" /* block size: min = 16, max = 64k, power of 2 */ #define BLK_SIZE 16 #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define GR_PRIME 0x9e370001 #define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift)) struct index { const unsigned char *ptr; unsigned int val; struct index *next; }; static struct index ** delta_index(const unsigned char *buf, unsigned long bufsize, unsigned long trg_bufsize, unsigned int *hash_shift) { unsigned int i, hsize, hshift, hlimit, entries, *hash_count; const unsigned char *data; struct index *entry, **hash; void *mem; /* determine index hash size */ entries = bufsize / BLK_SIZE; hsize = entries / 4; for (i = 4; (1 << i) < hsize && i < 31; i++); hsize = 1 << i; hshift = 32 - i; *hash_shift = hshift; /* allocate lookup index */ mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry)); if (!mem) return NULL; hash = mem; entry = mem + hsize * sizeof(*hash); memset(hash, 0, hsize * sizeof(*hash)); /* allocate an array to count hash entries */ hash_count = calloc(hsize, sizeof(*hash_count)); if (!hash_count) { free(hash); return NULL; } /* then populate the index */ data = buf + entries * BLK_SIZE - BLK_SIZE; while (data >= buf) { unsigned int val = adler32(0, data, BLK_SIZE); i = HASH(val, hshift); entry->ptr = data; entry->val = val; entry->next = hash[i]; hash[i] = entry++; hash_count[i]++; data -= BLK_SIZE; } /* * Determine a limit on the number of entries in the same hash * bucket. This guard us against patological data sets causing * really bad hash distribution with most entries in the same hash * bucket that would bring us to O(m*n) computing costs (m and n * corresponding to reference and target buffer sizes). * * The more the target buffer is large, the more it is important to * have small entry lists for each hash buckets. With such a limit * the cost is bounded to something more like O(m+n). */ hlimit = (1 << 26) / trg_bufsize; if (hlimit < 4*BLK_SIZE) hlimit = 4*BLK_SIZE; /* * Now make sure none of the hash buckets has more entries than * we're willing to test. Otherwise we cull the entry list * uniformly to still preserve a good repartition across * the reference buffer. */ for (i = 0; i < hsize; i++) { if (hash_count[i] < hlimit) continue; entry = hash[i]; do { struct index *keep = entry; int skip = hash_count[i] / hlimit / 2; do { entry = entry->next; } while(--skip && entry); keep->next = entry; } while(entry); } free(hash_count); return hash; } /* provide the size of the copy opcode given the block offset and size */ #define COPYOP_SIZE(o, s) \ (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \ !!(s & 0xff) + !!(s & 0xff00) + 1) /* the maximum size for any opcode */ #define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff) void *diff_delta(void *from_buf, unsigned long from_size, void *to_buf, unsigned long to_size, unsigned long *delta_size, unsigned long max_size) { unsigned int i, outpos, outsize, hash_shift; int inscnt; const unsigned char *ref_data, *ref_top, *data, *top; unsigned char *out; struct index *entry, **hash; if (!from_size || !to_size) return NULL; hash = delta_index(from_buf, from_size, to_size, &hash_shift); if (!hash) return NULL; outpos = 0; outsize = 8192; if (max_size && outsize >= max_size) outsize = max_size + MAX_OP_SIZE + 1; out = malloc(outsize); if (!out) { free(hash); return NULL; } ref_data = from_buf; ref_top = from_buf + from_size; data = to_buf; top = to_buf + to_size; /* store reference buffer size */ out[outpos++] = from_size; from_size >>= 7; while (from_size) { out[outpos - 1] |= 0x80; out[outpos++] = from_size; from_size >>= 7; } /* store target buffer size */ out[outpos++] = to_size; to_size >>= 7; while (to_size) { out[outpos - 1] |= 0x80; out[outpos++] = to_size; to_size >>= 7; } inscnt = 0; while (data < top) { unsigned int moff = 0, msize = 0; if (data + BLK_SIZE <= top) { unsigned int val = adler32(0, data, BLK_SIZE); i = HASH(val, hash_shift); for (entry = hash[i]; entry; entry = entry->next) { const unsigned char *ref = entry->ptr; const unsigned char *src = data; unsigned int ref_size = ref_top - ref; if (entry->val != val) continue; if (ref_size > top - src) ref_size = top - src; if (ref_size > 0x10000) ref_size = 0x10000; if (ref_size <= msize) break; while (ref_size-- && *src++ == *ref) ref++; if (msize < ref - entry->ptr) { /* this is our best match so far */ msize = ref - entry->ptr; moff = entry->ptr - ref_data; } } } if (!msize || msize < COPYOP_SIZE(moff, msize)) { if (!inscnt) outpos++; out[outpos++] = *data++; inscnt++; if (inscnt == 0x7f) { out[outpos - inscnt - 1] = inscnt; inscnt = 0; } } else { unsigned char *op; if (inscnt) { while (moff && ref_data[moff-1] == data[-1]) { if (msize == 0x10000) break; /* we can match one byte back */ msize++; moff--; data--; outpos--; if (--inscnt) continue; outpos--; /* remove count slot */ inscnt--; /* make it -1 */ break; } out[outpos - inscnt - 1] = inscnt; inscnt = 0; } data += msize; op = out + outpos++; i = 0x80; if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; } moff >>= 8; if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; } moff >>= 8; if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; } moff >>= 8; if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; } if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; } msize >>= 8; if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; } *op = i; } if (outpos >= outsize - MAX_OP_SIZE) { void *tmp = out; outsize = outsize * 3 / 2; if (max_size && outsize >= max_size) outsize = max_size + MAX_OP_SIZE + 1; if (max_size && outpos > max_size) out = NULL; else out = realloc(out, outsize); if (!out) { free(tmp); free(hash); return NULL; } } } if (inscnt) out[outpos - inscnt - 1] = inscnt; free(hash); *delta_size = outpos; return out; }