summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2014-07-21 18:18:44 (GMT)
committerJunio C Hamano <gitster@pobox.com>2014-07-21 18:18:44 (GMT)
commit9b1c2a3a8e625ea7f56e9ba3d3c0e31938faa738 (patch)
treeaa21c9fad684c6f6e16d4a1c69a6c6bd70dd54c3
parent0ac744305f5885207bf96fc5488fcc59782ffa96 (diff)
parent7b64d42d22206d9995a8f0cb3b515e623cac4702 (diff)
downloadgit-9b1c2a3a8e625ea7f56e9ba3d3c0e31938faa738.zip
git-9b1c2a3a8e625ea7f56e9ba3d3c0e31938faa738.tar.gz
git-9b1c2a3a8e625ea7f56e9ba3d3c0e31938faa738.tar.bz2
Merge branch 'kb/hashmap-updates'
* kb/hashmap-updates: hashmap: add string interning API hashmap: add simplified hashmap_get_from_hash() API hashmap: improve struct hashmap member documentation hashmap: factor out getting a hash code from a SHA1
-rw-r--r--Documentation/technical/api-hashmap.txt54
-rw-r--r--builtin/describe.c13
-rw-r--r--decorate.c5
-rw-r--r--diffcore-rename.c11
-rw-r--r--hashmap.c38
-rw-r--r--hashmap.h27
-rw-r--r--khash.h11
-rw-r--r--name-hash.c5
-rw-r--r--object.c13
-rw-r--r--pack-objects.c5
-rwxr-xr-xt/t0011-hashmap.sh13
-rw-r--r--test-hashmap.c25
12 files changed, 159 insertions, 61 deletions
diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt
index b977ae8..ad7a5bd 100644
--- a/Documentation/technical/api-hashmap.txt
+++ b/Documentation/technical/api-hashmap.txt
@@ -8,11 +8,19 @@ Data Structures
`struct hashmap`::
- The hash table structure.
+ The hash table structure. Members can be used as follows, but should
+ not be modified directly:
+
-The `size` member keeps track of the total number of entries. The `cmpfn`
-member is a function used to compare two entries for equality. The `table` and
-`tablesize` members store the hash table and its size, respectively.
+The `size` member keeps track of the total number of entries (0 means the
+hashmap is empty).
++
+`tablesize` is the allocated size of the hash table. A non-0 value indicates
+that the hashmap is initialized. It may also be useful for statistical purposes
+(i.e. `size / tablesize` is the current load factor).
++
+`cmpfn` stores the comparison function specified in `hashmap_init()`. In
+advanced scenarios, it may be useful to change this, e.g. to switch between
+case-sensitive and case-insensitive lookup.
`struct hashmap_entry`::
@@ -58,6 +66,15 @@ Functions
+
`strihash` and `memihash` are case insensitive versions.
+`unsigned int sha1hash(const unsigned char *sha1)`::
+
+ Converts a cryptographic hash (e.g. SHA-1) into an int-sized hash code
+ for use in hash tables. Cryptographic hashes are supposed to have
+ uniform distribution, so in contrast to `memhash()`, this just copies
+ the first `sizeof(int)` bytes without shuffling any bits. Note that
+ the results will be different on big-endian and little-endian
+ platforms, so they should not be stored or transferred over the net.
+
`void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`::
Initializes a hashmap structure.
@@ -101,6 +118,20 @@ hashmap_entry) that has at least been initialized with the proper hash code
If an entry with matching hash code is found, `key` and `keydata` are passed
to `hashmap_cmp_fn` to decide whether the entry matches the key.
+`void *hashmap_get_from_hash(const struct hashmap *map, unsigned int hash, const void *keydata)`::
+
+ Returns the hashmap entry for the specified hash code and key data,
+ or NULL if not found.
++
+`map` is the hashmap structure.
++
+`hash` is the hash code of the entry to look up.
++
+If an entry with matching hash code is found, `keydata` is passed to
+`hashmap_cmp_fn` to decide whether the entry matches the key. The
+`entry_or_key` parameter points to a bogus hashmap_entry structure that
+should not be used in the comparison.
+
`void *hashmap_get_next(const struct hashmap *map, const void *entry)`::
Returns the next equal hashmap entry, or NULL if not found. This can be
@@ -162,6 +193,21 @@ more entries.
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
and returns the first entry, if any).
+`const char *strintern(const char *string)`::
+`const void *memintern(const void *data, size_t len)`::
+
+ Returns the unique, interned version of the specified string or data,
+ similar to the `String.intern` API in Java and .NET, respectively.
+ Interned strings remain valid for the entire lifetime of the process.
++
+Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
+strings / data must not be modified or freed.
++
+Interned strings are best used for short strings with high probability of
+duplicates.
++
+Uses a hashmap to store the pool of interned strings.
+
Usage example
-------------
diff --git a/builtin/describe.c b/builtin/describe.c
index 24d740c..ee6a3b9 100644
--- a/builtin/describe.c
+++ b/builtin/describe.c
@@ -56,18 +56,9 @@ static int commit_name_cmp(const struct commit_name *cn1,
return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled);
}
-static inline unsigned int hash_sha1(const unsigned char *sha1)
-{
- unsigned int hash;
- memcpy(&hash, sha1, sizeof(hash));
- return hash;
-}
-
static inline struct commit_name *find_commit_name(const unsigned char *peeled)
{
- struct commit_name key;
- hashmap_entry_init(&key, hash_sha1(peeled));
- return hashmap_get(&names, &key, peeled);
+ return hashmap_get_from_hash(&names, sha1hash(peeled), peeled);
}
static int replace_name(struct commit_name *e,
@@ -114,7 +105,7 @@ static void add_to_known_names(const char *path,
if (!e) {
e = xmalloc(sizeof(struct commit_name));
hashcpy(e->peeled, peeled);
- hashmap_entry_init(e, hash_sha1(peeled));
+ hashmap_entry_init(e, sha1hash(peeled));
hashmap_add(&names, e);
e->path = NULL;
}
diff --git a/decorate.c b/decorate.c
index 7cb5d29..b2aac90 100644
--- a/decorate.c
+++ b/decorate.c
@@ -8,10 +8,7 @@
static unsigned int hash_obj(const struct object *obj, unsigned int n)
{
- unsigned int hash;
-
- memcpy(&hash, obj->sha1, sizeof(unsigned int));
- return hash % n;
+ return sha1hash(obj->sha1) % n;
}
static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 749a35d..2e44a37 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -242,14 +242,12 @@ struct file_similarity {
static unsigned int hash_filespec(struct diff_filespec *filespec)
{
- unsigned int hash;
if (!filespec->sha1_valid) {
if (diff_populate_filespec(filespec, 0))
return 0;
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
}
- memcpy(&hash, filespec->sha1, sizeof(hash));
- return hash;
+ return sha1hash(filespec->sha1);
}
static int find_identical_files(struct hashmap *srcs,
@@ -259,15 +257,14 @@ static int find_identical_files(struct hashmap *srcs,
int renames = 0;
struct diff_filespec *target = rename_dst[dst_index].two;
- struct file_similarity *p, *best, dst;
+ struct file_similarity *p, *best = NULL;
int i = 100, best_score = -1;
/*
* Find the best source match for specified destination.
*/
- best = NULL;
- hashmap_entry_init(&dst, hash_filespec(target));
- for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) {
+ p = hashmap_get_from_hash(srcs, hash_filespec(target), NULL);
+ for (; p; p = hashmap_get_next(srcs, p)) {
int score;
struct diff_filespec *source = p->filespec;
diff --git a/hashmap.c b/hashmap.c
index d1b8056..f693839 100644
--- a/hashmap.c
+++ b/hashmap.c
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
current = iter->map->table[iter->tablepos++];
}
}
+
+struct pool_entry {
+ struct hashmap_entry ent;
+ size_t len;
+ unsigned char data[FLEX_ARRAY];
+};
+
+static int pool_entry_cmp(const struct pool_entry *e1,
+ const struct pool_entry *e2,
+ const unsigned char *keydata)
+{
+ return e1->data != keydata &&
+ (e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
+}
+
+const void *memintern(const void *data, size_t len)
+{
+ static struct hashmap map;
+ struct pool_entry key, *e;
+
+ /* initialize string pool hashmap */
+ if (!map.tablesize)
+ hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
+
+ /* lookup interned string in pool */
+ hashmap_entry_init(&key, memhash(data, len));
+ key.len = len;
+ e = hashmap_get(&map, &key, data);
+ if (!e) {
+ /* not found: create it */
+ e = xmallocz(sizeof(struct pool_entry) + len);
+ hashmap_entry_init(e, key.ent.hash);
+ e->len = len;
+ memcpy(e->data, data, len);
+ hashmap_add(&map, e);
+ }
+ return e->data;
+}
diff --git a/hashmap.h b/hashmap.h
index a816ad4..ab7958a 100644
--- a/hashmap.h
+++ b/hashmap.h
@@ -13,6 +13,17 @@ extern unsigned int strihash(const char *buf);
extern unsigned int memhash(const void *buf, size_t len);
extern unsigned int memihash(const void *buf, size_t len);
+static inline unsigned int sha1hash(const unsigned char *sha1)
+{
+ /*
+ * Equivalent to 'return *(unsigned int *)sha1;', but safe on
+ * platforms that don't support unaligned reads.
+ */
+ unsigned int hash;
+ memcpy(&hash, sha1, sizeof(hash));
+ return hash;
+}
+
/* data structures */
struct hashmap_entry {
@@ -57,6 +68,14 @@ extern void *hashmap_put(struct hashmap *map, void *entry);
extern void *hashmap_remove(struct hashmap *map, const void *key,
const void *keydata);
+static inline void *hashmap_get_from_hash(const struct hashmap *map,
+ unsigned int hash, const void *keydata)
+{
+ struct hashmap_entry key;
+ hashmap_entry_init(&key, hash);
+ return hashmap_get(map, &key, keydata);
+}
+
/* hashmap_iter functions */
extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter);
@@ -68,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
return hashmap_iter_next(iter);
}
+/* string interning */
+
+extern const void *memintern(const void *data, size_t len);
+static inline const char *strintern(const char *string)
+{
+ return memintern(string, strlen(string));
+}
+
#endif
diff --git a/khash.h b/khash.h
index 57ff603..06c7906 100644
--- a/khash.h
+++ b/khash.h
@@ -320,19 +320,12 @@ static const double __ac_HASH_UPPER = 0.77;
code; \
} }
-static inline khint_t __kh_oid_hash(const unsigned char *oid)
-{
- khint_t hash;
- memcpy(&hash, oid, sizeof(hash));
- return hash;
-}
-
#define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0)
-KHASH_INIT(sha1, const unsigned char *, void *, 1, __kh_oid_hash, __kh_oid_cmp)
+KHASH_INIT(sha1, const unsigned char *, void *, 1, sha1hash, __kh_oid_cmp)
typedef kh_sha1_t khash_sha1;
-KHASH_INIT(sha1_pos, const unsigned char *, int, 1, __kh_oid_hash, __kh_oid_cmp)
+KHASH_INIT(sha1_pos, const unsigned char *, int, 1, sha1hash, __kh_oid_cmp)
typedef kh_sha1_pos_t khash_sha1_pos;
#endif /* __AC_KHASH_H */
diff --git a/name-hash.c b/name-hash.c
index 49fd508..702cd05 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -213,12 +213,11 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam
struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase)
{
struct cache_entry *ce;
- struct hashmap_entry key;
lazy_init_name_hash(istate);
- hashmap_entry_init(&key, memihash(name, namelen));
- ce = hashmap_get(&istate->name_hash, &key, NULL);
+ ce = hashmap_get_from_hash(&istate->name_hash,
+ memihash(name, namelen), NULL);
while (ce) {
if (same_name(ce, name, namelen, icase))
return ce;
diff --git a/object.c b/object.c
index 9c31e9a..91c7c86 100644
--- a/object.c
+++ b/object.c
@@ -50,18 +50,7 @@ int type_from_string(const char *str)
*/
static unsigned int hash_obj(const unsigned char *sha1, unsigned int n)
{
- unsigned int hash;
-
- /*
- * Since the sha1 is essentially random, we just take the
- * required number of bits directly from the first
- * sizeof(unsigned int) bytes of sha1. First we have to copy
- * the bytes into a properly aligned integer. If we cared
- * about getting consistent results across architectures, we
- * would have to call ntohl() here, too.
- */
- memcpy(&hash, sha1, sizeof(unsigned int));
- return hash & (n - 1);
+ return sha1hash(sha1) & (n - 1);
}
/*
diff --git a/pack-objects.c b/pack-objects.c
index 4f36c32..9992f3e 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -7,10 +7,9 @@ static uint32_t locate_object_entry_hash(struct packing_data *pdata,
const unsigned char *sha1,
int *found)
{
- uint32_t i, hash, mask = (pdata->index_size - 1);
+ uint32_t i, mask = (pdata->index_size - 1);
- memcpy(&hash, sha1, sizeof(uint32_t));
- i = hash & mask;
+ i = sha1hash(sha1) & mask;
while (pdata->index[i] > 0) {
uint32_t pos = pdata->index[i] - 1;
diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh
index 391e2b6..f97c805 100755
--- a/t/t0011-hashmap.sh
+++ b/t/t0011-hashmap.sh
@@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
'
+test_expect_success 'string interning' '
+
+test_hashmap "intern value1
+intern Value1
+intern value2
+intern value2
+" "value1
+Value1
+value2
+value2"
+
+'
+
test_done
diff --git a/test-hashmap.c b/test-hashmap.c
index f5183fb..07aa7ec 100644
--- a/test-hashmap.c
+++ b/test-hashmap.c
@@ -115,9 +115,8 @@ static void perf_hashmap(unsigned int method, unsigned int rounds)
for (j = 0; j < rounds; j++) {
for (i = 0; i < TEST_SIZE; i++) {
- struct hashmap_entry key;
- hashmap_entry_init(&key, hashes[i]);
- hashmap_get(&map, &key, entries[i]->key);
+ hashmap_get_from_hash(&map, hashes[i],
+ entries[i]->key);
}
}
@@ -199,12 +198,8 @@ int main(int argc, char *argv[])
} else if (!strcmp("get", cmd) && l1) {
- /* setup static key */
- struct hashmap_entry key;
- hashmap_entry_init(&key, hash);
-
/* lookup entry in hashmap */
- entry = hashmap_get(&map, &key, p1);
+ entry = hashmap_get_from_hash(&map, hash, p1);
/* print result */
if (!entry)
@@ -239,6 +234,20 @@ int main(int argc, char *argv[])
/* print table sizes */
printf("%u %u\n", map.tablesize, map.size);
+ } else if (!strcmp("intern", cmd) && l1) {
+
+ /* test that strintern works */
+ const char *i1 = strintern(p1);
+ const char *i2 = strintern(p1);
+ if (strcmp(i1, p1))
+ printf("strintern(%s) returns %s\n", p1, i1);
+ else if (i1 == p1)
+ printf("strintern(%s) returns input pointer\n", p1);
+ else if (i1 != i2)
+ printf("strintern(%s) != strintern(%s)", i1, i2);
+ else
+ printf("%s\n", i1);
+
} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
perf_hashmap(atoi(p1), atoi(p2));