Eric Blake <ebb9 <at> byu.net> writes: > > I got bored and attempted this. This implementation still passes the m4 > testsuite, although there might be corner cases not exercised by m4 (such as > shrinking the table on hash_delete) that I got wrong, so I'm posting it for > review now. In particular, I think I need to use xalloc_oversized in more > places.
In fact, the m4 testsuite never even triggered a rehash (so guess what I'm patching m4 to do today ;), but running autoconf on coreutils caused a coredump, due to a rather stupid bug in my placement of the failure label. Here's what I squashed to fix things: diff --git i/lib/hash.c w/lib/hash.c index f52c2ef..8c43b0f 100644 --- i/lib/hash.c +++ w/lib/hash.c @@ -714,7 +714,7 @@ allocate_entry (Hash_table *table) { new_size = table->overflow_size; new_size += new_size / 2; - if (new_size < table->overflow_size) + if (xalloc_oversized (new_size, sizeof *table->overflow)) return NULL; } new = realloc (table->overflow, new_size * sizeof *table->overflow); @@ -888,10 +888,10 @@ hash_rehash (Hash_table *table, size_t candidate) needed += table->overflow_size; if (needed < table->overflow_size + table->overflow_size / 2) needed = table->overflow_size + table->overflow_size / 2; - if (needed < table->overflow_size) - goto fail; if (needed < DEFAULT_MXFAST / sizeof *cursor) needed = DEFAULT_MXFAST / sizeof *cursor; + if (xalloc_oversized (needed, sizeof *cursor)) + goto fail; cursor = realloc (table->overflow, needed * sizeof *cursor); if (!cursor) goto fail; @@ -905,9 +905,6 @@ hash_rehash (Hash_table *table, size_t candidate) table->overflow = cursor; table->overflow_size = needed; } - fail: - free (new_table); - return false; } /* Merely reuse the extra old space into the new table. */ @@ -916,7 +913,7 @@ hash_rehash (Hash_table *table, size_t candidate) for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) if (bucket->data) - for (cursor = bucket; cursor; cursor = next) + for (cursor = bucket; cursor != table->overflow; cursor = next) { void *data = cursor->data; struct hash_entry *new_bucket @@ -934,7 +931,7 @@ hash_rehash (Hash_table *table, size_t candidate) { /* Allocate or recycle an entry, when moving from a bucket header into a bucket overflow. */ - struct hash_entry *new_entry = allocate_entry (new_table); + struct hash_entry *new_entry = allocate_entry (table); if (new_entry == NULL || table->overflow_size <= new_entry - table->overflow) @@ -961,7 +958,7 @@ hash_rehash (Hash_table *table, size_t candidate) new_bucket->data = data; new_table->n_buckets_used++; if (cursor != bucket) - free_entry (new_table, cursor); + free_entry (table, cursor); } } @@ -975,6 +972,10 @@ hash_rehash (Hash_table *table, size_t candidate) free (new_table); return true; + + fail: + free (new_table); + return false; } /* If ENTRY matches an entry already in the hash table, return the pointer And the current state of the patch (but more on that in the next email): From: Eric Blake <e...@byu.net> Date: Tue, 16 Jun 2009 16:38:52 -0600 Subject: [PATCH] hash: manage bucket overflows more efficiently * lib/hash.c [USE_OBSTACK]: Delete; the optimizations provided by an obstack have been folded in to mainline code. (hash_entry): Change type of next. (struct hash_table): Alter how overflow entries are tracked. (hash_get_max_bucket_length, hash_table_ok, hash_lookup) (hash_get_next, hash_get_entries, hash_do_for_each) (hash_initialize, hash_clear, hash_free, allocate_entry) (free_entry, hash_find_entry, hash_rehash, hash_insert) (hash_delete, hash_print): Adjust all clients. Signed-off-by: Eric Blake <e...@byu.net> --- ChangeLog | 13 +++ lib/hash.c | 293 ++++++++++++++++++++++++++++++++--------------------------- 2 files changed, 172 insertions(+), 134 deletions(-) diff --git a/ChangeLog b/ChangeLog index 48b7013..fa0b986 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,18 @@ 2009-06-16 Eric Blake <e...@byu.net> + hash: manage bucket overflows more efficiently + * lib/hash.c [USE_OBSTACK]: Delete; the optimizations provided by + an obstack have been folded in to mainline code. + (hash_entry): Change type of next. + (struct hash_table): Alter how overflow entries are tracked. + (hash_get_max_bucket_length, hash_table_ok, hash_lookup) + (hash_get_next, hash_get_entries, hash_do_for_each) + (hash_initialize, hash_clear, hash_free, allocate_entry) + (free_entry, hash_find_entry, hash_rehash, hash_insert) + (hash_delete, hash_print): Adjust all clients. + +2009-06-16 Eric Blake <e...@byu.net> + hash: more minor cleanups * lib/hash.h (hash_entry): Make opaque, by moving... * lib/hash.c (hash_entry): ...here. diff --git a/lib/hash.c b/lib/hash.c index 7115932..8c43b0f 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -20,10 +20,6 @@ /* A generic hash table package. */ -/* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead - of malloc. If you change USE_OBSTACK, you have to recompile! Also, - memory allocation failures in the obstack are not reliably tracked. */ - #include <config.h> #include "hash.h" @@ -33,24 +29,18 @@ #include <stdio.h> #include <stdlib.h> -#if USE_OBSTACK -# include "obstack.h" -# ifndef obstack_chunk_alloc -# define obstack_chunk_alloc malloc -# endif -# ifndef obstack_chunk_free -# define obstack_chunk_free free -# endif -#endif - #ifndef SIZE_MAX # define SIZE_MAX ((size_t) -1) #endif +/* The approximate size to use for initial allocation. 64 bytes is + the largest "small" request for the GNU C library malloc. */ +enum { DEFAULT_MXFAST = 64 }; + struct hash_entry { void *data; - struct hash_entry *next; + size_t next; /* Index into overflow array, or 0 if end of chain. */ }; struct hash_table @@ -76,16 +66,13 @@ struct hash_table Hash_comparator comparator; Hash_data_freer data_freer; - /* A linked list of freed struct hash_entry structs. */ - struct hash_entry *free_entry_list; - -#if USE_OBSTACK - /* Whenever obstacks are used, it is possible to allocate all overflowed - entries into a single stack, so they all can be freed in a single - operation. It is not clear if the speedup is worth the trouble. - Also, allocation failure is not reliably detected. */ - struct obstack entry_stack; -#endif + /* An array holding all overflowed entries, and size of that + array. If overflow_size is 0, then there is no overflow or + free entry storage; otherwise, overflow[0].next starts a list + of freed entries ready to be recycled, and overflow[0].data is + the address of the first unallocated entry. */ + struct hash_entry *overflow; + size_t overflow_size; }; /* A hash table contains many internal entries, each holding a pointer to @@ -188,7 +175,7 @@ hash_get_max_bucket_length (const Hash_table *table) struct hash_entry const *cursor = bucket; size_t bucket_length = 1; - while (cursor = cursor->next, cursor) + while (cursor->next && (cursor = table->overflow + cursor->next)) bucket_length++; if (bucket_length > max_bucket_length) @@ -199,7 +186,7 @@ hash_get_max_bucket_length (const Hash_table *table) return max_bucket_length; } -/* Do a mild validation of a hash table, by traversing it and checking two +/* Do a mild validation of a hash table, by traversing it and checking several statistics. */ bool @@ -208,7 +195,16 @@ hash_table_ok (const Hash_table *table) struct hash_entry const *bucket; size_t n_buckets_used = 0; size_t n_entries = 0; + size_t max_length = hash_get_max_bucket_length (table); + size_t max_index = 0; + if (table->overflow) + { + bucket = (struct hash_entry *) table->overflow->data; + max_index = bucket - table->overflow; + if (table->overflow_size < max_index) + return false; + } for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) { if (bucket->data) @@ -218,14 +214,21 @@ hash_table_ok (const Hash_table *table) /* Count bucket head. */ n_buckets_used++; n_entries++; + if (max_index < cursor->next) + return false; /* Count bucket overflow. */ - while (cursor = cursor->next, cursor) - n_entries++; + while (cursor->next && (cursor = table->overflow + cursor->next)) + { + if (max_index < cursor->next) + return false; + n_entries++; + } } } - if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries) + if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries + && (max_length <= 1 || max_length < table->overflow_size)) return true; return false; @@ -264,7 +267,8 @@ hash_lookup (const Hash_table *table, const void *entry) if (bucket->data == NULL) return NULL; - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) if (entry == cursor->data || table->comparator (entry, cursor->data)) return cursor->data; @@ -312,9 +316,10 @@ hash_get_next (const Hash_table *table, const void *entry) abort (); /* Find next entry in the same bucket. */ - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) if (cursor->data == entry && cursor->next) - return cursor->next->data; + return table->overflow[cursor->next].data; /* Find first entry in any subsequent bucket. */ while (++bucket < table->bucket_limit) @@ -341,7 +346,8 @@ hash_get_entries (const Hash_table *table, void **buffer, { if (bucket->data) { - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) { if (counter >= buffer_size) return counter; @@ -373,7 +379,8 @@ hash_do_for_each (const Hash_table *table, Hash_processor processor, { if (bucket->data) { - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) { if (!(*processor) (cursor->data, processor_data)) return counter; @@ -595,10 +602,8 @@ hash_initialize (size_t candidate, const Hash_tuning *tuning, table->comparator = comparator; table->data_freer = data_freer; - table->free_entry_list = NULL; -#if USE_OBSTACK - obstack_init (&table->entry_stack); -#endif + table->overflow = NULL; + table->overflow_size = 0; return table; fail: @@ -620,10 +625,12 @@ hash_clear (Hash_table *table) if (bucket->data) { struct hash_entry *cursor; - struct hash_entry *next; + size_t next; /* Free the bucket overflow. */ - for (cursor = bucket->next; cursor; cursor = next) + for (cursor = table->overflow + bucket->next; + cursor != table->overflow; + cursor = table->overflow + next) { if (table->data_freer) (*table->data_freer) (cursor->data); @@ -632,15 +639,15 @@ hash_clear (Hash_table *table) next = cursor->next; /* Relinking is done one entry at a time, as it is to be expected that overflows are either rare or short. */ - cursor->next = table->free_entry_list; - table->free_entry_list = cursor; + cursor->next = table->overflow->next; + table->overflow->next = cursor - table->overflow; } /* Free the bucket head. */ if (table->data_freer) (*table->data_freer) (bucket->data); bucket->data = NULL; - bucket->next = NULL; + bucket->next = 0; } } @@ -658,7 +665,6 @@ hash_free (Hash_table *table) { struct hash_entry *bucket; struct hash_entry *cursor; - struct hash_entry *next; /* Call the user data_freer function. */ if (table->data_freer && table->n_entries) @@ -667,7 +673,8 @@ hash_free (Hash_table *table) { if (bucket->data) { - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) { (*table->data_freer) (cursor->data); } @@ -675,30 +682,8 @@ hash_free (Hash_table *table) } } -#if USE_OBSTACK - - obstack_free (&table->entry_stack, NULL); - -#else - - /* Free all bucket overflowed entries. */ - for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) - { - for (cursor = bucket->next; cursor; cursor = next) - { - next = cursor->next; - free (cursor); - } - } - - /* Also reclaim the internal list of previously freed entries. */ - for (cursor = table->free_entry_list; cursor; cursor = next) - { - next = cursor->next; - free (cursor); - } - -#endif + /* Bulk free all overflows and free entry list. */ + free (table->overflow); /* Free the remainder of the hash table structure. */ free (table->bucket); @@ -715,20 +700,50 @@ allocate_entry (Hash_table *table) { struct hash_entry *new; - if (table->free_entry_list) + if (!table->overflow || + (table->overflow->next == 0 + && ((struct hash_entry *) table->overflow->data + == table->overflow + table->overflow_size))) + { + /* Must reallocate a larger overflow table. */ + size_t new_size; + + if (!table->overflow) + new_size = DEFAULT_MXFAST / sizeof *table->overflow; + else + { + new_size = table->overflow_size; + new_size += new_size / 2; + if (xalloc_oversized (new_size, sizeof *table->overflow)) + return NULL; + } + new = realloc (table->overflow, new_size * sizeof *table->overflow); + if (!new) + return NULL; + if (!table->overflow) + { + new->next = 0; + new->data = new + 1; + } + else + new->data = new + table->overflow_size; + table->overflow = new; + table->overflow_size = new_size; + } + + if (table->overflow->next) { - new = table->free_entry_list; - table->free_entry_list = new->next; + /* Recycle from the free list. */ + new = table->overflow + table->overflow->next; + table->overflow->next = new->next; } else { -#if USE_OBSTACK - new = obstack_alloc (&table->entry_stack, sizeof *new); -#else - new = malloc (sizeof *new); -#endif + /* Allocate from the tail of overflow. */ + new = (struct hash_entry *) table->overflow->data; + table->overflow->data = new + 1; } - + new->next = 0; return new; } @@ -738,9 +753,12 @@ allocate_entry (Hash_table *table) static void free_entry (Hash_table *table, struct hash_entry *entry) { + size_t index = entry - table->overflow; + if (table->overflow_size <= index) + abort (); entry->data = NULL; - entry->next = table->free_entry_list; - table->free_entry_list = entry; + entry->next = table->overflow->next; + table->overflow->next = index; } /* This private function is used to help with insertion and deletion. When @@ -775,7 +793,7 @@ hash_find_entry (Hash_table *table, const void *entry, { if (bucket->next) { - struct hash_entry *next = bucket->next; + struct hash_entry *next = table->overflow + bucket->next; /* Bump the first overflow entry into the bucket head, then save the previous first overflow entry for later recycling. */ @@ -792,15 +810,15 @@ hash_find_entry (Hash_table *table, const void *entry, } /* Scan the bucket overflow. */ - for (cursor = bucket; cursor->next; cursor = cursor->next) + for (cursor = bucket; cursor->next; cursor = table->overflow + cursor->next) { - if ((*table->comparator) (entry, cursor->next->data)) + if ((*table->comparator) (entry, table->overflow[cursor->next].data)) { - void *data = cursor->next->data; + void *data = table->overflow[cursor->next].data; if (delete) { - struct hash_entry *next = cursor->next; + struct hash_entry *next = table->overflow + cursor->next; /* Unlink the entry to delete, then save the freed entry for later recycling. */ @@ -847,39 +865,55 @@ hash_rehash (Hash_table *table, size_t candidate) combined into one bucket during the rehash). */ { size_t needed = table->n_buckets_used - 1; - cursor = table->free_entry_list; - while (cursor) + size_t available; + if (table->overflow) { - cursor = cursor->next; - needed--; + cursor = (struct hash_entry *) table->overflow->data; + available = table->overflow_size - (cursor - table->overflow); + if (available < needed) + needed -= available; + else + needed = 0; + + cursor = table->overflow; + while (needed && cursor->next + && (cursor = table->overflow + cursor->next)) + needed--; } - while (needed--) + if (needed) { -#if USE_OBSTACK - cursor = obstack_alloc (&table->entry_stack, sizeof *cursor); -#else - cursor = malloc (sizeof *cursor); -#endif + /* Reallocate to the largest among table->overflow_size+needed, + table->overflow_size*1.5, and DEFAULT_MXFAST, to guarantee + O(N) overall cost. */ + needed += table->overflow_size; + if (needed < table->overflow_size + table->overflow_size / 2) + needed = table->overflow_size + table->overflow_size / 2; + if (needed < DEFAULT_MXFAST / sizeof *cursor) + needed = DEFAULT_MXFAST / sizeof *cursor; + if (xalloc_oversized (needed, sizeof *cursor)) + goto fail; + cursor = realloc (table->overflow, needed * sizeof *cursor); if (!cursor) + goto fail; + if (!table->overflow) { - free (new_table); - return false; + cursor->next = 0; + cursor->data = cursor + 1; } - cursor->next = table->free_entry_list; - table->free_entry_list = cursor; + else + cursor->data = cursor + table->overflow_size; + table->overflow = cursor; + table->overflow_size = needed; } } /* Merely reuse the extra old space into the new table. */ -#if USE_OBSTACK - obstack_free (&new_table->entry_stack, NULL); - new_table->entry_stack = table->entry_stack; -#endif - new_table->free_entry_list = table->free_entry_list; + new_table->overflow = table->overflow; + new_table->overflow_size = table->overflow_size; for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) if (bucket->data) - for (cursor = bucket; cursor; cursor = next) + for (cursor = bucket; cursor != table->overflow; cursor = next) { void *data = cursor->data; struct hash_entry *new_bucket @@ -889,7 +923,7 @@ hash_rehash (Hash_table *table, size_t candidate) if (! (new_bucket < new_table->bucket_limit)) abort (); - next = cursor->next; + next = table->overflow + cursor->next; if (new_bucket->data) { @@ -897,21 +931,22 @@ hash_rehash (Hash_table *table, size_t candidate) { /* Allocate or recycle an entry, when moving from a bucket header into a bucket overflow. */ - struct hash_entry *new_entry = allocate_entry (new_table); + struct hash_entry *new_entry = allocate_entry (table); - if (new_entry == NULL) + if (new_entry == NULL + || table->overflow_size <= new_entry - table->overflow) abort (); new_entry->data = data; new_entry->next = new_bucket->next; - new_bucket->next = new_entry; + new_bucket->next = new_entry - table->overflow; } else { /* Merely relink an existing entry, when moving from a bucket overflow into a bucket overflow. */ cursor->next = new_bucket->next; - new_bucket->next = cursor; + new_bucket->next = cursor - table->overflow; } } else @@ -923,7 +958,7 @@ hash_rehash (Hash_table *table, size_t candidate) new_bucket->data = data; new_table->n_buckets_used++; if (cursor != bucket) - free_entry (new_table, cursor); + free_entry (table, cursor); } } @@ -932,14 +967,15 @@ hash_rehash (Hash_table *table, size_t candidate) table->bucket_limit = new_table->bucket_limit; table->n_buckets = new_table->n_buckets; table->n_buckets_used = new_table->n_buckets_used; - table->free_entry_list = new_table->free_entry_list; - /* table->n_entries already holds its value. */ -#if USE_OBSTACK - table->entry_stack = new_table->entry_stack; -#endif + /* table->n_entries, table->overflow, and table->overflow_size + already hold their values. */ free (new_table); return true; + + fail: + free (new_table); + return false; } /* If ENTRY matches an entry already in the hash table, return the pointer @@ -968,12 +1004,14 @@ hash_insert (Hash_table *table, const void *entry) if (new_entry == NULL) return NULL; + if (table->overflow_size <= new_entry - table->overflow) + abort (); /* Add ENTRY in the overflow of the bucket. */ new_entry->data = (void *) entry; new_entry->next = bucket->next; - bucket->next = new_entry; + bucket->next = new_entry - table->overflow; table->n_entries++; return (void *) entry; } @@ -1060,21 +1098,7 @@ hash_delete (Hash_table *table, const void *entry) if (!hash_rehash (table, candidate)) { /* Failure to allocate memory in an attempt to - shrink the table is not fatal. But since memory - is low, we can at least be kind and free any - spare entries, rather than keeping them tied up - in the free entry list. */ -#if ! USE_OBSTACK - struct hash_entry *cursor = table->free_entry_list; - struct hash_entry *next; - while (cursor) - { - next = cursor->next; - free (cursor); - cursor = next; - } - table->free_entry_list = NULL; -#endif + shrink the table is not fatal. */ } } } @@ -1099,7 +1123,8 @@ hash_print (const Hash_table *table) if (bucket) printf ("%lu:\n", (unsigned long int) (bucket - table->bucket)); - for (cursor = bucket; cursor; cursor = cursor->next) + for (cursor = bucket; cursor != table->overflow; + cursor = table->overflow + cursor->next) { char const *s = cursor->data; /* FIXME */ -- 1.6.3.2