On 15:35, Andre Noll wrote: > > - Reviewing my hash table code (particularly the hash function) > > src/lib/htable.h src/lib/htable.c > > Will do.
First some general remarks: - check the return value of malloc() - keys are restricted to strings. it would be easy to extend this to arbitrary data by adding a "size" field. - IMHO "%p" is prefered for printing pointer variables. --------------------------------------- htable.h > struct hlink { > void *next; /* next hash item */ why not struct hlink *next? > char *key; /* key this item */ > uint32_t hash; /* hash for this key */ > }; > > class htable : public SMARTALLOC { > hlink **table; /* hash table */ > int loffset; /* link offset in item */ see below. > uint32_t num_items; /* current number of items */ > uint32_t max_items; /* maximum items before growing */ see below. > uint32_t buckets; /* size of hash table */ > uint32_t hash; /* temp storage */ > uint32_t index; /* temp storage */ > uint32_t mask; /* "remainder" mask */ > uint32_t rshift; /* amount to shift down */ > hlink *walkptr; /* table walk pointer */ This is only used in first() and next(). IMHO the code would be more readable if this were local to these two methods and mext() would take a pointer to the predecessor. > uint32_t walk_index; /* table walk index */ same here. > void hash_index(char *key); /* produce hash key,index */ > void grow_table(); /* grow the table */ > public: > htable(void *item, void *link, int tsize = 31); > ~htable() { destroy(); } > void init(void *item, void *link, int tsize = 31); > bool insert(char *key, void *item); > void *lookup(char *key); > void *first(); /* get first item in table */ > void *next(); /* get next item in table */ > void destroy(); > void stats(); /* print stats about the table */ > uint32_t size(); /* return size of table */ > }; ----------------------------------------htable.c > /* > * Create hash of key, stored in hash then > * create and return the pseudo random bucket index > */ > void htable::hash_index(char *key) > { > hash = 0; > for (char *p=key; *p; p++) { > hash += (hash << 3) + (uint32_t)*p; > } This ist just shifting and multiplication with the bytes in key. I would expect this hash to have quite some unneccessary collisions when used on strings, though I'm by no means an expert in this area. > /* Multiply by large prime number, take top bits, mask for remainder */ > index = ((hash * 1103515249) >> rshift) & mask; > Dmsg2(100, "Leave hash_index hash=0x%x index=%d\n", hash, index); > return; The return statement is unneccesary. > void htable::init(void *item, void *link, int tsize) > { > int pwr; > tsize >>= 2; > for (pwr=0; tsize; pwr++) { > tsize >>= 1; > } > loffset = (char *)link - (char *)item; This is a constant, known at compile time, so make it a preprocessor define using offsetof. > mask = ~((~0)<<pwr); /* 3 bits => table size = 8 */ > rshift = 30 - pwr; /* start using bits 28, 29, 30 */ > num_items = 0; /* number of entries in table */ > buckets = 1<<pwr; /* hash table size -- power of two */ > max_items = buckets * 4; /* allow average 4 entries per chain */ As max_items is always four times the number of buckets, we could get rid of max_items. More importantly, I think it is not optimal to store that many keys in the table because that means we'll have in average four hits per bucket. I'd rather choose max_items _smaller_ than the number of buckets. For instance the git code which uses hash tables extensively, grows its table whenever it is filled 66%. > table = (hlink **)malloc(buckets * sizeof(hlink *)); unneccessary cast. > memset(table, 0, buckets * sizeof(hlink *)); how about calloc() instead of malloc + memset? > /* > * Take each hash link and walk down the chain of items > * that hash there counting them (i.e. the hits), > * then report that number. Open addressing might be better than chaining to deal with hash collisions. In particular if items are never (or rarely) removed from the table. > * Obiously, the more hits in a chain, the more time > * it takes to reference them. Empty chains are not so > * hot either -- as it means unused or wasted space. > */ > #define MAX_COUNT 20 Unless I'm mistaken, the code will fail badly if there are more than MAX_COUNT keys that hash to the same value. > void htable::stats() > { > int hits[MAX_COUNT]; > int max = 0; > int i, j; > hlink *p; > printf("\n\nNumItems=%d\nTotal buckets=%d\n", num_items, buckets); > printf("Hits/bucket: buckets\n"); Dmesg? > for (i=0; i < MAX_COUNT; i++) { > hits[i] = 0; > } memset(hits, 0 MAXCOUNT * sizeof(int)); > for (i=0; i<(int)buckets; i++) { > p = table[i]; > j = 0; > while (p) { > p = (hlink *)(p->next); cast would become unneccessary if next were struct hlink*. > void htable::grow_table() > { > Dmsg1(100, "Grow called old size = %d\n", buckets); > /* Setup a bigger table */ > htable *big = (htable *)malloc(sizeof(htable)); unnecessary cast > big->loffset = loffset; > big->mask = mask<<1 | 1; > big->rshift = rshift - 1; > big->num_items = 0; > big->buckets = buckets * 2; > big->max_items = big->buckets * 4; DRY ;) > big->table = (hlink **)malloc(big->buckets * sizeof(hlink *)); > memset(big->table, 0, big->buckets * sizeof(hlink *)); unnecessary cast, calloc(). > /* > * We walk through the old smaller tree getting items, > * but since we are overwriting the colision links, we must > * explicitly save the item->next pointer and walk each > * colision chain ourselves. We do use next() for getting > * to the next bucket. > */ > for (void *item=first(); item; ) { > void *ni = ((hlink *)((char *)item+loffset))->next; /* save link > overwritten by insert */ It might be worth to improve readability by defining #define next_item(item) ((hlink *)((char *)((item)+(loffset)) as this calculation is needed several times. > Dmsg1(100, "Grow insert: %s\n", ((hlink *)((char *)item+loffset))->key); > big->insert(((hlink *)((char *)item+loffset))->key, item); > if (ni) { > item = (void *)((char *)ni-loffset); > } else { > walkptr = NULL; > item = next(); > } > } This rather ugly code would become unneccessary when using open addressing instead of chaining. > void *htable::next() > { > Dmsg1(100, "Enter next: walkptr=0x%x\n", (unsigned)walkptr); > if (walkptr) { > walkptr = (hlink *)(walkptr->next); unnecessary cast Regards Andre -- The only person who always got his work done by Friday was Robinson Crusoe
signature.asc
Description: Digital signature
------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/
_______________________________________________ Bacula-users mailing list Bacula-users@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/bacula-users