Wayne Davison wrote: > http://rsync.samba.org/ftp/unpacked/rsync/patches/dynamic_hash.diff > > A line of credit would have been nice :-)
>One thing this patch does is to (1) leave the array allocated to its >largest size, (2) use realloc() if we need to make it bigger, (3) make >the minimum hash-table size 65537 (a prime). Some of these decisions >are debatable: > >The 1st item makes us more efficient in our malloc calls when sending >large files, but could waste sender-side memory when transferring a >single large file in the middle of a bunch of normal-sized files. > > With a minimal struct size of 65537, I doubt we would have too many reallocations. After all, a file needs to be over about 2.5 GB for us to need a larger array. Such files are rare enough, and the time it takes to allocate the array is fairly insignificant compared to their total handling time, that I think we can save the memory when handling smaller files. As for point 2 - isn't "realloc" potentially less efficient than just "malloc" if we intend to erase the array's content anyways? >The 3rd item might be a bit over the top, but we used to always allocate >a tag array of 65536 elements, and since I noticed some hash collisions >occurred in small files using a hash-table size of 11 items, I figured >it would be an acceptable overhead to make normal-sized files much more >likely to have no collisions at all. > > Personally, I agree that a minimum is a good idea. >Comments welcomed. Thanks again for your patch! >..wayne.. > > Ok. Here are a few comments: 1. I guess it's a matter of taste, but when you want to make sure a type has enough states to keep count of the number of elements in an array, I prefer using "size_t" to "int32". It's more upwards compatible. 2. In "sum_buf", sum1 is defined to be unsigned. It seems dangerous to me to hash it into a signed index, even if it's almost guarenteed to be ok. I'm attaching my proposed patch, incorporating all above comments. I also did some style suggestions (use sizeof(sum_table[0]) instead of sizeof(int32), initialize the chain element to -1 instead of 0, as that's the "null" value). Shachar
? .sender.c.swp ? dynamic_hash.patch ? patches/.dynamic_hash.diff.swp Index: match.c =================================================================== RCS file: /cvsroot/rsync/match.c,v retrieving revision 1.78 diff -u -r1.78 match.c --- match.c 24 Feb 2006 16:43:44 -0000 1.78 +++ match.c 26 Feb 2006 08:31:40 -0000 @@ -26,11 +26,6 @@ int updating_basis_file; -typedef unsigned short tag; - -#define TABLESIZE (1<<16) -#define NULL_TAG (-1) - static int false_alarms; static int tag_hits; static int matches; @@ -42,47 +37,39 @@ extern struct stats stats; -struct target { - tag t; - int32 i; -}; - -static struct target *targets; - -static int32 *tag_table; - -#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF) -#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16) - -static int compare_targets(struct target *t1,struct target *t2) -{ - return (int)t1->t - (int)t2->t; -} +static size_t tablesize; +static int32 *sum_table; +#define gettag2(s1,s2) gettag((s1) + ((s2)<<16)) +#define gettag(sum) ((sum)%tablesize) static void build_hash_table(struct sum_struct *s) { int32 i; + uint32 t; + size_t tablealloc=tablesize; - if (!tag_table) - tag_table = new_array(int32, TABLESIZE); + /* Dynamically calculate the hash table size so that the hash load + * is always about 80%. This number must be odd or s2 will not be + * able to span the entire set. */ + + tablesize = (s->count/8) * 10 + 11; + if (tablesize < 65537) + tablesize = 65537; /* a prime number */ + if (tablesize != tablealloc) { + free (sum_table); + sum_table = new_array(sum_table, uint32, tablesize); + if (!sum_table) + out_of_memory("build_hash_table"); + } - targets = new_array(struct target, s->count); - if (!tag_table || !targets) - out_of_memory("build_hash_table"); + memset(sum_table, 0xFF, tablesize * sizeof (sum_table[0])); for (i = 0; i < s->count; i++) { - targets[i].i = i; - targets[i].t = gettag(s->sums[i].sum1); + t = gettag(s->sums[i].sum1); + s->sums[i].chain = sum_table[t]; + sum_table[t] = i; } - - qsort(targets,s->count,sizeof(targets[0]),(int (*)())compare_targets); - - for (i = 0; i < TABLESIZE; i++) - tag_table[i] = NULL_TAG; - - for (i = s->count; i-- > 0; ) - tag_table[targets[i].t] = i; } @@ -176,20 +163,17 @@ } do { - tag t = gettag2(s1,s2); + int32 i; + size_t t = gettag2(s1,s2); int done_csum2 = 0; - int32 j = tag_table[t]; if (verbose > 4) rprintf(FINFO,"offset=%.0f sum=%08x\n",(double)offset,sum); - if (j == NULL_TAG) - goto null_tag; - sum = (s1 & 0xffff) | (s2 << 16); tag_hits++; - do { - int32 l, i = targets[j].i; + for (i = sum_table[t]; i >= 0; i = s->sums[i].chain) { + int32 l; if (sum != s->sums[i].sum1) continue; @@ -205,9 +189,10 @@ && !(s->sums[i].flags & SUMFLG_SAME_OFFSET)) continue; - if (verbose > 3) - rprintf(FINFO,"potential match at %.0f target=%.0f %.0f sum=%08x\n", - (double)offset,(double)j,(double)i,sum); + if (verbose > 3) { + rprintf(FINFO,"potential match at %.0f i=%ld sum=%08x\n", + (double)offset, (long)i, sum); + } if (!done_csum2) { map = (schar *)map_ptr(buf,offset,l); @@ -224,23 +209,23 @@ * one with an identical offset, so we prefer that over * the following want_i optimization. */ if (updating_basis_file) { - do { - int32 i2 = targets[j].i; + int32 i2; + for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) { if (s->sums[i2].offset != offset) continue; if (i2 != i) { if (sum != s->sums[i2].sum1) - break; + continue; if (memcmp(sum2, s->sums[i2].sum2, s->s2length) != 0) - break; + continue; i = i2; } /* This chunk was at the same offset on * both the sender and the receiver. */ s->sums[i].flags |= SUMFLG_SAME_OFFSET; goto set_want_i; - } while (++j < s->count && targets[j].t == t); + } } /* we've found a match, but now check to see @@ -266,9 +251,8 @@ s2 = sum >> 16; matches++; break; - } while (++j < s->count && targets[j].t == t); + } - null_tag: backup = offset - last_match; /* We sometimes read 1 byte prior to last_match... */ if (backup < 0) @@ -375,11 +359,6 @@ rprintf(FINFO,"sending file_sum\n"); write_buf(f,file_sum,MD4_SUM_LENGTH); - if (targets) { - free(targets); - targets=NULL; - } - if (verbose > 2) rprintf(FINFO, "false_alarms=%d tag_hits=%d matches=%d\n", false_alarms, tag_hits, matches); Index: rsync.h =================================================================== RCS file: /cvsroot/rsync/rsync.h,v retrieving revision 1.283 diff -u -r1.283 rsync.h --- rsync.h 3 Feb 2006 18:34:09 -0000 1.283 +++ rsync.h 26 Feb 2006 08:31:42 -0000 @@ -560,6 +560,7 @@ OFF_T offset; /**< offset in file of this chunk */ int32 len; /**< length of chunk of file */ uint32 sum1; /**< simple checksum */ + int32 chain; /**< hash-table chaining - index of next hash with colliding hash table position */ short flags; /**< flag bits */ char sum2[SUM_LENGTH]; /**< checksum */ }; Index: sender.c =================================================================== RCS file: /cvsroot/rsync/sender.c,v retrieving revision 1.92 diff -u -r1.92 sender.c --- sender.c 14 Jan 2006 20:26:24 -0000 1.92 +++ sender.c 26 Feb 2006 08:31:43 -0000 @@ -92,6 +92,7 @@ s->sums[i].offset = offset; s->sums[i].flags = 0; + s->sums[i].chain = -1; if (i == s->count-1 && s->remainder != 0) s->sums[i].len = s->remainder;
-- To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html