Changeset: d21f7ebcdca4 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d21f7ebcdca4 Modified Files: monetdb5/modules/mosaic/mosaic_prefix.c Branch: mosaic Log Message:
Find common prefix Using a block and a binary search to find the number of bits for the common prefix diffs (truncated from 526 to 300 lines): diff --git a/monetdb5/modules/mosaic/mosaic_prefix.c b/monetdb5/modules/mosaic/mosaic_prefix.c --- a/monetdb5/modules/mosaic/mosaic_prefix.c +++ b/monetdb5/modules/mosaic/mosaic_prefix.c @@ -20,8 +20,8 @@ /* * 2014-2016 author Martin Kersten * Bit_prefix compression - * Factor out the leading bits from a series of values. - * The prefix size is determined by the first two non-identical values. + * Factor out leading bits from a series of values. + * The prefix size is determined by looking ahead in a small block. * To use the bitvector, we limit the extracted tail to at most 32bits * The administration are 2 TPE values (mask,reference value) * The size of the residu is stored in the reference value lower bits @@ -40,6 +40,7 @@ MOSdump_prefix(Client cntxt, MOStask tas void *val = (void*)(((char*) blk) + MosaicBlkSize); mnstr_printf(cntxt->fdout,"#prefix "BUNFMT" ", MOSgetCnt(blk)); + switch(task->type){ case TYPE_bte: mnstr_printf(cntxt->fdout,"bte %hhd", *(bte*) val); break; @@ -143,7 +144,7 @@ MOSadvance_prefix(Client cntxt, MOStask bits = (int)(val & (~mask)); bytes = wordaligned(2 * sizeof(unsigned char),int); bytes += wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int); - task->blk = (MosaicBlk) (((char*) dst) + wordaligned(bytes, int)); + task->blk = (MosaicBlk) (((char*) dst) + bytes); } break; case 2: @@ -153,7 +154,7 @@ MOSadvance_prefix(Client cntxt, MOStask bits = (int)(val & (~mask)); bytes = wordaligned(2 * sizeof(unsigned short),int); bytes += wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int); - task->blk = (MosaicBlk) (((char*) dst) + wordaligned(bytes, int)); + task->blk = (MosaicBlk) (((char*) dst) + bytes); } break; case 4: @@ -173,7 +174,7 @@ MOSadvance_prefix(Client cntxt, MOStask bits = (int)(val & (~mask)); bytes = wordaligned(2 * sizeof(ulng),int); bytes += wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int); - task->blk = (MosaicBlk) (((char*) dst) + wordaligned(bytes, int)); + task->blk = (MosaicBlk) (((char*) dst) + bytes); } } #ifdef _DEBUG_MOSAIC_ @@ -189,7 +190,137 @@ MOSskip_prefix(Client cntxt, MOStask tas task->blk = 0; // ENDOFLIST } -// Find common prefix +// logarithmic search for common prefix in a given block +// use static prefix mask attempts + +static void +findPrefixBit(Client cntxt, unsigned char *v, int limit, int *bits, unsigned char *prefixmask) +{ + int i, step = 8, width = 0; + unsigned char prefix, mask; + do{ + step /=2; + mask = 1 ; + for( i=0; i < 8 - 1 - (width +step); i++) + mask = (mask <<1) | 1; + mask = ~mask; + prefix = v[0] & mask; + for(i=0; i< limit; i++) + if( (v[i] & mask) != prefix) + break; +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o %d\n", width, step, mask, limit-i); +#endif + if( i == limit){ + width += step; + *bits = width; + *prefixmask = mask; + } + } while (step > 1); +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits); +#else + (void) cntxt; +#endif +} + +static void +findPrefixSht(Client cntxt, unsigned short *v, int limit, int *bits, unsigned short *prefixmask) +{ + int i, step = 16, width = 0; + unsigned short prefix, mask; +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix start %u %d \n", *v, *bits); +#endif + do{ + step /=2; + mask = 1 ; + for( i=0; i < 16-1 - (width +step); i++) + mask = (mask <<1) | 1; + mask = ~mask; + prefix = v[0] & mask; + for(i=0; i< limit; i++) + if( (v[i] & mask) != prefix) + break; +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o %d\n", width, step, mask, limit-i); +#endif + if( i == limit){ + width += step; + *bits = width; + *prefixmask = mask; + } + } while (step > 1); +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits); +#else + (void) cntxt; +#endif +} + +static void +findPrefixInt(Client cntxt, unsigned int *v, int limit, int *bits, unsigned int *prefixmask) +{ + int i, step = 32, width = 0; + unsigned int prefix, mask; + do{ + step /=2; + mask = 1 ; + for( i=0; i < 32-1 - (width +step); i++) + mask = (mask <<1) | 1; + mask = ~mask; + prefix = v[0] & mask; + for(i=0; i< limit; i++) + if( (v[i] & mask) != prefix) + break; +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o %d\n", width, step, mask, limit-i); +#endif + if( i == limit){ + width += step; + *bits = width; + *prefixmask = mask; + } + } while (step > 1); +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits); +#else + (void) cntxt; +#endif +} + +static void +findPrefixLng(Client cntxt, ulng *v, int limit, int *bits, ulng *prefixmask) +{ + int i, step = 64, width = 0; + ulng prefix, mask; + do{ + step /=2; + mask = 1 ; + for( i=0; i < 64 - 1 - (width +step); i++) + mask = (mask <<1) | 1; + mask = ~mask; + prefix = v[0] & mask; + for(i=0; i< limit; i++) + if( (v[i] & mask) != prefix) + break; +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask "LLFMT" %d\n", width, step, mask, limit-i); +#endif + if( i == limit){ + width += step; + *bits = width; + *prefixmask = mask; + } + } while (step > 1 && *bits < 32); + // we only use at most 32 bits as prefix due to bitvector implementation +#ifdef _DEBUG_PREFIX_ + mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits); +#else + (void) cntxt; +#endif +} + #define Prefix(Prefix,Mask,X,Y,N) \ { int k, m = 1; \ for(k=0; k<N; k+=1, X>>=1, Y>>=1){\ @@ -201,6 +332,7 @@ MOSskip_prefix(Client cntxt, MOStask tas } +#define LOOKAHEAD (limit <10? limit:10) // calculate the expected reduction flt MOSestimate_prefix(Client cntxt, MOStask task) @@ -217,22 +349,13 @@ MOSestimate_prefix(Client cntxt, MOStask if( task->elm >= 2) switch(size){ case 1: - { unsigned char *v = ((unsigned char*) task->src) + task->start, *w= v+1, val= *v,val2= *w, mask; - // search first non-identical value - for(i = 0;i < limit-1; i++, w++) - if( *v != *w ){ - val2 = *w; - break; - } - // all are the same? - if ( i == limit -1) - break; - Prefix(prefixbits, mask, val, val2, 8); + { unsigned char *v = ((unsigned char*) task->src) + task->start, *w= v+1, val= *v, mask; + findPrefixBit(cntxt, v, LOOKAHEAD, &prefixbits, &mask); if( prefixbits == 0) break; #ifdef _DEBUG_PREFIX_ - mnstr_printf(cntxt->fdout,"#prefix estimate 1 %o %o val %d bits %d mask %o\n", + mnstr_printf(cntxt->fdout,"#prefix estimate size 1 %o %o val %d bits %d mask %o\n", *v,*w, val, prefixbits, mask); #endif if( task->range[MOSAIC_PREFIX] > task->start +1 /* need at least two*/){ @@ -264,20 +387,12 @@ MOSestimate_prefix(Client cntxt, MOStask } break; case 2: - { unsigned short *v = ((unsigned short*) task->src) + task->start, *w= v+1, val= *v,val2= *w, mask; - // search first non-identical value - for(i = 0;i < limit-1;i++, w++) - if( *v != *w ){ - val2 = *w; - break; - } - if ( i == limit-1) - break; - Prefix(prefixbits, mask, val, val2, 16); + { unsigned short *v = ((unsigned short*) task->src) + task->start, *w= v+1, val= *v, mask; + findPrefixSht(cntxt, v, LOOKAHEAD, &prefixbits, &mask); if( prefixbits == 0) break; #ifdef _DEBUG_PREFIX_ - mnstr_printf(cntxt->fdout,"#prefix estimate 2 %o %o elm "BUNFMT" val %d bits %d mask %o\n", + mnstr_printf(cntxt->fdout,"#prefix estimate size 2 %u %o elm "BUNFMT" val %d bits %d mask %o\n", *v,*w, i,val, prefixbits, mask); #endif @@ -309,22 +424,14 @@ MOSestimate_prefix(Client cntxt, MOStask } break; case 4: - { unsigned int *v = ((unsigned int*) task->src) + task->start, *w= v+1, val= *v,val2= *w, mask; - // search first non-identical value - for(i = 0;i < limit-1 ;i++, w++) - if( *v != *w ){ - val2 = *w; - break; - } - if ( i == limit-1) - break; - Prefix(prefixbits, mask, val, val2, 32); + { unsigned int *v = ((unsigned int*) task->src) + task->start, *w= v+1, val= *v, mask; + findPrefixInt(cntxt, v, LOOKAHEAD, &prefixbits,&mask); if( prefixbits == 0) break; #ifdef _DEBUG_PREFIX_ - mnstr_printf(cntxt->fdout,"#prefix estimate 4 %o %o elm "BUNFMT" val %d bits %d mask %o\n", - *v,*w, i, val, prefixbits, mask); + mnstr_printf(cntxt->fdout,"#prefix estimate size 4 %o elm "BUNFMT" val %d bits %d mask %o\n", + *v, i, val, prefixbits, mask); #endif if( task->range[MOSAIC_PREFIX] > task->start + 1){ bits = (task->range[MOSAIC_PREFIX] - task->start) * (32-prefixbits); @@ -355,17 +462,9 @@ MOSestimate_prefix(Client cntxt, MOStask } break; case 8: - { ulng *v = ((ulng*) task->src) + task->start, *w= v+1, val= *v,val2= *w, mask; - // search first non-identical value - for(i = 0; i < limit-1 ;i++, w++) - if( *v != *w ){ - val2 = *w; - break; - } - if ( i == limit-1 ) - break; - Prefix(prefixbits, mask, val, val2, 32); // bitvector has limit of 32-bit fields - if( prefixbits == 0 || 64-prefixbits >= 32) + { ulng *v = ((ulng*) task->src) + task->start, *w= v+1, val= *v,mask; + findPrefixLng(cntxt, v, LOOKAHEAD, &prefixbits,&mask); + if( prefixbits == 0 ) break; if( task->range[MOSAIC_PREFIX] > task->start + 1){ @@ -394,7 +493,7 @@ MOSestimate_prefix(Client cntxt, MOStask factor = ( (flt)i * sizeof(lng))/ store; } } -#ifdef _DEBUG_MOSAIC_ +#ifdef _DEBUG_PREFIX_ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list