Changeset: d21f7ebcdca4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d21f7ebcdca4
Modified Files:
        monetdb5/modules/mosaic/mosaic_prefix.c
Branch: mosaic
Log Message:

Find common prefix
Using a block and a binary search to find
the number of bits for the common prefix


diffs (truncated from 526 to 300 lines):

diff --git a/monetdb5/modules/mosaic/mosaic_prefix.c 
b/monetdb5/modules/mosaic/mosaic_prefix.c
--- a/monetdb5/modules/mosaic/mosaic_prefix.c
+++ b/monetdb5/modules/mosaic/mosaic_prefix.c
@@ -20,8 +20,8 @@
 /*
  * 2014-2016 author Martin Kersten
  * Bit_prefix compression
- * Factor out the leading bits from a series of values.
- * The prefix size is determined by the first two non-identical values.
+ * Factor out leading bits from a series of values.
+ * The prefix size is determined by looking ahead in a small block.
  * To use the bitvector, we limit the extracted tail to at most 32bits
  * The administration are 2 TPE values (mask,reference value)
  * The size of the residu is stored in the reference value lower bits
@@ -40,6 +40,7 @@ MOSdump_prefix(Client cntxt, MOStask tas
        void *val = (void*)(((char*) blk) + MosaicBlkSize);
 
        mnstr_printf(cntxt->fdout,"#prefix "BUNFMT" ", MOSgetCnt(blk));
+
        switch(task->type){
        case TYPE_bte:
                mnstr_printf(cntxt->fdout,"bte %hhd", *(bte*) val); break;
@@ -143,7 +144,7 @@ MOSadvance_prefix(Client cntxt, MOStask 
                        bits = (int)(val & (~mask));
                        bytes = wordaligned(2 * sizeof(unsigned char),int);
                        bytes += 
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
-                       task->blk = (MosaicBlk) (((char*) dst)  + 
wordaligned(bytes, int)); 
+                       task->blk = (MosaicBlk) (((char*) dst)  + bytes); 
                }
                break;
        case 2:
@@ -153,7 +154,7 @@ MOSadvance_prefix(Client cntxt, MOStask 
                        bits = (int)(val & (~mask));
                        bytes = wordaligned(2 * sizeof(unsigned short),int);
                        bytes += 
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
-                       task->blk = (MosaicBlk) (((char*) dst)  + 
wordaligned(bytes, int)); 
+                       task->blk = (MosaicBlk) (((char*) dst)  + bytes); 
                }
                break;
        case 4:
@@ -173,7 +174,7 @@ MOSadvance_prefix(Client cntxt, MOStask 
                        bits = (int)(val & (~mask));
                        bytes = wordaligned(2 * sizeof(ulng),int);
                        bytes += 
wordaligned(getBitVectorSize(MOSgetCnt(task->blk),bits) * sizeof(int), int);
-                       task->blk = (MosaicBlk) (((char*) dst)  + 
wordaligned(bytes, int)); 
+                       task->blk = (MosaicBlk) (((char*) dst)  + bytes); 
                }
        }
 #ifdef _DEBUG_MOSAIC_
@@ -189,7 +190,137 @@ MOSskip_prefix(Client cntxt, MOStask tas
                task->blk = 0; // ENDOFLIST
 }
 
-// Find common prefix
+// logarithmic search for common prefix in a given block
+// use static prefix mask attempts
+
+static void
+findPrefixBit(Client cntxt, unsigned char *v, int limit, int *bits, unsigned 
char *prefixmask)
+{
+       int i, step = 8, width = 0;
+       unsigned char prefix, mask;
+       do{
+               step /=2;
+               mask = 1 ;
+               for( i=0; i < 8 - 1 - (width +step); i++)
+                       mask = (mask <<1) | 1;
+               mask = ~mask;
+               prefix = v[0] & mask;
+               for(i=0; i< limit; i++)
+                       if( (v[i] & mask) != prefix)
+                               break;
+#ifdef _DEBUG_PREFIX_
+               mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o 
 %d\n", width, step, mask, limit-i);
+#endif
+               if( i ==  limit){
+                       width += step;
+                       *bits = width;
+                       *prefixmask = mask;
+               }
+       } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+       mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+       (void) cntxt;
+#endif
+}
+
+static void
+findPrefixSht(Client cntxt, unsigned short *v, int limit, int *bits, unsigned 
short *prefixmask)
+{
+       int i, step = 16, width = 0;
+       unsigned short prefix, mask;
+#ifdef _DEBUG_PREFIX_
+       mnstr_printf(cntxt->fdout,"#findprefix start %u %d \n", *v, *bits);
+#endif
+       do{
+               step /=2;
+               mask = 1 ;
+               for( i=0; i < 16-1 - (width +step); i++)
+                       mask = (mask <<1) | 1;
+               mask = ~mask;
+               prefix = v[0] & mask;
+               for(i=0; i< limit; i++)
+                       if( (v[i] & mask) != prefix)
+                               break;
+#ifdef _DEBUG_PREFIX_
+               mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o 
 %d\n", width, step, mask, limit-i);
+#endif
+               if( i ==  limit){
+                       width += step;
+                       *bits = width;
+                       *prefixmask = mask;
+               }
+       } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+       mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+       (void) cntxt;
+#endif
+}
+
+static void
+findPrefixInt(Client cntxt, unsigned int *v, int limit, int *bits, unsigned 
int *prefixmask)
+{
+       int i, step = 32, width = 0;
+       unsigned int prefix, mask;
+       do{
+               step /=2;
+               mask = 1 ;
+               for( i=0; i < 32-1 - (width +step); i++)
+                       mask = (mask <<1) | 1;
+               mask = ~mask;
+               prefix = v[0] & mask;
+               for(i=0; i< limit; i++)
+                       if( (v[i] & mask) != prefix)
+                               break;
+#ifdef _DEBUG_PREFIX_
+               mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask %o 
 %d\n", width, step, mask, limit-i);
+#endif
+               if( i ==  limit){
+                       width += step;
+                       *bits = width;
+                       *prefixmask = mask;
+               }
+       } while (step > 1);
+#ifdef _DEBUG_PREFIX_
+       mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+       (void) cntxt;
+#endif
+}
+
+static void
+findPrefixLng(Client cntxt, ulng *v, int limit, int *bits, ulng *prefixmask)
+{
+       int i, step = 64, width = 0;
+       ulng prefix, mask;
+       do{
+               step /=2;
+               mask = 1 ;
+               for( i=0; i < 64 - 1 - (width +step); i++)
+                       mask = (mask <<1) | 1;
+               mask = ~mask;
+               prefix = v[0] & mask;
+               for(i=0; i< limit; i++)
+                       if( (v[i] & mask) != prefix)
+                               break;
+#ifdef _DEBUG_PREFIX_
+               mnstr_printf(cntxt->fdout,"#findprefix width %d step %d mask 
"LLFMT"  %d\n", width, step, mask, limit-i);
+#endif
+               if( i ==  limit){
+                       width += step;
+                       *bits = width;
+                       *prefixmask = mask;
+               }
+       } while (step > 1 && *bits < 32);
+       // we only use at most 32 bits as prefix due to bitvector implementation
+#ifdef _DEBUG_PREFIX_
+       mnstr_printf(cntxt->fdout,"#findprefix final %d \n", *bits);
+#else
+       (void) cntxt;
+#endif
+}
+
 #define Prefix(Prefix,Mask,X,Y,N) \
 { int k, m = 1; \
   for(k=0; k<N; k+=1, X>>=1, Y>>=1){\
@@ -201,6 +332,7 @@ MOSskip_prefix(Client cntxt, MOStask tas
 } 
 
 
+#define LOOKAHEAD  (limit <10? limit:10)
 // calculate the expected reduction 
 flt
 MOSestimate_prefix(Client cntxt, MOStask task)
@@ -217,22 +349,13 @@ MOSestimate_prefix(Client cntxt, MOStask
        if( task->elm >= 2)
        switch(size){
        case 1:
-               {       unsigned char *v = ((unsigned char*) task->src) + 
task->start, *w= v+1, val= *v,val2= *w, mask;
-                       // search first non-identical value
-                       for(i = 0;i < limit-1; i++, w++)
-                       if( *v != *w ){
-                               val2 = *w;
-                               break;
-                       }
-                       // all are the same?
-                       if ( i == limit -1)
-                               break;
-                       Prefix(prefixbits, mask, val, val2, 8);
+               {       unsigned char *v = ((unsigned char*) task->src) + 
task->start, *w= v+1, val= *v, mask;
+                       findPrefixBit(cntxt, v, LOOKAHEAD, &prefixbits, &mask);
                        if( prefixbits == 0)
                                break;
 
 #ifdef _DEBUG_PREFIX_
-            mnstr_printf(cntxt->fdout,"#prefix  estimate 1 %o %o val %d bits 
%d mask %o\n",
+            mnstr_printf(cntxt->fdout,"#prefix  estimate size 1 %o %o val %d 
bits %d mask %o\n",
                 *v,*w, val, prefixbits, mask);
 #endif
                        if( task->range[MOSAIC_PREFIX] > task->start +1 /* need 
at least two*/){
@@ -264,20 +387,12 @@ MOSestimate_prefix(Client cntxt, MOStask
                }
                break;
        case 2:
-               {       unsigned short *v = ((unsigned short*) task->src) + 
task->start, *w= v+1, val= *v,val2= *w, mask;
-                       // search first non-identical value
-                       for(i = 0;i < limit-1;i++, w++)
-                       if( *v != *w ){
-                               val2 = *w;
-                               break;
-                       }
-                       if ( i == limit-1)
-                               break;
-                       Prefix(prefixbits, mask, val, val2, 16);
+               {       unsigned short *v = ((unsigned short*) task->src) + 
task->start, *w= v+1, val= *v, mask;
+                       findPrefixSht(cntxt, v, LOOKAHEAD, &prefixbits, &mask);
                        if( prefixbits == 0)
                                break;
 #ifdef _DEBUG_PREFIX_
-            mnstr_printf(cntxt->fdout,"#prefix  estimate 2 %o %o elm "BUNFMT" 
val %d bits %d mask %o\n",
+            mnstr_printf(cntxt->fdout,"#prefix  estimate size 2 %u %o elm 
"BUNFMT" val %d bits %d mask %o\n",
                 *v,*w, i,val, prefixbits, mask);
 #endif
 
@@ -309,22 +424,14 @@ MOSestimate_prefix(Client cntxt, MOStask
                }
                break;
        case 4:
-               {       unsigned int *v = ((unsigned int*) task->src) + 
task->start, *w= v+1, val= *v,val2= *w, mask;
-                       // search first non-identical value
-                       for(i = 0;i < limit-1 ;i++, w++)
-                       if( *v != *w ){
-                               val2 = *w;
-                               break;
-                       }
-                       if ( i == limit-1)
-                               break;
-                       Prefix(prefixbits, mask, val, val2, 32);
+               {       unsigned int *v = ((unsigned int*) task->src) + 
task->start, *w= v+1, val= *v, mask;
+                       findPrefixInt(cntxt, v, LOOKAHEAD, &prefixbits,&mask);
                        if( prefixbits == 0)
                                break;
 
 #ifdef _DEBUG_PREFIX_
-            mnstr_printf(cntxt->fdout,"#prefix estimate 4 %o %o elm "BUNFMT" 
val %d bits %d mask %o\n",
-                *v,*w, i, val, prefixbits, mask);
+            mnstr_printf(cntxt->fdout,"#prefix estimate size 4 %o elm "BUNFMT" 
val %d bits %d mask %o\n",
+                *v, i, val, prefixbits, mask);
 #endif
                        if( task->range[MOSAIC_PREFIX] > task->start + 1){
                                bits = (task->range[MOSAIC_PREFIX] - 
task->start) * (32-prefixbits);
@@ -355,17 +462,9 @@ MOSestimate_prefix(Client cntxt, MOStask
                }
                break;
        case 8:
-               {       ulng *v = ((ulng*) task->src) + task->start, *w= v+1, 
val= *v,val2= *w, mask;
-                       // search first non-identical value
-                       for(i = 0; i < limit-1 ;i++, w++)
-                       if( *v != *w ){
-                               val2 = *w;
-                               break;
-                       }
-                       if ( i == limit-1 )
-                               break;
-                       Prefix(prefixbits, mask, val, val2, 32); // bitvector 
has limit of 32-bit fields
-                       if( prefixbits == 0 || 64-prefixbits >= 32)
+               {       ulng *v = ((ulng*) task->src) + task->start, *w= v+1, 
val= *v,mask;
+                       findPrefixLng(cntxt, v, LOOKAHEAD, &prefixbits,&mask);
+                       if( prefixbits == 0 )
                                break;
 
                        if( task->range[MOSAIC_PREFIX] > task->start + 1){
@@ -394,7 +493,7 @@ MOSestimate_prefix(Client cntxt, MOStask
                        factor = ( (flt)i * sizeof(lng))/ store;
                }
        }
-#ifdef _DEBUG_MOSAIC_
+#ifdef _DEBUG_PREFIX_
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to