Changeset: bbbfb9de755a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bbbfb9de755a
Modified Files:
        monetdb5/modules/mal/mosaic.c
        monetdb5/modules/mal/mosaic.h
        monetdb5/modules/mal/mosaic_dictionary.c
        monetdb5/modules/mal/mosaic_dictionary.h
Branch: mosaic
Log Message:

Use a global column dictionary


diffs (truncated from 617 to 300 lines):

diff --git a/monetdb5/modules/mal/mosaic.c b/monetdb5/modules/mal/mosaic.c
--- a/monetdb5/modules/mal/mosaic.c
+++ b/monetdb5/modules/mal/mosaic.c
@@ -284,7 +284,6 @@ MOScompressInternal(Client cntxt, int *r
                task->elm = BATcount(bsrc);
                task->size = bsrc->T->heap.free;
                task->timer = GDKusec();
-               task->dictsize = task->elm < 20? 2: DICTSIZE;
 
                MOSinit(task,bcompress);
                MOSinitHeader(task);
@@ -303,11 +302,11 @@ MOScompressInternal(Client cntxt, int *r
                task->elm = BATcount(bcompress);
                task->size = bcompress->T->heap.free;
                task->timer = GDKusec();
-               task->dictsize = task->elm < 20? 2: DICTSIZE;
 
                MOSinit(task,bsrc);
                MOSinitHeader(task);
        }
+       MOScreatedictionary(cntxt,task);
        // always start with an EOL block
        MOSsetTag(task->blk,MOSAIC_EOL);
 
@@ -582,7 +581,7 @@ MOSdecompressInternal(Client cntxt, int 
                throw(MAL, "mosaic.decompress", "cannot decompress tail-VIEW");
        }
 
-       bsrc = BATnew(b->htype,b->ttype,BATcount(b),TRANSIENT);
+       bsrc = BATnew(b->htype,b->ttype,BATcount(b)+ MosaicHdrSize,TRANSIENT);
        if ( bsrc == NULL) {
                BBPreleaseref(b->batCacheid);
                throw(MAL, "mosaic.decompress", MAL_MALLOC_FAIL);
@@ -614,13 +613,11 @@ MOSdecompressInternal(Client cntxt, int 
                MOSinit(task,bsrc);
                task->src = Tloc(b, BUNfirst(b));
                task->timer = GDKusec();
-               task->dictsize = BATcount(b) < 20? 2: DICTSIZE;
        } else { 
                // create a local decompressed copy
                MOSinit(task,b);
                task->src = Tloc(bsrc, BUNfirst(bsrc));
                task->timer = GDKusec();
-               task->dictsize = BATcount(b) < 20? 2: DICTSIZE;
        } 
 
        while(task->blk){
diff --git a/monetdb5/modules/mal/mosaic.h b/monetdb5/modules/mal/mosaic.h
--- a/monetdb5/modules/mal/mosaic.h
+++ b/monetdb5/modules/mal/mosaic.h
@@ -39,7 +39,7 @@
 #define MIN_INPUT_COUNT 1
 
 /* The compressor kinds currently hardwired */
-#define MOSAIC_METHODS 8
+#define MOSAIC_METHODS 10
 #define MOSAIC_NONE     0              // no compression at all
 #define MOSAIC_RLE      1              // use run-length encoding
 #define MOSAIC_DICT     2              // local dictionary encoding
@@ -47,7 +47,8 @@
 #define MOSAIC_LINEAR  4               // use an encoding for a linear sequence
 #define MOSAIC_VARIANCE        5               // adaptive dictionary over 
deltas
 #define MOSAIC_PREFIX  6               // prefix/postfix bitwise compression
-#define MOSAIC_ZONE            7               // adaptive zone map over 
non-compressed data
+#define MOSAIC_INDEX   7               // columnwise full dictionary index 
with bit indices
+#define MOSAIC_ZONE            8               // adaptive zone map over 
non-compressed data
 #define MOSAIC_EOL             9               // marker for the last block
 
 //Compression should have a significant reduction to apply.
@@ -63,6 +64,12 @@ typedef struct MOSAICHEADER{
        int top;
        oid oidbase[MOSAICINDEX];
        BUN offset[MOSAICINDEX];
+       int dictsize;
+#ifdef HAVE_HGE
+       hge dict[256];
+#else
+       lng dict[256];
+#endif
 } * MosaicHdr;
 
 // bit stuffed header block, currently 4 bytes wide
@@ -99,8 +106,9 @@ typedef struct MOSTASK{
        BAT *b;                 // source column
        BUN     elm;            // elements left to compress
        char *src;              // read pointer into source
+       BAT *index;             // collection of unique elements
+       BAT *freq;              // frequency of these elements
 
-       int dictsize;   // entries in a dictionary 
        lng  xsize,size;// original and compressed size
        lng timer;              // compression time
        void *min, *max;// space for zones indices
diff --git a/monetdb5/modules/mal/mosaic_dictionary.c 
b/monetdb5/modules/mal/mosaic_dictionary.c
--- a/monetdb5/modules/mal/mosaic_dictionary.c
+++ b/monetdb5/modules/mal/mosaic_dictionary.c
@@ -38,15 +38,15 @@ MOSadvance_dictionary(Client cntxt, MOSt
        task->start += MOSgetCnt(task->blk);
        switch(ATOMstorage(task->type)){
        //case TYPE_bte: CASE_bit: no compression achievable
-       case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(sht)+ sizeof(bte) * 
MOSgetCnt(task->blk),sht)); break;
-       case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(int)+ sizeof(bte) * 
MOSgetCnt(task->blk),int)); break;
-       case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(lng)+ sizeof(bte) * 
MOSgetCnt(task->blk),lng)); break;
-       case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(oid)+ sizeof(bte) * 
MOSgetCnt(task->blk),oid)); break;
-       case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(wrd)+ sizeof(bte) * 
MOSgetCnt(task->blk),wrd)); break;
-       case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(flt)+ sizeof(bte) * 
MOSgetCnt(task->blk),flt)); break;
-       case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(dbl)+ sizeof(bte) * 
MOSgetCnt(task->blk),dbl)); break;
+       case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),sht)); break;
+       case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),int)); break;
+       case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),lng)); break;
+       case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),oid)); break;
+       case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),wrd)); break;
+       case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),flt)); break;
+       case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),dbl)); break;
 #ifdef HAVE_HGE
-       case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(hge)+ sizeof(bte) * 
MOSgetCnt(task->blk),hge)); break;
+       case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) + 
wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),hge)); break;
 #endif
        }
 }
@@ -55,40 +55,38 @@ MOSadvance_dictionary(Client cntxt, MOSt
 void
 MOSdump_dictionary(Client cntxt, MOStask task)
 {
-       MosaicBlk blk= task->blk;
+       MosaicHdr hdr= task->hdr;
        int i;
-       int *size;
-       void *val = (void*)(((char*) blk) + MosaicBlkSize);
+       void *val = (void*)hdr->dict;
 
-       size = (int*) (((char*)blk) + MosaicBlkSize);
-       mnstr_printf(cntxt->fdout,"#dict " BUNFMT" ", MOSgetCnt(blk));
-       switch(task->type){
+       mnstr_printf(cntxt->fdout,"#");
+       switch(ATOMstorage(task->type)){
        case TYPE_sht:
-               for(i=0; i< *size; i++)
-               mnstr_printf(cntxt->fdout,"sht [%d] %hd",i, ((sht*) val)[i]); 
break;
+               for(i=0; i< hdr->dictsize; i++)
+               mnstr_printf(cntxt->fdout,"sht [%d] %hd ",i, ((sht*) val)[i]); 
break;
        case TYPE_int:
-               for(i=0; i< *size; i++)
-               mnstr_printf(cntxt->fdout,"int [%d] %d",i, ((int*) val)[i]); 
break;
+               for(i=0; i< hdr->dictsize; i++)
+               mnstr_printf(cntxt->fdout,"int [%d] %d ",i, ((int*) val)[i]); 
break;
        case  TYPE_oid:
-               for(i=0; i< *size; i++)
+               for(i=0; i< hdr->dictsize; i++)
                mnstr_printf(cntxt->fdout,"oid [%d] "OIDFMT, i, ((oid*) 
val)[i]); break;
        case  TYPE_lng:
-               for(i=0; i< *size; i++)
+               for(i=0; i< hdr->dictsize; i++)
                mnstr_printf(cntxt->fdout,"lng [%d] "LLFMT, i, ((lng*) 
val)[i]); break;
 #ifdef HAVE_HGE
        case  TYPE_hge:
-               for(i=0; i< *size; i++)
-               mnstr_printf(cntxt->fdout,"hge [%d] %.40g", i, (dbl) ((hge*) 
val)[i]); break;
+               for(i=0; i< hdr->dictsize; i++)
+               mnstr_printf(cntxt->fdout,"hge [%d] %.40g ", i, (dbl) ((hge*) 
val)[i]); break;
 #endif
        case  TYPE_wrd:
-               for(i=0; i< *size; i++)
+               for(i=0; i< hdr->dictsize; i++)
                mnstr_printf(cntxt->fdout,"wrd [%d] "SZFMT, i, ((wrd*) 
val)[i]); break;
        case TYPE_flt:
-               for(i=0; i< *size; i++)
-               mnstr_printf(cntxt->fdout,"flt [%d] %f",i, ((flt*) val)[i]); 
break;
+               for(i=0; i< hdr->dictsize; i++)
+               mnstr_printf(cntxt->fdout,"flt [%d] %f ",i, ((flt*) val)[i]); 
break;
        case TYPE_dbl:
-               for(i=0; i< *size; i++)
-               mnstr_printf(cntxt->fdout,"dbl [%d] %g",i, ((dbl*) val)[i]); 
break;
+               for(i=0; i< hdr->dictsize; i++)
+               mnstr_printf(cntxt->fdout,"dbl [%d] %g ",i, ((dbl*) val)[i]); 
break;
        }
        mnstr_printf(cntxt->fdout,"\n");
 }
@@ -103,38 +101,116 @@ MOSskip_dictionary(Client cntxt, MOStask
 
 #define estimateDict(TPE)\
 {      TPE *val = (TPE*)task->src;\
-       TPE *dict = (TPE*)GDKzalloc(sizeof(TPE) * task->dictsize);\
-       if( dict== NULL)\
-               return 1.0;\
        for(i =0; i<task->elm; i++, val++){\
-               for(j= 0; j< size; j++)\
-                       if( dict[j] == *val) {cnt++;break;}\
-               if ( j == size){\
-                       if ( size == task->dictsize)\
-                               break;\
-                       dict[j] = *val;\
-                       size++;\
-                       cnt++;\
-               }\
+               for(j= 0; j< hdr->dictsize; j++)\
+                       if( hdr->dict[j] == *val) break;\
        }\
-       GDKfree(dict);\
        if ( i > MOSlimit() ) i = MOSlimit();\
-       if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned(2* 
MosaicBlkSize + size * sizeof(TPE)+ sizeof(bte) * MOSgetCnt(task->blk),TPE);\
+       if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned( 
MosaicBlkSize + i,TPE);\
 }
 
+// store it in the compressed heap header directly
+// filter out the most frequent ones
+#define makeDict(TPE)\
+{      TPE *val = (TPE*)task->src;\
+       TPE *dict = (TPE*)hdr->dict,v;\
+       for(i =0; i< (int) task->elm; i++, val++){\
+               for(j= 0; j< hdr->dictsize; j++)\
+                       if( dict[j] == *val) break;\
+               if ( j == hdr->dictsize){\
+                       if ( hdr->dictsize == 256){\
+                               int min = 0;\
+                               for(j=1;j<256;j++)\
+                                       if( cnt[min] <cnt[j]) min = j;\
+                               j=min;\
+                               cnt[j]=0;\
+                               break;\
+                       }\
+                       dict[j] = *val;\
+                       cnt[j]++;\
+                       hdr->dictsize++;\
+               } else\
+                       cnt[j]++;\
+       }\
+       for(i=0; i< hdr->dictsize; i++)\
+               for(j=i+1; j< hdr->dictsize; j++)\
+                       if(dict[i] >dict[j]){\
+                               v= dict[i];\
+                               dict[i] = dict[j];\
+                               dict[j] = v;\
+                       }\
+}
+
+void
+MOScreatedictionary(Client cntxt, MOStask task)
+{  int i, j;
+       MosaicHdr hdr = task->hdr;
+       lng cnt[256];
+
+       (void) cntxt;
+       for(j=0;j<256;j++)
+               cnt[j]=0;
+       hdr->dictsize = 0;
+       switch(ATOMstorage(task->type)){
+       case TYPE_sht: makeDict(sht); break;
+       case TYPE_lng: makeDict(lng); break;
+       case TYPE_oid: makeDict(oid); break;
+       case TYPE_wrd: makeDict(wrd); break;
+       case TYPE_flt: makeDict(flt); break;
+       case TYPE_dbl: makeDict(dbl); break;
+#ifdef HAVE_HGE
+       case TYPE_hge: makeDict(hge); break;
+#endif
+       case TYPE_int:
+               {       int *val = (int*)task->src;
+                       int *dict = (int*)hdr->dict,v;
+
+                       for(i =0; i< (int) task->elm; i++, val++){
+                               for(j= 0; j< hdr->dictsize; j++)
+                                       if( dict[j] == *val) break;
+                               if ( j == hdr->dictsize){
+                                       if ( hdr->dictsize == 256){
+                                               int min = 0;
+                                               // select low frequent candidate
+                                               for(j=1;j<256;j++)
+                                                       if( cnt[min] <cnt[j]) 
min = j;
+                                               j=min;
+                                               cnt[j]=0;
+                                               break;
+                                       }
+                                       dict[j] = *val;
+                                       cnt[j]++;
+                                       hdr->dictsize++;
+                               } else
+                                       cnt[j]++;
+                       }
+                       // sort it
+                       for(i=0; i< hdr->dictsize; i++)
+                               for(j=i+1; j< hdr->dictsize; j++)
+                                       if(dict[i] >dict[j]){
+                                               v= dict[i];
+                                               dict[i] = dict[j];
+                                               dict[j] = v;
+                                       }
+               }
+       }
+#ifdef _DEBUG_MOSAIC_
+       MOSdump_dictionary(cntxt, task);
+#endif
+}
 // calculate the expected reduction using DICT in terms of elements compressed
 flt
 MOSestimate_dictionary(Client cntxt, MOStask task)
 {      BUN i = -1;
-       int cnt =0,j;
-       int size=0;
+       int j;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to