Changeset: bbbfb9de755a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bbbfb9de755a Modified Files: monetdb5/modules/mal/mosaic.c monetdb5/modules/mal/mosaic.h monetdb5/modules/mal/mosaic_dictionary.c monetdb5/modules/mal/mosaic_dictionary.h Branch: mosaic Log Message:
Use a global column dictionary diffs (truncated from 617 to 300 lines): diff --git a/monetdb5/modules/mal/mosaic.c b/monetdb5/modules/mal/mosaic.c --- a/monetdb5/modules/mal/mosaic.c +++ b/monetdb5/modules/mal/mosaic.c @@ -284,7 +284,6 @@ MOScompressInternal(Client cntxt, int *r task->elm = BATcount(bsrc); task->size = bsrc->T->heap.free; task->timer = GDKusec(); - task->dictsize = task->elm < 20? 2: DICTSIZE; MOSinit(task,bcompress); MOSinitHeader(task); @@ -303,11 +302,11 @@ MOScompressInternal(Client cntxt, int *r task->elm = BATcount(bcompress); task->size = bcompress->T->heap.free; task->timer = GDKusec(); - task->dictsize = task->elm < 20? 2: DICTSIZE; MOSinit(task,bsrc); MOSinitHeader(task); } + MOScreatedictionary(cntxt,task); // always start with an EOL block MOSsetTag(task->blk,MOSAIC_EOL); @@ -582,7 +581,7 @@ MOSdecompressInternal(Client cntxt, int throw(MAL, "mosaic.decompress", "cannot decompress tail-VIEW"); } - bsrc = BATnew(b->htype,b->ttype,BATcount(b),TRANSIENT); + bsrc = BATnew(b->htype,b->ttype,BATcount(b)+ MosaicHdrSize,TRANSIENT); if ( bsrc == NULL) { BBPreleaseref(b->batCacheid); throw(MAL, "mosaic.decompress", MAL_MALLOC_FAIL); @@ -614,13 +613,11 @@ MOSdecompressInternal(Client cntxt, int MOSinit(task,bsrc); task->src = Tloc(b, BUNfirst(b)); task->timer = GDKusec(); - task->dictsize = BATcount(b) < 20? 2: DICTSIZE; } else { // create a local decompressed copy MOSinit(task,b); task->src = Tloc(bsrc, BUNfirst(bsrc)); task->timer = GDKusec(); - task->dictsize = BATcount(b) < 20? 2: DICTSIZE; } while(task->blk){ diff --git a/monetdb5/modules/mal/mosaic.h b/monetdb5/modules/mal/mosaic.h --- a/monetdb5/modules/mal/mosaic.h +++ b/monetdb5/modules/mal/mosaic.h @@ -39,7 +39,7 @@ #define MIN_INPUT_COUNT 1 /* The compressor kinds currently hardwired */ -#define MOSAIC_METHODS 8 +#define MOSAIC_METHODS 10 #define MOSAIC_NONE 0 // no compression at all #define MOSAIC_RLE 1 // use run-length encoding #define MOSAIC_DICT 2 // local dictionary encoding @@ -47,7 +47,8 @@ #define MOSAIC_LINEAR 4 // use an encoding for a linear sequence #define MOSAIC_VARIANCE 5 // adaptive dictionary over deltas #define MOSAIC_PREFIX 6 // prefix/postfix bitwise compression -#define MOSAIC_ZONE 7 // adaptive zone map over non-compressed data +#define MOSAIC_INDEX 7 // columnwise full dictionary index with bit indices +#define MOSAIC_ZONE 8 // adaptive zone map over non-compressed data #define MOSAIC_EOL 9 // marker for the last block //Compression should have a significant reduction to apply. @@ -63,6 +64,12 @@ typedef struct MOSAICHEADER{ int top; oid oidbase[MOSAICINDEX]; BUN offset[MOSAICINDEX]; + int dictsize; +#ifdef HAVE_HGE + hge dict[256]; +#else + lng dict[256]; +#endif } * MosaicHdr; // bit stuffed header block, currently 4 bytes wide @@ -99,8 +106,9 @@ typedef struct MOSTASK{ BAT *b; // source column BUN elm; // elements left to compress char *src; // read pointer into source + BAT *index; // collection of unique elements + BAT *freq; // frequency of these elements - int dictsize; // entries in a dictionary lng xsize,size;// original and compressed size lng timer; // compression time void *min, *max;// space for zones indices diff --git a/monetdb5/modules/mal/mosaic_dictionary.c b/monetdb5/modules/mal/mosaic_dictionary.c --- a/monetdb5/modules/mal/mosaic_dictionary.c +++ b/monetdb5/modules/mal/mosaic_dictionary.c @@ -38,15 +38,15 @@ MOSadvance_dictionary(Client cntxt, MOSt task->start += MOSgetCnt(task->blk); switch(ATOMstorage(task->type)){ //case TYPE_bte: CASE_bit: no compression achievable - case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(sht)+ sizeof(bte) * MOSgetCnt(task->blk),sht)); break; - case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(int)+ sizeof(bte) * MOSgetCnt(task->blk),int)); break; - case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(lng)+ sizeof(bte) * MOSgetCnt(task->blk),lng)); break; - case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(oid)+ sizeof(bte) * MOSgetCnt(task->blk),oid)); break; - case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(wrd)+ sizeof(bte) * MOSgetCnt(task->blk),wrd)); break; - case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(flt)+ sizeof(bte) * MOSgetCnt(task->blk),flt)); break; - case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(dbl)+ sizeof(bte) * MOSgetCnt(task->blk),dbl)); break; + case TYPE_sht: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),sht)); break; + case TYPE_int: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),int)); break; + case TYPE_lng: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),lng)); break; + case TYPE_oid: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),oid)); break; + case TYPE_wrd: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),wrd)); break; + case TYPE_flt: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),flt)); break; + case TYPE_dbl: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),dbl)); break; #ifdef HAVE_HGE - case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(2* MosaicBlkSize + task->dictsize * sizeof(hge)+ sizeof(bte) * MOSgetCnt(task->blk),hge)); break; + case TYPE_hge: task->blk = (MosaicBlk)( ((char*)task->blk) + wordaligned(MosaicBlkSize + sizeof(bte) * MOSgetCnt(task->blk),hge)); break; #endif } } @@ -55,40 +55,38 @@ MOSadvance_dictionary(Client cntxt, MOSt void MOSdump_dictionary(Client cntxt, MOStask task) { - MosaicBlk blk= task->blk; + MosaicHdr hdr= task->hdr; int i; - int *size; - void *val = (void*)(((char*) blk) + MosaicBlkSize); + void *val = (void*)hdr->dict; - size = (int*) (((char*)blk) + MosaicBlkSize); - mnstr_printf(cntxt->fdout,"#dict " BUNFMT" ", MOSgetCnt(blk)); - switch(task->type){ + mnstr_printf(cntxt->fdout,"#"); + switch(ATOMstorage(task->type)){ case TYPE_sht: - for(i=0; i< *size; i++) - mnstr_printf(cntxt->fdout,"sht [%d] %hd",i, ((sht*) val)[i]); break; + for(i=0; i< hdr->dictsize; i++) + mnstr_printf(cntxt->fdout,"sht [%d] %hd ",i, ((sht*) val)[i]); break; case TYPE_int: - for(i=0; i< *size; i++) - mnstr_printf(cntxt->fdout,"int [%d] %d",i, ((int*) val)[i]); break; + for(i=0; i< hdr->dictsize; i++) + mnstr_printf(cntxt->fdout,"int [%d] %d ",i, ((int*) val)[i]); break; case TYPE_oid: - for(i=0; i< *size; i++) + for(i=0; i< hdr->dictsize; i++) mnstr_printf(cntxt->fdout,"oid [%d] "OIDFMT, i, ((oid*) val)[i]); break; case TYPE_lng: - for(i=0; i< *size; i++) + for(i=0; i< hdr->dictsize; i++) mnstr_printf(cntxt->fdout,"lng [%d] "LLFMT, i, ((lng*) val)[i]); break; #ifdef HAVE_HGE case TYPE_hge: - for(i=0; i< *size; i++) - mnstr_printf(cntxt->fdout,"hge [%d] %.40g", i, (dbl) ((hge*) val)[i]); break; + for(i=0; i< hdr->dictsize; i++) + mnstr_printf(cntxt->fdout,"hge [%d] %.40g ", i, (dbl) ((hge*) val)[i]); break; #endif case TYPE_wrd: - for(i=0; i< *size; i++) + for(i=0; i< hdr->dictsize; i++) mnstr_printf(cntxt->fdout,"wrd [%d] "SZFMT, i, ((wrd*) val)[i]); break; case TYPE_flt: - for(i=0; i< *size; i++) - mnstr_printf(cntxt->fdout,"flt [%d] %f",i, ((flt*) val)[i]); break; + for(i=0; i< hdr->dictsize; i++) + mnstr_printf(cntxt->fdout,"flt [%d] %f ",i, ((flt*) val)[i]); break; case TYPE_dbl: - for(i=0; i< *size; i++) - mnstr_printf(cntxt->fdout,"dbl [%d] %g",i, ((dbl*) val)[i]); break; + for(i=0; i< hdr->dictsize; i++) + mnstr_printf(cntxt->fdout,"dbl [%d] %g ",i, ((dbl*) val)[i]); break; } mnstr_printf(cntxt->fdout,"\n"); } @@ -103,38 +101,116 @@ MOSskip_dictionary(Client cntxt, MOStask #define estimateDict(TPE)\ { TPE *val = (TPE*)task->src;\ - TPE *dict = (TPE*)GDKzalloc(sizeof(TPE) * task->dictsize);\ - if( dict== NULL)\ - return 1.0;\ for(i =0; i<task->elm; i++, val++){\ - for(j= 0; j< size; j++)\ - if( dict[j] == *val) {cnt++;break;}\ - if ( j == size){\ - if ( size == task->dictsize)\ - break;\ - dict[j] = *val;\ - size++;\ - cnt++;\ - }\ + for(j= 0; j< hdr->dictsize; j++)\ + if( hdr->dict[j] == *val) break;\ }\ - GDKfree(dict);\ if ( i > MOSlimit() ) i = MOSlimit();\ - if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned(2* MosaicBlkSize + size * sizeof(TPE)+ sizeof(bte) * MOSgetCnt(task->blk),TPE);\ + if(i) factor = (flt) ((int)i * sizeof(int)) / wordaligned( MosaicBlkSize + i,TPE);\ } +// store it in the compressed heap header directly +// filter out the most frequent ones +#define makeDict(TPE)\ +{ TPE *val = (TPE*)task->src;\ + TPE *dict = (TPE*)hdr->dict,v;\ + for(i =0; i< (int) task->elm; i++, val++){\ + for(j= 0; j< hdr->dictsize; j++)\ + if( dict[j] == *val) break;\ + if ( j == hdr->dictsize){\ + if ( hdr->dictsize == 256){\ + int min = 0;\ + for(j=1;j<256;j++)\ + if( cnt[min] <cnt[j]) min = j;\ + j=min;\ + cnt[j]=0;\ + break;\ + }\ + dict[j] = *val;\ + cnt[j]++;\ + hdr->dictsize++;\ + } else\ + cnt[j]++;\ + }\ + for(i=0; i< hdr->dictsize; i++)\ + for(j=i+1; j< hdr->dictsize; j++)\ + if(dict[i] >dict[j]){\ + v= dict[i];\ + dict[i] = dict[j];\ + dict[j] = v;\ + }\ +} + +void +MOScreatedictionary(Client cntxt, MOStask task) +{ int i, j; + MosaicHdr hdr = task->hdr; + lng cnt[256]; + + (void) cntxt; + for(j=0;j<256;j++) + cnt[j]=0; + hdr->dictsize = 0; + switch(ATOMstorage(task->type)){ + case TYPE_sht: makeDict(sht); break; + case TYPE_lng: makeDict(lng); break; + case TYPE_oid: makeDict(oid); break; + case TYPE_wrd: makeDict(wrd); break; + case TYPE_flt: makeDict(flt); break; + case TYPE_dbl: makeDict(dbl); break; +#ifdef HAVE_HGE + case TYPE_hge: makeDict(hge); break; +#endif + case TYPE_int: + { int *val = (int*)task->src; + int *dict = (int*)hdr->dict,v; + + for(i =0; i< (int) task->elm; i++, val++){ + for(j= 0; j< hdr->dictsize; j++) + if( dict[j] == *val) break; + if ( j == hdr->dictsize){ + if ( hdr->dictsize == 256){ + int min = 0; + // select low frequent candidate + for(j=1;j<256;j++) + if( cnt[min] <cnt[j]) min = j; + j=min; + cnt[j]=0; + break; + } + dict[j] = *val; + cnt[j]++; + hdr->dictsize++; + } else + cnt[j]++; + } + // sort it + for(i=0; i< hdr->dictsize; i++) + for(j=i+1; j< hdr->dictsize; j++) + if(dict[i] >dict[j]){ + v= dict[i]; + dict[i] = dict[j]; + dict[j] = v; + } + } + } +#ifdef _DEBUG_MOSAIC_ + MOSdump_dictionary(cntxt, task); +#endif +} // calculate the expected reduction using DICT in terms of elements compressed flt MOSestimate_dictionary(Client cntxt, MOStask task) { BUN i = -1; - int cnt =0,j; - int size=0; + int j; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list