Changeset: 5a6592348b31 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5a6592348b31 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h sql/backends/monet5/sql.mx Branch: rdf Log Message:
Create tables corresponding to type-specific CS's. Each base CS table is divided into default-type table and non-default-type table. These two tables are then combined into one view. This has been checked with test dataset. diffs (truncated from 790 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -558,6 +558,48 @@ str printCSrelWithMaxSet(CSset *freqCSse } +static +void setdefaultSubCSs(SubCSSet *subcsset, int num, BAT *sbat, oid *subjSubCSMap,oid *subjCSMap, char *subjdefaultMap){ + + int i; + int j; + int tmpmaxfreq; + int defaultidx; + BUN p,q; + BATiter si; + oid *sbt; + oid csId; + oid subId; + + for (i = 0; i < num; i++){ + if (subcsset[i].numSubCS != 0){ + tmpmaxfreq = 0; + defaultidx = -1; + for (j = 0; j < subcsset[i].numSubCS; j++){ + if (subcsset[i].freq[j] > tmpmaxfreq){ + tmpmaxfreq = subcsset[i].freq[j]; + defaultidx = j; + } + } + + //Update default value + subcsset[i].subCSs[defaultidx].isdefault = 1; + + } + } + + si = bat_iterator(sbat); + + BATloop(sbat, p, q){ + sbt = (oid *) BUNtloc(si, p); + csId = subjCSMap[*sbt]; + subId = subjSubCSMap[*sbt]; + //printf("csId = " BUNFMT " | subId = " BUNFMT " \n", csId, subId); + if (subcsset[csId].subCSs[subId].isdefault == 1){ + subjdefaultMap[*sbt] = 1; + } + } +} static void printSubCSInformation(SubCSSet *subcsset, BAT* freqBat, int num, char isWriteTofile, int freqThreshold){ @@ -602,7 +644,7 @@ void printSubCSInformation(SubCSSet *sub for (i = 0; i < num; i++){ if (subcsset[i].numSubCS != 0){ freq = (int *) Tloc(freqBat, i); - fprintf(fout, "CS " BUNFMT ": ", subcsset[i].csId); + fprintf(fout, "CS " BUNFMT " (Freq: %d) : ", subcsset[i].csId, *freq); if (*freq > freqThreshold){ fprintf(foutfreq, BUNFMT " ", subcsset[i].csId); @@ -610,7 +652,11 @@ void printSubCSInformation(SubCSSet *sub } numSubCSFilter = 0; for (j = 0; j < subcsset[i].numSubCS; j++){ - fprintf(fout, BUNFMT " (%d) ", subcsset[i].subCSs[j].subCSId, subcsset[i].freq[j]); + if (subcsset[i].subCSs[j].isdefault == 1) + fprintf(fout, "(default) "BUNFMT " (%d) ", subcsset[i].subCSs[j].subCSId, subcsset[i].freq[j]); + else + fprintf(fout, BUNFMT " (%d) ", subcsset[i].subCSs[j].subCSId, subcsset[i].freq[j]); + // Check frequent subCS which appears in > 1% if (*freq < subcsset[i].freq[j]*10){ @@ -640,6 +686,7 @@ SubCS* creatSubCS(oid subCSId, int numP, subcs->subCSId = subCSId; subcs->numSubTypes = numP; subcs->sign = subCSsign; + subcs->isdefault = 0; return subcs; } @@ -656,7 +703,7 @@ SubCSSet* createaSubCSSet(oid csId){ } static -SubCSSet* initCS_SubCSMap(oid numSubCSSet){ +SubCSSet* initCS_SubCSSets(oid numSubCSSet){ oid i; SubCSSet *subcssets = (SubCSSet*) malloc(sizeof(SubCSSet) * numSubCSSet); SubCSSet *subcsset; @@ -741,7 +788,7 @@ void addSubCStoSet(SubCSSet *subcsSet, S } static -oid addSubCS(char *buff, int numP, int csId, SubCSSet* csSubCSMap){ +oid addSubCS(char *buff, int numP, int csId, SubCSSet* csSubCSSet){ SubCSSet *subcsset; oid subCSsign; char isFound; @@ -749,7 +796,7 @@ oid addSubCS(char *buff, int numP, int c SubCS *subCS; - subcsset = &(csSubCSMap[csId]); + subcsset = &(csSubCSSet[csId]); // Check the duplication subCSsign = RDF_hash_Tyleslist(buff, numP); @@ -2499,7 +2546,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B static str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, - oid *subjCSMap, oid *subjSubCSMap, SubCSSet *csSubCSMap, CSrel *csrelSet, BUN maxSoid, int maxNumPwithDup){ + oid *subjCSMap, oid *subjSubCSMap, SubCSSet *csSubCSSet, CSrel *csrelSet, BUN maxSoid, int maxNumPwithDup){ BUN p, q; oid *sbt = 0, *obt, *pbt; @@ -2529,10 +2576,10 @@ str RDFrelationships(int *ret, BAT *sbat sbt = (oid *) BUNtloc(si, p); if (*sbt != curS){ if (p != 0){ /* Not the first S */ - returnSubCSid = addSubCS(buffTypes, numPwithDup, subjCSMap[curS], csSubCSMap); + returnSubCSid = addSubCS(buffTypes, numPwithDup, subjCSMap[curS], csSubCSSet); //Get the subCSId - subjSubCSMap[*sbt] = returnSubCSid; + subjSubCSMap[curS] = returnSubCSid; } curS = *sbt; @@ -2574,7 +2621,7 @@ str RDFrelationships(int *ret, BAT *sbat } /* Check for the last CS */ - returnSubCSid = addSubCS(buffTypes, numPwithDup, subjCSMap[*sbt], csSubCSMap); + returnSubCSid = addSubCS(buffTypes, numPwithDup, subjCSMap[*sbt], csSubCSSet); subjSubCSMap[*sbt] = returnSubCSid; free (buffTypes); @@ -2745,13 +2792,14 @@ int ontmetadataCount = 0; /* Extract CS from SPO triples table */ str -RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid *maxCSoid){ +RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid *maxCSoid, char **subjdefaultMap){ BAT *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL; BATiter si, pi, oi; /*iterator for BAT of s,p,o columns in spo table */ CSBats *csBats; oid *subjSubCSMap; /* Store the corresponding CS sub Id for each subject */ + BUN *maxSoid; int maxNumProp = 0; int maxNumPwithDup = 0; @@ -2760,7 +2808,7 @@ RDFextractCSwithTypes(int *ret, bat *sba CSrel *csrelToMaxFreqSet, *csrelFromMaxFreqSet; CSrel *csrelBetweenMaxFreqSet; CSmergeRel *csRelBetweenMergeFreqSet; - SubCSSet *csSubCSMap; + SubCSSet *csSubCSSet; int* csIdFreqIdxMap; /* Map a CSId to a freqIdx. Should be removed in the future .... */ @@ -2811,8 +2859,10 @@ RDFextractCSwithTypes(int *ret, bat *sba *subjCSMap = (oid *) malloc (sizeof(oid) * ((*maxSoid) + 1)); subjSubCSMap = (oid *) malloc (sizeof(oid) * ((*maxSoid) + 1)); + *subjdefaultMap = (char *) malloc (sizeof(char) * ((*maxSoid) + 1)); initArray(*subjCSMap, (*maxSoid) + 1, BUN_NONE); + initCharArray(*subjdefaultMap,(*maxSoid) + 1, 0); //Phase 1: Assign an ID for each CS @@ -2840,14 +2890,16 @@ RDFextractCSwithTypes(int *ret, bat *sba csrelSet = initCSrelset(*maxCSoid + 1); - csSubCSMap = initCS_SubCSMap(*maxCSoid +1); - - RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, csSubCSMap, csrelSet, *maxSoid, maxNumPwithDup); + csSubCSSet = initCS_SubCSSets(*maxCSoid +1); + + RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, csSubCSSet, csrelSet, *maxSoid, maxNumPwithDup); printCSrelSet(csrelSet,csFreqMap, csBats->freqBat, *maxCSoid + 1, 1, *freqThreshold); - printSubCSInformation(csSubCSMap, csBats->freqBat, *maxCSoid + 1, 1, *freqThreshold); + setdefaultSubCSs(csSubCSSet,*maxCSoid + 1, sbat, subjSubCSMap, *subjCSMap, *subjdefaultMap); + + printSubCSInformation(csSubCSSet, csBats->freqBat, *maxCSoid + 1, 1, *freqThreshold); printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded); @@ -2911,7 +2963,7 @@ RDFextractCSwithTypes(int *ret, bat *sba free (superCSFreqCSMap); free (superCSMergeMaxCSMap); - freeCS_SubCSMapSet(csSubCSMap, *maxCSoid + 1); + freeCS_SubCSMapSet(csSubCSSet, *maxCSoid + 1); free(csIdFreqIdxMap); free(csRelBetweenMergeFreqSet); @@ -3033,14 +3085,26 @@ BAT* getOriginalOBat(BAT *obat){ return origobat; } +/* + * In case of using type-specific cs table, we use one more bit at the + * position sizeof(BUN)*8 - NBITS_FOR_CSID - 1 for specifying whether + * a subject has the default data types for its properties or not. + * Thus, the way to calculate the table idx and base idx is changed + * */ static -void getTblidFromSoid(oid Soid, int *tbidx, oid *baseSoid){ +void getTblidFromSoid(oid Soid, int *tbidx, oid *baseSoid, char *isdefault){ //int freqCSid; + *isdefault = 0; - *tbidx = (int) ((Soid >> (sizeof(BUN)*8 - NBITS_FOR_CSID)) & ((1 << (NBITS_FOR_CSID-1)) - 1)) ; - - *baseSoid = Soid - ((oid) (*tbidx) << (sizeof(BUN)*8 - NBITS_FOR_CSID)); + *tbidx = (int) ((Soid >> (sizeof(BUN)*8 - NBITS_FOR_CSID)) & ((1 << (NBITS_FOR_CSID-1)) - 1)) ; + +#if CSTYPE_TABLE == 1 + *isdefault = (char) ((Soid >> (sizeof(BUN)*8 - NBITS_FOR_CSID -1)) & 1 ) ; +#endif + + *baseSoid = Soid - ((oid) (*tbidx * 2 + *isdefault) << (sizeof(BUN)*8 - NBITS_FOR_CSID -1)); + *tbidx = *tbidx - 1; //return freqCSid; @@ -3114,8 +3178,11 @@ void initCStablesAndIdxMapping(CStableSt cstablestat->obat = BATnew(TYPE_void, TYPE_oid, smallbatsz); cstablestat->lastInsertedS = (oid**) malloc(sizeof(oid*) * k); - cstablestat->lstcstable = (CStable*) malloc(sizeof(CStable) * k); + #if CSTYPE_TABLE == 1 + cstablestat->lastInsertedSEx = (oid**) malloc(sizeof(oid*) * k); + cstablestat->lstcstableEx = (CStable*) malloc(sizeof(CStable) * k); + #endif k = 0; for (i = 0; i < freqCSset->numCSadded; i++){ @@ -3126,10 +3193,19 @@ void initCStablesAndIdxMapping(CStableSt cstablestat->lastInsertedS[k] = (oid*) malloc(sizeof(oid) * tmpNumProp); cstablestat->lstcstable[k].numCol = tmpNumProp; cstablestat->lstcstable[k].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumProp); - + #if CSTYPE_TABLE == 1 + cstablestat->lastInsertedSEx[k] = (oid*) malloc(sizeof(oid) * tmpNumProp); + cstablestat->lstcstableEx[k].numCol = tmpNumProp; + cstablestat->lstcstableEx[k].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumProp); + #endif + for(j = 0; j < tmpNumProp; j++){ cstablestat->lstcstable[k].colBats[j] = BATnew(TYPE_void, TYPE_oid, smallbatsz); //TODO: use exact aount for each BAT + #if CSTYPE_TABLE == 1 + cstablestat->lstcstableEx[k].colBats[j] = BATnew(TYPE_void, TYPE_oid, smallbatsz); + #endif + } k++; @@ -3167,10 +3243,19 @@ void freeCStableStat(CStableStat* cstabl for (i = 0; i < cstablestat->numTables; i++){ free(cstablestat->lstbatid[i]); free(cstablestat->lastInsertedS[i]); + #if CSTYPE_TABLE == 1 + free(cstablestat->lastInsertedSEx[i]); + #endif for (j = 0; j < cstablestat->numPropPerTable[i];j++){ BBPunfix(cstablestat->lstcstable[i].colBats[j]->batCacheid); + #if CSTYPE_TABLE == 1 + BBPunfix(cstablestat->lstcstableEx[i].colBats[j]->batCacheid); + #endif } free(cstablestat->lstcstable[i].colBats); + #if CSTYPE_TABLE == 1 + free(cstablestat->lstcstableEx[i].colBats); + #endif } BBPunfix(cstablestat->pbat->batCacheid); BBPunfix(cstablestat->sbat->batCacheid); @@ -3178,6 +3263,10 @@ void freeCStableStat(CStableStat* cstabl free(cstablestat->lstbatid); free(cstablestat->lastInsertedS); free(cstablestat->lstcstable); + #if CSTYPE_TABLE == 1 + free(cstablestat->lastInsertedSEx); + free(cstablestat->lstcstableEx); + #endif free(cstablestat->numPropPerTable); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list