Changeset: 268c8c805182 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=268c8c805182 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Collect statistics on CS properties' types which will be used for creating relational columns. Optimize and fix bug while using labels for finding maxCS and mergedCS diffs (truncated from 460 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -683,6 +683,86 @@ void printSubCSInformation(SubCSSet *sub } } +/* + * Init property types for each CS in FreqCSset (after merging) + * For each property, init with all possible types (MULTIVALUES + 1)) + * + * */ +static +CSPropTypes* initCSPropTypes(CSset* freqCSset, int numMergedCS){ + int numFreqCS = freqCSset->numCSadded; + int i, j, k ; + int id; + + CSPropTypes* csPropTypes = (CSPropTypes*)GDKmalloc(sizeof(CSPropTypes) * numMergedCS); + + id = 0; + for (i = 0; i < numFreqCS; i++){ + if (freqCSset->items[i].parentFreqIdx == -1){ // Only use the maximum or merge CS + csPropTypes[id].freqCSId = i; + csPropTypes[id].numProp = freqCSset->items[i].numProp; + csPropTypes[id].lstPropTypes = (PropTypes*) GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp); + for (j = 0; j < csPropTypes[id].numProp; j++){ + csPropTypes[id].lstPropTypes[j].prop = freqCSset->items[i].lstProp[j]; + csPropTypes[id].lstPropTypes[j].numType = MULTIVALUES + 1; + csPropTypes[id].lstPropTypes[j].lstTypes = (char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType); + csPropTypes[id].lstPropTypes[j].lstFreq = (int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType); + for (k = 0; k < csPropTypes[id].lstPropTypes[j].numType; k++){ + csPropTypes[id].lstPropTypes[j].lstFreq[k] = 0; + } + + } + + id++; + } + } + + assert(id == numMergedCS); + + return csPropTypes; +} + + +/* + * Add types of properties + * Note that the property list is sorted by prop's oids + * E.g., buffP = {3, 5, 7} + * csPropTypes[tbIdx] contains properties {1,3,4,5,7} with types for each property and frequency of each <property, type> + * */ +static +void addPropTypes(char *buffTypes, oid* buffP, int numP, int csId, int* csTblIdxMapping, CSPropTypes* csPropTypes){ + int i,j; + int tblId = csTblIdxMapping[csId]; + + j = 0; + if (tblId != -1){ + for (i = 0; i < numP; i++){ + while (csPropTypes[tblId].lstPropTypes[j].prop != buffP[i]){ + j++; + } + //j is position of the property buffP[i] in csPropTypes[tblId] + csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++; + + } + } +} + +static +void freeCSPropTypes(CSPropTypes* csPropTypes, int numCS){ + int i,j; + + for (i = 0; i < numCS; i++){ + if (csPropTypes[i].freqCSId != -1){ + for (j = 0; j < csPropTypes[i].numProp; j++){ + free(csPropTypes[i].lstPropTypes[j].lstTypes); + free(csPropTypes[i].lstPropTypes[j].lstFreq); + } + free(csPropTypes[i].lstPropTypes); + } + } + free(csPropTypes); +} + static SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){ SubCS *subcs = (SubCS*) malloc(sizeof(SubCS)); @@ -1854,6 +1934,7 @@ void getMaximumFreqCSs(CSset *freqCSset, int* coverage; int* freq; char isLabelComparable = 0; + char isDiffLabel = 0; (void) labels; (void) isLabelComparable; @@ -1866,34 +1947,30 @@ void getMaximumFreqCSs(CSset *freqCSset, if (strcmp(labels[i].name, "DUMMY") != 0) isLabelComparable = 1; for (j = (i+1); j < numFreqCS; j++){ - if (freqCSset->items[j].numProp > freqCSset->items[i].numProp){ - if (isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp, - freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { - /* CSj is a superset of CSi */ - #if USE_LABEL_FINDING_MAXCS - if (isLabelComparable == 1 && strcmp(labels[i].name, labels[j].name) == 0) { - freqCSset->items[i].parentFreqIdx = j; - break; + isDiffLabel = 0; + #if USE_LABEL_FINDING_MAXCS + if (isLabelComparable == 0 || strcmp(labels[i].name, labels[j].name) != 0) { + isDiffLabel = 1; + } + #endif + + if (isDiffLabel == 0){ + if (freqCSset->items[j].numProp > freqCSset->items[i].numProp){ + if (isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp, + freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { + /* CSj is a superset of CSi */ + freqCSset->items[i].parentFreqIdx = j; + break; } - #else - freqCSset->items[i].parentFreqIdx = j; - #endif - break; } - } - else if (freqCSset->items[j].numProp < freqCSset->items[i].numProp){ - if (isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp, - freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { - /* CSj is a subset of CSi */ - #if USE_LABEL_FINDING_MAXCS - if (isLabelComparable == 1 && strcmp(labels[i].name, labels[j].name) == 0) { - freqCSset->items[j].parentFreqIdx = i; - } - #else - freqCSset->items[j].parentFreqIdx = i; - #endif - } - + else if (freqCSset->items[j].numProp < freqCSset->items[i].numProp){ + if (isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp, + freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { + /* CSj is a subset of CSi */ + freqCSset->items[j].parentFreqIdx = i; + } + + } } //Do not need to consider the case that the numProps are the same @@ -2864,6 +2941,73 @@ str RDFrelationships(int *ret, BAT *sbat return MAL_SUCCEED; } + + +static +str RDFExtractCSPropTypes(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, + oid *subjCSMap, int* csTblIdxMapping, CSPropTypes* csPropTypes, int maxNumPwithDup){ + + BUN p, q; + oid *sbt = 0, *obt, *pbt; + oid curS; /* current Subject oid */ + //oid CSoid = 0; /* Characteristic set oid */ + int numPwithDup; /* Number of properties for current S */ + char objType; + char* buffTypes; + oid* buffP; + oid curP; + + buffTypes = (char *) malloc(sizeof(char) * (maxNumPwithDup + 1)); + buffP = (oid *) malloc(sizeof(char) * (maxNumPwithDup + 1)); + + numPwithDup = 0; + curS = 0; + curP = 0; + + BATloop(sbat, p, q){ + sbt = (oid *) BUNtloc(si, p); + if (*sbt != curS){ + if (p != 0){ /* Not the first S */ + addPropTypes(buffTypes, buffP, numPwithDup, subjCSMap[curS], csTblIdxMapping, csPropTypes); + } + curS = *sbt; + numPwithDup = 0; + curP = 0; + } + + obt = (oid *) BUNtloc(oi, p); + /* Check type of object */ + objType = (char) ((*obt) >> (sizeof(BUN)*8 - 4)) & 7 ; /* Get two bits 63th, 62nd from object oid */ + + pbt = (oid *) BUNtloc(pi, p); + + if (curP == *pbt){ + #if USE_MULTIPLICITY == 1 + // Update the object type for this P as MULTIVALUES + buffTypes[numPwithDup-1] = MULTIVALUES; + #else + buffTypes[numPwithDup] = objType; + numPwithDup++; + #endif + } + else{ + buffTypes[numPwithDup] = objType; + buffP[numPwithDup] = *pbt; + numPwithDup++; + curP = *pbt; + } + } + + /* Check for the last CS */ + addPropTypes(buffTypes, buffP, numPwithDup, subjCSMap[curS], csTblIdxMapping, csPropTypes); + + free (buffTypes); + + *ret = 1; + + return MAL_SUCCEED; +} + static void initCsRelBetweenMergeFreqSet(CSmergeRel *csRelBetweenMergeFreqSet, int num){ int i; @@ -3057,7 +3201,7 @@ int ontmetadataCount = 0; /* Extract CS from SPO triples table */ str -RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid *maxCSoid, char **subjdefaultMap){ +RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid *maxCSoid, char **subjdefaultMap,int *maxNumPwithDup){ BAT *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL; BATiter si, pi, oi; /*iterator for BAT of s,p,o columns in spo table */ @@ -3067,7 +3211,6 @@ RDFextractCSwithTypes(int *ret, bat *sba BUN *maxSoid; int maxNumProp = 0; - int maxNumPwithDup = 0; char *csFreqMap; CSrel *csrelSet; CSrel *csrelToMaxFreqSet, *csrelFromMaxFreqSet; @@ -3084,6 +3227,7 @@ RDFextractCSwithTypes(int *ret, bat *sba clock_t curT; clock_t tmpLastT; + Labels *labels; if ((sbat = BATdescriptor(*sbatid)) == NULL) { @@ -3134,11 +3278,12 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpLastT = clock(); + *maxNumPwithDup = 0; //Phase 1: Assign an ID for each CS #if STOREFULLCS - RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats, *subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup); + RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats, *subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup); #else - RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats, *subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup); + RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats, *subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup); #endif curT = clock(); @@ -3149,7 +3294,7 @@ RDFextractCSwithTypes(int *ret, bat *sba printf("Max CS oid: " BUNFMT "\n", *maxCSoid); - printf("Max Number of P (considering duplicated P): %d \n", maxNumPwithDup); + printf("Max Number of P (considering duplicated P): %d \n", *maxNumPwithDup); csFreqMap = (char*) malloc(sizeof(char) * (*maxCSoid +1)); initCharArray(csFreqMap, *maxCSoid +1, 0); @@ -3163,7 +3308,7 @@ RDFextractCSwithTypes(int *ret, bat *sba csSubCSSet = initCS_SubCSSets(*maxCSoid +1); - RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, csSubCSSet, csrelSet, *maxSoid, maxNumPwithDup); + RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, csSubCSSet, csrelSet, *maxSoid, *maxNumPwithDup); curT = clock(); printf (" ----- Exploring subCSs and FKs took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); @@ -3229,6 +3374,7 @@ RDFextractCSwithTypes(int *ret, bat *sba curT = clock(); printf (" ----- Merging Frequent CSs took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); tmpLastT = curT; + csRelBetweenMergeFreqSet = (CSmergeRel *) malloc (sizeof(CSmergeRel) * freqCSset->numCSadded); initCsRelBetweenMergeFreqSet(csRelBetweenMergeFreqSet, freqCSset->numCSadded); @@ -3438,23 +3584,15 @@ str triplesubsort(BAT **sbat, BAT **pbat } static -void initCStablesAndIdxMapping(CStableStat* cstablestat, CSset* freqCSset, int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping){ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list