Changeset: 63c5b2b0c903 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=63c5b2b0c903 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Check primary keys in URI cols diffs (293 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -616,6 +616,7 @@ void initCSPropTypes(CSPropTypes* csProp csPropTypes[id].lstPropTypes[j].numType = MULTIVALUES + 1; csPropTypes[id].lstPropTypes[j].defaultType = STRING; csPropTypes[id].lstPropTypes[j].isMVProp = 0; + csPropTypes[id].lstPropTypes[j].isPKProp = 0; csPropTypes[id].lstPropTypes[j].numMvTypes = 0; csPropTypes[id].lstPropTypes[j].defColIdx = -1; csPropTypes[id].lstPropTypes[j].isFKProp = 0; @@ -3903,13 +3904,15 @@ static void getStatisticFinalCSs(CSset * //int *csPropNum; //int *csFreq; FILE *fout; - int i,j ; + int i,j, k ; char filename[100]; char tmpStr[20]; int maxNumtriple = 0; int minNumtriple = INT_MAX; int numMergeCS = 0; int totalCoverage = 0; + int totalCoverage10[10]; + int tmpNumProp10[10], maxNumProp10[10]; int freqId; int maxNumProp, tmpNumProp; @@ -3947,20 +3950,37 @@ static void getStatisticFinalCSs(CSset * minNumtriple = INT_MAX; maxNumProp = 0; tmpNumProp = 0; + for (k = 1; k < 10; k++) { + totalCoverage10[k] = totalCoverage; + maxNumProp10[k] = 0; + } for (i = 0; i < curNumMergeCS; i++){ freqId = mergeCSFreqCSMap[i]; if (freqCSset->items[freqId].parentFreqIdx == -1){ // Check whether it is a maximumCS // Output the result tmpNumProp = freqCSset->items[freqId].numProp; + for (k = 1; k < 10; k++) { + tmpNumProp10[k] = freqCSset->items[freqId].numProp; + } for (j = 0; j < freqCSset->items[freqId].numProp; j++){ //Check infrequent Prop if (isInfrequentProp(csPropTypes[i].lstPropTypes[j], freqCSset->items[freqId])){ totalCoverage = totalCoverage - csPropTypes[i].lstPropTypes[j].propCover; tmpNumProp--; } + + for (k = 1; k < 10; k++) { + if ((csPropTypes[i].lstPropTypes[j].propFreq * k) < freqCSset->items[freqId].support * INFREQ_PROP_THRESHOLD){ + totalCoverage10[k] = totalCoverage10[k] - csPropTypes[i].lstPropTypes[j].propCover; + tmpNumProp10[k]--; + }; + } } if (tmpNumProp > maxNumProp) maxNumProp = tmpNumProp; + for (k = 1; k < 10; k++){ + if (tmpNumProp10[k] > maxNumProp10[k]) maxNumProp10[k] = tmpNumProp10[k]; + } } } @@ -3968,6 +3988,11 @@ static void getStatisticFinalCSs(CSset * printf("Max number of props: %d \n", maxNumProp); printf("Total " BUNFMT " triples, coverred by final CSs: %d (%f percent) \n", BATcount(sbat), totalCoverage, 100 * ((float)totalCoverage/BATcount(sbat))); + printf("If Removing all INFREQUENT Prop (k times smaller threshold) \n"); + for (k = 1; k < 10; k++) { + printf("k = %d | Max # props: %d | coverred by final CSs: %d (%f percent) \n", k, maxNumProp10[k], totalCoverage10[k], 100 * ((float)totalCoverage10[k]/BATcount(sbat))); + } + //Check if remove all the final CS covering less than 10000 triples totalCoverage = 0; @@ -5213,6 +5238,8 @@ CSrel* getFKBetweenTableSet(CSrel *csrel CSrel* refinedCsRel; int propIdx; //Index of prop in list of props for each FreqCS int numRel = freqCSset->numCSadded; + int numOneToMany = 0; + int numManyToMany = 0; refinedCsRel = initCSrelset(numTables); @@ -5257,6 +5284,15 @@ CSrel* getFKBetweenTableSet(CSrel *csrel } assert(to < numTables); + if (rel.lstCnt[j] > freqCSset->items[toFreqId].support){ + //printf("ONE to MANY relatioship \n"); + numOneToMany++; + } + if (csPropTypes[from].lstPropTypes[propIdx].isMVProp){ + //printf("MANY to MANY relatioship \n"); + numManyToMany++; + } + addReltoCSRelWithFreq(from, to, rel.lstPropId[j], rel.lstCnt[j], rel.lstBlankCnt[j], &refinedCsRel[from]); //Add rel info to csPropTypes @@ -5265,6 +5301,9 @@ CSrel* getFKBetweenTableSet(CSrel *csrel } } + printf("FK relationship: Possible number of One-to-Many FK: %d \n", numOneToMany); + printf("FK relationship: Possible number of Many-to-Many FK: %d \n", numManyToMany); + return refinedCsRel; } @@ -5661,7 +5700,8 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpNumRel = freqCSset->numCSadded; /* S6: Merged CS referred from the same CS via the same property */ - mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount); + if (1) mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount); + //printf("DISABLE S6 (For Testing) \n"); freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel); @@ -6359,6 +6399,13 @@ str RDFdistTriplesToCSs(int *ret, bat *s //void* realObjValue = NULL; ValRecord vrRealObjValue; ValRecord vrCastedObjValue; + #if DETECT_PKCOL + BAT *tmpHashBat = NULL; + char isCheckDone = 0; + BUN tmpObjBun = BUN_NONE; + int numPKcols = 0; + char isPossiblePK = 0; + #endif if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { throw(RDF, "RDFdistTriplesToCSs", @@ -6408,6 +6455,10 @@ str RDFdistTriplesToCSs(int *ret, bat *s pbt = (oid *) BUNtloc(pi, p); sbt = (oid *) BUNtloc(si, p); obt = (oid *) BUNtloc(oi, p); + + //BATprint(pbat); + //BATprint(sbat); + //BATprint(obat); //printf(BUNFMT ": " BUNFMT " | " BUNFMT " | " BUNFMT "\n", p, *pbt, *sbt, *obt); getTblIdxFromS(*sbt, &tblIdx, &tmpSoid); @@ -6478,11 +6529,27 @@ str RDFdistTriplesToCSs(int *ret, bat *s //printf(" Tbl: %d | Col: %d \n", tblIdx, tmpColIdx); + istmpMVProp = csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp; + defaultType = csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType; + #if DETECT_PKCOL + isPossiblePK = 1; + #if ONLY_URI_PK + if (defaultType != URI) isPossiblePK = 0; + #endif + #endif if (isSetLasttblIdx == 0){ lastColIdx = tmpColIdx; lastPropIdx = tmpPropIdx; lasttblIdx = tblIdx; cstablestat->lastInsertedS[tblIdx][tmpColIdx] = BUN_NONE; + #if DETECT_PKCOL + if (isPossiblePK){ + tmpHashBat = BATnew(TYPE_void, TYPE_oid, lastSubjId[tblIdx] + 1); + (void)BATprepareHash(BATmirror(tmpHashBat)); + isCheckDone = 0; + numPKcols++; + } + #endif isSetLasttblIdx = 1; } @@ -6498,11 +6565,38 @@ str RDFdistTriplesToCSs(int *ret, bat *s lasttblIdx = tblIdx; tmplastInsertedS = -1; cstablestat->lastInsertedS[tblIdx][tmpColIdx] = BUN_NONE; + #if DETECT_PKCOL + if (isPossiblePK){ + if (tmpHashBat != NULL){ + BBPreclaim(tmpHashBat); + tmpHashBat = NULL; + } + tmpHashBat = BATnew(TYPE_void, TYPE_oid, lastSubjId[tblIdx] + 1); + (void)BATprepareHash(BATmirror(tmpHashBat)); + csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 1; /* Assume that the object values are all unique*/ + isCheckDone = 0; + numPKcols++; + } + #endif } - - istmpMVProp = csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp; - defaultType = csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType; + #if DETECT_PKCOL + else{ + if (isCheckDone == 0 && isPossiblePK){ + tmpObjBun = BUNfnd(BATmirror(tmpHashBat),(ptr) obt); + if (tmpObjBun == BUN_NONE){ + BUNappend(tmpHashBat,obt, TRUE); + } + else{ + isCheckDone = 1; + csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 0; + numPKcols--; + //printf("Found duplicated value at " BUNFMT " | " BUNFMT " | " BUNFMT "\n", *pbt, *sbt, *obt); + } + } + } + #endif + if (istmpMVProp == 1){ // This is a multi-valued prop //printf("Multi values prop \n"); @@ -6601,9 +6695,6 @@ str RDFdistTriplesToCSs(int *ret, bat *s continue; } - - - if (tmpTableType == MAINTBL){ curBat = cstablestat->lstcstable[tblIdx].colBats[tmpColIdx]; //printf(" tmpColIdx = %d \n",tmpColIdx); @@ -6646,7 +6737,14 @@ str RDFdistTriplesToCSs(int *ret, bat *s cstablestat->lastInsertedS[tblIdx][tmpColIdx] = tmpSoid; } - + + #if DETECT_PKCOL + if (tmpHashBat != NULL){ + BBPreclaim(tmpHashBat); + tmpHashBat = NULL; + } + printf("Number of possible PK cols is: %d \n", numPKcols); + #endif //HAVE TO GO THROUGH ALL BATS fillMissingvaluesAll(cstablestat, csPropTypes, lasttblIdx, lastColIdx, lastPropIdx, lastSubjId); @@ -6856,7 +6954,8 @@ RDFreorganize(int *ret, CStableStat *cst free(mfreqIdxTblIdxMapping); free(mTblIdxFreqIdxMapping); freeCSPropTypes(csPropTypes,numTables); - + printf("Finish & Exit exploring step! \n"); + return MAL_SUCCEED; } @@ -6962,10 +7061,16 @@ RDFreorganize(int *ret, CStableStat *cst //printPropStat(propStat,0); + curT = clock(); + printf (" Prepare and create sub-sorted PSO took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); + tmpLastT = curT; if (RDFdistTriplesToCSs(ret, &sNewBat->batCacheid, &pNewBat->batCacheid, &oNewBat->batCacheid, mapbatid, propStat, cstablestat, csPropTypes, lastSubjId) != MAL_SUCCEED){ throw(RDF, "rdf.RDFreorganize", "Problem in distributing triples to BATs using CSs"); } + curT = clock(); + printf ("RDFdistTriplesToCSs process took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); + tmpLastT = curT; freeCSrelSet(csRelMergeFreqSet,freqCSset->numCSadded); freeCSrelSet(csRelFinalFKs, numTables); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -218,6 +218,10 @@ typedef struct SubCSSet{ #define EXPORT_LABEL 1 /* Export labels: TODO: Only disable the */ + +#define DETECT_PKCOL 1 /* Detect whether a col can be a primary key col while reorganizing triples table*/ +#define ONLY_URI_PK 1 /* Only URI can be considered for PK */ + typedef struct CSset{ CS* items; int numOrigFreqCS; @@ -324,6 +328,7 @@ typedef struct PropTypes{ char* TableTypes; char defaultType; char isMVProp; /* = 1 if this prop is a multi-valued prop*/ + char isPKProp; /* = 1 if all the values in this columns is unique */ char numMvTypes; /* Number of extype BAT for this MV col */ char isFKProp; int refTblId; /* refTblId != -1 only when isFKProp = 1 */ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list