Changeset: 63c5b2b0c903 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=63c5b2b0c903
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Check primary keys in URI cols


diffs (293 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -616,6 +616,7 @@ void initCSPropTypes(CSPropTypes* csProp
                                csPropTypes[id].lstPropTypes[j].numType = 
MULTIVALUES + 1;
                                csPropTypes[id].lstPropTypes[j].defaultType = 
STRING; 
                                csPropTypes[id].lstPropTypes[j].isMVProp = 0; 
+                               csPropTypes[id].lstPropTypes[j].isPKProp = 0; 
                                csPropTypes[id].lstPropTypes[j].numMvTypes = 0; 
                                csPropTypes[id].lstPropTypes[j].defColIdx = -1; 
                                csPropTypes[id].lstPropTypes[j].isFKProp = 0;
@@ -3903,13 +3904,15 @@ static void getStatisticFinalCSs(CSset *
        //int   *csPropNum; 
        //int   *csFreq; 
        FILE    *fout; 
-       int     i,j ; 
+       int     i,j, k ; 
        char    filename[100];
        char    tmpStr[20];
        int     maxNumtriple = 0; 
        int     minNumtriple = INT_MAX; 
        int     numMergeCS = 0; 
        int     totalCoverage = 0; 
+       int     totalCoverage10[10]; 
+       int     tmpNumProp10[10], maxNumProp10[10]; 
        int     freqId; 
        int     maxNumProp, tmpNumProp; 
 
@@ -3947,20 +3950,37 @@ static void getStatisticFinalCSs(CSset *
        minNumtriple = INT_MAX;
        maxNumProp = 0; 
        tmpNumProp = 0;
+       for (k = 1; k < 10; k++) {
+               totalCoverage10[k] = totalCoverage;
+               maxNumProp10[k] = 0; 
+       }
        for (i = 0; i < curNumMergeCS; i++){
                freqId = mergeCSFreqCSMap[i]; 
                if (freqCSset->items[freqId].parentFreqIdx == -1){              
// Check whether it is a maximumCS
                        // Output the result 
                        tmpNumProp = freqCSset->items[freqId].numProp;  
+                       for (k = 1; k < 10; k++) {
+                               tmpNumProp10[k] = 
freqCSset->items[freqId].numProp;
+                       }
                        for (j = 0; j < freqCSset->items[freqId].numProp; j++){
                                //Check infrequent Prop
                                if 
(isInfrequentProp(csPropTypes[i].lstPropTypes[j], freqCSset->items[freqId])){
                                        totalCoverage = totalCoverage -  
csPropTypes[i].lstPropTypes[j].propCover;
                                        tmpNumProp--; 
                                }
+
+                               for (k = 1; k < 10; k++) {
+                                       if 
((csPropTypes[i].lstPropTypes[j].propFreq * k)  < 
freqCSset->items[freqId].support * INFREQ_PROP_THRESHOLD){
+                                               totalCoverage10[k] = 
totalCoverage10[k] - csPropTypes[i].lstPropTypes[j].propCover;
+                                               tmpNumProp10[k]--; 
+                                       };
+                               }
                        }
 
                        if (tmpNumProp > maxNumProp) maxNumProp = tmpNumProp; 
+                       for (k = 1; k < 10; k++){
+                               if (tmpNumProp10[k] > maxNumProp10[k]) 
maxNumProp10[k] = tmpNumProp10[k]; 
+                       }
                }
        }
 
@@ -3968,6 +3988,11 @@ static void getStatisticFinalCSs(CSset *
        printf("Max number of props: %d \n", maxNumProp);
        printf("Total " BUNFMT " triples, coverred by final CSs: %d  (%f 
percent) \n", BATcount(sbat), totalCoverage, 100 * 
((float)totalCoverage/BATcount(sbat)));
 
+       printf("If Removing all INFREQUENT Prop (k times smaller threshold) 
\n");
+       for (k = 1; k < 10; k++) {
+               printf("k = %d  |  Max # props: %d | coverred by final CSs: %d  
(%f percent)  \n", k, maxNumProp10[k], totalCoverage10[k],  100 * 
((float)totalCoverage10[k]/BATcount(sbat)));
+       }
+
        //Check if remove all the final CS covering less than 10000 triples
        
        totalCoverage = 0;
@@ -5213,6 +5238,8 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
        CSrel*  refinedCsRel;
        int     propIdx;        //Index of prop in list of props for each FreqCS
        int     numRel = freqCSset->numCSadded; 
+       int     numOneToMany = 0;
+       int     numManyToMany = 0;
 
        refinedCsRel = initCSrelset(numTables);
 
@@ -5257,6 +5284,15 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
                        }
                        
                        assert(to < numTables);
+                       if (rel.lstCnt[j] > freqCSset->items[toFreqId].support){
+                               //printf("ONE to MANY relatioship \n"); 
+                               numOneToMany++;
+                       }
+                       if (csPropTypes[from].lstPropTypes[propIdx].isMVProp){
+                               //printf("MANY to MANY relatioship \n"); 
+                               numManyToMany++;
+                       }
+
                        addReltoCSRelWithFreq(from, to, rel.lstPropId[j], 
rel.lstCnt[j], rel.lstBlankCnt[j], &refinedCsRel[from]);
 
                        //Add rel info to csPropTypes
@@ -5265,6 +5301,9 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
 
                }
        }
+       printf("FK relationship: Possible number of One-to-Many FK: %d \n", 
numOneToMany);
+       printf("FK relationship: Possible number of Many-to-Many FK: %d \n", 
numManyToMany);
+
        return refinedCsRel;
 }
 
@@ -5661,7 +5700,8 @@ RDFextractCSwithTypes(int *ret, bat *sba
        tmpNumRel = freqCSset->numCSadded; 
 
        /* S6: Merged CS referred from the same CS via the same property */
-       mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, 
mergeCSFreqCSMap, curNumMergeCS,  &mergecsId, ontmetadata, ontmetadataCount);
+       if (1) mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, 
mergeCSFreqCSMap, curNumMergeCS,  &mergecsId, ontmetadata, ontmetadataCount);
+       //printf("DISABLE S6 (For Testing) \n"); 
 
        freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel);
 
@@ -6359,6 +6399,13 @@ str RDFdistTriplesToCSs(int *ret, bat *s
        //void*         realObjValue = NULL;
        ValRecord       vrRealObjValue;
        ValRecord       vrCastedObjValue; 
+       #if     DETECT_PKCOL
+       BAT     *tmpHashBat = NULL; 
+       char    isCheckDone = 0; 
+       BUN     tmpObjBun = BUN_NONE; 
+       int     numPKcols = 0; 
+       char    isPossiblePK = 0; 
+       #endif
        
        if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
                throw(RDF, "RDFdistTriplesToCSs",
@@ -6408,6 +6455,10 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                pbt = (oid *) BUNtloc(pi, p);
                sbt = (oid *) BUNtloc(si, p);
                obt = (oid *) BUNtloc(oi, p);
+
+               //BATprint(pbat);
+               //BATprint(sbat); 
+               //BATprint(obat); 
                
                //printf(BUNFMT ": " BUNFMT "  |  " BUNFMT " | " BUNFMT "\n", 
p, *pbt, *sbt, *obt); 
                getTblIdxFromS(*sbt, &tblIdx, &tmpSoid);        
@@ -6478,11 +6529,27 @@ str RDFdistTriplesToCSs(int *ret, bat *s
 
                //printf(" Tbl: %d   |   Col: %d \n", tblIdx, tmpColIdx);
                
+               istmpMVProp = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp; 
+               defaultType = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType; 
+               #if     DETECT_PKCOL
+                       isPossiblePK = 1;
+                       #if ONLY_URI_PK
+                       if (defaultType != URI) isPossiblePK = 0; 
+                       #endif
+               #endif
                if (isSetLasttblIdx == 0){
                        lastColIdx = tmpColIdx;
                        lastPropIdx = tmpPropIdx; 
                        lasttblIdx = tblIdx;
                        cstablestat->lastInsertedS[tblIdx][tmpColIdx] = 
BUN_NONE;
+                       #if     DETECT_PKCOL
+                       if (isPossiblePK){
+                               tmpHashBat = BATnew(TYPE_void, TYPE_oid, 
lastSubjId[tblIdx] + 1);
+                               (void)BATprepareHash(BATmirror(tmpHashBat));
+                               isCheckDone = 0; 
+                               numPKcols++;
+                       }
+                       #endif
                        isSetLasttblIdx = 1; 
                }
 
@@ -6498,11 +6565,38 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                        lasttblIdx = tblIdx;
                        tmplastInsertedS = -1;
                        cstablestat->lastInsertedS[tblIdx][tmpColIdx] = 
BUN_NONE;
+                       #if     DETECT_PKCOL    
+                       if (isPossiblePK){
+                               if (tmpHashBat != NULL){
+                                       BBPreclaim(tmpHashBat); 
+                                       tmpHashBat = NULL; 
+                               }
+                               tmpHashBat = BATnew(TYPE_void, TYPE_oid, 
lastSubjId[tblIdx] + 1);
+                               (void)BATprepareHash(BATmirror(tmpHashBat));
+                               
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 1;  /* Assume that the 
object values are all unique*/
+                               isCheckDone = 0;
+                               numPKcols++;
+                       }
+                       #endif
                        
                }
-
-               istmpMVProp = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp; 
-               defaultType = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType; 
+               #if     DETECT_PKCOL
+               else{
+                       if (isCheckDone == 0 && isPossiblePK){
+                               tmpObjBun = BUNfnd(BATmirror(tmpHashBat),(ptr) 
obt);
+                               if (tmpObjBun == BUN_NONE){
+                                       BUNappend(tmpHashBat,obt, TRUE);
+                               }
+                               else{
+                                       isCheckDone = 1; 
+                                       
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 0; 
+                                       numPKcols--;
+                                       //printf("Found duplicated value at " 
BUNFMT "  |  " BUNFMT " | " BUNFMT "\n", *pbt, *sbt, *obt);
+                               }
+                       }
+               }
+               #endif
+
 
                if (istmpMVProp == 1){  // This is a multi-valued prop
                        //printf("Multi values prop \n"); 
@@ -6601,9 +6695,6 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                        continue; 
                }
 
-
-               
-
                if (tmpTableType == MAINTBL){
                        curBat = 
cstablestat->lstcstable[tblIdx].colBats[tmpColIdx];
                        //printf(" tmpColIdx = %d \n",tmpColIdx);
@@ -6646,7 +6737,14 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                cstablestat->lastInsertedS[tblIdx][tmpColIdx] = tmpSoid;
 
        }
-
+       
+       #if DETECT_PKCOL 
+       if (tmpHashBat != NULL){
+               BBPreclaim(tmpHashBat); 
+               tmpHashBat = NULL; 
+       }
+       printf("Number of possible PK cols is: %d \n", numPKcols); 
+       #endif
        //HAVE TO GO THROUGH ALL BATS
        fillMissingvaluesAll(cstablestat, csPropTypes, lasttblIdx, lastColIdx, 
lastPropIdx, lastSubjId);
 
@@ -6856,7 +6954,8 @@ RDFreorganize(int *ret, CStableStat *cst
                free(mfreqIdxTblIdxMapping);
                free(mTblIdxFreqIdxMapping);
                freeCSPropTypes(csPropTypes,numTables);
-
+               printf("Finish & Exit exploring step! \n"); 
+               
                return MAL_SUCCEED;
        }
 
@@ -6962,10 +7061,16 @@ RDFreorganize(int *ret, CStableStat *cst
        
        //printPropStat(propStat,0); 
        
+       curT = clock(); 
+       printf (" Prepare and create sub-sorted PSO took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+       tmpLastT = curT;                
        if (RDFdistTriplesToCSs(ret, &sNewBat->batCacheid, 
&pNewBat->batCacheid, &oNewBat->batCacheid, mapbatid, propStat, cstablestat, 
csPropTypes, lastSubjId) != MAL_SUCCEED){
                throw(RDF, "rdf.RDFreorganize", "Problem in distributing 
triples to BATs using CSs");           
        }
                
+       curT = clock(); 
+       printf ("RDFdistTriplesToCSs process took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+       tmpLastT = curT;                
 
        freeCSrelSet(csRelMergeFreqSet,freqCSset->numCSadded);
        freeCSrelSet(csRelFinalFKs, numTables); 
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -218,6 +218,10 @@ typedef struct SubCSSet{
 
 #define EXPORT_LABEL           1       /* Export labels: TODO: Only disable 
the  */
 
+
+#define DETECT_PKCOL           1       /* Detect whether a col can be a 
primary key col while reorganizing triples table*/
+#define ONLY_URI_PK            1       /* Only URI can be considered for PK */
+
 typedef struct CSset{
        CS* items;
        int numOrigFreqCS; 
@@ -324,6 +328,7 @@ typedef struct PropTypes{
        char*   TableTypes;
        char    defaultType; 
        char    isMVProp;       /* = 1 if this prop is a multi-valued prop*/
+       char    isPKProp;       /* = 1 if all the values in this columns is 
unique */
        char    numMvTypes;     /* Number of extype BAT for this MV col */
        char    isFKProp; 
        int     refTblId;       /* refTblId != -1 only when isFKProp = 1 */
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to