Changeset: 08cf5e383bd5 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=08cf5e383bd5
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Filter for FK relationships


diffs (183 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2594,6 +2594,7 @@ void mergeCSbyS4(CSset *freqCSset, CSlab
 
                        mergecs1 = (CS*)&(freqCSset->items[tmpParentIdx]);
                        mergecs2 = (CS*)&(freqCSset->items[freqId1]);
+                       printf("MaxCS: Merge freqCS %d and freqCS %d \n", 
tmpParentIdx, freqId1);
                        mergeConsistsOf(mergecs1, mergecs2);
                }
 
@@ -3125,7 +3126,7 @@ void doMerge(CSset *freqCSset, int ruleN
                addCStoSet(freqCSset,*mergecs);
                updateLabel(ruleNum, freqCSset, labels, 1, 
freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata, 
ontmetadataCount, NULL, -1);
                free(mergecs);
-
+               
                mergecsId[0]++;
        }
        else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){
@@ -3216,29 +3217,44 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                        tmpCount = 0; 
                        for (k = 0; k < labelStat->lstCount[i]; k++){
                                freqId1 = labelStat->freqIdList[i][k];
-                               if ((*labels)[freqId1].isOntology == 1) break; 
+                               if ((*labels)[freqId1].isOntology == 1) {
+                                       cs1 = &(freqCSset->items[freqId1]);
+                                       #if     NOT_MERGE_DIMENSIONCS
+                                       if (cs1->type == DIMENSIONCS) continue;
+                                       #endif
+                                       tmpCount++;
+                                       break; 
+                               }
                        }
-                       cs1 = &(freqCSset->items[freqId1]);
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
                                #if     NOT_MERGE_DIMENSIONCS
-                               if (cs2->type == DIMENSIONCS) continue; 
+                               if (cs2->type == DIMENSIONCS) 
+                                       continue; 
                                #endif
                                if ((*labels)[freqId2].isOntology == 1){
+                                       printf("Merge FreqCS %d and FreqCS %d 
by Ontology name \n", freqId1, freqId2);
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
+                                       printf("Number of added cs in freqCS: 
%d \n", freqCSset->numCSadded); 
                                        tmpCount++;
                                }
                        }
-                       fprintf(fout, " %d names are same by Ontology. MergedCS 
has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp);
+                       fprintf(fout, " %d freqCS merged as having same name by 
Ontology. MergedCS has %d prop. \n", tmpCount, 
freqCSset->items[freqCSset->numCSadded -1].numProp);
 
                        //For Type
                        tmpCount = 0;
                        for (k = 0; k < labelStat->lstCount[i]; k++){
                                freqId1 = labelStat->freqIdList[i][k];
-                               if ((*labels)[freqId1].isType == 1) break; 
+                               if ((*labels)[freqId1].isType == 1) {
+                                       cs1 = &(freqCSset->items[freqId1]);
+                                       #if     NOT_MERGE_DIMENSIONCS
+                                       if (cs1->type == DIMENSIONCS) continue;
+                                       #endif
+                                       tmpCount++;
+                                       break; 
+                               }
                        }
-                       cs1 = &(freqCSset->items[freqId1]);
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
@@ -3246,19 +3262,27 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                                if (cs2->type == DIMENSIONCS) continue; 
                                #endif
                                if ((*labels)[freqId2].isType == 1){
+                                       printf("Merge FreqCS %d and FreqCS %d 
by Type name \n", freqId1, freqId2);
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
+                                       printf("Number of added cs in freqCS: 
%d \n", freqCSset->numCSadded);                           
                                        tmpCount++;
                                }
                        }
-                       fprintf(fout, " %d names are same by TYPE. MergedCS has 
%d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp);
+                       fprintf(fout, " %d freqCS merged as having same name by 
TYPE. MergedCS has %d prop. \n", tmpCount, 
freqCSset->items[freqCSset->numCSadded -1].numProp);
 
                        //For FK
                        tmpCount = 0;
                        for (k = 0; k < labelStat->lstCount[i]; k++){
                                freqId1 = labelStat->freqIdList[i][k];
-                               if ((*labels)[freqId1].isFK == 1) break; 
+                               if ((*labels)[freqId1].isFK == 1) {
+                                       cs1 = &(freqCSset->items[freqId1]);
+                                       #if     NOT_MERGE_DIMENSIONCS
+                                       if (cs1->type == DIMENSIONCS) continue;
+                                       #endif
+                                       tmpCount++;
+                                       break; 
+                               }
                        }
-                       cs1 = &(freqCSset->items[freqId1]);
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
@@ -3266,13 +3290,14 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                                if (cs2->type == DIMENSIONCS) continue; 
                                #endif
                                if ((*labels)[freqId2].isFK == 1){
+                                       printf("Merge FreqCS %d and FreqCS %d 
by FK name \n", freqId1, freqId2);
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
+                                       printf("Number of added cs in freqCS: 
%d \n", freqCSset->numCSadded);                                   
                                        tmpCount++;
                                }
                        }
                        #endif /* USE_MULTIWAY_MERGING */
-                       fprintf(fout, " %d names are same by FK. MergedCS has 
%d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp);
-
+                       fprintf(fout, " %d freqCS merged as having same name by 
FK. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded 
-1].numProp);
 
                        #if OUTPUT_FREQID_PER_LABEL
                        
@@ -3577,7 +3602,8 @@ void mergeCSByS3S5(CSset *freqCSset, CSl
                        if (simscore > SIM_TFIDF_THRESHOLD){
                        #else   
                        if (simscore > SIM_THRESHOLD) {
-                       #endif                          
+                       #endif          
+                               //printf("S3S5: merge freqCS %d and freqCS %d 
\n", freqId1, freqId2);
                                //Check whether these CS's belong to any mergeCS
                                if (cs1->parentFreqIdx == -1 && 
cs2->parentFreqIdx == -1){      /* New merge */
                                        mergecs = mergeTwoCSs(*cs1,*cs2, 
freqId1,freqId2, *mergecsId);
@@ -5087,7 +5113,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
        refinedCsRel = initCSrelset(numTables);
 
        for (i = 0; i < numRel; ++i) {
-               if (csrelFreqSet[i].numRef == 0 || freqCSset->items[i].coverage 
> MINIMUM_TABLE_SIZE) continue; // ignore CS without relations
+               if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without 
relations
                assert(freqCSset->items[i].parentFreqIdx == -1);
                rel = csrelFreqSet[i];
                from = mfreqIdxTblIdxMapping[i];
@@ -5112,7 +5138,15 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
                        }
                        assert(propIdx < freqCSset->items[i].numProp);
                        
-                       if (csPropTypes[from].lstPropTypes[propIdx].propCover * 
MIN_FK_PROPCOVERAGE > rel.lstCnt[j]) continue; 
+
+                       //Filtering: For big size table, if large number of 
prop's instances need to refer to a certain table
+                       // else, all instances of that prop must refer to the 
certain table
+                       if (freqCSset->items[i].coverage > MINIMUM_TABLE_SIZE){
+                               if 
(csPropTypes[from].lstPropTypes[propIdx].propCover * MIN_FK_PROPCOVERAGE > 
rel.lstCnt[j]) continue; 
+                       }
+                       else{
+                               if 
(csPropTypes[from].lstPropTypes[propIdx].propCover != rel.lstCnt[j]) continue; 
+                       }
                        
                        assert(to < numTables);
                        addReltoCSRelWithFreq(from, to, rel.lstPropId[j], 
rel.lstCnt[j], rel.lstBlankCnt[j], &refinedCsRel[from]);
@@ -5479,6 +5513,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        curT = clock(); 
        printf("Merging with S1 took %f. (Number of mergeCS: %d | NumconsistOf: 
%d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, 
countNumberConsistOfCS(freqCSset));
+       printf("Number of added CS after S1: %d \n", freqCSset->numCSadded);
        tmpLastT = curT;
        
        /* ---------- S4 ------- */
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -129,7 +129,7 @@ typedef struct PropStat {
                                                           
IR_DIMENSION_THRESHOLD_PERCENTAGE * totalFrequency
                                                           Number of IR 
references should be several times larger than the CS frequency
                                                        */
-#define NOT_MERGE_DIMENSIONCS  1
+#define NOT_MERGE_DIMENSIONCS  1               /* Default: 1, 0: Is for 
example data */
 
 #define FILTER_INFREQ_FK_FOR_IR        1               /* We filter out all 
the dirty references from a CS */
 #define FILTER_THRESHOLD_FK_FOR_IR     0.1     /* The FK that their frequency 
< FILTER_THRESHOLD_FK_FOR_IR * FreqCS's frequency */     
@@ -204,6 +204,7 @@ typedef struct SubCSSet{
 #define MIN_PERCETAGE_S6 5     // Merge all CS refered by more than 
1/MIN_PERCETAGE_S6 percent of a CS via one property
 #define MIN_FROMTABLE_SIZE_S6 100  // The minimum size of the "from" table in 
S6. Meaning that 
                                    // the CS's to-be-merged in this rule must 
cover > MIN_FROMTABLE_SIZE_S6 / MIN_PERCETAGE_S6 triples
+//#define MIN_FROMTABLE_SIZE_S6 1              /* For example data */
 #define MINIMUM_TABLE_SIZE 10000   //The minimum number of triples coverred by 
a table (i.e., a final CS) 
 #define SAMPLE_FILTER_THRESHOLD 1  // SAMPLE_FILTER_THRESHOLD/ 100     
 #define HIGH_REFER_THRESHOLD 5
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to