Changeset: f7eebfbb3420 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f7eebfbb3420
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Refined rules S6 (Merging all CS's referred via the same prop).

- We do not consider CSrelBetweenMergedCS, but use CSrelToMergeCS.
- Filtering more strickly by using the threshold 0.1


diffs (121 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -884,12 +884,16 @@ static
 int countNumberMergeCS(CSset *csSet){
        int i; 
        int num = 0;
+       int maxNumProp = 0; 
        for (i = 0; i < csSet->numCSadded; i ++){
                if (csSet->items[i].parentFreqIdx == -1){
                        num++;  
+                       if (csSet->items[i].numProp > maxNumProp) maxNumProp = 
csSet->items[i].numProp;
                }
        }
 
+       printf("Max number of prop among %d merged CS is: %d \n", num, 
maxNumProp);
+
        return num; 
 
 }
@@ -2749,7 +2753,7 @@ void generatecsRelSum(CSrel csRel, int f
 
        for (i = 0; i < csRel.numRef; i++){
                freq = freqCSset->items[csRel.origFreqIdx].support; 
-               if (freq < csRel.lstCnt[i] * 100){                      
+               if (freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){                 
                        propIdx = 0;
                        while (csRelSum->lstPropId[propIdx] != 
csRel.lstPropId[i])
                                propIdx++;
@@ -4450,6 +4454,43 @@ CSrel* generateCsRelBetweenMergeFreqSet(
        return csRelMergeFreqSet;
 }
 
+
+static
+CSrel* generateCsRelToMergeFreqSet(CSrel *csrelFreqSet, CSset *freqCSset){
+       int     i,j;
+       int     numFreqCS = freqCSset->numOrigFreqCS; 
+       int     from, to;
+       CSrel   rel;
+       CSrel*  csRelMergeFreqSet;
+       
+       csRelMergeFreqSet = initCSrelset(freqCSset->numCSadded);
+
+       for (i = 0; i < numFreqCS; ++i) {
+               if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without 
relations
+               rel = csrelFreqSet[i];
+               // update the 'from' value
+               from = i;
+               /*
+               while (freqCSset->items[from].parentFreqIdx != -1) {
+                       from = freqCSset->items[from].parentFreqIdx;
+               }
+               assert(freqCSset->items[from].parentFreqIdx == -1);
+               */
+
+               for (j = 0; j < rel.numRef; ++j) {
+                       // update the 'to' value
+                       to = rel.lstRefFreqIdx[j];
+                       while (freqCSset->items[to].parentFreqIdx != -1) {
+                               to = freqCSset->items[to].parentFreqIdx;
+                       }
+                       assert(freqCSset->items[to].parentFreqIdx == -1);
+                       // add relation to new data structure
+                       addReltoCSRelWithFreq(from, to, rel.lstPropId[j], 
rel.lstCnt[j], rel.lstBlankCnt[j], &csRelMergeFreqSet[from]);
+               }
+       }
+       return csRelMergeFreqSet;
+}
+
 static
 void printCSRel(CSset *freqCSset, CSrel *csRelMergeFreqSet, int freqThreshold){
        FILE    *fout2,*fout2filter;
@@ -4554,6 +4595,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
        OntoUsageNode   *ontoUsageTree = NULL;
        int             curNumMergeCS = 0; 
        int             tmpNumRel = 0;
+       CSrel           *tmpCSrelToMergeCS = NULL; 
 
        if ((sbat = BATdescriptor(*sbatid)) == NULL) {
                throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
@@ -4681,6 +4723,10 @@ RDFextractCSwithTypes(int *ret, bat *sba
        
        printFreqCSSet(freqCSset, csBats->freqBat, mbat, 1, *freqThreshold, 
*labels); 
 
+       curNumMergeCS = countNumberMergeCS(freqCSset);
+       printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
+
+
        /* ---------- S1, S2 ------- */
        mergecsId = *maxCSoid + 1; 
 
@@ -4710,13 +4756,13 @@ RDFextractCSwithTypes(int *ret, bat *sba
        mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
        initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
        
-       *csRelMergeFreqSet = generateCsRelBetweenMergeFreqSet(csrelSet, 
freqCSset);
+       tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset);
        tmpNumRel = freqCSset->numCSadded; 
 
        /* S6: Merged CS referred from the same CS via the same property */
-       mergeMaxFreqCSByS6(*csRelMergeFreqSet, freqCSset, labels, 
mergeCSFreqCSMap, curNumMergeCS,  &mergecsId, ontmetadata, ontmetadataCount);
-
-       freeCSrelSet(*csRelMergeFreqSet,tmpNumRel);
+       mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, 
mergeCSFreqCSMap, curNumMergeCS,  &mergecsId, ontmetadata, ontmetadataCount);
+
+       freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel);
 
        curNumMergeCS = countNumberMergeCS(freqCSset);
        curT = clock(); 
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -173,6 +173,7 @@ typedef struct SubCSSet{
 #define SIM_THRESHOLD 0.6
 #define SIM_TFIDF_THRESHOLD 0.55
 #define IMPORTANCE_THRESHOLD 0.01
+#define MIN_PERCETAGE_S6 10    // Merge all CS refered by more than 
1/MIN_PERCETAGE_S6 percent of a CS via one property
 
 typedef struct CSset{
        CS* items;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to