Changeset: af6b114f1b3a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=af6b114f1b3a
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Tune S1 by using alternative name for a freqCS, checking where the name comes 
from. + validate TF-IDF function in rdflabels.c


diffs (truncated from 501 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -103,6 +103,11 @@ typedef enum {
 
 #define N_GRAPH_BAT (MAP_LEX+1)
 
+#define INFO_WHERE_NAME_FROM 1
+#define TOP_GENERAL_NAME 2     //Level of hierrachy in which a name is 
considered to be a general name
+                               //For example, PERSON, THING is at level 1      
+#define        USE_ALTERNATIVE_NAME 0  //Use different but may be better name 
for a general name
+
 // Final data structure that stores the labels for tables and attributes
 typedef struct CSlabel {
        oid             name;           // table name
@@ -116,6 +121,11 @@ typedef struct CSlabel {
        int             hierarchyCount; // number of entries in the hierarchy 
list
        int             numProp;        // number of properties, copied from 
freqCSset->items[x].numProp
        oid             *lstProp;       // attribute names (same order as in 
freqCSset->items[x].lstProp)
+       #if     INFO_WHERE_NAME_FROM    
+       char            isOntology;     // First name is decided by ontology
+       char            isType;         // First name is decided based on Type
+       char            isFK;   
+       #endif
 } CSlabel;
 
 #endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1167,6 +1167,7 @@ oid* getOntologyCandidates(oid** ontattr
                // remove subclass if superclass is in list
                for (k = 0; k < num; ++k) {
                        int found = 0;
+                       printf("    TFIDF score at %d is: %f \n",k, 
classStat[k].tfidfs);
                        if (classStat[k].tfidfs < ONTOLOGY_FREQ_THRESHOLD) 
break; // values not frequent enough (list is sorted by tfidfs)
                        for (j = 0; j < ontmetadataCount && (found == 0); ++j) {
                                oid muri = ontmetadata[0][j];
@@ -1346,8 +1347,19 @@ void createOntologyLookupResult(oid** re
                for (j = 0; j < ontologyCount; ++j) {
                        propOntologiesCount[j] = 0;
                }
+
+               printf("Get ontology for FreqId %d. Orignal numProp = %d \n", 
i, cs.numProp);
+
                propOntologies = findOntologies(cs, propOntologiesCount, 
&propOntologiesOids);
 
+               /*
+               printf("Prop ontologies count. \n");
+               for (j = 0; j < ontologyCount; ++j) {
+                       if (propOntologiesCount[j] > 0)
+                               printf("    (%d) props in ontology %d \n ", 
propOntologiesCount[j], j);
+               }
+               */
+
                // get class names
                resultCount[i] = 0;
                result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
@@ -1970,6 +1982,9 @@ void getTableName(CSlabel* label, int cs
                label->name = result[csIdx][0];
                label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
                nameFound = 1;
+               #if INFO_WHERE_NAME_FROM
+               label->isOntology = 1; 
+               #endif
        }
 
        if (!nameFound) {
@@ -2001,6 +2016,9 @@ void getTableName(CSlabel* label, int cs
                                label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
                                free(tmpList);
                                nameFound = 1;
+                               #if INFO_WHERE_NAME_FROM
+                               label->isOntology = 1; 
+                               #endif
                        }
 
                        if (!nameFound) {
@@ -2010,6 +2028,10 @@ void getTableName(CSlabel* label, int cs
                                        label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
                                        free(tmpList);
                                        nameFound = 1;
+                                       
+                                       #if INFO_WHERE_NAME_FROM
+                                       label->isOntology = 1; 
+                                       #endif
                                }
                        }
 
@@ -2019,6 +2041,10 @@ void getTableName(CSlabel* label, int cs
                                label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
                                free(tmpList);
                                nameFound = 1;
+
+                               #if INFO_WHERE_NAME_FROM
+                               label->isOntology = 1; 
+                               #endif
                        }
                }
        }
@@ -2060,6 +2086,10 @@ void getTableName(CSlabel* label, int cs
                        // only one type attribute, use most frequent value 
(sorted)
                        label->name = tmpList[0];
                        nameFound = 1;
+                       #if INFO_WHERE_NAME_FROM
+                       label->isType = 1; 
+                       #endif
+
                }
        }
 
@@ -2071,6 +2101,10 @@ void getTableName(CSlabel* label, int cs
                                        if (typeStat[i].value == tmpList[j]) {
                                                label->name = tmpList[j];
                                                nameFound = 1;
+
+                                               #if INFO_WHERE_NAME_FROM
+                                               label->isType = 1; 
+                                               #endif
                                        }
                                }
                        }
@@ -2094,6 +2128,10 @@ void getTableName(CSlabel* label, int cs
                if (links[csIdx].num > 0) {
                        label->name = links[csIdx].fks[0].prop; // sorted
                        nameFound = 1;
+
+                       #if INFO_WHERE_NAME_FROM
+                       label->isFK = 1; 
+                       #endif
                }
        }
 
@@ -2138,6 +2176,11 @@ CSlabel* initLabels(CSset *freqCSset) {
                labels[i].hierarchyCount = 0;
                labels[i].numProp = 0;
                labels[i].lstProp = NULL;
+               #if INFO_WHERE_NAME_FROM
+               labels[i].isOntology = 0; 
+               labels[i].isType = 0; 
+               labels[i].isFK = 0; 
+               #endif
        }
        return labels;
 }
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2778,7 +2778,7 @@ void generatecsRelSum(CSrel csRel, int f
 
        for (i = 0; i < csRel.numRef; i++){
                freq = freqCSset->items[csRel.origFreqIdx].support; 
-               if (freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){                 
+               if (freq > MIN_FROMTABLE_SIZE_S6 && freq < csRel.lstCnt[i] * 
MIN_PERCETAGE_S6){                 
                        propIdx = 0;
                        while (csRelSum->lstPropId[propIdx] != 
csRel.lstPropId[i])
                                propIdx++;
@@ -2811,6 +2811,38 @@ LabelStat* initLabelStat(void){
        return labelStat; 
 }
 
+/*
+ * 
+ * */
+#if USE_ALTERNATIVE_NAME 
+static
+oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){
+       oid candidate; 
+       int i; 
+       candidate = labels[freqIdx].candidates[candIdx];
+
+       if (labels[freqIdx].hierarchyCount > 1){
+               for (i = 0; i < labels[freqIdx].hierarchyCount; i++){
+                       if (labels[freqIdx].hierarchy[i] == candidate) break;
+               }
+
+       }
+       
+       if (i == labels[freqIdx].hierarchyCount)        // Not appears in the 
hierarchy
+               return candidate; 
+       else if (i > TOP_GENERAL_NAME)          // Not a too general candidate
+               return candidate; 
+       else if ((candIdx+1) < labels[freqIdx].candidatesCount){
+               //printf("Use another candidate \n");
+               return labels[freqIdx].candidates[candIdx+1];
+       }
+               
+       //No choice                     
+       return candidate; 
+
+}
+#endif
+
 static
 void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset, 
int k){
        int     i,j; 
@@ -2826,7 +2858,11 @@ void buildLabelStat(LabelStat *labelStat
                if (labels[i].name != BUN_NONE){
                        numCheck = (labels[i].candidatesCount > 
k)?k:labels[i].candidatesCount;
                        for (j = 0; j < numCheck; j++){
+                               #if USE_ALTERNATIVE_NAME
+                               candidate = getMostSuitableName(labels, i, j);
+                               #else
                                candidate = labels[i].candidates[j];
+                               #endif
                                bun = 
BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate);
                                if (bun == BUN_NONE) {
                                        /*New string*/
@@ -2874,7 +2910,11 @@ void buildLabelStat(LabelStat *labelStat
                if (labels[i].name != BUN_NONE){
                        numCheck = (labels[i].candidatesCount > 
k)?k:labels[i].candidatesCount;
                        for (j = 0; j < numCheck; j++){
+                               #if USE_ALTERNATIVE_NAME
+                               candidate = getMostSuitableName(labels, i, j);
+                               #else
                                candidate = labels[i].candidates[j];
+                               #endif
                                bun = 
BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate);
                                if (bun == BUN_NONE) {
                                        fprintf(stderr, "All the name should be 
stored already!\n");
@@ -2903,18 +2943,63 @@ void freeLabelStat(LabelStat *labelStat)
        free(labelStat);
 }
 
+static 
+void doMerge(CSset *freqCSset, int ruleNum, CS* cs1, CS* cs2, int freqId1, int 
freqId2, oid *mergecsId, CSlabel** labels, oid** ontmetadata, int 
ontmetadataCount, oid name){
+       CS      *mergecs; 
+       int             existMergecsId; 
+       CS              *existmergecs, *mergecs1, *mergecs2; 
+       int     k; 
+
+       //Check whether these CS's belong to any mergeCS
+       if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx == -1){      /* New 
merge */
+               mergecs = mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId);
+               //addmergeCStoSet(mergecsSet, *mergecs);
+               cs1->parentFreqIdx = freqCSset->numCSadded;
+               cs2->parentFreqIdx = freqCSset->numCSadded;
+               addCStoSet(freqCSset,*mergecs);
+               updateLabel(ruleNum, freqCSset, labels, 1, 
freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata, 
ontmetadataCount, NULL, -1);
+               free(mergecs);
+
+               mergecsId[0]++;
+       }
+       else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){
+               existMergecsId = cs2->parentFreqIdx;
+               existmergecs = &(freqCSset->items[existMergecsId]);
+               mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs);
+               cs1->parentFreqIdx = existMergecsId; 
+               updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId, 
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+       }
+       
+       else if (cs1->parentFreqIdx != -1 && cs2->parentFreqIdx == -1){
+               existMergecsId = cs1->parentFreqIdx;
+               existmergecs = &(freqCSset->items[existMergecsId]);
+               mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs);
+               cs2->parentFreqIdx = existMergecsId; 
+               updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId, 
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+       }
+       else if (cs1->parentFreqIdx != cs2->parentFreqIdx){
+               mergecs1 = &(freqCSset->items[cs1->parentFreqIdx]);
+               mergecs2 = &(freqCSset->items[cs2->parentFreqIdx]);
+               
+               mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx);
+
+               //Re-map for all maxCS in mergecs2
+               for (k = 0; k < mergecs2->numConsistsOf; k++){
+                       
freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx;
+               }
+               updateLabel(ruleNum, freqCSset, labels, 0, cs1->parentFreqIdx, 
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+       }
+
+}
 
 static
-void mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, 
oid** ontmetadata, int ontmetadataCount){
+str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, 
oid** ontmetadata, int ontmetadataCount){
        int             i; 
 
        #if !USE_MULTIWAY_MERGING
-       int             j,k;
+       int             j, k;
        int             freqId1, freqId2;
-       CS              *mergecs;
-       int             existMergecsId; 
        CS              *cs1, *cs2;
-       CS              *existmergecs, *mergecs1, *mergecs2; 
        #else
        int             *lstDistinctFreqId = NULL;              
        int             numDistinct = 0;
@@ -2923,6 +3008,17 @@ void mergeMaxFreqCSByS1(CSset *freqCSset
        #endif
        LabelStat       *labelStat = NULL; 
        oid             *name;
+       #if OUTPUT_FREQID_PER_LABEL
+       FILE            *fout;
+       char*           schema = "rdf";
+       int             ret = 0;
+       str             tmpLabel; 
+       int             tmpCount; 
+       
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to