Changeset: a5f2a79c980b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a5f2a79c980b
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Add hierarchy information for ontology-based name regardless of where the name 
comes from.

The name can be assigned based on the type-value, ontology-class similarity or 
FK. However, if the name can be found in ontology-class, we build ontology 
hierarchy for that name.
This is to prevent the problem happened when most of the CS's from dbpsb 
dataset get the name from their type values. These type values are also 
ontology-class, but we do not build any hierarchy information for these CS's.

More strict on the merging rule using common ancestor, putting the importance 
score to 0.001


diffs (226 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2186,6 +2186,7 @@ void getTableName(CSlabel* label, int cs
                maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
                ontClassPos = BUNfnd(BATmirror(ontmetaBat), &maxDepthOid);
                if ( ontClassPos != BUN_NONE){
+                       foundOntologyTypeValue = 1;
                        maxDepth = ontclassSet[ontClassPos].hierDepth;
                }       
                else{
@@ -2332,7 +2333,6 @@ void getTableName(CSlabel* label, int cs
        
        if (choosenOntologyTypeValue == BUN_NONE && resultCount[csIdx] >= 1){
                label->name = result[csIdx][bestOntCandIdx];
-               label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
                nameFound = 1;
                #if INFO_WHERE_NAME_FROM
                label->isOntology = 1; 
@@ -2369,6 +2369,16 @@ void getTableName(CSlabel* label, int cs
                }
        }
        
+       
+       //Add hierarchy information for ontology-based name
+       if (nameFound){
+               ontClassPos = BUNfnd(BATmirror(ontmetaBat), &(label->name));
+               if ( ontClassPos != BUN_NONE){
+                       label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
+               }
+       }
+
+
        //if no name is found, check again the typecount to assign a name
        #if USE_BEST_TYPEVALUE_INSTEADOF_DUMMY
        if (!nameFound){
@@ -2610,11 +2620,10 @@ void printTree(OntoUsageNode* tree, int 
 }
 
 static
-void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** 
ontmetadata, int ontmetadataCount, oid** result, int* resultCount, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount) {
-       int             i, j, k, l;
-       oid             *tmpList;
-       int             tmpListCount;
+void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** 
ontmetadata, int ontmetadataCount, BAT *ontmetaBat,CSlabel* labels) {
+       int             i;
        int             numTuples = 0;
+       BUN             pos; 
 
        // init tree with an artifical root node
        (*tree) = (OntoUsageNode *) malloc(sizeof(OntoUsageNode));
@@ -2633,46 +2642,12 @@ void createOntoUsageTree(OntoUsageNode**
                int             hierarchyCount = 0;
                oid*            hierarchy;
 
-               // get ontology
-               // copied from getTableName
-               if (resultCount[i] == 0) {
-                       // no hierarchy --> ignore
-                       continue;
-               } else if (resultCount[i] == 1) {
-                       // one ontology class --> use it
-                       uri = result[i][0];
-               } else {
-                       // multiple ontology classes --> intersect with types
-                       tmpList = NULL;
-                       tmpListCount = 0;
-                       // search for type values
-                       for (l = 0; l < typeAttributesCount; ++l) {
-                               for (j = 0; j < 
typeAttributesHistogramCount[i][l]; ++j) {
-                                       if 
(typeAttributesHistogram[i][l][j].percent < TYPE_FREQ_THRESHOLD) break; // 
sorted
-                                       // intersect type with ontology classes
-                                       for (k = 0; k < resultCount[i]; ++k) {
-                                               if (result[i][k] == 
typeAttributesHistogram[i][l][j].value) {
-                                                       // found, copy ontology 
class to tmpList
-                                                       tmpList = (oid *) 
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
-                                                       if (!tmpList) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
-                                                       tmpList[tmpListCount] = 
result[i][k];
-                                                       tmpListCount += 1;
-                                               }
-                                       }
-                               }
-                       }
-                       if (tmpListCount == 1) {
-                               // only one left --> use it
-                               uri = tmpList[0];
-                       } else if (tmpListCount > 1) {
-                               // multiple left --> use the class that covers 
most attributes, most popular ontology, ...
-                               uri = tmpList[0]; // sorted
-                       } else {
-                               // empty intersection -> use the class that 
covers most attributes, most popular ontology, ..
-                               uri = result[i][0]; // sorted
-                       }
-                       free(tmpList);
-               }
+               uri = labels[i].name;   
+               if (uri == BUN_NONE) continue;  //No name freqCS
+       
+               //Check if the name is ontology name    
+               pos = BUNfnd(BATmirror(ontmetaBat), &uri);
+               if (pos == BUN_NONE) continue; // no ontology information, 
ignore
 
                // get ontology hierarchy
                hierarchy = getOntoHierarchy(uri, &hierarchyCount, ontmetadata, 
ontmetadataCount);
@@ -2872,7 +2847,7 @@ CSlabel* createLabels(CSset* freqCSset, 
 #endif
 
        // Collect ontology statistics (tree)
-       createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata, 
ontmetadataCount, ontologyLookupResult, ontologyLookupResultCount, 
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
+       createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata, 
ontmetadataCount, ontmetaBat, labels);
 
        free(ontologyLookupResultCount);
        freeOntologyLookupResult(ontologyLookupResult, 
ontologyLookupResutMatchedProp, freqCSset->numCSadded);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3868,6 +3868,7 @@ char isSemanticSimilar(int freqId1, int 
        int hCount1, hCount2; 
        int level; 
        OntoUsageNode *tmpNode; 
+               
        /*
        int k1, k2; 
        if (labels[freqId1].name == labels[freqId2].name)
@@ -3887,7 +3888,7 @@ char isSemanticSimilar(int freqId1, int 
                }
        }
        */
-
+       
        // Check for the most common ancestor
        hCount1 = labels[freqId1].hierarchyCount;
        hCount2 = labels[freqId2].hierarchyCount;
@@ -3899,17 +3900,17 @@ char isSemanticSimilar(int freqId1, int 
        printf("Finding common ancestor for %d and %d \n", freqId1, freqId2 );
        printf("FreqCS1: ");
        for (i = 0; i < hCount1; i++){
-               printf("  %s", labels[freqId1].hierarchy[hCount1-1-i]);
+               printf(" " BUNFMT, labels[freqId1].hierarchy[hCount1-1-i]);
        }
        printf(" \n ");
        printf("FreqCS2: ");
        for (i = 0; i < hCount2; i++){
-               printf("  %s", labels[freqId2].hierarchy[hCount2-1-i]);
+               printf(" " BUNFMT, labels[freqId2].hierarchy[hCount2-1-i]);
        }
        printf(" \n ");
        }
-
        */
+
        
        if (0){
        if ((freqId1 > numOrigFreqCS -1) || (freqId2 > numOrigFreqCS -1))
@@ -3951,8 +3952,10 @@ char isSemanticSimilar(int freqId1, int 
                        oid classOid;
                        BUN ontClassPos;
                        classOid = tmpNode->uri;
+
                        ontClassPos = BUNfnd(BATmirror(ontmetaBat), &classOid); 
                        assert(ontClassPos != BUN_NONE);        
+                       
                        /*
                        if (ontClassPos != BUN_NONE){
                                printf(" Specific level: %d \n", 
ontclassSet[ontClassPos].hierDepth);
@@ -4031,8 +4034,6 @@ void mergeCSByS2S4(CSset *freqCSset, CSl
        oid             name;           /* Name of the common ancestor */
        TFIDFInfo       *tfidfInfos;
        
-
-       
        (void) labels;
        (void) isLabelComparable;
 
@@ -4193,12 +4194,14 @@ void mergeCSByS2(CSset *freqCSset, CSlab
                #if     NOT_MERGE_DIMENSIONCS
                if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; 
                #endif
+
+               if ((*labels)[freqId1].hierarchyCount < 1) continue; 
+
                for (j = (i+1); j < curNumMergeCS; j++){
                        freqId2 = mergeCSFreqCSMap[j];
                        #if     NOT_MERGE_DIMENSIONCS
                        if (freqCSset->items[freqId2].type == DIMENSIONCS) 
continue; 
                        #endif
-                       
                        if (isLabelComparable == 1 && 
isSemanticSimilar(freqId1, freqId2, (*labels), 
ontoUsageTree,freqCSset->numOrigFreqCS, &name, ontmetaBat, ontclassSet) == 1){
                                //printf("Same labels between freqCS %d and 
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
                                doMerge(freqCSset, S2, freqId1, freqId2, 
mergecsId, labels, ontmetadata, ontmetadataCount, name);
@@ -4272,6 +4275,7 @@ void mergeCSByS4(CSset *freqCSset, CSlab
                        #else   
                        if (simscore > SIM_THRESHOLD) {
                        #endif  
+                               //printf("   Similarity score (%d and %d) 
cosine = %f \n", freqId1,freqId2,simscore);
                                /*
                                if ((*labels)[freqId1].name != BUN_NONE){
                                        takeOid((*labels)[freqId1].name, 
&freqCSname1);
@@ -7526,7 +7530,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
        
        curNumMergeCS = countNumberMergeCS(freqCSset);
        printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
-
        
        /* ---------- S1 ------- */
        mergecsId = *maxCSoid + 1; 
@@ -7572,7 +7575,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
        mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
        initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
        
-
        /* S5: Merged CS referred from the same CS via the same property */
        if (1){
        tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -238,7 +238,7 @@ typedef struct SubCSSet{
 #define SIM_THRESHOLD 0.6
 //#define SIM_TFIDF_THRESHOLD 0.55
 #define SIM_TFIDF_THRESHOLD 0.75
-#define IMPORTANCE_THRESHOLD 0.01 //This is used when merging CS's by common 
ancestor
+#define IMPORTANCE_THRESHOLD 0.001 //This is used when merging CS's by common 
ancestor
 #define COMMON_ANCESTOR_LOWEST_SPECIFIC_LEVEL 2 
 
 #define MIN_PERCETAGE_S5 5     // Merge all CS refered by more than 
1/MIN_PERCETAGE_S6 percent of a CS via one property
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to