MonetDB: rdf - Change the priority in assinging label to type-on...

Minh-Duc Pham Fri, 28 Mar 2014 02:53:26 -0700

Changeset: c8f249486552 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8f249486552
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:


Change the priority in assinging label to type-ontology-FK

Slightly change the TF/IDF computation

Disable S3 (Sub-super) and S5 (FK).


diffs (truncated from 444 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1472,6 +1472,7 @@ void createOntologyLookupResult(oid** re
 
                // get class names
                resultCount[i] = 0;
+               
                result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
 
                for (j = 0; j < ontologyCount; ++j) {
@@ -2082,90 +2083,6 @@ void getTableName(CSlabel* label, int cs
        (void) ontmetaBat;
 
 
-       // --- ONTOLOGY ---
-       // add all ontology candidates to list of candidates
-       if (resultCount[csIdx] >= 1) {
-               label->candidatesOntology = resultCount[csIdx];
-               label->candidates = GDKrealloc(label->candidates, sizeof(oid) * 
(label->candidatesCount + resultCount[csIdx]));
-               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
-               for (i = 0; i < resultCount[csIdx]; ++i) {
-                       label->candidates[label->candidatesCount + i] = 
result[csIdx][i];
-               }
-               label->candidatesCount += resultCount[csIdx];
-       }
-
-       // one ontology class --> use it
-       if (resultCount[csIdx] == 1) {
-               label->name = result[csIdx][0];
-               label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-               nameFound = 1;
-               #if INFO_WHERE_NAME_FROM
-               label->isOntology = 1; 
-               #endif
-       }
-
-       if (!nameFound) {
-               // multiple ontology classes --> intersect with types
-               if (resultCount[csIdx] > 1) {
-                       tmpList = NULL;
-                       tmpListCount = 0;
-                       // search for type values
-                       for (i = 0; i < typeAttributesCount; ++i) {
-                               for (j = 0; j < 
typeAttributesHistogramCount[csIdx][i]; ++j) {
-                                       if 
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // 
sorted
-
-                                       // intersect type with ontology classes
-                                       for (k = 0; k < resultCount[csIdx]; 
++k) {
-                                               if (result[csIdx][k] == 
typeAttributesHistogram[csIdx][i][j].value) {
-                                                       // found, copy ontology 
class to tmpList
-                                                       tmpList = (oid *) 
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
-                                                       if (!tmpList) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
-                                                       tmpList[tmpListCount] = 
result[csIdx][k];
-                                                       tmpListCount += 1;
-                                               }
-                                       }
-                               }
-                       }
-
-                       // only one left --> use it
-                       if (tmpListCount == 1) {
-                               label->name = tmpList[0];
-                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
-                               free(tmpList);
-                               nameFound = 1;
-                               #if INFO_WHERE_NAME_FROM
-                               label->isOntology = 1; 
-                               #endif
-                       }
-
-                       if (!nameFound) {
-                               // multiple left --> use the class that covers 
most attributes, most popular ontology, ...
-                               if (tmpListCount > 1) {
-                                       label->name = tmpList[0]; // sorted
-                                       label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
-                                       free(tmpList);
-                                       nameFound = 1;
-                                       
-                                       #if INFO_WHERE_NAME_FROM
-                                       label->isOntology = 1; 
-                                       #endif
-                               }
-                       }
-
-                       if (!nameFound) {
-                               // empty intersection -> use the class that 
covers most attributes, most popular ontology, ..
-                               label->name = result[csIdx][0]; // sorted
-                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
-                               free(tmpList);
-                               nameFound = 1;
-
-                               #if INFO_WHERE_NAME_FROM
-                               label->isOntology = 1; 
-                               #endif
-                       }
-               }
-       }
-
        // --- TYPE ---
        // get most frequent type value per type attribute
        tmpList = NULL;
@@ -2204,7 +2121,7 @@ void getTableName(CSlabel* label, int cs
                        int freq;
                        if (typeAttributesHistogram[csIdx][i][j].percent < 
TYPE_FREQ_THRESHOLD) break;
                        depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&typeAttributesHistogram[csIdx][i][j].value)].hierDepth;
-                       maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&maxDepthOid)].hierDepth;;
+                       maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&maxDepthOid)].hierDepth;
                        freq = typeAttributesHistogram[csIdx][i][j].freq;
                        if (depth > maxDepth) {
                                // choose value with higher hierarchy level
@@ -2269,6 +2186,95 @@ void getTableName(CSlabel* label, int cs
                }
        }
 
+
+       // --- ONTOLOGY ---
+       // add all ontology candidates to list of candidates
+       if (resultCount[csIdx] >= 1) {
+               label->candidatesOntology = resultCount[csIdx];
+               label->candidates = GDKrealloc(label->candidates, sizeof(oid) * 
(label->candidatesCount + resultCount[csIdx]));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+               for (i = 0; i < resultCount[csIdx]; ++i) {
+                       label->candidates[label->candidatesCount + i] = 
result[csIdx][i];
+               }
+               label->candidatesCount += resultCount[csIdx];
+       }
+
+       // one ontology class --> use it
+       if (!nameFound){
+       if (resultCount[csIdx] == 1) {
+               label->name = result[csIdx][0];
+               label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
+               nameFound = 1;
+               #if INFO_WHERE_NAME_FROM
+               label->isOntology = 1; 
+               #endif
+       }
+       }
+
+       if (!nameFound) {
+               // multiple ontology classes --> intersect with types
+               if (resultCount[csIdx] > 1) {
+                       tmpList = NULL;
+                       tmpListCount = 0;
+                       // search for type values
+                       for (i = 0; i < typeAttributesCount; ++i) {
+                               for (j = 0; j < 
typeAttributesHistogramCount[csIdx][i]; ++j) {
+                                       if 
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // 
sorted
+
+                                       // intersect type with ontology classes
+                                       for (k = 0; k < resultCount[csIdx]; 
++k) {
+                                               if (result[csIdx][k] == 
typeAttributesHistogram[csIdx][i][j].value) {
+                                                       // found, copy ontology 
class to tmpList
+                                                       tmpList = (oid *) 
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
+                                                       if (!tmpList) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+                                                       tmpList[tmpListCount] = 
result[csIdx][k];
+                                                       tmpListCount += 1;
+                                               }
+                                       }
+                               }
+                       }
+
+                       // only one left --> use it
+                       if (tmpListCount == 1) {
+                               label->name = tmpList[0];
+                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                               free(tmpList);
+                               nameFound = 1;
+                               #if INFO_WHERE_NAME_FROM
+                               label->isOntology = 1; 
+                               #endif
+                       }
+
+                       if (!nameFound) {
+                               // multiple left --> use the class that covers 
most attributes, most popular ontology, ...
+                               if (tmpListCount > 1) {
+                                       label->name = tmpList[0]; // sorted
+                                       label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                                       free(tmpList);
+                                       nameFound = 1;
+                                       
+                                       #if INFO_WHERE_NAME_FROM
+                                       label->isOntology = 1; 
+                                       #endif
+                               }
+                       }
+
+                       if (!nameFound) {
+                               // empty intersection -> use the class that 
covers most attributes, most popular ontology, ..
+                               label->name = result[csIdx][0]; // sorted
+                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                               free(tmpList);
+                               nameFound = 1;
+
+                               #if INFO_WHERE_NAME_FROM
+                               label->isOntology = 1; 
+                               #endif
+                       }
+               }
+       }
+
+
+
        // --- FK ---
        // add top3 fk values to list of candidates
        if (links[csIdx].num > 0) {
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -92,7 +92,8 @@ enum {
 
 #define FK_FREQ_THRESHOLD 25           // X % of the targeted subjects have to 
be in this table
 #define TYPE_FREQ_THRESHOLD 80         // X % of the type values have to be 
this value
-#define ONTOLOGY_FREQ_THRESHOLD 0.4    // similarity threshold for tfidf 
simularity for ontology classes
+//#define ONTOLOGY_FREQ_THRESHOLD 0.4  // similarity threshold for tfidf 
simularity for ontology classes
+#define ONTOLOGY_FREQ_THRESHOLD 0.8    // similarity threshold for tfidf 
simularity for ontology classes
 
 #define USE_SHORT_NAMES 1              // use getPropNameShort()
 #define USE_TYPE_NAMES 1               // use type attribute values for 
labeling
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -121,6 +121,26 @@ static void initcsIdFreqIdxMap(int* inpu
 
 
 
+str printTKNZStringFromOid(oid id){
+       int ret; 
+       char*   schema = "rdf";
+       str propStr; 
+
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               throw(RDF, "rdf.rdfschema",
+                               "could not open the tokenizer\n");
+       }
+
+       takeOid(id, &propStr);  
+       printf("String for "BUNFMT": %s\n", id, propStr);
+       
+       GDKfree(propStr); 
+       TKNZRclose(&ret);
+
+       return MAL_SUCCEED; 
+}
+
+
 char isCSTable(CS item){
        if (item.parentFreqIdx != -1) return 0; 
 
@@ -2507,6 +2527,11 @@ oid putaCStoHash(CSBats *csBats, oid* ke
                csId = *csoid; 
                addNewCS(csBats, fullPropStat, &csKey, key, csoid, num, 
numTriples, numTypeValues, rdftypeOntologyValues);
 
+               //if (csId == 73){
+               //      printf("Extra info for cs 73 is: ");
+               //      printTKNZStringFromOid(rdftypeOntologyValues[0]);
+               //}
+
                //Handle the case when freqThreshold == 1 
                if (isStoreFreqCS ==1 && freqThreshold == 1){
                        #if STOREFULLCS
@@ -2612,7 +2637,10 @@ static int isSubset(oid* arr1, oid* arr2
  * See http://disi.unitn.it/~bernardi/Courses/DL/Slides_11_12/measures.pdf
  * tf(t,d): Number of times t occurs in d. --> For a CS, tf(prop, aCS) = 1; 
  * idf(t): The rarity of a term t in the whold document collection
- * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t +1)
+ * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t)
+ * Note that, some function may use #numberCSs_containing_t + 1 as it can be 
division 
+ * by 0 if the term does not appear in any document. However, in our case, 
+ * every prop must appear in at least one CS
  * tf-idf(t,d,D) = tf(t,d) * idf(t,D)
  *
  * Note that: If we use normalize tf by dividing with maximum tf 
@@ -2621,7 +2649,7 @@ static int isSubset(oid* arr1, oid* arr2
 
 static 
 float tfidfComp(int numContainedCSs, int totalNumCSs){
-       return log((float)totalNumCSs/(1+numContainedCSs)); 
+       return log((float)totalNumCSs/(numContainedCSs)); 
 }
 
 /*
@@ -3014,15 +3042,33 @@ void getPropStatisticsFromMergeCSs(PropS
 
        for (i = 0; i < propStat->numAdded; i++){
                propStat->tfidfs[i] = 
tfidfComp(propStat->freqs[i],curNumMergeCS);
+
        }
 
        //BATprint(propStat->pBat); 
        /*
+       {
+       int ret; 
+       char*   schema = "rdf";
+       str propStr; 
+
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               printf("Fail in opening Tokenizer \n");
+       }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Change the priority in assinging label to type-on...

Reply via email to