Changeset: da0c1bd43bd3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da0c1bd43bd3
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
Branch: rdf
Log Message:

Store type hierarchy for type values

Suggested by Peter
Instead of storing the leaf value per subject, store the whole hierarchy. By 
doing so, the frequencies are summed up on the more general levels of the 
hierarchy.
For example, 40% Politicians and 50% Athletes in a CS will be representented as 
(90% Thing, 90% Agent, 90% Person, 50% Athlete, 40% Politician), resulting in 
label candidate "Person" when threshold is set to 80%.


diffs (173 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
-/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+/* Add type values to the histogram. Values that are not present in the 
hierarchy tree built from the ontologies are NOT added to the histogram. */
 static
-void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
-       int             i, j, k;
+void insertValuesIntoTypeAttributesHistogram(oid* typeList, int 
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) {
+       int             i, j;
        int             fit;
-       char            *leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
-       BUN             pos;
-       OntClass        hierarchy;
-
-       // start with: every type value is a leaf
-       leaf = GDKmalloc(sizeof(char) * typeListLength);
-       for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
-
-       // analyze hierarchy
+
        for (i = 0; i < typeListLength; ++i) {
-               if (!leaf[i]) continue;
-               pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
-               if (pos == BUN_NONE) {
-                       // no ontology information for this type value, 
therefore it is not added to the hierarchy
-                       leaf[i] = 0;
-                       continue;
-               }
-
-               // get hierarchy of this type value
-               hierarchy = ontclassSet[pos];
-
-               // loop over superclasses, set leaf=0
-               for (j = 0; j < hierarchy.numsc; ++j) {
-                       for (k = 0; k < typeListLength; ++k) {
-                               if (i == k) continue;
-                               if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
-                                       // found superclass at position 'k'
-                                       leaf[k] = 0;
-                               }
-                       }
-               }
-       }
-
-       // add all leafs to the histogram
-       for (i = 0; i < typeListLength; ++i) {
-               if (!leaf[i]) continue;
+               BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+               if (pos == BUN_NONE) continue; // no ontology information, 
ignore
+
+               // add to histogram
                fit = 0;
                for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
                        if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
@@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog
                        
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
                }
        }
-
-       GDKfree(leaf);
 }
 
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass 
*ontclassSet) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) {
        // looping, extracting
        BUN             p, q;
        oid             *sbt, *obt, *pbt;
@@ -987,7 +955,7 @@ void createTypeAttributesHistogram(BAT *
                                        } else {
                                                // analyze values and add to 
histogram
                                                csFreqIdx = 
csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject
-                                               
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+                                               
insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat);
                                                typeValuesSize = 0; // reset
                                        }
                                        curS = *sbt;
@@ -1009,7 +977,7 @@ void createTypeAttributesHistogram(BAT *
        // analyze and add last set of typeValues
        if (curS != BUN_NONE && typeValuesSize != 0) {
                csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx 
of last subject
-               insertLeafsIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat, ontclassSet);
+               insertValuesIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat);
        }
 
        GDKfree(typeValues);
@@ -1024,14 +992,10 @@ void createTypeAttributesHistogram(BAT *
        // assign percentage
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < typeAttributesCount; ++j) {
-                       int sum = 0;
-                       // get total count of values
-                       for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
-                               sum += typeAttributesHistogram[i][j][k].freq;
-                       }
                        // assign percentage values for every value
                        for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
-                               typeAttributesHistogram[i][j][k].percent = 
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / sum + 0.5);
+                               typeAttributesHistogram[i][j][k].percent = 
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / 
freqCSset->items[i].support + 0.5);
+
                        }
                }
        }
@@ -2111,10 +2075,11 @@ void getTableName(CSlabel* label, int cs
        oid             *tmpList;
        int             tmpListCount;
        char            nameFound = 0;
+       oid             maxDepthOid;
+       int             maxFreq;
 
 
        (void) ontmetaBat;
-       (void) ontclassSet;
 
 
        // --- ONTOLOGY ---
@@ -2230,7 +2195,28 @@ void getTableName(CSlabel* label, int cs
                if (typeAttributesHistogram[csIdx][i][0].percent < 
TYPE_FREQ_THRESHOLD) continue; // sorted
                tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount 
+ 1));
                if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
-               tmpList[tmpListCount] = 
typeAttributesHistogram[csIdx][i][0].value;
+
+               // of all values that are >= TYPE_FREQ_THRESHOLD, choose the 
value with the highest hierarchy level ("deepest" value)
+               maxDepthOid = typeAttributesHistogram[csIdx][i][0].value;
+               maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
+               for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) {
+                       int depth, maxDepth;
+                       int freq;
+                       if (typeAttributesHistogram[csIdx][i][j].percent < 
TYPE_FREQ_THRESHOLD) break;
+                       depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&typeAttributesHistogram[csIdx][i][j].value)].hierDepth;
+                       maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&maxDepthOid)].hierDepth;;
+                       freq = typeAttributesHistogram[csIdx][i][j].freq;
+                       if (depth > maxDepth) {
+                               // choose value with higher hierarchy level
+                               maxDepthOid = 
typeAttributesHistogram[csIdx][i][j].value;
+                               maxFreq = freq;
+                       } else if (depth == maxDepth && freq > maxFreq) {
+                               // if both values are on the same level, choose 
the value with higher frequency
+                               maxDepthOid = 
typeAttributesHistogram[csIdx][i][j].value;
+                               maxFreq = freq;
+                       }
+               }
+               tmpList[tmpListCount] = maxDepthOid;
                tmpListCount += 1;
        }
 
@@ -2735,7 +2721,7 @@ CSlabel* createLabels(CSset* freqCSset, 
        typeAttributesHistogramCount = 
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
        typeAttributesHistogram = 
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
 #if USE_TYPE_NAMES
-       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
+       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat);
        typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, 
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
 #else
        (void) sbat;
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -91,7 +91,7 @@ enum {
 } RULE; 
 
 #define FK_FREQ_THRESHOLD 25           // X % of the targeted subjects have to 
be in this table
-#define TYPE_FREQ_THRESHOLD 30         // X % of the type values have to be 
this value
+#define TYPE_FREQ_THRESHOLD 80         // X % of the type values have to be 
this value
 #define ONTOLOGY_FREQ_THRESHOLD 0.4    // similarity threshold for tfidf 
simularity for ontology classes
 
 #define USE_SHORT_NAMES 1              // use getPropNameShort()
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to