Changeset: 944815cdd7d6 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=944815cdd7d6
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
Branch: rdf
Log Message:

Merge with Linnea changes in rdflabel


diffs (188 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
-/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+/* Add type values to the histogram. Values that are not present in the 
hierarchy tree built from the ontologies are NOT added to the histogram. */
 static
-void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
-       int             i, j, k;
+void insertValuesIntoTypeAttributesHistogram(oid* typeList, int 
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) {
+       int             i, j;
        int             fit;
-       char            *leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
-       BUN             pos;
-       OntClass        hierarchy;
-
-       // start with: every type value is a leaf
-       leaf = GDKmalloc(sizeof(char) * typeListLength);
-       for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
-
-       // analyze hierarchy
+
        for (i = 0; i < typeListLength; ++i) {
-               if (!leaf[i]) continue;
-               pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
-               if (pos == BUN_NONE) {
-                       // no ontology information for this type value, 
therefore it is not added to the hierarchy
-                       leaf[i] = 0;
-                       continue;
-               }
-
-               // get hierarchy of this type value
-               hierarchy = ontclassSet[pos];
-
-               // loop over superclasses, set leaf=0
-               for (j = 0; j < hierarchy.numsc; ++j) {
-                       for (k = 0; k < typeListLength; ++k) {
-                               if (i == k) continue;
-                               if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
-                                       // found superclass at position 'k'
-                                       leaf[k] = 0;
-                               }
-                       }
-               }
-       }
-
-       // add all leafs to the histogram
-       for (i = 0; i < typeListLength; ++i) {
-               if (!leaf[i]) continue;
+               BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+               if (pos == BUN_NONE) continue; // no ontology information, 
ignore
+
+               // add to histogram
                fit = 0;
                for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
                        if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
@@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog
                        
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
                }
        }
-
-       GDKfree(leaf);
 }
 
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass 
*ontclassSet) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) {
        // looping, extracting
        BUN             p, q;
        oid             *sbt, *obt, *pbt;
@@ -967,9 +935,7 @@ void createTypeAttributesHistogram(BAT *
                // check if property (*pbt) is a type
                for (i = 0; i < typeAttributesCount; ++i) {
                        if (*pbt == typeAttributesOids[i]) {
-
                                // prop is a type!
-                               csFreqIdx = csIdFreqIdxMap[subjCSMap[*sbt]];
 
                                // get object
                                obt = (oid *) BUNtloc(oi, p);
@@ -988,7 +954,8 @@ void createTypeAttributesHistogram(BAT *
                                                // nothing to add to histogram
                                        } else {
                                                // analyze values and add to 
histogram
-                                               
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+                                               csFreqIdx = 
csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject
+                                               
insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat);
                                                typeValuesSize = 0; // reset
                                        }
                                        curS = *sbt;
@@ -1008,7 +975,10 @@ void createTypeAttributesHistogram(BAT *
        }
 
        // analyze and add last set of typeValues
-       if (curS != BUN_NONE && typeValuesSize != 0) 
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+       if (curS != BUN_NONE && typeValuesSize != 0) {
+               csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx 
of last subject
+               insertValuesIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat);
+       }
 
        GDKfree(typeValues);
 
@@ -1022,14 +992,10 @@ void createTypeAttributesHistogram(BAT *
        // assign percentage
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < typeAttributesCount; ++j) {
-                       int sum = 0;
-                       // get total count of values
-                       for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
-                               sum += typeAttributesHistogram[i][j][k].freq;
-                       }
                        // assign percentage values for every value
                        for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
-                               typeAttributesHistogram[i][j][k].percent = 
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / sum + 0.5);
+                               typeAttributesHistogram[i][j][k].percent = 
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / 
freqCSset->items[i].support + 0.5);
+
                        }
                }
        }
@@ -2109,10 +2075,11 @@ void getTableName(CSlabel* label, int cs
        oid             *tmpList;
        int             tmpListCount;
        char            nameFound = 0;
+       oid             maxDepthOid;
+       int             maxFreq;
 
 
        (void) ontmetaBat;
-       (void) ontclassSet;
 
 
        // --- ONTOLOGY ---
@@ -2228,7 +2195,28 @@ void getTableName(CSlabel* label, int cs
                if (typeAttributesHistogram[csIdx][i][0].percent < 
TYPE_FREQ_THRESHOLD) continue; // sorted
                tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount 
+ 1));
                if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
-               tmpList[tmpListCount] = 
typeAttributesHistogram[csIdx][i][0].value;
+
+               // of all values that are >= TYPE_FREQ_THRESHOLD, choose the 
value with the highest hierarchy level ("deepest" value)
+               maxDepthOid = typeAttributesHistogram[csIdx][i][0].value;
+               maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
+               for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) {
+                       int depth, maxDepth;
+                       int freq;
+                       if (typeAttributesHistogram[csIdx][i][j].percent < 
TYPE_FREQ_THRESHOLD) break;
+                       depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&typeAttributesHistogram[csIdx][i][j].value)].hierDepth;
+                       maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), 
&maxDepthOid)].hierDepth;;
+                       freq = typeAttributesHistogram[csIdx][i][j].freq;
+                       if (depth > maxDepth) {
+                               // choose value with higher hierarchy level
+                               maxDepthOid = 
typeAttributesHistogram[csIdx][i][j].value;
+                               maxFreq = freq;
+                       } else if (depth == maxDepth && freq > maxFreq) {
+                               // if both values are on the same level, choose 
the value with higher frequency
+                               maxDepthOid = 
typeAttributesHistogram[csIdx][i][j].value;
+                               maxFreq = freq;
+                       }
+               }
+               tmpList[tmpListCount] = maxDepthOid;
                tmpListCount += 1;
        }
 
@@ -2736,7 +2724,7 @@ CSlabel* createLabels(CSset* freqCSset, 
        typeAttributesHistogramCount = 
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
        typeAttributesHistogram = 
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
 #if USE_TYPE_NAMES
-       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
+       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat);
        typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, 
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
 #else
        (void) sbat;
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -91,8 +91,7 @@ enum {
 } RULE; 
 
 #define FK_FREQ_THRESHOLD 25           // X % of the targeted subjects have to 
be in this table
-//#define TYPE_FREQ_THRESHOLD 30               // X % of the type values have 
to be this value
-#define TYPE_FREQ_THRESHOLD 0          // X % of the type values have to be 
this value
+#define TYPE_FREQ_THRESHOLD 80         // X % of the type values have to be 
this value
 #define ONTOLOGY_FREQ_THRESHOLD 0.4    // similarity threshold for tfidf 
simularity for ontology classes
 
 #define USE_SHORT_NAMES 1              // use getPropNameShort()
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to