Changeset: da0c1bd43bd3 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da0c1bd43bd3 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h Branch: rdf Log Message:
Store type hierarchy for type values Suggested by Peter Instead of storing the leaf value per subject, store the whole hierarchy. By doing so, the frequencies are summed up on the more general levels of the hierarchy. For example, 40% Politicians and 50% Athletes in a CS will be representented as (90% Thing, 90% Agent, 90% Person, 50% Athlete, 40% Politician), resulting in label candidate "Person" when threshold is set to 80%. diffs (173 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES -/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +/* Add type values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ static -void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { - int i, j, k; +void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) { + int i, j; int fit; - char *leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) - BUN pos; - OntClass hierarchy; - - // start with: every type value is a leaf - leaf = GDKmalloc(sizeof(char) * typeListLength); - for (i = 0; i < typeListLength; ++i) leaf[i] = 1; - - // analyze hierarchy + for (i = 0; i < typeListLength; ++i) { - if (!leaf[i]) continue; - pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); - if (pos == BUN_NONE) { - // no ontology information for this type value, therefore it is not added to the hierarchy - leaf[i] = 0; - continue; - } - - // get hierarchy of this type value - hierarchy = ontclassSet[pos]; - - // loop over superclasses, set leaf=0 - for (j = 0; j < hierarchy.numsc; ++j) { - for (k = 0; k < typeListLength; ++k) { - if (i == k) continue; - if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { - // found superclass at position 'k' - leaf[k] = 0; - } - } - } - } - - // add all leafs to the histogram - for (i = 0; i < typeListLength; ++i) { - if (!leaf[i]) continue; + BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); + if (pos == BUN_NONE) continue; // no ontology information, ignore + + // add to histogram fit = 0; for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; ++j) { if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { @@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; } } - - GDKfree(leaf); } /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass *ontclassSet) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) { // looping, extracting BUN p, q; oid *sbt, *obt, *pbt; @@ -987,7 +955,7 @@ void createTypeAttributesHistogram(BAT * } else { // analyze values and add to histogram csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject - insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); typeValuesSize = 0; // reset } curS = *sbt; @@ -1009,7 +977,7 @@ void createTypeAttributesHistogram(BAT * // analyze and add last set of typeValues if (curS != BUN_NONE && typeValuesSize != 0) { csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject - insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); } GDKfree(typeValues); @@ -1024,14 +992,10 @@ void createTypeAttributesHistogram(BAT * // assign percentage for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < typeAttributesCount; ++j) { - int sum = 0; - // get total count of values - for (k = 0; k < typeAttributesHistogramCount[i][j]; ++k) { - sum += typeAttributesHistogram[i][j][k].freq; - } // assign percentage values for every value for (k = 0; k < typeAttributesHistogramCount[i][j]; ++k) { - typeAttributesHistogram[i][j][k].percent = (int) (100.0 * typeAttributesHistogram[i][j][k].freq / sum + 0.5); + typeAttributesHistogram[i][j][k].percent = (int) (100.0 * typeAttributesHistogram[i][j][k].freq / freqCSset->items[i].support + 0.5); + } } } @@ -2111,10 +2075,11 @@ void getTableName(CSlabel* label, int cs oid *tmpList; int tmpListCount; char nameFound = 0; + oid maxDepthOid; + int maxFreq; (void) ontmetaBat; - (void) ontclassSet; // --- ONTOLOGY --- @@ -2230,7 +2195,28 @@ void getTableName(CSlabel* label, int cs if (typeAttributesHistogram[csIdx][i][0].percent < TYPE_FREQ_THRESHOLD) continue; // sorted tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - tmpList[tmpListCount] = typeAttributesHistogram[csIdx][i][0].value; + + // of all values that are >= TYPE_FREQ_THRESHOLD, choose the value with the highest hierarchy level ("deepest" value) + maxDepthOid = typeAttributesHistogram[csIdx][i][0].value; + maxFreq = typeAttributesHistogram[csIdx][i][0].freq; + for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) { + int depth, maxDepth; + int freq; + if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; + depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &typeAttributesHistogram[csIdx][i][j].value)].hierDepth; + maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &maxDepthOid)].hierDepth;; + freq = typeAttributesHistogram[csIdx][i][j].freq; + if (depth > maxDepth) { + // choose value with higher hierarchy level + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + } else if (depth == maxDepth && freq > maxFreq) { + // if both values are on the same level, choose the value with higher frequency + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + } + } + tmpList[tmpListCount] = maxDepthOid; tmpListCount += 1; } @@ -2735,7 +2721,7 @@ CSlabel* createLabels(CSset* freqCSset, typeAttributesHistogramCount = initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded); typeAttributesHistogram = initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded); #if USE_TYPE_NAMES - createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet); + createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat); typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); #else (void) sbat; diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -91,7 +91,7 @@ enum { } RULE; #define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to be in this table -#define TYPE_FREQ_THRESHOLD 30 // X % of the type values have to be this value +#define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be this value #define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf simularity for ontology classes #define USE_SHORT_NAMES 1 // use getPropNameShort() _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list