Changeset: 944815cdd7d6 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=944815cdd7d6 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h Branch: rdf Log Message:
Merge with Linnea changes in rdflabel diffs (188 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES -/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +/* Add type values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ static -void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { - int i, j, k; +void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) { + int i, j; int fit; - char *leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) - BUN pos; - OntClass hierarchy; - - // start with: every type value is a leaf - leaf = GDKmalloc(sizeof(char) * typeListLength); - for (i = 0; i < typeListLength; ++i) leaf[i] = 1; - - // analyze hierarchy + for (i = 0; i < typeListLength; ++i) { - if (!leaf[i]) continue; - pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); - if (pos == BUN_NONE) { - // no ontology information for this type value, therefore it is not added to the hierarchy - leaf[i] = 0; - continue; - } - - // get hierarchy of this type value - hierarchy = ontclassSet[pos]; - - // loop over superclasses, set leaf=0 - for (j = 0; j < hierarchy.numsc; ++j) { - for (k = 0; k < typeListLength; ++k) { - if (i == k) continue; - if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { - // found superclass at position 'k' - leaf[k] = 0; - } - } - } - } - - // add all leafs to the histogram - for (i = 0; i < typeListLength; ++i) { - if (!leaf[i]) continue; + BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); + if (pos == BUN_NONE) continue; // no ontology information, ignore + + // add to histogram fit = 0; for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; ++j) { if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { @@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; } } - - GDKfree(leaf); } /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass *ontclassSet) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) { // looping, extracting BUN p, q; oid *sbt, *obt, *pbt; @@ -967,9 +935,7 @@ void createTypeAttributesHistogram(BAT * // check if property (*pbt) is a type for (i = 0; i < typeAttributesCount; ++i) { if (*pbt == typeAttributesOids[i]) { - // prop is a type! - csFreqIdx = csIdFreqIdxMap[subjCSMap[*sbt]]; // get object obt = (oid *) BUNtloc(oi, p); @@ -988,7 +954,8 @@ void createTypeAttributesHistogram(BAT * // nothing to add to histogram } else { // analyze values and add to histogram - insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); typeValuesSize = 0; // reset } curS = *sbt; @@ -1008,7 +975,10 @@ void createTypeAttributesHistogram(BAT * } // analyze and add last set of typeValues - if (curS != BUN_NONE && typeValuesSize != 0) insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + if (curS != BUN_NONE && typeValuesSize != 0) { + csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); + } GDKfree(typeValues); @@ -1022,14 +992,10 @@ void createTypeAttributesHistogram(BAT * // assign percentage for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < typeAttributesCount; ++j) { - int sum = 0; - // get total count of values - for (k = 0; k < typeAttributesHistogramCount[i][j]; ++k) { - sum += typeAttributesHistogram[i][j][k].freq; - } // assign percentage values for every value for (k = 0; k < typeAttributesHistogramCount[i][j]; ++k) { - typeAttributesHistogram[i][j][k].percent = (int) (100.0 * typeAttributesHistogram[i][j][k].freq / sum + 0.5); + typeAttributesHistogram[i][j][k].percent = (int) (100.0 * typeAttributesHistogram[i][j][k].freq / freqCSset->items[i].support + 0.5); + } } } @@ -2109,10 +2075,11 @@ void getTableName(CSlabel* label, int cs oid *tmpList; int tmpListCount; char nameFound = 0; + oid maxDepthOid; + int maxFreq; (void) ontmetaBat; - (void) ontclassSet; // --- ONTOLOGY --- @@ -2228,7 +2195,28 @@ void getTableName(CSlabel* label, int cs if (typeAttributesHistogram[csIdx][i][0].percent < TYPE_FREQ_THRESHOLD) continue; // sorted tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - tmpList[tmpListCount] = typeAttributesHistogram[csIdx][i][0].value; + + // of all values that are >= TYPE_FREQ_THRESHOLD, choose the value with the highest hierarchy level ("deepest" value) + maxDepthOid = typeAttributesHistogram[csIdx][i][0].value; + maxFreq = typeAttributesHistogram[csIdx][i][0].freq; + for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) { + int depth, maxDepth; + int freq; + if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; + depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &typeAttributesHistogram[csIdx][i][j].value)].hierDepth; + maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &maxDepthOid)].hierDepth;; + freq = typeAttributesHistogram[csIdx][i][j].freq; + if (depth > maxDepth) { + // choose value with higher hierarchy level + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + } else if (depth == maxDepth && freq > maxFreq) { + // if both values are on the same level, choose the value with higher frequency + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + } + } + tmpList[tmpListCount] = maxDepthOid; tmpListCount += 1; } @@ -2736,7 +2724,7 @@ CSlabel* createLabels(CSset* freqCSset, typeAttributesHistogramCount = initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded); typeAttributesHistogram = initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded); #if USE_TYPE_NAMES - createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet); + createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat); typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); #else (void) sbat; diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -91,8 +91,7 @@ enum { } RULE; #define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to be in this table -//#define TYPE_FREQ_THRESHOLD 30 // X % of the type values have to be this value -#define TYPE_FREQ_THRESHOLD 0 // X % of the type values have to be this value +#define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be this value #define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf simularity for ontology classes #define USE_SHORT_NAMES 1 // use getPropNameShort() _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list