Changeset: 3e4ece2b7085 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3e4ece2b7085 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message:
Improve label quality - Computation of similarity between CS's and classes is now based on the assumption that all properties of a CS should belong to one ontology class, not that the CS has to consist of ALL properties of the corresponding ontology class. - Type values are usually multi-valued properties, the values represent the hierarchy the subject belongs to (e.g., if a subject in the dbpedia dataset has type 'Athlete', it also has types 'Person', 'Agent', 'Thing'). This hierarchy is analyzed and only the most specific type value (the "leaf") is added to the data structures. This improves the label candidates that are computed using type values. diffs (284 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES +/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +static +void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { + int i, j, k; + int fit; + char *leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) + BUN pos; + OntClass hierarchy; + + // start with: every type value is a leaf + leaf = GDKmalloc(sizeof(char) * typeListLength); + for (i = 0; i < typeListLength; ++i) leaf[i] = 1; + + // analyze hierarchy + for (i = 0; i < typeListLength; ++i) { + if (!leaf[i]) continue; + pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); + if (pos == BUN_NONE) { + // no ontology information for this type value, therefore it is not added to the hierarchy + leaf[i] = 0; + continue; + } + + // get hierarchy of this type value + hierarchy = ontclassSet[pos]; + + // loop over superclasses, set leaf=0 + for (j = 0; j < hierarchy.numsc; ++j) { + for (k = 0; k < typeListLength; ++k) { + if (i == k) continue; + if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { + // found superclass at position 'k' + leaf[k] = 0; + } + } + } + } + + // add all leafs to the histogram + for (i = 0; i < typeListLength; ++i) { + if (!leaf[i]) continue; + fit = 0; + for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; ++j) { + if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { + // bucket exists + typeAttributesHistogram[csFreqIdx][type][j].freq += 1; + fit = 1; + break; + } + } + if (!fit) { + // bucket does not exist + // realloc + typeAttributesHistogramCount[csFreqIdx][type] += 1; + typeAttributesHistogram[csFreqIdx][type] = (TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]); + if (!typeAttributesHistogram[csFreqIdx][type]) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + + // insert value + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].value = typeList[i]; + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; + } + } + + GDKfree(leaf); +} + /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass *ontclassSet) { // looping, extracting BUN p, q; oid *sbt, *obt, *pbt; char objType; oid objOid; int csFreqIdx; + oid curS; // last subject + int curT; // last type (index in 'typeAttributes' array) + oid *typeValues; // list of type values per subject and type + int typeValuesSize; + int typeValuesMaxSize = 10; // histogram int i, j, k; - int fit; oid *typeAttributesOids = malloc(sizeof(oid) * typeAttributesCount); @@ -878,6 +948,11 @@ void createTypeAttributesHistogram(BAT * TKNZRappend(&typeAttributesOids[i], &typeAttributes[i]); } + curS = BUN_NONE; + curT = -1; + typeValues = GDKmalloc(sizeof(oid) * typeValuesMaxSize); + if (!typeValues) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + typeValuesSize = 0; BATloop(sbat, p, q) { // Get data sbt = (oid *) BUNtloc(si, p); @@ -907,32 +982,36 @@ void createTypeAttributesHistogram(BAT * objOid = objOid - (objType*2 + 1) * RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */ } - // add object to histogram - fit = 0; - for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][i]; ++j) { - if (typeAttributesHistogram[csFreqIdx][i][j].value == objOid) { - // bucket exists - typeAttributesHistogram[csFreqIdx][i][j].freq += 1; - fit = 1; - break; + // if finished looping over one subject or type, the list of type values is analyzed and added to the histogram + if (curS != *sbt || curT != i) { + if (curS == BUN_NONE || typeValuesSize == 0) { + // nothing to add to histogram + } else { + // analyze values and add to histogram + insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + typeValuesSize = 0; // reset } + curS = *sbt; + curT = i; } - if (!fit) { - // bucket does not exist - // realloc - typeAttributesHistogramCount[csFreqIdx][i] += 1; - typeAttributesHistogram[csFreqIdx][i] = (TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][i], sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][i]); - if (!typeAttributesHistogram[csFreqIdx][i]) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - - // insert value - typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i] - 1].value = objOid; - typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i] - 1].freq = 1; + // add value to list of type values + if (typeValuesSize == typeValuesMaxSize) { + // resize + typeValuesMaxSize *= 2; + typeValues = GDKrealloc(typeValues, sizeof(oid) * typeValuesMaxSize); + if (!typeValues) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); } + typeValues[typeValuesSize++] = *obt; break; } } } + // analyze and add last set of typeValues + if (curS != BUN_NONE && typeValuesSize != 0) insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + + GDKfree(typeValues); + // sort descending by frequency for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < typeAttributesCount; ++j) { @@ -1094,7 +1173,7 @@ int compareOntologyCandidates (const voi #if USE_ONTOLOGY_NAMES /* For one CS: Calculate the ontology classes that are similar (tfidf) to the list of attributes. */ static -oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int *listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) { +oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int *listCount, int listNum, PropStat *propStat) { int i, j, k, l; oid *result = NULL; @@ -1147,6 +1226,7 @@ oid* getOntologyCandidates(oid** ontattr BUN p, bun; p = listOids[i][j]; bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &p); + if (bun == BUN_NONE) continue; // property does not belong to an ontology class and therefore has no tfidfs score for (k = 0; k < candidatesCount[j]; ++k) { // for each candidate // search for this class int found = 0; @@ -1169,21 +1249,6 @@ oid* getOntologyCandidates(oid** ontattr } } } - - //[DUC --- add the total tfidf score for a ontology class] //TODO: Compute before, not here - for (l = 0; l < num; ++l){ - for (j = 0; j < ontmetadataCount; ++j) { - oid auri = ontmetadata[0][j]; - //printf("auri = " BUNFMT "\n", auri); - if (auri == classStat[l].ontoClass){ - //printf("Classstat %d (uri: "BUNFMT ") - Set totaltfidf with ontology %dth: %f \n", l, auri, j, totaltfidfsPerOntology[j]); - classStat[l].totaltfidfs = totaltfidfsPerOntology[j]; - break; - } - } - } - //[ ... DUC] - // calculate optimal tfidf score (all properties) & normalize tfidf sums totalTfidfs = 0.0; @@ -1194,11 +1259,7 @@ oid* getOntologyCandidates(oid** ontattr totalTfidfs += (propStat->tfidfs[bun] * propStat->tfidfs[bun]); } for (j = 0; j < num; ++j) { - //classStat[j].tfidfs /= totalTfidfs; //[DUC--modify] - //printf("original classStat[j].tfidfs = %f \n", classStat[j].tfidfs); - classStat[j].tfidfs = classStat[j].tfidfs / (sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs)); - //printf("totalTfidfs = %f || classStat[j].totaltfidfs = %f || classStat[j].tfidfs = %f \n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs); - + classStat[j].tfidfs /= totalTfidfs; } // sort by tfidf desc @@ -1408,8 +1469,6 @@ static void createOntologyLookupResult(oid** result, CSset* freqCSset, int* resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount) { int i, j; PropStat *propStat; - float* totaltfidfsPerOntology; //[DUC] - oid lastUri; propStat = initPropStat(); @@ -1417,34 +1476,6 @@ void createOntologyLookupResult(oid** re // Not the properties from freqCS //createPropStatistics(propStat, freqCSset->numCSadded, freqCSset); createPropStatistics(propStat, ontattributes, ontattributesCount); - - - lastUri = BUN_NONE; - totaltfidfsPerOntology = (float*) malloc(sizeof(float) * ontmetadataCount); - //printf("Init tfidf for all %d ontologies \n",ontmetadataCount ); - for (i = 0; i < ontmetadataCount; ++i) { - oid auri = ontmetadata[0][i]; - - if (auri == lastUri){ - //printf("Duplication at %d value " BUNFMT "\n", i, auri); - continue; - } - else lastUri = auri; - totaltfidfsPerOntology[i] = 0; - - for (j = 0; j < ontattributesCount; j++){ - oid tmpuri = ontattributes[0][j]; - oid aattr = ontattributes[1][j]; - if (auri == tmpuri){ - BUN bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &aattr); - if (bun == BUN_NONE) printf("[Debug] This cannot happen \n"); - else - totaltfidfsPerOntology[i] += (propStat->tfidfs[bun] * propStat->tfidfs[bun]); - } - } - //printf("Computed totaltfidfsPerOntology of ontology %d: %f (uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri); - } - //... [DUC] for (i = 0; i < freqCSset->numCSadded; ++i) { CS cs; @@ -1475,7 +1506,7 @@ void createOntologyLookupResult(oid** re // get class names resultCount[i] = 0; - result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat,totaltfidfsPerOntology); + result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat); for (j = 0; j < ontologyCount; ++j) { free(propOntologies[j]); @@ -1486,7 +1517,6 @@ void createOntologyLookupResult(oid** re free(propOntologiesCount); } freePropStat(propStat); - free(totaltfidfsPerOntology); } #endif @@ -2703,7 +2733,7 @@ CSlabel* createLabels(CSset* freqCSset, typeAttributesHistogramCount = initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded); typeAttributesHistogram = initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded); #if USE_TYPE_NAMES - createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes); + createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet); typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); #else (void) sbat; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list