Changeset: 81ad328df8fa for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=81ad328df8fa Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message:
Merge with the changes from Linnea diffs (284 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES +/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +static +void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { + int i, j, k; + int fit; + char *leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) + BUN pos; + OntClass hierarchy; + + // start with: every type value is a leaf + leaf = GDKmalloc(sizeof(char) * typeListLength); + for (i = 0; i < typeListLength; ++i) leaf[i] = 1; + + // analyze hierarchy + for (i = 0; i < typeListLength; ++i) { + if (!leaf[i]) continue; + pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); + if (pos == BUN_NONE) { + // no ontology information for this type value, therefore it is not added to the hierarchy + leaf[i] = 0; + continue; + } + + // get hierarchy of this type value + hierarchy = ontclassSet[pos]; + + // loop over superclasses, set leaf=0 + for (j = 0; j < hierarchy.numsc; ++j) { + for (k = 0; k < typeListLength; ++k) { + if (i == k) continue; + if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { + // found superclass at position 'k' + leaf[k] = 0; + } + } + } + } + + // add all leafs to the histogram + for (i = 0; i < typeListLength; ++i) { + if (!leaf[i]) continue; + fit = 0; + for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; ++j) { + if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { + // bucket exists + typeAttributesHistogram[csFreqIdx][type][j].freq += 1; + fit = 1; + break; + } + } + if (!fit) { + // bucket does not exist + // realloc + typeAttributesHistogramCount[csFreqIdx][type] += 1; + typeAttributesHistogram[csFreqIdx][type] = (TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]); + if (!typeAttributesHistogram[csFreqIdx][type]) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + + // insert value + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].value = typeList[i]; + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; + } + } + + GDKfree(leaf); +} + /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass *ontclassSet) { // looping, extracting BUN p, q; oid *sbt, *obt, *pbt; char objType; oid objOid; int csFreqIdx; + oid curS; // last subject + int curT; // last type (index in 'typeAttributes' array) + oid *typeValues; // list of type values per subject and type + int typeValuesSize; + int typeValuesMaxSize = 10; // histogram int i, j, k; - int fit; oid *typeAttributesOids = malloc(sizeof(oid) * typeAttributesCount); @@ -878,6 +948,11 @@ void createTypeAttributesHistogram(BAT * TKNZRappend(&typeAttributesOids[i], &typeAttributes[i]); } + curS = BUN_NONE; + curT = -1; + typeValues = GDKmalloc(sizeof(oid) * typeValuesMaxSize); + if (!typeValues) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + typeValuesSize = 0; BATloop(sbat, p, q) { // Get data sbt = (oid *) BUNtloc(si, p); @@ -907,32 +982,36 @@ void createTypeAttributesHistogram(BAT * objOid = objOid - (objType*2 + 1) * RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */ } - // add object to histogram - fit = 0; - for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][i]; ++j) { - if (typeAttributesHistogram[csFreqIdx][i][j].value == objOid) { - // bucket exists - typeAttributesHistogram[csFreqIdx][i][j].freq += 1; - fit = 1; - break; + // if finished looping over one subject or type, the list of type values is analyzed and added to the histogram + if (curS != *sbt || curT != i) { + if (curS == BUN_NONE || typeValuesSize == 0) { + // nothing to add to histogram + } else { + // analyze values and add to histogram + insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + typeValuesSize = 0; // reset } + curS = *sbt; + curT = i; } - if (!fit) { - // bucket does not exist - // realloc - typeAttributesHistogramCount[csFreqIdx][i] += 1; - typeAttributesHistogram[csFreqIdx][i] = (TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][i], sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][i]); - if (!typeAttributesHistogram[csFreqIdx][i]) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - - // insert value - typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i] - 1].value = objOid; - typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i] - 1].freq = 1; + // add value to list of type values + if (typeValuesSize == typeValuesMaxSize) { + // resize + typeValuesMaxSize *= 2; + typeValues = GDKrealloc(typeValues, sizeof(oid) * typeValuesMaxSize); + if (!typeValues) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); } + typeValues[typeValuesSize++] = *obt; break; } } } + // analyze and add last set of typeValues + if (curS != BUN_NONE && typeValuesSize != 0) insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + + GDKfree(typeValues); + // sort descending by frequency for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < typeAttributesCount; ++j) { @@ -1094,7 +1173,7 @@ int compareOntologyCandidates (const voi #if USE_ONTOLOGY_NAMES /* For one CS: Calculate the ontology classes that are similar (tfidf) to the list of attributes. */ static -oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int *listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) { +oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int *listCount, int listNum, PropStat *propStat) { int i, j, k, l; oid *result = NULL; @@ -1147,6 +1226,7 @@ oid* getOntologyCandidates(oid** ontattr BUN p, bun; p = listOids[i][j]; bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &p); + if (bun == BUN_NONE) continue; // property does not belong to an ontology class and therefore has no tfidfs score for (k = 0; k < candidatesCount[j]; ++k) { // for each candidate // search for this class int found = 0; @@ -1169,21 +1249,6 @@ oid* getOntologyCandidates(oid** ontattr } } } - - //[DUC --- add the total tfidf score for a ontology class] //TODO: Compute before, not here - for (l = 0; l < num; ++l){ - for (j = 0; j < ontmetadataCount; ++j) { - oid auri = ontmetadata[0][j]; - //printf("auri = " BUNFMT "\n", auri); - if (auri == classStat[l].ontoClass){ - //printf("Classstat %d (uri: "BUNFMT ") - Set totaltfidf with ontology %dth: %f \n", l, auri, j, totaltfidfsPerOntology[j]); - classStat[l].totaltfidfs = totaltfidfsPerOntology[j]; - break; - } - } - } - //[ ... DUC] - // calculate optimal tfidf score (all properties) & normalize tfidf sums totalTfidfs = 0.0; @@ -1194,11 +1259,7 @@ oid* getOntologyCandidates(oid** ontattr totalTfidfs += (propStat->tfidfs[bun] * propStat->tfidfs[bun]); } for (j = 0; j < num; ++j) { - //classStat[j].tfidfs /= totalTfidfs; //[DUC--modify] - //printf("original classStat[j].tfidfs = %f \n", classStat[j].tfidfs); - classStat[j].tfidfs = classStat[j].tfidfs / (sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs)); - //printf("totalTfidfs = %f || classStat[j].totaltfidfs = %f || classStat[j].tfidfs = %f \n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs); - + classStat[j].tfidfs /= totalTfidfs; } // sort by tfidf desc @@ -1408,8 +1469,6 @@ static void createOntologyLookupResult(oid** result, CSset* freqCSset, int* resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata, int ontmetadataCount) { int i, j; PropStat *propStat; - float* totaltfidfsPerOntology; //[DUC] - oid lastUri; propStat = initPropStat(); @@ -1417,34 +1476,6 @@ void createOntologyLookupResult(oid** re // Not the properties from freqCS //createPropStatistics(propStat, freqCSset->numCSadded, freqCSset); createPropStatistics(propStat, ontattributes, ontattributesCount); - - - lastUri = BUN_NONE; - totaltfidfsPerOntology = (float*) malloc(sizeof(float) * ontmetadataCount); - //printf("Init tfidf for all %d ontologies \n",ontmetadataCount ); - for (i = 0; i < ontmetadataCount; ++i) { - oid auri = ontmetadata[0][i]; - - if (auri == lastUri){ - //printf("Duplication at %d value " BUNFMT "\n", i, auri); - continue; - } - else lastUri = auri; - totaltfidfsPerOntology[i] = 0; - - for (j = 0; j < ontattributesCount; j++){ - oid tmpuri = ontattributes[0][j]; - oid aattr = ontattributes[1][j]; - if (auri == tmpuri){ - BUN bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &aattr); - if (bun == BUN_NONE) printf("[Debug] This cannot happen \n"); - else - totaltfidfsPerOntology[i] += (propStat->tfidfs[bun] * propStat->tfidfs[bun]); - } - } - //printf("Computed totaltfidfsPerOntology of ontology %d: %f (uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri); - } - //... [DUC] for (i = 0; i < freqCSset->numCSadded; ++i) { CS cs; @@ -1475,7 +1506,7 @@ void createOntologyLookupResult(oid** re // get class names resultCount[i] = 0; - result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat,totaltfidfsPerOntology); + result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat); for (j = 0; j < ontologyCount; ++j) { free(propOntologies[j]); @@ -1486,7 +1517,6 @@ void createOntologyLookupResult(oid** re free(propOntologiesCount); } freePropStat(propStat); - free(totaltfidfsPerOntology); } #endif @@ -2703,7 +2733,7 @@ CSlabel* createLabels(CSset* freqCSset, typeAttributesHistogramCount = initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded); typeAttributesHistogram = initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded); #if USE_TYPE_NAMES - createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes); + createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet); typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); #else (void) sbat; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list