Changeset: 81ad328df8fa for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=81ad328df8fa
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Merge with the changes from Linnea


diffs (284 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
+/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+static
+void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
+       int             i, j, k;
+       int             fit;
+       char            *leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
+       BUN             pos;
+       OntClass        hierarchy;
+
+       // start with: every type value is a leaf
+       leaf = GDKmalloc(sizeof(char) * typeListLength);
+       for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
+
+       // analyze hierarchy
+       for (i = 0; i < typeListLength; ++i) {
+               if (!leaf[i]) continue;
+               pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+               if (pos == BUN_NONE) {
+                       // no ontology information for this type value, 
therefore it is not added to the hierarchy
+                       leaf[i] = 0;
+                       continue;
+               }
+
+               // get hierarchy of this type value
+               hierarchy = ontclassSet[pos];
+
+               // loop over superclasses, set leaf=0
+               for (j = 0; j < hierarchy.numsc; ++j) {
+                       for (k = 0; k < typeListLength; ++k) {
+                               if (i == k) continue;
+                               if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
+                                       // found superclass at position 'k'
+                                       leaf[k] = 0;
+                               }
+                       }
+               }
+       }
+
+       // add all leafs to the histogram
+       for (i = 0; i < typeListLength; ++i) {
+               if (!leaf[i]) continue;
+               fit = 0;
+               for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
+                       if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
+                               // bucket exists
+                               
typeAttributesHistogram[csFreqIdx][type][j].freq += 1;
+                               fit = 1;
+                               break;
+                       }
+               }
+               if (!fit) {
+                       // bucket does not exist
+                       // realloc
+                       typeAttributesHistogramCount[csFreqIdx][type] += 1;
+                       typeAttributesHistogram[csFreqIdx][type] = 
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], 
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]);
+                       if (!typeAttributesHistogram[csFreqIdx][type]) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
+                       // insert value
+                       
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].value = typeList[i];
+                       
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
+               }
+       }
+
+       GDKfree(leaf);
+}
+
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass 
*ontclassSet) {
        // looping, extracting
        BUN             p, q;
        oid             *sbt, *obt, *pbt;
        char            objType;
        oid             objOid;
        int             csFreqIdx;
+       oid             curS; // last subject
+       int             curT; // last type (index in 'typeAttributes' array)
+       oid             *typeValues; // list of type values per subject and type
+       int             typeValuesSize;
+       int             typeValuesMaxSize = 10;
 
        // histogram
        int             i, j, k;
-       int             fit;
 
        oid             *typeAttributesOids = malloc(sizeof(oid) * 
typeAttributesCount);
 
@@ -878,6 +948,11 @@ void createTypeAttributesHistogram(BAT *
                TKNZRappend(&typeAttributesOids[i], &typeAttributes[i]);
        }
 
+       curS = BUN_NONE;
+       curT = -1;
+       typeValues = GDKmalloc(sizeof(oid) * typeValuesMaxSize);
+       if (!typeValues) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+       typeValuesSize = 0;
        BATloop(sbat, p, q) {
                // Get data
                sbt = (oid *) BUNtloc(si, p);
@@ -907,32 +982,36 @@ void createTypeAttributesHistogram(BAT *
                                        objOid = objOid - (objType*2 + 1) *  
RDF_MIN_LITERAL;   /* Get the real objOid from Map or Tokenizer */
                                }
 
-                               // add object to histogram
-                               fit = 0;
-                               for (j = 0; j < 
typeAttributesHistogramCount[csFreqIdx][i]; ++j) {
-                                       if 
(typeAttributesHistogram[csFreqIdx][i][j].value == objOid) {
-                                               // bucket exists
-                                               
typeAttributesHistogram[csFreqIdx][i][j].freq += 1;
-                                               fit = 1;
-                                               break;
+                               // if finished looping over one subject or 
type, the list of type values is analyzed and added to the histogram
+                               if (curS != *sbt || curT != i) {
+                                       if (curS == BUN_NONE || typeValuesSize 
== 0) {
+                                               // nothing to add to histogram
+                                       } else {
+                                               // analyze values and add to 
histogram
+                                               
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+                                               typeValuesSize = 0; // reset
                                        }
+                                       curS = *sbt;
+                                       curT = i;
                                }
-                               if (!fit) {
-                                       // bucket does not exist
-                                       // realloc
-                                       
typeAttributesHistogramCount[csFreqIdx][i] += 1;
-                                       typeAttributesHistogram[csFreqIdx][i] = 
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][i], 
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][i]);
-                                       if 
(!typeAttributesHistogram[csFreqIdx][i]) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
-
-                                       // insert value
-                                       
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
 - 1].value = objOid;
-                                       
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
 - 1].freq = 1;
+                               // add value to list of type values
+                               if (typeValuesSize == typeValuesMaxSize) {
+                                       // resize
+                                       typeValuesMaxSize *= 2;
+                                       typeValues = GDKrealloc(typeValues, 
sizeof(oid) * typeValuesMaxSize);
+                                       if (!typeValues) fprintf(stderr, 
"ERROR: Couldn't realloc memory!\n");
                                }
+                               typeValues[typeValuesSize++] = *obt;
                                break;
                        }
                }
        }
 
+       // analyze and add last set of typeValues
+       if (curS != BUN_NONE && typeValuesSize != 0) 
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+
+       GDKfree(typeValues);
+
        // sort descending by frequency
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < typeAttributesCount; ++j) {
@@ -1094,7 +1173,7 @@ int compareOntologyCandidates (const voi
 #if USE_ONTOLOGY_NAMES
 /* For one CS: Calculate the ontology classes that are similar (tfidf) to the 
list of attributes. */
 static
-oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) {
+oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat) {
        int             i, j, k, l;
        oid             *result = NULL;
 
@@ -1147,6 +1226,7 @@ oid* getOntologyCandidates(oid** ontattr
                        BUN p, bun;
                        p = listOids[i][j];
                        bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &p);
+                       if (bun == BUN_NONE) continue; // property does not 
belong to an ontology class and therefore has no tfidfs score
                        for (k = 0; k < candidatesCount[j]; ++k) { // for each 
candidate
                                // search for this class
                                int found = 0;
@@ -1169,21 +1249,6 @@ oid* getOntologyCandidates(oid** ontattr
                                }
                        }
                }
-               
-               //[DUC --- add the total tfidf score for a ontology class]  
//TODO: Compute before, not here
-               for (l = 0; l < num; ++l){
-                       for (j = 0; j < ontmetadataCount; ++j) {
-                               oid auri = ontmetadata[0][j];
-                               //printf("auri = " BUNFMT "\n", auri);
-                               if (auri == classStat[l].ontoClass){
-                                       //printf("Classstat %d (uri: "BUNFMT ") 
- Set totaltfidf with ontology %dth: %f \n", l, auri, j, 
totaltfidfsPerOntology[j]); 
-                                       classStat[l].totaltfidfs = 
totaltfidfsPerOntology[j]; 
-                                       break; 
-                               }
-                       }
-               }
-               //[ ... DUC]
-
 
                // calculate optimal tfidf score (all properties) & normalize 
tfidf sums
                totalTfidfs = 0.0;
@@ -1194,11 +1259,7 @@ oid* getOntologyCandidates(oid** ontattr
                        totalTfidfs += (propStat->tfidfs[bun] * 
propStat->tfidfs[bun]);
                }
                for (j = 0; j < num; ++j) {
-                       //classStat[j].tfidfs /= totalTfidfs;  //[DUC--modify]
-                       //printf("original classStat[j].tfidfs = %f \n", 
classStat[j].tfidfs);
-                       classStat[j].tfidfs = classStat[j].tfidfs / 
(sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs));
-                       //printf("totalTfidfs = %f    || 
classStat[j].totaltfidfs =  %f || classStat[j].tfidfs = %f 
\n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs);
-                       
+                       classStat[j].tfidfs /= totalTfidfs;
                }
 
                // sort by tfidf desc
@@ -1408,8 +1469,6 @@ static
 void createOntologyLookupResult(oid** result, CSset* freqCSset, int* 
resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata, 
int ontmetadataCount) {
        int             i, j;
        PropStat        *propStat;
-       float*          totaltfidfsPerOntology;         //[DUC]
-       oid             lastUri; 
 
        propStat = initPropStat();
 
@@ -1417,34 +1476,6 @@ void createOntologyLookupResult(oid** re
        // Not the properties from freqCS
        //createPropStatistics(propStat, freqCSset->numCSadded, freqCSset);
        createPropStatistics(propStat, ontattributes, ontattributesCount);
-       
-
-       lastUri = BUN_NONE; 
-       totaltfidfsPerOntology = (float*) malloc(sizeof(float) * 
ontmetadataCount);
-       //printf("Init tfidf for all %d ontologies \n",ontmetadataCount );
-       for (i = 0; i < ontmetadataCount; ++i) {
-               oid auri = ontmetadata[0][i];
-
-               if (auri == lastUri){ 
-                       //printf("Duplication at %d value " BUNFMT "\n", i, 
auri); 
-                       continue; 
-               }
-               else lastUri = auri; 
-               totaltfidfsPerOntology[i] = 0; 
-
-               for (j = 0; j < ontattributesCount; j++){
-                       oid tmpuri = ontattributes[0][j];
-                       oid aattr = ontattributes[1][j];
-                       if (auri == tmpuri){
-                               BUN bun = BUNfnd(BATmirror(propStat->pBat), 
(ptr) &aattr);
-                               if (bun == BUN_NONE) printf("[Debug] This 
cannot happen \n");
-                               else
-                                       totaltfidfsPerOntology[i] += 
(propStat->tfidfs[bun] * propStat->tfidfs[bun]);
-                       }       
-               }
-               //printf("Computed totaltfidfsPerOntology of ontology %d: %f 
(uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri);
-       }
-       //... [DUC]
 
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS              cs;
@@ -1475,7 +1506,7 @@ void createOntologyLookupResult(oid** re
 
                // get class names
                resultCount[i] = 0;
-               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, 
propStat,totaltfidfsPerOntology);
+               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
 
                for (j = 0; j < ontologyCount; ++j) {
                        free(propOntologies[j]);
@@ -1486,7 +1517,6 @@ void createOntologyLookupResult(oid** re
                free(propOntologiesCount);
        }
        freePropStat(propStat);
-       free(totaltfidfsPerOntology);
 }
 #endif
 
@@ -2703,7 +2733,7 @@ CSlabel* createLabels(CSset* freqCSset, 
        typeAttributesHistogramCount = 
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
        typeAttributesHistogram = 
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
 #if USE_TYPE_NAMES
-       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes);
+       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
        typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, 
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
 #else
        (void) sbat;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to