Changeset: a5f2a79c980b for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a5f2a79c980b Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Add hierarchy information for ontology-based name regardless of where the name comes from. The name can be assigned based on the type-value, ontology-class similarity or FK. However, if the name can be found in ontology-class, we build ontology hierarchy for that name. This is to prevent the problem happened when most of the CS's from dbpsb dataset get the name from their type values. These type values are also ontology-class, but we do not build any hierarchy information for these CS's. More strict on the merging rule using common ancestor, putting the importance score to 0.001 diffs (226 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -2186,6 +2186,7 @@ void getTableName(CSlabel* label, int cs maxFreq = typeAttributesHistogram[csIdx][i][0].freq; ontClassPos = BUNfnd(BATmirror(ontmetaBat), &maxDepthOid); if ( ontClassPos != BUN_NONE){ + foundOntologyTypeValue = 1; maxDepth = ontclassSet[ontClassPos].hierDepth; } else{ @@ -2332,7 +2333,6 @@ void getTableName(CSlabel* label, int cs if (choosenOntologyTypeValue == BUN_NONE && resultCount[csIdx] >= 1){ label->name = result[csIdx][bestOntCandIdx]; - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); nameFound = 1; #if INFO_WHERE_NAME_FROM label->isOntology = 1; @@ -2369,6 +2369,16 @@ void getTableName(CSlabel* label, int cs } } + + //Add hierarchy information for ontology-based name + if (nameFound){ + ontClassPos = BUNfnd(BATmirror(ontmetaBat), &(label->name)); + if ( ontClassPos != BUN_NONE){ + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + } + } + + //if no name is found, check again the typecount to assign a name #if USE_BEST_TYPEVALUE_INSTEADOF_DUMMY if (!nameFound){ @@ -2610,11 +2620,10 @@ void printTree(OntoUsageNode* tree, int } static -void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** ontmetadata, int ontmetadataCount, oid** result, int* resultCount, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount) { - int i, j, k, l; - oid *tmpList; - int tmpListCount; +void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** ontmetadata, int ontmetadataCount, BAT *ontmetaBat,CSlabel* labels) { + int i; int numTuples = 0; + BUN pos; // init tree with an artifical root node (*tree) = (OntoUsageNode *) malloc(sizeof(OntoUsageNode)); @@ -2633,46 +2642,12 @@ void createOntoUsageTree(OntoUsageNode** int hierarchyCount = 0; oid* hierarchy; - // get ontology - // copied from getTableName - if (resultCount[i] == 0) { - // no hierarchy --> ignore - continue; - } else if (resultCount[i] == 1) { - // one ontology class --> use it - uri = result[i][0]; - } else { - // multiple ontology classes --> intersect with types - tmpList = NULL; - tmpListCount = 0; - // search for type values - for (l = 0; l < typeAttributesCount; ++l) { - for (j = 0; j < typeAttributesHistogramCount[i][l]; ++j) { - if (typeAttributesHistogram[i][l][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted - // intersect type with ontology classes - for (k = 0; k < resultCount[i]; ++k) { - if (result[i][k] == typeAttributesHistogram[i][l][j].value) { - // found, copy ontology class to tmpList - tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); - if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - tmpList[tmpListCount] = result[i][k]; - tmpListCount += 1; - } - } - } - } - if (tmpListCount == 1) { - // only one left --> use it - uri = tmpList[0]; - } else if (tmpListCount > 1) { - // multiple left --> use the class that covers most attributes, most popular ontology, ... - uri = tmpList[0]; // sorted - } else { - // empty intersection -> use the class that covers most attributes, most popular ontology, .. - uri = result[i][0]; // sorted - } - free(tmpList); - } + uri = labels[i].name; + if (uri == BUN_NONE) continue; //No name freqCS + + //Check if the name is ontology name + pos = BUNfnd(BATmirror(ontmetaBat), &uri); + if (pos == BUN_NONE) continue; // no ontology information, ignore // get ontology hierarchy hierarchy = getOntoHierarchy(uri, &hierarchyCount, ontmetadata, ontmetadataCount); @@ -2872,7 +2847,7 @@ CSlabel* createLabels(CSset* freqCSset, #endif // Collect ontology statistics (tree) - createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata, ontmetadataCount, ontologyLookupResult, ontologyLookupResultCount, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); + createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata, ontmetadataCount, ontmetaBat, labels); free(ontologyLookupResultCount); freeOntologyLookupResult(ontologyLookupResult, ontologyLookupResutMatchedProp, freqCSset->numCSadded); diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -3868,6 +3868,7 @@ char isSemanticSimilar(int freqId1, int int hCount1, hCount2; int level; OntoUsageNode *tmpNode; + /* int k1, k2; if (labels[freqId1].name == labels[freqId2].name) @@ -3887,7 +3888,7 @@ char isSemanticSimilar(int freqId1, int } } */ - + // Check for the most common ancestor hCount1 = labels[freqId1].hierarchyCount; hCount2 = labels[freqId2].hierarchyCount; @@ -3899,17 +3900,17 @@ char isSemanticSimilar(int freqId1, int printf("Finding common ancestor for %d and %d \n", freqId1, freqId2 ); printf("FreqCS1: "); for (i = 0; i < hCount1; i++){ - printf(" %s", labels[freqId1].hierarchy[hCount1-1-i]); + printf(" " BUNFMT, labels[freqId1].hierarchy[hCount1-1-i]); } printf(" \n "); printf("FreqCS2: "); for (i = 0; i < hCount2; i++){ - printf(" %s", labels[freqId2].hierarchy[hCount2-1-i]); + printf(" " BUNFMT, labels[freqId2].hierarchy[hCount2-1-i]); } printf(" \n "); } - */ + if (0){ if ((freqId1 > numOrigFreqCS -1) || (freqId2 > numOrigFreqCS -1)) @@ -3951,8 +3952,10 @@ char isSemanticSimilar(int freqId1, int oid classOid; BUN ontClassPos; classOid = tmpNode->uri; + ontClassPos = BUNfnd(BATmirror(ontmetaBat), &classOid); assert(ontClassPos != BUN_NONE); + /* if (ontClassPos != BUN_NONE){ printf(" Specific level: %d \n", ontclassSet[ontClassPos].hierDepth); @@ -4031,8 +4034,6 @@ void mergeCSByS2S4(CSset *freqCSset, CSl oid name; /* Name of the common ancestor */ TFIDFInfo *tfidfInfos; - - (void) labels; (void) isLabelComparable; @@ -4193,12 +4194,14 @@ void mergeCSByS2(CSset *freqCSset, CSlab #if NOT_MERGE_DIMENSIONCS if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; #endif + + if ((*labels)[freqId1].hierarchyCount < 1) continue; + for (j = (i+1); j < curNumMergeCS; j++){ freqId2 = mergeCSFreqCSMap[j]; #if NOT_MERGE_DIMENSIONCS if (freqCSset->items[freqId2].type == DIMENSIONCS) continue; #endif - if (isLabelComparable == 1 && isSemanticSimilar(freqId1, freqId2, (*labels), ontoUsageTree,freqCSset->numOrigFreqCS, &name, ontmetaBat, ontclassSet) == 1){ //printf("Same labels between freqCS %d and freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore); doMerge(freqCSset, S2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, name); @@ -4272,6 +4275,7 @@ void mergeCSByS4(CSset *freqCSset, CSlab #else if (simscore > SIM_THRESHOLD) { #endif + //printf(" Similarity score (%d and %d) cosine = %f \n", freqId1,freqId2,simscore); /* if ((*labels)[freqId1].name != BUN_NONE){ takeOid((*labels)[freqId1].name, &freqCSname1); @@ -7526,7 +7530,6 @@ RDFextractCSwithTypes(int *ret, bat *sba curNumMergeCS = countNumberMergeCS(freqCSset); printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS); - /* ---------- S1 ------- */ mergecsId = *maxCSoid + 1; @@ -7572,7 +7575,6 @@ RDFextractCSwithTypes(int *ret, bat *sba mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); - /* S5: Merged CS referred from the same CS via the same property */ if (1){ tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -238,7 +238,7 @@ typedef struct SubCSSet{ #define SIM_THRESHOLD 0.6 //#define SIM_TFIDF_THRESHOLD 0.55 #define SIM_TFIDF_THRESHOLD 0.75 -#define IMPORTANCE_THRESHOLD 0.01 //This is used when merging CS's by common ancestor +#define IMPORTANCE_THRESHOLD 0.001 //This is used when merging CS's by common ancestor #define COMMON_ANCESTOR_LOWEST_SPECIFIC_LEVEL 2 #define MIN_PERCETAGE_S5 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list