Changeset: 6b057271bba9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6b057271bba9 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Allow using S2 for all merged CS. Increase the threshold for TF-IDF diffs (65 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -3799,9 +3799,11 @@ char isSemanticSimilar(int freqId1, int } */ - + + if (0){ if ((freqId1 > numOrigFreqCS -1) || (freqId2 > numOrigFreqCS -1)) return 0; + } for (i = 0; i < minCount; i++){ if (labels[freqId1].hierarchy[hCount1-1-i] != labels[freqId2].hierarchy[hCount2-1-i]) @@ -3821,6 +3823,7 @@ char isSemanticSimilar(int freqId1, int } level++; } + /* printf("The common ancestor of freqCS %d ("BUNFMT") and freqCS %d ("BUNFMT") is: "BUNFMT" --- %f \n", freqId1, labels[freqId1].name, freqId2, labels[freqId2].name, tmpNode->uri, tmpNode->percentage); @@ -3830,8 +3833,8 @@ char isSemanticSimilar(int freqId1, int */ if (tmpNode->percentage < IMPORTANCE_THRESHOLD) { - printf("Merge two CS's %d (Label: "BUNFMT") and %d (Label: "BUNFMT") using the common ancestor ("BUNFMT") at level %d (score: %f)\n", - freqId1, labels[freqId1].name, freqId2, labels[freqId2].name,tmpNode->uri, i,tmpNode->percentage); + //printf("Merge two CS's %d (Label: "BUNFMT") and %d (Label: "BUNFMT") using the common ancestor ("BUNFMT") at level %d (score: %f)\n", + // freqId1, labels[freqId1].name, freqId2, labels[freqId2].name,tmpNode->uri, i,tmpNode->percentage); (*ancestor) = tmpNode->uri; return 1; @@ -4070,7 +4073,7 @@ void mergeCSByS2(CSset *freqCSset, CSlab #if NOT_MERGE_DIMENSIONCS if (freqCSset->items[freqId2].type == DIMENSIONCS) continue; #endif - + if (isLabelComparable == 1 && isSemanticSimilar(freqId1, freqId2, (*labels), ontoUsageTree,freqCSset->numOrigFreqCS, &name) == 1){ //printf("Same labels between freqCS %d and freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore); doMerge(freqCSset, S2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, name); @@ -4137,6 +4140,7 @@ void mergeCSByS4(CSset *freqCSset, CSlab #else if (simscore > SIM_THRESHOLD) { #endif + //printf("Merge %d and %d with simscore = %f \n",freqId1, freqId2,simscore); doMerge(freqCSset, S4, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, BUN_NONE); } } diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -235,7 +235,8 @@ typedef struct SubCSSet{ //#define INIT_NUM_CS 9999 // workaround #define INIT_NUM_CS 1000 // workaround #define SIM_THRESHOLD 0.6 -#define SIM_TFIDF_THRESHOLD 0.55 +//#define SIM_TFIDF_THRESHOLD 0.55 +#define SIM_TFIDF_THRESHOLD 0.75 #define IMPORTANCE_THRESHOLD 0.01 #define MIN_PERCETAGE_S5 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property #define MIN_FROMTABLE_SIZE_S5 100 // The minimum size of the "from" table in S6. Meaning that _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list