Changeset: a0bcec66e6b1 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a0bcec66e6b1 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Improve S4: Merging CS's if they have at least one discriminating prop in common. diffs (238 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2718,7 +2718,7 @@ float similarityScore(oid* arr1, oid* ar /*Using cosine similarity score with vector of tf-idfs for properties in each CS */ static float similarityScoreTFIDF(oid* arr1, oid* arr2, int m, int n, int *numCombineP, - TFIDFInfo *tfidfInfos, int mergeCSId1, int mergeCSId2){ + TFIDFInfo *tfidfInfos, int mergeCSId1, int mergeCSId2, char *existDiscriminatingProp){ int i = 0, j = 0; int numOverlap = 0; @@ -2734,6 +2734,8 @@ float similarityScoreTFIDF(oid* arr1, oi } else if( arr1[j] == arr2[i] ) { + if (tfidfInfos[mergeCSId1].lsttfidfs[j] > MIN_TFIDF_PROP_S4) *existDiscriminatingProp = 1; + sumXY += tfidfInfos[mergeCSId1].lsttfidfs[j] * tfidfInfos[mergeCSId1].lsttfidfs[j]; j++; i++; @@ -4016,160 +4018,6 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos free(tfidfInfos); } -#if COMBINE_S2_S4 -static -void mergeCSByS2S4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount){ - int i, j, k; - int freqId1, freqId2; - float simscore = 0.0; - CS *mergecs; - int existMergecsId; - int numCombineP = 0; - CS *cs1, *cs2; - CS *existmergecs, *mergecs1, *mergecs2; - - PropStat *propStat; /* Store statistics about properties */ - char isLabelComparable = 0; - char isSameLabel = 0; - oid name; /* Name of the common ancestor */ - TFIDFInfo *tfidfInfos; - - (void) labels; - (void) isLabelComparable; - - - propStat = initPropStat(); - getPropStatisticsFromMergeCSs(propStat, curNumMergeCS, mergeCSFreqCSMap, freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS only*/ - tfidfInfos = (TFIDFInfo*)malloc(sizeof(TFIDFInfo) * curNumMergeCS); - initTFIDFInfos(tfidfInfos, curNumMergeCS, mergeCSFreqCSMap, freqCSset, propStat); - - - for (i = 0; i < curNumMergeCS; i++){ - freqId1 = mergeCSFreqCSMap[i]; - //printf("Label of %d CS is %s \n", freqId1, (*labels)[freqId1].name); - isLabelComparable = 0; - if ((*labels)[freqId1].name != BUN_NONE) isLabelComparable = 1; // no "DUMMY" - - - #if NOT_MERGE_DIMENSIONCS - if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; - #endif - for (j = (i+1); j < curNumMergeCS; j++){ - cs1 = (CS*) &(freqCSset->items[freqId1]); - - freqId2 = mergeCSFreqCSMap[j]; - cs2 = (CS*) &(freqCSset->items[freqId2]); - #if NOT_MERGE_DIMENSIONCS - if (cs2->type == DIMENSIONCS) continue; - #endif - isSameLabel = 0; - - #if USE_LABEL_FOR_MERGING - if (isLabelComparable == 1 && isSemanticSimilar(freqId1, freqId2, (*labels), ontoUsageTree,freqCSset->numOrigFreqCS, &name) == 1){ - //printf("Same labels between freqCS %d and freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore); - isSameLabel = 1; - simscore = 1; - } - #endif - - if (isSameLabel == 0){ - if(USINGTFIDF == 0){ - simscore = similarityScore(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP); - - //printf("simscore Jaccard = %f \n", simscore); - } - else{ - simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j); - //printf(" Cosine = %f \n", simscore); - - } - } - - //simscore = 0.0; - #if USINGTFIDF - if (simscore > SIM_TFIDF_THRESHOLD){ - #else - if (simscore > SIM_THRESHOLD) { - #endif - //printf("S4: merge freqCS %d and freqCS %d (sim: %f)\n", freqId1, freqId2,simscore); - //Check whether these CS's belong to any mergeCS - if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx == -1){ /* New merge */ - mergecs = mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId); - //addmergeCStoSet(mergecsSet, *mergecs); - cs1->parentFreqIdx = freqCSset->numCSadded; - cs2->parentFreqIdx = freqCSset->numCSadded; - addCStoSet(freqCSset,*mergecs); - if (isSameLabel) { - // rule S2 - updateLabel(S2, freqCSset, labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); - } else { - // rule S4 - updateLabel(S4, freqCSset, labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, BUN_NONE, ontmetadata, ontmetadataCount, NULL, -1); - } - free(mergecs); - - mergecsId[0]++; - - - } - else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){ - existMergecsId = cs2->parentFreqIdx; - existmergecs = (CS*) &(freqCSset->items[existMergecsId]); - mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs); - cs1->parentFreqIdx = existMergecsId; - if (isSameLabel) { - // rule S2 - updateLabel(S2, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); - } else { - // rule S4 - updateLabel(S4, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata, ontmetadataCount, NULL, -1); - } - } - - else if (cs1->parentFreqIdx != -1 && cs2->parentFreqIdx == -1){ - existMergecsId = cs1->parentFreqIdx; - existmergecs = (CS*)&(freqCSset->items[existMergecsId]); - mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs); - cs2->parentFreqIdx = existMergecsId; - if (isSameLabel) { - // rule S2 - updateLabel(S2, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); - } else { - // rule S4 - updateLabel(S4, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata, ontmetadataCount, NULL, -1); - } - } - else if (cs1->parentFreqIdx != cs2->parentFreqIdx){ - mergecs1 = (CS*)&(freqCSset->items[cs1->parentFreqIdx]); - mergecs2 = (CS*)&(freqCSset->items[cs2->parentFreqIdx]); - - mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx); - - //Re-map for all maxCS in mergecs2 - for (k = 0; k < mergecs2->numConsistsOf; k++){ - freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx; - } - if (isSameLabel) { - // rule S2 - updateLabel(S2, freqCSset, labels, 0, cs1->parentFreqIdx, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); - } else { - // rule S4 - updateLabel(S4, freqCSset, labels, 0, cs1->parentFreqIdx, freqId1, freqId2, BUN_NONE, ontmetadata, ontmetadataCount, NULL, -1); - } - } - } - } - } - - - freePropStat(propStat); - freeTFIDFInfo(tfidfInfos, curNumMergeCS); - -} -#endif //COMBINE_S2_S4 - static void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet){ int i, j; @@ -4223,6 +4071,8 @@ void mergeCSByS4(CSset *freqCSset, CSlab PropStat *propStat; /* Store statistics about properties */ TFIDFInfo *tfidfInfos; + char existDiscriminatingProp = 0; + /* int ret; char* schema = "rdf"; @@ -4255,6 +4105,8 @@ void mergeCSByS4(CSset *freqCSset, CSlab #endif if (cs1->parentFreqIdx != -1 && cs1->parentFreqIdx == cs2->parentFreqIdx) continue; //They have already been merged + + existDiscriminatingProp = 0; if(USINGTFIDF == 0){ simscore = similarityScore(cs1->lstProp, cs2->lstProp, @@ -4264,20 +4116,19 @@ void mergeCSByS4(CSset *freqCSset, CSlab } else{ simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j); + cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j, &existDiscriminatingProp); //printf(" Cosine = %f \n", simscore); } //simscore = 0.0; #if USINGTFIDF - if (simscore > SIM_TFIDF_THRESHOLD){ + if (simscore > SIM_TFIDF_THRESHOLD && existDiscriminatingProp){ #else if (simscore > SIM_THRESHOLD) { #endif - //printf(" Similarity score (%d and %d) cosine = %f \n", freqId1,freqId2,simscore); - /* - if ((*labels)[freqId1].name != BUN_NONE){ + /* + if ((*labels)[freqId1].name != BUN_NONE){ takeOid((*labels)[freqId1].name, &freqCSname1); printf("Merge %d (%s) and ",freqId1, freqCSname1); GDKfree(freqCSname1); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -248,6 +248,9 @@ typedef struct SubCSSet{ // Number of references > (Frequency of referredCS / MIN_TO_PERCETAGE_S5) #define MIN_TFIDF_PROP_S5 3 // The prop for FK in S5 must not be a common prop, it should be a discriminating one // This is for preventing the case of webpageID link in dbpedia +#define MIN_TFIDF_PROP_S4 3.5 // When we merge two CS's based on the tf-idf/consine similarity score, we want + // to make sure that we do not merge two CS's that may have same set of really common properties + // such as type, description. They should have at least one discriminating prop in common. //#define MIN_FROMTABLE_SIZE_S5 1 /* For example data */ #define MINIMUM_TABLE_SIZE 10000 //The minimum number of triples coverred by a table (i.e., a final CS) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list