Changeset: 8b1a1fc8fcb1 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8b1a1fc8fcb1 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Seperate rules S2 and S4 diffs (truncated from 309 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2732,7 +2732,7 @@ void updateParentIdxAll(CSset *freqCSset * Here maximum frequent CS is a CS that there exist no other CS which contains that CS * */ static -void mergeCSbyS4(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){ +void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){ int numMergeCS = curNumMergeCS; int i, j; @@ -3177,7 +3177,7 @@ void generatecsRelSum(CSrel csRel, int f for (i = 0; i < csRel.numRef; i++){ freq = freqCSset->items[csRel.origFreqIdx].support; - if (freq > MIN_FROMTABLE_SIZE_S6 && freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){ + if (freq > MIN_FROMTABLE_SIZE_S5 && freq < csRel.lstCnt[i] * MIN_PERCETAGE_S5){ propIdx = 0; while (csRelSum->lstPropId[propIdx] != csRel.lstPropId[i]) propIdx++; @@ -3605,7 +3605,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, } static -void mergeMaxFreqCSByS6(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid** ontmetadata, int ontmetadataCount){ +void mergeMaxFreqCSByS5(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid** ontmetadata, int ontmetadataCount){ int i; int freqId; //int relId; @@ -3632,7 +3632,7 @@ void mergeMaxFreqCSByS6(CSrel *csrelMerg //int numCombinedP = 0; int startIdx = 0; - printf("Start merging CS by using S6 \n"); + printf("Start merging CS by using S5[From FK] \n"); #if NO_OUTPUTFILE == 0 strcpy(filename, "csRelSum.txt"); @@ -3885,8 +3885,9 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos free(tfidfInfos); } +#if COMBINE_S2_S4 static -void mergeCSByS3S5(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount){ +void mergeCSByS2S4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount){ int i, j, k; int freqId1, freqId2; float simscore = 0.0; @@ -4038,8 +4039,114 @@ void mergeCSByS3S5(CSset *freqCSset, CSl freeTFIDFInfo(tfidfInfos, curNumMergeCS); } - - +#endif //COMBINE_S2_S4 + +static +void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount){ + int i, j; + int freqId1, freqId2; + + char isLabelComparable = 0; + oid name; /* Name of the common ancestor */ + + + + (void) labels; + (void) isLabelComparable; + + + + for (i = 0; i < curNumMergeCS; i++){ + freqId1 = mergeCSFreqCSMap[i]; + + isLabelComparable = 0; + if ((*labels)[freqId1].name != BUN_NONE) isLabelComparable = 1; // no "DUMMY" + + #if NOT_MERGE_DIMENSIONCS + if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; + #endif + for (j = (i+1); j < curNumMergeCS; j++){ + freqId2 = mergeCSFreqCSMap[j]; + #if NOT_MERGE_DIMENSIONCS + if (freqCSset->items[freqId2].type == DIMENSIONCS) continue; + #endif + + if (isLabelComparable == 1 && isSemanticSimilar(freqId1, freqId2, (*labels), ontoUsageTree,freqCSset->numOrigFreqCS, &name) == 1){ + //printf("Same labels between freqCS %d and freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore); + doMerge(freqCSset, S2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, name); + } + + } + } + +} + +static +void mergeCSByS4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,oid **ontmetadata, int ontmetadataCount){ + int i, j; + int freqId1, freqId2; + float simscore = 0.0; + CS *cs1, *cs2; + int numCombineP = 0; + + PropStat *propStat; /* Store statistics about properties */ + TFIDFInfo *tfidfInfos; + + + + (void) labels; + + propStat = initPropStat(); + getPropStatisticsFromMergeCSs(propStat, curNumMergeCS, mergeCSFreqCSMap, freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS only*/ + tfidfInfos = (TFIDFInfo*)malloc(sizeof(TFIDFInfo) * curNumMergeCS); + initTFIDFInfos(tfidfInfos, curNumMergeCS, mergeCSFreqCSMap, freqCSset, propStat); + + + for (i = 0; i < curNumMergeCS; i++){ + freqId1 = mergeCSFreqCSMap[i]; + //printf("Label of %d CS is %s \n", freqId1, (*labels)[freqId1].name); + + #if NOT_MERGE_DIMENSIONCS + if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; + #endif + for (j = (i+1); j < curNumMergeCS; j++){ + cs1 = (CS*) &(freqCSset->items[freqId1]); + + freqId2 = mergeCSFreqCSMap[j]; + cs2 = (CS*) &(freqCSset->items[freqId2]); + #if NOT_MERGE_DIMENSIONCS + if (cs2->type == DIMENSIONCS) continue; + #endif + + if(USINGTFIDF == 0){ + simscore = similarityScore(cs1->lstProp, cs2->lstProp, + cs1->numProp,cs2->numProp,&numCombineP); + + //printf("simscore Jaccard = %f \n", simscore); + } + else{ + simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, + cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j); + //printf(" Cosine = %f \n", simscore); + + } + + //simscore = 0.0; + #if USINGTFIDF + if (simscore > SIM_TFIDF_THRESHOLD){ + #else + if (simscore > SIM_THRESHOLD) { + #endif + doMerge(freqCSset, S4, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, BUN_NONE); + } + } + } + + + freePropStat(propStat); + freeTFIDFInfo(tfidfInfos, curNumMergeCS); + +} static void putPtoHash(map_t pmap, int key, oid *poid, int support){ oid *getPoid; oid *putPoid; @@ -7055,7 +7162,7 @@ RDFextractCSwithTypes(int *ret, bat *sba printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS); - /* ---------- S1, S2 ------- */ + /* ---------- S1 ------- */ mergecsId = *maxCSoid + 1; mergeMaxFreqCSByS1(freqCSset, labels, &mergecsId, ontmetadata, ontmetadataCount); /*S1: Merge all freqCS's sharing top-3 candidates */ @@ -7075,13 +7182,13 @@ RDFextractCSwithTypes(int *ret, bat *sba #endif tmpLastT = curT; - /* ---------- S4 ------- */ + /* ---------- S3 ------- */ mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); if (0){ - /*S4: Merge two CS's having the subset-superset relationship */ - mergeCSbyS4(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS, ontmetadata, ontmetadataCount); + /*S3: Merge two CS's having the subset-superset relationship */ + mergeCSbyS3(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS, ontmetadata, ontmetadataCount); curNumMergeCS = countNumberMergeCS(freqCSset); curT = clock(); @@ -7094,26 +7201,26 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpLastT = curT; } - /* ---------- S6 ------- */ + /* ---------- S5 ------- */ free(mergeCSFreqCSMap); mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); - /* S6: Merged CS referred from the same CS via the same property */ + /* S5: Merged CS referred from the same CS via the same property */ if (0){ tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset); tmpNumRel = freqCSset->numCSadded; - mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount); - //printf("DISABLE S6 (For Testing) \n"); + mergeMaxFreqCSByS5(tmpCSrelToMergeCS, freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount); + //printf("DISABLE S5 (For Testing) \n"); freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel); } curNumMergeCS = countNumberMergeCS(freqCSset); curT = clock(); - printf("Merging with S6 took %f. (Number of mergeCS: %d | NumconsistOf: %d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, countNumberConsistOfCS(freqCSset)); + printf("Merging with S5 took %f. (Number of mergeCS: %d | NumconsistOf: %d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, countNumberConsistOfCS(freqCSset)); #if STORE_PERFORMANCE_METRIC_INFO computeMetricsQ(freqCSset); @@ -7121,17 +7228,40 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpLastT = curT; - /* S3, S5 */ + //S2: Common ancestor free(mergeCSFreqCSMap); mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); - mergeCSByS3S5(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontoUsageTree, ontmetadata, ontmetadataCount); - free(mergeCSFreqCSMap); + mergeCSByS2(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontoUsageTree, ontmetadata, ontmetadataCount); curNumMergeCS = countNumberMergeCS(freqCSset); curT = clock(); - printf ("Merging with S3, S5 took %f. (Number of mergeCS: %d) \n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS); + printf ("Merging with S2 took %f. (Number of mergeCS: %d) \n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS); + + #if NO_OUTPUTFILE == 0 + printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 4); + #endif + + #if STORE_PERFORMANCE_METRIC_INFO + computeMetricsQ(freqCSset); + #endif + + tmpLastT = curT; + + + //S4: TF/IDF similarity + free(mergeCSFreqCSMap); + mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); + initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); + + mergeCSByS4(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount); + free(mergeCSFreqCSMap); + + curNumMergeCS = countNumberMergeCS(freqCSset); + curT = clock(); + printf ("Merging with S4 took %f. (Number of mergeCS: %d) \n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS); + #if NO_OUTPUTFILE == 0 printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 5); #endif @@ -7142,12 +7272,11 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpLastT = curT; + updateParentIdxAll(freqCSset); - //Finally, re-create mergeFreqSet - *csRelMergeFreqSet = generateCsRelBetweenMergeFreqSet(csrelSet, freqCSset); #if NO_OUTPUTFILE == 0 diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -93,6 +93,7 @@ typedef struct PropStat { #define INIT_PROP_NUM 10 #define INIT_CS_PER_PROP 10 #define USINGTFIDF 1 +#define COMBINE_S2_S4 0 #define STOREFULLCS 1 /* Store full instance of a CS including the a subject and list of predicates, objects. Only use this for finding the name of the table corresponding to that CS */ @@ -236,10 +237,10 @@ typedef struct SubCSSet{ #define SIM_THRESHOLD 0.6 #define SIM_TFIDF_THRESHOLD 0.55 #define IMPORTANCE_THRESHOLD 0.01 -#define MIN_PERCETAGE_S6 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list