Changeset: 08cf5e383bd5 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=08cf5e383bd5 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Filter for FK relationships diffs (183 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2594,6 +2594,7 @@ void mergeCSbyS4(CSset *freqCSset, CSlab mergecs1 = (CS*)&(freqCSset->items[tmpParentIdx]); mergecs2 = (CS*)&(freqCSset->items[freqId1]); + printf("MaxCS: Merge freqCS %d and freqCS %d \n", tmpParentIdx, freqId1); mergeConsistsOf(mergecs1, mergecs2); } @@ -3125,7 +3126,7 @@ void doMerge(CSset *freqCSset, int ruleN addCStoSet(freqCSset,*mergecs); updateLabel(ruleNum, freqCSset, labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); free(mergecs); - + mergecsId[0]++; } else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){ @@ -3216,29 +3217,44 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, tmpCount = 0; for (k = 0; k < labelStat->lstCount[i]; k++){ freqId1 = labelStat->freqIdList[i][k]; - if ((*labels)[freqId1].isOntology == 1) break; + if ((*labels)[freqId1].isOntology == 1) { + cs1 = &(freqCSset->items[freqId1]); + #if NOT_MERGE_DIMENSIONCS + if (cs1->type == DIMENSIONCS) continue; + #endif + tmpCount++; + break; + } } - cs1 = &(freqCSset->items[freqId1]); for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); #if NOT_MERGE_DIMENSIONCS - if (cs2->type == DIMENSIONCS) continue; + if (cs2->type == DIMENSIONCS) + continue; #endif if ((*labels)[freqId2].isOntology == 1){ + printf("Merge FreqCS %d and FreqCS %d by Ontology name \n", freqId1, freqId2); doMerge(freqCSset, S1, cs1, cs2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name); + printf("Number of added cs in freqCS: %d \n", freqCSset->numCSadded); tmpCount++; } } - fprintf(fout, " %d names are same by Ontology. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); + fprintf(fout, " %d freqCS merged as having same name by Ontology. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); //For Type tmpCount = 0; for (k = 0; k < labelStat->lstCount[i]; k++){ freqId1 = labelStat->freqIdList[i][k]; - if ((*labels)[freqId1].isType == 1) break; + if ((*labels)[freqId1].isType == 1) { + cs1 = &(freqCSset->items[freqId1]); + #if NOT_MERGE_DIMENSIONCS + if (cs1->type == DIMENSIONCS) continue; + #endif + tmpCount++; + break; + } } - cs1 = &(freqCSset->items[freqId1]); for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); @@ -3246,19 +3262,27 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, if (cs2->type == DIMENSIONCS) continue; #endif if ((*labels)[freqId2].isType == 1){ + printf("Merge FreqCS %d and FreqCS %d by Type name \n", freqId1, freqId2); doMerge(freqCSset, S1, cs1, cs2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name); + printf("Number of added cs in freqCS: %d \n", freqCSset->numCSadded); tmpCount++; } } - fprintf(fout, " %d names are same by TYPE. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); + fprintf(fout, " %d freqCS merged as having same name by TYPE. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); //For FK tmpCount = 0; for (k = 0; k < labelStat->lstCount[i]; k++){ freqId1 = labelStat->freqIdList[i][k]; - if ((*labels)[freqId1].isFK == 1) break; + if ((*labels)[freqId1].isFK == 1) { + cs1 = &(freqCSset->items[freqId1]); + #if NOT_MERGE_DIMENSIONCS + if (cs1->type == DIMENSIONCS) continue; + #endif + tmpCount++; + break; + } } - cs1 = &(freqCSset->items[freqId1]); for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); @@ -3266,13 +3290,14 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, if (cs2->type == DIMENSIONCS) continue; #endif if ((*labels)[freqId2].isFK == 1){ + printf("Merge FreqCS %d and FreqCS %d by FK name \n", freqId1, freqId2); doMerge(freqCSset, S1, cs1, cs2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name); + printf("Number of added cs in freqCS: %d \n", freqCSset->numCSadded); tmpCount++; } } #endif /* USE_MULTIWAY_MERGING */ - fprintf(fout, " %d names are same by FK. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); - + fprintf(fout, " %d freqCS merged as having same name by FK. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded -1].numProp); #if OUTPUT_FREQID_PER_LABEL @@ -3577,7 +3602,8 @@ void mergeCSByS3S5(CSset *freqCSset, CSl if (simscore > SIM_TFIDF_THRESHOLD){ #else if (simscore > SIM_THRESHOLD) { - #endif + #endif + //printf("S3S5: merge freqCS %d and freqCS %d \n", freqId1, freqId2); //Check whether these CS's belong to any mergeCS if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx == -1){ /* New merge */ mergecs = mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId); @@ -5087,7 +5113,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel refinedCsRel = initCSrelset(numTables); for (i = 0; i < numRel; ++i) { - if (csrelFreqSet[i].numRef == 0 || freqCSset->items[i].coverage > MINIMUM_TABLE_SIZE) continue; // ignore CS without relations + if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without relations assert(freqCSset->items[i].parentFreqIdx == -1); rel = csrelFreqSet[i]; from = mfreqIdxTblIdxMapping[i]; @@ -5112,7 +5138,15 @@ CSrel* getFKBetweenTableSet(CSrel *csrel } assert(propIdx < freqCSset->items[i].numProp); - if (csPropTypes[from].lstPropTypes[propIdx].propCover * MIN_FK_PROPCOVERAGE > rel.lstCnt[j]) continue; + + //Filtering: For big size table, if large number of prop's instances need to refer to a certain table + // else, all instances of that prop must refer to the certain table + if (freqCSset->items[i].coverage > MINIMUM_TABLE_SIZE){ + if (csPropTypes[from].lstPropTypes[propIdx].propCover * MIN_FK_PROPCOVERAGE > rel.lstCnt[j]) continue; + } + else{ + if (csPropTypes[from].lstPropTypes[propIdx].propCover != rel.lstCnt[j]) continue; + } assert(to < numTables); addReltoCSRelWithFreq(from, to, rel.lstPropId[j], rel.lstCnt[j], rel.lstBlankCnt[j], &refinedCsRel[from]); @@ -5479,6 +5513,7 @@ RDFextractCSwithTypes(int *ret, bat *sba curT = clock(); printf("Merging with S1 took %f. (Number of mergeCS: %d | NumconsistOf: %d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, countNumberConsistOfCS(freqCSset)); + printf("Number of added CS after S1: %d \n", freqCSset->numCSadded); tmpLastT = curT; /* ---------- S4 ------- */ diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -129,7 +129,7 @@ typedef struct PropStat { IR_DIMENSION_THRESHOLD_PERCENTAGE * totalFrequency Number of IR references should be several times larger than the CS frequency */ -#define NOT_MERGE_DIMENSIONCS 1 +#define NOT_MERGE_DIMENSIONCS 1 /* Default: 1, 0: Is for example data */ #define FILTER_INFREQ_FK_FOR_IR 1 /* We filter out all the dirty references from a CS */ #define FILTER_THRESHOLD_FK_FOR_IR 0.1 /* The FK that their frequency < FILTER_THRESHOLD_FK_FOR_IR * FreqCS's frequency */ @@ -204,6 +204,7 @@ typedef struct SubCSSet{ #define MIN_PERCETAGE_S6 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property #define MIN_FROMTABLE_SIZE_S6 100 // The minimum size of the "from" table in S6. Meaning that // the CS's to-be-merged in this rule must cover > MIN_FROMTABLE_SIZE_S6 / MIN_PERCETAGE_S6 triples +//#define MIN_FROMTABLE_SIZE_S6 1 /* For example data */ #define MINIMUM_TABLE_SIZE 10000 //The minimum number of triples coverred by a table (i.e., a final CS) #define SAMPLE_FILTER_THRESHOLD 1 // SAMPLE_FILTER_THRESHOLD/ 100 #define HIGH_REFER_THRESHOLD 5 _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list