Changeset: ab84eb43b2d9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ab84eb43b2d9 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message:
implement USE_LABEL_FINDING_MAXCS (but do not enable it) diffs (175 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2970,12 +2970,68 @@ void updateParentIdxAll(CSset *freqCSset } } +#if USE_LABEL_FINDING_MAXCS +/* + * * Return 1 if there is semantic evidence against merging the two CS's, this is the case iff the two CS's have a hierarchy and their common ancestor is too generic (support above IMPORTANCE_THRESHOLD). + * */ +static +char isEvidenceAgainstMerging(int freqId1, int freqId2, CSlabel* labels, OntoUsageNode *tree) { + int i, j; + int level; + OntoUsageNode *tmpNode; + + // Get common ancestor + int hCount1 = labels[freqId1].hierarchyCount; + int hCount2 = labels[freqId2].hierarchyCount; + int minCount = (hCount1 > hCount2)?hCount2:hCount1; + + if (minCount == 0) { + // at least one CS does not have a hierarchy --> no semantic information --> no semantic evidence against merging + return 0; + } + + // get level where the hierarchies differ + for (i = 0; i < minCount; i++){ + if (labels[freqId1].hierarchy[hCount1-1-i] != labels[freqId2].hierarchy[hCount2-1-i]) break; + } + + if (i == 0) { + // not even the top level of the hierarchy is the same --> there is semantic evidence against merging the two CS's + return 1; + } else if (i == minCount) { + // same name --> no semantic evidence against merging + return 0; + } + + // get the common ancestor at level i + level = 0; + tmpNode = tree; + while(level < i){ + for (j = 0; j < tmpNode->numChildren; j++) { + if (tmpNode->lstChildren[j]->uri == labels[freqId1].hierarchy[hCount1-1-level]){ + tmpNode = tmpNode->lstChildren[j]; + break; + } + } + level++; + } + + if (tmpNode->percentage >= IMPORTANCE_THRESHOLD) { + // have common ancestor but it is too generic --> there is semantic evidence against merging the two CS's + return 1; + } else { + // common ancestor is specific --> no semantic evidence against merging + return 0; + } +} +#endif + /* * Get the maximum frequent CSs from a CSset * Here maximum frequent CS is a CS that there exist no other CS which contains that CS * */ static -void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){ +void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount, OntoUsageNode *tree){ int numMergeCS = curNumMergeCS; int i, j; @@ -2983,13 +3039,12 @@ void mergeCSbyS3(CSset *freqCSset, CSlab int tmpParentIdx; int freqId1, freqId2; - #if USE_LABEL_FINDING_MAXCS - char isLabelComparable = 0; - #endif - char isDiffLabel = 0; int numP1, numP2; CS *mergecs1, *mergecs2; - (void) labels; + +#if !USE_LABEL_FINDING_MAXCS + (void) tree; +#endif printf("Retrieving maximum frequent CSs: \n"); @@ -3000,44 +3055,35 @@ void mergeCSbyS3(CSset *freqCSset, CSlab if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; #endif - #if USE_LABEL_FINDING_MAXCS - isLabelComparable = 0; - if ((*labels)[i].name != BUN_NONE) isLabelComparable = 1; // no "DUMMY" - #endif - for (j = (i+1); j < numMergeCS; j++){ freqId2 = mergeCSFreqCSMap[j]; #if NOT_MERGE_DIMENSIONCS if (freqCSset->items[freqId2].type == DIMENSIONCS) continue; #endif - isDiffLabel = 0; - #if USE_LABEL_FINDING_MAXCS - if (isLabelComparable == 0 || strcmp((*labels)[freqId1].name, (*labels)[freqId2].name) != 0) { - isDiffLabel = 1; - } - #endif - - if (isDiffLabel == 0){ - numP2 = freqCSset->items[freqId2].numProp; - numP1 = freqCSset->items[freqId1].numProp; - if (numP2 > numP1 && (numP2-numP1)< MAX_SUB_SUPER_NUMPROP_DIF){ - if (isSubset(freqCSset->items[freqId2].lstProp, freqCSset->items[freqId1].lstProp, numP2,numP1) == 1) { - /* CSj is a superset of CSi */ - freqCSset->items[freqId1].parentFreqIdx = freqId2; - updateLabel(S3, freqCSset, labels, 0, freqId2, freqId1, freqId2, BUN_NONE, 0, 0, 0, ontmetadata, ontmetadataCount, NULL, -1); // name, isType, isOntology, isFK are not used for case CS - break; - } - } - else if (numP2 < numP1 && (numP1-numP2)< MAX_SUB_SUPER_NUMPROP_DIF){ - if (isSubset(freqCSset->items[freqId1].lstProp, freqCSset->items[freqId2].lstProp, - numP1,numP2) == 1) { - /* CSj is a subset of CSi */ - freqCSset->items[freqId2].parentFreqIdx = freqId1; - updateLabel(S3, freqCSset, labels, 0, freqId1, freqId1, freqId2, BUN_NONE, 0, 0, 0, ontmetadata, ontmetadataCount, NULL, -1); // name, isType, isOntology, isFK are not used for case CS - } - - } + numP2 = freqCSset->items[freqId2].numProp; + numP1 = freqCSset->items[freqId1].numProp; + if (numP2 > numP1 && (numP2-numP1)< MAX_SUB_SUPER_NUMPROP_DIF){ + if (isSubset(freqCSset->items[freqId2].lstProp, freqCSset->items[freqId1].lstProp, numP2,numP1) == 1) { + /* CSj is a superset of CSi */ +#if USE_LABEL_FINDING_MAXCS + if (isEvidenceAgainstMerging(freqId1, freqId2, *labels, tree)) continue; +#endif + freqCSset->items[freqId1].parentFreqIdx = freqId2; + updateLabel(S3, freqCSset, labels, 0, freqId2, freqId1, freqId2, BUN_NONE, 0, 0, 0, ontmetadata, ontmetadataCount, NULL, -1); // name, isType, isOntology, isFK are not used for case CS + break; + } + } + else if (numP2 < numP1 && (numP1-numP2)< MAX_SUB_SUPER_NUMPROP_DIF){ + if (isSubset(freqCSset->items[freqId1].lstProp, freqCSset->items[freqId2].lstProp, + numP1,numP2) == 1) { + /* CSj is a subset of CSi */ +#if USE_LABEL_FINDING_MAXCS + if (isEvidenceAgainstMerging(freqId1, freqId2, *labels, tree)) continue; +#endif + freqCSset->items[freqId2].parentFreqIdx = freqId1; + updateLabel(S3, freqCSset, labels, 0, freqId1, freqId1, freqId2, BUN_NONE, 0, 0, 0, ontmetadata, ontmetadataCount, NULL, -1); // name, isType, isOntology, isFK are not used for case CS + } } //Do not need to consider the case that the numProps are the same @@ -8856,12 +8902,12 @@ RDFextractCSwithTypes(int *ret, bat *sba if (0){ /*S3: Merge two CS's having the subset-superset relationship */ - mergeCSbyS3(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS, ontmetadata, ontmetadataCount); + mergeCSbyS3(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS, ontmetadata, ontmetadataCount, ontoUsageTree); curNumMergeCS = countNumberMergeCS(freqCSset); curT = clock(); - printf("Merging with S4 took %f. (Number of mergeCS: %d | NumconsistOf: %d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, countNumberConsistOfCS(freqCSset)); - printf("Number of added CS after S4: %d \n", freqCSset->numCSadded); + printf("Merging with S3 took %f. (Number of mergeCS: %d | NumconsistOf: %d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS, countNumberConsistOfCS(freqCSset)); + printf("Number of added CS after S3: %d \n", freqCSset->numCSadded); #if STORE_PERFORMANCE_METRIC_INFO computeMetricsQ(freqCSset); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list