Changeset: 953fd23b3505 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=953fd23b3505 Modified Files: monetdb5/extras/rdf/rdf_shredder.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Use lables for finding maxCS and fix problems in using label in merging process. diffs (151 lines): diff --git a/monetdb5/extras/rdf/rdf_shredder.c b/monetdb5/extras/rdf/rdf_shredder.c --- a/monetdb5/extras/rdf/rdf_shredder.c +++ b/monetdb5/extras/rdf/rdf_shredder.c @@ -359,7 +359,7 @@ tripleHandler(void* user_data, const rap } else if (triple->object->type == RAPTOR_TERM_TYPE_LITERAL) { unsigned char* objStr; - ObjectType objType; + ObjectType objType = STRING; objStr = raptor_term_to_string(triple->object); objType = getObjectType(objStr, &realNumValue); diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -1853,19 +1853,31 @@ void getMaximumFreqCSs(CSset *freqCSset, int tmpParentIdx; int* coverage; int* freq; + char isLabelComparable = 0; (void) labels; + (void) isLabelComparable; printf("Retrieving maximum frequent CSs: \n"); for (i = 0; i < numFreqCS; i++){ if (freqCSset->items[i].parentFreqIdx != -1) continue; + isLabelComparable = 0; + if (strcmp(labels[i].name, "DUMMY") != 0) isLabelComparable = 1; + for (j = (i+1); j < numFreqCS; j++){ if (freqCSset->items[j].numProp > freqCSset->items[i].numProp){ if (isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp, freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { /* CSj is a superset of CSi */ + #if USE_LABEL_FINDING_MAXCS + if (isLabelComparable == 1 && strcmp(labels[i].name, labels[j].name) == 0) { + freqCSset->items[i].parentFreqIdx = j; + break; + } + #else freqCSset->items[i].parentFreqIdx = j; + #endif break; } } @@ -1873,7 +1885,13 @@ void getMaximumFreqCSs(CSset *freqCSset, if (isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp, freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { /* CSj is a subset of CSi */ + #if USE_LABEL_FINDING_MAXCS + if (isLabelComparable == 1 && strcmp(labels[i].name, labels[j].name) == 0) { + freqCSset->items[j].parentFreqIdx = i; + } + #else freqCSset->items[j].parentFreqIdx = i; + #endif } } @@ -2227,8 +2245,11 @@ void mergeMaximumFreqCSsAll(CSset *freqC PropStat *propStat; /* Store statistics about properties */ int nummergedCSs = 0; + char isLabelComparable = 0; + char isSameLabel = 0; (void) labels; + (void) isLabelComparable; for (i = 0; i < freqCSset->numCSadded; i++){ if (freqCSset->items[i].parentFreqIdx == -1){ @@ -2248,30 +2269,38 @@ void mergeMaximumFreqCSsAll(CSset *freqC for (i = 0; i < numMaxCSs; i++){ freqId1 = superCSFreqCSMap[i]; + //printf("Label of %d CS is %s \n", freqId1, labels[freqId1].name); + isLabelComparable = 0; + if (strcmp(labels[freqId1].name,"DUMMY") != 0) isLabelComparable = 1; + cs1 = (CS*) &(freqCSset->items[freqId1]); for (j = (i+1); j < numMaxCSs; j++){ freqId2 = superCSFreqCSMap[j]; cs2 = (CS*) &(freqCSset->items[freqId2]); - - if(USINGTFIDF == 0){ - simscore = similarityScore(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP); - - //printf("simscore Jaccard = %f \n", simscore); - } - else{ - simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP, propStat); - //printf(" Cosine = %f \n", simscore); - - } - + isSameLabel = 0; + #if USE_LABEL_FOR_MERGING - if (strcmp(labels[freqId1].name, labels[freqId2].name) == 0){ - //printf("Same labels between freqCS %d and freqCS %d \n", freqId1, freqId2); + if (isLabelComparable == 1 && strcmp(labels[freqId1].name, labels[freqId2].name) == 0){ + //printf("Same labels between freqCS %d and freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore); + isSameLabel = 1; simscore = 1; } #endif + + if (isSameLabel == 0){ + if(USINGTFIDF == 0){ + simscore = similarityScore(cs1->lstProp, cs2->lstProp, + cs1->numProp,cs2->numProp,&numCombineP); + + //printf("simscore Jaccard = %f \n", simscore); + } + else{ + simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, + cs1->numProp,cs2->numProp,&numCombineP, propStat); + //printf(" Cosine = %f \n", simscore); + + } + } //simscore = 0.0; #if USINGTFIDF @@ -3163,6 +3192,8 @@ RDFextractCSwithTypes(int *ret, bat *sba // Create label per freqCS csIdFreqIdxMap = (int *) malloc (sizeof(int) * (*maxCSoid + 1)); initcsIdFreqIdxMap(csIdFreqIdxMap, *maxCSoid + 1, -1, freqCSset); + printf("Using ontologies with %d ontattributesCount and %d ontmetadataCount \n",ontattributesCount,ontmetadataCount); + labels = createLabels(freqCSset, csrelSet, *maxCSoid + 1, sbat, si, pi, oi, *subjCSMap, mbat, csIdFreqIdxMap, *freqThreshold, ontattributes, ontattributesCount, ontmetadata, ontmetadataCount); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -88,7 +88,9 @@ typedef struct PropStat { #define FULL_PROP_STAT 1 // Only use for showing the statistic on all properties / all CSs. (Default should be 0) -#define USE_LABEL_FOR_MERGING 1 // Use the labels received from labeling process for finding maxCS and mergeCS + +#define USE_LABEL_FINDING_MAXCS 1 // Use the labels received from labeling process for finding maxCS +#define USE_LABEL_FOR_MERGING 1 // Use the labels received from labeling process for finding mergeCS typedef struct CS { _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list