Changeset: f6245c097bcc for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f6245c097bcc Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Improve the computation of S3S5 by pre-computing TFIDFs score for all CSs diffs (196 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2514,43 +2514,13 @@ float similarityScore(oid* arr1, oid* ar /*Using cosine similarity score with vector of tf-idfs for properties in each CS */ static -float similarityScoreTFIDF(oid* arr1, oid* arr2, int m, int n, int *numCombineP, PropStat* propStat){ +float similarityScoreTFIDF(oid* arr1, oid* arr2, int m, int n, int *numCombineP, + TFIDFInfo *tfidfInfos, int mergeCSId1, int mergeCSId2){ int i = 0, j = 0; int numOverlap = 0; - float sumX2 = 0.0; - float sumY2 = 0.0; float sumXY = 0.0; - BUN bun; - BUN p; - float tfidfV; - - for (i = 0; i < m; i++){ - p = arr1[i]; - bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p); - if (bun == BUN_NONE) { - printf("This prop must be there!!!!\n"); - return 0.0; - } - else{ - tfidfV = propStat->tfidfs[bun]; - sumX2 += tfidfV*tfidfV; - } - } - - for (i = 0; i < n; i++){ - p = arr2[i]; - bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p); - if (bun == BUN_NONE) { - printf("This prop must be there!!!!\n"); - return 0.0; - } - else{ - tfidfV = propStat->tfidfs[bun]; - sumY2 += tfidfV*tfidfV; - } - } - + i = 0; j = 0; while( i < n && j < m ) @@ -2561,19 +2531,7 @@ float similarityScoreTFIDF(oid* arr1, oi } else if( arr1[j] == arr2[i] ) { - p = arr1[j]; - bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p); - - if (bun == BUN_NONE) { - printf("This prop must be there!!!!\n"); - return 0.0; - } - else{ - tfidfV = propStat->tfidfs[bun]; - // We can do this because the tfidfs of a property in any CS - // are the same - sumXY += tfidfV*tfidfV; - } + sumXY += tfidfInfos[mergeCSId1].lsttfidfs[j] * tfidfInfos[mergeCSId1].lsttfidfs[j]; j++; i++; numOverlap++; @@ -2586,7 +2544,7 @@ float similarityScoreTFIDF(oid* arr1, oi *numCombineP = m + n - numOverlap; - return ((float) sumXY / (sqrt(sumX2)*sqrt(sumY2))); + return ((float) sumXY / (tfidfInfos[mergeCSId1].totalTFIDF * tfidfInfos[mergeCSId2].totalTFIDF)); } /* @@ -3655,6 +3613,39 @@ char isSemanticSimilar(int freqId1, int } static +void initTFIDFInfos(TFIDFInfo *tfidfInfos, int curNumMergeCS, oid* mergeCSFreqCSMap, CSset *freqCSset, PropStat *propStat){ + int i, j; + int freqId; + CS *cs; + oid p; + float tfidfV; + float sum; + BUN bun = BUN_NONE; + for (i = 0; i < curNumMergeCS; i++){ + freqId = mergeCSFreqCSMap[i]; + cs = (CS*) &(freqCSset->items[freqId]); + tfidfInfos[i].freqId = freqId; + tfidfInfos[i].lsttfidfs = (float*)malloc(sizeof(float) * (cs->numProp)); + sum = 0.0; + for (j = 0; j < cs->numProp; j++){ + p = cs->lstProp[j]; + bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p); + if (bun == BUN_NONE) { + printf("This prop must be there!!!!\n"); + } + else{ + tfidfV = propStat->tfidfs[bun]; + sum += tfidfV*tfidfV; + } + tfidfInfos[i].lsttfidfs[j] = tfidfV; + + } + assert(sum > 0); + tfidfInfos[i].totalTFIDF = sqrt(sum); + } + +} +static void mergeCSByS3S5(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount){ int i, j, k; int freqId1, freqId2; @@ -3669,7 +3660,7 @@ void mergeCSByS3S5(CSset *freqCSset, CSl char isLabelComparable = 0; char isSameLabel = 0; oid name; /* Name of the common ancestor */ - + TFIDFInfo *tfidfInfos; @@ -3679,6 +3670,9 @@ void mergeCSByS3S5(CSset *freqCSset, CSl propStat = initPropStat(); getPropStatisticsFromMergeCSs(propStat, curNumMergeCS, mergeCSFreqCSMap, freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS only*/ + tfidfInfos = (TFIDFInfo*)malloc(sizeof(TFIDFInfo) * curNumMergeCS); + initTFIDFInfos(tfidfInfos, curNumMergeCS, mergeCSFreqCSMap, freqCSset, propStat); + for (i = 0; i < curNumMergeCS; i++){ freqId1 = mergeCSFreqCSMap[i]; @@ -3716,7 +3710,7 @@ void mergeCSByS3S5(CSset *freqCSset, CSl } else{ simscore = similarityScoreTFIDF(cs1->lstProp, cs2->lstProp, - cs1->numProp,cs2->numProp,&numCombineP, propStat); + cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j); //printf(" Cosine = %f \n", simscore); } @@ -3963,7 +3957,7 @@ static void getStatisticFinalCSs(CSset * printf("\nTotal " BUNFMT " triples, coverred by %d final CSs: %d (%f percent) \n", BATcount(sbat), numTables, totalCoverage, 100 * ((float)totalCoverage/BATcount(sbat))); printf("Max number of triples coverred by one final CS: %d \n", maxNumtriple); printf("Min number of triples coverred by one final CS: %d \n", minNumtriple); - printf("Avg number of triples coverred by one final CS: %f \n", (float)(totalCoverage/numMergeCS)); + if (numMergeCS != 0) printf("Avg number of triples coverred by one final CS: %f \n", (float)(totalCoverage/numMergeCS)); //Check if remove all non-frequent Prop maxNumtriple = 0; diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -85,7 +85,7 @@ typedef struct PropStat { #define INIT_PROP_NUM 10 #define INIT_CS_PER_PROP 10 -#define USINGTFIDF 0 +#define USINGTFIDF 1 #define STOREFULLCS 1 /* Store full instance of a CS including the a subject and list of predicates, objects. Only use this for finding the name of the table corresponding to that CS */ @@ -125,10 +125,12 @@ typedef struct PropStat { /* ---- For detecting dimension table */ #define NUM_ITERATION_FOR_IR 3 /* Number of iteration for indirect referrences to a CS (table) */ -#define IR_DIMENSION_THRESHOLD_PERCENTAGE 0.02 /* Score of indirect references that the CS can be considered as a dimension CS +#define IR_DIMENSION_THRESHOLD_PERCENTAGE 0.2 /* Score of indirect references that the CS can be considered as a dimension CS IR_DIMENSION_THRESHOLD_PERCENTAGE * totalFrequency Number of IR references should be several times larger than the CS frequency */ +//#define IR_DIMENSION_THRESHOLD_PERCENTAGE 0.02 //Value 0.2 is for example data only + #define NOT_MERGE_DIMENSIONCS 1 /* Default: 1, 0: Is for example data */ #define FILTER_INFREQ_FK_FOR_IR 1 /* We filter out all the dirty references from a CS */ @@ -162,6 +164,12 @@ typedef struct CS int numConsistsOf; } CS; +typedef struct TFIDFInfo { + int freqId; + float* lsttfidfs; //TFIDF score of each prop in a CS + float totalTFIDF; // sqrt of (Sum = Total tfidfV*tfidfV of all props in that CS) +} TFIDFInfo; + typedef struct SubCS { //oid csId; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list