Changeset: 21c27a0ff296 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=21c27a0ff296 Modified Files: monetdb5/extras/rdf/rdf_shredder.c monetdb5/extras/rdf/rdfontologyload.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Merge two maximumCSs based on the computed similarity score between them. diffs (truncated from 441 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf_shredder.c b/monetdb5/extras/rdf/rdf_shredder.c --- a/monetdb5/extras/rdf/rdf_shredder.c +++ b/monetdb5/extras/rdf/rdf_shredder.c @@ -223,7 +223,7 @@ tripleHandler(void* user_data, const rap BUN bun = BUN_NONE; BAT **graph = pdata->graph; - printf("%s %s %s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object)); + //printf("%s %s %s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object)); if (pdata->error > pdata->lasterror){ unsigned char* objStr; int objLen; diff --git a/monetdb5/extras/rdf/rdfontologyload.c b/monetdb5/extras/rdf/rdfontologyload.c --- a/monetdb5/extras/rdf/rdfontologyload.c +++ b/monetdb5/extras/rdf/rdfontologyload.c @@ -126,7 +126,9 @@ tripleHandler(void* user_data, const rap parserData *pdata = ((parserData *) user_data); //BUN bun = BUN_NONE; //BAT **graph = pdata->graph; + printf("%s %s %s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object)); + pdata->tcount++; return; } diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -125,6 +125,25 @@ void addCStoSet(CSset *csSet, CS item) } static +void addmergeCStoSet(mergeCSset *mergecsSet, mergeCS item) +{ + void *_tmp; + if(mergecsSet->nummergeCSadded == mergecsSet->numAllocation) + { + mergecsSet->numAllocation += INIT_NUM_CS; + + _tmp = realloc(mergecsSet->items, (mergecsSet->numAllocation * sizeof(CS))); + + if (!_tmp){ + fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + } + mergecsSet->items = (mergeCS*)_tmp; + } + mergecsSet->items[mergecsSet->nummergeCSadded] = item; + mergecsSet->nummergeCSadded++; +} + +static CSrel* creataCSrel(oid csoid){ CSrel *csrel = (CSrel*) malloc(sizeof(CSrel)); csrel->origCSoid = csoid; @@ -719,6 +738,16 @@ void freeCSset(CSset *csSet){ free(csSet); } +static +void freemergeCSset(mergeCSset *csSet){ + int i; + for(i = 0; i < csSet->nummergeCSadded; i ++){ + free(csSet->items[i].lstProp); + } + free(csSet->items); + free(csSet); +} + static CSset* initCSset(void){ CSset *csSet = (CSset*) malloc(sizeof(CSset)); @@ -729,6 +758,16 @@ CSset* initCSset(void){ return csSet; } +static +mergeCSset* initmergeCSset(void){ + mergeCSset *mergecsSet = (mergeCSset*) malloc(sizeof(mergeCSset)); + mergecsSet->items = (mergeCS*) malloc(sizeof(mergeCS) * INIT_NUM_CS); + mergecsSet->numAllocation = INIT_NUM_CS; + mergecsSet->nummergeCSadded = 0; + + return mergecsSet; +} + /* static void freeCS(CS *cs){ @@ -770,6 +809,95 @@ CS* creatCS(oid csId, int numP, oid* buf return cs; } +static +void mergeOidSets(oid* arr1, oid* arr2, oid* mergeArr, int m, int n, int numCombineP){ + + int i = 0, j = 0; + int pos = 0; + + while( j < m && i < n ) + { + if( arr1[j] < arr2[i] ){ + mergeArr[pos] = arr1[j]; + pos++; + j++; + } + else if( arr1[j] == arr2[i] ) + { + mergeArr[pos] = arr1[j]; + pos++; + j++; + i++; + } + else if( arr1[j] > arr2[i] ){ + mergeArr[pos] = arr2[i]; + pos++; + i++; + } + } + if (j == m && i < n){ + while (i < n){ + mergeArr[pos] = arr2[i]; + pos++; + i++; + } + } + + if (j < m && i == n){ + while (j < m){ + mergeArr[pos] = arr1[j]; + pos++; + j++; + } + } + + assert(pos == numCombineP); + /* + printf("pos = %d, numCombineP = %d\n", pos, numCombineP); + + for (i = 0; i < m; i++){ + printf(BUNFMT " ", arr1[i]); + } + + printf("\n"); + for (i = 0; i < n; i++){ + printf(BUNFMT " ", arr2[i]); + } + + + printf("\n"); + for (i = 0; i < pos; i++){ + printf(BUNFMT " ", mergeArr[i]); + } + + printf("\n"); + */ + + +} + +static +mergeCS* mergeTwoCSs(CS cs1, CS cs2, int numCombineP, int support, int coverage){ + + mergeCS *mergecs = (mergeCS*) malloc (sizeof (mergeCS)); + mergecs->id1 = cs1.csId; + mergecs->id2 = cs2.csId; + mergecs->lstProp = (oid*) malloc(sizeof(oid) * numCombineP); + + if (mergecs->lstProp == NULL){ + printf("Malloc failed. at %d", numCombineP); + exit(-1); + } + + mergeOidSets(cs1.lstProp, cs2.lstProp, mergecs->lstProp, cs1.numProp, cs2.numProp, numCombineP); + + mergecs->numProp = numCombineP; + mergecs->support = support; + mergecs->coverage = coverage; + + return mergecs; + +} static str printFreqCSSet(CSset *freqCSset, oid* csSuperCSMap, BAT *freqBat, BAT *mapbat, char isWriteTofile, int freqThreshold){ @@ -889,6 +1017,52 @@ str printFreqCSSet(CSset *freqCSset, oid return MAL_SUCCEED; } + +static +str printmergeCSSet(mergeCSset *mergecsSet, int freqThreshold){ + + int i,j; + FILE *fout; + char filename[100]; + char tmpStr[20]; + int ret; + + str propStr; + char* schema = "rdf"; + int nummergecs; + + nummergecs = mergecsSet->nummergeCSadded; + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + throw(RDF, "rdf.rdfschema", + "could not open the tokenizer\n"); + } + + + strcpy(filename, "mergeCSFullInfo"); + sprintf(tmpStr, "%d", freqThreshold); + strcat(filename, tmpStr); + strcat(filename, ".txt"); + + fout = fopen(filename,"wt"); + + for (i = 0; i < nummergecs; i++){ + mergeCS cs = (mergeCS)mergecsSet->items[i]; + + fprintf(fout, "MergeCS %d: "BUNFMT " and " BUNFMT "\n",i,cs.id1, cs.id2); + for (j = 0; j < cs.numProp; j++){ + takeOid(cs.lstProp[j], &propStr); + fprintf(fout," %s\n", propStr); + } + fprintf(fout, "\n"); + } + + fclose(fout); + + TKNZRclose(&ret); + return MAL_SUCCEED; +} + /* * Hashing function for a set of values * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html @@ -1171,6 +1345,40 @@ static int isSubset(oid* arr1, oid* arr2 } /* + * Use Jaccard similarity coefficient for computing the + * similarity between two sets + * sim(A,B) = |A B| / |A U B| + * Here each set contains distinct values only + * */ + +static +float similarityScore(oid* arr1, oid* arr2, int m, int n, int *numCombineP){ + + int i = 0, j = 0; + int numOverlap = 0; + + while( i < n && j < m ) + { + if( arr1[j] < arr2[i] ) + j++; + else if( arr1[j] == arr2[i] ) + { + j++; + i++; + numOverlap++; + } + else if( arr1[j] > arr2[i] ) + i++; + } + + *numCombineP = m + n - numOverlap; + + return ((float)numOverlap / (*numCombineP)); +} + + + +/* static void printCS(CS cs){ int i; @@ -1187,7 +1395,7 @@ void printCS(CS cs){ * Here maximum frequent CS is a CS that there exist no other CS which contains that CS * */ static -void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, BAT* coverageBat, int* superCSCoverage, BAT* freqBat, int* superCSFrequency, int numCS){ +void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, BAT* coverageBat, int* superCSCoverage, BAT* freqBat, int* superCSFrequency, int numCS, int *nMaxCSs){ int numFreqCS = freqCSset->numCSadded; int i, j; @@ -1224,6 +1432,8 @@ void getMaximumFreqCSs(CSset *freqCSset, //printCS( freqCSset->items[i]); } } + + *nMaxCSs = numMaxCSs; printf("Number of maximum CSs: %d / %d CSs \n", numMaxCSs, numCS); /* @@ -1268,7 +1478,39 @@ void getMaximumFreqCSs(CSset *freqCSset, */ } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list