Changeset: 21c27a0ff296 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=21c27a0ff296
Modified Files:
        monetdb5/extras/rdf/rdf_shredder.c
        monetdb5/extras/rdf/rdfontologyload.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Merge two maximumCSs based on the computed similarity score between them.


diffs (truncated from 441 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf_shredder.c 
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -223,7 +223,7 @@ tripleHandler(void* user_data, const rap
        BUN bun = BUN_NONE;
        BAT **graph = pdata->graph;
 
-       printf("%s   %s   
%s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object));
+       //printf("%s   %s   
%s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object));
        if (pdata->error > pdata->lasterror){
                unsigned char* objStr;
                int objLen; 
diff --git a/monetdb5/extras/rdf/rdfontologyload.c 
b/monetdb5/extras/rdf/rdfontologyload.c
--- a/monetdb5/extras/rdf/rdfontologyload.c
+++ b/monetdb5/extras/rdf/rdfontologyload.c
@@ -126,7 +126,9 @@ tripleHandler(void* user_data, const rap
        parserData *pdata = ((parserData *) user_data);
        //BUN bun = BUN_NONE;
        //BAT **graph = pdata->graph;
+       
        printf("%s   %s   
%s\n",raptor_term_to_string(triple->subject),raptor_term_to_string(triple->predicate),raptor_term_to_string(triple->object));
+       
        pdata->tcount++;
        return; 
 }
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -125,6 +125,25 @@ void addCStoSet(CSset *csSet, CS item)
 }
 
 static 
+void addmergeCStoSet(mergeCSset *mergecsSet, mergeCS item)
+{
+       void *_tmp; 
+       if(mergecsSet->nummergeCSadded == mergecsSet->numAllocation) 
+       { 
+               mergecsSet->numAllocation += INIT_NUM_CS; 
+               
+               _tmp = realloc(mergecsSet->items, (mergecsSet->numAllocation * 
sizeof(CS)));
+       
+               if (!_tmp){
+                       fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+               }
+               mergecsSet->items = (mergeCS*)_tmp;
+       }
+       mergecsSet->items[mergecsSet->nummergeCSadded] = item;
+       mergecsSet->nummergeCSadded++;
+}
+
+static 
 CSrel* creataCSrel(oid csoid){
        CSrel *csrel = (CSrel*) malloc(sizeof(CSrel));
        csrel->origCSoid = csoid; 
@@ -719,6 +738,16 @@ void freeCSset(CSset *csSet){
        free(csSet);    
 }
 
+static
+void freemergeCSset(mergeCSset *csSet){
+       int i;
+       for(i = 0; i < csSet->nummergeCSadded; i ++){
+               free(csSet->items[i].lstProp);
+       }
+       free(csSet->items);
+       free(csSet);    
+}
+
 static 
 CSset* initCSset(void){
        CSset *csSet = (CSset*) malloc(sizeof(CSset)); 
@@ -729,6 +758,16 @@ CSset* initCSset(void){
        return csSet;
 }
 
+static 
+mergeCSset* initmergeCSset(void){
+       mergeCSset *mergecsSet = (mergeCSset*) malloc(sizeof(mergeCSset)); 
+       mergecsSet->items = (mergeCS*) malloc(sizeof(mergeCS) * INIT_NUM_CS); 
+       mergecsSet->numAllocation = INIT_NUM_CS;
+       mergecsSet->nummergeCSadded = 0;
+
+       return mergecsSet;
+}
+
 /*
 static 
 void freeCS(CS *cs){
@@ -770,6 +809,95 @@ CS* creatCS(oid csId, int numP, oid* buf
        return cs; 
 }
 
+static 
+void mergeOidSets(oid* arr1, oid* arr2, oid* mergeArr, int m, int n, int 
numCombineP){
+       
+       int i = 0, j = 0;
+       int pos = 0;
+
+       while( j < m && i < n )
+       {
+               if( arr1[j] < arr2[i] ){
+                       mergeArr[pos] = arr1[j];
+                       pos++;
+                       j++;
+               }
+               else if( arr1[j] == arr2[i] )
+               {
+                       mergeArr[pos] = arr1[j];        
+                       pos++;
+                       j++;
+                       i++;
+               }
+               else if( arr1[j] > arr2[i] ){
+                       mergeArr[pos] = arr2[i];
+                       pos++;
+                       i++;
+               }
+       }
+       if (j == m && i < n){
+               while (i < n){
+                       mergeArr[pos] = arr2[i];
+                       pos++;
+                       i++;
+               }               
+       } 
+
+       if (j < m && i == n){
+               while (j < m){
+                       mergeArr[pos] = arr1[j];
+                       pos++;
+                       j++;
+               }               
+       } 
+       
+       assert(pos == numCombineP); 
+       /*
+       printf("pos = %d, numCombineP = %d\n", pos, numCombineP);
+
+       for (i = 0; i < m; i++){
+               printf(BUNFMT " ", arr1[i]);
+       }
+       
+       printf("\n");
+       for (i = 0; i < n; i++){
+               printf(BUNFMT " ", arr2[i]);
+       }
+
+       
+       printf("\n");
+       for (i = 0; i < pos; i++){
+               printf(BUNFMT " ", mergeArr[i]);
+       }
+       
+       printf("\n");
+       */
+
+               
+}
+
+static 
+mergeCS* mergeTwoCSs(CS cs1, CS cs2, int numCombineP, int support, int 
coverage){
+
+       mergeCS *mergecs = (mergeCS*) malloc (sizeof (mergeCS)); 
+       mergecs->id1 = cs1.csId;  
+       mergecs->id2 = cs2.csId; 
+       mergecs->lstProp = (oid*) malloc(sizeof(oid) * numCombineP); 
+
+       if (mergecs->lstProp == NULL){
+               printf("Malloc failed. at %d", numCombineP);
+               exit(-1);
+       }
+
+       mergeOidSets(cs1.lstProp, cs2.lstProp, mergecs->lstProp, cs1.numProp, 
cs2.numProp, numCombineP); 
+
+       mergecs->numProp = numCombineP;
+       mergecs->support = support;
+       mergecs->coverage = coverage;
+       
+       return mergecs; 
+
+}
 
 static 
 str printFreqCSSet(CSset *freqCSset, oid* csSuperCSMap, BAT *freqBat, BAT 
*mapbat, char isWriteTofile, int freqThreshold){
@@ -889,6 +1017,52 @@ str printFreqCSSet(CSset *freqCSset, oid
        return MAL_SUCCEED;
 }
 
+
+static 
+str printmergeCSSet(mergeCSset *mergecsSet, int freqThreshold){
+
+       int     i,j; 
+       FILE    *fout; 
+       char    filename[100];
+       char    tmpStr[20];
+       int     ret;
+
+       str     propStr; 
+       char*   schema = "rdf";
+       int     nummergecs;     
+
+       nummergecs = mergecsSet->nummergeCSadded; 
+       
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               throw(RDF, "rdf.rdfschema",
+                               "could not open the tokenizer\n");
+       }
+       
+
+       strcpy(filename, "mergeCSFullInfo");
+       sprintf(tmpStr, "%d", freqThreshold);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+
+       for (i = 0; i < nummergecs; i++){
+               mergeCS cs = (mergeCS)mergecsSet->items[i];
+               
+               fprintf(fout, "MergeCS %d: "BUNFMT " and " BUNFMT 
"\n",i,cs.id1, cs.id2);
+               for (j = 0; j < cs.numProp; j++){
+                       takeOid(cs.lstProp[j], &propStr);       
+                       fprintf(fout,"          %s\n", propStr);
+               }
+               fprintf(fout, "\n");
+       }
+
+       fclose(fout);
+       
+       TKNZRclose(&ret);
+       return MAL_SUCCEED;
+}
+
 /*
  * Hashing function for a set of values
  * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html
@@ -1171,6 +1345,40 @@ static int isSubset(oid* arr1, oid* arr2
 }
 
 /*
+ * Use Jaccard similarity coefficient for computing the 
+ * similarity between two sets
+ * sim(A,B) = |A  B| / |A U B|
+ * Here each set contains distinct values only 
+ * */
+
+static 
+float similarityScore(oid* arr1, oid* arr2, int m, int n, int *numCombineP){
+       
+       int i = 0, j = 0;
+       int numOverlap = 0; 
+        
+       while( i < n && j < m )
+       {
+               if( arr1[j] < arr2[i] )
+                       j++;
+               else if( arr1[j] == arr2[i] )
+               {
+                       j++;
+                       i++;
+                       numOverlap++;
+               }
+               else if( arr1[j] > arr2[i] )
+                       i++;
+       }
+
+       *numCombineP = m + n - numOverlap;
+               
+       return  ((float)numOverlap / (*numCombineP));
+}
+
+
+
+/*
 static 
 void printCS(CS cs){
        int i; 
@@ -1187,7 +1395,7 @@ void printCS(CS cs){
  * Here maximum frequent CS is a CS that there exist no other CS which 
contains that CS
  * */
 static 
-void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, BAT* coverageBat, 
int* superCSCoverage, BAT* freqBat, int* superCSFrequency, int numCS){
+void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, BAT* coverageBat, 
int* superCSCoverage, BAT* freqBat, int* superCSFrequency, int numCS, int 
*nMaxCSs){
 
        int     numFreqCS = freqCSset->numCSadded; 
        int     i, j; 
@@ -1224,6 +1432,8 @@ void getMaximumFreqCSs(CSset *freqCSset,
                        //printCS( freqCSset->items[i]); 
                }
        }
+
+       *nMaxCSs = numMaxCSs;
        printf("Number of maximum CSs: %d / %d CSs \n", numMaxCSs, numCS);
 
        /*
@@ -1268,7 +1478,39 @@ void getMaximumFreqCSs(CSset *freqCSset,
        */
 }
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to