Changeset: 9b6644b4d8f1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9b6644b4d8f1
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Detect the sub-family of each CS.

- Add data structures.

- Implement all function for generating signature for each subCS, checking 
duplications.


diffs (truncated from 461 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -42,6 +42,30 @@ static void copyOidSet(oid* dest, oid* o
        }
 }
 
+
+static void copyTypesSet(char* dest, char* orig, int len){
+       memcpy(dest, orig, len * sizeof(char));
+}
+
+
+/*
+ * Hashing function for a set of values
+ * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html
+ *
+ */
+static oid RDF_hash_Tyleslist(char* types, int num){
+       //unsigned int hashCode = 5381u; 
+       oid  hashCode = 5381u;
+       int i; 
+
+       for (i = 0; i < num; i++){
+               hashCode = ((hashCode << 5) + hashCode) + types[i];
+       }
+       
+       // return 0x7fffffff & hashCode 
+       return hashCode;
+}
+
 /*
 static void printArray(oid* inputArr, int num){
        int i; 
@@ -110,7 +134,7 @@ void printCSrelSet(CSrel *csrelSet, int 
        int j; 
        for (i = 0; i < num; i++){
                if (csrelSet[i].numRef != 0){   //Only print CS with FK
-                       printf("Relationship i: ");
+                       printf("Relationship %d: ", i);
                        printf("CS " BUNFMT " --> ", csrelSet[i].origCSoid);
                        for (j = 0; j < csrelSet[i].numRef; j++){
                                printf(BUNFMT " (%d) ", 
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);      
@@ -121,6 +145,128 @@ void printCSrelSet(CSrel *csrelSet, int 
 }
 
 static 
+SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){
+       SubCS *subcs = malloc(sizeof(SubCS)); 
+       subcs->subTypes =  (char*) malloc(sizeof(char) * numP);
+       
+       copyTypesSet(subcs->subTypes, buff, numP); 
+       subcs->subCSId = subCSId;
+       subcs->numSubTypes = numP; 
+       subcs->sign = subCSsign; 
+       return subcs; 
+}
+
+static 
+SubCSSet* createaSubCSSet(oid csId){
+       SubCSSet* subCSset = malloc(sizeof(SubCSSet));
+       subCSset->csId = csId; 
+       subCSset->numAllocation = INIT_NUM_SUBCS;
+       subCSset->numSubCS = 0;
+       subCSset->subCSs = malloc(sizeof(SubCS) * INIT_NUM_SUBCS);
+       subCSset->freq = malloc(sizeof(int) * INIT_NUM_SUBCS);
+
+       return subCSset;
+}
+
+static 
+SubCSSet* initCS_SubCSMap(oid numSubCSSet){
+       oid i; 
+       SubCSSet *subcssets = malloc(sizeof(SubCSSet) * numSubCSSet); 
+       SubCSSet *subcsset;
+       for (i = 0; i < numSubCSSet;i++){
+               subcsset = createaSubCSSet(i); 
+               subcssets[i] = (SubCSSet) *subcsset; 
+       }
+
+       return subcssets; 
+
+}
+static 
+char checkExistsubCS(oid subCSsign, char* types, int numTypes,  SubCSSet 
*subcsset, oid *existCSId){
+       char isFound = 0; 
+       int i; 
+       int j; 
+       for (i = 0; i < subcsset->numSubCS; i++){
+               if ((subcsset->subCSs[i].sign != subCSsign) || 
(subcsset->subCSs[i].numSubTypes != numTypes))
+                       continue; 
+               else{
+                       isFound = 1; 
+                       for (j = 0; j < numTypes; j++){
+                               if (subcsset->subCSs[i].subTypes[j] != 
types[j]){
+                                       isFound = 0; 
+                                       break; 
+                               }
+                       }
+
+                       if (isFound == 1){
+                               *existCSId = i; 
+                               return isFound; 
+                       }
+               }
+       }
+
+       *existCSId = subcsset->numSubCS;        //Id of new SubCS
+
+       return isFound; 
+}
+
+static 
+void addSubCStoSet(SubCSSet *subcsSet, SubCS item)
+{
+       void *_tmp; 
+       void *_tmp2; 
+
+       if(subcsSet->numSubCS == subcsSet->numAllocation) 
+       { 
+               subcsSet->numAllocation += INIT_NUM_SUBCS; 
+               
+               _tmp = realloc(subcsSet->subCSs, (subcsSet->numAllocation * 
sizeof(SubCS)));
+               _tmp2 = realloc(subcsSet->freq, (subcsSet->numAllocation * 
sizeof(int))); 
+       
+               if (!_tmp){
+                       fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+               }
+               subcsSet->subCSs = (SubCS*)_tmp;
+               subcsSet->freq = (int *) _tmp2; 
+       }
+
+       subcsSet->subCSs[subcsSet->numSubCS] = item;
+       subcsSet->freq[subcsSet->numSubCS] = 1;
+
+       subcsSet->numSubCS++;
+
+}
+
+static 
+oid addSubCS(char *buff, int numP, int csId, SubCSSet* csSubCSMap){
+       SubCSSet *subcsset;
+       oid subCSsign; 
+       char isFound; 
+       oid  subCSId; 
+       SubCS *subCS; 
+
+
+       subcsset = &(csSubCSMap[csId]);
+
+       // Check the duplication
+       subCSsign = RDF_hash_Tyleslist(buff, numP);
+
+       isFound = checkExistsubCS(subCSsign, buff, numP, subcsset, &subCSId);
+       
+       if (isFound == 0){      // Add new 
+               subCS = creatSubCS(subCSId, numP, buff, subCSsign);
+               addSubCStoSet(subcsset,*subCS);
+       }
+       else{                   // Exist
+               //Update frequency
+               subcsset->freq[subCSId]++;
+       }
+
+       return subCSId; 
+
+}
+
+static 
 void addReltoCSRel(oid origCSoid, oid refCSoid, CSrel *csrel)
 {
        void *_tmp; 
@@ -193,7 +339,7 @@ void freeCS(CS *cs){
 */
 
 static 
-CS* creatCS(oid subId, int numP, oid* buff){
+CS* creatCS(oid csId, int numP, oid* buff){
        CS *cs = malloc(sizeof(CS)); 
        cs->lstProp =  (oid*) malloc(sizeof(oid) * numP);
        
@@ -203,7 +349,7 @@ CS* creatCS(oid subId, int numP, oid* bu
        }
 
        copyOidSet(cs->lstProp, buff, numP); 
-       cs->subIdx = subId;
+       cs->csId = csId;
        cs->numProp = numP; 
        cs->numAllocation = numP; 
        cs->isSubset = 0; /*By default, this CS is not known to be a subset of 
any other CS*/
@@ -374,7 +520,7 @@ void addNewCS(CSBats *csBats, BUN* csKey
  *
  * */
 static 
-oid putaCStoHash(CSBats *csBats, oid subjId, oid* key, int num, 
+oid putaCStoHash(CSBats *csBats, oid* key, int num, 
                oid *csoid, char isStoreFreqCS, int freqThreshold, CSset 
*freqCSset){
        BUN     csKey; 
        int     *freq; 
@@ -412,7 +558,7 @@ oid putaCStoHash(CSBats *csBats, oid sub
                        if (isStoreFreqCS == 1){        /* Store the frequent 
CS to the CSset*/
                                //printf("FreqCS: Support = %d, Threshold %d  
\n ", freq, freqThreshold);
                                if (*freq == freqThreshold){
-                                       freqCS = creatCS(subjId, num, key);     
        
+                                       freqCS = creatCS(csId, num, key);       
        
                                        addCStoSet(freqCSset, *freqCS);
                                }
                        }
@@ -715,7 +861,7 @@ void freeCSBats(CSBats *csBats){
 
 
 static 
-str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset 
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid){
+str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset 
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid, 
int *maxNumProp, int *maxNumPwithDup){
 
        BUN     p, q; 
        oid     *sbt, *pbt; 
@@ -723,9 +869,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        oid     curP;           /* current Property oid */
        oid     CSoid = 0;      /* Characteristic set oid */
        int     numP;           /* Number of properties for current S */
+       int     numPwithDup = 0; 
        oid*    buff;    
        int     INIT_PROPERTY_NUM = 5000; 
-       int     maxNumProp = 0; 
        oid     returnCSid; 
        
        buff = (oid *) malloc (sizeof(oid) * INIT_PROPERTY_NUM);
@@ -739,12 +885,14 @@ str RDFassignCSId(int *ret, BAT *sbat, B
                sbt = (oid *) BUNtloc(si, p);           
                if (*sbt != curS){
                        if (p != 0){    /* Not the first S */
-                               returnCSid = putaCStoHash(csBats, curS, buff, 
numP, &CSoid, 1, *freqThreshold, freqCSset); 
+                               returnCSid = putaCStoHash(csBats, buff, numP, 
&CSoid, 1, *freqThreshold, freqCSset); 
 
                                subjCSMap[curS] = returnCSid;                   
        
 
-                               if (numP > maxNumProp) 
-                                       maxNumProp = numP; 
+                               if (numP > *maxNumProp) 
+                                       *maxNumProp = numP; 
+                               if (numPwithDup > *maxNumPwithDup)
+                                       *maxNumPwithDup = numPwithDup; 
                                if (returnCSid > *maxCSoid)
                                        *maxCSoid = returnCSid; 
                                 
@@ -752,6 +900,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B
                        curS = *sbt; 
                        curP = 0;
                        numP = 0;
+                       numPwithDup = 0; 
                }
                                
                pbt = (oid *) BUNtloc(pi, p); 
@@ -766,17 +915,22 @@ str RDFassignCSId(int *ret, BAT *sbat, B
                        numP++; 
                        curP = *pbt; 
                }
+
+               numPwithDup++;
                
        }
        
        /*put the last CS */
-       returnCSid = putaCStoHash(csBats, curS, buff, numP, &CSoid, 1, 
*freqThreshold, freqCSset ); 
+       returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1, 
*freqThreshold, freqCSset ); 
        
        subjCSMap[curS] = returnCSid;                           
 
-       if (numP > maxNumProp) 
-               maxNumProp = numP; 
-               
+       if (numP > *maxNumProp) 
+               *maxNumProp = numP; 
+
+       if (numPwithDup > *maxNumPwithDup)
+               *maxNumPwithDup = numPwithDup; 
+
        if (returnCSid > *maxCSoid)
                *maxCSoid = returnCSid; 
 
@@ -788,62 +942,48 @@ str RDFassignCSId(int *ret, BAT *sbat, B
 }
 
 static 
-str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, 
CSset *freqCSset, 
-               int *freqThreshold, CSBats* csBats, oid *subjCSMap, BUN 
maxSoid, BUN maxCSoid){
+str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter oi,  
+               oid *subjCSMap, oid *subjSubCSMap, BUN maxSoid, BUN maxCSoid, 
int maxNumPwithDup){
 
        BUN     p, q; 
-       oid     *sbt, *pbt, *obt; 
+       oid     *sbt, *obt; 
        oid     curS;           /* current Subject oid */
-       oid     curP;           /* current Property oid */
-       oid     CSoid = 0;      /* Characteristic set oid */
-       int     numP;           /* Number of properties for current S */
-       oid*    buff;    
-       int     INIT_PROPERTY_NUM = 5000; 
-       int     maxNumProp = 0; 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to