Changeset: 1034ded84d4d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1034ded84d4d
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdf_shredder.c
        monetdb5/extras/rdf/rdf_shredder.mx
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Change the implementation according to the new design.

- Using BAT for storing hash value of Characteristic set
- Store lists of P's corresponding to each CS in an BAT.
The offset for each list is stored in another BAT


diffs (truncated from 438 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -64,6 +64,9 @@ typedef enum {
 
 #define STORE TRIPLE_STORE /* this should become a compile time option */
 
+#define batsz 10000000
+#define smallbatsz 100000
+
 #if STORE == TRIPLE_STORE
  typedef enum {
        S_sort, P_sort, O_sort, /* sorted */
diff --git a/monetdb5/extras/rdf/rdf_shredder.c 
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -37,7 +37,7 @@ typedef struct graphBATdef {
        int tailType;            /* type of right column */
 } graphBATdef;
 
-static BUN batsz = 10000000;
+//static BUN batsz = 10000000;
 
 /* this list should be kept alligned with the graphBATType enum */
 #if STORE == TRIPLE_STORE
diff --git a/monetdb5/extras/rdf/rdf_shredder.mx 
b/monetdb5/extras/rdf/rdf_shredder.mx
--- a/monetdb5/extras/rdf/rdf_shredder.mx
+++ b/monetdb5/extras/rdf/rdf_shredder.mx
@@ -40,8 +40,6 @@ typedef struct graphBATdef {
        int tailType;            /* type of right column */
 } graphBATdef;
 
-static BUN batsz = 10000000;
-
 /* this list should be kept alligned with the graphBATType enum */
 #if STORE == TRIPLE_STORE
  static graphBATdef graphdef[N_GRAPH_BAT] = {
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -26,6 +26,7 @@
 #include <gdk.h>
 #include <hashmap/hashmap.h>
 
+
 str
 RDFSchemaExplore(int *ret, str *tbname, str *clname)
 {
@@ -107,6 +108,75 @@ CS* creatCS(int subId, int numP, int* bu
        return cs; 
 }
 
+
+/*
+ * Hashing function for a set of values
+ * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html
+ *
+ */
+static unsigned int RDF_hash_intlist(int* key, int num){
+       unsigned int hashCode = 5381u; 
+       int i; 
+
+       for (i = 0; i < num; i++){
+               hashCode = ((hashCode << 5) + hashCode) + key[i];
+       }
+       
+       // return 0x7fffffff & hashCode 
+       return hashCode;
+}
+
+static 
+void appendArrayToBat(BAT *b, int* inArray, int num){
+       int i; 
+       BUN r = BUNlast(b);
+       if (r + num < b->batCapacity){
+               BATextend(b, smallbatsz); 
+       }
+       for (i = 0; i < num; i++){
+               memcpy(Tloc(b, BUNlast(b)), inArray, sizeof(int) * num); 
+       }
+       BATsetcount(b, (BUN) (b->batCount + num)); 
+       
+}
+
+static 
+void checkCSduplication(BAT* pOffsetBat, BAT* fullPBat, BUN pos, int* key, int 
numK){
+       BUN *offset; 
+       BUN *offset2; 
+       int numP; 
+       int i; 
+       BUN *existvalue; 
+
+       offset = (BUN *) Tloc(pOffsetBat, pos); 
+       if ((pos + 1) < pOffsetBat->batCount){
+               offset2 = (BUN *)Tloc(pOffsetBat, pos + 1);
+       }
+       else{
+               offset2 = malloc(sizeof(BUN)); 
+               *offset2 = BUNlast(fullPBat); 
+       }
+
+       numP = *offset2 - *offset; 
+
+       // Check each value
+       if (numK != numP) {
+               printf("No duplication \n");
+               return; 
+       }
+       else{
+               existvalue = (BUN *)Tloc(fullPBat, *offset);    
+               for (i = 0; i < numP; i++){
+                       if (key[i] != (int)*existvalue++) {
+                               printf("No duplication \n");
+                               return;
+                       }       
+               }
+       }
+       
+       printf("There is duplication \n");
+       return;
+} 
 /*
  * Put a CS to the hashmap. 
  * While putting CS to the hashmap, update the support (frequency) value 
@@ -114,39 +184,44 @@ CS* creatCS(int subId, int numP, int* bu
  * If yes, add that frequent CS to the freqCSset. 
  *
  * */
-static void putaCStoHash(map_t csmap, int* key, int num, oid *csoid, char 
isStoreFreqCS, int freqThreshold, CSset **freqCSset){
-       oid     *getCSoid; 
-       oid     *putCSoid; 
-       int     err; 
-       int*    csKey; 
+static 
+void putaCStoHash(BAT* hsKeyBat, BAT* pOffsetBat, BAT* fullPBat, oid subjId, 
int* key, int num, 
+               oid *csoid, char isStoreFreqCS, int freqThreshold, CSset 
**freqCSset){
+       int     csKey; 
        int     freq = 0; 
        CS      *freqCS; 
+       BUN     bun; 
+       BUN     offset; 
 
-       csKey = (int*) malloc(sizeof(int) * num);
-       if (csKey==NULL){
-               printf("Malloc failed. at %d", num);
-               exit(-1); 
-       }
+       csKey = RDF_hash_intlist(key, num);
+       bun = BUNfnd(BATmirror(hsKeyBat),(ptr) &csKey);
+       if (bun == BUN_NONE) {
+               if (hsKeyBat->T->hash && BATcount(hsKeyBat) > 4 * 
hsKeyBat->T->hash->mask) {
+                       HASHdestroy(hsKeyBat);
+                       BAThash(BATmirror(hsKeyBat), 2*BATcount(hsKeyBat));
+               }
+               hsKeyBat = BUNappend(hsKeyBat, (ptr) &csKey, TRUE);
 
-       copyIntSet(csKey, key, num); 
-       if (hashmap_get(csmap, csKey, num,(void**)(&getCSoid),1, &freq) != 
MAP_OK){
-               putCSoid = malloc(sizeof(oid)); 
-               *putCSoid = *csoid; 
+               (*csoid)++;
 
-               err = hashmap_put(csmap, csKey, num, 1,  putCSoid);     
-               assert(err == MAP_OK); 
+               offset = BUNlast(fullPBat);
+               /* Add list of p to fullPBat and pOffsetBat*/
+               BUNappend(pOffsetBat, &offset , TRUE);
+               appendArrayToBat(fullPBat, key, num);
 
-               (*csoid)++; 
        }
        else{
+               printf("This CS exists \n");    
+               /* Check whether it is really an duplication (same hashvalue 
but different list of */
+               checkCSduplication(pOffsetBat, fullPBat, bun, key, num );
+
                if (isStoreFreqCS == 1){        /* Store the frequent CS to the 
CSset*/
                        //printf("FreqCS: Support = %d, Threshold %d  \n ", 
freq, freqThreshold);
                        if (freq == freqThreshold){
-                               freqCS = creatCS(*getCSoid, num, key);          
+                               freqCS = creatCS(subjId, num, key);             
                                addCStoSet(*freqCSset, *freqCS);
                        }
                }
-               free(csKey); 
        }
 
 }
@@ -334,128 +409,29 @@ static void getStatisticCSsBySupports(ma
        free(statCS); 
 }
 
-/* Extract CS from SPO triples table */
-str
-RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int *freqThreshold){
-       BUN     p, q; 
-       BAT     *sbat = NULL, *pbat = NULL; 
-       BATiter si, pi;         /*iterator for BAT of s,p columns in spo table 
*/
-       oid     *bt, *pbt; 
-       oid     curS;           /* current Subject oid */
-       oid     curP;           /* current Property oid */
-       oid     CSoid = 0;      /* Characteristic set oid */
-       int     numP;           /* Number of properties for current S */
-       map_t   csMap;          
-       int*    buff;    
-       int     INIT_PROPERTY_NUM = 5000; 
-       int     maxNumProp = 0; 
-       CSset   *freqCSset;     /* Set of frequent CSs */
-
-
-       buff = (int *) malloc (sizeof(int) * INIT_PROPERTY_NUM);
-       
-       if ((sbat = BATdescriptor(*sbatid)) == NULL) {
-               throw(MAL, "rdf.RDFextractCS", RUNTIME_OBJECT_MISSING);
-       }
-       if ((pbat = BATdescriptor(*pbatid)) == NULL) {
-               throw(MAL, "rdf.RDFextractCS", RUNTIME_OBJECT_MISSING);
-       }
-       
-       si = bat_iterator(sbat); 
-       pi = bat_iterator(pbat); 
-
-       /* Init a hashmap */
-       csMap = hashmap_new(); 
-       freqCSset = initCSset();
-
-       numP = 0;
-       curP = 0; 
-
-       printf("freqThreshold = %d \n", *freqThreshold);        
-       BATloop(sbat, p, q){
-               bt = (oid *) BUNtloc(si, p);            
-               if (*bt != curS){
-                       if (p != 0){    /* Not the first S */
-                               putaCStoHash(csMap, buff, numP, &CSoid, 1, 
*freqThreshold, &freqCSset); 
-                               
-                               if (numP > maxNumProp) 
-                                       maxNumProp = numP; 
-                       }
-                       curS = *bt; 
-                       curP = 0;
-                       numP = 0;
-               }
-                               
-               pbt = (oid *) BUNtloc(pi, p); 
-
-               if (numP > INIT_PROPERTY_NUM){
-                       throw(MAL, "rdf.RDFextractCS", "# of properties is 
greater than INIT_PROPERTY_NUM");
-                       exit(-1);
-               }
-               
-               if (curP != *pbt){      /* Multi values property */             
-                       buff[numP] = *pbt; 
-                       numP++; 
-                       curP = *pbt; 
-               }
-               //printf("Travel sbat at %d value: %d , for pbat: %d \n", (int) 
p, (int) *bt, (int) *pbt);
-       }
-       
-       /*put the last CS */
-       putaCStoHash(csMap, buff, numP, &CSoid, 1, *freqThreshold, &freqCSset 
); 
-
-       if (numP > maxNumProp) 
-               maxNumProp = numP; 
-               
-       printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded);
-
-       /*get the statistic */
-
-       getTopFreqCSs(csMap,*freqThreshold);
-
-       getMaximumFreqCSs(freqCSset); 
-
-       //getStatisticCSsBySize(csMap,maxNumProp); 
-
-       getStatisticCSsBySupports(csMap, 5000, 1, 0);
-
-       BBPreclaim(sbat); 
-       BBPreclaim(pbat); 
-
-       free (buff); 
-
-       freeCSset(freqCSset); 
-
-       hashmap_free(csMap);
-
-       *ret = 1; 
-       return MAL_SUCCEED; 
-}
-
 /*
  * Get the refer CS 
  * Input: oid of a URI object 
  * Return the id of the CS
  * */
+
+/*
 static 
 str getReferCS(BAT *sbat, BAT *pbat, oid *obt){
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to