Changeset: b384ccba763e for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b384ccba763e
Modified Files:
        monetdb5/extras/rdf/hashmap/hashmap.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
        monetdb5/extras/rdf/rdfschema.mal
Branch: rdf
Log Message:

Add frequent Characteristic sets to a set.

- This is done while putting CS into a hashmap.
Each CS will be check for the frequency (support),
if the support of a CS >= a threshold value, it will be added to the set


diffs (244 lines):

diff --git a/monetdb5/extras/rdf/hashmap/hashmap.c 
b/monetdb5/extras/rdf/hashmap/hashmap.c
--- a/monetdb5/extras/rdf/hashmap/hashmap.c
+++ b/monetdb5/extras/rdf/hashmap/hashmap.c
@@ -65,6 +65,7 @@ static char intsetcmp(int* key1, int* ke
  * arr1 has m members, arr2 has n members
  * */
 
+/*
 static int isSubset(int* arr1, int* arr2, int m, int n)
 {
        int i = 0, j = 0;
@@ -90,7 +91,7 @@ static int isSubset(int* arr1, int* arr2
        else
                return 1;
 } 
-
+*/
 
 /*
  * Return the integer of the location in data
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -41,12 +41,85 @@ static void copyIntSet(int* dest, int* o
        }
 }
 
-static void putaCStoHash(map_t csmap, int* buff, int num, oid *csoid){
+
+static 
+void addCStoSet(CSset *csSet, CS item)
+{
+       void *_tmp; 
+       if(csSet->numCSadded == csSet->numAllocation) 
+       { 
+               csSet->numAllocation += INIT_NUM_CS; 
+               
+               _tmp = realloc(csSet->items, (csSet->numAllocation * 
sizeof(CS)));
+       
+               if (!_tmp){
+                       fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+               }
+               csSet->items = (CS*)_tmp;
+       }
+       csSet->items[csSet->numCSadded] = item;
+       csSet->numCSadded++;
+}
+
+static 
+void freeCSset(CSset *csSet){
+       int i;
+       for(i = 0; i < csSet->numCSadded; i ++){
+               free(csSet->items[i].lstProp);
+       }
+       free(csSet->items);
+       free(csSet);
+}
+
+static 
+CSset* initCSset(void){
+       CSset *csSet = malloc(sizeof(CSset)); 
+       csSet->items = malloc(sizeof(CS) * INIT_NUM_CS); 
+       csSet->numAllocation = INIT_NUM_CS;
+       csSet->numCSadded = 0;
+
+       return csSet;
+}
+
+/*
+static 
+void freeCS(CS *cs){
+       free(cs->lstProp);
+       free(cs);
+}
+*/
+
+static 
+CS* creatCS(int subId, int numP, int* buff){
+       CS *cs = malloc(sizeof(CS)); 
+       cs->lstProp =  (int*) malloc(sizeof(int) * numP);
+       
+       if (cs->lstProp == NULL){
+               printf("Malloc failed. at %d", numP);
+               exit(-1); 
+       }
+
+       copyIntSet(cs->lstProp, buff, numP); 
+       cs->subIdx = subId;
+       cs->numProp = numP; 
+       cs->numAllocation = numP; 
+       return cs; 
+}
+
+/*
+ * Put a CS to the hashmap. 
+ * While putting CS to the hashmap, update the support (frequency) value 
+ * for an existing CS, and check whether it becomes a frequent CS or not. 
+ * If yes, add that frequent CS to the freqCSset. 
+ *
+ * */
+static void putaCStoHash(map_t csmap, int* buff, int num, oid *csoid, char 
isStoreFreqCS, int freqThreshold, CSset *freqCSset){
        oid     *getCSoid; 
        oid     *putCSoid; 
        int     err; 
        int*    cs; 
        int     freq; 
+       CS      *freqCS; 
 
        cs = (int*) malloc(sizeof(int) * num);
        if (cs==NULL){
@@ -64,12 +137,20 @@ static void putaCStoHash(map_t csmap, in
 
                (*csoid)++; 
        }
-       else
+       else{
+               if (isStoreFreqCS == 1){        /* Store the frequent CS to the 
CSset*/
+                       if (freq == freqThreshold){
+                               freqCS = creatCS(*getCSoid, num, buff);         
+                               addCStoSet(freqCSset, *freqCS);
+                       }
+               }
                free(cs); 
+       }
 
 }
 
 
+
 static void putPtoHash(map_t pmap, int value, oid *poid, int support){
        oid     *getPoid; 
        oid     *putPoid; 
@@ -171,7 +252,7 @@ static void getStatisticCSsBySupports(ma
 
 /* Extract CS from SPO triples table */
 str
-RDFextractCS(int *ret, bat *sbatid, bat *pbatid){
+RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold){
        BUN     p, q; 
        BAT     *sbat = NULL, *pbat = NULL; 
        BATiter si, pi;         /*iterator for BAT of s,p columns in spo table 
*/
@@ -184,6 +265,7 @@ RDFextractCS(int *ret, bat *sbatid, bat 
        int*    buff;    
        int     INIT_PROPERTY_NUM = 50000; 
        int     maxNumProp = 0; 
+       CSset   *freqCSset;     /* Set of frequent CSs */
 
        buff = (int *) malloc (sizeof(int) * INIT_PROPERTY_NUM);
        
@@ -199,6 +281,8 @@ RDFextractCS(int *ret, bat *sbatid, bat 
 
        /* Init a hashmap */
        csMap = hashmap_new(); 
+       freqCSset = initCSset();
+
        numP = 0;
        curP = 0; 
 
@@ -206,7 +290,7 @@ RDFextractCS(int *ret, bat *sbatid, bat 
                bt = (oid *) BUNtloc(si, p);            
                if (*bt != curS){
                        if (p != 0){    /* Not the first S */
-                               putaCStoHash(csMap, buff, numP, &CSoid); 
+                               putaCStoHash(csMap, buff, numP, &CSoid, 1, 
freqThreshold, freqCSset); 
                                
                                if (numP > maxNumProp) 
                                        maxNumProp = numP; 
@@ -232,11 +316,14 @@ RDFextractCS(int *ret, bat *sbatid, bat 
        }
        
        /*put the last CS */
-       putaCStoHash(csMap, buff, numP, &CSoid); 
+       putaCStoHash(csMap, buff, numP, &CSoid, 1, freqThreshold, freqCSset ); 
 
        if (numP > maxNumProp) 
                maxNumProp = numP; 
-                                       
+               
+       
+       printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded);
+
        /*get the statistic */
        getTopFreqCSs(csMap,20);
 
@@ -248,6 +335,9 @@ RDFextractCS(int *ret, bat *sbatid, bat 
        BBPreclaim(pbat); 
 
        free (buff); 
+
+       freeCSset(freqCSset); 
+
        hashmap_free(csMap);
 
        *ret = 1; 
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -24,19 +24,27 @@ rdf_export str
 RDFSchemaExplore(int *ret, str *tbname, str *clname);
 
 rdf_export str
-RDFextractCS(int *ret, bat *sbatid, bat *pbatid); 
+RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold); 
 
 rdf_export str
 RDFextractPfromPSO(int *ret, bat *pbatid, bat *sbatid); 
 
-typedef struct SubProps
+typedef struct CS
 {
        int     subIdx;         //Id of subject
        int*    lstProp;        //List of properties' Ids
        int     numProp;
        int     numAllocation;
-} SubProps;
+} CS;
 
-SubProps* subPropSet;
+#define INIT_NUM_CS 100
+
+typedef struct CSset{
+       CS* items;
+       int numCSadded;
+       int numAllocation;
+} CSset; 
+
+CSset *freqCSs; 
 
 #endif /* _RDFSCHEMA_H_ */
diff --git a/monetdb5/extras/rdf/rdfschema.mal 
b/monetdb5/extras/rdf/rdfschema.mal
--- a/monetdb5/extras/rdf/rdfschema.mal
+++ b/monetdb5/extras/rdf/rdfschema.mal
@@ -22,9 +22,9 @@ command rdfschemaexplore(tbname:str, cln
 address RDFSchemaExplore
 comment "Explore the schema information from input table e.g., SPO in RDF";
 
-command rdfextractCS( sbat:bat[:any_1,:oid], pbat:bat[:any_2,:oid] ) :void
+command rdfextractCS( sbat:bat[:any_1,:oid], pbat:bat[:any_2,:oid], 
freqThreshold:int ) :void
 address RDFextractCS
-comment "Extract Characteristic sets from SPO table";
+comment "Extract Characteristic sets from SPO table. While extracting CSs, get 
the frequent CSs";
 
 command rdfextractPfromPSO(pbat:bat[:any_1,:oid], sbat:bat[:any_2,:oid]):void
 address RDFextractPfromPSO
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to