Changeset: b384ccba763e for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b384ccba763e Modified Files: monetdb5/extras/rdf/hashmap/hashmap.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h monetdb5/extras/rdf/rdfschema.mal Branch: rdf Log Message:
Add frequent Characteristic sets to a set. - This is done while putting CS into a hashmap. Each CS will be check for the frequency (support), if the support of a CS >= a threshold value, it will be added to the set diffs (244 lines): diff --git a/monetdb5/extras/rdf/hashmap/hashmap.c b/monetdb5/extras/rdf/hashmap/hashmap.c --- a/monetdb5/extras/rdf/hashmap/hashmap.c +++ b/monetdb5/extras/rdf/hashmap/hashmap.c @@ -65,6 +65,7 @@ static char intsetcmp(int* key1, int* ke * arr1 has m members, arr2 has n members * */ +/* static int isSubset(int* arr1, int* arr2, int m, int n) { int i = 0, j = 0; @@ -90,7 +91,7 @@ static int isSubset(int* arr1, int* arr2 else return 1; } - +*/ /* * Return the integer of the location in data diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -41,12 +41,85 @@ static void copyIntSet(int* dest, int* o } } -static void putaCStoHash(map_t csmap, int* buff, int num, oid *csoid){ + +static +void addCStoSet(CSset *csSet, CS item) +{ + void *_tmp; + if(csSet->numCSadded == csSet->numAllocation) + { + csSet->numAllocation += INIT_NUM_CS; + + _tmp = realloc(csSet->items, (csSet->numAllocation * sizeof(CS))); + + if (!_tmp){ + fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + } + csSet->items = (CS*)_tmp; + } + csSet->items[csSet->numCSadded] = item; + csSet->numCSadded++; +} + +static +void freeCSset(CSset *csSet){ + int i; + for(i = 0; i < csSet->numCSadded; i ++){ + free(csSet->items[i].lstProp); + } + free(csSet->items); + free(csSet); +} + +static +CSset* initCSset(void){ + CSset *csSet = malloc(sizeof(CSset)); + csSet->items = malloc(sizeof(CS) * INIT_NUM_CS); + csSet->numAllocation = INIT_NUM_CS; + csSet->numCSadded = 0; + + return csSet; +} + +/* +static +void freeCS(CS *cs){ + free(cs->lstProp); + free(cs); +} +*/ + +static +CS* creatCS(int subId, int numP, int* buff){ + CS *cs = malloc(sizeof(CS)); + cs->lstProp = (int*) malloc(sizeof(int) * numP); + + if (cs->lstProp == NULL){ + printf("Malloc failed. at %d", numP); + exit(-1); + } + + copyIntSet(cs->lstProp, buff, numP); + cs->subIdx = subId; + cs->numProp = numP; + cs->numAllocation = numP; + return cs; +} + +/* + * Put a CS to the hashmap. + * While putting CS to the hashmap, update the support (frequency) value + * for an existing CS, and check whether it becomes a frequent CS or not. + * If yes, add that frequent CS to the freqCSset. + * + * */ +static void putaCStoHash(map_t csmap, int* buff, int num, oid *csoid, char isStoreFreqCS, int freqThreshold, CSset *freqCSset){ oid *getCSoid; oid *putCSoid; int err; int* cs; int freq; + CS *freqCS; cs = (int*) malloc(sizeof(int) * num); if (cs==NULL){ @@ -64,12 +137,20 @@ static void putaCStoHash(map_t csmap, in (*csoid)++; } - else + else{ + if (isStoreFreqCS == 1){ /* Store the frequent CS to the CSset*/ + if (freq == freqThreshold){ + freqCS = creatCS(*getCSoid, num, buff); + addCStoSet(freqCSset, *freqCS); + } + } free(cs); + } } + static void putPtoHash(map_t pmap, int value, oid *poid, int support){ oid *getPoid; oid *putPoid; @@ -171,7 +252,7 @@ static void getStatisticCSsBySupports(ma /* Extract CS from SPO triples table */ str -RDFextractCS(int *ret, bat *sbatid, bat *pbatid){ +RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold){ BUN p, q; BAT *sbat = NULL, *pbat = NULL; BATiter si, pi; /*iterator for BAT of s,p columns in spo table */ @@ -184,6 +265,7 @@ RDFextractCS(int *ret, bat *sbatid, bat int* buff; int INIT_PROPERTY_NUM = 50000; int maxNumProp = 0; + CSset *freqCSset; /* Set of frequent CSs */ buff = (int *) malloc (sizeof(int) * INIT_PROPERTY_NUM); @@ -199,6 +281,8 @@ RDFextractCS(int *ret, bat *sbatid, bat /* Init a hashmap */ csMap = hashmap_new(); + freqCSset = initCSset(); + numP = 0; curP = 0; @@ -206,7 +290,7 @@ RDFextractCS(int *ret, bat *sbatid, bat bt = (oid *) BUNtloc(si, p); if (*bt != curS){ if (p != 0){ /* Not the first S */ - putaCStoHash(csMap, buff, numP, &CSoid); + putaCStoHash(csMap, buff, numP, &CSoid, 1, freqThreshold, freqCSset); if (numP > maxNumProp) maxNumProp = numP; @@ -232,11 +316,14 @@ RDFextractCS(int *ret, bat *sbatid, bat } /*put the last CS */ - putaCStoHash(csMap, buff, numP, &CSoid); + putaCStoHash(csMap, buff, numP, &CSoid, 1, freqThreshold, freqCSset ); if (numP > maxNumProp) maxNumProp = numP; - + + + printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded); + /*get the statistic */ getTopFreqCSs(csMap,20); @@ -248,6 +335,9 @@ RDFextractCS(int *ret, bat *sbatid, bat BBPreclaim(pbat); free (buff); + + freeCSset(freqCSset); + hashmap_free(csMap); *ret = 1; diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -24,19 +24,27 @@ rdf_export str RDFSchemaExplore(int *ret, str *tbname, str *clname); rdf_export str -RDFextractCS(int *ret, bat *sbatid, bat *pbatid); +RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold); rdf_export str RDFextractPfromPSO(int *ret, bat *pbatid, bat *sbatid); -typedef struct SubProps +typedef struct CS { int subIdx; //Id of subject int* lstProp; //List of properties' Ids int numProp; int numAllocation; -} SubProps; +} CS; -SubProps* subPropSet; +#define INIT_NUM_CS 100 + +typedef struct CSset{ + CS* items; + int numCSadded; + int numAllocation; +} CSset; + +CSset *freqCSs; #endif /* _RDFSCHEMA_H_ */ diff --git a/monetdb5/extras/rdf/rdfschema.mal b/monetdb5/extras/rdf/rdfschema.mal --- a/monetdb5/extras/rdf/rdfschema.mal +++ b/monetdb5/extras/rdf/rdfschema.mal @@ -22,9 +22,9 @@ command rdfschemaexplore(tbname:str, cln address RDFSchemaExplore comment "Explore the schema information from input table e.g., SPO in RDF"; -command rdfextractCS( sbat:bat[:any_1,:oid], pbat:bat[:any_2,:oid] ) :void +command rdfextractCS( sbat:bat[:any_1,:oid], pbat:bat[:any_2,:oid], freqThreshold:int ) :void address RDFextractCS -comment "Extract Characteristic sets from SPO table"; +comment "Extract Characteristic sets from SPO table. While extracting CSs, get the frequent CSs"; command rdfextractPfromPSO(pbat:bat[:any_1,:oid], sbat:bat[:any_2,:oid]):void address RDFextractPfromPSO _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list