Changeset: 1034ded84d4d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1034ded84d4d Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdf_shredder.c monetdb5/extras/rdf/rdf_shredder.mx monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Change the implementation according to the new design. - Using BAT for storing hash value of Characteristic set - Store lists of P's corresponding to each CS in an BAT. The offset for each list is stored in another BAT diffs (truncated from 438 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -64,6 +64,9 @@ typedef enum { #define STORE TRIPLE_STORE /* this should become a compile time option */ +#define batsz 10000000 +#define smallbatsz 100000 + #if STORE == TRIPLE_STORE typedef enum { S_sort, P_sort, O_sort, /* sorted */ diff --git a/monetdb5/extras/rdf/rdf_shredder.c b/monetdb5/extras/rdf/rdf_shredder.c --- a/monetdb5/extras/rdf/rdf_shredder.c +++ b/monetdb5/extras/rdf/rdf_shredder.c @@ -37,7 +37,7 @@ typedef struct graphBATdef { int tailType; /* type of right column */ } graphBATdef; -static BUN batsz = 10000000; +//static BUN batsz = 10000000; /* this list should be kept alligned with the graphBATType enum */ #if STORE == TRIPLE_STORE diff --git a/monetdb5/extras/rdf/rdf_shredder.mx b/monetdb5/extras/rdf/rdf_shredder.mx --- a/monetdb5/extras/rdf/rdf_shredder.mx +++ b/monetdb5/extras/rdf/rdf_shredder.mx @@ -40,8 +40,6 @@ typedef struct graphBATdef { int tailType; /* type of right column */ } graphBATdef; -static BUN batsz = 10000000; - /* this list should be kept alligned with the graphBATType enum */ #if STORE == TRIPLE_STORE static graphBATdef graphdef[N_GRAPH_BAT] = { diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -26,6 +26,7 @@ #include <gdk.h> #include <hashmap/hashmap.h> + str RDFSchemaExplore(int *ret, str *tbname, str *clname) { @@ -107,6 +108,75 @@ CS* creatCS(int subId, int numP, int* bu return cs; } + +/* + * Hashing function for a set of values + * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html + * + */ +static unsigned int RDF_hash_intlist(int* key, int num){ + unsigned int hashCode = 5381u; + int i; + + for (i = 0; i < num; i++){ + hashCode = ((hashCode << 5) + hashCode) + key[i]; + } + + // return 0x7fffffff & hashCode + return hashCode; +} + +static +void appendArrayToBat(BAT *b, int* inArray, int num){ + int i; + BUN r = BUNlast(b); + if (r + num < b->batCapacity){ + BATextend(b, smallbatsz); + } + for (i = 0; i < num; i++){ + memcpy(Tloc(b, BUNlast(b)), inArray, sizeof(int) * num); + } + BATsetcount(b, (BUN) (b->batCount + num)); + +} + +static +void checkCSduplication(BAT* pOffsetBat, BAT* fullPBat, BUN pos, int* key, int numK){ + BUN *offset; + BUN *offset2; + int numP; + int i; + BUN *existvalue; + + offset = (BUN *) Tloc(pOffsetBat, pos); + if ((pos + 1) < pOffsetBat->batCount){ + offset2 = (BUN *)Tloc(pOffsetBat, pos + 1); + } + else{ + offset2 = malloc(sizeof(BUN)); + *offset2 = BUNlast(fullPBat); + } + + numP = *offset2 - *offset; + + // Check each value + if (numK != numP) { + printf("No duplication \n"); + return; + } + else{ + existvalue = (BUN *)Tloc(fullPBat, *offset); + for (i = 0; i < numP; i++){ + if (key[i] != (int)*existvalue++) { + printf("No duplication \n"); + return; + } + } + } + + printf("There is duplication \n"); + return; +} /* * Put a CS to the hashmap. * While putting CS to the hashmap, update the support (frequency) value @@ -114,39 +184,44 @@ CS* creatCS(int subId, int numP, int* bu * If yes, add that frequent CS to the freqCSset. * * */ -static void putaCStoHash(map_t csmap, int* key, int num, oid *csoid, char isStoreFreqCS, int freqThreshold, CSset **freqCSset){ - oid *getCSoid; - oid *putCSoid; - int err; - int* csKey; +static +void putaCStoHash(BAT* hsKeyBat, BAT* pOffsetBat, BAT* fullPBat, oid subjId, int* key, int num, + oid *csoid, char isStoreFreqCS, int freqThreshold, CSset **freqCSset){ + int csKey; int freq = 0; CS *freqCS; + BUN bun; + BUN offset; - csKey = (int*) malloc(sizeof(int) * num); - if (csKey==NULL){ - printf("Malloc failed. at %d", num); - exit(-1); - } + csKey = RDF_hash_intlist(key, num); + bun = BUNfnd(BATmirror(hsKeyBat),(ptr) &csKey); + if (bun == BUN_NONE) { + if (hsKeyBat->T->hash && BATcount(hsKeyBat) > 4 * hsKeyBat->T->hash->mask) { + HASHdestroy(hsKeyBat); + BAThash(BATmirror(hsKeyBat), 2*BATcount(hsKeyBat)); + } + hsKeyBat = BUNappend(hsKeyBat, (ptr) &csKey, TRUE); - copyIntSet(csKey, key, num); - if (hashmap_get(csmap, csKey, num,(void**)(&getCSoid),1, &freq) != MAP_OK){ - putCSoid = malloc(sizeof(oid)); - *putCSoid = *csoid; + (*csoid)++; - err = hashmap_put(csmap, csKey, num, 1, putCSoid); - assert(err == MAP_OK); + offset = BUNlast(fullPBat); + /* Add list of p to fullPBat and pOffsetBat*/ + BUNappend(pOffsetBat, &offset , TRUE); + appendArrayToBat(fullPBat, key, num); - (*csoid)++; } else{ + printf("This CS exists \n"); + /* Check whether it is really an duplication (same hashvalue but different list of */ + checkCSduplication(pOffsetBat, fullPBat, bun, key, num ); + if (isStoreFreqCS == 1){ /* Store the frequent CS to the CSset*/ //printf("FreqCS: Support = %d, Threshold %d \n ", freq, freqThreshold); if (freq == freqThreshold){ - freqCS = creatCS(*getCSoid, num, key); + freqCS = creatCS(subjId, num, key); addCStoSet(*freqCSset, *freqCS); } } - free(csKey); } } @@ -334,128 +409,29 @@ static void getStatisticCSsBySupports(ma free(statCS); } -/* Extract CS from SPO triples table */ -str -RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int *freqThreshold){ - BUN p, q; - BAT *sbat = NULL, *pbat = NULL; - BATiter si, pi; /*iterator for BAT of s,p columns in spo table */ - oid *bt, *pbt; - oid curS; /* current Subject oid */ - oid curP; /* current Property oid */ - oid CSoid = 0; /* Characteristic set oid */ - int numP; /* Number of properties for current S */ - map_t csMap; - int* buff; - int INIT_PROPERTY_NUM = 5000; - int maxNumProp = 0; - CSset *freqCSset; /* Set of frequent CSs */ - - - buff = (int *) malloc (sizeof(int) * INIT_PROPERTY_NUM); - - if ((sbat = BATdescriptor(*sbatid)) == NULL) { - throw(MAL, "rdf.RDFextractCS", RUNTIME_OBJECT_MISSING); - } - if ((pbat = BATdescriptor(*pbatid)) == NULL) { - throw(MAL, "rdf.RDFextractCS", RUNTIME_OBJECT_MISSING); - } - - si = bat_iterator(sbat); - pi = bat_iterator(pbat); - - /* Init a hashmap */ - csMap = hashmap_new(); - freqCSset = initCSset(); - - numP = 0; - curP = 0; - - printf("freqThreshold = %d \n", *freqThreshold); - BATloop(sbat, p, q){ - bt = (oid *) BUNtloc(si, p); - if (*bt != curS){ - if (p != 0){ /* Not the first S */ - putaCStoHash(csMap, buff, numP, &CSoid, 1, *freqThreshold, &freqCSset); - - if (numP > maxNumProp) - maxNumProp = numP; - } - curS = *bt; - curP = 0; - numP = 0; - } - - pbt = (oid *) BUNtloc(pi, p); - - if (numP > INIT_PROPERTY_NUM){ - throw(MAL, "rdf.RDFextractCS", "# of properties is greater than INIT_PROPERTY_NUM"); - exit(-1); - } - - if (curP != *pbt){ /* Multi values property */ - buff[numP] = *pbt; - numP++; - curP = *pbt; - } - //printf("Travel sbat at %d value: %d , for pbat: %d \n", (int) p, (int) *bt, (int) *pbt); - } - - /*put the last CS */ - putaCStoHash(csMap, buff, numP, &CSoid, 1, *freqThreshold, &freqCSset ); - - if (numP > maxNumProp) - maxNumProp = numP; - - printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded); - - /*get the statistic */ - - getTopFreqCSs(csMap,*freqThreshold); - - getMaximumFreqCSs(freqCSset); - - //getStatisticCSsBySize(csMap,maxNumProp); - - getStatisticCSsBySupports(csMap, 5000, 1, 0); - - BBPreclaim(sbat); - BBPreclaim(pbat); - - free (buff); - - freeCSset(freqCSset); - - hashmap_free(csMap); - - *ret = 1; - return MAL_SUCCEED; -} - /* * Get the refer CS * Input: oid of a URI object * Return the id of the CS * */ + +/* static str getReferCS(BAT *sbat, BAT *pbat, oid *obt){ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list