Changeset: 9b6644b4d8f1 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9b6644b4d8f1 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Detect the sub-family of each CS. - Add data structures. - Implement all function for generating signature for each subCS, checking duplications. diffs (truncated from 461 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -42,6 +42,30 @@ static void copyOidSet(oid* dest, oid* o } } + +static void copyTypesSet(char* dest, char* orig, int len){ + memcpy(dest, orig, len * sizeof(char)); +} + + +/* + * Hashing function for a set of values + * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html + * + */ +static oid RDF_hash_Tyleslist(char* types, int num){ + //unsigned int hashCode = 5381u; + oid hashCode = 5381u; + int i; + + for (i = 0; i < num; i++){ + hashCode = ((hashCode << 5) + hashCode) + types[i]; + } + + // return 0x7fffffff & hashCode + return hashCode; +} + /* static void printArray(oid* inputArr, int num){ int i; @@ -110,7 +134,7 @@ void printCSrelSet(CSrel *csrelSet, int int j; for (i = 0; i < num; i++){ if (csrelSet[i].numRef != 0){ //Only print CS with FK - printf("Relationship i: "); + printf("Relationship %d: ", i); printf("CS " BUNFMT " --> ", csrelSet[i].origCSoid); for (j = 0; j < csrelSet[i].numRef; j++){ printf(BUNFMT " (%d) ", csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]); @@ -121,6 +145,128 @@ void printCSrelSet(CSrel *csrelSet, int } static +SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){ + SubCS *subcs = malloc(sizeof(SubCS)); + subcs->subTypes = (char*) malloc(sizeof(char) * numP); + + copyTypesSet(subcs->subTypes, buff, numP); + subcs->subCSId = subCSId; + subcs->numSubTypes = numP; + subcs->sign = subCSsign; + return subcs; +} + +static +SubCSSet* createaSubCSSet(oid csId){ + SubCSSet* subCSset = malloc(sizeof(SubCSSet)); + subCSset->csId = csId; + subCSset->numAllocation = INIT_NUM_SUBCS; + subCSset->numSubCS = 0; + subCSset->subCSs = malloc(sizeof(SubCS) * INIT_NUM_SUBCS); + subCSset->freq = malloc(sizeof(int) * INIT_NUM_SUBCS); + + return subCSset; +} + +static +SubCSSet* initCS_SubCSMap(oid numSubCSSet){ + oid i; + SubCSSet *subcssets = malloc(sizeof(SubCSSet) * numSubCSSet); + SubCSSet *subcsset; + for (i = 0; i < numSubCSSet;i++){ + subcsset = createaSubCSSet(i); + subcssets[i] = (SubCSSet) *subcsset; + } + + return subcssets; + +} +static +char checkExistsubCS(oid subCSsign, char* types, int numTypes, SubCSSet *subcsset, oid *existCSId){ + char isFound = 0; + int i; + int j; + for (i = 0; i < subcsset->numSubCS; i++){ + if ((subcsset->subCSs[i].sign != subCSsign) || (subcsset->subCSs[i].numSubTypes != numTypes)) + continue; + else{ + isFound = 1; + for (j = 0; j < numTypes; j++){ + if (subcsset->subCSs[i].subTypes[j] != types[j]){ + isFound = 0; + break; + } + } + + if (isFound == 1){ + *existCSId = i; + return isFound; + } + } + } + + *existCSId = subcsset->numSubCS; //Id of new SubCS + + return isFound; +} + +static +void addSubCStoSet(SubCSSet *subcsSet, SubCS item) +{ + void *_tmp; + void *_tmp2; + + if(subcsSet->numSubCS == subcsSet->numAllocation) + { + subcsSet->numAllocation += INIT_NUM_SUBCS; + + _tmp = realloc(subcsSet->subCSs, (subcsSet->numAllocation * sizeof(SubCS))); + _tmp2 = realloc(subcsSet->freq, (subcsSet->numAllocation * sizeof(int))); + + if (!_tmp){ + fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + } + subcsSet->subCSs = (SubCS*)_tmp; + subcsSet->freq = (int *) _tmp2; + } + + subcsSet->subCSs[subcsSet->numSubCS] = item; + subcsSet->freq[subcsSet->numSubCS] = 1; + + subcsSet->numSubCS++; + +} + +static +oid addSubCS(char *buff, int numP, int csId, SubCSSet* csSubCSMap){ + SubCSSet *subcsset; + oid subCSsign; + char isFound; + oid subCSId; + SubCS *subCS; + + + subcsset = &(csSubCSMap[csId]); + + // Check the duplication + subCSsign = RDF_hash_Tyleslist(buff, numP); + + isFound = checkExistsubCS(subCSsign, buff, numP, subcsset, &subCSId); + + if (isFound == 0){ // Add new + subCS = creatSubCS(subCSId, numP, buff, subCSsign); + addSubCStoSet(subcsset,*subCS); + } + else{ // Exist + //Update frequency + subcsset->freq[subCSId]++; + } + + return subCSId; + +} + +static void addReltoCSRel(oid origCSoid, oid refCSoid, CSrel *csrel) { void *_tmp; @@ -193,7 +339,7 @@ void freeCS(CS *cs){ */ static -CS* creatCS(oid subId, int numP, oid* buff){ +CS* creatCS(oid csId, int numP, oid* buff){ CS *cs = malloc(sizeof(CS)); cs->lstProp = (oid*) malloc(sizeof(oid) * numP); @@ -203,7 +349,7 @@ CS* creatCS(oid subId, int numP, oid* bu } copyOidSet(cs->lstProp, buff, numP); - cs->subIdx = subId; + cs->csId = csId; cs->numProp = numP; cs->numAllocation = numP; cs->isSubset = 0; /*By default, this CS is not known to be a subset of any other CS*/ @@ -374,7 +520,7 @@ void addNewCS(CSBats *csBats, BUN* csKey * * */ static -oid putaCStoHash(CSBats *csBats, oid subjId, oid* key, int num, +oid putaCStoHash(CSBats *csBats, oid* key, int num, oid *csoid, char isStoreFreqCS, int freqThreshold, CSset *freqCSset){ BUN csKey; int *freq; @@ -412,7 +558,7 @@ oid putaCStoHash(CSBats *csBats, oid sub if (isStoreFreqCS == 1){ /* Store the frequent CS to the CSset*/ //printf("FreqCS: Support = %d, Threshold %d \n ", freq, freqThreshold); if (*freq == freqThreshold){ - freqCS = creatCS(subjId, num, key); + freqCS = creatCS(csId, num, key); addCStoSet(freqCSset, *freqCS); } } @@ -715,7 +861,7 @@ void freeCSBats(CSBats *csBats){ static -str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset *freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid){ +str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset *freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid, int *maxNumProp, int *maxNumPwithDup){ BUN p, q; oid *sbt, *pbt; @@ -723,9 +869,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B oid curP; /* current Property oid */ oid CSoid = 0; /* Characteristic set oid */ int numP; /* Number of properties for current S */ + int numPwithDup = 0; oid* buff; int INIT_PROPERTY_NUM = 5000; - int maxNumProp = 0; oid returnCSid; buff = (oid *) malloc (sizeof(oid) * INIT_PROPERTY_NUM); @@ -739,12 +885,14 @@ str RDFassignCSId(int *ret, BAT *sbat, B sbt = (oid *) BUNtloc(si, p); if (*sbt != curS){ if (p != 0){ /* Not the first S */ - returnCSid = putaCStoHash(csBats, curS, buff, numP, &CSoid, 1, *freqThreshold, freqCSset); + returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1, *freqThreshold, freqCSset); subjCSMap[curS] = returnCSid; - if (numP > maxNumProp) - maxNumProp = numP; + if (numP > *maxNumProp) + *maxNumProp = numP; + if (numPwithDup > *maxNumPwithDup) + *maxNumPwithDup = numPwithDup; if (returnCSid > *maxCSoid) *maxCSoid = returnCSid; @@ -752,6 +900,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B curS = *sbt; curP = 0; numP = 0; + numPwithDup = 0; } pbt = (oid *) BUNtloc(pi, p); @@ -766,17 +915,22 @@ str RDFassignCSId(int *ret, BAT *sbat, B numP++; curP = *pbt; } + + numPwithDup++; } /*put the last CS */ - returnCSid = putaCStoHash(csBats, curS, buff, numP, &CSoid, 1, *freqThreshold, freqCSset ); + returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1, *freqThreshold, freqCSset ); subjCSMap[curS] = returnCSid; - if (numP > maxNumProp) - maxNumProp = numP; - + if (numP > *maxNumProp) + *maxNumProp = numP; + + if (numPwithDup > *maxNumPwithDup) + *maxNumPwithDup = numPwithDup; + if (returnCSid > *maxCSoid) *maxCSoid = returnCSid; @@ -788,62 +942,48 @@ str RDFassignCSId(int *ret, BAT *sbat, B } static -str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, CSset *freqCSset, - int *freqThreshold, CSBats* csBats, oid *subjCSMap, BUN maxSoid, BUN maxCSoid){ +str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter oi, + oid *subjCSMap, oid *subjSubCSMap, BUN maxSoid, BUN maxCSoid, int maxNumPwithDup){ BUN p, q; - oid *sbt, *pbt, *obt; + oid *sbt, *obt; oid curS; /* current Subject oid */ - oid curP; /* current Property oid */ - oid CSoid = 0; /* Characteristic set oid */ - int numP; /* Number of properties for current S */ - oid* buff; - int INIT_PROPERTY_NUM = 5000; - int maxNumProp = 0; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list