Changeset: f087221703f5 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f087221703f5 Modified Files: monetdb5/extras/rdf/hashmap/hashmap.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Implement function for detecting maximum CSs from set of CSs diffs (203 lines): diff --git a/monetdb5/extras/rdf/hashmap/hashmap.c b/monetdb5/extras/rdf/hashmap/hashmap.c --- a/monetdb5/extras/rdf/hashmap/hashmap.c +++ b/monetdb5/extras/rdf/hashmap/hashmap.c @@ -61,37 +61,7 @@ static char intsetcmp(int* key1, int* ke return 0; } -/* Return 1 if sorted arr2[] is a subset of sorted arr1[] - * arr1 has m members, arr2 has n members - * */ -/* -static int isSubset(int* arr1, int* arr2, int m, int n) -{ - int i = 0, j = 0; - - if(m < n) - return 0; - - while( i < n && j < m ) - { - if( arr1[j] < arr2[i] ) - j++; - else if( arr1[j] == arr2[i] ) - { - j++; - i++; - } - else if( arr1[j] > arr2[i] ) - return 0; - } - - if( i < n ) - return 0; - else - return 1; -} -*/ /* * Return the integer of the location in data diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -103,6 +103,7 @@ CS* creatCS(int subId, int numP, int* bu cs->subIdx = subId; cs->numProp = numP; cs->numAllocation = numP; + cs->isSubset = 0; /*By default, this CS is not known to be a subset of any other CS*/ return cs; } @@ -139,6 +140,7 @@ static void putaCStoHash(map_t csmap, in } else{ if (isStoreFreqCS == 1){ /* Store the frequent CS to the CSset*/ + printf("FreqCS: Support = %d, Threshold %d \n ", freq, freqThreshold); if (freq == freqThreshold){ freqCS = creatCS(*getCSoid, num, buff); addCStoSet(freqCSset, *freqCS); @@ -149,6 +151,80 @@ static void putaCStoHash(map_t csmap, in } +/* Return 1 if sorted arr2[] is a subset of sorted arr1[] + * arr1 has m members, arr2 has n members + * */ + +static int isSubset(int* arr1, int* arr2, int m, int n) +{ + int i = 0, j = 0; + + if(m < n) + return 0; + + while( i < n && j < m ) + { + if( arr1[j] < arr2[i] ) + j++; + else if( arr1[j] == arr2[i] ) + { + j++; + i++; + } + else if( arr1[j] > arr2[i] ) + return 0; + } + + if( i < n ) + return 0; + else + return 1; +} + +static +void printCS(CS cs){ + int i; + printf("CS %d: ", cs.subIdx); + for (i = 0; i < cs.numProp; i++){ + printf(" %d ", cs.lstProp[i]); + } + printf("\n"); +} + +/* + * Get the maximum frequent CSs from a CSset + * Here maximum frequent CS is a CS that there exist no other CS which contains that CS + * */ +static +void getMaximumFreqCSs(CSset *freqCSset){ + + int numCS = freqCSset->numCSadded; + int i, j; + + printf("Maximum frequent CSs: \n"); + + for (i = 0; i < numCS; i++){ + if (freqCSset->items[i].isSubset == 1) continue; + for (j = (i+1); j < numCS; j++){ + if (isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp, + freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { + /* CSj is a subset of CSi */ + freqCSset->items[j].isSubset = 1; + } + else if (isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp, + freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { + /* CSj is a subset of CSi */ + freqCSset->items[i].isSubset = 1; + break; + } + + } + /* By the end, if this CS is not a subset of any other CS */ + if (freqCSset->items[i].isSubset == 0) printCS( freqCSset->items[i]); + } +} + + static void putPtoHash(map_t pmap, int value, oid *poid, int support){ @@ -252,7 +328,7 @@ static void getStatisticCSsBySupports(ma /* Extract CS from SPO triples table */ str -RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold){ +RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int *freqThreshold){ BUN p, q; BAT *sbat = NULL, *pbat = NULL; BATiter si, pi; /*iterator for BAT of s,p columns in spo table */ @@ -286,11 +362,12 @@ RDFextractCS(int *ret, bat *sbatid, bat numP = 0; curP = 0; + printf("freqThreshold = %d \n", *freqThreshold); BATloop(sbat, p, q){ bt = (oid *) BUNtloc(si, p); if (*bt != curS){ if (p != 0){ /* Not the first S */ - putaCStoHash(csMap, buff, numP, &CSoid, 1, freqThreshold, freqCSset); + putaCStoHash(csMap, buff, numP, &CSoid, 1, *freqThreshold, freqCSset); if (numP > maxNumProp) maxNumProp = numP; @@ -316,7 +393,7 @@ RDFextractCS(int *ret, bat *sbatid, bat } /*put the last CS */ - putaCStoHash(csMap, buff, numP, &CSoid, 1, freqThreshold, freqCSset ); + putaCStoHash(csMap, buff, numP, &CSoid, 1, *freqThreshold, freqCSset ); if (numP > maxNumProp) maxNumProp = numP; @@ -325,6 +402,9 @@ RDFextractCS(int *ret, bat *sbatid, bat printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded); /*get the statistic */ + + getMaximumFreqCSs(freqCSset); + getTopFreqCSs(csMap,20); getStatisticCSsBySize(csMap,maxNumProp); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -24,7 +24,7 @@ rdf_export str RDFSchemaExplore(int *ret, str *tbname, str *clname); rdf_export str -RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int freqThreshold); +RDFextractCS(int *ret, bat *sbatid, bat *pbatid, int *freqThreshold); rdf_export str RDFextractPfromPSO(int *ret, bat *pbatid, bat *sbatid); @@ -35,6 +35,7 @@ typedef struct CS int* lstProp; //List of properties' Ids int numProp; int numAllocation; + char isSubset; } CS; #define INIT_NUM_CS 100 _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list