Changeset: c78661b8a206 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c78661b8a206 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
change sampleData generation to generate only 8 columns (ordered by support) diffs (169 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -761,14 +761,6 @@ char isInfrequentProp(PropTypes pt, CS c #if NO_OUTPUTFILE == 0 static -char isInfrequentSampleProp(CS freqCS, int propIdx){ - if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; - else return 0; -} -#endif - -#if NO_OUTPUTFILE == 0 -static char isInfrequentSampleCol(CS freqCS, PropTypes pt){ if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; @@ -5784,6 +5776,8 @@ str printSampleData(CSSample *csSample, char* schema = "rdf"; CSSample sample; CS freqCS; + int* propOrder; + int numPropsInSampleTable; char objType = 0; str objStr; oid objOid = BUN_NONE; @@ -5833,7 +5827,7 @@ str printSampleData(CSSample *csSample, for (i = 0; i < num; i++){ sample = csSample[i]; freqCS = freqCSset->items[sample.freqIdx]; - fprintf(fout,"Sample table %d Candidates: ", i); + fprintf(fout,"Table %d\n", i); for (j = 0; j < (int)sample.candidateCount; j++){ //fprintf(fout," " BUNFMT,sample.candidates[j]); if (sample.candidates[j] != BUN_NONE){ @@ -5844,10 +5838,12 @@ str printSampleData(CSSample *csSample, getStringName(sample.candidates[j], &canStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(&canStrShort, canStr); - fprintf(fout,";%s", canStrShort); + if (j+1 == (int)sample.candidateCount) fprintf(fout, "%s", canStrShort); + else fprintf(fout, "%s;", canStrShort); GDKfree(canStrShort); #else - fprintf(fout,";%s", canStr); + if (j+1 == (int)sample.candidateCount) fprintf(fout, "%s", canStr); + else fprintf(fout, "%s;", canStr); #endif GDKfree(canStr); @@ -5882,6 +5878,35 @@ str printSampleData(CSSample *csSample, else fprintf(fouttb,"CREATE TABLE tbSample%d \n (\n", i); + //Number of tuples + fprintf(fout, "%d\n", freqCS.support); + + // Compute property order (descending by support) and number of properties that are printed + if (sampleVersion > 1) { + numPropsInSampleTable = (sample.numProp>NUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp; + propOrder = GDKmalloc(sizeof(int) * sample.numProp); + for (j = 0; j < sample.numProp; ++j) { + propOrder[j] = j; + } + + // insertion sort + // do not sort "Subject" (first property), it should remain at the first position + for (j = 2; j < sample.numProp; ++j) { + int tmpPos = propOrder[j]; + int tmpVal = freqCS.lstPropSupport[tmpPos]; + int k = j - 1; + while (k >= 1 && freqCS.lstPropSupport[propOrder[k]] < tmpVal) { // sort descending + propOrder[k + 1] = propOrder[k]; + k--; + } + propOrder[k + 1] = tmpPos; + } + + } else { + numPropsInSampleTable = sample.numProp; // all properties, no change in order because freqCS.lstPropSupport[] is not yet available + } + + //List of columns fprintf(fout,"Subject"); fprintf(fouttb,"SubjectCol string"); @@ -5891,14 +5916,15 @@ str printSampleData(CSSample *csSample, isDescription = 0; isImage = 0; isSite = 0; - for (j = 0; j < sample.numProp; j++){ + for (j = 0; j < numPropsInSampleTable; j++){ + int index = j; if (sampleVersion > 1){ //Do not consider infreq Prop - if (isInfrequentSampleProp(freqCS, j)) continue; + index = propOrder[index]; // apply mapping to change order of properties } #if USE_SHORT_NAMES propStrShort = NULL; #endif - takeOid(sample.lstProp[j], &propStr); + takeOid(sample.lstProp[index], &propStr); #if USE_SHORT_NAMES getPropNameShort(&propStrShort, propStr); fprintf(fout,";%s", propStrShort); @@ -5929,7 +5955,7 @@ str printSampleData(CSSample *csSample, strcmp(propStrShort,"fax_number") == 0 || strcmp(propStrShort,"app_id") == 0 ) - fprintf(fouttb,",\n%s_%d string",propStrShort,j); + fprintf(fouttb,",\n%s_%d string",propStrShort,index); else fprintf(fouttb,",\n%s string",propStrShort); @@ -5951,19 +5977,7 @@ str printSampleData(CSSample *csSample, } fprintf(fout, "\n"); fprintf(fouttb, "\n); \n \n"); - - //List of support - for (j = 0; j < sample.numProp; j++){ - if (sampleVersion > 1){ //Do not consider infreq Prop - if (isInfrequentSampleProp(freqCS, j)) continue; - fprintf(fout,";%d", freqCS.lstPropSupport[j]); - } - else{ - fprintf(fout,";%d", freqCS.support); - } - } - fprintf(fout, "\n"); - + fprintf(foutis, "echo \""); //All the instances for (k = 0; k < sample.numInstances; k++){ @@ -5982,10 +5996,11 @@ str printSampleData(CSSample *csSample, GDKfree(subjStr); for (j = 0; j < sample.numProp; j++){ + int index = j; if (sampleVersion > 1){ //Do not consider infreq Prop - if (isInfrequentSampleProp(freqCS, j)) continue; + index = propOrder[index]; // apply mapping to change order of properties } - objOid = sample.lstObj[j][k]; + objOid = sample.lstObj[index][k]; if (objOid == BUN_NONE){ fprintf(fout,";NULL"); fprintf(foutis,"|NULL"); @@ -6054,6 +6069,7 @@ str printSampleData(CSSample *csSample, } + if (sampleVersion > 1) GDKfree(propOrder); } fclose(fout); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -399,6 +399,7 @@ typedef struct CSPropTypes { #define NUM_SAMPLE_CANDIDATE 3 #define SAMPLE_FILTER_THRESHOLD 10 // SAMPLE_FILTER_THRESHOLD/ 100 #define GETSAMPLE_BEFOREMERGING 1 // Get the sample data before merging CS's +#define NUM_PROPS_IN_SAMPLE_DATA 8 // how many properties should be printed (including subject column) typedef struct CSSample{ int freqIdx; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list