Changeset: b443cd8459e9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b443cd8459e9 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
add properties with high tfidf scores to sample data diffs (170 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5763,7 +5763,7 @@ void getTblName(str *name, oid nameId, B #if NO_OUTPUTFILE == 0 static -str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ +str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion, PropStat *propStat){ int i,j, k; FILE *fout, *fouttb, *foutis; @@ -5777,6 +5777,8 @@ str printSampleData(CSSample *csSample, CSSample sample; CS freqCS; int* propOrder; + int* propOrderTfidf; + float* tfidfValues; int numPropsInSampleTable; char objType = 0; str objStr; @@ -5883,14 +5885,19 @@ str printSampleData(CSSample *csSample, // Compute property order (descending by support) and number of properties that are printed if (sampleVersion > 1) { - numPropsInSampleTable = (sample.numProp>NUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp; + int found = 0; + numPropsInSampleTable = (sample.numProp>(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp; propOrder = GDKmalloc(sizeof(int) * sample.numProp); + propOrderTfidf = GDKmalloc(sizeof(int) * sample.numProp); + tfidfValues = GDKmalloc(sizeof(float) * sample.numProp); for (j = 0; j < sample.numProp; ++j) { propOrder[j] = j; - } - - // insertion sort - // do not sort "Subject" (first property), it should remain at the first position + propOrderTfidf[j] = j; + } + + // To get the top <NUM_PROP_SUPPORT_SAMPLE> properties, sort all properties descending by support. + // The "subject" column remains at the first position regardless of its support. + // Sort using insertion sort. for (j = 2; j < sample.numProp; ++j) { int tmpPos = propOrder[j]; int tmpVal = freqCS.lstPropSupport[tmpPos]; @@ -5902,6 +5909,51 @@ str printSampleData(CSSample *csSample, propOrder[k + 1] = tmpPos; } + // To get the top <NUM_PROP_TFIDF_SAMPLE> properties, sort all properties descending by tf-idf score. + for (j = 1; j < sample.numProp; ++j) { + float tfidf; + BUN bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &sample.lstProp[j]); + if (bun == BUN_NONE) { + printf("Error: property not found\n"); + } else { + tfidf = propStat->tfidfs[bun]; + } + tfidfValues[j] = tfidf; + } + + // Sort using insertion sort. Ignore "subject" column + for (j = 2; j < sample.numProp; ++j) { + int tmpPos = propOrderTfidf[j]; + float tmpVal = tfidfValues[tmpPos]; + int k = j - 1; + while (k >= 1 && tfidfValues[propOrderTfidf[k]] < tmpVal) { // sort descending + propOrderTfidf[k + 1] = propOrderTfidf[k]; + k--; + } + propOrderTfidf[k + 1] = tmpPos; + } + + // Add <NUM_PROP_TFIDF_SAMPLE> properties to propOrder that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE values of propOrder + for (j = 1; j < sample.numProp; ++j) { + int prop, foundProp, bound; + if (found == NUM_PROP_TFIDF_SAMPLE) break; + prop = propOrderTfidf[j]; + // check if prop is already choosen + foundProp = 0; + bound = (1+NUM_PROP_SUPPORT_SAMPLE)>sample.numProp?sample.numProp:(1+NUM_PROP_SUPPORT_SAMPLE); //minimum + for (k = 1; k < bound; ++k) { + if (propOrder[k] == prop) { + foundProp = 1; + break; + } + } + if (!foundProp) { + // add prop to propOrder + // overwriting values is okay because the original values at position >= (1+NUM_PROP_SUPPORT_SAMPLE) in propOrder are not needed anymore + propOrder[1+NUM_PROP_SUPPORT_SAMPLE+found] = prop; + found++; + } + } } else { numPropsInSampleTable = sample.numProp; // all properties, no change in order because freqCS.lstPropSupport[] is not yet available } @@ -6069,7 +6121,11 @@ str printSampleData(CSSample *csSample, } - if (sampleVersion > 1) GDKfree(propOrder); + if (sampleVersion > 1) { + GDKfree(propOrder); + GDKfree(propOrderTfidf); + GDKfree(tfidfValues); + } } fclose(fout); @@ -6837,7 +6893,7 @@ void printFKMultiplicityFromCSPropTypes( #if NO_OUTPUTFILE == 0 static str getSampleData(int *ret, bat *mapbatid, int numTables, CSset* freqCSset, BAT *sbat, BATiter si, BATiter pi, BATiter oi, int* mTblIdxFreqIdxMapping, - CSlabel* labels, int* csTblIdxMapping, int maxNumPwithDup, oid* subjCSMap, int sampleVersion){ + CSlabel* labels, int* csTblIdxMapping, int maxNumPwithDup, oid* subjCSMap, int sampleVersion, PropStat *propStat){ BAT *outputBat = NULL, *mbat = NULL; CSSample *csSample; @@ -6858,7 +6914,7 @@ str getSampleData(int *ret, bat *mapbati initSampleData(csSample, outputBat, freqCSset, mTblIdxFreqIdxMapping, labels); RDFExtractSampleData(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, maxNumPwithDup, csSample, outputBat, numSampleTbl); printsubsetFromCSset(freqCSset, outputBat, mbat, numSampleTbl, mTblIdxFreqIdxMapping, labels, sampleVersion); - printSampleData(csSample, freqCSset, mbat, numSampleTbl, sampleVersion); + printSampleData(csSample, freqCSset, mbat, numSampleTbl, sampleVersion, propStat); freeSampleData(csSample, numSampleTbl); BBPreclaim(outputBat); BBPunfix(mbat->batCacheid); @@ -7301,7 +7357,7 @@ RDFextractCSwithTypes(int *ret, bat *sba #if NO_OUTPUTFILE == 0 getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, - mTblIdxFreqIdxMapping, *labels, csTblIdxMapping, *maxNumPwithDup, *subjCSMap, 1); + mTblIdxFreqIdxMapping, *labels, csTblIdxMapping, *maxNumPwithDup, *subjCSMap, 1, 0); // last parameter (propStat) is null, but is not used when sampleVersion==1 #endif @@ -8828,7 +8884,17 @@ RDFreorganize(int *ret, CStableStat *cst /* Extract sample data for the evaluation */ #if NO_OUTPUTFILE == 0 - getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, mTblIdxFreqIdxMapping, labels, csTblIdxMapping, maxNumPwithDup, subjCSMap, 2); + { + int curNumMergeCS = countNumberMergeCS(freqCSset); + oid* mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); + PropStat *propStat2; + initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap); + propStat2 = initPropStat(); + getPropStatisticsFromMergeCSs(propStat2, curNumMergeCS, mergeCSFreqCSMap, freqCSset); + getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, mTblIdxFreqIdxMapping, labels, csTblIdxMapping, maxNumPwithDup, subjCSMap, 2, propStat2); + freePropStat(propStat2); + free(mergeCSFreqCSMap); + } #endif diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -399,7 +399,8 @@ typedef struct CSPropTypes { #define NUM_SAMPLE_CANDIDATE 999 // print all candidates #define SAMPLE_FILTER_THRESHOLD 10 // SAMPLE_FILTER_THRESHOLD/ 100 #define GETSAMPLE_BEFOREMERGING 1 // Get the sample data before merging CS's -#define NUM_PROPS_IN_SAMPLE_DATA 8 // how many properties should be printed (including subject column) +#define NUM_PROP_SUPPORT_SAMPLE 5 // how many properties should be added to the sample data because of a high support (excluding subject column) +#define NUM_PROP_TFIDF_SAMPLE 2 // how many properties should be added to the sample data because of a high tfidf score typedef struct CSSample{ int freqIdx; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list