Changeset: b443cd8459e9 for MonetDB
Modified Files:
Branch: rdf
Log Message:

add properties with high tfidf scores to sample data

diffs (170 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5763,7 +5763,7 @@ void getTblName(str *name, oid nameId, B
 #if NO_OUTPUTFILE == 0 
-str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion){
+str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion, PropStat *propStat){
        int     i,j, k; 
        FILE    *fout, *fouttb, *foutis; 
@@ -5777,6 +5777,8 @@ str printSampleData(CSSample *csSample, 
        CSSample        sample; 
        CS              freqCS; 
        int*    propOrder;
+       int*    propOrderTfidf;
+       float*  tfidfValues;
        int     numPropsInSampleTable;
        char    objType = 0; 
        str     objStr;         
@@ -5883,14 +5885,19 @@ str printSampleData(CSSample *csSample, 
                // Compute property order (descending by support) and number of 
properties that are printed
                if (sampleVersion > 1) {
-                       numPropsInSampleTable = 
+                       int found = 0;
+                       numPropsInSampleTable = 
                        propOrder = GDKmalloc(sizeof(int) * sample.numProp);
+                       propOrderTfidf = GDKmalloc(sizeof(int) * 
+                       tfidfValues = GDKmalloc(sizeof(float) * sample.numProp);
                        for (j = 0; j < sample.numProp; ++j) {
                                propOrder[j] = j;
-                       }
-                       // insertion sort
-                       // do not sort "Subject" (first property), it should 
remain at the first position
+                               propOrderTfidf[j] = j;
+                       }
+                       // To get the top <NUM_PROP_SUPPORT_SAMPLE> properties, 
sort all properties descending by support.
+                       // The "subject" column remains at the first position 
regardless of its support.
+                       // Sort using insertion sort.
                        for (j = 2; j < sample.numProp; ++j) {
                                int tmpPos = propOrder[j];
                                int tmpVal = freqCS.lstPropSupport[tmpPos];
@@ -5902,6 +5909,51 @@ str printSampleData(CSSample *csSample, 
                                propOrder[k + 1] = tmpPos;
+                       // To get the top <NUM_PROP_TFIDF_SAMPLE> properties, 
sort all properties descending by tf-idf score.
+                       for (j = 1; j < sample.numProp; ++j) {
+                               float tfidf;
+                               BUN bun = 
BUNfnd(BATmirror(propStat->pBat),(ptr) &sample.lstProp[j]);
+                               if (bun == BUN_NONE) {
+                                       printf("Error: property not found\n");
+                               } else {
+                                       tfidf = propStat->tfidfs[bun];
+                               }
+                               tfidfValues[j] = tfidf;
+                       }
+                       // Sort using insertion sort. Ignore "subject" column
+                       for (j = 2; j < sample.numProp; ++j) {
+                               int tmpPos = propOrderTfidf[j];
+                               float tmpVal = tfidfValues[tmpPos];
+                               int k = j - 1;
+                               while (k >= 1 && tfidfValues[propOrderTfidf[k]] 
< tmpVal) { // sort descending
+                                       propOrderTfidf[k + 1] = 
+                                       k--;
+                               }
+                               propOrderTfidf[k + 1] = tmpPos;
+                       }
+                       // Add <NUM_PROP_TFIDF_SAMPLE> properties to propOrder 
that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE 
values of propOrder
+                       for (j = 1; j < sample.numProp; ++j) {
+                               int prop, foundProp, bound;
+                               if (found == NUM_PROP_TFIDF_SAMPLE) break;
+                               prop = propOrderTfidf[j];
+                               // check if prop is already choosen
+                               foundProp = 0;
+                               bound = 
+                               for (k = 1; k < bound; ++k) {
+                                       if (propOrder[k] == prop) {
+                                               foundProp = 1;
+                                               break;
+                                       }
+                               }
+                               if (!foundProp) {
+                                       // add prop to propOrder
+                                       // overwriting values is okay because 
the original values at position >= (1+NUM_PROP_SUPPORT_SAMPLE) in propOrder are 
not needed anymore
propOrder[1+NUM_PROP_SUPPORT_SAMPLE+found] = prop;
+                                       found++;
+                               }
+                       }
                } else {
                        numPropsInSampleTable = sample.numProp; // all 
properties, no change in order because freqCS.lstPropSupport[] is not yet 
@@ -6069,7 +6121,11 @@ str printSampleData(CSSample *csSample, 
-               if (sampleVersion > 1) GDKfree(propOrder);
+               if (sampleVersion > 1) {
+                       GDKfree(propOrder);
+                       GDKfree(propOrderTfidf);
+                       GDKfree(tfidfValues);
+               }
@@ -6837,7 +6893,7 @@ void printFKMultiplicityFromCSPropTypes(
 #if NO_OUTPUTFILE == 0 
 str getSampleData(int *ret, bat *mapbatid, int numTables, CSset* freqCSset, 
BAT *sbat, BATiter si, BATiter pi, BATiter oi, int* mTblIdxFreqIdxMapping, 
-               CSlabel* labels, int* csTblIdxMapping, int maxNumPwithDup, oid* 
subjCSMap, int sampleVersion){
+               CSlabel* labels, int* csTblIdxMapping, int maxNumPwithDup, oid* 
subjCSMap, int sampleVersion, PropStat *propStat){
        BAT             *outputBat = NULL, *mbat = NULL;
        CSSample        *csSample; 
@@ -6858,7 +6914,7 @@ str getSampleData(int *ret, bat *mapbati
        initSampleData(csSample, outputBat, freqCSset, mTblIdxFreqIdxMapping, 
        RDFExtractSampleData(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, 
maxNumPwithDup, csSample, outputBat, numSampleTbl);
        printsubsetFromCSset(freqCSset, outputBat, mbat, numSampleTbl, 
mTblIdxFreqIdxMapping, labels, sampleVersion);
-       printSampleData(csSample, freqCSset, mbat, numSampleTbl, sampleVersion);
+       printSampleData(csSample, freqCSset, mbat, numSampleTbl, sampleVersion, 
        freeSampleData(csSample, numSampleTbl);
@@ -7301,7 +7357,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
        #if NO_OUTPUTFILE == 0 
        getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, 
-                       mTblIdxFreqIdxMapping, *labels, csTblIdxMapping, 
*maxNumPwithDup, *subjCSMap, 1); 
+                       mTblIdxFreqIdxMapping, *labels, csTblIdxMapping, 
*maxNumPwithDup, *subjCSMap, 1, 0); // last parameter (propStat) is null, but 
is not used when sampleVersion==1
@@ -8828,7 +8884,17 @@ RDFreorganize(int *ret, CStableStat *cst
        /* Extract sample data for the evaluation */
        #if NO_OUTPUTFILE == 0 
-       getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, 
mTblIdxFreqIdxMapping, labels, csTblIdxMapping, maxNumPwithDup, subjCSMap, 2); 
+       {
+       int curNumMergeCS = countNumberMergeCS(freqCSset);
+       oid* mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
+       PropStat *propStat2;
+        initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
+       propStat2 = initPropStat();
+       getPropStatisticsFromMergeCSs(propStat2, curNumMergeCS, 
mergeCSFreqCSMap, freqCSset);
+       getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, 
mTblIdxFreqIdxMapping, labels, csTblIdxMapping, maxNumPwithDup, subjCSMap, 2, 
+       freePropStat(propStat2);
+       free(mergeCSFreqCSMap);
+       }
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -399,7 +399,8 @@ typedef struct CSPropTypes {
 #define NUM_SAMPLE_CANDIDATE 999 // print all candidates
 #define GETSAMPLE_BEFOREMERGING 1  // Get the sample data before merging CS's
-#define NUM_PROPS_IN_SAMPLE_DATA 8 // how many properties should be printed 
(including subject column)
+#define NUM_PROP_SUPPORT_SAMPLE 5 // how many properties should be added to 
the sample data because of a high support (excluding subject column)
+#define NUM_PROP_TFIDF_SAMPLE 2 // how many properties should be added to the 
sample data because of a high tfidf score
 typedef struct CSSample{
        int     freqIdx;
