Changeset: c8adf21bfcde for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8adf21bfcde Added Files: sql/server/rel_rdfscan.c sql/server/rel_rdfscan.h Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/optimizer/opt_pipes.c sql/backends/monet5/rel_bin.c sql/backends/monet5/sql.c sql/backends/monet5/sql_rdf.h sql/backends/monet5/sql_rdf_jgraph.c sql/include/sql_relation.h sql/server/Makefile.ag sql/server/rel_distribute.c sql/server/rel_dump.c sql/server/rel_optimizer.c sql/server/rel_select.c Branch: rdf Log Message:
Initial step in creating rdfscan + add the computation and usage of rankscore in labeling diffs (truncated from 970 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -403,6 +403,80 @@ getStringBetweenQuotes(str* out, str in) } } +#if TYPE_TFIDF_RANKING +/* + * Init the BATs for storing all type oids and their frequency + * */ +static +void initGlobalTypeBATs(BAT **glTypeValueBat, BAT **glTypeFreqBat){ + + *glTypeValueBat = BATnew(TYPE_void, TYPE_oid, smallbatsz, TRANSIENT); + BATseqbase(*glTypeValueBat, 0); + if (*glTypeValueBat == NULL) { + fprintf(stderr, "ERROR: Couldn't create BAT!\n"); + } + + (void)BAThash(*glTypeValueBat,0); + if (!((*glTypeValueBat)->T->hash)){ + fprintf(stderr, "ERROR: Couldn't create Hash for BAT!\n"); + } + + *glTypeFreqBat = BATnew(TYPE_void, TYPE_int, smallbatsz, TRANSIENT); + if (*glTypeFreqBat == NULL) { + fprintf(stderr, "ERROR: Couldn't create BAT!\n"); + } + +} + +static +void freeGlobalTypeBATs(BAT *glTypeValueBat, BAT *glTypeFreqBat){ + BBPunfix(glTypeValueBat->batCacheid); + BBPunfix(glTypeFreqBat->batCacheid); +} + +static +void addGlobalType(oid typevalue, BAT *glTypeValueBat, BAT *glTypeFreqBat){ + oid tmp; + BUN bun; + int freq; + + tmp = typevalue; + bun = BUNfnd(glTypeValueBat,(ptr) &tmp); + if (bun == BUN_NONE){ //New type value + if (glTypeValueBat->T->hash && BATcount(glTypeValueBat) > 4 * glTypeValueBat->T->hash->mask) { + HASHdestroy(glTypeValueBat); + BAThash(glTypeValueBat, 2*BATcount(glTypeValueBat)); + } + BUNappend(glTypeValueBat,&tmp, TRUE); + freq = 1; + BUNappend(glTypeFreqBat, &freq, TRUE); + } else{ + int *curfreq = (int *)Tloc(glTypeFreqBat, bun); + (*curfreq)++; + } +} + +static +int getTypeGlobalFrequency(oid typevalue, BAT *glTypeValueBat, BAT *glTypeFreqBat){ + + oid tmp; + BUN bun; + int ret = -1; + + tmp = typevalue; + bun = BUNfnd(glTypeValueBat,(ptr) &tmp); + if (bun == BUN_NONE){ //New type value + fprintf(stderr, "ERROR: This typevalue must be there!\n"); + } else{ + int *freq = (int *)Tloc(glTypeFreqBat, bun); + ret = *freq; + return ret; + } + return ret; +} +#endif + + #if USE_TYPE_NAMES static int compareTypeAttributesFreqs (const void * a, const void * b) { @@ -413,7 +487,7 @@ int compareTypeAttributesFreqs (const vo #if USE_TYPE_NAMES /* Add type values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ static -void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) { +void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, BAT *glTypeValueBat, BAT *glTypeFreqBat) { int i, j; int fit; (void) ontmetaBat; @@ -444,9 +518,19 @@ void insertValuesIntoTypeAttributesHisto typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].value = typeList[i]; typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; } + + //Add to global types + #if TYPE_TFIDF_RANKING + addGlobalType(typeList[i], glTypeValueBat, glTypeFreqBat); + #else + (void) glTypeValueBat; + (void) glTypeFreqBat; + #endif } } + + /* Loop through all subjects to collect frequency statistics for type attribute values. */ static void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) { @@ -461,12 +545,21 @@ void createTypeAttributesHistogram(BAT * oid *typeValues; // list of type values per subject and type int typeValuesSize; int typeValuesMaxSize = 10; + int numS = 0; // histogram int i, j, k; oid *typeAttributesOids = malloc(sizeof(oid) * typeAttributesCount); + BAT *glTypeValueBat = NULL; //Store the oid of each type value + BAT *glTypeFreqBat = NULL; //Store the global frequency (#of subjects) of a type value + + #if TYPE_TFIDF_RANKING + int tmpgl_freq = 0; + initGlobalTypeBATs(&glTypeValueBat, &glTypeFreqBat); + #endif + if (BATcount(sbat) == 0) { fprintf(stderr, "sbat must not be empty"); /* otherwise, variable sbt is not initialized and thus @@ -517,10 +610,11 @@ void createTypeAttributesHistogram(BAT * } else { // analyze values and add to histogram csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject - insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, glTypeValueBat,glTypeFreqBat); typeValuesSize = 0; // reset } curS = *sbt; + numS++; curT = i; } // add value to list of type values @@ -539,7 +633,7 @@ void createTypeAttributesHistogram(BAT * // analyze and add last set of typeValues if (curS != BUN_NONE && typeValuesSize != 0) { csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject - insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat); + insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, glTypeValueBat,glTypeFreqBat); } GDKfree(typeValues); @@ -551,18 +645,27 @@ void createTypeAttributesHistogram(BAT * } } - // assign percentage + (void) numS; + // assign percentage and tf-idf ranking score for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < typeAttributesCount; ++j) { // assign percentage values for every value for (k = 0; k < typeAttributesHistogramCount[i][j]; ++k) { typeAttributesHistogram[i][j][k].percent = (int) (100.0 * typeAttributesHistogram[i][j][k].freq / freqCSset->items[i].support + 0.5); - + #if TYPE_TFIDF_RANKING + tmpgl_freq = getTypeGlobalFrequency(typeAttributesHistogram[i][j][k].value, glTypeValueBat, glTypeFreqBat); + typeAttributesHistogram[i][j][k].rankscore = ((float) typeAttributesHistogram[i][j][k].percent * numS) / (float) tmpgl_freq; + //printf("numS = %d, oid "BUNFMT", typeAttributesHistogram[i][j][k].freq = %d, tmpgl_freq = %d, percent = %d , rankscore = %f\n", + // numS, typeAttributesHistogram[i][j][k].value, typeAttributesHistogram[i][j][k].freq, tmpgl_freq, typeAttributesHistogram[i][j][k].percent, typeAttributesHistogram[i][j][k].rankscore); + #endif } } } free(typeAttributesOids); + #if TYPE_TFIDF_RANKING + freeGlobalTypeBATs(glTypeValueBat, glTypeFreqBat); + #endif } #endif @@ -1316,6 +1419,13 @@ void getTableName(CSlabel* label, CSset* char nameFound = 0; oid maxDepthOid; int maxFreq; + + #if TYPE_TFIDF_RANKING + oid maxRankscoreOid; + float maxRankscore = 0.0; + float tmprankscore = 0.0; + int maxRankscoreFreq; + #endif //for choosing the right type values BUN ontClassPos; @@ -1339,7 +1449,7 @@ void getTableName(CSlabel* label, CSset* label->nameFreq = 0; label->ontologySimScore = 0.0; #endif - + for (i = 0; i < typeAttributesCount; ++i) { foundOntologyTypeValue = 0; if (typeAttributesHistogramCount[csIdx][i] == 0) continue; @@ -1373,6 +1483,12 @@ void getTableName(CSlabel* label, CSset* // of all values that are >= TYPE_FREQ_THRESHOLD, choose the value with the highest hierarchy level ("deepest" value) maxDepthOid = typeAttributesHistogram[csIdx][i][0].value; maxFreq = typeAttributesHistogram[csIdx][i][0].freq; + #if TYPE_TFIDF_RANKING + maxRankscore = typeAttributesHistogram[csIdx][i][0].rankscore; + maxRankscoreOid = typeAttributesHistogram[csIdx][i][0].value; + maxRankscoreFreq = typeAttributesHistogram[csIdx][i][0].freq; + #endif + ontClassPos = BUNfnd(ontmetaBat, &maxDepthOid); if ( ontClassPos != BUN_NONE){ foundOntologyTypeValue = 1; @@ -1405,6 +1521,14 @@ void getTableName(CSlabel* label, CSset* maxFreq = freq; } + #if TYPE_TFIDF_RANKING + tmprankscore = typeAttributesHistogram[csIdx][i][j].rankscore; + if (tmprankscore > maxRankscore){ + maxRankscore = tmprankscore; + maxRankscoreOid = typeAttributesHistogram[csIdx][i][j].value; + maxRankscoreFreq = typeAttributesHistogram[csIdx][i][j].freq; + } + #endif } } @@ -1414,6 +1538,10 @@ void getTableName(CSlabel* label, CSset* if (foundOntologyTypeValue){ choosenOntologyTypeValue = maxDepthOid; choosenFreq = maxFreq; + #if TYPE_TFIDF_RANKING + choosenOntologyTypeValue = maxRankscoreOid; + choosenFreq = maxRankscoreFreq; + #endif } } @@ -2027,6 +2155,7 @@ CSlabel* createLabels(CSset* freqCSset, clock_t tmpLastT; + str schema = "rdf"; int ret; diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -28,6 +28,7 @@ typedef struct TypeAttributesFreq { oid value; int freq; int percent; + float rankscore; //= percent / global } TypeAttributesFreq; // Statistics for a foreign key relationship @@ -106,6 +107,7 @@ enum { #define ONLY_USE_ONTOLOGYBASED_TYPE 0 #define USE_BEST_TYPEVALUE_INSTEADOF_DUMMY 1 //Use the most frequent type value instead of a dummy for the label name #define MIN_POSSIBLE_TYPE_FREQ_THRESHOLD 20 //However, that type must still appears in more than a minimum threshold +#define TYPE_TFIDF_RANKING 1 //Rank value of type property by using (percent in a CS) / (percent in all subjects) rdf_export void getPropNameShort(char** name, char* propStr); diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c --- a/monetdb5/optimizer/opt_pipes.c +++ b/monetdb5/optimizer/opt_pipes.c @@ -116,6 +116,31 @@ static struct PIPELINES { "optimizer.generator();" "optimizer.garbageCollector();", "stable", NULL, NULL, 1}, +/* The rdf_opt_pipe is identical to the no_mitosis_pipe + * which is used for rdf/sparql queries + */ + {"rdf_opt_pipe", + "optimizer.inline();" + "optimizer.remap();" + "optimizer.costModel();" + "optimizer.coercions();" + "optimizer.evaluate();" + "optimizer.aliases();" + "optimizer.pushselect();" + "optimizer.mergetable();" + "optimizer.deadcode();" + "optimizer.commonTerms();" + "optimizer.joinPath();" + "optimizer.reorder();" + "optimizer.deadcode();" _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list