Changeset: cf71a2dd1319 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cf71a2dd1319 Modified Files: monetdb5/extras/rdf/rdf_shredder.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Fix bug in rdf_shredder (while assigning value for objtype) which also caused the problem in exploring CS properties' types. diffs (221 lines): diff --git a/monetdb5/extras/rdf/rdf_shredder.c b/monetdb5/extras/rdf/rdf_shredder.c --- a/monetdb5/extras/rdf/rdf_shredder.c +++ b/monetdb5/extras/rdf/rdf_shredder.c @@ -158,7 +158,7 @@ rdf_BUNappend_unq_ForObj(parserData* pda /* Add the type here by changing 2 bits at position 62, 63 of oid */ *bun |= (BUN)objType << (sizeof(BUN)*8 - 4); - + //b = BUNappend(b, (ptr) (str)objStr, TRUE); b = BUNins(b, (ptr) bun, (ptr) (str)objStr, TRUE); @@ -235,7 +235,7 @@ char isInt(char *input){ static ObjectType getObjectType(unsigned char* objStr, BUN *realNumValue){ - ObjectType obType; + ObjectType obType = STRING; unsigned char* endpart; char* valuepart; const char* pos = NULL; @@ -275,6 +275,8 @@ getObjectType(unsigned char* objStr, BUN //printf("%s: String \n", objStr); } } + else + obType = STRING; return obType; } @@ -362,7 +364,7 @@ tripleHandler(void* user_data, const rap ObjectType objType = STRING; objStr = raptor_term_to_string(triple->object); objType = getObjectType(objStr, &realNumValue); - + rdf_BUNappend_unq_ForObj(pdata, graph[MAP_LEX], (str)objStr, objType, &bun); rdf_BUNappend(pdata, graph[O_sort], &bun); diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -722,7 +722,21 @@ CSPropTypes* initCSPropTypes(CSset* freq return csPropTypes; } - +static +void printCSPropTypes(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ + int i, j, k; + + for (i = 0; i < numMergedCS; i++){ + printf("MergedCS %d (Freq: %d): \n", i, freqCSset->items[csPropTypes[i].freqCSId].support); + for(j = 0; j < csPropTypes[i].numProp; j++){ + printf(" P " BUNFMT " : ", csPropTypes[i].lstPropTypes[j].prop); + for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType; k++){ + printf(" Type %d (%d) | ", k, csPropTypes[i].lstPropTypes[j].lstFreq[k]); + } + printf("\n"); + } + } +} /* * Add types of properties * Note that the property list is sorted by prop's oids @@ -734,9 +748,11 @@ void addPropTypes(char *buffTypes, oid* int i,j; int tblId = csTblIdxMapping[csId]; + //printf("Add %d prop from CS %d to table %d \n", numP, csId, tblId); j = 0; if (tblId != -1){ for (i = 0; i < numP; i++){ + //printf(" P: " BUNFMT " Type: %d ", buffP[i], buffTypes[i]); while (csPropTypes[tblId].lstPropTypes[j].prop != buffP[i]){ j++; } @@ -745,6 +761,7 @@ void addPropTypes(char *buffTypes, oid* } } + //printf("\n"); } static @@ -4202,61 +4219,62 @@ RDFreorganize(int *ret, CStableStat *cst throw(RDF, "rdf.RDFreorganize", "Problem in extracting CSs"); } + + + printf("Start re-organizing triple store for " BUNFMT " CSs \n", maxCSoid); + + csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1)); + initIntArray(csTblIdxMapping, (maxCSoid + 1), -1); + + mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); + initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1); + + mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); // A little bit reduntdant space + initIntArray(mTblIdxFreqIdxMapping , freqCSset->numCSadded, -1); + + //Mapping from from CSId to TableIdx + initCSTableIdxMapping(freqCSset, csTblIdxMapping, mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping); + + // Init CStableStat + initCStables(cstablestat, freqCSset); + + if ((sbat = BATdescriptor(*sbatid)) == NULL) { + throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); + } + + if ((obat = BATdescriptor(*obatid)) == NULL) { + BBPreleaseref(sbat->batCacheid); + throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); + } + + if ((pbat = BATdescriptor(*pbatid)) == NULL) { + BBPreleaseref(sbat->batCacheid); + BBPreleaseref(obat->batCacheid); + throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); + } + + si = bat_iterator(sbat); + pi = bat_iterator(pbat); + oi = bat_iterator(obat); + + /* Get possible types of each property in a table (i.e., mergedCS) */ + csPropTypes = initCSPropTypes(freqCSset, cstablestat->numTables); + RDFExtractCSPropTypes(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, csPropTypes, maxNumPwithDup); + printCSPropTypes(csPropTypes,cstablestat->numTables, freqCSset); + if (*mode == EXPLOREONLY){ printf("Only explore the schema information \n"); freeCSset(freqCSset); free(subjCSMap); free(subjdefaultMap); + free(csTblIdxMapping); + free(mfreqIdxTblIdxMapping); + free(mTblIdxFreqIdxMapping); return MAL_SUCCEED; } - - printf("Start re-organizing triple store for " BUNFMT " CSs \n", maxCSoid); - - csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1)); - initIntArray(csTblIdxMapping, (maxCSoid + 1), -1); - - mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); - initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1); - - mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); // A little bit reduntdant space - initIntArray(mTblIdxFreqIdxMapping , freqCSset->numCSadded, -1); - - //Mapping from from CSId to TableIdx - initCSTableIdxMapping(freqCSset, csTblIdxMapping, mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping); - - // Init CStableStat - initCStables(cstablestat, freqCSset); - - - lastSubjId = (oid *) malloc (sizeof(oid) * cstablestat->numTables); - initArray(lastSubjId, cstablestat->numTables, -1); - - lastSubjIdEx = (oid *) malloc (sizeof(oid) * cstablestat->numTables); - initArray(lastSubjIdEx, cstablestat->numTables, -1); - - if ((sbat = BATdescriptor(*sbatid)) == NULL) { - throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); - } - - if ((obat = BATdescriptor(*obatid)) == NULL) { - BBPreleaseref(sbat->batCacheid); - throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); - } - - if ((pbat = BATdescriptor(*pbatid)) == NULL) { - BBPreleaseref(sbat->batCacheid); - BBPreleaseref(obat->batCacheid); - throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); - } - - si = bat_iterator(sbat); - pi = bat_iterator(pbat); - oi = bat_iterator(obat); - - /* Get possible types of each property in a table (i.e., mergedCS) */ - csPropTypes = initCSPropTypes(freqCSset, cstablestat->numTables); - RDFExtractCSPropTypes(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, csPropTypes, maxNumPwithDup); + + sNewBat = BATnew(TYPE_void, TYPE_oid, BATcount(sbat)); @@ -4281,6 +4299,11 @@ RDFreorganize(int *ret, CStableStat *cst BATseqbase(rmap, 0); + lastSubjId = (oid *) malloc (sizeof(oid) * cstablestat->numTables); + initArray(lastSubjId, cstablestat->numTables, -1); + + lastSubjIdEx = (oid *) malloc (sizeof(oid) * cstablestat->numTables); + initArray(lastSubjIdEx, cstablestat->numTables, -1); printf("Re-assigning Subject oids ... "); lastS = -1; diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -88,8 +88,8 @@ typedef struct PropStat { #define FULL_PROP_STAT 1 // Only use for showing the statistic on all properties / all CSs. (Default should be 0) -#define USE_LABEL_FINDING_MAXCS 1 // Use the labels received from labeling process for finding maxCS -#define USE_LABEL_FOR_MERGING 1 // Use the labels received from labeling process for finding mergeCS +#define USE_LABEL_FINDING_MAXCS 0 // Use the labels received from labeling process for finding maxCS +#define USE_LABEL_FOR_MERGING 0 // Use the labels received from labeling process for finding mergeCS typedef struct CS { _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list