Changeset: bc147d8e4e56 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bc147d8e4e56 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Fix bug while removing subjects with lots of missing properties. diffs (truncated from 341 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6002,59 +6002,66 @@ str initFullSampleData(CSSampleExtend *c int curStrLen =0; int tmpStrLen =0; oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition); - //tmpOid refer to the keyBat of the mv bats - - //Get the range of multi-values in keyBat - tmpmvKeyBat = cstablestat->lstcstable[i].lstMVTables[j].keyBat; + if (*tmpOid != oid_nil){ + + //tmpOid refer to the keyBat of the mv bats + + //Get the range of multi-values in keyBat + tmpmvKeyBat = cstablestat->lstcstable[i].lstMVTables[j].keyBat; + + mvRefOid = *tmpOid; + tmpmvRefOid = (oid *) Tloc(tmpmvKeyBat, mvRefOid); + assert(tmpmvRefOid != NULL); + + //printf("First position for multivalues in keybat %d \n", (int) (*tmpmvRefOid)); + + tmpNumMVCols = cstablestat->lstcstable[i].lstMVTables[j].numCol; + //printf("Table %d colum %d is a mv col with %d types \n",i,j,tmpNumMVCols); + + tmpPos = *tmpOid; + while (*tmpmvRefOid == mvRefOid){ + //Concat the data from each column + for (mvColIdx =0; mvColIdx < tmpNumMVCols; mvColIdx++){ + tmpmvBat = cstablestat->lstcstable[i].lstMVTables[j].mvBats[mvColIdx]; + tmpObjType = getObjTypeFromBATtype(tmpmvBat->ttype); + if (getObjValueFromMVBat(&vrRealObjValue, &vrCastedObjValue, tmpPos, tmpObjType, tmpmvBat, lmap, rmap) == 1){ + //printf("Casted value at mvBat %d is %s \n",mvColIdx,vrCastedObjValue.val.sval); + tmpStrLen = strlen(vrCastedObjValue.val.sval); + if (tmpMVSampleStr == NULL){ + tmpMVSampleStr = (str) GDKmalloc(tmpStrLen + 1); + s = tmpMVSampleStr; + }else{ + tmpMVSampleStr = (str) GDKrealloc(tmpMVSampleStr, curStrLen + tmpStrLen + 2); + s = tmpMVSampleStr; + s += curStrLen; + } + + strcpy(s, vrCastedObjValue.val.sval); + s += tmpStrLen; + *s++ = ';'; + *s = '\0'; + + curStrLen = strlen(tmpMVSampleStr); + //printf("Current tmpMVSampleStr String %s --> curLen = %d \n",tmpMVSampleStr, curStrLen); + + VALclear(&vrCastedObjValue); + VALclear(&vrRealObjValue); + } + } + + + //Get next + tmpPos++; + if (tmpPos == BATcount(tmpmvKeyBat)) break; + + tmpmvRefOid = (oid *) Tloc(tmpmvKeyBat, tmpPos); + } + + } + //else{ + //printf("[Null] There is no set of multiple values for this subject"); - mvRefOid = *tmpOid; - tmpmvRefOid = (oid *) Tloc(tmpmvKeyBat, mvRefOid); - assert(tmpmvRefOid != NULL); - - //printf("First position for multivalues in keybat %d \n", (int) (*tmpmvRefOid)); - - tmpNumMVCols = cstablestat->lstcstable[i].lstMVTables[j].numCol; - //printf("Table %d colum %d is a mv col with %d types \n",i,j,tmpNumMVCols); - - tmpPos = *tmpOid; - while (*tmpmvRefOid == mvRefOid){ - //Concat the data from each column - for (mvColIdx =0; mvColIdx < tmpNumMVCols; mvColIdx++){ - tmpmvBat = cstablestat->lstcstable[i].lstMVTables[j].mvBats[mvColIdx]; - tmpObjType = getObjTypeFromBATtype(tmpmvBat->ttype); - if (getObjValueFromMVBat(&vrRealObjValue, &vrCastedObjValue, tmpPos, tmpObjType, tmpmvBat, lmap, rmap) == 1){ - //printf("Casted value at mvBat %d is %s \n",mvColIdx,vrCastedObjValue.val.sval); - tmpStrLen = strlen(vrCastedObjValue.val.sval); - if (tmpMVSampleStr == NULL){ - tmpMVSampleStr = (str) GDKmalloc(tmpStrLen + 1); - s = tmpMVSampleStr; - }else{ - tmpMVSampleStr = (str) GDKrealloc(tmpMVSampleStr, curStrLen + tmpStrLen + 2); - s = tmpMVSampleStr; - s += curStrLen; - } - - strcpy(s, vrCastedObjValue.val.sval); - s += tmpStrLen; - *s++ = ';'; - *s = '\0'; - - curStrLen = strlen(tmpMVSampleStr); - //printf("Current tmpMVSampleStr String %s --> curLen = %d \n",tmpMVSampleStr, curStrLen); - - VALclear(&vrCastedObjValue); - VALclear(&vrRealObjValue); - } - } - - - //Get next - tmpPos++; - if (tmpPos == BATcount(tmpmvKeyBat)) break; - - tmpmvRefOid = (oid *) Tloc(tmpmvKeyBat, tmpPos); - } - + //} if (tmpMVSampleStr != NULL){ tmpMVSampleStr = (str) GDKrealloc(tmpMVSampleStr, curStrLen + 1); tmpMVSampleStr[curStrLen] = '\0'; @@ -8898,7 +8905,7 @@ void getRealValue(ValPtr returnValue, oi }while (0) -str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mbatid, bat *lmapbatid, bat *rmapbatid, PropStat* propStat, CStableStat *cstablestat, CSPropTypes *csPropTypes, oid* lastSubjId){ +str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mbatid, bat *lmapbatid, bat *rmapbatid, PropStat* propStat, CStableStat *cstablestat, CSPropTypes *csPropTypes, oid* lastSubjId, char *isLotsNullSubj){ BAT *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL, *lmap = NULL, *rmap = NULL; BATiter si,pi,oi, mi; @@ -8922,6 +8929,8 @@ str RDFdistTriplesToCSs(int *ret, bat *s int lasttblIdx = -1; int lastColIdx = -1; int lastPropIdx = -1; + int numEmptyBat = 0; + char isSetLasttblIdx = 0; ObjectType objType, defaultType; char tmpTableType = 0; @@ -8954,6 +8963,8 @@ str RDFdistTriplesToCSs(int *ret, bat *s char isFKCol = 0; #endif + (void) isLotsNullSubj; + maxOrigPbt = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID)) - 1; if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { throw(RDF, "RDFdistTriplesToCSs", @@ -9031,11 +9042,20 @@ str RDFdistTriplesToCSs(int *ret, bat *s //printf(" --> Tbl: %d tmpSoid: " BUNFMT " | Last SubjId " BUNFMT "\n", tblIdx,tmpSoid, lastSubjId[tblIdx]); - if (tblIdx == -1){ // This is for irregular triples, put them to pso table - insToPSO(cstablestat->pbat,cstablestat->sbat, cstablestat->obat, pbt, sbt, obt); - //printf(" ==> To PSO \n"); - isFKCol = 0; - continue; + if (tblIdx == -1){ + #if REMOVE_LOTSOFNULL_SUBJECT + if (isLotsNullSubj[*sbt] == 0){ + // This is for irregular triples, put them to pso table + insToPSO(cstablestat->pbat,cstablestat->sbat, cstablestat->obat, pbt, sbt, obt); + //printf(" ==> To PSO \n"); + isFKCol = 0; + continue; + } + #else + insToPSO(cstablestat->pbat,cstablestat->sbat, cstablestat->obat, pbt, sbt, obt); + isFKCol = 0; + continue; + #endif } if (*pbt != lastP){ @@ -9068,10 +9088,18 @@ str RDFdistTriplesToCSs(int *ret, bat *s } + #if REMOVE_LOTSOFNULL_SUBJECT + if (tblIdx == -1 && isLotsNullSubj[*sbt]){ + // A lots-of-null subject + insToPSO(cstablestat->pbat,cstablestat->sbat, cstablestat->obat, pbt, sbt, obt); + + continue; + } + #endif + objType = getObjType(*obt); assert (objType != BLANKNODE); - tmpPropIdx = tmpTblIdxPropIdxMap[tblIdx]; //printf(" PropIdx = %d \n", tmpPropIdx); tmpColIdx = csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defColIdx; @@ -9153,7 +9181,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s #endif isSetLasttblIdx = 1; } - + + + /* New column. Finish with lastTblIdx and lastColIdx. Note: This lastColIdx is * the position of the prop in a final CS. Not the exact colIdx in MAINTBL or TYPETBL * */ @@ -9246,6 +9276,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s #endif #if COUNT_DISTINCT_REFERRED_S if (isFKCol){ + assert(tmpFKHashBat != NULL); tmpFKRefBun = BUNfnd(BATmirror(tmpFKHashBat),(ptr) obt); if (tmpFKRefBun == BUN_NONE){ @@ -9264,8 +9295,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s } #endif } - - + if (istmpMVProp == 1){ // This is a multi-valued prop //printf("Multi values prop \n"); if (*sbt != lastS){ @@ -9452,13 +9482,24 @@ str RDFdistTriplesToCSs(int *ret, bat *s throw(RDF, "rdf.RDFdistTriplesToCSs", "Problem in filling missing values all"); } - + numEmptyBat = 0; // Keep the batCacheId for (i = 0; i < cstablestat->numTables; i++){ //printf("----- Table %d ------ \n",i ); for (j = 0; j < cstablestat->numPropPerTable[i];j++){ //printf("Column %d \n", j); cstablestat->lstbatid[i][j] = cstablestat->lstcstable[i].colBats[j]->batCacheid; + tmpBat = cstablestat->lstcstable[i].colBats[j]; + if (BATcount(tmpBat) == 0) { + printf("Empty Bats at table %d column %d \n",i,j); + numEmptyBat++; + fillMissingvalues(tmpBat, (int)BATcount(tmpBat), (int)lastSubjId[i]); + } + if (j > 0) + if (BATcount(cstablestat->lstcstable[i].colBats[j]) > 0 && + BATcount(cstablestat->lstcstable[i].colBats[j-1]) > 0){ + assert(BATcount(cstablestat->lstcstable[i].colBats[j]) == BATcount(cstablestat->lstcstable[i].colBats[j-1])); + } //BATprint(cstablestat->lstcstable[i].colBats[j]); if (csPropTypes[i].lstPropTypes[j].isMVProp){ //printf("MV Columns: \n"); @@ -9475,6 +9516,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s *ret = 1; printf(" ... Done \n"); + printf("Number of full empty bats %d \n",numEmptyBat); + + printf("Number of triples in PSO table is "BUNFMT"\n", BATcount(cstablestat->pbat)); BBPunfix(sbat->batCacheid); BBPunfix(pbat->batCacheid); @@ -9509,6 +9553,8 @@ RDFreorganize(int *ret, CStableStat *cst int freqIdx; int numSubjRemoved = 0; #endif + char *isLotsNullSubj = NULL; + oid lastS; oid l,r; bat oNewBatid, pNewBatid; @@ -9692,17 +9738,33 @@ RDFreorganize(int *ret, CStableStat *cst lastSubjId = (oid *) malloc (sizeof(oid) * cstablestat->numTables); initArray(lastSubjId, cstablestat->numTables, -1); - printf("Re-assigning Subject oids ... "); + #if REMOVE_LOTSOFNULL_SUBJECT + //TODO: Find the better way than using isLotsNullSubj array to keep + //the status of subject + isLotsNullSubj = (char *) malloc(sizeof(char) * BATcount(sbat) + 1); + initCharArray(isLotsNullSubj, BATcount(sbat) + 1,0); + #else + (void) isLotsNullSubj; + #endif + + printf("Re-assigning Subject oids ... \n"); lastS = -1; BATloop(sbat, p, q){ sbt = (oid *) BUNtloc(si, p); tblIdx = csTblIdxMapping[subjCSMap[*sbt]]; - + #if REMOVE_LOTSOFNULL_SUBJECT - freqIdx = csFreqCSMapping[subjCSMap[*sbt]]; - if (freqCSset->items[freqIdx].numProp < cstablestat->lstcstable[tblIdx].numCol * LOTSOFNULL_SUBJECT_THRESHOLD){ - tblIdx = -1; - numSubjRemoved++; + //TODO: If the subject is the target + // of an FK prop, do not remove that subject. This is hard to check. + // + if (tblIdx != -1){ + freqIdx = csFreqCSMapping[subjCSMap[*sbt]]; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list