Changeset: 6af4c3c3560a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6af4c3c3560a Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Remap freqCSid, tableIdx and modify CStable & PropStat for facilitating the re-organizing process diffs (truncated from 344 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -86,6 +86,12 @@ static void initArray(oid* inputArr, int } +static void initIntArray(int* inputArr, int num, oid defaultValue){ + int i; + for (i = 0; i < num; i++){ + inputArr[i] = defaultValue; + } +} static void initcsIdFreqIdxMap(int* inputArr, int num, int defaultValue, CSset *freqCSset){ int i; @@ -1795,13 +1801,14 @@ PropStat* initPropStat(void){ } static -void addaProp(PropStat* propStat, oid prop, int csIdx){ +void addaProp(PropStat* propStat, oid prop, int csIdx, int invertIdx){ BUN bun; BUN p; int* _tmp1; float* _tmp2; Postinglist* _tmp3; + int* _tmp4; p = prop; bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &prop); @@ -1837,16 +1844,21 @@ void addaProp(PropStat* propStat, oid pr } propStat->plCSidx = (Postinglist*)_tmp3; + } propStat->freqs[propStat->numAdded] = 1; propStat->plCSidx[propStat->numAdded].lstIdx = (int *) malloc(sizeof(int) * INIT_CS_PER_PROP); + propStat->plCSidx[propStat->numAdded].lstInvertIdx = (int *) malloc(sizeof(int) * INIT_CS_PER_PROP); + + if (propStat->plCSidx[propStat->numAdded].lstIdx == NULL){ fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); } propStat->plCSidx[propStat->numAdded].lstIdx[0] = csIdx; + propStat->plCSidx[propStat->numAdded].lstInvertIdx[0] = invertIdx; propStat->plCSidx[propStat->numAdded].numAdded = 1; propStat->plCSidx[propStat->numAdded].numAllocation = INIT_CS_PER_PROP; @@ -1866,8 +1878,16 @@ void addaProp(PropStat* propStat, oid pr } propStat->plCSidx[bun].lstIdx = (int*) _tmp1; + _tmp4 = realloc(propStat->plCSidx[bun].lstInvertIdx, ((propStat->plCSidx[bun].numAllocation) * sizeof(int))); + if (!_tmp4){ + fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + } + propStat->plCSidx[bun].lstInvertIdx = (int*) _tmp4; + } propStat->plCSidx[bun].lstIdx[propStat->plCSidx[bun].numAdded] = csIdx; + propStat->plCSidx[bun].lstInvertIdx[propStat->plCSidx[bun].numAdded] = invertIdx; + propStat->plCSidx[bun].numAdded++; } @@ -1885,7 +1905,7 @@ void getPropStatisticsFromMaxCSs(PropSta cs = (CS)freqCSset->items[freqId]; for (j = 0; j < cs.numProp; j++){ - addaProp(propStat, cs.lstProp[j],freqId); + addaProp(propStat, cs.lstProp[j],freqId, j); } } @@ -1904,7 +1924,7 @@ void getPropStatisticsFromMaxCSs(PropSta static -PropStat* getPropStatisticsFromFreqCSs(CSset* freqCSset, int *numdistinctMCS){ +PropStat* getPropStatisticsByTable(CSset* freqCSset, int* mfreqIdxTblIdxMapping, int *numdistinctMCS){ int i, j, k; CS cs; @@ -1921,7 +1941,7 @@ PropStat* getPropStatisticsFromFreqCSs(C cs = (CS)freqCSset->items[i]; k++; for (j = 0; j < cs.numProp; j++){ - addaProp(propStat, cs.lstProp[j], i); + addaProp(propStat, cs.lstProp[j], mfreqIdxTblIdxMapping[i], j); } } } @@ -1965,6 +1985,7 @@ void freePropStat(PropStat *propStat){ free(propStat->tfidfs); for (i = 0; i < propStat->numAdded; i++){ free(propStat->plCSidx[i].lstIdx); + free(propStat->plCSidx[i].lstInvertIdx); } free(propStat->plCSidx); free(propStat); @@ -3057,27 +3078,75 @@ str triplesubsort(BAT **sbat, BAT **pbat return MAL_SUCCEED; } -static -CStable* initCStables(PropStat* propStat, int num){ - CStable* cstable; - int i; +static +CStable* initCStablesAndIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping){ + + int i, k; + CS cs; + CStable* cstable; + int tmpParentidx; + int tmpNumProp; + + cstable = (CStable *) malloc (sizeof (CStable)); - cstable = (CStable *) malloc (sizeof (CStable)); - cstable->lstbatid = (bat**) malloc(sizeof (bat*) * (propStat->numAdded)); - for(i = 0; i < propStat->numAdded;i++){ - cstable->lstbatid[i] = (bat*)malloc(sizeof(bat) * propStat->plCSidx[i].numAdded); + // Get the number of tables + k = 0; + for (i = 0; i < freqCSset->numCSadded; i++){ + if (freqCSset->items[i].parentFreqIdx == -1){ // Only use the maximum or merge CS + mfreqIdxTblIdxMapping[i] = k; + mTblIdxFreqIdxMapping[k] = i; + k++; + } } - cstable->numTables = num; + + // allocate memory space for cstable + cstable->numTables = k; + cstable->lstbatid = (bat**) malloc(sizeof (bat*) * k); + cstable->numPropPerTable = (int*) malloc(sizeof (int) * k); + cstable->lastInsertedS = (oid*) malloc(sizeof(oid) * k); + + + k = 0; + for (i = 0; i < freqCSset->numCSadded; i++){ + if (freqCSset->items[i].parentFreqIdx == -1){ // Only use the maximum or merge CS + tmpNumProp = freqCSset->items[i].numProp; + cstable->numPropPerTable[k] = tmpNumProp; + cstable->lstbatid[k] = (bat*) malloc (sizeof(bat) * tmpNumProp); + k++; + } + } + + // Mapping the csid directly to the index of the table ==> csTblIndxMapping + + for (i = 0; i < freqCSset->numOrigFreqCS; i++){ + cs = (CS)freqCSset->items[i]; + tmpParentidx = cs.parentFreqIdx; + + if (tmpParentidx == -1){ // maximumCS + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[i]; + } + else{ // A normal CS or a maxCS that have a mergeCS as its parent + if (freqCSset->items[tmpParentidx].parentFreqIdx == -1){ + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[tmpParentidx]; + } + else{ + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[freqCSset->items[tmpParentidx].parentFreqIdx]; + } + } + + } + + return cstable; + } -str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, PropStat* propStat, int numdistinctMCS){ +str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, PropStat* propStat, CStable *cstable){ BAT *sbat = NULL, *pbat = NULL, *obat = NULL; BATiter si,pi,oi; BUN p,q; oid *pbt, *sbt, *obt; oid lastP, lastS; - CStable *cstable; int freqid; BUN ppos; @@ -3102,9 +3171,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s oi = bat_iterator(obat); lastP = BUN_NONE; - lastS = BUN_NONE; - //Init cstable - cstable = initCStables(propStat, numdistinctMCS); + printf("Created cstable with %d tables \n", cstable->numTables); BATloop(pbat, p, q){ @@ -3154,7 +3221,6 @@ RDFreorganize(int *ret, bat *sbatid, bat CSset *freqCSset; /* Set of frequent CSs */ oid *subjCSMap = NULL; /* Store the corresponding CS Id for each subject */ - int i; oid maxCSoid = 0; BAT *sbat = NULL, *obat = NULL, *pbat = NULL; BATiter si; @@ -3163,13 +3229,16 @@ RDFreorganize(int *ret, bat *sbatid, bat BUN newId; oid *sbt; oid *lastSubjId; /* Store the last subject Id in each freqCS */ - oid freqId; + oid tblIdx; oid lastS; oid l,r; bat oNewBatid, pNewBatid; - oid *csMFreqCSMap; /* Store the mapping from a CS id to an index of a maxCS or mergeCS in freqCSset. */ + int *csTblIdxMapping; /* Store the mapping from a CS id to an index of a maxCS or mergeCS in freqCSset. */ + int *mfreqIdxTblIdxMapping; /* Store the mapping from the idx of a max/merge freqCS to the table Idx */ + int *mTblIdxFreqIdxMapping; /* Invert of mfreqIdxTblIdxMapping */ PropStat *propStat; int numdistinctMCS = 0; + CStable *cstable; freqCSset = initCSset(); @@ -3178,20 +3247,21 @@ RDFreorganize(int *ret, bat *sbatid, bat } printf("Start re-organizing triple store for " BUNFMT " CSs \n", maxCSoid); - csMFreqCSMap = (oid *) malloc (sizeof (oid) * (maxCSoid + 1)); - initArray(csMFreqCSMap, (maxCSoid + 1), BUN_NONE); - - - lastSubjId = (oid *) malloc (sizeof(oid) * freqCSset->numOrigFreqCS); - for (i = 0; i < freqCSset->numOrigFreqCS; i++){ - if (freqCSset->items[i].parentFreqIdx != -1){ // Use the maximum or merge CS instead - csMFreqCSMap[freqCSset->items[i].csId] = freqCSset->items[i].parentFreqIdx; - } - else - csMFreqCSMap[freqCSset->items[i].csId] = i; - - lastSubjId[i] = 0; - } + + csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1)); + initIntArray(csTblIdxMapping, (maxCSoid + 1), -1); + + mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); + initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1); + + mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); // A little bit reduntdant space + initIntArray(mTblIdxFreqIdxMapping , freqCSset->numCSadded, -1); + + //Mapping from from CSId to TableIdx + cstable = initCStablesAndIdxMapping(freqCSset, csTblIdxMapping, mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping); + + lastSubjId = (oid *) malloc (sizeof(oid) * cstable->numTables); + initArray(lastSubjId, cstable->numTables, 0); if ((sbat = BATdescriptor(*sbatid)) == NULL) { throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); @@ -3236,12 +3306,12 @@ RDFreorganize(int *ret, bat *sbatid, bat lastS = -1; BATloop(sbat, p, q){ sbt = (oid *) BUNtloc(si, p); - freqId = csMFreqCSMap[subjCSMap[*sbt]]; - - if (freqId != BUN_NONE){ - - newId = lastSubjId[freqId]; - newId |= (BUN)freqId << (sizeof(BUN)*8 - NBITS_FOR_CSID); + tblIdx = csTblIdxMapping[subjCSMap[*sbt]]; + + if (tblIdx != BUN_NONE){ + + newId = lastSubjId[tblIdx]; + newId |= (BUN)tblIdx << (sizeof(BUN)*8 - NBITS_FOR_CSID); if (lastS != *sbt){ //new subject lastS = *sbt; @@ -3251,7 +3321,7 @@ RDFreorganize(int *ret, bat *sbatid, bat lmap = BUNappend(lmap, &l, TRUE); rmap = BUNappend(rmap, &r, TRUE); - lastSubjId[freqId]++; + lastSubjId[tblIdx]++; } } @@ -3300,16 +3370,17 @@ RDFreorganize(int *ret, bat *sbatid, bat BATprint(sNewBat); - propStat = getPropStatisticsFromFreqCSs(freqCSset, &numdistinctMCS); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list