Changeset: ef6d72654574 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ef6d72654574 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Get the sample data before merging diffs (truncated from 366 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -686,6 +686,12 @@ char isInfrequentProp(PropTypes pt, CS c } +static +char isInfrequentSampleProp(CS freqCS, int propIdx){ + if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; + else return 0; +} + static void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ int i, j, k; @@ -1997,7 +2003,7 @@ str printmergeCSSet(CSset *freqCSset, in static -str printsubsetFromCSset(CSset *freqCSset, BAT* subsetIdxBat, int num, int* mergeCSFreqCSMap, CSlabel *label){ +str printsubsetFromCSset(CSset *freqCSset, BAT* subsetIdxBat, int num, int* mergeCSFreqCSMap, CSlabel *label, int sampleVersion){ int i,j; FILE *fout; @@ -2021,7 +2027,7 @@ str printsubsetFromCSset(CSset *freqCSse strcpy(filename, "selectedSubset"); - sprintf(tmpStr, "%d", num); + sprintf(tmpStr, "%d_v%d", num, sampleVersion); strcat(filename, tmpStr); strcat(filename, ".txt"); @@ -4889,12 +4895,12 @@ void getTblName(char *name, oid nameId){ } static -str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num){ +str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ int i,j, k; FILE *fout, *fouttb, *foutis; - char filename[100]; - char tmpStr[20]; + char filename[100], filename2[100], filename3[100]; + char tmpStr[20], tmpStr2[20], tmpStr3[20]; int ret; str propStr; @@ -4930,14 +4936,23 @@ str printSampleData(CSSample *csSample, strcpy(filename, "sampleData"); - sprintf(tmpStr, "%d", num); + sprintf(tmpStr, "%d_v%d", num,sampleVersion); strcat(filename, tmpStr); strcat(filename, ".txt"); - + strcpy(filename2, "createSampleTable"); + sprintf(tmpStr2, "%d_v%d", num,sampleVersion); + strcat(filename2, tmpStr2); + strcat(filename2, ".sh"); + + strcpy(filename3, "loadSampleToMonet"); + sprintf(tmpStr3, "%d_v%d", num,sampleVersion); + strcat(filename3, tmpStr3); + strcat(filename3, ".sh"); + fout = fopen(filename,"wt"); - fouttb = fopen("createSampleTable.sh","wt"); - foutis = fopen("loadSampleToMonet.sh","wt"); + fouttb = fopen(filename2,"wt"); + foutis = fopen(filename3,"wt"); for (i = 0; i < num; i++){ sample = csSample[i]; @@ -4999,7 +5014,9 @@ str printSampleData(CSSample *csSample, isImage = 0; isSite = 0; for (j = 0; j < sample.numProp; j++){ - if (freqCS.lstPropSupport[j] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) continue; + if (sampleVersion > 1){ //Do not consider infreq Prop + if (isInfrequentSampleProp(freqCS, j)) continue; + } #if USE_SHORT_NAMES propStrShort = NULL; #endif @@ -5059,8 +5076,13 @@ str printSampleData(CSSample *csSample, //List of support for (j = 0; j < sample.numProp; j++){ - if (freqCS.lstPropSupport[j] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) continue; - fprintf(fout,";%d", freqCS.lstPropSupport[j]); + if (sampleVersion > 1){ //Do not consider infreq Prop + if (isInfrequentSampleProp(freqCS, j)) continue; + fprintf(fout,";%d", freqCS.lstPropSupport[j]); + } + else{ + fprintf(fout,";%d", freqCS.support); + } } fprintf(fout, "\n"); @@ -5082,7 +5104,9 @@ str printSampleData(CSSample *csSample, GDKfree(subjStr); for (j = 0; j < sample.numProp; j++){ - if (freqCS.lstPropSupport[j] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) continue; + if (sampleVersion > 1){ //Do not consider infreq Prop + if (isInfrequentSampleProp(freqCS, j)) continue; + } objOid = sample.lstObj[j][k]; if (objOid == BUN_NONE){ fprintf(fout,";NULL"); @@ -5584,6 +5608,80 @@ void printFKMultiplicityFromCSPropTypes( } +static +str getSampleData(int *ret, bat *mapbatid, int numTables, CSset* freqCSset, BAT *sbat, BATiter si, BATiter pi, BATiter oi, int* mTblIdxFreqIdxMapping, + CSlabel* labels, int* csTblIdxMapping, int maxNumPwithDup, oid* subjCSMap, int sampleVersion){ + + BAT *outputBat = NULL, *mbat = NULL; + CSSample *csSample; + int numSampleTbl = 0; + + if ((mbat = BATdescriptor(*mapbatid)) == NULL) { + throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING); + } + //Generate evaluating tables + + numSampleTbl = (NUM_SAMPLETABLE > (numTables/2))?(numTables/2):NUM_SAMPLETABLE; + + printf("Select list of sample tables \n"); + outputBat = generateTablesForEvaluating(freqCSset, numSampleTbl, mTblIdxFreqIdxMapping, numTables); + assert (BATcount(outputBat) == (oid) numSampleTbl); + csSample = (CSSample*)malloc(sizeof(CSSample) * numSampleTbl); + printf("Select sample instances for %d tables \n", numSampleTbl); + initSampleData(csSample, outputBat, freqCSset, mTblIdxFreqIdxMapping, labels); + RDFExtractSampleData(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, maxNumPwithDup, csSample, outputBat, numSampleTbl); + printsubsetFromCSset(freqCSset, outputBat, numSampleTbl, mTblIdxFreqIdxMapping, labels, sampleVersion); + printSampleData(csSample, freqCSset, mbat, numSampleTbl, sampleVersion); + freeSampleData(csSample, numSampleTbl); + BBPreclaim(outputBat); + BBPunfix(mbat->batCacheid); + + return MAL_SUCCEED; +} + +static +void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int *numTables){ + +int i, k; +CS cs; + int tmpParentidx; + + k = 0; + for (i = 0; i < freqCSset->numCSadded; i++){ + if (isCSTable(freqCSset->items[i])){ // Only use the not-removed maximum or merge CS + mfreqIdxTblIdxMapping[i] = k; + mTblIdxFreqIdxMapping[k] = i; + k++; + } + } + + *numTables = k; + + // Mapping the csid directly to the index of the table ==> csTblIndxMapping + + for (i = 0; i < freqCSset->numOrigFreqCS; i++){ + cs = (CS)freqCSset->items[i]; + tmpParentidx = cs.parentFreqIdx; + + if (tmpParentidx == -1){ // maximumCS + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[i]; + } + else{ // A normal CS or a maxCS that have a mergeCS as its parent + if (freqCSset->items[tmpParentidx].parentFreqIdx == -1){ + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[tmpParentidx]; + } + else{ + csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[freqCSset->items[tmpParentidx].parentFreqIdx]; + } + } + + } + + + //return cstablestat; + +} + // for storing ontology data oid **ontattributes = NULL; int ontattributesCount = 0; @@ -5768,12 +5866,37 @@ RDFextractCSwithTypes(int *ret, bat *sba curT = clock(); printf("Get number of indirect referrences to detect dimension tables !!! Took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); tmpLastT = curT; + /*------------------------------------*/ + + { + int numTables = 0; + int *csTblIdxMapping, *mfreqIdxTblIdxMapping, *mTblIdxFreqIdxMapping; + + + csTblIdxMapping = (int *) malloc (sizeof (int) * (*maxCSoid + 1)); + initIntArray(csTblIdxMapping, (*maxCSoid + 1), -1); + + mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); + initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1); + + mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * freqCSset->numCSadded); // TODO: little bit reduntdant space + initIntArray(mTblIdxFreqIdxMapping , freqCSset->numCSadded, -1); + + //Mapping from from CSId to TableIdx + printf("Init CS tableIdxMapping \n"); + initCSTableIdxMapping(freqCSset, csTblIdxMapping, mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables); + + getSampleData(ret, mapbatid, numTables, freqCSset, sbat, si, pi, oi, + mTblIdxFreqIdxMapping, *labels, csTblIdxMapping, *maxNumPwithDup, *subjCSMap, 1); + + } + /*------------------------------------*/ curNumMergeCS = countNumberMergeCS(freqCSset); printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS); - + /* ---------- S1, S2 ------- */ mergecsId = *maxCSoid + 1; @@ -6203,48 +6326,7 @@ void initCStables(CStableStat* cstablest -static -void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int *numTables){ - -int i, k; -CS cs; - int tmpParentidx; - - k = 0; - for (i = 0; i < freqCSset->numCSadded; i++){ - if (isCSTable(freqCSset->items[i])){ // Only use the not-removed maximum or merge CS - mfreqIdxTblIdxMapping[i] = k; - mTblIdxFreqIdxMapping[k] = i; - k++; - } - } - - *numTables = k; - - // Mapping the csid directly to the index of the table ==> csTblIndxMapping - - for (i = 0; i < freqCSset->numOrigFreqCS; i++){ - cs = (CS)freqCSset->items[i]; - tmpParentidx = cs.parentFreqIdx; - - if (tmpParentidx == -1){ // maximumCS - csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[i]; - } - else{ // A normal CS or a maxCS that have a mergeCS as its parent - if (freqCSset->items[tmpParentidx].parentFreqIdx == -1){ - csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[tmpParentidx]; - } - else{ - csTblIdxMapping[cs.csId] = mfreqIdxTblIdxMapping[freqCSset->items[tmpParentidx].parentFreqIdx]; - } - } - - } - - - //return cstablestat; - -} + void freeCStableStat(CStableStat* cstablestat){ int i,j, k; @@ -7133,7 +7215,7 @@ RDFreorganize(int *ret, CStableStat *cst CSset *freqCSset; /* Set of frequent CSs */ oid *subjCSMap = NULL; /* Store the corresponding CS Id for each subject */ oid maxCSoid = 0; - BAT *sbat = NULL, *obat = NULL, *pbat = NULL, *mbat = NULL; + BAT *sbat = NULL, *obat = NULL, *pbat = NULL; BATiter si,pi,oi; BUN p,q; BAT *sNewBat, *lmap, *rmap, *oNewBat, *origobat, *pNewBat; @@ -7160,7 +7242,6 @@ RDFreorganize(int *ret, CStableStat *cst //int curNumMergeCS; //oid *mergeCSFreqCSMap; - int numSampleTbl = 0; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list