Changeset: 4d57d94c1068 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4d57d94c1068 Modified Files: monetdb5/extras/rdf/rdf_shredder.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Analyze and filter multi-valued columns. diffs (241 lines): diff --git a/monetdb5/extras/rdf/rdf_shredder.c b/monetdb5/extras/rdf/rdf_shredder.c --- a/monetdb5/extras/rdf/rdf_shredder.c +++ b/monetdb5/extras/rdf/rdf_shredder.c @@ -243,20 +243,20 @@ getObjectType(unsigned char* objStr, BUN *realNumValue = BUN_NONE; if (strlen((str)objStr) > 20){ - endpart = objStr + (strlen((str)objStr) - 19); //XMLSchema#dateTime> - //printf("Original: %s --> substring: %s \n", (str)objStr, (str)endpart); + endpart = objStr + (strlen((str)objStr) - 19); /* XMLSchema#dateTime> */ + /* printf("Original: %s --> substring: %s \n", (str)objStr, (str)endpart); */ if ( (pos = strstr((str)endpart , "XMLSchema#date>")) != NULL || (pos = strstr((str)endpart, "XMLSchema#dateTime>")) != NULL ){ obType = DATETIME; - //printf("%s: DateTime \n", objStr); + /* printf("%s: DateTime \n", objStr); */ } else if ((pos = strstr((str) endpart, "XMLSchema#int>")) != NULL || (pos = strstr((str)endpart, "XMLSchema#integer>")) != NULL){ obType = INTEGER; valuepart = substring((char*)objStr, 2 , (int) (pos - (str)objStr - 28)); - //printf("%s: Integer \n. Length of value %d ==> value %s \n", objStr, (int) (pos - (str)objStr - 28), valuepart); + /* printf("%s: Integer \n. Length of value %d ==> value %s \n", objStr, (int) (pos - (str)objStr - 28), valuepart); */ if (isInt(valuepart) == 1){ /* Check whether the real value is an integer */ *realNumValue = (BUN) atoi(valuepart); - //printf("Real value is: " BUNFMT " \n", *realNumValue); + /* printf("Real value is: " BUNFMT " \n", *realNumValue); */ } else obType = STRING; @@ -268,11 +268,11 @@ getObjectType(unsigned char* objStr, BUN || (pos = strstr((str) endpart, "XMLSchema#double>")) != NULL || (pos = strstr((str) endpart, "XMLSchema#decimal>")) != NULL){ obType = FLOAT; - //printf("%s: Float \n", objStr); + /* printf("%s: Float \n", objStr); */ } else { obType = STRING; - //printf("%s: String \n", objStr); + /* printf("%s: String \n", objStr); */ } } else @@ -758,6 +758,7 @@ RDFParser (BAT **graph, str *location, s dp = opendir (pdata->location); if (dp != NULL){ while ((ep = readdir (dp)) != NULL){ + printf("Checking file %s \n",ep->d_name); if (strstr (ep->d_name,".nt")!= NULL || strstr (ep->d_name,".ttl")!= NULL ){ sprintf(tmpfilename,"%s%s",pdata->location,ep->d_name); printf("Loading file %s ..",tmpfilename); diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -714,6 +714,11 @@ CSPropTypes* initCSPropTypes(CSset* freq csPropTypes[id].lstPropTypes = (PropTypes*) GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp); for (j = 0; j < csPropTypes[id].numProp; j++){ csPropTypes[id].lstPropTypes[j].prop = freqCSset->items[i].lstProp[j]; + #if STAT_ANALYZE + csPropTypes[id].lstPropTypes[j].numNull = 0; + csPropTypes[id].lstPropTypes[j].numMVType = 0; + csPropTypes[id].lstPropTypes[j].numSingleType = 0; + #endif csPropTypes[id].lstPropTypes[j].propFreq = 0; csPropTypes[id].lstPropTypes[j].propCover = 0; csPropTypes[id].lstPropTypes[j].numType = MULTIVALUES + 1; @@ -783,22 +788,85 @@ void genCSPropTypesColIdx(CSPropTypes* c } + +} + +static +void printCSPropTypes(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset, int freqThreshold){ + char filename[100]; + char tmpStr[50]; + FILE *fout; + int i, j, k; + int numMVCS = 0; + int numMVCSFilter = 0; + int numMVCols = 0; + int numMVColsFilter = 0; + int numNonMVCS = 0; + char tmpIsMVCS = 0; + char tmpIsMVCSFilter = 0; + double threshold = 1.1; + double tmpRatio; + + strcpy(filename, "csPropTypes"); + sprintf(tmpStr, "%d", freqThreshold); + strcat(filename, tmpStr); + strcat(filename, ".txt"); + + fout = fopen(filename,"wt"); + /* Print cspropTypes */ for (i = 0; i < numMergedCS; i++){ - printf("MergedCS %d (Freq: %d): \n", i, freqCSset->items[csPropTypes[i].freqCSId].support); + fprintf(fout, "MergedCS %d (Freq: %d): \n", i, freqCSset->items[csPropTypes[i].freqCSId].support); + tmpIsMVCS = 0; + tmpIsMVCSFilter = 0; for(j = 0; j < csPropTypes[i].numProp; j++){ - printf(" P " BUNFMT "(%d | cov:%d):", csPropTypes[i].lstPropTypes[j].prop, csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover); + if (csPropTypes[i].lstPropTypes[j].numMVType > 0){ + tmpIsMVCS = 1; + numMVCols++; + } + tmpRatio = (double) (csPropTypes[i].lstPropTypes[j].propCover / (csPropTypes[i].lstPropTypes[j].numSingleType + csPropTypes[i].lstPropTypes[j].numMVType)); + + if ((csPropTypes[i].lstPropTypes[j].numMVType > 0) && (tmpRatio > threshold)){ + tmpIsMVCSFilter = 1; + numMVColsFilter++; + } + + fprintf(fout, " P " BUNFMT "(%d | cov:%d | Null: %d | Single: %d | Multi: %d) \n", + csPropTypes[i].lstPropTypes[j].prop, csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover, + csPropTypes[i].lstPropTypes[j].numNull, csPropTypes[i].lstPropTypes[j].numSingleType, csPropTypes[i].lstPropTypes[j].numMVType); + fprintf(fout, " "); for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType; k++){ - printf(" Type %d (%d) | ", k, csPropTypes[i].lstPropTypes[j].lstFreq[k]); + fprintf(fout, " Type %d (%d) | ", k, csPropTypes[i].lstPropTypes[j].lstFreq[k]); } - printf("\n"); - printf(" "); + fprintf(fout, "\n"); + fprintf(fout, " "); for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType; k++){ - printf(" Tbl %d (cl%d) | ", csPropTypes[i].lstPropTypes[j].TableTypes[k], csPropTypes[i].lstPropTypes[j].colIdxes[k]); + fprintf(fout, " Tbl %d (cl%d) | ", csPropTypes[i].lstPropTypes[j].TableTypes[k], csPropTypes[i].lstPropTypes[j].colIdxes[k]); } - printf("\n"); + fprintf(fout, "\n"); + } + + if (tmpIsMVCS == 1){ + numMVCS++; + } + + if (tmpIsMVCSFilter == 1){ + numMVCSFilter++; } } + numNonMVCS = numMergedCS - numMVCS; + fprintf(fout, "Number of tables with MV col: %d \n", numMVCS); + fprintf(fout, "Number of tables with NO MV col: %d \n", numNonMVCS); + fprintf(fout, "Number of MV cols: %d \n", numMVCols); + + fprintf(fout, "==== With filtering ==== \n"); + fprintf(fout, "Number of tables with MV col: %d \n", numMVCSFilter); + fprintf(fout, "Number of tables with NO MV col: %d \n", (numMergedCS - numMVCSFilter)); + fprintf(fout, "Number of MV cols: %d \n", numMVColsFilter); + + + fclose(fout); + } /* * Add types of properties @@ -817,6 +885,9 @@ void addPropTypes(char *buffTypes, oid* for (i = 0; i < numP; i++){ //printf(" P: " BUNFMT " Type: %d ", buffP[i], buffTypes[i]); while (csPropTypes[tblId].lstPropTypes[j].prop != buffP[i]){ + #if STAT_ANALYZE + csPropTypes[tblId].lstPropTypes[j].numNull++; + #endif j++; } //j is position of the property buffP[i] in csPropTypes[tblId] @@ -824,8 +895,24 @@ void addPropTypes(char *buffTypes, oid* csPropTypes[tblId].lstPropTypes[j].propFreq++; csPropTypes[tblId].lstPropTypes[j].propCover += buffCover[i]; csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++; + #if STAT_ANALYZE + if (buffTypes[i] == MULTIVALUES){ + csPropTypes[tblId].lstPropTypes[j].numMVType++; + } + else{ + csPropTypes[tblId].lstPropTypes[j].numSingleType++; + } + #endif + + j++; } + #if STAT_ANALYZE + while (j < csPropTypes[tblId].numProp){ + csPropTypes[tblId].lstPropTypes[j].numNull++; + j++; + } + #endif } //printf("\n"); } @@ -3450,10 +3537,12 @@ RDFextractCSwithTypes(int *ret, bat *sba csIdFreqIdxMap = (int *) malloc (sizeof(int) * (*maxCSoid + 1)); initcsIdFreqIdxMap(csIdFreqIdxMap, *maxCSoid + 1, -1, freqCSset); printf("Using ontologies with %d ontattributesCount and %d ontmetadataCount \n",ontattributesCount,ontmetadataCount); - + labels = createLabels(freqCSset, csrelSet, *maxCSoid + 1, sbat, si, pi, oi, *subjCSMap, mbat, csIdFreqIdxMap, *freqThreshold, ontattributes, ontattributesCount, ontmetadata, ontmetadataCount); - + curT = clock(); + printf("Done labeling!!! Took %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); + tmpLastT = curT; getMaximumFreqCSs(freqCSset, labels, csBats->coverageBat, csBats->freqBat, *maxCSoid + 1, &numMaxCSs); @@ -4244,6 +4333,7 @@ RDFreorganize(int *ret, CStableStat *cst csPropTypes = initCSPropTypes(freqCSset, numTables); RDFExtractCSPropTypes(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, csPropTypes, maxNumPwithDup); genCSPropTypesColIdx(csPropTypes, numTables, freqCSset); + printCSPropTypes(csPropTypes, numTables, freqCSset, *freqThreshold); // Init CStableStat initCStables(cstablestat, freqCSset, csPropTypes, numTables); diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -93,6 +93,8 @@ typedef struct PropStat { #define FULL_PROP_STAT 1 // Only use for showing the statistic on all properties / all CSs. (Default should be 0) +#define STAT_ANALYZE 1 // Only use for collecting the statistic on the number of multi/null/single-valued prop + #define USE_LABEL_FINDING_MAXCS 0 // Use the labels received from labeling process for finding maxCS #define USE_LABEL_FOR_MERGING 0 // Use the labels received from labeling process for finding mergeCS @@ -229,6 +231,11 @@ typedef struct CStableStat { typedef struct PropTypes{ oid prop; int numType; +#if STAT_ANALYZE + int numMVType; /* Number of subjects having this property a multi-valued prop. */ + int numNull; /* Number of subjects that don't have obj value for this prop */ + int numSingleType; /* Number of subjects having the */ +#endif int propFreq; /* without considering type = Table frequency*/ int propCover; /* = coverage of that property */ char* lstTypes; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list