Changeset: 4d57d94c1068 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4d57d94c1068
Modified Files:
        monetdb5/extras/rdf/rdf_shredder.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Analyze and filter multi-valued columns.


diffs (241 lines):

diff --git a/monetdb5/extras/rdf/rdf_shredder.c 
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -243,20 +243,20 @@ getObjectType(unsigned char* objStr, BUN
        *realNumValue = BUN_NONE; 
 
        if (strlen((str)objStr) > 20){
-               endpart = objStr + (strlen((str)objStr) - 19);   
//XMLSchema#dateTime>
-               //printf("Original: %s  --> substring: %s \n", (str)objStr, 
(str)endpart);
+               endpart = objStr + (strlen((str)objStr) - 19);   /* 
XMLSchema#dateTime> */
+               /* printf("Original: %s  --> substring: %s \n", (str)objStr, 
(str)endpart); */
 
                if ( (pos = strstr((str)endpart , "XMLSchema#date>")) != NULL 
|| (pos = strstr((str)endpart, "XMLSchema#dateTime>")) != NULL ){
                        obType = DATETIME;
-                       //printf("%s: DateTime \n", objStr); 
+                       /* printf("%s: DateTime \n", objStr); */
                }
                else if ((pos = strstr((str) endpart, "XMLSchema#int>")) != 
NULL || (pos = strstr((str)endpart, "XMLSchema#integer>")) != NULL){
                        obType = INTEGER;
                        valuepart = substring((char*)objStr, 2 , (int) (pos - 
(str)objStr - 28)); 
-                       //printf("%s: Integer \n. Length of value %d ==> value 
%s \n", objStr, (int) (pos - (str)objStr - 28), valuepart);
+                       /* printf("%s: Integer \n. Length of value %d ==> value 
%s \n", objStr, (int) (pos - (str)objStr - 28), valuepart); */
                        if (isInt(valuepart) == 1){     /* Check whether the 
real value is an integer */
                                *realNumValue = (BUN) atoi(valuepart); 
-                               //printf("Real value is: " BUNFMT " \n", 
*realNumValue);
+                               /* printf("Real value is: " BUNFMT " \n", 
*realNumValue); */
                        }
                        else 
                                obType = STRING;        
@@ -268,11 +268,11 @@ getObjectType(unsigned char* objStr, BUN
                                || (pos = strstr((str) endpart, 
"XMLSchema#double>")) != NULL  
                                || (pos = strstr((str) endpart, 
"XMLSchema#decimal>")) != NULL){
                        obType = FLOAT;
-                       //printf("%s: Float \n", objStr);
+                       /* printf("%s: Float \n", objStr); */
                }
                else {
                        obType = STRING;
-                       //printf("%s: String \n", objStr); 
+                       /* printf("%s: String \n", objStr); */
                }
        }
        else
@@ -758,6 +758,7 @@ RDFParser (BAT **graph, str *location, s
                                dp = opendir (pdata->location);
                                if (dp != NULL){
                                        while ((ep = readdir (dp)) != NULL){
+                                               printf("Checking file %s 
\n",ep->d_name);       
                                                if (strstr (ep->d_name,".nt")!= 
NULL || strstr (ep->d_name,".ttl")!= NULL ){
                                                        
sprintf(tmpfilename,"%s%s",pdata->location,ep->d_name);
                                                        printf("Loading file %s 
..",tmpfilename);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -714,6 +714,11 @@ CSPropTypes* initCSPropTypes(CSset* freq
                        csPropTypes[id].lstPropTypes = (PropTypes*) 
GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp);
                        for (j = 0; j < csPropTypes[id].numProp; j++){
                                csPropTypes[id].lstPropTypes[j].prop = 
freqCSset->items[i].lstProp[j]; 
+                               #if STAT_ANALYZE
+                               csPropTypes[id].lstPropTypes[j].numNull = 0;
+                               csPropTypes[id].lstPropTypes[j].numMVType = 0;
+                               csPropTypes[id].lstPropTypes[j].numSingleType = 
0;              
+                               #endif
                                csPropTypes[id].lstPropTypes[j].propFreq = 0; 
                                csPropTypes[id].lstPropTypes[j].propCover = 0; 
                                csPropTypes[id].lstPropTypes[j].numType = 
MULTIVALUES + 1;
@@ -783,22 +788,85 @@ void genCSPropTypesColIdx(CSPropTypes* c
 
        }
 
+
+}
+
+static 
+void printCSPropTypes(CSPropTypes* csPropTypes, int numMergedCS, CSset* 
freqCSset, int freqThreshold){
+       char filename[100]; 
+       char tmpStr[50]; 
+       FILE *fout; 
+       int i, j, k; 
+       int     numMVCS = 0; 
+       int     numMVCSFilter = 0; 
+       int     numMVCols = 0; 
+       int     numMVColsFilter = 0;
+       int     numNonMVCS = 0; 
+       char    tmpIsMVCS = 0; 
+       char    tmpIsMVCSFilter = 0; 
+       double  threshold = 1.1; 
+       double  tmpRatio; 
+
+       strcpy(filename, "csPropTypes");
+       sprintf(tmpStr, "%d", freqThreshold);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+
        /* Print cspropTypes */
        for (i = 0; i < numMergedCS; i++){
-               printf("MergedCS %d (Freq: %d): \n", i, 
freqCSset->items[csPropTypes[i].freqCSId].support);
+               fprintf(fout, "MergedCS %d (Freq: %d): \n", i, 
freqCSset->items[csPropTypes[i].freqCSId].support);
+               tmpIsMVCS = 0;
+               tmpIsMVCSFilter = 0; 
                for(j = 0; j < csPropTypes[i].numProp; j++){
-                       printf("  P " BUNFMT "(%d | cov:%d):", 
csPropTypes[i].lstPropTypes[j].prop, 
csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover);
+                       if (csPropTypes[i].lstPropTypes[j].numMVType > 0){
+                               tmpIsMVCS = 1; 
+                               numMVCols++;
+                       }
+                       tmpRatio = (double) 
(csPropTypes[i].lstPropTypes[j].propCover / 
(csPropTypes[i].lstPropTypes[j].numSingleType + 
csPropTypes[i].lstPropTypes[j].numMVType));
+
+                       if ((csPropTypes[i].lstPropTypes[j].numMVType > 0) && 
(tmpRatio > threshold)){
+                               tmpIsMVCSFilter = 1; 
+                               numMVColsFilter++;
+                       }
+
+                       fprintf(fout, "  P " BUNFMT "(%d | cov:%d | Null: %d | 
Single: %d | Multi: %d) \n", 
+                                       csPropTypes[i].lstPropTypes[j].prop, 
csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover,
+                                       csPropTypes[i].lstPropTypes[j].numNull, 
csPropTypes[i].lstPropTypes[j].numSingleType, 
csPropTypes[i].lstPropTypes[j].numMVType);
+                       fprintf(fout, "         ");
                        for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType; 
k++){
-                               printf(" Type %d (%d)  | ", k, 
csPropTypes[i].lstPropTypes[j].lstFreq[k]);
+                               fprintf(fout, " Type %d (%d)  | ", k, 
csPropTypes[i].lstPropTypes[j].lstFreq[k]);
                        }
-                       printf("\n");
-                       printf("         ");
+                       fprintf(fout, "\n");
+                       fprintf(fout, "         ");
                        for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType; 
k++){
-                               printf(" Tbl %d (cl%d) | ", 
csPropTypes[i].lstPropTypes[j].TableTypes[k], 
csPropTypes[i].lstPropTypes[j].colIdxes[k]);
+                               fprintf(fout, " Tbl %d (cl%d) | ", 
csPropTypes[i].lstPropTypes[j].TableTypes[k], 
csPropTypes[i].lstPropTypes[j].colIdxes[k]);
                        }
-                       printf("\n");
+                       fprintf(fout, "\n");
+               }
+
+               if (tmpIsMVCS == 1){
+                       numMVCS++;
+               }
+
+               if (tmpIsMVCSFilter == 1){
+                       numMVCSFilter++;
                }
        }
+       numNonMVCS = numMergedCS - numMVCS;
+       fprintf(fout, "Number of tables with MV col: %d \n", numMVCS);
+       fprintf(fout, "Number of tables with NO MV col: %d \n", numNonMVCS);
+       fprintf(fout, "Number of MV cols: %d \n", numMVCols);
+
+       fprintf(fout, "==== With filtering ==== \n");
+       fprintf(fout, "Number of tables with MV col: %d \n", numMVCSFilter);
+       fprintf(fout, "Number of tables with NO MV col: %d \n", (numMergedCS - 
numMVCSFilter));
+       fprintf(fout, "Number of MV cols: %d \n", numMVColsFilter);
+
+
+       fclose(fout); 
+
 }
 /*
  * Add types of properties 
@@ -817,6 +885,9 @@ void addPropTypes(char *buffTypes, oid* 
                for (i = 0; i < numP; i++){
                        //printf("  P: " BUNFMT " Type: %d ", buffP[i], 
buffTypes[i]);
                        while (csPropTypes[tblId].lstPropTypes[j].prop != 
buffP[i]){
+                               #if STAT_ANALYZE
+                               csPropTypes[tblId].lstPropTypes[j].numNull++;
+                               #endif
                                j++;
                        }       
                        //j is position of the property buffP[i] in 
csPropTypes[tblId]
@@ -824,8 +895,24 @@ void addPropTypes(char *buffTypes, oid* 
                        csPropTypes[tblId].lstPropTypes[j].propFreq++;
                        csPropTypes[tblId].lstPropTypes[j].propCover += 
buffCover[i]; 
                        
csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++; 
+                       #if STAT_ANALYZE
+                       if (buffTypes[i] == MULTIVALUES){
+                               csPropTypes[tblId].lstPropTypes[j].numMVType++;
+                       }
+                       else{
+                               
csPropTypes[tblId].lstPropTypes[j].numSingleType++;
+                       }
+                       #endif
+
+                       j++;
 
                }
+               #if STAT_ANALYZE
+               while (j < csPropTypes[tblId].numProp){
+                       csPropTypes[tblId].lstPropTypes[j].numNull++;
+                       j++;
+               }
+               #endif
        }
        //printf("\n");
 }
@@ -3450,10 +3537,12 @@ RDFextractCSwithTypes(int *ret, bat *sba
        csIdFreqIdxMap = (int *) malloc (sizeof(int) * (*maxCSoid + 1));
        initcsIdFreqIdxMap(csIdFreqIdxMap, *maxCSoid + 1, -1, freqCSset);
        printf("Using ontologies with %d ontattributesCount and %d 
ontmetadataCount \n",ontattributesCount,ontmetadataCount);
-
+       
        labels = createLabels(freqCSset, csrelSet, *maxCSoid + 1, sbat, si, pi, 
oi, *subjCSMap, mbat, csIdFreqIdxMap, *freqThreshold, ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount);
 
-
+       curT = clock(); 
+       printf("Done labeling!!! Took %f seconds.\n", ((float)(curT - 
tmpLastT))/CLOCKS_PER_SEC);
+       tmpLastT = curT;
 
        getMaximumFreqCSs(freqCSset, labels, csBats->coverageBat,  
csBats->freqBat, *maxCSoid + 1, &numMaxCSs); 
 
@@ -4244,6 +4333,7 @@ RDFreorganize(int *ret, CStableStat *cst
        csPropTypes = initCSPropTypes(freqCSset, numTables);
        RDFExtractCSPropTypes(ret, sbat, si, pi, oi, subjCSMap, 
csTblIdxMapping, csPropTypes, maxNumPwithDup);
        genCSPropTypesColIdx(csPropTypes, numTables, freqCSset);
+       printCSPropTypes(csPropTypes, numTables, freqCSset, *freqThreshold);
 
        // Init CStableStat
        initCStables(cstablestat, freqCSset, csPropTypes, numTables);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -93,6 +93,8 @@ typedef struct PropStat {
 
 #define FULL_PROP_STAT 1       // Only use for showing the statistic on all 
properties / all CSs. (Default should be 0)
 
+#define STAT_ANALYZE 1 // Only use for collecting the statistic on the number 
of multi/null/single-valued prop
+
 
 #define USE_LABEL_FINDING_MAXCS        0       // Use the labels received from 
labeling process for finding maxCS 
 #define USE_LABEL_FOR_MERGING  0       // Use the labels received from 
labeling process for finding mergeCS
@@ -229,6 +231,11 @@ typedef struct CStableStat {
 typedef struct PropTypes{
        oid     prop;
        int     numType; 
+#if STAT_ANALYZE       
+       int     numMVType;      /* Number of subjects having this property a 
multi-valued prop. */
+       int     numNull;        /* Number of subjects that don't have obj value 
for this prop */
+       int     numSingleType;  /* Number of subjects having the */
+#endif 
        int     propFreq;       /* without considering type = Table frequency*/
        int     propCover;      /* = coverage of that property */       
        char*   lstTypes; 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to