Changeset: 268c8c805182 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=268c8c805182
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Collect statistics on CS properties' types which will be used for creating 
relational columns.

Optimize and fix bug while using labels for finding maxCS and mergedCS


diffs (truncated from 460 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -683,6 +683,86 @@ void printSubCSInformation(SubCSSet *sub
        }
 }
 
+/*
+ * Init property types for each CS in FreqCSset (after merging)
+ * For each property, init with all possible types (MULTIVALUES + 1))
+ * 
+ * */
+static 
+CSPropTypes* initCSPropTypes(CSset* freqCSset, int numMergedCS){
+       int numFreqCS = freqCSset->numCSadded;
+       int i, j, k ;
+       int id; 
+
+       CSPropTypes* csPropTypes = (CSPropTypes*)GDKmalloc(sizeof(CSPropTypes) 
* numMergedCS); 
+       
+       id = 0; 
+       for (i = 0; i < numFreqCS; i++){
+               if (freqCSset->items[i].parentFreqIdx == -1){   // Only use the 
maximum or merge CS             
+                       csPropTypes[id].freqCSId = i; 
+                       csPropTypes[id].numProp = freqCSset->items[i].numProp;
+                       csPropTypes[id].lstPropTypes = (PropTypes*) 
GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp);
+                       for (j = 0; j < csPropTypes[id].numProp; j++){
+                               csPropTypes[id].lstPropTypes[j].prop = 
freqCSset->items[i].lstProp[j]; 
+                               csPropTypes[id].lstPropTypes[j].numType = 
MULTIVALUES + 1;
+                               csPropTypes[id].lstPropTypes[j].lstTypes = 
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
+                               csPropTypes[id].lstPropTypes[j].lstFreq = 
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
+                               for (k = 0; k < 
csPropTypes[id].lstPropTypes[j].numType; k++){
+                                       
csPropTypes[id].lstPropTypes[j].lstFreq[k] = 0; 
+                               }
+
+                       }
+
+                       id++;
+               }
+       }
+
+       assert(id == numMergedCS);
+
+       return csPropTypes;
+}
+
+
+/*
+ * Add types of properties 
+ * Note that the property list is sorted by prop's oids
+ * E.g., buffP = {3, 5, 7}
+ * csPropTypes[tbIdx] contains properties {1,3,4,5,7} with types for each 
property and frequency of each <property, type>
+ * */
+static 
+void addPropTypes(char *buffTypes, oid* buffP, int numP, int csId, int* 
csTblIdxMapping, CSPropTypes* csPropTypes){
+       int i,j; 
+       int tblId = csTblIdxMapping[csId];
+       
+       j = 0;
+       if (tblId != -1){
+               for (i = 0; i < numP; i++){
+                       while (csPropTypes[tblId].lstPropTypes[j].prop != 
buffP[i]){
+                               j++;
+                       }       
+                       //j is position of the property buffP[i] in 
csPropTypes[tblId]
+                       
csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++; 
+                       
+               }
+       }
+}
+
+static
+void freeCSPropTypes(CSPropTypes* csPropTypes, int numCS){
+       int i,j; 
+
+       for (i = 0; i < numCS; i++){
+               if (csPropTypes[i].freqCSId != -1){
+                       for (j = 0; j < csPropTypes[i].numProp; j++){
+                               free(csPropTypes[i].lstPropTypes[j].lstTypes); 
+                               free(csPropTypes[i].lstPropTypes[j].lstFreq);
+                       }
+                       free(csPropTypes[i].lstPropTypes); 
+               }
+       }
+       free(csPropTypes);
+}
+
 static 
 SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){
        SubCS *subcs = (SubCS*) malloc(sizeof(SubCS)); 
@@ -1854,6 +1934,7 @@ void getMaximumFreqCSs(CSset *freqCSset,
        int*    coverage; 
        int*    freq; 
        char    isLabelComparable = 0;
+       char    isDiffLabel = 0;
 
        (void) labels; 
        (void) isLabelComparable;
@@ -1866,34 +1947,30 @@ void getMaximumFreqCSs(CSset *freqCSset,
                if (strcmp(labels[i].name, "DUMMY") != 0) isLabelComparable = 1;
 
                for (j = (i+1); j < numFreqCS; j++){
-                       if (freqCSset->items[j].numProp > 
freqCSset->items[i].numProp){
-                               if (isSubset(freqCSset->items[j].lstProp, 
freqCSset->items[i].lstProp,  
-                                               
freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { 
-                                       /* CSj is a superset of CSi */
-                                       #if USE_LABEL_FINDING_MAXCS
-                                       if (isLabelComparable == 1 && 
strcmp(labels[i].name, labels[j].name) == 0) {
-                                               
freqCSset->items[i].parentFreqIdx = j;
-                                               break;
+                       isDiffLabel = 0; 
+                       #if USE_LABEL_FINDING_MAXCS
+                       if (isLabelComparable == 0 || strcmp(labels[i].name, 
labels[j].name) != 0) {
+                               isDiffLabel = 1; 
+                       }
+                       #endif
+
+                       if (isDiffLabel == 0){
+                               if (freqCSset->items[j].numProp > 
freqCSset->items[i].numProp){
+                                       if 
(isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp,  
+                                                       
freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) { 
+                                               /* CSj is a superset of CSi */
+                                               
freqCSset->items[i].parentFreqIdx = j; 
+                                               break; 
                                        }
-                                       #else   
-                                       freqCSset->items[i].parentFreqIdx = j; 
-                                       #endif
-                                       break; 
                                }
-                       }
-                       else if (freqCSset->items[j].numProp < 
freqCSset->items[i].numProp){
-                               if (isSubset(freqCSset->items[i].lstProp, 
freqCSset->items[j].lstProp,  
-                                               
freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { 
-                                       /* CSj is a subset of CSi */
-                                       #if USE_LABEL_FINDING_MAXCS
-                                       if (isLabelComparable == 1 && 
strcmp(labels[i].name, labels[j].name) == 0) {
-                                               
freqCSset->items[j].parentFreqIdx = i;
-                                       }
-                                       #else
-                                       freqCSset->items[j].parentFreqIdx = i; 
-                                       #endif
-                               }               
-                       
+                               else if (freqCSset->items[j].numProp < 
freqCSset->items[i].numProp){
+                                       if 
(isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp,  
+                                                       
freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) { 
+                                               /* CSj is a subset of CSi */
+                                               
freqCSset->items[j].parentFreqIdx = i; 
+                                       }               
+                               
+                               }
                        }
 
                        //Do not need to consider the case that the numProps 
are the same
@@ -2864,6 +2941,73 @@ str RDFrelationships(int *ret, BAT *sbat
        return MAL_SUCCEED; 
 }
 
+
+
+static 
+str RDFExtractCSPropTypes(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter 
oi,  
+               oid *subjCSMap, int* csTblIdxMapping, CSPropTypes* csPropTypes, 
int maxNumPwithDup){
+
+       BUN             p, q; 
+       oid             *sbt = 0, *obt, *pbt;
+       oid             curS;           /* current Subject oid */
+       //oid           CSoid = 0;      /* Characteristic set oid */
+       int             numPwithDup;    /* Number of properties for current S */
+       char            objType;
+       char*           buffTypes; 
+       oid*            buffP;
+       oid             curP; 
+
+       buffTypes = (char *) malloc(sizeof(char) * (maxNumPwithDup + 1)); 
+       buffP = (oid *) malloc(sizeof(char) * (maxNumPwithDup + 1));
+
+       numPwithDup = 0;
+       curS = 0; 
+       curP = 0; 
+
+       BATloop(sbat, p, q){
+               sbt = (oid *) BUNtloc(si, p);           
+               if (*sbt != curS){
+                       if (p != 0){    /* Not the first S */
+                               addPropTypes(buffTypes, buffP, numPwithDup, 
subjCSMap[curS], csTblIdxMapping, csPropTypes);
+                       }
+                       curS = *sbt; 
+                       numPwithDup = 0;
+                       curP = 0; 
+               }
+                               
+               obt = (oid *) BUNtloc(oi, p); 
+               /* Check type of object */
+               objType = (char) ((*obt) >> (sizeof(BUN)*8 - 4))  &  7 ;        
/* Get two bits 63th, 62nd from object oid */
+       
+               pbt = (oid *) BUNtloc(pi, p);
+
+               if (curP == *pbt){
+                       #if USE_MULTIPLICITY == 1       
+                       // Update the object type for this P as MULTIVALUES     
+                       buffTypes[numPwithDup-1] = MULTIVALUES; 
+                       #else
+                       buffTypes[numPwithDup] = objType;
+                       numPwithDup++;
+                       #endif
+               }
+               else{                   
+                       buffTypes[numPwithDup] = objType; 
+                       buffP[numPwithDup] = *pbt;
+                       numPwithDup++; 
+                       curP = *pbt; 
+               }
+       }
+       
+       /* Check for the last CS */
+       addPropTypes(buffTypes, buffP, numPwithDup, subjCSMap[curS], 
csTblIdxMapping, csPropTypes);
+
+       free (buffTypes); 
+
+       *ret = 1; 
+
+       return MAL_SUCCEED; 
+}
+
 static
 void initCsRelBetweenMergeFreqSet(CSmergeRel *csRelBetweenMergeFreqSet, int 
num){
        int i;
@@ -3057,7 +3201,7 @@ int       ontmetadataCount = 0;
 
 /* Extract CS from SPO triples table */
 str
-RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat 
*mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid 
*maxCSoid, char **subjdefaultMap){
+RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat 
*mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid 
*maxCSoid, char **subjdefaultMap,int *maxNumPwithDup){
 
        BAT             *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL; 
        BATiter         si, pi, oi;     /*iterator for BAT of s,p,o columns in 
spo table */
@@ -3067,7 +3211,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        BUN             *maxSoid;       
        int             maxNumProp = 0;
-       int             maxNumPwithDup = 0; 
        char            *csFreqMap; 
        CSrel           *csrelSet;
        CSrel           *csrelToMaxFreqSet, *csrelFromMaxFreqSet;
@@ -3084,6 +3227,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
        clock_t         curT;
        clock_t         tmpLastT; 
 
+
        Labels          *labels;
 
        if ((sbat = BATdescriptor(*sbatid)) == NULL) {
@@ -3134,11 +3278,12 @@ RDFextractCSwithTypes(int *ret, bat *sba
        
        tmpLastT = clock();
 
+       *maxNumPwithDup  = 0;
        //Phase 1: Assign an ID for each CS
        #if STOREFULLCS
-       RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats, 
*subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup);
+       RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats, 
*subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup);
        #else
-       RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats, 
*subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup);
+       RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats, 
*subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup);
        #endif
        
        curT = clock(); 
@@ -3149,7 +3294,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        printf("Max CS oid: " BUNFMT "\n", *maxCSoid);
 
-       printf("Max Number of P (considering duplicated P): %d \n", 
maxNumPwithDup);
+       printf("Max Number of P (considering duplicated P): %d \n", 
*maxNumPwithDup);
 
        csFreqMap = (char*) malloc(sizeof(char) * (*maxCSoid +1)); 
        initCharArray(csFreqMap, *maxCSoid +1, 0); 
@@ -3163,7 +3308,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        csSubCSSet = initCS_SubCSSets(*maxCSoid +1); 
 
-       RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, 
csSubCSSet, csrelSet, *maxSoid, maxNumPwithDup);
+       RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap, 
csSubCSSet, csrelSet, *maxSoid, *maxNumPwithDup);
 
        curT = clock(); 
        printf (" ----- Exploring subCSs and FKs took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
@@ -3229,6 +3374,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
        curT = clock(); 
        printf (" ----- Merging Frequent CSs took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
        tmpLastT = curT;                
+       
 
        csRelBetweenMergeFreqSet = (CSmergeRel *) malloc (sizeof(CSmergeRel) * 
freqCSset->numCSadded);
        initCsRelBetweenMergeFreqSet(csRelBetweenMergeFreqSet, 
freqCSset->numCSadded);
@@ -3438,23 +3584,15 @@ str triplesubsort(BAT **sbat, BAT **pbat
 }
 
 static
-void initCStablesAndIdxMapping(CStableStat* cstablestat, CSset* freqCSset, 
int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping){
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to