Changeset: f09253dc1656 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f09253dc1656
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Modify merging rules so that dimension tables will not be merged


diffs (264 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -420,6 +420,25 @@ void getIRNums(CSrel *csrelSet, int num,
        free(lastIRScores);
 }
 
+
+static 
+void updateFreqCStype(CSset *freqCSset, int num,  float *curIRScores, int 
*refCount){
+
+       int     i; 
+       int     numDimensionCS = 0; 
+
+       for (i = 0; i < num; i++){
+               if (refCount[i] < freqCSset->items[i].support) continue; 
+               if (curIRScores[i] < IR_DIMENSION_THRESHOLD) continue; 
+
+               freqCSset->items[i].type = DIMENSIONCS;
+               numDimensionCS++;
+       }
+
+       printf("There are %d dimension CSs \n", numDimensionCS); 
+
+}
+
 #if NEEDSUBCS
 static 
 void setdefaultSubCSs(SubCSSet *subcsset, int num, BAT *sbat, oid 
*subjSubCSMap,oid *subjCSMap, char *subjdefaultMap){
@@ -1421,7 +1440,6 @@ int* getDistinctList(int *lstMergeCSFreq
 
 }
 
-
 /* Calculate number of consistsOf in the merged CS 
  and  Update support and coverage: Total of all suppors */
 
@@ -2491,6 +2509,10 @@ void mergeCSbyS4(CSset *freqCSset, CSlab
        for (i = 0; i < numMergeCS; i++){
                freqId1 = mergeCSFreqCSMap[i]; 
                if (freqCSset->items[freqId1].parentFreqIdx != -1) continue;
+               #if     NOT_MERGE_DIMENSIONCS
+               if (freqCSset->items[freqId1].type == DIMENSIONCS) continue; 
+               #endif
+
                #if USE_LABEL_FINDING_MAXCS
                isLabelComparable = 0;
                if ((*labels)[i].name != BUN_NONE) isLabelComparable = 1; // no 
"DUMMY"
@@ -2498,6 +2520,10 @@ void mergeCSbyS4(CSset *freqCSset, CSlab
 
                for (j = (i+1); j < numMergeCS; j++){
                        freqId2 = mergeCSFreqCSMap[j];
+                       #if     NOT_MERGE_DIMENSIONCS
+                       if (freqCSset->items[freqId2].type == DIMENSIONCS) 
continue; 
+                       #endif
+
                        isDiffLabel = 0; 
                        #if USE_LABEL_FINDING_MAXCS
                        if (isLabelComparable == 0 || 
strcmp((*labels)[freqId1].name, (*labels)[freqId2].name) != 0) {
@@ -3182,6 +3208,9 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
+                               #if     NOT_MERGE_DIMENSIONCS
+                               if (cs2->type == DIMENSIONCS) continue; 
+                               #endif
                                if ((*labels)[freqId2].isOntology == 1){
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
                                        tmpCount++;
@@ -3199,6 +3228,9 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
+                               #if     NOT_MERGE_DIMENSIONCS
+                               if (cs2->type == DIMENSIONCS) continue; 
+                               #endif
                                if ((*labels)[freqId2].isType == 1){
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
                                        tmpCount++;
@@ -3216,6 +3248,9 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
                        for (j = k+1; j < labelStat->lstCount[i]; j++){
                                freqId2 = labelStat->freqIdList[i][j];
                                cs2 = &(freqCSset->items[freqId2]);
+                               #if     NOT_MERGE_DIMENSIONCS
+                               if (cs2->type == DIMENSIONCS) continue; 
+                               #endif
                                if ((*labels)[freqId2].isFK == 1){
                                        doMerge(freqCSset, S1, cs1, cs2, 
freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name);
                                        tmpCount++;
@@ -3269,11 +3304,7 @@ void mergeMaxFreqCSByS6(CSrel *csrelMerg
        int             j, k; 
        #if             !USE_MULTIWAY_MERGING
        int             freqId1, freqId2;
-       int             m; 
-       CS              *mergecs;
-       int             existMergecsId; 
        CS              *cs1, *cs2;
-       CS              *existmergecs, *mergecs1, *mergecs2; 
        #else
        int             *lstDistinctFreqId = NULL;              
        int             numDistinct = 0;
@@ -3285,6 +3316,7 @@ void mergeMaxFreqCSByS6(CSrel *csrelMerg
        FILE            *fout; 
        int             maxNumPropInMergeCS =0;
        //int           numCombinedP = 0; 
+       int             startIdx = 0; 
        
        printf("Start merging CS by using S6 \n");
 
@@ -3329,54 +3361,31 @@ void mergeMaxFreqCSByS6(CSrel *csrelMerg
                                                updateLabel(S5, freqCSset, 
labels, isNew, mergeFreqIdx, -1, -1, BUN_NONE, ontmetadata, ontmetadataCount, 
lstDistinctFreqId, numDistinct);
                                        }
                                        #else
-                                       freqId1 = csRelSum->freqIdList[j][0];
+
+                                       startIdx = 0;
+                                       #if     NOT_MERGE_DIMENSIONCS
+                                       while(startIdx < 
csRelSum->numPropRef[j]) {
+                                               freqId1 = 
csRelSum->freqIdList[j][startIdx];
+                                               cs1 = (CS*) 
&(freqCSset->items[freqId1]);
+                                               if (cs1->type == DIMENSIONCS) 
+                                                       startIdx++;
+                                               else 
+                                                       break;
+                                       }
+                                       #else
+                                       freqId1 = 
csRelSum->freqIdList[j][startIdx];
                                        cs1 = (CS*) 
&(freqCSset->items[freqId1]);
-                                       for (k = 1; k < 
csRelSum->numPropRef[j]; k++){
+                                       #endif
+
+                                       for (k = (startIdx+1); k < 
csRelSum->numPropRef[j]; k++){
                                                freqId2 = 
csRelSum->freqIdList[j][k];
                                                cs2 = (CS*) 
&(freqCSset->items[freqId2]);
-                                               //Check whether these CS's 
belong to any mergeCS
-                                               if (cs1->parentFreqIdx == -1 && 
cs2->parentFreqIdx == -1){      /* New merge */
-                                                       mergecs = 
mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId);
-                                                       
//addmergeCStoSet(mergecsSet, *mergecs);
-                                                       cs1->parentFreqIdx = 
freqCSset->numCSadded;
-                                                       cs2->parentFreqIdx = 
freqCSset->numCSadded;
-                                                       //printf("Merge into %d 
\n", freqCSset->numCSadded);
-                                                       
addCStoSet(freqCSset,*mergecs);
-                                                       free(mergecs);
-
-                                                       mergecsId[0]++;
-                                                       updateLabel(S5, 
freqCSset, labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, BUN_NONE, 
ontmetadata, ontmetadataCount, NULL, -1);
-                                               }
-                                               else if (cs1->parentFreqIdx == 
-1 && cs2->parentFreqIdx != -1){
-                                                       existMergecsId = 
cs2->parentFreqIdx;
-                                                       existmergecs = (CS*) 
&(freqCSset->items[existMergecsId]);
-                                                       
mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs);
-                                                       cs1->parentFreqIdx = 
existMergecsId; 
-                                                       //printf("Merge into 
"BUNFMT" \n", existMergecsId);
-                                                       updateLabel(S5, 
freqCSset, labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata, 
ontmetadataCount, NULL, -1);
-                                                       
-                                               }
-                                               
-                                               else if (cs1->parentFreqIdx != 
-1 && cs2->parentFreqIdx == -1){
-                                                       existMergecsId = 
cs1->parentFreqIdx;
-                                                       existmergecs = 
(CS*)&(freqCSset->items[existMergecsId]);
-                                                       
mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs);
-                                                       cs2->parentFreqIdx = 
existMergecsId; 
-                                                       //printf("Merge into 
"BUNFMT" \n", existMergecsId);
-                                                       updateLabel(S5, 
freqCSset, labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata, 
ontmetadataCount, NULL, -1);
-                                               }
-                                               else if (cs1->parentFreqIdx != 
cs2->parentFreqIdx){
-                                                       mergecs1 = 
(CS*)&(freqCSset->items[cs1->parentFreqIdx]);
-                                                       mergecs2 = 
(CS*)&(freqCSset->items[cs2->parentFreqIdx]);
-                                                       
-                                                       
mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx);
-                                                       //printf("Merge into %d 
\n", cs1->parentFreqIdx);
-                                                       //Re-map for all maxCS 
in mergecs2
-                                                       for (m = 0; m < 
mergecs2->numConsistsOf; m++){
-                                                               
freqCSset->items[mergecs2->lstConsistsOf[m]].parentFreqIdx = cs1->parentFreqIdx;
-                                                       }
-                                                       updateLabel(S5, 
freqCSset, labels, 0, cs1->parentFreqIdx, freqId1, freqId2, BUN_NONE, 
ontmetadata, ontmetadataCount, NULL, -1);
-                                               }
+
+                                               #if     NOT_MERGE_DIMENSIONCS
+                                               if (cs2->type == DIMENSIONCS) 
continue; 
+                                               #endif
+
+                                               doMerge(freqCSset, S5, cs1, 
cs2, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, 
BUN_NONE);
 
                                        }
 
@@ -3514,9 +3523,16 @@ void mergeCSByS3S5(CSset *freqCSset, CSl
                if ((*labels)[freqId1].name != BUN_NONE) isLabelComparable = 1; 
// no "DUMMY"
 
                cs1 = (CS*) &(freqCSset->items[freqId1]);
+                               
+               #if     NOT_MERGE_DIMENSIONCS
+               if (cs1->type == DIMENSIONCS) continue; 
+               #endif
                for (j = (i+1); j < curNumMergeCS; j++){
                        freqId2 = mergeCSFreqCSMap[j];
                        cs2 = (CS*) &(freqCSset->items[freqId2]);
+                       #if     NOT_MERGE_DIMENSIONCS
+                       if (cs2->type == DIMENSIONCS) continue; 
+                       #endif
                        isSameLabel = 0; 
 
                        #if     USE_LABEL_FOR_MERGING
@@ -4342,7 +4358,9 @@ str RDFrelationships(int *ret, BAT *sbat
 }
 
 
-
+/* 
+ * Add highly referred CS to freqCSset, and update the frequency + coverage 
for each freqCS
+ * */
 static
 str addHighRefCSsToFreqCS(BAT *pOffsetBat, BAT *freqBat, BAT *coverageBat, BAT 
*fullPBat, 
                int* refCount, CSset *freqCSset, int *csIdFreqIdxMap, int 
numCS, int threshold){
@@ -5359,7 +5377,9 @@ RDFextractCSwithTypes(int *ret, bat *sba
        refCount = (int *) malloc(sizeof(int) * (*maxCSoid + 1));
        initIntArray(refCount, (*maxCSoid + 1), 0); 
        RDFgetRefCounts(ret, sbat, si, pi,oi, *subjCSMap, maxNumProp, *maxSoid, 
refCount);
+
        addHighRefCSsToFreqCS(csBats->pOffsetBat, csBats->freqBat, 
csBats->coverageBat, csBats->fullPBat, refCount, freqCSset, csIdFreqIdxMap, 
*maxCSoid + 1, HIGH_REFER_THRESHOLD * (*freqThreshold)); 
+
        free(refCount);
        curT = clock();
        printf (" ----- Counting references and adding highly referred CS's 
took  %f seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
@@ -5421,7 +5441,8 @@ RDFextractCSwithTypes(int *ret, bat *sba
        initIntArray(refCount, freqCSset->numCSadded, 0); 
 
        getOrigRefCount(csrelSet, freqCSset->numCSadded, refCount);  
-       getIRNums(csrelSet, freqCSset->numCSadded, refCount, 
curIRScores,NUM_ITERATION_FOR_IR);  
+       getIRNums(csrelSet, freqCSset->numCSadded, refCount, curIRScores, 
NUM_ITERATION_FOR_IR);  
+       updateFreqCStype(freqCSset, freqCSset->numCSadded, curIRScores, 
refCount);
 
        free(refCount); 
        free(curIRScores);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -52,7 +52,8 @@ typedef enum {
        NORMALCS, 
        FREQCS, 
        MAXCS, 
-       MERGECS
+       MERGECS,
+       DIMENSIONCS
 } CStype; 
 
 typedef struct {
@@ -119,8 +120,16 @@ typedef struct PropStat {
                                        */
 #define INFREQ_TYPE_THRESHOLD  0.1     /* Threshold that a type is consider as 
an infrequent type */
 
+
+
+/* ---- For detecting dimension table */
 #define        NUM_ITERATION_FOR_IR    3       /* Number of iteration for 
indirect referrences to a CS (table) */
 
+#define IR_DIMENSION_THRESHOLD 100000  /* Score of indirect references that 
the CS can be considered as a dimension CS 
+                                          Number of IR references should be 
several times larger than the CS frequency
+                                       */
+#define NOT_MERGE_DIMENSIONCS  1
+
 typedef struct CS
 {
        oid     csId;           //Id of the CS
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to