Changeset: 182a90c795fc for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=182a90c795fc
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdfretrieval.c
        monetdb5/extras/rdf/rdfretrieval.h
        sql/backends/monet5/sql.mx
Branch: rdf
Log Message:

Heuristic to maximize the edge-weights
Maximize the number of relationships between tuples in the selected subschema


diffs (273 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -414,16 +414,16 @@ void convertToSQL(CSset *freqCSset, Rela
 
 static
 void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, 
Labels* labels) {
-       char    **matrix = NULL; // matrix[from][to]
+       int     **matrix = NULL; // matrix[from][to] frequency
        int     i, j, k;
        FILE    *fout;
 
        // init
-       matrix = (char **) malloc(sizeof(char *) * freqCSset->numCSadded);
+       matrix = (int **) malloc(sizeof(int *) * freqCSset->numCSadded);
        if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
 
        for (i = 0; i < freqCSset->numCSadded; ++i) {
-               matrix[i] = (char *) malloc(sizeof(char *) * 
freqCSset->numCSadded);
+               matrix[i] = (int *) malloc(sizeof(char *) * 
freqCSset->numCSadded);
                if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
 
                for (j = 0; j < freqCSset->numCSadded; ++j) {
@@ -449,7 +449,7 @@ void createSQLMetadata(CSset* freqCSset,
                                        int to = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
                                        if (i == to) continue; // ignore self 
references
                                        if ((int) (100.0 * 
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) 
continue; // foreign key is not frequent enough
-                                       matrix[i][to] = 1;
+                                       matrix[i][to] += 
csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'to'? add 
the frequencies
                                }
                        }
                }
@@ -460,7 +460,7 @@ void createSQLMetadata(CSset* freqCSset,
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < freqCSset->numCSadded; ++j) {
                        if (matrix[i][j]) {
-                               fprintf(fout, "\"%d\",\"%d\"\n",i,j);
+                               fprintf(fout, "\"%d\",\"%d\",\"%d\"\n", i, j, 
matrix[i][j]);
                        }
                }
        }
@@ -480,7 +480,7 @@ void createSQLMetadata(CSset* freqCSset,
 
        fout = fopen("CSmetadata.sql", "wt");
        fprintf(fout, "CREATE TABLE table_id_freq (id VARCHAR(10), name 
VARCHAR(100), frequency VARCHAR(10));\n");
-       fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id 
VARCHAR(10));\n");
+       fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id 
VARCHAR(10), frequency VARCHAR(10));\n");
        fprintf(fout, "COPY INTO table_id_freq from 
'/export/scratch2/linnea/dbfarm/test/tableIdFreq.csv' USING DELIMITERS 
',','\\n','\"';\n");
        fprintf(fout, "COPY INTO adjacency_list from 
'/export/scratch2/linnea/dbfarm/test/adjacencyList.csv' USING DELIMITERS 
',','\\n','\"';");
        fclose(fout);
diff --git a/monetdb5/extras/rdf/rdfretrieval.c 
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -33,6 +33,15 @@ int edgeExists(long int from, long int t
 }
 
 static
+int getTableIndex(long int id, long int* table_id, int tableCount) {
+       int i;
+       for (i = 0; i < tableCount; ++i) {
+               if (table_id[i] == id) return i;
+       }
+       return -1;
+}
+
+static
 NodeStat* initNodeStats1(long int* table_freq, int tableCount) {
        NodeStat*       nodeStats = NULL;
        int             i;
@@ -466,12 +475,95 @@ int* retrieval3(int root, int numNodesMa
        return chosenNodes;
 }
 
-int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, int adjacencyCount) {
-       if (SUBSCHEMA_HEURISTIC == 3) {
+static
+int* retrieval4(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int 
adjacencyCount) {
+       int             numNodes;
+       int             *chosenNodes = NULL;
+       int             i, j;
+       int             sumSubjects = 0;
+       int             csCount = 0;
+       int             sumChosenSubjects = 0;
+
+       if (numNodesMax < 1) fprintf(stderr, "ERROR: numNodesMax < 1!\n");
+
+       chosenNodes = (int *) malloc(sizeof(int) * numNodesMax);
+       if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+       numNodes = 0;
+
+       // add root node
+       chosenNodes[numNodes] = root;
+       numNodes += 1;
+
+       // add nodes
+       while (numNodes < numNodesMax) {
+               int bestNextEdge = -1;
+               for (i = 0; i < adjacencyCount; ++i) {
+                       char foundFrom = 0;
+                       char foundTo = 0;
+                       for (j = 0; j < numNodes; ++j) {
+                               if (chosenNodes[j] == 
getTableIndex(adjacency_to[i], table_id, tableCount)) {
+                                       foundTo = 1;
+                                       break;
+                               }
+                       }
+                       for (j = 0; j < numNodes; ++j) {
+                               if (chosenNodes[j] == 
getTableIndex(adjacency_from[i], table_id, tableCount)) {
+                                       foundFrom = 1;
+                                       break;
+                               }
+                       }
+                       if (foundFrom && !foundTo) {
+                               // set or update
+                               if (bestNextEdge == -1) {
+                                       // first edge
+                                       bestNextEdge = i;
+                               } else {
+                                       if (adjacency_freq[i] > 
adjacency_freq[bestNextEdge]) bestNextEdge = i;
+                               }
+                       }
+               }
+               if (bestNextEdge == -1) {
+                       // no more edges
+                       break;
+               } else {
+                       chosenNodes[numNodes] = 
getTableIndex(adjacency_to[bestNextEdge], table_id, tableCount);
+                       numNodes += 1;
+               }
+       }
+
+       printf("SUBSCHEMA:\n");
+       for (i = 0; i < numNodes; ++i) {
+               str name = table_name[chosenNodes[i]];
+               printf("CS %s\n", name);
+       }
+
+       // statistics
+       for (i = 0; i < tableCount; ++i) {
+               csCount += 1;
+               sumSubjects += table_freq[i];
+               for (j = 0; j < numNodes; ++j) {
+                       if (chosenNodes[j] == i) sumChosenSubjects += 
table_freq[i];
+               }
+       }
+       printf("COVERAGE:\n");
+       printf("%d out of %d (%f %%) using %d out of %d tables (%f %%)\n", 
sumChosenSubjects, sumSubjects, (100.00 * sumChosenSubjects) / sumSubjects, 
numNodes, csCount, (100.00 * numNodes) / csCount);
+
+       *numNodesActual = numNodes;
+       return chosenNodes;
+}
+
+int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int 
adjacencyCount) {
+       if (SUBSCHEMA_HEURISTIC == 4) {
+               return retrieval4(root, numNodesMax, numNodesActual, table_id, 
table_name, table_freq, tableCount, adjacency_from, adjacency_to, 
adjacency_freq, adjacencyCount);
+       } else if (SUBSCHEMA_HEURISTIC == 3) {
+               (void) adjacency_freq;
                return retrieval3(root, numNodesMax, numNodesActual, table_id, 
table_name, table_freq, tableCount, adjacency_from, adjacency_to, 
adjacencyCount);
        } else if (SUBSCHEMA_HEURISTIC == 2) {
+               (void) adjacency_freq;
                return retrieval2(root, numNodesMax, numNodesActual, table_id, 
table_name, table_freq, tableCount, adjacency_from, adjacency_to, 
adjacencyCount);
        }
        // SUBSCHEMA_HEURISTIC == 1 or other value
+       (void) adjacency_freq;
        return retrieval1(root, numNodesMax, numNodesActual, table_id, 
table_name, table_freq, tableCount, adjacency_from, adjacency_to, 
adjacencyCount);
 };
diff --git a/monetdb5/extras/rdf/rdfretrieval.h 
b/monetdb5/extras/rdf/rdfretrieval.h
--- a/monetdb5/extras/rdf/rdfretrieval.h
+++ b/monetdb5/extras/rdf/rdfretrieval.h
@@ -29,8 +29,8 @@ typedef struct NodeStat {
        int     origWeight;     // = CS frequency
 } NodeStat;
 
-#define SUBSCHEMA_HEURISTIC 1
+#define SUBSCHEMA_HEURISTIC 4
 
-int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, int adjacencyCount);
+int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int 
adjacencyCount);
 
 #endif /* _RDFRETRIEVAL_H_ */
diff --git a/sql/backends/monet5/sql.mx b/sql/backends/monet5/sql.mx
--- a/sql/backends/monet5/sql.mx
+++ b/sql/backends/monet5/sql.mx
@@ -7778,13 +7778,13 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
        int             i;
 
        BAT             *table_id_freq_id, *table_id_freq_name, 
*table_id_freq_freq;
-       BAT             *adjacency_list_from, *adjacency_list_to;
+       BAT             *adjacency_list_from, *adjacency_list_to, 
*adjacency_list_freq;
        BATiter         table_id_freq_id_i, table_id_freq_name_i, 
table_id_freq_freq_i;
-       BATiter         adjacency_list_from_i, adjacency_list_to_i;
+       BATiter         adjacency_list_from_i, adjacency_list_to_i, 
adjacency_list_freq_i;
        BUN             p, q;
        BUN             p2, q2;
        BUN             bun, bun2, bun3;
-       BUN             bun4, bun5;
+       BUN             bun4, bun5, bun6;
 
        int             tableCount = 0;
        long int        *table_id = NULL;
@@ -7792,6 +7792,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
        long int        *table_freq = NULL;
        long int        *adjacency_from = NULL;
        long int        *adjacency_to = NULL;
+       long int        *adjacency_freq = NULL;
        int             adjacencyCount = 0;
 
        int             root;
@@ -7810,18 +7811,21 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
        table_id_freq_freq = mvc_bind(m, "sys", "table_id_freq", "frequency", 
0);
        adjacency_list_from = mvc_bind(m, "sys", "adjacency_list", "from_id", 
0);
        adjacency_list_to = mvc_bind(m, "sys", "adjacency_list", "to_id", 0);
+       adjacency_list_freq = mvc_bind(m, "sys", "adjacency_list", "frequency", 
0);
 
        table_id_freq_id_i = bat_iterator(table_id_freq_id);
        table_id_freq_name_i = bat_iterator(table_id_freq_name);
        table_id_freq_freq_i = bat_iterator(table_id_freq_freq);
        adjacency_list_from_i = bat_iterator(adjacency_list_from);
        adjacency_list_to_i = bat_iterator(adjacency_list_to);
+       adjacency_list_freq_i = bat_iterator(adjacency_list_freq);
 
        bun = BUNfirst(table_id_freq_id);
        bun2 = BUNfirst(table_id_freq_name);
        bun3 = BUNfirst(table_id_freq_freq);
        bun4 = BUNfirst(adjacency_list_from);
        bun5 = BUNfirst(adjacency_list_to);
+       bun6 = BUNfirst(adjacency_list_freq);
 
        // store data
        BATloop(table_id_freq_id, p, q) {
@@ -7844,13 +7848,16 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
        BATloop(adjacency_list_from, p2, q2) {
                str from = (str) BUNtail(adjacency_list_from_i, bun4 + 
adjacencyCount);
                str to = (str) BUNtail(adjacency_list_to_i, bun5 + 
adjacencyCount);
+               str freq = (str) BUNtail(adjacency_list_freq_i, bun6 + 
adjacencyCount);
 
                adjacency_from = realloc(adjacency_from, sizeof(long int) * 
(adjacencyCount + 1));
                adjacency_to = realloc(adjacency_to, sizeof(long int) * 
(adjacencyCount + 1));
-               if (!adjacency_from || !adjacency_to) fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+               adjacency_freq = realloc(adjacency_freq, sizeof(long int) * 
(adjacencyCount + 1));
+               if (!adjacency_from || !adjacency_to || !adjacency_freq) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
 
                adjacency_from[adjacencyCount] = strtol(from, NULL, 10);
                adjacency_to[adjacencyCount] = strtol(to, NULL, 10);
+               adjacency_freq[adjacencyCount] = strtol(freq, NULL, 10);
                adjacencyCount += 1;
        }
 
@@ -7860,7 +7867,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
 
        // call retrieval function
        countActual = 0;
-       nodes = retrieval(root, *count, &countActual, table_id, table_name, 
table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount);
+       nodes = retrieval(root, *count, &countActual, table_id, table_name, 
table_freq, tableCount, adjacency_from, adjacency_to, adjacency_freq, 
adjacencyCount);
 
        // create schema
        name = "s123"; // TODO create schema name
@@ -7893,12 +7900,14 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
        BBPreclaim(table_id_freq_freq);
        BBPreclaim(adjacency_list_from);
        BBPreclaim(adjacency_list_to);
+       BBPreclaim(adjacency_list_freq);
 
        free(table_id);
        free(table_name);
        free(table_freq);
        free(adjacency_from);
        free(adjacency_to);
+       free(adjacency_freq);
 
        return MAL_SUCCEED;
 }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to