Changeset: 9a9a115446e0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9a9a115446e0
Modified Files:
        monetdb5/extras/rdf/rdfretrieval.c
        monetdb5/extras/rdf/rdfretrieval.h
Branch: rdf
Log Message:

Schema overview: first version of algorithm to choose tables that provide an 
overview of the SQL schema


diffs (289 lines):

diff --git a/monetdb5/extras/rdf/rdfretrieval.c 
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -553,8 +553,257 @@ int* retrieval4(int root, int numNodesMa
        return chosenNodes;
 }
 
+static
+char** initEdgesOverview(long int* table_id, int tableCount, long int* 
adjacency_from, long int* adjacency_to, int adjacencyCount) {
+       char            **edges;
+       int             i, j;
+
+       edges = (char **) malloc(sizeof(char *) * tableCount);
+       if (!edges) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+       for (i = 0; i < tableCount; ++i) {
+               edges[i] = (char *) malloc(sizeof(char) * tableCount);
+               if (!edges[i]) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
+               for (j = 0; j < tableCount; ++j) {
+                       edges[i][j] = 0;
+               }
+               edges[i][i] = 1; // self-reachability
+       }
+
+       for (i = 0; i < adjacencyCount; ++i) {
+               long int from = adjacency_from[i];
+               long int to = adjacency_to[i];
+               int fromIdx = -1;
+               int toIdx = -1;
+
+               // index lookup
+               for (j = 0; j < tableCount; ++j) {
+                       if (table_id[j] == from) {fromIdx = j;}
+                       if (table_id[j] == to) {toIdx = j;}
+                       if (fromIdx > -1 && toIdx > -1) {break;}
+               }
+               assert(fromIdx > -1);
+               assert(toIdx > -1);
+
+               // set edge
+               edges[fromIdx][toIdx] = 1;
+       }
+
+       return edges;
+}
+
+static
+int compareOverviewNodes (const void * a, const void * b) {
+  return ( (*(Node*)b).reachabilityCount - (*(Node*)a).reachabilityCount ); // 
sort descending
+}
+
+static
+int* retrievalOverview(int* numNodesActual, long int* table_id, str* 
table_name, long int* table_freq, int tableCount, long int* adjacency_from, 
long int* adjacency_to, int adjacencyCount) {
+       int             i, j, k;
+       char            **edges;
+       int             sumSubjects = 0;
+       int             csCount = 0;
+       int             sumChosenSubjects = 0;
+
+       int             queue[tableCount]; // cyclic array
+       int             isInQueue[tableCount];
+       int             queuePosition; // next element in queue to view at
+       int             queueLength;
+       char            visited[tableCount];
+       int             subgraphSize;
+       Groups          groups;
+       int             *chosenNodes = NULL;
+
+       groups.count = 0;
+       groups.groups = NULL;
+
+       edges = initEdgesOverview(table_id, tableCount, adjacency_from, 
adjacency_to, adjacencyCount);
+
+       for (i = 0; i < tableCount; ++i) {
+               visited[i] = 0;
+       }
+
+       // split into disconnected subgraph (ignoring the direction of the 
edges) using BFS
+       while (1) {
+               int root = -1;
+               for (i = 0; i < tableCount; ++i) {
+                       if (!visited[i]) {
+                               root = i;
+                               break;
+                       }
+               }
+               if (root == -1) break; // all nodes have been visited, all 
subgraphs have been found
+               // init
+               subgraphSize = 0;
+
+               for (i = 0; i < tableCount; ++i) {
+                       queue[i] = -1;
+                       isInQueue[i] = 0;
+               }
+
+               // add root node
+               queue[0] = root;
+               queuePosition = 0;
+               queueLength = 1;
+
+               visited[root] = 1;
+               isInQueue[root] = 1;
+
+               // bfs
+               while (queueLength > 0) {
+                       // dequeue next value
+                       int node = queue[queuePosition % tableCount];
+                       visited[node] = 1;
+                       subgraphSize++;
+                       isInQueue[node] = 0;
+                       queuePosition += 1;
+                       queueLength -= 1;
+
+                       // for all adjacent edges
+                       for (i = 0; i < tableCount; ++i) {
+                               if (visited[i] || isInQueue[i]) continue;
+                               if (edges[node][i] || edges[i][node]) {
+                                       // ignore direction of edge
+
+                                       // enqueue
+                                       queue[((queueLength + queuePosition) % 
tableCount)] = i;
+                                       queueLength += 1;
+                                       isInQueue[i] = 1;
+                               }
+                       }
+               }
+
+               // store subgraph/group
+               j = 0;
+               // add group
+               groups.count++;
+               groups.groups = realloc(groups.groups, sizeof(Group) * 
groups.count);
+               if (!groups.groups) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
+               groups.groups[groups.count - 1].size = subgraphSize;
+               groups.groups[groups.count - 1].nodes = (Node *) 
malloc(sizeof(Node) * subgraphSize);
+
+               for (i = 0; i < tableCount; ++i) {
+                       if (visited[i] == 1) {
+                               // add to group
+                               groups.groups[groups.count - 1].nodes[j].idx = 
i;
+                               j++;
+                               visited[i] = 2; // node is still marked, but 
can be distinguished from nodes visited in other iterations
+                       }
+               }
+               assert(j == subgraphSize);
+       }
+
+       // transitive closure (Floyd-Warshall-Algorithm)
+       for (k = 0; k < tableCount; ++k) {
+               for (i = 0; i < tableCount; ++i) {
+                       for (j = 0; j < tableCount; ++j) {
+                               if (i == j || i == k || j == k) continue;
+                               if (edges[i][k] && edges[k][j]) edges[i][j] = 1;
+                       }
+               }
+       }
+
+       // select nodes to be shown in the overview schema
+       for (i = 0; i < groups.count; ++i) {
+               int found = 0;
+
+               // count how many group members can be reached from each node
+               for (j = 0; j < groups.groups[i].size; ++j) {
+                       int node = groups.groups[i].nodes[j].idx;
+                       int reachabilityCount = 0;
+                       for (k = 0; k < tableCount; ++k) {
+                               if (edges[node][k]) {
+                                       reachabilityCount++;
+                               }
+                       }
+                       groups.groups[i].nodes[j].reachabilityCount = 
reachabilityCount;
+
+                       if (reachabilityCount == groups.groups[i].size) {
+                               // found a node that links to all other nodes 
in that group --> add to overview schema
+                               (*numNodesActual) += 1;
+                               chosenNodes = realloc(chosenNodes, sizeof(int) 
* (*numNodesActual));
+                               if (!chosenNodes) fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+                               chosenNodes[*numNodesActual - 1] = node;
+                               found = 1;
+                               break;
+                       }
+               }
+               if (!found) {
+                       int node;
+                       char reachability[tableCount];
+                       int reachabilityCount = 0;
+                       int nextNode; // position in the (sorted) list of nodes 
to look at next
+
+                       // greedy
+                       qsort(groups.groups[i].nodes, groups.groups[i].size, 
sizeof(Node), compareOverviewNodes);
+
+                       // take first node (covers the most nodes)
+                       node = groups.groups[i].nodes[0].idx;
+                       (*numNodesActual) += 1;
+                       chosenNodes = realloc(chosenNodes, sizeof(int) * 
(*numNodesActual));
+                       if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+                       chosenNodes[*numNodesActual - 1] = node;
+                       nextNode = 1;
+                       // store reachability vector
+                       for (j = 0; j < tableCount; ++j) {
+                               if (edges[node][j]) {
+                                       reachability[j] = 1;
+                                       reachabilityCount++;
+                               } else {
+                                       reachability[j] = 0;
+                               }
+                       }
+                       assert (groups.groups[i].nodes[0].reachabilityCount == 
reachabilityCount);
+
+                       // take more nodes
+                       for (j = nextNode; j < tableCount; ++j) {
+                               int node = groups.groups[i].nodes[j].idx;;
+                               if (reachabilityCount == groups.groups[i].size) 
break;
+                               if (reachability[node]) continue;
+
+                               // take this node
+                               (*numNodesActual) += 1;
+                               chosenNodes = realloc(chosenNodes, sizeof(int) 
* (*numNodesActual));
+                               if (!chosenNodes) fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+                               chosenNodes[*numNodesActual - 1] = node;
+                               nextNode = (j + 1);
+
+                               // update reachability vector
+                               for (k = 0; k < tableCount; ++k) {
+                                       if (edges[node][k] && !reachability[k]) 
{
+                                               reachability[k] = 1;
+                                               reachabilityCount++;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       printf("SUBSCHEMA:\n");
+       for (i = 0; i < *numNodesActual; ++i) {
+               str name = table_name[chosenNodes[i]];
+               printf("CS %s\n", name);
+       }
+
+       // statistics
+       for (i = 0; i < tableCount; ++i) {
+               csCount += 1;
+               sumSubjects += table_freq[i];
+               for (j = 0; j < *numNodesActual; ++j) {
+                       if (chosenNodes[j] == i) sumChosenSubjects += 
table_freq[i];
+               }
+       }
+       printf("COVERAGE:\n");
+       printf("%d out of %d (%f %%) using %d out of %d tables (%f %%)\n", 
sumChosenSubjects, sumSubjects, (100.00 * sumChosenSubjects) / sumSubjects, 
*numNodesActual, csCount, (100.00 * *numNodesActual) / csCount);
+       return chosenNodes;
+}
+
 int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int 
adjacencyCount) {
-       if (SUBSCHEMA_HEURISTIC == 4) {
+       if (SUBSCHEMA_HEURISTIC == 5) {
+               (void) numNodesMax;
+               (void) root;
+               return retrievalOverview(numNodesActual, table_id, table_name, 
table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount);
+       } else if (SUBSCHEMA_HEURISTIC == 4) {
                return retrieval4(root, numNodesMax, numNodesActual, table_id, 
table_name, table_freq, tableCount, adjacency_from, adjacency_to, 
adjacency_freq, adjacencyCount);
        } else if (SUBSCHEMA_HEURISTIC == 3) {
                (void) adjacency_freq;
diff --git a/monetdb5/extras/rdf/rdfretrieval.h 
b/monetdb5/extras/rdf/rdfretrieval.h
--- a/monetdb5/extras/rdf/rdfretrieval.h
+++ b/monetdb5/extras/rdf/rdfretrieval.h
@@ -29,7 +29,22 @@ typedef struct NodeStat {
        int     origWeight;     // = CS frequency
 } NodeStat;
 
-#define SUBSCHEMA_HEURISTIC 4
+typedef struct Node {
+       int     idx;
+       int     reachabilityCount;      // how many members of the group can be 
reached from this node? (self-reachability is included, max value is 'size')
+} Node;
+
+typedef struct Group {
+       int     size;
+       Node*   nodes;
+} Group;
+
+typedef struct Groups {
+       int     count;
+       Group   *groups;
+} Groups;
+
+#define SUBSCHEMA_HEURISTIC 5
 
 int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* 
table_id, str* table_name, long int* table_freq, int tableCount, long int* 
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int 
adjacencyCount);
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to