Changeset: 3db233ec0f01 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3db233ec0f01 Added Files: monetdb5/extras/rdf/rdfretrieval.c monetdb5/extras/rdf/rdfretrieval.h Modified Files: monetdb5/extras/rdf/Makefile.ag monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message:
Sub schema retrieval heuristics Three Algorithms to choose a fixed number of connected CS's containing as much data as possible. diffs (truncated from 632 to 300 lines): diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag --- a/monetdb5/extras/rdf/Makefile.ag +++ b/monetdb5/extras/rdf/Makefile.ag @@ -32,7 +32,7 @@ lib__rdf = { #MODULE NOINST #DIR = libdir/monetdb5 - SOURCES = rdf.h rdfschema.h rdflabels.h rdfparser.h rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c rdflabels.c + SOURCES = rdf.h rdfschema.h rdflabels.h rdfretrieval.h rdfparser.h rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c rdflabels.c rdfretrieval.c #SEP = _ # LIBS = ./hashmap/librdfhash diff --git a/monetdb5/extras/rdf/rdfretrieval.c b/monetdb5/extras/rdf/rdfretrieval.c new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/rdfretrieval.c @@ -0,0 +1,533 @@ +/* + * The contents of this file are subject to the MonetDB Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.monetdb.org/Legal/MonetDBLicense + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + * License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is the MonetDB Database System. + * + * The Initial Developer of the Original Code is CWI. + * Portions created by CWI are Copyright (C) 1997-July 2008 CWI. + * Copyright August 2008-2013 MonetDB B.V. + * All Rights Reserved. + */ + +#include "monetdb_config.h" +#include "rdf.h" +#include "rdfretrieval.h" +#include "rdfschema.h" +#include "rdflabels.h" + +static +char** initAdjacencyMatrix(int csCount) { + char **matrix = NULL; // matrix[from][to] + int i, j; + + matrix = (char **) malloc(sizeof(char *) * csCount); + if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + + for (i = 0; i < csCount; ++i) { + matrix[i] = (char *) malloc(sizeof(char *) * csCount); + if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + + for (j = 0; j < csCount; ++j) { + matrix[i][j] = 0; + } + } + + return matrix; +} + +static +void createAdjacencyMatrix(char** matrix, CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet) { + int i, j, k; + + for (i = 0; i < freqCSset->numCSadded; ++i) { + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + + for (j = 0; j < freqCSset->items[i].numProp; ++j) { // propNo in CS order + // check foreign key frequency + int sum = 0; + for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; ++k) { + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset->items[i].lstProp[j]) { + sum += csRelBetweenMergeFreqSet[i].lstCnt[k]; + } + } + + for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset->items[i].lstProp[j]) { + int to = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; + if (i == to) continue; // ignore self references + if ((int) (100.0 * csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough + matrix[i][to] = 1; + } + } + } + } +} + +static +NodeStat* initNodeStats(CSset* freqCSset) { + NodeStat* nodeStats = NULL; + int i; + + nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * freqCSset->numCSadded); + if (!nodeStats) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + + for (i = 0; i < freqCSset->numCSadded; ++i) { + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + nodeStats[i].origWeight = freqCSset->items[i].support; + nodeStats[i].weight = freqCSset->items[i].support; // weight = origWeight + nodeStats[i].steps = -1; + nodeStats[i].predecessor = -1; + } + + return nodeStats; +} + +static +NodeStat* initNodeStats23(CSset* freqCSset) { + NodeStat* nodeStats = NULL; + int i; + + nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * freqCSset->numCSadded); + if (!nodeStats) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + + for (i = 0; i < freqCSset->numCSadded; ++i) { + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + nodeStats[i].origWeight = freqCSset->items[i].support; + nodeStats[i].weight = 0; + nodeStats[i].steps = -1; // not used + nodeStats[i].predecessor = 0; // not used + } + + return nodeStats; +} + +static +void bfs1(int root, CSset* freqCSset, char** adjacencyMatrix, int* queue, int* visited, int* isInQueue, int* queuePosition, int* queueLength, NodeStat* nodeStats) { + int i; + + for (i = 0; i < freqCSset->numCSadded; ++i) { + if (adjacencyMatrix[root][i]) { + if (nodeStats[i].steps == -1) { + // no previous path to this node + nodeStats[i].weight = nodeStats[root].weight + nodeStats[i].origWeight; + nodeStats[i].steps = nodeStats[root].steps + 1; + nodeStats[i].predecessor = root; + } else { + // previous path to this node + // test if values should be updated + + // cycle detection + int cycle = 0; + int pathId = root; + while (pathId != -1) { + if (pathId == i) { + // cycle found + cycle = 1; + break; + } + pathId = nodeStats[pathId].predecessor; + } + if (cycle) continue; + + if (nodeStats[i].predecessor == root) { + // path to 'i' used 'root', has to be updated if the weight changes + if (((float) (nodeStats[root].weight + nodeStats[i].origWeight)) / (nodeStats[root].steps + 1) != ((float) nodeStats[i].weight) / nodeStats[i].steps) { + // set new weight and path + nodeStats[i].weight = nodeStats[root].weight + nodeStats[i].origWeight; + nodeStats[i].steps = nodeStats[root].steps + 1; + nodeStats[i].predecessor = root; + // update values for subsequent nodes + visited[i] = 0; + } + } else if (((float) (nodeStats[root].weight + nodeStats[i].origWeight)) / (nodeStats[root].steps + 1) > ((float) nodeStats[i].weight) / nodeStats[i].steps) { + // improved weight when accessing node 'i' via 'root' + // set new weight and path + nodeStats[i].weight = nodeStats[root].weight + nodeStats[i].origWeight; + nodeStats[i].steps = nodeStats[root].steps + 1; + nodeStats[i].predecessor = root; + // update values for subsequent nodes + visited[i] = 0; + } + } + + if (!visited[i] && !isInQueue[i]) { + // add to queue + queue[((*queueLength + *queuePosition) % freqCSset->numCSadded)] = i; + *queueLength += 1; + isInQueue[i] = 1; + } + + } + } + + if (*queueLength > 0) { + visited[queue[(*queuePosition % freqCSset->numCSadded)]] = 1; + isInQueue[queue[(*queuePosition % freqCSset->numCSadded)]] = 0; + *queuePosition += 1; + *queueLength -= 1; + bfs1(queue[((*queuePosition + freqCSset->numCSadded - 1) % freqCSset->numCSadded)], freqCSset, adjacencyMatrix, queue, visited, isInQueue, queuePosition, queueLength, nodeStats); + } +} + +static +void addNode1(char** adjacencyMatrix, NodeStat* nodeStats, CSset* freqCSset, int root, char initial) { + int queue[freqCSset->numCSadded]; // cyclic array + int visited[freqCSset->numCSadded]; + int isInQueue[freqCSset->numCSadded]; + int queuePosition; // next element in queue to view at + int queueLength; + int pathId, pathIdTmp; + int i; + + // init + for (i = 0; i < freqCSset->numCSadded; ++i) { + queue[i] = -1; + visited[i] = 0; + isInQueue[i] = 0; + } + visited[root] = 1; + queuePosition = 0; + queueLength = 0; + + if (initial) { + // mark root as a "chosen node" + nodeStats[root].steps = 0; + nodeStats[root].predecessor = -1; + nodeStats[root].weight = 0; + } else { + // add nodes on path to queue + int steps = nodeStats[root].steps; + int i = 0; + + pathId = root; + while (pathId != -1) { + ++i; + pathIdTmp = nodeStats[pathId].predecessor; // save predecessor + + // mark node as a "chosen node" + nodeStats[pathId].steps = 0; + nodeStats[pathId].predecessor = -1; + nodeStats[pathId].weight = 0; + + if (nodeStats[pathIdTmp].steps == 0) break; // found the end of the path of new nodes + queue[queueLength] = pathIdTmp; + queueLength += 1; + isInQueue[pathIdTmp] = 1; + + pathId = pathIdTmp; // move to predecessor + } + assert(steps == i); + } + + bfs1(root, freqCSset, adjacencyMatrix, queue, visited, isInQueue, &queuePosition, &queueLength, nodeStats); +} + +int* retrieval1(int root, int numNodesMax, int* numNodesActual, CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet) { + char **adjacencyMatrix = NULL; + NodeStat *nodeStats = NULL; + int numNodes; + int *chosenNodes = NULL; + int i, j; + int sumSubjects = 0; + int csCount = 0; + int sumChosenSubjects = 0; + + if (numNodesMax < 1) fprintf(stderr, "ERROR: numNodesMax < 1!\n"); + + chosenNodes = (int *) malloc(sizeof(int) * numNodesMax); + if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + + adjacencyMatrix = initAdjacencyMatrix(freqCSset->numCSadded); + createAdjacencyMatrix(adjacencyMatrix, freqCSset, csRelBetweenMergeFreqSet); + nodeStats = initNodeStats(freqCSset); + numNodes = 1; + + // add root node + addNode1(adjacencyMatrix, nodeStats, freqCSset, root, 1); + + // add nodes + while (numNodes < numNodesMax) { + // get top node (highest fraction (weight/steps)) + int top = -1; + for (i = 0; i < freqCSset->numCSadded; ++i) { + int topWeight, testWeight; + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + if (nodeStats[i].steps == -1) continue; // non-reachable + if (nodeStats[i].steps == 0) continue; // already chosen + if (numNodes + nodeStats[i].steps > numNodesMax) continue; // path too long + if (top == -1) { + top = i; // set first value + continue; + } + topWeight = ((float) nodeStats[top].weight) / nodeStats[top].steps; + testWeight = ((float) nodeStats[i].weight) / nodeStats[i].steps; + if (testWeight > topWeight) { + top = i; + } + } + if (top == -1) break; // not enough nodes found + numNodes += nodeStats[top].steps; + addNode1(adjacencyMatrix, nodeStats, freqCSset, top, 0); // add node(s) + } + + // store list of chosen nodes + printf("SUBSCHEMA:\n"); + j = 0; // counter for chosenNodes[] _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list