Changeset: 182a90c795fc for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=182a90c795fc Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdfretrieval.c monetdb5/extras/rdf/rdfretrieval.h sql/backends/monet5/sql.mx Branch: rdf Log Message:
Heuristic to maximize the edge-weights Maximize the number of relationships between tuples in the selected subschema diffs (273 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -414,16 +414,16 @@ void convertToSQL(CSset *freqCSset, Rela static void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, Labels* labels) { - char **matrix = NULL; // matrix[from][to] + int **matrix = NULL; // matrix[from][to] frequency int i, j, k; FILE *fout; // init - matrix = (char **) malloc(sizeof(char *) * freqCSset->numCSadded); + matrix = (int **) malloc(sizeof(int *) * freqCSset->numCSadded); if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); for (i = 0; i < freqCSset->numCSadded; ++i) { - matrix[i] = (char *) malloc(sizeof(char *) * freqCSset->numCSadded); + matrix[i] = (int *) malloc(sizeof(char *) * freqCSset->numCSadded); if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); for (j = 0; j < freqCSset->numCSadded; ++j) { @@ -449,7 +449,7 @@ void createSQLMetadata(CSset* freqCSset, int to = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; if (i == to) continue; // ignore self references if ((int) (100.0 * csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough - matrix[i][to] = 1; + matrix[i][to] += csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'to'? add the frequencies } } } @@ -460,7 +460,7 @@ void createSQLMetadata(CSset* freqCSset, for (i = 0; i < freqCSset->numCSadded; ++i) { for (j = 0; j < freqCSset->numCSadded; ++j) { if (matrix[i][j]) { - fprintf(fout, "\"%d\",\"%d\"\n",i,j); + fprintf(fout, "\"%d\",\"%d\",\"%d\"\n", i, j, matrix[i][j]); } } } @@ -480,7 +480,7 @@ void createSQLMetadata(CSset* freqCSset, fout = fopen("CSmetadata.sql", "wt"); fprintf(fout, "CREATE TABLE table_id_freq (id VARCHAR(10), name VARCHAR(100), frequency VARCHAR(10));\n"); - fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id VARCHAR(10));\n"); + fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id VARCHAR(10), frequency VARCHAR(10));\n"); fprintf(fout, "COPY INTO table_id_freq from '/export/scratch2/linnea/dbfarm/test/tableIdFreq.csv' USING DELIMITERS ',','\\n','\"';\n"); fprintf(fout, "COPY INTO adjacency_list from '/export/scratch2/linnea/dbfarm/test/adjacencyList.csv' USING DELIMITERS ',','\\n','\"';"); fclose(fout); diff --git a/monetdb5/extras/rdf/rdfretrieval.c b/monetdb5/extras/rdf/rdfretrieval.c --- a/monetdb5/extras/rdf/rdfretrieval.c +++ b/monetdb5/extras/rdf/rdfretrieval.c @@ -33,6 +33,15 @@ int edgeExists(long int from, long int t } static +int getTableIndex(long int id, long int* table_id, int tableCount) { + int i; + for (i = 0; i < tableCount; ++i) { + if (table_id[i] == id) return i; + } + return -1; +} + +static NodeStat* initNodeStats1(long int* table_freq, int tableCount) { NodeStat* nodeStats = NULL; int i; @@ -466,12 +475,95 @@ int* retrieval3(int root, int numNodesMa return chosenNodes; } -int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, int adjacencyCount) { - if (SUBSCHEMA_HEURISTIC == 3) { +static +int* retrieval4(int root, int numNodesMax, int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, long int* adjacency_freq, int adjacencyCount) { + int numNodes; + int *chosenNodes = NULL; + int i, j; + int sumSubjects = 0; + int csCount = 0; + int sumChosenSubjects = 0; + + if (numNodesMax < 1) fprintf(stderr, "ERROR: numNodesMax < 1!\n"); + + chosenNodes = (int *) malloc(sizeof(int) * numNodesMax); + if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + + numNodes = 0; + + // add root node + chosenNodes[numNodes] = root; + numNodes += 1; + + // add nodes + while (numNodes < numNodesMax) { + int bestNextEdge = -1; + for (i = 0; i < adjacencyCount; ++i) { + char foundFrom = 0; + char foundTo = 0; + for (j = 0; j < numNodes; ++j) { + if (chosenNodes[j] == getTableIndex(adjacency_to[i], table_id, tableCount)) { + foundTo = 1; + break; + } + } + for (j = 0; j < numNodes; ++j) { + if (chosenNodes[j] == getTableIndex(adjacency_from[i], table_id, tableCount)) { + foundFrom = 1; + break; + } + } + if (foundFrom && !foundTo) { + // set or update + if (bestNextEdge == -1) { + // first edge + bestNextEdge = i; + } else { + if (adjacency_freq[i] > adjacency_freq[bestNextEdge]) bestNextEdge = i; + } + } + } + if (bestNextEdge == -1) { + // no more edges + break; + } else { + chosenNodes[numNodes] = getTableIndex(adjacency_to[bestNextEdge], table_id, tableCount); + numNodes += 1; + } + } + + printf("SUBSCHEMA:\n"); + for (i = 0; i < numNodes; ++i) { + str name = table_name[chosenNodes[i]]; + printf("CS %s\n", name); + } + + // statistics + for (i = 0; i < tableCount; ++i) { + csCount += 1; + sumSubjects += table_freq[i]; + for (j = 0; j < numNodes; ++j) { + if (chosenNodes[j] == i) sumChosenSubjects += table_freq[i]; + } + } + printf("COVERAGE:\n"); + printf("%d out of %d (%f %%) using %d out of %d tables (%f %%)\n", sumChosenSubjects, sumSubjects, (100.00 * sumChosenSubjects) / sumSubjects, numNodes, csCount, (100.00 * numNodes) / csCount); + + *numNodesActual = numNodes; + return chosenNodes; +} + +int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, long int* adjacency_freq, int adjacencyCount) { + if (SUBSCHEMA_HEURISTIC == 4) { + return retrieval4(root, numNodesMax, numNodesActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacency_freq, adjacencyCount); + } else if (SUBSCHEMA_HEURISTIC == 3) { + (void) adjacency_freq; return retrieval3(root, numNodesMax, numNodesActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount); } else if (SUBSCHEMA_HEURISTIC == 2) { + (void) adjacency_freq; return retrieval2(root, numNodesMax, numNodesActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount); } // SUBSCHEMA_HEURISTIC == 1 or other value + (void) adjacency_freq; return retrieval1(root, numNodesMax, numNodesActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount); }; diff --git a/monetdb5/extras/rdf/rdfretrieval.h b/monetdb5/extras/rdf/rdfretrieval.h --- a/monetdb5/extras/rdf/rdfretrieval.h +++ b/monetdb5/extras/rdf/rdfretrieval.h @@ -29,8 +29,8 @@ typedef struct NodeStat { int origWeight; // = CS frequency } NodeStat; -#define SUBSCHEMA_HEURISTIC 1 +#define SUBSCHEMA_HEURISTIC 4 -int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, int adjacencyCount); +int* retrieval(int root, int numNodesMax, int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, long int* adjacency_freq, int adjacencyCount); #endif /* _RDFRETRIEVAL_H_ */ diff --git a/sql/backends/monet5/sql.mx b/sql/backends/monet5/sql.mx --- a/sql/backends/monet5/sql.mx +++ b/sql/backends/monet5/sql.mx @@ -7778,13 +7778,13 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma int i; BAT *table_id_freq_id, *table_id_freq_name, *table_id_freq_freq; - BAT *adjacency_list_from, *adjacency_list_to; + BAT *adjacency_list_from, *adjacency_list_to, *adjacency_list_freq; BATiter table_id_freq_id_i, table_id_freq_name_i, table_id_freq_freq_i; - BATiter adjacency_list_from_i, adjacency_list_to_i; + BATiter adjacency_list_from_i, adjacency_list_to_i, adjacency_list_freq_i; BUN p, q; BUN p2, q2; BUN bun, bun2, bun3; - BUN bun4, bun5; + BUN bun4, bun5, bun6; int tableCount = 0; long int *table_id = NULL; @@ -7792,6 +7792,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma long int *table_freq = NULL; long int *adjacency_from = NULL; long int *adjacency_to = NULL; + long int *adjacency_freq = NULL; int adjacencyCount = 0; int root; @@ -7810,18 +7811,21 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma table_id_freq_freq = mvc_bind(m, "sys", "table_id_freq", "frequency", 0); adjacency_list_from = mvc_bind(m, "sys", "adjacency_list", "from_id", 0); adjacency_list_to = mvc_bind(m, "sys", "adjacency_list", "to_id", 0); + adjacency_list_freq = mvc_bind(m, "sys", "adjacency_list", "frequency", 0); table_id_freq_id_i = bat_iterator(table_id_freq_id); table_id_freq_name_i = bat_iterator(table_id_freq_name); table_id_freq_freq_i = bat_iterator(table_id_freq_freq); adjacency_list_from_i = bat_iterator(adjacency_list_from); adjacency_list_to_i = bat_iterator(adjacency_list_to); + adjacency_list_freq_i = bat_iterator(adjacency_list_freq); bun = BUNfirst(table_id_freq_id); bun2 = BUNfirst(table_id_freq_name); bun3 = BUNfirst(table_id_freq_freq); bun4 = BUNfirst(adjacency_list_from); bun5 = BUNfirst(adjacency_list_to); + bun6 = BUNfirst(adjacency_list_freq); // store data BATloop(table_id_freq_id, p, q) { @@ -7844,13 +7848,16 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma BATloop(adjacency_list_from, p2, q2) { str from = (str) BUNtail(adjacency_list_from_i, bun4 + adjacencyCount); str to = (str) BUNtail(adjacency_list_to_i, bun5 + adjacencyCount); + str freq = (str) BUNtail(adjacency_list_freq_i, bun6 + adjacencyCount); adjacency_from = realloc(adjacency_from, sizeof(long int) * (adjacencyCount + 1)); adjacency_to = realloc(adjacency_to, sizeof(long int) * (adjacencyCount + 1)); - if (!adjacency_from || !adjacency_to) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + adjacency_freq = realloc(adjacency_freq, sizeof(long int) * (adjacencyCount + 1)); + if (!adjacency_from || !adjacency_to || !adjacency_freq) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); adjacency_from[adjacencyCount] = strtol(from, NULL, 10); adjacency_to[adjacencyCount] = strtol(to, NULL, 10); + adjacency_freq[adjacencyCount] = strtol(freq, NULL, 10); adjacencyCount += 1; } @@ -7860,7 +7867,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma // call retrieval function countActual = 0; - nodes = retrieval(root, *count, &countActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount); + nodes = retrieval(root, *count, &countActual, table_id, table_name, table_freq, tableCount, adjacency_from, adjacency_to, adjacency_freq, adjacencyCount); // create schema name = "s123"; // TODO create schema name @@ -7893,12 +7900,14 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma BBPreclaim(table_id_freq_freq); BBPreclaim(adjacency_list_from); BBPreclaim(adjacency_list_to); + BBPreclaim(adjacency_list_freq); free(table_id); free(table_name); free(table_freq); free(adjacency_from); free(adjacency_to); + free(adjacency_freq); return MAL_SUCCEED; } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list