Changeset: 8c25b051ed3a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8c25b051ed3a Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
First draft of createFinalLabels, including new UML diagram generation diffs (truncated from 973 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -103,4 +103,13 @@ typedef enum { #define N_GRAPH_BAT (MAP_LEX+1) +// Final data structure that stores the labels for tables and attributes +typedef struct CSlabel { + str name; // table name + str *hierarchy; // hierarchy "bottom to top" + int hierarchyCount; // number of entries in the hierarchy list + int numProp; // number of properties, copied from freqCSset->items[x].numProp + char **lstProp; // attribute names (same order as in freqCSset->items[x].lstProp) +} CSlabel; + #endif /* _RDF_H_ */ diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -300,6 +300,73 @@ Relation*** initRelationMetadata(int** r return relationMetadata; } +/* Calculate frequency per foreign key relationship. */ +static +Relation*** initRelationMetadata2(int** relationMetadataCount, CSmergeRel* csRelBetweenMergeFreqSet, CSset* freqCSset) { + int i, j, k; + Relation*** relationMetadata; + + int ret; + char* schema = "rdf"; + + TKNZRopen (NULL, &schema); + + relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset->numCSadded); + if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + for (i = 0; i < freqCSset->numCSadded; ++i) { // CS + CS cs; + if (i == -1) continue; // ignore + cs = (CS) freqCSset->items[i]; + relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) * cs.numProp); + if (!relationMetadata[i]) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + for (j = 0; j < cs.numProp; ++j) { // propNo in CS order + int sum = 0; + relationMetadataCount[i][j] = 0; + relationMetadata[i][j] = NULL; + for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == cs.lstProp[j]) { + int toId = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; + if (toId == -1) continue; // ignore + relationMetadataCount[i][j] += 1; + + // alloc/realloc + if (relationMetadataCount[i][j] == 1) { + // alloc + relationMetadata[i][j] = (Relation *) malloc (sizeof(Relation)); + if (!relationMetadata[i][j]) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + relationMetadata[i][j][0].to = toId; + relationMetadata[i][j][0].from = i; + relationMetadata[i][j][0].freq = csRelBetweenMergeFreqSet[i].lstCnt[k]; + relationMetadata[i][j][0].percent = -1; + } else { + // realloc + relationMetadata[i][j] = (Relation *) realloc(relationMetadata[i][j], sizeof(Relation) * relationMetadataCount[i][j]); + if (!relationMetadata[i][j]) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq = csRelBetweenMergeFreqSet[i].lstCnt[k]; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1; + } + } + } + + // get total count of values + for (k = 0; k < relationMetadataCount[i][j]; ++k) { + sum += relationMetadata[i][j][k].freq; + } + // assign percentage values for every value + for (k = 0; k < relationMetadataCount[i][j]; ++k) { + relationMetadata[i][j][k].percent = (int) (100.0 * relationMetadata[i][j][k].freq / sum + 0.5); + } + } + } + + TKNZRclose(&ret); + + return relationMetadata; +} + static IncidentFKs* initLinks(int csCount) { int i; @@ -381,7 +448,7 @@ void escapeURIforSQL(char* s) { /* Create SQL CREATE TABLE statements including foreign keys. */ static -void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int** relationMetadataCount, Labels* labels, int freqThreshold) { +void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int** relationMetadataCount, CSlabel* labels, int freqThreshold) { // tokenizer int ret; char* schema = "rdf"; @@ -408,6 +475,9 @@ void convertToSQL(CSset *freqCSset, Rela // create statement for every table for (i = 0; i < freqCSset->numCSadded; ++i) { char *temp; + + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) + 1)); if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(temp, labels[i].name); @@ -434,6 +504,8 @@ void convertToSQL(CSset *freqCSset, Rela // add foreign key columns and add foreign keys for (i = 0; i < freqCSset->numCSadded; ++i) { + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + for (j = 0; j < labels[i].numProp; ++j) { char *temp2; int refCounter = 0; @@ -472,7 +544,7 @@ void convertToSQL(CSset *freqCSset, Rela } static -void createSQLMetadata(CSset* freqCSset, CSrel* csrelSet, int num, Labels* labels, int* csIdFreqIdxMap) { +void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, CSlabel* labels) { int **matrix = NULL; // matrix[from][to] frequency int i, j, k; FILE *fout; @@ -491,28 +563,27 @@ void createSQLMetadata(CSset* freqCSset, } // set values - for (i = 0; i < num; ++i) { - CS cs; - int csId = csIdFreqIdxMap[i]; - if (csId == -1) continue; // ignore - cs = (CS) freqCSset->items[csId]; + for (i = 0; i < freqCSset->numCSadded; ++i) { + CS cs = (CS) freqCSset->items[i]; + + if (cs.parentFreqIdx != -1) continue; // ignore for (j = 0; j < cs.numProp; ++j) { // propNo in CS order // check foreign key frequency int sum = 0; - for (k = 0; k < csrelSet[i].numRef; ++k) { - if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) { - sum += csrelSet[i].lstCnt[k]; + for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; ++k) { + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == cs.lstProp[j]) { + sum += csRelBetweenMergeFreqSet[i].lstCnt[k]; } } - for (k = 0; k < csrelSet[i].numRef; ++k) { // propNo in CSrel - if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) { - int toId = csIdFreqIdxMap[ csrelSet[i].lstRefCSoid[k] ]; + for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == cs.lstProp[j]) { + int toId = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; if (toId == -1) continue; // ignore if (i == toId) continue; // ignore self references - if ((int) (100.0 * csrelSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough - matrix[csId][toId] += csrelSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'? add the frequencies + if ((int) (100.0 * csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough + matrix[i][toId] += csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'? add the frequencies } } } @@ -533,6 +604,9 @@ void createSQLMetadata(CSset* freqCSset, fout = fopen("tableIdFreq.csv", "wt"); for (i = 0; i < freqCSset->numCSadded; ++i) { char *temp; + + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) + 1)); if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(temp, labels[i].name); @@ -552,7 +626,7 @@ void createSQLMetadata(CSset* freqCSset, /* Simple representation of the final labels for tables and attributes. */ static -void printTxt(CSset* freqCSset, Labels* labels, int freqThreshold) { +void printTxt(CSset* freqCSset, CSlabel* labels, int freqThreshold) { FILE *fout; char filename[20], tmp[10]; int i, j; @@ -564,6 +638,8 @@ void printTxt(CSset* freqCSset, Labels* fout = fopen(filename, "wt"); for (i = 0; i < freqCSset->numCSadded; ++i) { + if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore + fprintf(fout, "%s (CS "BUNFMT"): ", labels[i].name, freqCSset->items[i].csId); for (j = 0; j < labels[i].numProp; ++j) { if (j + 1 < labels[i].numProp) fprintf(fout, "%s, ", labels[i].lstProp[j]); @@ -1138,8 +1214,9 @@ void createOntologyLookupResult(str** re #endif /* Print the dot code to draw an UML-like diagram. Call: dot -Tpdf -O <filename> to create <filename>.pdf */ +/* static -void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int* resultCount, IncidentFKs* links, Labels* labels, Relation*** relationMetadata, int** relationMetadataCount, int freqThreshold) { +void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int* resultCount, IncidentFKs* links, CSlabel* labels, Relation*** relationMetadata, int** relationMetadataCount, int freqThreshold) { str propStr, tmpStr; int ret; char* schema = "rdf"; @@ -1179,7 +1256,7 @@ void printUML(CSset *freqCSset, int type CS cs = (CS) freqCSset->items[i]; #if SHOW_CANDIDATES - /* DATA SOURCES */ + // DATA SOURCES resultStr = (char *) malloc(sizeof(char) * resultStrSize); if (!resultStr) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(resultStr, "\0"); @@ -1439,11 +1516,204 @@ void printUML(CSset *freqCSset, int type TKNZRclose(&ret); } +*/ + +static +void printUML2(CSset *freqCSset, CSlabel* labels, Relation*** relationMetadata, int** relationMetadataCount, int freqThreshold) { + str propStr, tmpStr; + int ret; + char* schema = "rdf"; + + int i, j, k; + FILE *fout; + char filename[20], tmp[10]; + + int smallest = -1, biggest = -1; + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + fprintf(stderr, "could not open the tokenizer\n"); + } + + strcpy(filename, "CS2max"); + sprintf(tmp, "%d", freqThreshold); + strcat(filename, tmp); + strcat(filename, ".dot"); + + fout = fopen(filename, "wt"); + + // header + fprintf(fout, "digraph g {\n"); + fprintf(fout, "graph[ratio=\"compress\"];\n"); + fprintf(fout, "node [shape=\"none\"];\n\n"); + + // find biggest and smallest table + for (i = 0; i < freqCSset->numCSadded; ++i) { + CS cs = (CS) freqCSset->items[i]; + if (cs.parentFreqIdx != -1) continue; // ignore + + // first values + if (smallest == -1) smallest = i; + if (biggest == -1) biggest = i; + + if (cs.coverage < freqCSset->items[smallest].coverage) smallest = i; + if (cs.coverage > freqCSset->items[biggest].coverage) biggest = i; + } + + for (i = 0; i < freqCSset->numCSadded; ++i) { + int width; + CS cs = (CS) freqCSset->items[i]; + if (cs.parentFreqIdx != -1) continue; // ignore + + // print header + width = (int) ((300 + 300 * (log10(freqCSset->items[i].coverage) - log10(freqCSset->items[smallest].coverage)) / (log10(freqCSset->items[biggest].coverage) - log10(freqCSset->items[smallest].coverage))) + 0.5); // width between 300 and 600 px, using logarithm + fprintf(fout, "\"" BUNFMT "\" [\n", cs.csId); + fprintf(fout, "label = <<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"); + fprintf(fout, "<TR><TD WIDTH=\"%d\"><B>%s (#triples: %d)</B></TD></TR>\n", width, labels[i].name, cs.coverage); + + for (j = 0; j < cs.numProp; ++j) { + char *propStrEscaped = NULL; + char *propStrShort = NULL; + str color; + + takeOid(cs.lstProp[j], &tmpStr); + + // copy propStr to propStrEscaped because .dot-PORTs cannot contain colons and quotes + propStr = removeBrackets(tmpStr); + propStrEscaped = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + if (!propStrEscaped) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + memcpy(propStrEscaped, propStr, (strlen(propStr) + 1)); + escapeURI(propStrEscaped); + getPropNameShort(&propStrShort, propStr); + + // assign color (the more tuples the property occurs in, the darker + if ((1.0 * cs.lstPropSupport[j])/cs.support > 0.8) { + color = "#5555FF"; + } else if ((1.0 * cs.lstPropSupport[j])/cs.support > 0.6) { + color = "#7777FF"; + } else if ((1.0 * cs.lstPropSupport[j])/cs.support > 0.4) { + color = "#9999FF"; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list