Changeset: 8c25b051ed3a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8c25b051ed3a
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

First draft of createFinalLabels, including new UML diagram generation


diffs (truncated from 973 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -103,4 +103,13 @@ typedef enum {
 
 #define N_GRAPH_BAT (MAP_LEX+1)
 
+// Final data structure that stores the labels for tables and attributes
+typedef struct CSlabel {
+       str             name;           // table name
+       str             *hierarchy;     // hierarchy "bottom to top"
+       int             hierarchyCount; // number of entries in the hierarchy 
list
+       int             numProp;        // number of properties, copied from 
freqCSset->items[x].numProp
+       char            **lstProp;      // attribute names (same order as in 
freqCSset->items[x].lstProp)
+} CSlabel;
+
 #endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -300,6 +300,73 @@ Relation*** initRelationMetadata(int** r
        return relationMetadata;
 }
 
+/* Calculate frequency per foreign key relationship. */
+static
+Relation*** initRelationMetadata2(int** relationMetadataCount, CSmergeRel* 
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+       int             i, j, k;
+       Relation***     relationMetadata;
+
+       int             ret;
+       char*           schema = "rdf";
+
+       TKNZRopen (NULL, &schema);
+
+       relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset->numCSadded);
+       if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
+       for (i = 0; i < freqCSset->numCSadded; ++i) { // CS
+               CS cs;
+               if (i == -1) continue; // ignore
+               cs = (CS) freqCSset->items[i];
+               relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) 
* cs.numProp);
+               if (!relationMetadata[i]) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
+               for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
+                       int sum = 0;
+                       relationMetadataCount[i][j] = 0;
+                       relationMetadata[i][j] = NULL;
+                       for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+
+                               if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
+                                       int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+                                       if (toId == -1) continue; // ignore
+                                       relationMetadataCount[i][j] += 1;
+
+                                       // alloc/realloc
+                                       if (relationMetadataCount[i][j] == 1) {
+                                               // alloc
+                                               relationMetadata[i][j] = 
(Relation *) malloc (sizeof(Relation));
+                                               if (!relationMetadata[i][j]) 
fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+                                               relationMetadata[i][j][0].to = 
toId;
+                                               relationMetadata[i][j][0].from 
= i;
+                                               relationMetadata[i][j][0].freq 
= csRelBetweenMergeFreqSet[i].lstCnt[k];
+                                               
relationMetadata[i][j][0].percent = -1;
+                                       } else {
+                                               // realloc
+                                               relationMetadata[i][j] = 
(Relation *) realloc(relationMetadata[i][j], sizeof(Relation) * 
relationMetadataCount[i][j]);
+                                               if (!relationMetadata[i][j]) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId;
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i;
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq = 
csRelBetweenMergeFreqSet[i].lstCnt[k];
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1;
+                                       }
+                               }
+                       }
+
+                       // get total count of values
+                       for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+                               sum += relationMetadata[i][j][k].freq;
+                       }
+                       // assign percentage values for every value
+                       for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+                               relationMetadata[i][j][k].percent = (int) 
(100.0 * relationMetadata[i][j][k].freq / sum + 0.5);
+                       }
+               }
+       }
+
+       TKNZRclose(&ret);
+
+       return relationMetadata;
+}
+
 static
 IncidentFKs* initLinks(int csCount) {
        int             i;
@@ -381,7 +448,7 @@ void escapeURIforSQL(char* s) {
 
 /* Create SQL CREATE TABLE statements including foreign keys. */
 static
-void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int** 
relationMetadataCount, Labels* labels, int freqThreshold) {
+void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int** 
relationMetadataCount, CSlabel* labels, int freqThreshold) {
        // tokenizer
        int             ret;
        char*           schema = "rdf";
@@ -408,6 +475,9 @@ void convertToSQL(CSset *freqCSset, Rela
        // create statement for every table
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                char *temp;
+
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
                temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) + 
1));
                if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
                strcpy(temp, labels[i].name);
@@ -434,6 +504,8 @@ void convertToSQL(CSset *freqCSset, Rela
 
        // add foreign key columns and add foreign keys
        for (i = 0; i < freqCSset->numCSadded; ++i) {
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
                for (j = 0; j < labels[i].numProp; ++j) {
                        char *temp2;
                        int refCounter = 0;
@@ -472,7 +544,7 @@ void convertToSQL(CSset *freqCSset, Rela
 }
 
 static
-void createSQLMetadata(CSset* freqCSset, CSrel* csrelSet, int num, Labels* 
labels, int* csIdFreqIdxMap) {
+void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, 
CSlabel* labels) {
        int     **matrix = NULL; // matrix[from][to] frequency
        int     i, j, k;
        FILE    *fout;
@@ -491,28 +563,27 @@ void createSQLMetadata(CSset* freqCSset,
        }
 
        // set values
-       for (i = 0; i < num; ++i) {
-               CS cs;
-               int csId = csIdFreqIdxMap[i];
-               if (csId == -1) continue; // ignore
-               cs = (CS) freqCSset->items[csId];
+       for (i = 0; i < freqCSset->numCSadded; ++i) {
+               CS cs = (CS) freqCSset->items[i];
+
+               if (cs.parentFreqIdx != -1) continue; // ignore
 
                for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
                        // check foreign key frequency
                        int sum = 0;
-                       for (k = 0; k < csrelSet[i].numRef; ++k) {
-                               if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) {
-                                       sum += csrelSet[i].lstCnt[k];
+                       for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; 
++k) {
+                               if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
+                                       sum += 
csRelBetweenMergeFreqSet[i].lstCnt[k];
                                }
                        }
 
-                       for (k = 0; k < csrelSet[i].numRef; ++k) { // propNo in 
CSrel
-                               if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) {
-                                       int toId = csIdFreqIdxMap[ 
csrelSet[i].lstRefCSoid[k] ];
+                       for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+                               if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
+                                       int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
                                        if (toId == -1) continue; // ignore
                                        if (i == toId) continue; // ignore self 
references
-                                       if ((int) (100.0 * 
csrelSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign 
key is not frequent enough
-                                       matrix[csId][toId] += 
csrelSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'? add the frequencies
+                                       if ((int) (100.0 * 
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) 
continue; // foreign key is not frequent enough
+                                       matrix[i][toId] += 
csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'? 
add the frequencies
                                }
                        }
                }
@@ -533,6 +604,9 @@ void createSQLMetadata(CSset* freqCSset,
        fout = fopen("tableIdFreq.csv", "wt");
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                char *temp;
+
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
                temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) + 
1));
                if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
                strcpy(temp, labels[i].name);
@@ -552,7 +626,7 @@ void createSQLMetadata(CSset* freqCSset,
 
 /* Simple representation of the final labels for tables and attributes. */
 static
-void printTxt(CSset* freqCSset, Labels* labels, int freqThreshold) {
+void printTxt(CSset* freqCSset, CSlabel* labels, int freqThreshold) {
        FILE            *fout;
        char            filename[20], tmp[10];
        int             i, j;
@@ -564,6 +638,8 @@ void printTxt(CSset* freqCSset, Labels* 
 
        fout = fopen(filename, "wt");
        for (i = 0; i < freqCSset->numCSadded; ++i) {
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
                fprintf(fout, "%s (CS "BUNFMT"): ", labels[i].name, 
freqCSset->items[i].csId);
                for (j = 0; j < labels[i].numProp; ++j) {
                        if (j + 1 < labels[i].numProp) fprintf(fout, "%s, ", 
labels[i].lstProp[j]);
@@ -1138,8 +1214,9 @@ void createOntologyLookupResult(str** re
 #endif
 
 /* Print the dot code to draw an UML-like diagram. Call:   dot -Tpdf -O 
<filename>   to create <filename>.pdf */
+/*
 static
-void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq*** 
typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int* 
resultCount, IncidentFKs* links, Labels* labels, Relation*** relationMetadata, 
int** relationMetadataCount, int freqThreshold) {
+void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq*** 
typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int* 
resultCount, IncidentFKs* links, CSlabel* labels, Relation*** relationMetadata, 
int** relationMetadataCount, int freqThreshold) {
        str             propStr, tmpStr;
        int             ret;
        char*           schema = "rdf";
@@ -1179,7 +1256,7 @@ void printUML(CSset *freqCSset, int type
                CS cs = (CS) freqCSset->items[i];
 
 #if SHOW_CANDIDATES
-               /* DATA SOURCES */
+               // DATA SOURCES
                resultStr = (char *) malloc(sizeof(char) * resultStrSize);
                if (!resultStr) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
                strcpy(resultStr, "\0");
@@ -1439,11 +1516,204 @@ void printUML(CSset *freqCSset, int type
 
        TKNZRclose(&ret);
 }
+*/
+
+static
+void printUML2(CSset *freqCSset, CSlabel* labels, Relation*** 
relationMetadata, int** relationMetadataCount, int freqThreshold) {
+       str             propStr, tmpStr;
+       int             ret;
+       char*           schema = "rdf";
+
+       int             i, j, k;
+       FILE            *fout;
+       char            filename[20], tmp[10];
+
+       int             smallest = -1, biggest = -1;
+
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               fprintf(stderr, "could not open the tokenizer\n");
+       }
+
+       strcpy(filename, "CS2max");
+       sprintf(tmp, "%d", freqThreshold);
+       strcat(filename, tmp);
+       strcat(filename, ".dot");
+
+       fout = fopen(filename, "wt");
+
+       // header
+       fprintf(fout, "digraph g {\n");
+       fprintf(fout, "graph[ratio=\"compress\"];\n");
+       fprintf(fout, "node [shape=\"none\"];\n\n");
+
+       // find biggest and smallest table
+       for (i = 0; i < freqCSset->numCSadded; ++i) {
+               CS cs = (CS) freqCSset->items[i];
+               if (cs.parentFreqIdx != -1) continue; // ignore
+
+               // first values
+               if (smallest == -1) smallest = i;
+               if (biggest == -1) biggest = i;
+
+               if (cs.coverage < freqCSset->items[smallest].coverage) smallest 
= i;
+               if (cs.coverage > freqCSset->items[biggest].coverage) biggest = 
i;
+       }
+
+       for (i = 0; i < freqCSset->numCSadded; ++i) {
+               int width;
+               CS cs = (CS) freqCSset->items[i];
+               if (cs.parentFreqIdx != -1) continue; // ignore
+
+               // print header
+               width = (int) ((300 + 300 * 
(log10(freqCSset->items[i].coverage) - 
log10(freqCSset->items[smallest].coverage)) / 
(log10(freqCSset->items[biggest].coverage) - 
log10(freqCSset->items[smallest].coverage))) + 0.5); // width between 300 and 
600 px, using logarithm
+               fprintf(fout, "\"" BUNFMT "\" [\n", cs.csId);
+               fprintf(fout, "label = <<TABLE BORDER=\"0\" CELLBORDER=\"1\" 
CELLSPACING=\"0\">\n");
+               fprintf(fout, "<TR><TD WIDTH=\"%d\"><B>%s (#triples: 
%d)</B></TD></TR>\n", width, labels[i].name, cs.coverage);
+
+               for (j = 0; j < cs.numProp; ++j) {
+                       char    *propStrEscaped = NULL;
+                       char    *propStrShort = NULL;
+                       str color;
+
+                       takeOid(cs.lstProp[j], &tmpStr);
+
+                       // copy propStr to propStrEscaped because .dot-PORTs 
cannot contain colons and quotes
+                       propStr = removeBrackets(tmpStr);
+                       propStrEscaped = (char *) malloc(sizeof(char) * 
(strlen(propStr) + 1));
+                       if (!propStrEscaped) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
+                       memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
+                       escapeURI(propStrEscaped);
+                       getPropNameShort(&propStrShort, propStr);
+
+                       // assign color (the more tuples the property occurs 
in, the darker
+                       if ((1.0 * cs.lstPropSupport[j])/cs.support > 0.8) {
+                               color = "#5555FF";
+                       } else if ((1.0 * cs.lstPropSupport[j])/cs.support > 
0.6) {
+                               color = "#7777FF";
+                       } else if ((1.0 * cs.lstPropSupport[j])/cs.support > 
0.4) {
+                               color = "#9999FF";
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to