Changeset: 9aa7d8033c08 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9aa7d8033c08
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Add list of candidates for each CSlabel
Beside the name, a list of label candidates is stored for each CSlabel. The 
candidates are used by the CS merging algorithm.


diffs (truncated from 450 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -106,6 +106,8 @@ typedef enum {
 // Final data structure that stores the labels for tables and attributes
 typedef struct CSlabel {
        str             name;           // table name
+       str             *candidates;    // list of table name candidates, 
candidates[0] == name
+       int             candidatesCount;// number of entries in the candidates 
list
        str             *hierarchy;     // hierarchy "bottom to top"
        int             hierarchyCount; // number of entries in the hierarchy 
list
        int             numProp;        // number of properties, copied from 
freqCSset->items[x].numProp
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1712,8 +1712,20 @@ void getTableName(CSlabel* label, int cs
        int             i, j, k;
        str             *tmpList;
        int             tmpListCount;
+       char            nameFound = 0;
 
        // --- ONTOLOGY ---
+       // add all ontology candidates to list of candidates
+       if (resultCount[csIdx] >= 1) {
+               label->candidates = realloc(label->candidates, sizeof(str) * 
(label->candidatesCount + resultCount[csIdx]));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+               for (i = 0; i < resultCount[csIdx]; ++i) {
+                       label->candidates[label->candidatesCount + i] = (char 
*) malloc(sizeof(char) * (strlen(result[csIdx][i]) + 1));
+                       strcpy(label->candidates[label->candidatesCount + i], 
result[csIdx][i]);
+               }
+               label->candidatesCount += resultCount[csIdx];
+       }
+
        // one ontology class --> use it
        if (resultCount[csIdx] == 1) {
 #if USE_SHORT_NAMES
@@ -1721,65 +1733,74 @@ void getTableName(CSlabel* label, int cs
 #else
                label->name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
                strcpy(label->name, result[csIdx][0]);
+#endif
                label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-#endif
-               return;
+               nameFound = 1;
        }
 
-       // multiple ontology classes --> intersect with types
-       if (resultCount[csIdx] > 1) {
-               tmpList = NULL;
-               tmpListCount = 0;
-               // search for type values
-               for (i = 0; i < typeAttributesCount; ++i) {
-                       for (j = 0; j < typeAttributesHistogramCount[csIdx][i]; 
++j) {
-                               if 
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // 
sorted
-                               // intersect type with ontology classes
-                               for (k = 0; k < resultCount[csIdx]; ++k) {
-                                       if (strcmp(result[csIdx][k], 
typeAttributesHistogram[csIdx][i][j].value) == 0) {
-                                               // found, copy ontology class 
to tmpList
-                                               tmpList = (str *) 
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
-                                               if (!tmpList) fprintf(stderr, 
"ERROR: Couldn't realloc memory!\n");
-                                               tmpList[tmpListCount] = 
result[csIdx][k]; // pointer, no copy
-                                               tmpListCount += 1;
+       if (!nameFound) {
+               // multiple ontology classes --> intersect with types
+               if (resultCount[csIdx] > 1) {
+                       tmpList = NULL;
+                       tmpListCount = 0;
+                       // search for type values
+                       for (i = 0; i < typeAttributesCount; ++i) {
+                               for (j = 0; j < 
typeAttributesHistogramCount[csIdx][i]; ++j) {
+                                       if 
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // 
sorted
+                                       // intersect type with ontology classes
+                                       for (k = 0; k < resultCount[csIdx]; 
++k) {
+                                               if (strcmp(result[csIdx][k], 
typeAttributesHistogram[csIdx][i][j].value) == 0) {
+                                                       // found, copy ontology 
class to tmpList
+                                                       tmpList = (str *) 
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
+                                                       if (!tmpList) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+                                                       tmpList[tmpListCount] = 
result[csIdx][k]; // pointer, no copy
+                                                       tmpListCount += 1;
+                                               }
                                        }
                                }
                        }
+
+                       // only one left --> use it
+                       if (tmpListCount == 1) {
+#if USE_SHORT_NAMES
+                               getPropNameShort(&(label->name), tmpList[0]);
+#else
+                               label->name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
+                               strcpy(label->name, tmpList[0]);
+#endif
+                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                               free(tmpList);
+                               nameFound = 1;
+                       }
+
+                       if (!nameFound) {
+                               // multiple left --> use the class that covers 
most attributes, most popular ontology, ...
+                               if (tmpListCount > 1) {
+#if USE_SHORT_NAMES
+                                       getPropNameShort(&(label->name), 
tmpList[0]); // sorted
+#else
+                                       label->name = (char *) 
malloc(sizeof(char) * (strlen(tmpList[0]) + 1));
+                                       strcpy(label->name, tmpList[0]); // 
sorted
+#endif
+                                       label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                                       free(tmpList);
+                                       nameFound = 1;
+                               }
+                       }
+
+                       if (!nameFound) {
+                               // empty intersection -> use the class that 
covers most attributes, most popular ontology, ..
+#if USE_SHORT_NAMES
+                               getPropNameShort(&(label->name), 
result[csIdx][0]); // sorted
+#else
+                               label->name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
+                               strcpy(label->name, result[csIdx][0]); // sorted
+#endif
+                               label->hierarchy = 
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, 
ontmetadataCount);
+                               free(tmpList);
+                               nameFound = 1;
+                       }
                }
-               // only one left --> use it
-               if (tmpListCount == 1) {
-#if USE_SHORT_NAMES
-                       getPropNameShort(&(label->name), tmpList[0]);
-#else
-                       label->name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
-                       strcpy(label->name, tmpList[0]);
-#endif
-                       label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-                       free(tmpList);
-                       return;
-               }
-               // multiple left --> use the class that covers most attributes, 
most popular ontology, ...
-               if (tmpListCount > 1) {
-#if USE_SHORT_NAMES
-                       getPropNameShort(&(label->name), tmpList[0]); // sorted
-#else
-                       label->name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
-                       strcpy(label->name, tmpList[0]); // sorted
-#endif
-                       label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-                       free(tmpList);
-                       return;
-               }
-               // empty intersection -> use the class that covers most 
attributes, most popular ontology, ..
-#if USE_SHORT_NAMES
-               getPropNameShort(&(label->name), result[csIdx][0]); // sorted
-#else
-               label->name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
-               strcpy(label->name, result[csIdx][0]); // sorted
-#endif
-               label->hierarchy = getOntoHierarchy(label->name, 
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-               free(tmpList);
-               return;
        }
 
        // --- TYPE ---
@@ -1794,54 +1815,107 @@ void getTableName(CSlabel* label, int cs
                tmpList[tmpListCount] = 
typeAttributesHistogram[csIdx][i][0].value; // pointer, no copy
                tmpListCount += 1;
        }
-       // one type attribute --> use most frequent one
-       if (tmpListCount == 1) {
-               // only one type attribute, use most frequent value (sorted)
-#if USE_SHORT_NAMES
-               getPropNameShort(&(label->name), tmpList[0]);
-#else
-               label->name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
-               strcpy(label->name, tmpList[0]);
-#endif
-               return;
-       }
-       // multiple type attributes --> use the one with fewest occurances in 
other CS's
-       if (tmpListCount > 1) {
+
+       // add all most frequent type values to list of candidates
+       if (tmpListCount >= 1) {
+               int counter = 0;
+               label->candidates = realloc(label->candidates, sizeof(str) * 
(label->candidatesCount + tmpListCount));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
                for (i = 0; i < typeStatCount; ++i) {
                        for (j = 0; j < tmpListCount; ++j) {
                                if (strcmp(typeStat[i].value, tmpList[j]) == 0) 
{
+                                       
label->candidates[label->candidatesCount + counter] = (char *) 
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
+                                       
strcpy(label->candidates[label->candidatesCount + counter], tmpList[j]);
+                                       counter++;
+                               }
+                       }
+               }
+               assert(counter == tmpListCount);
+               label->candidatesCount += tmpListCount;
+       }
+
+       if (!nameFound) {
+               // one type attribute --> use most frequent one
+               if (tmpListCount == 1) {
+                       // only one type attribute, use most frequent value 
(sorted)
 #if USE_SHORT_NAMES
-                                       getPropNameShort(&(label->name), 
tmpList[j]);
+                       getPropNameShort(&(label->name), tmpList[0]);
 #else
-                                       label->name = (char *) 
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
-                                       strcpy(label->name, tmpList[j]);
+                       label->name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
+                       strcpy(label->name, tmpList[0]);
 #endif
-                                       return;
+                       nameFound = 1;
+               }
+       }
+
+       if (!nameFound) {
+               // multiple type attributes --> use the one with fewest 
occurances in other CS's
+               if (tmpListCount > 1) {
+                       for (i = 0; i < typeStatCount && !nameFound; ++i) {
+                               for (j = 0; j < tmpListCount && !nameFound; 
++j) {
+                                       if (strcmp(typeStat[i].value, 
tmpList[j]) == 0) {
+#if USE_SHORT_NAMES
+                                               
getPropNameShort(&(label->name), tmpList[j]);
+#else
+                                               label->name = (char *) 
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
+                                               strcpy(label->name, tmpList[j]);
+#endif
+                                               nameFound = 1;
+                                       }
                                }
                        }
                }
        }
 
        // --- FK ---
-       // incident foreign keys --> use the one with the most occurances (num 
and freq)
+       // add top3 fk values to list of candidates
        if (links[csIdx].num > 0) {
-               str propStr, tmpStr;
-               takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted
-               propStr = removeBrackets(tmpStr);
+               label->candidates = realloc(label->candidates, sizeof(str) * 
(label->candidatesCount + MIN(3, links[csIdx].num)));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+               for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
+                       str propStr, tmpStr;
+                       takeOid(links[csIdx].fks[0].prop, &tmpStr);
+                       propStr = removeBrackets(tmpStr);
+
+                       label->candidates[label->candidatesCount + i] = (char 
*) malloc(sizeof(char) * (strlen(propStr) + 1));
+                       strcpy(label->candidates[label->candidatesCount + i], 
propStr);
+               }
+               label->candidatesCount += MIN(3, links[csIdx].num);
+       }
+
+       if (!nameFound) {
+               // incident foreign keys --> use the one with the most 
occurances (num and freq)
+               if (links[csIdx].num > 0) {
+                       str propStr, tmpStr;
+                       takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted
+                       propStr = removeBrackets(tmpStr);
 #if USE_SHORT_NAMES
-               getPropNameShort(&(label->name), propStr);
+                       getPropNameShort(&(label->name), propStr);
 #else
-               label->name = (char *) malloc(sizeof(char) * (strlen(propStr) + 
1));
-               strcpy(label->name, propStr);
+                       label->name = (char *) malloc(sizeof(char) * 
(strlen(propStr) + 1));
+                       strcpy(label->name, propStr);
 #endif
-               GDKfree(tmpStr);
-               GDKfree(propStr);
-               return;
+                       GDKfree(tmpStr);
+                       GDKfree(propStr);
+                       nameFound = 1;
+               }
        }
 
        // --- NOTHING ---
-       label->name = (char *) malloc(sizeof(char) * 6);
-       strcpy(label->name, "DUMMY");
+       if (label->candidatesCount == 0) {
+               label->candidates = realloc(label->candidates, sizeof(str));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+               label->candidates[0] = (char *) malloc(sizeof(char) * 6);
+               strcpy(label->candidates[0], "DUMMY");
+               label->candidatesCount = 1;
+       }
+
+       if (!nameFound) {
+               label->name = (char *) malloc(sizeof(char) * 6);
+               strcpy(label->name, "DUMMY");
+               nameFound = 1;
+       }
+
        return;
 }
 #endif
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to