Changeset: 9aa7d8033c08 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9aa7d8033c08 Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message:
Add list of candidates for each CSlabel Beside the name, a list of label candidates is stored for each CSlabel. The candidates are used by the CS merging algorithm. diffs (truncated from 450 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -106,6 +106,8 @@ typedef enum { // Final data structure that stores the labels for tables and attributes typedef struct CSlabel { str name; // table name + str *candidates; // list of table name candidates, candidates[0] == name + int candidatesCount;// number of entries in the candidates list str *hierarchy; // hierarchy "bottom to top" int hierarchyCount; // number of entries in the hierarchy list int numProp; // number of properties, copied from freqCSset->items[x].numProp diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1712,8 +1712,20 @@ void getTableName(CSlabel* label, int cs int i, j, k; str *tmpList; int tmpListCount; + char nameFound = 0; // --- ONTOLOGY --- + // add all ontology candidates to list of candidates + if (resultCount[csIdx] >= 1) { + label->candidates = realloc(label->candidates, sizeof(str) * (label->candidatesCount + resultCount[csIdx])); + if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + for (i = 0; i < resultCount[csIdx]; ++i) { + label->candidates[label->candidatesCount + i] = (char *) malloc(sizeof(char) * (strlen(result[csIdx][i]) + 1)); + strcpy(label->candidates[label->candidatesCount + i], result[csIdx][i]); + } + label->candidatesCount += resultCount[csIdx]; + } + // one ontology class --> use it if (resultCount[csIdx] == 1) { #if USE_SHORT_NAMES @@ -1721,65 +1733,74 @@ void getTableName(CSlabel* label, int cs #else label->name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); strcpy(label->name, result[csIdx][0]); +#endif label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); -#endif - return; + nameFound = 1; } - // multiple ontology classes --> intersect with types - if (resultCount[csIdx] > 1) { - tmpList = NULL; - tmpListCount = 0; - // search for type values - for (i = 0; i < typeAttributesCount; ++i) { - for (j = 0; j < typeAttributesHistogramCount[csIdx][i]; ++j) { - if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted - // intersect type with ontology classes - for (k = 0; k < resultCount[csIdx]; ++k) { - if (strcmp(result[csIdx][k], typeAttributesHistogram[csIdx][i][j].value) == 0) { - // found, copy ontology class to tmpList - tmpList = (str *) realloc(tmpList, sizeof(str) * (tmpListCount + 1)); - if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - tmpList[tmpListCount] = result[csIdx][k]; // pointer, no copy - tmpListCount += 1; + if (!nameFound) { + // multiple ontology classes --> intersect with types + if (resultCount[csIdx] > 1) { + tmpList = NULL; + tmpListCount = 0; + // search for type values + for (i = 0; i < typeAttributesCount; ++i) { + for (j = 0; j < typeAttributesHistogramCount[csIdx][i]; ++j) { + if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted + // intersect type with ontology classes + for (k = 0; k < resultCount[csIdx]; ++k) { + if (strcmp(result[csIdx][k], typeAttributesHistogram[csIdx][i][j].value) == 0) { + // found, copy ontology class to tmpList + tmpList = (str *) realloc(tmpList, sizeof(str) * (tmpListCount + 1)); + if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + tmpList[tmpListCount] = result[csIdx][k]; // pointer, no copy + tmpListCount += 1; + } } } } + + // only one left --> use it + if (tmpListCount == 1) { +#if USE_SHORT_NAMES + getPropNameShort(&(label->name), tmpList[0]); +#else + label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); + strcpy(label->name, tmpList[0]); +#endif + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + } + + if (!nameFound) { + // multiple left --> use the class that covers most attributes, most popular ontology, ... + if (tmpListCount > 1) { +#if USE_SHORT_NAMES + getPropNameShort(&(label->name), tmpList[0]); // sorted +#else + label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); + strcpy(label->name, tmpList[0]); // sorted +#endif + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + } + } + + if (!nameFound) { + // empty intersection -> use the class that covers most attributes, most popular ontology, .. +#if USE_SHORT_NAMES + getPropNameShort(&(label->name), result[csIdx][0]); // sorted +#else + label->name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); + strcpy(label->name, result[csIdx][0]); // sorted +#endif + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + } } - // only one left --> use it - if (tmpListCount == 1) { -#if USE_SHORT_NAMES - getPropNameShort(&(label->name), tmpList[0]); -#else - label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); - strcpy(label->name, tmpList[0]); -#endif - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - return; - } - // multiple left --> use the class that covers most attributes, most popular ontology, ... - if (tmpListCount > 1) { -#if USE_SHORT_NAMES - getPropNameShort(&(label->name), tmpList[0]); // sorted -#else - label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); - strcpy(label->name, tmpList[0]); // sorted -#endif - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - return; - } - // empty intersection -> use the class that covers most attributes, most popular ontology, .. -#if USE_SHORT_NAMES - getPropNameShort(&(label->name), result[csIdx][0]); // sorted -#else - label->name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); - strcpy(label->name, result[csIdx][0]); // sorted -#endif - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - return; } // --- TYPE --- @@ -1794,54 +1815,107 @@ void getTableName(CSlabel* label, int cs tmpList[tmpListCount] = typeAttributesHistogram[csIdx][i][0].value; // pointer, no copy tmpListCount += 1; } - // one type attribute --> use most frequent one - if (tmpListCount == 1) { - // only one type attribute, use most frequent value (sorted) -#if USE_SHORT_NAMES - getPropNameShort(&(label->name), tmpList[0]); -#else - label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); - strcpy(label->name, tmpList[0]); -#endif - return; - } - // multiple type attributes --> use the one with fewest occurances in other CS's - if (tmpListCount > 1) { + + // add all most frequent type values to list of candidates + if (tmpListCount >= 1) { + int counter = 0; + label->candidates = realloc(label->candidates, sizeof(str) * (label->candidatesCount + tmpListCount)); + if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); for (i = 0; i < typeStatCount; ++i) { for (j = 0; j < tmpListCount; ++j) { if (strcmp(typeStat[i].value, tmpList[j]) == 0) { + label->candidates[label->candidatesCount + counter] = (char *) malloc(sizeof(char) * (strlen(tmpList[j]) + 1)); + strcpy(label->candidates[label->candidatesCount + counter], tmpList[j]); + counter++; + } + } + } + assert(counter == tmpListCount); + label->candidatesCount += tmpListCount; + } + + if (!nameFound) { + // one type attribute --> use most frequent one + if (tmpListCount == 1) { + // only one type attribute, use most frequent value (sorted) #if USE_SHORT_NAMES - getPropNameShort(&(label->name), tmpList[j]); + getPropNameShort(&(label->name), tmpList[0]); #else - label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[j]) + 1)); - strcpy(label->name, tmpList[j]); + label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); + strcpy(label->name, tmpList[0]); #endif - return; + nameFound = 1; + } + } + + if (!nameFound) { + // multiple type attributes --> use the one with fewest occurances in other CS's + if (tmpListCount > 1) { + for (i = 0; i < typeStatCount && !nameFound; ++i) { + for (j = 0; j < tmpListCount && !nameFound; ++j) { + if (strcmp(typeStat[i].value, tmpList[j]) == 0) { +#if USE_SHORT_NAMES + getPropNameShort(&(label->name), tmpList[j]); +#else + label->name = (char *) malloc(sizeof(char) * (strlen(tmpList[j]) + 1)); + strcpy(label->name, tmpList[j]); +#endif + nameFound = 1; + } } } } } // --- FK --- - // incident foreign keys --> use the one with the most occurances (num and freq) + // add top3 fk values to list of candidates if (links[csIdx].num > 0) { - str propStr, tmpStr; - takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted - propStr = removeBrackets(tmpStr); + label->candidates = realloc(label->candidates, sizeof(str) * (label->candidatesCount + MIN(3, links[csIdx].num))); + if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + for (i = 0; i < MIN(3, links[csIdx].num); ++i) { + str propStr, tmpStr; + takeOid(links[csIdx].fks[0].prop, &tmpStr); + propStr = removeBrackets(tmpStr); + + label->candidates[label->candidatesCount + i] = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + strcpy(label->candidates[label->candidatesCount + i], propStr); + } + label->candidatesCount += MIN(3, links[csIdx].num); + } + + if (!nameFound) { + // incident foreign keys --> use the one with the most occurances (num and freq) + if (links[csIdx].num > 0) { + str propStr, tmpStr; + takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted + propStr = removeBrackets(tmpStr); #if USE_SHORT_NAMES - getPropNameShort(&(label->name), propStr); + getPropNameShort(&(label->name), propStr); #else - label->name = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); - strcpy(label->name, propStr); + label->name = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + strcpy(label->name, propStr); #endif - GDKfree(tmpStr); - GDKfree(propStr); - return; + GDKfree(tmpStr); + GDKfree(propStr); + nameFound = 1; + } } // --- NOTHING --- - label->name = (char *) malloc(sizeof(char) * 6); - strcpy(label->name, "DUMMY"); + if (label->candidatesCount == 0) { + label->candidates = realloc(label->candidates, sizeof(str)); + if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + label->candidates[0] = (char *) malloc(sizeof(char) * 6); + strcpy(label->candidates[0], "DUMMY"); + label->candidatesCount = 1; + } + + if (!nameFound) { + label->name = (char *) malloc(sizeof(char) * 6); + strcpy(label->name, "DUMMY"); + nameFound = 1; + } + return; } #endif _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list