Changeset: 4f9d12a701c4 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f9d12a701c4 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message:
Add ontology tree Stores distribution of data, used for CS merging diffs (truncated from 331 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1656,7 +1656,6 @@ void printUML2(CSset *freqCSset, CSlabel TKNZRclose(&ret); } -#if USE_TABLE_NAME static str* getOntoHierarchy(str ontology, int* hierarchyCount, str** ontmetadata, int ontmetadataCount) { int i; @@ -1677,7 +1676,7 @@ str* getOntoHierarchy(str ontology, int* // lookup superclass int foundTuple = 0; for (i = 0; i < ontmetadataCount; ++i) { - str muristr = ontmetadata[0][i]; + str muristr = ontmetadata[0][i]; str msuperstr = ontmetadata[1][i]; if (strcmp(hierarchy[(*hierarchyCount) - 1], muristr) == 0) { // found entry @@ -1707,8 +1706,6 @@ str* getOntoHierarchy(str ontology, int* return hierarchy; } -#endif - #if USE_TABLE_NAME /* For one CS: Choose the best table name out of all collected candidates (ontology, type, fk). */ @@ -1972,6 +1969,182 @@ void createLinks(CSset* freqCSset, Relat #endif static +void createOntoUsageTreeStatistics(OntoUsageNode* tree, int numTuples) { + int i; + + if (tree->numChildren == 0) { + // leaf node + tree->numOccurancesSum = tree->numOccurances; + tree->percentage = (1.0 * tree->numOccurancesSum) / numTuples; + } else { + // inner node + tree->numOccurancesSum = tree->numOccurances; + for (i = 0; i < tree->numChildren; ++i) { + createOntoUsageTreeStatistics(tree->lstChildren[i], numTuples); + // sum up data + tree->numOccurancesSum += tree->lstChildren[i]->numOccurancesSum; + } + tree->percentage = (1.0 * tree->numOccurancesSum) / numTuples; + } +} + +static +void addToOntoUsageTree(OntoUsageNode* tree, str* hierarchy, int hierarchyCount, int numTuples) { + int i; + str uri; + OntoUsageNode *leaf; + + if (hierarchyCount == 0) { + // found position in tree +// tree->numOccurances += numTuples; // TODO cs.support not yet available + tree->numOccurances += 1; + return; + } + + // search through children + uri = hierarchy[hierarchyCount - 1]; + hierarchyCount--; + for (i = 0; i < tree->numChildren; ++i) { + if (strcmp(tree->lstChildren[i]->uri, uri) == 0) { + // found + addToOntoUsageTree(tree->lstChildren[i], hierarchy, hierarchyCount, numTuples); + return; + } + } + + // child not found + // create leaf + leaf = (OntoUsageNode *) malloc(sizeof(OntoUsageNode)); + if (!leaf) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + leaf->parent = tree; + leaf->uri = (str) malloc(sizeof(char) * (strlen(uri) + 1)); + if (!leaf->uri) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + strcpy(leaf->uri, uri); + leaf->lstChildren = NULL; + leaf->numChildren = 0; + leaf->numOccurances = 0; + leaf->numOccurancesSum = 0; + leaf->percentage = 0.0; + // add to tree + tree->numChildren++; + tree->lstChildren = realloc(tree->lstChildren, sizeof(OntoUsageNode *) * tree->numChildren); + if (!tree->lstChildren) + fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + tree->lstChildren[tree->numChildren - 1] = leaf; + // call + addToOntoUsageTree(leaf, hierarchy, hierarchyCount, numTuples); +} + + +static +void printTree(OntoUsageNode* tree, int level) { + int i; + printf("Level %d URI %s Count %d Sum %d Percent %.1f\n", level, tree->uri, tree->numOccurances, tree->numOccurancesSum, tree->percentage * 100); + for (i = 0; i < tree->numChildren; ++i) { + printTree(tree->lstChildren[i], level+1); + } +} + +static +void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, str** ontmetadata, int ontmetadataCount, str** result, int* resultCount, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount) { + int i, j, k; + str *tmpList; + int tmpListCount; + int numTuples = 0; + + // init tree with an artifical root node + (*tree) = (OntoUsageNode *) malloc(sizeof(OntoUsageNode)); + if (!(*tree)) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + (*tree)->parent = NULL; + (*tree)->uri = NULL; // artificial root; + (*tree)->lstChildren = NULL; + (*tree)->numChildren = 0; + (*tree)->numOccurances = 0; + (*tree)->numOccurancesSum = 0; + (*tree)->percentage = 0.0; + + // loop through data + for (i = 0; i < freqCSset->numCSadded; ++i) { + str uri; + int hierarchyCount = 0; + str* hierarchy; + + // get ontology + // copied from getTableName, TODO improve! + if (resultCount[i] == 0) { + // no hierarchy --> ignore + continue; + } else if (resultCount[i] == 1) { + // one ontology class --> use it + uri = (char *) malloc(sizeof(char) * (strlen(result[i][0]) + 1)); + if (!uri) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + strcpy(uri, result[i][0]); + } else { + // multiple ontology classes --> intersect with types + tmpList = NULL; + tmpListCount = 0; + // search for type values + for (i = 0; i < typeAttributesCount; ++i) { + for (j = 0; j < typeAttributesHistogramCount[i][i]; ++j) { + if (typeAttributesHistogram[i][i][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted + // intersect type with ontology classes + for (k = 0; k < resultCount[i]; ++k) { + if (strcmp(result[i][k], typeAttributesHistogram[i][i][j].value) == 0) { + // found, copy ontology class to tmpList + tmpList = (str *) realloc(tmpList, sizeof(str) * (tmpListCount + 1)); + if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + tmpList[tmpListCount] = result[i][k]; // pointer, no copy + tmpListCount += 1; + } + } + } + } + if (tmpListCount == 1) { + // only one left --> use it + uri = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); + if (!uri) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + strcpy(uri, tmpList[0]); + free(tmpList); + } else if (tmpListCount > 1) { + // multiple left --> use the class that covers most attributes, most popular ontology, ... + uri = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); + if (!uri) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + strcpy(uri, tmpList[0]); // sorted + free(tmpList); + } else { + // empty intersection -> use the class that covers most attributes, most popular ontology, .. + uri = (char *) malloc(sizeof(char) * (strlen(result[i][0]) + 1)); + if (!uri) + fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); + strcpy(uri, result[i][0]); // sorted + free(tmpList); + } + } + + // get ontology hierarchy + hierarchy = getOntoHierarchy(uri, &hierarchyCount, ontmetadata, ontmetadataCount); + + // search class in tree and add CS to statistics + addToOntoUsageTree(*tree, hierarchy, hierarchyCount, freqCSset->items[i].support); +// numTuples += freqCSset->items[i].support; // update total number of tuples in dataset // TODO cs.support not yet available + numTuples += 1; + } + + // calculate summed parameters + createOntoUsageTreeStatistics(*tree, numTuples); + + // print + printf("Ontology tree:\n"); + printTree(*tree, 0); +} + +static void freeTypeAttributesHistogram(TypeAttributesFreq*** typeAttributesHistogram, int csCount, int typeAttributesCount) { int i, j; @@ -2111,7 +2284,7 @@ int* getSubCS(CSset* freqCSset, int csId } /* Creates labels for all CS (without a parent). */ -CSlabel* createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mbat, int *csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str** ontmetadata, int ontmetadataCount) { +CSlabel* createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mbat, int *csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str** ontmetadata, int ontmetadataCount, OntoUsageNode** ontoUsageTree) { #if USE_TYPE_NAMES char* typeAttributes[] = { "http://ogp.me/ns#type", @@ -2178,9 +2351,7 @@ CSlabel* createLabels(CSset* freqCSset, // freeOntmetadata(ontmetadata); #else (void) ontattributesCount; - (void) ontmetadataCount; (void) ontattributes; - (void) ontmetadata; #endif // Assigning Names @@ -2190,6 +2361,9 @@ CSlabel* createLabels(CSset* freqCSset, if (typeStatCount > 0) free(typeStat); #endif + // Collect ontology statistics (tree) + createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata, ontmetadataCount, ontologyLookupResult, ontologyLookupResultCount, typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount); + free(ontologyLookupResultCount); freeOntologyLookupResult(ontologyLookupResult, freqCSset->numCSadded); freeTypeAttributesHistogram(typeAttributesHistogram, freqCSset->numCSadded, typeAttributesCount); @@ -2311,3 +2485,22 @@ void freeFinalLabels(CSlabel* labels, CS } free(labels); } + +void freeOntoUsageTree(OntoUsageNode* tree) { + int i; + + if (tree->numChildren == 0) { + // leaf node + free(tree->uri); + free(tree); + return; + } + + // inner node + for (i = 0; i < tree->numChildren; ++i) { + freeOntoUsageTree(tree->lstChildren[i]); + } + free(tree->lstChildren); + free(tree->uri); + free(tree); +} diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -68,6 +68,17 @@ typedef struct TypeStat { int freq; // number of CS's the value occurs in } TypeStat; +// Tree node to store the number of tuples per ontology class +typedef struct OntoUsageNode { + struct OntoUsageNode *parent; + struct OntoUsageNode **lstChildren; + str uri; // TODO uri==NULL <=> artificial root + int numChildren; + int numOccurances; // TODO overflow 2,000,000? + int numOccurancesSum; + float percentage; // TODO rename, range [0..1] +} OntoUsageNode; + #define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to be in this table #define TYPE_FREQ_THRESHOLD 10 // X % of the type values have to be this value #define ONTOLOGY_FREQ_THRESHOLD 0.5 // similarity threshold for tfidf simularity for ontology classes @@ -80,7 +91,7 @@ typedef struct TypeStat { #define SHOW_CANDIDATES 0 // inserts a row in UML diagrams to show all candidate names rdf_export CSlabel* -createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mapbat, int *csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str** ontmetadata, int ontmetadataCount); +createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mapbat, int *csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str** ontmetadata, int ontmetadataCount, OntoUsageNode** ontoUsageTree); rdf_export CSlabel* createFinalLabels(CSlabel* labels, CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, int freqThreshold); @@ -91,4 +102,7 @@ freeLabels(CSlabel* labels, CSset* freqC rdf_export void freeFinalLabels(CSlabel* labels, CSset* freqCSset); +rdf_export void +freeOntoUsageTree(OntoUsageNode* tree); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list