Changeset: c8f249486552 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8f249486552 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Change the priority in assinging label to type-ontology-FK Slightly change the TF/IDF computation Disable S3 (Sub-super) and S5 (FK). diffs (truncated from 444 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1472,6 +1472,7 @@ void createOntologyLookupResult(oid** re // get class names resultCount[i] = 0; + result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat); for (j = 0; j < ontologyCount; ++j) { @@ -2082,90 +2083,6 @@ void getTableName(CSlabel* label, int cs (void) ontmetaBat; - // --- ONTOLOGY --- - // add all ontology candidates to list of candidates - if (resultCount[csIdx] >= 1) { - label->candidatesOntology = resultCount[csIdx]; - label->candidates = GDKrealloc(label->candidates, sizeof(oid) * (label->candidatesCount + resultCount[csIdx])); - if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - for (i = 0; i < resultCount[csIdx]; ++i) { - label->candidates[label->candidatesCount + i] = result[csIdx][i]; - } - label->candidatesCount += resultCount[csIdx]; - } - - // one ontology class --> use it - if (resultCount[csIdx] == 1) { - label->name = result[csIdx][0]; - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - nameFound = 1; - #if INFO_WHERE_NAME_FROM - label->isOntology = 1; - #endif - } - - if (!nameFound) { - // multiple ontology classes --> intersect with types - if (resultCount[csIdx] > 1) { - tmpList = NULL; - tmpListCount = 0; - // search for type values - for (i = 0; i < typeAttributesCount; ++i) { - for (j = 0; j < typeAttributesHistogramCount[csIdx][i]; ++j) { - if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted - - // intersect type with ontology classes - for (k = 0; k < resultCount[csIdx]; ++k) { - if (result[csIdx][k] == typeAttributesHistogram[csIdx][i][j].value) { - // found, copy ontology class to tmpList - tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); - if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); - tmpList[tmpListCount] = result[csIdx][k]; - tmpListCount += 1; - } - } - } - } - - // only one left --> use it - if (tmpListCount == 1) { - label->name = tmpList[0]; - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - nameFound = 1; - #if INFO_WHERE_NAME_FROM - label->isOntology = 1; - #endif - } - - if (!nameFound) { - // multiple left --> use the class that covers most attributes, most popular ontology, ... - if (tmpListCount > 1) { - label->name = tmpList[0]; // sorted - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - nameFound = 1; - - #if INFO_WHERE_NAME_FROM - label->isOntology = 1; - #endif - } - } - - if (!nameFound) { - // empty intersection -> use the class that covers most attributes, most popular ontology, .. - label->name = result[csIdx][0]; // sorted - label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - nameFound = 1; - - #if INFO_WHERE_NAME_FROM - label->isOntology = 1; - #endif - } - } - } - // --- TYPE --- // get most frequent type value per type attribute tmpList = NULL; @@ -2204,7 +2121,7 @@ void getTableName(CSlabel* label, int cs int freq; if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &typeAttributesHistogram[csIdx][i][j].value)].hierDepth; - maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &maxDepthOid)].hierDepth;; + maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &maxDepthOid)].hierDepth; freq = typeAttributesHistogram[csIdx][i][j].freq; if (depth > maxDepth) { // choose value with higher hierarchy level @@ -2269,6 +2186,95 @@ void getTableName(CSlabel* label, int cs } } + + // --- ONTOLOGY --- + // add all ontology candidates to list of candidates + if (resultCount[csIdx] >= 1) { + label->candidatesOntology = resultCount[csIdx]; + label->candidates = GDKrealloc(label->candidates, sizeof(oid) * (label->candidatesCount + resultCount[csIdx])); + if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + for (i = 0; i < resultCount[csIdx]; ++i) { + label->candidates[label->candidatesCount + i] = result[csIdx][i]; + } + label->candidatesCount += resultCount[csIdx]; + } + + // one ontology class --> use it + if (!nameFound){ + if (resultCount[csIdx] == 1) { + label->name = result[csIdx][0]; + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + nameFound = 1; + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif + } + } + + if (!nameFound) { + // multiple ontology classes --> intersect with types + if (resultCount[csIdx] > 1) { + tmpList = NULL; + tmpListCount = 0; + // search for type values + for (i = 0; i < typeAttributesCount; ++i) { + for (j = 0; j < typeAttributesHistogramCount[csIdx][i]; ++j) { + if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; // sorted + + // intersect type with ontology classes + for (k = 0; k < resultCount[csIdx]; ++k) { + if (result[csIdx][k] == typeAttributesHistogram[csIdx][i][j].value) { + // found, copy ontology class to tmpList + tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); + if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); + tmpList[tmpListCount] = result[csIdx][k]; + tmpListCount += 1; + } + } + } + } + + // only one left --> use it + if (tmpListCount == 1) { + label->name = tmpList[0]; + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif + } + + if (!nameFound) { + // multiple left --> use the class that covers most attributes, most popular ontology, ... + if (tmpListCount > 1) { + label->name = tmpList[0]; // sorted + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif + } + } + + if (!nameFound) { + // empty intersection -> use the class that covers most attributes, most popular ontology, .. + label->name = result[csIdx][0]; // sorted + label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); + free(tmpList); + nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif + } + } + } + + + // --- FK --- // add top3 fk values to list of candidates if (links[csIdx].num > 0) { diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -92,7 +92,8 @@ enum { #define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to be in this table #define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be this value -#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf simularity for ontology classes +//#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf simularity for ontology classes +#define ONTOLOGY_FREQ_THRESHOLD 0.8 // similarity threshold for tfidf simularity for ontology classes #define USE_SHORT_NAMES 1 // use getPropNameShort() #define USE_TYPE_NAMES 1 // use type attribute values for labeling diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -121,6 +121,26 @@ static void initcsIdFreqIdxMap(int* inpu +str printTKNZStringFromOid(oid id){ + int ret; + char* schema = "rdf"; + str propStr; + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + throw(RDF, "rdf.rdfschema", + "could not open the tokenizer\n"); + } + + takeOid(id, &propStr); + printf("String for "BUNFMT": %s\n", id, propStr); + + GDKfree(propStr); + TKNZRclose(&ret); + + return MAL_SUCCEED; +} + + char isCSTable(CS item){ if (item.parentFreqIdx != -1) return 0; @@ -2507,6 +2527,11 @@ oid putaCStoHash(CSBats *csBats, oid* ke csId = *csoid; addNewCS(csBats, fullPropStat, &csKey, key, csoid, num, numTriples, numTypeValues, rdftypeOntologyValues); + //if (csId == 73){ + // printf("Extra info for cs 73 is: "); + // printTKNZStringFromOid(rdftypeOntologyValues[0]); + //} + //Handle the case when freqThreshold == 1 if (isStoreFreqCS ==1 && freqThreshold == 1){ #if STOREFULLCS @@ -2612,7 +2637,10 @@ static int isSubset(oid* arr1, oid* arr2 * See http://disi.unitn.it/~bernardi/Courses/DL/Slides_11_12/measures.pdf * tf(t,d): Number of times t occurs in d. --> For a CS, tf(prop, aCS) = 1; * idf(t): The rarity of a term t in the whold document collection - * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t +1) + * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t) + * Note that, some function may use #numberCSs_containing_t + 1 as it can be division + * by 0 if the term does not appear in any document. However, in our case, + * every prop must appear in at least one CS * tf-idf(t,d,D) = tf(t,d) * idf(t,D) * * Note that: If we use normalize tf by dividing with maximum tf @@ -2621,7 +2649,7 @@ static int isSubset(oid* arr1, oid* arr2 static float tfidfComp(int numContainedCSs, int totalNumCSs){ - return log((float)totalNumCSs/(1+numContainedCSs)); + return log((float)totalNumCSs/(numContainedCSs)); } /* @@ -3014,15 +3042,33 @@ void getPropStatisticsFromMergeCSs(PropS for (i = 0; i < propStat->numAdded; i++){ propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],curNumMergeCS); + } //BATprint(propStat->pBat); /* + { + int ret; + char* schema = "rdf"; + str propStr; + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + printf("Fail in opening Tokenizer \n"); + } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list