Changeset: af6b114f1b3a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=af6b114f1b3a Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Tune S1 by using alternative name for a freqCS, checking where the name comes from. + validate TF-IDF function in rdflabels.c diffs (truncated from 501 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -103,6 +103,11 @@ typedef enum { #define N_GRAPH_BAT (MAP_LEX+1) +#define INFO_WHERE_NAME_FROM 1 +#define TOP_GENERAL_NAME 2 //Level of hierrachy in which a name is considered to be a general name + //For example, PERSON, THING is at level 1 +#define USE_ALTERNATIVE_NAME 0 //Use different but may be better name for a general name + // Final data structure that stores the labels for tables and attributes typedef struct CSlabel { oid name; // table name @@ -116,6 +121,11 @@ typedef struct CSlabel { int hierarchyCount; // number of entries in the hierarchy list int numProp; // number of properties, copied from freqCSset->items[x].numProp oid *lstProp; // attribute names (same order as in freqCSset->items[x].lstProp) + #if INFO_WHERE_NAME_FROM + char isOntology; // First name is decided by ontology + char isType; // First name is decided based on Type + char isFK; + #endif } CSlabel; #endif /* _RDF_H_ */ diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1167,6 +1167,7 @@ oid* getOntologyCandidates(oid** ontattr // remove subclass if superclass is in list for (k = 0; k < num; ++k) { int found = 0; + printf(" TFIDF score at %d is: %f \n",k, classStat[k].tfidfs); if (classStat[k].tfidfs < ONTOLOGY_FREQ_THRESHOLD) break; // values not frequent enough (list is sorted by tfidfs) for (j = 0; j < ontmetadataCount && (found == 0); ++j) { oid muri = ontmetadata[0][j]; @@ -1346,8 +1347,19 @@ void createOntologyLookupResult(oid** re for (j = 0; j < ontologyCount; ++j) { propOntologiesCount[j] = 0; } + + printf("Get ontology for FreqId %d. Orignal numProp = %d \n", i, cs.numProp); + propOntologies = findOntologies(cs, propOntologiesCount, &propOntologiesOids); + /* + printf("Prop ontologies count. \n"); + for (j = 0; j < ontologyCount; ++j) { + if (propOntologiesCount[j] > 0) + printf(" (%d) props in ontology %d \n ", propOntologiesCount[j], j); + } + */ + // get class names resultCount[i] = 0; result[i] = getOntologyCandidates(ontattributes, ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), propOntologiesOids, propOntologiesCount, ontologyCount, propStat); @@ -1970,6 +1982,9 @@ void getTableName(CSlabel* label, int cs label->name = result[csIdx][0]; label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); nameFound = 1; + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif } if (!nameFound) { @@ -2001,6 +2016,9 @@ void getTableName(CSlabel* label, int cs label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); free(tmpList); nameFound = 1; + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif } if (!nameFound) { @@ -2010,6 +2028,10 @@ void getTableName(CSlabel* label, int cs label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); free(tmpList); nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif } } @@ -2019,6 +2041,10 @@ void getTableName(CSlabel* label, int cs label->hierarchy = getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata, ontmetadataCount); free(tmpList); nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isOntology = 1; + #endif } } } @@ -2060,6 +2086,10 @@ void getTableName(CSlabel* label, int cs // only one type attribute, use most frequent value (sorted) label->name = tmpList[0]; nameFound = 1; + #if INFO_WHERE_NAME_FROM + label->isType = 1; + #endif + } } @@ -2071,6 +2101,10 @@ void getTableName(CSlabel* label, int cs if (typeStat[i].value == tmpList[j]) { label->name = tmpList[j]; nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isType = 1; + #endif } } } @@ -2094,6 +2128,10 @@ void getTableName(CSlabel* label, int cs if (links[csIdx].num > 0) { label->name = links[csIdx].fks[0].prop; // sorted nameFound = 1; + + #if INFO_WHERE_NAME_FROM + label->isFK = 1; + #endif } } @@ -2138,6 +2176,11 @@ CSlabel* initLabels(CSset *freqCSset) { labels[i].hierarchyCount = 0; labels[i].numProp = 0; labels[i].lstProp = NULL; + #if INFO_WHERE_NAME_FROM + labels[i].isOntology = 0; + labels[i].isType = 0; + labels[i].isFK = 0; + #endif } return labels; } diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2778,7 +2778,7 @@ void generatecsRelSum(CSrel csRel, int f for (i = 0; i < csRel.numRef; i++){ freq = freqCSset->items[csRel.origFreqIdx].support; - if (freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){ + if (freq > MIN_FROMTABLE_SIZE_S6 && freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){ propIdx = 0; while (csRelSum->lstPropId[propIdx] != csRel.lstPropId[i]) propIdx++; @@ -2811,6 +2811,38 @@ LabelStat* initLabelStat(void){ return labelStat; } +/* + * + * */ +#if USE_ALTERNATIVE_NAME +static +oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){ + oid candidate; + int i; + candidate = labels[freqIdx].candidates[candIdx]; + + if (labels[freqIdx].hierarchyCount > 1){ + for (i = 0; i < labels[freqIdx].hierarchyCount; i++){ + if (labels[freqIdx].hierarchy[i] == candidate) break; + } + + } + + if (i == labels[freqIdx].hierarchyCount) // Not appears in the hierarchy + return candidate; + else if (i > TOP_GENERAL_NAME) // Not a too general candidate + return candidate; + else if ((candIdx+1) < labels[freqIdx].candidatesCount){ + //printf("Use another candidate \n"); + return labels[freqIdx].candidates[candIdx+1]; + } + + //No choice + return candidate; + +} +#endif + static void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset, int k){ int i,j; @@ -2826,7 +2858,11 @@ void buildLabelStat(LabelStat *labelStat if (labels[i].name != BUN_NONE){ numCheck = (labels[i].candidatesCount > k)?k:labels[i].candidatesCount; for (j = 0; j < numCheck; j++){ + #if USE_ALTERNATIVE_NAME + candidate = getMostSuitableName(labels, i, j); + #else candidate = labels[i].candidates[j]; + #endif bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate); if (bun == BUN_NONE) { /*New string*/ @@ -2874,7 +2910,11 @@ void buildLabelStat(LabelStat *labelStat if (labels[i].name != BUN_NONE){ numCheck = (labels[i].candidatesCount > k)?k:labels[i].candidatesCount; for (j = 0; j < numCheck; j++){ + #if USE_ALTERNATIVE_NAME + candidate = getMostSuitableName(labels, i, j); + #else candidate = labels[i].candidates[j]; + #endif bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate); if (bun == BUN_NONE) { fprintf(stderr, "All the name should be stored already!\n"); @@ -2903,18 +2943,63 @@ void freeLabelStat(LabelStat *labelStat) free(labelStat); } +static +void doMerge(CSset *freqCSset, int ruleNum, CS* cs1, CS* cs2, int freqId1, int freqId2, oid *mergecsId, CSlabel** labels, oid** ontmetadata, int ontmetadataCount, oid name){ + CS *mergecs; + int existMergecsId; + CS *existmergecs, *mergecs1, *mergecs2; + int k; + + //Check whether these CS's belong to any mergeCS + if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx == -1){ /* New merge */ + mergecs = mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId); + //addmergeCStoSet(mergecsSet, *mergecs); + cs1->parentFreqIdx = freqCSset->numCSadded; + cs2->parentFreqIdx = freqCSset->numCSadded; + addCStoSet(freqCSset,*mergecs); + updateLabel(ruleNum, freqCSset, labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); + free(mergecs); + + mergecsId[0]++; + } + else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){ + existMergecsId = cs2->parentFreqIdx; + existmergecs = &(freqCSset->items[existMergecsId]); + mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs); + cs1->parentFreqIdx = existMergecsId; + updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); + } + + else if (cs1->parentFreqIdx != -1 && cs2->parentFreqIdx == -1){ + existMergecsId = cs1->parentFreqIdx; + existmergecs = &(freqCSset->items[existMergecsId]); + mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs); + cs2->parentFreqIdx = existMergecsId; + updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); + } + else if (cs1->parentFreqIdx != cs2->parentFreqIdx){ + mergecs1 = &(freqCSset->items[cs1->parentFreqIdx]); + mergecs2 = &(freqCSset->items[cs2->parentFreqIdx]); + + mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx); + + //Re-map for all maxCS in mergecs2 + for (k = 0; k < mergecs2->numConsistsOf; k++){ + freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx; + } + updateLabel(ruleNum, freqCSset, labels, 0, cs1->parentFreqIdx, freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1); + } + +} static -void mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, oid** ontmetadata, int ontmetadataCount){ +str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, oid** ontmetadata, int ontmetadataCount){ int i; #if !USE_MULTIWAY_MERGING - int j,k; + int j, k; int freqId1, freqId2; - CS *mergecs; - int existMergecsId; CS *cs1, *cs2; - CS *existmergecs, *mergecs1, *mergecs2; #else int *lstDistinctFreqId = NULL; int numDistinct = 0; @@ -2923,6 +3008,17 @@ void mergeMaxFreqCSByS1(CSset *freqCSset #endif LabelStat *labelStat = NULL; oid *name; + #if OUTPUT_FREQID_PER_LABEL + FILE *fout; + char* schema = "rdf"; + int ret = 0; + str tmpLabel; + int tmpCount; + _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list