Changeset: efb882d6b146 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=efb882d6b146 Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Merge with Linnea's changes for updatelabel function diffs (truncated from 337 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -108,6 +108,10 @@ typedef struct CSlabel { oid name; // table name oid *candidates; // list of table name candidates, candidates[0] == name int candidatesCount;// number of entries in the candidates list + int candidatesNew; // number of candidates that are created during merging (e.g. ancestor name) + int candidatesOntology; // number of ontology candidates (first category) + int candidatesType; // number of type candidates (second category) + int candidatesFK; // number of fk candidates (third category) oid *hierarchy; // hierarchy "bottom to top" int hierarchyCount; // number of entries in the hierarchy list int numProp; // number of properties, copied from freqCSset->items[x].numProp diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1830,6 +1830,50 @@ oid* getOntoHierarchy(oid ontology, int* return hierarchy; } +/* Remove duplicated candidate values and remove DUMMY values if better candidates exist + */ +static +void removeDuplicatedCandidates(CSlabel *label) { + int i, j; + int cNew = label->candidatesNew, cOnto = label->candidatesOntology, cType = label->candidatesType, cFK = label->candidatesFK; + + if (label->candidatesCount < 2) return; // no duplicates + + // loop through all candidates + for (i = 0; i < label->candidatesCount - 1; ++i) { + // search (direction: right) whether this value occurs again + int moveLeft = 0; + for (j = i + 1; j < label->candidatesCount; ++j) { + // find out which category (new, onto, type, fk) we are in + int *cPtr = NULL; + if (j < label->candidatesNew) cPtr = &cNew; + else if (j < label->candidatesNew + label->candidatesOntology) cPtr = &cOnto; + else if (j < label->candidatesNew + label->candidatesOntology + label->candidatesType) cPtr = &cType; + else cPtr = &cFK; + + if (label->candidates[i] == label->candidates[j] || label->candidates[j] == BUN_NONE) { + // DUMMY value will be overwritten + // OR: + // value occurs again, will be overwritten + moveLeft++; + (*cPtr)--; + } else { + // different value, keep it + label->candidates[j - moveLeft] = label->candidates[j]; + } + } + // value 'i' is unique now + // update counts + label->candidatesCount -= moveLeft; + label->candidatesNew = cNew; + label->candidatesOntology = cOnto; + label->candidatesType = cType; + label->candidatesFK = cFK; + } + + // DUMMY value on position 0 is kept to ensure that name == candidates[0] +} + #if USE_TABLE_NAME /* For one CS: Choose the best table name out of all collected candidates (ontology, type, fk). */ static @@ -1842,6 +1886,7 @@ void getTableName(CSlabel* label, int cs // --- ONTOLOGY --- // add all ontology candidates to list of candidates if (resultCount[csIdx] >= 1) { + label->candidatesOntology = resultCount[csIdx]; label->candidates = GDKrealloc(label->candidates, sizeof(oid) * (label->candidatesCount + resultCount[csIdx])); if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); for (i = 0; i < resultCount[csIdx]; ++i) { @@ -1924,6 +1969,7 @@ void getTableName(CSlabel* label, int cs // add all most frequent type values to list of candidates if (tmpListCount >= 1) { int counter = 0; + label->candidatesType = tmpListCount; label->candidates = GDKrealloc(label->candidates, sizeof(oid) * (label->candidatesCount + tmpListCount)); if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); for (i = 0; i < typeStatCount; ++i) { @@ -1964,6 +2010,7 @@ void getTableName(CSlabel* label, int cs // --- FK --- // add top3 fk values to list of candidates if (links[csIdx].num > 0) { + label->candidatesFK = MIN(3, links[csIdx].num); label->candidates = GDKrealloc(label->candidates, sizeof(oid) * (label->candidatesCount + MIN(3, links[csIdx].num))); if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); for (i = 0; i < MIN(3, links[csIdx].num); ++i) { @@ -1982,6 +2029,7 @@ void getTableName(CSlabel* label, int cs // --- NOTHING --- if (label->candidatesCount == 0) { + label->candidatesNew = 1; label->candidates = GDKrealloc(label->candidates, sizeof(oid)); if (!label->candidates) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); label->candidates[0] = BUN_NONE; @@ -1993,6 +2041,9 @@ void getTableName(CSlabel* label, int cs nameFound = 1; } + // de-duplicate + removeDuplicatedCandidates(label); + if(tmpList != NULL) free(tmpList); return; } @@ -2009,6 +2060,10 @@ CSlabel* initLabels(CSset *freqCSset) { labels[i].name = BUN_NONE; labels[i].candidates = NULL; labels[i].candidatesCount = 0; + labels[i].candidatesNew = 0; + labels[i].candidatesOntology = 0; + labels[i].candidatesType = 0; + labels[i].candidatesFK = 0; labels[i].hierarchy = NULL; labels[i].hierarchyCount = 0; labels[i].numProp = 0; @@ -2450,6 +2505,68 @@ CSlabel* createLabels(CSset* freqCSset, return labels; } +/* Merge two lists of candidates. + * Result: <common name> <ontology candidates CS1> <ontology candidates CS2> <type candidates CS1> <type candidates CS2> <FK candidates CS1> <FK candidates CS2> + */ +static +oid* mergeCandidates(int *candidatesCount, int *candidatesNew, int *candidatesOntology, int *candidatesType, int *candidatesFK, CSlabel cs1, CSlabel cs2, oid commonName) { + oid *candidates; + int counter = 0; + int i; + + (*candidatesCount) = cs1.candidatesCount + cs2.candidatesCount + 1; // +1 for common name + candidates = GDKmalloc(sizeof(oid) * (*candidatesCount)); + + candidates[counter] = commonName; + counter++; + + // copy "new" + for (i = 0; i < cs1.candidatesNew; ++i) { + candidates[counter] = cs1.candidates[i]; + counter++; + } + for (i = 0; i < cs2.candidatesNew; ++i) { + candidates[counter] = cs2.candidates[i]; + counter++; + } + (*candidatesNew) = counter; + + // copy "ontology" + for (i = 0; i < cs1.candidatesOntology; ++i) { + candidates[counter] = cs1.candidates[cs1.candidatesNew + i]; + counter++; + } + for (i = 0; i < cs2.candidatesOntology; ++i) { + candidates[counter] = cs2.candidates[cs2.candidatesNew + i]; + counter++; + } + (*candidatesOntology) = counter - (*candidatesNew); + + // copy "type" + for (i = 0; i < cs1.candidatesType; ++i) { + candidates[counter] = cs1.candidates[cs1.candidatesNew + cs1.candidatesOntology + i]; + counter++; + } + for (i = 0; i < cs2.candidatesType; ++i) { + candidates[counter] = cs2.candidates[cs2.candidatesNew + cs2.candidatesOntology + i]; + counter++; + } + (*candidatesType) = counter - (*candidatesNew) - (*candidatesOntology); + + // copy "fk" + for (i = 0; i < cs1.candidatesFK; ++i) { + candidates[counter] = cs1.candidates[cs1.candidatesNew + cs1.candidatesOntology + cs1.candidatesType + i]; + counter++; + } + for (i = 0; i < cs2.candidatesFK; ++i) { + candidates[counter] = cs2.candidates[cs2.candidatesNew + cs2.candidatesOntology + cs2.candidatesType + i]; + counter++; + } + (*candidatesFK) = counter - (*candidatesNew) - (*candidatesOntology) - (*candidatesType); + + return candidates; +} + /* Create labels for merged CS's. Uses rules S1 to S5 (new names!). * If no MERGECS is created (subset-superset relation), mergeCSFreqId contains the Id of the superset class. * For S1 and S2, parameter 'name' is used to avoid recomputation of CS names @@ -2457,15 +2574,19 @@ CSlabel* createLabels(CSset* freqCSset, str updateLabel(int ruleNumber, CSset *freqCSset, CSlabel **labels, int newCS, int mergeCSFreqId, int freqCS1, int freqCS2, oid name, oid **ontmetadata, int ontmetadataCount, int *lstFreqId, int numIds){ int i; int freqCS1Counter; - CSlabel *big; + CSlabel big, small; CSlabel *label; CS cs; #if USE_MULTIWAY_MERGING int tmpMaxCoverage; int tmpFreqId; #endif + oid *mergedCandidates = NULL; + int candidatesCount, candidatesNew, candidatesOntology, candidatesType, candidatesFK; + (void) lstFreqId; (void) numIds; + if (newCS) { // realloc labels *labels = GDKrealloc(*labels, sizeof(CSlabel) * freqCSset->numCSadded); @@ -2473,6 +2594,10 @@ str updateLabel(int ruleNumber, CSset *f (*labels)[mergeCSFreqId].name = BUN_NONE; (*labels)[mergeCSFreqId].candidates = NULL; (*labels)[mergeCSFreqId].candidatesCount = 0; + (*labels)[mergeCSFreqId].candidatesNew = 0; + (*labels)[mergeCSFreqId].candidatesOntology = 0; + (*labels)[mergeCSFreqId].candidatesType = 0; + (*labels)[mergeCSFreqId].candidatesFK = 0; (*labels)[mergeCSFreqId].hierarchy = NULL; (*labels)[mergeCSFreqId].hierarchyCount = 0; (*labels)[mergeCSFreqId].numProp = 0; @@ -2503,9 +2628,16 @@ str updateLabel(int ruleNumber, CSset *f (void)freqCS2; #else - // TODO candidates - //label->candidates = ; - //label->candidatesCount = ; + // candidates + mergedCandidates = mergeCandidates(&candidatesCount, &candidatesNew, &candidatesOntology, &candidatesType, &candidatesFK, (*labels)[freqCS1], (*labels)[freqCS2], label->name); + GDKfree(label->candidates); + label->candidates = mergedCandidates; // TODO check access outside function + label->candidatesCount = candidatesCount; + label->candidatesNew = candidatesNew; + label->candidatesOntology = candidatesOntology; + label->candidatesType = candidatesType; + label->candidatesFK = candidatesFK; + removeDuplicatedCandidates(label); // hierarchy if ((*labels)[freqCS1].name == label->name) { @@ -2541,9 +2673,16 @@ str updateLabel(int ruleNumber, CSset *f // use common ancestor label->name = name; - // TODO candidates - //label->candidates = ; - //label->candidatesCount = ; + // candidates + mergedCandidates = mergeCandidates(&candidatesCount, &candidatesNew, &candidatesOntology, &candidatesType, &candidatesFK, (*labels)[freqCS1], (*labels)[freqCS2], label->name); + GDKfree(label->candidates); + label->candidates = mergedCandidates; // TODO check access outside function + label->candidatesCount = candidatesCount; + label->candidatesNew = candidatesNew; + label->candidatesOntology = candidatesOntology; + label->candidatesType = candidatesType; + label->candidatesFK = candidatesFK; + removeDuplicatedCandidates(label); // hierarchy freqCS1Counter = (*labels)[freqCS1].hierarchyCount - 1; @@ -2565,7 +2704,18 @@ str updateLabel(int ruleNumber, CSset *f case S3: // subset-superset relation - // candidates already set + + // candidates + mergedCandidates = mergeCandidates(&candidatesCount, &candidatesNew, &candidatesOntology, &candidatesType, &candidatesFK, (*labels)[freqCS1], (*labels)[freqCS2], label->name); // freqCS1 is superCS, freqCS2 is subCS + GDKfree(label->candidates); + label->candidates = mergedCandidates; // TODO check access outside function + label->candidatesCount = candidatesCount; + label->candidatesNew = candidatesNew; + label->candidatesOntology = candidatesOntology; + label->candidatesType = candidatesType; + label->candidatesFK = candidatesFK; + removeDuplicatedCandidates(label); + // hierarchy already set // properties already set @@ -2587,25 +2737,34 @@ str updateLabel(int ruleNumber, CSset *f #else // use label of biggest CS (higher coverage value) if (freqCSset->items[freqCS1].coverage > freqCSset->items[freqCS2].coverage) { - big = &(*labels)[freqCS1]; + big = (*labels)[freqCS1]; + small = (*labels)[freqCS2]; } else { - big = &(*labels)[freqCS2]; + big = (*labels)[freqCS2]; + small = (*labels)[freqCS1]; } #endif - label->name = big->name; + label->name = big.name; - // TODO candidates - //label->candidatesCount = ; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list