Changeset: eb32228c325e for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=eb32228c325e Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfontologyload.c monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message:
Store oids instead of strings to improve performance Store oids during the labeling process, transform them into strings for export only. URI string format: <http://xxxxxxxx>/ diffs (truncated from 2090 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -105,13 +105,13 @@ typedef enum { // Final data structure that stores the labels for tables and attributes typedef struct CSlabel { - str name; // table name - str *candidates; // list of table name candidates, candidates[0] == name + oid name; // table name + oid *candidates; // list of table name candidates, candidates[0] == name int candidatesCount;// number of entries in the candidates list - str *hierarchy; // hierarchy "bottom to top" + oid *hierarchy; // hierarchy "bottom to top" int hierarchyCount; // number of entries in the hierarchy list int numProp; // number of properties, copied from freqCSset->items[x].numProp - char **lstProp; // attribute names (same order as in freqCSset->items[x].lstProp) + oid *lstProp; // attribute names (same order as in freqCSset->items[x].lstProp) } CSlabel; #endif /* _RDF_H_ */ diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -27,79 +27,79 @@ // list of known ontologies int ontologyCount = 73; ontology ontologies[] = { -{{"http:", "www.facebook.com", "2008"}, 3}, -{{"http:", "facebook.com", "2008"}, 3}, -{{"http:", "developers.facebook.com", "schema"}, 3}, -{{"https:", "www.facebook.com", "2008"}, 3}, -{{"http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore -{{"http:", "purl.org", "dc", "terms"}, 4}, // DublinCore -{{"http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations -{{"http:", "purl.org", "rss", "1.0", "modules"}, 5}, -{{"http:", "purl.org", "stuff"}, 3}, -{{"http:", "www.purl.org", "stuff"}, 3}, -{{"http:", "ogp.me", "ns"}, 3}, -{{"https:", "ogp.me", "ns"}, 3}, -{{"http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf -{{"http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs -{{"http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple Knowledge Organization System) -{{"http:", "www.w3.org", "2002", "07", "owl"}, 5}, -{{"http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard -{{"http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5}, -{{"http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo -{{"http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml -{{"http:", "search.yahoo.com", "searchmonkey"}, 3}, -{{"https:", "search.yahoo.com", "searchmonkey"}, 3}, -{{"http:", "search.yahoo.co.jp", "searchmonkey"}, 3}, -{{"http:", "g.yahoo.com", "searchmonkey"}, 3}, -{{"http:", "opengraphprotocol.org", "schema"}, 3}, -{{"https:", "opengraphprotocol.org", "schema"}, 3}, -{{"http:", "opengraph.org", "schema"}, 3}, -{{"https:", "opengraph.org", "schema"}, 3}, -{{"http:", "creativecommons.org", "ns"}, 3}, // cc -{{"http:", "rdf.data-vocabulary.org"}, 2}, // by google -{{"http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock", Semantically-Interlinked Online Communities Project) -{{"http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend) -{{"http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph -{{"http:", "commontag.org", "ns"}, 3}, -{{"http:", "semsl.org", "ontology"}, 3}, // semantic web for second life -{{"http:", "schema.org"}, 2}, -{{"http:", "openelectiondata.org", "0.1"}, 3}, -{{"http:", "search.aol.com", "rdf"}, 3}, -{{"http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress -{{"http:", "dbpedia.org", "ontology"}, 3}, // dbo -{{"http:", "dbpedia.org", "resource"}, 3}, // dbpedia -{{"http:", "dbpedia.org", "property"}, 3}, // dbp -{{"http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research, publications, ...) -{{"http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography) -{{"http:", "purl.org", "ontology", "mo"}, 4}, // mo (music) -{{"http:", "www.geonames.org", "ontology"}, 3}, // geonames -{{"http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional Requirements for Bibliographic Records) -{{"http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd -{{"http:", "www.w3.org", "2006", "time"}, 4}, // time -{{"http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event -{{"http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive) -{{"http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data) -{{"http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag -{{"http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked Datasets) -{{"http:", "www.w3.org", "2006", "http"}, 4}, // http -{{"http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation) -{{"http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding Exchange Layer) -{{"http:", "purl.org", "stuff", "rev"}, 4}, // rev (review) -{{"http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube) -{{"http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations) -{{"http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for annotating vocabulary descriptions) -{{"http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, // admingeo (administrative geography and civil voting area) -{{"http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web Description Resources) -{{"http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a Project) -{{"http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn (TaxonConcept, species) -{{"http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust) -{{"http:", "purl.org", "net", "compass"}, 4}, // compass -{{"http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF graph) -{{"http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline) -{{"http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata) -{{"http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, research) -{{"http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib (bibTeX entries) -{{"http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes) +{{"<http:", "www.facebook.com", "2008"}, 3}, +{{"<http:", "facebook.com", "2008"}, 3}, +{{"<http:", "developers.facebook.com", "schema"}, 3}, +{{"<https:", "www.facebook.com", "2008"}, 3}, +{{"<http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore +{{"<http:", "purl.org", "dc", "terms"}, 4}, // DublinCore +{{"<http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations +{{"<http:", "purl.org", "rss", "1.0", "modules"}, 5}, +{{"<http:", "purl.org", "stuff"}, 3}, +{{"<http:", "www.purl.org", "stuff"}, 3}, +{{"<http:", "ogp.me", "ns"}, 3}, +{{"<https:", "ogp.me", "ns"}, 3}, +{{"<http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf +{{"<http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs +{{"<http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple Knowledge Organization System) +{{"<http:", "www.w3.org", "2002", "07", "owl"}, 5}, +{{"<http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard +{{"<http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5}, +{{"<http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo +{{"<http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml +{{"<http:", "search.yahoo.com", "searchmonkey"}, 3}, +{{"<https:", "search.yahoo.com", "searchmonkey"}, 3}, +{{"<http:", "search.yahoo.co.jp", "searchmonkey"}, 3}, +{{"<http:", "g.yahoo.com", "searchmonkey"}, 3}, +{{"<http:", "opengraphprotocol.org", "schema"}, 3}, +{{"<https:", "opengraphprotocol.org", "schema"}, 3}, +{{"<http:", "opengraph.org", "schema"}, 3}, +{{"<https:", "opengraph.org", "schema"}, 3}, +{{"<http:", "creativecommons.org", "ns"}, 3}, // cc +{{"<http:", "rdf.data-vocabulary.org"}, 2}, // by google +{{"<http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock", Semantically-Interlinked Online Communities Project) +{{"<http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend) +{{"<http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph +{{"<http:", "commontag.org", "ns"}, 3}, +{{"<http:", "semsl.org", "ontology"}, 3}, // semantic web for second life +{{"<http:", "schema.org"}, 2}, +{{"<http:", "openelectiondata.org", "0.1"}, 3}, +{{"<http:", "search.aol.com", "rdf"}, 3}, +{{"<http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress +{{"<http:", "dbpedia.org", "ontology"}, 3}, // dbo +{{"<http:", "dbpedia.org", "resource"}, 3}, // dbpedia +{{"<http:", "dbpedia.org", "property"}, 3}, // dbp +{{"<http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research, publications, ...) +{{"<http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography) +{{"<http:", "purl.org", "ontology", "mo"}, 4}, // mo (music) +{{"<http:", "www.geonames.org", "ontology"}, 3}, // geonames +{{"<http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional Requirements for Bibliographic Records) +{{"<http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd +{{"<http:", "www.w3.org", "2006", "time"}, 4}, // time +{{"<http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event +{{"<http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive) +{{"<http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data) +{{"<http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag +{{"<http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked Datasets) +{{"<http:", "www.w3.org", "2006", "http"}, 4}, // http +{{"<http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation) +{{"<http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding Exchange Layer) +{{"<http:", "purl.org", "stuff", "rev"}, 4}, // rev (review) +{{"<http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube) +{{"<http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations) +{{"<http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for annotating vocabulary descriptions) +{{"<http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, // admingeo (administrative geography and civil voting area) +{{"<http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web Description Resources) +{{"<http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a Project) +{{"<http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn (TaxonConcept, species) +{{"<http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust) +{{"<http:", "purl.org", "net", "compass"}, 4}, // compass +{{"<http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF graph) +{{"<http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline) +{{"<http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata) +{{"<http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, research) +{{"<http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib (bibTeX entries) +{{"<http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes) }; #if USE_SHORT_NAMES @@ -107,25 +107,25 @@ ontology ontologies[] = { static void getPropNameShort(char** name, char* propStr) { char *token; - char *uri, *uriPtr; + char *uri; + char *uriPtr; int length = 0; // number of tokens char **tokenizedUri = NULL; // list of tokens int i, j; int fit; // tokenize uri - uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + uri = (char *) GDKmalloc(sizeof(char) * (strlen(propStr) + 1)); if (!uri) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(uri, propStr); // uri will be modified during tokenization uriPtr = uri; // uri will be modified, uriPtr keeps original pointer token = strtok(uri, "/#"); while (token != NULL) { - tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length); + tokenizedUri = GDKrealloc(tokenizedUri, sizeof(char*) * ++length); if (!tokenizedUri) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); tokenizedUri[length - 1] = token; token = strtok(NULL, "/#"); } - free(uriPtr); // match with ontologies for (j = 0; j < ontologyCount; ++j) { @@ -142,7 +142,7 @@ void getPropNameShort(char** name, char* for (i = ontologies[j].length; i < length; ++i) { totalLength += (strlen(tokenizedUri[i]) + 1); // additional char for underscore } - (*name) = (char *) malloc(sizeof(char) * (totalLength + 1)); + (*name) = (char *) GDKmalloc(sizeof(char) * (totalLength + 1)); if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(*name, "\0"); @@ -153,7 +153,10 @@ void getPropNameShort(char** name, char* // remove trailing underscore (*name)[strlen(*name) - 1] = '\0'; - free(tokenizedUri); + if ((*name)[strlen(*name) - 1] == '>') (*name)[strlen(*name) - 1] = '\0'; // remove > + + GDKfree(tokenizedUri); + GDKfree(uriPtr); return; } } @@ -163,16 +166,19 @@ void getPropNameShort(char** name, char* if (length <= 1) { // value - (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + (*name) = (char *) GDKmalloc(sizeof(char) * (strlen(propStr) + 1)); if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(*name, propStr); } else { - (*name) = (char *) malloc(sizeof(char) * (strlen(tokenizedUri[length - 1]) + 1)); + (*name) = (char *) GDKmalloc(sizeof(char) * (strlen(tokenizedUri[length - 1]) + 1)); if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); strcpy(*name, tokenizedUri[length - 1]); } - free(tokenizedUri); + if ((*name)[strlen(*name) - 1] == '>') (*name)[strlen(*name) - 1] = '\0'; // remove > + + GDKfree(tokenizedUri); + GDKfree(uriPtr); return; } #endif @@ -238,11 +244,6 @@ Relation*** initRelationMetadata(int** r int i, j, k; Relation*** relationMetadata; - int ret; - char* schema = "rdf"; - - TKNZRopen (NULL, &schema); - relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset->numCSadded); if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); for (i = 0; i < num; ++i) { // CS @@ -293,8 +294,6 @@ Relation*** initRelationMetadata(int** r } } - TKNZRclose(&ret); - return relationMetadata; } @@ -304,11 +303,6 @@ Relation*** initRelationMetadata2(int** int i, j, k; Relation*** relationMetadata; - int ret; - char* schema = "rdf"; - - TKNZRopen (NULL, &schema); - relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset->numCSadded); if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc memory!\n"); for (i = 0; i < freqCSset->numCSadded; ++i) { // CS @@ -360,8 +354,6 @@ Relation*** initRelationMetadata2(int** } } - TKNZRclose(&ret); - return relationMetadata; } @@ -439,7 +431,7 @@ void escapeURIforSQL(char* s) { int i; for (i = 0; i < (int) strlen(s); ++i) { - if (s[i] == ':' || s[i] == '"' || s[i] == ' ' || s[i] == '-') s[i] = '_'; + if (s[i] == ':' || s[i] == '"' || s[i] == ' ' || s[i] == '-' || s[i] == '<' || s[i] == '>' || s[i] == '/' || s[i] == '(' || s[i] == ')' || s[i] == '.' || s[i] == '%') s[i] = '_'; s[i] = tolower(s[i]); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list