Changeset: 88c8fdf4e227 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=88c8fdf4e227 Modified Files: monetdb5/extras/rdf/rdf_shredder.mx sql/backends/monet5/sql.mx Branch: lodrdf Log Message:
Fix bug in SQLrdfShred() function (in sql/backends/monet5/sql.mx) while loading rdf data Add the corresponding modification for this in rdf_shredder.mx diffs (truncated from 1087 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf_shredder.mx b/monetdb5/extras/rdf/rdf_shredder.mx --- a/monetdb5/extras/rdf/rdf_shredder.mx +++ b/monetdb5/extras/rdf/rdf_shredder.mx @@ -92,15 +92,15 @@ typedef struct parserData { int line; /* locator for errors */ int column; /* locator for errors */ /**GRAPH DATA */ - BAT *graph[N_GRAPH_BAT]; /* BATs for the result + BAT **graph; /* BATs for the result shredded RDF graph */ } parserData; -/* - * @- - * The (fatal) errors and warnings produced by the raptor parser are handled - * by the next three message handler functions. - */ +@{ +@- +The (fatal) errors and warnings produced by the raptor parser are handled +by the next three message handler functions. + @= raptor_exception @1->exception++; @1->exceptionMsg = @2; @@ -124,17 +124,17 @@ static void } } -@ +@} + @c @:rdf_parser_handler(fatal)@ @:rdf_parser_handler(error)@ @:rdf_parser_handler(warning)@ -/* - * @- - * The raptor parser needs to register a callback function that handles one triple - * at a time. Function rdf_parser_triple_handler() does exactly this. - */ +@- +The raptor parser needs to register a callback function that handles one triple +at a time. Function rdf_parser_triple_handler() does exactly this. + @= rdf_insert #ifdef _TKNZR_H @:rdf_tknzr_insert(@2)@ @@ -146,7 +146,7 @@ static void @= rdf_BUNappend_unq_1 bun = BUNfnd(BATmirror(@1),(ptr)@2); if (bun == BUN_NONE) { - if (BATcount(@1) > 4 * @1->T->hash->mask) { + if (@1->T->hash && BATcount(@1) > 4 * @1->T->hash->mask) { HASHdestroy(@1); BAThash(BATmirror(@1), 2*BATcount(@1)); } @@ -185,7 +185,6 @@ if (@1 == NULL) { @:raptor_exception(pdata, "could not append in@1")@ } -@ @c static void tripleHandler(void* user_data, const raptor_statement* triple) @@ -229,10 +228,9 @@ tripleHandler(void* user_data, const rap return; } -/* - * @- - * Function RDFParser() is the entry point to parse an RDF document. - */ +@- +Function RDFParser() is the entry point to parse an RDF document. + @= set_handlers /* set callback handler for triples */ raptor_set_statement_handler (@1, @2, tripleHandler); @@ -241,7 +239,6 @@ raptor_set_fatal_error_handler (@1, @2, raptor_set_error_handler (@1, @2, errorHandler); raptor_set_warning_handler (@1, @2, warningHandler); -@ @c /* creates a BAT for the triple table */ static BAT* @@ -255,7 +252,6 @@ create_BAT(int ht, int tt, int size) /* disable all properties */ b->tsorted = FALSE; - b->trevsorted = FALSE; b->tdense = FALSE; b->tkey = FALSE; b->hdense = TRUE; @@ -264,7 +260,7 @@ create_BAT(int ht, int tt, int size) } static parserData* -parserData_create (str location) +parserData_create (str location, BAT** graph) { int i; @@ -277,8 +273,9 @@ parserData_create (str location) pdata->error = 0; pdata->warning = 0; pdata->location = location; + pdata->graph = graph; - for (i = 0; i < N_GRAPH_BAT; i++) { + for (i = 0; i <= N_GRAPH_BAT; i++) { pdata->graph[i] = NULL; } @@ -306,7 +303,7 @@ parserData_create (str location) return NULL; } /* MAP_LEX must have the key property */ - BATseqbase(pdata->graph[MAP_LEX], 1 << 30); + BATseqbase(pdata->graph[MAP_LEX], RDF_MIN_LITERAL); pdata->graph[MAP_LEX]->tkey = BOUND2BTRUE; pdata->graph[MAP_LEX]->T->nokey[0] = 0; pdata->graph[MAP_LEX]->T->nokey[1] = 0; @@ -314,22 +311,21 @@ parserData_create (str location) return pdata; } -/* - * @- - * After the RDF document has been shredded into 3 bats and a lexical value - * dictionary, a post-shred processing step follows that orders the lexical - * dictionary, re-maps oids to match the ordered dictionary and finaly creates - * all 6 permutations of the (subject, predicate, object) order. - * - * However, it is still to be examined if it worth the time to refine the order - * of the last column. In most cases, during query time, the last column will need - * to be re-order for a subsequent sort-merge join. We introduce sort3 and sort2 - * so we can investigate both possibilities. In addition, the first column need to - * be stored only once for each couple of orders with the same first column. For - * example, it holds that S_SPO == S_SOP. - */ +@- +After the RDF document has been shredded into 3 bats and a lexical value +dictionary, a post-shred processing step follows that orders the lexical +dictionary, re-maps oids to match the ordered dictionary and finaly creates +all 6 permutations of the (subject, predicate, object) order. + +However, it is still to be examined if it worth the time to refine the order +of the last column. In most cases, during query time, the last column will need +to be re-order for a subsequent sort-merge join. We introduce sort3 and sort2 +so we can investigate both possibilities. In addition, the first column need to +be stored only once for each couple of orders with the same first column. For +example, it holds that S_SPO == S_SOP. + @= order -@:order2(@1,@2,@3,@4)@ +@:order3(@1,@2,@3,@4)@ @= order2 if (!CTrefine(&ctref, @1, @2)) /* refine @2 given @1= sorted */ @@ -347,6 +343,7 @@ BBPcold(graph[@3_@4]->batCacheid); /* free ctref */ BBPreclaim(ctref); @ + @= order3 if ( !(CTrefine(&map_oid, @1, @2) /* refine @3 given @1= sorted */ && CTrefine(&ctref, map_oid, @3)))/* refine @4 given @3 */ @@ -366,6 +363,7 @@ BBPcold(graph[@3_@4]->batCacheid); /* free map_oid */ BBPreclaim(map_oid); @ + @c int CTrefine(BAT **ret, BAT *b, BAT *a); /* from modules/kernel/group.mx */ @@ -385,19 +383,19 @@ post_processing (parserData *pdata) /* order MAP_LEX */ BATorder(BATmirror(graph[MAP_LEX])); - map_oid = BATmark(graph[MAP_LEX], 1<<30); /* BATmark will create a copy */ + map_oid = BATmark(graph[MAP_LEX], RDF_MIN_LITERAL); /* BATmark will create a copy */ BATorder(map_oid); BATsetaccess(map_oid, BAT_READ); /* force BAtmark not to copy bat */ - map_oid = BATmirror(BATmark(BATmirror(map_oid), 1<<30)); + map_oid = BATmirror(BATmark(BATmirror(map_oid), RDF_MIN_LITERAL)); BATsetaccess(graph[MAP_LEX], BAT_READ); /* force BATmark not to copy bat */ - graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), 1<<30)); + graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), RDF_MIN_LITERAL)); /* convert old oids of O_sort to new ones */ bi = bat_iterator(graph[O_sort]); mi = bat_iterator(map_oid); BATloop(graph[O_sort], p, d) { bt = (oid *) BUNtloc(bi, p); - if (*bt >= (1 << 30)) { + if (*bt >= (RDF_MIN_LITERAL)) { BUNfndVOID(r, mi, bt); void_inplace(graph[O_sort], p, BUNtloc(mi, r), 1); } @@ -484,6 +482,7 @@ raptor_free_parser(rparser); raptor_free_uri(uri); raptor_finish(); @ + @= clean if (pdata != NULL) { for (iret = 0; iret < N_GRAPH_BAT; iret++) { @@ -493,10 +492,13 @@ if (pdata != NULL) { GDKfree(pdata); } @ + @c +#define RDF_CHUNK_SIZE 100*1024*1024 + /* Main RDF parser function that drives raptor */ str -RDFParser (int *retval, str *location, str *graphname, str *schema) +RDFParser (BAT **graph, str *location, str *graphname, str *schema) { raptor_parser *rparser; parserData *pdata; @@ -504,8 +506,6 @@ RDFParser (int *retval, str *location, s bit isURI; str ret; int iret; - BAT **graph; - BAT *retbat; (void) graphname; /* init tokenizer */ @@ -519,7 +519,7 @@ RDFParser (int *retval, str *location, s #endif /* Init pdata */ - pdata = parserData_create(*location); + pdata = parserData_create(*location,graph); if (pdata == NULL) { #ifdef _TKNZR_H TKNZRclose(&iret); @@ -555,16 +555,38 @@ RDFParser (int *retval, str *location, s uri = raptor_new_uri((unsigned char *) pdata->location); iret = raptor_parse_uri(rparser, uri, NULL); } else { - uri = raptor_new_uri( - raptor_uri_filename_to_uri_string(pdata->location)); - iret = raptor_parse_file(rparser, uri, NULL); + + /* Too slow loading --> use old code + FILE *fp = fopen(pdata->location, "r"); + char *buf = (char*) GDKmalloc(RDF_CHUNK_SIZE); + if (buf == NULL) { + throw(RDF, "rdf.rdfShred", + "could not allocate a %dMB file buffer\n", (int) (RDF_CHUNK_SIZE>>20)); + } + uri = raptor_new_uri(raptor_uri_filename_to_uri_string(pdata->location)); + iret = raptor_start_parse(rparser, uri); + while(fp && iret == 0) { + ssize_t len = (ssize_t) fread(buf, 1, RDF_CHUNK_SIZE, fp); + iret = raptor_parse_chunk(rparser, (const unsigned char*) buf, (size_t) len, len < RDF_CHUNK_SIZE); + } + fclose(fp); + + */ + + /* does/may? not work on large files -- therefore the abpove chunked read + iret = raptor_parse_file_stream(rparser, fp, pdata->location, uri); + */ + + /* Old code */ + uri = raptor_new_uri( + raptor_uri_filename_to_uri_string(pdata->location)); + iret = raptor_parse_file(rparser, uri, NULL); } @:clean_raptor@ #ifdef _TKNZR_H TKNZRclose(&iret); #endif - graph = pdata->graph; assert (pdata->tcount == BATcount(graph[S_sort]) && pdata->tcount == BATcount(graph[P_sort]) && pdata->tcount == BATcount(graph[O_sort])); @@ -593,25 +615,6 @@ RDFParser (int *retval, str *location, s @:clean@ throw(RDF, "rdf.rdfShred", "could not post-proccess data"); } - - /* prepare return bat of bats */ -/* XXX: BAT columns of TYPE_bat are no longer allowed: this function - * needs to be rewritten to return multiple BATs instead of a single - * BAT-of-batS */ - retbat = BATnew(TYPE_void, TYPE_bat, N_GRAPH_BAT); - if (retbat == NULL) { - @:clean@ - throw(RDF, "rdf.rdfShred", - "could not allocate enough memory for return bat"); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list