MonetDB: lodrdf - Fix bug in SQLrdfShred() function (in sql/back...

Minh-Duc Pham Fri, 28 Sep 2012 01:56:23 -0700

Changeset: 88c8fdf4e227 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=88c8fdf4e227
Modified Files:
        monetdb5/extras/rdf/rdf_shredder.mx
        sql/backends/monet5/sql.mx
Branch: lodrdf
Log Message:


Fix bug in SQLrdfShred() function (in sql/backends/monet5/sql.mx) while loading 
rdf data
Add the corresponding modification for this in rdf_shredder.mx


diffs (truncated from 1087 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf_shredder.mx 
b/monetdb5/extras/rdf/rdf_shredder.mx
--- a/monetdb5/extras/rdf/rdf_shredder.mx
+++ b/monetdb5/extras/rdf/rdf_shredder.mx
@@ -92,15 +92,15 @@ typedef struct parserData {
        int line;                     /* locator for errors     */
        int column;                   /* locator for errors     */
                                      /**GRAPH DATA             */
-       BAT *graph[N_GRAPH_BAT];      /* BATs for the result
+       BAT **graph;                  /* BATs for the result
                                         shredded RDF graph     */
 } parserData;
 
-/*
- * @-
- * The (fatal) errors and warnings produced by the raptor parser are handled
- * by the next three message handler functions.
- */
+@{
+@-
+The (fatal) errors and warnings produced by the raptor parser are handled
+by the next three message handler functions.
+
 @= raptor_exception
 @1->exception++;
 @1->exceptionMsg = @2;
@@ -124,17 +124,17 @@ static void
        }
 }
 
-@
+@}
+
 @c
 @:rdf_parser_handler(fatal)@
 @:rdf_parser_handler(error)@
 @:rdf_parser_handler(warning)@
 
-/*
- * @-
- * The raptor parser needs to register a callback function that handles one 
triple
- * at a time. Function rdf_parser_triple_handler() does exactly this.
- */
+@-
+The raptor parser needs to register a callback function that handles one triple
+at a time. Function rdf_parser_triple_handler() does exactly this.
+
 @= rdf_insert
 #ifdef _TKNZR_H
  @:rdf_tknzr_insert(@2)@
@@ -146,7 +146,7 @@ static void
 @= rdf_BUNappend_unq_1
 bun = BUNfnd(BATmirror(@1),(ptr)@2);
 if (bun == BUN_NONE) {
-       if (BATcount(@1) > 4 * @1->T->hash->mask) {
+       if (@1->T->hash && BATcount(@1) > 4 * @1->T->hash->mask) {
                HASHdestroy(@1);
                BAThash(BATmirror(@1), 2*BATcount(@1));
        }
@@ -185,7 +185,6 @@ if (@1 == NULL) {
        @:raptor_exception(pdata, "could not append in@1")@
 }
 
-@
 @c
 static void
 tripleHandler(void* user_data, const raptor_statement* triple)
@@ -229,10 +228,9 @@ tripleHandler(void* user_data, const rap
        return;
 }
 
-/*
- * @-
- * Function RDFParser() is the entry point to parse an RDF document.
- */
+@-
+Function RDFParser() is the entry point to parse an RDF document.
+
 @= set_handlers
 /* set callback handler for triples */
 raptor_set_statement_handler   (@1, @2, tripleHandler);
@@ -241,7 +239,6 @@ raptor_set_fatal_error_handler (@1, @2, 
 raptor_set_error_handler       (@1, @2, errorHandler);
 raptor_set_warning_handler     (@1, @2, warningHandler);
 
-@
 @c
 /* creates a BAT for the triple table */
 static BAT*
@@ -255,7 +252,6 @@ create_BAT(int ht, int tt, int size)
 
        /* disable all properties */
        b->tsorted = FALSE;
-       b->trevsorted = FALSE;
        b->tdense = FALSE;
        b->tkey = FALSE;
        b->hdense = TRUE;
@@ -264,7 +260,7 @@ create_BAT(int ht, int tt, int size)
 }
 
 static parserData*
-parserData_create (str location)
+parserData_create (str location, BAT** graph)
 {
        int i;
 
@@ -277,8 +273,9 @@ parserData_create (str location)
        pdata->error = 0;
        pdata->warning = 0;
        pdata->location = location;
+       pdata->graph = graph;
 
-       for (i = 0; i < N_GRAPH_BAT; i++) {
+       for (i = 0; i <= N_GRAPH_BAT; i++) {
                pdata->graph[i] = NULL;
        }
 
@@ -306,7 +303,7 @@ parserData_create (str location)
                return NULL;
        }
        /* MAP_LEX must have the key property */
-       BATseqbase(pdata->graph[MAP_LEX], 1 << 30);
+       BATseqbase(pdata->graph[MAP_LEX], RDF_MIN_LITERAL);
        pdata->graph[MAP_LEX]->tkey = BOUND2BTRUE;
        pdata->graph[MAP_LEX]->T->nokey[0] = 0;
        pdata->graph[MAP_LEX]->T->nokey[1] = 0;
@@ -314,22 +311,21 @@ parserData_create (str location)
        return pdata;
 }
 
-/*
- * @-
- * After the RDF document has been shredded into 3 bats and a lexical value
- * dictionary, a post-shred processing step follows that orders the lexical
- * dictionary, re-maps oids to match the ordered dictionary and finaly creates
- * all 6 permutations of the (subject, predicate, object) order.
- *
- * However, it is still to be examined if it worth the time to refine the order
- * of the last column. In most cases, during query time, the last column will 
need
- * to be re-order for a subsequent sort-merge join. We introduce sort3 and 
sort2
- * so we can investigate both possibilities. In addition, the first column 
need to
- * be stored only once for each couple of orders with the same first column. 
For
- * example, it holds that S_SPO == S_SOP.
- */
+@-
+After the RDF document has been shredded into 3 bats and a lexical value
+dictionary, a post-shred processing step follows that orders the lexical
+dictionary, re-maps oids to match the ordered dictionary and finaly creates
+all 6 permutations of the (subject, predicate, object) order.
+
+However, it is still to be examined if it worth the time to refine the order
+of the last column. In most cases, during query time, the last column will need
+to be re-order for a subsequent sort-merge join. We introduce sort3 and sort2
+so we can investigate both possibilities. In addition, the first column need to
+be stored only once for each couple of orders with the same first column. For
+example, it holds that S_SPO == S_SOP.
+
 @= order
-@:order2(@1,@2,@3,@4)@
+@:order3(@1,@2,@3,@4)@
 
 @= order2
 if (!CTrefine(&ctref, @1, @2))         /* refine @2 given @1= sorted  */
@@ -347,6 +343,7 @@ BBPcold(graph[@3_@4]->batCacheid);
 /* free ctref */
 BBPreclaim(ctref);
 @
+
 @= order3
 if ( !(CTrefine(&map_oid, @1, @2)         /* refine @3 given @1= sorted  */
                && CTrefine(&ctref, map_oid, @3)))/* refine @4 given @3         
 */
@@ -366,6 +363,7 @@ BBPcold(graph[@3_@4]->batCacheid);
 /* free map_oid */
 BBPreclaim(map_oid);
 @
+
 @c
 int CTrefine(BAT **ret, BAT *b, BAT *a); /* from modules/kernel/group.mx */
 
@@ -385,19 +383,19 @@ post_processing (parserData *pdata)
 
        /* order MAP_LEX */
        BATorder(BATmirror(graph[MAP_LEX]));
-       map_oid = BATmark(graph[MAP_LEX], 1<<30);   /* BATmark will create a 
copy */
+       map_oid = BATmark(graph[MAP_LEX], RDF_MIN_LITERAL);   /* BATmark will 
create a copy */
        BATorder(map_oid);
        BATsetaccess(map_oid, BAT_READ);        /* force BAtmark not to copy 
bat */
-       map_oid = BATmirror(BATmark(BATmirror(map_oid), 1<<30));
+       map_oid = BATmirror(BATmark(BATmirror(map_oid), RDF_MIN_LITERAL));
        BATsetaccess(graph[MAP_LEX], BAT_READ); /* force BATmark not to copy 
bat */
-       graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), 1<<30));
+       graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), 
RDF_MIN_LITERAL));
 
        /* convert old oids of O_sort to new ones */
        bi = bat_iterator(graph[O_sort]);
        mi = bat_iterator(map_oid);
        BATloop(graph[O_sort], p, d) {
                bt = (oid *) BUNtloc(bi, p);
-               if (*bt >= (1 << 30)) {
+               if (*bt >= (RDF_MIN_LITERAL)) {
                        BUNfndVOID(r, mi, bt);
                        void_inplace(graph[O_sort], p, BUNtloc(mi, r), 1);
                }
@@ -484,6 +482,7 @@ raptor_free_parser(rparser);
 raptor_free_uri(uri);
 raptor_finish();
 @
+
 @= clean
 if (pdata != NULL) {
        for (iret = 0; iret < N_GRAPH_BAT; iret++) {
@@ -493,10 +492,13 @@ if (pdata != NULL) {
        GDKfree(pdata);
 }
 @
+
 @c
+#define RDF_CHUNK_SIZE 100*1024*1024
+
 /* Main RDF parser function that drives raptor */
 str
-RDFParser (int *retval, str *location, str *graphname, str *schema)
+RDFParser (BAT **graph, str *location, str *graphname, str *schema)
 {
        raptor_parser *rparser;
        parserData *pdata;
@@ -504,8 +506,6 @@ RDFParser (int *retval, str *location, s
        bit isURI;
        str ret;
        int iret;
-       BAT **graph;
-       BAT *retbat;
        (void) graphname;
 
        /* init tokenizer */
@@ -519,7 +519,7 @@ RDFParser (int *retval, str *location, s
 #endif
 
        /* Init pdata  */
-       pdata = parserData_create(*location);
+       pdata = parserData_create(*location,graph);
        if (pdata == NULL) {
 #ifdef _TKNZR_H
                TKNZRclose(&iret);
@@ -555,16 +555,38 @@ RDFParser (int *retval, str *location, s
                uri = raptor_new_uri((unsigned char *) pdata->location);
                iret = raptor_parse_uri(rparser, uri, NULL);
        } else {
-               uri = raptor_new_uri(
-                               
raptor_uri_filename_to_uri_string(pdata->location));
-               iret = raptor_parse_file(rparser, uri, NULL);
+               
+               /* Too slow loading --> use old code 
+               FILE *fp = fopen(pdata->location, "r");
+               char *buf = (char*) GDKmalloc(RDF_CHUNK_SIZE);
+               if (buf == NULL) {
+                       throw(RDF, "rdf.rdfShred",
+                               "could not allocate a %dMB file buffer\n", 
(int) (RDF_CHUNK_SIZE>>20));
+               }
+               uri = 
raptor_new_uri(raptor_uri_filename_to_uri_string(pdata->location));
+               iret = raptor_start_parse(rparser, uri);
+               while(fp && iret == 0) {
+                       ssize_t len = (ssize_t) fread(buf, 1, RDF_CHUNK_SIZE, 
fp);  
+                       iret = raptor_parse_chunk(rparser, (const unsigned 
char*) buf, (size_t) len, len < RDF_CHUNK_SIZE);
+               }
+               fclose(fp);
+               
+               */
+
+               /* does/may? not work on large files -- therefore the abpove 
chunked read
+                   iret = raptor_parse_file_stream(rparser, fp, 
pdata->location, uri); 
+                */
+
+               /* Old code */
+                uri = raptor_new_uri(
+                                
raptor_uri_filename_to_uri_string(pdata->location));
+                iret = raptor_parse_file(rparser, uri, NULL);
        }
        @:clean_raptor@
 #ifdef _TKNZR_H
        TKNZRclose(&iret);
 #endif
 
-       graph = pdata->graph;
        assert (pdata->tcount == BATcount(graph[S_sort]) &&
                        pdata->tcount == BATcount(graph[P_sort]) &&
                        pdata->tcount == BATcount(graph[O_sort]));
@@ -593,25 +615,6 @@ RDFParser (int *retval, str *location, s
                @:clean@
                throw(RDF, "rdf.rdfShred", "could not post-proccess data");
        }
-
-       /* prepare return bat of bats */
-/* XXX: BAT columns of TYPE_bat are no longer allowed: this function
- * needs to be rewritten to return multiple BATs instead of a single
- * BAT-of-batS */
-       retbat = BATnew(TYPE_void, TYPE_bat, N_GRAPH_BAT);
-       if (retbat == NULL) {
-               @:clean@
-               throw(RDF, "rdf.rdfShred",
-                               "could not allocate enough memory for return 
bat");
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: lodrdf - Fix bug in SQLrdfShred() function (in sql/back...

Reply via email to