Changeset: d065169e7cc9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d065169e7cc9 Modified Files: pathfinder/src/sqlhelpers/xmlshred/encoding.c pathfinder/src/sqlhelpers/xmlshred/guides.c pathfinder/src/sqlhelpers/xmlshred/hash.c pathfinder/src/sqlhelpers/xmlshred/include/encoding.h pathfinder/src/sqlhelpers/xmlshred/include/guides.h pathfinder/src/sqlhelpers/xmlshred/include/hash.h pathfinder/src/sqlhelpers/xmlshred/include/shred_helper.h pathfinder/src/sqlhelpers/xmlshred/main.c Branch: default Log Message:
Extended pfshred to shred a large number of small documents in one go. -- Added two more shredder options: -d docname to change the name of the document in the encoded XML representation -t to interpret the input file as triples of document name, offset, and length of the XML document $ cat test.csv test.xml, 0, 33 test.xml, 33, 21 test.xml, 54, 16 $ cat test.xml <a a="test"><b g="h"><c/></b></a><d>hello<e/>world</d><a>hello<d/></a> $ pfshred -f test.csv -t -o out -d example.xml $ sort -n out.csv 0, 5, 0, 6, , "example.xml", 1, 1, 4, 1, 1, 0, , 2, 0 2, 0, 2, 2, 0, "test", 3, 0 3, 2, 2, 1, 1, , 4, 0 4, 0, 3, 2, 2, "h", 5, 0 5, 0, 3, 1, 3, , 6, 0 6, 4, 0, 6, , "example.xml", 1, 7, 3, 1, 1, 4, , 7, 0 8, 0, 2, 3, , "hello", 8, 9, 0, 2, 1, 5, , 9, 0 10, 0, 2, 3, , "world", 8, 11, 3, 0, 6, , "example.xml", 1, 12, 2, 1, 1, 0, , 2, 0 13, 0, 2, 3, , "hello", 10, 14, 0, 2, 1, 4, , 11, 0 -- Extended pfshred with a new Format field 'root', that returns the pre value of the document node for all other nodes. -- Changed guide collection to build a single guide for multiple documents. -- Removed type xmlChar from the helper files. -- Fixed various memory leaks. diffs (truncated from 913 to 300 lines): diff -r 96991648d072 -r d065169e7cc9 pathfinder/src/sqlhelpers/xmlshred/encoding.c --- a/pathfinder/src/sqlhelpers/xmlshred/encoding.c Tue Oct 05 22:30:37 2010 +0200 +++ b/pathfinder/src/sqlhelpers/xmlshred/encoding.c Tue Oct 05 23:17:03 2010 +0200 @@ -96,10 +96,12 @@ static node_t stack[STACK_MAX]; static int level; static int max_level; +static nat root; static nat pre; static nat post; static nat rank; static nat att_id; +static guide_tree_t *guide; /* encoding format compilation */ @@ -248,6 +250,9 @@ static void lambda_d (node_t n) { print_number (out, n); } +static void lambda_r (node_t n) +{ fprintf (out, SSZFMT, n.root); } + static void lambda_t (node_t n) { print_value (out, n); } @@ -272,6 +277,7 @@ , ['n'] = lambda_n , ['u'] = lambda_u , ['d'] = lambda_d + , ['r'] = lambda_r , ['t'] = lambda_t , ['g'] = lambda_g , ['%'] = lambda_percent @@ -334,13 +340,13 @@ if (!localname) return -1; - localname_id = hashtable_find (localname_hash, localname); + localname_id = hashtable_find (localname_hash, (char *) localname); if (NOKEY (localname_id)) { /* key not found, create a new name id */ localname_id = global_localname_id++; /* add the (localname, localname_id) pair into the hash table */ - hashtable_insert (localname_hash, localname, localname_id); + hashtable_insert (localname_hash, (char *) localname, localname_id); /* print the name binding if necessary */ if (shredstate.names_separate) fprintf (out_names, "%i, \"%s\"\n", localname_id, (char*) localname); @@ -358,13 +364,13 @@ if (!URI) return -1; - uri_id = hashtable_find (uris_hash, URI); + uri_id = hashtable_find (uris_hash, (char *) URI); if (NOKEY (uri_id)) { /* key not found, create a new URI id */ uri_id = global_uri_id++; /* add the (URI, uri_id) pair to the hash table */ - hashtable_insert (uris_hash, URI, uri_id); + hashtable_insert (uris_hash, (char *) URI, uri_id); /* print the URI binding if necessary */ if (shredstate.names_separate) fprintf (out_uris, "%i, \"%s\"\n", uri_id, (char*) URI); @@ -378,6 +384,7 @@ const xmlChar *URI, const xmlChar *localname, const xmlChar *value) { + char *short_value = NULL; int valStrLen = -1; pre++; @@ -397,11 +404,16 @@ /* check if value is larger than text_size characters */ if (value && (valStrLen = xmlStrlen (value)) >= 0 && - (unsigned int) valStrLen > text_size) + (unsigned int) valStrLen > text_size) { text_stripped++; + short_value = strndup ((char *) value, text_size); + } + + assert (value); stack[level] = (node_t) { - .pre = pre + .root = root + , .pre = pre , .post = post , .pre_stretched = rank , .post_stretched = rank + 1 @@ -410,14 +422,12 @@ , .children = 0 , .level = level , .kind = kind - , .localname = xmlStrdup (localname) + , .localname = localname ? strdup ((char *) localname) : NULL , .localname_id = generate_localname_id (localname) - , .uri = xmlStrdup (URI) + , .uri = URI ? strdup ((char *) URI) : NULL , .uri_id = generate_uri_id (URI) - , .value = xmlStrndup (value, - MIN ((unsigned int) xmlStrlen (value), - text_size)) - , .guide = insert_guide_node_ (URI, localname, + , .value = short_value ? short_value : (char *) value + , .guide = insert_guide_node_ ((char *) URI, (char *) localname, stack[level-1].guide, kind) }; @@ -427,9 +437,18 @@ apply_fmt (stack[level]); - if (stack[level].localname) xmlFree (stack[level].localname); - if (stack[level].uri) xmlFree (stack[level].uri); - if (stack[level].value) xmlFree (stack[level].value); + if (stack[level].localname) { + free (stack[level].localname); + stack[level].localname = NULL; + } + if (stack[level].uri) { + free (stack[level].uri); + stack[level].uri = NULL; + } + if (short_value) { + free (short_value); + stack[level].value = NULL; + } level--; /* add one more child node */ @@ -452,20 +471,15 @@ { (void) ctx; - /* initialize everything with zero */ - pre = 0; - post = 0; - rank = 0; - level = 0; - max_level = 0; - att_id = 0; + root = pre; if (strlen (shredstate.doc_name) > MIN (FILENAME_MAX, text_size)) text_stripped++; /* create a new node */ stack[level] = (node_t) { - .pre = pre + .root = root + , .pre = pre , .post = 0 , .pre_stretched = rank , .post_stretched = 0 @@ -478,16 +492,11 @@ , .localname_id = -1 , .uri = NULL , .uri_id = -1 - , .value = xmlStrdup ((xmlChar *) shredstate.doc_name) - , .guide = insert_guide_node_ ( - NULL, - xmlCharStrndup ( - shredstate.doc_name, - MIN (FILENAME_MAX, text_size)), - NULL, - doc) + , .value = strdup (shredstate.doc_name) + , .guide = guide }; - + /* extend guide counter */ + guide->count++; } static void @@ -507,13 +516,16 @@ apply_fmt (stack[level]); + pre++; post++; if (shredstate.statistics) guide_occurrence (stack[level].guide); - if (stack[level].value) - xmlFree (stack[level].value); + if (stack[level].value) { + free (stack[level].value); + stack[level].value = NULL; + } } static void @@ -554,7 +566,8 @@ URI, TAG_SIZE); stack[level] = (node_t) { - .pre = pre + .root = root + , .pre = pre , .post = 0 , .pre_stretched = rank , .post_stretched = 0 @@ -563,12 +576,12 @@ , .children = 0 , .level = level , .kind = elem - , .localname = xmlStrdup (localname) + , .localname = strdup ((char *) localname) , .localname_id = generate_localname_id (localname) - , .uri = xmlStrdup (URI) + , .uri = strdup ((char *) URI) , .uri_id = generate_uri_id (URI) - , .value = (xmlChar *) NULL - , .guide = insert_guide_node_ (URI, localname, + , .value = NULL + , .guide = insert_guide_node_ ((char *) URI, (char *) localname, stack[level-1].guide, elem) }; @@ -608,7 +621,8 @@ if (shredstate.statistics) fprintf (out_attr, "," SSZFMT, guide_val_ ( - insert_guide_node_ (atts[2], atts[0], + insert_guide_node_ ((char *) atts[2], + (char *) atts[0], stack[level].guide, attr))); putc ('\n', out_attr); @@ -646,7 +660,7 @@ value lookups into a single path step. */ /* copy the text value of a text node if it is the only child */ if (buf[0] && pre == stack[level].pre) - stack[level].value = xmlStrndup (buf, text_size); + stack[level].value = strndup ((char *) buf, text_size); flush_buffer (); @@ -664,12 +678,18 @@ post++; /* free the memory allocated for the element name and the text value */ - if (stack[level].localname) - xmlFree (stack[level].localname); - if (stack[level].uri) - xmlFree (stack[level].uri); - if (stack[level].value) - xmlFree (stack[level].value); + if (stack[level].localname) { + free (stack[level].localname); + stack[level].localname = NULL; + } + if (stack[level].uri) { + free (stack[level].uri); + stack[level].uri = NULL; + } + if (stack[level].value) { + free (stack[level].value); + stack[level].value = NULL; + } level--; /* add one more child node */ @@ -776,23 +796,13 @@ } } -/** - * Main shredding procedure. - */ -void -SHshredder (const char *s, - FILE *shout, +static void +initialize (FILE *shout, FILE *attout, FILE *namesout, - FILE *urisout, - FILE *guideout, - shred_state_t *status) + FILE *urisout) { - /* XML parser context */ - xmlParserCtxtPtr ctx; - shredstate = *status; - - assert (shout); + char *doc_name; /* bind the different output files to global variables to make them accessible inside the callback functions */ @@ -828,8 +838,52 @@ print_uri = print_uri_str; } _______________________________________________ Checkin-list mailing list Checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list