Changeset: d065169e7cc9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d065169e7cc9
Modified Files:
        pathfinder/src/sqlhelpers/xmlshred/encoding.c
        pathfinder/src/sqlhelpers/xmlshred/guides.c
        pathfinder/src/sqlhelpers/xmlshred/hash.c
        pathfinder/src/sqlhelpers/xmlshred/include/encoding.h
        pathfinder/src/sqlhelpers/xmlshred/include/guides.h
        pathfinder/src/sqlhelpers/xmlshred/include/hash.h
        pathfinder/src/sqlhelpers/xmlshred/include/shred_helper.h
        pathfinder/src/sqlhelpers/xmlshred/main.c
Branch: default
Log Message:

Extended pfshred to shred a large number of small documents in one go.

-- Added two more shredder options:

   -d docname to change the name of the document in the encoded
              XML representation
   -t         to interpret the input file as triples of document
              name, offset, and length of the XML document

   $ cat test.csv
   test.xml, 0, 33
   test.xml, 33, 21
   test.xml, 54, 16
   $ cat test.xml
   <a a="test"><b g="h"><c/></b></a><d>hello<e/>world</d><a>hello<d/></a>
   $ pfshred -f test.csv -t -o out -d example.xml
   $ sort -n out.csv
   0, 5, 0, 6, , "example.xml", 1,
   1, 4, 1, 1, 0, , 2, 0
   2, 0, 2, 2, 0, "test", 3, 0
   3, 2, 2, 1, 1, , 4, 0
   4, 0, 3, 2, 2, "h", 5, 0
   5, 0, 3, 1, 3, , 6, 0
   6, 4, 0, 6, , "example.xml", 1,
   7, 3, 1, 1, 4, , 7, 0
   8, 0, 2, 3, , "hello", 8,
   9, 0, 2, 1, 5, , 9, 0
   10, 0, 2, 3, , "world", 8,
   11, 3, 0, 6, , "example.xml", 1,
   12, 2, 1, 1, 0, , 2, 0
   13, 0, 2, 3, , "hello", 10,
   14, 0, 2, 1, 4, , 11, 0

-- Extended pfshred with a new Format field 'root', that returns
   the pre value of the document node for all other nodes.

-- Changed guide collection to build a single guide for multiple documents.

-- Removed type xmlChar from the helper files.

-- Fixed various memory leaks.


diffs (truncated from 913 to 300 lines):

diff -r 96991648d072 -r d065169e7cc9 
pathfinder/src/sqlhelpers/xmlshred/encoding.c
--- a/pathfinder/src/sqlhelpers/xmlshred/encoding.c     Tue Oct 05 22:30:37 
2010 +0200
+++ b/pathfinder/src/sqlhelpers/xmlshred/encoding.c     Tue Oct 05 23:17:03 
2010 +0200
@@ -96,10 +96,12 @@
 static node_t stack[STACK_MAX];
 static int level;
 static int max_level;
+static nat root;
 static nat pre;
 static nat post;
 static nat rank;
 static nat att_id;
+static guide_tree_t *guide;
              
 /* encoding format compilation */
 
@@ -248,6 +250,9 @@
 static void lambda_d (node_t n) 
 { print_number (out, n); }
 
+static void lambda_r (node_t n) 
+{ fprintf (out, SSZFMT, n.root); }
+
 static void lambda_t (node_t n) 
 { print_value (out, n); }
 
@@ -272,6 +277,7 @@
   , ['n'] = lambda_n
   , ['u'] = lambda_u
   , ['d'] = lambda_d
+  , ['r'] = lambda_r
   , ['t'] = lambda_t
   , ['g'] = lambda_g
   , ['%'] = lambda_percent
@@ -334,13 +340,13 @@
     if (!localname)
         return -1;
                      
-    localname_id = hashtable_find (localname_hash, localname);
+    localname_id = hashtable_find (localname_hash, (char *) localname);
 
     if (NOKEY (localname_id)) {
         /* key not found, create a new name id */
         localname_id = global_localname_id++;
         /* add the (localname, localname_id) pair into the hash table */
-        hashtable_insert (localname_hash, localname, localname_id);
+        hashtable_insert (localname_hash, (char *) localname, localname_id);
         /* print the name binding if necessary */
         if (shredstate.names_separate)
             fprintf (out_names, "%i, \"%s\"\n", localname_id, (char*) 
localname);
@@ -358,13 +364,13 @@
     if (!URI)
         return -1;
         
-    uri_id = hashtable_find (uris_hash, URI);
+    uri_id = hashtable_find (uris_hash, (char *) URI);
 
     if (NOKEY (uri_id)) {
         /* key not found, create a new URI id */
         uri_id = global_uri_id++;
         /* add the (URI, uri_id) pair to the hash table */
-        hashtable_insert (uris_hash, URI, uri_id);
+        hashtable_insert (uris_hash, (char *) URI, uri_id);
         /* print the URI binding if necessary */
         if (shredstate.names_separate)
             fprintf (out_uris, "%i, \"%s\"\n", uri_id, (char*) URI);
@@ -378,6 +384,7 @@
             const xmlChar *URI, const xmlChar *localname, 
             const xmlChar *value)
 {
+    char *short_value = NULL;
     int valStrLen = -1;
 
     pre++;
@@ -397,11 +404,16 @@
 
     /* check if value is larger than text_size characters */
     if (value && (valStrLen = xmlStrlen (value)) >= 0 &&
-        (unsigned int) valStrLen > text_size)
+        (unsigned int) valStrLen > text_size) {
         text_stripped++;
+        short_value = strndup ((char *) value, text_size);
+    }
+
+    assert (value);
 
     stack[level] = (node_t) {
-        .pre            = pre
+        .root           = root
+      , .pre            = pre
       , .post           = post
       , .pre_stretched  = rank
       , .post_stretched = rank + 1
@@ -410,14 +422,12 @@
       , .children       = 0
       , .level          = level
       , .kind           = kind
-      , .localname      = xmlStrdup (localname)
+      , .localname      = localname ? strdup ((char *) localname) : NULL
       , .localname_id   = generate_localname_id (localname)
-      , .uri            = xmlStrdup (URI)
+      , .uri            = URI ? strdup ((char *) URI) : NULL
       , .uri_id         = generate_uri_id (URI)
-      , .value          = xmlStrndup (value,
-                                      MIN ((unsigned int) xmlStrlen (value),
-                                           text_size))
-      , .guide          = insert_guide_node_ (URI, localname,
+      , .value          = short_value ? short_value : (char *) value
+      , .guide          = insert_guide_node_ ((char *) URI, (char *) localname,
                                               stack[level-1].guide,
                                               kind)
     };
@@ -427,9 +437,18 @@
     
     apply_fmt (stack[level]);
 
-    if (stack[level].localname) xmlFree (stack[level].localname);
-    if (stack[level].uri)       xmlFree (stack[level].uri);
-    if (stack[level].value)     xmlFree (stack[level].value);
+    if (stack[level].localname) {
+        free (stack[level].localname);
+        stack[level].localname = NULL;
+    }
+    if (stack[level].uri) {   
+        free (stack[level].uri);
+        stack[level].uri = NULL;
+    }
+    if (short_value) {
+        free (short_value);
+        stack[level].value = NULL;
+    }
 
     level--;
     /* add one more child node */
@@ -452,20 +471,15 @@
 {
     (void) ctx;
 
-    /* initialize everything with zero */
-    pre       = 0;
-    post      = 0;
-    rank      = 0;
-    level     = 0;
-    max_level = 0;
-    att_id    = 0;
+    root = pre;
 
     if (strlen (shredstate.doc_name) > MIN (FILENAME_MAX, text_size))
         text_stripped++;
 
     /* create a new node */
     stack[level] = (node_t) {
-          .pre            = pre
+          .root           = root
+        , .pre            = pre
         , .post           = 0
         , .pre_stretched  = rank
         , .post_stretched = 0
@@ -478,16 +492,11 @@
         , .localname_id   = -1
         , .uri            = NULL
         , .uri_id         = -1
-        , .value          = xmlStrdup ((xmlChar *) shredstate.doc_name)
-        , .guide          = insert_guide_node_ (
-                                NULL,
-                                xmlCharStrndup (
-                                    shredstate.doc_name,
-                                    MIN (FILENAME_MAX, text_size)),
-                                NULL,
-                                doc)
+        , .value          = strdup (shredstate.doc_name)
+        , .guide          = guide
     };
-
+    /* extend guide counter */
+    guide->count++;
 }
 
 static void
@@ -507,13 +516,16 @@
 
     apply_fmt (stack[level]);
 
+    pre++;
     post++;
 
     if (shredstate.statistics) 
         guide_occurrence (stack[level].guide);
 
-    if (stack[level].value) 
-        xmlFree (stack[level].value);
+    if (stack[level].value) {
+        free (stack[level].value);
+        stack[level].value = NULL;
+    }
 }
 
 static void
@@ -554,7 +566,8 @@
                  URI, TAG_SIZE);
 
     stack[level] = (node_t) {
-        .pre            = pre
+        .root           = root
+      , .pre            = pre
       , .post           = 0
       , .pre_stretched  = rank
       , .post_stretched = 0
@@ -563,12 +576,12 @@
       , .children       = 0
       , .level          = level
       , .kind           = elem
-      , .localname      = xmlStrdup (localname)
+      , .localname      = strdup ((char *) localname)
       , .localname_id   = generate_localname_id (localname) 
-      , .uri            = xmlStrdup (URI)
+      , .uri            = strdup ((char *) URI)
       , .uri_id         = generate_uri_id (URI)
-      , .value          = (xmlChar *) NULL
-      , .guide          = insert_guide_node_ (URI, localname,
+      , .value          = NULL
+      , .guide          = insert_guide_node_ ((char *) URI, (char *) localname,
                                               stack[level-1].guide,
                                               elem)
     };
@@ -608,7 +621,8 @@
                 if (shredstate.statistics)
                     fprintf (out_attr, "," SSZFMT,
                              guide_val_ (
-                                 insert_guide_node_ (atts[2], atts[0],
+                                 insert_guide_node_ ((char *) atts[2], 
+                                                     (char *) atts[0],
                                                      stack[level].guide,
                                                      attr)));
                 putc ('\n', out_attr);
@@ -646,7 +660,7 @@
    value lookups into a single path step. */
     /* copy the text value of a text node if it is the only child */
     if (buf[0] && pre == stack[level].pre)
-        stack[level].value = xmlStrndup (buf, text_size);
+        stack[level].value = strndup ((char *) buf, text_size);
 
     flush_buffer ();
 
@@ -664,12 +678,18 @@
     post++;
 
     /* free the memory allocated for the element name and the text value */
-    if (stack[level].localname) 
-        xmlFree (stack[level].localname);
-    if (stack[level].uri)       
-        xmlFree (stack[level].uri);
-    if (stack[level].value)     
-        xmlFree (stack[level].value);
+    if (stack[level].localname) {
+        free (stack[level].localname);
+        stack[level].localname = NULL;
+    }
+    if (stack[level].uri) {   
+        free (stack[level].uri);
+        stack[level].uri = NULL;
+    }
+    if (stack[level].value) {
+        free (stack[level].value);
+        stack[level].value = NULL;
+    }
 
     level--;
     /* add one more child node */
@@ -776,23 +796,13 @@
     }
 }
 
-/**
- * Main shredding procedure.
- */
-void
-SHshredder (const char *s, 
-            FILE *shout,
+static void
+initialize (FILE *shout,
             FILE *attout,
             FILE *namesout,
-            FILE *urisout,
-            FILE *guideout,
-            shred_state_t *status)
+            FILE *urisout)
 {
-    /* XML parser context */
-    xmlParserCtxtPtr ctx;
-    shredstate = *status;
-
-    assert (shout);
+    char *doc_name;
 
     /* bind the different output files to global variables
        to make them accessible inside the callback functions */
@@ -828,8 +838,52 @@
         print_uri       = print_uri_str;
     }
_______________________________________________
Checkin-list mailing list
Checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to