Changeset: c416436c479c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c416436c479c
Modified Files:
        pathfinder/src/sqlhelpers/xmlshred/encoding.c
        pathfinder/src/sqlhelpers/xmlshred/guides.c
        pathfinder/src/sqlhelpers/xmlshred/include/encoding.h
        pathfinder/src/sqlhelpers/xmlshred/include/guides.h
        pathfinder/src/sqlhelpers/xmlshred/include/shred_helper.h
        pathfinder/src/sqlhelpers/xmlshred/main.c
Branch: default
Log Message:

Various extensions to pfshred:
- Added support for arbitrarily long tagnames and textnodes.
- Extended guide information with unique name and uri ids
  (cf., format options '%n' and '%u').
- Fix error when generating files without format option '%g'.
- Added new format options ('%X', '%U', '%N', '%r').
- Added option (-e) to disable string escaping.
- Added option (-x) to generate output for x100.


diffs (truncated from 770 to 300 lines):

diff --git a/pathfinder/src/sqlhelpers/xmlshred/encoding.c 
b/pathfinder/src/sqlhelpers/xmlshred/encoding.c
--- a/pathfinder/src/sqlhelpers/xmlshred/encoding.c
+++ b/pathfinder/src/sqlhelpers/xmlshred/encoding.c
@@ -58,14 +58,20 @@
 
 /* count the tags/text being truncated
  * during the shredding process */
-unsigned int tag_stripped;
+unsigned int prefix_stripped;
+unsigned int name_stripped;
+unsigned int uri_stripped;
 unsigned int text_stripped;
 
 static shred_state_t shredstate;
 
 /* Wrapper to ensure that guide information is only collected if needed */
-#define insert_guide_node_(u,n,p,k) ((shredstate.statistics)            \
-                                  ? insert_guide_node ((u),(n),(p),(k)) \
+#define insert_guide_node_(u,n,p,k) ((shredstate.statistics)                   
\
+                                  ? insert_guide_node (                        
\
+                                        (u),(n),                               
\
+                                        generate_uri_id ((xmlChar *) u),       
\
+                                        generate_localname_id ((xmlChar *) n), 
\
+                                        (p),(k))                               
\
                                   : NULL)                            
 #define guide_val_(g)             ((shredstate.statistics) \
                                   ? (g)->guide             \
@@ -78,20 +84,15 @@
 #define BAILOUT(...) do { SHoops (SH_FATAL, __VA_ARGS__);   \
                           free_hashtable (localname_hash);  \
                           free_hashtable (uris_hash);       \
+                          free (buffer);                    \
                           exit (1);                         \
                         } while (0)
 
 
-/* id of empty namespace prefix */
-#define EMPTY_NS 0
-
-/* maximum localname/URI and text buffer sizes */
-#define TAG_SIZE 100
-#define BUF_SIZE 4096 
-                              
 /* buffer for XML node contents/values */
-static xmlChar buf[BUF_SIZE + 1];
-static int bufpos;
+static unsigned int bufsize;
+static xmlChar *buffer;
+static unsigned int bufpos;
 
 static node_t stack[STACK_MAX];
 static int level;
@@ -160,12 +161,18 @@
 void
 print_value (FILE *f, node_t tuple)
 {
+    if (tuple.value)
+        fputs (tuple.value, f);
+}
+
+void
+print_value_escaped (FILE *f, node_t tuple)
+{
     /* escape quotes in quoted string */
     if (tuple.value) {
         char *src = (char *) tuple.value;
         unsigned int start = 0, end;
-        putc ('"', f);
-        for (end = 0; end < text_size && tuple.value[end]; end++)
+        for (end = 0; tuple.value[end]; end++)
             /* escape quotes */
             if (tuple.value[end] == '"') {
                 print_text (f, &src[start], end - start);
@@ -174,13 +181,9 @@
             }
         if (start < end)
             print_text (f, &src[start], end - start);
-        putc ('"', f);
     }
 }
 
-void (*print_localname) (FILE *, node_t);
-void (*print_uri) (FILE *, node_t);
-    
 void
 print_localname_id (FILE *f, node_t tuple)
 {
@@ -190,10 +193,16 @@
 void
 print_localname_str (FILE *f, node_t tuple)
 {
-    fprintf (f, "\"%s\"", (char *) tuple.localname);
+    fputs ((char *) tuple.localname, f);
 }
     
 void
+print_prefix_str (FILE *f, node_t tuple)
+{
+    fputs ((char *) tuple.prefix, f);
+}
+
+void
 print_uri_id (FILE *f, node_t tuple)
 {
     fprintf (f, "%i", tuple.uri_id);
@@ -202,7 +211,7 @@
 void
 print_uri_str (FILE *f, node_t tuple)
 {
-    fprintf (f, "\"%s\"", (char *) tuple.uri);
+    fputs ((char *) tuple.uri, f);
 }
 
 /* implementation of formatting functions */
@@ -241,11 +250,20 @@
 static void lambda_P (node_t n) 
 { if (n.parent) lambda_E (*(n.parent)); }
 
+static void lambda_X (node_t n) 
+{ if (n.uri_id != -1) print_prefix_str (out, n); }
+
 static void lambda_n (node_t n) 
-{ if (n.localname_id != -1) print_localname (out, n); }
+{ if (n.localname_id != -1) print_localname_id (out, n); }
+
+static void lambda_N (node_t n) 
+{ if (n.localname_id != -1) print_localname_str (out, n); }
 
 static void lambda_u (node_t n) 
-{ if (n.uri_id != -1) print_uri (out, n); }
+{ if (n.uri_id != -1) print_uri_id (out, n); }
+
+static void lambda_U (node_t n) 
+{ if (n.uri_id != -1) print_uri_str (out, n); }
 
 static void lambda_d (node_t n) 
 { print_number (out, n); }
@@ -253,8 +271,10 @@
 static void lambda_r (node_t n) 
 { fprintf (out, SSZFMT, n.root); }
 
+void (*print_textnode) (FILE *, node_t);
+
 static void lambda_t (node_t n) 
-{ print_value (out, n); }
+{ print_textnode (out, n); }
 
 static void lambda_g (node_t n) 
 { fprintf (out, SSZFMT, guide_val_ (n.guide)); }
@@ -275,7 +295,10 @@
   , ['p'] = lambda_p
   , ['P'] = lambda_P
   , ['n'] = lambda_n
+  , ['X'] = lambda_X
+  , ['N'] = lambda_N
   , ['u'] = lambda_u
+  , ['U'] = lambda_U
   , ['d'] = lambda_d
   , ['r'] = lambda_r
   , ['t'] = lambda_t
@@ -381,11 +404,14 @@
 
 static void
 flush_node (kind_t kind, 
-            const xmlChar *URI, const xmlChar *localname, 
+            const xmlChar *localname, 
+            const xmlChar *prefix, const xmlChar *URI, 
             const xmlChar *value)
 {
-    char *short_value = NULL;
-    int valStrLen = -1;
+    char *short_value    = NULL,
+         *prefix_copy    = NULL,
+         *localname_copy = NULL,
+         *URI_copy       = NULL;
 
     pre++;
     rank++;
@@ -393,18 +419,30 @@
 
     max_level = MAX(level, max_level);
 
-    /* check if tagname is larger than TAG_SIZE characters */
-    if (localname && xmlStrlen (localname) > TAG_SIZE)
-        BAILOUT ("attribute local name `%s' exceeds %u characters", 
-                 localname, TAG_SIZE);
+    /* check if prefix is larger than text_size characters */
+    if (text_size && prefix && (unsigned int) xmlStrlen (prefix) > text_size) {
+        prefix_stripped++;
+        prefix_copy = strndup ((char *) prefix, text_size);
+    } else if (prefix)
+        prefix_copy = strdup ((char *) prefix);
+        
+    /* check if tagname is larger than text_size characters */
+    if (text_size && localname &&
+        (unsigned int) xmlStrlen (localname) > text_size) {
+        name_stripped++;
+        localname_copy = strndup ((char *) localname, text_size);
+    } else if (localname)
+        localname_copy = strdup ((char *) localname);
     
-    if (URI && xmlStrlen (URI) > TAG_SIZE)
-        BAILOUT ("namespace URI `%s' exceeds length of %u characters", 
-                 URI, TAG_SIZE);
+    /* check if uri is larger than text_size characters */
+    if (text_size && URI && (unsigned int) xmlStrlen (URI) > text_size) {
+        name_stripped++;
+        URI_copy = strndup ((char *) URI, text_size);
+    } else if (URI)
+        URI_copy = strdup ((char *) URI);
 
     /* check if value is larger than text_size characters */
-    if (value && (valStrLen = xmlStrlen (value)) >= 0 &&
-        (unsigned int) valStrLen > text_size) {
+    if (text_size && value && (unsigned int) xmlStrlen (value) > text_size) {
         text_stripped++;
         short_value = strndup ((char *) value, text_size);
     }
@@ -422,9 +460,10 @@
       , .children       = 0
       , .level          = level
       , .kind           = kind
-      , .localname      = localname ? strdup ((char *) localname) : NULL
+      , .prefix         = prefix_copy
+      , .localname      = localname_copy
       , .localname_id   = generate_localname_id (localname)
-      , .uri            = URI ? strdup ((char *) URI) : NULL
+      , .uri            = URI_copy
       , .uri_id         = generate_uri_id (URI)
       , .value          = short_value ? short_value : (char *) value
       , .guide          = insert_guide_node_ ((char *) URI, (char *) localname,
@@ -437,12 +476,16 @@
     
     apply_fmt (stack[level]);
 
-    if (stack[level].localname) {
-        free (stack[level].localname);
+    if (prefix_copy) {
+        free (prefix_copy);
+        stack[level].prefix = NULL;
+    }
+    if (localname_copy) {
+        free (localname_copy);
         stack[level].localname = NULL;
     }
-    if (stack[level].uri) {   
-        free (stack[level].uri);
+    if (URI_copy) {   
+        free (URI_copy);
         stack[level].uri = NULL;
     }
     if (short_value) {
@@ -458,23 +501,29 @@
 static void
 flush_buffer (void)
 {
-    if (buf[0]) {
-        flush_node (text, NULL, NULL, buf);
+    if (bufpos) {
+        flush_node (text, NULL, NULL, NULL, buffer);
     }
 
-    buf[0] = '\0';
+    buffer[0] = '\0';
     bufpos = 0;
 }
 
 static void
 start_document (void *ctx)
 {
+    char *doc_name;
+
     (void) ctx;
 
     root = pre;
 
-    if (strlen (shredstate.doc_name) > MIN (FILENAME_MAX, text_size))
+    if (text_size && strlen (shredstate.doc_name) > text_size) {
         text_stripped++;
+        doc_name = strndup (shredstate.doc_name, text_size);
+    } else
+        doc_name = strdup (shredstate.doc_name);
+
 
     /* create a new node */
     stack[level] = (node_t) {
@@ -488,15 +537,17 @@
         , .children       = 0
         , .level          = level
         , .kind           = doc
+        , .prefix         = NULL
         , .localname      = NULL
         , .localname_id   = -1
         , .uri            = NULL
         , .uri_id         = -1
-        , .value          = strdup (shredstate.doc_name)
+        , .value          = doc_name
         , .guide          = guide
     };
     /* extend guide counter */
_______________________________________________
Checkin-list mailing list
Checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to