Changeset: c416436c479c for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c416436c479c Modified Files: pathfinder/src/sqlhelpers/xmlshred/encoding.c pathfinder/src/sqlhelpers/xmlshred/guides.c pathfinder/src/sqlhelpers/xmlshred/include/encoding.h pathfinder/src/sqlhelpers/xmlshred/include/guides.h pathfinder/src/sqlhelpers/xmlshred/include/shred_helper.h pathfinder/src/sqlhelpers/xmlshred/main.c Branch: default Log Message:
Various extensions to pfshred: - Added support for arbitrarily long tagnames and textnodes. - Extended guide information with unique name and uri ids (cf., format options '%n' and '%u'). - Fix error when generating files without format option '%g'. - Added new format options ('%X', '%U', '%N', '%r'). - Added option (-e) to disable string escaping. - Added option (-x) to generate output for x100. diffs (truncated from 770 to 300 lines): diff --git a/pathfinder/src/sqlhelpers/xmlshred/encoding.c b/pathfinder/src/sqlhelpers/xmlshred/encoding.c --- a/pathfinder/src/sqlhelpers/xmlshred/encoding.c +++ b/pathfinder/src/sqlhelpers/xmlshred/encoding.c @@ -58,14 +58,20 @@ /* count the tags/text being truncated * during the shredding process */ -unsigned int tag_stripped; +unsigned int prefix_stripped; +unsigned int name_stripped; +unsigned int uri_stripped; unsigned int text_stripped; static shred_state_t shredstate; /* Wrapper to ensure that guide information is only collected if needed */ -#define insert_guide_node_(u,n,p,k) ((shredstate.statistics) \ - ? insert_guide_node ((u),(n),(p),(k)) \ +#define insert_guide_node_(u,n,p,k) ((shredstate.statistics) \ + ? insert_guide_node ( \ + (u),(n), \ + generate_uri_id ((xmlChar *) u), \ + generate_localname_id ((xmlChar *) n), \ + (p),(k)) \ : NULL) #define guide_val_(g) ((shredstate.statistics) \ ? (g)->guide \ @@ -78,20 +84,15 @@ #define BAILOUT(...) do { SHoops (SH_FATAL, __VA_ARGS__); \ free_hashtable (localname_hash); \ free_hashtable (uris_hash); \ + free (buffer); \ exit (1); \ } while (0) -/* id of empty namespace prefix */ -#define EMPTY_NS 0 - -/* maximum localname/URI and text buffer sizes */ -#define TAG_SIZE 100 -#define BUF_SIZE 4096 - /* buffer for XML node contents/values */ -static xmlChar buf[BUF_SIZE + 1]; -static int bufpos; +static unsigned int bufsize; +static xmlChar *buffer; +static unsigned int bufpos; static node_t stack[STACK_MAX]; static int level; @@ -160,12 +161,18 @@ void print_value (FILE *f, node_t tuple) { + if (tuple.value) + fputs (tuple.value, f); +} + +void +print_value_escaped (FILE *f, node_t tuple) +{ /* escape quotes in quoted string */ if (tuple.value) { char *src = (char *) tuple.value; unsigned int start = 0, end; - putc ('"', f); - for (end = 0; end < text_size && tuple.value[end]; end++) + for (end = 0; tuple.value[end]; end++) /* escape quotes */ if (tuple.value[end] == '"') { print_text (f, &src[start], end - start); @@ -174,13 +181,9 @@ } if (start < end) print_text (f, &src[start], end - start); - putc ('"', f); } } -void (*print_localname) (FILE *, node_t); -void (*print_uri) (FILE *, node_t); - void print_localname_id (FILE *f, node_t tuple) { @@ -190,10 +193,16 @@ void print_localname_str (FILE *f, node_t tuple) { - fprintf (f, "\"%s\"", (char *) tuple.localname); + fputs ((char *) tuple.localname, f); } void +print_prefix_str (FILE *f, node_t tuple) +{ + fputs ((char *) tuple.prefix, f); +} + +void print_uri_id (FILE *f, node_t tuple) { fprintf (f, "%i", tuple.uri_id); @@ -202,7 +211,7 @@ void print_uri_str (FILE *f, node_t tuple) { - fprintf (f, "\"%s\"", (char *) tuple.uri); + fputs ((char *) tuple.uri, f); } /* implementation of formatting functions */ @@ -241,11 +250,20 @@ static void lambda_P (node_t n) { if (n.parent) lambda_E (*(n.parent)); } +static void lambda_X (node_t n) +{ if (n.uri_id != -1) print_prefix_str (out, n); } + static void lambda_n (node_t n) -{ if (n.localname_id != -1) print_localname (out, n); } +{ if (n.localname_id != -1) print_localname_id (out, n); } + +static void lambda_N (node_t n) +{ if (n.localname_id != -1) print_localname_str (out, n); } static void lambda_u (node_t n) -{ if (n.uri_id != -1) print_uri (out, n); } +{ if (n.uri_id != -1) print_uri_id (out, n); } + +static void lambda_U (node_t n) +{ if (n.uri_id != -1) print_uri_str (out, n); } static void lambda_d (node_t n) { print_number (out, n); } @@ -253,8 +271,10 @@ static void lambda_r (node_t n) { fprintf (out, SSZFMT, n.root); } +void (*print_textnode) (FILE *, node_t); + static void lambda_t (node_t n) -{ print_value (out, n); } +{ print_textnode (out, n); } static void lambda_g (node_t n) { fprintf (out, SSZFMT, guide_val_ (n.guide)); } @@ -275,7 +295,10 @@ , ['p'] = lambda_p , ['P'] = lambda_P , ['n'] = lambda_n + , ['X'] = lambda_X + , ['N'] = lambda_N , ['u'] = lambda_u + , ['U'] = lambda_U , ['d'] = lambda_d , ['r'] = lambda_r , ['t'] = lambda_t @@ -381,11 +404,14 @@ static void flush_node (kind_t kind, - const xmlChar *URI, const xmlChar *localname, + const xmlChar *localname, + const xmlChar *prefix, const xmlChar *URI, const xmlChar *value) { - char *short_value = NULL; - int valStrLen = -1; + char *short_value = NULL, + *prefix_copy = NULL, + *localname_copy = NULL, + *URI_copy = NULL; pre++; rank++; @@ -393,18 +419,30 @@ max_level = MAX(level, max_level); - /* check if tagname is larger than TAG_SIZE characters */ - if (localname && xmlStrlen (localname) > TAG_SIZE) - BAILOUT ("attribute local name `%s' exceeds %u characters", - localname, TAG_SIZE); + /* check if prefix is larger than text_size characters */ + if (text_size && prefix && (unsigned int) xmlStrlen (prefix) > text_size) { + prefix_stripped++; + prefix_copy = strndup ((char *) prefix, text_size); + } else if (prefix) + prefix_copy = strdup ((char *) prefix); + + /* check if tagname is larger than text_size characters */ + if (text_size && localname && + (unsigned int) xmlStrlen (localname) > text_size) { + name_stripped++; + localname_copy = strndup ((char *) localname, text_size); + } else if (localname) + localname_copy = strdup ((char *) localname); - if (URI && xmlStrlen (URI) > TAG_SIZE) - BAILOUT ("namespace URI `%s' exceeds length of %u characters", - URI, TAG_SIZE); + /* check if uri is larger than text_size characters */ + if (text_size && URI && (unsigned int) xmlStrlen (URI) > text_size) { + name_stripped++; + URI_copy = strndup ((char *) URI, text_size); + } else if (URI) + URI_copy = strdup ((char *) URI); /* check if value is larger than text_size characters */ - if (value && (valStrLen = xmlStrlen (value)) >= 0 && - (unsigned int) valStrLen > text_size) { + if (text_size && value && (unsigned int) xmlStrlen (value) > text_size) { text_stripped++; short_value = strndup ((char *) value, text_size); } @@ -422,9 +460,10 @@ , .children = 0 , .level = level , .kind = kind - , .localname = localname ? strdup ((char *) localname) : NULL + , .prefix = prefix_copy + , .localname = localname_copy , .localname_id = generate_localname_id (localname) - , .uri = URI ? strdup ((char *) URI) : NULL + , .uri = URI_copy , .uri_id = generate_uri_id (URI) , .value = short_value ? short_value : (char *) value , .guide = insert_guide_node_ ((char *) URI, (char *) localname, @@ -437,12 +476,16 @@ apply_fmt (stack[level]); - if (stack[level].localname) { - free (stack[level].localname); + if (prefix_copy) { + free (prefix_copy); + stack[level].prefix = NULL; + } + if (localname_copy) { + free (localname_copy); stack[level].localname = NULL; } - if (stack[level].uri) { - free (stack[level].uri); + if (URI_copy) { + free (URI_copy); stack[level].uri = NULL; } if (short_value) { @@ -458,23 +501,29 @@ static void flush_buffer (void) { - if (buf[0]) { - flush_node (text, NULL, NULL, buf); + if (bufpos) { + flush_node (text, NULL, NULL, NULL, buffer); } - buf[0] = '\0'; + buffer[0] = '\0'; bufpos = 0; } static void start_document (void *ctx) { + char *doc_name; + (void) ctx; root = pre; - if (strlen (shredstate.doc_name) > MIN (FILENAME_MAX, text_size)) + if (text_size && strlen (shredstate.doc_name) > text_size) { text_stripped++; + doc_name = strndup (shredstate.doc_name, text_size); + } else + doc_name = strdup (shredstate.doc_name); + /* create a new node */ stack[level] = (node_t) { @@ -488,15 +537,17 @@ , .children = 0 , .level = level , .kind = doc + , .prefix = NULL , .localname = NULL , .localname_id = -1 , .uri = NULL , .uri_id = -1 - , .value = strdup (shredstate.doc_name) + , .value = doc_name , .guide = guide }; /* extend guide counter */ _______________________________________________ Checkin-list mailing list Checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list