Thom Brown wrote:
Would a test for mismatched or undefined namespaces be necessary?
For example:
Mismatched namespace:
<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</my:foo>
Undefined namespace when used in conjunction with IS DOCUMENT:
<pg:foo xmlns:my="http://postgresql.org/stuff">bar</pg:foo>
Thanks for looking at my patch Thom. I hadn't thought of that particular
scenario and even though I didn't specifically code for it, the
underlying libxml call does correctly reject the mismatched namespace:
template1=# SELECT xml_is_well_formed('<pg:foo
xmlns:pg="http://postgresql.org/stuff">bar</my:foo>');
xml_is_well_formed
--------------------
f
(1 row)
In the attached patch I've added the example to the SGML documentation
and the regression tests.
Also, having a look at the following example from the patch:
SELECT xml_is_well_formed('<local:data
xmlns:local="http://127.0.0.1";><local:piece id="1">number
one</local:piece><local:piece id="2" /></local:data>');
xml_is_well_formed
--------------------
t
(1 row)
Just wondering about that semi-colon after the namespace definition.
Thom
The semi-colon is not supposed to be there, and I'm not sure where it's
come from. With Thunderbird I see the email with my patch as an
attachement, downloaded and viewing the file there are no instances of a
" followed by a ;. However, if I look at the message on the archive at
http://archives.postgresql.org/message-id/4c3871c2.8000...@mlfowler.com
I can see every URL that ends with a " has a ; following it. Should I
be escaping the " in the patch file in some way or this just an artifact
of HTML parsing a patch?
Regards,
--
Mike Fowler
Registered Linux user: 379787
*** a/contrib/xml2/xpath.c
--- b/contrib/xml2/xpath.c
***************
*** 27,33 **** PG_MODULE_MAGIC;
/* externally accessible functions */
- Datum xml_is_well_formed(PG_FUNCTION_ARGS);
Datum xml_encode_special_chars(PG_FUNCTION_ARGS);
Datum xpath_nodeset(PG_FUNCTION_ARGS);
Datum xpath_string(PG_FUNCTION_ARGS);
--- 27,32 ----
***************
*** 70,97 **** pgxml_parser_init(void)
xmlLoadExtDtdDefaultValue = 1;
}
-
- /* Returns true if document is well-formed */
-
- PG_FUNCTION_INFO_V1(xml_is_well_formed);
-
- Datum
- xml_is_well_formed(PG_FUNCTION_ARGS)
- {
- text *t = PG_GETARG_TEXT_P(0); /* document buffer */
- int32 docsize = VARSIZE(t) - VARHDRSZ;
- xmlDocPtr doctree;
-
- pgxml_parser_init();
-
- doctree = xmlParseMemory((char *) VARDATA(t), docsize);
- if (doctree == NULL)
- PG_RETURN_BOOL(false); /* i.e. not well-formed */
- xmlFreeDoc(doctree);
- PG_RETURN_BOOL(true);
- }
-
-
/* Encodes special characters (<, >, &, " and \r) as XML entities */
PG_FUNCTION_INFO_V1(xml_encode_special_chars);
--- 69,74 ----
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 8554,8562 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
]]></screen>
</para>
</sect3>
<sect3>
! <title>XML Predicates</title>
<indexterm>
<primary>IS DOCUMENT</primary>
--- 8554,8566 ----
]]></screen>
</para>
</sect3>
+ </sect2>
+
+ <sect2>
+ <title>XML Predicates</title>
<sect3>
! <title>IS DOCUMENT</title>
<indexterm>
<primary>IS DOCUMENT</primary>
***************
*** 8574,8579 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
--- 8578,8675 ----
between documents and content fragments.
</para>
</sect3>
+
+ <sect3>
+ <title>xml_is_well_formed</title>
+
+ <indexterm>
+ <primary>xml_is_well_formed</primary>
+ <secondary>well formed</secondary>
+ </indexterm>
+
+ <synopsis>
+ <function>xml_is_well_formed</function>(<replaceable>text</replaceable>)
+ </synopsis>
+
+ <para>
+ The function <function>xml_is_well_formed</function> evaluates whether
+ the <replaceable>text</replaceable> is well formed XML content, returning
+ a boolean.
+ </para>
+ <para>
+ Example:
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo><bar>stuff</foo>');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+ ]]></screen>
+ </para>
+ <para>
+ In addition to the structure checks, the function ensures that namespaces are correcty matched.
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</my:foo>');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</pg:foo>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+ ]]></screen>
+ </para>
+ <para>
+ This function can be combined with the IS DOCUMENT predicate to prevent
+ invalid XML content errors from occuring in queries. For example, given a
+ table that may have rows with invalid XML mixed in with rows of valid
+ XML, <function>xml_is_well_formed</function> can be used to filter out all
+ the invalid rows.
+ </para>
+ <para>
+ Example:
+ <screen><![CDATA[
+ SELECT * FROM mixed;
+ data
+ ------------------------------
+ <foo>bar</foo>
+ <foo>bar</foo
+ <foo>bar</foo><bar>foo</bar>
+ <foo>bar</foo><bar>foo</bar
+ (4 rows)
+
+ SELECT COUNT(data) FROM mixed WHERE data::xml IS DOCUMENT;
+ ERROR: invalid XML content
+ DETAIL: Entity: line 1: parser error : expected '>'
+ <foo>bar</foo
+ ^
+ Entity: line 1: parser error : chunk is not well balanced
+ <foo>bar</foo
+ ^
+
+ SELECT COUNT(data) FROM mixed WHERE xml_is_well_formed(data) AND data::xml IS DOCUMENT;
+ count
+ -------
+ 1
+ (1 row)
+ ]]></screen>
+ </para>
+ </sect3>
</sect2>
<sect2 id="functions-xml-processing">
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 3293,3298 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
--- 3293,3365 ----
}
#endif
+ Datum
+ xml_is_well_formed(PG_FUNCTION_ARGS)
+ {
+ #ifdef USE_LIBXML
+ text *data = PG_GETARG_TEXT_P(0);
+ bool result;
+ int res_code;
+ int32 len;
+ const xmlChar *string;
+ xmlParserCtxtPtr ctxt;
+ xmlDocPtr doc = NULL;
+
+ len = VARSIZE(data) - VARHDRSZ;
+ string = xml_text2xmlChar(data);
+
+ /* Start up libxml and its parser (no-ops if already done) */
+ pg_xml_init();
+ xmlInitParser();
+
+ ctxt = xmlNewParserCtxt();
+ if (ctxt == NULL)
+ xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+ "could not allocate parser context");
+
+ PG_TRY();
+ {
+ size_t count;
+ xmlChar *version = NULL;
+ int standalone = -1;
+
+ res_code = parse_xml_decl(string, &count, &version, NULL, &standalone);
+ if (res_code != 0)
+ xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
+ "invalid XML content: invalid XML declaration",
+ res_code);
+
+ doc = xmlNewDoc(version);
+ doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
+ doc->standalone = 1;
+
+ res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL);
+
+ result = !res_code;
+ }
+ PG_CATCH();
+ {
+ if (doc)
+ xmlFreeDoc(doc);
+ if (ctxt)
+ xmlFreeParserCtxt(ctxt);
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ if (doc)
+ xmlFreeDoc(doc);
+ if (ctxt)
+ xmlFreeParserCtxt(ctxt);
+
+ return result;
+ #else
+ NO_XML_SUPPORT();
+ return 0;
+ #endif
+ }
+
/*
* Evaluate XPath expression and return array of XML values.
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4385,4390 **** DESCR("evaluate XPath expression, with namespaces support");
--- 4385,4393 ----
DATA(insert OID = 2932 ( xpath PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_ "select pg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ ));
DESCR("evaluate XPath expression");
+ DATA(insert OID = 3037 ( xml_is_well_formed PGNSP PGUID 12 1 0 0 f f f t f i 1 0 16 "25" _null_ _null_ _null_ _null_ xml_is_well_formed _null_ _null_ _null_ ));
+ DESCR("determine if a text fragment is well formed XML");
+
/* uuid */
DATA(insert OID = 2952 ( uuid_in PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_ _null_ uuid_in _null_ _null_ _null_ ));
DESCR("I/O");
*** a/src/include/utils/xml.h
--- b/src/include/utils/xml.h
***************
*** 46,51 **** extern Datum query_to_xmlschema(PG_FUNCTION_ARGS);
--- 46,52 ----
extern Datum cursor_to_xmlschema(PG_FUNCTION_ARGS);
extern Datum table_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
extern Datum query_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
+ extern Datum xml_is_well_formed(PG_FUNCTION_ARGS);
extern Datum schema_to_xml(PG_FUNCTION_ARGS);
extern Datum schema_to_xmlschema(PG_FUNCTION_ARGS);
*** a/src/test/regress/expected/xml.out
--- b/src/test/regress/expected/xml.out
***************
*** 502,504 **** SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
--- 502,577 ----
{<b>two</b>,<b>etc</b>}
(1 row)
+ -- Test xml_is_well_formed
+ SELECT xml_is_well_formed('<>');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('abc');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<abc/>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+ ?column?
+ ----------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+ ?column?
+ ----------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+ ?column?
+ ----------
+ t
+ (1 row)
+
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</my:foo>');
+ xml_is_well_formed
+ --------------------
+ f
+ (1 row)
+
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</pg:foo>');
+ xml_is_well_formed
+ --------------------
+ t
+ (1 row)
+
*** a/src/test/regress/sql/xml.sql
--- b/src/test/regress/sql/xml.sql
***************
*** 163,165 **** SELECT xpath('', '<!-- error -->');
--- 163,180 ----
SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]);
SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
+
+ -- Test xml_is_well_formed
+
+ SELECT xml_is_well_formed('<>');
+ SELECT xml_is_well_formed('abc');
+ SELECT xml_is_well_formed('<abc/>');
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:piece id="2" /></local:data>');
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</my:foo>');
+ SELECT xml_is_well_formed('<pg:foo xmlns:pg="http://postgresql.org/stuff">bar</pg:foo>');
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers