While reviewing another patch, I noticed that the keep_comments argument
was missing a default value. To allow calling the function with only the
XML argument, I've added DEFAULT true to keep_comments.
SELECT xmlcanonicalize(xmldoc);
Updated patch (v16) attached.
Best, Jim
From 28f7c62de548468c8fb15ba95d4a68e2fdf0528e Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Fri, 21 Mar 2025 09:46:49 +0100
Subject: [PATCH v16] Add xmlcanonicalize function
This patch adds the xmlcanonicalize function, which transforms an
XML document into its canonical form according to the W3C Canonical
XML Version 1.1 specification.
xmlcanonicalize(doc xml, keep_comments boolean DEFAULT true) -> xml
* doc: The XML document to be canonicalized.
* keep_comments: A flag indicating whether to preserve or discard
XML comments from the input document. If omitted, it defaults to
'true'.
This implementation is based on the xmlC14NDocDumpMemory function
from the C14N module of libxml2.
---
doc/src/sgml/func.sgml | 50 +++++++++++++++
src/backend/catalog/system_functions.sql | 6 ++
src/backend/utils/adt/xml.c | 43 +++++++++++++
src/include/catalog/pg_proc.dat | 3 +
src/test/regress/expected/xml.out | 82 ++++++++++++++++++++++++
src/test/regress/expected/xml_1.out | 80 +++++++++++++++++++++++
src/test/regress/expected/xml_2.out | 82 ++++++++++++++++++++++++
src/test/regress/sql/xml.sql | 52 +++++++++++++++
8 files changed, 398 insertions(+)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6fa1d6586b..7635112854 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14585,6 +14585,56 @@ SELECT xmltext('< foo & bar >');
</para>
</sect3>
+<sect3 id="functions-producing-xml-xmlcanonicalize">
+ <title><literal>xmlcanonicalize</literal></title>
+
+ <indexterm>
+ <primary>xmlcanonicalize</primary>
+ </indexterm>
+
+<synopsis>
+<function>xmlcanonicalize</function> ( <parameter>doc</parameter> <type>xml</type> [, <parameter>keep_comments</parameter> <type>boolean</type> DEFAULT <literal>true</literal>] ) <returnvalue>xml</returnvalue>
+
+</synopsis>
+
+ <para>
+ This function transforms a given XML document into its <ulink url="https://www.w3.org/TR/xml-c14n11/#Terminology">canonical form</ulink>,
+ as defined by the <ulink url="https://www.w3.org/TR/xml-c14n11/">W3C Canonical XML 1.1 Specification</ulink>, which standardizes the document's
+ structure and syntax to facilitate comparison and validation.
+ The <parameter>keep_comments</parameter> parameter controls whether XML comments from the input document are preserved or discarded.
+ If omitted, it defaults to <literal>true</literal>.
+ </para>
+
+ <para>
+ Example:
+<screen><![CDATA[
+SELECT
+ xmlcanonicalize(
+ '<foo>
+ <!-- a comment -->
+ <bar c="3" b="2" a="1">42</bar>
+ <empty/>
+ </foo>'::xml);
+ xmlcanonicalize
+-----------------------------------------------------------------------------
+ <foo><!-- a comment --><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+
+SELECT
+ xmlcanonicalize(
+ '<foo>
+ <!-- a comment -->
+ <bar c="3" b="2" a="1">42</bar>
+ <empty/>
+ </foo>'::xml, false);
+ xmlcanonicalize
+-----------------------------------------------------------
+ <foo><bar a="1" b="2" c="3">42</bar><empty></empty></foo>
+(1 row)
+]]></screen>
+ </para>
+ </sect3>
+
<sect3 id="functions-producing-xml-xmlcomment">
<title><literal>xmlcomment</literal></title>
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 566f308e44..15c33335dc 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -268,6 +268,12 @@ CREATE OR REPLACE FUNCTION xpath_exists(text, xml)
IMMUTABLE PARALLEL SAFE STRICT COST 1
RETURN xpath_exists($1, $2, '{}'::text[]);
+CREATE OR REPLACE FUNCTION xmlcanonicalize(xml, boolean DEFAULT true)
+ RETURNS xml
+ LANGUAGE internal
+ IMMUTABLE PARALLEL SAFE STRICT
+AS 'xmlcanonicalize';
+
CREATE OR REPLACE FUNCTION pg_sleep_for(interval)
RETURNS void
LANGUAGE sql
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index db8d0d6a7e..fb956710b8 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -58,6 +58,7 @@
#include <libxml/xmlwriter.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
+#include <libxml/c14n.h>
/*
* We used to check for xmlStructuredErrorContext via a configure test; but
@@ -544,6 +545,48 @@ xmltext(PG_FUNCTION_ARGS)
#endif /* not USE_LIBXML */
}
+/**
+ * Converts an XML document to its canonical form according to the
+ * W3C Canonical XML 1.1 specification implemented on xmlC14NDocDumpMemory.
+ */
+Datum
+xmlcanonicalize(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+ xmltype *arg = PG_GETARG_XML_P(0);
+ bool keep_comments = PG_GETARG_BOOL(1);
+ text *result;
+ int nbytes;
+ xmlDocPtr doc;
+ xmlChar *xmlbuf = NULL;
+
+ doc = xml_parse(arg, XMLOPTION_DOCUMENT, false,
+ GetDatabaseEncoding(), NULL, NULL, NULL);
+
+ /*
+ * This dumps the canonicalized XML doc into the xmlChar* buffer.
+ * mode = 2 means the doc will be canonicalized using the C14N 1.1 standard.
+ */
+ nbytes = xmlC14NDocDumpMemory(doc, NULL, 2, NULL, keep_comments, &xmlbuf);
+
+ if(doc)
+ xmlFreeDoc(doc);
+
+ if(nbytes < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not canonicalize the given XML document")));
+
+ result = cstring_to_text_with_len((const char *) xmlbuf, nbytes);
+
+ xmlFree(xmlbuf);
+
+ PG_RETURN_XML_P(result);
+#else
+ NO_XML_SUPPORT();
+ return 0;
+#endif /* not USE_LIBXML */
+}
/*
* TODO: xmlconcat needs to merge the notations and unparsed entities
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 890822eaf7..f279943176 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -9071,6 +9071,9 @@
{ oid => '3813', descr => 'generate XML text node',
proname => 'xmltext', prorettype => 'xml', proargtypes => 'text',
prosrc => 'xmltext' },
+{ oid => '3814', descr => 'generate the canonical form of an XML document',
+ proname => 'xmlcanonicalize', prorettype => 'xml', proargtypes => 'xml bool',
+ prosrc => 'xmlcanonicalize' },
{ oid => '2923', descr => 'map table contents to XML',
proname => 'table_to_xml', procost => '100', provolatile => 's',
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index bcc743f485..63d0e71259 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -1877,3 +1877,85 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
x<P>73</P>0.42truej
(1 row)
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted --> +
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('');
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ');
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo');
+ERROR: invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index a1c5d31417..cf5dab372d 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -1492,3 +1492,83 @@ ERROR: unsupported XML feature
LINE 1: SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j':...
^
DETAIL: This functionality requires the server to be built with libxml support.
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+ERROR: unsupported XML feature
+LINE 2: ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ ^
+DETAIL: This functionality requires the server to be built with libxml support.
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
+ ?column?
+----------
+(0 rows)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+(0 rows)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR: unsupported XML feature at character 24
+SELECT xmlcanonicalize(' ', true);
+ERROR: unsupported XML feature at character 24
+SELECT xmlcanonicalize('foo', true);
+ERROR: unsupported XML feature at character 24
+SELECT xmlcanonicalize('');
+ERROR: unsupported XML feature at character 24
+SELECT xmlcanonicalize(' ');
+ERROR: unsupported XML feature at character 24
+SELECT xmlcanonicalize('foo');
+ERROR: unsupported XML feature at character 24
+\set VERBOSITY default
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index 045641dae6..0735cfd9bc 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -1863,3 +1863,85 @@ SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
x<P>73</P>0.42truej
(1 row)
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <!-- attributes and namespces will be sorted --> +
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><!-- Normalization of whitespace in start and end tags --><!-- Elimination of superfluous namespace declarations, as already declared in <foo> --><bar xmlns="">42</bar><!-- empty element will be converted to start-end tag pair --><empty></empty><!-- text will be transcoded to UTF-8 --><transcode>1</transcode><!-- whitespace inside tag will be preserved --><whitespace> 321 </whitespace><!-- empty namespace will be removed of child tag --><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><!-- CDATA section will be replaced by its value --><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>+
+ <!-- comment outside root element -->
+(1 row)
+
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ <foo xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I am" attr2="all" b:attr="sorted" a:attr="out"><bar xmlns="">42</bar><empty></empty><transcode>1</transcode><whitespace> 321 </whitespace><emptyns xmlns=""><emptyns_child></emptyns_child></emptyns><compute>value>"0" && value<"10" ?"valid":"error"</compute></foo>
+(1 row)
+
+SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+SELECT xmlcanonicalize(NULL, true);
+ xmlcanonicalize
+-----------------
+
+(1 row)
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo', true);
+ERROR: invalid XML document
+SELECT xmlcanonicalize('');
+ERROR: invalid XML document
+SELECT xmlcanonicalize(' ');
+ERROR: invalid XML document
+SELECT xmlcanonicalize('foo');
+ERROR: invalid XML document
+\set VERBOSITY default
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index 4c3520ce89..c0c5b2a419 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -677,3 +677,55 @@ SELECT xmltext(' ');
SELECT xmltext('foo `$_-+?=*^%!|/\()[]{}');
SELECT xmltext('foo & <"bar">');
SELECT xmltext('x'|| '<P>73</P>'::xml || .42 || true || 'j'::char);
+
+-- xmlserialize: canonical
+CREATE TABLE xmlcanonicalize_test (doc xml);
+INSERT INTO xmlcanonicalize_test VALUES
+ ('<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!DOCTYPE doc SYSTEM "doc.dtd" [
+ <!ENTITY val "42">
+ <!ATTLIST xyz attr CDATA "default">
+ ]>
+
+ <!-- attributes and namespces will be sorted -->
+ <foo a:attr="out" b:attr="sorted" attr2="all" attr="I am"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org">
+
+ <!-- Normalization of whitespace in start and end tags -->
+ <!-- Elimination of superfluous namespace declarations, as already declared in <foo> -->
+ <bar xmlns="" xmlns:a="http://www.w3.org" >&val;</bar >
+
+ <!-- empty element will be converted to start-end tag pair -->
+ <empty/>
+
+ <!-- text will be transcoded to UTF-8 -->
+ <transcode>1</transcode>
+
+ <!-- whitespace inside tag will be preserved -->
+ <whitespace> 321 </whitespace>
+
+ <!-- empty namespace will be removed of child tag -->
+ <emptyns xmlns="" >
+ <emptyns_child xmlns=""></emptyns_child>
+ </emptyns>
+
+ <!-- CDATA section will be replaced by its value -->
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ </foo> <!-- comment outside root element --> ');
+
+SELECT xmlcanonicalize(doc, true) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, false) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, true)::text = xmlcanonicalize(doc)::text FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(doc, NULL) FROM xmlcanonicalize_test;
+SELECT xmlcanonicalize(NULL, true);
+
+\set VERBOSITY terse
+SELECT xmlcanonicalize('', true);
+SELECT xmlcanonicalize(' ', true);
+SELECT xmlcanonicalize('foo', true);
+SELECT xmlcanonicalize('');
+SELECT xmlcanonicalize(' ');
+SELECT xmlcanonicalize('foo');
+\set VERBOSITY default
\ No newline at end of file
--
2.34.1