Hi Tom
On 06.09.24 18:34, Tom Lane wrote:
> I think it'd be quite foolish to assume that every extant and future
> version of libxml2 will share this glitch. Probably should use
> logic more like pg_strip_crlf(), although we can't use that directly.
Makes sense. I Introduced this logic in the end of
xmltotext_with_options() in case it was called with INDENT and DOCUMENT
type xml string.
SELECT xmlserialize(DOCUMENT '<foo><bar>42</bar></foo>' AS text INDENT);
xmlserialize
-----------------
<foo> +
<bar>42</bar>+
</foo>
(1 row)
The regression tests were updated accordingly - see patch v2-0002.
> Would it ever be the case that trailing whitespace would be valid
> data? In a bit of testing, it seems like that could be true in
> CONTENT mode but not DOCUMENT mode.
Yes, in case of CONTENT it is valid data and it will be preserved, as
CONTENT can be pretty much anything.
SELECT xmlserialize(CONTENT E'<foo><bar>42</bar></foo>\n\n\t\t\t' AS
text INDENT);
xmlserialize
--------------------------
<foo> +
<bar>42</bar> +
</foo> +
+
(1 row)
With DOCUMENT it is superfluous and should be removed after indentation.
IIRC there's an xmlSaveToBuffer option called XML_SAVE_WSNONSIG that can
be used to preserve it.
Thanks
Best, Jim
From ba0aa7c2a822bd23a9b2ec5af07265bc3eba86ce Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Fri, 6 Sep 2024 23:48:24 +0200
Subject: [PATCH v2 2/2] Bug fix: remove default trailing newline from
XMLSERIALIZE calls
xmlDocContentDumpOutput adds by default a trailing newline to
DOCUMENT typed XML strings when XMLSERIALIZE is called with the
option INDENT. This introduces a condition that checks if the
serialized DOCUMENT contains a trailing newline and if so removes
it. Regression tests were updated accordingly.
---
src/backend/utils/adt/xml.c | 18 +++++++++++++++++-
src/test/regress/expected/xml.out | 18 ++++++------------
src/test/regress/expected/xml_2.out | 18 ++++++------------
3 files changed, 29 insertions(+), 25 deletions(-)
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 1cd4929870..fa38d2b992 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -808,7 +808,23 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
"could not close xmlSaveCtxtPtr");
}
- result = (text *) xmlBuffer_to_xmltype(buf);
+ /*
+ * xmlDocContentDumpOutput adds a trailing newline by default
+ * so we get rid of it here.
+ */
+ if (xmloption_arg == XMLOPTION_DOCUMENT)
+ {
+ char *str = (char *) xmlBufferContent(buf);
+ int len = xmlBufferLength(buf);
+
+ while (len > 0 && (str[len - 1] == '\n' ||
+ str[len - 1] == '\r'))
+ str[--len] = '\0';
+
+ result = cstring_to_text_with_len(str, len);
+ }
+ else
+ result = (text *) xmlBuffer_to_xmltype(buf);
}
PG_CATCH();
{
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 6f073101a1..361a6f9b27 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -485,8 +485,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
<bar> +
<val x="y">42</val>+
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text INDENT);
@@ -546,8 +545,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val><val x="y">text node<
<val x="y">42</val> +
<val x="y">text node<val>73</val></val>+
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val><val x="y">text node<val>73</val></val></bar></foo>' AS text INDENT);
@@ -601,8 +599,7 @@ SELECT xmlserialize(DOCUMENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><v
<bar> +
<val>73</val> +
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><val>73</val></bar></foo>' AS text INDENT);
@@ -620,8 +617,7 @@ SELECT xmlserialize(DOCUMENT '<!DOCTYPE a><a/>' AS text INDENT);
xmlserialize
--------------
<!DOCTYPE a>+
- <a/> +
-
+ <a/>
(1 row)
SELECT xmlserialize(CONTENT '<!DOCTYPE a><a/>' AS text INDENT);
@@ -638,8 +634,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar></bar></foo>' AS text INDENT);
--------------
<foo> +
<bar/> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar></bar></foo>' AS text INDENT);
@@ -669,8 +664,7 @@ SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
--------------
<foo> +
<bar/> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index 7b154da4ba..73c2851d3f 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -471,8 +471,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
<bar> +
<val x="y">42</val>+
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text INDENT);
@@ -532,8 +531,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val><val x="y">text node<
<val x="y">42</val> +
<val x="y">text node<val>73</val></val>+
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val><val x="y">text node<val>73</val></val></bar></foo>' AS text INDENT);
@@ -587,8 +585,7 @@ SELECT xmlserialize(DOCUMENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><v
<bar> +
<val>73</val> +
</bar> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><val>73</val></bar></foo>' AS text INDENT);
@@ -606,8 +603,7 @@ SELECT xmlserialize(DOCUMENT '<!DOCTYPE a><a/>' AS text INDENT);
xmlserialize
--------------
<!DOCTYPE a>+
- <a/> +
-
+ <a/>
(1 row)
SELECT xmlserialize(CONTENT '<!DOCTYPE a><a/>' AS text INDENT);
@@ -624,8 +620,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar></bar></foo>' AS text INDENT);
--------------
<foo> +
<bar/> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT '<foo><bar></bar></foo>' AS text INDENT);
@@ -655,8 +650,7 @@ SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
--------------
<foo> +
<bar/> +
- </foo> +
-
+ </foo>
(1 row)
SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
--
2.34.1
From e474cdc457a91e96432f5a14c69b4ecbc0345579 Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Thu, 29 Aug 2024 19:41:02 +0200
Subject: [PATCH v2 1/2] Bug fix for XMLSERIALIZE(...INDENT) for xml containing
blank nodes
This fixes a bug that let XMLSERIALIZE(... INDENT) to ignore
blank nodes. It basically sets xml_parse's parameter
'preserve_whitespace' to false if the INDENT flag was used
in XMLSERIALIZE. New regression tests are also included.
---
src/backend/utils/adt/xml.c | 10 ++++++++--
src/test/regress/expected/xml.out | 19 +++++++++++++++++++
src/test/regress/expected/xml_1.out | 11 +++++++++++
src/test/regress/expected/xml_2.out | 19 +++++++++++++++++++
src/test/regress/sql/xml.sql | 3 +++
5 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 447e72b21e..1cd4929870 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -677,8 +677,14 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
}
#ifdef USE_LIBXML
- /* Parse the input according to the xmloption */
- doc = xml_parse(data, xmloption_arg, true, GetDatabaseEncoding(),
+ /*
+ * Parse the input according to the xmloption
+ * preserve_whitespace is set to false in case the function should
+ * return an indented xml, otherwise libxml2 will ignore the elements
+ * that contain whitespaces between them.
+ */
+ doc = xml_parse(data, xmloption_arg, !indent ? true : false,
+ GetDatabaseEncoding(),
&parsed_xmloptiontype, &content_nodes,
(Node *) &escontext);
if (doc == NULL || escontext.error_occurred)
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 93a79cda8f..6f073101a1 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -663,6 +663,25 @@ SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
t
(1 row)
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
+ xmlserialize
+--------------
+ <foo> +
+ <bar/> +
+ </foo> +
+
+(1 row)
+
+SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
+ xmlserialize
+--------------
+ text node +
+ <foo> +
+ <bar/> +
+ </foo>
+(1 row)
+
SELECT xml '<foo>bar</foo>' IS DOCUMENT;
?column?
----------
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index 9323b84ae2..d26e10441e 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -443,6 +443,17 @@ ERROR: unsupported XML feature
LINE 1: SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val><...
^
DETAIL: This functionality requires the server to be built with libxml support.
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
+ERROR: unsupported XML feature
+LINE 1: SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>'...
+ ^
+DETAIL: This functionality requires the server to be built with libxml support.
+SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
+ERROR: unsupported XML feature
+LINE 1: SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> ...
+ ^
+DETAIL: This functionality requires the server to be built with libxml support.
SELECT xml '<foo>bar</foo>' IS DOCUMENT;
ERROR: unsupported XML feature
LINE 1: SELECT xml '<foo>bar</foo>' IS DOCUMENT;
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index f956322c69..7b154da4ba 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -649,6 +649,25 @@ SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
t
(1 row)
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
+ xmlserialize
+--------------
+ <foo> +
+ <bar/> +
+ </foo> +
+
+(1 row)
+
+SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
+ xmlserialize
+--------------
+ text node +
+ <foo> +
+ <bar/> +
+ </foo>
+(1 row)
+
SELECT xml '<foo>bar</foo>' IS DOCUMENT;
?column?
----------
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index 953bac09e4..f752ecb142 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -168,6 +168,9 @@ SELECT xmlserialize(CONTENT '<foo><bar></bar></foo>' AS text INDENT);
-- 'no indent' = not using 'no indent'
SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text) = xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text NO INDENT);
SELECT xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text) = xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text NO INDENT);
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo> <bar></bar> </foo>' AS text INDENT);
+SELECT xmlserialize(CONTENT 'text node<foo> <bar></bar> </foo>' AS text INDENT);
SELECT xml '<foo>bar</foo>' IS DOCUMENT;
SELECT xml '<foo>bar</foo><bar>foo</bar>' IS DOCUMENT;
--
2.34.1